pybiolib 1.2.71__py3-none-any.whl → 1.2.99.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biolib/utils/seq_util.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import re
2
- from io import BufferedIOBase
2
+ from io import BufferedIOBase, TextIOWrapper
3
3
 
4
- from biolib.typing_utils import Dict, List, Optional, Union
4
+ from biolib.typing_utils import Dict, List, Optional, Union, Iterator
5
5
 
6
6
 
7
7
  class SeqUtilRecord:
@@ -37,67 +37,75 @@ class SeqUtil:
37
37
  allow_any_sequence_characters: bool = False,
38
38
  allow_empty_sequence: bool = True,
39
39
  file_name: Optional[str] = None,
40
- ) -> List[SeqUtilRecord]:
40
+ ) -> Iterator[SeqUtilRecord]:
41
+ def process_and_yield_record(header: str, sequence_lines: list):
42
+ sequence = ''.join(sequence_lines)
43
+ sequence_id = header.split()[0]
44
+ if not allow_any_sequence_characters:
45
+ invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
46
+ if invalid_sequence_characters:
47
+ raise Exception(
48
+ f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
49
+ )
50
+ if not allow_empty_sequence and not sequence:
51
+ raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
52
+ yield SeqUtilRecord(
53
+ sequence=sequence,
54
+ sequence_id=sequence_id,
55
+ description=header[len(sequence_id):].strip()
56
+ )
57
+
58
+ def line_generator_from_buffered_io_base(file_handle: BufferedIOBase) -> Iterator[str]:
59
+ """Generates lines from a BufferedIOBase handle, decoding UTF-8."""
60
+ for line in file_handle:
61
+ yield line.decode('utf-8')
62
+
63
+ def line_generator_from_text_io_wrapper(file_handle: TextIOWrapper) -> Iterator[str]:
64
+ """Generates lines from a TextIOWrapper handle, decoding UTF-8."""
65
+ for line in file_handle:
66
+ yield line
67
+
41
68
  if input_file is None:
42
69
  if file_name:
43
70
  input_file = file_name
44
71
  else:
45
72
  raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
73
+
74
+ file_handle = None
46
75
  if isinstance(input_file, str):
47
- with open(input_file) as file_handle:
48
- data = file_handle.read().strip()
76
+ file_handle = open(input_file, "r")
77
+ line_iterator = line_generator_from_text_io_wrapper(file_handle)
49
78
  elif isinstance(input_file, BufferedIOBase):
50
- data = input_file.read().decode('utf-8')
79
+ line_iterator = line_generator_from_buffered_io_base(input_file)
51
80
  else:
52
81
  raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
53
- if not data:
54
- return []
55
-
56
- if '>' not in data:
57
- if default_header:
58
- lines_with_header = []
59
- for index, line in enumerate(data.split('\n')):
60
- index_string = str(index + 1) if index > 0 else ''
61
- lines_with_header.append(f'>{default_header}{index_string}\n{line}')
62
-
63
- data = '\n'.join(lines_with_header)
64
- else:
65
- raise Exception(f'No header line found in FASTA file "{file_name}"')
66
-
67
- splitted = []
68
- tmp_data = ''
69
- for line in data.splitlines():
70
- if line.startswith('>'):
71
- if tmp_data:
72
- splitted.append(tmp_data)
73
- tmp_data = line[1:].strip() + '\n'
74
- else:
75
- if line.strip():
76
- tmp_data += line.strip() + '\n'
77
-
78
- if tmp_data:
79
- splitted.append(tmp_data)
80
-
81
- parsed_sequences = []
82
- for sequence_data in splitted:
83
- sequence_data_splitted = sequence_data.strip().split('\n')
84
- header_line = sequence_data_splitted[0].split()
85
- sequence_id = header_line[0]
86
- description = sequence_data_splitted[0][len(sequence_id) :].strip()
87
- sequence = ''.join([seq.strip() for seq in sequence_data_splitted[1:]])
88
-
89
- if not allow_any_sequence_characters:
90
- invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
91
- if len(invalid_sequence_characters) > 0:
92
- raise Exception(
93
- f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
94
- )
95
- if not allow_empty_sequence and len(sequence) == 0:
96
- raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
97
-
98
- parsed_sequences.append(SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description))
99
82
 
100
- return parsed_sequences
83
+ header = None
84
+ sequence_lines: list[str] = []
85
+
86
+ try:
87
+ for line_number, line in enumerate(line_iterator):
88
+ line = line.strip()
89
+ if line.startswith('>'):
90
+ if header is not None:
91
+ yield from process_and_yield_record(header, sequence_lines)
92
+
93
+ header = line[1:].strip()
94
+ sequence_lines = []
95
+ else:
96
+ if header is None:
97
+ if default_header:
98
+ yield from process_and_yield_record(f"{default_header}{line_number}", [line])
99
+ else:
100
+ raise Exception(f'No header line found in FASTA file "{file_name}"')
101
+ else:
102
+ sequence_lines.append(line)
103
+
104
+ if header is not None:
105
+ yield from process_and_yield_record(header, sequence_lines)
106
+ finally:
107
+ if file_handle:
108
+ file_handle.close()
101
109
 
102
110
  @staticmethod
103
111
  def write_records_to_fasta(file_name: str, records: List[SeqUtilRecord]) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pybiolib
3
- Version: 1.2.71
3
+ Version: 1.2.99.dev1
4
4
  Summary: BioLib Python Client
5
5
  Home-page: https://github.com/biolib
6
6
  License: MIT
@@ -117,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
117
117
  biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
118
118
  biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
119
119
  biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
120
- biolib/utils/seq_util.py,sha256=ZQFcaE37B2dtucN2zDjOmdya_X0ITc1zBFZJNQY13XA,5183
120
+ biolib/utils/seq_util.py,sha256=fBJoGIkmqNqFebLIwSzxifXzkkUdBOepDYknBsks7Qw,5660
121
121
  biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
122
- pybiolib-1.2.71.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
123
- pybiolib-1.2.71.dist-info/METADATA,sha256=Fz8X4xfMgIOj-MyEF8-h8YkSZ0ZSHhi_uJtv56ikYvk,1506
124
- pybiolib-1.2.71.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
125
- pybiolib-1.2.71.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
126
- pybiolib-1.2.71.dist-info/RECORD,,
122
+ pybiolib-1.2.99.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
123
+ pybiolib-1.2.99.dev1.dist-info/METADATA,sha256=FoK4QVbKuqHAXomf4Ezk2tGYhvl5MdeMyJqDOPSlhP0,1511
124
+ pybiolib-1.2.99.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
125
+ pybiolib-1.2.99.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
126
+ pybiolib-1.2.99.dev1.dist-info/RECORD,,