pybiolib 1.2.63.dev1__py3-none-any.whl → 1.2.98.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biolib/utils/seq_util.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import re
2
- from io import BufferedIOBase
2
+ from io import BufferedIOBase, TextIOWrapper
3
3
 
4
- from biolib.typing_utils import Dict, List, Optional, Union
4
+ from biolib.typing_utils import Dict, List, Optional, Union, Iterator
5
5
 
6
6
 
7
7
  class SeqUtilRecord:
@@ -37,67 +37,76 @@ class SeqUtil:
37
37
  allow_any_sequence_characters: bool = False,
38
38
  allow_empty_sequence: bool = True,
39
39
  file_name: Optional[str] = None,
40
- ) -> List[SeqUtilRecord]:
40
+ ) -> Iterator[SeqUtilRecord]:
41
+ def process_and_yield_record(header: str, sequence_lines: list):
42
+ """Processes the sequence and yields a SeqUtilRecord."""
43
+ sequence = ''.join(sequence_lines)
44
+ sequence_id = header.split()[0]
45
+ if not allow_any_sequence_characters:
46
+ invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
47
+ if invalid_sequence_characters:
48
+ raise Exception(
49
+ f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
50
+ )
51
+ if not allow_empty_sequence and not sequence:
52
+ raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
53
+ yield SeqUtilRecord(
54
+ sequence=sequence,
55
+ sequence_id=sequence_id,
56
+ description=header[len(sequence_id):].strip()
57
+ )
58
+
59
+ def line_generator_from_buffered_io_base(file_handle: BufferedIOBase) -> Iterator[str]:
60
+ """Generates lines from a BufferedIOBase handle, decoding UTF-8."""
61
+ for line in file_handle:
62
+ yield line.decode('utf-8')
63
+
64
+ def line_generator_from_text_io_wrapper(file_handle: TextIOWrapper) -> Iterator[str]:
65
+ """Generates lines from a TextIOWrapper handle, decoding UTF-8."""
66
+ for line in file_handle:
67
+ yield line
68
+
41
69
  if input_file is None:
42
70
  if file_name:
43
71
  input_file = file_name
44
72
  else:
45
73
  raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
74
+
75
+ file_handle = None
46
76
  if isinstance(input_file, str):
47
- with open(input_file) as file_handle:
48
- data = file_handle.read().strip()
77
+ file_handle = open(input_file, "r")
78
+ line_iterator = line_generator_from_text_io_wrapper(file_handle)
49
79
  elif isinstance(input_file, BufferedIOBase):
50
- data = input_file.read().decode('utf-8')
80
+ line_iterator = line_generator_from_buffered_io_base(input_file)
51
81
  else:
52
82
  raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
53
- if not data:
54
- return []
55
-
56
- if '>' not in data:
57
- if default_header:
58
- lines_with_header = []
59
- for index, line in enumerate(data.split('\n')):
60
- index_string = str(index + 1) if index > 0 else ''
61
- lines_with_header.append(f'>{default_header}{index_string}\n{line}')
62
-
63
- data = '\n'.join(lines_with_header)
64
- else:
65
- raise Exception(f'No header line found in FASTA file "{file_name}"')
66
-
67
- splitted = []
68
- tmp_data = ''
69
- for line in data.splitlines():
70
- if line.startswith('>'):
71
- if tmp_data:
72
- splitted.append(tmp_data)
73
- tmp_data = line[1:].strip() + '\n'
74
- else:
75
- if line.strip():
76
- tmp_data += line.strip() + '\n'
77
-
78
- if tmp_data:
79
- splitted.append(tmp_data)
80
-
81
- parsed_sequences = []
82
- for sequence_data in splitted:
83
- sequence_data_splitted = sequence_data.strip().split('\n')
84
- header_line = sequence_data_splitted[0].split()
85
- sequence_id = header_line[0]
86
- description = sequence_data_splitted[0][len(sequence_id) :].strip()
87
- sequence = ''.join([seq.strip() for seq in sequence_data_splitted[1:]])
88
-
89
- if not allow_any_sequence_characters:
90
- invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
91
- if len(invalid_sequence_characters) > 0:
92
- raise Exception(
93
- f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
94
- )
95
- if not allow_empty_sequence and len(sequence) == 0:
96
- raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
97
-
98
- parsed_sequences.append(SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description))
99
83
 
100
- return parsed_sequences
84
+ header = None
85
+ sequence_lines: list[str] = []
86
+
87
+ try:
88
+ for line_number, line in enumerate(line_iterator):
89
+ line = line.strip()
90
+ if line.startswith('>'):
91
+ if header is not None:
92
+ yield from process_and_yield_record(header, sequence_lines)
93
+
94
+ header = line[1:].strip()
95
+ sequence_lines = []
96
+ else:
97
+ if header is None:
98
+ if default_header:
99
+ yield from process_and_yield_record(f"{default_header}{line_number}", [line])
100
+ else:
101
+ raise Exception(f'No header line found in FASTA file "{file_name}"')
102
+ else:
103
+ sequence_lines.append(line)
104
+
105
+ if header is not None:
106
+ yield from process_and_yield_record(header, sequence_lines)
107
+ finally:
108
+ if file_handle:
109
+ file_handle.close()
101
110
 
102
111
  @staticmethod
103
112
  def write_records_to_fasta(file_name: str, records: List[SeqUtilRecord]) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pybiolib
3
- Version: 1.2.63.dev1
3
+ Version: 1.2.98.dev1
4
4
  Summary: BioLib Python Client
5
5
  Home-page: https://github.com/biolib
6
6
  License: MIT
@@ -117,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
117
117
  biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
118
118
  biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
119
119
  biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
120
- biolib/utils/seq_util.py,sha256=ZQFcaE37B2dtucN2zDjOmdya_X0ITc1zBFZJNQY13XA,5183
120
+ biolib/utils/seq_util.py,sha256=oQGSg81qECf0p481dsvCXt2fwe0RNmDuTe2QDJA_bNY,5729
121
121
  biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
122
- pybiolib-1.2.63.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
123
- pybiolib-1.2.63.dev1.dist-info/METADATA,sha256=t8YhPOU4slbJKvi3kieokCcqoKqgyqaYf5Tjwi7Vs3s,1511
124
- pybiolib-1.2.63.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
125
- pybiolib-1.2.63.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
126
- pybiolib-1.2.63.dev1.dist-info/RECORD,,
122
+ pybiolib-1.2.98.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
123
+ pybiolib-1.2.98.dev1.dist-info/METADATA,sha256=IrQEwIzu-zbqMdFnI4_t-zsPRDMX2MqFl4n8-mfHWRo,1511
124
+ pybiolib-1.2.98.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
125
+ pybiolib-1.2.98.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
126
+ pybiolib-1.2.98.dev1.dist-info/RECORD,,