PyPI - pybiolib - Versions diffs - 1.2.71__py3-none-any.whl → 1.2.99.dev1__py3-none-any.whl - Mend

pybiolib 1.2.71py3-none-any.whl → 1.2.99.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

biolib/utils/seq_util.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
-from io import BufferedIOBase
+from io import BufferedIOBase, TextIOWrapper
-from biolib.typing_utils import Dict, List, Optional, Union
+from biolib.typing_utils import Dict, List, Optional, Union, Iterator
 class SeqUtilRecord:
@@ -37,67 +37,75 @@ class SeqUtil:
         allow_any_sequence_characters: bool = False,
         allow_empty_sequence: bool = True,
         file_name: Optional[str] = None,
-    ) -> List[SeqUtilRecord]:
+) -> Iterator[SeqUtilRecord]:
+        def process_and_yield_record(header: str, sequence_lines: list):
+            sequence = ''.join(sequence_lines)
+            sequence_id = header.split()[0]
+            if not allow_any_sequence_characters:
+                invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
+                if invalid_sequence_characters:
+                    raise Exception(
+                        f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
+                    )
+            if not allow_empty_sequence and not sequence:
+                raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
+            yield SeqUtilRecord(
+                sequence=sequence,
+                sequence_id=sequence_id,
+                description=header[len(sequence_id):].strip()
+            )
+        def line_generator_from_buffered_io_base(file_handle: BufferedIOBase) -> Iterator[str]:
+            """Generates lines from a BufferedIOBase handle, decoding UTF-8."""
+            for line in file_handle:
+                yield line.decode('utf-8')
+        def line_generator_from_text_io_wrapper(file_handle: TextIOWrapper) -> Iterator[str]:
+            """Generates lines from a TextIOWrapper handle, decoding UTF-8."""
+            for line in file_handle:
+                yield line
         if input_file is None:
             if file_name:
                 input_file = file_name
             else:
                 raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
+        file_handle = None
         if isinstance(input_file, str):
-            with open(input_file) as file_handle:
-                data = file_handle.read().strip()
+            file_handle = open(input_file, "r")
+            line_iterator = line_generator_from_text_io_wrapper(file_handle)
         elif isinstance(input_file, BufferedIOBase):
-            data = input_file.read().decode('utf-8')
+            line_iterator = line_generator_from_buffered_io_base(input_file)
         else:
             raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
-        if not data:
-            return []
-        if '>' not in data:
-            if default_header:
-                lines_with_header = []
-                for index, line in enumerate(data.split('\n')):
-                    index_string = str(index + 1) if index > 0 else ''
-                    lines_with_header.append(f'>{default_header}{index_string}\n{line}')
-                data = '\n'.join(lines_with_header)
-            else:
-                raise Exception(f'No header line found in FASTA file "{file_name}"')
-        splitted = []
-        tmp_data = ''
-        for line in data.splitlines():
-            if line.startswith('>'):
-                if tmp_data:
-                    splitted.append(tmp_data)
-                tmp_data = line[1:].strip() + '\n'
-            else:
-                if line.strip():
-                    tmp_data += line.strip() + '\n'
-        if tmp_data:
-            splitted.append(tmp_data)
-        parsed_sequences = []
-        for sequence_data in splitted:
-            sequence_data_splitted = sequence_data.strip().split('\n')
-            header_line = sequence_data_splitted[0].split()
-            sequence_id = header_line[0]
-            description = sequence_data_splitted[0][len(sequence_id) :].strip()
-            sequence = ''.join([seq.strip() for seq in sequence_data_splitted[1:]])
-            if not allow_any_sequence_characters:
-                invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
-                if len(invalid_sequence_characters) > 0:
-                    raise Exception(
-                        f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
-                    )
-            if not allow_empty_sequence and len(sequence) == 0:
-                raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
-            parsed_sequences.append(SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description))
-        return parsed_sequences
+        header = None
+        sequence_lines: list[str] = []
+        try:
+            for line_number, line in enumerate(line_iterator):
+                line = line.strip()
+                if line.startswith('>'):
+                    if header is not None:
+                        yield from process_and_yield_record(header, sequence_lines)
+                    header = line[1:].strip()
+                    sequence_lines = []
+                else:
+                    if header is None:
+                        if default_header:
+                            yield from process_and_yield_record(f"{default_header}{line_number}", [line])
+                        else:
+                            raise Exception(f'No header line found in FASTA file "{file_name}"')
+                    else:
+                        sequence_lines.append(line)
+            if header is not None:
+                yield from process_and_yield_record(header, sequence_lines)
+        finally:
+            if file_handle:
+                file_handle.close()
     @staticmethod
     def write_records_to_fasta(file_name: str, records: List[SeqUtilRecord]) -> None:

{pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pybiolib
-Version: 1.2.71
+Version: 1.2.99.dev1
 Summary: BioLib Python Client
 Home-page: https://github.com/biolib
 License: MIT

{pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/RECORD RENAMED Viewed

@@ -117,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
 biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
 biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
 biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
-biolib/utils/seq_util.py,sha256=ZQFcaE37B2dtucN2zDjOmdya_X0ITc1zBFZJNQY13XA,5183
+biolib/utils/seq_util.py,sha256=fBJoGIkmqNqFebLIwSzxifXzkkUdBOepDYknBsks7Qw,5660
 biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
-pybiolib-1.2.71.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
-pybiolib-1.2.71.dist-info/METADATA,sha256=Fz8X4xfMgIOj-MyEF8-h8YkSZ0ZSHhi_uJtv56ikYvk,1506
-pybiolib-1.2.71.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-pybiolib-1.2.71.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
-pybiolib-1.2.71.dist-info/RECORD,,
+pybiolib-1.2.99.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
+pybiolib-1.2.99.dev1.dist-info/METADATA,sha256=FoK4QVbKuqHAXomf4Ezk2tGYhvl5MdeMyJqDOPSlhP0,1511
+pybiolib-1.2.99.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+pybiolib-1.2.99.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
+pybiolib-1.2.99.dev1.dist-info/RECORD,,

{pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/LICENSE RENAMED Viewed

File without changes

{pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/WHEEL RENAMED Viewed

File without changes

{pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pybiolib 1.2.71__py3-none-any.whl → 1.2.99.dev1__py3-none-any.whl

pybiolib 1.2.71py3-none-any.whl → 1.2.99.dev1py3-none-any.whl