pybiolib 1.2.71__py3-none-any.whl → 1.2.99.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/utils/seq_util.py +61 -53
- {pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/METADATA +1 -1
- {pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/RECORD +6 -6
- {pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/LICENSE +0 -0
- {pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/WHEEL +0 -0
- {pybiolib-1.2.71.dist-info → pybiolib-1.2.99.dev1.dist-info}/entry_points.txt +0 -0
biolib/utils/seq_util.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
|
-
from io import BufferedIOBase
|
2
|
+
from io import BufferedIOBase, TextIOWrapper
|
3
3
|
|
4
|
-
from biolib.typing_utils import Dict, List, Optional, Union
|
4
|
+
from biolib.typing_utils import Dict, List, Optional, Union, Iterator
|
5
5
|
|
6
6
|
|
7
7
|
class SeqUtilRecord:
|
@@ -37,67 +37,75 @@ class SeqUtil:
|
|
37
37
|
allow_any_sequence_characters: bool = False,
|
38
38
|
allow_empty_sequence: bool = True,
|
39
39
|
file_name: Optional[str] = None,
|
40
|
-
|
40
|
+
) -> Iterator[SeqUtilRecord]:
|
41
|
+
def process_and_yield_record(header: str, sequence_lines: list):
|
42
|
+
sequence = ''.join(sequence_lines)
|
43
|
+
sequence_id = header.split()[0]
|
44
|
+
if not allow_any_sequence_characters:
|
45
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
46
|
+
if invalid_sequence_characters:
|
47
|
+
raise Exception(
|
48
|
+
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
49
|
+
)
|
50
|
+
if not allow_empty_sequence and not sequence:
|
51
|
+
raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
|
52
|
+
yield SeqUtilRecord(
|
53
|
+
sequence=sequence,
|
54
|
+
sequence_id=sequence_id,
|
55
|
+
description=header[len(sequence_id):].strip()
|
56
|
+
)
|
57
|
+
|
58
|
+
def line_generator_from_buffered_io_base(file_handle: BufferedIOBase) -> Iterator[str]:
|
59
|
+
"""Generates lines from a BufferedIOBase handle, decoding UTF-8."""
|
60
|
+
for line in file_handle:
|
61
|
+
yield line.decode('utf-8')
|
62
|
+
|
63
|
+
def line_generator_from_text_io_wrapper(file_handle: TextIOWrapper) -> Iterator[str]:
|
64
|
+
"""Generates lines from a TextIOWrapper handle, decoding UTF-8."""
|
65
|
+
for line in file_handle:
|
66
|
+
yield line
|
67
|
+
|
41
68
|
if input_file is None:
|
42
69
|
if file_name:
|
43
70
|
input_file = file_name
|
44
71
|
else:
|
45
72
|
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
73
|
+
|
74
|
+
file_handle = None
|
46
75
|
if isinstance(input_file, str):
|
47
|
-
|
48
|
-
|
76
|
+
file_handle = open(input_file, "r")
|
77
|
+
line_iterator = line_generator_from_text_io_wrapper(file_handle)
|
49
78
|
elif isinstance(input_file, BufferedIOBase):
|
50
|
-
|
79
|
+
line_iterator = line_generator_from_buffered_io_base(input_file)
|
51
80
|
else:
|
52
81
|
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
53
|
-
if not data:
|
54
|
-
return []
|
55
|
-
|
56
|
-
if '>' not in data:
|
57
|
-
if default_header:
|
58
|
-
lines_with_header = []
|
59
|
-
for index, line in enumerate(data.split('\n')):
|
60
|
-
index_string = str(index + 1) if index > 0 else ''
|
61
|
-
lines_with_header.append(f'>{default_header}{index_string}\n{line}')
|
62
|
-
|
63
|
-
data = '\n'.join(lines_with_header)
|
64
|
-
else:
|
65
|
-
raise Exception(f'No header line found in FASTA file "{file_name}"')
|
66
|
-
|
67
|
-
splitted = []
|
68
|
-
tmp_data = ''
|
69
|
-
for line in data.splitlines():
|
70
|
-
if line.startswith('>'):
|
71
|
-
if tmp_data:
|
72
|
-
splitted.append(tmp_data)
|
73
|
-
tmp_data = line[1:].strip() + '\n'
|
74
|
-
else:
|
75
|
-
if line.strip():
|
76
|
-
tmp_data += line.strip() + '\n'
|
77
|
-
|
78
|
-
if tmp_data:
|
79
|
-
splitted.append(tmp_data)
|
80
|
-
|
81
|
-
parsed_sequences = []
|
82
|
-
for sequence_data in splitted:
|
83
|
-
sequence_data_splitted = sequence_data.strip().split('\n')
|
84
|
-
header_line = sequence_data_splitted[0].split()
|
85
|
-
sequence_id = header_line[0]
|
86
|
-
description = sequence_data_splitted[0][len(sequence_id) :].strip()
|
87
|
-
sequence = ''.join([seq.strip() for seq in sequence_data_splitted[1:]])
|
88
|
-
|
89
|
-
if not allow_any_sequence_characters:
|
90
|
-
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
91
|
-
if len(invalid_sequence_characters) > 0:
|
92
|
-
raise Exception(
|
93
|
-
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
94
|
-
)
|
95
|
-
if not allow_empty_sequence and len(sequence) == 0:
|
96
|
-
raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
|
97
|
-
|
98
|
-
parsed_sequences.append(SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description))
|
99
82
|
|
100
|
-
|
83
|
+
header = None
|
84
|
+
sequence_lines: list[str] = []
|
85
|
+
|
86
|
+
try:
|
87
|
+
for line_number, line in enumerate(line_iterator):
|
88
|
+
line = line.strip()
|
89
|
+
if line.startswith('>'):
|
90
|
+
if header is not None:
|
91
|
+
yield from process_and_yield_record(header, sequence_lines)
|
92
|
+
|
93
|
+
header = line[1:].strip()
|
94
|
+
sequence_lines = []
|
95
|
+
else:
|
96
|
+
if header is None:
|
97
|
+
if default_header:
|
98
|
+
yield from process_and_yield_record(f"{default_header}{line_number}", [line])
|
99
|
+
else:
|
100
|
+
raise Exception(f'No header line found in FASTA file "{file_name}"')
|
101
|
+
else:
|
102
|
+
sequence_lines.append(line)
|
103
|
+
|
104
|
+
if header is not None:
|
105
|
+
yield from process_and_yield_record(header, sequence_lines)
|
106
|
+
finally:
|
107
|
+
if file_handle:
|
108
|
+
file_handle.close()
|
101
109
|
|
102
110
|
@staticmethod
|
103
111
|
def write_records_to_fasta(file_name: str, records: List[SeqUtilRecord]) -> None:
|
@@ -117,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
|
|
117
117
|
biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
|
118
118
|
biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
|
119
119
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
120
|
-
biolib/utils/seq_util.py,sha256=
|
120
|
+
biolib/utils/seq_util.py,sha256=fBJoGIkmqNqFebLIwSzxifXzkkUdBOepDYknBsks7Qw,5660
|
121
121
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
122
|
-
pybiolib-1.2.
|
123
|
-
pybiolib-1.2.
|
124
|
-
pybiolib-1.2.
|
125
|
-
pybiolib-1.2.
|
126
|
-
pybiolib-1.2.
|
122
|
+
pybiolib-1.2.99.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
123
|
+
pybiolib-1.2.99.dev1.dist-info/METADATA,sha256=FoK4QVbKuqHAXomf4Ezk2tGYhvl5MdeMyJqDOPSlhP0,1511
|
124
|
+
pybiolib-1.2.99.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
125
|
+
pybiolib-1.2.99.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
126
|
+
pybiolib-1.2.99.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|