pybiolib 1.2.158.dev1__py3-none-any.whl → 1.2.163.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/utils/seq_util.py +19 -2
- {pybiolib-1.2.158.dev1.dist-info → pybiolib-1.2.163.dev1.dist-info}/METADATA +1 -1
- {pybiolib-1.2.158.dev1.dist-info → pybiolib-1.2.163.dev1.dist-info}/RECORD +6 -6
- {pybiolib-1.2.158.dev1.dist-info → pybiolib-1.2.163.dev1.dist-info}/LICENSE +0 -0
- {pybiolib-1.2.158.dev1.dist-info → pybiolib-1.2.163.dev1.dist-info}/WHEEL +0 -0
- {pybiolib-1.2.158.dev1.dist-info → pybiolib-1.2.163.dev1.dist-info}/entry_points.txt +0 -0
biolib/utils/seq_util.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
2
|
from io import BufferedIOBase, TextIOBase
|
3
3
|
|
4
|
-
from biolib.typing_utils import Dict, List, Optional, Union
|
4
|
+
from biolib.typing_utils import Dict, Iterator, List, Optional, Union
|
5
5
|
|
6
6
|
|
7
7
|
class SeqUtilRecord:
|
@@ -35,14 +35,22 @@ class SeqUtil:
|
|
35
35
|
input_file: Union[str, BufferedIOBase, None] = None,
|
36
36
|
default_header: Optional[str] = None,
|
37
37
|
allow_any_sequence_characters: bool = False,
|
38
|
+
use_strict_alphabet: Optional[bool] = False,
|
38
39
|
allow_empty_sequence: bool = True,
|
39
40
|
file_name: Optional[str] = None,
|
40
41
|
) -> Iterator[SeqUtilRecord]:
|
41
42
|
def process_and_yield_record(header: str, sequence_lines: List[str]):
|
42
43
|
sequence = ''.join(sequence_lines)
|
43
44
|
sequence_id = header.split()[0]
|
45
|
+
if allow_any_sequence_characters and use_strict_alphabet:
|
46
|
+
raise Exception(
|
47
|
+
'Error: Please choose either allow_any_sequence_characters or use_strict_alphabet'
|
48
|
+
)
|
44
49
|
if not allow_any_sequence_characters:
|
45
|
-
|
50
|
+
if use_strict_alphabet:
|
51
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters_strict(sequence)
|
52
|
+
else:
|
53
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
46
54
|
if invalid_sequence_characters:
|
47
55
|
raise Exception(
|
48
56
|
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
@@ -126,6 +134,15 @@ class SeqUtil:
|
|
126
134
|
invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
|
127
135
|
return invalid_chars
|
128
136
|
|
137
|
+
@staticmethod
|
138
|
+
def _find_invalid_sequence_characters_strict(sequence: str) -> List[str]:
|
139
|
+
# Equivalent to fair-esm alphabet, compatible with ESM-models
|
140
|
+
# Excludes digits, '_' and 'J' (ambiguous letter only used in mass-spec NMR)
|
141
|
+
# https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/constants.py#L8
|
142
|
+
allowed_sequence_chars = set('lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.')
|
143
|
+
invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
|
144
|
+
return invalid_chars
|
145
|
+
|
129
146
|
@staticmethod
|
130
147
|
def _find_invalid_sequence_id_characters(sequence: str) -> List[str]:
|
131
148
|
allowed_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.:*#')
|
@@ -117,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
|
|
117
117
|
biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
|
118
118
|
biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
|
119
119
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
120
|
-
biolib/utils/seq_util.py,sha256=
|
120
|
+
biolib/utils/seq_util.py,sha256=Ozk0blGtPur_D9MwShD02r_mphyQmgZkx-lOHOwnlIM,6730
|
121
121
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
122
|
-
pybiolib-1.2.
|
123
|
-
pybiolib-1.2.
|
124
|
-
pybiolib-1.2.
|
125
|
-
pybiolib-1.2.
|
126
|
-
pybiolib-1.2.
|
122
|
+
pybiolib-1.2.163.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
123
|
+
pybiolib-1.2.163.dev1.dist-info/METADATA,sha256=JK1zdgar9NsD7Wv30JWkX3NNIzcKLQHQVT-Hy6n-I38,1512
|
124
|
+
pybiolib-1.2.163.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
125
|
+
pybiolib-1.2.163.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
126
|
+
pybiolib-1.2.163.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|