pybiolib 1.2.158.dev1__py3-none-any.whl → 1.2.163.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biolib/utils/seq_util.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  from io import BufferedIOBase, TextIOBase
3
3
 
4
- from biolib.typing_utils import Dict, List, Optional, Union, Iterator
4
+ from biolib.typing_utils import Dict, Iterator, List, Optional, Union
5
5
 
6
6
 
7
7
  class SeqUtilRecord:
@@ -35,14 +35,22 @@ class SeqUtil:
35
35
  input_file: Union[str, BufferedIOBase, None] = None,
36
36
  default_header: Optional[str] = None,
37
37
  allow_any_sequence_characters: bool = False,
38
+ use_strict_alphabet: Optional[bool] = False,
38
39
  allow_empty_sequence: bool = True,
39
40
  file_name: Optional[str] = None,
40
41
  ) -> Iterator[SeqUtilRecord]:
41
42
  def process_and_yield_record(header: str, sequence_lines: List[str]):
42
43
  sequence = ''.join(sequence_lines)
43
44
  sequence_id = header.split()[0]
45
+ if allow_any_sequence_characters and use_strict_alphabet:
46
+ raise Exception(
47
+ 'Error: Please choose either allow_any_sequence_characters or use_strict_alphabet'
48
+ )
44
49
  if not allow_any_sequence_characters:
45
- invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
50
+ if use_strict_alphabet:
51
+ invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters_strict(sequence)
52
+ else:
53
+ invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
46
54
  if invalid_sequence_characters:
47
55
  raise Exception(
48
56
  f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
@@ -126,6 +134,15 @@ class SeqUtil:
126
134
  invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
127
135
  return invalid_chars
128
136
 
137
+ @staticmethod
138
+ def _find_invalid_sequence_characters_strict(sequence: str) -> List[str]:
139
+ # Equivalent to fair-esm alphabet, compatible with ESM-models
140
+ # Excludes digits, '_' and 'J' (ambiguous letter only used in mass-spec NMR)
141
+ # https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/constants.py#L8
142
+ allowed_sequence_chars = set('lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.')
143
+ invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
144
+ return invalid_chars
145
+
129
146
  @staticmethod
130
147
  def _find_invalid_sequence_id_characters(sequence: str) -> List[str]:
131
148
  allowed_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.:*#')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pybiolib
3
- Version: 1.2.158.dev1
3
+ Version: 1.2.163.dev1
4
4
  Summary: BioLib Python Client
5
5
  Home-page: https://github.com/biolib
6
6
  License: MIT
@@ -117,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
117
117
  biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
118
118
  biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
119
119
  biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
120
- biolib/utils/seq_util.py,sha256=WJnU9vZdwY8RHXvzATyV80OXzyJ7w9EkG33Tna9Nr6A,5698
120
+ biolib/utils/seq_util.py,sha256=Ozk0blGtPur_D9MwShD02r_mphyQmgZkx-lOHOwnlIM,6730
121
121
  biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
122
- pybiolib-1.2.158.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
123
- pybiolib-1.2.158.dev1.dist-info/METADATA,sha256=7gYFVnp6YM8fLfHzg0S_XDeC6DxdTzeDX6dfWesenm0,1512
124
- pybiolib-1.2.158.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
125
- pybiolib-1.2.158.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
126
- pybiolib-1.2.158.dev1.dist-info/RECORD,,
122
+ pybiolib-1.2.163.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
123
+ pybiolib-1.2.163.dev1.dist-info/METADATA,sha256=JK1zdgar9NsD7Wv30JWkX3NNIzcKLQHQVT-Hy6n-I38,1512
124
+ pybiolib-1.2.163.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
125
+ pybiolib-1.2.163.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
126
+ pybiolib-1.2.163.dev1.dist-info/RECORD,,