pybiolib 1.1.2173__py3-none-any.whl → 1.1.2175__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/runtime/__init__.py +1 -0
- biolib/utils/seq_util.py +38 -35
- {pybiolib-1.1.2173.dist-info → pybiolib-1.1.2175.dist-info}/METADATA +1 -1
- {pybiolib-1.1.2173.dist-info → pybiolib-1.1.2175.dist-info}/RECORD +7 -7
- {pybiolib-1.1.2173.dist-info → pybiolib-1.1.2175.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.2173.dist-info → pybiolib-1.1.2175.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.2173.dist-info → pybiolib-1.1.2175.dist-info}/entry_points.txt +0 -0
biolib/runtime/__init__.py
CHANGED
biolib/utils/seq_util.py
CHANGED
@@ -1,32 +1,26 @@
|
|
1
1
|
import re
|
2
2
|
from io import BufferedIOBase
|
3
|
-
from biolib.typing_utils import List, Optional, Dict, Union
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def find_invalid_sequence_characters(sequence):
|
9
|
-
invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
|
10
|
-
return invalid_chars
|
4
|
+
from biolib.typing_utils import Dict, List, Optional, Union
|
11
5
|
|
12
6
|
|
13
7
|
class SeqUtilRecord:
|
14
8
|
def __init__(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
9
|
+
self,
|
10
|
+
sequence: str,
|
11
|
+
sequence_id: str,
|
12
|
+
description: Optional['str'],
|
13
|
+
properties: Optional[Dict[str, str]] = None,
|
20
14
|
):
|
21
15
|
self.sequence = sequence
|
22
16
|
self.id = sequence_id # pylint: disable=invalid-name
|
23
17
|
self.description = description
|
24
18
|
|
25
19
|
if properties:
|
26
|
-
disallowed_pattern = re.compile(r
|
20
|
+
disallowed_pattern = re.compile(r'[=\[\]\n]')
|
27
21
|
for key, value in properties.items():
|
28
|
-
assert not bool(disallowed_pattern.search(key)),
|
29
|
-
assert not bool(disallowed_pattern.search(value)),
|
22
|
+
assert not bool(disallowed_pattern.search(key)), 'Key cannot contain characters =[] and newline'
|
23
|
+
assert not bool(disallowed_pattern.search(value)), 'Value cannot contain characters =[] and newline'
|
30
24
|
self.properties = properties
|
31
25
|
else:
|
32
26
|
self.properties = {}
|
@@ -38,24 +32,24 @@ class SeqUtilRecord:
|
|
38
32
|
class SeqUtil:
|
39
33
|
@staticmethod
|
40
34
|
def parse_fasta(
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
35
|
+
input_file: Union[str, BufferedIOBase, None] = None,
|
36
|
+
default_header: Optional[str] = None,
|
37
|
+
allow_any_sequence_characters: bool = False,
|
38
|
+
allow_empty_sequence: bool = False,
|
39
|
+
file_name: Optional[str] = None,
|
46
40
|
) -> List[SeqUtilRecord]:
|
47
41
|
if input_file is None:
|
48
42
|
if file_name:
|
49
43
|
input_file = file_name
|
50
44
|
else:
|
51
|
-
raise ValueError(
|
45
|
+
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
52
46
|
if isinstance(input_file, str):
|
53
|
-
with open(input_file
|
47
|
+
with open(input_file) as file_handle:
|
54
48
|
data = file_handle.read().strip()
|
55
49
|
elif isinstance(input_file, BufferedIOBase):
|
56
50
|
data = input_file.read().decode('utf-8')
|
57
51
|
else:
|
58
|
-
raise ValueError(
|
52
|
+
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
59
53
|
if not data:
|
60
54
|
return []
|
61
55
|
|
@@ -71,9 +65,9 @@ class SeqUtil:
|
|
71
65
|
raise Exception(f'No header line found in FASTA file "{file_name}"')
|
72
66
|
|
73
67
|
splitted = []
|
74
|
-
tmp_data =
|
68
|
+
tmp_data = ''
|
75
69
|
for line in data.splitlines():
|
76
|
-
if line.startswith(
|
70
|
+
if line.startswith('>'):
|
77
71
|
if tmp_data:
|
78
72
|
splitted.append(tmp_data)
|
79
73
|
tmp_data = line[1:].strip() + '\n'
|
@@ -89,23 +83,20 @@ class SeqUtil:
|
|
89
83
|
sequence_data_splitted = sequence_data.strip().split('\n')
|
90
84
|
header_line = sequence_data_splitted[0].split()
|
91
85
|
sequence_id = header_line[0]
|
92
|
-
description = sequence_data_splitted[0][len(sequence_id):].strip()
|
93
|
-
sequence =
|
86
|
+
description = sequence_data_splitted[0][len(sequence_id) :].strip()
|
87
|
+
sequence = ''.join([seq.strip() for seq in sequence_data_splitted[1:]])
|
94
88
|
|
95
89
|
if not allow_any_sequence_characters:
|
96
|
-
invalid_sequence_characters =
|
90
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
97
91
|
if len(invalid_sequence_characters) > 0:
|
98
92
|
raise Exception(
|
99
93
|
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
100
94
|
)
|
101
95
|
if not allow_empty_sequence and len(sequence) == 0:
|
102
|
-
raise Exception(
|
103
|
-
|
104
|
-
|
96
|
+
raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
|
97
|
+
|
98
|
+
parsed_sequences.append(SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description))
|
105
99
|
|
106
|
-
parsed_sequences.append(
|
107
|
-
SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description)
|
108
|
-
)
|
109
100
|
return parsed_sequences
|
110
101
|
|
111
102
|
@staticmethod
|
@@ -116,5 +107,17 @@ class SeqUtil:
|
|
116
107
|
if record.properties:
|
117
108
|
for key, value in record.properties.items():
|
118
109
|
optional_description += f' [{key}={value}]'
|
119
|
-
sequence = '\n'.join(record.sequence[i:i + 80] for i in range(0, len(record.sequence), 80))
|
110
|
+
sequence = '\n'.join(record.sequence[i : i + 80] for i in range(0, len(record.sequence), 80))
|
120
111
|
file_handle.write(f'>{record.id}{optional_description}\n{sequence}\n')
|
112
|
+
|
113
|
+
@staticmethod
|
114
|
+
def _find_invalid_sequence_characters(sequence: str) -> List[str]:
|
115
|
+
allowed_sequence_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.')
|
116
|
+
invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
|
117
|
+
return invalid_chars
|
118
|
+
|
119
|
+
@staticmethod
|
120
|
+
def _find_invalid_sequence_id_characters(sequence: str) -> List[str]:
|
121
|
+
allowed_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.:*#')
|
122
|
+
invalid_chars = [char for char in sequence if char not in allowed_chars]
|
123
|
+
return invalid_chars
|
@@ -102,7 +102,7 @@ biolib/jobs/__init__.py,sha256=aIb2H2DHjQbM2Bs-dysFijhwFcL58Blp0Co0gimED3w,32
|
|
102
102
|
biolib/jobs/job.py,sha256=OfG8cLd3AjGjiMWRlJRZdVVbLsRWSX-OM5nxJhR6mPQ,19136
|
103
103
|
biolib/jobs/job_result.py,sha256=rALHiKYNaC9lHi_JJqBob1RubzNLwG9Z386kwRJjd2M,5885
|
104
104
|
biolib/jobs/types.py,sha256=qhadtH2KDC2WUOOqPiwke0YgtQY4FtuB71Stekq1k48,970
|
105
|
-
biolib/runtime/__init__.py,sha256=
|
105
|
+
biolib/runtime/__init__.py,sha256=MlRepA11n2H-3plB5rzWyyHK2JmP6PiaP3i6x3vt0mg,506
|
106
106
|
biolib/sdk/__init__.py,sha256=qJ_V_Edxolzi4VBQCrvem5lYIkJ0FVH3VZepSDuXjTc,1895
|
107
107
|
biolib/tables.py,sha256=acH7VjwAbadLo8P84FSnKEZxCTVsF5rEg9VPuxElNs8,872
|
108
108
|
biolib/templates/__init__.py,sha256=Yx62sSyDCDesRQDQgmbDsLpfgEh93fWE8r9u4g2azXk,36
|
@@ -114,10 +114,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
|
|
114
114
|
biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
|
115
115
|
biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
|
116
116
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
117
|
-
biolib/utils/seq_util.py,sha256=
|
117
|
+
biolib/utils/seq_util.py,sha256=WieuQ2RvV4QSJFUAMRVyvKXFs3YanFAmjh-CFIaQmQk,5184
|
118
118
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
119
|
-
pybiolib-1.1.
|
120
|
-
pybiolib-1.1.
|
121
|
-
pybiolib-1.1.
|
122
|
-
pybiolib-1.1.
|
123
|
-
pybiolib-1.1.
|
119
|
+
pybiolib-1.1.2175.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
120
|
+
pybiolib-1.1.2175.dist-info/METADATA,sha256=2N5GV9P58sKVu7I1HPx2eQuP_I7igY1RIqfd1OZEQKI,1508
|
121
|
+
pybiolib-1.1.2175.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
122
|
+
pybiolib-1.1.2175.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
123
|
+
pybiolib-1.1.2175.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|