pybiolib 1.2.7.dev0__py3-none-any.whl → 1.2.105.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +2 -2
- biolib/_data_record/data_record.py +22 -48
- biolib/_internal/data_record/push_data.py +67 -0
- biolib/_internal/push_application.py +34 -1
- biolib/_internal/types/__init__.py +1 -0
- biolib/_internal/types/data_record.py +1 -1
- biolib/_internal/types/resource_version.py +7 -0
- biolib/_internal/utils/multinode.py +7 -4
- biolib/compute_node/remote_host_proxy.py +9 -0
- biolib/utils/seq_util.py +63 -53
- {pybiolib-1.2.7.dev0.dist-info → pybiolib-1.2.105.dev1.dist-info}/METADATA +1 -1
- {pybiolib-1.2.7.dev0.dist-info → pybiolib-1.2.105.dev1.dist-info}/RECORD +16 -14
- /README.md → /PYPI_README.md +0 -0
- {pybiolib-1.2.7.dev0.dist-info → pybiolib-1.2.105.dev1.dist-info}/LICENSE +0 -0
- {pybiolib-1.2.7.dev0.dist-info → pybiolib-1.2.105.dev1.dist-info}/WHEEL +0 -0
- {pybiolib-1.2.7.dev0.dist-info → pybiolib-1.2.105.dev1.dist-info}/entry_points.txt +0 -0
biolib/__init__.py
CHANGED
@@ -41,8 +41,8 @@ def search(
|
|
41
41
|
return apps
|
42
42
|
|
43
43
|
|
44
|
-
def get_job(job_id: str) -> _Job:
|
45
|
-
return _Job.create_from_uuid(uuid=job_id)
|
44
|
+
def get_job(job_id: str, job_token: Optional[str] = None) -> _Job:
|
45
|
+
return _Job.create_from_uuid(uuid=job_id, auth_token=job_token)
|
46
46
|
|
47
47
|
|
48
48
|
def get_data_record(uri: str) -> _DataRecord:
|
@@ -6,19 +6,21 @@ from pathlib import Path
|
|
6
6
|
from struct import Struct
|
7
7
|
from typing import Callable, Dict, List, Optional, Union, cast
|
8
8
|
|
9
|
-
from biolib import api
|
9
|
+
from biolib import api
|
10
10
|
from biolib._internal import types
|
11
11
|
from biolib._internal.data_record import get_data_record_state_from_uri
|
12
12
|
from biolib._internal.data_record.data_record import validate_sqlite_v1
|
13
|
+
from biolib._internal.data_record.push_data import (
|
14
|
+
push_data_path,
|
15
|
+
validate_data_path_and_get_files_and_size_of_directory,
|
16
|
+
)
|
13
17
|
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
14
|
-
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
15
18
|
from biolib._internal.http_client import HttpClient
|
16
19
|
from biolib.api import client as api_client
|
17
20
|
from biolib.biolib_api_client import BiolibApiClient
|
18
21
|
from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersion, DataRecordVersionInfo
|
19
22
|
from biolib.biolib_binary_format import LazyLoadedFile
|
20
23
|
from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
|
21
|
-
from biolib.biolib_errors import BioLibError
|
22
24
|
from biolib.biolib_logging import logger
|
23
25
|
from biolib.utils.app_uri import parse_app_uri
|
24
26
|
from biolib.utils.zip.remote_zip import RemoteZip
|
@@ -85,18 +87,8 @@ class DataRecord:
|
|
85
87
|
self.download_files(output_dir=output_dir, path_filter=path_filter)
|
86
88
|
|
87
89
|
def update(self, data_path: str, chunk_size_in_mb: Optional[int] = None) -> None:
|
88
|
-
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
89
90
|
BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
|
90
|
-
|
91
|
-
if os.path.realpath(data_path) == '/':
|
92
|
-
raise BioLibError('Pushing your root directory is not possible')
|
93
|
-
|
94
|
-
original_working_dir = os.getcwd()
|
95
|
-
os.chdir(data_path)
|
96
|
-
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
97
|
-
|
98
|
-
if data_size_in_bytes > 4_500_000_000_000:
|
99
|
-
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
91
|
+
files_to_zip, data_size_in_bytes = validate_data_path_and_get_files_and_size_of_directory(data_path)
|
100
92
|
|
101
93
|
# validate data record
|
102
94
|
detailed_dict: types.DataRecordDetailedDict = self._get_detailed_dict()
|
@@ -114,40 +106,23 @@ class DataRecord:
|
|
114
106
|
else:
|
115
107
|
raise Exception(f"Error processing data record validation: unknown rule type {rule['type']}")
|
116
108
|
|
117
|
-
min_chunk_size_bytes = 10_000_000
|
118
|
-
chunk_size_in_bytes: int
|
119
|
-
if chunk_size_in_mb:
|
120
|
-
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
121
|
-
if chunk_size_in_bytes < min_chunk_size_bytes:
|
122
|
-
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
123
|
-
chunk_size_in_bytes = min_chunk_size_bytes
|
124
|
-
else:
|
125
|
-
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
126
|
-
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
127
|
-
|
128
|
-
data_size_in_mb = round(data_size_in_bytes / 10**6)
|
129
|
-
logger.info(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
130
|
-
|
131
109
|
response = api.client.post(path='/lfs/versions/', data={'resource_uuid': self._state['resource_uuid']})
|
132
110
|
data_record_version: DataRecordVersion = response.json()
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
),
|
111
|
+
resource_version_uuid = data_record_version['uuid']
|
112
|
+
|
113
|
+
push_data_path(
|
114
|
+
data_path=data_path,
|
115
|
+
data_size_in_bytes=data_size_in_bytes,
|
116
|
+
files_to_zip=files_to_zip,
|
117
|
+
resource_version_uuid=resource_version_uuid,
|
118
|
+
chunk_size_in_mb=chunk_size_in_mb,
|
119
|
+
)
|
120
|
+
|
121
|
+
api.client.patch(
|
122
|
+
path=f'/resources/versions/{resource_version_uuid}/',
|
123
|
+
data={'state': 'published', 'set_as_active': True},
|
147
124
|
)
|
148
125
|
|
149
|
-
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
150
|
-
os.chdir(original_working_dir)
|
151
126
|
logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
|
152
127
|
self._state = get_data_record_state_from_uri(data_record_version['uri'])
|
153
128
|
|
@@ -177,12 +152,11 @@ class DataRecord:
|
|
177
152
|
data_record_info: DataRecordInfo = response.json()
|
178
153
|
logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
|
179
154
|
|
155
|
+
data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
|
180
156
|
if data_path is not None:
|
181
|
-
data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
|
182
157
|
data_record.update(data_path=data_path)
|
183
|
-
|
184
|
-
|
185
|
-
return DataRecord.get_by_uri(uri=data_record_info['uri'])
|
158
|
+
|
159
|
+
return data_record
|
186
160
|
|
187
161
|
@staticmethod
|
188
162
|
def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
|
@@ -0,0 +1,67 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from biolib import utils
|
4
|
+
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
5
|
+
from biolib._internal.types.typing import List, Optional, Tuple
|
6
|
+
from biolib.biolib_errors import BioLibError
|
7
|
+
from biolib.biolib_logging import logger
|
8
|
+
|
9
|
+
|
10
|
+
def validate_data_path_and_get_files_and_size_of_directory(data_path: str) -> Tuple[List[str], int]:
|
11
|
+
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
12
|
+
|
13
|
+
if os.path.realpath(data_path) == '/':
|
14
|
+
raise BioLibError('Pushing your root directory is not possible')
|
15
|
+
|
16
|
+
original_working_dir = os.getcwd()
|
17
|
+
os.chdir(data_path)
|
18
|
+
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
19
|
+
os.chdir(original_working_dir)
|
20
|
+
|
21
|
+
if data_size_in_bytes > 4_500_000_000_000:
|
22
|
+
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
23
|
+
|
24
|
+
return files_to_zip, data_size_in_bytes
|
25
|
+
|
26
|
+
|
27
|
+
def push_data_path(
|
28
|
+
data_path: str,
|
29
|
+
data_size_in_bytes: int,
|
30
|
+
files_to_zip: List[str],
|
31
|
+
resource_version_uuid: str,
|
32
|
+
chunk_size_in_mb: Optional[int] = None,
|
33
|
+
) -> None:
|
34
|
+
original_working_dir = os.getcwd()
|
35
|
+
os.chdir(data_path)
|
36
|
+
|
37
|
+
min_chunk_size_bytes = 10_000_000
|
38
|
+
chunk_size_in_bytes: int
|
39
|
+
if chunk_size_in_mb:
|
40
|
+
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
41
|
+
if chunk_size_in_bytes < min_chunk_size_bytes:
|
42
|
+
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
43
|
+
chunk_size_in_bytes = min_chunk_size_bytes
|
44
|
+
else:
|
45
|
+
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
46
|
+
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
47
|
+
|
48
|
+
data_size_in_mb = round(data_size_in_bytes / 10**6)
|
49
|
+
logger.info(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
50
|
+
|
51
|
+
iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
|
52
|
+
multipart_uploader = utils.MultiPartUploader(
|
53
|
+
use_process_pool=True,
|
54
|
+
get_presigned_upload_url_request=dict(
|
55
|
+
headers=None,
|
56
|
+
requires_biolib_auth=True,
|
57
|
+
path=f'/lfs/versions/{resource_version_uuid}/presigned_upload_url/',
|
58
|
+
),
|
59
|
+
complete_upload_request=dict(
|
60
|
+
headers=None,
|
61
|
+
requires_biolib_auth=True,
|
62
|
+
path=f'/lfs/versions/{resource_version_uuid}/complete_upload/',
|
63
|
+
),
|
64
|
+
)
|
65
|
+
|
66
|
+
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
67
|
+
os.chdir(original_working_dir)
|
@@ -6,6 +6,10 @@ import rich.progress
|
|
6
6
|
import yaml
|
7
7
|
|
8
8
|
from biolib import api, utils
|
9
|
+
from biolib._internal.data_record.push_data import (
|
10
|
+
push_data_path,
|
11
|
+
validate_data_path_and_get_files_and_size_of_directory,
|
12
|
+
)
|
9
13
|
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
10
14
|
from biolib.biolib_api_client import BiolibApiClient
|
11
15
|
from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
|
@@ -94,10 +98,25 @@ def push_application(
|
|
94
98
|
zip_filters.add('.biolib/config.yml')
|
95
99
|
|
96
100
|
input_files_maps_to_root = False
|
101
|
+
app_data_path: Optional[Path] = None
|
97
102
|
try:
|
98
103
|
with open(config_yml_path) as config_yml_file:
|
99
104
|
config = yaml.safe_load(config_yml_file.read())
|
100
105
|
|
106
|
+
app_data = config.get('app_data')
|
107
|
+
if app_data:
|
108
|
+
if not isinstance(app_data, str):
|
109
|
+
raise BioLibError(
|
110
|
+
f'In .biolib/config.yml the value of "app_data" must be a string but got {type(app_data)}'
|
111
|
+
)
|
112
|
+
|
113
|
+
app_data_path = app_path_absolute.joinpath(app_data).resolve()
|
114
|
+
if not app_data_path.is_dir():
|
115
|
+
raise BioLibError(
|
116
|
+
'In .biolib/config.yml the value of "app_data" must be a path to a directory '
|
117
|
+
'in the application directory'
|
118
|
+
)
|
119
|
+
|
101
120
|
license_file_relative_path = config.get('license_file', 'LICENSE')
|
102
121
|
if app_path_absolute.joinpath(license_file_relative_path).is_file():
|
103
122
|
zip_filters.add(license_file_relative_path)
|
@@ -118,11 +137,14 @@ def push_application(
|
|
118
137
|
if module.get('source_files'):
|
119
138
|
zip_filters.add('*')
|
120
139
|
|
121
|
-
for mapping in module
|
140
|
+
for mapping in module.get('input_files', []):
|
122
141
|
mapping_parts = mapping.split(' ')
|
123
142
|
if len(mapping_parts) == 3 and mapping_parts[2] == '/':
|
124
143
|
input_files_maps_to_root = True
|
125
144
|
|
145
|
+
except BioLibError as error:
|
146
|
+
raise error from None
|
147
|
+
|
126
148
|
except Exception as error:
|
127
149
|
raise BioLibError('Failed to parse the .biolib/config.yml file') from error
|
128
150
|
|
@@ -175,6 +197,17 @@ def push_application(
|
|
175
197
|
else None,
|
176
198
|
)
|
177
199
|
|
200
|
+
if app_data_path:
|
201
|
+
app_data_files_to_zip, app_data_size_in_bytes = validate_data_path_and_get_files_and_size_of_directory(
|
202
|
+
data_path=str(app_data_path),
|
203
|
+
)
|
204
|
+
push_data_path(
|
205
|
+
resource_version_uuid=new_app_version_json['public_id'],
|
206
|
+
data_path=str(app_data_path),
|
207
|
+
data_size_in_bytes=app_data_size_in_bytes,
|
208
|
+
files_to_zip=app_data_files_to_zip,
|
209
|
+
)
|
210
|
+
|
178
211
|
# Don't push docker images if copying from another app version
|
179
212
|
docker_tags = new_app_version_json.get('docker_tags', {})
|
180
213
|
if not app_version_to_copy_images_from and docker_tags:
|
@@ -7,7 +7,7 @@ class SqliteV1ForeignKey(TypedDict):
|
|
7
7
|
|
8
8
|
|
9
9
|
class SqliteV1Column(TypedDict):
|
10
|
-
type: Literal['INTEGER', 'REAL', 'TEXT', 'JSON']
|
10
|
+
type: Literal['INTEGER', 'REAL', 'TEXT', 'JSON']
|
11
11
|
nullable: Optional[bool]
|
12
12
|
foreign_key: Optional[SqliteV1ForeignKey]
|
13
13
|
json_schema: Optional[Dict]
|
@@ -118,7 +118,9 @@ def fasta_batch_records(fasta_file, work_per_batch_min, work_per_residue=1, verb
|
|
118
118
|
return batches
|
119
119
|
|
120
120
|
|
121
|
-
def fasta_send_batches_biolib(
|
121
|
+
def fasta_send_batches_biolib(
|
122
|
+
app_url, batches, args, args_fasta='fasta', machine='cpu.large', stream_all_jobs=True, verbose=1
|
123
|
+
):
|
122
124
|
"""
|
123
125
|
Send jobs through pybiolib interface
|
124
126
|
"""
|
@@ -128,7 +130,6 @@ def fasta_send_batches_biolib(app_url, batches, args, args_fasta='fasta', verbos
|
|
128
130
|
|
129
131
|
# Login to biolib, prepare app
|
130
132
|
# current_app = biolib.load(Runtime.get_app_uri())
|
131
|
-
biolib.login()
|
132
133
|
current_app = biolib.load(app_url) # Nb: uses "_" not "-"
|
133
134
|
|
134
135
|
# Compute results
|
@@ -149,7 +150,7 @@ def fasta_send_batches_biolib(app_url, batches, args, args_fasta='fasta', verbos
|
|
149
150
|
new_args_list = _args_dict_to_args_list(new_args)
|
150
151
|
|
151
152
|
# Send job
|
152
|
-
job = current_app.cli(args=new_args_list, blocking=False)
|
153
|
+
job = current_app.cli(args=new_args_list, blocking=False, machine=machine)
|
153
154
|
job_list.append(job)
|
154
155
|
|
155
156
|
# Job stats
|
@@ -161,7 +162,9 @@ def fasta_send_batches_biolib(app_url, batches, args, args_fasta='fasta', verbos
|
|
161
162
|
# Stream job output at a time
|
162
163
|
print('Streaming job outputs ...')
|
163
164
|
for i, job in enumerate(job_list):
|
164
|
-
job
|
165
|
+
# Try to print if verbose. Always on first job, otherwise only if stream_all_jobs set
|
166
|
+
if (i == 0 and verbose) or (stream_all_jobs and verbose):
|
167
|
+
job.stream_logs()
|
165
168
|
|
166
169
|
# Check if job succeeded
|
167
170
|
assert job.get_exit_code() == 0, f'Job failed with exit code {job.get_exit_code()}'
|
@@ -291,6 +291,15 @@ http {{
|
|
291
291
|
proxy_ssl_server_name on;
|
292
292
|
}}
|
293
293
|
|
294
|
+
location /api/resources/data-records/ {{
|
295
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
296
|
+
proxy_set_header authorization "";
|
297
|
+
proxy_set_header compute-node-auth-token "{compute_node_auth_token}";
|
298
|
+
proxy_set_header job-uuid "{self._job_uuid}";
|
299
|
+
proxy_set_header cookie "";
|
300
|
+
proxy_ssl_server_name on;
|
301
|
+
}}
|
302
|
+
|
294
303
|
location /api/ {{
|
295
304
|
proxy_pass https://$upstream_hostname$request_uri;
|
296
305
|
proxy_set_header authorization "";
|
biolib/utils/seq_util.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
|
-
from io import BufferedIOBase
|
2
|
+
from io import BufferedIOBase, TextIOBase
|
3
3
|
|
4
|
-
from biolib.typing_utils import Dict, List, Optional, Union
|
4
|
+
from biolib.typing_utils import Dict, List, Optional, Union, Iterator
|
5
5
|
|
6
6
|
|
7
7
|
class SeqUtilRecord:
|
@@ -37,67 +37,77 @@ class SeqUtil:
|
|
37
37
|
allow_any_sequence_characters: bool = False,
|
38
38
|
allow_empty_sequence: bool = True,
|
39
39
|
file_name: Optional[str] = None,
|
40
|
-
|
40
|
+
) -> Iterator[SeqUtilRecord]:
|
41
|
+
def process_and_yield_record(header: str, sequence_lines: List[str]):
|
42
|
+
sequence = ''.join(sequence_lines)
|
43
|
+
sequence_id = header.split()[0]
|
44
|
+
if not allow_any_sequence_characters:
|
45
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
46
|
+
if invalid_sequence_characters:
|
47
|
+
raise Exception(
|
48
|
+
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
49
|
+
)
|
50
|
+
if not allow_empty_sequence and not sequence:
|
51
|
+
raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
|
52
|
+
yield SeqUtilRecord(
|
53
|
+
sequence=sequence,
|
54
|
+
sequence_id=sequence_id,
|
55
|
+
description=header[len(sequence_id):].strip()
|
56
|
+
)
|
57
|
+
|
58
|
+
def line_generator_from_buffered_io_base(file_handle: BufferedIOBase) -> Iterator[str]:
|
59
|
+
for line in file_handle:
|
60
|
+
yield line.decode('utf-8')
|
61
|
+
|
62
|
+
def line_generator_from_text_io_base(file_handle: TextIOBase) -> Iterator[str]:
|
63
|
+
for line in file_handle:
|
64
|
+
yield line
|
65
|
+
|
41
66
|
if input_file is None:
|
42
67
|
if file_name:
|
43
68
|
input_file = file_name
|
44
69
|
else:
|
45
70
|
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
71
|
+
|
72
|
+
file_handle = None
|
46
73
|
if isinstance(input_file, str):
|
47
|
-
|
48
|
-
|
74
|
+
file_handle = open(input_file, "rb")
|
75
|
+
line_iterator = line_generator_from_buffered_io_base(file_handle)
|
49
76
|
elif isinstance(input_file, BufferedIOBase):
|
50
|
-
|
77
|
+
line_iterator = line_generator_from_buffered_io_base(input_file)
|
78
|
+
elif isinstance(input_file, TextIOBase):
|
79
|
+
line_iterator = line_generator_from_text_io_base(input_file)
|
51
80
|
else:
|
52
81
|
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
53
|
-
if not data:
|
54
|
-
return []
|
55
|
-
|
56
|
-
if '>' not in data:
|
57
|
-
if default_header:
|
58
|
-
lines_with_header = []
|
59
|
-
for index, line in enumerate(data.split('\n')):
|
60
|
-
index_string = str(index + 1) if index > 0 else ''
|
61
|
-
lines_with_header.append(f'>{default_header}{index_string}\n{line}')
|
62
|
-
|
63
|
-
data = '\n'.join(lines_with_header)
|
64
|
-
else:
|
65
|
-
raise Exception(f'No header line found in FASTA file "{file_name}"')
|
66
|
-
|
67
|
-
splitted = []
|
68
|
-
tmp_data = ''
|
69
|
-
for line in data.splitlines():
|
70
|
-
if line.startswith('>'):
|
71
|
-
if tmp_data:
|
72
|
-
splitted.append(tmp_data)
|
73
|
-
tmp_data = line[1:].strip() + '\n'
|
74
|
-
else:
|
75
|
-
if line.strip():
|
76
|
-
tmp_data += line.strip() + '\n'
|
77
|
-
|
78
|
-
if tmp_data:
|
79
|
-
splitted.append(tmp_data)
|
80
|
-
|
81
|
-
parsed_sequences = []
|
82
|
-
for sequence_data in splitted:
|
83
|
-
sequence_data_splitted = sequence_data.strip().split('\n')
|
84
|
-
header_line = sequence_data_splitted[0].split()
|
85
|
-
sequence_id = header_line[0]
|
86
|
-
description = sequence_data_splitted[0][len(sequence_id) :].strip()
|
87
|
-
sequence = ''.join([seq.strip() for seq in sequence_data_splitted[1:]])
|
88
|
-
|
89
|
-
if not allow_any_sequence_characters:
|
90
|
-
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
91
|
-
if len(invalid_sequence_characters) > 0:
|
92
|
-
raise Exception(
|
93
|
-
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
94
|
-
)
|
95
|
-
if not allow_empty_sequence and len(sequence) == 0:
|
96
|
-
raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
|
97
|
-
|
98
|
-
parsed_sequences.append(SeqUtilRecord(sequence=sequence, sequence_id=sequence_id, description=description))
|
99
82
|
|
100
|
-
|
83
|
+
header = None
|
84
|
+
sequence_lines: List[str] = []
|
85
|
+
|
86
|
+
try:
|
87
|
+
for line_number, line in enumerate(line_iterator):
|
88
|
+
line = line.strip()
|
89
|
+
if not line:
|
90
|
+
continue # skip empty lines
|
91
|
+
if line.startswith('>'):
|
92
|
+
if header is not None:
|
93
|
+
yield from process_and_yield_record(header, sequence_lines)
|
94
|
+
|
95
|
+
header = line[1:].strip()
|
96
|
+
sequence_lines = []
|
97
|
+
else:
|
98
|
+
if header is None:
|
99
|
+
if default_header:
|
100
|
+
yield from process_and_yield_record(f"{default_header}{line_number}", [line])
|
101
|
+
else:
|
102
|
+
raise Exception(f'No header line found in FASTA file "{file_name}"')
|
103
|
+
else:
|
104
|
+
sequence_lines.append(line)
|
105
|
+
|
106
|
+
if header is not None:
|
107
|
+
yield from process_and_yield_record(header, sequence_lines)
|
108
|
+
finally:
|
109
|
+
if file_handle:
|
110
|
+
file_handle.close()
|
101
111
|
|
102
112
|
@staticmethod
|
103
113
|
def write_records_to_fasta(file_name: str, records: List[SeqUtilRecord]) -> None:
|
@@ -1,10 +1,11 @@
|
|
1
1
|
LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
2
|
-
|
3
|
-
biolib/__init__.py,sha256=
|
4
|
-
biolib/_data_record/data_record.py,sha256=
|
2
|
+
PYPI_README.md,sha256=_IH7pxFiqy2bIAmaVeA-iVTyUwWRjMIlfgtUbYTtmls,368
|
3
|
+
biolib/__init__.py,sha256=q_YhAYw51Vq16IKtSk8_MJclDAa4CfCmPhvWDYmrSIg,4393
|
4
|
+
biolib/_data_record/data_record.py,sha256=zVAhFU1RLI1-ptoQ_l639RNwrMANXV9j75yXHvB7dtA,10950
|
5
5
|
biolib/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
biolib/_internal/data_record/__init__.py,sha256=fGdME6JGRU_2VxpJbYpGXYndjN-feUkmKY4fuMyq3cg,76
|
7
7
|
biolib/_internal/data_record/data_record.py,sha256=g_-jdy5-Zem3dthwxJj2OuQqkDGTyc-iGqN1rtYYD1A,4418
|
8
|
+
biolib/_internal/data_record/push_data.py,sha256=-L3a_7zZzDCXabBu3O4lWPMAMeBbeRPTrBlEM-_5SCI,2693
|
8
9
|
biolib/_internal/data_record/remote_storage_endpoint.py,sha256=eCptuZ4DMAPnaNCVDvpWXwXGI6Jac9U1N5dqU8Cj95Q,1732
|
9
10
|
biolib/_internal/file_utils.py,sha256=4jT6j7bB21c0JNn5BfnyWQib_zt0CVtJ_TiOFOStRcE,2604
|
10
11
|
biolib/_internal/fuse_mount/__init__.py,sha256=B_tM6RM2dBw-vbpoHJC4X3tOAaN1H2RDvqYJOw3xFwg,55
|
@@ -14,16 +15,17 @@ biolib/_internal/lfs/__init__.py,sha256=gSWo_xg61UniYgD7yNYxeT4I9uaXBCBSi3_nmZjn
|
|
14
15
|
biolib/_internal/lfs/cache.py,sha256=pQS2np21rdJ6I3DpoOutnzPHpLOZgUIS8TMltUJk_k4,2226
|
15
16
|
biolib/_internal/libs/__init__.py,sha256=Jdf4tNPqe_oIIf6zYml6TiqhL_02Vyqwge6IELrAFhw,98
|
16
17
|
biolib/_internal/libs/fusepy/__init__.py,sha256=AWDzNFS-XV_5yKb0Qx7kggIhPzq1nj_BZS5y2Nso08k,41944
|
17
|
-
biolib/_internal/push_application.py,sha256=
|
18
|
+
biolib/_internal/push_application.py,sha256=mKs3kIKW-ZYfz3Cy6LIyFBwsWkbcGZ9zgMk-xn5NDyg,11660
|
18
19
|
biolib/_internal/runtime.py,sha256=BiHl4klUHr36MCpqKaUso4idHeBZfPAahLYRQrabFqA,486
|
19
|
-
biolib/_internal/types/__init__.py,sha256=
|
20
|
+
biolib/_internal/types/__init__.py,sha256=xLgOQJFh3GRtiqIJq7MaqHReZx4pp34_zcaFQ_JjuJ4,198
|
20
21
|
biolib/_internal/types/app.py,sha256=Mz2QGD_jESX-K9JYnLWPo4YA__Q_1FQQTk9pvidCohU,118
|
21
|
-
biolib/_internal/types/data_record.py,sha256=
|
22
|
+
biolib/_internal/types/data_record.py,sha256=9r_vdhVs60YTnzU4XQFXfDrfS2P2MqD3BH2xa7lk6ck,852
|
22
23
|
biolib/_internal/types/experiment.py,sha256=D94iBdn2nS92lRW-TOs1a2WKXJD5ZtmzL4ypggKX2ys,176
|
23
24
|
biolib/_internal/types/resource.py,sha256=G-vPkZoe4Um6FPxsQZtRzAlbSW5sDW4NFkbjn21I3V4,372
|
25
|
+
biolib/_internal/types/resource_version.py,sha256=sLxViYXloDDUhTDFgjegiQCj097OM1Ih1-uqlC_4ULA,174
|
24
26
|
biolib/_internal/types/typing.py,sha256=D4EKKEe7kDx0K6lJi-H_XLtk-8w6nu2fdqn9bvzI-Xo,288
|
25
27
|
biolib/_internal/utils/__init__.py,sha256=p5vsIFyu-zYqBgdSMfwW9NC_jk7rXvvCbV4Bzd3As7c,630
|
26
|
-
biolib/_internal/utils/multinode.py,sha256=
|
28
|
+
biolib/_internal/utils/multinode.py,sha256=zWrQhcVK5u_xdWX2oIM-D_2fINqNPlqF_h71fu4K8LY,8279
|
27
29
|
biolib/_runtime/runtime.py,sha256=oVgTnDDJv9L4BUP1_sd0oAj4LLyyiPSQdhp7ixWARvw,2923
|
28
30
|
biolib/api/__init__.py,sha256=mQ4u8FijqyLzjYMezMUUbbBGNB3iFmkNdjXnWPZ7Jlw,138
|
29
31
|
biolib/api/client.py,sha256=FRpdH5aI187b_I_4HUNi680v4iOP65z5f2RcUo8D8MA,3559
|
@@ -87,7 +89,7 @@ biolib/compute_node/job_worker/large_file_system.py,sha256=XXqRlVtYhs-Ji9zQGIk5K
|
|
87
89
|
biolib/compute_node/job_worker/mappings.py,sha256=Z48Kg4nbcOvsT2-9o3RRikBkqflgO4XeaWxTGz-CNvI,2499
|
88
90
|
biolib/compute_node/job_worker/utilization_reporter_thread.py,sha256=7tm5Yk9coqJ9VbEdnO86tSXI0iM0omwIyKENxdxiVXk,8575
|
89
91
|
biolib/compute_node/job_worker/utils.py,sha256=wgxcIA8yAhUPdCwyvuuJ0JmreyWmmUoBO33vWtG60xg,1282
|
90
|
-
biolib/compute_node/remote_host_proxy.py,sha256=
|
92
|
+
biolib/compute_node/remote_host_proxy.py,sha256=eTT7x7ht_cxXMQ-0yXCvhKZW6mKeYM4KrfBf75KTbc8,15651
|
91
93
|
biolib/compute_node/socker_listener_thread.py,sha256=T5_UikA3MB9bD5W_dckYLPTgixh72vKUlgbBvj9dbM0,1601
|
92
94
|
biolib/compute_node/socket_sender_thread.py,sha256=YgamPHeUm2GjMFGx8qk-99WlZhEs-kAb3q_2O6qByig,971
|
93
95
|
biolib/compute_node/utils.py,sha256=M7i_WTyxbFM3Lri9RWZ_8FeQNYrQIWpKGLfp2I55oeY,4677
|
@@ -115,10 +117,10 @@ biolib/utils/__init__.py,sha256=fwjciJyJicvYyZcVTzfDBgD0SKY13DeXqvTeG4qZIy8,5548
|
|
115
117
|
biolib/utils/app_uri.py,sha256=Yq_-_VGugQhMMo6mM5f0G9yNlLkr0WK4j0Nrf3FE4xQ,2171
|
116
118
|
biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3100
|
117
119
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
118
|
-
biolib/utils/seq_util.py,sha256=
|
120
|
+
biolib/utils/seq_util.py,sha256=WJnU9vZdwY8RHXvzATyV80OXzyJ7w9EkG33Tna9Nr6A,5698
|
119
121
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
120
|
-
pybiolib-1.2.
|
121
|
-
pybiolib-1.2.
|
122
|
-
pybiolib-1.2.
|
123
|
-
pybiolib-1.2.
|
124
|
-
pybiolib-1.2.
|
122
|
+
pybiolib-1.2.105.dev1.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
123
|
+
pybiolib-1.2.105.dev1.dist-info/METADATA,sha256=8gXSVU8uvqONlUb6KfFDPrrnjV4z9abxHxZ1FL4EtTo,1512
|
124
|
+
pybiolib-1.2.105.dev1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
125
|
+
pybiolib-1.2.105.dev1.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
126
|
+
pybiolib-1.2.105.dev1.dist-info/RECORD,,
|
/README.md → /PYPI_README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|