pybiolib 1.1.2236__py3-none-any.whl → 1.1.2250__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/_internal/utils/multinode.py +264 -0
- biolib/cli/data_record.py +2 -0
- {pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/METADATA +1 -1
- {pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/RECORD +7 -6
- {pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.2236.dist-info → pybiolib-1.1.2250.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,264 @@
|
|
1
|
+
import glob
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
import shutil
|
5
|
+
import subprocess
|
6
|
+
import tempfile
|
7
|
+
|
8
|
+
import biolib
|
9
|
+
from biolib.utils import SeqUtil
|
10
|
+
|
11
|
+
|
12
|
+
def natsorted(lst):
|
13
|
+
"""Sort the list using the natural sort key."""
|
14
|
+
|
15
|
+
def _natural_sort_key(s):
|
16
|
+
"""A key function for natural sorting."""
|
17
|
+
return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
|
18
|
+
|
19
|
+
return sorted(lst, key=_natural_sort_key)
|
20
|
+
|
21
|
+
|
22
|
+
def fasta_above_threshold(fasta_file, work_threshold, work_per_residue=1, verbose=False):
|
23
|
+
"""True if total FASYA residue work above max_work"""
|
24
|
+
|
25
|
+
records = SeqUtil.parse_fasta(fasta_file)
|
26
|
+
|
27
|
+
# Calculate work units
|
28
|
+
total_work_units = 0
|
29
|
+
for i, record in enumerate(records):
|
30
|
+
sequence_work_units = len(record.sequence) * work_per_residue
|
31
|
+
total_work_units += sequence_work_units
|
32
|
+
|
33
|
+
if total_work_units >= work_threshold:
|
34
|
+
if verbose:
|
35
|
+
print(f'FASTA above threshold (stopped at {total_work_units}) >= {work_threshold}')
|
36
|
+
print(f'From from {i+1}/{len(records)} sequences in {fasta_file}')
|
37
|
+
return True
|
38
|
+
|
39
|
+
if verbose:
|
40
|
+
print(f'FASTA below threshold ({total_work_units}) < {work_threshold}')
|
41
|
+
print(f'From {len(records)} sequences in {fasta_file}')
|
42
|
+
|
43
|
+
return False
|
44
|
+
|
45
|
+
|
46
|
+
def run_locally(command_list, args):
|
47
|
+
"""Run script locally (no multi-node processing)"""
|
48
|
+
|
49
|
+
# Prepare command
|
50
|
+
new_args = vars(args)
|
51
|
+
|
52
|
+
# Delete multinode-specific input arguments
|
53
|
+
for k in list(new_args.keys()):
|
54
|
+
if str(k).startswith('multinode'):
|
55
|
+
del new_args[k]
|
56
|
+
|
57
|
+
# Convert to list format
|
58
|
+
new_args_list = _args_dict_to_args_list(new_args)
|
59
|
+
|
60
|
+
# Prepare command, e.g. ["python3", "predict.py"] + new_args_list
|
61
|
+
command = command_list + new_args_list
|
62
|
+
|
63
|
+
if args.verbose >= 1:
|
64
|
+
print(f'Running {command}')
|
65
|
+
|
66
|
+
# Run command
|
67
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
68
|
+
if result.returncode == 0:
|
69
|
+
print(f'{result.stdout}')
|
70
|
+
else:
|
71
|
+
print(f'Error: {result.stderr}')
|
72
|
+
|
73
|
+
|
74
|
+
def fasta_batch_records(fasta_file, work_per_batch_min, work_per_residue=1, verbose=False):
|
75
|
+
"""Converts FASTA records to batches of records, based on thresholds"""
|
76
|
+
|
77
|
+
def log_batches(batches):
|
78
|
+
for i, batch in enumerate(batches):
|
79
|
+
batch_dict = {
|
80
|
+
'records': len(batch),
|
81
|
+
'residues': sum(len(record.sequence) for record in batch),
|
82
|
+
}
|
83
|
+
|
84
|
+
n_seqs, n_res = batch_dict['records'], batch_dict['residues']
|
85
|
+
print(f'Batch {i+1}: {n_res} residues from {n_seqs} sequences')
|
86
|
+
|
87
|
+
records = SeqUtil.parse_fasta(fasta_file)
|
88
|
+
|
89
|
+
batches = []
|
90
|
+
batch = []
|
91
|
+
current_work_units = 0
|
92
|
+
total_work_units = 0
|
93
|
+
for record in records:
|
94
|
+
# Add to batch
|
95
|
+
batch.append(record)
|
96
|
+
|
97
|
+
# Calculate work units
|
98
|
+
seq = record.sequence
|
99
|
+
sequence_work_units = len(seq) * work_per_residue
|
100
|
+
|
101
|
+
# Increase counters
|
102
|
+
current_work_units += sequence_work_units
|
103
|
+
total_work_units += sequence_work_units
|
104
|
+
|
105
|
+
# If above limit, start a new batch
|
106
|
+
if current_work_units >= work_per_batch_min:
|
107
|
+
batches.append(batch)
|
108
|
+
batch = []
|
109
|
+
current_work_units = 0
|
110
|
+
|
111
|
+
# Append last batch if present
|
112
|
+
if batch:
|
113
|
+
batches.append(batch)
|
114
|
+
|
115
|
+
if verbose:
|
116
|
+
log_batches(batches)
|
117
|
+
|
118
|
+
return batches
|
119
|
+
|
120
|
+
|
121
|
+
def fasta_send_batches_biolib(app_url, batches, args, args_fasta='fasta', verbose=1):
|
122
|
+
"""
|
123
|
+
Send jobs through pybiolib interface
|
124
|
+
"""
|
125
|
+
|
126
|
+
if args.verbose >= 1:
|
127
|
+
print(f'Sending {len(batches)} batches to Biolib')
|
128
|
+
|
129
|
+
# Login to biolib, prepare app
|
130
|
+
# current_app = biolib.load(Runtime.get_app_uri())
|
131
|
+
biolib.login()
|
132
|
+
current_app = biolib.load(app_url) # Nb: uses "_" not "-"
|
133
|
+
|
134
|
+
# Compute results
|
135
|
+
job_list = []
|
136
|
+
for i, batch_records in enumerate(batches): # MH
|
137
|
+
# Write FASTA, send to server
|
138
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
139
|
+
# New arguments
|
140
|
+
new_args = vars(args)
|
141
|
+
|
142
|
+
# Write batched FASTA to send
|
143
|
+
fasta_path = f'{tempdir}/input.fasta'
|
144
|
+
SeqUtil.write_records_to_fasta(fasta_path, batch_records)
|
145
|
+
new_args[args_fasta] = fasta_path
|
146
|
+
new_args['multinode_only_local'] = True
|
147
|
+
|
148
|
+
# Convert to list
|
149
|
+
new_args_list = _args_dict_to_args_list(new_args)
|
150
|
+
|
151
|
+
# Send job
|
152
|
+
job = current_app.cli(args=new_args_list, blocking=False)
|
153
|
+
job_list.append(job)
|
154
|
+
|
155
|
+
# Job stats
|
156
|
+
if args.verbose:
|
157
|
+
batch_dict = _get_batch_stats(batch_records)
|
158
|
+
n_seqs, n_res = batch_dict['records'], batch_dict['residues']
|
159
|
+
print(f'Sending job {i+1}: {n_res} residues from {n_seqs} sequences -> arg_list = {new_args_list}')
|
160
|
+
|
161
|
+
# Stream job output at a time
|
162
|
+
print('Streaming job outputs ...')
|
163
|
+
for i, job in enumerate(job_list):
|
164
|
+
job.stream_logs()
|
165
|
+
|
166
|
+
# Check if job succeeded
|
167
|
+
assert job.get_exit_code() == 0, f'Job failed with exit code {job.get_exit_code()}'
|
168
|
+
|
169
|
+
# Write to disk
|
170
|
+
output_dir = f'job_output/job_{i+1}'
|
171
|
+
job.save_files(output_dir=output_dir)
|
172
|
+
|
173
|
+
if verbose:
|
174
|
+
print(f'Saving to {output_dir}')
|
175
|
+
|
176
|
+
|
177
|
+
def merge_folder(folder_name, job_out_dir='job_output', out_dir='output', verbose=1):
|
178
|
+
"""Helper function for merging folders"""
|
179
|
+
|
180
|
+
os.makedirs(out_dir, exist_ok=True)
|
181
|
+
|
182
|
+
job_dirs = glob.glob(f'{job_out_dir}/job_*')
|
183
|
+
job_dirs = natsorted(job_dirs)
|
184
|
+
|
185
|
+
# Move first file, prepare to merge
|
186
|
+
first_folder = f'{job_dirs[0]}/{folder_name}'
|
187
|
+
merged_folder = f'{out_dir}/{folder_name}'
|
188
|
+
shutil.move(first_folder, merged_folder)
|
189
|
+
|
190
|
+
if verbose:
|
191
|
+
print(f'Merging {folder_name} from {len(job_dirs)} directories to {merged_folder}')
|
192
|
+
|
193
|
+
# If more than one folder, merge to first
|
194
|
+
if len(job_dirs) >= 2:
|
195
|
+
# Find each job output file
|
196
|
+
for job_dir in job_dirs[1:]:
|
197
|
+
# Move over extra files
|
198
|
+
extra_folder = f'{job_dir}/{folder_name}'
|
199
|
+
extra_files = os.listdir(extra_folder)
|
200
|
+
for file_name in extra_files:
|
201
|
+
file_path = f'{extra_folder}/{file_name}'
|
202
|
+
shutil.move(file_path, merged_folder)
|
203
|
+
|
204
|
+
|
205
|
+
def merge_file(
|
206
|
+
file_name,
|
207
|
+
header_lines_int=1,
|
208
|
+
job_out_dir='job_output',
|
209
|
+
out_dir='output',
|
210
|
+
verbose=1,
|
211
|
+
):
|
212
|
+
"""Helper function for merging files with headers"""
|
213
|
+
|
214
|
+
os.makedirs(out_dir, exist_ok=True)
|
215
|
+
|
216
|
+
job_dirs = glob.glob(f'{job_out_dir}/job_*')
|
217
|
+
job_dirs = natsorted(job_dirs)
|
218
|
+
|
219
|
+
# Move first file, prepare to merge
|
220
|
+
first_file = f'{job_dirs[0]}/{file_name}'
|
221
|
+
merged_file = f'{out_dir}/{file_name}'
|
222
|
+
shutil.move(first_file, merged_file)
|
223
|
+
|
224
|
+
if verbose:
|
225
|
+
print(f'Merging {file_name} from {len(job_dirs)} directories to {merged_file}')
|
226
|
+
|
227
|
+
# If more than one file, append to first
|
228
|
+
if len(job_dirs) >= 2:
|
229
|
+
# Open first file
|
230
|
+
with open(merged_file, 'a') as merged_file_handle:
|
231
|
+
# Find each job output file
|
232
|
+
for job_dir in job_dirs[1:]:
|
233
|
+
# Open extra file
|
234
|
+
extra_file = f'{job_dir}/{file_name}'
|
235
|
+
with open(extra_file) as extra_file_handle:
|
236
|
+
# Skip first n header lines
|
237
|
+
for _ in range(header_lines_int):
|
238
|
+
next(extra_file_handle)
|
239
|
+
|
240
|
+
# Append content to first file
|
241
|
+
contents = extra_file_handle.read()
|
242
|
+
merged_file_handle.write(contents)
|
243
|
+
|
244
|
+
|
245
|
+
def _get_batch_stats(batch):
|
246
|
+
stats_dict = {
|
247
|
+
'records': len(batch),
|
248
|
+
'residues': sum(len(R.sequence) for R in batch),
|
249
|
+
}
|
250
|
+
|
251
|
+
return stats_dict
|
252
|
+
|
253
|
+
|
254
|
+
def _args_dict_to_args_list(new_args):
|
255
|
+
"""Converts args dict to list of arguments for Biolib"""
|
256
|
+
|
257
|
+
nested_list = [[f'--{key}', f'{value}'] for key, value in new_args.items()]
|
258
|
+
|
259
|
+
arg_list = []
|
260
|
+
for lst in nested_list:
|
261
|
+
for item in lst:
|
262
|
+
arg_list.append(item)
|
263
|
+
|
264
|
+
return arg_list
|
biolib/cli/data_record.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Dict, List
|
|
6
6
|
import click
|
7
7
|
|
8
8
|
from biolib._data_record.data_record import DataRecord
|
9
|
+
from biolib.biolib_api_client import BiolibApiClient
|
9
10
|
from biolib.biolib_logging import logger, logger_no_user_data
|
10
11
|
from biolib.typing_utils import Optional
|
11
12
|
|
@@ -57,6 +58,7 @@ def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
|
|
57
58
|
@click.argument('uri', required=True)
|
58
59
|
@click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
|
59
60
|
def describe(uri: str, output_as_json: bool) -> None:
|
61
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='get Data Record description')
|
60
62
|
record = DataRecord.get_by_uri(uri)
|
61
63
|
files_info: List[Dict] = []
|
62
64
|
total_size_in_bytes = 0
|
@@ -23,6 +23,7 @@ biolib/_internal/types/experiment.py,sha256=D94iBdn2nS92lRW-TOs1a2WKXJD5ZtmzL4yp
|
|
23
23
|
biolib/_internal/types/resource.py,sha256=G-vPkZoe4Um6FPxsQZtRzAlbSW5sDW4NFkbjn21I3V4,372
|
24
24
|
biolib/_internal/types/typing.py,sha256=D4EKKEe7kDx0K6lJi-H_XLtk-8w6nu2fdqn9bvzI-Xo,288
|
25
25
|
biolib/_internal/utils/__init__.py,sha256=p5vsIFyu-zYqBgdSMfwW9NC_jk7rXvvCbV4Bzd3As7c,630
|
26
|
+
biolib/_internal/utils/multinode.py,sha256=UnM08GXc8U-p0eoSleer4BIgngIsn_fgh9FxRQJkIiI,8068
|
26
27
|
biolib/_runtime/runtime.py,sha256=oVgTnDDJv9L4BUP1_sd0oAj4LLyyiPSQdhp7ixWARvw,2923
|
27
28
|
biolib/api/__init__.py,sha256=mQ4u8FijqyLzjYMezMUUbbBGNB3iFmkNdjXnWPZ7Jlw,138
|
28
29
|
biolib/api/client.py,sha256=FRpdH5aI187b_I_4HUNi680v4iOP65z5f2RcUo8D8MA,3559
|
@@ -57,7 +58,7 @@ biolib/biolib_errors.py,sha256=5m4lK2l39DafpoXBImEBD4EPH3ayXBX0JgtPzmGClow,689
|
|
57
58
|
biolib/biolib_logging.py,sha256=J3E5H_LL5k6ZUim2C8gqN7E6lCBZMTpO4tnMpOPwG9U,2854
|
58
59
|
biolib/cli/__init__.py,sha256=0v3c_J-U0k46c5ZWeQjLG_kTaKDJm81LBxQpDO2B_aI,1286
|
59
60
|
biolib/cli/auth.py,sha256=rpWGmXs6Fz6CGrO9K8ibPRszOdXG78Vig_boKaVCD9A,2082
|
60
|
-
biolib/cli/data_record.py,sha256=
|
61
|
+
biolib/cli/data_record.py,sha256=t8DfJK2EZ_SNZ9drDA_N5Jqy8DNwf9f5SlFrIaOvtv0,3501
|
61
62
|
biolib/cli/download_container.py,sha256=HIZVHOPmslGE5M2Dsp9r2cCkAEJx__vcsDz5Wt5LRos,483
|
62
63
|
biolib/cli/init.py,sha256=wQOfii_au-d30Hp7DdH-WVw-WVraKvA_zY4za1w7DE8,821
|
63
64
|
biolib/cli/lfs.py,sha256=z2qHUwink85mv9yDgifbVKkVwuyknGhMDTfly_gLKJM,4151
|
@@ -116,8 +117,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
|
|
116
117
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
117
118
|
biolib/utils/seq_util.py,sha256=ZQFcaE37B2dtucN2zDjOmdya_X0ITc1zBFZJNQY13XA,5183
|
118
119
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
119
|
-
pybiolib-1.1.
|
120
|
-
pybiolib-1.1.
|
121
|
-
pybiolib-1.1.
|
122
|
-
pybiolib-1.1.
|
123
|
-
pybiolib-1.1.
|
120
|
+
pybiolib-1.1.2250.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
121
|
+
pybiolib-1.1.2250.dist-info/METADATA,sha256=IwmPFCDmfGZxhiEc-2YbSTyVc7tARTMCYIeVlk0NHSo,1508
|
122
|
+
pybiolib-1.1.2250.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
123
|
+
pybiolib-1.1.2250.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
124
|
+
pybiolib-1.1.2250.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|