pybiolib 1.1.2236__py3-none-any.whl → 1.1.2250__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ import glob
2
+ import os
3
+ import re
4
+ import shutil
5
+ import subprocess
6
+ import tempfile
7
+
8
+ import biolib
9
+ from biolib.utils import SeqUtil
10
+
11
+
12
+ def natsorted(lst):
13
+ """Sort the list using the natural sort key."""
14
+
15
+ def _natural_sort_key(s):
16
+ """A key function for natural sorting."""
17
+ return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
18
+
19
+ return sorted(lst, key=_natural_sort_key)
20
+
21
+
22
+ def fasta_above_threshold(fasta_file, work_threshold, work_per_residue=1, verbose=False):
23
+ """True if total FASYA residue work above max_work"""
24
+
25
+ records = SeqUtil.parse_fasta(fasta_file)
26
+
27
+ # Calculate work units
28
+ total_work_units = 0
29
+ for i, record in enumerate(records):
30
+ sequence_work_units = len(record.sequence) * work_per_residue
31
+ total_work_units += sequence_work_units
32
+
33
+ if total_work_units >= work_threshold:
34
+ if verbose:
35
+ print(f'FASTA above threshold (stopped at {total_work_units}) >= {work_threshold}')
36
+ print(f'From from {i+1}/{len(records)} sequences in {fasta_file}')
37
+ return True
38
+
39
+ if verbose:
40
+ print(f'FASTA below threshold ({total_work_units}) < {work_threshold}')
41
+ print(f'From {len(records)} sequences in {fasta_file}')
42
+
43
+ return False
44
+
45
+
46
+ def run_locally(command_list, args):
47
+ """Run script locally (no multi-node processing)"""
48
+
49
+ # Prepare command
50
+ new_args = vars(args)
51
+
52
+ # Delete multinode-specific input arguments
53
+ for k in list(new_args.keys()):
54
+ if str(k).startswith('multinode'):
55
+ del new_args[k]
56
+
57
+ # Convert to list format
58
+ new_args_list = _args_dict_to_args_list(new_args)
59
+
60
+ # Prepare command, e.g. ["python3", "predict.py"] + new_args_list
61
+ command = command_list + new_args_list
62
+
63
+ if args.verbose >= 1:
64
+ print(f'Running {command}')
65
+
66
+ # Run command
67
+ result = subprocess.run(command, capture_output=True, text=True, check=False)
68
+ if result.returncode == 0:
69
+ print(f'{result.stdout}')
70
+ else:
71
+ print(f'Error: {result.stderr}')
72
+
73
+
74
+ def fasta_batch_records(fasta_file, work_per_batch_min, work_per_residue=1, verbose=False):
75
+ """Converts FASTA records to batches of records, based on thresholds"""
76
+
77
+ def log_batches(batches):
78
+ for i, batch in enumerate(batches):
79
+ batch_dict = {
80
+ 'records': len(batch),
81
+ 'residues': sum(len(record.sequence) for record in batch),
82
+ }
83
+
84
+ n_seqs, n_res = batch_dict['records'], batch_dict['residues']
85
+ print(f'Batch {i+1}: {n_res} residues from {n_seqs} sequences')
86
+
87
+ records = SeqUtil.parse_fasta(fasta_file)
88
+
89
+ batches = []
90
+ batch = []
91
+ current_work_units = 0
92
+ total_work_units = 0
93
+ for record in records:
94
+ # Add to batch
95
+ batch.append(record)
96
+
97
+ # Calculate work units
98
+ seq = record.sequence
99
+ sequence_work_units = len(seq) * work_per_residue
100
+
101
+ # Increase counters
102
+ current_work_units += sequence_work_units
103
+ total_work_units += sequence_work_units
104
+
105
+ # If above limit, start a new batch
106
+ if current_work_units >= work_per_batch_min:
107
+ batches.append(batch)
108
+ batch = []
109
+ current_work_units = 0
110
+
111
+ # Append last batch if present
112
+ if batch:
113
+ batches.append(batch)
114
+
115
+ if verbose:
116
+ log_batches(batches)
117
+
118
+ return batches
119
+
120
+
121
+ def fasta_send_batches_biolib(app_url, batches, args, args_fasta='fasta', verbose=1):
122
+ """
123
+ Send jobs through pybiolib interface
124
+ """
125
+
126
+ if args.verbose >= 1:
127
+ print(f'Sending {len(batches)} batches to Biolib')
128
+
129
+ # Login to biolib, prepare app
130
+ # current_app = biolib.load(Runtime.get_app_uri())
131
+ biolib.login()
132
+ current_app = biolib.load(app_url) # Nb: uses "_" not "-"
133
+
134
+ # Compute results
135
+ job_list = []
136
+ for i, batch_records in enumerate(batches): # MH
137
+ # Write FASTA, send to server
138
+ with tempfile.TemporaryDirectory() as tempdir:
139
+ # New arguments
140
+ new_args = vars(args)
141
+
142
+ # Write batched FASTA to send
143
+ fasta_path = f'{tempdir}/input.fasta'
144
+ SeqUtil.write_records_to_fasta(fasta_path, batch_records)
145
+ new_args[args_fasta] = fasta_path
146
+ new_args['multinode_only_local'] = True
147
+
148
+ # Convert to list
149
+ new_args_list = _args_dict_to_args_list(new_args)
150
+
151
+ # Send job
152
+ job = current_app.cli(args=new_args_list, blocking=False)
153
+ job_list.append(job)
154
+
155
+ # Job stats
156
+ if args.verbose:
157
+ batch_dict = _get_batch_stats(batch_records)
158
+ n_seqs, n_res = batch_dict['records'], batch_dict['residues']
159
+ print(f'Sending job {i+1}: {n_res} residues from {n_seqs} sequences -> arg_list = {new_args_list}')
160
+
161
+ # Stream job output at a time
162
+ print('Streaming job outputs ...')
163
+ for i, job in enumerate(job_list):
164
+ job.stream_logs()
165
+
166
+ # Check if job succeeded
167
+ assert job.get_exit_code() == 0, f'Job failed with exit code {job.get_exit_code()}'
168
+
169
+ # Write to disk
170
+ output_dir = f'job_output/job_{i+1}'
171
+ job.save_files(output_dir=output_dir)
172
+
173
+ if verbose:
174
+ print(f'Saving to {output_dir}')
175
+
176
+
177
+ def merge_folder(folder_name, job_out_dir='job_output', out_dir='output', verbose=1):
178
+ """Helper function for merging folders"""
179
+
180
+ os.makedirs(out_dir, exist_ok=True)
181
+
182
+ job_dirs = glob.glob(f'{job_out_dir}/job_*')
183
+ job_dirs = natsorted(job_dirs)
184
+
185
+ # Move first file, prepare to merge
186
+ first_folder = f'{job_dirs[0]}/{folder_name}'
187
+ merged_folder = f'{out_dir}/{folder_name}'
188
+ shutil.move(first_folder, merged_folder)
189
+
190
+ if verbose:
191
+ print(f'Merging {folder_name} from {len(job_dirs)} directories to {merged_folder}')
192
+
193
+ # If more than one folder, merge to first
194
+ if len(job_dirs) >= 2:
195
+ # Find each job output file
196
+ for job_dir in job_dirs[1:]:
197
+ # Move over extra files
198
+ extra_folder = f'{job_dir}/{folder_name}'
199
+ extra_files = os.listdir(extra_folder)
200
+ for file_name in extra_files:
201
+ file_path = f'{extra_folder}/{file_name}'
202
+ shutil.move(file_path, merged_folder)
203
+
204
+
205
+ def merge_file(
206
+ file_name,
207
+ header_lines_int=1,
208
+ job_out_dir='job_output',
209
+ out_dir='output',
210
+ verbose=1,
211
+ ):
212
+ """Helper function for merging files with headers"""
213
+
214
+ os.makedirs(out_dir, exist_ok=True)
215
+
216
+ job_dirs = glob.glob(f'{job_out_dir}/job_*')
217
+ job_dirs = natsorted(job_dirs)
218
+
219
+ # Move first file, prepare to merge
220
+ first_file = f'{job_dirs[0]}/{file_name}'
221
+ merged_file = f'{out_dir}/{file_name}'
222
+ shutil.move(first_file, merged_file)
223
+
224
+ if verbose:
225
+ print(f'Merging {file_name} from {len(job_dirs)} directories to {merged_file}')
226
+
227
+ # If more than one file, append to first
228
+ if len(job_dirs) >= 2:
229
+ # Open first file
230
+ with open(merged_file, 'a') as merged_file_handle:
231
+ # Find each job output file
232
+ for job_dir in job_dirs[1:]:
233
+ # Open extra file
234
+ extra_file = f'{job_dir}/{file_name}'
235
+ with open(extra_file) as extra_file_handle:
236
+ # Skip first n header lines
237
+ for _ in range(header_lines_int):
238
+ next(extra_file_handle)
239
+
240
+ # Append content to first file
241
+ contents = extra_file_handle.read()
242
+ merged_file_handle.write(contents)
243
+
244
+
245
+ def _get_batch_stats(batch):
246
+ stats_dict = {
247
+ 'records': len(batch),
248
+ 'residues': sum(len(R.sequence) for R in batch),
249
+ }
250
+
251
+ return stats_dict
252
+
253
+
254
+ def _args_dict_to_args_list(new_args):
255
+ """Converts args dict to list of arguments for Biolib"""
256
+
257
+ nested_list = [[f'--{key}', f'{value}'] for key, value in new_args.items()]
258
+
259
+ arg_list = []
260
+ for lst in nested_list:
261
+ for item in lst:
262
+ arg_list.append(item)
263
+
264
+ return arg_list
biolib/cli/data_record.py CHANGED
@@ -6,6 +6,7 @@ from typing import Dict, List
6
6
  import click
7
7
 
8
8
  from biolib._data_record.data_record import DataRecord
9
+ from biolib.biolib_api_client import BiolibApiClient
9
10
  from biolib.biolib_logging import logger, logger_no_user_data
10
11
  from biolib.typing_utils import Optional
11
12
 
@@ -57,6 +58,7 @@ def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
57
58
  @click.argument('uri', required=True)
58
59
  @click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
59
60
  def describe(uri: str, output_as_json: bool) -> None:
61
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='get Data Record description')
60
62
  record = DataRecord.get_by_uri(uri)
61
63
  files_info: List[Dict] = []
62
64
  total_size_in_bytes = 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pybiolib
3
- Version: 1.1.2236
3
+ Version: 1.1.2250
4
4
  Summary: BioLib Python Client
5
5
  Home-page: https://github.com/biolib
6
6
  License: MIT
@@ -23,6 +23,7 @@ biolib/_internal/types/experiment.py,sha256=D94iBdn2nS92lRW-TOs1a2WKXJD5ZtmzL4yp
23
23
  biolib/_internal/types/resource.py,sha256=G-vPkZoe4Um6FPxsQZtRzAlbSW5sDW4NFkbjn21I3V4,372
24
24
  biolib/_internal/types/typing.py,sha256=D4EKKEe7kDx0K6lJi-H_XLtk-8w6nu2fdqn9bvzI-Xo,288
25
25
  biolib/_internal/utils/__init__.py,sha256=p5vsIFyu-zYqBgdSMfwW9NC_jk7rXvvCbV4Bzd3As7c,630
26
+ biolib/_internal/utils/multinode.py,sha256=UnM08GXc8U-p0eoSleer4BIgngIsn_fgh9FxRQJkIiI,8068
26
27
  biolib/_runtime/runtime.py,sha256=oVgTnDDJv9L4BUP1_sd0oAj4LLyyiPSQdhp7ixWARvw,2923
27
28
  biolib/api/__init__.py,sha256=mQ4u8FijqyLzjYMezMUUbbBGNB3iFmkNdjXnWPZ7Jlw,138
28
29
  biolib/api/client.py,sha256=FRpdH5aI187b_I_4HUNi680v4iOP65z5f2RcUo8D8MA,3559
@@ -57,7 +58,7 @@ biolib/biolib_errors.py,sha256=5m4lK2l39DafpoXBImEBD4EPH3ayXBX0JgtPzmGClow,689
57
58
  biolib/biolib_logging.py,sha256=J3E5H_LL5k6ZUim2C8gqN7E6lCBZMTpO4tnMpOPwG9U,2854
58
59
  biolib/cli/__init__.py,sha256=0v3c_J-U0k46c5ZWeQjLG_kTaKDJm81LBxQpDO2B_aI,1286
59
60
  biolib/cli/auth.py,sha256=rpWGmXs6Fz6CGrO9K8ibPRszOdXG78Vig_boKaVCD9A,2082
60
- biolib/cli/data_record.py,sha256=08JbZkFWKMo0PrnhhG0jQEKnNW7pPLti9cOw8s1TWfI,3344
61
+ biolib/cli/data_record.py,sha256=t8DfJK2EZ_SNZ9drDA_N5Jqy8DNwf9f5SlFrIaOvtv0,3501
61
62
  biolib/cli/download_container.py,sha256=HIZVHOPmslGE5M2Dsp9r2cCkAEJx__vcsDz5Wt5LRos,483
62
63
  biolib/cli/init.py,sha256=wQOfii_au-d30Hp7DdH-WVw-WVraKvA_zY4za1w7DE8,821
63
64
  biolib/cli/lfs.py,sha256=z2qHUwink85mv9yDgifbVKkVwuyknGhMDTfly_gLKJM,4151
@@ -116,8 +117,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
116
117
  biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
117
118
  biolib/utils/seq_util.py,sha256=ZQFcaE37B2dtucN2zDjOmdya_X0ITc1zBFZJNQY13XA,5183
118
119
  biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
119
- pybiolib-1.1.2236.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
120
- pybiolib-1.1.2236.dist-info/METADATA,sha256=1ifPFsaJA8_xjhDV2TwwqQx5oJJo9dXWcy_G360yHHQ,1508
121
- pybiolib-1.1.2236.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
122
- pybiolib-1.1.2236.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
123
- pybiolib-1.1.2236.dist-info/RECORD,,
120
+ pybiolib-1.1.2250.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
121
+ pybiolib-1.1.2250.dist-info/METADATA,sha256=IwmPFCDmfGZxhiEc-2YbSTyVc7tARTMCYIeVlk0NHSo,1508
122
+ pybiolib-1.1.2250.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
123
+ pybiolib-1.1.2250.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
124
+ pybiolib-1.1.2250.dist-info/RECORD,,