pybiolib 1.2.1056__py3-none-any.whl → 1.2.1642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pybiolib might be problematic. Click here for more details.
- biolib/__init__.py +33 -10
- biolib/_data_record/data_record.py +24 -11
- biolib/_index/__init__.py +0 -0
- biolib/_index/index.py +51 -0
- biolib/_index/types.py +7 -0
- biolib/_internal/data_record/data_record.py +1 -1
- biolib/_internal/data_record/push_data.py +1 -1
- biolib/_internal/data_record/remote_storage_endpoint.py +3 -3
- biolib/_internal/file_utils.py +7 -4
- biolib/_internal/index/__init__.py +1 -0
- biolib/_internal/index/index.py +18 -0
- biolib/_internal/lfs/cache.py +4 -2
- biolib/_internal/push_application.py +89 -23
- biolib/_internal/runtime.py +2 -0
- biolib/_internal/templates/gui_template/App.tsx +38 -2
- biolib/_internal/templates/gui_template/Dockerfile +2 -0
- biolib/_internal/templates/gui_template/biolib-sdk.ts +37 -0
- biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
- biolib/_internal/templates/gui_template/package.json +1 -0
- biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +49 -0
- biolib/_internal/templates/gui_template/vite.config.mts +2 -1
- biolib/_internal/templates/init_template/.github/workflows/biolib.yml +6 -1
- biolib/_internal/templates/init_template/Dockerfile +2 -0
- biolib/_internal/utils/__init__.py +25 -0
- biolib/_internal/utils/job_url.py +33 -0
- biolib/_runtime/runtime.py +9 -0
- biolib/_session/session.py +7 -5
- biolib/_shared/__init__.py +0 -0
- biolib/_shared/types/__init__.py +69 -0
- biolib/_shared/types/resource.py +17 -0
- biolib/_shared/types/resource_deploy_key.py +11 -0
- biolib/{_internal → _shared}/types/resource_permission.py +1 -1
- biolib/_shared/utils/__init__.py +7 -0
- biolib/_shared/utils/resource_uri.py +75 -0
- biolib/api/client.py +1 -1
- biolib/app/app.py +56 -23
- biolib/biolib_api_client/app_types.py +1 -6
- biolib/biolib_api_client/biolib_app_api.py +17 -0
- biolib/biolib_binary_format/module_input.py +8 -0
- biolib/biolib_binary_format/remote_endpoints.py +3 -3
- biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
- biolib/cli/__init__.py +2 -1
- biolib/cli/data_record.py +17 -0
- biolib/cli/index.py +32 -0
- biolib/cli/lfs.py +1 -1
- biolib/cli/start.py +14 -1
- biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
- biolib/compute_node/job_worker/executors/docker_types.py +1 -1
- biolib/compute_node/job_worker/executors/types.py +6 -5
- biolib/compute_node/job_worker/job_worker.py +149 -93
- biolib/compute_node/job_worker/large_file_system.py +2 -6
- biolib/compute_node/job_worker/network_alloc.py +99 -0
- biolib/compute_node/job_worker/network_buffer.py +240 -0
- biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
- biolib/compute_node/remote_host_proxy.py +125 -67
- biolib/compute_node/utils.py +2 -0
- biolib/compute_node/webserver/compute_node_results_proxy.py +188 -0
- biolib/compute_node/webserver/proxy_utils.py +28 -0
- biolib/compute_node/webserver/webserver.py +64 -19
- biolib/experiments/experiment.py +98 -16
- biolib/jobs/job.py +119 -29
- biolib/jobs/job_result.py +70 -33
- biolib/jobs/types.py +1 -0
- biolib/sdk/__init__.py +17 -2
- biolib/typing_utils.py +1 -1
- biolib/utils/cache_state.py +2 -2
- biolib/utils/seq_util.py +1 -1
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/METADATA +4 -2
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/RECORD +84 -66
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/WHEEL +1 -1
- biolib/_internal/types/__init__.py +0 -6
- biolib/utils/app_uri.py +0 -57
- /biolib/{_internal → _shared}/types/account.py +0 -0
- /biolib/{_internal → _shared}/types/account_member.py +0 -0
- /biolib/{_internal → _shared}/types/app.py +0 -0
- /biolib/{_internal → _shared}/types/data_record.py +0 -0
- /biolib/{_internal → _shared}/types/experiment.py +0 -0
- /biolib/{_internal → _shared}/types/file_node.py +0 -0
- /biolib/{_internal → _shared}/types/push.py +0 -0
- /biolib/{_internal/types/resource.py → _shared/types/resource_types.py} +0 -0
- /biolib/{_internal → _shared}/types/resource_version.py +0 -0
- /biolib/{_internal → _shared}/types/result.py +0 -0
- /biolib/{_internal → _shared}/types/typing.py +0 -0
- /biolib/{_internal → _shared}/types/user.py +0 -0
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/entry_points.txt +0 -0
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info/licenses}/LICENSE +0 -0
biolib/jobs/job.py
CHANGED
|
@@ -2,7 +2,7 @@ import base64
|
|
|
2
2
|
import sys
|
|
3
3
|
import time
|
|
4
4
|
from collections import OrderedDict
|
|
5
|
-
from datetime import datetime, timedelta
|
|
5
|
+
from datetime import datetime, timedelta, timezone
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
@@ -10,7 +10,8 @@ import biolib.api.client
|
|
|
10
10
|
from biolib import utils
|
|
11
11
|
from biolib._internal.http_client import HttpClient
|
|
12
12
|
from biolib._internal.tree_utils import build_tree_from_files, build_tree_str
|
|
13
|
-
from biolib._internal.utils import open_browser_window_from_notebook
|
|
13
|
+
from biolib._internal.utils import PathFilter, filter_lazy_loaded_files, open_browser_window_from_notebook
|
|
14
|
+
from biolib._shared.utils import parse_resource_uri
|
|
14
15
|
from biolib.api.client import ApiClient
|
|
15
16
|
from biolib.biolib_api_client import BiolibApiClient, CreatedJobDict
|
|
16
17
|
from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
|
|
@@ -18,16 +19,16 @@ from biolib.biolib_api_client.biolib_job_api import BiolibJobApi
|
|
|
18
19
|
from biolib.biolib_binary_format import LazyLoadedFile, ModuleInput, ModuleInputDict, ModuleOutputV2
|
|
19
20
|
from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
|
|
20
21
|
from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
|
|
22
|
+
from biolib.biolib_binary_format.utils import InMemoryIndexableBuffer
|
|
21
23
|
from biolib.biolib_errors import BioLibError, CloudJobFinishedError
|
|
22
24
|
from biolib.biolib_logging import logger, logger_no_user_data
|
|
23
25
|
from biolib.compute_node.job_worker.job_storage import JobStorage
|
|
24
26
|
from biolib.compute_node.utils import SystemExceptionCodeMap, SystemExceptionCodes
|
|
25
|
-
from biolib.jobs.job_result import JobResult
|
|
27
|
+
from biolib.jobs.job_result import JobResult
|
|
26
28
|
from biolib.jobs.types import CloudJobDict, CloudJobStartedDict, JobDict
|
|
27
29
|
from biolib.tables import BioLibTable
|
|
28
|
-
from biolib.typing_utils import Dict, List, Optional, Union, cast
|
|
30
|
+
from biolib.typing_utils import Dict, Generator, List, Optional, Tuple, Union, cast
|
|
29
31
|
from biolib.utils import IS_RUNNING_IN_NOTEBOOK
|
|
30
|
-
from biolib.utils.app_uri import parse_app_uri
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
class Result:
|
|
@@ -49,7 +50,7 @@ class Result:
|
|
|
49
50
|
self._auth_token: str = job_dict['auth_token']
|
|
50
51
|
|
|
51
52
|
self._job_dict: JobDict = job_dict
|
|
52
|
-
self._job_dict_last_fetched_at: datetime = datetime.
|
|
53
|
+
self._job_dict_last_fetched_at: datetime = datetime.now(timezone.utc)
|
|
53
54
|
self._result: Optional[JobResult] = None
|
|
54
55
|
self._cached_input_arguments: Optional[List[str]] = None
|
|
55
56
|
|
|
@@ -187,6 +188,45 @@ class Result:
|
|
|
187
188
|
"""
|
|
188
189
|
return self.result.list_output_files(path_filter=path_filter)
|
|
189
190
|
|
|
191
|
+
def list_input_files(
|
|
192
|
+
self,
|
|
193
|
+
path_filter: Optional[PathFilter] = None,
|
|
194
|
+
) -> List[LazyLoadedFile]:
|
|
195
|
+
"""List input files from the result.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
path_filter (PathFilter, optional): Filter to apply to the input files.
|
|
199
|
+
Can be a string glob pattern or a callable that takes a path string and returns a boolean.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
List[LazyLoadedFile]: List of input files.
|
|
203
|
+
|
|
204
|
+
Example::
|
|
205
|
+
>>> result = biolib.get_result("result_id")
|
|
206
|
+
>>> input_files = result.list_input_files()
|
|
207
|
+
>>> # Filter files with a glob pattern
|
|
208
|
+
>>> input_files = result.list_input_files("*.txt")
|
|
209
|
+
"""
|
|
210
|
+
presigned_download_url = BiolibJobApi.get_job_storage_download_url(
|
|
211
|
+
job_uuid=self.id,
|
|
212
|
+
job_auth_token=self._auth_token,
|
|
213
|
+
storage_type='input',
|
|
214
|
+
)
|
|
215
|
+
response = HttpClient.request(url=presigned_download_url)
|
|
216
|
+
module_input_serialized: bytes = response.content
|
|
217
|
+
module_input = ModuleInput(module_input_serialized).deserialize()
|
|
218
|
+
|
|
219
|
+
files = []
|
|
220
|
+
for path, data in module_input['files'].items():
|
|
221
|
+
buffer = InMemoryIndexableBuffer(data)
|
|
222
|
+
lazy_file = LazyLoadedFile(path=path, buffer=buffer, start=0, length=len(data))
|
|
223
|
+
files.append(lazy_file)
|
|
224
|
+
|
|
225
|
+
if not path_filter:
|
|
226
|
+
return files
|
|
227
|
+
|
|
228
|
+
return filter_lazy_loaded_files(files, path_filter)
|
|
229
|
+
|
|
190
230
|
def get_output_file(self, filename: str) -> LazyLoadedFile:
|
|
191
231
|
return self.result.get_output_file(filename=filename)
|
|
192
232
|
|
|
@@ -256,14 +296,44 @@ class Result:
|
|
|
256
296
|
self,
|
|
257
297
|
output_dir: str,
|
|
258
298
|
path_filter: Optional[PathFilter] = None,
|
|
259
|
-
skip_file_if_exists:
|
|
299
|
+
skip_file_if_exists: bool = False,
|
|
260
300
|
overwrite: bool = False,
|
|
301
|
+
flat: bool = False,
|
|
261
302
|
) -> None:
|
|
303
|
+
"""Save output files from the result to a local directory.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
output_dir (str): Directory path where files will be saved.
|
|
307
|
+
path_filter (PathFilter, optional): Filter to apply to output files.
|
|
308
|
+
Can be a string glob pattern or a callable that takes a path and returns a boolean.
|
|
309
|
+
skip_file_if_exists (bool, optional): If True, skip files that already exist locally.
|
|
310
|
+
Defaults to False.
|
|
311
|
+
overwrite (bool, optional): If True, overwrite existing files by renaming them with a timestamp.
|
|
312
|
+
Defaults to False.
|
|
313
|
+
flat (bool, optional): If True, save all files directly to output_dir using only their basenames,
|
|
314
|
+
without creating subdirectories. When enabled, raises an error if duplicate basenames exist
|
|
315
|
+
in the filtered output or if any basename already exists in output_dir. Defaults to False.
|
|
316
|
+
|
|
317
|
+
Raises:
|
|
318
|
+
BioLibError: If flat=True and duplicate basenames are found in filtered output.
|
|
319
|
+
BioLibError: If flat=True and a file with the same basename already exists in output_dir.
|
|
320
|
+
BioLibError: If a file already exists and neither skip_file_if_exists nor overwrite is True.
|
|
321
|
+
|
|
322
|
+
Example::
|
|
323
|
+
>>> result = biolib.get_result("result_id")
|
|
324
|
+
>>> # Save all files preserving directory structure
|
|
325
|
+
>>> result.save_files("./output")
|
|
326
|
+
>>> # Save files flat without subdirectories
|
|
327
|
+
>>> result.save_files("./output", flat=True)
|
|
328
|
+
>>> # Save only specific files
|
|
329
|
+
>>> result.save_files("./output", path_filter="*.txt")
|
|
330
|
+
"""
|
|
262
331
|
self.result.save_files(
|
|
263
332
|
output_dir=output_dir,
|
|
264
333
|
path_filter=path_filter,
|
|
265
334
|
skip_file_if_exists=skip_file_if_exists,
|
|
266
335
|
overwrite=overwrite,
|
|
336
|
+
flat=flat,
|
|
267
337
|
)
|
|
268
338
|
|
|
269
339
|
def get_status(self) -> str:
|
|
@@ -358,6 +428,7 @@ class Result:
|
|
|
358
428
|
>>> # Recompute with different arguments
|
|
359
429
|
>>> new_result = result.recompute(arguments=["--new-arg", "value"])
|
|
360
430
|
"""
|
|
431
|
+
self._refetch_job_dict()
|
|
361
432
|
app_response = BiolibAppApi.get_by_uri(uri=app_uri or self._job_dict['app_uri'])
|
|
362
433
|
|
|
363
434
|
job_storage_input = RemoteJobStorageEndpoint(
|
|
@@ -385,6 +456,7 @@ class Result:
|
|
|
385
456
|
app_uri=app_response['app_uri'],
|
|
386
457
|
app_version_uuid=app_response['app_version']['public_id'],
|
|
387
458
|
module_input_serialized=module_input_serialized,
|
|
459
|
+
override_command=self._job_dict['arguments_override_command'],
|
|
388
460
|
machine=machine if machine else original_requested_machine,
|
|
389
461
|
)
|
|
390
462
|
if blocking:
|
|
@@ -451,16 +523,11 @@ class Result:
|
|
|
451
523
|
return Result(job_dict)
|
|
452
524
|
|
|
453
525
|
@staticmethod
|
|
454
|
-
def
|
|
526
|
+
def _yield_logs_packages(stdout_and_stderr_packages_b64) -> Generator[Tuple[str, bytes], None, None]:
|
|
455
527
|
for stdout_and_stderr_package_b64 in stdout_and_stderr_packages_b64:
|
|
456
528
|
stdout_and_stderr_package = base64.b64decode(stdout_and_stderr_package_b64)
|
|
457
529
|
stdout_and_stderr = StdoutAndStderr(stdout_and_stderr_package).deserialize()
|
|
458
|
-
|
|
459
|
-
sys.stdout.write(stdout_and_stderr.decode())
|
|
460
|
-
if not IS_RUNNING_IN_NOTEBOOK: # for some reason flushing in jupyter notebooks breaks \r handling
|
|
461
|
-
sys.stdout.flush()
|
|
462
|
-
# flush after having processed all packages
|
|
463
|
-
sys.stdout.flush()
|
|
530
|
+
yield ('stdout', stdout_and_stderr)
|
|
464
531
|
|
|
465
532
|
def show(self) -> None:
|
|
466
533
|
self._refetch_job_dict()
|
|
@@ -470,21 +537,44 @@ class Result:
|
|
|
470
537
|
title=f'Result: {self._uuid}',
|
|
471
538
|
).print_table()
|
|
472
539
|
|
|
473
|
-
def stream_logs(self
|
|
540
|
+
def stream_logs(self, as_iterator: bool = False):
|
|
541
|
+
if as_iterator:
|
|
542
|
+
return self._iter_logs()
|
|
474
543
|
self._stream_logs()
|
|
544
|
+
return None
|
|
475
545
|
|
|
476
546
|
def _stream_logs(self, enable_print: bool = True) -> None:
|
|
547
|
+
try:
|
|
548
|
+
for stream_type, data in self._iter_logs(enable_print=enable_print):
|
|
549
|
+
if stream_type == 'stdout':
|
|
550
|
+
if IS_RUNNING_IN_NOTEBOOK:
|
|
551
|
+
sys.stdout.write(data.decode(encoding='utf-8', errors='replace'))
|
|
552
|
+
# Note: we avoid flush() in notebook as that breaks \r handling
|
|
553
|
+
else:
|
|
554
|
+
sys.stdout.buffer.write(data)
|
|
555
|
+
sys.stdout.buffer.flush()
|
|
556
|
+
elif stream_type == 'stderr':
|
|
557
|
+
if IS_RUNNING_IN_NOTEBOOK:
|
|
558
|
+
sys.stderr.write(data.decode(encoding='utf-8', errors='replace'))
|
|
559
|
+
# Note: we avoid flush() in notebook as that breaks \r handling
|
|
560
|
+
else:
|
|
561
|
+
sys.stderr.buffer.write(data)
|
|
562
|
+
sys.stderr.buffer.flush()
|
|
563
|
+
finally:
|
|
564
|
+
# Flush after having processed all packages
|
|
565
|
+
if IS_RUNNING_IN_NOTEBOOK:
|
|
566
|
+
sys.stdout.flush()
|
|
567
|
+
sys.stderr.flush()
|
|
568
|
+
|
|
569
|
+
def _iter_logs(self, enable_print: bool = True) -> Generator[Tuple[str, bytes], None, None]:
|
|
477
570
|
try:
|
|
478
571
|
cloud_job = self._get_cloud_job_awaiting_started()
|
|
479
572
|
except CloudJobFinishedError:
|
|
480
573
|
logger.info(f'--- The result {self.id} has already completed (no streaming will take place) ---')
|
|
481
574
|
logger.info('--- The stdout log is printed below: ---')
|
|
482
|
-
|
|
483
|
-
print(self.get_stdout().decode(), file=sys.stdout)
|
|
484
|
-
sys.stdout.flush()
|
|
575
|
+
yield ('stdout', self.get_stdout())
|
|
485
576
|
logger.info('--- The stderr log is printed below: ---')
|
|
486
|
-
|
|
487
|
-
sys.stderr.flush()
|
|
577
|
+
yield ('stderr', self.get_stderr())
|
|
488
578
|
logger.info(f'--- The job {self.id} has already completed. Its output was printed above. ---')
|
|
489
579
|
return
|
|
490
580
|
|
|
@@ -496,7 +586,7 @@ class Result:
|
|
|
496
586
|
logger_no_user_data.debug(f'Using cloud proxy URL from env var BIOLIB_CLOUD_BASE_URL: {compute_node_url}')
|
|
497
587
|
|
|
498
588
|
if enable_print:
|
|
499
|
-
self.
|
|
589
|
+
yield from self._yield_full_logs(node_url=compute_node_url)
|
|
500
590
|
|
|
501
591
|
final_status_messages: List[str] = []
|
|
502
592
|
while True:
|
|
@@ -515,8 +605,8 @@ class Result:
|
|
|
515
605
|
# Print the status before writing stdout and stderr
|
|
516
606
|
logger.info(f'Cloud: {status_update["log_message"]}')
|
|
517
607
|
|
|
518
|
-
if
|
|
519
|
-
self.
|
|
608
|
+
if enable_print:
|
|
609
|
+
yield from self._yield_logs_packages(status_json['stdout_and_stderr_packages_b64'])
|
|
520
610
|
|
|
521
611
|
if 'error_code' in status_json:
|
|
522
612
|
error_code = status_json['error_code']
|
|
@@ -533,7 +623,7 @@ class Result:
|
|
|
533
623
|
|
|
534
624
|
self.wait() # Wait for compute node to tell the backend that the job is finished
|
|
535
625
|
|
|
536
|
-
def
|
|
626
|
+
def _yield_full_logs(self, node_url: str) -> Generator[Tuple[str, bytes], None, None]:
|
|
537
627
|
try:
|
|
538
628
|
response_json = HttpClient.request(url=f'{node_url}/v1/job/{self._uuid}/status/?logs=full').json()
|
|
539
629
|
except Exception as error:
|
|
@@ -543,7 +633,7 @@ class Result:
|
|
|
543
633
|
for status_update in response_json.get('previous_status_updates', []):
|
|
544
634
|
logger.info(f'Cloud: {status_update["log_message"]}')
|
|
545
635
|
|
|
546
|
-
self.
|
|
636
|
+
yield from self._yield_logs_packages(response_json['streamed_logs_packages_b64'])
|
|
547
637
|
|
|
548
638
|
def _get_cloud_job_awaiting_started(self) -> CloudJobStartedDict:
|
|
549
639
|
retry_count = 0
|
|
@@ -589,11 +679,11 @@ class Result:
|
|
|
589
679
|
)
|
|
590
680
|
|
|
591
681
|
def _refetch_job_dict(self, force_refetch: Optional[bool] = False) -> None:
|
|
592
|
-
if not force_refetch and self._job_dict_last_fetched_at > datetime.
|
|
682
|
+
if not force_refetch and self._job_dict_last_fetched_at > datetime.now(timezone.utc) - timedelta(seconds=2):
|
|
593
683
|
return
|
|
594
684
|
|
|
595
685
|
self._job_dict = self._get_job_dict(self._uuid, self._auth_token)
|
|
596
|
-
self._job_dict_last_fetched_at = datetime.
|
|
686
|
+
self._job_dict_last_fetched_at = datetime.now(timezone.utc)
|
|
597
687
|
|
|
598
688
|
@staticmethod
|
|
599
689
|
def _start_job_in_cloud(
|
|
@@ -612,8 +702,8 @@ class Result:
|
|
|
612
702
|
) -> 'Result':
|
|
613
703
|
if len(module_input_serialized) < 500_000 and temporary_client_secrets is None:
|
|
614
704
|
_job_dict = BiolibJobApi.create_job_with_data(
|
|
615
|
-
app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
|
|
616
705
|
app_version_uuid=app_version_uuid,
|
|
706
|
+
app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
|
|
617
707
|
arguments_override_command=override_command,
|
|
618
708
|
experiment_uuid=experiment_id,
|
|
619
709
|
module_input_serialized=module_input_serialized,
|
|
@@ -627,8 +717,8 @@ class Result:
|
|
|
627
717
|
return Result(cast(JobDict, _job_dict))
|
|
628
718
|
|
|
629
719
|
job_dict: CreatedJobDict = BiolibJobApi.create(
|
|
630
|
-
app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
|
|
631
720
|
app_version_id=app_version_uuid,
|
|
721
|
+
app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
|
|
632
722
|
experiment_uuid=experiment_id,
|
|
633
723
|
machine=machine,
|
|
634
724
|
notify=notify,
|
biolib/jobs/job_result.py
CHANGED
|
@@ -1,16 +1,14 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from fnmatch import fnmatch
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
|
|
4
|
+
from biolib._internal.utils import PathFilter, filter_lazy_loaded_files
|
|
5
5
|
from biolib.biolib_binary_format import ModuleOutputV2
|
|
6
6
|
from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
|
|
7
7
|
from biolib.biolib_binary_format.remote_stream_seeker import StreamSeeker
|
|
8
8
|
from biolib.biolib_binary_format.utils import LazyLoadedFile, RemoteIndexableBuffer
|
|
9
9
|
from biolib.biolib_errors import BioLibError
|
|
10
10
|
from biolib.biolib_logging import logger
|
|
11
|
-
from biolib.typing_utils import
|
|
12
|
-
|
|
13
|
-
PathFilter = Union[str, Callable[[str], bool]]
|
|
11
|
+
from biolib.typing_utils import Dict, List, Optional
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
class JobResult:
|
|
@@ -38,18 +36,68 @@ class JobResult:
|
|
|
38
36
|
self,
|
|
39
37
|
output_dir: str,
|
|
40
38
|
path_filter: Optional[PathFilter] = None,
|
|
41
|
-
skip_file_if_exists:
|
|
39
|
+
skip_file_if_exists: bool = False,
|
|
42
40
|
overwrite: bool = False,
|
|
41
|
+
flat: bool = False,
|
|
43
42
|
) -> None:
|
|
44
43
|
module_output = self._get_module_output()
|
|
45
44
|
output_files = module_output.get_files()
|
|
46
|
-
filtered_output_files =
|
|
45
|
+
filtered_output_files = filter_lazy_loaded_files(output_files, path_filter) if path_filter else output_files
|
|
47
46
|
|
|
48
47
|
if len(filtered_output_files) == 0:
|
|
49
48
|
logger.debug('No output files to save')
|
|
50
49
|
return
|
|
51
50
|
|
|
52
|
-
|
|
51
|
+
if flat:
|
|
52
|
+
basename_to_paths: Dict[str, List[str]] = {}
|
|
53
|
+
for file in filtered_output_files:
|
|
54
|
+
basename = Path(file.path).name
|
|
55
|
+
if basename not in basename_to_paths:
|
|
56
|
+
basename_to_paths[basename] = []
|
|
57
|
+
basename_to_paths[basename].append(file.path)
|
|
58
|
+
|
|
59
|
+
duplicates = {basename: paths for basename, paths in basename_to_paths.items() if len(paths) > 1}
|
|
60
|
+
|
|
61
|
+
if duplicates:
|
|
62
|
+
max_shown = 3
|
|
63
|
+
error_parts = []
|
|
64
|
+
sorted_basenames = sorted(duplicates.keys())
|
|
65
|
+
|
|
66
|
+
for basename in sorted_basenames[:max_shown]:
|
|
67
|
+
paths = duplicates[basename]
|
|
68
|
+
error_parts.append(f' {basename}: ({", ".join(paths)})')
|
|
69
|
+
|
|
70
|
+
error_message = 'Cannot save files in flat mode: duplicate filenames detected:\n' + '\n'.join(
|
|
71
|
+
error_parts
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if len(duplicates) > max_shown:
|
|
75
|
+
remaining = len(duplicates) - max_shown
|
|
76
|
+
error_message += f'\n (and {remaining} more)'
|
|
77
|
+
|
|
78
|
+
raise BioLibError(error_message)
|
|
79
|
+
|
|
80
|
+
major_gap_threshold = 50_000
|
|
81
|
+
n = len(filtered_output_files)
|
|
82
|
+
|
|
83
|
+
next_break_end = [0] * n
|
|
84
|
+
if n > 0:
|
|
85
|
+
next_break_end[n - 1] = filtered_output_files[n - 1].start + filtered_output_files[n - 1].length
|
|
86
|
+
for i in range(n - 2, -1, -1):
|
|
87
|
+
end_i = filtered_output_files[i].start + filtered_output_files[i].length
|
|
88
|
+
gap = filtered_output_files[i + 1].start - end_i
|
|
89
|
+
if gap >= major_gap_threshold:
|
|
90
|
+
next_break_end[i] = end_i
|
|
91
|
+
else:
|
|
92
|
+
next_break_end[i] = next_break_end[i + 1]
|
|
93
|
+
|
|
94
|
+
total_files_data_to_download_in_bytes = 0
|
|
95
|
+
file_read_ahead_map = {}
|
|
96
|
+
for i, file in enumerate(filtered_output_files):
|
|
97
|
+
total_files_data_to_download_in_bytes += file.length
|
|
98
|
+
end_i = file.start + file.length
|
|
99
|
+
read_ahead_bytes = max(0, next_break_end[i] - end_i)
|
|
100
|
+
file_read_ahead_map[i] = read_ahead_bytes
|
|
53
101
|
|
|
54
102
|
# Assume files are in order
|
|
55
103
|
first_file = filtered_output_files[0]
|
|
@@ -57,14 +105,18 @@ class JobResult:
|
|
|
57
105
|
stream_seeker = StreamSeeker(
|
|
58
106
|
files_data_start=first_file.start,
|
|
59
107
|
files_data_end=last_file.start + last_file.length,
|
|
60
|
-
|
|
108
|
+
max_chunk_size=min(total_files_data_to_download_in_bytes, 10_000_000),
|
|
61
109
|
upstream_buffer=module_output.buffer,
|
|
62
110
|
)
|
|
63
111
|
|
|
64
112
|
logger.info(f'Saving {len(filtered_output_files)} files to {output_dir}...')
|
|
65
|
-
for file in filtered_output_files:
|
|
66
|
-
|
|
67
|
-
|
|
113
|
+
for file_index, file in enumerate(filtered_output_files):
|
|
114
|
+
if flat:
|
|
115
|
+
destination_file_path = Path(output_dir) / Path(file.path).name
|
|
116
|
+
else:
|
|
117
|
+
# Remove leading slash of file_path
|
|
118
|
+
destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
|
|
119
|
+
|
|
68
120
|
if destination_file_path.exists():
|
|
69
121
|
if skip_file_if_exists:
|
|
70
122
|
print(f'Skipping {destination_file_path} as a file with that name already exists locally.')
|
|
@@ -91,8 +143,12 @@ class JobResult:
|
|
|
91
143
|
file_start += data_already_downloaded
|
|
92
144
|
data_to_download -= data_already_downloaded
|
|
93
145
|
|
|
146
|
+
read_ahead_bytes = file_read_ahead_map[file_index]
|
|
147
|
+
|
|
94
148
|
with open(partial_path, mode='ab') as partial_file:
|
|
95
|
-
for chunk in stream_seeker.seek_and_read(
|
|
149
|
+
for chunk in stream_seeker.seek_and_read(
|
|
150
|
+
file_start=file_start, file_length=data_to_download, read_ahead_bytes=read_ahead_bytes
|
|
151
|
+
):
|
|
96
152
|
partial_file.write(chunk)
|
|
97
153
|
|
|
98
154
|
# rename partial file to actual file name
|
|
@@ -100,7 +156,7 @@ class JobResult:
|
|
|
100
156
|
|
|
101
157
|
def get_output_file(self, filename) -> LazyLoadedFile:
|
|
102
158
|
files = self._get_module_output().get_files()
|
|
103
|
-
filtered_files =
|
|
159
|
+
filtered_files = filter_lazy_loaded_files(files, path_filter=filename)
|
|
104
160
|
if not filtered_files:
|
|
105
161
|
raise BioLibError(f'File {filename} not found in results.')
|
|
106
162
|
|
|
@@ -114,26 +170,7 @@ class JobResult:
|
|
|
114
170
|
if not path_filter:
|
|
115
171
|
return files
|
|
116
172
|
|
|
117
|
-
return
|
|
118
|
-
|
|
119
|
-
@staticmethod
|
|
120
|
-
def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
|
|
121
|
-
if not (isinstance(path_filter, str) or callable(path_filter)):
|
|
122
|
-
raise Exception('Expected path_filter to be a string or a function')
|
|
123
|
-
|
|
124
|
-
if callable(path_filter):
|
|
125
|
-
return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
|
|
126
|
-
|
|
127
|
-
glob_filter = cast(str, path_filter)
|
|
128
|
-
|
|
129
|
-
# since all file paths start with /, make sure filter does too
|
|
130
|
-
if not glob_filter.startswith('/'):
|
|
131
|
-
glob_filter = '/' + glob_filter
|
|
132
|
-
|
|
133
|
-
def _filter_function(file: LazyLoadedFile) -> bool:
|
|
134
|
-
return fnmatch(file.path, glob_filter)
|
|
135
|
-
|
|
136
|
-
return list(filter(_filter_function, files))
|
|
173
|
+
return filter_lazy_loaded_files(files, path_filter)
|
|
137
174
|
|
|
138
175
|
def _get_module_output(self) -> ModuleOutputV2:
|
|
139
176
|
if self._module_output is None:
|
biolib/jobs/types.py
CHANGED
biolib/sdk/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
# Imports to hide and use as private internal utils
|
|
4
4
|
from biolib._data_record.data_record import DataRecord as _DataRecord
|
|
5
|
+
from biolib._index.index import Index as _Index
|
|
5
6
|
from biolib._internal.push_application import push_application as _push_application
|
|
6
7
|
from biolib._internal.push_application import set_app_version_as_active as _set_app_version_as_active
|
|
7
8
|
from biolib._runtime.runtime import Runtime as _Runtime
|
|
@@ -12,8 +13,18 @@ from biolib.app import BioLibApp as _BioLibApp
|
|
|
12
13
|
Runtime = _Runtime
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def get_session(
|
|
16
|
-
|
|
16
|
+
def get_session(
|
|
17
|
+
refresh_token: str,
|
|
18
|
+
base_url: Optional[str] = None,
|
|
19
|
+
client_type: Optional[str] = None,
|
|
20
|
+
experiment: Optional[str] = None,
|
|
21
|
+
) -> _Session:
|
|
22
|
+
return _Session.get_session(
|
|
23
|
+
refresh_token=refresh_token,
|
|
24
|
+
base_url=base_url,
|
|
25
|
+
client_type=client_type,
|
|
26
|
+
experiment=experiment,
|
|
27
|
+
)
|
|
17
28
|
|
|
18
29
|
|
|
19
30
|
def push_app_version(uri: str, path: str) -> _BioLibApp:
|
|
@@ -64,3 +75,7 @@ def create_data_record(
|
|
|
64
75
|
data_path=data_path,
|
|
65
76
|
record_type=record_type,
|
|
66
77
|
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_index(uri: str) -> _Index:
|
|
81
|
+
return _Index.get_by_uri(uri)
|
biolib/typing_utils.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# TODO: Deprecate and later remove this file
|
|
2
|
-
from biolib.
|
|
2
|
+
from biolib._shared.types.typing import * # pylint: disable=wildcard-import, unused-wildcard-import
|
biolib/utils/cache_state.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import abc
|
|
3
3
|
import json
|
|
4
4
|
import time
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
6
|
|
|
7
7
|
import appdirs # type: ignore
|
|
8
8
|
|
|
@@ -88,4 +88,4 @@ class CacheState(abc.ABC, Generic[StateType]):
|
|
|
88
88
|
|
|
89
89
|
@staticmethod
|
|
90
90
|
def get_timestamp_now() -> str:
|
|
91
|
-
return datetime.
|
|
91
|
+
return datetime.now(timezone.utc).isoformat()
|
biolib/utils/seq_util.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: pybiolib
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1642
|
|
4
4
|
Summary: BioLib Python Client
|
|
5
5
|
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Keywords: biolib
|
|
7
8
|
Author: biolib
|
|
8
9
|
Author-email: hello@biolib.com
|
|
@@ -17,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
17
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
22
|
Provides-Extra: compute-node
|
|
21
23
|
Requires-Dist: appdirs (>=1.4.3)
|
|
22
24
|
Requires-Dist: click (>=8.0.0)
|