pybiolib 1.2.1056__py3-none-any.whl → 1.2.1727__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pybiolib might be problematic. Click here for more details.

Files changed (93) hide show
  1. biolib/__init__.py +33 -10
  2. biolib/_data_record/data_record.py +103 -26
  3. biolib/_index/__init__.py +0 -0
  4. biolib/_index/index.py +51 -0
  5. biolib/_index/types.py +7 -0
  6. biolib/_internal/data_record/data_record.py +1 -1
  7. biolib/_internal/data_record/push_data.py +65 -16
  8. biolib/_internal/data_record/remote_storage_endpoint.py +3 -3
  9. biolib/_internal/file_utils.py +7 -4
  10. biolib/_internal/index/__init__.py +1 -0
  11. biolib/_internal/index/index.py +18 -0
  12. biolib/_internal/lfs/cache.py +4 -2
  13. biolib/_internal/push_application.py +89 -23
  14. biolib/_internal/runtime.py +2 -0
  15. biolib/_internal/templates/gui_template/App.tsx +38 -2
  16. biolib/_internal/templates/gui_template/Dockerfile +2 -0
  17. biolib/_internal/templates/gui_template/biolib-sdk.ts +37 -0
  18. biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
  19. biolib/_internal/templates/gui_template/package.json +1 -0
  20. biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +49 -0
  21. biolib/_internal/templates/gui_template/vite.config.mts +2 -1
  22. biolib/_internal/templates/init_template/.github/workflows/biolib.yml +6 -1
  23. biolib/_internal/templates/init_template/Dockerfile +2 -0
  24. biolib/_internal/utils/__init__.py +40 -0
  25. biolib/_internal/utils/auth.py +46 -0
  26. biolib/_internal/utils/job_url.py +33 -0
  27. biolib/_runtime/runtime.py +9 -0
  28. biolib/_session/session.py +7 -5
  29. biolib/_shared/__init__.py +0 -0
  30. biolib/_shared/types/__init__.py +74 -0
  31. biolib/_shared/types/resource.py +37 -0
  32. biolib/_shared/types/resource_deploy_key.py +11 -0
  33. biolib/{_internal → _shared}/types/resource_version.py +8 -2
  34. biolib/_shared/utils/__init__.py +7 -0
  35. biolib/_shared/utils/resource_uri.py +75 -0
  36. biolib/api/client.py +3 -47
  37. biolib/app/app.py +57 -33
  38. biolib/biolib_api_client/api_client.py +3 -47
  39. biolib/biolib_api_client/app_types.py +1 -6
  40. biolib/biolib_api_client/biolib_app_api.py +17 -0
  41. biolib/biolib_binary_format/module_input.py +8 -0
  42. biolib/biolib_binary_format/remote_endpoints.py +3 -3
  43. biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
  44. biolib/cli/__init__.py +2 -1
  45. biolib/cli/data_record.py +82 -0
  46. biolib/cli/index.py +32 -0
  47. biolib/cli/init.py +39 -1
  48. biolib/cli/lfs.py +1 -1
  49. biolib/cli/run.py +8 -5
  50. biolib/cli/start.py +14 -1
  51. biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
  52. biolib/compute_node/job_worker/executors/docker_types.py +1 -1
  53. biolib/compute_node/job_worker/executors/types.py +6 -5
  54. biolib/compute_node/job_worker/job_worker.py +149 -93
  55. biolib/compute_node/job_worker/large_file_system.py +2 -6
  56. biolib/compute_node/job_worker/network_alloc.py +99 -0
  57. biolib/compute_node/job_worker/network_buffer.py +240 -0
  58. biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
  59. biolib/compute_node/remote_host_proxy.py +139 -79
  60. biolib/compute_node/utils.py +2 -0
  61. biolib/compute_node/webserver/compute_node_results_proxy.py +188 -0
  62. biolib/compute_node/webserver/proxy_utils.py +28 -0
  63. biolib/compute_node/webserver/webserver.py +64 -19
  64. biolib/experiments/experiment.py +111 -16
  65. biolib/jobs/job.py +119 -29
  66. biolib/jobs/job_result.py +70 -33
  67. biolib/jobs/types.py +1 -0
  68. biolib/sdk/__init__.py +17 -2
  69. biolib/typing_utils.py +1 -1
  70. biolib/utils/cache_state.py +2 -2
  71. biolib/utils/multipart_uploader.py +24 -18
  72. biolib/utils/seq_util.py +1 -1
  73. pybiolib-1.2.1727.dist-info/METADATA +41 -0
  74. {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1727.dist-info}/RECORD +103 -85
  75. {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1727.dist-info}/WHEEL +1 -1
  76. pybiolib-1.2.1727.dist-info/entry_points.txt +2 -0
  77. biolib/_internal/types/__init__.py +0 -6
  78. biolib/_internal/types/resource.py +0 -18
  79. biolib/utils/app_uri.py +0 -57
  80. pybiolib-1.2.1056.dist-info/METADATA +0 -50
  81. pybiolib-1.2.1056.dist-info/entry_points.txt +0 -3
  82. /biolib/{_internal → _shared}/types/account.py +0 -0
  83. /biolib/{_internal → _shared}/types/account_member.py +0 -0
  84. /biolib/{_internal → _shared}/types/app.py +0 -0
  85. /biolib/{_internal → _shared}/types/data_record.py +0 -0
  86. /biolib/{_internal → _shared}/types/experiment.py +0 -0
  87. /biolib/{_internal → _shared}/types/file_node.py +0 -0
  88. /biolib/{_internal → _shared}/types/push.py +0 -0
  89. /biolib/{_internal → _shared}/types/resource_permission.py +0 -0
  90. /biolib/{_internal → _shared}/types/result.py +0 -0
  91. /biolib/{_internal → _shared}/types/typing.py +0 -0
  92. /biolib/{_internal → _shared}/types/user.py +0 -0
  93. {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1727.dist-info/licenses}/LICENSE +0 -0
biolib/jobs/job.py CHANGED
@@ -2,7 +2,7 @@ import base64
2
2
  import sys
3
3
  import time
4
4
  from collections import OrderedDict
5
- from datetime import datetime, timedelta
5
+ from datetime import datetime, timedelta, timezone
6
6
  from pathlib import Path
7
7
  from urllib.parse import urlparse
8
8
 
@@ -10,7 +10,8 @@ import biolib.api.client
10
10
  from biolib import utils
11
11
  from biolib._internal.http_client import HttpClient
12
12
  from biolib._internal.tree_utils import build_tree_from_files, build_tree_str
13
- from biolib._internal.utils import open_browser_window_from_notebook
13
+ from biolib._internal.utils import PathFilter, filter_lazy_loaded_files, open_browser_window_from_notebook
14
+ from biolib._shared.utils import parse_resource_uri
14
15
  from biolib.api.client import ApiClient
15
16
  from biolib.biolib_api_client import BiolibApiClient, CreatedJobDict
16
17
  from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
@@ -18,16 +19,16 @@ from biolib.biolib_api_client.biolib_job_api import BiolibJobApi
18
19
  from biolib.biolib_binary_format import LazyLoadedFile, ModuleInput, ModuleInputDict, ModuleOutputV2
19
20
  from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
20
21
  from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
22
+ from biolib.biolib_binary_format.utils import InMemoryIndexableBuffer
21
23
  from biolib.biolib_errors import BioLibError, CloudJobFinishedError
22
24
  from biolib.biolib_logging import logger, logger_no_user_data
23
25
  from biolib.compute_node.job_worker.job_storage import JobStorage
24
26
  from biolib.compute_node.utils import SystemExceptionCodeMap, SystemExceptionCodes
25
- from biolib.jobs.job_result import JobResult, PathFilter
27
+ from biolib.jobs.job_result import JobResult
26
28
  from biolib.jobs.types import CloudJobDict, CloudJobStartedDict, JobDict
27
29
  from biolib.tables import BioLibTable
28
- from biolib.typing_utils import Dict, List, Optional, Union, cast
30
+ from biolib.typing_utils import Dict, Generator, List, Optional, Tuple, Union, cast
29
31
  from biolib.utils import IS_RUNNING_IN_NOTEBOOK
30
- from biolib.utils.app_uri import parse_app_uri
31
32
 
32
33
 
33
34
  class Result:
@@ -49,7 +50,7 @@ class Result:
49
50
  self._auth_token: str = job_dict['auth_token']
50
51
 
51
52
  self._job_dict: JobDict = job_dict
52
- self._job_dict_last_fetched_at: datetime = datetime.utcnow()
53
+ self._job_dict_last_fetched_at: datetime = datetime.now(timezone.utc)
53
54
  self._result: Optional[JobResult] = None
54
55
  self._cached_input_arguments: Optional[List[str]] = None
55
56
 
@@ -187,6 +188,45 @@ class Result:
187
188
  """
188
189
  return self.result.list_output_files(path_filter=path_filter)
189
190
 
191
+ def list_input_files(
192
+ self,
193
+ path_filter: Optional[PathFilter] = None,
194
+ ) -> List[LazyLoadedFile]:
195
+ """List input files from the result.
196
+
197
+ Args:
198
+ path_filter (PathFilter, optional): Filter to apply to the input files.
199
+ Can be a string glob pattern or a callable that takes a path string and returns a boolean.
200
+
201
+ Returns:
202
+ List[LazyLoadedFile]: List of input files.
203
+
204
+ Example::
205
+ >>> result = biolib.get_result("result_id")
206
+ >>> input_files = result.list_input_files()
207
+ >>> # Filter files with a glob pattern
208
+ >>> input_files = result.list_input_files("*.txt")
209
+ """
210
+ presigned_download_url = BiolibJobApi.get_job_storage_download_url(
211
+ job_uuid=self.id,
212
+ job_auth_token=self._auth_token,
213
+ storage_type='input',
214
+ )
215
+ response = HttpClient.request(url=presigned_download_url)
216
+ module_input_serialized: bytes = response.content
217
+ module_input = ModuleInput(module_input_serialized).deserialize()
218
+
219
+ files = []
220
+ for path, data in module_input['files'].items():
221
+ buffer = InMemoryIndexableBuffer(data)
222
+ lazy_file = LazyLoadedFile(path=path, buffer=buffer, start=0, length=len(data))
223
+ files.append(lazy_file)
224
+
225
+ if not path_filter:
226
+ return files
227
+
228
+ return filter_lazy_loaded_files(files, path_filter)
229
+
190
230
  def get_output_file(self, filename: str) -> LazyLoadedFile:
191
231
  return self.result.get_output_file(filename=filename)
192
232
 
@@ -256,14 +296,44 @@ class Result:
256
296
  self,
257
297
  output_dir: str,
258
298
  path_filter: Optional[PathFilter] = None,
259
- skip_file_if_exists: Optional[bool] = None,
299
+ skip_file_if_exists: bool = False,
260
300
  overwrite: bool = False,
301
+ flat: bool = False,
261
302
  ) -> None:
303
+ """Save output files from the result to a local directory.
304
+
305
+ Args:
306
+ output_dir (str): Directory path where files will be saved.
307
+ path_filter (PathFilter, optional): Filter to apply to output files.
308
+ Can be a string glob pattern or a callable that takes a path and returns a boolean.
309
+ skip_file_if_exists (bool, optional): If True, skip files that already exist locally.
310
+ Defaults to False.
311
+ overwrite (bool, optional): If True, overwrite existing files by renaming them with a timestamp.
312
+ Defaults to False.
313
+ flat (bool, optional): If True, save all files directly to output_dir using only their basenames,
314
+ without creating subdirectories. When enabled, raises an error if duplicate basenames exist
315
+ in the filtered output or if any basename already exists in output_dir. Defaults to False.
316
+
317
+ Raises:
318
+ BioLibError: If flat=True and duplicate basenames are found in filtered output.
319
+ BioLibError: If flat=True and a file with the same basename already exists in output_dir.
320
+ BioLibError: If a file already exists and neither skip_file_if_exists nor overwrite is True.
321
+
322
+ Example::
323
+ >>> result = biolib.get_result("result_id")
324
+ >>> # Save all files preserving directory structure
325
+ >>> result.save_files("./output")
326
+ >>> # Save files flat without subdirectories
327
+ >>> result.save_files("./output", flat=True)
328
+ >>> # Save only specific files
329
+ >>> result.save_files("./output", path_filter="*.txt")
330
+ """
262
331
  self.result.save_files(
263
332
  output_dir=output_dir,
264
333
  path_filter=path_filter,
265
334
  skip_file_if_exists=skip_file_if_exists,
266
335
  overwrite=overwrite,
336
+ flat=flat,
267
337
  )
268
338
 
269
339
  def get_status(self) -> str:
@@ -358,6 +428,7 @@ class Result:
358
428
  >>> # Recompute with different arguments
359
429
  >>> new_result = result.recompute(arguments=["--new-arg", "value"])
360
430
  """
431
+ self._refetch_job_dict()
361
432
  app_response = BiolibAppApi.get_by_uri(uri=app_uri or self._job_dict['app_uri'])
362
433
 
363
434
  job_storage_input = RemoteJobStorageEndpoint(
@@ -385,6 +456,7 @@ class Result:
385
456
  app_uri=app_response['app_uri'],
386
457
  app_version_uuid=app_response['app_version']['public_id'],
387
458
  module_input_serialized=module_input_serialized,
459
+ override_command=self._job_dict['arguments_override_command'],
388
460
  machine=machine if machine else original_requested_machine,
389
461
  )
390
462
  if blocking:
@@ -451,16 +523,11 @@ class Result:
451
523
  return Result(job_dict)
452
524
 
453
525
  @staticmethod
454
- def print_logs_packages(stdout_and_stderr_packages_b64):
526
+ def _yield_logs_packages(stdout_and_stderr_packages_b64) -> Generator[Tuple[str, bytes], None, None]:
455
527
  for stdout_and_stderr_package_b64 in stdout_and_stderr_packages_b64:
456
528
  stdout_and_stderr_package = base64.b64decode(stdout_and_stderr_package_b64)
457
529
  stdout_and_stderr = StdoutAndStderr(stdout_and_stderr_package).deserialize()
458
-
459
- sys.stdout.write(stdout_and_stderr.decode())
460
- if not IS_RUNNING_IN_NOTEBOOK: # for some reason flushing in jupyter notebooks breaks \r handling
461
- sys.stdout.flush()
462
- # flush after having processed all packages
463
- sys.stdout.flush()
530
+ yield ('stdout', stdout_and_stderr)
464
531
 
465
532
  def show(self) -> None:
466
533
  self._refetch_job_dict()
@@ -470,21 +537,44 @@ class Result:
470
537
  title=f'Result: {self._uuid}',
471
538
  ).print_table()
472
539
 
473
- def stream_logs(self) -> None:
540
+ def stream_logs(self, as_iterator: bool = False):
541
+ if as_iterator:
542
+ return self._iter_logs()
474
543
  self._stream_logs()
544
+ return None
475
545
 
476
546
  def _stream_logs(self, enable_print: bool = True) -> None:
547
+ try:
548
+ for stream_type, data in self._iter_logs(enable_print=enable_print):
549
+ if stream_type == 'stdout':
550
+ if IS_RUNNING_IN_NOTEBOOK:
551
+ sys.stdout.write(data.decode(encoding='utf-8', errors='replace'))
552
+ # Note: we avoid flush() in notebook as that breaks \r handling
553
+ else:
554
+ sys.stdout.buffer.write(data)
555
+ sys.stdout.buffer.flush()
556
+ elif stream_type == 'stderr':
557
+ if IS_RUNNING_IN_NOTEBOOK:
558
+ sys.stderr.write(data.decode(encoding='utf-8', errors='replace'))
559
+ # Note: we avoid flush() in notebook as that breaks \r handling
560
+ else:
561
+ sys.stderr.buffer.write(data)
562
+ sys.stderr.buffer.flush()
563
+ finally:
564
+ # Flush after having processed all packages
565
+ if IS_RUNNING_IN_NOTEBOOK:
566
+ sys.stdout.flush()
567
+ sys.stderr.flush()
568
+
569
+ def _iter_logs(self, enable_print: bool = True) -> Generator[Tuple[str, bytes], None, None]:
477
570
  try:
478
571
  cloud_job = self._get_cloud_job_awaiting_started()
479
572
  except CloudJobFinishedError:
480
573
  logger.info(f'--- The result {self.id} has already completed (no streaming will take place) ---')
481
574
  logger.info('--- The stdout log is printed below: ---')
482
- sys.stdout.flush()
483
- print(self.get_stdout().decode(), file=sys.stdout)
484
- sys.stdout.flush()
575
+ yield ('stdout', self.get_stdout())
485
576
  logger.info('--- The stderr log is printed below: ---')
486
- print(self.get_stderr().decode(), file=sys.stderr)
487
- sys.stderr.flush()
577
+ yield ('stderr', self.get_stderr())
488
578
  logger.info(f'--- The job {self.id} has already completed. Its output was printed above. ---')
489
579
  return
490
580
 
@@ -496,7 +586,7 @@ class Result:
496
586
  logger_no_user_data.debug(f'Using cloud proxy URL from env var BIOLIB_CLOUD_BASE_URL: {compute_node_url}')
497
587
 
498
588
  if enable_print:
499
- self._print_full_logs(node_url=compute_node_url)
589
+ yield from self._yield_full_logs(node_url=compute_node_url)
500
590
 
501
591
  final_status_messages: List[str] = []
502
592
  while True:
@@ -515,8 +605,8 @@ class Result:
515
605
  # Print the status before writing stdout and stderr
516
606
  logger.info(f'Cloud: {status_update["log_message"]}')
517
607
 
518
- if 'stdout_and_stderr_packages_b64' and enable_print:
519
- self.print_logs_packages(status_json['stdout_and_stderr_packages_b64'])
608
+ if enable_print:
609
+ yield from self._yield_logs_packages(status_json['stdout_and_stderr_packages_b64'])
520
610
 
521
611
  if 'error_code' in status_json:
522
612
  error_code = status_json['error_code']
@@ -533,7 +623,7 @@ class Result:
533
623
 
534
624
  self.wait() # Wait for compute node to tell the backend that the job is finished
535
625
 
536
- def _print_full_logs(self, node_url: str) -> None:
626
+ def _yield_full_logs(self, node_url: str) -> Generator[Tuple[str, bytes], None, None]:
537
627
  try:
538
628
  response_json = HttpClient.request(url=f'{node_url}/v1/job/{self._uuid}/status/?logs=full').json()
539
629
  except Exception as error:
@@ -543,7 +633,7 @@ class Result:
543
633
  for status_update in response_json.get('previous_status_updates', []):
544
634
  logger.info(f'Cloud: {status_update["log_message"]}')
545
635
 
546
- self.print_logs_packages(response_json['streamed_logs_packages_b64'])
636
+ yield from self._yield_logs_packages(response_json['streamed_logs_packages_b64'])
547
637
 
548
638
  def _get_cloud_job_awaiting_started(self) -> CloudJobStartedDict:
549
639
  retry_count = 0
@@ -589,11 +679,11 @@ class Result:
589
679
  )
590
680
 
591
681
  def _refetch_job_dict(self, force_refetch: Optional[bool] = False) -> None:
592
- if not force_refetch and self._job_dict_last_fetched_at > datetime.utcnow() - timedelta(seconds=2):
682
+ if not force_refetch and self._job_dict_last_fetched_at > datetime.now(timezone.utc) - timedelta(seconds=2):
593
683
  return
594
684
 
595
685
  self._job_dict = self._get_job_dict(self._uuid, self._auth_token)
596
- self._job_dict_last_fetched_at = datetime.utcnow()
686
+ self._job_dict_last_fetched_at = datetime.now(timezone.utc)
597
687
 
598
688
  @staticmethod
599
689
  def _start_job_in_cloud(
@@ -612,8 +702,8 @@ class Result:
612
702
  ) -> 'Result':
613
703
  if len(module_input_serialized) < 500_000 and temporary_client_secrets is None:
614
704
  _job_dict = BiolibJobApi.create_job_with_data(
615
- app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
616
705
  app_version_uuid=app_version_uuid,
706
+ app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
617
707
  arguments_override_command=override_command,
618
708
  experiment_uuid=experiment_id,
619
709
  module_input_serialized=module_input_serialized,
@@ -627,8 +717,8 @@ class Result:
627
717
  return Result(cast(JobDict, _job_dict))
628
718
 
629
719
  job_dict: CreatedJobDict = BiolibJobApi.create(
630
- app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
631
720
  app_version_id=app_version_uuid,
721
+ app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
632
722
  experiment_uuid=experiment_id,
633
723
  machine=machine,
634
724
  notify=notify,
biolib/jobs/job_result.py CHANGED
@@ -1,16 +1,14 @@
1
1
  import time
2
- from fnmatch import fnmatch
3
2
  from pathlib import Path
4
3
 
4
+ from biolib._internal.utils import PathFilter, filter_lazy_loaded_files
5
5
  from biolib.biolib_binary_format import ModuleOutputV2
6
6
  from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
7
7
  from biolib.biolib_binary_format.remote_stream_seeker import StreamSeeker
8
8
  from biolib.biolib_binary_format.utils import LazyLoadedFile, RemoteIndexableBuffer
9
9
  from biolib.biolib_errors import BioLibError
10
10
  from biolib.biolib_logging import logger
11
- from biolib.typing_utils import Callable, List, Optional, Union, cast
12
-
13
- PathFilter = Union[str, Callable[[str], bool]]
11
+ from biolib.typing_utils import Dict, List, Optional
14
12
 
15
13
 
16
14
  class JobResult:
@@ -38,18 +36,68 @@ class JobResult:
38
36
  self,
39
37
  output_dir: str,
40
38
  path_filter: Optional[PathFilter] = None,
41
- skip_file_if_exists: Optional[bool] = None,
39
+ skip_file_if_exists: bool = False,
42
40
  overwrite: bool = False,
41
+ flat: bool = False,
43
42
  ) -> None:
44
43
  module_output = self._get_module_output()
45
44
  output_files = module_output.get_files()
46
- filtered_output_files = self._get_filtered_files(output_files, path_filter) if path_filter else output_files
45
+ filtered_output_files = filter_lazy_loaded_files(output_files, path_filter) if path_filter else output_files
47
46
 
48
47
  if len(filtered_output_files) == 0:
49
48
  logger.debug('No output files to save')
50
49
  return
51
50
 
52
- total_files_data_to_download_in_bytes = sum(file.length for file in filtered_output_files)
51
+ if flat:
52
+ basename_to_paths: Dict[str, List[str]] = {}
53
+ for file in filtered_output_files:
54
+ basename = Path(file.path).name
55
+ if basename not in basename_to_paths:
56
+ basename_to_paths[basename] = []
57
+ basename_to_paths[basename].append(file.path)
58
+
59
+ duplicates = {basename: paths for basename, paths in basename_to_paths.items() if len(paths) > 1}
60
+
61
+ if duplicates:
62
+ max_shown = 3
63
+ error_parts = []
64
+ sorted_basenames = sorted(duplicates.keys())
65
+
66
+ for basename in sorted_basenames[:max_shown]:
67
+ paths = duplicates[basename]
68
+ error_parts.append(f' {basename}: ({", ".join(paths)})')
69
+
70
+ error_message = 'Cannot save files in flat mode: duplicate filenames detected:\n' + '\n'.join(
71
+ error_parts
72
+ )
73
+
74
+ if len(duplicates) > max_shown:
75
+ remaining = len(duplicates) - max_shown
76
+ error_message += f'\n (and {remaining} more)'
77
+
78
+ raise BioLibError(error_message)
79
+
80
+ major_gap_threshold = 50_000
81
+ n = len(filtered_output_files)
82
+
83
+ next_break_end = [0] * n
84
+ if n > 0:
85
+ next_break_end[n - 1] = filtered_output_files[n - 1].start + filtered_output_files[n - 1].length
86
+ for i in range(n - 2, -1, -1):
87
+ end_i = filtered_output_files[i].start + filtered_output_files[i].length
88
+ gap = filtered_output_files[i + 1].start - end_i
89
+ if gap >= major_gap_threshold:
90
+ next_break_end[i] = end_i
91
+ else:
92
+ next_break_end[i] = next_break_end[i + 1]
93
+
94
+ total_files_data_to_download_in_bytes = 0
95
+ file_read_ahead_map = {}
96
+ for i, file in enumerate(filtered_output_files):
97
+ total_files_data_to_download_in_bytes += file.length
98
+ end_i = file.start + file.length
99
+ read_ahead_bytes = max(0, next_break_end[i] - end_i)
100
+ file_read_ahead_map[i] = read_ahead_bytes
53
101
 
54
102
  # Assume files are in order
55
103
  first_file = filtered_output_files[0]
@@ -57,14 +105,18 @@ class JobResult:
57
105
  stream_seeker = StreamSeeker(
58
106
  files_data_start=first_file.start,
59
107
  files_data_end=last_file.start + last_file.length,
60
- download_chunk_size_in_bytes=min(total_files_data_to_download_in_bytes, 10_000_000),
108
+ max_chunk_size=min(total_files_data_to_download_in_bytes, 10_000_000),
61
109
  upstream_buffer=module_output.buffer,
62
110
  )
63
111
 
64
112
  logger.info(f'Saving {len(filtered_output_files)} files to {output_dir}...')
65
- for file in filtered_output_files:
66
- # Remove leading slash of file_path
67
- destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
113
+ for file_index, file in enumerate(filtered_output_files):
114
+ if flat:
115
+ destination_file_path = Path(output_dir) / Path(file.path).name
116
+ else:
117
+ # Remove leading slash of file_path
118
+ destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
119
+
68
120
  if destination_file_path.exists():
69
121
  if skip_file_if_exists:
70
122
  print(f'Skipping {destination_file_path} as a file with that name already exists locally.')
@@ -91,8 +143,12 @@ class JobResult:
91
143
  file_start += data_already_downloaded
92
144
  data_to_download -= data_already_downloaded
93
145
 
146
+ read_ahead_bytes = file_read_ahead_map[file_index]
147
+
94
148
  with open(partial_path, mode='ab') as partial_file:
95
- for chunk in stream_seeker.seek_and_read(file_start=file_start, file_length=data_to_download):
149
+ for chunk in stream_seeker.seek_and_read(
150
+ file_start=file_start, file_length=data_to_download, read_ahead_bytes=read_ahead_bytes
151
+ ):
96
152
  partial_file.write(chunk)
97
153
 
98
154
  # rename partial file to actual file name
@@ -100,7 +156,7 @@ class JobResult:
100
156
 
101
157
  def get_output_file(self, filename) -> LazyLoadedFile:
102
158
  files = self._get_module_output().get_files()
103
- filtered_files = self._get_filtered_files(files, path_filter=filename)
159
+ filtered_files = filter_lazy_loaded_files(files, path_filter=filename)
104
160
  if not filtered_files:
105
161
  raise BioLibError(f'File {filename} not found in results.')
106
162
 
@@ -114,26 +170,7 @@ class JobResult:
114
170
  if not path_filter:
115
171
  return files
116
172
 
117
- return self._get_filtered_files(files, path_filter)
118
-
119
- @staticmethod
120
- def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
121
- if not (isinstance(path_filter, str) or callable(path_filter)):
122
- raise Exception('Expected path_filter to be a string or a function')
123
-
124
- if callable(path_filter):
125
- return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
126
-
127
- glob_filter = cast(str, path_filter)
128
-
129
- # since all file paths start with /, make sure filter does too
130
- if not glob_filter.startswith('/'):
131
- glob_filter = '/' + glob_filter
132
-
133
- def _filter_function(file: LazyLoadedFile) -> bool:
134
- return fnmatch(file.path, glob_filter)
135
-
136
- return list(filter(_filter_function, files))
173
+ return filter_lazy_loaded_files(files, path_filter)
137
174
 
138
175
  def _get_module_output(self) -> ModuleOutputV2:
139
176
  if self._module_output is None:
biolib/jobs/types.py CHANGED
@@ -26,6 +26,7 @@ class Result(TypedDict):
26
26
 
27
27
  class JobDict(TypedDict):
28
28
  app_uri: str
29
+ arguments_override_command: bool
29
30
  auth_token: str
30
31
  created_at: str
31
32
  ended_at: Optional[str]
biolib/sdk/__init__.py CHANGED
@@ -2,6 +2,7 @@ from typing import Optional
2
2
 
3
3
  # Imports to hide and use as private internal utils
4
4
  from biolib._data_record.data_record import DataRecord as _DataRecord
5
+ from biolib._index.index import Index as _Index
5
6
  from biolib._internal.push_application import push_application as _push_application
6
7
  from biolib._internal.push_application import set_app_version_as_active as _set_app_version_as_active
7
8
  from biolib._runtime.runtime import Runtime as _Runtime
@@ -12,8 +13,18 @@ from biolib.app import BioLibApp as _BioLibApp
12
13
  Runtime = _Runtime
13
14
 
14
15
 
15
- def get_session(refresh_token: str, base_url: Optional[str] = None, client_type: Optional[str] = None) -> _Session:
16
- return _Session.get_session(refresh_token=refresh_token, base_url=base_url, client_type=client_type)
16
+ def get_session(
17
+ refresh_token: str,
18
+ base_url: Optional[str] = None,
19
+ client_type: Optional[str] = None,
20
+ experiment: Optional[str] = None,
21
+ ) -> _Session:
22
+ return _Session.get_session(
23
+ refresh_token=refresh_token,
24
+ base_url=base_url,
25
+ client_type=client_type,
26
+ experiment=experiment,
27
+ )
17
28
 
18
29
 
19
30
  def push_app_version(uri: str, path: str) -> _BioLibApp:
@@ -64,3 +75,7 @@ def create_data_record(
64
75
  data_path=data_path,
65
76
  record_type=record_type,
66
77
  )
78
+
79
+
80
+ def get_index(uri: str) -> _Index:
81
+ return _Index.get_by_uri(uri)
biolib/typing_utils.py CHANGED
@@ -1,2 +1,2 @@
1
1
  # TODO: Deprecate and later remove this file
2
- from biolib._internal.types.typing import * # pylint: disable=wildcard-import, unused-wildcard-import
2
+ from biolib._shared.types.typing import * # pylint: disable=wildcard-import, unused-wildcard-import
@@ -2,7 +2,7 @@ import os
2
2
  import abc
3
3
  import json
4
4
  import time
5
- from datetime import datetime
5
+ from datetime import datetime, timezone
6
6
 
7
7
  import appdirs # type: ignore
8
8
 
@@ -88,4 +88,4 @@ class CacheState(abc.ABC, Generic[StateType]):
88
88
 
89
89
  @staticmethod
90
90
  def get_timestamp_now() -> str:
91
- return datetime.utcnow().isoformat()
91
+ return datetime.now(timezone.utc).isoformat()
@@ -10,7 +10,7 @@ from biolib._internal.http_client import HttpClient
10
10
  from biolib.biolib_api_client import BiolibApiClient
11
11
  from biolib.biolib_errors import BioLibError
12
12
  from biolib.biolib_logging import logger, logger_no_user_data
13
- from biolib.typing_utils import TypedDict, List, Iterator, Tuple, Optional, Dict
13
+ from biolib.typing_utils import Callable, Dict, Iterator, List, Optional, Tuple, TypedDict
14
14
 
15
15
 
16
16
  def get_chunk_iterator_from_bytes(byte_buffer: bytes, chunk_size_in_bytes: int = 50_000_000) -> Iterator[bytes]:
@@ -45,19 +45,20 @@ _UploadChunkReturnType = Tuple[_PartMetadata, int]
45
45
 
46
46
 
47
47
  class MultiPartUploader:
48
-
49
48
  def __init__(
50
- self,
51
- complete_upload_request: RequestOptions,
52
- get_presigned_upload_url_request: RequestOptions,
53
- start_multipart_upload_request: Optional[RequestOptions] = None,
54
- use_process_pool: Optional[bool] = None,
49
+ self,
50
+ complete_upload_request: RequestOptions,
51
+ get_presigned_upload_url_request: RequestOptions,
52
+ start_multipart_upload_request: Optional[RequestOptions] = None,
53
+ use_process_pool: Optional[bool] = None,
54
+ on_progress: Optional[Callable[[int, int], None]] = None,
55
55
  ):
56
56
  self._complete_upload_request = complete_upload_request
57
57
  self._get_presigned_upload_url_request = get_presigned_upload_url_request
58
58
  self._start_multipart_upload_request = start_multipart_upload_request
59
59
  self._bytes_uploaded: int = 0
60
60
  self._use_process_pool = use_process_pool
61
+ self._on_progress = on_progress
61
62
 
62
63
  def upload(self, payload_iterator: Iterator[bytes], payload_size_in_bytes: int) -> None:
63
64
  parts: List[_PartMetadata] = []
@@ -85,21 +86,22 @@ class MultiPartUploader:
85
86
  self._update_progress_bar_and_parts(
86
87
  upload_chunk_response=upload_chunk_response,
87
88
  parts=parts,
88
- payload_size_in_bytes=payload_size_in_bytes
89
+ payload_size_in_bytes=payload_size_in_bytes,
89
90
  )
90
91
  else:
91
92
  # use 16 cores, unless less is available
92
93
  pool_size = min(16, multiprocessing.cpu_count() - 1)
93
- process_pool = multiprocessing.Pool(pool_size) if self._use_process_pool else \
94
- multiprocessing.pool.ThreadPool(pool_size)
94
+ process_pool = (
95
+ multiprocessing.Pool(pool_size)
96
+ if self._use_process_pool
97
+ else multiprocessing.pool.ThreadPool(pool_size)
98
+ )
95
99
 
96
100
  try:
97
101
  response: _UploadChunkReturnType
98
102
  for response in process_pool.imap(self._upload_chunk, iterator_with_index):
99
103
  self._update_progress_bar_and_parts(
100
- upload_chunk_response=response,
101
- parts=parts,
102
- payload_size_in_bytes=payload_size_in_bytes
104
+ upload_chunk_response=response, parts=parts, payload_size_in_bytes=payload_size_in_bytes
103
105
  )
104
106
  finally:
105
107
  logger_no_user_data.debug('Multipart upload closing process pool...')
@@ -148,8 +150,9 @@ class MultiPartUploader:
148
150
  if app_caller_proxy_job_storage_base_url:
149
151
  # Done to hit App Caller Proxy when uploading result from inside an app
150
152
  parsed_url = urlparse(presigned_upload_url)
151
- presigned_upload_url = \
153
+ presigned_upload_url = (
152
154
  f'{app_caller_proxy_job_storage_base_url}{parsed_url.path}?{parsed_url.query}'
155
+ )
153
156
 
154
157
  put_chunk_response = HttpClient.request(
155
158
  url=presigned_upload_url,
@@ -169,10 +172,10 @@ class MultiPartUploader:
169
172
  raise BioLibError(f'Max retries hit, when uploading part {part_number}. Exiting...')
170
173
 
171
174
  def _update_progress_bar_and_parts(
172
- self,
173
- upload_chunk_response: _UploadChunkReturnType,
174
- parts: List[_PartMetadata],
175
- payload_size_in_bytes: int,
175
+ self,
176
+ upload_chunk_response: _UploadChunkReturnType,
177
+ parts: List[_PartMetadata],
178
+ payload_size_in_bytes: int,
176
179
  ) -> None:
177
180
  part_metadata, chunk_byte_length = upload_chunk_response
178
181
  part_number = part_metadata['PartNumber']
@@ -180,6 +183,9 @@ class MultiPartUploader:
180
183
  parts.append(part_metadata)
181
184
  self._bytes_uploaded += chunk_byte_length
182
185
 
186
+ if self._on_progress is not None:
187
+ self._on_progress(self._bytes_uploaded, payload_size_in_bytes)
188
+
183
189
  approx_progress_percent = min(self._bytes_uploaded / (payload_size_in_bytes + 1) * 100, 100)
184
190
  approx_rounded_progress = round(approx_progress_percent, 2)
185
191
  logger_no_user_data.debug(
biolib/utils/seq_util.py CHANGED
@@ -9,7 +9,7 @@ class SeqUtilRecord:
9
9
  self,
10
10
  sequence: str,
11
11
  sequence_id: str,
12
- description: Optional['str'],
12
+ description: Optional['str'] = None,
13
13
  properties: Optional[Dict[str, str]] = None,
14
14
  ):
15
15
  self.sequence = sequence
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: pybiolib
3
+ Version: 1.2.1727
4
+ Summary: BioLib Python Client
5
+ Project-URL: Homepage, https://github.com/biolib
6
+ Author-email: biolib <hello@biolib.com>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: biolib
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.6.3
13
+ Requires-Dist: appdirs>=1.4.3
14
+ Requires-Dist: click>=8.0.0
15
+ Requires-Dist: docker>=5.0.3
16
+ Requires-Dist: importlib-metadata>=1.6.1
17
+ Requires-Dist: pyyaml>=5.3.1
18
+ Requires-Dist: rich>=12.4.4
19
+ Requires-Dist: typing-extensions>=4.1.0; python_version < '3.11'
20
+ Provides-Extra: compute-node
21
+ Requires-Dist: flask>=2.0.1; extra == 'compute-node'
22
+ Requires-Dist: gunicorn>=20.1.0; extra == 'compute-node'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # PyBioLib
26
+
27
+ PyBioLib is a Python package for running BioLib applications from Python scripts and the command line.
28
+
29
+ ### Python Example
30
+ ```python
31
+ # pip3 install -U pybiolib
32
+ import biolib
33
+ samtools = biolib.load('samtools/samtools')
34
+ print(samtools.cli(args='--help'))
35
+ ```
36
+
37
+ ### Command Line Example
38
+ ```bash
39
+ pip3 install -U pybiolib
40
+ biolib run samtools/samtools --help
41
+ ```