pybiolib 1.2.911__py3-none-any.whl → 1.2.1642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pybiolib might be problematic. Click here for more details.

Files changed (113) hide show
  1. biolib/__init__.py +33 -10
  2. biolib/_data_record/data_record.py +24 -11
  3. biolib/_index/index.py +51 -0
  4. biolib/_index/types.py +7 -0
  5. biolib/_internal/add_copilot_prompts.py +3 -5
  6. biolib/_internal/add_gui_files.py +59 -0
  7. biolib/_internal/data_record/data_record.py +1 -1
  8. biolib/_internal/data_record/push_data.py +1 -1
  9. biolib/_internal/data_record/remote_storage_endpoint.py +3 -3
  10. biolib/_internal/file_utils.py +48 -0
  11. biolib/_internal/index/__init__.py +1 -0
  12. biolib/_internal/index/index.py +18 -0
  13. biolib/_internal/lfs/cache.py +4 -2
  14. biolib/_internal/push_application.py +89 -23
  15. biolib/_internal/runtime.py +2 -0
  16. biolib/_internal/string_utils.py +13 -0
  17. biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
  18. biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
  19. biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
  20. biolib/_internal/templates/gui_template/App.tsx +53 -0
  21. biolib/_internal/templates/gui_template/Dockerfile +28 -0
  22. biolib/_internal/templates/gui_template/biolib-sdk.ts +37 -0
  23. biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
  24. biolib/_internal/templates/gui_template/index.css +5 -0
  25. biolib/_internal/templates/gui_template/index.html +13 -0
  26. biolib/_internal/templates/gui_template/index.tsx +10 -0
  27. biolib/_internal/templates/gui_template/package.json +27 -0
  28. biolib/_internal/templates/gui_template/tsconfig.json +24 -0
  29. biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +49 -0
  30. biolib/_internal/templates/gui_template/vite.config.mts +9 -0
  31. biolib/_internal/templates/init_template/.biolib/config.yml +1 -0
  32. biolib/_internal/templates/init_template/.github/workflows/biolib.yml +6 -1
  33. biolib/_internal/templates/init_template/Dockerfile +2 -0
  34. biolib/_internal/templates/init_template/run.sh +1 -0
  35. biolib/_internal/templates/templates.py +9 -1
  36. biolib/_internal/utils/__init__.py +25 -0
  37. biolib/_internal/utils/job_url.py +33 -0
  38. biolib/_internal/utils/multinode.py +12 -14
  39. biolib/_runtime/runtime.py +15 -2
  40. biolib/_session/session.py +7 -5
  41. biolib/_shared/__init__.py +0 -0
  42. biolib/_shared/types/__init__.py +69 -0
  43. biolib/_shared/types/account.py +12 -0
  44. biolib/_shared/types/account_member.py +8 -0
  45. biolib/{_internal → _shared}/types/experiment.py +1 -0
  46. biolib/_shared/types/resource.py +17 -0
  47. biolib/_shared/types/resource_deploy_key.py +11 -0
  48. biolib/{_internal → _shared}/types/resource_permission.py +1 -1
  49. biolib/{_internal → _shared}/types/user.py +5 -5
  50. biolib/_shared/utils/__init__.py +7 -0
  51. biolib/_shared/utils/resource_uri.py +75 -0
  52. biolib/api/client.py +1 -1
  53. biolib/app/app.py +96 -45
  54. biolib/biolib_api_client/app_types.py +1 -0
  55. biolib/biolib_api_client/biolib_app_api.py +26 -0
  56. biolib/biolib_binary_format/module_input.py +8 -0
  57. biolib/biolib_binary_format/remote_endpoints.py +3 -3
  58. biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
  59. biolib/biolib_logging.py +1 -1
  60. biolib/cli/__init__.py +2 -1
  61. biolib/cli/auth.py +4 -16
  62. biolib/cli/data_record.py +17 -0
  63. biolib/cli/index.py +32 -0
  64. biolib/cli/init.py +93 -11
  65. biolib/cli/lfs.py +1 -1
  66. biolib/cli/run.py +1 -1
  67. biolib/cli/start.py +14 -1
  68. biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
  69. biolib/compute_node/job_worker/executors/docker_types.py +1 -1
  70. biolib/compute_node/job_worker/executors/types.py +6 -5
  71. biolib/compute_node/job_worker/job_storage.py +2 -1
  72. biolib/compute_node/job_worker/job_worker.py +155 -90
  73. biolib/compute_node/job_worker/large_file_system.py +2 -6
  74. biolib/compute_node/job_worker/network_alloc.py +99 -0
  75. biolib/compute_node/job_worker/network_buffer.py +240 -0
  76. biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
  77. biolib/compute_node/remote_host_proxy.py +135 -67
  78. biolib/compute_node/utils.py +2 -0
  79. biolib/compute_node/webserver/compute_node_results_proxy.py +188 -0
  80. biolib/compute_node/webserver/proxy_utils.py +28 -0
  81. biolib/compute_node/webserver/webserver.py +64 -19
  82. biolib/experiments/experiment.py +98 -16
  83. biolib/jobs/job.py +128 -31
  84. biolib/jobs/job_result.py +73 -33
  85. biolib/jobs/types.py +1 -0
  86. biolib/sdk/__init__.py +17 -2
  87. biolib/typing_utils.py +1 -1
  88. biolib/utils/cache_state.py +2 -2
  89. biolib/utils/seq_util.py +1 -1
  90. {pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/METADATA +4 -2
  91. pybiolib-1.2.1642.dist-info/RECORD +180 -0
  92. {pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/WHEEL +1 -1
  93. biolib/_internal/llm_instructions/.github/instructions/style-react-ts.instructions.md +0 -22
  94. biolib/_internal/types/__init__.py +0 -6
  95. biolib/_internal/types/account.py +0 -10
  96. biolib/utils/app_uri.py +0 -57
  97. pybiolib-1.2.911.dist-info/RECORD +0 -150
  98. /biolib/{_internal/llm_instructions → _index}/__init__.py +0 -0
  99. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/general-app-knowledge.instructions.md +0 -0
  100. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-general.instructions.md +0 -0
  101. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-python.instructions.md +0 -0
  102. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_app_inputs.prompt.md +0 -0
  103. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_run_apps.prompt.md +0 -0
  104. /biolib/{_internal → _shared}/types/app.py +0 -0
  105. /biolib/{_internal → _shared}/types/data_record.py +0 -0
  106. /biolib/{_internal → _shared}/types/file_node.py +0 -0
  107. /biolib/{_internal → _shared}/types/push.py +0 -0
  108. /biolib/{_internal/types/resource.py → _shared/types/resource_types.py} +0 -0
  109. /biolib/{_internal → _shared}/types/resource_version.py +0 -0
  110. /biolib/{_internal → _shared}/types/result.py +0 -0
  111. /biolib/{_internal → _shared}/types/typing.py +0 -0
  112. {pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/entry_points.txt +0 -0
  113. {pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info/licenses}/LICENSE +0 -0
biolib/jobs/job.py CHANGED
@@ -2,7 +2,7 @@ import base64
2
2
  import sys
3
3
  import time
4
4
  from collections import OrderedDict
5
- from datetime import datetime, timedelta
5
+ from datetime import datetime, timedelta, timezone
6
6
  from pathlib import Path
7
7
  from urllib.parse import urlparse
8
8
 
@@ -10,7 +10,8 @@ import biolib.api.client
10
10
  from biolib import utils
11
11
  from biolib._internal.http_client import HttpClient
12
12
  from biolib._internal.tree_utils import build_tree_from_files, build_tree_str
13
- from biolib._internal.utils import open_browser_window_from_notebook
13
+ from biolib._internal.utils import PathFilter, filter_lazy_loaded_files, open_browser_window_from_notebook
14
+ from biolib._shared.utils import parse_resource_uri
14
15
  from biolib.api.client import ApiClient
15
16
  from biolib.biolib_api_client import BiolibApiClient, CreatedJobDict
16
17
  from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
@@ -18,16 +19,16 @@ from biolib.biolib_api_client.biolib_job_api import BiolibJobApi
18
19
  from biolib.biolib_binary_format import LazyLoadedFile, ModuleInput, ModuleInputDict, ModuleOutputV2
19
20
  from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
20
21
  from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
22
+ from biolib.biolib_binary_format.utils import InMemoryIndexableBuffer
21
23
  from biolib.biolib_errors import BioLibError, CloudJobFinishedError
22
24
  from biolib.biolib_logging import logger, logger_no_user_data
23
25
  from biolib.compute_node.job_worker.job_storage import JobStorage
24
26
  from biolib.compute_node.utils import SystemExceptionCodeMap, SystemExceptionCodes
25
- from biolib.jobs.job_result import JobResult, PathFilter
27
+ from biolib.jobs.job_result import JobResult
26
28
  from biolib.jobs.types import CloudJobDict, CloudJobStartedDict, JobDict
27
29
  from biolib.tables import BioLibTable
28
- from biolib.typing_utils import Dict, List, Optional, Union, cast
30
+ from biolib.typing_utils import Dict, Generator, List, Optional, Tuple, Union, cast
29
31
  from biolib.utils import IS_RUNNING_IN_NOTEBOOK
30
- from biolib.utils.app_uri import parse_app_uri
31
32
 
32
33
 
33
34
  class Result:
@@ -49,7 +50,7 @@ class Result:
49
50
  self._auth_token: str = job_dict['auth_token']
50
51
 
51
52
  self._job_dict: JobDict = job_dict
52
- self._job_dict_last_fetched_at: datetime = datetime.utcnow()
53
+ self._job_dict_last_fetched_at: datetime = datetime.now(timezone.utc)
53
54
  self._result: Optional[JobResult] = None
54
55
  self._cached_input_arguments: Optional[List[str]] = None
55
56
 
@@ -187,6 +188,45 @@ class Result:
187
188
  """
188
189
  return self.result.list_output_files(path_filter=path_filter)
189
190
 
191
+ def list_input_files(
192
+ self,
193
+ path_filter: Optional[PathFilter] = None,
194
+ ) -> List[LazyLoadedFile]:
195
+ """List input files from the result.
196
+
197
+ Args:
198
+ path_filter (PathFilter, optional): Filter to apply to the input files.
199
+ Can be a string glob pattern or a callable that takes a path string and returns a boolean.
200
+
201
+ Returns:
202
+ List[LazyLoadedFile]: List of input files.
203
+
204
+ Example::
205
+ >>> result = biolib.get_result("result_id")
206
+ >>> input_files = result.list_input_files()
207
+ >>> # Filter files with a glob pattern
208
+ >>> input_files = result.list_input_files("*.txt")
209
+ """
210
+ presigned_download_url = BiolibJobApi.get_job_storage_download_url(
211
+ job_uuid=self.id,
212
+ job_auth_token=self._auth_token,
213
+ storage_type='input',
214
+ )
215
+ response = HttpClient.request(url=presigned_download_url)
216
+ module_input_serialized: bytes = response.content
217
+ module_input = ModuleInput(module_input_serialized).deserialize()
218
+
219
+ files = []
220
+ for path, data in module_input['files'].items():
221
+ buffer = InMemoryIndexableBuffer(data)
222
+ lazy_file = LazyLoadedFile(path=path, buffer=buffer, start=0, length=len(data))
223
+ files.append(lazy_file)
224
+
225
+ if not path_filter:
226
+ return files
227
+
228
+ return filter_lazy_loaded_files(files, path_filter)
229
+
190
230
  def get_output_file(self, filename: str) -> LazyLoadedFile:
191
231
  return self.result.get_output_file(filename=filename)
192
232
 
@@ -226,7 +266,7 @@ class Result:
226
266
 
227
267
  return self._cached_input_arguments
228
268
 
229
- def save_input_files(self, output_dir: str) -> None:
269
+ def save_input_files(self, output_dir: str, overwrite: bool = False) -> None:
230
270
  logger.info('Downloading input files...')
231
271
  module_input = self._get_module_input()
232
272
 
@@ -236,7 +276,12 @@ class Result:
236
276
  # Remove leading slash of file_path
237
277
  destination_file_path = Path(output_dir) / Path(path.lstrip('/'))
238
278
  if destination_file_path.exists():
239
- destination_file_path.rename(f'{destination_file_path}.biolib-renamed.{time.strftime("%Y%m%d%H%M%S")}')
279
+ if not overwrite:
280
+ raise BioLibError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite.')
281
+ else:
282
+ destination_file_path.rename(
283
+ f'{destination_file_path}.biolib-renamed.{time.strftime("%Y%m%d%H%M%S")}'
284
+ )
240
285
 
241
286
  dir_path = destination_file_path.parent
242
287
  if dir_path:
@@ -251,12 +296,44 @@ class Result:
251
296
  self,
252
297
  output_dir: str,
253
298
  path_filter: Optional[PathFilter] = None,
254
- skip_file_if_exists: Optional[bool] = None,
299
+ skip_file_if_exists: bool = False,
300
+ overwrite: bool = False,
301
+ flat: bool = False,
255
302
  ) -> None:
303
+ """Save output files from the result to a local directory.
304
+
305
+ Args:
306
+ output_dir (str): Directory path where files will be saved.
307
+ path_filter (PathFilter, optional): Filter to apply to output files.
308
+ Can be a string glob pattern or a callable that takes a path and returns a boolean.
309
+ skip_file_if_exists (bool, optional): If True, skip files that already exist locally.
310
+ Defaults to False.
311
+ overwrite (bool, optional): If True, overwrite existing files by renaming them with a timestamp.
312
+ Defaults to False.
313
+ flat (bool, optional): If True, save all files directly to output_dir using only their basenames,
314
+ without creating subdirectories. When enabled, raises an error if duplicate basenames exist
315
+ in the filtered output or if any basename already exists in output_dir. Defaults to False.
316
+
317
+ Raises:
318
+ BioLibError: If flat=True and duplicate basenames are found in filtered output.
319
+ BioLibError: If flat=True and a file with the same basename already exists in output_dir.
320
+ BioLibError: If a file already exists and neither skip_file_if_exists nor overwrite is True.
321
+
322
+ Example::
323
+ >>> result = biolib.get_result("result_id")
324
+ >>> # Save all files preserving directory structure
325
+ >>> result.save_files("./output")
326
+ >>> # Save files flat without subdirectories
327
+ >>> result.save_files("./output", flat=True)
328
+ >>> # Save only specific files
329
+ >>> result.save_files("./output", path_filter="*.txt")
330
+ """
256
331
  self.result.save_files(
257
332
  output_dir=output_dir,
258
333
  path_filter=path_filter,
259
334
  skip_file_if_exists=skip_file_if_exists,
335
+ overwrite=overwrite,
336
+ flat=flat,
260
337
  )
261
338
 
262
339
  def get_status(self) -> str:
@@ -351,6 +428,7 @@ class Result:
351
428
  >>> # Recompute with different arguments
352
429
  >>> new_result = result.recompute(arguments=["--new-arg", "value"])
353
430
  """
431
+ self._refetch_job_dict()
354
432
  app_response = BiolibAppApi.get_by_uri(uri=app_uri or self._job_dict['app_uri'])
355
433
 
356
434
  job_storage_input = RemoteJobStorageEndpoint(
@@ -378,6 +456,7 @@ class Result:
378
456
  app_uri=app_response['app_uri'],
379
457
  app_version_uuid=app_response['app_version']['public_id'],
380
458
  module_input_serialized=module_input_serialized,
459
+ override_command=self._job_dict['arguments_override_command'],
381
460
  machine=machine if machine else original_requested_machine,
382
461
  )
383
462
  if blocking:
@@ -444,16 +523,11 @@ class Result:
444
523
  return Result(job_dict)
445
524
 
446
525
  @staticmethod
447
- def print_logs_packages(stdout_and_stderr_packages_b64):
526
+ def _yield_logs_packages(stdout_and_stderr_packages_b64) -> Generator[Tuple[str, bytes], None, None]:
448
527
  for stdout_and_stderr_package_b64 in stdout_and_stderr_packages_b64:
449
528
  stdout_and_stderr_package = base64.b64decode(stdout_and_stderr_package_b64)
450
529
  stdout_and_stderr = StdoutAndStderr(stdout_and_stderr_package).deserialize()
451
-
452
- sys.stdout.write(stdout_and_stderr.decode())
453
- if not IS_RUNNING_IN_NOTEBOOK: # for some reason flushing in jupyter notebooks breaks \r handling
454
- sys.stdout.flush()
455
- # flush after having processed all packages
456
- sys.stdout.flush()
530
+ yield ('stdout', stdout_and_stderr)
457
531
 
458
532
  def show(self) -> None:
459
533
  self._refetch_job_dict()
@@ -463,21 +537,44 @@ class Result:
463
537
  title=f'Result: {self._uuid}',
464
538
  ).print_table()
465
539
 
466
- def stream_logs(self) -> None:
540
+ def stream_logs(self, as_iterator: bool = False):
541
+ if as_iterator:
542
+ return self._iter_logs()
467
543
  self._stream_logs()
544
+ return None
468
545
 
469
546
  def _stream_logs(self, enable_print: bool = True) -> None:
547
+ try:
548
+ for stream_type, data in self._iter_logs(enable_print=enable_print):
549
+ if stream_type == 'stdout':
550
+ if IS_RUNNING_IN_NOTEBOOK:
551
+ sys.stdout.write(data.decode(encoding='utf-8', errors='replace'))
552
+ # Note: we avoid flush() in notebook as that breaks \r handling
553
+ else:
554
+ sys.stdout.buffer.write(data)
555
+ sys.stdout.buffer.flush()
556
+ elif stream_type == 'stderr':
557
+ if IS_RUNNING_IN_NOTEBOOK:
558
+ sys.stderr.write(data.decode(encoding='utf-8', errors='replace'))
559
+ # Note: we avoid flush() in notebook as that breaks \r handling
560
+ else:
561
+ sys.stderr.buffer.write(data)
562
+ sys.stderr.buffer.flush()
563
+ finally:
564
+ # Flush after having processed all packages
565
+ if IS_RUNNING_IN_NOTEBOOK:
566
+ sys.stdout.flush()
567
+ sys.stderr.flush()
568
+
569
+ def _iter_logs(self, enable_print: bool = True) -> Generator[Tuple[str, bytes], None, None]:
470
570
  try:
471
571
  cloud_job = self._get_cloud_job_awaiting_started()
472
572
  except CloudJobFinishedError:
473
573
  logger.info(f'--- The result {self.id} has already completed (no streaming will take place) ---')
474
574
  logger.info('--- The stdout log is printed below: ---')
475
- sys.stdout.flush()
476
- print(self.get_stdout().decode(), file=sys.stdout)
477
- sys.stdout.flush()
575
+ yield ('stdout', self.get_stdout())
478
576
  logger.info('--- The stderr log is printed below: ---')
479
- print(self.get_stderr().decode(), file=sys.stderr)
480
- sys.stderr.flush()
577
+ yield ('stderr', self.get_stderr())
481
578
  logger.info(f'--- The job {self.id} has already completed. Its output was printed above. ---')
482
579
  return
483
580
 
@@ -489,7 +586,7 @@ class Result:
489
586
  logger_no_user_data.debug(f'Using cloud proxy URL from env var BIOLIB_CLOUD_BASE_URL: {compute_node_url}')
490
587
 
491
588
  if enable_print:
492
- self._print_full_logs(node_url=compute_node_url)
589
+ yield from self._yield_full_logs(node_url=compute_node_url)
493
590
 
494
591
  final_status_messages: List[str] = []
495
592
  while True:
@@ -508,8 +605,8 @@ class Result:
508
605
  # Print the status before writing stdout and stderr
509
606
  logger.info(f'Cloud: {status_update["log_message"]}')
510
607
 
511
- if 'stdout_and_stderr_packages_b64' and enable_print:
512
- self.print_logs_packages(status_json['stdout_and_stderr_packages_b64'])
608
+ if enable_print:
609
+ yield from self._yield_logs_packages(status_json['stdout_and_stderr_packages_b64'])
513
610
 
514
611
  if 'error_code' in status_json:
515
612
  error_code = status_json['error_code']
@@ -526,7 +623,7 @@ class Result:
526
623
 
527
624
  self.wait() # Wait for compute node to tell the backend that the job is finished
528
625
 
529
- def _print_full_logs(self, node_url: str) -> None:
626
+ def _yield_full_logs(self, node_url: str) -> Generator[Tuple[str, bytes], None, None]:
530
627
  try:
531
628
  response_json = HttpClient.request(url=f'{node_url}/v1/job/{self._uuid}/status/?logs=full').json()
532
629
  except Exception as error:
@@ -536,7 +633,7 @@ class Result:
536
633
  for status_update in response_json.get('previous_status_updates', []):
537
634
  logger.info(f'Cloud: {status_update["log_message"]}')
538
635
 
539
- self.print_logs_packages(response_json['streamed_logs_packages_b64'])
636
+ yield from self._yield_logs_packages(response_json['streamed_logs_packages_b64'])
540
637
 
541
638
  def _get_cloud_job_awaiting_started(self) -> CloudJobStartedDict:
542
639
  retry_count = 0
@@ -582,11 +679,11 @@ class Result:
582
679
  )
583
680
 
584
681
  def _refetch_job_dict(self, force_refetch: Optional[bool] = False) -> None:
585
- if not force_refetch and self._job_dict_last_fetched_at > datetime.utcnow() - timedelta(seconds=2):
682
+ if not force_refetch and self._job_dict_last_fetched_at > datetime.now(timezone.utc) - timedelta(seconds=2):
586
683
  return
587
684
 
588
685
  self._job_dict = self._get_job_dict(self._uuid, self._auth_token)
589
- self._job_dict_last_fetched_at = datetime.utcnow()
686
+ self._job_dict_last_fetched_at = datetime.now(timezone.utc)
590
687
 
591
688
  @staticmethod
592
689
  def _start_job_in_cloud(
@@ -605,8 +702,8 @@ class Result:
605
702
  ) -> 'Result':
606
703
  if len(module_input_serialized) < 500_000 and temporary_client_secrets is None:
607
704
  _job_dict = BiolibJobApi.create_job_with_data(
608
- app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
609
705
  app_version_uuid=app_version_uuid,
706
+ app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
610
707
  arguments_override_command=override_command,
611
708
  experiment_uuid=experiment_id,
612
709
  module_input_serialized=module_input_serialized,
@@ -620,8 +717,8 @@ class Result:
620
717
  return Result(cast(JobDict, _job_dict))
621
718
 
622
719
  job_dict: CreatedJobDict = BiolibJobApi.create(
623
- app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
624
720
  app_version_id=app_version_uuid,
721
+ app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
625
722
  experiment_uuid=experiment_id,
626
723
  machine=machine,
627
724
  notify=notify,
biolib/jobs/job_result.py CHANGED
@@ -1,16 +1,14 @@
1
1
  import time
2
- from fnmatch import fnmatch
3
2
  from pathlib import Path
4
3
 
4
+ from biolib._internal.utils import PathFilter, filter_lazy_loaded_files
5
5
  from biolib.biolib_binary_format import ModuleOutputV2
6
6
  from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
7
7
  from biolib.biolib_binary_format.remote_stream_seeker import StreamSeeker
8
8
  from biolib.biolib_binary_format.utils import LazyLoadedFile, RemoteIndexableBuffer
9
9
  from biolib.biolib_errors import BioLibError
10
10
  from biolib.biolib_logging import logger
11
- from biolib.typing_utils import Callable, List, Optional, Union, cast
12
-
13
- PathFilter = Union[str, Callable[[str], bool]]
11
+ from biolib.typing_utils import Dict, List, Optional
14
12
 
15
13
 
16
14
  class JobResult:
@@ -38,17 +36,68 @@ class JobResult:
38
36
  self,
39
37
  output_dir: str,
40
38
  path_filter: Optional[PathFilter] = None,
41
- skip_file_if_exists: Optional[bool] = None,
39
+ skip_file_if_exists: bool = False,
40
+ overwrite: bool = False,
41
+ flat: bool = False,
42
42
  ) -> None:
43
43
  module_output = self._get_module_output()
44
44
  output_files = module_output.get_files()
45
- filtered_output_files = self._get_filtered_files(output_files, path_filter) if path_filter else output_files
45
+ filtered_output_files = filter_lazy_loaded_files(output_files, path_filter) if path_filter else output_files
46
46
 
47
47
  if len(filtered_output_files) == 0:
48
48
  logger.debug('No output files to save')
49
49
  return
50
50
 
51
- total_files_data_to_download_in_bytes = sum(file.length for file in filtered_output_files)
51
+ if flat:
52
+ basename_to_paths: Dict[str, List[str]] = {}
53
+ for file in filtered_output_files:
54
+ basename = Path(file.path).name
55
+ if basename not in basename_to_paths:
56
+ basename_to_paths[basename] = []
57
+ basename_to_paths[basename].append(file.path)
58
+
59
+ duplicates = {basename: paths for basename, paths in basename_to_paths.items() if len(paths) > 1}
60
+
61
+ if duplicates:
62
+ max_shown = 3
63
+ error_parts = []
64
+ sorted_basenames = sorted(duplicates.keys())
65
+
66
+ for basename in sorted_basenames[:max_shown]:
67
+ paths = duplicates[basename]
68
+ error_parts.append(f' {basename}: ({", ".join(paths)})')
69
+
70
+ error_message = 'Cannot save files in flat mode: duplicate filenames detected:\n' + '\n'.join(
71
+ error_parts
72
+ )
73
+
74
+ if len(duplicates) > max_shown:
75
+ remaining = len(duplicates) - max_shown
76
+ error_message += f'\n (and {remaining} more)'
77
+
78
+ raise BioLibError(error_message)
79
+
80
+ major_gap_threshold = 50_000
81
+ n = len(filtered_output_files)
82
+
83
+ next_break_end = [0] * n
84
+ if n > 0:
85
+ next_break_end[n - 1] = filtered_output_files[n - 1].start + filtered_output_files[n - 1].length
86
+ for i in range(n - 2, -1, -1):
87
+ end_i = filtered_output_files[i].start + filtered_output_files[i].length
88
+ gap = filtered_output_files[i + 1].start - end_i
89
+ if gap >= major_gap_threshold:
90
+ next_break_end[i] = end_i
91
+ else:
92
+ next_break_end[i] = next_break_end[i + 1]
93
+
94
+ total_files_data_to_download_in_bytes = 0
95
+ file_read_ahead_map = {}
96
+ for i, file in enumerate(filtered_output_files):
97
+ total_files_data_to_download_in_bytes += file.length
98
+ end_i = file.start + file.length
99
+ read_ahead_bytes = max(0, next_break_end[i] - end_i)
100
+ file_read_ahead_map[i] = read_ahead_bytes
52
101
 
53
102
  # Assume files are in order
54
103
  first_file = filtered_output_files[0]
@@ -56,18 +105,24 @@ class JobResult:
56
105
  stream_seeker = StreamSeeker(
57
106
  files_data_start=first_file.start,
58
107
  files_data_end=last_file.start + last_file.length,
59
- download_chunk_size_in_bytes=min(total_files_data_to_download_in_bytes, 10_000_000),
108
+ max_chunk_size=min(total_files_data_to_download_in_bytes, 10_000_000),
60
109
  upstream_buffer=module_output.buffer,
61
110
  )
62
111
 
63
112
  logger.info(f'Saving {len(filtered_output_files)} files to {output_dir}...')
64
- for file in filtered_output_files:
65
- # Remove leading slash of file_path
66
- destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
113
+ for file_index, file in enumerate(filtered_output_files):
114
+ if flat:
115
+ destination_file_path = Path(output_dir) / Path(file.path).name
116
+ else:
117
+ # Remove leading slash of file_path
118
+ destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
119
+
67
120
  if destination_file_path.exists():
68
121
  if skip_file_if_exists:
69
122
  print(f'Skipping {destination_file_path} as a file with that name already exists locally.')
70
123
  continue
124
+ elif not overwrite:
125
+ raise BioLibError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite.')
71
126
  else:
72
127
  destination_file_path.rename(
73
128
  f'{destination_file_path}.biolib-renamed.{time.strftime("%Y%m%d%H%M%S")}'
@@ -88,8 +143,12 @@ class JobResult:
88
143
  file_start += data_already_downloaded
89
144
  data_to_download -= data_already_downloaded
90
145
 
146
+ read_ahead_bytes = file_read_ahead_map[file_index]
147
+
91
148
  with open(partial_path, mode='ab') as partial_file:
92
- for chunk in stream_seeker.seek_and_read(file_start=file_start, file_length=data_to_download):
149
+ for chunk in stream_seeker.seek_and_read(
150
+ file_start=file_start, file_length=data_to_download, read_ahead_bytes=read_ahead_bytes
151
+ ):
93
152
  partial_file.write(chunk)
94
153
 
95
154
  # rename partial file to actual file name
@@ -97,7 +156,7 @@ class JobResult:
97
156
 
98
157
  def get_output_file(self, filename) -> LazyLoadedFile:
99
158
  files = self._get_module_output().get_files()
100
- filtered_files = self._get_filtered_files(files, path_filter=filename)
159
+ filtered_files = filter_lazy_loaded_files(files, path_filter=filename)
101
160
  if not filtered_files:
102
161
  raise BioLibError(f'File {filename} not found in results.')
103
162
 
@@ -111,26 +170,7 @@ class JobResult:
111
170
  if not path_filter:
112
171
  return files
113
172
 
114
- return self._get_filtered_files(files, path_filter)
115
-
116
- @staticmethod
117
- def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
118
- if not (isinstance(path_filter, str) or callable(path_filter)):
119
- raise Exception('Expected path_filter to be a string or a function')
120
-
121
- if callable(path_filter):
122
- return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
123
-
124
- glob_filter = cast(str, path_filter)
125
-
126
- # since all file paths start with /, make sure filter does too
127
- if not glob_filter.startswith('/'):
128
- glob_filter = '/' + glob_filter
129
-
130
- def _filter_function(file: LazyLoadedFile) -> bool:
131
- return fnmatch(file.path, glob_filter)
132
-
133
- return list(filter(_filter_function, files))
173
+ return filter_lazy_loaded_files(files, path_filter)
134
174
 
135
175
  def _get_module_output(self) -> ModuleOutputV2:
136
176
  if self._module_output is None:
biolib/jobs/types.py CHANGED
@@ -26,6 +26,7 @@ class Result(TypedDict):
26
26
 
27
27
  class JobDict(TypedDict):
28
28
  app_uri: str
29
+ arguments_override_command: bool
29
30
  auth_token: str
30
31
  created_at: str
31
32
  ended_at: Optional[str]
biolib/sdk/__init__.py CHANGED
@@ -2,6 +2,7 @@ from typing import Optional
2
2
 
3
3
  # Imports to hide and use as private internal utils
4
4
  from biolib._data_record.data_record import DataRecord as _DataRecord
5
+ from biolib._index.index import Index as _Index
5
6
  from biolib._internal.push_application import push_application as _push_application
6
7
  from biolib._internal.push_application import set_app_version_as_active as _set_app_version_as_active
7
8
  from biolib._runtime.runtime import Runtime as _Runtime
@@ -12,8 +13,18 @@ from biolib.app import BioLibApp as _BioLibApp
12
13
  Runtime = _Runtime
13
14
 
14
15
 
15
- def get_session(refresh_token: str, base_url: Optional[str] = None, client_type: Optional[str] = None) -> _Session:
16
- return _Session.get_session(refresh_token=refresh_token, base_url=base_url, client_type=client_type)
16
+ def get_session(
17
+ refresh_token: str,
18
+ base_url: Optional[str] = None,
19
+ client_type: Optional[str] = None,
20
+ experiment: Optional[str] = None,
21
+ ) -> _Session:
22
+ return _Session.get_session(
23
+ refresh_token=refresh_token,
24
+ base_url=base_url,
25
+ client_type=client_type,
26
+ experiment=experiment,
27
+ )
17
28
 
18
29
 
19
30
  def push_app_version(uri: str, path: str) -> _BioLibApp:
@@ -64,3 +75,7 @@ def create_data_record(
64
75
  data_path=data_path,
65
76
  record_type=record_type,
66
77
  )
78
+
79
+
80
+ def get_index(uri: str) -> _Index:
81
+ return _Index.get_by_uri(uri)
biolib/typing_utils.py CHANGED
@@ -1,2 +1,2 @@
1
1
  # TODO: Deprecate and later remove this file
2
- from biolib._internal.types.typing import * # pylint: disable=wildcard-import, unused-wildcard-import
2
+ from biolib._shared.types.typing import * # pylint: disable=wildcard-import, unused-wildcard-import
@@ -2,7 +2,7 @@ import os
2
2
  import abc
3
3
  import json
4
4
  import time
5
- from datetime import datetime
5
+ from datetime import datetime, timezone
6
6
 
7
7
  import appdirs # type: ignore
8
8
 
@@ -88,4 +88,4 @@ class CacheState(abc.ABC, Generic[StateType]):
88
88
 
89
89
  @staticmethod
90
90
  def get_timestamp_now() -> str:
91
- return datetime.utcnow().isoformat()
91
+ return datetime.now(timezone.utc).isoformat()
biolib/utils/seq_util.py CHANGED
@@ -9,7 +9,7 @@ class SeqUtilRecord:
9
9
  self,
10
10
  sequence: str,
11
11
  sequence_id: str,
12
- description: Optional['str'],
12
+ description: Optional['str'] = None,
13
13
  properties: Optional[Dict[str, str]] = None,
14
14
  ):
15
15
  self.sequence = sequence
@@ -1,8 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: pybiolib
3
- Version: 1.2.911
3
+ Version: 1.2.1642
4
4
  Summary: BioLib Python Client
5
5
  License: MIT
6
+ License-File: LICENSE
6
7
  Keywords: biolib
7
8
  Author: biolib
8
9
  Author-email: hello@biolib.com
@@ -17,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.10
17
18
  Classifier: Programming Language :: Python :: 3.11
18
19
  Classifier: Programming Language :: Python :: 3.12
19
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
20
22
  Provides-Extra: compute-node
21
23
  Requires-Dist: appdirs (>=1.4.3)
22
24
  Requires-Dist: click (>=8.0.0)