PyPI - pybiolib - Versions diffs - 1.2.911__py3-none-any.whl → 1.2.1642__py3-none-any.whl - Mend

pybiolib 1.2.911py3-none-any.whl → 1.2.1642py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pybiolib might be problematic. Click here for more details.

Files changed (113) hide show

biolib/__init__.py +33 -10
biolib/_data_record/data_record.py +24 -11
biolib/_index/index.py +51 -0
biolib/_index/types.py +7 -0
biolib/_internal/add_copilot_prompts.py +3 -5
biolib/_internal/add_gui_files.py +59 -0
biolib/_internal/data_record/data_record.py +1 -1
biolib/_internal/data_record/push_data.py +1 -1
biolib/_internal/data_record/remote_storage_endpoint.py +3 -3
biolib/_internal/file_utils.py +48 -0
biolib/_internal/index/__init__.py +1 -0
biolib/_internal/index/index.py +18 -0
biolib/_internal/lfs/cache.py +4 -2
biolib/_internal/push_application.py +89 -23
biolib/_internal/runtime.py +2 -0
biolib/_internal/string_utils.py +13 -0
biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
biolib/_internal/templates/gui_template/App.tsx +53 -0
biolib/_internal/templates/gui_template/Dockerfile +28 -0
biolib/_internal/templates/gui_template/biolib-sdk.ts +37 -0
biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
biolib/_internal/templates/gui_template/index.css +5 -0
biolib/_internal/templates/gui_template/index.html +13 -0
biolib/_internal/templates/gui_template/index.tsx +10 -0
biolib/_internal/templates/gui_template/package.json +27 -0
biolib/_internal/templates/gui_template/tsconfig.json +24 -0
biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +49 -0
biolib/_internal/templates/gui_template/vite.config.mts +9 -0
biolib/_internal/templates/init_template/.biolib/config.yml +1 -0
biolib/_internal/templates/init_template/.github/workflows/biolib.yml +6 -1
biolib/_internal/templates/init_template/Dockerfile +2 -0
biolib/_internal/templates/init_template/run.sh +1 -0
biolib/_internal/templates/templates.py +9 -1
biolib/_internal/utils/__init__.py +25 -0
biolib/_internal/utils/job_url.py +33 -0
biolib/_internal/utils/multinode.py +12 -14
biolib/_runtime/runtime.py +15 -2
biolib/_session/session.py +7 -5
biolib/_shared/__init__.py +0 -0
biolib/_shared/types/__init__.py +69 -0
biolib/_shared/types/account.py +12 -0
biolib/_shared/types/account_member.py +8 -0
biolib/{_internal → _shared}/types/experiment.py +1 -0
biolib/_shared/types/resource.py +17 -0
biolib/_shared/types/resource_deploy_key.py +11 -0
biolib/{_internal → _shared}/types/resource_permission.py +1 -1
biolib/{_internal → _shared}/types/user.py +5 -5
biolib/_shared/utils/__init__.py +7 -0
biolib/_shared/utils/resource_uri.py +75 -0
biolib/api/client.py +1 -1
biolib/app/app.py +96 -45
biolib/biolib_api_client/app_types.py +1 -0
biolib/biolib_api_client/biolib_app_api.py +26 -0
biolib/biolib_binary_format/module_input.py +8 -0
biolib/biolib_binary_format/remote_endpoints.py +3 -3
biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
biolib/biolib_logging.py +1 -1
biolib/cli/__init__.py +2 -1
biolib/cli/auth.py +4 -16
biolib/cli/data_record.py +17 -0
biolib/cli/index.py +32 -0
biolib/cli/init.py +93 -11
biolib/cli/lfs.py +1 -1
biolib/cli/run.py +1 -1
biolib/cli/start.py +14 -1
biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
biolib/compute_node/job_worker/executors/docker_types.py +1 -1
biolib/compute_node/job_worker/executors/types.py +6 -5
biolib/compute_node/job_worker/job_storage.py +2 -1
biolib/compute_node/job_worker/job_worker.py +155 -90
biolib/compute_node/job_worker/large_file_system.py +2 -6
biolib/compute_node/job_worker/network_alloc.py +99 -0
biolib/compute_node/job_worker/network_buffer.py +240 -0
biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
biolib/compute_node/remote_host_proxy.py +135 -67
biolib/compute_node/utils.py +2 -0
biolib/compute_node/webserver/compute_node_results_proxy.py +188 -0
biolib/compute_node/webserver/proxy_utils.py +28 -0
biolib/compute_node/webserver/webserver.py +64 -19
biolib/experiments/experiment.py +98 -16
biolib/jobs/job.py +128 -31
biolib/jobs/job_result.py +73 -33
biolib/jobs/types.py +1 -0
biolib/sdk/__init__.py +17 -2
biolib/typing_utils.py +1 -1
biolib/utils/cache_state.py +2 -2
biolib/utils/seq_util.py +1 -1
{pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/METADATA +4 -2
pybiolib-1.2.1642.dist-info/RECORD +180 -0
{pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/WHEEL +1 -1
biolib/_internal/llm_instructions/.github/instructions/style-react-ts.instructions.md +0 -22
biolib/_internal/types/__init__.py +0 -6
biolib/_internal/types/account.py +0 -10
biolib/utils/app_uri.py +0 -57
pybiolib-1.2.911.dist-info/RECORD +0 -150
/biolib/{_internal/llm_instructions → _index}/__init__.py +0 -0
/biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/general-app-knowledge.instructions.md +0 -0
/biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-general.instructions.md +0 -0
/biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-python.instructions.md +0 -0
/biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_app_inputs.prompt.md +0 -0
/biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_run_apps.prompt.md +0 -0
/biolib/{_internal → _shared}/types/app.py +0 -0
/biolib/{_internal → _shared}/types/data_record.py +0 -0
/biolib/{_internal → _shared}/types/file_node.py +0 -0
/biolib/{_internal → _shared}/types/push.py +0 -0
/biolib/{_internal/types/resource.py → _shared/types/resource_types.py} +0 -0
/biolib/{_internal → _shared}/types/resource_version.py +0 -0
/biolib/{_internal → _shared}/types/result.py +0 -0
/biolib/{_internal → _shared}/types/typing.py +0 -0
{pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/entry_points.txt +0 -0
{pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info/licenses}/LICENSE +0 -0

biolib/jobs/job.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import sys
 import time
 from collections import OrderedDict
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from urllib.parse import urlparse
@@ -10,7 +10,8 @@ import biolib.api.client
 from biolib import utils
 from biolib._internal.http_client import HttpClient
 from biolib._internal.tree_utils import build_tree_from_files, build_tree_str
-from biolib._internal.utils import open_browser_window_from_notebook
+from biolib._internal.utils import PathFilter, filter_lazy_loaded_files, open_browser_window_from_notebook
+from biolib._shared.utils import parse_resource_uri
 from biolib.api.client import ApiClient
 from biolib.biolib_api_client import BiolibApiClient, CreatedJobDict
 from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
@@ -18,16 +19,16 @@ from biolib.biolib_api_client.biolib_job_api import BiolibJobApi
 from biolib.biolib_binary_format import LazyLoadedFile, ModuleInput, ModuleInputDict, ModuleOutputV2
 from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
 from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
+from biolib.biolib_binary_format.utils import InMemoryIndexableBuffer
 from biolib.biolib_errors import BioLibError, CloudJobFinishedError
 from biolib.biolib_logging import logger, logger_no_user_data
 from biolib.compute_node.job_worker.job_storage import JobStorage
 from biolib.compute_node.utils import SystemExceptionCodeMap, SystemExceptionCodes
-from biolib.jobs.job_result import JobResult, PathFilter
+from biolib.jobs.job_result import JobResult
 from biolib.jobs.types import CloudJobDict, CloudJobStartedDict, JobDict
 from biolib.tables import BioLibTable
-from biolib.typing_utils import Dict, List, Optional, Union, cast
+from biolib.typing_utils import Dict, Generator, List, Optional, Tuple, Union, cast
 from biolib.utils import IS_RUNNING_IN_NOTEBOOK
-from biolib.utils.app_uri import parse_app_uri
 class Result:
@@ -49,7 +50,7 @@ class Result:
         self._auth_token: str = job_dict['auth_token']
         self._job_dict: JobDict = job_dict
-        self._job_dict_last_fetched_at: datetime = datetime.utcnow()
+        self._job_dict_last_fetched_at: datetime = datetime.now(timezone.utc)
         self._result: Optional[JobResult] = None
         self._cached_input_arguments: Optional[List[str]] = None
@@ -187,6 +188,45 @@ class Result:
         """
         return self.result.list_output_files(path_filter=path_filter)
+    def list_input_files(
+        self,
+        path_filter: Optional[PathFilter] = None,
+    ) -> List[LazyLoadedFile]:
+        """List input files from the result.
+        Args:
+            path_filter (PathFilter, optional): Filter to apply to the input files.
+                Can be a string glob pattern or a callable that takes a path string and returns a boolean.
+        Returns:
+            List[LazyLoadedFile]: List of input files.
+        Example::
+            >>> result = biolib.get_result("result_id")
+            >>> input_files = result.list_input_files()
+            >>> # Filter files with a glob pattern
+            >>> input_files = result.list_input_files("*.txt")
+        """
+        presigned_download_url = BiolibJobApi.get_job_storage_download_url(
+            job_uuid=self.id,
+            job_auth_token=self._auth_token,
+            storage_type='input',
+        )
+        response = HttpClient.request(url=presigned_download_url)
+        module_input_serialized: bytes = response.content
+        module_input = ModuleInput(module_input_serialized).deserialize()
+        files = []
+        for path, data in module_input['files'].items():
+            buffer = InMemoryIndexableBuffer(data)
+            lazy_file = LazyLoadedFile(path=path, buffer=buffer, start=0, length=len(data))
+            files.append(lazy_file)
+        if not path_filter:
+            return files
+        return filter_lazy_loaded_files(files, path_filter)
     def get_output_file(self, filename: str) -> LazyLoadedFile:
         return self.result.get_output_file(filename=filename)
@@ -226,7 +266,7 @@ class Result:
         return self._cached_input_arguments
-    def save_input_files(self, output_dir: str) -> None:
+    def save_input_files(self, output_dir: str, overwrite: bool = False) -> None:
         logger.info('Downloading input files...')
         module_input = self._get_module_input()
@@ -236,7 +276,12 @@ class Result:
             # Remove leading slash of file_path
             destination_file_path = Path(output_dir) / Path(path.lstrip('/'))
             if destination_file_path.exists():
-                destination_file_path.rename(f'{destination_file_path}.biolib-renamed.{time.strftime("%Y%m%d%H%M%S")}')
+                if not overwrite:
+                    raise BioLibError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite.')
+                else:
+                    destination_file_path.rename(
+                        f'{destination_file_path}.biolib-renamed.{time.strftime("%Y%m%d%H%M%S")}'
+                    )
             dir_path = destination_file_path.parent
             if dir_path:
@@ -251,12 +296,44 @@ class Result:
         self,
         output_dir: str,
         path_filter: Optional[PathFilter] = None,
-        skip_file_if_exists: Optional[bool] = None,
+        skip_file_if_exists: bool = False,
+        overwrite: bool = False,
+        flat: bool = False,
     ) -> None:
+        """Save output files from the result to a local directory.
+        Args:
+            output_dir (str): Directory path where files will be saved.
+            path_filter (PathFilter, optional): Filter to apply to output files.
+                Can be a string glob pattern or a callable that takes a path and returns a boolean.
+            skip_file_if_exists (bool, optional): If True, skip files that already exist locally.
+                Defaults to False.
+            overwrite (bool, optional): If True, overwrite existing files by renaming them with a timestamp.
+                Defaults to False.
+            flat (bool, optional): If True, save all files directly to output_dir using only their basenames,
+                without creating subdirectories. When enabled, raises an error if duplicate basenames exist
+                in the filtered output or if any basename already exists in output_dir. Defaults to False.
+        Raises:
+            BioLibError: If flat=True and duplicate basenames are found in filtered output.
+            BioLibError: If flat=True and a file with the same basename already exists in output_dir.
+            BioLibError: If a file already exists and neither skip_file_if_exists nor overwrite is True.
+        Example::
+            >>> result = biolib.get_result("result_id")
+            >>> # Save all files preserving directory structure
+            >>> result.save_files("./output")
+            >>> # Save files flat without subdirectories
+            >>> result.save_files("./output", flat=True)
+            >>> # Save only specific files
+            >>> result.save_files("./output", path_filter="*.txt")
+        """
         self.result.save_files(
             output_dir=output_dir,
             path_filter=path_filter,
             skip_file_if_exists=skip_file_if_exists,
+            overwrite=overwrite,
+            flat=flat,
         )
     def get_status(self) -> str:
@@ -351,6 +428,7 @@ class Result:
             >>> # Recompute with different arguments
             >>> new_result = result.recompute(arguments=["--new-arg", "value"])
         """
+        self._refetch_job_dict()
         app_response = BiolibAppApi.get_by_uri(uri=app_uri or self._job_dict['app_uri'])
         job_storage_input = RemoteJobStorageEndpoint(
@@ -378,6 +456,7 @@ class Result:
             app_uri=app_response['app_uri'],
             app_version_uuid=app_response['app_version']['public_id'],
             module_input_serialized=module_input_serialized,
+            override_command=self._job_dict['arguments_override_command'],
             machine=machine if machine else original_requested_machine,
         )
         if blocking:
@@ -444,16 +523,11 @@ class Result:
         return Result(job_dict)
     @staticmethod
-    def print_logs_packages(stdout_and_stderr_packages_b64):
+    def _yield_logs_packages(stdout_and_stderr_packages_b64) -> Generator[Tuple[str, bytes], None, None]:
         for stdout_and_stderr_package_b64 in stdout_and_stderr_packages_b64:
             stdout_and_stderr_package = base64.b64decode(stdout_and_stderr_package_b64)
             stdout_and_stderr = StdoutAndStderr(stdout_and_stderr_package).deserialize()
-            sys.stdout.write(stdout_and_stderr.decode())
-            if not IS_RUNNING_IN_NOTEBOOK:  # for some reason flushing in jupyter notebooks breaks \r handling
-                sys.stdout.flush()
-        # flush after having processed all packages
-        sys.stdout.flush()
+            yield ('stdout', stdout_and_stderr)
     def show(self) -> None:
         self._refetch_job_dict()
@@ -463,21 +537,44 @@ class Result:
             title=f'Result: {self._uuid}',
         ).print_table()
-    def stream_logs(self) -> None:
+    def stream_logs(self, as_iterator: bool = False):
+        if as_iterator:
+            return self._iter_logs()
         self._stream_logs()
+        return None
     def _stream_logs(self, enable_print: bool = True) -> None:
+        try:
+            for stream_type, data in self._iter_logs(enable_print=enable_print):
+                if stream_type == 'stdout':
+                    if IS_RUNNING_IN_NOTEBOOK:
+                        sys.stdout.write(data.decode(encoding='utf-8', errors='replace'))
+                        # Note: we avoid flush() in notebook as that breaks \r handling
+                    else:
+                        sys.stdout.buffer.write(data)
+                        sys.stdout.buffer.flush()
+                elif stream_type == 'stderr':
+                    if IS_RUNNING_IN_NOTEBOOK:
+                        sys.stderr.write(data.decode(encoding='utf-8', errors='replace'))
+                        # Note: we avoid flush() in notebook as that breaks \r handling
+                    else:
+                        sys.stderr.buffer.write(data)
+                        sys.stderr.buffer.flush()
+        finally:
+            # Flush after having processed all packages
+            if IS_RUNNING_IN_NOTEBOOK:
+                sys.stdout.flush()
+                sys.stderr.flush()
+    def _iter_logs(self, enable_print: bool = True) -> Generator[Tuple[str, bytes], None, None]:
         try:
             cloud_job = self._get_cloud_job_awaiting_started()
         except CloudJobFinishedError:
             logger.info(f'--- The result {self.id} has already completed (no streaming will take place) ---')
             logger.info('--- The stdout log is printed below: ---')
-            sys.stdout.flush()
-            print(self.get_stdout().decode(), file=sys.stdout)
-            sys.stdout.flush()
+            yield ('stdout', self.get_stdout())
             logger.info('--- The stderr log is printed below: ---')
-            print(self.get_stderr().decode(), file=sys.stderr)
-            sys.stderr.flush()
+            yield ('stderr', self.get_stderr())
             logger.info(f'--- The job {self.id} has already completed. Its output was printed above. ---')
             return
@@ -489,7 +586,7 @@ class Result:
             logger_no_user_data.debug(f'Using cloud proxy URL from env var BIOLIB_CLOUD_BASE_URL: {compute_node_url}')
         if enable_print:
-            self._print_full_logs(node_url=compute_node_url)
+            yield from self._yield_full_logs(node_url=compute_node_url)
         final_status_messages: List[str] = []
         while True:
@@ -508,8 +605,8 @@ class Result:
                     # Print the status before writing stdout and stderr
                     logger.info(f'Cloud: {status_update["log_message"]}')
-            if 'stdout_and_stderr_packages_b64' and enable_print:
-                self.print_logs_packages(status_json['stdout_and_stderr_packages_b64'])
+            if enable_print:
+                yield from self._yield_logs_packages(status_json['stdout_and_stderr_packages_b64'])
             if 'error_code' in status_json:
                 error_code = status_json['error_code']
@@ -526,7 +623,7 @@ class Result:
         self.wait()  # Wait for compute node to tell the backend that the job is finished
-    def _print_full_logs(self, node_url: str) -> None:
+    def _yield_full_logs(self, node_url: str) -> Generator[Tuple[str, bytes], None, None]:
         try:
             response_json = HttpClient.request(url=f'{node_url}/v1/job/{self._uuid}/status/?logs=full').json()
         except Exception as error:
@@ -536,7 +633,7 @@ class Result:
         for status_update in response_json.get('previous_status_updates', []):
             logger.info(f'Cloud: {status_update["log_message"]}')
-        self.print_logs_packages(response_json['streamed_logs_packages_b64'])
+        yield from self._yield_logs_packages(response_json['streamed_logs_packages_b64'])
     def _get_cloud_job_awaiting_started(self) -> CloudJobStartedDict:
         retry_count = 0
@@ -582,11 +679,11 @@ class Result:
         )
     def _refetch_job_dict(self, force_refetch: Optional[bool] = False) -> None:
-        if not force_refetch and self._job_dict_last_fetched_at > datetime.utcnow() - timedelta(seconds=2):
+        if not force_refetch and self._job_dict_last_fetched_at > datetime.now(timezone.utc) - timedelta(seconds=2):
             return
         self._job_dict = self._get_job_dict(self._uuid, self._auth_token)
-        self._job_dict_last_fetched_at = datetime.utcnow()
+        self._job_dict_last_fetched_at = datetime.now(timezone.utc)
     @staticmethod
     def _start_job_in_cloud(
@@ -605,8 +702,8 @@ class Result:
     ) -> 'Result':
         if len(module_input_serialized) < 500_000 and temporary_client_secrets is None:
             _job_dict = BiolibJobApi.create_job_with_data(
-                app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
                 app_version_uuid=app_version_uuid,
+                app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
                 arguments_override_command=override_command,
                 experiment_uuid=experiment_id,
                 module_input_serialized=module_input_serialized,
@@ -620,8 +717,8 @@ class Result:
             return Result(cast(JobDict, _job_dict))
         job_dict: CreatedJobDict = BiolibJobApi.create(
-            app_resource_name_prefix=parse_app_uri(app_uri)['resource_name_prefix'],
             app_version_id=app_version_uuid,
+            app_resource_name_prefix=parse_resource_uri(app_uri)['resource_prefix'],
             experiment_uuid=experiment_id,
             machine=machine,
             notify=notify,

biolib/jobs/job_result.py CHANGED Viewed

@@ -1,16 +1,14 @@
 import time
-from fnmatch import fnmatch
 from pathlib import Path
+from biolib._internal.utils import PathFilter, filter_lazy_loaded_files
 from biolib.biolib_binary_format import ModuleOutputV2
 from biolib.biolib_binary_format.remote_endpoints import RemoteJobStorageEndpoint
 from biolib.biolib_binary_format.remote_stream_seeker import StreamSeeker
 from biolib.biolib_binary_format.utils import LazyLoadedFile, RemoteIndexableBuffer
 from biolib.biolib_errors import BioLibError
 from biolib.biolib_logging import logger
-from biolib.typing_utils import Callable, List, Optional, Union, cast
-PathFilter = Union[str, Callable[[str], bool]]
+from biolib.typing_utils import Dict, List, Optional
 class JobResult:
@@ -38,17 +36,68 @@ class JobResult:
         self,
         output_dir: str,
         path_filter: Optional[PathFilter] = None,
-        skip_file_if_exists: Optional[bool] = None,
+        skip_file_if_exists: bool = False,
+        overwrite: bool = False,
+        flat: bool = False,
     ) -> None:
         module_output = self._get_module_output()
         output_files = module_output.get_files()
-        filtered_output_files = self._get_filtered_files(output_files, path_filter) if path_filter else output_files
+        filtered_output_files = filter_lazy_loaded_files(output_files, path_filter) if path_filter else output_files
         if len(filtered_output_files) == 0:
             logger.debug('No output files to save')
             return
-        total_files_data_to_download_in_bytes = sum(file.length for file in filtered_output_files)
+        if flat:
+            basename_to_paths: Dict[str, List[str]] = {}
+            for file in filtered_output_files:
+                basename = Path(file.path).name
+                if basename not in basename_to_paths:
+                    basename_to_paths[basename] = []
+                basename_to_paths[basename].append(file.path)
+            duplicates = {basename: paths for basename, paths in basename_to_paths.items() if len(paths) > 1}
+            if duplicates:
+                max_shown = 3
+                error_parts = []
+                sorted_basenames = sorted(duplicates.keys())
+                for basename in sorted_basenames[:max_shown]:
+                    paths = duplicates[basename]
+                    error_parts.append(f'  {basename}: ({", ".join(paths)})')
+                error_message = 'Cannot save files in flat mode: duplicate filenames detected:\n' + '\n'.join(
+                    error_parts
+                )
+                if len(duplicates) > max_shown:
+                    remaining = len(duplicates) - max_shown
+                    error_message += f'\n  (and {remaining} more)'
+                raise BioLibError(error_message)
+        major_gap_threshold = 50_000
+        n = len(filtered_output_files)
+        next_break_end = [0] * n
+        if n > 0:
+            next_break_end[n - 1] = filtered_output_files[n - 1].start + filtered_output_files[n - 1].length
+            for i in range(n - 2, -1, -1):
+                end_i = filtered_output_files[i].start + filtered_output_files[i].length
+                gap = filtered_output_files[i + 1].start - end_i
+                if gap >= major_gap_threshold:
+                    next_break_end[i] = end_i
+                else:
+                    next_break_end[i] = next_break_end[i + 1]
+        total_files_data_to_download_in_bytes = 0
+        file_read_ahead_map = {}
+        for i, file in enumerate(filtered_output_files):
+            total_files_data_to_download_in_bytes += file.length
+            end_i = file.start + file.length
+            read_ahead_bytes = max(0, next_break_end[i] - end_i)
+            file_read_ahead_map[i] = read_ahead_bytes
         # Assume files are in order
         first_file = filtered_output_files[0]
@@ -56,18 +105,24 @@ class JobResult:
         stream_seeker = StreamSeeker(
             files_data_start=first_file.start,
             files_data_end=last_file.start + last_file.length,
-            download_chunk_size_in_bytes=min(total_files_data_to_download_in_bytes, 10_000_000),
+            max_chunk_size=min(total_files_data_to_download_in_bytes, 10_000_000),
             upstream_buffer=module_output.buffer,
         )
         logger.info(f'Saving {len(filtered_output_files)} files to {output_dir}...')
-        for file in filtered_output_files:
-            # Remove leading slash of file_path
-            destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
+        for file_index, file in enumerate(filtered_output_files):
+            if flat:
+                destination_file_path = Path(output_dir) / Path(file.path).name
+            else:
+                # Remove leading slash of file_path
+                destination_file_path = Path(output_dir) / Path(file.path.lstrip('/'))
             if destination_file_path.exists():
                 if skip_file_if_exists:
                     print(f'Skipping {destination_file_path} as a file with that name already exists locally.')
                     continue
+                elif not overwrite:
+                    raise BioLibError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite.')
                 else:
                     destination_file_path.rename(
                         f'{destination_file_path}.biolib-renamed.{time.strftime("%Y%m%d%H%M%S")}'
@@ -88,8 +143,12 @@ class JobResult:
                 file_start += data_already_downloaded
                 data_to_download -= data_already_downloaded
+            read_ahead_bytes = file_read_ahead_map[file_index]
             with open(partial_path, mode='ab') as partial_file:
-                for chunk in stream_seeker.seek_and_read(file_start=file_start, file_length=data_to_download):
+                for chunk in stream_seeker.seek_and_read(
+                    file_start=file_start, file_length=data_to_download, read_ahead_bytes=read_ahead_bytes
+                ):
                     partial_file.write(chunk)
             # rename partial file to actual file name
@@ -97,7 +156,7 @@ class JobResult:
     def get_output_file(self, filename) -> LazyLoadedFile:
         files = self._get_module_output().get_files()
-        filtered_files = self._get_filtered_files(files, path_filter=filename)
+        filtered_files = filter_lazy_loaded_files(files, path_filter=filename)
         if not filtered_files:
             raise BioLibError(f'File {filename} not found in results.')
@@ -111,26 +170,7 @@ class JobResult:
         if not path_filter:
             return files
-        return self._get_filtered_files(files, path_filter)
-    @staticmethod
-    def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
-        if not (isinstance(path_filter, str) or callable(path_filter)):
-            raise Exception('Expected path_filter to be a string or a function')
-        if callable(path_filter):
-            return list(filter(lambda x: path_filter(x.path), files))  # type: ignore
-        glob_filter = cast(str, path_filter)
-        # since all file paths start with /, make sure filter does too
-        if not glob_filter.startswith('/'):
-            glob_filter = '/' + glob_filter
-        def _filter_function(file: LazyLoadedFile) -> bool:
-            return fnmatch(file.path, glob_filter)
-        return list(filter(_filter_function, files))
+        return filter_lazy_loaded_files(files, path_filter)
     def _get_module_output(self) -> ModuleOutputV2:
         if self._module_output is None:

biolib/jobs/types.py CHANGED Viewed

@@ -26,6 +26,7 @@ class Result(TypedDict):
 class JobDict(TypedDict):
     app_uri: str
+    arguments_override_command: bool
     auth_token: str
     created_at: str
     ended_at: Optional[str]

biolib/sdk/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Optional
 # Imports to hide and use as private internal utils
 from biolib._data_record.data_record import DataRecord as _DataRecord
+from biolib._index.index import Index as _Index
 from biolib._internal.push_application import push_application as _push_application
 from biolib._internal.push_application import set_app_version_as_active as _set_app_version_as_active
 from biolib._runtime.runtime import Runtime as _Runtime
@@ -12,8 +13,18 @@ from biolib.app import BioLibApp as _BioLibApp
 Runtime = _Runtime
-def get_session(refresh_token: str, base_url: Optional[str] = None, client_type: Optional[str] = None) -> _Session:
-    return _Session.get_session(refresh_token=refresh_token, base_url=base_url, client_type=client_type)
+def get_session(
+    refresh_token: str,
+    base_url: Optional[str] = None,
+    client_type: Optional[str] = None,
+    experiment: Optional[str] = None,
+) -> _Session:
+    return _Session.get_session(
+        refresh_token=refresh_token,
+        base_url=base_url,
+        client_type=client_type,
+        experiment=experiment,
+    )
 def push_app_version(uri: str, path: str) -> _BioLibApp:
@@ -64,3 +75,7 @@ def create_data_record(
         data_path=data_path,
         record_type=record_type,
     )
+def get_index(uri: str) -> _Index:
+    return _Index.get_by_uri(uri)

biolib/typing_utils.py CHANGED Viewed

@@ -1,2 +1,2 @@
 # TODO: Deprecate and later remove this file
-from biolib._internal.types.typing import *  # pylint: disable=wildcard-import, unused-wildcard-import
+from biolib._shared.types.typing import *  # pylint: disable=wildcard-import, unused-wildcard-import

biolib/utils/cache_state.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import abc
 import json
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 import appdirs  # type: ignore
@@ -88,4 +88,4 @@ class CacheState(abc.ABC, Generic[StateType]):
     @staticmethod
     def get_timestamp_now() -> str:
-        return datetime.utcnow().isoformat()
+        return datetime.now(timezone.utc).isoformat()

biolib/utils/seq_util.py CHANGED Viewed

@@ -9,7 +9,7 @@ class SeqUtilRecord:
         self,
         sequence: str,
         sequence_id: str,
-        description: Optional['str'],
+        description: Optional['str'] = None,
         properties: Optional[Dict[str, str]] = None,
     ):
         self.sequence = sequence

{pybiolib-1.2.911.dist-info → pybiolib-1.2.1642.dist-info}/METADATA RENAMED Viewed

@@ -1,8 +1,9 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: pybiolib
-Version: 1.2.911
+Version: 1.2.1642
 Summary: BioLib Python Client
 License: MIT
+License-File: LICENSE
 Keywords: biolib
 Author: biolib
 Author-email: hello@biolib.com
@@ -17,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Provides-Extra: compute-node
 Requires-Dist: appdirs (>=1.4.3)
 Requires-Dist: click (>=8.0.0)

pybiolib 1.2.911__py3-none-any.whl → 1.2.1642__py3-none-any.whl

Potentially problematic release.

pybiolib 1.2.911py3-none-any.whl → 1.2.1642py3-none-any.whl