pybiolib 0.2.951__py3-none-any.whl → 1.2.1890__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +357 -11
- biolib/_data_record/data_record.py +380 -0
- biolib/_index/__init__.py +0 -0
- biolib/_index/index.py +55 -0
- biolib/_index/query_result.py +103 -0
- biolib/_internal/__init__.py +0 -0
- biolib/_internal/add_copilot_prompts.py +58 -0
- biolib/_internal/add_gui_files.py +81 -0
- biolib/_internal/data_record/__init__.py +1 -0
- biolib/_internal/data_record/data_record.py +85 -0
- biolib/_internal/data_record/push_data.py +116 -0
- biolib/_internal/data_record/remote_storage_endpoint.py +43 -0
- biolib/_internal/errors.py +5 -0
- biolib/_internal/file_utils.py +125 -0
- biolib/_internal/fuse_mount/__init__.py +1 -0
- biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
- biolib/_internal/http_client.py +159 -0
- biolib/_internal/lfs/__init__.py +1 -0
- biolib/_internal/lfs/cache.py +51 -0
- biolib/_internal/libs/__init__.py +1 -0
- biolib/_internal/libs/fusepy/__init__.py +1257 -0
- biolib/_internal/push_application.py +488 -0
- biolib/_internal/runtime.py +22 -0
- biolib/_internal/string_utils.py +13 -0
- biolib/_internal/templates/__init__.py +1 -0
- biolib/_internal/templates/copilot_template/.github/instructions/general-app-knowledge.instructions.md +10 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-general.instructions.md +20 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-python.instructions.md +16 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_app_inputs.prompt.md +11 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_run_apps.prompt.md +12 -0
- biolib/_internal/templates/dashboard_template/.biolib/config.yml +5 -0
- biolib/_internal/templates/github_workflow_template/.github/workflows/biolib.yml +21 -0
- biolib/_internal/templates/gitignore_template/.gitignore +10 -0
- biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
- biolib/_internal/templates/gui_template/App.tsx +53 -0
- biolib/_internal/templates/gui_template/Dockerfile +27 -0
- biolib/_internal/templates/gui_template/biolib-sdk.ts +82 -0
- biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
- biolib/_internal/templates/gui_template/index.css +5 -0
- biolib/_internal/templates/gui_template/index.html +13 -0
- biolib/_internal/templates/gui_template/index.tsx +10 -0
- biolib/_internal/templates/gui_template/package.json +27 -0
- biolib/_internal/templates/gui_template/tsconfig.json +24 -0
- biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +50 -0
- biolib/_internal/templates/gui_template/vite.config.mts +10 -0
- biolib/_internal/templates/init_template/.biolib/config.yml +19 -0
- biolib/_internal/templates/init_template/Dockerfile +14 -0
- biolib/_internal/templates/init_template/requirements.txt +1 -0
- biolib/_internal/templates/init_template/run.py +12 -0
- biolib/_internal/templates/init_template/run.sh +4 -0
- biolib/_internal/templates/templates.py +25 -0
- biolib/_internal/tree_utils.py +106 -0
- biolib/_internal/utils/__init__.py +65 -0
- biolib/_internal/utils/auth.py +46 -0
- biolib/_internal/utils/job_url.py +33 -0
- biolib/_internal/utils/multinode.py +263 -0
- biolib/_runtime/runtime.py +157 -0
- biolib/_session/session.py +44 -0
- biolib/_shared/__init__.py +0 -0
- biolib/_shared/types/__init__.py +74 -0
- biolib/_shared/types/account.py +12 -0
- biolib/_shared/types/account_member.py +8 -0
- biolib/_shared/types/app.py +9 -0
- biolib/_shared/types/data_record.py +40 -0
- biolib/_shared/types/experiment.py +32 -0
- biolib/_shared/types/file_node.py +17 -0
- biolib/_shared/types/push.py +6 -0
- biolib/_shared/types/resource.py +37 -0
- biolib/_shared/types/resource_deploy_key.py +11 -0
- biolib/_shared/types/resource_permission.py +14 -0
- biolib/_shared/types/resource_version.py +19 -0
- biolib/_shared/types/result.py +14 -0
- biolib/_shared/types/typing.py +10 -0
- biolib/_shared/types/user.py +19 -0
- biolib/_shared/utils/__init__.py +7 -0
- biolib/_shared/utils/resource_uri.py +75 -0
- biolib/api/__init__.py +6 -0
- biolib/api/client.py +168 -0
- biolib/app/app.py +252 -49
- biolib/app/search_apps.py +45 -0
- biolib/biolib_api_client/api_client.py +126 -31
- biolib/biolib_api_client/app_types.py +24 -4
- biolib/biolib_api_client/auth.py +31 -8
- biolib/biolib_api_client/biolib_app_api.py +147 -52
- biolib/biolib_api_client/biolib_job_api.py +161 -141
- biolib/biolib_api_client/job_types.py +21 -5
- biolib/biolib_api_client/lfs_types.py +7 -23
- biolib/biolib_api_client/user_state.py +56 -0
- biolib/biolib_binary_format/__init__.py +1 -4
- biolib/biolib_binary_format/file_in_container.py +105 -0
- biolib/biolib_binary_format/module_input.py +24 -7
- biolib/biolib_binary_format/module_output_v2.py +149 -0
- biolib/biolib_binary_format/remote_endpoints.py +34 -0
- biolib/biolib_binary_format/remote_stream_seeker.py +59 -0
- biolib/biolib_binary_format/saved_job.py +3 -2
- biolib/biolib_binary_format/{attestation_document.py → stdout_and_stderr.py} +8 -8
- biolib/biolib_binary_format/system_status_update.py +3 -2
- biolib/biolib_binary_format/utils.py +175 -0
- biolib/biolib_docker_client/__init__.py +11 -2
- biolib/biolib_errors.py +36 -0
- biolib/biolib_logging.py +27 -10
- biolib/cli/__init__.py +38 -0
- biolib/cli/auth.py +46 -0
- biolib/cli/data_record.py +164 -0
- biolib/cli/index.py +32 -0
- biolib/cli/init.py +421 -0
- biolib/cli/lfs.py +101 -0
- biolib/cli/push.py +50 -0
- biolib/cli/run.py +63 -0
- biolib/cli/runtime.py +14 -0
- biolib/cli/sdk.py +16 -0
- biolib/cli/start.py +56 -0
- biolib/compute_node/cloud_utils/cloud_utils.py +110 -161
- biolib/compute_node/job_worker/cache_state.py +66 -88
- biolib/compute_node/job_worker/cache_types.py +1 -6
- biolib/compute_node/job_worker/docker_image_cache.py +112 -37
- biolib/compute_node/job_worker/executors/__init__.py +0 -3
- biolib/compute_node/job_worker/executors/docker_executor.py +532 -199
- biolib/compute_node/job_worker/executors/docker_types.py +9 -1
- biolib/compute_node/job_worker/executors/types.py +19 -9
- biolib/compute_node/job_worker/job_legacy_input_wait_timeout_thread.py +30 -0
- biolib/compute_node/job_worker/job_max_runtime_timer_thread.py +3 -5
- biolib/compute_node/job_worker/job_storage.py +108 -0
- biolib/compute_node/job_worker/job_worker.py +397 -212
- biolib/compute_node/job_worker/large_file_system.py +87 -38
- biolib/compute_node/job_worker/network_alloc.py +99 -0
- biolib/compute_node/job_worker/network_buffer.py +240 -0
- biolib/compute_node/job_worker/utilization_reporter_thread.py +197 -0
- biolib/compute_node/job_worker/utils.py +9 -24
- biolib/compute_node/remote_host_proxy.py +400 -98
- biolib/compute_node/utils.py +31 -9
- biolib/compute_node/webserver/compute_node_results_proxy.py +189 -0
- biolib/compute_node/webserver/proxy_utils.py +28 -0
- biolib/compute_node/webserver/webserver.py +130 -44
- biolib/compute_node/webserver/webserver_types.py +2 -6
- biolib/compute_node/webserver/webserver_utils.py +77 -12
- biolib/compute_node/webserver/worker_thread.py +183 -42
- biolib/experiments/__init__.py +0 -0
- biolib/experiments/experiment.py +356 -0
- biolib/jobs/__init__.py +1 -0
- biolib/jobs/job.py +741 -0
- biolib/jobs/job_result.py +185 -0
- biolib/jobs/types.py +50 -0
- biolib/py.typed +0 -0
- biolib/runtime/__init__.py +14 -0
- biolib/sdk/__init__.py +91 -0
- biolib/tables.py +34 -0
- biolib/typing_utils.py +2 -7
- biolib/user/__init__.py +1 -0
- biolib/user/sign_in.py +54 -0
- biolib/utils/__init__.py +162 -0
- biolib/utils/cache_state.py +94 -0
- biolib/utils/multipart_uploader.py +194 -0
- biolib/utils/seq_util.py +150 -0
- biolib/utils/zip/remote_zip.py +640 -0
- pybiolib-1.2.1890.dist-info/METADATA +41 -0
- pybiolib-1.2.1890.dist-info/RECORD +177 -0
- {pybiolib-0.2.951.dist-info → pybiolib-1.2.1890.dist-info}/WHEEL +1 -1
- pybiolib-1.2.1890.dist-info/entry_points.txt +2 -0
- README.md +0 -17
- biolib/app/app_result.py +0 -68
- biolib/app/utils.py +0 -62
- biolib/biolib-js/0-biolib.worker.js +0 -1
- biolib/biolib-js/1-biolib.worker.js +0 -1
- biolib/biolib-js/2-biolib.worker.js +0 -1
- biolib/biolib-js/3-biolib.worker.js +0 -1
- biolib/biolib-js/4-biolib.worker.js +0 -1
- biolib/biolib-js/5-biolib.worker.js +0 -1
- biolib/biolib-js/6-biolib.worker.js +0 -1
- biolib/biolib-js/index.html +0 -10
- biolib/biolib-js/main-biolib.js +0 -1
- biolib/biolib_api_client/biolib_account_api.py +0 -21
- biolib/biolib_api_client/biolib_large_file_system_api.py +0 -108
- biolib/biolib_binary_format/aes_encrypted_package.py +0 -42
- biolib/biolib_binary_format/module_output.py +0 -58
- biolib/biolib_binary_format/rsa_encrypted_aes_package.py +0 -57
- biolib/biolib_push.py +0 -114
- biolib/cli.py +0 -203
- biolib/cli_utils.py +0 -273
- biolib/compute_node/cloud_utils/enclave_parent_types.py +0 -7
- biolib/compute_node/enclave/__init__.py +0 -2
- biolib/compute_node/enclave/enclave_remote_hosts.py +0 -53
- biolib/compute_node/enclave/nitro_secure_module_utils.py +0 -64
- biolib/compute_node/job_worker/executors/base_executor.py +0 -18
- biolib/compute_node/job_worker/executors/pyppeteer_executor.py +0 -173
- biolib/compute_node/job_worker/executors/remote/__init__.py +0 -1
- biolib/compute_node/job_worker/executors/remote/nitro_enclave_utils.py +0 -81
- biolib/compute_node/job_worker/executors/remote/remote_executor.py +0 -51
- biolib/lfs.py +0 -196
- biolib/pyppeteer/.circleci/config.yml +0 -100
- biolib/pyppeteer/.coveragerc +0 -3
- biolib/pyppeteer/.gitignore +0 -89
- biolib/pyppeteer/.pre-commit-config.yaml +0 -28
- biolib/pyppeteer/CHANGES.md +0 -253
- biolib/pyppeteer/CONTRIBUTING.md +0 -26
- biolib/pyppeteer/LICENSE +0 -12
- biolib/pyppeteer/README.md +0 -137
- biolib/pyppeteer/docs/Makefile +0 -177
- biolib/pyppeteer/docs/_static/custom.css +0 -28
- biolib/pyppeteer/docs/_templates/layout.html +0 -10
- biolib/pyppeteer/docs/changes.md +0 -1
- biolib/pyppeteer/docs/conf.py +0 -299
- biolib/pyppeteer/docs/index.md +0 -21
- biolib/pyppeteer/docs/make.bat +0 -242
- biolib/pyppeteer/docs/reference.md +0 -211
- biolib/pyppeteer/docs/server.py +0 -60
- biolib/pyppeteer/poetry.lock +0 -1699
- biolib/pyppeteer/pyppeteer/__init__.py +0 -135
- biolib/pyppeteer/pyppeteer/accessibility.py +0 -286
- biolib/pyppeteer/pyppeteer/browser.py +0 -401
- biolib/pyppeteer/pyppeteer/browser_fetcher.py +0 -194
- biolib/pyppeteer/pyppeteer/command.py +0 -22
- biolib/pyppeteer/pyppeteer/connection/__init__.py +0 -242
- biolib/pyppeteer/pyppeteer/connection/cdpsession.py +0 -101
- biolib/pyppeteer/pyppeteer/coverage.py +0 -346
- biolib/pyppeteer/pyppeteer/device_descriptors.py +0 -787
- biolib/pyppeteer/pyppeteer/dialog.py +0 -79
- biolib/pyppeteer/pyppeteer/domworld.py +0 -597
- biolib/pyppeteer/pyppeteer/emulation_manager.py +0 -53
- biolib/pyppeteer/pyppeteer/errors.py +0 -48
- biolib/pyppeteer/pyppeteer/events.py +0 -63
- biolib/pyppeteer/pyppeteer/execution_context.py +0 -156
- biolib/pyppeteer/pyppeteer/frame/__init__.py +0 -299
- biolib/pyppeteer/pyppeteer/frame/frame_manager.py +0 -306
- biolib/pyppeteer/pyppeteer/helpers.py +0 -245
- biolib/pyppeteer/pyppeteer/input.py +0 -371
- biolib/pyppeteer/pyppeteer/jshandle.py +0 -598
- biolib/pyppeteer/pyppeteer/launcher.py +0 -683
- biolib/pyppeteer/pyppeteer/lifecycle_watcher.py +0 -169
- biolib/pyppeteer/pyppeteer/models/__init__.py +0 -103
- biolib/pyppeteer/pyppeteer/models/_protocol.py +0 -12460
- biolib/pyppeteer/pyppeteer/multimap.py +0 -82
- biolib/pyppeteer/pyppeteer/network_manager.py +0 -678
- biolib/pyppeteer/pyppeteer/options.py +0 -8
- biolib/pyppeteer/pyppeteer/page.py +0 -1728
- biolib/pyppeteer/pyppeteer/pipe_transport.py +0 -59
- biolib/pyppeteer/pyppeteer/target.py +0 -147
- biolib/pyppeteer/pyppeteer/task_queue.py +0 -24
- biolib/pyppeteer/pyppeteer/timeout_settings.py +0 -36
- biolib/pyppeteer/pyppeteer/tracing.py +0 -93
- biolib/pyppeteer/pyppeteer/us_keyboard_layout.py +0 -305
- biolib/pyppeteer/pyppeteer/util.py +0 -18
- biolib/pyppeteer/pyppeteer/websocket_transport.py +0 -47
- biolib/pyppeteer/pyppeteer/worker.py +0 -101
- biolib/pyppeteer/pyproject.toml +0 -97
- biolib/pyppeteer/spell.txt +0 -137
- biolib/pyppeteer/tox.ini +0 -72
- biolib/pyppeteer/utils/generate_protocol_types.py +0 -603
- biolib/start_cli.py +0 -7
- biolib/utils.py +0 -47
- biolib/validators/validate_app_version.py +0 -183
- biolib/validators/validate_argument.py +0 -134
- biolib/validators/validate_module.py +0 -323
- biolib/validators/validate_zip_file.py +0 -40
- biolib/validators/validator_utils.py +0 -103
- pybiolib-0.2.951.dist-info/LICENSE +0 -21
- pybiolib-0.2.951.dist-info/METADATA +0 -61
- pybiolib-0.2.951.dist-info/RECORD +0 -153
- pybiolib-0.2.951.dist-info/entry_points.txt +0 -3
- /LICENSE → /pybiolib-1.2.1890.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
|
|
7
|
+
import appdirs # type: ignore
|
|
8
|
+
|
|
9
|
+
from biolib.biolib_errors import BioLibError
|
|
10
|
+
from biolib.biolib_logging import logger_no_user_data
|
|
11
|
+
from biolib.typing_utils import Generic, Optional, TypeVar
|
|
12
|
+
|
|
13
|
+
StateType = TypeVar('StateType') # pylint: disable=invalid-name
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CacheStateError(BioLibError):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CacheState(abc.ABC, Generic[StateType]):
|
|
21
|
+
@property
|
|
22
|
+
@abc.abstractmethod
|
|
23
|
+
def _state_path(self) -> str:
|
|
24
|
+
raise NotImplementedError
|
|
25
|
+
|
|
26
|
+
@abc.abstractmethod
|
|
27
|
+
def _get_default_state(self) -> StateType:
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def _user_cache_dir(self) -> str:
|
|
32
|
+
user_cache_dir: str = appdirs.user_cache_dir(appname='pybiolib', appauthor='biolib')
|
|
33
|
+
os.makedirs(user_cache_dir, exist_ok=True)
|
|
34
|
+
return user_cache_dir
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def _state_lock_path(self) -> str:
|
|
38
|
+
return f'{self._state_path}.lock'
|
|
39
|
+
|
|
40
|
+
def __init__(self, fail_fast_on_lock_acquire: bool = False) -> None:
|
|
41
|
+
self._state: Optional[StateType] = None
|
|
42
|
+
self._fail_fast_on_lock_acquire: bool = fail_fast_on_lock_acquire
|
|
43
|
+
|
|
44
|
+
def __enter__(self) -> StateType:
|
|
45
|
+
logger_no_user_data.debug(f'CacheState: Entering state path: {self._state_path}...')
|
|
46
|
+
try:
|
|
47
|
+
self._acquire_state_lock()
|
|
48
|
+
if os.path.exists(self._state_path):
|
|
49
|
+
with open(self._state_path, mode='r') as file:
|
|
50
|
+
self._state = json.loads(file.read())
|
|
51
|
+
else:
|
|
52
|
+
self._state = self._get_default_state()
|
|
53
|
+
with open(self._state_path, mode='w') as file:
|
|
54
|
+
file.write(json.dumps(self._state))
|
|
55
|
+
|
|
56
|
+
# Check for type checking
|
|
57
|
+
if self._state is None:
|
|
58
|
+
raise CacheStateError('Internal state is not defined')
|
|
59
|
+
except BaseException as error: # pylint: disable=broad-except
|
|
60
|
+
logger_no_user_data.debug(f'Could not get LFS lock, got error: {error}...')
|
|
61
|
+
raise error
|
|
62
|
+
return self._state
|
|
63
|
+
|
|
64
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
65
|
+
with open(self._state_path, mode='w') as file:
|
|
66
|
+
file.write(json.dumps(self._state))
|
|
67
|
+
|
|
68
|
+
self._release_state_lock()
|
|
69
|
+
logger_no_user_data.debug(f'CacheState: Exited state path: {self._state_path}')
|
|
70
|
+
|
|
71
|
+
def _acquire_state_lock(self) -> None:
|
|
72
|
+
for _ in range(10):
|
|
73
|
+
try:
|
|
74
|
+
lock_file = open(self._state_lock_path, mode='x')
|
|
75
|
+
lock_file.close()
|
|
76
|
+
return
|
|
77
|
+
except BaseException as error: # pylint: disable=broad-except
|
|
78
|
+
logger_no_user_data.debug(f'Failed to acquire lock file "{self._state_lock_path}". Got error: {error}')
|
|
79
|
+
if self._fail_fast_on_lock_acquire:
|
|
80
|
+
raise CacheStateError(f'Failed to acquire lock file "{self._state_lock_path}": {error}') from error
|
|
81
|
+
|
|
82
|
+
time.sleep(0.5)
|
|
83
|
+
|
|
84
|
+
raise CacheStateError(f'Cache state timed out waiting to acquire lock file "{self._state_lock_path}"')
|
|
85
|
+
|
|
86
|
+
def _release_state_lock(self) -> None:
|
|
87
|
+
if os.path.exists(self._state_lock_path):
|
|
88
|
+
os.remove(self._state_lock_path)
|
|
89
|
+
else:
|
|
90
|
+
raise CacheStateError('Cache state was not locked.')
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def get_timestamp_now() -> str:
|
|
94
|
+
return datetime.now(timezone.utc).isoformat()
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import multiprocessing.pool
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import biolib.api
|
|
9
|
+
from biolib._internal.http_client import HttpClient
|
|
10
|
+
from biolib.biolib_api_client import BiolibApiClient
|
|
11
|
+
from biolib.biolib_errors import BioLibError
|
|
12
|
+
from biolib.biolib_logging import logger, logger_no_user_data
|
|
13
|
+
from biolib.typing_utils import Callable, Dict, Iterator, List, Optional, Tuple, TypedDict
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_chunk_iterator_from_bytes(byte_buffer: bytes, chunk_size_in_bytes: int = 50_000_000) -> Iterator[bytes]:
|
|
17
|
+
chunk_count = math.ceil(len(byte_buffer) / chunk_size_in_bytes)
|
|
18
|
+
for chunk_number in range(chunk_count):
|
|
19
|
+
start = chunk_size_in_bytes * chunk_number
|
|
20
|
+
stop = start + chunk_size_in_bytes
|
|
21
|
+
yield byte_buffer[start:stop]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_chunk_iterator_from_file_object(file_object, chunk_size_in_bytes: int = 50_000_000) -> Iterator[bytes]:
|
|
25
|
+
while True:
|
|
26
|
+
data = file_object.read(chunk_size_in_bytes)
|
|
27
|
+
if not data:
|
|
28
|
+
break
|
|
29
|
+
yield data
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RequestOptions(TypedDict):
|
|
33
|
+
headers: Optional[Dict[str, str]]
|
|
34
|
+
requires_biolib_auth: bool
|
|
35
|
+
path: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _PartMetadata(TypedDict):
|
|
39
|
+
ETag: str
|
|
40
|
+
PartNumber: int
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
_UploadChunkInputType = Tuple[int, bytes]
|
|
44
|
+
_UploadChunkReturnType = Tuple[_PartMetadata, int]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MultiPartUploader:
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
complete_upload_request: RequestOptions,
|
|
51
|
+
get_presigned_upload_url_request: RequestOptions,
|
|
52
|
+
start_multipart_upload_request: Optional[RequestOptions] = None,
|
|
53
|
+
use_process_pool: Optional[bool] = None,
|
|
54
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
55
|
+
):
|
|
56
|
+
self._complete_upload_request = complete_upload_request
|
|
57
|
+
self._get_presigned_upload_url_request = get_presigned_upload_url_request
|
|
58
|
+
self._start_multipart_upload_request = start_multipart_upload_request
|
|
59
|
+
self._bytes_uploaded: int = 0
|
|
60
|
+
self._use_process_pool = use_process_pool
|
|
61
|
+
self._on_progress = on_progress
|
|
62
|
+
|
|
63
|
+
def upload(self, payload_iterator: Iterator[bytes], payload_size_in_bytes: int) -> None:
|
|
64
|
+
parts: List[_PartMetadata] = []
|
|
65
|
+
|
|
66
|
+
iterator_with_index: Iterator[_UploadChunkInputType] = enumerate(payload_iterator, 1) # type: ignore
|
|
67
|
+
logger_no_user_data.debug(f'Starting multipart upload of payload with size {payload_size_in_bytes} bytes')
|
|
68
|
+
|
|
69
|
+
if self._start_multipart_upload_request:
|
|
70
|
+
try:
|
|
71
|
+
biolib.api.client.post(
|
|
72
|
+
authenticate=self._start_multipart_upload_request['requires_biolib_auth'],
|
|
73
|
+
headers=self._start_multipart_upload_request['headers'],
|
|
74
|
+
path=self._start_multipart_upload_request['path'],
|
|
75
|
+
)
|
|
76
|
+
except BaseException as error:
|
|
77
|
+
logger_no_user_data.debug(f'Failed to start multipart upload got error: {error}')
|
|
78
|
+
raise error
|
|
79
|
+
|
|
80
|
+
# if multiprocessing start method is spawn or we are running in a daemon process,
|
|
81
|
+
# multiprocessing.Pool may fail when called from script
|
|
82
|
+
if multiprocessing.get_start_method() == 'spawn' or multiprocessing.current_process().daemon:
|
|
83
|
+
logger_no_user_data.debug('Uploading multipart from main process...')
|
|
84
|
+
for chunk in iterator_with_index:
|
|
85
|
+
upload_chunk_response = self._upload_chunk(chunk)
|
|
86
|
+
self._update_progress_bar_and_parts(
|
|
87
|
+
upload_chunk_response=upload_chunk_response,
|
|
88
|
+
parts=parts,
|
|
89
|
+
payload_size_in_bytes=payload_size_in_bytes,
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
# use 16 cores, unless less is available
|
|
93
|
+
pool_size = min(16, multiprocessing.cpu_count() - 1)
|
|
94
|
+
process_pool = (
|
|
95
|
+
multiprocessing.Pool(pool_size)
|
|
96
|
+
if self._use_process_pool
|
|
97
|
+
else multiprocessing.pool.ThreadPool(pool_size)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
response: _UploadChunkReturnType
|
|
102
|
+
for response in process_pool.imap(self._upload_chunk, iterator_with_index):
|
|
103
|
+
self._update_progress_bar_and_parts(
|
|
104
|
+
upload_chunk_response=response, parts=parts, payload_size_in_bytes=payload_size_in_bytes
|
|
105
|
+
)
|
|
106
|
+
finally:
|
|
107
|
+
logger_no_user_data.debug('Multipart upload closing process pool...')
|
|
108
|
+
process_pool.close()
|
|
109
|
+
|
|
110
|
+
requires_biolib_auth = self._complete_upload_request['requires_biolib_auth']
|
|
111
|
+
if requires_biolib_auth:
|
|
112
|
+
BiolibApiClient.refresh_auth_token()
|
|
113
|
+
|
|
114
|
+
logger_no_user_data.debug(f'Uploaded {len(parts)} parts, now calling complete upload...')
|
|
115
|
+
biolib.api.client.post(
|
|
116
|
+
authenticate=requires_biolib_auth,
|
|
117
|
+
headers=self._complete_upload_request['headers'],
|
|
118
|
+
data={'parts': parts, 'size_bytes': self._bytes_uploaded},
|
|
119
|
+
path=self._complete_upload_request['path'],
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _upload_chunk(self, _input: _UploadChunkInputType) -> _UploadChunkReturnType:
|
|
123
|
+
part_number, chunk = _input
|
|
124
|
+
requires_biolib_auth = self._get_presigned_upload_url_request['requires_biolib_auth']
|
|
125
|
+
|
|
126
|
+
for index in range(20): # will fail after approximately sum_i(i^2+2) = 41 min if range (20)
|
|
127
|
+
if requires_biolib_auth:
|
|
128
|
+
BiolibApiClient.refresh_auth_token()
|
|
129
|
+
|
|
130
|
+
logger_no_user_data.debug(f'Uploading part number {part_number} with size {len(chunk)} bytes...')
|
|
131
|
+
presigned_upload_url = None
|
|
132
|
+
try:
|
|
133
|
+
logger_no_user_data.debug(f'Getting upload URL for chunk {part_number}...')
|
|
134
|
+
get_url_response = biolib.api.client.get(
|
|
135
|
+
authenticate=requires_biolib_auth,
|
|
136
|
+
headers=self._get_presigned_upload_url_request['headers'],
|
|
137
|
+
params={'part_number': part_number},
|
|
138
|
+
path=self._get_presigned_upload_url_request['path'],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
presigned_upload_url = get_url_response.json()['presigned_upload_url']
|
|
142
|
+
|
|
143
|
+
except Exception as error: # pylint: disable=broad-except
|
|
144
|
+
logger_no_user_data.warning(f'Error when getting url for part {part_number}. Retrying...')
|
|
145
|
+
logger.debug(f'Upload error: {error}')
|
|
146
|
+
|
|
147
|
+
if presigned_upload_url:
|
|
148
|
+
try:
|
|
149
|
+
app_caller_proxy_job_storage_base_url = os.getenv('BIOLIB_CLOUD_JOB_STORAGE_BASE_URL', '')
|
|
150
|
+
if app_caller_proxy_job_storage_base_url:
|
|
151
|
+
# Done to hit App Caller Proxy when uploading result from inside an app
|
|
152
|
+
parsed_url = urlparse(presigned_upload_url)
|
|
153
|
+
presigned_upload_url = (
|
|
154
|
+
f'{app_caller_proxy_job_storage_base_url}{parsed_url.path}?{parsed_url.query}'
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
put_chunk_response = HttpClient.request(
|
|
158
|
+
url=presigned_upload_url,
|
|
159
|
+
data=chunk,
|
|
160
|
+
method='PUT',
|
|
161
|
+
timeout_in_seconds=300,
|
|
162
|
+
)
|
|
163
|
+
return _PartMetadata(PartNumber=part_number, ETag=put_chunk_response.headers['ETag']), len(chunk)
|
|
164
|
+
|
|
165
|
+
except Exception as error: # pylint: disable=broad-except
|
|
166
|
+
logger_no_user_data.warning(f'Encountered error when uploading part {part_number}. Retrying...')
|
|
167
|
+
logger.debug(f'Upload error: {error} ({presigned_upload_url})')
|
|
168
|
+
|
|
169
|
+
time.sleep(index * index + 2)
|
|
170
|
+
|
|
171
|
+
logger_no_user_data.debug(f'Max retries hit, when uploading part {part_number}. Exiting...')
|
|
172
|
+
raise BioLibError(f'Max retries hit, when uploading part {part_number}. Exiting...')
|
|
173
|
+
|
|
174
|
+
def _update_progress_bar_and_parts(
|
|
175
|
+
self,
|
|
176
|
+
upload_chunk_response: _UploadChunkReturnType,
|
|
177
|
+
parts: List[_PartMetadata],
|
|
178
|
+
payload_size_in_bytes: int,
|
|
179
|
+
) -> None:
|
|
180
|
+
part_metadata, chunk_byte_length = upload_chunk_response
|
|
181
|
+
part_number = part_metadata['PartNumber']
|
|
182
|
+
|
|
183
|
+
parts.append(part_metadata)
|
|
184
|
+
self._bytes_uploaded += chunk_byte_length
|
|
185
|
+
|
|
186
|
+
if self._on_progress is not None:
|
|
187
|
+
self._on_progress(self._bytes_uploaded, payload_size_in_bytes)
|
|
188
|
+
|
|
189
|
+
approx_progress_percent = min(self._bytes_uploaded / (payload_size_in_bytes + 1) * 100, 100)
|
|
190
|
+
approx_rounded_progress = round(approx_progress_percent, 2)
|
|
191
|
+
logger_no_user_data.debug(
|
|
192
|
+
f'Uploaded part number {part_number} with size {chunk_byte_length} bytes, '
|
|
193
|
+
f'the approximate progress is {approx_rounded_progress}%'
|
|
194
|
+
)
|
biolib/utils/seq_util.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from io import BufferedIOBase, TextIOBase
|
|
3
|
+
|
|
4
|
+
from biolib.typing_utils import Dict, Iterator, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SeqUtilRecord:
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
sequence: str,
|
|
11
|
+
sequence_id: str,
|
|
12
|
+
description: Optional['str'] = None,
|
|
13
|
+
properties: Optional[Dict[str, str]] = None,
|
|
14
|
+
):
|
|
15
|
+
self.sequence = sequence
|
|
16
|
+
self.id = sequence_id # pylint: disable=invalid-name
|
|
17
|
+
self.description = description
|
|
18
|
+
|
|
19
|
+
if properties:
|
|
20
|
+
disallowed_pattern = re.compile(r'[=\[\]\n]')
|
|
21
|
+
for key, value in properties.items():
|
|
22
|
+
assert not bool(disallowed_pattern.search(key)), 'Key cannot contain characters =[] and newline'
|
|
23
|
+
assert not bool(disallowed_pattern.search(value)), 'Value cannot contain characters =[] and newline'
|
|
24
|
+
self.properties = properties
|
|
25
|
+
else:
|
|
26
|
+
self.properties = {}
|
|
27
|
+
|
|
28
|
+
def __repr__(self) -> str:
|
|
29
|
+
return f'{self.__class__.__name__} ({self.id})'
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SeqUtil:
|
|
33
|
+
@staticmethod
|
|
34
|
+
def parse_fasta(
|
|
35
|
+
input_file: Union[str, BufferedIOBase, None] = None,
|
|
36
|
+
default_header: Optional[str] = None,
|
|
37
|
+
allow_any_sequence_characters: bool = False,
|
|
38
|
+
use_strict_alphabet: Optional[bool] = False,
|
|
39
|
+
allow_empty_sequence: bool = True,
|
|
40
|
+
file_name: Optional[str] = None,
|
|
41
|
+
) -> Iterator[SeqUtilRecord]:
|
|
42
|
+
def process_and_yield_record(header: str, sequence_lines: List[str]):
|
|
43
|
+
sequence = ''.join(sequence_lines)
|
|
44
|
+
sequence_id = header.split()[0]
|
|
45
|
+
if allow_any_sequence_characters and use_strict_alphabet:
|
|
46
|
+
raise Exception(
|
|
47
|
+
'Error: Please choose either allow_any_sequence_characters or use_strict_alphabet'
|
|
48
|
+
)
|
|
49
|
+
if not allow_any_sequence_characters:
|
|
50
|
+
if use_strict_alphabet:
|
|
51
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters_strict(sequence)
|
|
52
|
+
else:
|
|
53
|
+
invalid_sequence_characters = SeqUtil._find_invalid_sequence_characters(sequence)
|
|
54
|
+
if invalid_sequence_characters:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f'Error: Invalid character ("{invalid_sequence_characters[0]}") found in sequence {sequence_id}'
|
|
57
|
+
)
|
|
58
|
+
if not allow_empty_sequence and not sequence:
|
|
59
|
+
raise Exception(f'Error: No sequence found for fasta entry {sequence_id}')
|
|
60
|
+
yield SeqUtilRecord(
|
|
61
|
+
sequence=sequence,
|
|
62
|
+
sequence_id=sequence_id,
|
|
63
|
+
description=header[len(sequence_id):].strip()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def line_generator_from_buffered_io_base(file_handle: BufferedIOBase) -> Iterator[str]:
|
|
67
|
+
for line in file_handle:
|
|
68
|
+
yield line.decode('utf-8')
|
|
69
|
+
|
|
70
|
+
def line_generator_from_text_io_base(file_handle: TextIOBase) -> Iterator[str]:
|
|
71
|
+
for line in file_handle:
|
|
72
|
+
yield line
|
|
73
|
+
|
|
74
|
+
if input_file is None:
|
|
75
|
+
if file_name:
|
|
76
|
+
input_file = file_name
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
|
79
|
+
|
|
80
|
+
file_handle = None
|
|
81
|
+
if isinstance(input_file, str):
|
|
82
|
+
file_handle = open(input_file, "rb")
|
|
83
|
+
line_iterator = line_generator_from_buffered_io_base(file_handle)
|
|
84
|
+
elif isinstance(input_file, BufferedIOBase):
|
|
85
|
+
line_iterator = line_generator_from_buffered_io_base(input_file)
|
|
86
|
+
elif isinstance(input_file, TextIOBase):
|
|
87
|
+
line_iterator = line_generator_from_text_io_base(input_file)
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError('input_file must be a file name (str) or a BufferedIOBase object')
|
|
90
|
+
|
|
91
|
+
header = None
|
|
92
|
+
sequence_lines: List[str] = []
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
for line_number, line in enumerate(line_iterator):
|
|
96
|
+
line = line.strip()
|
|
97
|
+
if not line:
|
|
98
|
+
continue # skip empty lines
|
|
99
|
+
if line.startswith('>'):
|
|
100
|
+
if header is not None:
|
|
101
|
+
yield from process_and_yield_record(header, sequence_lines)
|
|
102
|
+
|
|
103
|
+
header = line[1:].strip()
|
|
104
|
+
sequence_lines = []
|
|
105
|
+
else:
|
|
106
|
+
if header is None:
|
|
107
|
+
if default_header:
|
|
108
|
+
yield from process_and_yield_record(f"{default_header}{line_number}", [line])
|
|
109
|
+
else:
|
|
110
|
+
raise Exception(f'No header line found in FASTA file "{file_name}"')
|
|
111
|
+
else:
|
|
112
|
+
sequence_lines.append(line)
|
|
113
|
+
|
|
114
|
+
if header is not None:
|
|
115
|
+
yield from process_and_yield_record(header, sequence_lines)
|
|
116
|
+
finally:
|
|
117
|
+
if file_handle:
|
|
118
|
+
file_handle.close()
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def write_records_to_fasta(file_name: str, records: List[SeqUtilRecord]) -> None:
|
|
122
|
+
with open(file_name, mode='w') as file_handle:
|
|
123
|
+
for record in records:
|
|
124
|
+
optional_description = f' {record.description}' if record.description else ''
|
|
125
|
+
if record.properties:
|
|
126
|
+
for key, value in record.properties.items():
|
|
127
|
+
optional_description += f' [{key}={value}]'
|
|
128
|
+
sequence = '\n'.join(record.sequence[i : i + 80] for i in range(0, len(record.sequence), 80))
|
|
129
|
+
file_handle.write(f'>{record.id}{optional_description}\n{sequence}\n')
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _find_invalid_sequence_characters(sequence: str) -> List[str]:
|
|
133
|
+
allowed_sequence_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.')
|
|
134
|
+
invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
|
|
135
|
+
return invalid_chars
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def _find_invalid_sequence_characters_strict(sequence: str) -> List[str]:
|
|
139
|
+
# Equivalent to fair-esm alphabet, compatible with ESM-models
|
|
140
|
+
# Excludes digits, '_' and 'J' (ambiguous letter only used in mass-spec NMR)
|
|
141
|
+
# https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/constants.py#L8
|
|
142
|
+
allowed_sequence_chars = set('lagvsertidpkqnfymhwcxbuzoLAGVSERTIDPKQNFYMHWCXBUZO-.')
|
|
143
|
+
invalid_chars = [char for char in sequence if char not in allowed_sequence_chars]
|
|
144
|
+
return invalid_chars
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _find_invalid_sequence_id_characters(sequence: str) -> List[str]:
|
|
148
|
+
allowed_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.:*#')
|
|
149
|
+
invalid_chars = [char for char in sequence if char not in allowed_chars]
|
|
150
|
+
return invalid_chars
|