pybiolib 1.2.883__py3-none-any.whl → 1.2.1890__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +33 -10
- biolib/_data_record/data_record.py +220 -126
- biolib/_index/index.py +55 -0
- biolib/_index/query_result.py +103 -0
- biolib/_internal/add_copilot_prompts.py +24 -11
- biolib/_internal/add_gui_files.py +81 -0
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +1 -18
- biolib/_internal/data_record/push_data.py +65 -16
- biolib/_internal/data_record/remote_storage_endpoint.py +18 -13
- biolib/_internal/file_utils.py +48 -0
- biolib/_internal/lfs/cache.py +4 -2
- biolib/_internal/push_application.py +95 -24
- biolib/_internal/runtime.py +2 -0
- biolib/_internal/string_utils.py +13 -0
- biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-general.instructions.md +5 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
- biolib/_internal/templates/dashboard_template/.biolib/config.yml +5 -0
- biolib/_internal/templates/{init_template → github_workflow_template}/.github/workflows/biolib.yml +7 -2
- biolib/_internal/templates/gitignore_template/.gitignore +10 -0
- biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
- biolib/_internal/templates/gui_template/App.tsx +53 -0
- biolib/_internal/templates/gui_template/Dockerfile +27 -0
- biolib/_internal/templates/gui_template/biolib-sdk.ts +82 -0
- biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
- biolib/_internal/templates/gui_template/index.css +5 -0
- biolib/_internal/templates/gui_template/index.html +13 -0
- biolib/_internal/templates/gui_template/index.tsx +10 -0
- biolib/_internal/templates/gui_template/package.json +27 -0
- biolib/_internal/templates/gui_template/tsconfig.json +24 -0
- biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +50 -0
- biolib/_internal/templates/gui_template/vite.config.mts +10 -0
- biolib/_internal/templates/init_template/.biolib/config.yml +1 -0
- biolib/_internal/templates/init_template/Dockerfile +5 -1
- biolib/_internal/templates/init_template/run.py +6 -15
- biolib/_internal/templates/init_template/run.sh +1 -0
- biolib/_internal/templates/templates.py +21 -1
- biolib/_internal/utils/__init__.py +47 -0
- biolib/_internal/utils/auth.py +46 -0
- biolib/_internal/utils/job_url.py +33 -0
- biolib/_internal/utils/multinode.py +12 -14
- biolib/_runtime/runtime.py +15 -2
- biolib/_session/session.py +7 -5
- biolib/_shared/__init__.py +0 -0
- biolib/_shared/types/__init__.py +74 -0
- biolib/_shared/types/account.py +12 -0
- biolib/_shared/types/account_member.py +8 -0
- biolib/{_internal → _shared}/types/experiment.py +1 -0
- biolib/_shared/types/resource.py +37 -0
- biolib/_shared/types/resource_deploy_key.py +11 -0
- biolib/{_internal → _shared}/types/resource_version.py +8 -2
- biolib/_shared/types/user.py +19 -0
- biolib/_shared/utils/__init__.py +7 -0
- biolib/_shared/utils/resource_uri.py +75 -0
- biolib/api/client.py +5 -48
- biolib/app/app.py +97 -55
- biolib/biolib_api_client/api_client.py +3 -47
- biolib/biolib_api_client/app_types.py +1 -1
- biolib/biolib_api_client/biolib_app_api.py +31 -6
- biolib/biolib_api_client/biolib_job_api.py +1 -1
- biolib/biolib_api_client/user_state.py +34 -2
- biolib/biolib_binary_format/module_input.py +8 -0
- biolib/biolib_binary_format/remote_endpoints.py +3 -3
- biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
- biolib/biolib_logging.py +1 -1
- biolib/cli/__init__.py +2 -2
- biolib/cli/auth.py +4 -16
- biolib/cli/data_record.py +82 -0
- biolib/cli/index.py +32 -0
- biolib/cli/init.py +393 -71
- biolib/cli/lfs.py +1 -1
- biolib/cli/run.py +9 -6
- biolib/cli/start.py +14 -1
- biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
- biolib/compute_node/job_worker/executors/docker_types.py +1 -1
- biolib/compute_node/job_worker/executors/types.py +6 -5
- biolib/compute_node/job_worker/job_storage.py +2 -1
- biolib/compute_node/job_worker/job_worker.py +155 -90
- biolib/compute_node/job_worker/large_file_system.py +2 -6
- biolib/compute_node/job_worker/network_alloc.py +99 -0
- biolib/compute_node/job_worker/network_buffer.py +240 -0
- biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
- biolib/compute_node/remote_host_proxy.py +163 -79
- biolib/compute_node/utils.py +2 -0
- biolib/compute_node/webserver/compute_node_results_proxy.py +189 -0
- biolib/compute_node/webserver/proxy_utils.py +28 -0
- biolib/compute_node/webserver/webserver.py +64 -19
- biolib/experiments/experiment.py +111 -16
- biolib/jobs/job.py +128 -31
- biolib/jobs/job_result.py +74 -34
- biolib/jobs/types.py +1 -0
- biolib/sdk/__init__.py +28 -3
- biolib/typing_utils.py +1 -1
- biolib/utils/cache_state.py +8 -5
- biolib/utils/multipart_uploader.py +24 -18
- biolib/utils/seq_util.py +1 -1
- pybiolib-1.2.1890.dist-info/METADATA +41 -0
- pybiolib-1.2.1890.dist-info/RECORD +177 -0
- {pybiolib-1.2.883.dist-info → pybiolib-1.2.1890.dist-info}/WHEEL +1 -1
- pybiolib-1.2.1890.dist-info/entry_points.txt +2 -0
- biolib/_internal/llm_instructions/.github/instructions/style-react-ts.instructions.md +0 -22
- biolib/_internal/templates/init_template/.gitignore +0 -2
- biolib/_internal/types/__init__.py +0 -6
- biolib/_internal/types/resource.py +0 -18
- biolib/biolib_download_container.py +0 -38
- biolib/cli/download_container.py +0 -14
- biolib/utils/app_uri.py +0 -57
- pybiolib-1.2.883.dist-info/METADATA +0 -50
- pybiolib-1.2.883.dist-info/RECORD +0 -148
- pybiolib-1.2.883.dist-info/entry_points.txt +0 -3
- /biolib/{_internal/llm_instructions → _index}/__init__.py +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/general-app-knowledge.instructions.md +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-python.instructions.md +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_app_inputs.prompt.md +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_run_apps.prompt.md +0 -0
- /biolib/{_internal → _shared}/types/app.py +0 -0
- /biolib/{_internal → _shared}/types/data_record.py +0 -0
- /biolib/{_internal → _shared}/types/file_node.py +0 -0
- /biolib/{_internal → _shared}/types/push.py +0 -0
- /biolib/{_internal → _shared}/types/resource_permission.py +0 -0
- /biolib/{_internal → _shared}/types/result.py +0 -0
- /biolib/{_internal → _shared}/types/typing.py +0 -0
- {pybiolib-1.2.883.dist-info → pybiolib-1.2.1890.dist-info/licenses}/LICENSE +0 -0
biolib/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# ruff: noqa: I001
|
|
1
2
|
# Imports to hide
|
|
2
3
|
import os
|
|
3
4
|
from urllib.parse import urlparse as _urlparse
|
|
@@ -15,6 +16,7 @@ from biolib.jobs.job import Result as _Result
|
|
|
15
16
|
from biolib import user as _user
|
|
16
17
|
from biolib.typing_utils import List, Optional, cast as _cast
|
|
17
18
|
from biolib._data_record.data_record import DataRecord as _DataRecord
|
|
19
|
+
from biolib._internal.utils.job_url import parse_result_id_or_url as _parse_result_id_or_url
|
|
18
20
|
|
|
19
21
|
import biolib.api
|
|
20
22
|
import biolib.app
|
|
@@ -22,7 +24,6 @@ import biolib.cli
|
|
|
22
24
|
import biolib.sdk
|
|
23
25
|
import biolib.utils
|
|
24
26
|
|
|
25
|
-
|
|
26
27
|
# ------------------------------------ Function definitions for public Python API ------------------------------------
|
|
27
28
|
|
|
28
29
|
|
|
@@ -83,43 +84,65 @@ def search(
|
|
|
83
84
|
|
|
84
85
|
|
|
85
86
|
def get_job(job_id: str, job_token: Optional[str] = None) -> _Result:
|
|
86
|
-
r"""Get a job by its ID.
|
|
87
|
+
r"""Get a job by its ID or full URL.
|
|
87
88
|
|
|
88
89
|
Args:
|
|
89
|
-
job_id (str): The UUID of the job to retrieve
|
|
90
|
+
job_id (str): The UUID of the job to retrieve, or a full URL to the job.
|
|
91
|
+
Can be either:
|
|
92
|
+
- Job UUID (e.g., 'abc123')
|
|
93
|
+
- Full URL (e.g., 'https://biolib.com/result/abc123/?token=xyz789')
|
|
94
|
+
- Full URL with token parameter (e.g., 'biolib.com/result/abc123/token=xyz789')
|
|
90
95
|
job_token (str, optional): Authentication token for accessing the job.
|
|
91
96
|
Only needed for jobs that aren't owned by the current user.
|
|
97
|
+
If the URL contains a token, this parameter is ignored.
|
|
92
98
|
|
|
93
99
|
Returns:
|
|
94
100
|
Job: The job object
|
|
95
101
|
|
|
96
102
|
Example::
|
|
97
103
|
|
|
104
|
+
>>> # Get by UUID
|
|
98
105
|
>>> job = biolib.get_job('abc123')
|
|
99
|
-
>>> #
|
|
106
|
+
>>> # Get with explicit token
|
|
100
107
|
>>> job = biolib.get_job('abc123', job_token='xyz789')
|
|
108
|
+
>>> # Get by full URL with token
|
|
109
|
+
>>> job = biolib.get_job('https://biolib.com/result/abc123/?token=xyz789')
|
|
110
|
+
>>> # Get by URL with inline token format
|
|
111
|
+
>>> job = biolib.get_job('biolib.com/result/abc123/token=xyz789')
|
|
101
112
|
"""
|
|
102
|
-
|
|
113
|
+
uuid, token = _parse_result_id_or_url(job_id, job_token)
|
|
114
|
+
return _Result.create_from_uuid(uuid=uuid, auth_token=token)
|
|
103
115
|
|
|
104
116
|
|
|
105
117
|
def get_result(result_id: str, result_token: Optional[str] = None) -> _Result:
|
|
106
|
-
r"""Get a result by its ID.
|
|
118
|
+
r"""Get a result by its ID or full URL.
|
|
107
119
|
|
|
108
120
|
Args:
|
|
109
|
-
result_id (str): The UUID of the result to retrieve
|
|
121
|
+
result_id (str): The UUID of the result to retrieve, or a full URL to the result.
|
|
122
|
+
Can be either:
|
|
123
|
+
- Result UUID (e.g., 'abc123')
|
|
124
|
+
- Full URL (e.g., 'https://biolib.com/result/abc123/?token=xyz789')
|
|
125
|
+
- Full URL with token parameter (e.g., 'biolib.com/result/abc123/token=xyz789')
|
|
110
126
|
result_token (str, optional): Authentication token for accessing the result.
|
|
111
|
-
Only needed for
|
|
127
|
+
Only needed for results that aren't owned by the current user.
|
|
128
|
+
If the URL contains a token, this parameter is ignored.
|
|
112
129
|
|
|
113
130
|
Returns:
|
|
114
131
|
Result: The result object
|
|
115
132
|
|
|
116
133
|
Example::
|
|
117
134
|
|
|
135
|
+
>>> # Get by UUID
|
|
118
136
|
>>> result = biolib.get_result('abc123')
|
|
119
|
-
>>> #
|
|
137
|
+
>>> # Get with explicit token
|
|
120
138
|
>>> result = biolib.get_result('abc123', result_token='xyz789')
|
|
139
|
+
>>> # Get by full URL with token
|
|
140
|
+
>>> result = biolib.get_result('https://biolib.com/result/abc123/?token=xyz789')
|
|
141
|
+
>>> # Get by URL with inline token format
|
|
142
|
+
>>> result = biolib.get_result('biolib.com/result/abc123/token=xyz789')
|
|
121
143
|
"""
|
|
122
|
-
|
|
144
|
+
uuid, token = _parse_result_id_or_url(result_id, result_token)
|
|
145
|
+
return _Result.create_from_uuid(uuid=uuid, auth_token=token)
|
|
123
146
|
|
|
124
147
|
|
|
125
148
|
def get_data_record(uri: str) -> _DataRecord:
|
|
@@ -3,52 +3,157 @@ from collections import namedtuple
|
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from struct import Struct
|
|
6
|
-
from typing import Callable, Dict, Iterable, List, Optional, Union, cast
|
|
6
|
+
from typing import Callable, Dict, Iterable, Iterator, List, Optional, Union, cast
|
|
7
7
|
|
|
8
8
|
from biolib import api
|
|
9
|
-
from biolib._internal import types
|
|
10
|
-
from biolib._internal.data_record import get_data_record_state_from_uri
|
|
11
9
|
from biolib._internal.data_record.data_record import validate_sqlite_v1
|
|
12
10
|
from biolib._internal.data_record.push_data import (
|
|
11
|
+
_upload_from_iterator,
|
|
13
12
|
push_data_path,
|
|
14
13
|
validate_data_path_and_get_files_and_size_of_directory,
|
|
15
14
|
)
|
|
16
15
|
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
|
17
16
|
from biolib._internal.http_client import HttpClient
|
|
18
|
-
from biolib.
|
|
17
|
+
from biolib._shared import types
|
|
18
|
+
from biolib._shared.types import ResourceDetailedDict, ResourceVersionDetailedDict, ZipFileNodeDict
|
|
19
|
+
from biolib._shared.utils import parse_resource_uri
|
|
19
20
|
from biolib.api import client as api_client
|
|
20
21
|
from biolib.biolib_api_client import BiolibApiClient
|
|
21
|
-
from biolib.biolib_api_client.
|
|
22
|
+
from biolib.biolib_api_client.biolib_app_api import _get_resource_uri_from_str
|
|
23
|
+
from biolib.biolib_api_client.lfs_types import DataRecordInfo
|
|
22
24
|
from biolib.biolib_binary_format import LazyLoadedFile
|
|
23
25
|
from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
|
|
24
26
|
from biolib.biolib_logging import logger
|
|
25
|
-
from biolib.utils.app_uri import parse_app_uri
|
|
26
27
|
|
|
27
28
|
PathFilter = Union[str, List[str], Callable[[str], bool]]
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
class DataRecord:
|
|
31
|
-
def __init__(self, _internal_state:
|
|
32
|
+
def __init__(self, _internal_state: ResourceDetailedDict):
|
|
32
33
|
self._state = _internal_state
|
|
33
34
|
|
|
34
35
|
def __repr__(self):
|
|
35
|
-
return f'DataRecord: {self._state["
|
|
36
|
+
return f'DataRecord: {self._state["uri"]}'
|
|
36
37
|
|
|
37
38
|
@property
|
|
38
39
|
def uri(self) -> str:
|
|
39
|
-
return self._state['
|
|
40
|
+
return self._state['uri']
|
|
40
41
|
|
|
41
42
|
@property
|
|
42
43
|
def uuid(self) -> str:
|
|
43
|
-
return self._state['
|
|
44
|
+
return self._state['uuid']
|
|
44
45
|
|
|
45
46
|
@property
|
|
46
47
|
def name(self) -> str:
|
|
47
|
-
uri_parsed =
|
|
48
|
-
if not uri_parsed['
|
|
48
|
+
uri_parsed = parse_resource_uri(self._state['uri'], use_account_as_name_default=False)
|
|
49
|
+
if not uri_parsed['resource_name']:
|
|
49
50
|
raise ValueError('Expected parameter "resource_uri" to contain resource name')
|
|
50
51
|
|
|
51
|
-
return uri_parsed['
|
|
52
|
+
return uri_parsed['resource_name']
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def get_by_uri(uri: str) -> 'DataRecord':
|
|
56
|
+
normalized_uri = _get_resource_uri_from_str(uri)
|
|
57
|
+
resource_dict: ResourceDetailedDict = api_client.get(path='/resource/', params={'uri': normalized_uri}).json()
|
|
58
|
+
if resource_dict['type'] != 'data-record':
|
|
59
|
+
raise Exception(f'Resource "{resource_dict["uri"]}" is not a Data Record')
|
|
60
|
+
|
|
61
|
+
return DataRecord(_internal_state=resource_dict)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def create(destination: str, data_path: Optional[str] = None, record_type: Optional[str] = None) -> 'DataRecord':
|
|
65
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
|
|
66
|
+
if data_path is not None:
|
|
67
|
+
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
|
68
|
+
uri_parsed = parse_resource_uri(destination, use_account_as_name_default=False)
|
|
69
|
+
if uri_parsed['resource_name_normalized']:
|
|
70
|
+
data_record_uri = destination
|
|
71
|
+
else:
|
|
72
|
+
record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
|
|
73
|
+
data_record_uri = f'{destination}/{record_name}'
|
|
74
|
+
|
|
75
|
+
response = api.client.post(
|
|
76
|
+
path='/resources/data-records/',
|
|
77
|
+
data={
|
|
78
|
+
'uri': data_record_uri,
|
|
79
|
+
'type': record_type,
|
|
80
|
+
},
|
|
81
|
+
)
|
|
82
|
+
data_record_info: DataRecordInfo = response.json()
|
|
83
|
+
logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
|
|
84
|
+
|
|
85
|
+
data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
|
|
86
|
+
if data_path is not None:
|
|
87
|
+
data_record.update(data_path=data_path)
|
|
88
|
+
|
|
89
|
+
return data_record
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
|
|
93
|
+
# TODO: Simplify when backend exposes /api/resources/ instead of /api/apps/
|
|
94
|
+
max_page_size = 1_000
|
|
95
|
+
params: Dict[str, Union[str, int]] = {
|
|
96
|
+
'page_size': str(count or max_page_size),
|
|
97
|
+
'resource_type': 'data-record',
|
|
98
|
+
}
|
|
99
|
+
if uri:
|
|
100
|
+
uri_parsed = parse_resource_uri(uri, use_account_as_name_default=False)
|
|
101
|
+
params['account_handle'] = uri_parsed['account_handle_normalized']
|
|
102
|
+
if uri_parsed['resource_name_normalized']:
|
|
103
|
+
params['app_name'] = uri_parsed['resource_name_normalized']
|
|
104
|
+
|
|
105
|
+
results = api_client.get(path='/apps/', params=params).json()['results']
|
|
106
|
+
if count is None and len(results) == max_page_size:
|
|
107
|
+
logger.warning(
|
|
108
|
+
f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return [
|
|
112
|
+
DataRecord(
|
|
113
|
+
_internal_state=ResourceDetailedDict(
|
|
114
|
+
uri=result['resource_uri'],
|
|
115
|
+
uuid=result['public_id'],
|
|
116
|
+
name=result['name'],
|
|
117
|
+
created_at=result['created_at'],
|
|
118
|
+
type=result['type'],
|
|
119
|
+
description=result['description'],
|
|
120
|
+
account_uuid=result['account_id'],
|
|
121
|
+
experiment=None,
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
for result in results
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def clone(
|
|
129
|
+
source: 'DataRecord',
|
|
130
|
+
destination: 'DataRecord',
|
|
131
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
132
|
+
) -> 'DataRecord':
|
|
133
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='clone a Data Record')
|
|
134
|
+
|
|
135
|
+
# pylint: disable=protected-access
|
|
136
|
+
total_size_in_bytes = source._get_zip_size_bytes()
|
|
137
|
+
|
|
138
|
+
if total_size_in_bytes == 0:
|
|
139
|
+
raise ValueError('Source data record has no data to clone')
|
|
140
|
+
|
|
141
|
+
min_chunk_size_bytes = 10_000_000
|
|
142
|
+
chunk_size_in_bytes = max(min_chunk_size_bytes, int(total_size_in_bytes / 9_000))
|
|
143
|
+
|
|
144
|
+
zip_iterator = source._iter_zip_bytes(chunk_size_bytes=chunk_size_in_bytes)
|
|
145
|
+
|
|
146
|
+
new_resource_version_uuid = _upload_from_iterator(
|
|
147
|
+
resource_uuid=destination._state['uuid'],
|
|
148
|
+
payload_iterator=zip_iterator,
|
|
149
|
+
payload_size_in_bytes=total_size_in_bytes,
|
|
150
|
+
publish=True,
|
|
151
|
+
on_progress=on_progress,
|
|
152
|
+
)
|
|
153
|
+
# pylint: enable=protected-access
|
|
154
|
+
|
|
155
|
+
logger.info(f"Successfully cloned data to '{destination.uri}'")
|
|
156
|
+
return DataRecord._get_by_version_uuid(new_resource_version_uuid)
|
|
52
157
|
|
|
53
158
|
def list_files(
|
|
54
159
|
self,
|
|
@@ -71,9 +176,7 @@ class DataRecord:
|
|
|
71
176
|
return files
|
|
72
177
|
|
|
73
178
|
def download_zip(self, output_path: str):
|
|
74
|
-
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
|
|
75
|
-
resource_version_uuid=self._state['resource_version_uuid'],
|
|
76
|
-
)
|
|
179
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
|
|
77
180
|
HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
|
|
78
181
|
|
|
79
182
|
def download_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
|
|
@@ -113,126 +216,37 @@ class DataRecord:
|
|
|
113
216
|
else:
|
|
114
217
|
raise Exception(f"Error processing data record validation: unknown rule type {rule['type']}")
|
|
115
218
|
|
|
116
|
-
|
|
117
|
-
data_record_version: DataRecordVersion = response.json()
|
|
118
|
-
resource_version_uuid = data_record_version['uuid']
|
|
119
|
-
|
|
120
|
-
push_data_path(
|
|
219
|
+
new_resource_version_uuid = push_data_path(
|
|
121
220
|
data_path=data_path,
|
|
122
221
|
data_size_in_bytes=data_size_in_bytes,
|
|
123
222
|
files_to_zip=files_to_zip,
|
|
124
|
-
|
|
223
|
+
resource_uuid=self._state['uuid'],
|
|
125
224
|
chunk_size_in_mb=chunk_size_in_mb,
|
|
225
|
+
publish=True,
|
|
126
226
|
)
|
|
127
227
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
)
|
|
228
|
+
updated_record = DataRecord._get_by_version_uuid(new_resource_version_uuid)
|
|
229
|
+
self._state = updated_record._state # pylint: disable=protected-access
|
|
230
|
+
logger.info(f"Successfully pushed a new Data Record version '{self.uri}'")
|
|
132
231
|
|
|
133
|
-
|
|
134
|
-
|
|
232
|
+
def delete(self) -> None:
|
|
233
|
+
"""Delete the data record.
|
|
135
234
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
235
|
+
Example::
|
|
236
|
+
>>> record = DataRecord.get_by_uri("account/data-record")
|
|
237
|
+
>>> record.delete()
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
api_client.delete(path=f'/apps/{self.uuid}/')
|
|
241
|
+
logger.info(f'Data record {self.uri} deleted')
|
|
242
|
+
except Exception as error:
|
|
243
|
+
raise Exception(f'Failed to delete data record {self.uri} due to: {error}') from error
|
|
139
244
|
|
|
140
245
|
@staticmethod
|
|
141
|
-
def
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
uri_parsed = parse_app_uri(destination, use_account_as_name_default=False)
|
|
146
|
-
if uri_parsed['app_name_normalized']:
|
|
147
|
-
data_record_uri = destination
|
|
148
|
-
else:
|
|
149
|
-
record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
|
|
150
|
-
data_record_uri = f'{destination}/{record_name}'
|
|
151
|
-
|
|
152
|
-
response = api.client.post(
|
|
153
|
-
path='/resources/data-records/',
|
|
154
|
-
data={
|
|
155
|
-
'uri': data_record_uri,
|
|
156
|
-
'type': record_type,
|
|
157
|
-
},
|
|
158
|
-
)
|
|
159
|
-
data_record_info: DataRecordInfo = response.json()
|
|
160
|
-
logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
|
|
161
|
-
|
|
162
|
-
data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
|
|
163
|
-
if data_path is not None:
|
|
164
|
-
data_record.update(data_path=data_path)
|
|
165
|
-
|
|
166
|
-
return data_record
|
|
167
|
-
|
|
168
|
-
@staticmethod
|
|
169
|
-
def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
|
|
170
|
-
max_page_size = 1_000
|
|
171
|
-
params: Dict[str, Union[str, int]] = {
|
|
172
|
-
'page_size': str(count or max_page_size),
|
|
173
|
-
'resource_type': 'data-record',
|
|
174
|
-
}
|
|
175
|
-
if uri:
|
|
176
|
-
uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
|
|
177
|
-
params['account_handle'] = uri_parsed['account_handle_normalized']
|
|
178
|
-
if uri_parsed['app_name_normalized']:
|
|
179
|
-
params['app_name'] = uri_parsed['app_name_normalized']
|
|
180
|
-
|
|
181
|
-
results = api_client.get(path='/apps/', params=params).json()['results']
|
|
182
|
-
if count is None and len(results) == max_page_size:
|
|
183
|
-
logger.warning(
|
|
184
|
-
f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
return [
|
|
188
|
-
DataRecord(
|
|
189
|
-
_internal_state={
|
|
190
|
-
'resource_uri': result['resource_uri'],
|
|
191
|
-
'resource_uuid': result['public_id'],
|
|
192
|
-
'resource_version_uuid': result['active_version'],
|
|
193
|
-
}
|
|
194
|
-
)
|
|
195
|
-
for result in results
|
|
196
|
-
]
|
|
197
|
-
|
|
198
|
-
def _fetch_files(
|
|
199
|
-
self,
|
|
200
|
-
max_count: Optional[int],
|
|
201
|
-
path_filter: Optional[PathFilter] = None,
|
|
202
|
-
) -> Iterable[LazyLoadedFile]:
|
|
203
|
-
if path_filter and not (isinstance(path_filter, (str, list)) or callable(path_filter)):
|
|
204
|
-
raise Exception('Expected path_filter to be a string, a list of strings or a function')
|
|
205
|
-
|
|
206
|
-
path_filters = (
|
|
207
|
-
[path_filter] if isinstance(path_filter, str) else path_filter if isinstance(path_filter, list) else []
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
resource_version_uuid = self._state['resource_version_uuid']
|
|
211
|
-
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(resource_version_uuid)
|
|
212
|
-
|
|
213
|
-
page: Optional[int] = 1
|
|
214
|
-
yielded_files: int = 0
|
|
215
|
-
while page:
|
|
216
|
-
response = api.client.post(
|
|
217
|
-
path=f'/proxy/files/data-record-versions/{resource_version_uuid}/query/',
|
|
218
|
-
data=dict(page=page, page_size=1_000, path_filters=path_filters),
|
|
219
|
-
).json()
|
|
220
|
-
|
|
221
|
-
for file_node_dict in cast(List[ZipFileNodeDict], response['results']):
|
|
222
|
-
if file_node_dict['is_dir']:
|
|
223
|
-
continue
|
|
224
|
-
|
|
225
|
-
if callable(path_filter) and not path_filter(file_node_dict['dir_path'] + file_node_dict['name']):
|
|
226
|
-
continue
|
|
227
|
-
|
|
228
|
-
yield self._get_file(remote_storage_endpoint, file_node_dict)
|
|
229
|
-
yielded_files += 1
|
|
230
|
-
|
|
231
|
-
if max_count is not None and yielded_files >= max_count:
|
|
232
|
-
page = None
|
|
233
|
-
break
|
|
234
|
-
|
|
235
|
-
page = page + 1 if page is not None and response['page_count'] > page else None
|
|
246
|
+
def _get_by_version_uuid(version_uuid: str) -> 'DataRecord':
|
|
247
|
+
response = api.client.get(path=f'/lfs/versions/{version_uuid}/')
|
|
248
|
+
version_info = response.json()
|
|
249
|
+
return DataRecord.get_by_uri(version_info['uri'])
|
|
236
250
|
|
|
237
251
|
@staticmethod
|
|
238
252
|
def _get_file(
|
|
@@ -282,5 +296,85 @@ class DataRecord:
|
|
|
282
296
|
start_func=file_start_func,
|
|
283
297
|
)
|
|
284
298
|
|
|
299
|
+
def _get_version(self) -> ResourceVersionDetailedDict:
|
|
300
|
+
if 'version' not in self._state:
|
|
301
|
+
# Version might be missing in state if initialized from the fetch method (list of data records)
|
|
302
|
+
self._state = self.get_by_uri(self.uri)._state
|
|
303
|
+
|
|
304
|
+
version = self._state.get('version')
|
|
305
|
+
if version is None:
|
|
306
|
+
raise Exception(f'Data Record "{self._state["uri"]}" has no active version')
|
|
307
|
+
|
|
308
|
+
return version
|
|
309
|
+
|
|
310
|
+
def _fetch_files(
|
|
311
|
+
self,
|
|
312
|
+
max_count: Optional[int],
|
|
313
|
+
path_filter: Optional[PathFilter] = None,
|
|
314
|
+
) -> Iterable[LazyLoadedFile]:
|
|
315
|
+
if path_filter and not (isinstance(path_filter, (str, list)) or callable(path_filter)):
|
|
316
|
+
raise Exception('Expected path_filter to be a string, a list of strings or a function')
|
|
317
|
+
|
|
318
|
+
path_filters = (
|
|
319
|
+
[path_filter] if isinstance(path_filter, str) else path_filter if isinstance(path_filter, list) else []
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
version = self._get_version()
|
|
323
|
+
resource_version_uuid = version['uuid']
|
|
324
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
|
|
325
|
+
|
|
326
|
+
page: Optional[int] = 1
|
|
327
|
+
yielded_files: int = 0
|
|
328
|
+
while page:
|
|
329
|
+
response = api.client.post(
|
|
330
|
+
path=f'/proxy/files/data-record-versions/{resource_version_uuid}/query/',
|
|
331
|
+
data=dict(page=page, page_size=1_000, path_filters=path_filters),
|
|
332
|
+
).json()
|
|
333
|
+
|
|
334
|
+
for file_node_dict in cast(List[ZipFileNodeDict], response['results']):
|
|
335
|
+
if file_node_dict['is_dir']:
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
if callable(path_filter) and not path_filter(file_node_dict['dir_path'] + file_node_dict['name']):
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
yield self._get_file(remote_storage_endpoint, file_node_dict)
|
|
342
|
+
yielded_files += 1
|
|
343
|
+
|
|
344
|
+
if max_count is not None and yielded_files >= max_count:
|
|
345
|
+
page = None
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
page = page + 1 if page is not None and response['page_count'] > page else None
|
|
349
|
+
|
|
285
350
|
def _get_detailed_dict(self) -> types.DataRecordDetailedDict:
|
|
286
351
|
return cast(types.DataRecordDetailedDict, api_client.get(f'/resources/data-records/{self.uuid}/').json())
|
|
352
|
+
|
|
353
|
+
def _get_zip_size_bytes(self) -> int:
|
|
354
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
|
|
355
|
+
presigned_url = remote_storage_endpoint.get_remote_url()
|
|
356
|
+
response = HttpClient.request(url=presigned_url, headers={'range': 'bytes=0-0'})
|
|
357
|
+
content_range = response.headers.get('Content-Range', '')
|
|
358
|
+
if not content_range or '/' not in content_range:
|
|
359
|
+
raise ValueError('Unable to determine zip size: Content-Range header missing or invalid')
|
|
360
|
+
total_size = int(content_range.split('/')[1])
|
|
361
|
+
return total_size
|
|
362
|
+
|
|
363
|
+
def _iter_zip_bytes(self, chunk_size_bytes: int) -> Iterator[bytes]:
|
|
364
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
|
|
365
|
+
presigned_url = remote_storage_endpoint.get_remote_url()
|
|
366
|
+
response = HttpClient.request(url=presigned_url, headers={'range': 'bytes=0-0'})
|
|
367
|
+
content_range = response.headers.get('Content-Range', '')
|
|
368
|
+
if not content_range or '/' not in content_range:
|
|
369
|
+
raise ValueError('Unable to determine zip size: Content-Range header missing or invalid')
|
|
370
|
+
total_size = int(content_range.split('/')[1])
|
|
371
|
+
|
|
372
|
+
for start in range(0, total_size, chunk_size_bytes):
|
|
373
|
+
end = min(start + chunk_size_bytes - 1, total_size - 1)
|
|
374
|
+
presigned_url = remote_storage_endpoint.get_remote_url()
|
|
375
|
+
response = HttpClient.request(
|
|
376
|
+
url=presigned_url,
|
|
377
|
+
headers={'range': f'bytes={start}-{end}'},
|
|
378
|
+
timeout_in_seconds=300,
|
|
379
|
+
)
|
|
380
|
+
yield response.content
|
biolib/_index/index.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from biolib import api
|
|
5
|
+
from biolib._shared.types import ResourceDetailedDict
|
|
6
|
+
from biolib.biolib_api_client import BiolibApiClient
|
|
7
|
+
from biolib.biolib_api_client.biolib_app_api import _get_resource_uri_from_str
|
|
8
|
+
from biolib.biolib_logging import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Index:
|
|
12
|
+
def __init__(self, _internal_state: ResourceDetailedDict):
|
|
13
|
+
self._state = _internal_state
|
|
14
|
+
|
|
15
|
+
def __repr__(self) -> str:
|
|
16
|
+
return f'Index: {self._state["uri"]}'
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def uri(self) -> str:
|
|
20
|
+
return self._state['uri']
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def id(self) -> str:
|
|
24
|
+
return f'{self._state["account_uuid"]}.{self._state["uuid"]}'.replace('-', '_')
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def get_by_uri(uri: str) -> 'Index':
|
|
28
|
+
normalized_uri = _get_resource_uri_from_str(uri)
|
|
29
|
+
response: ResourceDetailedDict = api.client.get(path='/resource/', params={'uri': normalized_uri}).json()
|
|
30
|
+
if response['type'] != 'index':
|
|
31
|
+
raise Exception(f'Resource "{response["uri"]}" is not an Index')
|
|
32
|
+
return Index(_internal_state=response)
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def create(uri: str, config: Dict[str, Any]) -> str:
|
|
36
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='create an Index')
|
|
37
|
+
|
|
38
|
+
response = api.client.post(
|
|
39
|
+
path='/resources/indexes/',
|
|
40
|
+
data={
|
|
41
|
+
'uri': uri,
|
|
42
|
+
'index_config': config,
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
result = response.json()
|
|
46
|
+
created_uri: str = result['uri']
|
|
47
|
+
logger.info(f"Successfully created Index '{created_uri}'")
|
|
48
|
+
return created_uri
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def create_from_config_file(uri: str, config_path: str) -> str:
|
|
52
|
+
with open(config_path) as config_file:
|
|
53
|
+
index_config = json.load(config_file)
|
|
54
|
+
|
|
55
|
+
return Index.create(uri=uri, config=index_config)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from biolib import api
|
|
5
|
+
from biolib._internal.http_client import HttpResponse
|
|
6
|
+
from biolib._internal.utils import base64_encode_string
|
|
7
|
+
from biolib._internal.utils.auth import decode_jwt_without_checking_signature
|
|
8
|
+
from biolib._runtime.runtime import Runtime
|
|
9
|
+
from biolib.biolib_api_client import BiolibApiClient
|
|
10
|
+
from biolib.biolib_errors import BioLibError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_index_basic_auth_header() -> Optional[str]:
|
|
14
|
+
if Runtime.check_is_environment_biolib_app():
|
|
15
|
+
return None
|
|
16
|
+
|
|
17
|
+
deprecated_api_client = BiolibApiClient.get()
|
|
18
|
+
deprecated_api_client.refresh_access_token()
|
|
19
|
+
access_token = deprecated_api_client.access_token
|
|
20
|
+
if not access_token:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
decoded_token = decode_jwt_without_checking_signature(access_token)
|
|
24
|
+
user_uuid: Optional[str] = decoded_token['payload'].get('public_id')
|
|
25
|
+
if not user_uuid:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
normalized_user_uuid = user_uuid.replace('-', '_')
|
|
29
|
+
credentials = f'biolib_user|{normalized_user_uuid}:{access_token}'
|
|
30
|
+
return f'Basic {base64_encode_string(credentials)}'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class IndexQueryResult:
|
|
34
|
+
"""Result wrapper for index query responses."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, response: HttpResponse, data_format: str):
|
|
37
|
+
self._response = response
|
|
38
|
+
self._data_format = data_format
|
|
39
|
+
self._json_data: Optional[Dict[str, Any]] = None
|
|
40
|
+
if data_format == 'json':
|
|
41
|
+
content = self._response.content
|
|
42
|
+
if content:
|
|
43
|
+
self._json_data = json.loads(content.decode('utf-8'))
|
|
44
|
+
|
|
45
|
+
def iter_rows(self) -> Iterator[Dict[str, Any]]:
|
|
46
|
+
"""Return an iterator over the rows in the query result.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Iterator[Dict[str, Any]]: An iterator yielding each row as a dictionary.
|
|
50
|
+
"""
|
|
51
|
+
if self._json_data is None:
|
|
52
|
+
raise BioLibError('iter_rows() is only available when data_format is "json"')
|
|
53
|
+
return iter(self._json_data['data'])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def query_index(
|
|
57
|
+
query: str,
|
|
58
|
+
data: Optional[Union[List[Dict[str, Any]], bytes]] = None,
|
|
59
|
+
data_format: str = 'json',
|
|
60
|
+
) -> IndexQueryResult:
|
|
61
|
+
"""Query the BioLib index with a SQL-like query.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
query: The SQL query string to execute.
|
|
65
|
+
data: Optional input data. If data_format is "json", this should be a list of
|
|
66
|
+
dictionaries that will be JSON encoded. Otherwise, pass raw bytes.
|
|
67
|
+
data_format: The format for the query. Defaults to "json".
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
IndexQueryResult: A result object wrapping the query response.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
BioLibError: If the query fails or returns a non-successful HTTP status code.
|
|
74
|
+
"""
|
|
75
|
+
data_format = data_format.lower()
|
|
76
|
+
|
|
77
|
+
params: Dict[str, Union[str, int]] = {'default_format': data_format.upper()}
|
|
78
|
+
if data is not None:
|
|
79
|
+
params['query'] = query
|
|
80
|
+
|
|
81
|
+
if data is not None:
|
|
82
|
+
if data_format == 'json':
|
|
83
|
+
body: bytes = '\n'.join(json.dumps(item, ensure_ascii=False) for item in data).encode('utf-8')
|
|
84
|
+
else:
|
|
85
|
+
body = data # type: ignore[assignment]
|
|
86
|
+
else:
|
|
87
|
+
body = query.encode('utf-8')
|
|
88
|
+
|
|
89
|
+
response = api.client.post(
|
|
90
|
+
path='proxy/index',
|
|
91
|
+
data=body,
|
|
92
|
+
params=params,
|
|
93
|
+
headers={
|
|
94
|
+
'Content-Type': 'text/plain; charset=utf-8',
|
|
95
|
+
'Authorization': _get_index_basic_auth_header(),
|
|
96
|
+
},
|
|
97
|
+
authenticate=False,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if response.status_code < 200 or response.status_code >= 300:
|
|
101
|
+
raise BioLibError(f'Index query failed with status code {response.status_code}: {response.text}')
|
|
102
|
+
|
|
103
|
+
return IndexQueryResult(response, data_format)
|