pybiolib 1.2.883__py3-none-any.whl → 1.2.1890__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. biolib/__init__.py +33 -10
  2. biolib/_data_record/data_record.py +220 -126
  3. biolib/_index/index.py +55 -0
  4. biolib/_index/query_result.py +103 -0
  5. biolib/_internal/add_copilot_prompts.py +24 -11
  6. biolib/_internal/add_gui_files.py +81 -0
  7. biolib/_internal/data_record/__init__.py +1 -1
  8. biolib/_internal/data_record/data_record.py +1 -18
  9. biolib/_internal/data_record/push_data.py +65 -16
  10. biolib/_internal/data_record/remote_storage_endpoint.py +18 -13
  11. biolib/_internal/file_utils.py +48 -0
  12. biolib/_internal/lfs/cache.py +4 -2
  13. biolib/_internal/push_application.py +95 -24
  14. biolib/_internal/runtime.py +2 -0
  15. biolib/_internal/string_utils.py +13 -0
  16. biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-general.instructions.md +5 -0
  17. biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
  18. biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
  19. biolib/_internal/templates/dashboard_template/.biolib/config.yml +5 -0
  20. biolib/_internal/templates/{init_template → github_workflow_template}/.github/workflows/biolib.yml +7 -2
  21. biolib/_internal/templates/gitignore_template/.gitignore +10 -0
  22. biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
  23. biolib/_internal/templates/gui_template/App.tsx +53 -0
  24. biolib/_internal/templates/gui_template/Dockerfile +27 -0
  25. biolib/_internal/templates/gui_template/biolib-sdk.ts +82 -0
  26. biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
  27. biolib/_internal/templates/gui_template/index.css +5 -0
  28. biolib/_internal/templates/gui_template/index.html +13 -0
  29. biolib/_internal/templates/gui_template/index.tsx +10 -0
  30. biolib/_internal/templates/gui_template/package.json +27 -0
  31. biolib/_internal/templates/gui_template/tsconfig.json +24 -0
  32. biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +50 -0
  33. biolib/_internal/templates/gui_template/vite.config.mts +10 -0
  34. biolib/_internal/templates/init_template/.biolib/config.yml +1 -0
  35. biolib/_internal/templates/init_template/Dockerfile +5 -1
  36. biolib/_internal/templates/init_template/run.py +6 -15
  37. biolib/_internal/templates/init_template/run.sh +1 -0
  38. biolib/_internal/templates/templates.py +21 -1
  39. biolib/_internal/utils/__init__.py +47 -0
  40. biolib/_internal/utils/auth.py +46 -0
  41. biolib/_internal/utils/job_url.py +33 -0
  42. biolib/_internal/utils/multinode.py +12 -14
  43. biolib/_runtime/runtime.py +15 -2
  44. biolib/_session/session.py +7 -5
  45. biolib/_shared/__init__.py +0 -0
  46. biolib/_shared/types/__init__.py +74 -0
  47. biolib/_shared/types/account.py +12 -0
  48. biolib/_shared/types/account_member.py +8 -0
  49. biolib/{_internal → _shared}/types/experiment.py +1 -0
  50. biolib/_shared/types/resource.py +37 -0
  51. biolib/_shared/types/resource_deploy_key.py +11 -0
  52. biolib/{_internal → _shared}/types/resource_version.py +8 -2
  53. biolib/_shared/types/user.py +19 -0
  54. biolib/_shared/utils/__init__.py +7 -0
  55. biolib/_shared/utils/resource_uri.py +75 -0
  56. biolib/api/client.py +5 -48
  57. biolib/app/app.py +97 -55
  58. biolib/biolib_api_client/api_client.py +3 -47
  59. biolib/biolib_api_client/app_types.py +1 -1
  60. biolib/biolib_api_client/biolib_app_api.py +31 -6
  61. biolib/biolib_api_client/biolib_job_api.py +1 -1
  62. biolib/biolib_api_client/user_state.py +34 -2
  63. biolib/biolib_binary_format/module_input.py +8 -0
  64. biolib/biolib_binary_format/remote_endpoints.py +3 -3
  65. biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
  66. biolib/biolib_logging.py +1 -1
  67. biolib/cli/__init__.py +2 -2
  68. biolib/cli/auth.py +4 -16
  69. biolib/cli/data_record.py +82 -0
  70. biolib/cli/index.py +32 -0
  71. biolib/cli/init.py +393 -71
  72. biolib/cli/lfs.py +1 -1
  73. biolib/cli/run.py +9 -6
  74. biolib/cli/start.py +14 -1
  75. biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
  76. biolib/compute_node/job_worker/executors/docker_types.py +1 -1
  77. biolib/compute_node/job_worker/executors/types.py +6 -5
  78. biolib/compute_node/job_worker/job_storage.py +2 -1
  79. biolib/compute_node/job_worker/job_worker.py +155 -90
  80. biolib/compute_node/job_worker/large_file_system.py +2 -6
  81. biolib/compute_node/job_worker/network_alloc.py +99 -0
  82. biolib/compute_node/job_worker/network_buffer.py +240 -0
  83. biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
  84. biolib/compute_node/remote_host_proxy.py +163 -79
  85. biolib/compute_node/utils.py +2 -0
  86. biolib/compute_node/webserver/compute_node_results_proxy.py +189 -0
  87. biolib/compute_node/webserver/proxy_utils.py +28 -0
  88. biolib/compute_node/webserver/webserver.py +64 -19
  89. biolib/experiments/experiment.py +111 -16
  90. biolib/jobs/job.py +128 -31
  91. biolib/jobs/job_result.py +74 -34
  92. biolib/jobs/types.py +1 -0
  93. biolib/sdk/__init__.py +28 -3
  94. biolib/typing_utils.py +1 -1
  95. biolib/utils/cache_state.py +8 -5
  96. biolib/utils/multipart_uploader.py +24 -18
  97. biolib/utils/seq_util.py +1 -1
  98. pybiolib-1.2.1890.dist-info/METADATA +41 -0
  99. pybiolib-1.2.1890.dist-info/RECORD +177 -0
  100. {pybiolib-1.2.883.dist-info → pybiolib-1.2.1890.dist-info}/WHEEL +1 -1
  101. pybiolib-1.2.1890.dist-info/entry_points.txt +2 -0
  102. biolib/_internal/llm_instructions/.github/instructions/style-react-ts.instructions.md +0 -22
  103. biolib/_internal/templates/init_template/.gitignore +0 -2
  104. biolib/_internal/types/__init__.py +0 -6
  105. biolib/_internal/types/resource.py +0 -18
  106. biolib/biolib_download_container.py +0 -38
  107. biolib/cli/download_container.py +0 -14
  108. biolib/utils/app_uri.py +0 -57
  109. pybiolib-1.2.883.dist-info/METADATA +0 -50
  110. pybiolib-1.2.883.dist-info/RECORD +0 -148
  111. pybiolib-1.2.883.dist-info/entry_points.txt +0 -3
  112. /biolib/{_internal/llm_instructions → _index}/__init__.py +0 -0
  113. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/general-app-knowledge.instructions.md +0 -0
  114. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-python.instructions.md +0 -0
  115. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_app_inputs.prompt.md +0 -0
  116. /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_run_apps.prompt.md +0 -0
  117. /biolib/{_internal → _shared}/types/app.py +0 -0
  118. /biolib/{_internal → _shared}/types/data_record.py +0 -0
  119. /biolib/{_internal → _shared}/types/file_node.py +0 -0
  120. /biolib/{_internal → _shared}/types/push.py +0 -0
  121. /biolib/{_internal → _shared}/types/resource_permission.py +0 -0
  122. /biolib/{_internal → _shared}/types/result.py +0 -0
  123. /biolib/{_internal → _shared}/types/typing.py +0 -0
  124. {pybiolib-1.2.883.dist-info → pybiolib-1.2.1890.dist-info/licenses}/LICENSE +0 -0
biolib/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ # ruff: noqa: I001
1
2
  # Imports to hide
2
3
  import os
3
4
  from urllib.parse import urlparse as _urlparse
@@ -15,6 +16,7 @@ from biolib.jobs.job import Result as _Result
15
16
  from biolib import user as _user
16
17
  from biolib.typing_utils import List, Optional, cast as _cast
17
18
  from biolib._data_record.data_record import DataRecord as _DataRecord
19
+ from biolib._internal.utils.job_url import parse_result_id_or_url as _parse_result_id_or_url
18
20
 
19
21
  import biolib.api
20
22
  import biolib.app
@@ -22,7 +24,6 @@ import biolib.cli
22
24
  import biolib.sdk
23
25
  import biolib.utils
24
26
 
25
-
26
27
  # ------------------------------------ Function definitions for public Python API ------------------------------------
27
28
 
28
29
 
@@ -83,43 +84,65 @@ def search(
83
84
 
84
85
 
85
86
  def get_job(job_id: str, job_token: Optional[str] = None) -> _Result:
86
- r"""Get a job by its ID.
87
+ r"""Get a job by its ID or full URL.
87
88
 
88
89
  Args:
89
- job_id (str): The UUID of the job to retrieve
90
+ job_id (str): The UUID of the job to retrieve, or a full URL to the job.
91
+ Can be either:
92
+ - Job UUID (e.g., 'abc123')
93
+ - Full URL (e.g., 'https://biolib.com/result/abc123/?token=xyz789')
94
+ - Full URL with token parameter (e.g., 'biolib.com/result/abc123/token=xyz789')
90
95
  job_token (str, optional): Authentication token for accessing the job.
91
96
  Only needed for jobs that aren't owned by the current user.
97
+ If the URL contains a token, this parameter is ignored.
92
98
 
93
99
  Returns:
94
100
  Job: The job object
95
101
 
96
102
  Example::
97
103
 
104
+ >>> # Get by UUID
98
105
  >>> job = biolib.get_job('abc123')
99
- >>> # Access shared job
106
+ >>> # Get with explicit token
100
107
  >>> job = biolib.get_job('abc123', job_token='xyz789')
108
+ >>> # Get by full URL with token
109
+ >>> job = biolib.get_job('https://biolib.com/result/abc123/?token=xyz789')
110
+ >>> # Get by URL with inline token format
111
+ >>> job = biolib.get_job('biolib.com/result/abc123/token=xyz789')
101
112
  """
102
- return _Result.create_from_uuid(uuid=job_id, auth_token=job_token)
113
+ uuid, token = _parse_result_id_or_url(job_id, job_token)
114
+ return _Result.create_from_uuid(uuid=uuid, auth_token=token)
103
115
 
104
116
 
105
117
  def get_result(result_id: str, result_token: Optional[str] = None) -> _Result:
106
- r"""Get a result by its ID.
118
+ r"""Get a result by its ID or full URL.
107
119
 
108
120
  Args:
109
- result_id (str): The UUID of the result to retrieve
121
+ result_id (str): The UUID of the result to retrieve, or a full URL to the result.
122
+ Can be either:
123
+ - Result UUID (e.g., 'abc123')
124
+ - Full URL (e.g., 'https://biolib.com/result/abc123/?token=xyz789')
125
+ - Full URL with token parameter (e.g., 'biolib.com/result/abc123/token=xyz789')
110
126
  result_token (str, optional): Authentication token for accessing the result.
111
- Only needed for result that aren't owned by the current user.
127
+ Only needed for results that aren't owned by the current user.
128
+ If the URL contains a token, this parameter is ignored.
112
129
 
113
130
  Returns:
114
131
  Result: The result object
115
132
 
116
133
  Example::
117
134
 
135
+ >>> # Get by UUID
118
136
  >>> result = biolib.get_result('abc123')
119
- >>> # Access shared result
137
+ >>> # Get with explicit token
120
138
  >>> result = biolib.get_result('abc123', result_token='xyz789')
139
+ >>> # Get by full URL with token
140
+ >>> result = biolib.get_result('https://biolib.com/result/abc123/?token=xyz789')
141
+ >>> # Get by URL with inline token format
142
+ >>> result = biolib.get_result('biolib.com/result/abc123/token=xyz789')
121
143
  """
122
- return _Result.create_from_uuid(uuid=result_id, auth_token=result_token)
144
+ uuid, token = _parse_result_id_or_url(result_id, result_token)
145
+ return _Result.create_from_uuid(uuid=uuid, auth_token=token)
123
146
 
124
147
 
125
148
  def get_data_record(uri: str) -> _DataRecord:
@@ -3,52 +3,157 @@ from collections import namedtuple
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
5
  from struct import Struct
6
- from typing import Callable, Dict, Iterable, List, Optional, Union, cast
6
+ from typing import Callable, Dict, Iterable, Iterator, List, Optional, Union, cast
7
7
 
8
8
  from biolib import api
9
- from biolib._internal import types
10
- from biolib._internal.data_record import get_data_record_state_from_uri
11
9
  from biolib._internal.data_record.data_record import validate_sqlite_v1
12
10
  from biolib._internal.data_record.push_data import (
11
+ _upload_from_iterator,
13
12
  push_data_path,
14
13
  validate_data_path_and_get_files_and_size_of_directory,
15
14
  )
16
15
  from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
17
16
  from biolib._internal.http_client import HttpClient
18
- from biolib._internal.types.file_node import ZipFileNodeDict
17
+ from biolib._shared import types
18
+ from biolib._shared.types import ResourceDetailedDict, ResourceVersionDetailedDict, ZipFileNodeDict
19
+ from biolib._shared.utils import parse_resource_uri
19
20
  from biolib.api import client as api_client
20
21
  from biolib.biolib_api_client import BiolibApiClient
21
- from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersion, DataRecordVersionInfo
22
+ from biolib.biolib_api_client.biolib_app_api import _get_resource_uri_from_str
23
+ from biolib.biolib_api_client.lfs_types import DataRecordInfo
22
24
  from biolib.biolib_binary_format import LazyLoadedFile
23
25
  from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
24
26
  from biolib.biolib_logging import logger
25
- from biolib.utils.app_uri import parse_app_uri
26
27
 
27
28
  PathFilter = Union[str, List[str], Callable[[str], bool]]
28
29
 
29
30
 
30
31
  class DataRecord:
31
- def __init__(self, _internal_state: DataRecordVersionInfo):
32
+ def __init__(self, _internal_state: ResourceDetailedDict):
32
33
  self._state = _internal_state
33
34
 
34
35
  def __repr__(self):
35
- return f'DataRecord: {self._state["resource_uri"]}'
36
+ return f'DataRecord: {self._state["uri"]}'
36
37
 
37
38
  @property
38
39
  def uri(self) -> str:
39
- return self._state['resource_uri']
40
+ return self._state['uri']
40
41
 
41
42
  @property
42
43
  def uuid(self) -> str:
43
- return self._state['resource_uuid']
44
+ return self._state['uuid']
44
45
 
45
46
  @property
46
47
  def name(self) -> str:
47
- uri_parsed = parse_app_uri(self._state['resource_uri'], use_account_as_name_default=False)
48
- if not uri_parsed['app_name']:
48
+ uri_parsed = parse_resource_uri(self._state['uri'], use_account_as_name_default=False)
49
+ if not uri_parsed['resource_name']:
49
50
  raise ValueError('Expected parameter "resource_uri" to contain resource name')
50
51
 
51
- return uri_parsed['app_name']
52
+ return uri_parsed['resource_name']
53
+
54
+ @staticmethod
55
+ def get_by_uri(uri: str) -> 'DataRecord':
56
+ normalized_uri = _get_resource_uri_from_str(uri)
57
+ resource_dict: ResourceDetailedDict = api_client.get(path='/resource/', params={'uri': normalized_uri}).json()
58
+ if resource_dict['type'] != 'data-record':
59
+ raise Exception(f'Resource "{resource_dict["uri"]}" is not a Data Record')
60
+
61
+ return DataRecord(_internal_state=resource_dict)
62
+
63
+ @staticmethod
64
+ def create(destination: str, data_path: Optional[str] = None, record_type: Optional[str] = None) -> 'DataRecord':
65
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
66
+ if data_path is not None:
67
+ assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
68
+ uri_parsed = parse_resource_uri(destination, use_account_as_name_default=False)
69
+ if uri_parsed['resource_name_normalized']:
70
+ data_record_uri = destination
71
+ else:
72
+ record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
73
+ data_record_uri = f'{destination}/{record_name}'
74
+
75
+ response = api.client.post(
76
+ path='/resources/data-records/',
77
+ data={
78
+ 'uri': data_record_uri,
79
+ 'type': record_type,
80
+ },
81
+ )
82
+ data_record_info: DataRecordInfo = response.json()
83
+ logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
84
+
85
+ data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
86
+ if data_path is not None:
87
+ data_record.update(data_path=data_path)
88
+
89
+ return data_record
90
+
91
+ @staticmethod
92
+ def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
93
+ # TODO: Simplify when backend exposes /api/resources/ instead of /api/apps/
94
+ max_page_size = 1_000
95
+ params: Dict[str, Union[str, int]] = {
96
+ 'page_size': str(count or max_page_size),
97
+ 'resource_type': 'data-record',
98
+ }
99
+ if uri:
100
+ uri_parsed = parse_resource_uri(uri, use_account_as_name_default=False)
101
+ params['account_handle'] = uri_parsed['account_handle_normalized']
102
+ if uri_parsed['resource_name_normalized']:
103
+ params['app_name'] = uri_parsed['resource_name_normalized']
104
+
105
+ results = api_client.get(path='/apps/', params=params).json()['results']
106
+ if count is None and len(results) == max_page_size:
107
+ logger.warning(
108
+ f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
109
+ )
110
+
111
+ return [
112
+ DataRecord(
113
+ _internal_state=ResourceDetailedDict(
114
+ uri=result['resource_uri'],
115
+ uuid=result['public_id'],
116
+ name=result['name'],
117
+ created_at=result['created_at'],
118
+ type=result['type'],
119
+ description=result['description'],
120
+ account_uuid=result['account_id'],
121
+ experiment=None,
122
+ )
123
+ )
124
+ for result in results
125
+ ]
126
+
127
+ @staticmethod
128
+ def clone(
129
+ source: 'DataRecord',
130
+ destination: 'DataRecord',
131
+ on_progress: Optional[Callable[[int, int], None]] = None,
132
+ ) -> 'DataRecord':
133
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='clone a Data Record')
134
+
135
+ # pylint: disable=protected-access
136
+ total_size_in_bytes = source._get_zip_size_bytes()
137
+
138
+ if total_size_in_bytes == 0:
139
+ raise ValueError('Source data record has no data to clone')
140
+
141
+ min_chunk_size_bytes = 10_000_000
142
+ chunk_size_in_bytes = max(min_chunk_size_bytes, int(total_size_in_bytes / 9_000))
143
+
144
+ zip_iterator = source._iter_zip_bytes(chunk_size_bytes=chunk_size_in_bytes)
145
+
146
+ new_resource_version_uuid = _upload_from_iterator(
147
+ resource_uuid=destination._state['uuid'],
148
+ payload_iterator=zip_iterator,
149
+ payload_size_in_bytes=total_size_in_bytes,
150
+ publish=True,
151
+ on_progress=on_progress,
152
+ )
153
+ # pylint: enable=protected-access
154
+
155
+ logger.info(f"Successfully cloned data to '{destination.uri}'")
156
+ return DataRecord._get_by_version_uuid(new_resource_version_uuid)
52
157
 
53
158
  def list_files(
54
159
  self,
@@ -71,9 +176,7 @@ class DataRecord:
71
176
  return files
72
177
 
73
178
  def download_zip(self, output_path: str):
74
- remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
75
- resource_version_uuid=self._state['resource_version_uuid'],
76
- )
179
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
77
180
  HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
78
181
 
79
182
  def download_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
@@ -113,126 +216,37 @@ class DataRecord:
113
216
  else:
114
217
  raise Exception(f"Error processing data record validation: unknown rule type {rule['type']}")
115
218
 
116
- response = api.client.post(path='/lfs/versions/', data={'resource_uuid': self._state['resource_uuid']})
117
- data_record_version: DataRecordVersion = response.json()
118
- resource_version_uuid = data_record_version['uuid']
119
-
120
- push_data_path(
219
+ new_resource_version_uuid = push_data_path(
121
220
  data_path=data_path,
122
221
  data_size_in_bytes=data_size_in_bytes,
123
222
  files_to_zip=files_to_zip,
124
- resource_version_uuid=resource_version_uuid,
223
+ resource_uuid=self._state['uuid'],
125
224
  chunk_size_in_mb=chunk_size_in_mb,
225
+ publish=True,
126
226
  )
127
227
 
128
- api.client.patch(
129
- path=f'/resources/versions/{resource_version_uuid}/',
130
- data={'state': 'published', 'set_as_active': True},
131
- )
228
+ updated_record = DataRecord._get_by_version_uuid(new_resource_version_uuid)
229
+ self._state = updated_record._state # pylint: disable=protected-access
230
+ logger.info(f"Successfully pushed a new Data Record version '{self.uri}'")
132
231
 
133
- logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
134
- self._state = get_data_record_state_from_uri(data_record_version['uri'])
232
+ def delete(self) -> None:
233
+ """Delete the data record.
135
234
 
136
- @staticmethod
137
- def get_by_uri(uri: str) -> 'DataRecord':
138
- return DataRecord(_internal_state=get_data_record_state_from_uri(uri))
235
+ Example::
236
+ >>> record = DataRecord.get_by_uri("account/data-record")
237
+ >>> record.delete()
238
+ """
239
+ try:
240
+ api_client.delete(path=f'/apps/{self.uuid}/')
241
+ logger.info(f'Data record {self.uri} deleted')
242
+ except Exception as error:
243
+ raise Exception(f'Failed to delete data record {self.uri} due to: {error}') from error
139
244
 
140
245
  @staticmethod
141
- def create(destination: str, data_path: Optional[str] = None, record_type: Optional[str] = None) -> 'DataRecord':
142
- BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
143
- if data_path is not None:
144
- assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
145
- uri_parsed = parse_app_uri(destination, use_account_as_name_default=False)
146
- if uri_parsed['app_name_normalized']:
147
- data_record_uri = destination
148
- else:
149
- record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
150
- data_record_uri = f'{destination}/{record_name}'
151
-
152
- response = api.client.post(
153
- path='/resources/data-records/',
154
- data={
155
- 'uri': data_record_uri,
156
- 'type': record_type,
157
- },
158
- )
159
- data_record_info: DataRecordInfo = response.json()
160
- logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
161
-
162
- data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
163
- if data_path is not None:
164
- data_record.update(data_path=data_path)
165
-
166
- return data_record
167
-
168
- @staticmethod
169
- def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
170
- max_page_size = 1_000
171
- params: Dict[str, Union[str, int]] = {
172
- 'page_size': str(count or max_page_size),
173
- 'resource_type': 'data-record',
174
- }
175
- if uri:
176
- uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
177
- params['account_handle'] = uri_parsed['account_handle_normalized']
178
- if uri_parsed['app_name_normalized']:
179
- params['app_name'] = uri_parsed['app_name_normalized']
180
-
181
- results = api_client.get(path='/apps/', params=params).json()['results']
182
- if count is None and len(results) == max_page_size:
183
- logger.warning(
184
- f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
185
- )
186
-
187
- return [
188
- DataRecord(
189
- _internal_state={
190
- 'resource_uri': result['resource_uri'],
191
- 'resource_uuid': result['public_id'],
192
- 'resource_version_uuid': result['active_version'],
193
- }
194
- )
195
- for result in results
196
- ]
197
-
198
- def _fetch_files(
199
- self,
200
- max_count: Optional[int],
201
- path_filter: Optional[PathFilter] = None,
202
- ) -> Iterable[LazyLoadedFile]:
203
- if path_filter and not (isinstance(path_filter, (str, list)) or callable(path_filter)):
204
- raise Exception('Expected path_filter to be a string, a list of strings or a function')
205
-
206
- path_filters = (
207
- [path_filter] if isinstance(path_filter, str) else path_filter if isinstance(path_filter, list) else []
208
- )
209
-
210
- resource_version_uuid = self._state['resource_version_uuid']
211
- remote_storage_endpoint = DataRecordRemoteStorageEndpoint(resource_version_uuid)
212
-
213
- page: Optional[int] = 1
214
- yielded_files: int = 0
215
- while page:
216
- response = api.client.post(
217
- path=f'/proxy/files/data-record-versions/{resource_version_uuid}/query/',
218
- data=dict(page=page, page_size=1_000, path_filters=path_filters),
219
- ).json()
220
-
221
- for file_node_dict in cast(List[ZipFileNodeDict], response['results']):
222
- if file_node_dict['is_dir']:
223
- continue
224
-
225
- if callable(path_filter) and not path_filter(file_node_dict['dir_path'] + file_node_dict['name']):
226
- continue
227
-
228
- yield self._get_file(remote_storage_endpoint, file_node_dict)
229
- yielded_files += 1
230
-
231
- if max_count is not None and yielded_files >= max_count:
232
- page = None
233
- break
234
-
235
- page = page + 1 if page is not None and response['page_count'] > page else None
246
+ def _get_by_version_uuid(version_uuid: str) -> 'DataRecord':
247
+ response = api.client.get(path=f'/lfs/versions/{version_uuid}/')
248
+ version_info = response.json()
249
+ return DataRecord.get_by_uri(version_info['uri'])
236
250
 
237
251
  @staticmethod
238
252
  def _get_file(
@@ -282,5 +296,85 @@ class DataRecord:
282
296
  start_func=file_start_func,
283
297
  )
284
298
 
299
+ def _get_version(self) -> ResourceVersionDetailedDict:
300
+ if 'version' not in self._state:
301
+ # Version might be missing in state if initialized from the fetch method (list of data records)
302
+ self._state = self.get_by_uri(self.uri)._state
303
+
304
+ version = self._state.get('version')
305
+ if version is None:
306
+ raise Exception(f'Data Record "{self._state["uri"]}" has no active version')
307
+
308
+ return version
309
+
310
+ def _fetch_files(
311
+ self,
312
+ max_count: Optional[int],
313
+ path_filter: Optional[PathFilter] = None,
314
+ ) -> Iterable[LazyLoadedFile]:
315
+ if path_filter and not (isinstance(path_filter, (str, list)) or callable(path_filter)):
316
+ raise Exception('Expected path_filter to be a string, a list of strings or a function')
317
+
318
+ path_filters = (
319
+ [path_filter] if isinstance(path_filter, str) else path_filter if isinstance(path_filter, list) else []
320
+ )
321
+
322
+ version = self._get_version()
323
+ resource_version_uuid = version['uuid']
324
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
325
+
326
+ page: Optional[int] = 1
327
+ yielded_files: int = 0
328
+ while page:
329
+ response = api.client.post(
330
+ path=f'/proxy/files/data-record-versions/{resource_version_uuid}/query/',
331
+ data=dict(page=page, page_size=1_000, path_filters=path_filters),
332
+ ).json()
333
+
334
+ for file_node_dict in cast(List[ZipFileNodeDict], response['results']):
335
+ if file_node_dict['is_dir']:
336
+ continue
337
+
338
+ if callable(path_filter) and not path_filter(file_node_dict['dir_path'] + file_node_dict['name']):
339
+ continue
340
+
341
+ yield self._get_file(remote_storage_endpoint, file_node_dict)
342
+ yielded_files += 1
343
+
344
+ if max_count is not None and yielded_files >= max_count:
345
+ page = None
346
+ break
347
+
348
+ page = page + 1 if page is not None and response['page_count'] > page else None
349
+
285
350
  def _get_detailed_dict(self) -> types.DataRecordDetailedDict:
286
351
  return cast(types.DataRecordDetailedDict, api_client.get(f'/resources/data-records/{self.uuid}/').json())
352
+
353
+ def _get_zip_size_bytes(self) -> int:
354
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
355
+ presigned_url = remote_storage_endpoint.get_remote_url()
356
+ response = HttpClient.request(url=presigned_url, headers={'range': 'bytes=0-0'})
357
+ content_range = response.headers.get('Content-Range', '')
358
+ if not content_range or '/' not in content_range:
359
+ raise ValueError('Unable to determine zip size: Content-Range header missing or invalid')
360
+ total_size = int(content_range.split('/')[1])
361
+ return total_size
362
+
363
+ def _iter_zip_bytes(self, chunk_size_bytes: int) -> Iterator[bytes]:
364
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(uri=self.uri)
365
+ presigned_url = remote_storage_endpoint.get_remote_url()
366
+ response = HttpClient.request(url=presigned_url, headers={'range': 'bytes=0-0'})
367
+ content_range = response.headers.get('Content-Range', '')
368
+ if not content_range or '/' not in content_range:
369
+ raise ValueError('Unable to determine zip size: Content-Range header missing or invalid')
370
+ total_size = int(content_range.split('/')[1])
371
+
372
+ for start in range(0, total_size, chunk_size_bytes):
373
+ end = min(start + chunk_size_bytes - 1, total_size - 1)
374
+ presigned_url = remote_storage_endpoint.get_remote_url()
375
+ response = HttpClient.request(
376
+ url=presigned_url,
377
+ headers={'range': f'bytes={start}-{end}'},
378
+ timeout_in_seconds=300,
379
+ )
380
+ yield response.content
biolib/_index/index.py ADDED
@@ -0,0 +1,55 @@
1
+ import json
2
+ from typing import Any, Dict
3
+
4
+ from biolib import api
5
+ from biolib._shared.types import ResourceDetailedDict
6
+ from biolib.biolib_api_client import BiolibApiClient
7
+ from biolib.biolib_api_client.biolib_app_api import _get_resource_uri_from_str
8
+ from biolib.biolib_logging import logger
9
+
10
+
11
+ class Index:
12
+ def __init__(self, _internal_state: ResourceDetailedDict):
13
+ self._state = _internal_state
14
+
15
+ def __repr__(self) -> str:
16
+ return f'Index: {self._state["uri"]}'
17
+
18
+ @property
19
+ def uri(self) -> str:
20
+ return self._state['uri']
21
+
22
+ @property
23
+ def id(self) -> str:
24
+ return f'{self._state["account_uuid"]}.{self._state["uuid"]}'.replace('-', '_')
25
+
26
+ @staticmethod
27
+ def get_by_uri(uri: str) -> 'Index':
28
+ normalized_uri = _get_resource_uri_from_str(uri)
29
+ response: ResourceDetailedDict = api.client.get(path='/resource/', params={'uri': normalized_uri}).json()
30
+ if response['type'] != 'index':
31
+ raise Exception(f'Resource "{response["uri"]}" is not an Index')
32
+ return Index(_internal_state=response)
33
+
34
+ @staticmethod
35
+ def create(uri: str, config: Dict[str, Any]) -> str:
36
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='create an Index')
37
+
38
+ response = api.client.post(
39
+ path='/resources/indexes/',
40
+ data={
41
+ 'uri': uri,
42
+ 'index_config': config,
43
+ },
44
+ )
45
+ result = response.json()
46
+ created_uri: str = result['uri']
47
+ logger.info(f"Successfully created Index '{created_uri}'")
48
+ return created_uri
49
+
50
+ @staticmethod
51
+ def create_from_config_file(uri: str, config_path: str) -> str:
52
+ with open(config_path) as config_file:
53
+ index_config = json.load(config_file)
54
+
55
+ return Index.create(uri=uri, config=index_config)
@@ -0,0 +1,103 @@
1
+ import json
2
+ from typing import Any, Dict, Iterator, List, Optional, Union
3
+
4
+ from biolib import api
5
+ from biolib._internal.http_client import HttpResponse
6
+ from biolib._internal.utils import base64_encode_string
7
+ from biolib._internal.utils.auth import decode_jwt_without_checking_signature
8
+ from biolib._runtime.runtime import Runtime
9
+ from biolib.biolib_api_client import BiolibApiClient
10
+ from biolib.biolib_errors import BioLibError
11
+
12
+
13
+ def _get_index_basic_auth_header() -> Optional[str]:
14
+ if Runtime.check_is_environment_biolib_app():
15
+ return None
16
+
17
+ deprecated_api_client = BiolibApiClient.get()
18
+ deprecated_api_client.refresh_access_token()
19
+ access_token = deprecated_api_client.access_token
20
+ if not access_token:
21
+ return None
22
+
23
+ decoded_token = decode_jwt_without_checking_signature(access_token)
24
+ user_uuid: Optional[str] = decoded_token['payload'].get('public_id')
25
+ if not user_uuid:
26
+ return None
27
+
28
+ normalized_user_uuid = user_uuid.replace('-', '_')
29
+ credentials = f'biolib_user|{normalized_user_uuid}:{access_token}'
30
+ return f'Basic {base64_encode_string(credentials)}'
31
+
32
+
33
+ class IndexQueryResult:
34
+ """Result wrapper for index query responses."""
35
+
36
+ def __init__(self, response: HttpResponse, data_format: str):
37
+ self._response = response
38
+ self._data_format = data_format
39
+ self._json_data: Optional[Dict[str, Any]] = None
40
+ if data_format == 'json':
41
+ content = self._response.content
42
+ if content:
43
+ self._json_data = json.loads(content.decode('utf-8'))
44
+
45
+ def iter_rows(self) -> Iterator[Dict[str, Any]]:
46
+ """Return an iterator over the rows in the query result.
47
+
48
+ Returns:
49
+ Iterator[Dict[str, Any]]: An iterator yielding each row as a dictionary.
50
+ """
51
+ if self._json_data is None:
52
+ raise BioLibError('iter_rows() is only available when data_format is "json"')
53
+ return iter(self._json_data['data'])
54
+
55
+
56
+ def query_index(
57
+ query: str,
58
+ data: Optional[Union[List[Dict[str, Any]], bytes]] = None,
59
+ data_format: str = 'json',
60
+ ) -> IndexQueryResult:
61
+ """Query the BioLib index with a SQL-like query.
62
+
63
+ Args:
64
+ query: The SQL query string to execute.
65
+ data: Optional input data. If data_format is "json", this should be a list of
66
+ dictionaries that will be JSON encoded. Otherwise, pass raw bytes.
67
+ data_format: The format for the query. Defaults to "json".
68
+
69
+ Returns:
70
+ IndexQueryResult: A result object wrapping the query response.
71
+
72
+ Raises:
73
+ BioLibError: If the query fails or returns a non-successful HTTP status code.
74
+ """
75
+ data_format = data_format.lower()
76
+
77
+ params: Dict[str, Union[str, int]] = {'default_format': data_format.upper()}
78
+ if data is not None:
79
+ params['query'] = query
80
+
81
+ if data is not None:
82
+ if data_format == 'json':
83
+ body: bytes = '\n'.join(json.dumps(item, ensure_ascii=False) for item in data).encode('utf-8')
84
+ else:
85
+ body = data # type: ignore[assignment]
86
+ else:
87
+ body = query.encode('utf-8')
88
+
89
+ response = api.client.post(
90
+ path='proxy/index',
91
+ data=body,
92
+ params=params,
93
+ headers={
94
+ 'Content-Type': 'text/plain; charset=utf-8',
95
+ 'Authorization': _get_index_basic_auth_header(),
96
+ },
97
+ authenticate=False,
98
+ )
99
+
100
+ if response.status_code < 200 or response.status_code >= 300:
101
+ raise BioLibError(f'Index query failed with status code {response.status_code}: {response.text}')
102
+
103
+ return IndexQueryResult(response, data_format)