pybiolib 1.1.1881__py3-none-any.whl → 1.2.7.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. biolib/__init__.py +11 -4
  2. biolib/_data_record/data_record.py +278 -0
  3. biolib/_internal/data_record/__init__.py +1 -1
  4. biolib/_internal/data_record/data_record.py +97 -151
  5. biolib/_internal/data_record/remote_storage_endpoint.py +18 -7
  6. biolib/_internal/file_utils.py +77 -0
  7. biolib/_internal/fuse_mount/__init__.py +1 -0
  8. biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
  9. biolib/_internal/http_client.py +31 -9
  10. biolib/_internal/lfs/__init__.py +1 -0
  11. biolib/_internal/libs/__init__.py +1 -0
  12. biolib/_internal/libs/fusepy/__init__.py +1257 -0
  13. biolib/_internal/push_application.py +6 -1
  14. biolib/_internal/runtime.py +3 -56
  15. biolib/_internal/types/__init__.py +4 -0
  16. biolib/_internal/types/app.py +9 -0
  17. biolib/_internal/types/data_record.py +40 -0
  18. biolib/_internal/types/experiment.py +10 -0
  19. biolib/_internal/types/resource.py +14 -0
  20. biolib/_internal/types/typing.py +7 -0
  21. biolib/_internal/utils/multinode.py +264 -0
  22. biolib/_runtime/runtime.py +84 -0
  23. biolib/api/__init__.py +1 -0
  24. biolib/api/client.py +39 -17
  25. biolib/app/app.py +34 -71
  26. biolib/biolib_api_client/api_client.py +9 -2
  27. biolib/biolib_api_client/app_types.py +3 -2
  28. biolib/biolib_api_client/biolib_job_api.py +6 -0
  29. biolib/biolib_api_client/job_types.py +4 -4
  30. biolib/biolib_api_client/lfs_types.py +8 -2
  31. biolib/biolib_binary_format/remote_endpoints.py +12 -10
  32. biolib/biolib_binary_format/utils.py +23 -3
  33. biolib/cli/auth.py +1 -1
  34. biolib/cli/data_record.py +45 -6
  35. biolib/cli/lfs.py +10 -6
  36. biolib/compute_node/cloud_utils/cloud_utils.py +13 -16
  37. biolib/compute_node/job_worker/executors/docker_executor.py +127 -108
  38. biolib/compute_node/job_worker/job_storage.py +17 -5
  39. biolib/compute_node/job_worker/job_worker.py +25 -15
  40. biolib/compute_node/remote_host_proxy.py +72 -84
  41. biolib/compute_node/webserver/webserver_types.py +0 -1
  42. biolib/compute_node/webserver/worker_thread.py +42 -39
  43. biolib/experiments/experiment.py +75 -44
  44. biolib/jobs/job.py +98 -19
  45. biolib/jobs/job_result.py +46 -21
  46. biolib/jobs/types.py +1 -1
  47. biolib/runtime/__init__.py +2 -1
  48. biolib/sdk/__init__.py +18 -7
  49. biolib/typing_utils.py +2 -7
  50. biolib/user/sign_in.py +2 -2
  51. biolib/utils/seq_util.py +38 -35
  52. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/METADATA +1 -1
  53. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/RECORD +57 -45
  54. biolib/experiments/types.py +0 -9
  55. biolib/lfs/__init__.py +0 -4
  56. biolib/lfs/utils.py +0 -153
  57. /biolib/{lfs → _internal/lfs}/cache.py +0 -0
  58. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/LICENSE +0 -0
  59. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/WHEEL +0 -0
  60. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/entry_points.txt +0 -0
biolib/__init__.py CHANGED
@@ -12,8 +12,8 @@ from biolib.experiments.experiment import Experiment
12
12
  from biolib.biolib_api_client import BiolibApiClient as _BioLibApiClient, App
13
13
  from biolib.jobs import Job as _Job
14
14
  from biolib import user as _user
15
- from biolib.typing_utils import List, Optional
16
- from biolib._internal.data_record import DataRecord as _DataRecord
15
+ from biolib.typing_utils import List, Optional, cast as _cast
16
+ from biolib._data_record.data_record import DataRecord as _DataRecord
17
17
 
18
18
  import biolib.api
19
19
  import biolib.app
@@ -45,6 +45,10 @@ def get_job(job_id: str) -> _Job:
45
45
  return _Job.create_from_uuid(uuid=job_id)
46
46
 
47
47
 
48
+ def get_data_record(uri: str) -> _DataRecord:
49
+ return _DataRecord.get_by_uri(uri)
50
+
51
+
48
52
  def fetch_jobs(count: int = 25) -> List[_Job]:
49
53
  return _Job.fetch_jobs(count)
50
54
 
@@ -53,8 +57,11 @@ def fetch_data_records(uri: Optional[str] = None, count: Optional[int] = None) -
53
57
  return _DataRecord.fetch(uri, count)
54
58
 
55
59
 
56
- def get_experiment(name: str) -> Experiment:
57
- return Experiment(name)
60
+ def get_experiment(uri: Optional[str] = None, name: Optional[str] = None) -> Experiment:
61
+ if (not uri and not name) or (uri and name):
62
+ raise ValueError('Must provide either uri or name')
63
+
64
+ return Experiment.get_by_uri(uri=_cast(str, uri or name))
58
65
 
59
66
 
60
67
  def show_jobs(count: int = 25) -> None:
@@ -0,0 +1,278 @@
1
+ import os
2
+ from collections import namedtuple
3
+ from datetime import datetime
4
+ from fnmatch import fnmatch
5
+ from pathlib import Path
6
+ from struct import Struct
7
+ from typing import Callable, Dict, List, Optional, Union, cast
8
+
9
+ from biolib import api, utils
10
+ from biolib._internal import types
11
+ from biolib._internal.data_record import get_data_record_state_from_uri
12
+ from biolib._internal.data_record.data_record import validate_sqlite_v1
13
+ from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
14
+ from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
15
+ from biolib._internal.http_client import HttpClient
16
+ from biolib.api import client as api_client
17
+ from biolib.biolib_api_client import BiolibApiClient
18
+ from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersion, DataRecordVersionInfo
19
+ from biolib.biolib_binary_format import LazyLoadedFile
20
+ from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
21
+ from biolib.biolib_errors import BioLibError
22
+ from biolib.biolib_logging import logger
23
+ from biolib.utils.app_uri import parse_app_uri
24
+ from biolib.utils.zip.remote_zip import RemoteZip
25
+
26
+ PathFilter = Union[str, Callable[[str], bool]]
27
+
28
+
29
+ class DataRecord:
30
+ def __init__(self, _internal_state: DataRecordVersionInfo):
31
+ self._state = _internal_state
32
+
33
+ def __repr__(self):
34
+ return f'DataRecord: {self._state["resource_uri"]}'
35
+
36
+ @property
37
+ def uri(self) -> str:
38
+ return self._state['resource_uri']
39
+
40
+ @property
41
+ def uuid(self) -> str:
42
+ return self._state['resource_uuid']
43
+
44
+ @property
45
+ def name(self) -> str:
46
+ uri_parsed = parse_app_uri(self._state['resource_uri'], use_account_as_name_default=False)
47
+ if not uri_parsed['app_name']:
48
+ raise ValueError('Expected parameter "resource_uri" to contain resource name')
49
+
50
+ return uri_parsed['app_name']
51
+
52
+ def list_files(self, path_filter: Optional[PathFilter] = None) -> List[LazyLoadedFile]:
53
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
54
+ resource_version_uuid=self._state['resource_version_uuid'],
55
+ )
56
+ files: List[LazyLoadedFile] = []
57
+ with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
58
+ central_directory = remote_zip.get_central_directory()
59
+ for file_info in central_directory.values():
60
+ files.append(self._get_file(remote_storage_endpoint, file_info))
61
+
62
+ return self._get_filtered_files(files=files, path_filter=path_filter) if path_filter else files
63
+
64
+ def download_zip(self, output_path: str):
65
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
66
+ resource_version_uuid=self._state['resource_version_uuid'],
67
+ )
68
+ HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
69
+
70
+ def download_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
71
+ filtered_files = self.list_files(path_filter=path_filter)
72
+
73
+ if len(filtered_files) == 0:
74
+ logger.debug('No files to save')
75
+ return
76
+
77
+ for file in filtered_files:
78
+ file_path = os.path.join(output_dir, file.path)
79
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
80
+ with open(file_path, mode='wb') as file_handle:
81
+ for chunk in file.get_data_iterator():
82
+ file_handle.write(chunk)
83
+
84
+ def save_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
85
+ self.download_files(output_dir=output_dir, path_filter=path_filter)
86
+
87
+ def update(self, data_path: str, chunk_size_in_mb: Optional[int] = None) -> None:
88
+ assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
89
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
90
+
91
+ if os.path.realpath(data_path) == '/':
92
+ raise BioLibError('Pushing your root directory is not possible')
93
+
94
+ original_working_dir = os.getcwd()
95
+ os.chdir(data_path)
96
+ files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
97
+
98
+ if data_size_in_bytes > 4_500_000_000_000:
99
+ raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
100
+
101
+ # validate data record
102
+ detailed_dict: types.DataRecordDetailedDict = self._get_detailed_dict()
103
+ if detailed_dict['type']:
104
+ # only validate if data record has a type
105
+ data_record_type: types.DataRecordTypeDict = detailed_dict['type']
106
+ logger.info(f"Validating data record of type {data_record_type['name']}")
107
+ for rule in data_record_type['validation_rules']:
108
+ logger.info(f"Validating rule {rule['type']} for {rule['path']}...")
109
+ if rule['type'] == 'sqlite-v1':
110
+ try:
111
+ validate_sqlite_v1(schema=rule['rule'], sqlite_file=Path(rule['path']))
112
+ except Exception as error:
113
+ raise Exception('Data Record Validation failed') from error
114
+ else:
115
+ raise Exception(f"Error processing data record validation: unknown rule type {rule['type']}")
116
+
117
+ min_chunk_size_bytes = 10_000_000
118
+ chunk_size_in_bytes: int
119
+ if chunk_size_in_mb:
120
+ chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
121
+ if chunk_size_in_bytes < min_chunk_size_bytes:
122
+ logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
123
+ chunk_size_in_bytes = min_chunk_size_bytes
124
+ else:
125
+ # Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
126
+ chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
127
+
128
+ data_size_in_mb = round(data_size_in_bytes / 10**6)
129
+ logger.info(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
130
+
131
+ response = api.client.post(path='/lfs/versions/', data={'resource_uuid': self._state['resource_uuid']})
132
+ data_record_version: DataRecordVersion = response.json()
133
+ iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
134
+
135
+ multipart_uploader = utils.MultiPartUploader(
136
+ use_process_pool=True,
137
+ get_presigned_upload_url_request=dict(
138
+ headers=None,
139
+ requires_biolib_auth=True,
140
+ path=f"/lfs/versions/{data_record_version['uuid']}/presigned_upload_url/",
141
+ ),
142
+ complete_upload_request=dict(
143
+ headers=None,
144
+ requires_biolib_auth=True,
145
+ path=f"/lfs/versions/{data_record_version['uuid']}/complete_upload/",
146
+ ),
147
+ )
148
+
149
+ multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
150
+ os.chdir(original_working_dir)
151
+ logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
152
+ self._state = get_data_record_state_from_uri(data_record_version['uri'])
153
+
154
+ @staticmethod
155
+ def get_by_uri(uri: str) -> 'DataRecord':
156
+ return DataRecord(_internal_state=get_data_record_state_from_uri(uri))
157
+
158
+ @staticmethod
159
+ def create(destination: str, data_path: Optional[str] = None, record_type: Optional[str] = None) -> 'DataRecord':
160
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
161
+ if data_path is not None:
162
+ assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
163
+ uri_parsed = parse_app_uri(destination, use_account_as_name_default=False)
164
+ if uri_parsed['app_name_normalized']:
165
+ data_record_uri = destination
166
+ else:
167
+ record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
168
+ data_record_uri = f'{destination}/{record_name}'
169
+
170
+ response = api.client.post(
171
+ path='/resources/data-records/',
172
+ data={
173
+ 'uri': data_record_uri,
174
+ 'type': record_type,
175
+ },
176
+ )
177
+ data_record_info: DataRecordInfo = response.json()
178
+ logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
179
+
180
+ if data_path is not None:
181
+ data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
182
+ data_record.update(data_path=data_path)
183
+ return data_record
184
+ else:
185
+ return DataRecord.get_by_uri(uri=data_record_info['uri'])
186
+
187
+ @staticmethod
188
+ def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
189
+ max_page_size = 1_000
190
+ params: Dict[str, Union[str, int]] = {
191
+ 'page_size': str(count or max_page_size),
192
+ 'resource_type': 'data-record',
193
+ }
194
+ if uri:
195
+ uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
196
+ params['account_handle'] = uri_parsed['account_handle_normalized']
197
+ if uri_parsed['app_name_normalized']:
198
+ params['app_name'] = uri_parsed['app_name_normalized']
199
+
200
+ results = api_client.get(path='/apps/', params=params).json()['results']
201
+ if count is None and len(results) == max_page_size:
202
+ logger.warning(
203
+ f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
204
+ )
205
+
206
+ return [
207
+ DataRecord(
208
+ _internal_state={
209
+ 'resource_uri': result['resource_uri'],
210
+ 'resource_uuid': result['public_id'],
211
+ 'resource_version_uuid': result['active_version'],
212
+ }
213
+ )
214
+ for result in results
215
+ ]
216
+
217
+ @staticmethod
218
+ def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
219
+ local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
220
+ local_file_header_struct = Struct('<H2sHHHIIIHH')
221
+ LocalFileHeader = namedtuple(
222
+ 'LocalFileHeader',
223
+ (
224
+ 'version',
225
+ 'flags',
226
+ 'compression_raw',
227
+ 'mod_time',
228
+ 'mod_date',
229
+ 'crc_32_expected',
230
+ 'compressed_size_raw',
231
+ 'uncompressed_size_raw',
232
+ 'file_name_len',
233
+ 'extra_field_len',
234
+ ),
235
+ )
236
+
237
+ local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
238
+ local_file_header_end = local_file_header_start + local_file_header_struct.size
239
+
240
+ def file_start_func() -> int:
241
+ local_file_header_response = HttpClient.request(
242
+ url=remote_storage_endpoint.get_remote_url(),
243
+ headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
244
+ timeout_in_seconds=300,
245
+ )
246
+ local_file_header = LocalFileHeader._make(
247
+ local_file_header_struct.unpack(local_file_header_response.content)
248
+ )
249
+ file_start: int = (
250
+ local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
251
+ )
252
+ return file_start
253
+
254
+ return LazyLoadedFile(
255
+ buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
256
+ length=file_info['file_size'],
257
+ path=file_info['filename'],
258
+ start=None,
259
+ start_func=file_start_func,
260
+ )
261
+
262
+ @staticmethod
263
+ def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
264
+ if not (isinstance(path_filter, str) or callable(path_filter)):
265
+ raise Exception('Expected path_filter to be a string or a function')
266
+
267
+ if callable(path_filter):
268
+ return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
269
+
270
+ glob_filter = cast(str, path_filter)
271
+
272
+ def _filter_function(file: LazyLoadedFile) -> bool:
273
+ return fnmatch(file.path, glob_filter)
274
+
275
+ return list(filter(_filter_function, files))
276
+
277
+ def _get_detailed_dict(self) -> types.DataRecordDetailedDict:
278
+ return cast(types.DataRecordDetailedDict, api_client.get(f'/resources/data-records/{self.uuid}/').json())
@@ -1 +1 @@
1
- from .data_record import DataRecord
1
+ from .data_record import get_data_record_state_from_uri, validate_sqlite_v1
@@ -1,153 +1,99 @@
1
- import os
2
- from collections import namedtuple
3
- from datetime import datetime
4
- from fnmatch import fnmatch
5
- from struct import Struct
6
- from typing import Callable, Dict, List, Optional, Union, cast
7
-
8
- from biolib import lfs
9
- from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
10
- from biolib._internal.http_client import HttpClient
1
+ import sqlite3
2
+ from pathlib import Path
3
+
4
+ from biolib._internal.types.data_record import SqliteV1DatabaseSchema
11
5
  from biolib.api import client as api_client
12
6
  from biolib.biolib_api_client import AppGetResponse
13
- from biolib.biolib_binary_format import LazyLoadedFile
14
- from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
15
- from biolib.biolib_logging import logger
16
- from biolib.utils.app_uri import parse_app_uri
17
- from biolib.utils.zip.remote_zip import RemoteZip # type: ignore
18
-
19
- PathFilter = Union[str, Callable[[str], bool]]
20
-
21
-
22
- class DataRecord:
23
- def __init__(self, uri: str):
24
- self._uri = uri
25
- uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
26
- if not uri_parsed['app_name']:
27
- raise ValueError('Expected parameter "uri" to contain resource name')
28
-
29
- self._name = uri_parsed['app_name']
30
-
31
- @property
32
- def uri(self) -> str:
33
- return self._uri
34
-
35
- @property
36
- def name(self) -> str:
37
- return self._name
38
-
39
- def list_files(self, path_filter: Optional[PathFilter] = None) -> List[LazyLoadedFile]:
40
- app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': self._uri}).json()
41
- remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
42
- resource_version_uuid=app_response['app_version']['public_id'],
43
- )
44
- files: List[LazyLoadedFile] = []
45
- with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
46
- central_directory = remote_zip.get_central_directory()
47
- for file_info in central_directory.values():
48
- files.append(self._get_file(remote_storage_endpoint, file_info))
49
-
50
- return self._get_filtered_files(files=files, path_filter=path_filter) if path_filter else files
51
-
52
- def download_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
53
- filtered_files = self.list_files(path_filter=path_filter)
54
-
55
- if len(filtered_files) == 0:
56
- logger.debug('No files to save')
57
- return
58
-
59
- for file in filtered_files:
60
- file_path = os.path.join(output_dir, file.path)
61
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
62
- with open(file_path, mode='wb') as file_handle:
63
- file_handle.write(file.get_data())
64
-
65
- def save_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
66
- self.download_files(output_dir=output_dir, path_filter=path_filter)
67
-
68
- @staticmethod
69
- def create(destination: str, data_path: str, name: Optional[str] = None) -> 'DataRecord':
70
- assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
71
- record_name = name if name else 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
72
- record_uri = lfs.create_large_file_system(lfs_uri=f'{destination}/{record_name}')
73
- record_version_uri = lfs.push_large_file_system(lfs_uri=record_uri, input_dir=data_path)
74
- return DataRecord(uri=record_version_uri)
75
-
76
- @staticmethod
77
- def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
78
- max_page_size = 1_000
79
- params: Dict[str, Union[str, int]] = {
80
- 'page_size': str(count or max_page_size),
81
- 'resource_type': 'data-record',
82
- }
83
- if uri:
84
- uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
85
- params['account_handle'] = uri_parsed['account_handle_normalized']
86
-
87
- results = api_client.get(path='/apps/', params=params).json()['results']
88
- if count is None and len(results) == max_page_size:
89
- logger.warning(
90
- f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
91
- )
92
-
93
- return [DataRecord(result['resource_uri']) for result in results]
94
-
95
- @staticmethod
96
- def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
97
- local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
98
- local_file_header_struct = Struct('<H2sHHHIIIHH')
99
- LocalFileHeader = namedtuple(
100
- 'LocalFileHeader',
101
- (
102
- 'version',
103
- 'flags',
104
- 'compression_raw',
105
- 'mod_time',
106
- 'mod_date',
107
- 'crc_32_expected',
108
- 'compressed_size_raw',
109
- 'uncompressed_size_raw',
110
- 'file_name_len',
111
- 'extra_field_len',
112
- ),
113
- )
114
-
115
- local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
116
- local_file_header_end = local_file_header_start + local_file_header_struct.size
117
-
118
- def file_start_func() -> int:
119
- local_file_header_response = HttpClient.request(
120
- url=remote_storage_endpoint.get_remote_url(),
121
- headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
122
- timeout_in_seconds=300,
123
- )
124
- local_file_header = LocalFileHeader._make(
125
- local_file_header_struct.unpack(local_file_header_response.content)
126
- )
127
- file_start: int = (
128
- local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
129
- )
130
- return file_start
131
-
132
- return LazyLoadedFile(
133
- buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
134
- length=file_info['file_size'],
135
- path=file_info['filename'],
136
- start=None,
137
- start_func=file_start_func,
138
- )
139
-
140
- @staticmethod
141
- def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
142
- if not (isinstance(path_filter, str) or callable(path_filter)):
143
- raise Exception('Expected path_filter to be a string or a function')
144
-
145
- if callable(path_filter):
146
- return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
147
-
148
- glob_filter = cast(str, path_filter)
149
-
150
- def _filter_function(file: LazyLoadedFile) -> bool:
151
- return fnmatch(file.path, glob_filter)
152
-
153
- return list(filter(_filter_function, files))
7
+ from biolib.biolib_api_client.lfs_types import DataRecordVersionInfo
8
+
9
+
10
+ def get_actual_schema(db_path):
11
+ if not db_path.exists():
12
+ raise Exception(f'File {db_path} not found.')
13
+ conn = sqlite3.connect(db_path)
14
+ cursor = conn.cursor()
15
+
16
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
17
+ tables = cursor.fetchall()
18
+
19
+ actual_schema: SqliteV1DatabaseSchema = {'tables': {}}
20
+ for table in tables:
21
+ table_name = table[0]
22
+ cursor.execute(f'PRAGMA table_info({table_name});')
23
+ columns = cursor.fetchall()
24
+ actual_schema['tables'][table_name] = {'columns': {}}
25
+ for column in columns:
26
+ actual_schema['tables'][table_name]['columns'][column[1]] = {
27
+ 'type': column[2],
28
+ 'nullable': not bool(column[3]),
29
+ }
30
+
31
+ cursor.execute(f'PRAGMA foreign_key_list({table_name});')
32
+ foreign_keys = cursor.fetchall()
33
+ for fk in foreign_keys:
34
+ actual_schema['tables'][table_name]['columns'][fk[3]]['foreign_key'] = {'table': fk[2], 'column': fk[4]}
35
+
36
+ conn.close()
37
+ return actual_schema
38
+
39
+
40
+ def verify_schema(specification: SqliteV1DatabaseSchema, actual_schema: SqliteV1DatabaseSchema):
41
+ for table_name, table_spec in specification['tables'].items():
42
+ if table_name not in actual_schema['tables']:
43
+ raise Exception(f"Error: Table '{table_name}' is missing.")
44
+
45
+ for column_name, column_spec in table_spec['columns'].items():
46
+ if column_name not in actual_schema['tables'][table_name]['columns']:
47
+ raise Exception(f"Error: Column '{column_name}' in table '{table_name}' is missing.")
48
+
49
+ actual_column = actual_schema['tables'][table_name]['columns'][column_name]
50
+ if actual_column['type'] != column_spec['type']:
51
+ raise Exception(
52
+ f"Error: Column '{column_name}' in table '{table_name}' "
53
+ "has type '{actual_column['type']}' but expected '{column_spec['type']}'."
54
+ )
55
+
56
+ if not actual_column['nullable'] and column_spec.get('nullable', True):
57
+ raise Exception(
58
+ f"Error: Column '{column_name}' in table '{table_name}' is "
59
+ 'not nullable but should be nullable according to the specification.'
60
+ )
61
+
62
+ for column_name, column_spec in table_spec['columns'].items():
63
+ if column_spec.get('foreign_key'):
64
+ foreign_key_spec = column_spec['foreign_key']
65
+ if actual_schema['tables'][table_name]['columns'][column_name].get('foreign_key'):
66
+ fk = actual_schema['tables'][table_name]['columns'][column_name]['foreign_key']
67
+ if (
68
+ fk
69
+ and foreign_key_spec
70
+ and fk['table'] == foreign_key_spec['table']
71
+ and fk['column'] == foreign_key_spec['column']
72
+ ):
73
+ raise Exception(
74
+ f"Error: Column '{column_name}' in table '{table_name}' does "
75
+ 'not have the correct foreign key constraint.'
76
+ )
77
+ else:
78
+ raise Exception(
79
+ f"Error: Column '{column_name}' in table '{table_name}' does "
80
+ 'not have a foreign key constraint.'
81
+ )
82
+
83
+
84
+ def get_data_record_state_from_uri(uri) -> 'DataRecordVersionInfo':
85
+ app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': uri}).json()
86
+ if app_response['app']['type'] != 'data-record':
87
+ raise Exception(f'Resource "{uri}" is not a Data Record')
88
+ return DataRecordVersionInfo(
89
+ resource_uri=app_response['app_version']['app_uri'],
90
+ resource_uuid=app_response['app']['public_id'],
91
+ resource_version_uuid=app_response['app_version']['public_id'],
92
+ )
93
+
94
+
95
+ def validate_sqlite_v1(schema: SqliteV1DatabaseSchema, sqlite_file: Path):
96
+ actual_schema = get_actual_schema(sqlite_file)
97
+ print(schema)
98
+ print(actual_schema)
99
+ verify_schema(specification=schema, actual_schema=actual_schema)
@@ -1,23 +1,34 @@
1
+ import os
1
2
  from datetime import datetime, timedelta
3
+ from urllib.parse import urlparse
2
4
 
3
5
  from biolib.api import client as api_client
4
- from biolib.biolib_api_client.lfs_types import LargeFileSystemVersion
6
+ from biolib.biolib_api_client.lfs_types import DataRecordVersion
5
7
  from biolib.biolib_binary_format.utils import RemoteEndpoint
6
8
  from biolib.biolib_logging import logger
9
+ from biolib.typing_utils import Optional
7
10
 
8
11
 
9
12
  class DataRecordRemoteStorageEndpoint(RemoteEndpoint):
10
13
  def __init__(self, resource_version_uuid: str):
11
14
  self._resource_version_uuid: str = resource_version_uuid
12
- self._expires_at = None
13
- self._presigned_url = None
15
+ self._expires_at: Optional[datetime] = None
16
+ self._presigned_url: Optional[str] = None
14
17
 
15
- def get_remote_url(self):
16
- if not self._presigned_url or datetime.utcnow() > self._expires_at:
17
- lfs_version: LargeFileSystemVersion = api_client.get(
18
+ def get_remote_url(self) -> str:
19
+ if not self._presigned_url or not self._expires_at or datetime.utcnow() > self._expires_at:
20
+ lfs_version: DataRecordVersion = api_client.get(
18
21
  path=f'/lfs/versions/{self._resource_version_uuid}/',
19
22
  ).json()
20
- self._presigned_url = lfs_version['presigned_download_url']
23
+
24
+ app_caller_proxy_job_storage_base_url = os.getenv('BIOLIB_CLOUD_JOB_STORAGE_BASE_URL', '')
25
+ if app_caller_proxy_job_storage_base_url:
26
+ # Done to hit App Caller Proxy when downloading from inside an app
27
+ parsed_url = urlparse(lfs_version['presigned_download_url'])
28
+ self._presigned_url = f'{app_caller_proxy_job_storage_base_url}{parsed_url.path}?{parsed_url.query}'
29
+ else:
30
+ self._presigned_url = lfs_version['presigned_download_url']
31
+
21
32
  self._expires_at = datetime.utcnow() + timedelta(minutes=8)
22
33
  logger.debug(
23
34
  f'DataRecord "{self._resource_version_uuid}" fetched presigned URL '