pybiolib 1.1.1881__py3-none-any.whl → 1.2.7.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +11 -4
- biolib/_data_record/data_record.py +278 -0
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +97 -151
- biolib/_internal/data_record/remote_storage_endpoint.py +18 -7
- biolib/_internal/file_utils.py +77 -0
- biolib/_internal/fuse_mount/__init__.py +1 -0
- biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
- biolib/_internal/http_client.py +31 -9
- biolib/_internal/lfs/__init__.py +1 -0
- biolib/_internal/libs/__init__.py +1 -0
- biolib/_internal/libs/fusepy/__init__.py +1257 -0
- biolib/_internal/push_application.py +6 -1
- biolib/_internal/runtime.py +3 -56
- biolib/_internal/types/__init__.py +4 -0
- biolib/_internal/types/app.py +9 -0
- biolib/_internal/types/data_record.py +40 -0
- biolib/_internal/types/experiment.py +10 -0
- biolib/_internal/types/resource.py +14 -0
- biolib/_internal/types/typing.py +7 -0
- biolib/_internal/utils/multinode.py +264 -0
- biolib/_runtime/runtime.py +84 -0
- biolib/api/__init__.py +1 -0
- biolib/api/client.py +39 -17
- biolib/app/app.py +34 -71
- biolib/biolib_api_client/api_client.py +9 -2
- biolib/biolib_api_client/app_types.py +3 -2
- biolib/biolib_api_client/biolib_job_api.py +6 -0
- biolib/biolib_api_client/job_types.py +4 -4
- biolib/biolib_api_client/lfs_types.py +8 -2
- biolib/biolib_binary_format/remote_endpoints.py +12 -10
- biolib/biolib_binary_format/utils.py +23 -3
- biolib/cli/auth.py +1 -1
- biolib/cli/data_record.py +45 -6
- biolib/cli/lfs.py +10 -6
- biolib/compute_node/cloud_utils/cloud_utils.py +13 -16
- biolib/compute_node/job_worker/executors/docker_executor.py +127 -108
- biolib/compute_node/job_worker/job_storage.py +17 -5
- biolib/compute_node/job_worker/job_worker.py +25 -15
- biolib/compute_node/remote_host_proxy.py +72 -84
- biolib/compute_node/webserver/webserver_types.py +0 -1
- biolib/compute_node/webserver/worker_thread.py +42 -39
- biolib/experiments/experiment.py +75 -44
- biolib/jobs/job.py +98 -19
- biolib/jobs/job_result.py +46 -21
- biolib/jobs/types.py +1 -1
- biolib/runtime/__init__.py +2 -1
- biolib/sdk/__init__.py +18 -7
- biolib/typing_utils.py +2 -7
- biolib/user/sign_in.py +2 -2
- biolib/utils/seq_util.py +38 -35
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/METADATA +1 -1
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/RECORD +57 -45
- biolib/experiments/types.py +0 -9
- biolib/lfs/__init__.py +0 -4
- biolib/lfs/utils.py +0 -153
- /biolib/{lfs → _internal/lfs}/cache.py +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/entry_points.txt +0 -0
biolib/__init__.py
CHANGED
@@ -12,8 +12,8 @@ from biolib.experiments.experiment import Experiment
|
|
12
12
|
from biolib.biolib_api_client import BiolibApiClient as _BioLibApiClient, App
|
13
13
|
from biolib.jobs import Job as _Job
|
14
14
|
from biolib import user as _user
|
15
|
-
from biolib.typing_utils import List, Optional
|
16
|
-
from biolib.
|
15
|
+
from biolib.typing_utils import List, Optional, cast as _cast
|
16
|
+
from biolib._data_record.data_record import DataRecord as _DataRecord
|
17
17
|
|
18
18
|
import biolib.api
|
19
19
|
import biolib.app
|
@@ -45,6 +45,10 @@ def get_job(job_id: str) -> _Job:
|
|
45
45
|
return _Job.create_from_uuid(uuid=job_id)
|
46
46
|
|
47
47
|
|
48
|
+
def get_data_record(uri: str) -> _DataRecord:
|
49
|
+
return _DataRecord.get_by_uri(uri)
|
50
|
+
|
51
|
+
|
48
52
|
def fetch_jobs(count: int = 25) -> List[_Job]:
|
49
53
|
return _Job.fetch_jobs(count)
|
50
54
|
|
@@ -53,8 +57,11 @@ def fetch_data_records(uri: Optional[str] = None, count: Optional[int] = None) -
|
|
53
57
|
return _DataRecord.fetch(uri, count)
|
54
58
|
|
55
59
|
|
56
|
-
def get_experiment(name: str) -> Experiment:
|
57
|
-
|
60
|
+
def get_experiment(uri: Optional[str] = None, name: Optional[str] = None) -> Experiment:
|
61
|
+
if (not uri and not name) or (uri and name):
|
62
|
+
raise ValueError('Must provide either uri or name')
|
63
|
+
|
64
|
+
return Experiment.get_by_uri(uri=_cast(str, uri or name))
|
58
65
|
|
59
66
|
|
60
67
|
def show_jobs(count: int = 25) -> None:
|
@@ -0,0 +1,278 @@
|
|
1
|
+
import os
|
2
|
+
from collections import namedtuple
|
3
|
+
from datetime import datetime
|
4
|
+
from fnmatch import fnmatch
|
5
|
+
from pathlib import Path
|
6
|
+
from struct import Struct
|
7
|
+
from typing import Callable, Dict, List, Optional, Union, cast
|
8
|
+
|
9
|
+
from biolib import api, utils
|
10
|
+
from biolib._internal import types
|
11
|
+
from biolib._internal.data_record import get_data_record_state_from_uri
|
12
|
+
from biolib._internal.data_record.data_record import validate_sqlite_v1
|
13
|
+
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
14
|
+
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
15
|
+
from biolib._internal.http_client import HttpClient
|
16
|
+
from biolib.api import client as api_client
|
17
|
+
from biolib.biolib_api_client import BiolibApiClient
|
18
|
+
from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersion, DataRecordVersionInfo
|
19
|
+
from biolib.biolib_binary_format import LazyLoadedFile
|
20
|
+
from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
|
21
|
+
from biolib.biolib_errors import BioLibError
|
22
|
+
from biolib.biolib_logging import logger
|
23
|
+
from biolib.utils.app_uri import parse_app_uri
|
24
|
+
from biolib.utils.zip.remote_zip import RemoteZip
|
25
|
+
|
26
|
+
PathFilter = Union[str, Callable[[str], bool]]
|
27
|
+
|
28
|
+
|
29
|
+
class DataRecord:
|
30
|
+
def __init__(self, _internal_state: DataRecordVersionInfo):
|
31
|
+
self._state = _internal_state
|
32
|
+
|
33
|
+
def __repr__(self):
|
34
|
+
return f'DataRecord: {self._state["resource_uri"]}'
|
35
|
+
|
36
|
+
@property
|
37
|
+
def uri(self) -> str:
|
38
|
+
return self._state['resource_uri']
|
39
|
+
|
40
|
+
@property
|
41
|
+
def uuid(self) -> str:
|
42
|
+
return self._state['resource_uuid']
|
43
|
+
|
44
|
+
@property
|
45
|
+
def name(self) -> str:
|
46
|
+
uri_parsed = parse_app_uri(self._state['resource_uri'], use_account_as_name_default=False)
|
47
|
+
if not uri_parsed['app_name']:
|
48
|
+
raise ValueError('Expected parameter "resource_uri" to contain resource name')
|
49
|
+
|
50
|
+
return uri_parsed['app_name']
|
51
|
+
|
52
|
+
def list_files(self, path_filter: Optional[PathFilter] = None) -> List[LazyLoadedFile]:
|
53
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
|
54
|
+
resource_version_uuid=self._state['resource_version_uuid'],
|
55
|
+
)
|
56
|
+
files: List[LazyLoadedFile] = []
|
57
|
+
with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
|
58
|
+
central_directory = remote_zip.get_central_directory()
|
59
|
+
for file_info in central_directory.values():
|
60
|
+
files.append(self._get_file(remote_storage_endpoint, file_info))
|
61
|
+
|
62
|
+
return self._get_filtered_files(files=files, path_filter=path_filter) if path_filter else files
|
63
|
+
|
64
|
+
def download_zip(self, output_path: str):
|
65
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
|
66
|
+
resource_version_uuid=self._state['resource_version_uuid'],
|
67
|
+
)
|
68
|
+
HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
|
69
|
+
|
70
|
+
def download_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
|
71
|
+
filtered_files = self.list_files(path_filter=path_filter)
|
72
|
+
|
73
|
+
if len(filtered_files) == 0:
|
74
|
+
logger.debug('No files to save')
|
75
|
+
return
|
76
|
+
|
77
|
+
for file in filtered_files:
|
78
|
+
file_path = os.path.join(output_dir, file.path)
|
79
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
80
|
+
with open(file_path, mode='wb') as file_handle:
|
81
|
+
for chunk in file.get_data_iterator():
|
82
|
+
file_handle.write(chunk)
|
83
|
+
|
84
|
+
def save_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
|
85
|
+
self.download_files(output_dir=output_dir, path_filter=path_filter)
|
86
|
+
|
87
|
+
def update(self, data_path: str, chunk_size_in_mb: Optional[int] = None) -> None:
|
88
|
+
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
89
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
|
90
|
+
|
91
|
+
if os.path.realpath(data_path) == '/':
|
92
|
+
raise BioLibError('Pushing your root directory is not possible')
|
93
|
+
|
94
|
+
original_working_dir = os.getcwd()
|
95
|
+
os.chdir(data_path)
|
96
|
+
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
97
|
+
|
98
|
+
if data_size_in_bytes > 4_500_000_000_000:
|
99
|
+
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
100
|
+
|
101
|
+
# validate data record
|
102
|
+
detailed_dict: types.DataRecordDetailedDict = self._get_detailed_dict()
|
103
|
+
if detailed_dict['type']:
|
104
|
+
# only validate if data record has a type
|
105
|
+
data_record_type: types.DataRecordTypeDict = detailed_dict['type']
|
106
|
+
logger.info(f"Validating data record of type {data_record_type['name']}")
|
107
|
+
for rule in data_record_type['validation_rules']:
|
108
|
+
logger.info(f"Validating rule {rule['type']} for {rule['path']}...")
|
109
|
+
if rule['type'] == 'sqlite-v1':
|
110
|
+
try:
|
111
|
+
validate_sqlite_v1(schema=rule['rule'], sqlite_file=Path(rule['path']))
|
112
|
+
except Exception as error:
|
113
|
+
raise Exception('Data Record Validation failed') from error
|
114
|
+
else:
|
115
|
+
raise Exception(f"Error processing data record validation: unknown rule type {rule['type']}")
|
116
|
+
|
117
|
+
min_chunk_size_bytes = 10_000_000
|
118
|
+
chunk_size_in_bytes: int
|
119
|
+
if chunk_size_in_mb:
|
120
|
+
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
121
|
+
if chunk_size_in_bytes < min_chunk_size_bytes:
|
122
|
+
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
123
|
+
chunk_size_in_bytes = min_chunk_size_bytes
|
124
|
+
else:
|
125
|
+
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
126
|
+
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
127
|
+
|
128
|
+
data_size_in_mb = round(data_size_in_bytes / 10**6)
|
129
|
+
logger.info(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
130
|
+
|
131
|
+
response = api.client.post(path='/lfs/versions/', data={'resource_uuid': self._state['resource_uuid']})
|
132
|
+
data_record_version: DataRecordVersion = response.json()
|
133
|
+
iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
|
134
|
+
|
135
|
+
multipart_uploader = utils.MultiPartUploader(
|
136
|
+
use_process_pool=True,
|
137
|
+
get_presigned_upload_url_request=dict(
|
138
|
+
headers=None,
|
139
|
+
requires_biolib_auth=True,
|
140
|
+
path=f"/lfs/versions/{data_record_version['uuid']}/presigned_upload_url/",
|
141
|
+
),
|
142
|
+
complete_upload_request=dict(
|
143
|
+
headers=None,
|
144
|
+
requires_biolib_auth=True,
|
145
|
+
path=f"/lfs/versions/{data_record_version['uuid']}/complete_upload/",
|
146
|
+
),
|
147
|
+
)
|
148
|
+
|
149
|
+
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
150
|
+
os.chdir(original_working_dir)
|
151
|
+
logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
|
152
|
+
self._state = get_data_record_state_from_uri(data_record_version['uri'])
|
153
|
+
|
154
|
+
@staticmethod
|
155
|
+
def get_by_uri(uri: str) -> 'DataRecord':
|
156
|
+
return DataRecord(_internal_state=get_data_record_state_from_uri(uri))
|
157
|
+
|
158
|
+
@staticmethod
|
159
|
+
def create(destination: str, data_path: Optional[str] = None, record_type: Optional[str] = None) -> 'DataRecord':
|
160
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
|
161
|
+
if data_path is not None:
|
162
|
+
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
163
|
+
uri_parsed = parse_app_uri(destination, use_account_as_name_default=False)
|
164
|
+
if uri_parsed['app_name_normalized']:
|
165
|
+
data_record_uri = destination
|
166
|
+
else:
|
167
|
+
record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
|
168
|
+
data_record_uri = f'{destination}/{record_name}'
|
169
|
+
|
170
|
+
response = api.client.post(
|
171
|
+
path='/resources/data-records/',
|
172
|
+
data={
|
173
|
+
'uri': data_record_uri,
|
174
|
+
'type': record_type,
|
175
|
+
},
|
176
|
+
)
|
177
|
+
data_record_info: DataRecordInfo = response.json()
|
178
|
+
logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
|
179
|
+
|
180
|
+
if data_path is not None:
|
181
|
+
data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
|
182
|
+
data_record.update(data_path=data_path)
|
183
|
+
return data_record
|
184
|
+
else:
|
185
|
+
return DataRecord.get_by_uri(uri=data_record_info['uri'])
|
186
|
+
|
187
|
+
@staticmethod
|
188
|
+
def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
|
189
|
+
max_page_size = 1_000
|
190
|
+
params: Dict[str, Union[str, int]] = {
|
191
|
+
'page_size': str(count or max_page_size),
|
192
|
+
'resource_type': 'data-record',
|
193
|
+
}
|
194
|
+
if uri:
|
195
|
+
uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
|
196
|
+
params['account_handle'] = uri_parsed['account_handle_normalized']
|
197
|
+
if uri_parsed['app_name_normalized']:
|
198
|
+
params['app_name'] = uri_parsed['app_name_normalized']
|
199
|
+
|
200
|
+
results = api_client.get(path='/apps/', params=params).json()['results']
|
201
|
+
if count is None and len(results) == max_page_size:
|
202
|
+
logger.warning(
|
203
|
+
f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
|
204
|
+
)
|
205
|
+
|
206
|
+
return [
|
207
|
+
DataRecord(
|
208
|
+
_internal_state={
|
209
|
+
'resource_uri': result['resource_uri'],
|
210
|
+
'resource_uuid': result['public_id'],
|
211
|
+
'resource_version_uuid': result['active_version'],
|
212
|
+
}
|
213
|
+
)
|
214
|
+
for result in results
|
215
|
+
]
|
216
|
+
|
217
|
+
@staticmethod
|
218
|
+
def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
|
219
|
+
local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
|
220
|
+
local_file_header_struct = Struct('<H2sHHHIIIHH')
|
221
|
+
LocalFileHeader = namedtuple(
|
222
|
+
'LocalFileHeader',
|
223
|
+
(
|
224
|
+
'version',
|
225
|
+
'flags',
|
226
|
+
'compression_raw',
|
227
|
+
'mod_time',
|
228
|
+
'mod_date',
|
229
|
+
'crc_32_expected',
|
230
|
+
'compressed_size_raw',
|
231
|
+
'uncompressed_size_raw',
|
232
|
+
'file_name_len',
|
233
|
+
'extra_field_len',
|
234
|
+
),
|
235
|
+
)
|
236
|
+
|
237
|
+
local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
|
238
|
+
local_file_header_end = local_file_header_start + local_file_header_struct.size
|
239
|
+
|
240
|
+
def file_start_func() -> int:
|
241
|
+
local_file_header_response = HttpClient.request(
|
242
|
+
url=remote_storage_endpoint.get_remote_url(),
|
243
|
+
headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
|
244
|
+
timeout_in_seconds=300,
|
245
|
+
)
|
246
|
+
local_file_header = LocalFileHeader._make(
|
247
|
+
local_file_header_struct.unpack(local_file_header_response.content)
|
248
|
+
)
|
249
|
+
file_start: int = (
|
250
|
+
local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
|
251
|
+
)
|
252
|
+
return file_start
|
253
|
+
|
254
|
+
return LazyLoadedFile(
|
255
|
+
buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
|
256
|
+
length=file_info['file_size'],
|
257
|
+
path=file_info['filename'],
|
258
|
+
start=None,
|
259
|
+
start_func=file_start_func,
|
260
|
+
)
|
261
|
+
|
262
|
+
@staticmethod
|
263
|
+
def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
|
264
|
+
if not (isinstance(path_filter, str) or callable(path_filter)):
|
265
|
+
raise Exception('Expected path_filter to be a string or a function')
|
266
|
+
|
267
|
+
if callable(path_filter):
|
268
|
+
return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
|
269
|
+
|
270
|
+
glob_filter = cast(str, path_filter)
|
271
|
+
|
272
|
+
def _filter_function(file: LazyLoadedFile) -> bool:
|
273
|
+
return fnmatch(file.path, glob_filter)
|
274
|
+
|
275
|
+
return list(filter(_filter_function, files))
|
276
|
+
|
277
|
+
def _get_detailed_dict(self) -> types.DataRecordDetailedDict:
|
278
|
+
return cast(types.DataRecordDetailedDict, api_client.get(f'/resources/data-records/{self.uuid}/').json())
|
@@ -1 +1 @@
|
|
1
|
-
from .data_record import
|
1
|
+
from .data_record import get_data_record_state_from_uri, validate_sqlite_v1
|
@@ -1,153 +1,99 @@
|
|
1
|
-
import
|
2
|
-
from
|
3
|
-
|
4
|
-
from
|
5
|
-
from struct import Struct
|
6
|
-
from typing import Callable, Dict, List, Optional, Union, cast
|
7
|
-
|
8
|
-
from biolib import lfs
|
9
|
-
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
10
|
-
from biolib._internal.http_client import HttpClient
|
1
|
+
import sqlite3
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from biolib._internal.types.data_record import SqliteV1DatabaseSchema
|
11
5
|
from biolib.api import client as api_client
|
12
6
|
from biolib.biolib_api_client import AppGetResponse
|
13
|
-
from biolib.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
'mod_date',
|
107
|
-
'crc_32_expected',
|
108
|
-
'compressed_size_raw',
|
109
|
-
'uncompressed_size_raw',
|
110
|
-
'file_name_len',
|
111
|
-
'extra_field_len',
|
112
|
-
),
|
113
|
-
)
|
114
|
-
|
115
|
-
local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
|
116
|
-
local_file_header_end = local_file_header_start + local_file_header_struct.size
|
117
|
-
|
118
|
-
def file_start_func() -> int:
|
119
|
-
local_file_header_response = HttpClient.request(
|
120
|
-
url=remote_storage_endpoint.get_remote_url(),
|
121
|
-
headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
|
122
|
-
timeout_in_seconds=300,
|
123
|
-
)
|
124
|
-
local_file_header = LocalFileHeader._make(
|
125
|
-
local_file_header_struct.unpack(local_file_header_response.content)
|
126
|
-
)
|
127
|
-
file_start: int = (
|
128
|
-
local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
|
129
|
-
)
|
130
|
-
return file_start
|
131
|
-
|
132
|
-
return LazyLoadedFile(
|
133
|
-
buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
|
134
|
-
length=file_info['file_size'],
|
135
|
-
path=file_info['filename'],
|
136
|
-
start=None,
|
137
|
-
start_func=file_start_func,
|
138
|
-
)
|
139
|
-
|
140
|
-
@staticmethod
|
141
|
-
def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
|
142
|
-
if not (isinstance(path_filter, str) or callable(path_filter)):
|
143
|
-
raise Exception('Expected path_filter to be a string or a function')
|
144
|
-
|
145
|
-
if callable(path_filter):
|
146
|
-
return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
|
147
|
-
|
148
|
-
glob_filter = cast(str, path_filter)
|
149
|
-
|
150
|
-
def _filter_function(file: LazyLoadedFile) -> bool:
|
151
|
-
return fnmatch(file.path, glob_filter)
|
152
|
-
|
153
|
-
return list(filter(_filter_function, files))
|
7
|
+
from biolib.biolib_api_client.lfs_types import DataRecordVersionInfo
|
8
|
+
|
9
|
+
|
10
|
+
def get_actual_schema(db_path):
|
11
|
+
if not db_path.exists():
|
12
|
+
raise Exception(f'File {db_path} not found.')
|
13
|
+
conn = sqlite3.connect(db_path)
|
14
|
+
cursor = conn.cursor()
|
15
|
+
|
16
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
17
|
+
tables = cursor.fetchall()
|
18
|
+
|
19
|
+
actual_schema: SqliteV1DatabaseSchema = {'tables': {}}
|
20
|
+
for table in tables:
|
21
|
+
table_name = table[0]
|
22
|
+
cursor.execute(f'PRAGMA table_info({table_name});')
|
23
|
+
columns = cursor.fetchall()
|
24
|
+
actual_schema['tables'][table_name] = {'columns': {}}
|
25
|
+
for column in columns:
|
26
|
+
actual_schema['tables'][table_name]['columns'][column[1]] = {
|
27
|
+
'type': column[2],
|
28
|
+
'nullable': not bool(column[3]),
|
29
|
+
}
|
30
|
+
|
31
|
+
cursor.execute(f'PRAGMA foreign_key_list({table_name});')
|
32
|
+
foreign_keys = cursor.fetchall()
|
33
|
+
for fk in foreign_keys:
|
34
|
+
actual_schema['tables'][table_name]['columns'][fk[3]]['foreign_key'] = {'table': fk[2], 'column': fk[4]}
|
35
|
+
|
36
|
+
conn.close()
|
37
|
+
return actual_schema
|
38
|
+
|
39
|
+
|
40
|
+
def verify_schema(specification: SqliteV1DatabaseSchema, actual_schema: SqliteV1DatabaseSchema):
|
41
|
+
for table_name, table_spec in specification['tables'].items():
|
42
|
+
if table_name not in actual_schema['tables']:
|
43
|
+
raise Exception(f"Error: Table '{table_name}' is missing.")
|
44
|
+
|
45
|
+
for column_name, column_spec in table_spec['columns'].items():
|
46
|
+
if column_name not in actual_schema['tables'][table_name]['columns']:
|
47
|
+
raise Exception(f"Error: Column '{column_name}' in table '{table_name}' is missing.")
|
48
|
+
|
49
|
+
actual_column = actual_schema['tables'][table_name]['columns'][column_name]
|
50
|
+
if actual_column['type'] != column_spec['type']:
|
51
|
+
raise Exception(
|
52
|
+
f"Error: Column '{column_name}' in table '{table_name}' "
|
53
|
+
"has type '{actual_column['type']}' but expected '{column_spec['type']}'."
|
54
|
+
)
|
55
|
+
|
56
|
+
if not actual_column['nullable'] and column_spec.get('nullable', True):
|
57
|
+
raise Exception(
|
58
|
+
f"Error: Column '{column_name}' in table '{table_name}' is "
|
59
|
+
'not nullable but should be nullable according to the specification.'
|
60
|
+
)
|
61
|
+
|
62
|
+
for column_name, column_spec in table_spec['columns'].items():
|
63
|
+
if column_spec.get('foreign_key'):
|
64
|
+
foreign_key_spec = column_spec['foreign_key']
|
65
|
+
if actual_schema['tables'][table_name]['columns'][column_name].get('foreign_key'):
|
66
|
+
fk = actual_schema['tables'][table_name]['columns'][column_name]['foreign_key']
|
67
|
+
if (
|
68
|
+
fk
|
69
|
+
and foreign_key_spec
|
70
|
+
and fk['table'] == foreign_key_spec['table']
|
71
|
+
and fk['column'] == foreign_key_spec['column']
|
72
|
+
):
|
73
|
+
raise Exception(
|
74
|
+
f"Error: Column '{column_name}' in table '{table_name}' does "
|
75
|
+
'not have the correct foreign key constraint.'
|
76
|
+
)
|
77
|
+
else:
|
78
|
+
raise Exception(
|
79
|
+
f"Error: Column '{column_name}' in table '{table_name}' does "
|
80
|
+
'not have a foreign key constraint.'
|
81
|
+
)
|
82
|
+
|
83
|
+
|
84
|
+
def get_data_record_state_from_uri(uri) -> 'DataRecordVersionInfo':
|
85
|
+
app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': uri}).json()
|
86
|
+
if app_response['app']['type'] != 'data-record':
|
87
|
+
raise Exception(f'Resource "{uri}" is not a Data Record')
|
88
|
+
return DataRecordVersionInfo(
|
89
|
+
resource_uri=app_response['app_version']['app_uri'],
|
90
|
+
resource_uuid=app_response['app']['public_id'],
|
91
|
+
resource_version_uuid=app_response['app_version']['public_id'],
|
92
|
+
)
|
93
|
+
|
94
|
+
|
95
|
+
def validate_sqlite_v1(schema: SqliteV1DatabaseSchema, sqlite_file: Path):
|
96
|
+
actual_schema = get_actual_schema(sqlite_file)
|
97
|
+
print(schema)
|
98
|
+
print(actual_schema)
|
99
|
+
verify_schema(specification=schema, actual_schema=actual_schema)
|
@@ -1,23 +1,34 @@
|
|
1
|
+
import os
|
1
2
|
from datetime import datetime, timedelta
|
3
|
+
from urllib.parse import urlparse
|
2
4
|
|
3
5
|
from biolib.api import client as api_client
|
4
|
-
from biolib.biolib_api_client.lfs_types import
|
6
|
+
from biolib.biolib_api_client.lfs_types import DataRecordVersion
|
5
7
|
from biolib.biolib_binary_format.utils import RemoteEndpoint
|
6
8
|
from biolib.biolib_logging import logger
|
9
|
+
from biolib.typing_utils import Optional
|
7
10
|
|
8
11
|
|
9
12
|
class DataRecordRemoteStorageEndpoint(RemoteEndpoint):
|
10
13
|
def __init__(self, resource_version_uuid: str):
|
11
14
|
self._resource_version_uuid: str = resource_version_uuid
|
12
|
-
self._expires_at = None
|
13
|
-
self._presigned_url = None
|
15
|
+
self._expires_at: Optional[datetime] = None
|
16
|
+
self._presigned_url: Optional[str] = None
|
14
17
|
|
15
|
-
def get_remote_url(self):
|
16
|
-
if not self._presigned_url or datetime.utcnow() > self._expires_at:
|
17
|
-
lfs_version:
|
18
|
+
def get_remote_url(self) -> str:
|
19
|
+
if not self._presigned_url or not self._expires_at or datetime.utcnow() > self._expires_at:
|
20
|
+
lfs_version: DataRecordVersion = api_client.get(
|
18
21
|
path=f'/lfs/versions/{self._resource_version_uuid}/',
|
19
22
|
).json()
|
20
|
-
|
23
|
+
|
24
|
+
app_caller_proxy_job_storage_base_url = os.getenv('BIOLIB_CLOUD_JOB_STORAGE_BASE_URL', '')
|
25
|
+
if app_caller_proxy_job_storage_base_url:
|
26
|
+
# Done to hit App Caller Proxy when downloading from inside an app
|
27
|
+
parsed_url = urlparse(lfs_version['presigned_download_url'])
|
28
|
+
self._presigned_url = f'{app_caller_proxy_job_storage_base_url}{parsed_url.path}?{parsed_url.query}'
|
29
|
+
else:
|
30
|
+
self._presigned_url = lfs_version['presigned_download_url']
|
31
|
+
|
21
32
|
self._expires_at = datetime.utcnow() + timedelta(minutes=8)
|
22
33
|
logger.debug(
|
23
34
|
f'DataRecord "{self._resource_version_uuid}" fetched presigned URL '
|