pybiolib 1.2.166__py3-none-any.whl → 1.2.174__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/_data_record/data_record.py +66 -32
- biolib/_internal/types/file_node.py +17 -0
- biolib/compute_node/remote_host_proxy.py +9 -0
- {pybiolib-1.2.166.dist-info → pybiolib-1.2.174.dist-info}/METADATA +1 -1
- {pybiolib-1.2.166.dist-info → pybiolib-1.2.174.dist-info}/RECORD +8 -7
- {pybiolib-1.2.166.dist-info → pybiolib-1.2.174.dist-info}/LICENSE +0 -0
- {pybiolib-1.2.166.dist-info → pybiolib-1.2.174.dist-info}/WHEEL +0 -0
- {pybiolib-1.2.166.dist-info → pybiolib-1.2.174.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,9 @@
|
|
1
1
|
import os
|
2
2
|
from collections import namedtuple
|
3
3
|
from datetime import datetime
|
4
|
-
from fnmatch import fnmatch
|
5
4
|
from pathlib import Path
|
6
5
|
from struct import Struct
|
7
|
-
from typing import Callable, Dict, List, Optional, Union, cast
|
6
|
+
from typing import Callable, Dict, Iterable, List, Optional, Union, cast
|
8
7
|
|
9
8
|
from biolib import api
|
10
9
|
from biolib._internal import types
|
@@ -16,6 +15,7 @@ from biolib._internal.data_record.push_data import (
|
|
16
15
|
)
|
17
16
|
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
18
17
|
from biolib._internal.http_client import HttpClient
|
18
|
+
from biolib._internal.types.file_node import ZipFileNodeDict
|
19
19
|
from biolib.api import client as api_client
|
20
20
|
from biolib.biolib_api_client import BiolibApiClient
|
21
21
|
from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersion, DataRecordVersionInfo
|
@@ -23,9 +23,8 @@ from biolib.biolib_binary_format import LazyLoadedFile
|
|
23
23
|
from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
|
24
24
|
from biolib.biolib_logging import logger
|
25
25
|
from biolib.utils.app_uri import parse_app_uri
|
26
|
-
from biolib.utils.zip.remote_zip import RemoteZip
|
27
26
|
|
28
|
-
PathFilter = Union[str, Callable[[str], bool]]
|
27
|
+
PathFilter = Union[str, List[str], Callable[[str], bool]]
|
29
28
|
|
30
29
|
|
31
30
|
class DataRecord:
|
@@ -51,17 +50,25 @@ class DataRecord:
|
|
51
50
|
|
52
51
|
return uri_parsed['app_name']
|
53
52
|
|
54
|
-
def list_files(
|
55
|
-
|
56
|
-
|
53
|
+
def list_files(
|
54
|
+
self,
|
55
|
+
path_filter: Optional[PathFilter] = None,
|
56
|
+
max_count: Optional[int] = 100_000,
|
57
|
+
) -> List[LazyLoadedFile]:
|
58
|
+
files = list(
|
59
|
+
self._fetch_files(
|
60
|
+
path_filter=path_filter,
|
61
|
+
max_count=max_count + 1 if max_count is not None else None,
|
62
|
+
)
|
57
63
|
)
|
58
|
-
files: List[LazyLoadedFile] = []
|
59
|
-
with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
|
60
|
-
central_directory = remote_zip.get_central_directory()
|
61
|
-
for file_info in central_directory.values():
|
62
|
-
files.append(self._get_file(remote_storage_endpoint, file_info))
|
63
64
|
|
64
|
-
|
65
|
+
if max_count is not None and len(files) > max_count:
|
66
|
+
raise Exception(
|
67
|
+
f'list_files returned more than {max_count} files. '
|
68
|
+
f'Please set the keyword argument "max_count" to a higher number.'
|
69
|
+
)
|
70
|
+
|
71
|
+
return files
|
65
72
|
|
66
73
|
def download_zip(self, output_path: str):
|
67
74
|
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
|
@@ -188,8 +195,50 @@ class DataRecord:
|
|
188
195
|
for result in results
|
189
196
|
]
|
190
197
|
|
198
|
+
def _fetch_files(
|
199
|
+
self,
|
200
|
+
max_count: Optional[int],
|
201
|
+
path_filter: Optional[PathFilter] = None,
|
202
|
+
) -> Iterable[LazyLoadedFile]:
|
203
|
+
if path_filter and not (isinstance(path_filter, (str, list)) or callable(path_filter)):
|
204
|
+
raise Exception('Expected path_filter to be a string, a list of strings or a function')
|
205
|
+
|
206
|
+
path_filters = (
|
207
|
+
[path_filter] if isinstance(path_filter, str) else path_filter if isinstance(path_filter, list) else []
|
208
|
+
)
|
209
|
+
|
210
|
+
resource_version_uuid = self._state['resource_version_uuid']
|
211
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(resource_version_uuid)
|
212
|
+
|
213
|
+
page: Optional[int] = 1
|
214
|
+
yielded_files: int = 0
|
215
|
+
while page:
|
216
|
+
response = api.client.post(
|
217
|
+
path=f'/proxy/files/data-record-versions/{resource_version_uuid}/query/',
|
218
|
+
data=dict(page=page, page_size=1_000, path_filters=path_filters),
|
219
|
+
).json()
|
220
|
+
|
221
|
+
for file_node_dict in cast(List[ZipFileNodeDict], response['results']):
|
222
|
+
if file_node_dict['is_dir']:
|
223
|
+
continue
|
224
|
+
|
225
|
+
if callable(path_filter) and not path_filter(file_node_dict['dir_path'] + file_node_dict['name']):
|
226
|
+
continue
|
227
|
+
|
228
|
+
yield self._get_file(remote_storage_endpoint, file_node_dict)
|
229
|
+
yielded_files += 1
|
230
|
+
|
231
|
+
if max_count is not None and yielded_files >= max_count:
|
232
|
+
page = None
|
233
|
+
break
|
234
|
+
|
235
|
+
page = page + 1 if page is not None and response['page_count'] > page else None
|
236
|
+
|
191
237
|
@staticmethod
|
192
|
-
def _get_file(
|
238
|
+
def _get_file(
|
239
|
+
remote_storage_endpoint: DataRecordRemoteStorageEndpoint,
|
240
|
+
file_node_dict: ZipFileNodeDict,
|
241
|
+
) -> LazyLoadedFile:
|
193
242
|
local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
|
194
243
|
local_file_header_struct = Struct('<H2sHHHIIIHH')
|
195
244
|
LocalFileHeader = namedtuple(
|
@@ -208,7 +257,7 @@ class DataRecord:
|
|
208
257
|
),
|
209
258
|
)
|
210
259
|
|
211
|
-
local_file_header_start =
|
260
|
+
local_file_header_start = file_node_dict['zip_meta']['header_start'] + len(local_file_header_signature_bytes)
|
212
261
|
local_file_header_end = local_file_header_start + local_file_header_struct.size
|
213
262
|
|
214
263
|
def file_start_func() -> int:
|
@@ -227,26 +276,11 @@ class DataRecord:
|
|
227
276
|
|
228
277
|
return LazyLoadedFile(
|
229
278
|
buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
|
230
|
-
length=
|
231
|
-
path=
|
279
|
+
length=file_node_dict['zip_meta']['size_on_disk'],
|
280
|
+
path=file_node_dict['dir_path'] + file_node_dict['name'],
|
232
281
|
start=None,
|
233
282
|
start_func=file_start_func,
|
234
283
|
)
|
235
284
|
|
236
|
-
@staticmethod
|
237
|
-
def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
|
238
|
-
if not (isinstance(path_filter, str) or callable(path_filter)):
|
239
|
-
raise Exception('Expected path_filter to be a string or a function')
|
240
|
-
|
241
|
-
if callable(path_filter):
|
242
|
-
return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
|
243
|
-
|
244
|
-
glob_filter = cast(str, path_filter)
|
245
|
-
|
246
|
-
def _filter_function(file: LazyLoadedFile) -> bool:
|
247
|
-
return fnmatch(file.path, glob_filter)
|
248
|
-
|
249
|
-
return list(filter(_filter_function, files))
|
250
|
-
|
251
285
|
def _get_detailed_dict(self) -> types.DataRecordDetailedDict:
|
252
286
|
return cast(types.DataRecordDetailedDict, api_client.get(f'/resources/data-records/{self.uuid}/').json())
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from .typing import TypedDict
|
2
|
+
|
3
|
+
|
4
|
+
class FileZipMetadataDict(TypedDict):
|
5
|
+
header_start: int
|
6
|
+
size_on_disk: int
|
7
|
+
|
8
|
+
|
9
|
+
class FileNodeDict(TypedDict):
|
10
|
+
dir_path: str
|
11
|
+
is_dir: bool
|
12
|
+
name: str
|
13
|
+
size: int
|
14
|
+
|
15
|
+
|
16
|
+
class ZipFileNodeDict(FileNodeDict):
|
17
|
+
zip_meta: FileZipMetadataDict
|
@@ -309,6 +309,15 @@ http {{
|
|
309
309
|
proxy_ssl_server_name on;
|
310
310
|
}}
|
311
311
|
|
312
|
+
location /api/proxy/files/data-record-versions/ {{
|
313
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
314
|
+
proxy_set_header authorization "";
|
315
|
+
proxy_set_header compute-node-auth-token "{compute_node_auth_token}";
|
316
|
+
proxy_set_header job-uuid "{self._job_uuid}";
|
317
|
+
proxy_set_header cookie "";
|
318
|
+
proxy_ssl_server_name on;
|
319
|
+
}}
|
320
|
+
|
312
321
|
location /api/ {{
|
313
322
|
proxy_pass https://$upstream_hostname$request_uri;
|
314
323
|
proxy_set_header authorization "";
|
@@ -1,7 +1,7 @@
|
|
1
1
|
LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
2
2
|
PYPI_README.md,sha256=_IH7pxFiqy2bIAmaVeA-iVTyUwWRjMIlfgtUbYTtmls,368
|
3
3
|
biolib/__init__.py,sha256=4Rfa0AJKztqkCG5D67kFgSwdOUiBTa5HkAzIOqHNREU,4431
|
4
|
-
biolib/_data_record/data_record.py,sha256=
|
4
|
+
biolib/_data_record/data_record.py,sha256=zKvnh5T-dIVY46-kgVzMBoZ666ZhcTCFQnWvZT0D6RM,12026
|
5
5
|
biolib/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
biolib/_internal/data_record/__init__.py,sha256=fGdME6JGRU_2VxpJbYpGXYndjN-feUkmKY4fuMyq3cg,76
|
7
7
|
biolib/_internal/data_record/data_record.py,sha256=g_-jdy5-Zem3dthwxJj2OuQqkDGTyc-iGqN1rtYYD1A,4418
|
@@ -21,6 +21,7 @@ biolib/_internal/types/__init__.py,sha256=xLgOQJFh3GRtiqIJq7MaqHReZx4pp34_zcaFQ_
|
|
21
21
|
biolib/_internal/types/app.py,sha256=Mz2QGD_jESX-K9JYnLWPo4YA__Q_1FQQTk9pvidCohU,118
|
22
22
|
biolib/_internal/types/data_record.py,sha256=9r_vdhVs60YTnzU4XQFXfDrfS2P2MqD3BH2xa7lk6ck,852
|
23
23
|
biolib/_internal/types/experiment.py,sha256=D94iBdn2nS92lRW-TOs1a2WKXJD5ZtmzL4ypggKX2ys,176
|
24
|
+
biolib/_internal/types/file_node.py,sha256=T6BIqo662f3nwMBRqtBHYsg6YuuUaKpiokHcVjv9_ME,283
|
24
25
|
biolib/_internal/types/resource.py,sha256=G-vPkZoe4Um6FPxsQZtRzAlbSW5sDW4NFkbjn21I3V4,372
|
25
26
|
biolib/_internal/types/resource_version.py,sha256=sLxViYXloDDUhTDFgjegiQCj097OM1Ih1-uqlC_4ULA,174
|
26
27
|
biolib/_internal/types/typing.py,sha256=D4EKKEe7kDx0K6lJi-H_XLtk-8w6nu2fdqn9bvzI-Xo,288
|
@@ -89,7 +90,7 @@ biolib/compute_node/job_worker/large_file_system.py,sha256=XXqRlVtYhs-Ji9zQGIk5K
|
|
89
90
|
biolib/compute_node/job_worker/mappings.py,sha256=Z48Kg4nbcOvsT2-9o3RRikBkqflgO4XeaWxTGz-CNvI,2499
|
90
91
|
biolib/compute_node/job_worker/utilization_reporter_thread.py,sha256=7tm5Yk9coqJ9VbEdnO86tSXI0iM0omwIyKENxdxiVXk,8575
|
91
92
|
biolib/compute_node/job_worker/utils.py,sha256=wgxcIA8yAhUPdCwyvuuJ0JmreyWmmUoBO33vWtG60xg,1282
|
92
|
-
biolib/compute_node/remote_host_proxy.py,sha256=
|
93
|
+
biolib/compute_node/remote_host_proxy.py,sha256=1EZnB0WWG359Yy22xBoqRqGEmmlA12rpiG5u8B2381M,16533
|
93
94
|
biolib/compute_node/socker_listener_thread.py,sha256=T5_UikA3MB9bD5W_dckYLPTgixh72vKUlgbBvj9dbM0,1601
|
94
95
|
biolib/compute_node/socket_sender_thread.py,sha256=YgamPHeUm2GjMFGx8qk-99WlZhEs-kAb3q_2O6qByig,971
|
95
96
|
biolib/compute_node/utils.py,sha256=M7i_WTyxbFM3Lri9RWZ_8FeQNYrQIWpKGLfp2I55oeY,4677
|
@@ -119,8 +120,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
|
|
119
120
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
120
121
|
biolib/utils/seq_util.py,sha256=Ozk0blGtPur_D9MwShD02r_mphyQmgZkx-lOHOwnlIM,6730
|
121
122
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
122
|
-
pybiolib-1.2.
|
123
|
-
pybiolib-1.2.
|
124
|
-
pybiolib-1.2.
|
125
|
-
pybiolib-1.2.
|
126
|
-
pybiolib-1.2.
|
123
|
+
pybiolib-1.2.174.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
124
|
+
pybiolib-1.2.174.dist-info/METADATA,sha256=whtDRAsVaAK8ZdCekzmffyOrdtKgkagocbSclm4sjrc,1507
|
125
|
+
pybiolib-1.2.174.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
126
|
+
pybiolib-1.2.174.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
127
|
+
pybiolib-1.2.174.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|