pybiolib 1.1.2145__py3-none-any.whl → 1.1.2155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/_data_record/data_record.py +68 -11
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +2 -63
- {pybiolib-1.1.2145.dist-info → pybiolib-1.1.2155.dist-info}/METADATA +1 -1
- {pybiolib-1.1.2145.dist-info → pybiolib-1.1.2155.dist-info}/RECORD +8 -8
- {pybiolib-1.1.2145.dist-info → pybiolib-1.1.2155.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.2145.dist-info → pybiolib-1.1.2155.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.2145.dist-info → pybiolib-1.1.2155.dist-info}/entry_points.txt +0 -0
@@ -5,21 +5,24 @@ from fnmatch import fnmatch
|
|
5
5
|
from struct import Struct
|
6
6
|
from typing import Callable, Dict, List, Union, cast
|
7
7
|
|
8
|
-
from biolib import api
|
8
|
+
from biolib import api, utils
|
9
9
|
from biolib._internal import types
|
10
|
-
from biolib._internal.data_record import get_data_record_state_from_uri
|
10
|
+
from biolib._internal.data_record import get_data_record_state_from_uri
|
11
11
|
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
12
|
+
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
12
13
|
from biolib._internal.http_client import HttpClient
|
13
14
|
from biolib.api import client as api_client
|
14
15
|
from biolib.biolib_api_client import BiolibApiClient
|
15
|
-
from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersionInfo
|
16
|
+
from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersion, DataRecordVersionInfo
|
16
17
|
from biolib.biolib_binary_format import LazyLoadedFile
|
17
18
|
from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
|
19
|
+
from biolib.biolib_errors import BioLibError
|
18
20
|
from biolib.biolib_logging import logger
|
19
21
|
from biolib.typing_utils import Optional as _Optional
|
20
22
|
from biolib.utils.app_uri import parse_app_uri
|
21
23
|
from biolib.utils.zip.remote_zip import RemoteZip
|
22
24
|
|
25
|
+
|
23
26
|
PathFilter = Union[str, Callable[[str], bool]]
|
24
27
|
|
25
28
|
|
@@ -83,10 +86,63 @@ class DataRecord:
|
|
83
86
|
|
84
87
|
def update(self, data_path: str, chunk_size_in_mb: _Optional[int] = None) -> None:
|
85
88
|
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
86
|
-
|
87
|
-
|
89
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
|
90
|
+
|
91
|
+
if os.path.realpath(data_path) == '/':
|
92
|
+
raise BioLibError('Pushing your root directory is not possible')
|
93
|
+
|
94
|
+
original_working_dir = os.getcwd()
|
95
|
+
os.chdir(data_path)
|
96
|
+
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
97
|
+
|
98
|
+
if data_size_in_bytes > 4_500_000_000_000:
|
99
|
+
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
100
|
+
|
101
|
+
# validate data record
|
102
|
+
detailed_dict: types.DataRecordDetailedDict = self._get_detailed_dict()
|
103
|
+
if detailed_dict['type']:
|
104
|
+
# only validate if data record has a type
|
105
|
+
data_record_type: types.DataRecordTypeDict = detailed_dict['type']
|
106
|
+
logger.info(f"Validating data record of type {data_record_type['name']}")
|
107
|
+
for rule in data_record_type['validation_rules']:
|
108
|
+
logger.info(f"Validating rule {rule['type']} for {rule['path']}...")
|
109
|
+
|
110
|
+
min_chunk_size_bytes = 10_000_000
|
111
|
+
chunk_size_in_bytes: int
|
112
|
+
if chunk_size_in_mb:
|
113
|
+
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
114
|
+
if chunk_size_in_bytes < min_chunk_size_bytes:
|
115
|
+
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
116
|
+
chunk_size_in_bytes = min_chunk_size_bytes
|
117
|
+
else:
|
118
|
+
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
119
|
+
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
120
|
+
|
121
|
+
data_size_in_mb = round(data_size_in_bytes / 10**6)
|
122
|
+
logger.info(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
123
|
+
|
124
|
+
response = api.client.post(path='/lfs/versions/', data={'resource_uuid': self._state['resource_uuid']})
|
125
|
+
data_record_version: DataRecordVersion = response.json()
|
126
|
+
iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
|
127
|
+
|
128
|
+
multipart_uploader = utils.MultiPartUploader(
|
129
|
+
use_process_pool=True,
|
130
|
+
get_presigned_upload_url_request=dict(
|
131
|
+
headers=None,
|
132
|
+
requires_biolib_auth=True,
|
133
|
+
path=f"/lfs/versions/{data_record_version['uuid']}/presigned_upload_url/",
|
134
|
+
),
|
135
|
+
complete_upload_request=dict(
|
136
|
+
headers=None,
|
137
|
+
requires_biolib_auth=True,
|
138
|
+
path=f"/lfs/versions/{data_record_version['uuid']}/complete_upload/",
|
139
|
+
),
|
88
140
|
)
|
89
|
-
|
141
|
+
|
142
|
+
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
143
|
+
os.chdir(original_working_dir)
|
144
|
+
logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
|
145
|
+
self._state = get_data_record_state_from_uri(data_record_version['uri'])
|
90
146
|
|
91
147
|
@staticmethod
|
92
148
|
def get_by_uri(uri: str) -> 'DataRecord':
|
@@ -112,14 +168,15 @@ class DataRecord:
|
|
112
168
|
'name': uri_parsed['app_name'],
|
113
169
|
},
|
114
170
|
)
|
115
|
-
|
116
|
-
logger.info(f"Successfully created new Data Record '{
|
171
|
+
data_record_info: DataRecordInfo = response.json()
|
172
|
+
logger.info(f"Successfully created new Data Record '{data_record_info['uri']}'")
|
117
173
|
|
118
174
|
if data_path is not None:
|
119
|
-
|
120
|
-
|
175
|
+
data_record = DataRecord.get_by_uri(uri=data_record_info['uri'])
|
176
|
+
data_record.update(data_path=data_path)
|
177
|
+
return data_record
|
121
178
|
else:
|
122
|
-
return DataRecord.get_by_uri(uri=
|
179
|
+
return DataRecord.get_by_uri(uri=data_record_info['uri'])
|
123
180
|
|
124
181
|
@staticmethod
|
125
182
|
def fetch(uri: _Optional[str] = None, count: _Optional[int] = None) -> List['DataRecord']:
|
@@ -1 +1 @@
|
|
1
|
-
from .data_record import get_data_record_state_from_uri
|
1
|
+
from .data_record import get_data_record_state_from_uri
|
@@ -1,67 +1,6 @@
|
|
1
|
-
import os
|
2
|
-
from typing import Optional
|
3
|
-
|
4
|
-
from biolib import api, utils
|
5
|
-
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
6
1
|
from biolib.api import client as api_client
|
7
|
-
from biolib.biolib_api_client import AppGetResponse
|
8
|
-
from biolib.biolib_api_client.lfs_types import
|
9
|
-
from biolib.biolib_errors import BioLibError
|
10
|
-
from biolib.biolib_logging import logger
|
11
|
-
|
12
|
-
|
13
|
-
def push_data_record_version(data_record_uuid: str, input_dir: str, chunk_size_in_mb: Optional[int] = None) -> str:
|
14
|
-
BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
|
15
|
-
|
16
|
-
if not os.path.isdir(input_dir):
|
17
|
-
raise BioLibError(f'Could not find folder at {input_dir}')
|
18
|
-
|
19
|
-
if os.path.realpath(input_dir) == '/':
|
20
|
-
raise BioLibError('Pushing your root directory is not possible')
|
21
|
-
|
22
|
-
original_working_dir = os.getcwd()
|
23
|
-
os.chdir(input_dir)
|
24
|
-
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
25
|
-
|
26
|
-
if data_size_in_bytes > 4_500_000_000_000:
|
27
|
-
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
28
|
-
|
29
|
-
min_chunk_size_bytes = 10_000_000
|
30
|
-
chunk_size_in_bytes: int
|
31
|
-
if chunk_size_in_mb:
|
32
|
-
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
33
|
-
if chunk_size_in_bytes < min_chunk_size_bytes:
|
34
|
-
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
35
|
-
chunk_size_in_bytes = min_chunk_size_bytes
|
36
|
-
else:
|
37
|
-
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
38
|
-
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
39
|
-
|
40
|
-
data_size_in_mb = round(data_size_in_bytes / 10**6)
|
41
|
-
print(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
42
|
-
|
43
|
-
response = api.client.post(path='/lfs/versions/', data={'resource_uuid': data_record_uuid})
|
44
|
-
data_record_version: DataRecordVersion = response.json()
|
45
|
-
iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
|
46
|
-
|
47
|
-
multipart_uploader = utils.MultiPartUploader(
|
48
|
-
use_process_pool=True,
|
49
|
-
get_presigned_upload_url_request=dict(
|
50
|
-
headers=None,
|
51
|
-
requires_biolib_auth=True,
|
52
|
-
path=f"/lfs/versions/{data_record_version['uuid']}/presigned_upload_url/",
|
53
|
-
),
|
54
|
-
complete_upload_request=dict(
|
55
|
-
headers=None,
|
56
|
-
requires_biolib_auth=True,
|
57
|
-
path=f"/lfs/versions/{data_record_version['uuid']}/complete_upload/",
|
58
|
-
),
|
59
|
-
)
|
60
|
-
|
61
|
-
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
62
|
-
os.chdir(original_working_dir)
|
63
|
-
logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
|
64
|
-
return data_record_version['uri']
|
2
|
+
from biolib.biolib_api_client import AppGetResponse
|
3
|
+
from biolib.biolib_api_client.lfs_types import DataRecordVersionInfo
|
65
4
|
|
66
5
|
|
67
6
|
def get_data_record_state_from_uri(uri) -> 'DataRecordVersionInfo':
|
@@ -1,10 +1,10 @@
|
|
1
1
|
LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
2
2
|
README.md,sha256=_IH7pxFiqy2bIAmaVeA-iVTyUwWRjMIlfgtUbYTtmls,368
|
3
3
|
biolib/__init__.py,sha256=_tThyzISH81yS9KXP_X3qEiKXmsIp5XOBcJIODfLVnc,4338
|
4
|
-
biolib/_data_record/data_record.py,sha256=
|
4
|
+
biolib/_data_record/data_record.py,sha256=Sud8yXz7yR6YW4V6OqE7nO6I4a0TdqijmMTZwwU59j8,12152
|
5
5
|
biolib/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
biolib/_internal/data_record/__init__.py,sha256=
|
7
|
-
biolib/_internal/data_record/data_record.py,sha256=
|
6
|
+
biolib/_internal/data_record/__init__.py,sha256=wLOy3Pb7dWYik5eQtQM00DH2AWC-M5RbTjwh9InPiqo,56
|
7
|
+
biolib/_internal/data_record/data_record.py,sha256=If4SQj-XwKSPzCpaWA01LEGKalZ6DEjD5PJZRtl3Mao,556
|
8
8
|
biolib/_internal/data_record/remote_storage_endpoint.py,sha256=eCptuZ4DMAPnaNCVDvpWXwXGI6Jac9U1N5dqU8Cj95Q,1732
|
9
9
|
biolib/_internal/file_utils.py,sha256=4jT6j7bB21c0JNn5BfnyWQib_zt0CVtJ_TiOFOStRcE,2604
|
10
10
|
biolib/_internal/fuse_mount/__init__.py,sha256=B_tM6RM2dBw-vbpoHJC4X3tOAaN1H2RDvqYJOw3xFwg,55
|
@@ -116,8 +116,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
|
|
116
116
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
117
117
|
biolib/utils/seq_util.py,sha256=jC5WhH63FTD7SLFJbxQGA2hOt9NTwq9zHl_BEec1Z0c,4907
|
118
118
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
119
|
-
pybiolib-1.1.
|
120
|
-
pybiolib-1.1.
|
121
|
-
pybiolib-1.1.
|
122
|
-
pybiolib-1.1.
|
123
|
-
pybiolib-1.1.
|
119
|
+
pybiolib-1.1.2155.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
120
|
+
pybiolib-1.1.2155.dist-info/METADATA,sha256=tu73TYRGUjbJ3MfO15gYq2UO81SYCy508-wwmoHN3WQ,1508
|
121
|
+
pybiolib-1.1.2155.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
122
|
+
pybiolib-1.1.2155.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
123
|
+
pybiolib-1.1.2155.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|