pybiolib 1.1.2038__py3-none-any.whl → 1.1.2097__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +5 -1
- biolib/_data_record/data_record.py +208 -0
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +67 -163
- biolib/_internal/data_record/remote_storage_endpoint.py +2 -2
- biolib/_internal/file_utils.py +77 -0
- biolib/_internal/lfs/__init__.py +1 -0
- biolib/_internal/push_application.py +1 -1
- biolib/_internal/runtime.py +1 -78
- biolib/_runtime/runtime.py +79 -0
- biolib/biolib_api_client/api_client.py +1 -1
- biolib/biolib_api_client/app_types.py +1 -0
- biolib/biolib_api_client/lfs_types.py +8 -2
- biolib/cli/data_record.py +42 -6
- biolib/cli/lfs.py +10 -6
- biolib/runtime/__init__.py +1 -1
- biolib/sdk/__init__.py +9 -5
- {pybiolib-1.1.2038.dist-info → pybiolib-1.1.2097.dist-info}/METADATA +1 -1
- {pybiolib-1.1.2038.dist-info → pybiolib-1.1.2097.dist-info}/RECORD +23 -21
- biolib/lfs/__init__.py +0 -4
- biolib/lfs/utils.py +0 -153
- /biolib/{lfs → _internal/lfs}/cache.py +0 -0
- {pybiolib-1.1.2038.dist-info → pybiolib-1.1.2097.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.2038.dist-info → pybiolib-1.1.2097.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.2038.dist-info → pybiolib-1.1.2097.dist-info}/entry_points.txt +0 -0
biolib/__init__.py
CHANGED
@@ -13,7 +13,7 @@ from biolib.biolib_api_client import BiolibApiClient as _BioLibApiClient, App
|
|
13
13
|
from biolib.jobs import Job as _Job
|
14
14
|
from biolib import user as _user
|
15
15
|
from biolib.typing_utils import List, Optional
|
16
|
-
from biolib.
|
16
|
+
from biolib._data_record.data_record import DataRecord as _DataRecord
|
17
17
|
|
18
18
|
import biolib.api
|
19
19
|
import biolib.app
|
@@ -45,6 +45,10 @@ def get_job(job_id: str) -> _Job:
|
|
45
45
|
return _Job.create_from_uuid(uuid=job_id)
|
46
46
|
|
47
47
|
|
48
|
+
def get_data_record(uri: str) -> _DataRecord:
|
49
|
+
return _DataRecord.get_by_uri(uri)
|
50
|
+
|
51
|
+
|
48
52
|
def fetch_jobs(count: int = 25) -> List[_Job]:
|
49
53
|
return _Job.fetch_jobs(count)
|
50
54
|
|
@@ -0,0 +1,208 @@
|
|
1
|
+
from biolib import api
|
2
|
+
from biolib._internal.data_record import get_data_record_state_from_uri, push_data_record_version
|
3
|
+
from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
|
4
|
+
from biolib._internal.http_client import HttpClient
|
5
|
+
from biolib.api import client as api_client
|
6
|
+
from biolib.biolib_api_client import BiolibApiClient
|
7
|
+
from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersionInfo
|
8
|
+
from biolib.biolib_binary_format import LazyLoadedFile
|
9
|
+
from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
|
10
|
+
from biolib.biolib_logging import logger
|
11
|
+
from biolib.typing_utils import Optional as _Optional
|
12
|
+
from biolib.utils.app_uri import parse_app_uri
|
13
|
+
from biolib.utils.zip.remote_zip import RemoteZip
|
14
|
+
|
15
|
+
|
16
|
+
import os
|
17
|
+
from collections import namedtuple
|
18
|
+
from datetime import datetime
|
19
|
+
from fnmatch import fnmatch
|
20
|
+
from struct import Struct
|
21
|
+
from typing import Callable, Dict, List, cast, Union
|
22
|
+
|
23
|
+
PathFilter = Union[str, Callable[[str], bool]]
|
24
|
+
|
25
|
+
|
26
|
+
class DataRecord:
|
27
|
+
def __init__(self, _internal_state: DataRecordVersionInfo):
|
28
|
+
self._state = _internal_state
|
29
|
+
|
30
|
+
def __repr__(self):
|
31
|
+
return f'DataRecord: {self._state["resource_uri"]}'
|
32
|
+
|
33
|
+
@property
|
34
|
+
def uri(self) -> str:
|
35
|
+
return self._state['resource_uri']
|
36
|
+
|
37
|
+
@property
|
38
|
+
def name(self) -> str:
|
39
|
+
uri_parsed = parse_app_uri(self._state['resource_uri'], use_account_as_name_default=False)
|
40
|
+
if not uri_parsed['app_name']:
|
41
|
+
raise ValueError('Expected parameter "resource_uri" to contain resource name')
|
42
|
+
|
43
|
+
return uri_parsed['app_name']
|
44
|
+
|
45
|
+
def list_files(self, path_filter: _Optional[PathFilter] = None) -> List[LazyLoadedFile]:
|
46
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
|
47
|
+
resource_version_uuid=self._state['resource_version_uuid'],
|
48
|
+
)
|
49
|
+
files: List[LazyLoadedFile] = []
|
50
|
+
with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
|
51
|
+
central_directory = remote_zip.get_central_directory()
|
52
|
+
for file_info in central_directory.values():
|
53
|
+
files.append(self._get_file(remote_storage_endpoint, file_info))
|
54
|
+
|
55
|
+
return self._get_filtered_files(files=files, path_filter=path_filter) if path_filter else files
|
56
|
+
|
57
|
+
def download_zip(self, output_path: str):
|
58
|
+
remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
|
59
|
+
resource_version_uuid=self._state['resource_version_uuid'],
|
60
|
+
)
|
61
|
+
HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
|
62
|
+
|
63
|
+
def download_files(self, output_dir: str, path_filter: _Optional[PathFilter] = None) -> None:
|
64
|
+
filtered_files = self.list_files(path_filter=path_filter)
|
65
|
+
|
66
|
+
if len(filtered_files) == 0:
|
67
|
+
logger.debug('No files to save')
|
68
|
+
return
|
69
|
+
|
70
|
+
for file in filtered_files:
|
71
|
+
file_path = os.path.join(output_dir, file.path)
|
72
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
73
|
+
with open(file_path, mode='wb') as file_handle:
|
74
|
+
for chunk in file.get_data_iterator():
|
75
|
+
file_handle.write(chunk)
|
76
|
+
|
77
|
+
def save_files(self, output_dir: str, path_filter: _Optional[PathFilter] = None) -> None:
|
78
|
+
self.download_files(output_dir=output_dir, path_filter=path_filter)
|
79
|
+
|
80
|
+
def update(self, data_path: str, chunk_size_in_mb: _Optional[int] = None) -> None:
|
81
|
+
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
82
|
+
uri = push_data_record_version(
|
83
|
+
data_record_uuid=self._state['resource_uuid'], input_dir=data_path, chunk_size_in_mb=chunk_size_in_mb
|
84
|
+
)
|
85
|
+
self._state = get_data_record_state_from_uri(uri)
|
86
|
+
|
87
|
+
@staticmethod
|
88
|
+
def get_by_uri(uri: str) -> 'DataRecord':
|
89
|
+
return DataRecord(_internal_state=get_data_record_state_from_uri(uri))
|
90
|
+
|
91
|
+
@staticmethod
|
92
|
+
def create(destination: str, data_path: _Optional[str] = None) -> 'DataRecord':
|
93
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
|
94
|
+
if data_path is not None:
|
95
|
+
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
96
|
+
uri_parsed = parse_app_uri(destination, use_account_as_name_default=False)
|
97
|
+
if uri_parsed['app_name_normalized']:
|
98
|
+
data_record_uri = destination
|
99
|
+
else:
|
100
|
+
record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
|
101
|
+
data_record_uri = f'{destination}/{record_name}'
|
102
|
+
|
103
|
+
uri_parsed = parse_app_uri(data_record_uri)
|
104
|
+
response = api.client.post(
|
105
|
+
path='/lfs/',
|
106
|
+
data={
|
107
|
+
'account_handle': uri_parsed['account_handle_normalized'],
|
108
|
+
'name': uri_parsed['app_name'],
|
109
|
+
},
|
110
|
+
)
|
111
|
+
data_record: DataRecordInfo = response.json()
|
112
|
+
logger.info(f"Successfully created new Data Record '{data_record['uri']}'")
|
113
|
+
|
114
|
+
if data_path is not None:
|
115
|
+
record_version_uri = push_data_record_version(data_record_uuid=data_record['uuid'], input_dir=data_path)
|
116
|
+
return DataRecord.get_by_uri(uri=record_version_uri)
|
117
|
+
else:
|
118
|
+
return DataRecord.get_by_uri(uri=data_record_uri)
|
119
|
+
|
120
|
+
@staticmethod
|
121
|
+
def fetch(uri: _Optional[str] = None, count: _Optional[int] = None) -> List['DataRecord']:
|
122
|
+
max_page_size = 1_000
|
123
|
+
params: Dict[str, Union[str, int]] = {
|
124
|
+
'page_size': str(count or max_page_size),
|
125
|
+
'resource_type': 'data-record',
|
126
|
+
}
|
127
|
+
if uri:
|
128
|
+
uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
|
129
|
+
params['account_handle'] = uri_parsed['account_handle_normalized']
|
130
|
+
if uri_parsed['app_name_normalized']:
|
131
|
+
params['app_name'] = uri_parsed['app_name_normalized']
|
132
|
+
|
133
|
+
results = api_client.get(path='/apps/', params=params).json()['results']
|
134
|
+
if count is None and len(results) == max_page_size:
|
135
|
+
logger.warning(
|
136
|
+
f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
|
137
|
+
)
|
138
|
+
|
139
|
+
return [
|
140
|
+
DataRecord(
|
141
|
+
_internal_state={
|
142
|
+
'resource_uri': result['resource_uri'],
|
143
|
+
'resource_uuid': result['public_id'],
|
144
|
+
'resource_version_uuid': result['active_version'],
|
145
|
+
}
|
146
|
+
)
|
147
|
+
for result in results
|
148
|
+
]
|
149
|
+
|
150
|
+
@staticmethod
|
151
|
+
def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
|
152
|
+
local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
|
153
|
+
local_file_header_struct = Struct('<H2sHHHIIIHH')
|
154
|
+
LocalFileHeader = namedtuple(
|
155
|
+
'LocalFileHeader',
|
156
|
+
(
|
157
|
+
'version',
|
158
|
+
'flags',
|
159
|
+
'compression_raw',
|
160
|
+
'mod_time',
|
161
|
+
'mod_date',
|
162
|
+
'crc_32_expected',
|
163
|
+
'compressed_size_raw',
|
164
|
+
'uncompressed_size_raw',
|
165
|
+
'file_name_len',
|
166
|
+
'extra_field_len',
|
167
|
+
),
|
168
|
+
)
|
169
|
+
|
170
|
+
local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
|
171
|
+
local_file_header_end = local_file_header_start + local_file_header_struct.size
|
172
|
+
|
173
|
+
def file_start_func() -> int:
|
174
|
+
local_file_header_response = HttpClient.request(
|
175
|
+
url=remote_storage_endpoint.get_remote_url(),
|
176
|
+
headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
|
177
|
+
timeout_in_seconds=300,
|
178
|
+
)
|
179
|
+
local_file_header = LocalFileHeader._make(
|
180
|
+
local_file_header_struct.unpack(local_file_header_response.content)
|
181
|
+
)
|
182
|
+
file_start: int = (
|
183
|
+
local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
|
184
|
+
)
|
185
|
+
return file_start
|
186
|
+
|
187
|
+
return LazyLoadedFile(
|
188
|
+
buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
|
189
|
+
length=file_info['file_size'],
|
190
|
+
path=file_info['filename'],
|
191
|
+
start=None,
|
192
|
+
start_func=file_start_func,
|
193
|
+
)
|
194
|
+
|
195
|
+
@staticmethod
|
196
|
+
def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
|
197
|
+
if not (isinstance(path_filter, str) or callable(path_filter)):
|
198
|
+
raise Exception('Expected path_filter to be a string or a function')
|
199
|
+
|
200
|
+
if callable(path_filter):
|
201
|
+
return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
|
202
|
+
|
203
|
+
glob_filter = cast(str, path_filter)
|
204
|
+
|
205
|
+
def _filter_function(file: LazyLoadedFile) -> bool:
|
206
|
+
return fnmatch(file.path, glob_filter)
|
207
|
+
|
208
|
+
return list(filter(_filter_function, files))
|
@@ -1 +1 @@
|
|
1
|
-
from .data_record import
|
1
|
+
from .data_record import get_data_record_state_from_uri, push_data_record_version
|
@@ -1,169 +1,73 @@
|
|
1
1
|
import os
|
2
|
-
from
|
3
|
-
from datetime import datetime
|
4
|
-
from fnmatch import fnmatch
|
5
|
-
from struct import Struct
|
6
|
-
from typing import Callable, Dict, List, Optional, Union, cast
|
2
|
+
from typing import Optional
|
7
3
|
|
8
|
-
from biolib import
|
9
|
-
from biolib._internal.
|
10
|
-
from biolib._internal.http_client import HttpClient
|
4
|
+
from biolib import api, utils
|
5
|
+
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
11
6
|
from biolib.api import client as api_client
|
12
|
-
from biolib.biolib_api_client import AppGetResponse
|
13
|
-
from biolib.
|
14
|
-
from biolib.
|
7
|
+
from biolib.biolib_api_client import AppGetResponse, BiolibApiClient
|
8
|
+
from biolib.biolib_api_client.lfs_types import DataRecordVersion, DataRecordVersionInfo
|
9
|
+
from biolib.biolib_errors import BioLibError
|
15
10
|
from biolib.biolib_logging import logger
|
16
|
-
from biolib.utils.app_uri import parse_app_uri
|
17
|
-
from biolib.utils.zip.remote_zip import RemoteZip # type: ignore
|
18
11
|
|
19
|
-
PathFilter = Union[str, Callable[[str], bool]]
|
20
12
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
@staticmethod
|
83
|
-
def create(destination: str, data_path: str, name: Optional[str] = None) -> 'DataRecord':
|
84
|
-
assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
|
85
|
-
record_name = name if name else 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
|
86
|
-
record_uri = lfs.create_large_file_system(lfs_uri=f'{destination}/{record_name}')
|
87
|
-
record_version_uri = lfs.push_large_file_system(lfs_uri=record_uri, input_dir=data_path)
|
88
|
-
return DataRecord(uri=record_version_uri)
|
89
|
-
|
90
|
-
@staticmethod
|
91
|
-
def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
|
92
|
-
max_page_size = 1_000
|
93
|
-
params: Dict[str, Union[str, int]] = {
|
94
|
-
'page_size': str(count or max_page_size),
|
95
|
-
'resource_type': 'data-record',
|
96
|
-
}
|
97
|
-
if uri:
|
98
|
-
uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
|
99
|
-
params['account_handle'] = uri_parsed['account_handle_normalized']
|
100
|
-
if uri_parsed['app_name_normalized']:
|
101
|
-
params['app_name'] = uri_parsed['app_name_normalized']
|
102
|
-
|
103
|
-
results = api_client.get(path='/apps/', params=params).json()['results']
|
104
|
-
if count is None and len(results) == max_page_size:
|
105
|
-
logger.warning(
|
106
|
-
f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
|
107
|
-
)
|
108
|
-
|
109
|
-
return [DataRecord(result['resource_uri']) for result in results]
|
110
|
-
|
111
|
-
@staticmethod
|
112
|
-
def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
|
113
|
-
local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
|
114
|
-
local_file_header_struct = Struct('<H2sHHHIIIHH')
|
115
|
-
LocalFileHeader = namedtuple(
|
116
|
-
'LocalFileHeader',
|
117
|
-
(
|
118
|
-
'version',
|
119
|
-
'flags',
|
120
|
-
'compression_raw',
|
121
|
-
'mod_time',
|
122
|
-
'mod_date',
|
123
|
-
'crc_32_expected',
|
124
|
-
'compressed_size_raw',
|
125
|
-
'uncompressed_size_raw',
|
126
|
-
'file_name_len',
|
127
|
-
'extra_field_len',
|
128
|
-
),
|
129
|
-
)
|
130
|
-
|
131
|
-
local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
|
132
|
-
local_file_header_end = local_file_header_start + local_file_header_struct.size
|
133
|
-
|
134
|
-
def file_start_func() -> int:
|
135
|
-
local_file_header_response = HttpClient.request(
|
136
|
-
url=remote_storage_endpoint.get_remote_url(),
|
137
|
-
headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
|
138
|
-
timeout_in_seconds=300,
|
139
|
-
)
|
140
|
-
local_file_header = LocalFileHeader._make(
|
141
|
-
local_file_header_struct.unpack(local_file_header_response.content)
|
142
|
-
)
|
143
|
-
file_start: int = (
|
144
|
-
local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
|
145
|
-
)
|
146
|
-
return file_start
|
147
|
-
|
148
|
-
return LazyLoadedFile(
|
149
|
-
buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
|
150
|
-
length=file_info['file_size'],
|
151
|
-
path=file_info['filename'],
|
152
|
-
start=None,
|
153
|
-
start_func=file_start_func,
|
154
|
-
)
|
155
|
-
|
156
|
-
@staticmethod
|
157
|
-
def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
|
158
|
-
if not (isinstance(path_filter, str) or callable(path_filter)):
|
159
|
-
raise Exception('Expected path_filter to be a string or a function')
|
160
|
-
|
161
|
-
if callable(path_filter):
|
162
|
-
return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
|
163
|
-
|
164
|
-
glob_filter = cast(str, path_filter)
|
165
|
-
|
166
|
-
def _filter_function(file: LazyLoadedFile) -> bool:
|
167
|
-
return fnmatch(file.path, glob_filter)
|
168
|
-
|
169
|
-
return list(filter(_filter_function, files))
|
13
|
+
def push_data_record_version(data_record_uuid: str, input_dir: str, chunk_size_in_mb: Optional[int] = None) -> str:
|
14
|
+
BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
|
15
|
+
|
16
|
+
if not os.path.isdir(input_dir):
|
17
|
+
raise BioLibError(f'Could not find folder at {input_dir}')
|
18
|
+
|
19
|
+
if os.path.realpath(input_dir) == '/':
|
20
|
+
raise BioLibError('Pushing your root directory is not possible')
|
21
|
+
|
22
|
+
original_working_dir = os.getcwd()
|
23
|
+
os.chdir(input_dir)
|
24
|
+
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
25
|
+
|
26
|
+
if data_size_in_bytes > 4_500_000_000_000:
|
27
|
+
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
28
|
+
|
29
|
+
min_chunk_size_bytes = 10_000_000
|
30
|
+
chunk_size_in_bytes: int
|
31
|
+
if chunk_size_in_mb:
|
32
|
+
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
33
|
+
if chunk_size_in_bytes < min_chunk_size_bytes:
|
34
|
+
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
35
|
+
chunk_size_in_bytes = min_chunk_size_bytes
|
36
|
+
else:
|
37
|
+
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
38
|
+
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
39
|
+
|
40
|
+
data_size_in_mb = round(data_size_in_bytes / 10**6)
|
41
|
+
print(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
42
|
+
|
43
|
+
response = api.client.post(path='/lfs/versions/', data={'resource_uuid': data_record_uuid})
|
44
|
+
data_record_version: DataRecordVersion = response.json()
|
45
|
+
iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
|
46
|
+
|
47
|
+
multipart_uploader = utils.MultiPartUploader(
|
48
|
+
use_process_pool=True,
|
49
|
+
get_presigned_upload_url_request=dict(
|
50
|
+
headers=None,
|
51
|
+
requires_biolib_auth=True,
|
52
|
+
path=f"/lfs/versions/{data_record_version['uuid']}/presigned_upload_url/",
|
53
|
+
),
|
54
|
+
complete_upload_request=dict(
|
55
|
+
headers=None,
|
56
|
+
requires_biolib_auth=True,
|
57
|
+
path=f"/lfs/versions/{data_record_version['uuid']}/complete_upload/",
|
58
|
+
),
|
59
|
+
)
|
60
|
+
|
61
|
+
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
62
|
+
os.chdir(original_working_dir)
|
63
|
+
logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
|
64
|
+
return data_record_version['uri']
|
65
|
+
|
66
|
+
|
67
|
+
def get_data_record_state_from_uri(uri) -> 'DataRecordVersionInfo':
|
68
|
+
app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': uri}).json()
|
69
|
+
return DataRecordVersionInfo(
|
70
|
+
resource_uri=app_response['app_version']['app_uri'],
|
71
|
+
resource_uuid=app_response['app']['public_id'],
|
72
|
+
resource_version_uuid=app_response['app_version']['public_id'],
|
73
|
+
)
|
@@ -3,7 +3,7 @@ from datetime import datetime, timedelta
|
|
3
3
|
from urllib.parse import urlparse
|
4
4
|
|
5
5
|
from biolib.api import client as api_client
|
6
|
-
from biolib.biolib_api_client.lfs_types import
|
6
|
+
from biolib.biolib_api_client.lfs_types import DataRecordVersion
|
7
7
|
from biolib.biolib_binary_format.utils import RemoteEndpoint
|
8
8
|
from biolib.biolib_logging import logger
|
9
9
|
from biolib.typing_utils import Optional
|
@@ -17,7 +17,7 @@ class DataRecordRemoteStorageEndpoint(RemoteEndpoint):
|
|
17
17
|
|
18
18
|
def get_remote_url(self) -> str:
|
19
19
|
if not self._presigned_url or not self._expires_at or datetime.utcnow() > self._expires_at:
|
20
|
-
lfs_version:
|
20
|
+
lfs_version: DataRecordVersion = api_client.get(
|
21
21
|
path=f'/lfs/versions/{self._resource_version_uuid}/',
|
22
22
|
).json()
|
23
23
|
|
@@ -0,0 +1,77 @@
|
|
1
|
+
import io
|
2
|
+
import os
|
3
|
+
import zipfile as zf
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
from biolib.typing_utils import Iterator, List, Tuple
|
7
|
+
|
8
|
+
|
9
|
+
def get_files_and_size_of_directory(directory: str) -> Tuple[List[str], int]:
|
10
|
+
data_size = 0
|
11
|
+
file_list: List[str] = []
|
12
|
+
|
13
|
+
for path, _, files in os.walk(directory):
|
14
|
+
for file in files:
|
15
|
+
file_path = os.path.join(path, file)
|
16
|
+
if os.path.islink(file_path):
|
17
|
+
continue # skip symlinks
|
18
|
+
|
19
|
+
relative_file_path = file_path[len(directory) + 1 :] # +1 to remove starting slash
|
20
|
+
file_list.append(relative_file_path)
|
21
|
+
data_size += os.path.getsize(file_path)
|
22
|
+
|
23
|
+
return file_list, data_size
|
24
|
+
|
25
|
+
|
26
|
+
def get_iterable_zip_stream(files: List[str], chunk_size: int) -> Iterator[bytes]:
|
27
|
+
class ChunkedIOBuffer(io.RawIOBase):
|
28
|
+
def __init__(self, chunk_size: int):
|
29
|
+
super().__init__()
|
30
|
+
self.chunk_size = chunk_size
|
31
|
+
self.tmp_data = bytearray()
|
32
|
+
|
33
|
+
def get_buffer_size(self):
|
34
|
+
return len(self.tmp_data)
|
35
|
+
|
36
|
+
def read_chunk(self):
|
37
|
+
chunk = bytes(self.tmp_data[: self.chunk_size])
|
38
|
+
self.tmp_data = self.tmp_data[self.chunk_size :]
|
39
|
+
return chunk
|
40
|
+
|
41
|
+
def write(self, data):
|
42
|
+
data_length = len(data)
|
43
|
+
self.tmp_data += data
|
44
|
+
return data_length
|
45
|
+
|
46
|
+
# create chunked buffer to hold data temporarily
|
47
|
+
io_buffer = ChunkedIOBuffer(chunk_size)
|
48
|
+
|
49
|
+
# create zip writer that will write to the io buffer
|
50
|
+
zip_writer = zf.ZipFile(io_buffer, mode='w') # type: ignore
|
51
|
+
|
52
|
+
for file_path in files:
|
53
|
+
# generate zip info and prepare zip pointer for writing
|
54
|
+
z_info = zf.ZipInfo.from_file(file_path)
|
55
|
+
zip_pointer = zip_writer.open(z_info, mode='w')
|
56
|
+
if Path(file_path).is_file():
|
57
|
+
# read file chunk by chunk
|
58
|
+
with open(file_path, 'br') as file_pointer:
|
59
|
+
while True:
|
60
|
+
chunk = file_pointer.read(chunk_size)
|
61
|
+
if len(chunk) == 0:
|
62
|
+
break
|
63
|
+
# write the chunk to the zip
|
64
|
+
zip_pointer.write(chunk)
|
65
|
+
# if writing the chunk caused us to go over chunk_size, flush it
|
66
|
+
if io_buffer.get_buffer_size() > chunk_size:
|
67
|
+
yield io_buffer.read_chunk()
|
68
|
+
|
69
|
+
zip_pointer.close()
|
70
|
+
|
71
|
+
# flush any remaining data in the stream (e.g. zip file meta data)
|
72
|
+
zip_writer.close()
|
73
|
+
while True:
|
74
|
+
chunk = io_buffer.read_chunk()
|
75
|
+
if len(chunk) == 0:
|
76
|
+
break
|
77
|
+
yield chunk
|
@@ -0,0 +1 @@
|
|
1
|
+
from .cache import prune_lfs_cache
|
@@ -6,12 +6,12 @@ import rich.progress
|
|
6
6
|
import yaml
|
7
7
|
|
8
8
|
from biolib import api, utils
|
9
|
+
from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
9
10
|
from biolib.biolib_api_client import BiolibApiClient
|
10
11
|
from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
|
11
12
|
from biolib.biolib_docker_client import BiolibDockerClient
|
12
13
|
from biolib.biolib_errors import BioLibError
|
13
14
|
from biolib.biolib_logging import logger
|
14
|
-
from biolib.lfs.utils import get_files_and_size_of_directory, get_iterable_zip_stream
|
15
15
|
from biolib.typing_utils import Iterable, Optional, Set, TypedDict
|
16
16
|
|
17
17
|
REGEX_MARKDOWN_INLINE_IMAGE = re.compile(r'!\[(?P<alt>.*)\]\((?P<src>.*)\)')
|
biolib/_internal/runtime.py
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
import
|
2
|
-
import re
|
3
|
-
|
4
|
-
from biolib import api
|
5
|
-
from biolib.typing_utils import Optional, TypedDict, cast
|
1
|
+
from biolib.typing_utils import TypedDict
|
6
2
|
|
7
3
|
|
8
4
|
class RuntimeJobDataDict(TypedDict):
|
@@ -21,76 +17,3 @@ class BioLibRuntimeNotRecognizedError(BioLibRuntimeError):
|
|
21
17
|
def __init__(self, message='The runtime is not recognized as a BioLib app'):
|
22
18
|
self.message = message
|
23
19
|
super().__init__(self.message)
|
24
|
-
|
25
|
-
|
26
|
-
class Runtime:
|
27
|
-
_job_data: Optional[RuntimeJobDataDict] = None
|
28
|
-
|
29
|
-
@staticmethod
|
30
|
-
def check_is_environment_biolib_app() -> bool:
|
31
|
-
return bool(Runtime._try_to_get_job_data())
|
32
|
-
|
33
|
-
@staticmethod
|
34
|
-
def get_job_id() -> str:
|
35
|
-
return Runtime._get_job_data()['job_uuid']
|
36
|
-
|
37
|
-
@staticmethod
|
38
|
-
def get_job_auth_token() -> str:
|
39
|
-
return Runtime._get_job_data()['job_auth_token']
|
40
|
-
|
41
|
-
@staticmethod
|
42
|
-
def get_job_requested_machine() -> str:
|
43
|
-
return Runtime._get_job_data()['job_requested_machine']
|
44
|
-
|
45
|
-
@staticmethod
|
46
|
-
def get_app_uri() -> str:
|
47
|
-
return Runtime._get_job_data()['app_uri']
|
48
|
-
|
49
|
-
@staticmethod
|
50
|
-
def get_secret(secret_name: str) -> bytes:
|
51
|
-
assert re.match(
|
52
|
-
'^[a-zA-Z0-9_-]*$', secret_name
|
53
|
-
), 'Secret name can only contain alphanumeric characters and dashes or underscores '
|
54
|
-
try:
|
55
|
-
with open(f'/biolib/secrets/{secret_name}', 'rb') as file:
|
56
|
-
return file.read()
|
57
|
-
except BaseException as error:
|
58
|
-
raise BioLibRuntimeError(f'Unable to get system secret: {secret_name}') from error
|
59
|
-
|
60
|
-
@staticmethod
|
61
|
-
def set_main_result_prefix(result_prefix: str) -> None:
|
62
|
-
job_data = Runtime._get_job_data()
|
63
|
-
api.client.patch(
|
64
|
-
data={'result_name_prefix': result_prefix},
|
65
|
-
headers={'Job-Auth-Token': job_data['job_auth_token']},
|
66
|
-
path=f"/jobs/{job_data['job_uuid']}/main_result/",
|
67
|
-
)
|
68
|
-
|
69
|
-
@staticmethod
|
70
|
-
def create_result_note(note: str) -> None:
|
71
|
-
job_id = Runtime.get_job_id()
|
72
|
-
# Note: Authentication is added by app caller proxy in compute node
|
73
|
-
api.client.post(data={'note': note}, path=f'/jobs/{job_id}/notes/')
|
74
|
-
|
75
|
-
@staticmethod
|
76
|
-
def _try_to_get_job_data() -> Optional[RuntimeJobDataDict]:
|
77
|
-
if not Runtime._job_data:
|
78
|
-
try:
|
79
|
-
with open('/biolib/secrets/biolib_system_secret') as file:
|
80
|
-
job_data: RuntimeJobDataDict = json.load(file)
|
81
|
-
except BaseException:
|
82
|
-
return None
|
83
|
-
|
84
|
-
if not job_data['version'].startswith('1.'):
|
85
|
-
raise BioLibRuntimeError(f"Unexpected system secret version {job_data['version']} expected 1.x.x")
|
86
|
-
|
87
|
-
Runtime._job_data = job_data
|
88
|
-
|
89
|
-
return cast(RuntimeJobDataDict, Runtime._job_data)
|
90
|
-
|
91
|
-
@staticmethod
|
92
|
-
def _get_job_data() -> RuntimeJobDataDict:
|
93
|
-
job_data = Runtime._try_to_get_job_data()
|
94
|
-
if not job_data:
|
95
|
-
raise BioLibRuntimeNotRecognizedError() from None
|
96
|
-
return job_data
|
@@ -0,0 +1,79 @@
|
|
1
|
+
from biolib import api
|
2
|
+
from biolib._internal.runtime import BioLibRuntimeError, BioLibRuntimeNotRecognizedError, RuntimeJobDataDict
|
3
|
+
from biolib.typing_utils import cast, Optional as _Optional
|
4
|
+
|
5
|
+
import json
|
6
|
+
import re
|
7
|
+
|
8
|
+
|
9
|
+
class Runtime:
|
10
|
+
_job_data: _Optional[RuntimeJobDataDict] = None
|
11
|
+
|
12
|
+
@staticmethod
|
13
|
+
def check_is_environment_biolib_app() -> bool:
|
14
|
+
return bool(Runtime._try_to_get_job_data())
|
15
|
+
|
16
|
+
@staticmethod
|
17
|
+
def get_job_id() -> str:
|
18
|
+
return Runtime._get_job_data()['job_uuid']
|
19
|
+
|
20
|
+
@staticmethod
|
21
|
+
def get_job_auth_token() -> str:
|
22
|
+
return Runtime._get_job_data()['job_auth_token']
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def get_job_requested_machine() -> str:
|
26
|
+
return Runtime._get_job_data()['job_requested_machine']
|
27
|
+
|
28
|
+
@staticmethod
|
29
|
+
def get_app_uri() -> str:
|
30
|
+
return Runtime._get_job_data()['app_uri']
|
31
|
+
|
32
|
+
@staticmethod
|
33
|
+
def get_secret(secret_name: str) -> bytes:
|
34
|
+
assert re.match(
|
35
|
+
'^[a-zA-Z0-9_-]*$', secret_name
|
36
|
+
), 'Secret name can only contain alphanumeric characters and dashes or underscores '
|
37
|
+
try:
|
38
|
+
with open(f'/biolib/secrets/{secret_name}', 'rb') as file:
|
39
|
+
return file.read()
|
40
|
+
except BaseException as error:
|
41
|
+
raise BioLibRuntimeError(f'Unable to get system secret: {secret_name}') from error
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def set_main_result_prefix(result_prefix: str) -> None:
|
45
|
+
job_data = Runtime._get_job_data()
|
46
|
+
api.client.patch(
|
47
|
+
data={'result_name_prefix': result_prefix},
|
48
|
+
headers={'Job-Auth-Token': job_data['job_auth_token']},
|
49
|
+
path=f"/jobs/{job_data['job_uuid']}/main_result/",
|
50
|
+
)
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def create_result_note(note: str) -> None:
|
54
|
+
job_id = Runtime.get_job_id()
|
55
|
+
# Note: Authentication is added by app caller proxy in compute node
|
56
|
+
api.client.post(data={'note': note}, path=f'/jobs/{job_id}/notes/')
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def _try_to_get_job_data() -> _Optional[RuntimeJobDataDict]:
|
60
|
+
if not Runtime._job_data:
|
61
|
+
try:
|
62
|
+
with open('/biolib/secrets/biolib_system_secret') as file:
|
63
|
+
job_data: RuntimeJobDataDict = json.load(file)
|
64
|
+
except BaseException:
|
65
|
+
return None
|
66
|
+
|
67
|
+
if not job_data['version'].startswith('1.'):
|
68
|
+
raise BioLibRuntimeError(f"Unexpected system secret version {job_data['version']} expected 1.x.x")
|
69
|
+
|
70
|
+
Runtime._job_data = job_data
|
71
|
+
|
72
|
+
return cast(RuntimeJobDataDict, Runtime._job_data)
|
73
|
+
|
74
|
+
@staticmethod
|
75
|
+
def _get_job_data() -> RuntimeJobDataDict:
|
76
|
+
job_data = Runtime._try_to_get_job_data()
|
77
|
+
if not job_data:
|
78
|
+
raise BioLibRuntimeNotRecognizedError() from None
|
79
|
+
return job_data
|
@@ -6,7 +6,7 @@ import os
|
|
6
6
|
from datetime import datetime, timezone
|
7
7
|
from json.decoder import JSONDecodeError
|
8
8
|
|
9
|
-
from biolib.
|
9
|
+
from biolib._runtime.runtime import Runtime
|
10
10
|
from biolib._internal.http_client import HttpClient
|
11
11
|
from biolib.typing_utils import Optional
|
12
12
|
from biolib.biolib_errors import BioLibError
|
@@ -1,13 +1,19 @@
|
|
1
1
|
from biolib.typing_utils import TypedDict
|
2
2
|
|
3
3
|
|
4
|
-
class
|
4
|
+
class DataRecordVersion(TypedDict):
|
5
5
|
presigned_download_url: str
|
6
6
|
size_bytes: int
|
7
7
|
uri: str
|
8
8
|
uuid: str
|
9
9
|
|
10
10
|
|
11
|
-
class
|
11
|
+
class DataRecordInfo(TypedDict):
|
12
12
|
uri: str
|
13
13
|
uuid: str
|
14
|
+
|
15
|
+
|
16
|
+
class DataRecordVersionInfo(TypedDict):
|
17
|
+
resource_uri: str
|
18
|
+
resource_uuid: str
|
19
|
+
resource_version_uuid: str
|
biolib/cli/data_record.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
import json
|
1
2
|
import logging
|
2
3
|
import os
|
4
|
+
from typing import Dict, List
|
3
5
|
|
4
6
|
import click
|
5
7
|
|
6
|
-
from biolib.
|
8
|
+
from biolib._data_record.data_record import DataRecord
|
7
9
|
from biolib.biolib_logging import logger, logger_no_user_data
|
8
10
|
from biolib.typing_utils import Optional
|
9
11
|
|
@@ -15,11 +17,18 @@ def data_record() -> None:
|
|
15
17
|
|
16
18
|
|
17
19
|
@data_record.command(help='Create a Data Record')
|
18
|
-
@click.
|
20
|
+
@click.argument('uri', required=True)
|
21
|
+
@click.option('--data-path', required=True, type=click.Path(exists=True))
|
22
|
+
def create(uri: str, data_path: str) -> None:
|
23
|
+
DataRecord.create(destination=uri, data_path=data_path)
|
24
|
+
|
25
|
+
|
26
|
+
@data_record.command(help='Update a Data Record')
|
27
|
+
@click.argument('uri', required=True)
|
19
28
|
@click.option('--data-path', required=True, type=click.Path(exists=True))
|
20
|
-
@click.option('--
|
21
|
-
def
|
22
|
-
DataRecord.
|
29
|
+
@click.option('--chunk-size', default=None, required=False, type=click.INT, help='The size of each chunk (In MB)')
|
30
|
+
def update(uri: str, data_path: str, chunk_size: Optional[int]) -> None:
|
31
|
+
DataRecord.get_by_uri(uri=uri).update(data_path=data_path, chunk_size_in_mb=chunk_size)
|
23
32
|
|
24
33
|
|
25
34
|
@data_record.command(help='Download files from a Data Record')
|
@@ -27,7 +36,7 @@ def create(destination: str, data_path: str, name: Optional[str] = None) -> None
|
|
27
36
|
@click.option('--file', required=False, type=str)
|
28
37
|
@click.option('--path-filter', required=False, type=str, hide_input=True)
|
29
38
|
def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
|
30
|
-
record = DataRecord(uri=uri)
|
39
|
+
record = DataRecord.get_by_uri(uri=uri)
|
31
40
|
if file is not None:
|
32
41
|
try:
|
33
42
|
file_obj = [file_obj for file_obj in record.list_files() if file_obj.path == file][0]
|
@@ -41,3 +50,30 @@ def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
|
|
41
50
|
else:
|
42
51
|
assert not os.path.exists(record.name), f'Directory with name {record.name} already exists in current directory'
|
43
52
|
record.save_files(output_dir=record.name, path_filter=path_filter)
|
53
|
+
|
54
|
+
|
55
|
+
@data_record.command(help='Describe a Data Record')
|
56
|
+
@click.argument('uri', required=True)
|
57
|
+
@click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
|
58
|
+
def describe(uri: str, output_as_json: bool) -> None:
|
59
|
+
record = DataRecord.get_by_uri(uri)
|
60
|
+
files_info: List[Dict] = []
|
61
|
+
total_size_in_bytes = 0
|
62
|
+
for file in record.list_files():
|
63
|
+
files_info.append({'path': file.path, 'size_bytes': file.length})
|
64
|
+
total_size_in_bytes += file.length
|
65
|
+
|
66
|
+
if output_as_json:
|
67
|
+
print(
|
68
|
+
json.dumps(
|
69
|
+
obj={'uri': record.uri, 'size_bytes': total_size_in_bytes, 'files': files_info},
|
70
|
+
indent=4,
|
71
|
+
)
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
print(f'Data Record {record.uri}\ntotal {total_size_in_bytes} bytes\n')
|
75
|
+
print('size bytes path')
|
76
|
+
for file_info in files_info:
|
77
|
+
size_string = str(file_info['size_bytes'])
|
78
|
+
leading_space_string = ' ' * (10 - len(size_string))
|
79
|
+
print(f"{leading_space_string}{size_string} {file_info['path']}")
|
biolib/cli/lfs.py
CHANGED
@@ -7,9 +7,9 @@ from typing import Dict, List
|
|
7
7
|
import click
|
8
8
|
|
9
9
|
from biolib import biolib_errors
|
10
|
-
from biolib.
|
10
|
+
from biolib._data_record.data_record import DataRecord
|
11
|
+
from biolib._internal.lfs import prune_lfs_cache
|
11
12
|
from biolib.biolib_logging import logger, logger_no_user_data
|
12
|
-
from biolib.lfs import create_large_file_system, prune_lfs_cache, push_large_file_system
|
13
13
|
from biolib.typing_utils import Optional
|
14
14
|
|
15
15
|
|
@@ -21,9 +21,10 @@ def lfs() -> None:
|
|
21
21
|
@lfs.command(help='Create a Large File System')
|
22
22
|
@click.argument('uri', required=True)
|
23
23
|
def create(uri: str) -> None:
|
24
|
+
logger.warning('This is command deprecated, please use "biolib data-record create" instead.')
|
24
25
|
logger.configure(default_log_level=logging.INFO)
|
25
26
|
logger_no_user_data.configure(default_log_level=logging.INFO)
|
26
|
-
|
27
|
+
DataRecord.create(destination=uri)
|
27
28
|
|
28
29
|
|
29
30
|
@lfs.command(help='Push a new version of a Large File System')
|
@@ -31,10 +32,11 @@ def create(uri: str) -> None:
|
|
31
32
|
@click.option('--path', required=True, type=click.Path(exists=True))
|
32
33
|
@click.option('--chunk-size', default=None, required=False, type=click.INT, help='The size of each chunk (In MB)')
|
33
34
|
def push(uri: str, path: str, chunk_size: Optional[int]) -> None:
|
35
|
+
logger.warning('This is command deprecated, please use "biolib data-record update" instead.')
|
34
36
|
logger.configure(default_log_level=logging.INFO)
|
35
37
|
logger_no_user_data.configure(default_log_level=logging.INFO)
|
36
38
|
try:
|
37
|
-
|
39
|
+
DataRecord.get_by_uri(uri=uri).update(data_path=path, chunk_size_in_mb=chunk_size)
|
38
40
|
except biolib_errors.BioLibError as error:
|
39
41
|
print(f'An error occurred:\n{error.message}', file=sys.stderr)
|
40
42
|
exit(1)
|
@@ -44,10 +46,11 @@ def push(uri: str, path: str, chunk_size: Optional[int]) -> None:
|
|
44
46
|
@click.argument('uri', required=True)
|
45
47
|
@click.option('--file-path', required=True, type=str)
|
46
48
|
def download_file(uri: str, file_path: str) -> None:
|
49
|
+
logger.warning('This is command deprecated, please use "biolib data-record download" instead.')
|
47
50
|
logger.configure(default_log_level=logging.INFO)
|
48
51
|
logger_no_user_data.configure(default_log_level=logging.INFO)
|
49
52
|
try:
|
50
|
-
record = DataRecord(uri=uri)
|
53
|
+
record = DataRecord.get_by_uri(uri=uri)
|
51
54
|
try:
|
52
55
|
file_obj = [file_obj for file_obj in record.list_files() if file_obj.path == file_path][0]
|
53
56
|
except IndexError:
|
@@ -66,7 +69,8 @@ def download_file(uri: str, file_path: str) -> None:
|
|
66
69
|
@click.argument('uri', required=True)
|
67
70
|
@click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
|
68
71
|
def describe(uri: str, output_as_json: bool) -> None:
|
69
|
-
|
72
|
+
logger.warning('This is command deprecated, please use "biolib data-record describe" instead.')
|
73
|
+
data_record = DataRecord.get_by_uri(uri)
|
70
74
|
files_info: List[Dict] = []
|
71
75
|
total_size_in_bytes = 0
|
72
76
|
for file in data_record.list_files():
|
biolib/runtime/__init__.py
CHANGED
biolib/sdk/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# Imports to hide and use as private internal utils
|
2
|
+
from biolib._data_record.data_record import DataRecord as _DataRecord
|
2
3
|
from biolib._internal.push_application import push_application as _push_application
|
3
4
|
from biolib._internal.push_application import set_app_version_as_active as _set_app_version_as_active
|
4
5
|
from biolib.app import BioLibApp as _BioLibApp
|
5
6
|
from biolib.typing_utils import Optional as _Optional
|
6
7
|
|
7
|
-
#
|
8
|
-
from biolib.
|
9
|
-
from biolib._internal.runtime import Runtime
|
8
|
+
# Classes to expose as public API
|
9
|
+
from biolib._runtime.runtime import Runtime
|
10
10
|
|
11
11
|
|
12
12
|
def push_app_version(uri: str, path: str) -> _BioLibApp:
|
@@ -42,5 +42,9 @@ def get_app_version_pytest_plugin(app_version: _BioLibApp):
|
|
42
42
|
return AppVersionFixturePlugin(app_version)
|
43
43
|
|
44
44
|
|
45
|
-
def create_data_record(destination: str, data_path: str, name: _Optional[str] = None) ->
|
46
|
-
|
45
|
+
def create_data_record(destination: str, data_path: str, name: _Optional[str] = None) -> _DataRecord:
|
46
|
+
if name:
|
47
|
+
destination_with_name = f"{destination}/{name}"
|
48
|
+
else:
|
49
|
+
destination_with_name = destination
|
50
|
+
return _DataRecord.create(destination_with_name, data_path)
|
@@ -1,32 +1,37 @@
|
|
1
1
|
LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
2
2
|
README.md,sha256=_IH7pxFiqy2bIAmaVeA-iVTyUwWRjMIlfgtUbYTtmls,368
|
3
|
-
biolib/__init__.py,sha256=
|
3
|
+
biolib/__init__.py,sha256=yX8w8bDiY7CIxfKHFRF0U1hhwgCCIXtVr18Td5iNLp8,4135
|
4
|
+
biolib/_data_record/data_record.py,sha256=jUeCQjnVQLNLmlXO3rREEUnjXjOYuaQjBO7R66P6wFU,8909
|
4
5
|
biolib/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
biolib/_internal/data_record/__init__.py,sha256=
|
6
|
-
biolib/_internal/data_record/data_record.py,sha256=
|
7
|
-
biolib/_internal/data_record/remote_storage_endpoint.py,sha256=
|
6
|
+
biolib/_internal/data_record/__init__.py,sha256=0T0CV6PfKc8itjMu-48sCJjcZQEzXl1ZLBqG_LjJTqQ,82
|
7
|
+
biolib/_internal/data_record/data_record.py,sha256=D0BaC8WhnkM564eKUI69hVHUkKY1In0cyfpjxYyWk18,3363
|
8
|
+
biolib/_internal/data_record/remote_storage_endpoint.py,sha256=eCptuZ4DMAPnaNCVDvpWXwXGI6Jac9U1N5dqU8Cj95Q,1732
|
9
|
+
biolib/_internal/file_utils.py,sha256=4jT6j7bB21c0JNn5BfnyWQib_zt0CVtJ_TiOFOStRcE,2604
|
8
10
|
biolib/_internal/fuse_mount/__init__.py,sha256=B_tM6RM2dBw-vbpoHJC4X3tOAaN1H2RDvqYJOw3xFwg,55
|
9
11
|
biolib/_internal/fuse_mount/experiment_fuse_mount.py,sha256=08aUdEq_bvqLBft_gSLjOClKDy5sBnMts1RfJf7AP_U,7012
|
10
12
|
biolib/_internal/http_client.py,sha256=DdooXei93JKGYGV4aQmzue_oFzvHkozg2UCxgk9dfDM,5081
|
13
|
+
biolib/_internal/lfs/__init__.py,sha256=gSWo_xg61UniYgD7yNYxeT4I9uaXBCBSi3_nmZjnPpE,35
|
14
|
+
biolib/_internal/lfs/cache.py,sha256=pQS2np21rdJ6I3DpoOutnzPHpLOZgUIS8TMltUJk_k4,2226
|
11
15
|
biolib/_internal/libs/__init__.py,sha256=Jdf4tNPqe_oIIf6zYml6TiqhL_02Vyqwge6IELrAFhw,98
|
12
16
|
biolib/_internal/libs/fusepy/__init__.py,sha256=AWDzNFS-XV_5yKb0Qx7kggIhPzq1nj_BZS5y2Nso08k,41944
|
13
|
-
biolib/_internal/push_application.py,sha256=
|
14
|
-
biolib/_internal/runtime.py,sha256=
|
17
|
+
biolib/_internal/push_application.py,sha256=8P7eXvySn7CRp5XBDkO3xjTGixS8g7-jD-_iwzM_XDI,10020
|
18
|
+
biolib/_internal/runtime.py,sha256=9pZ3s3L7LGxdqOgnHh1KK3Jjyn_9MjhQmKHI-6hMT3U,448
|
15
19
|
biolib/_internal/utils/__init__.py,sha256=p5vsIFyu-zYqBgdSMfwW9NC_jk7rXvvCbV4Bzd3As7c,630
|
20
|
+
biolib/_runtime/runtime.py,sha256=zy9HrE4X5hBqm8doUHkckyflquSBDSXV3REhT2MQGas,2767
|
16
21
|
biolib/api/__init__.py,sha256=mQ4u8FijqyLzjYMezMUUbbBGNB3iFmkNdjXnWPZ7Jlw,138
|
17
22
|
biolib/api/client.py,sha256=9MD1qI52BnRC_QSydFGjyFquwFw0R9dkDfUrjUouuHQ,3490
|
18
23
|
biolib/app/__init__.py,sha256=cdPtcfb_U-bxb9iSL4fCEq2rpD9OjkyY4W-Zw60B0LI,37
|
19
24
|
biolib/app/app.py,sha256=8AvPYL1W2wxQ7t7BB2KeVU2WPrm3UL6vVuHPGs8g9L0,8388
|
20
25
|
biolib/app/search_apps.py,sha256=K4a41f5XIWth2BWI7OffASgIsD0ko8elCax8YL2igaY,1470
|
21
26
|
biolib/biolib_api_client/__init__.py,sha256=E5EMa19wJoblwSdQPYrxc_BtIeRsAuO0L_jQweWw-Yk,182
|
22
|
-
biolib/biolib_api_client/api_client.py,sha256=
|
23
|
-
biolib/biolib_api_client/app_types.py,sha256=
|
27
|
+
biolib/biolib_api_client/api_client.py,sha256=krlSRmmAwtdMMyN1XzQhh1gihB1ERSIVslWQ-dqI1yU,7188
|
28
|
+
biolib/biolib_api_client/app_types.py,sha256=FxSr4UqfnMhLe34p8bm02wsC3g1Jz8iaing5tRKDOQI,2442
|
24
29
|
biolib/biolib_api_client/auth.py,sha256=kjm0ZHnH3I8so3su2sZbBxNHYp-ZUdrZ5lwQ0K36RSw,949
|
25
30
|
biolib/biolib_api_client/biolib_app_api.py,sha256=DndlVxrNTes6DOaWyMINLGZQCRMWVvR7gwt5HVlyf5Y,4240
|
26
31
|
biolib/biolib_api_client/biolib_job_api.py,sha256=IpFahcRzm7GNy8DJ-XHYe-x7r4Voba8o22IXw5puHn8,6782
|
27
32
|
biolib/biolib_api_client/common_types.py,sha256=RH-1KNHqUF-EkTpfPOSTt5Mq1GPdfju_cqXDesscO1I,123
|
28
33
|
biolib/biolib_api_client/job_types.py,sha256=Dl4NhU2xpgpXV-7YIoDf6WL63SLR5bni55OX8x5539M,1300
|
29
|
-
biolib/biolib_api_client/lfs_types.py,sha256=
|
34
|
+
biolib/biolib_api_client/lfs_types.py,sha256=joZWP6-sa5_Ug_6xIp5fHAgEo_bqLE3rbleQocZtDcg,339
|
30
35
|
biolib/biolib_api_client/user_state.py,sha256=XcgWV-MgVk88mIlMmnu8yHxMu8OCaw8o0tk7TVo5Hcg,637
|
31
36
|
biolib/biolib_binary_format/__init__.py,sha256=HMl5SdX_VUWE4OQzi4Jf_yFvC7b0bSPOGPHYi9dWM2Q,185
|
32
37
|
biolib/biolib_binary_format/base_bbf_package.py,sha256=vxRV4iKy0dXeDOlFWnMFI0hGnDBYDH5Cgh5gAfuObt8,959
|
@@ -46,10 +51,10 @@ biolib/biolib_errors.py,sha256=5m4lK2l39DafpoXBImEBD4EPH3ayXBX0JgtPzmGClow,689
|
|
46
51
|
biolib/biolib_logging.py,sha256=J3E5H_LL5k6ZUim2C8gqN7E6lCBZMTpO4tnMpOPwG9U,2854
|
47
52
|
biolib/cli/__init__.py,sha256=0v3c_J-U0k46c5ZWeQjLG_kTaKDJm81LBxQpDO2B_aI,1286
|
48
53
|
biolib/cli/auth.py,sha256=rpWGmXs6Fz6CGrO9K8ibPRszOdXG78Vig_boKaVCD9A,2082
|
49
|
-
biolib/cli/data_record.py,sha256=
|
54
|
+
biolib/cli/data_record.py,sha256=oDy8U6mv-h-hbeMihXRzVEvM-WrGQq6oBiBl3xDRaXs,3220
|
50
55
|
biolib/cli/download_container.py,sha256=HIZVHOPmslGE5M2Dsp9r2cCkAEJx__vcsDz5Wt5LRos,483
|
51
56
|
biolib/cli/init.py,sha256=wQOfii_au-d30Hp7DdH-WVw-WVraKvA_zY4za1w7DE8,821
|
52
|
-
biolib/cli/lfs.py,sha256=
|
57
|
+
biolib/cli/lfs.py,sha256=z2qHUwink85mv9yDgifbVKkVwuyknGhMDTfly_gLKJM,4151
|
53
58
|
biolib/cli/push.py,sha256=TFi7O9tJ3zFe0VmtVTV3Vh9_xIMHnrc41xxcaBKU46g,813
|
54
59
|
biolib/cli/run.py,sha256=BbvXLQ-XibjQ71Y2d4URMH_8dflNVwM0i3TIWhw_u_c,1634
|
55
60
|
biolib/cli/runtime.py,sha256=Xv-nrma5xX8NidWcvbUKcUvuN5TCarZa4A8mPVmF-z0,361
|
@@ -92,11 +97,8 @@ biolib/jobs/__init__.py,sha256=aIb2H2DHjQbM2Bs-dysFijhwFcL58Blp0Co0gimED3w,32
|
|
92
97
|
biolib/jobs/job.py,sha256=npnARoP408SXD2UqyzFRJYdEJsP_gHoBh2xQkNegYqg,18884
|
93
98
|
biolib/jobs/job_result.py,sha256=rALHiKYNaC9lHi_JJqBob1RubzNLwG9Z386kwRJjd2M,5885
|
94
99
|
biolib/jobs/types.py,sha256=qhadtH2KDC2WUOOqPiwke0YgtQY4FtuB71Stekq1k48,970
|
95
|
-
biolib/
|
96
|
-
biolib/
|
97
|
-
biolib/lfs/utils.py,sha256=HSs7F2wXklYhhv5tabfaeC5noXJyxRjbGD5IhWOVdxs,5918
|
98
|
-
biolib/runtime/__init__.py,sha256=x1Ivydtu9TFTaX-Cofg_kGA-TI0zLon-ccrFiiVgBok,492
|
99
|
-
biolib/sdk/__init__.py,sha256=wkQs7ltIpYK9Xw0-FLLacblemmlNGz8J2UmlM0noGSs,1749
|
100
|
+
biolib/runtime/__init__.py,sha256=Fg2ZIAmUegurLKagpBNfRgLcOwR2VZSmXQpb-ryRwI0,505
|
101
|
+
biolib/sdk/__init__.py,sha256=qJ_V_Edxolzi4VBQCrvem5lYIkJ0FVH3VZepSDuXjTc,1895
|
100
102
|
biolib/tables.py,sha256=acH7VjwAbadLo8P84FSnKEZxCTVsF5rEg9VPuxElNs8,872
|
101
103
|
biolib/templates/__init__.py,sha256=Yx62sSyDCDesRQDQgmbDsLpfgEh93fWE8r9u4g2azXk,36
|
102
104
|
biolib/templates/example_app.py,sha256=EB3E3RT4SeO_ii5nVQqJpi5KDGNE_huF1ub-e5ZFveE,715
|
@@ -109,8 +111,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
|
|
109
111
|
biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
|
110
112
|
biolib/utils/seq_util.py,sha256=jC5WhH63FTD7SLFJbxQGA2hOt9NTwq9zHl_BEec1Z0c,4907
|
111
113
|
biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
|
112
|
-
pybiolib-1.1.
|
113
|
-
pybiolib-1.1.
|
114
|
-
pybiolib-1.1.
|
115
|
-
pybiolib-1.1.
|
116
|
-
pybiolib-1.1.
|
114
|
+
pybiolib-1.1.2097.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
|
115
|
+
pybiolib-1.1.2097.dist-info/METADATA,sha256=-4wEBR8SXfG_VDlLRZR7UgrlKee5VydzL-L6wMKP17Y,1508
|
116
|
+
pybiolib-1.1.2097.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
117
|
+
pybiolib-1.1.2097.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
|
118
|
+
pybiolib-1.1.2097.dist-info/RECORD,,
|
biolib/lfs/__init__.py
DELETED
biolib/lfs/utils.py
DELETED
@@ -1,153 +0,0 @@
|
|
1
|
-
import io
|
2
|
-
import os
|
3
|
-
import zipfile as zf
|
4
|
-
from pathlib import Path
|
5
|
-
|
6
|
-
from biolib import utils, api
|
7
|
-
from biolib.biolib_api_client import BiolibApiClient
|
8
|
-
from biolib.biolib_api_client.lfs_types import LargeFileSystem, LargeFileSystemVersion
|
9
|
-
from biolib.biolib_logging import logger
|
10
|
-
from biolib.biolib_errors import BioLibError
|
11
|
-
from biolib.typing_utils import List, Tuple, Iterator, Optional
|
12
|
-
from biolib.utils.app_uri import parse_app_uri
|
13
|
-
|
14
|
-
|
15
|
-
def get_files_and_size_of_directory(directory: str) -> Tuple[List[str], int]:
|
16
|
-
data_size = 0
|
17
|
-
file_list: List[str] = []
|
18
|
-
|
19
|
-
for path, _, files in os.walk(directory):
|
20
|
-
for file in files:
|
21
|
-
file_path = os.path.join(path, file)
|
22
|
-
if os.path.islink(file_path):
|
23
|
-
continue # skip symlinks
|
24
|
-
|
25
|
-
relative_file_path = file_path[len(directory) + 1:] # +1 to remove starting slash
|
26
|
-
file_list.append(relative_file_path)
|
27
|
-
data_size += os.path.getsize(file_path)
|
28
|
-
|
29
|
-
return file_list, data_size
|
30
|
-
|
31
|
-
|
32
|
-
def get_iterable_zip_stream(files: List[str], chunk_size: int) -> Iterator[bytes]:
|
33
|
-
class ChunkedIOBuffer(io.RawIOBase):
|
34
|
-
def __init__(self, chunk_size: int):
|
35
|
-
super().__init__()
|
36
|
-
self.chunk_size = chunk_size
|
37
|
-
self.tmp_data = bytearray()
|
38
|
-
|
39
|
-
def get_buffer_size(self):
|
40
|
-
return len(self.tmp_data)
|
41
|
-
|
42
|
-
def read_chunk(self):
|
43
|
-
chunk = bytes(self.tmp_data[:self.chunk_size])
|
44
|
-
self.tmp_data = self.tmp_data[self.chunk_size:]
|
45
|
-
return chunk
|
46
|
-
|
47
|
-
def write(self, data):
|
48
|
-
data_length = len(data)
|
49
|
-
self.tmp_data += data
|
50
|
-
return data_length
|
51
|
-
|
52
|
-
# create chunked buffer to hold data temporarily
|
53
|
-
io_buffer = ChunkedIOBuffer(chunk_size)
|
54
|
-
|
55
|
-
# create zip writer that will write to the io buffer
|
56
|
-
zip_writer = zf.ZipFile(io_buffer, mode='w') # type: ignore
|
57
|
-
|
58
|
-
for file_path in files:
|
59
|
-
# generate zip info and prepare zip pointer for writing
|
60
|
-
z_info = zf.ZipInfo.from_file(file_path)
|
61
|
-
zip_pointer = zip_writer.open(z_info, mode='w')
|
62
|
-
if Path(file_path).is_file():
|
63
|
-
# read file chunk by chunk
|
64
|
-
with open(file_path, 'br') as file_pointer:
|
65
|
-
while True:
|
66
|
-
chunk = file_pointer.read(chunk_size)
|
67
|
-
if len(chunk) == 0:
|
68
|
-
break
|
69
|
-
# write the chunk to the zip
|
70
|
-
zip_pointer.write(chunk)
|
71
|
-
# if writing the chunk caused us to go over chunk_size, flush it
|
72
|
-
if io_buffer.get_buffer_size() > chunk_size:
|
73
|
-
yield io_buffer.read_chunk()
|
74
|
-
|
75
|
-
zip_pointer.close()
|
76
|
-
|
77
|
-
# flush any remaining data in the stream (e.g. zip file meta data)
|
78
|
-
zip_writer.close()
|
79
|
-
while True:
|
80
|
-
chunk = io_buffer.read_chunk()
|
81
|
-
if len(chunk) == 0:
|
82
|
-
break
|
83
|
-
yield chunk
|
84
|
-
|
85
|
-
|
86
|
-
def create_large_file_system(lfs_uri: str) -> str:
|
87
|
-
BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Large File System')
|
88
|
-
|
89
|
-
uri_parsed = parse_app_uri(lfs_uri)
|
90
|
-
response = api.client.post(
|
91
|
-
path='/lfs/',
|
92
|
-
data={
|
93
|
-
'account_handle': uri_parsed['account_handle_normalized'],
|
94
|
-
'name': uri_parsed['app_name'],
|
95
|
-
},
|
96
|
-
)
|
97
|
-
lfs: LargeFileSystem = response.json()
|
98
|
-
logger.info(f"Successfully created new Large File System '{lfs['uri']}'")
|
99
|
-
return lfs['uri']
|
100
|
-
|
101
|
-
|
102
|
-
def push_large_file_system(lfs_uri: str, input_dir: str, chunk_size_in_mb: Optional[int] = None) -> str:
|
103
|
-
BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Large File System')
|
104
|
-
|
105
|
-
if not os.path.isdir(input_dir):
|
106
|
-
raise BioLibError(f'Could not find folder at {input_dir}')
|
107
|
-
|
108
|
-
if os.path.realpath(input_dir) == '/':
|
109
|
-
raise BioLibError('Pushing your root directory is not possible')
|
110
|
-
|
111
|
-
original_working_dir = os.getcwd()
|
112
|
-
os.chdir(input_dir)
|
113
|
-
files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
|
114
|
-
|
115
|
-
if data_size_in_bytes > 4_500_000_000_000:
|
116
|
-
raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
|
117
|
-
|
118
|
-
min_chunk_size_bytes = 10_000_000
|
119
|
-
chunk_size_in_bytes: int
|
120
|
-
if chunk_size_in_mb:
|
121
|
-
chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
|
122
|
-
if chunk_size_in_bytes < min_chunk_size_bytes:
|
123
|
-
logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
|
124
|
-
chunk_size_in_bytes = min_chunk_size_bytes
|
125
|
-
else:
|
126
|
-
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
127
|
-
chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
|
128
|
-
|
129
|
-
data_size_in_mb = round(data_size_in_bytes / 10 ** 6)
|
130
|
-
print(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
|
131
|
-
|
132
|
-
response = api.client.post(path='/lfs/versions/', data={'resource_uri': lfs_uri})
|
133
|
-
lfs_version: LargeFileSystemVersion = response.json()
|
134
|
-
iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
|
135
|
-
|
136
|
-
multipart_uploader = utils.MultiPartUploader(
|
137
|
-
use_process_pool=True,
|
138
|
-
get_presigned_upload_url_request=dict(
|
139
|
-
headers=None,
|
140
|
-
requires_biolib_auth=True,
|
141
|
-
path=f"/lfs/versions/{lfs_version['uuid']}/presigned_upload_url/",
|
142
|
-
),
|
143
|
-
complete_upload_request=dict(
|
144
|
-
headers=None,
|
145
|
-
requires_biolib_auth=True,
|
146
|
-
path=f"/lfs/versions/{lfs_version['uuid']}/complete_upload/",
|
147
|
-
),
|
148
|
-
)
|
149
|
-
|
150
|
-
multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
|
151
|
-
os.chdir(original_working_dir)
|
152
|
-
logger.info(f"Successfully pushed a new LFS version '{lfs_version['uri']}'")
|
153
|
-
return lfs_version['uri']
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|