pybiolib 1.1.2038__py3-none-any.whl → 1.1.2097__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
biolib/__init__.py CHANGED
@@ -13,7 +13,7 @@ from biolib.biolib_api_client import BiolibApiClient as _BioLibApiClient, App
13
13
  from biolib.jobs import Job as _Job
14
14
  from biolib import user as _user
15
15
  from biolib.typing_utils import List, Optional
16
- from biolib._internal.data_record import DataRecord as _DataRecord
16
+ from biolib._data_record.data_record import DataRecord as _DataRecord
17
17
 
18
18
  import biolib.api
19
19
  import biolib.app
@@ -45,6 +45,10 @@ def get_job(job_id: str) -> _Job:
45
45
  return _Job.create_from_uuid(uuid=job_id)
46
46
 
47
47
 
48
+ def get_data_record(uri: str) -> _DataRecord:
49
+ return _DataRecord.get_by_uri(uri)
50
+
51
+
48
52
  def fetch_jobs(count: int = 25) -> List[_Job]:
49
53
  return _Job.fetch_jobs(count)
50
54
 
@@ -0,0 +1,208 @@
1
+ from biolib import api
2
+ from biolib._internal.data_record import get_data_record_state_from_uri, push_data_record_version
3
+ from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
4
+ from biolib._internal.http_client import HttpClient
5
+ from biolib.api import client as api_client
6
+ from biolib.biolib_api_client import BiolibApiClient
7
+ from biolib.biolib_api_client.lfs_types import DataRecordInfo, DataRecordVersionInfo
8
+ from biolib.biolib_binary_format import LazyLoadedFile
9
+ from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
10
+ from biolib.biolib_logging import logger
11
+ from biolib.typing_utils import Optional as _Optional
12
+ from biolib.utils.app_uri import parse_app_uri
13
+ from biolib.utils.zip.remote_zip import RemoteZip
14
+
15
+
16
+ import os
17
+ from collections import namedtuple
18
+ from datetime import datetime
19
+ from fnmatch import fnmatch
20
+ from struct import Struct
21
+ from typing import Callable, Dict, List, cast, Union
22
+
23
+ PathFilter = Union[str, Callable[[str], bool]]
24
+
25
+
26
+ class DataRecord:
27
+ def __init__(self, _internal_state: DataRecordVersionInfo):
28
+ self._state = _internal_state
29
+
30
+ def __repr__(self):
31
+ return f'DataRecord: {self._state["resource_uri"]}'
32
+
33
+ @property
34
+ def uri(self) -> str:
35
+ return self._state['resource_uri']
36
+
37
+ @property
38
+ def name(self) -> str:
39
+ uri_parsed = parse_app_uri(self._state['resource_uri'], use_account_as_name_default=False)
40
+ if not uri_parsed['app_name']:
41
+ raise ValueError('Expected parameter "resource_uri" to contain resource name')
42
+
43
+ return uri_parsed['app_name']
44
+
45
+ def list_files(self, path_filter: _Optional[PathFilter] = None) -> List[LazyLoadedFile]:
46
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
47
+ resource_version_uuid=self._state['resource_version_uuid'],
48
+ )
49
+ files: List[LazyLoadedFile] = []
50
+ with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
51
+ central_directory = remote_zip.get_central_directory()
52
+ for file_info in central_directory.values():
53
+ files.append(self._get_file(remote_storage_endpoint, file_info))
54
+
55
+ return self._get_filtered_files(files=files, path_filter=path_filter) if path_filter else files
56
+
57
+ def download_zip(self, output_path: str):
58
+ remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
59
+ resource_version_uuid=self._state['resource_version_uuid'],
60
+ )
61
+ HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
62
+
63
+ def download_files(self, output_dir: str, path_filter: _Optional[PathFilter] = None) -> None:
64
+ filtered_files = self.list_files(path_filter=path_filter)
65
+
66
+ if len(filtered_files) == 0:
67
+ logger.debug('No files to save')
68
+ return
69
+
70
+ for file in filtered_files:
71
+ file_path = os.path.join(output_dir, file.path)
72
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
73
+ with open(file_path, mode='wb') as file_handle:
74
+ for chunk in file.get_data_iterator():
75
+ file_handle.write(chunk)
76
+
77
+ def save_files(self, output_dir: str, path_filter: _Optional[PathFilter] = None) -> None:
78
+ self.download_files(output_dir=output_dir, path_filter=path_filter)
79
+
80
+ def update(self, data_path: str, chunk_size_in_mb: _Optional[int] = None) -> None:
81
+ assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
82
+ uri = push_data_record_version(
83
+ data_record_uuid=self._state['resource_uuid'], input_dir=data_path, chunk_size_in_mb=chunk_size_in_mb
84
+ )
85
+ self._state = get_data_record_state_from_uri(uri)
86
+
87
+ @staticmethod
88
+ def get_by_uri(uri: str) -> 'DataRecord':
89
+ return DataRecord(_internal_state=get_data_record_state_from_uri(uri))
90
+
91
+ @staticmethod
92
+ def create(destination: str, data_path: _Optional[str] = None) -> 'DataRecord':
93
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Data Record')
94
+ if data_path is not None:
95
+ assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
96
+ uri_parsed = parse_app_uri(destination, use_account_as_name_default=False)
97
+ if uri_parsed['app_name_normalized']:
98
+ data_record_uri = destination
99
+ else:
100
+ record_name = 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
101
+ data_record_uri = f'{destination}/{record_name}'
102
+
103
+ uri_parsed = parse_app_uri(data_record_uri)
104
+ response = api.client.post(
105
+ path='/lfs/',
106
+ data={
107
+ 'account_handle': uri_parsed['account_handle_normalized'],
108
+ 'name': uri_parsed['app_name'],
109
+ },
110
+ )
111
+ data_record: DataRecordInfo = response.json()
112
+ logger.info(f"Successfully created new Data Record '{data_record['uri']}'")
113
+
114
+ if data_path is not None:
115
+ record_version_uri = push_data_record_version(data_record_uuid=data_record['uuid'], input_dir=data_path)
116
+ return DataRecord.get_by_uri(uri=record_version_uri)
117
+ else:
118
+ return DataRecord.get_by_uri(uri=data_record_uri)
119
+
120
+ @staticmethod
121
+ def fetch(uri: _Optional[str] = None, count: _Optional[int] = None) -> List['DataRecord']:
122
+ max_page_size = 1_000
123
+ params: Dict[str, Union[str, int]] = {
124
+ 'page_size': str(count or max_page_size),
125
+ 'resource_type': 'data-record',
126
+ }
127
+ if uri:
128
+ uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
129
+ params['account_handle'] = uri_parsed['account_handle_normalized']
130
+ if uri_parsed['app_name_normalized']:
131
+ params['app_name'] = uri_parsed['app_name_normalized']
132
+
133
+ results = api_client.get(path='/apps/', params=params).json()['results']
134
+ if count is None and len(results) == max_page_size:
135
+ logger.warning(
136
+ f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
137
+ )
138
+
139
+ return [
140
+ DataRecord(
141
+ _internal_state={
142
+ 'resource_uri': result['resource_uri'],
143
+ 'resource_uuid': result['public_id'],
144
+ 'resource_version_uuid': result['active_version'],
145
+ }
146
+ )
147
+ for result in results
148
+ ]
149
+
150
+ @staticmethod
151
+ def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
152
+ local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
153
+ local_file_header_struct = Struct('<H2sHHHIIIHH')
154
+ LocalFileHeader = namedtuple(
155
+ 'LocalFileHeader',
156
+ (
157
+ 'version',
158
+ 'flags',
159
+ 'compression_raw',
160
+ 'mod_time',
161
+ 'mod_date',
162
+ 'crc_32_expected',
163
+ 'compressed_size_raw',
164
+ 'uncompressed_size_raw',
165
+ 'file_name_len',
166
+ 'extra_field_len',
167
+ ),
168
+ )
169
+
170
+ local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
171
+ local_file_header_end = local_file_header_start + local_file_header_struct.size
172
+
173
+ def file_start_func() -> int:
174
+ local_file_header_response = HttpClient.request(
175
+ url=remote_storage_endpoint.get_remote_url(),
176
+ headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
177
+ timeout_in_seconds=300,
178
+ )
179
+ local_file_header = LocalFileHeader._make(
180
+ local_file_header_struct.unpack(local_file_header_response.content)
181
+ )
182
+ file_start: int = (
183
+ local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
184
+ )
185
+ return file_start
186
+
187
+ return LazyLoadedFile(
188
+ buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
189
+ length=file_info['file_size'],
190
+ path=file_info['filename'],
191
+ start=None,
192
+ start_func=file_start_func,
193
+ )
194
+
195
+ @staticmethod
196
+ def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
197
+ if not (isinstance(path_filter, str) or callable(path_filter)):
198
+ raise Exception('Expected path_filter to be a string or a function')
199
+
200
+ if callable(path_filter):
201
+ return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
202
+
203
+ glob_filter = cast(str, path_filter)
204
+
205
+ def _filter_function(file: LazyLoadedFile) -> bool:
206
+ return fnmatch(file.path, glob_filter)
207
+
208
+ return list(filter(_filter_function, files))
@@ -1 +1 @@
1
- from .data_record import DataRecord
1
+ from .data_record import get_data_record_state_from_uri, push_data_record_version
@@ -1,169 +1,73 @@
1
1
  import os
2
- from collections import namedtuple
3
- from datetime import datetime
4
- from fnmatch import fnmatch
5
- from struct import Struct
6
- from typing import Callable, Dict, List, Optional, Union, cast
2
+ from typing import Optional
7
3
 
8
- from biolib import lfs
9
- from biolib._internal.data_record.remote_storage_endpoint import DataRecordRemoteStorageEndpoint
10
- from biolib._internal.http_client import HttpClient
4
+ from biolib import api, utils
5
+ from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
11
6
  from biolib.api import client as api_client
12
- from biolib.biolib_api_client import AppGetResponse
13
- from biolib.biolib_binary_format import LazyLoadedFile
14
- from biolib.biolib_binary_format.utils import RemoteIndexableBuffer
7
+ from biolib.biolib_api_client import AppGetResponse, BiolibApiClient
8
+ from biolib.biolib_api_client.lfs_types import DataRecordVersion, DataRecordVersionInfo
9
+ from biolib.biolib_errors import BioLibError
15
10
  from biolib.biolib_logging import logger
16
- from biolib.utils.app_uri import parse_app_uri
17
- from biolib.utils.zip.remote_zip import RemoteZip # type: ignore
18
11
 
19
- PathFilter = Union[str, Callable[[str], bool]]
20
12
 
21
-
22
- class DataRecord:
23
- def __init__(self, uri: str):
24
- self._uri = uri
25
-
26
- def __repr__(self):
27
- return f'DataRecord: {self._uri}'
28
-
29
- @property
30
- def uri(self) -> str:
31
- return self._uri
32
-
33
- @property
34
- def name(self) -> str:
35
- uri_parsed = parse_app_uri(self.uri, use_account_as_name_default=False)
36
- if not uri_parsed['app_name']:
37
- raise ValueError('Expected parameter "uri" to contain resource name')
38
-
39
- return uri_parsed['app_name']
40
-
41
- def list_files(self, path_filter: Optional[PathFilter] = None) -> List[LazyLoadedFile]:
42
- app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': self._uri}).json()
43
- remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
44
- resource_version_uuid=app_response['app_version']['public_id'],
45
- )
46
- files: List[LazyLoadedFile] = []
47
- with RemoteZip(url=remote_storage_endpoint.get_remote_url()) as remote_zip:
48
- central_directory = remote_zip.get_central_directory()
49
- for file_info in central_directory.values():
50
- files.append(self._get_file(remote_storage_endpoint, file_info))
51
-
52
- return self._get_filtered_files(files=files, path_filter=path_filter) if path_filter else files
53
-
54
- def download_zip(self, output_path: str):
55
- app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': self._uri}).json()
56
- remote_storage_endpoint = DataRecordRemoteStorageEndpoint(
57
- resource_version_uuid=app_response['app_version']['public_id'],
58
- )
59
- HttpClient.request(url=remote_storage_endpoint.get_remote_url(), response_path=output_path)
60
-
61
- def download_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
62
- filtered_files = self.list_files(path_filter=path_filter)
63
-
64
- if len(filtered_files) == 0:
65
- logger.debug('No files to save')
66
- return
67
-
68
- for file in filtered_files:
69
- file_path = os.path.join(output_dir, file.path)
70
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
71
- with open(file_path, mode='wb') as file_handle:
72
- for chunk in file.get_data_iterator():
73
- file_handle.write(chunk)
74
-
75
- def save_files(self, output_dir: str, path_filter: Optional[PathFilter] = None) -> None:
76
- self.download_files(output_dir=output_dir, path_filter=path_filter)
77
-
78
- def update(self, data_path: str) -> None:
79
- assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
80
- self._uri = lfs.push_large_file_system(lfs_uri=self._uri, input_dir=data_path)
81
-
82
- @staticmethod
83
- def create(destination: str, data_path: str, name: Optional[str] = None) -> 'DataRecord':
84
- assert os.path.isdir(data_path), f'The path "{data_path}" is not a directory.'
85
- record_name = name if name else 'data-record-' + datetime.now().isoformat().split('.')[0].replace(':', '-')
86
- record_uri = lfs.create_large_file_system(lfs_uri=f'{destination}/{record_name}')
87
- record_version_uri = lfs.push_large_file_system(lfs_uri=record_uri, input_dir=data_path)
88
- return DataRecord(uri=record_version_uri)
89
-
90
- @staticmethod
91
- def fetch(uri: Optional[str] = None, count: Optional[int] = None) -> List['DataRecord']:
92
- max_page_size = 1_000
93
- params: Dict[str, Union[str, int]] = {
94
- 'page_size': str(count or max_page_size),
95
- 'resource_type': 'data-record',
96
- }
97
- if uri:
98
- uri_parsed = parse_app_uri(uri, use_account_as_name_default=False)
99
- params['account_handle'] = uri_parsed['account_handle_normalized']
100
- if uri_parsed['app_name_normalized']:
101
- params['app_name'] = uri_parsed['app_name_normalized']
102
-
103
- results = api_client.get(path='/apps/', params=params).json()['results']
104
- if count is None and len(results) == max_page_size:
105
- logger.warning(
106
- f'Fetch results exceeded maximum count of {max_page_size}. Some data records might not be fetched.'
107
- )
108
-
109
- return [DataRecord(result['resource_uri']) for result in results]
110
-
111
- @staticmethod
112
- def _get_file(remote_storage_endpoint: DataRecordRemoteStorageEndpoint, file_info: Dict) -> LazyLoadedFile:
113
- local_file_header_signature_bytes = b'\x50\x4b\x03\x04'
114
- local_file_header_struct = Struct('<H2sHHHIIIHH')
115
- LocalFileHeader = namedtuple(
116
- 'LocalFileHeader',
117
- (
118
- 'version',
119
- 'flags',
120
- 'compression_raw',
121
- 'mod_time',
122
- 'mod_date',
123
- 'crc_32_expected',
124
- 'compressed_size_raw',
125
- 'uncompressed_size_raw',
126
- 'file_name_len',
127
- 'extra_field_len',
128
- ),
129
- )
130
-
131
- local_file_header_start = file_info['header_offset'] + len(local_file_header_signature_bytes)
132
- local_file_header_end = local_file_header_start + local_file_header_struct.size
133
-
134
- def file_start_func() -> int:
135
- local_file_header_response = HttpClient.request(
136
- url=remote_storage_endpoint.get_remote_url(),
137
- headers={'range': f'bytes={local_file_header_start}-{local_file_header_end - 1}'},
138
- timeout_in_seconds=300,
139
- )
140
- local_file_header = LocalFileHeader._make(
141
- local_file_header_struct.unpack(local_file_header_response.content)
142
- )
143
- file_start: int = (
144
- local_file_header_end + local_file_header.file_name_len + local_file_header.extra_field_len
145
- )
146
- return file_start
147
-
148
- return LazyLoadedFile(
149
- buffer=RemoteIndexableBuffer(endpoint=remote_storage_endpoint),
150
- length=file_info['file_size'],
151
- path=file_info['filename'],
152
- start=None,
153
- start_func=file_start_func,
154
- )
155
-
156
- @staticmethod
157
- def _get_filtered_files(files: List[LazyLoadedFile], path_filter: PathFilter) -> List[LazyLoadedFile]:
158
- if not (isinstance(path_filter, str) or callable(path_filter)):
159
- raise Exception('Expected path_filter to be a string or a function')
160
-
161
- if callable(path_filter):
162
- return list(filter(lambda x: path_filter(x.path), files)) # type: ignore
163
-
164
- glob_filter = cast(str, path_filter)
165
-
166
- def _filter_function(file: LazyLoadedFile) -> bool:
167
- return fnmatch(file.path, glob_filter)
168
-
169
- return list(filter(_filter_function, files))
13
+ def push_data_record_version(data_record_uuid: str, input_dir: str, chunk_size_in_mb: Optional[int] = None) -> str:
14
+ BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Data Record')
15
+
16
+ if not os.path.isdir(input_dir):
17
+ raise BioLibError(f'Could not find folder at {input_dir}')
18
+
19
+ if os.path.realpath(input_dir) == '/':
20
+ raise BioLibError('Pushing your root directory is not possible')
21
+
22
+ original_working_dir = os.getcwd()
23
+ os.chdir(input_dir)
24
+ files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
25
+
26
+ if data_size_in_bytes > 4_500_000_000_000:
27
+ raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
28
+
29
+ min_chunk_size_bytes = 10_000_000
30
+ chunk_size_in_bytes: int
31
+ if chunk_size_in_mb:
32
+ chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
33
+ if chunk_size_in_bytes < min_chunk_size_bytes:
34
+ logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
35
+ chunk_size_in_bytes = min_chunk_size_bytes
36
+ else:
37
+ # Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
38
+ chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
39
+
40
+ data_size_in_mb = round(data_size_in_bytes / 10**6)
41
+ print(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
42
+
43
+ response = api.client.post(path='/lfs/versions/', data={'resource_uuid': data_record_uuid})
44
+ data_record_version: DataRecordVersion = response.json()
45
+ iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
46
+
47
+ multipart_uploader = utils.MultiPartUploader(
48
+ use_process_pool=True,
49
+ get_presigned_upload_url_request=dict(
50
+ headers=None,
51
+ requires_biolib_auth=True,
52
+ path=f"/lfs/versions/{data_record_version['uuid']}/presigned_upload_url/",
53
+ ),
54
+ complete_upload_request=dict(
55
+ headers=None,
56
+ requires_biolib_auth=True,
57
+ path=f"/lfs/versions/{data_record_version['uuid']}/complete_upload/",
58
+ ),
59
+ )
60
+
61
+ multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
62
+ os.chdir(original_working_dir)
63
+ logger.info(f"Successfully pushed a new Data Record version '{data_record_version['uri']}'")
64
+ return data_record_version['uri']
65
+
66
+
67
+ def get_data_record_state_from_uri(uri) -> 'DataRecordVersionInfo':
68
+ app_response: AppGetResponse = api_client.get(path='/app/', params={'uri': uri}).json()
69
+ return DataRecordVersionInfo(
70
+ resource_uri=app_response['app_version']['app_uri'],
71
+ resource_uuid=app_response['app']['public_id'],
72
+ resource_version_uuid=app_response['app_version']['public_id'],
73
+ )
@@ -3,7 +3,7 @@ from datetime import datetime, timedelta
3
3
  from urllib.parse import urlparse
4
4
 
5
5
  from biolib.api import client as api_client
6
- from biolib.biolib_api_client.lfs_types import LargeFileSystemVersion
6
+ from biolib.biolib_api_client.lfs_types import DataRecordVersion
7
7
  from biolib.biolib_binary_format.utils import RemoteEndpoint
8
8
  from biolib.biolib_logging import logger
9
9
  from biolib.typing_utils import Optional
@@ -17,7 +17,7 @@ class DataRecordRemoteStorageEndpoint(RemoteEndpoint):
17
17
 
18
18
  def get_remote_url(self) -> str:
19
19
  if not self._presigned_url or not self._expires_at or datetime.utcnow() > self._expires_at:
20
- lfs_version: LargeFileSystemVersion = api_client.get(
20
+ lfs_version: DataRecordVersion = api_client.get(
21
21
  path=f'/lfs/versions/{self._resource_version_uuid}/',
22
22
  ).json()
23
23
 
@@ -0,0 +1,77 @@
1
+ import io
2
+ import os
3
+ import zipfile as zf
4
+ from pathlib import Path
5
+
6
+ from biolib.typing_utils import Iterator, List, Tuple
7
+
8
+
9
+ def get_files_and_size_of_directory(directory: str) -> Tuple[List[str], int]:
10
+ data_size = 0
11
+ file_list: List[str] = []
12
+
13
+ for path, _, files in os.walk(directory):
14
+ for file in files:
15
+ file_path = os.path.join(path, file)
16
+ if os.path.islink(file_path):
17
+ continue # skip symlinks
18
+
19
+ relative_file_path = file_path[len(directory) + 1 :] # +1 to remove starting slash
20
+ file_list.append(relative_file_path)
21
+ data_size += os.path.getsize(file_path)
22
+
23
+ return file_list, data_size
24
+
25
+
26
+ def get_iterable_zip_stream(files: List[str], chunk_size: int) -> Iterator[bytes]:
27
+ class ChunkedIOBuffer(io.RawIOBase):
28
+ def __init__(self, chunk_size: int):
29
+ super().__init__()
30
+ self.chunk_size = chunk_size
31
+ self.tmp_data = bytearray()
32
+
33
+ def get_buffer_size(self):
34
+ return len(self.tmp_data)
35
+
36
+ def read_chunk(self):
37
+ chunk = bytes(self.tmp_data[: self.chunk_size])
38
+ self.tmp_data = self.tmp_data[self.chunk_size :]
39
+ return chunk
40
+
41
+ def write(self, data):
42
+ data_length = len(data)
43
+ self.tmp_data += data
44
+ return data_length
45
+
46
+ # create chunked buffer to hold data temporarily
47
+ io_buffer = ChunkedIOBuffer(chunk_size)
48
+
49
+ # create zip writer that will write to the io buffer
50
+ zip_writer = zf.ZipFile(io_buffer, mode='w') # type: ignore
51
+
52
+ for file_path in files:
53
+ # generate zip info and prepare zip pointer for writing
54
+ z_info = zf.ZipInfo.from_file(file_path)
55
+ zip_pointer = zip_writer.open(z_info, mode='w')
56
+ if Path(file_path).is_file():
57
+ # read file chunk by chunk
58
+ with open(file_path, 'br') as file_pointer:
59
+ while True:
60
+ chunk = file_pointer.read(chunk_size)
61
+ if len(chunk) == 0:
62
+ break
63
+ # write the chunk to the zip
64
+ zip_pointer.write(chunk)
65
+ # if writing the chunk caused us to go over chunk_size, flush it
66
+ if io_buffer.get_buffer_size() > chunk_size:
67
+ yield io_buffer.read_chunk()
68
+
69
+ zip_pointer.close()
70
+
71
+ # flush any remaining data in the stream (e.g. zip file meta data)
72
+ zip_writer.close()
73
+ while True:
74
+ chunk = io_buffer.read_chunk()
75
+ if len(chunk) == 0:
76
+ break
77
+ yield chunk
@@ -0,0 +1 @@
1
+ from .cache import prune_lfs_cache
@@ -6,12 +6,12 @@ import rich.progress
6
6
  import yaml
7
7
 
8
8
  from biolib import api, utils
9
+ from biolib._internal.file_utils import get_files_and_size_of_directory, get_iterable_zip_stream
9
10
  from biolib.biolib_api_client import BiolibApiClient
10
11
  from biolib.biolib_api_client.biolib_app_api import BiolibAppApi
11
12
  from biolib.biolib_docker_client import BiolibDockerClient
12
13
  from biolib.biolib_errors import BioLibError
13
14
  from biolib.biolib_logging import logger
14
- from biolib.lfs.utils import get_files_and_size_of_directory, get_iterable_zip_stream
15
15
  from biolib.typing_utils import Iterable, Optional, Set, TypedDict
16
16
 
17
17
  REGEX_MARKDOWN_INLINE_IMAGE = re.compile(r'!\[(?P<alt>.*)\]\((?P<src>.*)\)')
@@ -1,8 +1,4 @@
1
- import json
2
- import re
3
-
4
- from biolib import api
5
- from biolib.typing_utils import Optional, TypedDict, cast
1
+ from biolib.typing_utils import TypedDict
6
2
 
7
3
 
8
4
  class RuntimeJobDataDict(TypedDict):
@@ -21,76 +17,3 @@ class BioLibRuntimeNotRecognizedError(BioLibRuntimeError):
21
17
  def __init__(self, message='The runtime is not recognized as a BioLib app'):
22
18
  self.message = message
23
19
  super().__init__(self.message)
24
-
25
-
26
- class Runtime:
27
- _job_data: Optional[RuntimeJobDataDict] = None
28
-
29
- @staticmethod
30
- def check_is_environment_biolib_app() -> bool:
31
- return bool(Runtime._try_to_get_job_data())
32
-
33
- @staticmethod
34
- def get_job_id() -> str:
35
- return Runtime._get_job_data()['job_uuid']
36
-
37
- @staticmethod
38
- def get_job_auth_token() -> str:
39
- return Runtime._get_job_data()['job_auth_token']
40
-
41
- @staticmethod
42
- def get_job_requested_machine() -> str:
43
- return Runtime._get_job_data()['job_requested_machine']
44
-
45
- @staticmethod
46
- def get_app_uri() -> str:
47
- return Runtime._get_job_data()['app_uri']
48
-
49
- @staticmethod
50
- def get_secret(secret_name: str) -> bytes:
51
- assert re.match(
52
- '^[a-zA-Z0-9_-]*$', secret_name
53
- ), 'Secret name can only contain alphanumeric characters and dashes or underscores '
54
- try:
55
- with open(f'/biolib/secrets/{secret_name}', 'rb') as file:
56
- return file.read()
57
- except BaseException as error:
58
- raise BioLibRuntimeError(f'Unable to get system secret: {secret_name}') from error
59
-
60
- @staticmethod
61
- def set_main_result_prefix(result_prefix: str) -> None:
62
- job_data = Runtime._get_job_data()
63
- api.client.patch(
64
- data={'result_name_prefix': result_prefix},
65
- headers={'Job-Auth-Token': job_data['job_auth_token']},
66
- path=f"/jobs/{job_data['job_uuid']}/main_result/",
67
- )
68
-
69
- @staticmethod
70
- def create_result_note(note: str) -> None:
71
- job_id = Runtime.get_job_id()
72
- # Note: Authentication is added by app caller proxy in compute node
73
- api.client.post(data={'note': note}, path=f'/jobs/{job_id}/notes/')
74
-
75
- @staticmethod
76
- def _try_to_get_job_data() -> Optional[RuntimeJobDataDict]:
77
- if not Runtime._job_data:
78
- try:
79
- with open('/biolib/secrets/biolib_system_secret') as file:
80
- job_data: RuntimeJobDataDict = json.load(file)
81
- except BaseException:
82
- return None
83
-
84
- if not job_data['version'].startswith('1.'):
85
- raise BioLibRuntimeError(f"Unexpected system secret version {job_data['version']} expected 1.x.x")
86
-
87
- Runtime._job_data = job_data
88
-
89
- return cast(RuntimeJobDataDict, Runtime._job_data)
90
-
91
- @staticmethod
92
- def _get_job_data() -> RuntimeJobDataDict:
93
- job_data = Runtime._try_to_get_job_data()
94
- if not job_data:
95
- raise BioLibRuntimeNotRecognizedError() from None
96
- return job_data
@@ -0,0 +1,79 @@
1
+ from biolib import api
2
+ from biolib._internal.runtime import BioLibRuntimeError, BioLibRuntimeNotRecognizedError, RuntimeJobDataDict
3
+ from biolib.typing_utils import cast, Optional as _Optional
4
+
5
+ import json
6
+ import re
7
+
8
+
9
+ class Runtime:
10
+ _job_data: _Optional[RuntimeJobDataDict] = None
11
+
12
+ @staticmethod
13
+ def check_is_environment_biolib_app() -> bool:
14
+ return bool(Runtime._try_to_get_job_data())
15
+
16
+ @staticmethod
17
+ def get_job_id() -> str:
18
+ return Runtime._get_job_data()['job_uuid']
19
+
20
+ @staticmethod
21
+ def get_job_auth_token() -> str:
22
+ return Runtime._get_job_data()['job_auth_token']
23
+
24
+ @staticmethod
25
+ def get_job_requested_machine() -> str:
26
+ return Runtime._get_job_data()['job_requested_machine']
27
+
28
+ @staticmethod
29
+ def get_app_uri() -> str:
30
+ return Runtime._get_job_data()['app_uri']
31
+
32
+ @staticmethod
33
+ def get_secret(secret_name: str) -> bytes:
34
+ assert re.match(
35
+ '^[a-zA-Z0-9_-]*$', secret_name
36
+ ), 'Secret name can only contain alphanumeric characters and dashes or underscores '
37
+ try:
38
+ with open(f'/biolib/secrets/{secret_name}', 'rb') as file:
39
+ return file.read()
40
+ except BaseException as error:
41
+ raise BioLibRuntimeError(f'Unable to get system secret: {secret_name}') from error
42
+
43
+ @staticmethod
44
+ def set_main_result_prefix(result_prefix: str) -> None:
45
+ job_data = Runtime._get_job_data()
46
+ api.client.patch(
47
+ data={'result_name_prefix': result_prefix},
48
+ headers={'Job-Auth-Token': job_data['job_auth_token']},
49
+ path=f"/jobs/{job_data['job_uuid']}/main_result/",
50
+ )
51
+
52
+ @staticmethod
53
+ def create_result_note(note: str) -> None:
54
+ job_id = Runtime.get_job_id()
55
+ # Note: Authentication is added by app caller proxy in compute node
56
+ api.client.post(data={'note': note}, path=f'/jobs/{job_id}/notes/')
57
+
58
+ @staticmethod
59
+ def _try_to_get_job_data() -> _Optional[RuntimeJobDataDict]:
60
+ if not Runtime._job_data:
61
+ try:
62
+ with open('/biolib/secrets/biolib_system_secret') as file:
63
+ job_data: RuntimeJobDataDict = json.load(file)
64
+ except BaseException:
65
+ return None
66
+
67
+ if not job_data['version'].startswith('1.'):
68
+ raise BioLibRuntimeError(f"Unexpected system secret version {job_data['version']} expected 1.x.x")
69
+
70
+ Runtime._job_data = job_data
71
+
72
+ return cast(RuntimeJobDataDict, Runtime._job_data)
73
+
74
+ @staticmethod
75
+ def _get_job_data() -> RuntimeJobDataDict:
76
+ job_data = Runtime._try_to_get_job_data()
77
+ if not job_data:
78
+ raise BioLibRuntimeNotRecognizedError() from None
79
+ return job_data
@@ -6,7 +6,7 @@ import os
6
6
  from datetime import datetime, timezone
7
7
  from json.decoder import JSONDecodeError
8
8
 
9
- from biolib._internal.runtime import Runtime
9
+ from biolib._runtime.runtime import Runtime
10
10
  from biolib._internal.http_client import HttpClient
11
11
  from biolib.typing_utils import Optional
12
12
  from biolib.biolib_errors import BioLibError
@@ -16,6 +16,7 @@ class AppVersion(AppVersionSlim):
16
16
  source_code_license: str
17
17
  stdout_render_type: Literal['text', 'markdown']
18
18
  main_output_file: Optional[str]
19
+ app_uri: str
19
20
 
20
21
 
21
22
  class App(TypedDict):
@@ -1,13 +1,19 @@
1
1
  from biolib.typing_utils import TypedDict
2
2
 
3
3
 
4
- class LargeFileSystemVersion(TypedDict):
4
+ class DataRecordVersion(TypedDict):
5
5
  presigned_download_url: str
6
6
  size_bytes: int
7
7
  uri: str
8
8
  uuid: str
9
9
 
10
10
 
11
- class LargeFileSystem(TypedDict):
11
+ class DataRecordInfo(TypedDict):
12
12
  uri: str
13
13
  uuid: str
14
+
15
+
16
+ class DataRecordVersionInfo(TypedDict):
17
+ resource_uri: str
18
+ resource_uuid: str
19
+ resource_version_uuid: str
biolib/cli/data_record.py CHANGED
@@ -1,9 +1,11 @@
1
+ import json
1
2
  import logging
2
3
  import os
4
+ from typing import Dict, List
3
5
 
4
6
  import click
5
7
 
6
- from biolib._internal.data_record import DataRecord
8
+ from biolib._data_record.data_record import DataRecord
7
9
  from biolib.biolib_logging import logger, logger_no_user_data
8
10
  from biolib.typing_utils import Optional
9
11
 
@@ -15,11 +17,18 @@ def data_record() -> None:
15
17
 
16
18
 
17
19
  @data_record.command(help='Create a Data Record')
18
- @click.option('--destination', type=str, required=True)
20
+ @click.argument('uri', required=True)
21
+ @click.option('--data-path', required=True, type=click.Path(exists=True))
22
+ def create(uri: str, data_path: str) -> None:
23
+ DataRecord.create(destination=uri, data_path=data_path)
24
+
25
+
26
+ @data_record.command(help='Update a Data Record')
27
+ @click.argument('uri', required=True)
19
28
  @click.option('--data-path', required=True, type=click.Path(exists=True))
20
- @click.option('--name', type=str, required=False)
21
- def create(destination: str, data_path: str, name: Optional[str] = None) -> None:
22
- DataRecord.create(destination, data_path, name)
29
+ @click.option('--chunk-size', default=None, required=False, type=click.INT, help='The size of each chunk (In MB)')
30
+ def update(uri: str, data_path: str, chunk_size: Optional[int]) -> None:
31
+ DataRecord.get_by_uri(uri=uri).update(data_path=data_path, chunk_size_in_mb=chunk_size)
23
32
 
24
33
 
25
34
  @data_record.command(help='Download files from a Data Record')
@@ -27,7 +36,7 @@ def create(destination: str, data_path: str, name: Optional[str] = None) -> None
27
36
  @click.option('--file', required=False, type=str)
28
37
  @click.option('--path-filter', required=False, type=str, hide_input=True)
29
38
  def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
30
- record = DataRecord(uri=uri)
39
+ record = DataRecord.get_by_uri(uri=uri)
31
40
  if file is not None:
32
41
  try:
33
42
  file_obj = [file_obj for file_obj in record.list_files() if file_obj.path == file][0]
@@ -41,3 +50,30 @@ def download(uri: str, file: Optional[str], path_filter: Optional[str]) -> None:
41
50
  else:
42
51
  assert not os.path.exists(record.name), f'Directory with name {record.name} already exists in current directory'
43
52
  record.save_files(output_dir=record.name, path_filter=path_filter)
53
+
54
+
55
+ @data_record.command(help='Describe a Data Record')
56
+ @click.argument('uri', required=True)
57
+ @click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
58
+ def describe(uri: str, output_as_json: bool) -> None:
59
+ record = DataRecord.get_by_uri(uri)
60
+ files_info: List[Dict] = []
61
+ total_size_in_bytes = 0
62
+ for file in record.list_files():
63
+ files_info.append({'path': file.path, 'size_bytes': file.length})
64
+ total_size_in_bytes += file.length
65
+
66
+ if output_as_json:
67
+ print(
68
+ json.dumps(
69
+ obj={'uri': record.uri, 'size_bytes': total_size_in_bytes, 'files': files_info},
70
+ indent=4,
71
+ )
72
+ )
73
+ else:
74
+ print(f'Data Record {record.uri}\ntotal {total_size_in_bytes} bytes\n')
75
+ print('size bytes path')
76
+ for file_info in files_info:
77
+ size_string = str(file_info['size_bytes'])
78
+ leading_space_string = ' ' * (10 - len(size_string))
79
+ print(f"{leading_space_string}{size_string} {file_info['path']}")
biolib/cli/lfs.py CHANGED
@@ -7,9 +7,9 @@ from typing import Dict, List
7
7
  import click
8
8
 
9
9
  from biolib import biolib_errors
10
- from biolib._internal.data_record import DataRecord
10
+ from biolib._data_record.data_record import DataRecord
11
+ from biolib._internal.lfs import prune_lfs_cache
11
12
  from biolib.biolib_logging import logger, logger_no_user_data
12
- from biolib.lfs import create_large_file_system, prune_lfs_cache, push_large_file_system
13
13
  from biolib.typing_utils import Optional
14
14
 
15
15
 
@@ -21,9 +21,10 @@ def lfs() -> None:
21
21
  @lfs.command(help='Create a Large File System')
22
22
  @click.argument('uri', required=True)
23
23
  def create(uri: str) -> None:
24
+ logger.warning('This is command deprecated, please use "biolib data-record create" instead.')
24
25
  logger.configure(default_log_level=logging.INFO)
25
26
  logger_no_user_data.configure(default_log_level=logging.INFO)
26
- create_large_file_system(lfs_uri=uri)
27
+ DataRecord.create(destination=uri)
27
28
 
28
29
 
29
30
  @lfs.command(help='Push a new version of a Large File System')
@@ -31,10 +32,11 @@ def create(uri: str) -> None:
31
32
  @click.option('--path', required=True, type=click.Path(exists=True))
32
33
  @click.option('--chunk-size', default=None, required=False, type=click.INT, help='The size of each chunk (In MB)')
33
34
  def push(uri: str, path: str, chunk_size: Optional[int]) -> None:
35
+ logger.warning('This is command deprecated, please use "biolib data-record update" instead.')
34
36
  logger.configure(default_log_level=logging.INFO)
35
37
  logger_no_user_data.configure(default_log_level=logging.INFO)
36
38
  try:
37
- push_large_file_system(lfs_uri=uri, input_dir=path, chunk_size_in_mb=chunk_size)
39
+ DataRecord.get_by_uri(uri=uri).update(data_path=path, chunk_size_in_mb=chunk_size)
38
40
  except biolib_errors.BioLibError as error:
39
41
  print(f'An error occurred:\n{error.message}', file=sys.stderr)
40
42
  exit(1)
@@ -44,10 +46,11 @@ def push(uri: str, path: str, chunk_size: Optional[int]) -> None:
44
46
  @click.argument('uri', required=True)
45
47
  @click.option('--file-path', required=True, type=str)
46
48
  def download_file(uri: str, file_path: str) -> None:
49
+ logger.warning('This is command deprecated, please use "biolib data-record download" instead.')
47
50
  logger.configure(default_log_level=logging.INFO)
48
51
  logger_no_user_data.configure(default_log_level=logging.INFO)
49
52
  try:
50
- record = DataRecord(uri=uri)
53
+ record = DataRecord.get_by_uri(uri=uri)
51
54
  try:
52
55
  file_obj = [file_obj for file_obj in record.list_files() if file_obj.path == file_path][0]
53
56
  except IndexError:
@@ -66,7 +69,8 @@ def download_file(uri: str, file_path: str) -> None:
66
69
  @click.argument('uri', required=True)
67
70
  @click.option('--json', 'output_as_json', is_flag=True, default=False, required=False, help='Format output as JSON')
68
71
  def describe(uri: str, output_as_json: bool) -> None:
69
- data_record = DataRecord(uri)
72
+ logger.warning('This is command deprecated, please use "biolib data-record describe" instead.')
73
+ data_record = DataRecord.get_by_uri(uri)
70
74
  files_info: List[Dict] = []
71
75
  total_size_in_bytes = 0
72
76
  for file in data_record.list_files():
@@ -1,5 +1,5 @@
1
1
  import warnings
2
- from biolib.sdk import Runtime as _Runtime
2
+ from biolib._runtime.runtime import Runtime as _Runtime
3
3
 
4
4
 
5
5
  def set_main_result_prefix(result_prefix: str) -> None:
biolib/sdk/__init__.py CHANGED
@@ -1,12 +1,12 @@
1
1
  # Imports to hide and use as private internal utils
2
+ from biolib._data_record.data_record import DataRecord as _DataRecord
2
3
  from biolib._internal.push_application import push_application as _push_application
3
4
  from biolib._internal.push_application import set_app_version_as_active as _set_app_version_as_active
4
5
  from biolib.app import BioLibApp as _BioLibApp
5
6
  from biolib.typing_utils import Optional as _Optional
6
7
 
7
- # Imports to expose as public API
8
- from biolib._internal.data_record import DataRecord
9
- from biolib._internal.runtime import Runtime
8
+ # Classes to expose as public API
9
+ from biolib._runtime.runtime import Runtime
10
10
 
11
11
 
12
12
  def push_app_version(uri: str, path: str) -> _BioLibApp:
@@ -42,5 +42,9 @@ def get_app_version_pytest_plugin(app_version: _BioLibApp):
42
42
  return AppVersionFixturePlugin(app_version)
43
43
 
44
44
 
45
- def create_data_record(destination: str, data_path: str, name: _Optional[str] = None) -> DataRecord:
46
- return DataRecord.create(destination, data_path, name)
45
+ def create_data_record(destination: str, data_path: str, name: _Optional[str] = None) -> _DataRecord:
46
+ if name:
47
+ destination_with_name = f"{destination}/{name}"
48
+ else:
49
+ destination_with_name = destination
50
+ return _DataRecord.create(destination_with_name, data_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pybiolib
3
- Version: 1.1.2038
3
+ Version: 1.1.2097
4
4
  Summary: BioLib Python Client
5
5
  Home-page: https://github.com/biolib
6
6
  License: MIT
@@ -1,32 +1,37 @@
1
1
  LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
2
2
  README.md,sha256=_IH7pxFiqy2bIAmaVeA-iVTyUwWRjMIlfgtUbYTtmls,368
3
- biolib/__init__.py,sha256=nfZvVkrHZLvjvvlAvFzhvem9NMfqgmw8NWaCH9HGzew,4045
3
+ biolib/__init__.py,sha256=yX8w8bDiY7CIxfKHFRF0U1hhwgCCIXtVr18Td5iNLp8,4135
4
+ biolib/_data_record/data_record.py,sha256=jUeCQjnVQLNLmlXO3rREEUnjXjOYuaQjBO7R66P6wFU,8909
4
5
  biolib/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- biolib/_internal/data_record/__init__.py,sha256=1Bk303i3rFet9veS56fIsrBYtT5X3n9vcsYMA6T6c5o,36
6
- biolib/_internal/data_record/data_record.py,sha256=NLzeyzqtzB9QOVUDcEiOn2WbMdMijbjZGYgy_p592_c,7372
7
- biolib/_internal/data_record/remote_storage_endpoint.py,sha256=hILu0TmFx-ZyDYbWPC4QEPogP0RRVEqwgJc8OEHYp50,1742
6
+ biolib/_internal/data_record/__init__.py,sha256=0T0CV6PfKc8itjMu-48sCJjcZQEzXl1ZLBqG_LjJTqQ,82
7
+ biolib/_internal/data_record/data_record.py,sha256=D0BaC8WhnkM564eKUI69hVHUkKY1In0cyfpjxYyWk18,3363
8
+ biolib/_internal/data_record/remote_storage_endpoint.py,sha256=eCptuZ4DMAPnaNCVDvpWXwXGI6Jac9U1N5dqU8Cj95Q,1732
9
+ biolib/_internal/file_utils.py,sha256=4jT6j7bB21c0JNn5BfnyWQib_zt0CVtJ_TiOFOStRcE,2604
8
10
  biolib/_internal/fuse_mount/__init__.py,sha256=B_tM6RM2dBw-vbpoHJC4X3tOAaN1H2RDvqYJOw3xFwg,55
9
11
  biolib/_internal/fuse_mount/experiment_fuse_mount.py,sha256=08aUdEq_bvqLBft_gSLjOClKDy5sBnMts1RfJf7AP_U,7012
10
12
  biolib/_internal/http_client.py,sha256=DdooXei93JKGYGV4aQmzue_oFzvHkozg2UCxgk9dfDM,5081
13
+ biolib/_internal/lfs/__init__.py,sha256=gSWo_xg61UniYgD7yNYxeT4I9uaXBCBSi3_nmZjnPpE,35
14
+ biolib/_internal/lfs/cache.py,sha256=pQS2np21rdJ6I3DpoOutnzPHpLOZgUIS8TMltUJk_k4,2226
11
15
  biolib/_internal/libs/__init__.py,sha256=Jdf4tNPqe_oIIf6zYml6TiqhL_02Vyqwge6IELrAFhw,98
12
16
  biolib/_internal/libs/fusepy/__init__.py,sha256=AWDzNFS-XV_5yKb0Qx7kggIhPzq1nj_BZS5y2Nso08k,41944
13
- biolib/_internal/push_application.py,sha256=H1PGNtVJ0vRC0li39gFMpPpjm6QeZ8Ob-7cLkLmxS_Y,10009
14
- biolib/_internal/runtime.py,sha256=BnFvRWYnxPXCgOtfxupN255Zxx9Gw6oPZyzUIGODw3k,3060
17
+ biolib/_internal/push_application.py,sha256=8P7eXvySn7CRp5XBDkO3xjTGixS8g7-jD-_iwzM_XDI,10020
18
+ biolib/_internal/runtime.py,sha256=9pZ3s3L7LGxdqOgnHh1KK3Jjyn_9MjhQmKHI-6hMT3U,448
15
19
  biolib/_internal/utils/__init__.py,sha256=p5vsIFyu-zYqBgdSMfwW9NC_jk7rXvvCbV4Bzd3As7c,630
20
+ biolib/_runtime/runtime.py,sha256=zy9HrE4X5hBqm8doUHkckyflquSBDSXV3REhT2MQGas,2767
16
21
  biolib/api/__init__.py,sha256=mQ4u8FijqyLzjYMezMUUbbBGNB3iFmkNdjXnWPZ7Jlw,138
17
22
  biolib/api/client.py,sha256=9MD1qI52BnRC_QSydFGjyFquwFw0R9dkDfUrjUouuHQ,3490
18
23
  biolib/app/__init__.py,sha256=cdPtcfb_U-bxb9iSL4fCEq2rpD9OjkyY4W-Zw60B0LI,37
19
24
  biolib/app/app.py,sha256=8AvPYL1W2wxQ7t7BB2KeVU2WPrm3UL6vVuHPGs8g9L0,8388
20
25
  biolib/app/search_apps.py,sha256=K4a41f5XIWth2BWI7OffASgIsD0ko8elCax8YL2igaY,1470
21
26
  biolib/biolib_api_client/__init__.py,sha256=E5EMa19wJoblwSdQPYrxc_BtIeRsAuO0L_jQweWw-Yk,182
22
- biolib/biolib_api_client/api_client.py,sha256=J03jRVvod1bgwwAZ3BZVKlUSJi43-ev2DUB0j63GZpc,7189
23
- biolib/biolib_api_client/app_types.py,sha256=lm_mZ5knl-70eVB5Zj03jSMrPN1ERqu_5ofzcuSUwN4,2425
27
+ biolib/biolib_api_client/api_client.py,sha256=krlSRmmAwtdMMyN1XzQhh1gihB1ERSIVslWQ-dqI1yU,7188
28
+ biolib/biolib_api_client/app_types.py,sha256=FxSr4UqfnMhLe34p8bm02wsC3g1Jz8iaing5tRKDOQI,2442
24
29
  biolib/biolib_api_client/auth.py,sha256=kjm0ZHnH3I8so3su2sZbBxNHYp-ZUdrZ5lwQ0K36RSw,949
25
30
  biolib/biolib_api_client/biolib_app_api.py,sha256=DndlVxrNTes6DOaWyMINLGZQCRMWVvR7gwt5HVlyf5Y,4240
26
31
  biolib/biolib_api_client/biolib_job_api.py,sha256=IpFahcRzm7GNy8DJ-XHYe-x7r4Voba8o22IXw5puHn8,6782
27
32
  biolib/biolib_api_client/common_types.py,sha256=RH-1KNHqUF-EkTpfPOSTt5Mq1GPdfju_cqXDesscO1I,123
28
33
  biolib/biolib_api_client/job_types.py,sha256=Dl4NhU2xpgpXV-7YIoDf6WL63SLR5bni55OX8x5539M,1300
29
- biolib/biolib_api_client/lfs_types.py,sha256=xaGjE-yUyNVM3LyKdslJn5ZXWp6_kVCd4o-ch8Czfm4,227
34
+ biolib/biolib_api_client/lfs_types.py,sha256=joZWP6-sa5_Ug_6xIp5fHAgEo_bqLE3rbleQocZtDcg,339
30
35
  biolib/biolib_api_client/user_state.py,sha256=XcgWV-MgVk88mIlMmnu8yHxMu8OCaw8o0tk7TVo5Hcg,637
31
36
  biolib/biolib_binary_format/__init__.py,sha256=HMl5SdX_VUWE4OQzi4Jf_yFvC7b0bSPOGPHYi9dWM2Q,185
32
37
  biolib/biolib_binary_format/base_bbf_package.py,sha256=vxRV4iKy0dXeDOlFWnMFI0hGnDBYDH5Cgh5gAfuObt8,959
@@ -46,10 +51,10 @@ biolib/biolib_errors.py,sha256=5m4lK2l39DafpoXBImEBD4EPH3ayXBX0JgtPzmGClow,689
46
51
  biolib/biolib_logging.py,sha256=J3E5H_LL5k6ZUim2C8gqN7E6lCBZMTpO4tnMpOPwG9U,2854
47
52
  biolib/cli/__init__.py,sha256=0v3c_J-U0k46c5ZWeQjLG_kTaKDJm81LBxQpDO2B_aI,1286
48
53
  biolib/cli/auth.py,sha256=rpWGmXs6Fz6CGrO9K8ibPRszOdXG78Vig_boKaVCD9A,2082
49
- biolib/cli/data_record.py,sha256=piN3QUbRAkMi4wpayghN4unFfuiNE5VCjI1gag4d8qg,1725
54
+ biolib/cli/data_record.py,sha256=oDy8U6mv-h-hbeMihXRzVEvM-WrGQq6oBiBl3xDRaXs,3220
50
55
  biolib/cli/download_container.py,sha256=HIZVHOPmslGE5M2Dsp9r2cCkAEJx__vcsDz5Wt5LRos,483
51
56
  biolib/cli/init.py,sha256=wQOfii_au-d30Hp7DdH-WVw-WVraKvA_zY4za1w7DE8,821
52
- biolib/cli/lfs.py,sha256=S9Ov-HWwtpMeRcwclh0qItnzviOaQL4aI0nnaCcZ_MM,3771
57
+ biolib/cli/lfs.py,sha256=z2qHUwink85mv9yDgifbVKkVwuyknGhMDTfly_gLKJM,4151
53
58
  biolib/cli/push.py,sha256=TFi7O9tJ3zFe0VmtVTV3Vh9_xIMHnrc41xxcaBKU46g,813
54
59
  biolib/cli/run.py,sha256=BbvXLQ-XibjQ71Y2d4URMH_8dflNVwM0i3TIWhw_u_c,1634
55
60
  biolib/cli/runtime.py,sha256=Xv-nrma5xX8NidWcvbUKcUvuN5TCarZa4A8mPVmF-z0,361
@@ -92,11 +97,8 @@ biolib/jobs/__init__.py,sha256=aIb2H2DHjQbM2Bs-dysFijhwFcL58Blp0Co0gimED3w,32
92
97
  biolib/jobs/job.py,sha256=npnARoP408SXD2UqyzFRJYdEJsP_gHoBh2xQkNegYqg,18884
93
98
  biolib/jobs/job_result.py,sha256=rALHiKYNaC9lHi_JJqBob1RubzNLwG9Z386kwRJjd2M,5885
94
99
  biolib/jobs/types.py,sha256=qhadtH2KDC2WUOOqPiwke0YgtQY4FtuB71Stekq1k48,970
95
- biolib/lfs/__init__.py,sha256=Qv8vdYeK43JecT4SsE93ZYE2VmNiZENdNpW8P9-omxs,115
96
- biolib/lfs/cache.py,sha256=pQS2np21rdJ6I3DpoOutnzPHpLOZgUIS8TMltUJk_k4,2226
97
- biolib/lfs/utils.py,sha256=HSs7F2wXklYhhv5tabfaeC5noXJyxRjbGD5IhWOVdxs,5918
98
- biolib/runtime/__init__.py,sha256=x1Ivydtu9TFTaX-Cofg_kGA-TI0zLon-ccrFiiVgBok,492
99
- biolib/sdk/__init__.py,sha256=wkQs7ltIpYK9Xw0-FLLacblemmlNGz8J2UmlM0noGSs,1749
100
+ biolib/runtime/__init__.py,sha256=Fg2ZIAmUegurLKagpBNfRgLcOwR2VZSmXQpb-ryRwI0,505
101
+ biolib/sdk/__init__.py,sha256=qJ_V_Edxolzi4VBQCrvem5lYIkJ0FVH3VZepSDuXjTc,1895
100
102
  biolib/tables.py,sha256=acH7VjwAbadLo8P84FSnKEZxCTVsF5rEg9VPuxElNs8,872
101
103
  biolib/templates/__init__.py,sha256=Yx62sSyDCDesRQDQgmbDsLpfgEh93fWE8r9u4g2azXk,36
102
104
  biolib/templates/example_app.py,sha256=EB3E3RT4SeO_ii5nVQqJpi5KDGNE_huF1ub-e5ZFveE,715
@@ -109,8 +111,8 @@ biolib/utils/cache_state.py,sha256=u256F37QSRIVwqKlbnCyzAX4EMI-kl6Dwu6qwj-Qmag,3
109
111
  biolib/utils/multipart_uploader.py,sha256=XvGP1I8tQuKhAH-QugPRoEsCi9qvbRk-DVBs5PNwwJo,8452
110
112
  biolib/utils/seq_util.py,sha256=jC5WhH63FTD7SLFJbxQGA2hOt9NTwq9zHl_BEec1Z0c,4907
111
113
  biolib/utils/zip/remote_zip.py,sha256=0wErYlxir5921agfFeV1xVjf29l9VNgGQvNlWOlj2Yc,23232
112
- pybiolib-1.1.2038.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
113
- pybiolib-1.1.2038.dist-info/METADATA,sha256=egfRWzBO8-r0fKEdCvVcZHQAXCUzf7zwmQEutCJElHc,1508
114
- pybiolib-1.1.2038.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
115
- pybiolib-1.1.2038.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
116
- pybiolib-1.1.2038.dist-info/RECORD,,
114
+ pybiolib-1.1.2097.dist-info/LICENSE,sha256=F2h7gf8i0agDIeWoBPXDMYScvQOz02pAWkKhTGOHaaw,1067
115
+ pybiolib-1.1.2097.dist-info/METADATA,sha256=-4wEBR8SXfG_VDlLRZR7UgrlKee5VydzL-L6wMKP17Y,1508
116
+ pybiolib-1.1.2097.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
117
+ pybiolib-1.1.2097.dist-info/entry_points.txt,sha256=p6DyaP_2kctxegTX23WBznnrDi4mz6gx04O5uKtRDXg,42
118
+ pybiolib-1.1.2097.dist-info/RECORD,,
biolib/lfs/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .cache import prune_lfs_cache
2
- from .utils import \
3
- push_large_file_system, \
4
- create_large_file_system
biolib/lfs/utils.py DELETED
@@ -1,153 +0,0 @@
1
- import io
2
- import os
3
- import zipfile as zf
4
- from pathlib import Path
5
-
6
- from biolib import utils, api
7
- from biolib.biolib_api_client import BiolibApiClient
8
- from biolib.biolib_api_client.lfs_types import LargeFileSystem, LargeFileSystemVersion
9
- from biolib.biolib_logging import logger
10
- from biolib.biolib_errors import BioLibError
11
- from biolib.typing_utils import List, Tuple, Iterator, Optional
12
- from biolib.utils.app_uri import parse_app_uri
13
-
14
-
15
- def get_files_and_size_of_directory(directory: str) -> Tuple[List[str], int]:
16
- data_size = 0
17
- file_list: List[str] = []
18
-
19
- for path, _, files in os.walk(directory):
20
- for file in files:
21
- file_path = os.path.join(path, file)
22
- if os.path.islink(file_path):
23
- continue # skip symlinks
24
-
25
- relative_file_path = file_path[len(directory) + 1:] # +1 to remove starting slash
26
- file_list.append(relative_file_path)
27
- data_size += os.path.getsize(file_path)
28
-
29
- return file_list, data_size
30
-
31
-
32
- def get_iterable_zip_stream(files: List[str], chunk_size: int) -> Iterator[bytes]:
33
- class ChunkedIOBuffer(io.RawIOBase):
34
- def __init__(self, chunk_size: int):
35
- super().__init__()
36
- self.chunk_size = chunk_size
37
- self.tmp_data = bytearray()
38
-
39
- def get_buffer_size(self):
40
- return len(self.tmp_data)
41
-
42
- def read_chunk(self):
43
- chunk = bytes(self.tmp_data[:self.chunk_size])
44
- self.tmp_data = self.tmp_data[self.chunk_size:]
45
- return chunk
46
-
47
- def write(self, data):
48
- data_length = len(data)
49
- self.tmp_data += data
50
- return data_length
51
-
52
- # create chunked buffer to hold data temporarily
53
- io_buffer = ChunkedIOBuffer(chunk_size)
54
-
55
- # create zip writer that will write to the io buffer
56
- zip_writer = zf.ZipFile(io_buffer, mode='w') # type: ignore
57
-
58
- for file_path in files:
59
- # generate zip info and prepare zip pointer for writing
60
- z_info = zf.ZipInfo.from_file(file_path)
61
- zip_pointer = zip_writer.open(z_info, mode='w')
62
- if Path(file_path).is_file():
63
- # read file chunk by chunk
64
- with open(file_path, 'br') as file_pointer:
65
- while True:
66
- chunk = file_pointer.read(chunk_size)
67
- if len(chunk) == 0:
68
- break
69
- # write the chunk to the zip
70
- zip_pointer.write(chunk)
71
- # if writing the chunk caused us to go over chunk_size, flush it
72
- if io_buffer.get_buffer_size() > chunk_size:
73
- yield io_buffer.read_chunk()
74
-
75
- zip_pointer.close()
76
-
77
- # flush any remaining data in the stream (e.g. zip file meta data)
78
- zip_writer.close()
79
- while True:
80
- chunk = io_buffer.read_chunk()
81
- if len(chunk) == 0:
82
- break
83
- yield chunk
84
-
85
-
86
- def create_large_file_system(lfs_uri: str) -> str:
87
- BiolibApiClient.assert_is_signed_in(authenticated_action_description='create a Large File System')
88
-
89
- uri_parsed = parse_app_uri(lfs_uri)
90
- response = api.client.post(
91
- path='/lfs/',
92
- data={
93
- 'account_handle': uri_parsed['account_handle_normalized'],
94
- 'name': uri_parsed['app_name'],
95
- },
96
- )
97
- lfs: LargeFileSystem = response.json()
98
- logger.info(f"Successfully created new Large File System '{lfs['uri']}'")
99
- return lfs['uri']
100
-
101
-
102
- def push_large_file_system(lfs_uri: str, input_dir: str, chunk_size_in_mb: Optional[int] = None) -> str:
103
- BiolibApiClient.assert_is_signed_in(authenticated_action_description='push data to a Large File System')
104
-
105
- if not os.path.isdir(input_dir):
106
- raise BioLibError(f'Could not find folder at {input_dir}')
107
-
108
- if os.path.realpath(input_dir) == '/':
109
- raise BioLibError('Pushing your root directory is not possible')
110
-
111
- original_working_dir = os.getcwd()
112
- os.chdir(input_dir)
113
- files_to_zip, data_size_in_bytes = get_files_and_size_of_directory(directory=os.getcwd())
114
-
115
- if data_size_in_bytes > 4_500_000_000_000:
116
- raise BioLibError('Attempted to push directory with a size larger than the limit of 4.5 TB')
117
-
118
- min_chunk_size_bytes = 10_000_000
119
- chunk_size_in_bytes: int
120
- if chunk_size_in_mb:
121
- chunk_size_in_bytes = chunk_size_in_mb * 1_000_000 # Convert megabytes to bytes
122
- if chunk_size_in_bytes < min_chunk_size_bytes:
123
- logger.warning('Specified chunk size is too small, using minimum of 10 MB instead.')
124
- chunk_size_in_bytes = min_chunk_size_bytes
125
- else:
126
- # Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
127
- chunk_size_in_bytes = max(min_chunk_size_bytes, int(data_size_in_bytes / 9_000))
128
-
129
- data_size_in_mb = round(data_size_in_bytes / 10 ** 6)
130
- print(f'Zipping {len(files_to_zip)} files, in total ~{data_size_in_mb}mb of data')
131
-
132
- response = api.client.post(path='/lfs/versions/', data={'resource_uri': lfs_uri})
133
- lfs_version: LargeFileSystemVersion = response.json()
134
- iterable_zip_stream = get_iterable_zip_stream(files=files_to_zip, chunk_size=chunk_size_in_bytes)
135
-
136
- multipart_uploader = utils.MultiPartUploader(
137
- use_process_pool=True,
138
- get_presigned_upload_url_request=dict(
139
- headers=None,
140
- requires_biolib_auth=True,
141
- path=f"/lfs/versions/{lfs_version['uuid']}/presigned_upload_url/",
142
- ),
143
- complete_upload_request=dict(
144
- headers=None,
145
- requires_biolib_auth=True,
146
- path=f"/lfs/versions/{lfs_version['uuid']}/complete_upload/",
147
- ),
148
- )
149
-
150
- multipart_uploader.upload(payload_iterator=iterable_zip_stream, payload_size_in_bytes=data_size_in_bytes)
151
- os.chdir(original_working_dir)
152
- logger.info(f"Successfully pushed a new LFS version '{lfs_version['uri']}'")
153
- return lfs_version['uri']
File without changes