pybiolib 1.1.1881__py3-none-any.whl → 1.2.7.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. biolib/__init__.py +11 -4
  2. biolib/_data_record/data_record.py +278 -0
  3. biolib/_internal/data_record/__init__.py +1 -1
  4. biolib/_internal/data_record/data_record.py +97 -151
  5. biolib/_internal/data_record/remote_storage_endpoint.py +18 -7
  6. biolib/_internal/file_utils.py +77 -0
  7. biolib/_internal/fuse_mount/__init__.py +1 -0
  8. biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
  9. biolib/_internal/http_client.py +31 -9
  10. biolib/_internal/lfs/__init__.py +1 -0
  11. biolib/_internal/libs/__init__.py +1 -0
  12. biolib/_internal/libs/fusepy/__init__.py +1257 -0
  13. biolib/_internal/push_application.py +6 -1
  14. biolib/_internal/runtime.py +3 -56
  15. biolib/_internal/types/__init__.py +4 -0
  16. biolib/_internal/types/app.py +9 -0
  17. biolib/_internal/types/data_record.py +40 -0
  18. biolib/_internal/types/experiment.py +10 -0
  19. biolib/_internal/types/resource.py +14 -0
  20. biolib/_internal/types/typing.py +7 -0
  21. biolib/_internal/utils/multinode.py +264 -0
  22. biolib/_runtime/runtime.py +84 -0
  23. biolib/api/__init__.py +1 -0
  24. biolib/api/client.py +39 -17
  25. biolib/app/app.py +34 -71
  26. biolib/biolib_api_client/api_client.py +9 -2
  27. biolib/biolib_api_client/app_types.py +3 -2
  28. biolib/biolib_api_client/biolib_job_api.py +6 -0
  29. biolib/biolib_api_client/job_types.py +4 -4
  30. biolib/biolib_api_client/lfs_types.py +8 -2
  31. biolib/biolib_binary_format/remote_endpoints.py +12 -10
  32. biolib/biolib_binary_format/utils.py +23 -3
  33. biolib/cli/auth.py +1 -1
  34. biolib/cli/data_record.py +45 -6
  35. biolib/cli/lfs.py +10 -6
  36. biolib/compute_node/cloud_utils/cloud_utils.py +13 -16
  37. biolib/compute_node/job_worker/executors/docker_executor.py +127 -108
  38. biolib/compute_node/job_worker/job_storage.py +17 -5
  39. biolib/compute_node/job_worker/job_worker.py +25 -15
  40. biolib/compute_node/remote_host_proxy.py +72 -84
  41. biolib/compute_node/webserver/webserver_types.py +0 -1
  42. biolib/compute_node/webserver/worker_thread.py +42 -39
  43. biolib/experiments/experiment.py +75 -44
  44. biolib/jobs/job.py +98 -19
  45. biolib/jobs/job_result.py +46 -21
  46. biolib/jobs/types.py +1 -1
  47. biolib/runtime/__init__.py +2 -1
  48. biolib/sdk/__init__.py +18 -7
  49. biolib/typing_utils.py +2 -7
  50. biolib/user/sign_in.py +2 -2
  51. biolib/utils/seq_util.py +38 -35
  52. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/METADATA +1 -1
  53. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/RECORD +57 -45
  54. biolib/experiments/types.py +0 -9
  55. biolib/lfs/__init__.py +0 -4
  56. biolib/lfs/utils.py +0 -153
  57. /biolib/{lfs → _internal/lfs}/cache.py +0 -0
  58. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/LICENSE +0 -0
  59. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/WHEEL +0 -0
  60. {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,77 @@
1
+ import io
2
+ import os
3
+ import zipfile as zf
4
+ from pathlib import Path
5
+
6
+ from biolib.typing_utils import Iterator, List, Tuple
7
+
8
+
9
+ def get_files_and_size_of_directory(directory: str) -> Tuple[List[str], int]:
10
+ data_size = 0
11
+ file_list: List[str] = []
12
+
13
+ for path, _, files in os.walk(directory):
14
+ for file in files:
15
+ file_path = os.path.join(path, file)
16
+ if os.path.islink(file_path):
17
+ continue # skip symlinks
18
+
19
+ relative_file_path = file_path[len(directory) + 1 :] # +1 to remove starting slash
20
+ file_list.append(relative_file_path)
21
+ data_size += os.path.getsize(file_path)
22
+
23
+ return file_list, data_size
24
+
25
+
26
+ def get_iterable_zip_stream(files: List[str], chunk_size: int) -> Iterator[bytes]:
27
+ class ChunkedIOBuffer(io.RawIOBase):
28
+ def __init__(self, chunk_size: int):
29
+ super().__init__()
30
+ self.chunk_size = chunk_size
31
+ self.tmp_data = bytearray()
32
+
33
+ def get_buffer_size(self):
34
+ return len(self.tmp_data)
35
+
36
+ def read_chunk(self):
37
+ chunk = bytes(self.tmp_data[: self.chunk_size])
38
+ self.tmp_data = self.tmp_data[self.chunk_size :]
39
+ return chunk
40
+
41
+ def write(self, data):
42
+ data_length = len(data)
43
+ self.tmp_data += data
44
+ return data_length
45
+
46
+ # create chunked buffer to hold data temporarily
47
+ io_buffer = ChunkedIOBuffer(chunk_size)
48
+
49
+ # create zip writer that will write to the io buffer
50
+ zip_writer = zf.ZipFile(io_buffer, mode='w') # type: ignore
51
+
52
+ for file_path in files:
53
+ # generate zip info and prepare zip pointer for writing
54
+ z_info = zf.ZipInfo.from_file(file_path)
55
+ zip_pointer = zip_writer.open(z_info, mode='w')
56
+ if Path(file_path).is_file():
57
+ # read file chunk by chunk
58
+ with open(file_path, 'br') as file_pointer:
59
+ while True:
60
+ chunk = file_pointer.read(chunk_size)
61
+ if len(chunk) == 0:
62
+ break
63
+ # write the chunk to the zip
64
+ zip_pointer.write(chunk)
65
+ # if writing the chunk caused us to go over chunk_size, flush it
66
+ if io_buffer.get_buffer_size() > chunk_size:
67
+ yield io_buffer.read_chunk()
68
+
69
+ zip_pointer.close()
70
+
71
+ # flush any remaining data in the stream (e.g. zip file meta data)
72
+ zip_writer.close()
73
+ while True:
74
+ chunk = io_buffer.read_chunk()
75
+ if len(chunk) == 0:
76
+ break
77
+ yield chunk
@@ -0,0 +1 @@
1
+ from .experiment_fuse_mount import ExperimentFuseMount
@@ -0,0 +1,209 @@
1
+ import errno
2
+ import os
3
+ import stat
4
+ from datetime import datetime, timezone
5
+ from time import time
6
+
7
+ from biolib._internal.libs.fusepy import FUSE, FuseOSError, Operations
8
+ from biolib.biolib_errors import BioLibError
9
+ from biolib.jobs import Job
10
+ from biolib.typing_utils import Dict, List, Optional, Tuple, TypedDict
11
+
12
+
13
+ class _AttributeDict(TypedDict):
14
+ st_atime: int
15
+ st_ctime: int
16
+ st_gid: int
17
+ st_mode: int
18
+ st_mtime: int
19
+ st_nlink: int
20
+ st_size: int
21
+ st_uid: int
22
+
23
+
24
+ _SUCCESS_CODE = 0
25
+
26
+
27
+ class ExperimentFuseMount(Operations):
28
+ def __init__(self, experiment):
29
+ self._experiment = experiment
30
+ self._job_names_map: Optional[Dict[str, Job]] = None
31
+ self._jobs_last_fetched_at: float = 0.0
32
+ self._mounted_at_epoch_seconds: int = int(time())
33
+
34
+ @staticmethod
35
+ def mount_experiment(experiment, mount_path: str) -> None:
36
+ FUSE(
37
+ operations=ExperimentFuseMount(experiment),
38
+ mountpoint=mount_path,
39
+ nothreads=True,
40
+ foreground=True,
41
+ allow_other=False,
42
+ )
43
+
44
+ def getattr(self, path: str, fh=None) -> _AttributeDict:
45
+ if path == '/':
46
+ return self._get_directory_attributes(timestamp_epoch_seconds=self._mounted_at_epoch_seconds)
47
+
48
+ job, path_in_job = self._parse_path(path)
49
+ job_finished_at_epoch_seconds: int = int(
50
+ datetime.fromisoformat(job.to_dict()['finished_at'].rstrip('Z')).replace(tzinfo=timezone.utc).timestamp()
51
+ )
52
+
53
+ if path_in_job == '/':
54
+ return self._get_directory_attributes(timestamp_epoch_seconds=job_finished_at_epoch_seconds)
55
+
56
+ try:
57
+ file = job.get_output_file(path_in_job)
58
+ return self._get_file_attributes(
59
+ timestamp_epoch_seconds=job_finished_at_epoch_seconds,
60
+ size_in_bytes=file.length,
61
+ )
62
+ except BioLibError:
63
+ # file not found
64
+ pass
65
+
66
+ file_paths_in_job = [file.path for file in job.list_output_files()]
67
+
68
+ for file_path_in_job in file_paths_in_job:
69
+ if file_path_in_job.startswith(path_in_job):
70
+ return self._get_directory_attributes(timestamp_epoch_seconds=job_finished_at_epoch_seconds)
71
+
72
+ raise FuseOSError(errno.ENOENT) from None # No such file or directory
73
+
74
+ def readdir(self, path: str, fh: int) -> List[str]:
75
+ directory_entries = ['.', '..']
76
+
77
+ if path == '/':
78
+ directory_entries.extend(self._get_job_names_map(refresh_jobs=True).keys())
79
+ else:
80
+ job, path_in_job = self._parse_path(path)
81
+ dir_path_in_job = '/' if path_in_job == '/' else path_in_job + '/'
82
+ depth = dir_path_in_job.count('/')
83
+ directory_entries.extend(
84
+ set(
85
+ [
86
+ file.path.split('/')[depth]
87
+ for file in job.list_output_files()
88
+ if file.path.startswith(dir_path_in_job)
89
+ ]
90
+ )
91
+ )
92
+
93
+ return directory_entries
94
+
95
+ def open(self, path: str, flags: int) -> int:
96
+ job, path_in_job = self._parse_path(path)
97
+ try:
98
+ job.get_output_file(path_in_job)
99
+ except BioLibError:
100
+ # file not found
101
+ raise FuseOSError(errno.ENOENT) from None
102
+
103
+ return 1234 # dummy file handle
104
+
105
+ def read(self, path: str, size: int, offset: int, fh: int) -> bytes:
106
+ job, path_in_job = self._parse_path(path)
107
+ try:
108
+ file = job.get_output_file(path_in_job)
109
+ except BioLibError:
110
+ raise FuseOSError(errno.ENOENT) from None # No such file or directory
111
+
112
+ return file.get_data(start=offset, length=size)
113
+
114
+ def release(self, path: str, fh: int) -> int:
115
+ return _SUCCESS_CODE
116
+
117
+ def releasedir(self, path: str, fh: int) -> int:
118
+ return _SUCCESS_CODE
119
+
120
+ def flush(self, path: str, fh: int) -> int:
121
+ return _SUCCESS_CODE
122
+
123
+ @staticmethod
124
+ def _get_directory_attributes(timestamp_epoch_seconds: int) -> _AttributeDict:
125
+ return _AttributeDict(
126
+ st_atime=timestamp_epoch_seconds,
127
+ st_ctime=timestamp_epoch_seconds,
128
+ st_gid=os.getgid(),
129
+ st_mode=stat.S_IFDIR | 0o555, # Directory that is readable and executable by owner, group, and others.
130
+ st_mtime=timestamp_epoch_seconds,
131
+ st_nlink=1,
132
+ st_size=1,
133
+ st_uid=os.getuid(),
134
+ )
135
+
136
+ @staticmethod
137
+ def _get_file_attributes(timestamp_epoch_seconds: int, size_in_bytes: int) -> _AttributeDict:
138
+ return _AttributeDict(
139
+ st_atime=timestamp_epoch_seconds,
140
+ st_ctime=timestamp_epoch_seconds,
141
+ st_gid=os.getgid(),
142
+ st_mode=stat.S_IFREG | 0o444, # Regular file with read permissions for owner, group, and others.
143
+ st_mtime=timestamp_epoch_seconds,
144
+ st_nlink=1,
145
+ st_size=size_in_bytes,
146
+ st_uid=os.getuid(),
147
+ )
148
+
149
+ def _get_job_names_map(self, refresh_jobs=False) -> Dict[str, Job]:
150
+ current_time = time()
151
+ if not self._job_names_map or (current_time - self._jobs_last_fetched_at > 1 and refresh_jobs):
152
+ self._jobs_last_fetched_at = current_time
153
+ self._job_names_map = {job.get_name(): job for job in self._experiment.get_jobs(status='completed')}
154
+
155
+ return self._job_names_map
156
+
157
+ def _parse_path(self, path: str) -> Tuple[Job, str]:
158
+ path_splitted = path.split('/')
159
+ job_name = path_splitted[1]
160
+ path_in_job = '/' + '/'.join(path_splitted[2:])
161
+ job = self._get_job_names_map().get(job_name)
162
+ if not job:
163
+ raise FuseOSError(errno.ENOENT) # No such file or directory
164
+
165
+ return job, path_in_job
166
+
167
+ # ----------------------------------- File system methods not implemented below -----------------------------------
168
+
169
+ def chmod(self, path, mode):
170
+ raise FuseOSError(errno.EACCES)
171
+
172
+ def chown(self, path, uid, gid):
173
+ raise FuseOSError(errno.EACCES)
174
+
175
+ def mknod(self, path, mode, dev):
176
+ raise FuseOSError(errno.EACCES)
177
+
178
+ def rmdir(self, path):
179
+ raise FuseOSError(errno.EACCES)
180
+
181
+ def mkdir(self, path, mode):
182
+ raise FuseOSError(errno.EACCES)
183
+
184
+ def unlink(self, path):
185
+ raise FuseOSError(errno.EACCES)
186
+
187
+ def symlink(self, target, source):
188
+ raise FuseOSError(errno.EACCES)
189
+
190
+ def rename(self, old, new):
191
+ raise FuseOSError(errno.EACCES)
192
+
193
+ def link(self, target, source):
194
+ raise FuseOSError(errno.EACCES)
195
+
196
+ def utimens(self, path, times=None):
197
+ raise FuseOSError(errno.EACCES)
198
+
199
+ def create(self, path, mode, fi=None):
200
+ raise FuseOSError(errno.EACCES)
201
+
202
+ def write(self, path, data, offset, fh):
203
+ raise FuseOSError(errno.EACCES)
204
+
205
+ def truncate(self, path, length, fh=None):
206
+ raise FuseOSError(errno.EACCES)
207
+
208
+ def fsync(self, path, datasync, fh):
209
+ raise FuseOSError(errno.EACCES)
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import platform
3
+ import shutil
3
4
  import socket
4
5
  import ssl
5
6
  import subprocess
@@ -41,15 +42,24 @@ class HttpError(urllib.error.HTTPError):
41
42
 
42
43
 
43
44
  class HttpResponse:
44
- def __init__(self, response) -> None:
45
+ def __init__(self, response, response_path) -> None:
45
46
  self.headers: Dict[str, str] = dict(response.headers)
46
47
  self.status_code: int = int(response.status)
47
- self.content: bytes = response.read()
48
+ self.response_path = response_path
49
+ if self.response_path:
50
+ with open(self.response_path, 'wb') as out_file:
51
+ shutil.copyfileobj(response, out_file)
52
+ else:
53
+ self.content: bytes = response.read()
48
54
  self.url: str = response.geturl()
49
55
 
50
56
  @property
51
57
  def text(self) -> str:
52
- return cast(str, self.content.decode('utf-8'))
58
+ if self.response_path:
59
+ with open(self.response_path, 'rb') as fp:
60
+ return cast(str, fp.read().decode('utf-8'))
61
+ else:
62
+ return cast(str, self.content.decode('utf-8'))
53
63
 
54
64
  def json(self):
55
65
  return json.loads(self.text)
@@ -66,6 +76,7 @@ class HttpClient:
66
76
  headers: Optional[Dict[str, str]] = None,
67
77
  retries: int = 5,
68
78
  timeout_in_seconds: Optional[int] = None,
79
+ response_path: Optional[str] = None,
69
80
  ) -> HttpResponse:
70
81
  if not HttpClient.ssl_context:
71
82
  HttpClient.ssl_context = _create_ssl_context()
@@ -83,7 +94,7 @@ class HttpClient:
83
94
  if timeout_in_seconds is None:
84
95
  timeout_in_seconds = 60 if isinstance(data, dict) else 180 # TODO: Calculate timeout based on data size
85
96
 
86
- last_error: Optional[urllib.error.URLError] = None
97
+ last_error: Optional[Exception] = None
87
98
  for retry_count in range(retries + 1):
88
99
  if retry_count > 0:
89
100
  time.sleep(5 * retry_count)
@@ -94,23 +105,34 @@ class HttpClient:
94
105
  context=HttpClient.ssl_context,
95
106
  timeout=timeout_in_seconds,
96
107
  ) as response:
97
- return HttpResponse(response)
108
+ return HttpResponse(response, response_path)
98
109
 
99
110
  except urllib.error.HTTPError as error:
100
- if error.code == 502:
101
- logger_no_user_data.debug(f'HTTP {method} request failed with status 502 for "{url}"')
111
+ if error.code == 429:
112
+ logger_no_user_data.warning(f'HTTP {method} request failed with status 429 for "{url}"')
113
+ last_error = error
114
+ elif error.code == 502:
115
+ logger_no_user_data.warning(f'HTTP {method} request failed with status 502 for "{url}"')
102
116
  last_error = error
103
117
  elif error.code == 503:
104
- logger_no_user_data.debug(f'HTTP {method} request failed with status 503 for "{url}"')
118
+ logger_no_user_data.warning(f'HTTP {method} request failed with status 503 for "{url}"')
119
+ last_error = error
120
+ elif error.code == 504:
121
+ logger_no_user_data.warning(f'HTTP {method} request failed with status 504 for "{url}"')
105
122
  last_error = error
106
123
  else:
107
124
  raise HttpError(error) from None
108
125
 
109
126
  except urllib.error.URLError as error:
110
127
  if isinstance(error.reason, socket.timeout):
111
- logger_no_user_data.debug(f'HTTP {method} request failed with read timeout for "{url}"')
128
+ if retry_count > 0:
129
+ logger_no_user_data.warning(f'HTTP {method} request failed with read timeout for "{url}"')
112
130
  last_error = error
113
131
  else:
114
132
  raise error
133
+ except socket.timeout as error:
134
+ if retry_count > 0:
135
+ logger_no_user_data.warning(f'HTTP {method} request failed with read timeout for "{url}"')
136
+ last_error = error
115
137
 
116
138
  raise last_error or Exception(f'HTTP {method} request failed after {retries} retries for "{url}"')
@@ -0,0 +1 @@
1
+ from .cache import prune_lfs_cache
@@ -0,0 +1 @@
1
+ # Note: this directory is purely for libraries to be directly included instead of as dependencies