pybiolib 1.1.1881__py3-none-any.whl → 1.2.7.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +11 -4
- biolib/_data_record/data_record.py +278 -0
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +97 -151
- biolib/_internal/data_record/remote_storage_endpoint.py +18 -7
- biolib/_internal/file_utils.py +77 -0
- biolib/_internal/fuse_mount/__init__.py +1 -0
- biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
- biolib/_internal/http_client.py +31 -9
- biolib/_internal/lfs/__init__.py +1 -0
- biolib/_internal/libs/__init__.py +1 -0
- biolib/_internal/libs/fusepy/__init__.py +1257 -0
- biolib/_internal/push_application.py +6 -1
- biolib/_internal/runtime.py +3 -56
- biolib/_internal/types/__init__.py +4 -0
- biolib/_internal/types/app.py +9 -0
- biolib/_internal/types/data_record.py +40 -0
- biolib/_internal/types/experiment.py +10 -0
- biolib/_internal/types/resource.py +14 -0
- biolib/_internal/types/typing.py +7 -0
- biolib/_internal/utils/multinode.py +264 -0
- biolib/_runtime/runtime.py +84 -0
- biolib/api/__init__.py +1 -0
- biolib/api/client.py +39 -17
- biolib/app/app.py +34 -71
- biolib/biolib_api_client/api_client.py +9 -2
- biolib/biolib_api_client/app_types.py +3 -2
- biolib/biolib_api_client/biolib_job_api.py +6 -0
- biolib/biolib_api_client/job_types.py +4 -4
- biolib/biolib_api_client/lfs_types.py +8 -2
- biolib/biolib_binary_format/remote_endpoints.py +12 -10
- biolib/biolib_binary_format/utils.py +23 -3
- biolib/cli/auth.py +1 -1
- biolib/cli/data_record.py +45 -6
- biolib/cli/lfs.py +10 -6
- biolib/compute_node/cloud_utils/cloud_utils.py +13 -16
- biolib/compute_node/job_worker/executors/docker_executor.py +127 -108
- biolib/compute_node/job_worker/job_storage.py +17 -5
- biolib/compute_node/job_worker/job_worker.py +25 -15
- biolib/compute_node/remote_host_proxy.py +72 -84
- biolib/compute_node/webserver/webserver_types.py +0 -1
- biolib/compute_node/webserver/worker_thread.py +42 -39
- biolib/experiments/experiment.py +75 -44
- biolib/jobs/job.py +98 -19
- biolib/jobs/job_result.py +46 -21
- biolib/jobs/types.py +1 -1
- biolib/runtime/__init__.py +2 -1
- biolib/sdk/__init__.py +18 -7
- biolib/typing_utils.py +2 -7
- biolib/user/sign_in.py +2 -2
- biolib/utils/seq_util.py +38 -35
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/METADATA +1 -1
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/RECORD +57 -45
- biolib/experiments/types.py +0 -9
- biolib/lfs/__init__.py +0 -4
- biolib/lfs/utils.py +0 -153
- /biolib/{lfs → _internal/lfs}/cache.py +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/entry_points.txt +0 -0
@@ -1,44 +1,41 @@
|
|
1
|
+
import io
|
1
2
|
import json
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
import shlex
|
2
6
|
import subprocess
|
7
|
+
import tarfile
|
3
8
|
import tempfile
|
4
9
|
import time
|
5
|
-
import tarfile
|
6
10
|
import zipfile
|
7
|
-
import os
|
8
|
-
import io
|
9
|
-
import re
|
10
|
-
import shlex
|
11
11
|
from copy import copy
|
12
12
|
from datetime import datetime
|
13
13
|
|
14
14
|
import docker # type: ignore
|
15
15
|
import docker.types # type: ignore
|
16
|
-
|
17
|
-
from docker.errors import ImageNotFound, APIError # type: ignore
|
16
|
+
from docker.errors import APIError, ImageNotFound # type: ignore
|
18
17
|
from docker.models.containers import Container # type: ignore
|
19
18
|
|
20
19
|
from biolib import utils
|
21
|
-
|
22
20
|
from biolib._internal.runtime import RuntimeJobDataDict
|
23
21
|
from biolib.biolib_binary_format import ModuleInput, ModuleOutputV2
|
22
|
+
from biolib.biolib_binary_format.file_in_container import FileInContainer
|
24
23
|
from biolib.biolib_docker_client import BiolibDockerClient
|
25
|
-
from biolib.biolib_errors import
|
24
|
+
from biolib.biolib_errors import BioLibError, DockerContainerNotFoundDuringExecutionException
|
26
25
|
from biolib.biolib_logging import logger, logger_no_user_data
|
27
26
|
from biolib.compute_node import utils as compute_node_utils
|
28
27
|
from biolib.compute_node.cloud_utils import CloudUtils
|
29
28
|
from biolib.compute_node.job_worker.docker_image_cache import DockerImageCache
|
30
|
-
from biolib.biolib_binary_format.file_in_container import FileInContainer
|
31
29
|
from biolib.compute_node.job_worker.executors.docker_types import DockerDiffKind
|
32
|
-
from biolib.compute_node.job_worker.executors.types import
|
30
|
+
from biolib.compute_node.job_worker.executors.types import LocalExecutorOptions, StatusUpdate
|
33
31
|
from biolib.compute_node.job_worker.mappings import Mappings, path_without_first_folder
|
34
|
-
from biolib.compute_node.job_worker.utils import ComputeProcessException
|
35
32
|
from biolib.compute_node.job_worker.utilization_reporter_thread import UtilizationReporterThread
|
33
|
+
from biolib.compute_node.job_worker.utils import ComputeProcessException
|
36
34
|
from biolib.compute_node.utils import SystemExceptionCodes
|
37
|
-
from biolib.typing_utils import
|
35
|
+
from biolib.typing_utils import Dict, List, Optional
|
38
36
|
|
39
37
|
|
40
38
|
class DockerExecutor:
|
41
|
-
|
42
39
|
def __init__(self, options: LocalExecutorOptions) -> None:
|
43
40
|
self._options: LocalExecutorOptions = options
|
44
41
|
self._is_cleaning_up = False
|
@@ -83,11 +80,12 @@ class DockerExecutor:
|
|
83
80
|
raise Exception('Docker container was None')
|
84
81
|
return self._docker_container
|
85
82
|
|
86
|
-
def execute_module(self,
|
83
|
+
def execute_module(self, module_input_path: str, module_output_path: str) -> None:
|
87
84
|
try:
|
88
85
|
job_uuid = self._options['job']['public_id']
|
89
86
|
send_status_update = self._options['send_status_update']
|
90
|
-
|
87
|
+
with open(module_input_path, 'rb') as fp:
|
88
|
+
module_input = ModuleInput(fp.read()).deserialize()
|
91
89
|
|
92
90
|
send_status_update(StatusUpdate(progress=55, log_message='Pulling images...'))
|
93
91
|
|
@@ -115,35 +113,49 @@ class DockerExecutor:
|
|
115
113
|
except Exception: # pylint: disable=broad-except
|
116
114
|
logger_no_user_data.error('DockerExecutor failed to clean up container')
|
117
115
|
|
118
|
-
def _pull(self):
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
image_uri=self._absolute_image_uri,
|
124
|
-
estimated_image_size_bytes=self._options['module']['estimated_image_size_bytes'],
|
125
|
-
job_id=self._options['job']['public_id'],
|
126
|
-
)
|
127
|
-
else:
|
128
|
-
docker_client = BiolibDockerClient.get_docker_client()
|
129
|
-
try:
|
130
|
-
docker_client.images.get(self._absolute_image_uri)
|
131
|
-
except ImageNotFound:
|
132
|
-
job_uuid = self._options['job'].get('federated_job_uuid') or self._options['job']['public_id']
|
133
|
-
docker_client.images.pull(
|
134
|
-
self._absolute_image_uri,
|
135
|
-
auth_config={'username': 'biolib', 'password': f',{job_uuid}'},
|
136
|
-
)
|
116
|
+
def _pull(self) -> None:
|
117
|
+
retries = 3
|
118
|
+
last_error: Optional[Exception] = None
|
119
|
+
estimated_image_size_bytes = self._options['module']['estimated_image_size_bytes']
|
120
|
+
assert estimated_image_size_bytes is not None, 'No estimated image size'
|
137
121
|
|
138
|
-
|
122
|
+
for retry_count in range(retries + 1):
|
123
|
+
if retry_count > 0:
|
124
|
+
logger_no_user_data.debug(f'Retrying Docker image pull of "{self._absolute_image_uri}"')
|
125
|
+
time.sleep(5 * retry_count)
|
126
|
+
try:
|
127
|
+
start_time = time.time()
|
128
|
+
if utils.IS_RUNNING_IN_CLOUD and not self._options['job'].get('federated_job_uuid'):
|
129
|
+
DockerImageCache().get(
|
130
|
+
image_uri=self._absolute_image_uri,
|
131
|
+
estimated_image_size_bytes=estimated_image_size_bytes,
|
132
|
+
job_id=self._options['job']['public_id'],
|
133
|
+
)
|
134
|
+
else:
|
135
|
+
docker_client = BiolibDockerClient.get_docker_client()
|
136
|
+
try:
|
137
|
+
docker_client.images.get(self._absolute_image_uri)
|
138
|
+
except ImageNotFound:
|
139
|
+
job_uuid = self._options['job'].get('federated_job_uuid') or self._options['job']['public_id']
|
140
|
+
docker_client.images.pull(
|
141
|
+
self._absolute_image_uri,
|
142
|
+
auth_config={'username': 'biolib', 'password': f',{job_uuid}'},
|
143
|
+
)
|
144
|
+
|
145
|
+
logger_no_user_data.debug(f'Pulled image in: {time.time() - start_time}')
|
146
|
+
return
|
147
|
+
except Exception as error:
|
148
|
+
logger_no_user_data.warning(
|
149
|
+
f'Pull of Docker image "{self._absolute_image_uri}" returned error: {error}'
|
150
|
+
)
|
151
|
+
last_error = error
|
139
152
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
) from exception
|
153
|
+
raise ComputeProcessException(
|
154
|
+
last_error or Exception('Retries exceeded: failed to pull Docker image'),
|
155
|
+
SystemExceptionCodes.FAILED_TO_PULL_DOCKER_IMAGE.value,
|
156
|
+
self._send_system_exception,
|
157
|
+
may_contain_user_data=False,
|
158
|
+
)
|
147
159
|
|
148
160
|
def _execute_helper(self, module_input, module_output_path: str) -> None:
|
149
161
|
job_uuid = self._options['job']['public_id']
|
@@ -154,10 +166,10 @@ class DockerExecutor:
|
|
154
166
|
logger_no_user_data.debug(f'Job "{job_uuid}" starting utilization metrics reporter thread...')
|
155
167
|
config = CloudUtils.get_webserver_config()
|
156
168
|
node_auth_token = config['compute_node_info']['auth_token'] # pylint: disable=unsubscriptable-object
|
157
|
-
cloud_job = self._options[
|
169
|
+
cloud_job = self._options['cloud_job']
|
158
170
|
include_gpu_stats = False
|
159
171
|
if cloud_job:
|
160
|
-
include_gpu_stats = cloud_job.get(
|
172
|
+
include_gpu_stats = cloud_job.get('reserved_gpu_count', 0) > 0
|
161
173
|
UtilizationReporterThread(
|
162
174
|
container=self._container,
|
163
175
|
job_uuid=job_uuid,
|
@@ -212,7 +224,7 @@ class DockerExecutor:
|
|
212
224
|
raise ComputeProcessException(
|
213
225
|
MemoryError(),
|
214
226
|
SystemExceptionCodes.OUT_OF_MEMORY.value,
|
215
|
-
self._send_system_exception
|
227
|
+
self._send_system_exception,
|
216
228
|
)
|
217
229
|
|
218
230
|
logger_no_user_data.debug(f'Docker container exited with code {exit_code} for {job_uuid}')
|
@@ -243,24 +255,23 @@ class DockerExecutor:
|
|
243
255
|
for path_to_delete in [self._input_tar_path, self._runtime_tar_path]:
|
244
256
|
if os.path.exists(path_to_delete):
|
245
257
|
os.remove(path_to_delete)
|
246
|
-
logger_no_user_data.debug(f
|
258
|
+
logger_no_user_data.debug(f'Deleted tars in: {time.time() - tar_time}')
|
247
259
|
|
248
260
|
container_time = time.time()
|
249
261
|
if self._docker_container:
|
250
262
|
self._docker_container.remove(force=True)
|
251
263
|
|
252
264
|
if utils.IS_RUNNING_IN_CLOUD:
|
253
|
-
DockerImageCache().detach_job(
|
254
|
-
image_uri=self._absolute_image_uri,
|
255
|
-
job_id=self._options['job']['public_id']
|
256
|
-
)
|
265
|
+
DockerImageCache().detach_job(image_uri=self._absolute_image_uri, job_id=self._options['job']['public_id'])
|
257
266
|
|
258
|
-
logger_no_user_data.debug(f
|
267
|
+
logger_no_user_data.debug(f'Deleted compute container in: {time.time() - container_time}')
|
259
268
|
self._tmp_secrets_dir.cleanup()
|
260
269
|
|
261
270
|
# TODO: type this method
|
262
271
|
def _initialize_docker_container(self, module_input):
|
263
272
|
try:
|
273
|
+
job_uuid = self._options['job']['public_id']
|
274
|
+
logger_no_user_data.debug(f'Job "{job_uuid}" initializing Docker container...')
|
264
275
|
module = self._options['module']
|
265
276
|
logger.debug(f"Initializing docker container with command: {module['command']}")
|
266
277
|
|
@@ -274,6 +285,8 @@ class DockerExecutor:
|
|
274
285
|
job_requested_machine=self._options['job']['requested_machine'],
|
275
286
|
job_uuid=self._options['job']['public_id'],
|
276
287
|
job_auth_token=self._options['job']['auth_token'],
|
288
|
+
app_uri=self._options['job']['app_uri'],
|
289
|
+
is_environment_biolib_cloud=bool(utils.IS_RUNNING_IN_CLOUD),
|
277
290
|
)
|
278
291
|
secrets: Dict[str, str] = dict(
|
279
292
|
**module.get('secrets', {}),
|
@@ -290,32 +303,40 @@ class DockerExecutor:
|
|
290
303
|
)
|
291
304
|
if app_version_created_at < datetime(2022, 11, 30, 0, 0):
|
292
305
|
environment_vars = module.get('secrets', {})
|
293
|
-
environment_vars.update(
|
294
|
-
|
295
|
-
|
296
|
-
|
306
|
+
environment_vars.update(
|
307
|
+
{
|
308
|
+
'BIOLIB_JOB_UUID': self._options['job']['public_id'],
|
309
|
+
'BIOLIB_JOB_AUTH_TOKEN': self._options['job']['auth_token'],
|
310
|
+
}
|
311
|
+
)
|
297
312
|
|
298
313
|
if utils.IS_RUNNING_IN_CLOUD and self._options['cloud_job']:
|
299
|
-
environment_vars.update(
|
300
|
-
|
301
|
-
|
314
|
+
environment_vars.update(
|
315
|
+
{
|
316
|
+
'BIOLIB_JOB_MAX_RUNTIME_IN_SECONDS': self._options['cloud_job']['max_runtime_in_seconds'],
|
317
|
+
}
|
318
|
+
)
|
302
319
|
|
320
|
+
logger_no_user_data.debug(f'Job "{job_uuid}" initializing Docker container. Getting IPs for proxies...')
|
303
321
|
for proxy in self._options['remote_host_proxies']:
|
304
322
|
proxy_ip = proxy.get_ip_address_on_network(internal_network)
|
305
323
|
if proxy.is_app_caller_proxy:
|
306
324
|
logger_no_user_data.debug('Found app caller proxy, setting both base URLs in compute container')
|
307
|
-
environment_vars.update(
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
325
|
+
environment_vars.update(
|
326
|
+
{
|
327
|
+
'BIOLIB_BASE_URL': f'http://{proxy_ip}',
|
328
|
+
'BIOLIB_CLOUD_BASE_URL': f'http://{proxy_ip}',
|
329
|
+
# This should be removed eventually, but will break apps calling apps on older versions
|
330
|
+
'BIOLIB_CLOUD_RESULTS_BASE_URL': f'http://{proxy_ip}',
|
331
|
+
'BIOLIB_CLOUD_JOB_STORAGE_BASE_URL': f'http://{proxy_ip}',
|
332
|
+
# Inform container if we are targeting public biolib as we change the BIOLIB_BASE_URL
|
333
|
+
'BIOLIB_ENVIRONMENT_IS_PUBLIC_BIOLIB': bool(utils.BASE_URL_IS_PUBLIC_BIOLIB),
|
334
|
+
}
|
335
|
+
)
|
316
336
|
else:
|
317
337
|
extra_hosts[proxy.hostname] = proxy_ip
|
318
338
|
|
339
|
+
logger_no_user_data.debug(f'Job "{job_uuid}" initializing Docker container. Constructing container args...')
|
319
340
|
create_container_args = {
|
320
341
|
'environment': environment_vars,
|
321
342
|
'extra_hosts': extra_hosts,
|
@@ -327,45 +348,38 @@ class DockerExecutor:
|
|
327
348
|
|
328
349
|
if self._options['job'].get('arguments_override_command'):
|
329
350
|
# In this case, arguments contains a user specified command to run in the app
|
330
|
-
create_container_args.update({
|
331
|
-
'command': module_input['arguments'],
|
332
|
-
'entrypoint': ''
|
333
|
-
})
|
351
|
+
create_container_args.update({'command': module_input['arguments'], 'entrypoint': ''})
|
334
352
|
|
335
353
|
else:
|
336
|
-
create_container_args.update({
|
337
|
-
'command': shlex.split(module['command']) + module_input['arguments']
|
338
|
-
})
|
354
|
+
create_container_args.update({'command': shlex.split(module['command']) + module_input['arguments']})
|
339
355
|
|
340
356
|
app_version = self._options['job']['app_version']
|
341
357
|
if app_version.get('main_output_file') or app_version.get('stdout_render_type') == 'text':
|
342
358
|
create_container_args['tty'] = True
|
343
359
|
|
344
360
|
if utils.IS_RUNNING_IN_CLOUD:
|
345
|
-
cloud_job = self._options[
|
361
|
+
cloud_job = self._options['cloud_job']
|
346
362
|
create_container_args['mem_limit'] = f'{cloud_job["reserved_memory_in_bytes"]}b'
|
347
|
-
create_container_args['nano_cpus'] = cloud_job[
|
363
|
+
create_container_args['nano_cpus'] = cloud_job['reserved_cpu_in_nano_shares']
|
364
|
+
create_container_args['pids_limit'] = 10_000
|
348
365
|
|
349
366
|
biolib_identity_user_email: Optional[str] = cloud_job.get('biolib_identity_user_email')
|
350
367
|
if biolib_identity_user_email:
|
351
|
-
create_container_args['environment'].update(
|
352
|
-
'BIOLIB_IDENTITY_USER_EMAIL': biolib_identity_user_email
|
353
|
-
|
368
|
+
create_container_args['environment'].update(
|
369
|
+
{'BIOLIB_IDENTITY_USER_EMAIL': biolib_identity_user_email}
|
370
|
+
)
|
354
371
|
|
355
372
|
docker_runtime = os.getenv('BIOLIB_DOCKER_RUNTIME')
|
356
373
|
if docker_runtime is not None:
|
357
374
|
create_container_args['runtime'] = docker_runtime
|
358
375
|
|
359
|
-
|
360
|
-
|
361
|
-
)
|
362
|
-
|
363
|
-
logger_no_user_data.debug('Finished initializing docker container')
|
376
|
+
docker_client = BiolibDockerClient.get_docker_client()
|
377
|
+
logger_no_user_data.debug(f'Job "{job_uuid}" initializing Docker container. Creating container...')
|
378
|
+
self._docker_container = docker_client.containers.create(**create_container_args)
|
379
|
+
logger_no_user_data.debug(f'Job "{job_uuid}" finished initializing Docker container.')
|
364
380
|
except Exception as exception:
|
365
381
|
raise ComputeProcessException(
|
366
|
-
exception,
|
367
|
-
SystemExceptionCodes.FAILED_TO_START_COMPUTE_CONTAINER.value,
|
368
|
-
self._send_system_exception
|
382
|
+
exception, SystemExceptionCodes.FAILED_TO_START_COMPUTE_CONTAINER.value, self._send_system_exception
|
369
383
|
) from exception
|
370
384
|
|
371
385
|
def _add_file_to_tar(self, tar, current_path, mapped_path, data):
|
@@ -432,7 +446,7 @@ class DockerExecutor:
|
|
432
446
|
raise ComputeProcessException(
|
433
447
|
exception,
|
434
448
|
SystemExceptionCodes.FAILED_TO_COPY_INPUT_FILES_TO_COMPUTE_CONTAINER.value,
|
435
|
-
self._send_system_exception
|
449
|
+
self._send_system_exception,
|
436
450
|
) from exception
|
437
451
|
|
438
452
|
def _map_and_copy_runtime_files_to_container(self, runtime_zip_data, arguments: List[str], remove_root_folder=True):
|
@@ -447,17 +461,17 @@ class DockerExecutor:
|
|
447
461
|
raise ComputeProcessException(
|
448
462
|
exception,
|
449
463
|
SystemExceptionCodes.FAILED_TO_COPY_RUNTIME_FILES_TO_COMPUTE_CONTAINER.value,
|
450
|
-
self._send_system_exception
|
464
|
+
self._send_system_exception,
|
451
465
|
) from exception
|
452
466
|
|
453
467
|
def _write_module_output_to_file(
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
468
|
+
self,
|
469
|
+
arguments: List[str],
|
470
|
+
exit_code: int,
|
471
|
+
module_output_path: str,
|
472
|
+
stderr: bytes,
|
473
|
+
stdout: bytes,
|
474
|
+
pre_start_diff: List[Dict],
|
461
475
|
) -> None:
|
462
476
|
mapped_files: List[FileInContainer] = []
|
463
477
|
try:
|
@@ -502,9 +516,11 @@ class DockerExecutor:
|
|
502
516
|
result = subprocess.run(
|
503
517
|
args=[
|
504
518
|
'ctr',
|
505
|
-
'--namespace',
|
519
|
+
'--namespace',
|
520
|
+
'moby',
|
506
521
|
'snapshots',
|
507
|
-
'--snapshotter',
|
522
|
+
'--snapshotter',
|
523
|
+
'nydus',
|
508
524
|
'mounts',
|
509
525
|
'/some_arbitrary_path',
|
510
526
|
str(self._container.id),
|
@@ -526,9 +542,10 @@ class DockerExecutor:
|
|
526
542
|
pre_start_diff_paths = [obj['Path'] for obj in pre_start_diff]
|
527
543
|
post_run_diff = self._container.diff()
|
528
544
|
run_diff_paths: List[str] = [
|
529
|
-
obj['Path']
|
530
|
-
obj
|
531
|
-
obj['
|
545
|
+
obj['Path']
|
546
|
+
for obj in post_run_diff
|
547
|
+
if obj['Kind'] in (DockerDiffKind.CHANGED.value, DockerDiffKind.ADDED.value)
|
548
|
+
and obj['Path'] not in pre_start_diff_paths
|
532
549
|
]
|
533
550
|
|
534
551
|
known_directories = set()
|
@@ -538,7 +555,7 @@ class DockerExecutor:
|
|
538
555
|
if idx == 0:
|
539
556
|
continue # always skip root
|
540
557
|
|
541
|
-
folder = '/' + '/'.join(parent_folders[1:idx + 1])
|
558
|
+
folder = '/' + '/'.join(parent_folders[1 : idx + 1])
|
542
559
|
known_directories.add(folder)
|
543
560
|
|
544
561
|
def path_is_included_in_from_mappings(path: str) -> bool:
|
@@ -558,11 +575,13 @@ class DockerExecutor:
|
|
558
575
|
files_and_empty_dirs: List[FileInContainer] = []
|
559
576
|
for path in run_diff_paths:
|
560
577
|
if path not in known_directories and path_is_included_in_from_mappings(path):
|
561
|
-
files_and_empty_dirs.append(
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
578
|
+
files_and_empty_dirs.append(
|
579
|
+
FileInContainer(
|
580
|
+
container=self._container,
|
581
|
+
overlay_upper_dir_path=overlay_upper_dir_path,
|
582
|
+
path_in_container=path,
|
583
|
+
)
|
584
|
+
)
|
566
585
|
|
567
586
|
return files_and_empty_dirs
|
568
587
|
|
@@ -10,6 +10,7 @@ from biolib.utils.multipart_uploader import get_chunk_iterator_from_file_object
|
|
10
10
|
|
11
11
|
|
12
12
|
class JobStorage:
|
13
|
+
module_input_file_name = 'input-output.bbf'
|
13
14
|
module_output_file_name = 'module-output.bbf'
|
14
15
|
|
15
16
|
@staticmethod
|
@@ -46,8 +47,21 @@ class JobStorage:
|
|
46
47
|
module_output_path = os.path.join(job_temporary_dir, JobStorage.module_output_file_name)
|
47
48
|
module_output_size = os.path.getsize(module_output_path)
|
48
49
|
|
50
|
+
# Calculate chunk size based on max chunk count of 10_000, using 9_000 to be on the safe side
|
51
|
+
max_chunk_count = 9_000
|
52
|
+
min_chunk_size_bytes = 50_000_000
|
53
|
+
chunk_size_in_bytes = max(min_chunk_size_bytes, module_output_size // max_chunk_count)
|
54
|
+
|
55
|
+
logger_no_user_data.debug(
|
56
|
+
f'Job "{job_uuid}" uploading result of size {module_output_size} bytes '
|
57
|
+
f'with chunk size of {chunk_size_in_bytes} bytes...'
|
58
|
+
)
|
59
|
+
|
49
60
|
with open(module_output_path, mode='rb') as module_output_file:
|
50
|
-
module_output_iterator = get_chunk_iterator_from_file_object(
|
61
|
+
module_output_iterator = get_chunk_iterator_from_file_object(
|
62
|
+
file_object=module_output_file,
|
63
|
+
chunk_size_in_bytes=chunk_size_in_bytes,
|
64
|
+
)
|
51
65
|
multipart_uploader = JobStorage._get_module_output_uploader(job_uuid)
|
52
66
|
multipart_uploader.upload(
|
53
67
|
payload_iterator=module_output_iterator,
|
@@ -81,7 +95,7 @@ class JobStorage:
|
|
81
95
|
)
|
82
96
|
|
83
97
|
@staticmethod
|
84
|
-
def
|
98
|
+
def download_module_input(job: CreatedJobDict, path: str):
|
85
99
|
job_uuid = job['public_id']
|
86
100
|
logger_no_user_data.debug(f'Job "{job_uuid}" downloading module input...')
|
87
101
|
presigned_download_url = BiolibJobApi.get_job_storage_download_url(
|
@@ -89,7 +103,5 @@ class JobStorage:
|
|
89
103
|
job_auth_token=job['auth_token'],
|
90
104
|
storage_type='input',
|
91
105
|
)
|
92
|
-
|
93
|
-
data: bytes = response.content
|
106
|
+
HttpClient.request(url=presigned_download_url, response_path=path)
|
94
107
|
logger_no_user_data.debug(f'Job "{job_uuid}" module input downloaded')
|
95
|
-
return data
|
@@ -133,7 +133,8 @@ class JobWorker:
|
|
133
133
|
).start()
|
134
134
|
|
135
135
|
try:
|
136
|
-
|
136
|
+
module_input_path = os.path.join(self.job_temporary_dir, JobStorage.module_input_file_name)
|
137
|
+
JobStorage.download_module_input(job=job, path=module_input_path)
|
137
138
|
except StorageDownloadFailed:
|
138
139
|
# Expect module input to be handled in a separate ModuleInput package
|
139
140
|
self._legacy_input_wait_timeout_thread = JobLegacyInputWaitTimeout(
|
@@ -147,7 +148,7 @@ class JobWorker:
|
|
147
148
|
raise error
|
148
149
|
|
149
150
|
try:
|
150
|
-
self._run_root_job(
|
151
|
+
self._run_root_job(module_input_path)
|
151
152
|
|
152
153
|
# This error occurs when trying to access the container after the job worker has cleaned it up.
|
153
154
|
# In that case stop the computation.
|
@@ -165,7 +166,9 @@ class JobWorker:
|
|
165
166
|
self._legacy_input_wait_timeout_thread.stop()
|
166
167
|
|
167
168
|
try:
|
168
|
-
self.
|
169
|
+
module_input_path = os.path.join(self.job_temporary_dir, JobStorage.module_input_file_name)
|
170
|
+
open(module_input_path, 'wb').write(package)
|
171
|
+
self._run_root_job(module_input_path)
|
169
172
|
|
170
173
|
# This error occurs when trying to access the container after the job worker has cleaned it up.
|
171
174
|
# In that case stop the computation.
|
@@ -250,7 +253,6 @@ class JobWorker:
|
|
250
253
|
|
251
254
|
def _start_network_and_remote_host_proxies(self, job: CreatedJobDict) -> None:
|
252
255
|
app_version = job['app_version']
|
253
|
-
app = app_version.get('app', {})
|
254
256
|
job_id = job['public_id']
|
255
257
|
remote_hosts = app_version['remote_hosts']
|
256
258
|
if utils.IS_RUNNING_IN_CLOUD:
|
@@ -313,7 +315,6 @@ class JobWorker:
|
|
313
315
|
self._internal_network,
|
314
316
|
job_id,
|
315
317
|
ports,
|
316
|
-
can_push_data_record_for_user=app.get('can_push_data_record_for_user', False),
|
317
318
|
)
|
318
319
|
remote_host_proxy.start()
|
319
320
|
self._remote_host_proxies.append(remote_host_proxy)
|
@@ -331,15 +332,15 @@ class JobWorker:
|
|
331
332
|
def _run_app_version(
|
332
333
|
self,
|
333
334
|
app_version_id: str,
|
334
|
-
|
335
|
+
module_input_path: str,
|
335
336
|
caller_job: CreatedJobDict,
|
336
337
|
main_module_output_path: str,
|
337
338
|
) -> None:
|
338
339
|
job: CreatedJobDict = BiolibJobApi.create(app_version_id, caller_job=caller_job['public_id'])
|
339
340
|
self._jobs[job['public_id']] = job
|
340
|
-
self._run_job(job,
|
341
|
+
self._run_job(job, module_input_path, main_module_output_path)
|
341
342
|
|
342
|
-
def _run_job(self, job: CreatedJobDict,
|
343
|
+
def _run_job(self, job: CreatedJobDict, module_input_path: str, main_module_output_path: str) -> None:
|
343
344
|
job_uuid = job['public_id']
|
344
345
|
logger_no_user_data.info(f'Job "{job_uuid}" running...')
|
345
346
|
if self._root_job_wrapper is None:
|
@@ -406,7 +407,7 @@ class JobWorker:
|
|
406
407
|
send_system_exception=self.send_system_exception,
|
407
408
|
send_stdout_and_stderr=self.send_stdout_and_stderr,
|
408
409
|
),
|
409
|
-
|
410
|
+
module_input_path,
|
410
411
|
main_module_output_path,
|
411
412
|
)
|
412
413
|
|
@@ -417,15 +418,20 @@ class JobWorker:
|
|
417
418
|
def _run_module(
|
418
419
|
self,
|
419
420
|
options: LocalExecutorOptions,
|
420
|
-
|
421
|
+
module_input_path: str,
|
421
422
|
module_output_path: str,
|
422
423
|
) -> None:
|
423
424
|
module = options['module']
|
424
425
|
job_id = options['job']['public_id']
|
425
426
|
logger_no_user_data.debug(f'Job "{job_id}" running module "{module["name"]}"...')
|
427
|
+
|
426
428
|
executor_instance: DockerExecutor
|
427
429
|
if module['environment'] == ModuleEnvironment.BIOLIB_APP.value:
|
430
|
+
if not self.job_temporary_dir:
|
431
|
+
raise BioLibError('Undefined job_temporary_dir')
|
428
432
|
logger_no_user_data.debug(f'Job "{job_id}" starting child job...')
|
433
|
+
with open(module_input_path,'rb') as fp:
|
434
|
+
module_input_serialized = fp.read()
|
429
435
|
module_input = ModuleInput(module_input_serialized).deserialize()
|
430
436
|
module_input_with_runtime_zip = self._add_runtime_zip_and_command_to_module_input(options, module_input)
|
431
437
|
module_input_with_runtime_zip_serialized = ModuleInput().serialize(
|
@@ -433,9 +439,11 @@ class JobWorker:
|
|
433
439
|
arguments=module_input_with_runtime_zip['arguments'],
|
434
440
|
files=module_input_with_runtime_zip['files'],
|
435
441
|
)
|
442
|
+
module_input_path_new = os.path.join(self.job_temporary_dir, "runtime." + JobStorage.module_input_file_name)
|
443
|
+
open(module_input_path_new, 'wb').write(module_input_with_runtime_zip_serialized)
|
436
444
|
return self._run_app_version(
|
437
445
|
module['image_uri'],
|
438
|
-
|
446
|
+
module_input_path_new,
|
439
447
|
options['job'],
|
440
448
|
module_output_path,
|
441
449
|
)
|
@@ -461,7 +469,7 @@ class JobWorker:
|
|
461
469
|
# Log memory and disk before pulling and executing module
|
462
470
|
log_disk_and_memory_usage_info()
|
463
471
|
|
464
|
-
executor_instance.execute_module(
|
472
|
+
executor_instance.execute_module(module_input_path, module_output_path)
|
465
473
|
|
466
474
|
def _connect_to_parent(self):
|
467
475
|
try:
|
@@ -587,7 +595,7 @@ class JobWorker:
|
|
587
595
|
may_contain_user_data=False
|
588
596
|
) from exception
|
589
597
|
|
590
|
-
def _run_root_job(self,
|
598
|
+
def _run_root_job(self, module_input_path: str) -> str:
|
591
599
|
# Make typechecker happy
|
592
600
|
if not self._root_job_wrapper or not self.job_temporary_dir:
|
593
601
|
raise BioLibError('Undefined job_wrapper or job_temporary_dir')
|
@@ -595,7 +603,7 @@ class JobWorker:
|
|
595
603
|
main_module_output_path = os.path.join(self.job_temporary_dir, JobStorage.module_output_file_name)
|
596
604
|
self._run_job(
|
597
605
|
job=self._root_job_wrapper['job'],
|
598
|
-
|
606
|
+
module_input_path=module_input_path,
|
599
607
|
main_module_output_path=main_module_output_path,
|
600
608
|
)
|
601
609
|
self._send_status_update(StatusUpdate(progress=94, log_message='Computation finished'))
|
@@ -614,7 +622,9 @@ class JobWorker:
|
|
614
622
|
job_temporary_dir=job_temporary_dir,
|
615
623
|
)
|
616
624
|
self._start_network_and_remote_host_proxies(job_dict)
|
617
|
-
|
625
|
+
module_input_path = os.path.join(self.job_temporary_dir, JobStorage.module_input_file_name)
|
626
|
+
open(module_input_path, 'wb').write(module_input_serialized)
|
627
|
+
module_output_path = self._run_root_job(module_input_path)
|
618
628
|
with open(module_output_path, mode='rb') as module_output_file:
|
619
629
|
module_output_serialized = module_output_file.read()
|
620
630
|
return ModuleOutputV2(InMemoryIndexableBuffer(module_output_serialized))
|