pybiolib 1.2.1056__py3-none-any.whl → 1.2.1642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pybiolib might be problematic. Click here for more details.
- biolib/__init__.py +33 -10
- biolib/_data_record/data_record.py +24 -11
- biolib/_index/__init__.py +0 -0
- biolib/_index/index.py +51 -0
- biolib/_index/types.py +7 -0
- biolib/_internal/data_record/data_record.py +1 -1
- biolib/_internal/data_record/push_data.py +1 -1
- biolib/_internal/data_record/remote_storage_endpoint.py +3 -3
- biolib/_internal/file_utils.py +7 -4
- biolib/_internal/index/__init__.py +1 -0
- biolib/_internal/index/index.py +18 -0
- biolib/_internal/lfs/cache.py +4 -2
- biolib/_internal/push_application.py +89 -23
- biolib/_internal/runtime.py +2 -0
- biolib/_internal/templates/gui_template/App.tsx +38 -2
- biolib/_internal/templates/gui_template/Dockerfile +2 -0
- biolib/_internal/templates/gui_template/biolib-sdk.ts +37 -0
- biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
- biolib/_internal/templates/gui_template/package.json +1 -0
- biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +49 -0
- biolib/_internal/templates/gui_template/vite.config.mts +2 -1
- biolib/_internal/templates/init_template/.github/workflows/biolib.yml +6 -1
- biolib/_internal/templates/init_template/Dockerfile +2 -0
- biolib/_internal/utils/__init__.py +25 -0
- biolib/_internal/utils/job_url.py +33 -0
- biolib/_runtime/runtime.py +9 -0
- biolib/_session/session.py +7 -5
- biolib/_shared/__init__.py +0 -0
- biolib/_shared/types/__init__.py +69 -0
- biolib/_shared/types/resource.py +17 -0
- biolib/_shared/types/resource_deploy_key.py +11 -0
- biolib/{_internal → _shared}/types/resource_permission.py +1 -1
- biolib/_shared/utils/__init__.py +7 -0
- biolib/_shared/utils/resource_uri.py +75 -0
- biolib/api/client.py +1 -1
- biolib/app/app.py +56 -23
- biolib/biolib_api_client/app_types.py +1 -6
- biolib/biolib_api_client/biolib_app_api.py +17 -0
- biolib/biolib_binary_format/module_input.py +8 -0
- biolib/biolib_binary_format/remote_endpoints.py +3 -3
- biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
- biolib/cli/__init__.py +2 -1
- biolib/cli/data_record.py +17 -0
- biolib/cli/index.py +32 -0
- biolib/cli/lfs.py +1 -1
- biolib/cli/start.py +14 -1
- biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
- biolib/compute_node/job_worker/executors/docker_types.py +1 -1
- biolib/compute_node/job_worker/executors/types.py +6 -5
- biolib/compute_node/job_worker/job_worker.py +149 -93
- biolib/compute_node/job_worker/large_file_system.py +2 -6
- biolib/compute_node/job_worker/network_alloc.py +99 -0
- biolib/compute_node/job_worker/network_buffer.py +240 -0
- biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
- biolib/compute_node/remote_host_proxy.py +125 -67
- biolib/compute_node/utils.py +2 -0
- biolib/compute_node/webserver/compute_node_results_proxy.py +188 -0
- biolib/compute_node/webserver/proxy_utils.py +28 -0
- biolib/compute_node/webserver/webserver.py +64 -19
- biolib/experiments/experiment.py +98 -16
- biolib/jobs/job.py +119 -29
- biolib/jobs/job_result.py +70 -33
- biolib/jobs/types.py +1 -0
- biolib/sdk/__init__.py +17 -2
- biolib/typing_utils.py +1 -1
- biolib/utils/cache_state.py +2 -2
- biolib/utils/seq_util.py +1 -1
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/METADATA +4 -2
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/RECORD +84 -66
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/WHEEL +1 -1
- biolib/_internal/types/__init__.py +0 -6
- biolib/utils/app_uri.py +0 -57
- /biolib/{_internal → _shared}/types/account.py +0 -0
- /biolib/{_internal → _shared}/types/account_member.py +0 -0
- /biolib/{_internal → _shared}/types/app.py +0 -0
- /biolib/{_internal → _shared}/types/data_record.py +0 -0
- /biolib/{_internal → _shared}/types/experiment.py +0 -0
- /biolib/{_internal → _shared}/types/file_node.py +0 -0
- /biolib/{_internal → _shared}/types/push.py +0 -0
- /biolib/{_internal/types/resource.py → _shared/types/resource_types.py} +0 -0
- /biolib/{_internal → _shared}/types/resource_version.py +0 -0
- /biolib/{_internal → _shared}/types/result.py +0 -0
- /biolib/{_internal → _shared}/types/typing.py +0 -0
- /biolib/{_internal → _shared}/types/user.py +0 -0
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info}/entry_points.txt +0 -0
- {pybiolib-1.2.1056.dist-info → pybiolib-1.2.1642.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,50 +1,63 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
|
-
import
|
|
4
|
+
import multiprocessing
|
|
5
|
+
import os
|
|
4
6
|
import shlex
|
|
7
|
+
import signal
|
|
8
|
+
import socket
|
|
5
9
|
import sys
|
|
6
10
|
import tempfile
|
|
7
11
|
import zipfile
|
|
8
|
-
from time import time
|
|
9
12
|
from queue import Queue
|
|
10
|
-
import
|
|
11
|
-
import os
|
|
12
|
-
import signal
|
|
13
|
+
from time import time
|
|
13
14
|
from types import FrameType
|
|
14
15
|
|
|
15
16
|
from docker.models.networks import Network # type: ignore
|
|
17
|
+
from docker.types import IPAMConfig, IPAMPool # type: ignore
|
|
16
18
|
|
|
17
|
-
from biolib._internal.http_client import HttpClient
|
|
18
|
-
from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
|
|
19
|
-
from biolib.compute_node.job_worker.job_legacy_input_wait_timeout_thread import JobLegacyInputWaitTimeout
|
|
20
|
-
from biolib.compute_node.job_worker.job_storage import JobStorage
|
|
21
|
-
from biolib.compute_node.job_worker.large_file_system import LargeFileSystem
|
|
22
|
-
from biolib.biolib_errors import DockerContainerNotFoundDuringExecutionException, BioLibError, \
|
|
23
|
-
StorageDownloadFailed
|
|
24
|
-
from biolib.compute_node.job_worker.job_max_runtime_timer_thread import JobMaxRuntimeTimerThread
|
|
25
|
-
from biolib.compute_node.remote_host_proxy import RemoteHostProxy
|
|
26
|
-
from biolib.typing_utils import Optional, List, Dict
|
|
27
19
|
from biolib import utils
|
|
28
|
-
from biolib.
|
|
29
|
-
|
|
20
|
+
from biolib._internal.http_client import HttpClient
|
|
21
|
+
from biolib.biolib_api_client import (
|
|
22
|
+
AppVersionOnJob,
|
|
23
|
+
BiolibApiClient,
|
|
24
|
+
CreatedJobDict,
|
|
25
|
+
JobWrapper,
|
|
26
|
+
Module,
|
|
27
|
+
ModuleEnvironment,
|
|
28
|
+
)
|
|
30
29
|
from biolib.biolib_api_client.biolib_job_api import BiolibJobApi
|
|
30
|
+
from biolib.biolib_binary_format import (
|
|
31
|
+
InMemoryIndexableBuffer,
|
|
32
|
+
ModuleInput,
|
|
33
|
+
ModuleOutputV2,
|
|
34
|
+
SavedJob,
|
|
35
|
+
SystemException,
|
|
36
|
+
SystemStatusUpdate,
|
|
37
|
+
)
|
|
38
|
+
from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
|
|
31
39
|
from biolib.biolib_docker_client import BiolibDockerClient
|
|
40
|
+
from biolib.biolib_errors import BioLibError, DockerContainerNotFoundDuringExecutionException, StorageDownloadFailed
|
|
32
41
|
from biolib.biolib_logging import logger, logger_no_user_data
|
|
33
42
|
from biolib.compute_node.job_worker.executors import DockerExecutor
|
|
34
43
|
from biolib.compute_node.job_worker.executors.types import LocalExecutorOptions, StatusUpdate
|
|
35
|
-
from biolib.compute_node.
|
|
36
|
-
from biolib.compute_node.
|
|
44
|
+
from biolib.compute_node.job_worker.job_legacy_input_wait_timeout_thread import JobLegacyInputWaitTimeout
|
|
45
|
+
from biolib.compute_node.job_worker.job_max_runtime_timer_thread import JobMaxRuntimeTimerThread
|
|
46
|
+
from biolib.compute_node.job_worker.job_storage import JobStorage
|
|
47
|
+
from biolib.compute_node.job_worker.large_file_system import LargeFileSystem
|
|
37
48
|
from biolib.compute_node.job_worker.mappings import Mappings, path_without_first_folder
|
|
49
|
+
from biolib.compute_node.job_worker.network_buffer import NetworkBuffer
|
|
38
50
|
from biolib.compute_node.job_worker.utils import ComputeProcessException, log_disk_and_memory_usage_info
|
|
39
|
-
from biolib.compute_node.
|
|
40
|
-
from biolib.
|
|
41
|
-
|
|
51
|
+
from biolib.compute_node.remote_host_proxy import RemoteHostMapping, RemoteHostProxy, get_static_ip_from_network
|
|
52
|
+
from biolib.compute_node.socker_listener_thread import SocketListenerThread
|
|
53
|
+
from biolib.compute_node.socket_sender_thread import SocketSenderThread
|
|
54
|
+
from biolib.compute_node.utils import SystemExceptionCodeMap, SystemExceptionCodes, get_package_type
|
|
55
|
+
from biolib.typing_utils import Dict, List, Optional
|
|
42
56
|
|
|
43
57
|
SOCKET_HOST = '127.0.0.1'
|
|
44
58
|
|
|
45
59
|
|
|
46
60
|
class JobWorkerProcess(multiprocessing.Process):
|
|
47
|
-
|
|
48
61
|
# note: this method is run in the parent process
|
|
49
62
|
def __init__(self, socket_port: int, log_level: int):
|
|
50
63
|
super().__init__()
|
|
@@ -69,6 +82,13 @@ class JobWorker:
|
|
|
69
82
|
# handle termination signal from parent
|
|
70
83
|
signal.signal(signal.SIGTERM, self._handle_exit_gracefully)
|
|
71
84
|
|
|
85
|
+
try:
|
|
86
|
+
docker_client = BiolibDockerClient.get_docker_client()
|
|
87
|
+
networks = docker_client.networks.list()
|
|
88
|
+
logger_no_user_data.debug(f'Docker networks at JobWorker init: {[net.name for net in networks]}')
|
|
89
|
+
except Exception as error:
|
|
90
|
+
logger_no_user_data.debug(f'Failed to list docker networks at init: {error}')
|
|
91
|
+
|
|
72
92
|
self._socket_port = socket_port
|
|
73
93
|
self._received_messages_queue: Queue = Queue()
|
|
74
94
|
self._messages_to_send_queue: Queue = Queue()
|
|
@@ -80,9 +100,9 @@ class JobWorker:
|
|
|
80
100
|
|
|
81
101
|
self._remote_host_proxies: List[RemoteHostProxy] = []
|
|
82
102
|
self._internal_network: Optional[Network] = None
|
|
83
|
-
self._public_network: Optional[Network] = None
|
|
84
103
|
self._executors: List[DockerExecutor] = []
|
|
85
104
|
self.is_cleaning_up: bool = False
|
|
105
|
+
self._network_buffer = NetworkBuffer.get_instance()
|
|
86
106
|
|
|
87
107
|
self.job_temporary_dir: Optional[str] = None
|
|
88
108
|
|
|
@@ -91,18 +111,18 @@ class JobWorker:
|
|
|
91
111
|
exception,
|
|
92
112
|
SystemExceptionCodes.FAILED_TO_INIT_COMPUTE_PROCESS_VARIABLES.value,
|
|
93
113
|
self.send_system_exception,
|
|
94
|
-
may_contain_user_data=False
|
|
114
|
+
may_contain_user_data=False,
|
|
95
115
|
) from exception
|
|
96
116
|
|
|
97
117
|
if socket_port:
|
|
98
118
|
self._connect_to_parent()
|
|
99
119
|
|
|
100
120
|
def _handle_exit_gracefully(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
121
|
+
self,
|
|
122
|
+
signum: int,
|
|
123
|
+
frame: Optional[FrameType], # pylint: disable=unused-argument
|
|
104
124
|
) -> None:
|
|
105
|
-
job_id = self._root_job_wrapper[
|
|
125
|
+
job_id = self._root_job_wrapper['job']['public_id'] if self._root_job_wrapper else None
|
|
106
126
|
logger_no_user_data.debug(
|
|
107
127
|
f'_JobWorker ({job_id}) got exit signal {signal.Signals(signum).name}' # pylint: disable=no-member
|
|
108
128
|
)
|
|
@@ -187,9 +207,7 @@ class JobWorker:
|
|
|
187
207
|
|
|
188
208
|
except Exception as exception:
|
|
189
209
|
raise ComputeProcessException(
|
|
190
|
-
exception,
|
|
191
|
-
SystemExceptionCodes.UNKNOWN_COMPUTE_PROCESS_ERROR.value,
|
|
192
|
-
self.send_system_exception
|
|
210
|
+
exception, SystemExceptionCodes.UNKNOWN_COMPUTE_PROCESS_ERROR.value, self.send_system_exception
|
|
193
211
|
) from exception
|
|
194
212
|
|
|
195
213
|
def _cleanup(self) -> None:
|
|
@@ -200,6 +218,8 @@ class JobWorker:
|
|
|
200
218
|
executor.cleanup()
|
|
201
219
|
|
|
202
220
|
proxy_count = len(self._remote_host_proxies)
|
|
221
|
+
cleaned_networks = set()
|
|
222
|
+
|
|
203
223
|
if proxy_count > 0:
|
|
204
224
|
logger_no_user_data.debug('Cleaning up proxies...')
|
|
205
225
|
proxy_cleanup_start_time = time()
|
|
@@ -211,21 +231,37 @@ class JobWorker:
|
|
|
211
231
|
logger_no_user_data.error('Failed to clean up remote host proxy')
|
|
212
232
|
logger.error(exception)
|
|
213
233
|
|
|
234
|
+
for network in proxy.get_remote_host_networks():
|
|
235
|
+
try:
|
|
236
|
+
self._cleanup_network(network)
|
|
237
|
+
cleaned_networks.add(network.id)
|
|
238
|
+
except Exception as exception: # pylint: disable=broad-except
|
|
239
|
+
logger_no_user_data.error(f'Failed to clean up network {network.name}')
|
|
240
|
+
logger.error(exception)
|
|
241
|
+
|
|
214
242
|
self._remote_host_proxies = []
|
|
215
243
|
logger_no_user_data.debug(f'Cleaned up {proxy_count} proxies in {time() - proxy_cleanup_start_time}')
|
|
216
244
|
|
|
217
245
|
logger_no_user_data.debug('Cleaning up networks...')
|
|
218
|
-
self.
|
|
246
|
+
if self._internal_network and self._internal_network.id not in cleaned_networks:
|
|
247
|
+
self._cleanup_network(self._internal_network)
|
|
219
248
|
self._internal_network = None
|
|
220
|
-
|
|
221
|
-
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
logger_no_user_data.debug('Refilling network buffer...')
|
|
252
|
+
created = self._network_buffer.fill_buffer()
|
|
253
|
+
logger_no_user_data.debug(f'Refilled buffer with {created} new networks')
|
|
254
|
+
except Exception as exception: # pylint: disable=broad-except
|
|
255
|
+
logger_no_user_data.error('Failed to refill network buffer')
|
|
256
|
+
logger.error(exception)
|
|
257
|
+
|
|
222
258
|
logger_no_user_data.debug('Cleaned up networks...')
|
|
223
259
|
|
|
224
260
|
@staticmethod
|
|
225
261
|
def _cleanup_network(network: Optional[Network]) -> None:
|
|
226
262
|
if network:
|
|
227
263
|
network_cleanup_start_time = time()
|
|
228
|
-
network_name = network
|
|
264
|
+
network_name = network.name
|
|
229
265
|
try:
|
|
230
266
|
network.remove()
|
|
231
267
|
except Exception as exception: # pylint: disable=broad-except
|
|
@@ -237,10 +273,7 @@ class JobWorker:
|
|
|
237
273
|
def _handle_save_job_wrapper(self, package: bytes):
|
|
238
274
|
job_wrapper_json_string = SavedJob(package).deserialize()
|
|
239
275
|
job_wrapper: JobWrapper = json.loads(job_wrapper_json_string)
|
|
240
|
-
BiolibApiClient.initialize(
|
|
241
|
-
base_url=job_wrapper['BASE_URL'],
|
|
242
|
-
access_token=job_wrapper['access_token']
|
|
243
|
-
)
|
|
276
|
+
BiolibApiClient.initialize(base_url=job_wrapper['BASE_URL'], access_token=job_wrapper['access_token'])
|
|
244
277
|
self._root_job_wrapper = job_wrapper
|
|
245
278
|
if not utils.IS_RUNNING_IN_CLOUD:
|
|
246
279
|
job_wrapper['cloud_job'] = None
|
|
@@ -253,10 +286,10 @@ class JobWorker:
|
|
|
253
286
|
app_version = job['app_version']
|
|
254
287
|
modules = app_version.get('modules', [])
|
|
255
288
|
for module in modules:
|
|
256
|
-
|
|
257
|
-
if
|
|
289
|
+
module_ports = module.get('ports', [])
|
|
290
|
+
if module_ports:
|
|
258
291
|
logger_no_user_data.debug(
|
|
259
|
-
f"Job '{job['public_id']}' module '{module['name']}' has
|
|
292
|
+
f"Job '{job['public_id']}' module '{module['name']}' has ports: {module_ports}"
|
|
260
293
|
)
|
|
261
294
|
|
|
262
295
|
if job['app_version'].get('modules') is not None and BiolibDockerClient.is_docker_running():
|
|
@@ -268,44 +301,33 @@ class JobWorker:
|
|
|
268
301
|
app_version = job['app_version']
|
|
269
302
|
job_id = job['public_id']
|
|
270
303
|
remote_hosts = app_version['remote_hosts']
|
|
271
|
-
if utils.IS_RUNNING_IN_CLOUD:
|
|
272
|
-
remote_hosts.append(
|
|
273
|
-
{
|
|
274
|
-
'hostname': 'AppCallerProxy',
|
|
275
|
-
},
|
|
276
|
-
)
|
|
277
|
-
|
|
278
304
|
docker_client = BiolibDockerClient.get_docker_client()
|
|
279
305
|
try:
|
|
306
|
+
name_hash = int(hashlib.sha256(job_id.encode()).hexdigest(), 16)
|
|
307
|
+
third_octet = name_hash % 256
|
|
308
|
+
internal_subnet = f'172.29.{third_octet}.0/24'
|
|
309
|
+
|
|
310
|
+
ipam_pool = IPAMPool(subnet=internal_subnet)
|
|
311
|
+
ipam_config = IPAMConfig(pool_configs=[ipam_pool])
|
|
312
|
+
|
|
280
313
|
self._internal_network = docker_client.networks.create(
|
|
281
314
|
name=f'biolib-sandboxed-network-{job_id}',
|
|
282
315
|
internal=True,
|
|
283
316
|
driver='bridge',
|
|
317
|
+
ipam=ipam_config,
|
|
284
318
|
)
|
|
319
|
+
logger_no_user_data.debug(f'Created internal network for job {job_id} with subnet {internal_subnet}')
|
|
285
320
|
except Exception as exception:
|
|
286
321
|
raise ComputeProcessException(
|
|
287
322
|
exception,
|
|
288
323
|
SystemExceptionCodes.FAILED_TO_CREATE_DOCKER_NETWORKS.value,
|
|
289
324
|
self.send_system_exception,
|
|
290
|
-
may_contain_user_data=False
|
|
325
|
+
may_contain_user_data=False,
|
|
291
326
|
) from exception
|
|
292
327
|
|
|
293
328
|
if len(remote_hosts) > 0:
|
|
294
|
-
logger_no_user_data.debug(f'Job "{job_id}"
|
|
295
|
-
|
|
296
|
-
self._public_network = docker_client.networks.create(
|
|
297
|
-
name=f'biolib-proxy-network-{job_id}',
|
|
298
|
-
internal=False,
|
|
299
|
-
driver='bridge',
|
|
300
|
-
)
|
|
301
|
-
except Exception as exception:
|
|
302
|
-
raise ComputeProcessException(
|
|
303
|
-
exception,
|
|
304
|
-
SystemExceptionCodes.FAILED_TO_CREATE_DOCKER_NETWORKS.value,
|
|
305
|
-
self.send_system_exception,
|
|
306
|
-
may_contain_user_data=False
|
|
307
|
-
) from exception
|
|
308
|
-
logger_no_user_data.debug(f'Job "{job_id}" starting proxies for remote hosts: {remote_hosts}')
|
|
329
|
+
logger_no_user_data.debug(f'Job "{job_id}" starting proxy for remote hosts: {remote_hosts}')
|
|
330
|
+
created_networks: List[Network] = []
|
|
309
331
|
try:
|
|
310
332
|
hostname_to_ports: Dict[str, List[int]] = {}
|
|
311
333
|
for remote_host in remote_hosts:
|
|
@@ -321,33 +343,67 @@ class JobWorker:
|
|
|
321
343
|
else:
|
|
322
344
|
hostname_to_ports[hostname] = [port]
|
|
323
345
|
|
|
324
|
-
|
|
346
|
+
remote_host_mappings: List[RemoteHostMapping] = []
|
|
347
|
+
networks = self._network_buffer.allocate_networks(job_id, len(hostname_to_ports))
|
|
348
|
+
created_networks.extend(networks)
|
|
349
|
+
|
|
350
|
+
for (hostname, ports), network in zip(hostname_to_ports.items(), networks):
|
|
351
|
+
static_ip = get_static_ip_from_network(network, offset=2)
|
|
352
|
+
|
|
353
|
+
mapping = RemoteHostMapping(
|
|
354
|
+
hostname=hostname,
|
|
355
|
+
ports=ports,
|
|
356
|
+
network=network,
|
|
357
|
+
static_ip=static_ip,
|
|
358
|
+
)
|
|
359
|
+
remote_host_mappings.append(mapping)
|
|
360
|
+
|
|
361
|
+
if remote_host_mappings:
|
|
325
362
|
remote_host_proxy = RemoteHostProxy(
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
job_id,
|
|
330
|
-
ports,
|
|
363
|
+
remote_host_mappings=remote_host_mappings,
|
|
364
|
+
job_id=job_id,
|
|
365
|
+
app_caller_network=None,
|
|
331
366
|
)
|
|
332
367
|
remote_host_proxy.start()
|
|
333
368
|
self._remote_host_proxies.append(remote_host_proxy)
|
|
369
|
+
num_hosts = len(remote_host_mappings)
|
|
370
|
+
logger_no_user_data.debug(f'Started single proxy container for {num_hosts} remote hosts')
|
|
334
371
|
|
|
335
372
|
except Exception as exception:
|
|
373
|
+
for network in created_networks:
|
|
374
|
+
self._cleanup_network(network)
|
|
375
|
+
|
|
336
376
|
raise ComputeProcessException(
|
|
337
377
|
exception,
|
|
338
378
|
SystemExceptionCodes.FAILED_TO_START_REMOTE_HOST_PROXIES.value,
|
|
339
379
|
self.send_system_exception,
|
|
340
|
-
may_contain_user_data=False
|
|
380
|
+
may_contain_user_data=False,
|
|
341
381
|
) from exception
|
|
342
382
|
|
|
343
|
-
|
|
383
|
+
if utils.IS_RUNNING_IN_CLOUD:
|
|
384
|
+
try:
|
|
385
|
+
app_caller_proxy = RemoteHostProxy(
|
|
386
|
+
remote_host_mappings=[],
|
|
387
|
+
job_id=job_id,
|
|
388
|
+
app_caller_network=self._internal_network,
|
|
389
|
+
)
|
|
390
|
+
app_caller_proxy.start()
|
|
391
|
+
self._remote_host_proxies.append(app_caller_proxy)
|
|
392
|
+
logger_no_user_data.debug('Started app caller proxy')
|
|
393
|
+
except Exception as exception:
|
|
394
|
+
raise ComputeProcessException(
|
|
395
|
+
exception,
|
|
396
|
+
SystemExceptionCodes.FAILED_TO_START_REMOTE_HOST_PROXIES.value,
|
|
397
|
+
self.send_system_exception,
|
|
398
|
+
may_contain_user_data=False,
|
|
399
|
+
) from exception
|
|
344
400
|
|
|
345
401
|
def _run_app_version(
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
402
|
+
self,
|
|
403
|
+
app_version_id: str,
|
|
404
|
+
module_input_path: str,
|
|
405
|
+
caller_job: CreatedJobDict,
|
|
406
|
+
main_module_output_path: str,
|
|
351
407
|
) -> None:
|
|
352
408
|
job: CreatedJobDict = BiolibJobApi.create(app_version_id, caller_job=caller_job['public_id'])
|
|
353
409
|
self._jobs[job['public_id']] = job
|
|
@@ -366,17 +422,17 @@ class JobWorker:
|
|
|
366
422
|
root_job_id = root_job['public_id']
|
|
367
423
|
if job.get('arguments_override_command') and not job['app_version']['app']['allow_client_side_execution']:
|
|
368
424
|
raise ComputeProcessException(
|
|
369
|
-
Exception(
|
|
425
|
+
Exception('Command override not allowed'),
|
|
370
426
|
SystemExceptionCodes.COMMAND_OVERRIDE_NOT_ALLOWED.value,
|
|
371
|
-
self.send_system_exception
|
|
427
|
+
self.send_system_exception,
|
|
372
428
|
)
|
|
373
429
|
|
|
374
430
|
modules = job['app_version'].get('modules')
|
|
375
431
|
if not modules:
|
|
376
432
|
raise ComputeProcessException(
|
|
377
|
-
Exception(
|
|
433
|
+
Exception('No modules found on job'),
|
|
378
434
|
SystemExceptionCodes.NO_MODULES_FOUND_ON_JOB.value,
|
|
379
|
-
self.send_system_exception
|
|
435
|
+
self.send_system_exception,
|
|
380
436
|
)
|
|
381
437
|
|
|
382
438
|
main_module = self._get_module_from_name(modules, module_name='main')
|
|
@@ -429,8 +485,8 @@ class JobWorker:
|
|
|
429
485
|
log_disk_and_memory_usage_info()
|
|
430
486
|
|
|
431
487
|
def _run_module(
|
|
432
|
-
|
|
433
|
-
|
|
488
|
+
self,
|
|
489
|
+
options: LocalExecutorOptions,
|
|
434
490
|
) -> None:
|
|
435
491
|
module = options['module']
|
|
436
492
|
job_id = options['job']['public_id']
|
|
@@ -443,7 +499,7 @@ class JobWorker:
|
|
|
443
499
|
if not self.job_temporary_dir:
|
|
444
500
|
raise BioLibError('Undefined job_temporary_dir')
|
|
445
501
|
logger_no_user_data.debug(f'Job "{job_id}" starting child job...')
|
|
446
|
-
with open(module_input_path,'rb') as fp:
|
|
502
|
+
with open(module_input_path, 'rb') as fp:
|
|
447
503
|
module_input_serialized = fp.read()
|
|
448
504
|
module_input = ModuleInput(module_input_serialized).deserialize()
|
|
449
505
|
module_input_with_runtime_zip = self._add_runtime_zip_and_command_to_module_input(options, module_input)
|
|
@@ -452,7 +508,7 @@ class JobWorker:
|
|
|
452
508
|
arguments=module_input_with_runtime_zip['arguments'],
|
|
453
509
|
files=module_input_with_runtime_zip['files'],
|
|
454
510
|
)
|
|
455
|
-
module_input_path_new = os.path.join(self.job_temporary_dir,
|
|
511
|
+
module_input_path_new = os.path.join(self.job_temporary_dir, 'runtime.' + JobStorage.module_input_file_name)
|
|
456
512
|
open(module_input_path_new, 'wb').write(module_input_with_runtime_zip_serialized)
|
|
457
513
|
return self._run_app_version(
|
|
458
514
|
module['image_uri'],
|
|
@@ -469,7 +525,7 @@ class JobWorker:
|
|
|
469
525
|
exception,
|
|
470
526
|
SystemExceptionCodes.FAILED_TO_INITIALIZE_DOCKER_EXECUTOR.value,
|
|
471
527
|
self.send_system_exception,
|
|
472
|
-
may_contain_user_data=False
|
|
528
|
+
may_contain_user_data=False,
|
|
473
529
|
) from exception
|
|
474
530
|
else:
|
|
475
531
|
err_string = f'Job "{job_id}" hit unsupported module environment "{module["environment"]}"'
|
|
@@ -494,7 +550,7 @@ class JobWorker:
|
|
|
494
550
|
exception,
|
|
495
551
|
SystemExceptionCodes.FAILED_TO_CONNECT_TO_WORKER_THREAD_SOCKET.value,
|
|
496
552
|
self.send_system_exception,
|
|
497
|
-
may_contain_user_data=False
|
|
553
|
+
may_contain_user_data=False,
|
|
498
554
|
) from exception
|
|
499
555
|
|
|
500
556
|
try:
|
|
@@ -505,7 +561,7 @@ class JobWorker:
|
|
|
505
561
|
exception,
|
|
506
562
|
SystemExceptionCodes.FAILED_TO_START_SENDER_THREAD_OR_RECEIVER_THREAD.value,
|
|
507
563
|
self.send_system_exception,
|
|
508
|
-
may_contain_user_data=False
|
|
564
|
+
may_contain_user_data=False,
|
|
509
565
|
) from exception
|
|
510
566
|
|
|
511
567
|
# TODO: move this mapping logic to the ModuleInput class
|
|
@@ -533,7 +589,7 @@ class JobWorker:
|
|
|
533
589
|
exception,
|
|
534
590
|
SystemExceptionCodes.FAILED_TO_CREATE_NEW_JOB.value,
|
|
535
591
|
self.send_system_exception,
|
|
536
|
-
may_contain_user_data=False
|
|
592
|
+
may_contain_user_data=False,
|
|
537
593
|
) from exception
|
|
538
594
|
|
|
539
595
|
return module_input
|
|
@@ -559,7 +615,7 @@ class JobWorker:
|
|
|
559
615
|
exception,
|
|
560
616
|
SystemExceptionCodes.FAILED_TO_DOWNLOAD_RUNTIME_ZIP.value,
|
|
561
617
|
self.send_system_exception,
|
|
562
|
-
may_contain_user_data=False
|
|
618
|
+
may_contain_user_data=False,
|
|
563
619
|
) from exception
|
|
564
620
|
finally:
|
|
565
621
|
download_time = time() - start_time
|
|
@@ -605,7 +661,7 @@ class JobWorker:
|
|
|
605
661
|
exception,
|
|
606
662
|
SystemExceptionCodes.FAILED_TO_SEND_STATUS_UPDATE.value,
|
|
607
663
|
self.send_system_exception,
|
|
608
|
-
may_contain_user_data=False
|
|
664
|
+
may_contain_user_data=False,
|
|
609
665
|
) from exception
|
|
610
666
|
|
|
611
667
|
def _run_root_job(self, module_input_path: str) -> str:
|
|
@@ -56,17 +56,13 @@ class LargeFileSystem:
|
|
|
56
56
|
self._path_on_disk_for_write: Optional[str] = None
|
|
57
57
|
self._send_status_update: Callable[[StatusUpdate], None] = send_status_update
|
|
58
58
|
|
|
59
|
-
@property
|
|
60
|
-
def _is_initialized(self) -> bool:
|
|
61
|
-
return self._path_on_disk is not None
|
|
62
|
-
|
|
63
59
|
@property
|
|
64
60
|
def uuid(self) -> str:
|
|
65
61
|
return self._lfs_mapping['uuid']
|
|
66
62
|
|
|
67
63
|
@property
|
|
68
64
|
def docker_mount(self) -> docker.types.Mount:
|
|
69
|
-
if not self.
|
|
65
|
+
if not self._path_on_disk:
|
|
70
66
|
raise LargeFileSystemError('LargeFileSystem not initialized')
|
|
71
67
|
|
|
72
68
|
return docker.types.Mount(
|
|
@@ -77,7 +73,7 @@ class LargeFileSystem:
|
|
|
77
73
|
)
|
|
78
74
|
|
|
79
75
|
def initialize(self) -> None:
|
|
80
|
-
if self.
|
|
76
|
+
if self._path_on_disk:
|
|
81
77
|
logger_no_user_data.debug(f'LFS {self.uuid} is already initialized')
|
|
82
78
|
return
|
|
83
79
|
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import ipaddress
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Dict, Optional, cast
|
|
5
|
+
|
|
6
|
+
from docker.errors import APIError
|
|
7
|
+
from docker.models.networks import Network
|
|
8
|
+
from docker.types import IPAMConfig, IPAMPool
|
|
9
|
+
|
|
10
|
+
from biolib.biolib_errors import BioLibError
|
|
11
|
+
from biolib.biolib_logging import logger_no_user_data
|
|
12
|
+
from biolib.compute_node.remote_host_proxy import get_static_ip_from_network
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _iter_network_subnets(existing_network):
|
|
16
|
+
ipam_config = existing_network.attrs.get('IPAM', {}).get('Config', [])
|
|
17
|
+
for cfg in ipam_config:
|
|
18
|
+
subnet_str = cfg.get('Subnet')
|
|
19
|
+
if not subnet_str:
|
|
20
|
+
continue
|
|
21
|
+
try:
|
|
22
|
+
yield ipaddress.ip_network(subnet_str, strict=False)
|
|
23
|
+
except ValueError:
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _find_overlap(candidate_network, existing_networks):
|
|
28
|
+
for existing in existing_networks:
|
|
29
|
+
for subnet in _iter_network_subnets(existing):
|
|
30
|
+
if candidate_network.overlaps(subnet):
|
|
31
|
+
return existing, str(subnet)
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _allocate_network_with_retries(
|
|
36
|
+
name_prefix: str,
|
|
37
|
+
docker_client,
|
|
38
|
+
internal: bool = True,
|
|
39
|
+
driver: str = 'bridge',
|
|
40
|
+
max_attempts: int = 10,
|
|
41
|
+
labels: Optional[Dict[str, str]] = None,
|
|
42
|
+
) -> Network:
|
|
43
|
+
base_network = ipaddress.ip_network('172.28.0.0/16', strict=False)
|
|
44
|
+
|
|
45
|
+
suffix = uuid.uuid4().hex
|
|
46
|
+
full_name = f'{name_prefix}{suffix}'
|
|
47
|
+
name_hash = int(hashlib.sha256(full_name.encode()).hexdigest(), 16)
|
|
48
|
+
starting_offset = name_hash % 256
|
|
49
|
+
|
|
50
|
+
for attempt in range(max_attempts):
|
|
51
|
+
offset = (starting_offset + attempt) % 256
|
|
52
|
+
|
|
53
|
+
if base_network.prefixlen == 16:
|
|
54
|
+
third_octet = offset
|
|
55
|
+
candidate_subnet = f'{base_network.network_address.exploded.rsplit(".", 2)[0]}.{third_octet}.0/24'
|
|
56
|
+
else:
|
|
57
|
+
candidate_subnet = f'{base_network.network_address.exploded.rsplit(".", 1)[0]}.{offset}.0/24'
|
|
58
|
+
|
|
59
|
+
candidate_network = ipaddress.ip_network(candidate_subnet, strict=False)
|
|
60
|
+
|
|
61
|
+
existing_networks = docker_client.networks.list()
|
|
62
|
+
overlap = _find_overlap(candidate_network, existing_networks)
|
|
63
|
+
if overlap:
|
|
64
|
+
existing_network, existing_subnet = overlap
|
|
65
|
+
logger_no_user_data.debug(
|
|
66
|
+
f'Subnet {candidate_subnet} conflicts with existing network '
|
|
67
|
+
f'{existing_network.name} ({existing_subnet}), trying next candidate'
|
|
68
|
+
)
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
ipam_pool = IPAMPool(subnet=candidate_subnet)
|
|
72
|
+
computed_ipam_config = IPAMConfig(pool_configs=[ipam_pool])
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
network = cast(
|
|
76
|
+
Network,
|
|
77
|
+
docker_client.networks.create(
|
|
78
|
+
name=full_name,
|
|
79
|
+
internal=internal,
|
|
80
|
+
driver=driver,
|
|
81
|
+
ipam=computed_ipam_config,
|
|
82
|
+
labels=labels or {},
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
static_ip = get_static_ip_from_network(network, offset=2)
|
|
86
|
+
logger_no_user_data.debug(
|
|
87
|
+
f'Created network {full_name} with subnet {candidate_subnet} and static IP {static_ip}'
|
|
88
|
+
)
|
|
89
|
+
return network
|
|
90
|
+
except APIError as api_error:
|
|
91
|
+
logger_no_user_data.debug(
|
|
92
|
+
f'Network creation failed with Docker API error for subnet {candidate_subnet}: {api_error}, '
|
|
93
|
+
f'trying next candidate (attempt {attempt + 1}/{max_attempts})'
|
|
94
|
+
)
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
raise BioLibError(
|
|
98
|
+
f'Failed to allocate and create network {full_name} after {max_attempts} attempts. ' f'Base CIDR: 172.28.0.0/16'
|
|
99
|
+
)
|