pybiolib 1.2.883__py3-none-any.whl → 1.2.1890__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +33 -10
- biolib/_data_record/data_record.py +220 -126
- biolib/_index/index.py +55 -0
- biolib/_index/query_result.py +103 -0
- biolib/_internal/add_copilot_prompts.py +24 -11
- biolib/_internal/add_gui_files.py +81 -0
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +1 -18
- biolib/_internal/data_record/push_data.py +65 -16
- biolib/_internal/data_record/remote_storage_endpoint.py +18 -13
- biolib/_internal/file_utils.py +48 -0
- biolib/_internal/lfs/cache.py +4 -2
- biolib/_internal/push_application.py +95 -24
- biolib/_internal/runtime.py +2 -0
- biolib/_internal/string_utils.py +13 -0
- biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-general.instructions.md +5 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
- biolib/_internal/templates/dashboard_template/.biolib/config.yml +5 -0
- biolib/_internal/templates/{init_template → github_workflow_template}/.github/workflows/biolib.yml +7 -2
- biolib/_internal/templates/gitignore_template/.gitignore +10 -0
- biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
- biolib/_internal/templates/gui_template/App.tsx +53 -0
- biolib/_internal/templates/gui_template/Dockerfile +27 -0
- biolib/_internal/templates/gui_template/biolib-sdk.ts +82 -0
- biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
- biolib/_internal/templates/gui_template/index.css +5 -0
- biolib/_internal/templates/gui_template/index.html +13 -0
- biolib/_internal/templates/gui_template/index.tsx +10 -0
- biolib/_internal/templates/gui_template/package.json +27 -0
- biolib/_internal/templates/gui_template/tsconfig.json +24 -0
- biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +50 -0
- biolib/_internal/templates/gui_template/vite.config.mts +10 -0
- biolib/_internal/templates/init_template/.biolib/config.yml +1 -0
- biolib/_internal/templates/init_template/Dockerfile +5 -1
- biolib/_internal/templates/init_template/run.py +6 -15
- biolib/_internal/templates/init_template/run.sh +1 -0
- biolib/_internal/templates/templates.py +21 -1
- biolib/_internal/utils/__init__.py +47 -0
- biolib/_internal/utils/auth.py +46 -0
- biolib/_internal/utils/job_url.py +33 -0
- biolib/_internal/utils/multinode.py +12 -14
- biolib/_runtime/runtime.py +15 -2
- biolib/_session/session.py +7 -5
- biolib/_shared/__init__.py +0 -0
- biolib/_shared/types/__init__.py +74 -0
- biolib/_shared/types/account.py +12 -0
- biolib/_shared/types/account_member.py +8 -0
- biolib/{_internal → _shared}/types/experiment.py +1 -0
- biolib/_shared/types/resource.py +37 -0
- biolib/_shared/types/resource_deploy_key.py +11 -0
- biolib/{_internal → _shared}/types/resource_version.py +8 -2
- biolib/_shared/types/user.py +19 -0
- biolib/_shared/utils/__init__.py +7 -0
- biolib/_shared/utils/resource_uri.py +75 -0
- biolib/api/client.py +5 -48
- biolib/app/app.py +97 -55
- biolib/biolib_api_client/api_client.py +3 -47
- biolib/biolib_api_client/app_types.py +1 -1
- biolib/biolib_api_client/biolib_app_api.py +31 -6
- biolib/biolib_api_client/biolib_job_api.py +1 -1
- biolib/biolib_api_client/user_state.py +34 -2
- biolib/biolib_binary_format/module_input.py +8 -0
- biolib/biolib_binary_format/remote_endpoints.py +3 -3
- biolib/biolib_binary_format/remote_stream_seeker.py +39 -25
- biolib/biolib_logging.py +1 -1
- biolib/cli/__init__.py +2 -2
- biolib/cli/auth.py +4 -16
- biolib/cli/data_record.py +82 -0
- biolib/cli/index.py +32 -0
- biolib/cli/init.py +393 -71
- biolib/cli/lfs.py +1 -1
- biolib/cli/run.py +9 -6
- biolib/cli/start.py +14 -1
- biolib/compute_node/job_worker/executors/docker_executor.py +31 -9
- biolib/compute_node/job_worker/executors/docker_types.py +1 -1
- biolib/compute_node/job_worker/executors/types.py +6 -5
- biolib/compute_node/job_worker/job_storage.py +2 -1
- biolib/compute_node/job_worker/job_worker.py +155 -90
- biolib/compute_node/job_worker/large_file_system.py +2 -6
- biolib/compute_node/job_worker/network_alloc.py +99 -0
- biolib/compute_node/job_worker/network_buffer.py +240 -0
- biolib/compute_node/job_worker/utilization_reporter_thread.py +2 -2
- biolib/compute_node/remote_host_proxy.py +163 -79
- biolib/compute_node/utils.py +2 -0
- biolib/compute_node/webserver/compute_node_results_proxy.py +189 -0
- biolib/compute_node/webserver/proxy_utils.py +28 -0
- biolib/compute_node/webserver/webserver.py +64 -19
- biolib/experiments/experiment.py +111 -16
- biolib/jobs/job.py +128 -31
- biolib/jobs/job_result.py +74 -34
- biolib/jobs/types.py +1 -0
- biolib/sdk/__init__.py +28 -3
- biolib/typing_utils.py +1 -1
- biolib/utils/cache_state.py +8 -5
- biolib/utils/multipart_uploader.py +24 -18
- biolib/utils/seq_util.py +1 -1
- pybiolib-1.2.1890.dist-info/METADATA +41 -0
- pybiolib-1.2.1890.dist-info/RECORD +177 -0
- {pybiolib-1.2.883.dist-info → pybiolib-1.2.1890.dist-info}/WHEEL +1 -1
- pybiolib-1.2.1890.dist-info/entry_points.txt +2 -0
- biolib/_internal/llm_instructions/.github/instructions/style-react-ts.instructions.md +0 -22
- biolib/_internal/templates/init_template/.gitignore +0 -2
- biolib/_internal/types/__init__.py +0 -6
- biolib/_internal/types/resource.py +0 -18
- biolib/biolib_download_container.py +0 -38
- biolib/cli/download_container.py +0 -14
- biolib/utils/app_uri.py +0 -57
- pybiolib-1.2.883.dist-info/METADATA +0 -50
- pybiolib-1.2.883.dist-info/RECORD +0 -148
- pybiolib-1.2.883.dist-info/entry_points.txt +0 -3
- /biolib/{_internal/llm_instructions → _index}/__init__.py +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/general-app-knowledge.instructions.md +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/instructions/style-python.instructions.md +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_app_inputs.prompt.md +0 -0
- /biolib/_internal/{llm_instructions → templates/copilot_template}/.github/prompts/biolib_run_apps.prompt.md +0 -0
- /biolib/{_internal → _shared}/types/app.py +0 -0
- /biolib/{_internal → _shared}/types/data_record.py +0 -0
- /biolib/{_internal → _shared}/types/file_node.py +0 -0
- /biolib/{_internal → _shared}/types/push.py +0 -0
- /biolib/{_internal → _shared}/types/resource_permission.py +0 -0
- /biolib/{_internal → _shared}/types/result.py +0 -0
- /biolib/{_internal → _shared}/types/typing.py +0 -0
- {pybiolib-1.2.883.dist-info → pybiolib-1.2.1890.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,50 +1,63 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
|
-
import
|
|
4
|
+
import multiprocessing
|
|
5
|
+
import os
|
|
4
6
|
import shlex
|
|
7
|
+
import signal
|
|
8
|
+
import socket
|
|
5
9
|
import sys
|
|
6
10
|
import tempfile
|
|
7
11
|
import zipfile
|
|
8
|
-
from time import time
|
|
9
12
|
from queue import Queue
|
|
10
|
-
import
|
|
11
|
-
import os
|
|
12
|
-
import signal
|
|
13
|
+
from time import time
|
|
13
14
|
from types import FrameType
|
|
14
15
|
|
|
15
16
|
from docker.models.networks import Network # type: ignore
|
|
17
|
+
from docker.types import IPAMConfig, IPAMPool # type: ignore
|
|
16
18
|
|
|
17
|
-
from biolib._internal.http_client import HttpClient
|
|
18
|
-
from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
|
|
19
|
-
from biolib.compute_node.job_worker.job_legacy_input_wait_timeout_thread import JobLegacyInputWaitTimeout
|
|
20
|
-
from biolib.compute_node.job_worker.job_storage import JobStorage
|
|
21
|
-
from biolib.compute_node.job_worker.large_file_system import LargeFileSystem
|
|
22
|
-
from biolib.biolib_errors import DockerContainerNotFoundDuringExecutionException, BioLibError, \
|
|
23
|
-
StorageDownloadFailed
|
|
24
|
-
from biolib.compute_node.job_worker.job_max_runtime_timer_thread import JobMaxRuntimeTimerThread
|
|
25
|
-
from biolib.compute_node.remote_host_proxy import RemoteHostProxy
|
|
26
|
-
from biolib.typing_utils import Optional, List, Dict
|
|
27
19
|
from biolib import utils
|
|
28
|
-
from biolib.
|
|
29
|
-
|
|
20
|
+
from biolib._internal.http_client import HttpClient
|
|
21
|
+
from biolib.biolib_api_client import (
|
|
22
|
+
AppVersionOnJob,
|
|
23
|
+
BiolibApiClient,
|
|
24
|
+
CreatedJobDict,
|
|
25
|
+
JobWrapper,
|
|
26
|
+
Module,
|
|
27
|
+
ModuleEnvironment,
|
|
28
|
+
)
|
|
30
29
|
from biolib.biolib_api_client.biolib_job_api import BiolibJobApi
|
|
30
|
+
from biolib.biolib_binary_format import (
|
|
31
|
+
InMemoryIndexableBuffer,
|
|
32
|
+
ModuleInput,
|
|
33
|
+
ModuleOutputV2,
|
|
34
|
+
SavedJob,
|
|
35
|
+
SystemException,
|
|
36
|
+
SystemStatusUpdate,
|
|
37
|
+
)
|
|
38
|
+
from biolib.biolib_binary_format.stdout_and_stderr import StdoutAndStderr
|
|
31
39
|
from biolib.biolib_docker_client import BiolibDockerClient
|
|
40
|
+
from biolib.biolib_errors import BioLibError, DockerContainerNotFoundDuringExecutionException, StorageDownloadFailed
|
|
32
41
|
from biolib.biolib_logging import logger, logger_no_user_data
|
|
33
42
|
from biolib.compute_node.job_worker.executors import DockerExecutor
|
|
34
43
|
from biolib.compute_node.job_worker.executors.types import LocalExecutorOptions, StatusUpdate
|
|
35
|
-
from biolib.compute_node.
|
|
36
|
-
from biolib.compute_node.
|
|
44
|
+
from biolib.compute_node.job_worker.job_legacy_input_wait_timeout_thread import JobLegacyInputWaitTimeout
|
|
45
|
+
from biolib.compute_node.job_worker.job_max_runtime_timer_thread import JobMaxRuntimeTimerThread
|
|
46
|
+
from biolib.compute_node.job_worker.job_storage import JobStorage
|
|
47
|
+
from biolib.compute_node.job_worker.large_file_system import LargeFileSystem
|
|
37
48
|
from biolib.compute_node.job_worker.mappings import Mappings, path_without_first_folder
|
|
49
|
+
from biolib.compute_node.job_worker.network_buffer import NetworkBuffer
|
|
38
50
|
from biolib.compute_node.job_worker.utils import ComputeProcessException, log_disk_and_memory_usage_info
|
|
39
|
-
from biolib.compute_node.
|
|
40
|
-
from biolib.
|
|
41
|
-
|
|
51
|
+
from biolib.compute_node.remote_host_proxy import RemoteHostMapping, RemoteHostProxy, get_static_ip_from_network
|
|
52
|
+
from biolib.compute_node.socker_listener_thread import SocketListenerThread
|
|
53
|
+
from biolib.compute_node.socket_sender_thread import SocketSenderThread
|
|
54
|
+
from biolib.compute_node.utils import SystemExceptionCodeMap, SystemExceptionCodes, get_package_type
|
|
55
|
+
from biolib.typing_utils import Dict, List, Optional
|
|
42
56
|
|
|
43
57
|
SOCKET_HOST = '127.0.0.1'
|
|
44
58
|
|
|
45
59
|
|
|
46
60
|
class JobWorkerProcess(multiprocessing.Process):
|
|
47
|
-
|
|
48
61
|
# note: this method is run in the parent process
|
|
49
62
|
def __init__(self, socket_port: int, log_level: int):
|
|
50
63
|
super().__init__()
|
|
@@ -69,6 +82,13 @@ class JobWorker:
|
|
|
69
82
|
# handle termination signal from parent
|
|
70
83
|
signal.signal(signal.SIGTERM, self._handle_exit_gracefully)
|
|
71
84
|
|
|
85
|
+
try:
|
|
86
|
+
docker_client = BiolibDockerClient.get_docker_client()
|
|
87
|
+
networks = docker_client.networks.list()
|
|
88
|
+
logger_no_user_data.debug(f'Docker networks at JobWorker init: {[net.name for net in networks]}')
|
|
89
|
+
except Exception as error:
|
|
90
|
+
logger_no_user_data.debug(f'Failed to list docker networks at init: {error}')
|
|
91
|
+
|
|
72
92
|
self._socket_port = socket_port
|
|
73
93
|
self._received_messages_queue: Queue = Queue()
|
|
74
94
|
self._messages_to_send_queue: Queue = Queue()
|
|
@@ -80,9 +100,9 @@ class JobWorker:
|
|
|
80
100
|
|
|
81
101
|
self._remote_host_proxies: List[RemoteHostProxy] = []
|
|
82
102
|
self._internal_network: Optional[Network] = None
|
|
83
|
-
self._public_network: Optional[Network] = None
|
|
84
103
|
self._executors: List[DockerExecutor] = []
|
|
85
104
|
self.is_cleaning_up: bool = False
|
|
105
|
+
self._network_buffer = NetworkBuffer.get_instance()
|
|
86
106
|
|
|
87
107
|
self.job_temporary_dir: Optional[str] = None
|
|
88
108
|
|
|
@@ -91,18 +111,18 @@ class JobWorker:
|
|
|
91
111
|
exception,
|
|
92
112
|
SystemExceptionCodes.FAILED_TO_INIT_COMPUTE_PROCESS_VARIABLES.value,
|
|
93
113
|
self.send_system_exception,
|
|
94
|
-
may_contain_user_data=False
|
|
114
|
+
may_contain_user_data=False,
|
|
95
115
|
) from exception
|
|
96
116
|
|
|
97
117
|
if socket_port:
|
|
98
118
|
self._connect_to_parent()
|
|
99
119
|
|
|
100
120
|
def _handle_exit_gracefully(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
121
|
+
self,
|
|
122
|
+
signum: int,
|
|
123
|
+
frame: Optional[FrameType], # pylint: disable=unused-argument
|
|
104
124
|
) -> None:
|
|
105
|
-
job_id = self._root_job_wrapper[
|
|
125
|
+
job_id = self._root_job_wrapper['job']['public_id'] if self._root_job_wrapper else None
|
|
106
126
|
logger_no_user_data.debug(
|
|
107
127
|
f'_JobWorker ({job_id}) got exit signal {signal.Signals(signum).name}' # pylint: disable=no-member
|
|
108
128
|
)
|
|
@@ -187,9 +207,7 @@ class JobWorker:
|
|
|
187
207
|
|
|
188
208
|
except Exception as exception:
|
|
189
209
|
raise ComputeProcessException(
|
|
190
|
-
exception,
|
|
191
|
-
SystemExceptionCodes.UNKNOWN_COMPUTE_PROCESS_ERROR.value,
|
|
192
|
-
self.send_system_exception
|
|
210
|
+
exception, SystemExceptionCodes.UNKNOWN_COMPUTE_PROCESS_ERROR.value, self.send_system_exception
|
|
193
211
|
) from exception
|
|
194
212
|
|
|
195
213
|
def _cleanup(self) -> None:
|
|
@@ -200,6 +218,8 @@ class JobWorker:
|
|
|
200
218
|
executor.cleanup()
|
|
201
219
|
|
|
202
220
|
proxy_count = len(self._remote_host_proxies)
|
|
221
|
+
cleaned_networks = set()
|
|
222
|
+
|
|
203
223
|
if proxy_count > 0:
|
|
204
224
|
logger_no_user_data.debug('Cleaning up proxies...')
|
|
205
225
|
proxy_cleanup_start_time = time()
|
|
@@ -211,21 +231,37 @@ class JobWorker:
|
|
|
211
231
|
logger_no_user_data.error('Failed to clean up remote host proxy')
|
|
212
232
|
logger.error(exception)
|
|
213
233
|
|
|
234
|
+
for network in proxy.get_remote_host_networks():
|
|
235
|
+
try:
|
|
236
|
+
self._cleanup_network(network)
|
|
237
|
+
cleaned_networks.add(network.id)
|
|
238
|
+
except Exception as exception: # pylint: disable=broad-except
|
|
239
|
+
logger_no_user_data.error(f'Failed to clean up network {network.name}')
|
|
240
|
+
logger.error(exception)
|
|
241
|
+
|
|
214
242
|
self._remote_host_proxies = []
|
|
215
243
|
logger_no_user_data.debug(f'Cleaned up {proxy_count} proxies in {time() - proxy_cleanup_start_time}')
|
|
216
244
|
|
|
217
245
|
logger_no_user_data.debug('Cleaning up networks...')
|
|
218
|
-
self.
|
|
246
|
+
if self._internal_network and self._internal_network.id not in cleaned_networks:
|
|
247
|
+
self._cleanup_network(self._internal_network)
|
|
219
248
|
self._internal_network = None
|
|
220
|
-
|
|
221
|
-
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
logger_no_user_data.debug('Refilling network buffer...')
|
|
252
|
+
created = self._network_buffer.fill_buffer()
|
|
253
|
+
logger_no_user_data.debug(f'Refilled buffer with {created} new networks')
|
|
254
|
+
except Exception as exception: # pylint: disable=broad-except
|
|
255
|
+
logger_no_user_data.error('Failed to refill network buffer')
|
|
256
|
+
logger.error(exception)
|
|
257
|
+
|
|
222
258
|
logger_no_user_data.debug('Cleaned up networks...')
|
|
223
259
|
|
|
224
260
|
@staticmethod
|
|
225
261
|
def _cleanup_network(network: Optional[Network]) -> None:
|
|
226
262
|
if network:
|
|
227
263
|
network_cleanup_start_time = time()
|
|
228
|
-
network_name = network
|
|
264
|
+
network_name = network.name
|
|
229
265
|
try:
|
|
230
266
|
network.remove()
|
|
231
267
|
except Exception as exception: # pylint: disable=broad-except
|
|
@@ -237,10 +273,7 @@ class JobWorker:
|
|
|
237
273
|
def _handle_save_job_wrapper(self, package: bytes):
|
|
238
274
|
job_wrapper_json_string = SavedJob(package).deserialize()
|
|
239
275
|
job_wrapper: JobWrapper = json.loads(job_wrapper_json_string)
|
|
240
|
-
BiolibApiClient.initialize(
|
|
241
|
-
base_url=job_wrapper['BASE_URL'],
|
|
242
|
-
access_token=job_wrapper['access_token']
|
|
243
|
-
)
|
|
276
|
+
BiolibApiClient.initialize(base_url=job_wrapper['BASE_URL'], access_token=job_wrapper['access_token'])
|
|
244
277
|
self._root_job_wrapper = job_wrapper
|
|
245
278
|
if not utils.IS_RUNNING_IN_CLOUD:
|
|
246
279
|
job_wrapper['cloud_job'] = None
|
|
@@ -250,6 +283,15 @@ class JobWorker:
|
|
|
250
283
|
job = job_wrapper['job']
|
|
251
284
|
self._jobs[job['public_id']] = job
|
|
252
285
|
|
|
286
|
+
app_version = job['app_version']
|
|
287
|
+
modules = app_version.get('modules', [])
|
|
288
|
+
for module in modules:
|
|
289
|
+
module_ports = module.get('ports', [])
|
|
290
|
+
if module_ports:
|
|
291
|
+
logger_no_user_data.debug(
|
|
292
|
+
f"Job '{job['public_id']}' module '{module['name']}' has ports: {module_ports}"
|
|
293
|
+
)
|
|
294
|
+
|
|
253
295
|
if job['app_version'].get('modules') is not None and BiolibDockerClient.is_docker_running():
|
|
254
296
|
self._start_network_and_remote_host_proxies(job)
|
|
255
297
|
|
|
@@ -259,44 +301,33 @@ class JobWorker:
|
|
|
259
301
|
app_version = job['app_version']
|
|
260
302
|
job_id = job['public_id']
|
|
261
303
|
remote_hosts = app_version['remote_hosts']
|
|
262
|
-
if utils.IS_RUNNING_IN_CLOUD:
|
|
263
|
-
remote_hosts.append(
|
|
264
|
-
{
|
|
265
|
-
'hostname': 'AppCallerProxy',
|
|
266
|
-
},
|
|
267
|
-
)
|
|
268
|
-
|
|
269
304
|
docker_client = BiolibDockerClient.get_docker_client()
|
|
270
305
|
try:
|
|
306
|
+
name_hash = int(hashlib.sha256(job_id.encode()).hexdigest(), 16)
|
|
307
|
+
third_octet = name_hash % 256
|
|
308
|
+
internal_subnet = f'172.29.{third_octet}.0/24'
|
|
309
|
+
|
|
310
|
+
ipam_pool = IPAMPool(subnet=internal_subnet)
|
|
311
|
+
ipam_config = IPAMConfig(pool_configs=[ipam_pool])
|
|
312
|
+
|
|
271
313
|
self._internal_network = docker_client.networks.create(
|
|
272
314
|
name=f'biolib-sandboxed-network-{job_id}',
|
|
273
315
|
internal=True,
|
|
274
316
|
driver='bridge',
|
|
317
|
+
ipam=ipam_config,
|
|
275
318
|
)
|
|
319
|
+
logger_no_user_data.debug(f'Created internal network for job {job_id} with subnet {internal_subnet}')
|
|
276
320
|
except Exception as exception:
|
|
277
321
|
raise ComputeProcessException(
|
|
278
322
|
exception,
|
|
279
323
|
SystemExceptionCodes.FAILED_TO_CREATE_DOCKER_NETWORKS.value,
|
|
280
324
|
self.send_system_exception,
|
|
281
|
-
may_contain_user_data=False
|
|
325
|
+
may_contain_user_data=False,
|
|
282
326
|
) from exception
|
|
283
327
|
|
|
284
328
|
if len(remote_hosts) > 0:
|
|
285
|
-
logger_no_user_data.debug(f'Job "{job_id}"
|
|
286
|
-
|
|
287
|
-
self._public_network = docker_client.networks.create(
|
|
288
|
-
name=f'biolib-proxy-network-{job_id}',
|
|
289
|
-
internal=False,
|
|
290
|
-
driver='bridge',
|
|
291
|
-
)
|
|
292
|
-
except Exception as exception:
|
|
293
|
-
raise ComputeProcessException(
|
|
294
|
-
exception,
|
|
295
|
-
SystemExceptionCodes.FAILED_TO_CREATE_DOCKER_NETWORKS.value,
|
|
296
|
-
self.send_system_exception,
|
|
297
|
-
may_contain_user_data=False
|
|
298
|
-
) from exception
|
|
299
|
-
logger_no_user_data.debug(f'Job "{job_id}" starting proxies for remote hosts: {remote_hosts}')
|
|
329
|
+
logger_no_user_data.debug(f'Job "{job_id}" starting proxy for remote hosts: {remote_hosts}')
|
|
330
|
+
created_networks: List[Network] = []
|
|
300
331
|
try:
|
|
301
332
|
hostname_to_ports: Dict[str, List[int]] = {}
|
|
302
333
|
for remote_host in remote_hosts:
|
|
@@ -312,33 +343,67 @@ class JobWorker:
|
|
|
312
343
|
else:
|
|
313
344
|
hostname_to_ports[hostname] = [port]
|
|
314
345
|
|
|
315
|
-
|
|
346
|
+
remote_host_mappings: List[RemoteHostMapping] = []
|
|
347
|
+
networks = self._network_buffer.allocate_networks(job_id, len(hostname_to_ports))
|
|
348
|
+
created_networks.extend(networks)
|
|
349
|
+
|
|
350
|
+
for (hostname, ports), network in zip(hostname_to_ports.items(), networks):
|
|
351
|
+
static_ip = get_static_ip_from_network(network, offset=2)
|
|
352
|
+
|
|
353
|
+
mapping = RemoteHostMapping(
|
|
354
|
+
hostname=hostname,
|
|
355
|
+
ports=ports,
|
|
356
|
+
network=network,
|
|
357
|
+
static_ip=static_ip,
|
|
358
|
+
)
|
|
359
|
+
remote_host_mappings.append(mapping)
|
|
360
|
+
|
|
361
|
+
if remote_host_mappings:
|
|
316
362
|
remote_host_proxy = RemoteHostProxy(
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
job_id,
|
|
321
|
-
ports,
|
|
363
|
+
remote_host_mappings=remote_host_mappings,
|
|
364
|
+
job=job,
|
|
365
|
+
app_caller_network=None,
|
|
322
366
|
)
|
|
323
367
|
remote_host_proxy.start()
|
|
324
368
|
self._remote_host_proxies.append(remote_host_proxy)
|
|
369
|
+
num_hosts = len(remote_host_mappings)
|
|
370
|
+
logger_no_user_data.debug(f'Started single proxy container for {num_hosts} remote hosts')
|
|
325
371
|
|
|
326
372
|
except Exception as exception:
|
|
373
|
+
for network in created_networks:
|
|
374
|
+
self._cleanup_network(network)
|
|
375
|
+
|
|
327
376
|
raise ComputeProcessException(
|
|
328
377
|
exception,
|
|
329
378
|
SystemExceptionCodes.FAILED_TO_START_REMOTE_HOST_PROXIES.value,
|
|
330
379
|
self.send_system_exception,
|
|
331
|
-
may_contain_user_data=False
|
|
380
|
+
may_contain_user_data=False,
|
|
332
381
|
) from exception
|
|
333
382
|
|
|
334
|
-
|
|
383
|
+
if utils.IS_RUNNING_IN_CLOUD:
|
|
384
|
+
try:
|
|
385
|
+
app_caller_proxy = RemoteHostProxy(
|
|
386
|
+
remote_host_mappings=[],
|
|
387
|
+
job=job,
|
|
388
|
+
app_caller_network=self._internal_network,
|
|
389
|
+
)
|
|
390
|
+
app_caller_proxy.start()
|
|
391
|
+
self._remote_host_proxies.append(app_caller_proxy)
|
|
392
|
+
logger_no_user_data.debug('Started app caller proxy')
|
|
393
|
+
except Exception as exception:
|
|
394
|
+
raise ComputeProcessException(
|
|
395
|
+
exception,
|
|
396
|
+
SystemExceptionCodes.FAILED_TO_START_REMOTE_HOST_PROXIES.value,
|
|
397
|
+
self.send_system_exception,
|
|
398
|
+
may_contain_user_data=False,
|
|
399
|
+
) from exception
|
|
335
400
|
|
|
336
401
|
def _run_app_version(
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
402
|
+
self,
|
|
403
|
+
app_version_id: str,
|
|
404
|
+
module_input_path: str,
|
|
405
|
+
caller_job: CreatedJobDict,
|
|
406
|
+
main_module_output_path: str,
|
|
342
407
|
) -> None:
|
|
343
408
|
job: CreatedJobDict = BiolibJobApi.create(app_version_id, caller_job=caller_job['public_id'])
|
|
344
409
|
self._jobs[job['public_id']] = job
|
|
@@ -357,17 +422,17 @@ class JobWorker:
|
|
|
357
422
|
root_job_id = root_job['public_id']
|
|
358
423
|
if job.get('arguments_override_command') and not job['app_version']['app']['allow_client_side_execution']:
|
|
359
424
|
raise ComputeProcessException(
|
|
360
|
-
Exception(
|
|
425
|
+
Exception('Command override not allowed'),
|
|
361
426
|
SystemExceptionCodes.COMMAND_OVERRIDE_NOT_ALLOWED.value,
|
|
362
|
-
self.send_system_exception
|
|
427
|
+
self.send_system_exception,
|
|
363
428
|
)
|
|
364
429
|
|
|
365
430
|
modules = job['app_version'].get('modules')
|
|
366
431
|
if not modules:
|
|
367
432
|
raise ComputeProcessException(
|
|
368
|
-
Exception(
|
|
433
|
+
Exception('No modules found on job'),
|
|
369
434
|
SystemExceptionCodes.NO_MODULES_FOUND_ON_JOB.value,
|
|
370
|
-
self.send_system_exception
|
|
435
|
+
self.send_system_exception,
|
|
371
436
|
)
|
|
372
437
|
|
|
373
438
|
main_module = self._get_module_from_name(modules, module_name='main')
|
|
@@ -420,8 +485,8 @@ class JobWorker:
|
|
|
420
485
|
log_disk_and_memory_usage_info()
|
|
421
486
|
|
|
422
487
|
def _run_module(
|
|
423
|
-
|
|
424
|
-
|
|
488
|
+
self,
|
|
489
|
+
options: LocalExecutorOptions,
|
|
425
490
|
) -> None:
|
|
426
491
|
module = options['module']
|
|
427
492
|
job_id = options['job']['public_id']
|
|
@@ -434,7 +499,7 @@ class JobWorker:
|
|
|
434
499
|
if not self.job_temporary_dir:
|
|
435
500
|
raise BioLibError('Undefined job_temporary_dir')
|
|
436
501
|
logger_no_user_data.debug(f'Job "{job_id}" starting child job...')
|
|
437
|
-
with open(module_input_path,'rb') as fp:
|
|
502
|
+
with open(module_input_path, 'rb') as fp:
|
|
438
503
|
module_input_serialized = fp.read()
|
|
439
504
|
module_input = ModuleInput(module_input_serialized).deserialize()
|
|
440
505
|
module_input_with_runtime_zip = self._add_runtime_zip_and_command_to_module_input(options, module_input)
|
|
@@ -443,7 +508,7 @@ class JobWorker:
|
|
|
443
508
|
arguments=module_input_with_runtime_zip['arguments'],
|
|
444
509
|
files=module_input_with_runtime_zip['files'],
|
|
445
510
|
)
|
|
446
|
-
module_input_path_new = os.path.join(self.job_temporary_dir,
|
|
511
|
+
module_input_path_new = os.path.join(self.job_temporary_dir, 'runtime.' + JobStorage.module_input_file_name)
|
|
447
512
|
open(module_input_path_new, 'wb').write(module_input_with_runtime_zip_serialized)
|
|
448
513
|
return self._run_app_version(
|
|
449
514
|
module['image_uri'],
|
|
@@ -460,7 +525,7 @@ class JobWorker:
|
|
|
460
525
|
exception,
|
|
461
526
|
SystemExceptionCodes.FAILED_TO_INITIALIZE_DOCKER_EXECUTOR.value,
|
|
462
527
|
self.send_system_exception,
|
|
463
|
-
may_contain_user_data=False
|
|
528
|
+
may_contain_user_data=False,
|
|
464
529
|
) from exception
|
|
465
530
|
else:
|
|
466
531
|
err_string = f'Job "{job_id}" hit unsupported module environment "{module["environment"]}"'
|
|
@@ -485,7 +550,7 @@ class JobWorker:
|
|
|
485
550
|
exception,
|
|
486
551
|
SystemExceptionCodes.FAILED_TO_CONNECT_TO_WORKER_THREAD_SOCKET.value,
|
|
487
552
|
self.send_system_exception,
|
|
488
|
-
may_contain_user_data=False
|
|
553
|
+
may_contain_user_data=False,
|
|
489
554
|
) from exception
|
|
490
555
|
|
|
491
556
|
try:
|
|
@@ -496,7 +561,7 @@ class JobWorker:
|
|
|
496
561
|
exception,
|
|
497
562
|
SystemExceptionCodes.FAILED_TO_START_SENDER_THREAD_OR_RECEIVER_THREAD.value,
|
|
498
563
|
self.send_system_exception,
|
|
499
|
-
may_contain_user_data=False
|
|
564
|
+
may_contain_user_data=False,
|
|
500
565
|
) from exception
|
|
501
566
|
|
|
502
567
|
# TODO: move this mapping logic to the ModuleInput class
|
|
@@ -524,7 +589,7 @@ class JobWorker:
|
|
|
524
589
|
exception,
|
|
525
590
|
SystemExceptionCodes.FAILED_TO_CREATE_NEW_JOB.value,
|
|
526
591
|
self.send_system_exception,
|
|
527
|
-
may_contain_user_data=False
|
|
592
|
+
may_contain_user_data=False,
|
|
528
593
|
) from exception
|
|
529
594
|
|
|
530
595
|
return module_input
|
|
@@ -550,7 +615,7 @@ class JobWorker:
|
|
|
550
615
|
exception,
|
|
551
616
|
SystemExceptionCodes.FAILED_TO_DOWNLOAD_RUNTIME_ZIP.value,
|
|
552
617
|
self.send_system_exception,
|
|
553
|
-
may_contain_user_data=False
|
|
618
|
+
may_contain_user_data=False,
|
|
554
619
|
) from exception
|
|
555
620
|
finally:
|
|
556
621
|
download_time = time() - start_time
|
|
@@ -596,7 +661,7 @@ class JobWorker:
|
|
|
596
661
|
exception,
|
|
597
662
|
SystemExceptionCodes.FAILED_TO_SEND_STATUS_UPDATE.value,
|
|
598
663
|
self.send_system_exception,
|
|
599
|
-
may_contain_user_data=False
|
|
664
|
+
may_contain_user_data=False,
|
|
600
665
|
) from exception
|
|
601
666
|
|
|
602
667
|
def _run_root_job(self, module_input_path: str) -> str:
|
|
@@ -56,17 +56,13 @@ class LargeFileSystem:
|
|
|
56
56
|
self._path_on_disk_for_write: Optional[str] = None
|
|
57
57
|
self._send_status_update: Callable[[StatusUpdate], None] = send_status_update
|
|
58
58
|
|
|
59
|
-
@property
|
|
60
|
-
def _is_initialized(self) -> bool:
|
|
61
|
-
return self._path_on_disk is not None
|
|
62
|
-
|
|
63
59
|
@property
|
|
64
60
|
def uuid(self) -> str:
|
|
65
61
|
return self._lfs_mapping['uuid']
|
|
66
62
|
|
|
67
63
|
@property
|
|
68
64
|
def docker_mount(self) -> docker.types.Mount:
|
|
69
|
-
if not self.
|
|
65
|
+
if not self._path_on_disk:
|
|
70
66
|
raise LargeFileSystemError('LargeFileSystem not initialized')
|
|
71
67
|
|
|
72
68
|
return docker.types.Mount(
|
|
@@ -77,7 +73,7 @@ class LargeFileSystem:
|
|
|
77
73
|
)
|
|
78
74
|
|
|
79
75
|
def initialize(self) -> None:
|
|
80
|
-
if self.
|
|
76
|
+
if self._path_on_disk:
|
|
81
77
|
logger_no_user_data.debug(f'LFS {self.uuid} is already initialized')
|
|
82
78
|
return
|
|
83
79
|
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import ipaddress
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Dict, Optional, cast
|
|
5
|
+
|
|
6
|
+
from docker.errors import APIError
|
|
7
|
+
from docker.models.networks import Network
|
|
8
|
+
from docker.types import IPAMConfig, IPAMPool
|
|
9
|
+
|
|
10
|
+
from biolib.biolib_errors import BioLibError
|
|
11
|
+
from biolib.biolib_logging import logger_no_user_data
|
|
12
|
+
from biolib.compute_node.remote_host_proxy import get_static_ip_from_network
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _iter_network_subnets(existing_network):
|
|
16
|
+
ipam_config = existing_network.attrs.get('IPAM', {}).get('Config', [])
|
|
17
|
+
for cfg in ipam_config:
|
|
18
|
+
subnet_str = cfg.get('Subnet')
|
|
19
|
+
if not subnet_str:
|
|
20
|
+
continue
|
|
21
|
+
try:
|
|
22
|
+
yield ipaddress.ip_network(subnet_str, strict=False)
|
|
23
|
+
except ValueError:
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _find_overlap(candidate_network, existing_networks):
|
|
28
|
+
for existing in existing_networks:
|
|
29
|
+
for subnet in _iter_network_subnets(existing):
|
|
30
|
+
if candidate_network.overlaps(subnet):
|
|
31
|
+
return existing, str(subnet)
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _allocate_network_with_retries(
|
|
36
|
+
name_prefix: str,
|
|
37
|
+
docker_client,
|
|
38
|
+
internal: bool = True,
|
|
39
|
+
driver: str = 'bridge',
|
|
40
|
+
max_attempts: int = 10,
|
|
41
|
+
labels: Optional[Dict[str, str]] = None,
|
|
42
|
+
) -> Network:
|
|
43
|
+
base_network = ipaddress.ip_network('172.28.0.0/16', strict=False)
|
|
44
|
+
|
|
45
|
+
suffix = uuid.uuid4().hex
|
|
46
|
+
full_name = f'{name_prefix}{suffix}'
|
|
47
|
+
name_hash = int(hashlib.sha256(full_name.encode()).hexdigest(), 16)
|
|
48
|
+
starting_offset = name_hash % 256
|
|
49
|
+
|
|
50
|
+
for attempt in range(max_attempts):
|
|
51
|
+
offset = (starting_offset + attempt) % 256
|
|
52
|
+
|
|
53
|
+
if base_network.prefixlen == 16:
|
|
54
|
+
third_octet = offset
|
|
55
|
+
candidate_subnet = f'{base_network.network_address.exploded.rsplit(".", 2)[0]}.{third_octet}.0/24'
|
|
56
|
+
else:
|
|
57
|
+
candidate_subnet = f'{base_network.network_address.exploded.rsplit(".", 1)[0]}.{offset}.0/24'
|
|
58
|
+
|
|
59
|
+
candidate_network = ipaddress.ip_network(candidate_subnet, strict=False)
|
|
60
|
+
|
|
61
|
+
existing_networks = docker_client.networks.list()
|
|
62
|
+
overlap = _find_overlap(candidate_network, existing_networks)
|
|
63
|
+
if overlap:
|
|
64
|
+
existing_network, existing_subnet = overlap
|
|
65
|
+
logger_no_user_data.debug(
|
|
66
|
+
f'Subnet {candidate_subnet} conflicts with existing network '
|
|
67
|
+
f'{existing_network.name} ({existing_subnet}), trying next candidate'
|
|
68
|
+
)
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
ipam_pool = IPAMPool(subnet=candidate_subnet)
|
|
72
|
+
computed_ipam_config = IPAMConfig(pool_configs=[ipam_pool])
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
network = cast(
|
|
76
|
+
Network,
|
|
77
|
+
docker_client.networks.create(
|
|
78
|
+
name=full_name,
|
|
79
|
+
internal=internal,
|
|
80
|
+
driver=driver,
|
|
81
|
+
ipam=computed_ipam_config,
|
|
82
|
+
labels=labels or {},
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
static_ip = get_static_ip_from_network(network, offset=2)
|
|
86
|
+
logger_no_user_data.debug(
|
|
87
|
+
f'Created network {full_name} with subnet {candidate_subnet} and static IP {static_ip}'
|
|
88
|
+
)
|
|
89
|
+
return network
|
|
90
|
+
except APIError as api_error:
|
|
91
|
+
logger_no_user_data.debug(
|
|
92
|
+
f'Network creation failed with Docker API error for subnet {candidate_subnet}: {api_error}, '
|
|
93
|
+
f'trying next candidate (attempt {attempt + 1}/{max_attempts})'
|
|
94
|
+
)
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
raise BioLibError(
|
|
98
|
+
f'Failed to allocate and create network {full_name} after {max_attempts} attempts. ' f'Base CIDR: 172.28.0.0/16'
|
|
99
|
+
)
|