pybiolib 1.1.1881__py3-none-any.whl → 1.2.7.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +11 -4
- biolib/_data_record/data_record.py +278 -0
- biolib/_internal/data_record/__init__.py +1 -1
- biolib/_internal/data_record/data_record.py +97 -151
- biolib/_internal/data_record/remote_storage_endpoint.py +18 -7
- biolib/_internal/file_utils.py +77 -0
- biolib/_internal/fuse_mount/__init__.py +1 -0
- biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
- biolib/_internal/http_client.py +31 -9
- biolib/_internal/lfs/__init__.py +1 -0
- biolib/_internal/libs/__init__.py +1 -0
- biolib/_internal/libs/fusepy/__init__.py +1257 -0
- biolib/_internal/push_application.py +6 -1
- biolib/_internal/runtime.py +3 -56
- biolib/_internal/types/__init__.py +4 -0
- biolib/_internal/types/app.py +9 -0
- biolib/_internal/types/data_record.py +40 -0
- biolib/_internal/types/experiment.py +10 -0
- biolib/_internal/types/resource.py +14 -0
- biolib/_internal/types/typing.py +7 -0
- biolib/_internal/utils/multinode.py +264 -0
- biolib/_runtime/runtime.py +84 -0
- biolib/api/__init__.py +1 -0
- biolib/api/client.py +39 -17
- biolib/app/app.py +34 -71
- biolib/biolib_api_client/api_client.py +9 -2
- biolib/biolib_api_client/app_types.py +3 -2
- biolib/biolib_api_client/biolib_job_api.py +6 -0
- biolib/biolib_api_client/job_types.py +4 -4
- biolib/biolib_api_client/lfs_types.py +8 -2
- biolib/biolib_binary_format/remote_endpoints.py +12 -10
- biolib/biolib_binary_format/utils.py +23 -3
- biolib/cli/auth.py +1 -1
- biolib/cli/data_record.py +45 -6
- biolib/cli/lfs.py +10 -6
- biolib/compute_node/cloud_utils/cloud_utils.py +13 -16
- biolib/compute_node/job_worker/executors/docker_executor.py +127 -108
- biolib/compute_node/job_worker/job_storage.py +17 -5
- biolib/compute_node/job_worker/job_worker.py +25 -15
- biolib/compute_node/remote_host_proxy.py +72 -84
- biolib/compute_node/webserver/webserver_types.py +0 -1
- biolib/compute_node/webserver/worker_thread.py +42 -39
- biolib/experiments/experiment.py +75 -44
- biolib/jobs/job.py +98 -19
- biolib/jobs/job_result.py +46 -21
- biolib/jobs/types.py +1 -1
- biolib/runtime/__init__.py +2 -1
- biolib/sdk/__init__.py +18 -7
- biolib/typing_utils.py +2 -7
- biolib/user/sign_in.py +2 -2
- biolib/utils/seq_util.py +38 -35
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/METADATA +1 -1
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/RECORD +57 -45
- biolib/experiments/types.py +0 -9
- biolib/lfs/__init__.py +0 -4
- biolib/lfs/utils.py +0 -153
- /biolib/{lfs → _internal/lfs}/cache.py +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/LICENSE +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/WHEEL +0 -0
- {pybiolib-1.1.1881.dist-info → pybiolib-1.2.7.dev0.dist-info}/entry_points.txt +0 -0
@@ -1,21 +1,22 @@
|
|
1
|
+
import base64
|
1
2
|
import io
|
2
|
-
import tarfile
|
3
3
|
import subprocess
|
4
|
+
import tarfile
|
4
5
|
import time
|
6
|
+
from urllib.parse import urlparse
|
5
7
|
|
6
|
-
from docker.models.containers import Container # type: ignore
|
7
8
|
from docker.errors import ImageNotFound # type: ignore
|
9
|
+
from docker.models.containers import Container # type: ignore
|
8
10
|
from docker.models.images import Image # type: ignore
|
9
11
|
from docker.models.networks import Network # type: ignore
|
10
12
|
|
11
13
|
from biolib import utils
|
12
|
-
from biolib.
|
13
|
-
from biolib.compute_node.cloud_utils import CloudUtils
|
14
|
-
from biolib.typing_utils import Optional, List
|
15
|
-
from biolib.biolib_api_client import RemoteHost
|
14
|
+
from biolib.biolib_api_client import BiolibApiClient, RemoteHost
|
16
15
|
from biolib.biolib_docker_client import BiolibDockerClient
|
16
|
+
from biolib.biolib_errors import BioLibError
|
17
17
|
from biolib.biolib_logging import logger_no_user_data
|
18
|
-
from biolib.
|
18
|
+
from biolib.compute_node.cloud_utils import CloudUtils
|
19
|
+
from biolib.typing_utils import List, Optional
|
19
20
|
|
20
21
|
|
21
22
|
# Prepare for remote hosts with specified port
|
@@ -24,31 +25,23 @@ class RemoteHostExtended(RemoteHost):
|
|
24
25
|
|
25
26
|
|
26
27
|
class RemoteHostProxy:
|
27
|
-
|
28
28
|
def __init__(
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
can_push_data_record_for_user: bool,
|
29
|
+
self,
|
30
|
+
remote_host: RemoteHost,
|
31
|
+
public_network: Network,
|
32
|
+
internal_network: Optional[Network],
|
33
|
+
job_id: str,
|
34
|
+
ports: List[int],
|
36
35
|
):
|
37
|
-
self._can_push_data_record_for_user: bool = can_push_data_record_for_user
|
38
36
|
self.is_app_caller_proxy = remote_host['hostname'] == 'AppCallerProxy'
|
39
|
-
|
40
|
-
# Default to port 443 for now until backend serves remote_hosts with port specified
|
41
|
-
self._remote_host: RemoteHostExtended = RemoteHostExtended(
|
42
|
-
hostname=remote_host['hostname'],
|
43
|
-
ports=ports
|
44
|
-
)
|
37
|
+
self._remote_host: RemoteHostExtended = RemoteHostExtended(hostname=remote_host['hostname'], ports=ports)
|
45
38
|
self._public_network: Network = public_network
|
46
39
|
self._internal_network: Optional[Network] = internal_network
|
47
40
|
|
48
41
|
if not job_id:
|
49
42
|
raise Exception('RemoteHostProxy missing argument "job_id"')
|
50
43
|
|
51
|
-
self._name = f
|
44
|
+
self._name = f'biolib-remote-host-proxy-{job_id}-{self.hostname}'
|
52
45
|
self._job_uuid = job_id
|
53
46
|
self._container: Optional[Container] = None
|
54
47
|
self._enclave_traffic_forwarder_processes: List[subprocess.Popen] = []
|
@@ -154,32 +147,24 @@ class RemoteHostProxy:
|
|
154
147
|
raise Exception('RemoteHostProxy container not defined when attempting to write NGINX config')
|
155
148
|
|
156
149
|
docker = BiolibDockerClient.get_docker_client()
|
157
|
-
|
150
|
+
upstream_hostname = urlparse(BiolibApiClient.get().base_url).hostname
|
158
151
|
if self.is_app_caller_proxy:
|
152
|
+
if not utils.IS_RUNNING_IN_CLOUD:
|
153
|
+
raise BioLibError('Calling apps inside apps is not supported in local compute environment')
|
154
|
+
|
159
155
|
logger_no_user_data.debug(f'Job "{self._job_uuid}" writing config for and starting App Caller Proxy...')
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
if base_url in ('https://biolib.com', 'https://staging-elb.biolib.com'):
|
164
|
-
cloud_base_url = 'https://biolibcloud.com'
|
165
|
-
else:
|
166
|
-
raise BioLibError('Calling apps inside apps is not supported in local compute environment')
|
167
|
-
|
168
|
-
if utils.IS_RUNNING_IN_CLOUD:
|
169
|
-
config = CloudUtils.get_webserver_config()
|
170
|
-
s3_results_bucket_name = config['s3_general_storage_bucket_name']
|
171
|
-
s3_results_base_url = f'https://{s3_results_bucket_name}.s3.amazonaws.com'
|
172
|
-
else:
|
173
|
-
if base_url in ('https://biolib.com', 'https://staging-elb.biolib.com'):
|
174
|
-
s3_results_base_url = 'https://biolib-cloud-api.s3.amazonaws.com'
|
175
|
-
else:
|
176
|
-
raise BioLibError("Calling apps inside apps locally is only supported on biolib.com")
|
156
|
+
config = CloudUtils.get_webserver_config()
|
157
|
+
compute_node_uuid = config['compute_node_info']['public_id']
|
158
|
+
compute_node_auth_token = config['compute_node_info']['auth_token']
|
177
159
|
|
178
160
|
# TODO: Get access_token from new API class instead
|
179
161
|
access_token = BiolibApiClient.get().access_token
|
180
162
|
bearer_token = f'Bearer {access_token}' if access_token else ''
|
181
163
|
|
182
|
-
|
164
|
+
biolib_index_basic_auth = f'compute_node|admin:{compute_node_auth_token},{self._job_uuid}'
|
165
|
+
biolib_index_basic_auth_base64 = base64.b64encode(biolib_index_basic_auth.encode('utf-8')).decode('utf-8')
|
166
|
+
|
167
|
+
nginx_config = f"""
|
183
168
|
events {{
|
184
169
|
worker_connections 1024;
|
185
170
|
}}
|
@@ -196,29 +181,19 @@ http {{
|
|
196
181
|
default "";
|
197
182
|
}}
|
198
183
|
|
199
|
-
map $request_method $bearer_token_on_patch {{
|
200
|
-
PATCH "{bearer_token}";
|
201
|
-
default "";
|
202
|
-
}}
|
203
|
-
|
204
184
|
map $request_method $bearer_token_on_patch_and_get {{
|
205
185
|
PATCH "{bearer_token}";
|
206
186
|
GET "{bearer_token}";
|
207
187
|
default "";
|
208
188
|
}}
|
209
189
|
|
210
|
-
map $request_method $bearer_token_on_post_and_get {{
|
211
|
-
POST "{bearer_token}";
|
212
|
-
GET "{bearer_token}";
|
213
|
-
default "";
|
214
|
-
}}
|
215
|
-
|
216
190
|
server {{
|
217
191
|
listen 80;
|
218
|
-
resolver 127.0.0.11 valid=30s;
|
192
|
+
resolver 127.0.0.11 ipv6=off valid=30s;
|
193
|
+
set $upstream_hostname {upstream_hostname};
|
219
194
|
|
220
195
|
location ~* "^/api/jobs/cloud/(?<job_id>[a-z0-9-]{{36}})/status/$" {{
|
221
|
-
proxy_pass
|
196
|
+
proxy_pass https://$upstream_hostname/api/jobs/cloud/$job_id/status/;
|
222
197
|
proxy_set_header authorization $bearer_token_on_get;
|
223
198
|
proxy_set_header cookie "";
|
224
199
|
proxy_ssl_server_name on;
|
@@ -226,35 +201,35 @@ http {{
|
|
226
201
|
|
227
202
|
location ~* "^/api/jobs/cloud/$" {{
|
228
203
|
# Note: Using $1 here as URI part from regex must be used for proxy_pass
|
229
|
-
proxy_pass
|
204
|
+
proxy_pass https://$upstream_hostname/api/jobs/cloud/$1;
|
230
205
|
proxy_set_header authorization $bearer_token_on_post;
|
231
206
|
proxy_set_header cookie "";
|
232
207
|
proxy_ssl_server_name on;
|
233
208
|
}}
|
234
209
|
|
235
210
|
location ~* "^/api/jobs/(?<job_id>[a-z0-9-]{{36}})/storage/input/start_upload/$" {{
|
236
|
-
proxy_pass
|
211
|
+
proxy_pass https://$upstream_hostname/api/jobs/$job_id/storage/input/start_upload/;
|
237
212
|
proxy_set_header authorization "";
|
238
213
|
proxy_set_header cookie "";
|
239
214
|
proxy_ssl_server_name on;
|
240
215
|
}}
|
241
216
|
|
242
217
|
location ~* "^/api/jobs/(?<job_id>[a-z0-9-]{{36}})/storage/input/presigned_upload_url/$" {{
|
243
|
-
proxy_pass
|
218
|
+
proxy_pass https://$upstream_hostname/api/jobs/$job_id/storage/input/presigned_upload_url/$is_args$args;
|
244
219
|
proxy_set_header authorization "";
|
245
220
|
proxy_set_header cookie "";
|
246
221
|
proxy_ssl_server_name on;
|
247
222
|
}}
|
248
223
|
|
249
224
|
location ~* "^/api/jobs/(?<job_id>[a-z0-9-]{{36}})/storage/input/complete_upload/$" {{
|
250
|
-
proxy_pass
|
225
|
+
proxy_pass https://$upstream_hostname/api/jobs/$job_id/storage/input/complete_upload/;
|
251
226
|
proxy_set_header authorization "";
|
252
227
|
proxy_set_header cookie "";
|
253
228
|
proxy_ssl_server_name on;
|
254
229
|
}}
|
255
230
|
|
256
231
|
location ~* "^/api/jobs/(?<job_id>[a-z0-9-]{{36}})/main_result/$" {{
|
257
|
-
proxy_pass
|
232
|
+
proxy_pass https://$upstream_hostname/api/jobs/$job_id/main_result/;
|
258
233
|
proxy_set_header authorization "";
|
259
234
|
proxy_set_header cookie "";
|
260
235
|
proxy_pass_request_headers on;
|
@@ -262,7 +237,7 @@ http {{
|
|
262
237
|
}}
|
263
238
|
|
264
239
|
location ~* "^/api/jobs/(?<job_id>[a-z0-9-]{{36}})/$" {{
|
265
|
-
proxy_pass
|
240
|
+
proxy_pass https://$upstream_hostname/api/jobs/$job_id/;
|
266
241
|
proxy_set_header authorization $bearer_token_on_patch_and_get;
|
267
242
|
proxy_set_header caller-job-uuid "{self._job_uuid}";
|
268
243
|
proxy_set_header cookie "";
|
@@ -271,7 +246,7 @@ http {{
|
|
271
246
|
|
272
247
|
location ~* "^/api/jobs/create_job_with_data/$" {{
|
273
248
|
# Note: Using $1 here as URI part from regex must be used for proxy_pass
|
274
|
-
proxy_pass
|
249
|
+
proxy_pass https://$upstream_hostname/api/jobs/create_job_with_data/$1;
|
275
250
|
proxy_set_header authorization $bearer_token_on_post;
|
276
251
|
proxy_set_header caller-job-uuid "{self._job_uuid}";
|
277
252
|
proxy_set_header cookie "";
|
@@ -280,78 +255,91 @@ http {{
|
|
280
255
|
|
281
256
|
location ~* "^/api/jobs/$" {{
|
282
257
|
# Note: Using $1 here as URI part from regex must be used for proxy_pass
|
283
|
-
proxy_pass
|
258
|
+
proxy_pass https://$upstream_hostname/api/jobs/$1;
|
284
259
|
proxy_set_header authorization $bearer_token_on_post;
|
285
260
|
proxy_set_header caller-job-uuid "{self._job_uuid}";
|
286
261
|
proxy_set_header cookie "";
|
287
262
|
proxy_ssl_server_name on;
|
288
263
|
}}
|
289
264
|
|
290
|
-
location
|
291
|
-
proxy_pass
|
292
|
-
|
265
|
+
location ~ "^/api/jobs/{self._job_uuid}/notes/$" {{
|
266
|
+
# Note: Using $1 here as URI part from regex must be used for proxy_pass
|
267
|
+
proxy_pass https://$upstream_hostname/api/jobs/{self._job_uuid}/notes/$1;
|
268
|
+
proxy_set_header authorization "";
|
269
|
+
proxy_set_header job-auth-token "";
|
270
|
+
proxy_set_header compute-node-auth-token "{compute_node_auth_token}";
|
271
|
+
proxy_set_header compute-node-uuid "{compute_node_uuid}";
|
293
272
|
proxy_set_header cookie "";
|
294
273
|
proxy_ssl_server_name on;
|
295
274
|
}}
|
296
275
|
|
297
|
-
location /api/ {{
|
298
|
-
proxy_pass
|
276
|
+
location /api/lfs/ {{
|
277
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
299
278
|
proxy_set_header authorization "";
|
279
|
+
proxy_set_header compute-node-auth-token "{compute_node_auth_token}";
|
280
|
+
proxy_set_header job-uuid "{self._job_uuid}";
|
300
281
|
proxy_set_header cookie "";
|
301
282
|
proxy_ssl_server_name on;
|
302
283
|
}}
|
303
284
|
|
304
|
-
location /
|
305
|
-
proxy_pass
|
285
|
+
location /api/app/ {{
|
286
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
306
287
|
proxy_set_header authorization "";
|
288
|
+
proxy_set_header compute-node-auth-token "{compute_node_auth_token}";
|
289
|
+
proxy_set_header job-uuid "{self._job_uuid}";
|
307
290
|
proxy_set_header cookie "";
|
308
291
|
proxy_ssl_server_name on;
|
309
292
|
}}
|
310
293
|
|
311
|
-
location /
|
312
|
-
proxy_pass
|
294
|
+
location /api/ {{
|
295
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
313
296
|
proxy_set_header authorization "";
|
314
297
|
proxy_set_header cookie "";
|
315
298
|
proxy_ssl_server_name on;
|
316
299
|
}}
|
317
300
|
|
318
301
|
location /proxy/storage/job-storage/ {{
|
319
|
-
proxy_pass
|
302
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
320
303
|
proxy_set_header authorization "";
|
321
304
|
proxy_set_header cookie "";
|
322
305
|
proxy_ssl_server_name on;
|
323
306
|
}}
|
324
307
|
|
325
|
-
{f"""
|
326
308
|
location /proxy/storage/lfs/versions/ {{
|
327
|
-
proxy_pass
|
309
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
328
310
|
proxy_set_header authorization "";
|
329
311
|
proxy_set_header cookie "";
|
330
312
|
proxy_ssl_server_name on;
|
331
313
|
}}
|
332
|
-
""" if self._can_push_data_record_for_user else ''}
|
333
314
|
|
334
315
|
location /proxy/cloud/ {{
|
335
|
-
proxy_pass
|
316
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
336
317
|
proxy_set_header authorization "";
|
337
318
|
proxy_set_header cookie "";
|
338
319
|
proxy_ssl_server_name on;
|
339
320
|
}}
|
340
321
|
|
322
|
+
location /proxy/index/ {{
|
323
|
+
proxy_pass https://$upstream_hostname$request_uri;
|
324
|
+
proxy_set_header authorization "Basic {biolib_index_basic_auth_base64}";
|
325
|
+
proxy_set_header cookie "";
|
326
|
+
proxy_ssl_server_name on;
|
327
|
+
}}
|
328
|
+
|
341
329
|
location / {{
|
342
330
|
return 404 "Not found";
|
343
331
|
}}
|
344
332
|
}}
|
345
333
|
}}
|
346
|
-
|
334
|
+
"""
|
347
335
|
else:
|
348
|
-
nginx_config =
|
336
|
+
nginx_config = """
|
349
337
|
events {}
|
350
338
|
error_log /dev/stdout info;
|
351
339
|
stream {
|
352
|
-
resolver 127.0.0.11 valid=30s;
|
340
|
+
resolver 127.0.0.11 valid=30s;"""
|
353
341
|
for idx, upstream_server_port in enumerate(upstream_server_ports):
|
354
|
-
nginx_config += f
|
342
|
+
nginx_config += f"""
|
355
343
|
map "" $upstream_{idx} {{
|
356
344
|
default {upstream_server_name}:{upstream_server_port};
|
357
345
|
}}
|
@@ -364,11 +352,11 @@ stream {
|
|
364
352
|
server {{
|
365
353
|
listen {self._remote_host['ports'][idx]} udp;
|
366
354
|
proxy_pass $upstream_{idx};
|
367
|
-
}}
|
355
|
+
}}"""
|
368
356
|
|
369
|
-
nginx_config +=
|
357
|
+
nginx_config += """
|
370
358
|
}
|
371
|
-
|
359
|
+
"""
|
372
360
|
|
373
361
|
nginx_config_bytes = nginx_config.encode()
|
374
362
|
tarfile_in_memory = io.BytesIO()
|
@@ -2,23 +2,23 @@ import base64
|
|
2
2
|
import os
|
3
3
|
import random
|
4
4
|
import shutil
|
5
|
+
import socket
|
5
6
|
import sys
|
6
|
-
import time
|
7
7
|
import threading
|
8
|
-
import
|
8
|
+
import time
|
9
9
|
from queue import Queue
|
10
10
|
|
11
|
-
from biolib import utils
|
11
|
+
from biolib import api, utils
|
12
|
+
from biolib.biolib_binary_format import ModuleOutputV2, SystemException, SystemStatusUpdate
|
12
13
|
from biolib.biolib_binary_format.utils import LocalFileIndexableBuffer
|
14
|
+
from biolib.biolib_logging import logger, logger_no_user_data
|
13
15
|
from biolib.compute_node.cloud_utils import CloudUtils
|
14
16
|
from biolib.compute_node.job_worker import JobWorkerProcess
|
15
17
|
from biolib.compute_node.job_worker.job_storage import JobStorage
|
16
18
|
from biolib.compute_node.socker_listener_thread import SocketListenerThread
|
17
19
|
from biolib.compute_node.socket_sender_thread import SocketSenderThread
|
20
|
+
from biolib.compute_node.utils import SystemExceptionCodes, WorkerThreadException, get_package_type
|
18
21
|
from biolib.compute_node.webserver import webserver_utils
|
19
|
-
from biolib.biolib_binary_format import SystemStatusUpdate, SystemException, ModuleOutputV2
|
20
|
-
from biolib.compute_node.utils import get_package_type, WorkerThreadException, SystemExceptionCodes
|
21
|
-
from biolib.biolib_logging import logger, logger_no_user_data
|
22
22
|
|
23
23
|
SOCKET_HOST = '127.0.0.1'
|
24
24
|
|
@@ -37,7 +37,7 @@ class WorkerThread(threading.Thread):
|
|
37
37
|
self._sender_thread = None
|
38
38
|
self._start_and_connect_to_compute_process()
|
39
39
|
|
40
|
-
logger.debug(f
|
40
|
+
logger.debug(f'WorkerThread connected to port {self._socket_port}')
|
41
41
|
|
42
42
|
except Exception as exception:
|
43
43
|
logger_no_user_data.error(exception)
|
@@ -79,20 +79,16 @@ class WorkerThread(threading.Thread):
|
|
79
79
|
if progress == 94:
|
80
80
|
# Get Job exit code
|
81
81
|
try:
|
82
|
-
module_output_path = os.path.join(
|
83
|
-
|
84
|
-
|
85
|
-
buffer=LocalFileIndexableBuffer(
|
86
|
-
filename=module_output_path
|
87
|
-
)
|
82
|
+
module_output_path = os.path.join(
|
83
|
+
self._job_temporary_dir,
|
84
|
+
JobStorage.module_output_file_name,
|
88
85
|
)
|
86
|
+
module_output = ModuleOutputV2(buffer=LocalFileIndexableBuffer(filename=module_output_path))
|
89
87
|
self.compute_state['exit_code'] = module_output.get_exit_code()
|
90
88
|
logger_no_user_data.debug(f"Got exit code: {self.compute_state['exit_code']}")
|
91
89
|
|
92
90
|
except Exception as error: # pylint: disable=broad-except
|
93
|
-
logger_no_user_data.error(
|
94
|
-
f'Could not get exit_code from module output due to: {error}'
|
95
|
-
)
|
91
|
+
logger_no_user_data.error(f'Could not get exit_code from module output due to: {error}')
|
96
92
|
|
97
93
|
if utils.IS_RUNNING_IN_CLOUD:
|
98
94
|
JobStorage.upload_module_output(
|
@@ -107,7 +103,7 @@ class WorkerThread(threading.Thread):
|
|
107
103
|
elif package_type == 'SystemException':
|
108
104
|
error_code = SystemException(package).deserialize()
|
109
105
|
self.compute_state['status']['error_code'] = error_code
|
110
|
-
logger.debug(
|
106
|
+
logger.debug('Hit error. Terminating Worker Thread and Compute Process')
|
111
107
|
self.compute_state['progress'] = 95
|
112
108
|
self.terminate()
|
113
109
|
|
@@ -153,10 +149,10 @@ class WorkerThread(threading.Thread):
|
|
153
149
|
|
154
150
|
# Starting a thread for accepting connections before starting the process that should to connect to the socket
|
155
151
|
logger_no_user_data.debug('Starting connection thread')
|
156
|
-
self._connection_thread = threading.Thread(
|
157
|
-
|
158
|
-
messages_to_send_queue
|
159
|
-
|
152
|
+
self._connection_thread = threading.Thread(
|
153
|
+
target=self._accept_new_socket_connection,
|
154
|
+
args=[received_messages_queue, messages_to_send_queue],
|
155
|
+
)
|
160
156
|
self._connection_thread.start()
|
161
157
|
logger_no_user_data.debug('Started connection thread')
|
162
158
|
logger_no_user_data.debug('Starting compute process')
|
@@ -177,6 +173,16 @@ class WorkerThread(threading.Thread):
|
|
177
173
|
self._sender_thread.start()
|
178
174
|
|
179
175
|
def terminate(self) -> None:
|
176
|
+
cloud_job_uuid = self.compute_state['cloud_job_id']
|
177
|
+
exit_code = self.compute_state.get('exit_code')
|
178
|
+
system_exception_code = self.compute_state['status'].get('error_code')
|
179
|
+
if utils.IS_RUNNING_IN_CLOUD:
|
180
|
+
CloudUtils.finish_cloud_job(
|
181
|
+
cloud_job_id=cloud_job_uuid,
|
182
|
+
system_exception_code=system_exception_code,
|
183
|
+
exit_code=exit_code,
|
184
|
+
)
|
185
|
+
|
180
186
|
deregistered_due_to_error = False
|
181
187
|
if self._job_worker_process:
|
182
188
|
logger_no_user_data.debug(
|
@@ -184,7 +190,8 @@ class WorkerThread(threading.Thread):
|
|
184
190
|
)
|
185
191
|
self._job_worker_process.terminate()
|
186
192
|
|
187
|
-
|
193
|
+
clean_up_timeout_in_seconds = 600
|
194
|
+
for _ in range(clean_up_timeout_in_seconds):
|
188
195
|
if self._job_worker_process.exitcode is not None:
|
189
196
|
logger_no_user_data.debug(
|
190
197
|
f'Job "{self._job_uuid}" worker process exitcode {self._job_worker_process.exitcode}'
|
@@ -196,28 +203,18 @@ class WorkerThread(threading.Thread):
|
|
196
203
|
|
197
204
|
if self._job_worker_process.exitcode is None:
|
198
205
|
# TODO: Figure out if more error handling is necessary here
|
199
|
-
logger_no_user_data.error(
|
206
|
+
logger_no_user_data.error(
|
207
|
+
f'Job {self._job_uuid} worker process did not exit within {clean_up_timeout_in_seconds} seconds'
|
208
|
+
)
|
200
209
|
if utils.IS_RUNNING_IN_CLOUD:
|
201
210
|
logger_no_user_data.error('Deregistering compute node...')
|
202
211
|
CloudUtils.deregister(error='job_cleanup_timed_out')
|
203
212
|
deregistered_due_to_error = True
|
204
213
|
|
205
214
|
# Delete result as error occurred
|
206
|
-
system_exception_code = self.compute_state['status'].get('error_code')
|
207
215
|
if system_exception_code and os.path.exists(self._job_temporary_dir):
|
208
216
|
shutil.rmtree(self._job_temporary_dir)
|
209
217
|
|
210
|
-
exit_code = self.compute_state.get('exit_code')
|
211
|
-
|
212
|
-
if utils.IS_RUNNING_IN_CLOUD:
|
213
|
-
# Get and send compute node exception code and job exit code if present
|
214
|
-
logger_no_user_data.debug(f"Sending exit code {exit_code}")
|
215
|
-
CloudUtils.finish_cloud_job(
|
216
|
-
cloud_job_id=self.compute_state['cloud_job_id'],
|
217
|
-
system_exception_code=system_exception_code,
|
218
|
-
exit_code=exit_code
|
219
|
-
)
|
220
|
-
|
221
218
|
if self._socket:
|
222
219
|
self._socket.close()
|
223
220
|
|
@@ -225,7 +222,7 @@ class WorkerThread(threading.Thread):
|
|
225
222
|
self._connection.close()
|
226
223
|
|
227
224
|
if self.compute_state['progress'] == 95:
|
228
|
-
seconds_to_sleep =
|
225
|
+
seconds_to_sleep = 5
|
229
226
|
logger_no_user_data.debug(
|
230
227
|
f'Job "{self._job_uuid}" worker thread sleeping for {seconds_to_sleep} seconds before cleaning up'
|
231
228
|
)
|
@@ -234,7 +231,7 @@ class WorkerThread(threading.Thread):
|
|
234
231
|
|
235
232
|
compute_state_dict = webserver_utils.JOB_ID_TO_COMPUTE_STATE_DICT
|
236
233
|
if self._job_uuid in compute_state_dict:
|
237
|
-
# Delete result as user has not started download
|
234
|
+
# Delete result as user has not started download
|
238
235
|
if compute_state_dict[self._job_uuid]['progress'] == 95 and os.path.exists(self._job_temporary_dir):
|
239
236
|
shutil.rmtree(self._job_temporary_dir)
|
240
237
|
|
@@ -245,12 +242,18 @@ class WorkerThread(threading.Thread):
|
|
245
242
|
f'Job "{self._job_uuid}" could not be found, maybe it has already been cleaned up'
|
246
243
|
)
|
247
244
|
|
248
|
-
logger_no_user_data.debug(f'Job "{self._job_uuid}" worker thread terminated')
|
249
|
-
|
250
245
|
if utils.IS_RUNNING_IN_CLOUD:
|
246
|
+
config = CloudUtils.get_webserver_config()
|
247
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" reporting CloudJob "{cloud_job_uuid}" as cleaned up...')
|
248
|
+
api.client.post(
|
249
|
+
path=f'/internal/compute-nodes/cloud-jobs/{cloud_job_uuid}/cleaned-up/',
|
250
|
+
headers={'Compute-Node-Auth-Token': config['compute_node_info']['auth_token']},
|
251
|
+
)
|
252
|
+
|
251
253
|
if deregistered_due_to_error:
|
252
254
|
CloudUtils.shutdown() # shutdown now
|
253
255
|
else:
|
254
256
|
webserver_utils.update_auto_shutdown_time()
|
255
257
|
|
258
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" worker thread exiting...')
|
256
259
|
sys.exit()
|