fractal-server 2.2.0a1__py3-none-any.whl → 2.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/v1/state.py +1 -2
- fractal_server/app/routes/admin/v1.py +2 -2
- fractal_server/app/routes/admin/v2.py +2 -2
- fractal_server/app/routes/api/v1/job.py +2 -2
- fractal_server/app/routes/api/v1/task_collection.py +4 -4
- fractal_server/app/routes/api/v2/__init__.py +23 -3
- fractal_server/app/routes/api/v2/job.py +2 -2
- fractal_server/app/routes/api/v2/submit.py +6 -0
- fractal_server/app/routes/api/v2/task_collection.py +74 -34
- fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
- fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
- fractal_server/app/routes/aux/_runner.py +10 -2
- fractal_server/app/runner/compress_folder.py +120 -0
- fractal_server/app/runner/executors/slurm/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/_batching.py +0 -1
- fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
- fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
- fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
- fractal_server/app/runner/extract_archive.py +38 -0
- fractal_server/app/runner/v1/__init__.py +78 -40
- fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
- fractal_server/app/runner/v2/__init__.py +147 -62
- fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
- fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
- fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
- fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
- fractal_server/app/runner/versions.py +30 -0
- fractal_server/app/schemas/v1/__init__.py +1 -0
- fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
- fractal_server/app/schemas/v2/__init__.py +4 -1
- fractal_server/app/schemas/v2/task_collection.py +97 -27
- fractal_server/config.py +184 -3
- fractal_server/main.py +25 -1
- fractal_server/ssh/__init__.py +4 -0
- fractal_server/ssh/_fabric.py +190 -0
- fractal_server/tasks/utils.py +12 -64
- fractal_server/tasks/v1/background_operations.py +2 -2
- fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
- fractal_server/tasks/v1/utils.py +67 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
- fractal_server/tasks/v2/_venv_pip.py +195 -0
- fractal_server/tasks/v2/background_operations.py +257 -295
- fractal_server/tasks/v2/background_operations_ssh.py +304 -0
- fractal_server/tasks/v2/endpoint_operations.py +136 -0
- fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
- fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
- fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
- fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
- fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
- fractal_server/tasks/v2/utils.py +54 -0
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +4 -2
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +66 -42
- fractal_server/tasks/v2/get_collection_data.py +0 -14
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,26 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from enum import Enum
|
1
3
|
from pathlib import Path
|
4
|
+
from typing import Any
|
2
5
|
from typing import Literal
|
3
6
|
from typing import Optional
|
4
7
|
|
5
8
|
from pydantic import BaseModel
|
6
|
-
from pydantic import
|
9
|
+
from pydantic import root_validator
|
7
10
|
from pydantic import validator
|
8
11
|
|
9
12
|
from .._validators import valdictkeys
|
10
13
|
from .._validators import valstr
|
11
|
-
from .
|
14
|
+
from fractal_server.app.schemas._validators import valutc
|
15
|
+
from fractal_server.app.schemas.v2 import ManifestV2
|
16
|
+
|
17
|
+
|
18
|
+
class CollectionStatusV2(str, Enum):
|
19
|
+
PENDING = "pending"
|
20
|
+
INSTALLING = "installing"
|
21
|
+
COLLECTING = "collecting"
|
22
|
+
FAIL = "fail"
|
23
|
+
OK = "OK"
|
12
24
|
|
13
25
|
|
14
26
|
class TaskCollectPipV2(BaseModel):
|
@@ -41,7 +53,7 @@ class TaskCollectPipV2(BaseModel):
|
|
41
53
|
package: str
|
42
54
|
package_version: Optional[str] = None
|
43
55
|
package_extras: Optional[str] = None
|
44
|
-
python_version: Optional[
|
56
|
+
python_version: Optional[Literal["3.9", "3.10", "3.11", "3.12"]] = None
|
45
57
|
pinned_package_versions: Optional[dict[str, str]] = None
|
46
58
|
|
47
59
|
_pinned_package_versions = validator(
|
@@ -70,40 +82,98 @@ class TaskCollectPipV2(BaseModel):
|
|
70
82
|
|
71
83
|
@validator("package_version")
|
72
84
|
def package_version_validator(cls, v, values):
|
73
|
-
|
74
|
-
valstr("package_version")(v)
|
75
|
-
|
85
|
+
v = valstr("package_version")(v)
|
76
86
|
if values["package"].endswith(".whl"):
|
77
87
|
raise ValueError(
|
78
|
-
"Cannot provide version when package is a
|
88
|
+
"Cannot provide package version when package is a wheel file."
|
79
89
|
)
|
80
90
|
return v
|
81
91
|
|
82
92
|
|
83
|
-
class
|
93
|
+
class TaskCollectCustomV2(BaseModel):
|
84
94
|
"""
|
85
|
-
TaskCollectStatus class
|
86
|
-
|
87
95
|
Attributes:
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
96
|
+
manifest: Manifest of a Fractal task package (this is typically the
|
97
|
+
content of `__FRACTAL_MANIFEST__.json`).
|
98
|
+
python_interpreter: Absolute path to the Python interpreter to be used
|
99
|
+
for running tasks.
|
100
|
+
source: A common label identifying this package.
|
101
|
+
package_root: The folder where the package is installed.
|
102
|
+
If not provided, it will be extracted via `pip show`
|
103
|
+
(requires `package_name` to be set).
|
104
|
+
package_name: Name of the package, as used for `import <package_name>`;
|
105
|
+
this is then used to extract the package directory (`package_root`)
|
106
|
+
via `pip show <package_name>`.
|
107
|
+
version: Optional version of tasks to be collected.
|
94
108
|
"""
|
95
109
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
110
|
+
manifest: ManifestV2
|
111
|
+
python_interpreter: str
|
112
|
+
source: str
|
113
|
+
package_root: Optional[str]
|
114
|
+
package_name: Optional[str]
|
115
|
+
version: Optional[str]
|
116
|
+
|
117
|
+
@root_validator(pre=True)
|
118
|
+
def one_of_package_root_or_name(cls, values):
|
119
|
+
package_root = values["package_root"]
|
120
|
+
package_name = values["package_name"]
|
121
|
+
if (package_root is None and package_name is None) or (
|
122
|
+
package_root is not None and package_name is not None
|
123
|
+
):
|
124
|
+
raise ValueError(
|
125
|
+
"One and only one must be set between "
|
126
|
+
"'package_root' and 'package_name'"
|
127
|
+
)
|
128
|
+
return values
|
102
129
|
|
103
|
-
|
130
|
+
@validator("package_name")
|
131
|
+
def package_name_prevent_injection(cls, value: str):
|
104
132
|
"""
|
105
|
-
|
133
|
+
Remove all whitespace characters, and reject values containing `;`.
|
106
134
|
"""
|
107
|
-
|
108
|
-
|
109
|
-
|
135
|
+
if value is not None:
|
136
|
+
if ";" in value:
|
137
|
+
raise ValueError(f"Invalid package_name: {value}")
|
138
|
+
value = value.replace(" ", "")
|
139
|
+
return value
|
140
|
+
|
141
|
+
@validator("package_root")
|
142
|
+
def package_root_validator(cls, value):
|
143
|
+
if (value is not None) and (not Path(value).is_absolute()):
|
144
|
+
raise ValueError(
|
145
|
+
f"'package_root' must be an absolute path: (given {value})."
|
146
|
+
)
|
147
|
+
return value
|
148
|
+
|
149
|
+
@validator("python_interpreter")
|
150
|
+
def python_interpreter_validator(cls, value):
|
151
|
+
if not Path(value).is_absolute():
|
152
|
+
raise ValueError(
|
153
|
+
f"Python interpreter path must be absolute: (given {value})."
|
154
|
+
)
|
155
|
+
return value
|
156
|
+
|
157
|
+
# Valstr
|
158
|
+
_python_interpreter = validator("python_interpreter", allow_reuse=True)(
|
159
|
+
valstr("python_interpreter")
|
160
|
+
)
|
161
|
+
_source = validator("source", allow_reuse=True)(valstr("source"))
|
162
|
+
_package_root = validator("package_root", allow_reuse=True)(
|
163
|
+
valstr("package_root", accept_none=True)
|
164
|
+
)
|
165
|
+
_package_name = validator("package_name", allow_reuse=True)(
|
166
|
+
valstr("package_name", accept_none=True)
|
167
|
+
)
|
168
|
+
_version = validator("version", allow_reuse=True)(
|
169
|
+
valstr("version", accept_none=True)
|
170
|
+
)
|
171
|
+
|
172
|
+
|
173
|
+
class CollectionStateReadV2(BaseModel):
|
174
|
+
|
175
|
+
id: Optional[int]
|
176
|
+
data: dict[str, Any]
|
177
|
+
timestamp: datetime
|
178
|
+
|
179
|
+
_timestamp = validator("timestamp", allow_reuse=True)(valutc("timestamp"))
|
fractal_server/config.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13
13
|
# Zurich.
|
14
14
|
import logging
|
15
15
|
import shutil
|
16
|
+
import sys
|
16
17
|
from os import environ
|
17
18
|
from os import getenv
|
18
19
|
from os.path import abspath
|
@@ -323,7 +324,10 @@ class Settings(BaseSettings):
|
|
323
324
|
return FRACTAL_RUNNER_WORKING_BASE_DIR_path
|
324
325
|
|
325
326
|
FRACTAL_RUNNER_BACKEND: Literal[
|
326
|
-
"local",
|
327
|
+
"local",
|
328
|
+
"local_experimental",
|
329
|
+
"slurm",
|
330
|
+
"slurm_ssh",
|
327
331
|
] = "local"
|
328
332
|
"""
|
329
333
|
Select which runner backend to use.
|
@@ -366,10 +370,126 @@ class Settings(BaseSettings):
|
|
366
370
|
|
367
371
|
FRACTAL_SLURM_WORKER_PYTHON: Optional[str] = None
|
368
372
|
"""
|
369
|
-
|
370
|
-
not specified, the same interpreter that runs the server is used.
|
373
|
+
Absolute path to Python interpreter that will run the jobs on the SLURM
|
374
|
+
nodes. If not specified, the same interpreter that runs the server is used.
|
371
375
|
"""
|
372
376
|
|
377
|
+
@validator("FRACTAL_SLURM_WORKER_PYTHON", always=True)
|
378
|
+
def absolute_FRACTAL_SLURM_WORKER_PYTHON(cls, v):
|
379
|
+
"""
|
380
|
+
If `FRACTAL_SLURM_WORKER_PYTHON` is a relative path, fail.
|
381
|
+
"""
|
382
|
+
if v is None:
|
383
|
+
return None
|
384
|
+
elif not Path(v).is_absolute():
|
385
|
+
raise FractalConfigurationError(
|
386
|
+
f"Non-absolute value for FRACTAL_SLURM_WORKER_PYTHON={v}"
|
387
|
+
)
|
388
|
+
else:
|
389
|
+
return v
|
390
|
+
|
391
|
+
FRACTAL_TASKS_PYTHON_DEFAULT_VERSION: Optional[
|
392
|
+
Literal["3.9", "3.10", "3.11", "3.12"]
|
393
|
+
] = None
|
394
|
+
"""
|
395
|
+
Default Python version to be used for task collection. Defaults to the
|
396
|
+
current version. Requires the corresponding variable (e.g
|
397
|
+
`FRACTAL_TASKS_PYTHON_3_10`) to be set.
|
398
|
+
"""
|
399
|
+
|
400
|
+
FRACTAL_TASKS_PYTHON_3_9: Optional[str] = None
|
401
|
+
"""
|
402
|
+
Absolute path to the Python 3.9 interpreter that serves as base for virtual
|
403
|
+
environments tasks. Note that this interpreter must have the `venv` module
|
404
|
+
installed. If set, this must be an absolute path. If the version specified
|
405
|
+
in `FRACTAL_TASKS_PYTHON_DEFAULT_VERSION` is `"3.9"` and this attribute is
|
406
|
+
unset, `sys.executable` is used as a default.
|
407
|
+
"""
|
408
|
+
|
409
|
+
FRACTAL_TASKS_PYTHON_3_10: Optional[str] = None
|
410
|
+
"""
|
411
|
+
Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.10.
|
412
|
+
"""
|
413
|
+
|
414
|
+
FRACTAL_TASKS_PYTHON_3_11: Optional[str] = None
|
415
|
+
"""
|
416
|
+
Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.11.
|
417
|
+
"""
|
418
|
+
|
419
|
+
FRACTAL_TASKS_PYTHON_3_12: Optional[str] = None
|
420
|
+
"""
|
421
|
+
Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.12.
|
422
|
+
"""
|
423
|
+
|
424
|
+
@root_validator(pre=True)
|
425
|
+
def check_tasks_python(cls, values) -> None:
|
426
|
+
"""
|
427
|
+
Perform multiple checks of the Python-intepreter variables.
|
428
|
+
|
429
|
+
1. Each `FRACTAL_TASKS_PYTHON_X_Y` variable must be an absolute path,
|
430
|
+
if set.
|
431
|
+
2. If `FRACTAL_TASKS_PYTHON_DEFAULT_VERSION` is unset, use
|
432
|
+
`sys.executable` and set the corresponding
|
433
|
+
`FRACTAL_TASKS_PYTHON_X_Y` (and unset all others).
|
434
|
+
"""
|
435
|
+
|
436
|
+
# `FRACTAL_TASKS_PYTHON_X_Y` variables can only be absolute paths
|
437
|
+
for version in ["3_9", "3_10", "3_11", "3_12"]:
|
438
|
+
key = f"FRACTAL_TASKS_PYTHON_{version}"
|
439
|
+
value = values.get(key)
|
440
|
+
if value is not None and not Path(value).is_absolute():
|
441
|
+
raise FractalConfigurationError(
|
442
|
+
f"Non-absolute value {key}={value}"
|
443
|
+
)
|
444
|
+
|
445
|
+
default_version = values.get("FRACTAL_TASKS_PYTHON_DEFAULT_VERSION")
|
446
|
+
|
447
|
+
if default_version is not None:
|
448
|
+
# "production/slurm" branch
|
449
|
+
# If a default version is set, then the corresponding interpreter
|
450
|
+
# must also be set
|
451
|
+
default_version_undescore = default_version.replace(".", "_")
|
452
|
+
key = f"FRACTAL_TASKS_PYTHON_{default_version_undescore}"
|
453
|
+
value = values.get(key)
|
454
|
+
if value is None:
|
455
|
+
msg = (
|
456
|
+
f"FRACTAL_TASKS_PYTHON_DEFAULT_VERSION={default_version} "
|
457
|
+
f"but {key}={value}."
|
458
|
+
)
|
459
|
+
logging.error(msg)
|
460
|
+
raise FractalConfigurationError(msg)
|
461
|
+
|
462
|
+
else:
|
463
|
+
# If no default version is set, then only `sys.executable` is made
|
464
|
+
# available
|
465
|
+
_info = sys.version_info
|
466
|
+
current_version = f"{_info.major}_{_info.minor}"
|
467
|
+
current_version_dot = f"{_info.major}.{_info.minor}"
|
468
|
+
values[
|
469
|
+
"FRACTAL_TASKS_PYTHON_DEFAULT_VERSION"
|
470
|
+
] = current_version_dot
|
471
|
+
logging.info(
|
472
|
+
"Setting FRACTAL_TASKS_PYTHON_DEFAULT_VERSION to "
|
473
|
+
f"{current_version_dot}"
|
474
|
+
)
|
475
|
+
|
476
|
+
# Unset all existing intepreters variable
|
477
|
+
for _version in ["3_9", "3_10", "3_11", "3_12"]:
|
478
|
+
key = f"FRACTAL_TASKS_PYTHON_{_version}"
|
479
|
+
if _version == current_version:
|
480
|
+
values[key] = sys.executable
|
481
|
+
logging.info(f"Setting {key} to {sys.executable}.")
|
482
|
+
else:
|
483
|
+
value = values.get(key)
|
484
|
+
if value is not None:
|
485
|
+
logging.info(
|
486
|
+
f"Setting {key} to None (given: {value}), "
|
487
|
+
"because FRACTAL_TASKS_PYTHON_DEFAULT_VERSION was "
|
488
|
+
"not set."
|
489
|
+
)
|
490
|
+
values[key] = None
|
491
|
+
return values
|
492
|
+
|
373
493
|
FRACTAL_SLURM_POLL_INTERVAL: int = 5
|
374
494
|
"""
|
375
495
|
Interval to wait (in seconds) before checking whether unfinished job are
|
@@ -392,6 +512,25 @@ class Settings(BaseSettings):
|
|
392
512
|
`JobExecutionError`.
|
393
513
|
"""
|
394
514
|
|
515
|
+
FRACTAL_SLURM_SSH_HOST: Optional[str] = None
|
516
|
+
"""
|
517
|
+
SSH-reachable host where a SLURM client is available.
|
518
|
+
"""
|
519
|
+
FRACTAL_SLURM_SSH_USER: Optional[str] = None
|
520
|
+
"""
|
521
|
+
User on `FRACTAL_SLURM_SSH_HOST`.
|
522
|
+
"""
|
523
|
+
FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH: Optional[str] = None
|
524
|
+
"""
|
525
|
+
Private key for connecting to `FRACTAL_SLURM_SSH_HOST` as
|
526
|
+
`FRACTAL_SLURM_SSH_USER`.
|
527
|
+
"""
|
528
|
+
# FIXME SSH: Split this into two folders (for tasks and for jobs)
|
529
|
+
FRACTAL_SLURM_SSH_WORKING_BASE_DIR: Optional[str] = None
|
530
|
+
"""
|
531
|
+
Remote folder on `FRACTAL_SLURM_SSH_HOST`.
|
532
|
+
"""
|
533
|
+
|
395
534
|
FRACTAL_API_SUBMIT_RATE_LIMIT: int = 2
|
396
535
|
"""
|
397
536
|
Interval to wait (in seconds) to be allowed to call again
|
@@ -480,6 +619,48 @@ class Settings(BaseSettings):
|
|
480
619
|
raise FractalConfigurationError(
|
481
620
|
f"{info} but `squeue` command not found."
|
482
621
|
)
|
622
|
+
elif self.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
623
|
+
if self.FRACTAL_SLURM_WORKER_PYTHON is None:
|
624
|
+
raise FractalConfigurationError(
|
625
|
+
f"Must set FRACTAL_SLURM_WORKER_PYTHON when {info}"
|
626
|
+
)
|
627
|
+
if self.FRACTAL_SLURM_SSH_USER is None:
|
628
|
+
raise FractalConfigurationError(
|
629
|
+
f"Must set FRACTAL_SLURM_SSH_USER when {info}"
|
630
|
+
)
|
631
|
+
if self.FRACTAL_SLURM_SSH_HOST is None:
|
632
|
+
raise FractalConfigurationError(
|
633
|
+
f"Must set FRACTAL_SLURM_SSH_HOST when {info}"
|
634
|
+
)
|
635
|
+
if self.FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH is None:
|
636
|
+
raise FractalConfigurationError(
|
637
|
+
f"Must set FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH when {info}"
|
638
|
+
)
|
639
|
+
if self.FRACTAL_SLURM_SSH_WORKING_BASE_DIR is None:
|
640
|
+
raise FractalConfigurationError(
|
641
|
+
f"Must set FRACTAL_SLURM_SSH_WORKING_BASE_DIR when {info}"
|
642
|
+
)
|
643
|
+
|
644
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import ( # noqa: E501
|
645
|
+
load_slurm_config_file,
|
646
|
+
)
|
647
|
+
|
648
|
+
if not self.FRACTAL_SLURM_CONFIG_FILE:
|
649
|
+
raise FractalConfigurationError(
|
650
|
+
f"Must set FRACTAL_SLURM_CONFIG_FILE when {info}"
|
651
|
+
)
|
652
|
+
else:
|
653
|
+
if not self.FRACTAL_SLURM_CONFIG_FILE.exists():
|
654
|
+
raise FractalConfigurationError(
|
655
|
+
f"{info} but FRACTAL_SLURM_CONFIG_FILE="
|
656
|
+
f"{self.FRACTAL_SLURM_CONFIG_FILE} not found."
|
657
|
+
)
|
658
|
+
|
659
|
+
load_slurm_config_file(self.FRACTAL_SLURM_CONFIG_FILE)
|
660
|
+
if not shutil.which("ssh"):
|
661
|
+
raise FractalConfigurationError(
|
662
|
+
f"{info} but `ssh` command not found."
|
663
|
+
)
|
483
664
|
else: # i.e. self.FRACTAL_RUNNER_BACKEND == "local"
|
484
665
|
if self.FRACTAL_LOCAL_CONFIG_FILE:
|
485
666
|
if not self.FRACTAL_LOCAL_CONFIG_FILE.exists():
|
fractal_server/main.py
CHANGED
@@ -20,6 +20,7 @@ from contextlib import asynccontextmanager
|
|
20
20
|
|
21
21
|
from fastapi import FastAPI
|
22
22
|
|
23
|
+
from .app.routes.aux._runner import _backend_supports_shutdown # FIXME: change
|
23
24
|
from .app.runner.shutdown import cleanup_after_shutdown
|
24
25
|
from .app.security import _create_first_user
|
25
26
|
from .config import get_settings
|
@@ -97,17 +98,38 @@ async def lifespan(app: FastAPI):
|
|
97
98
|
is_superuser=True,
|
98
99
|
is_verified=True,
|
99
100
|
)
|
101
|
+
|
102
|
+
if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
103
|
+
from fractal_server.ssh._fabric import get_ssh_connection
|
104
|
+
|
105
|
+
app.state.connection = get_ssh_connection()
|
106
|
+
logger.info(
|
107
|
+
f"Created SSH connection "
|
108
|
+
f"({app.state.connection.is_connected=})."
|
109
|
+
)
|
110
|
+
else:
|
111
|
+
app.state.connection = None
|
112
|
+
|
100
113
|
config_uvicorn_loggers()
|
101
114
|
logger.info("End application startup")
|
102
115
|
reset_logger_handlers(logger)
|
103
116
|
yield
|
104
117
|
logger = get_logger("fractal_server.lifespan")
|
105
118
|
logger.info("Start application shutdown")
|
119
|
+
|
120
|
+
if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
121
|
+
logger.info(
|
122
|
+
f"Closing SSH connection "
|
123
|
+
f"(current: {app.state.connection.is_connected=})."
|
124
|
+
)
|
125
|
+
|
126
|
+
app.state.connection.close()
|
127
|
+
|
106
128
|
logger.info(
|
107
129
|
f"Current worker with pid {os.getpid()} is shutting down. "
|
108
130
|
f"Current jobs: {app.state.jobsV1=}, {app.state.jobsV2=}"
|
109
131
|
)
|
110
|
-
if settings.FRACTAL_RUNNER_BACKEND
|
132
|
+
if _backend_supports_shutdown(settings.FRACTAL_RUNNER_BACKEND):
|
111
133
|
try:
|
112
134
|
await cleanup_after_shutdown(
|
113
135
|
jobsV1=app.state.jobsV1,
|
@@ -120,6 +142,8 @@ async def lifespan(app: FastAPI):
|
|
120
142
|
"some of running jobs are not shutdown properly. "
|
121
143
|
f"Original error: {e}"
|
122
144
|
)
|
145
|
+
else:
|
146
|
+
logger.info("Shutdown not available for this backend runner.")
|
123
147
|
|
124
148
|
logger.info("End application shutdown")
|
125
149
|
reset_logger_handlers(logger)
|
@@ -0,0 +1,190 @@
|
|
1
|
+
import time
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from fabric import Connection
|
5
|
+
from invoke import UnexpectedExit
|
6
|
+
from paramiko.ssh_exception import NoValidConnectionsError
|
7
|
+
|
8
|
+
from ..logger import get_logger
|
9
|
+
from ..logger import set_logger
|
10
|
+
from fractal_server.config import get_settings
|
11
|
+
from fractal_server.syringe import Inject
|
12
|
+
|
13
|
+
logger = set_logger(__name__)
|
14
|
+
|
15
|
+
MAX_ATTEMPTS = 5
|
16
|
+
|
17
|
+
|
18
|
+
def get_ssh_connection(
|
19
|
+
*,
|
20
|
+
host: Optional[str] = None,
|
21
|
+
user: Optional[str] = None,
|
22
|
+
key_filename: Optional[str] = None,
|
23
|
+
) -> Connection:
|
24
|
+
"""
|
25
|
+
Create a `fabric.Connection` object based on fractal-server settings
|
26
|
+
or explicit arguments.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
host:
|
30
|
+
user:
|
31
|
+
key_filename:
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Fabric connection object
|
35
|
+
"""
|
36
|
+
settings = Inject(get_settings)
|
37
|
+
if host is None:
|
38
|
+
host = settings.FRACTAL_SLURM_SSH_HOST
|
39
|
+
if user is None:
|
40
|
+
user = settings.FRACTAL_SLURM_SSH_USER
|
41
|
+
if key_filename is None:
|
42
|
+
key_filename = settings.FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH
|
43
|
+
|
44
|
+
connection = Connection(
|
45
|
+
host=host,
|
46
|
+
user=user,
|
47
|
+
connect_kwargs={"key_filename": key_filename},
|
48
|
+
)
|
49
|
+
logger.debug(f"Now created {connection=}.")
|
50
|
+
return connection
|
51
|
+
|
52
|
+
|
53
|
+
def check_connection(connection: Connection) -> None:
|
54
|
+
"""
|
55
|
+
Open the SSH connection and handle exceptions.
|
56
|
+
|
57
|
+
This function can be called from within other functions that use
|
58
|
+
`connection`, so that we can provide a meaningful error in case the
|
59
|
+
SSH connection cannot be opened.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
connection: Fabric connection object
|
63
|
+
"""
|
64
|
+
if not connection.is_connected:
|
65
|
+
try:
|
66
|
+
connection.open()
|
67
|
+
except Exception as e:
|
68
|
+
raise RuntimeError(
|
69
|
+
f"Cannot open SSH connection (original error: '{str(e)}')."
|
70
|
+
)
|
71
|
+
|
72
|
+
|
73
|
+
def run_command_over_ssh(
|
74
|
+
*,
|
75
|
+
cmd: str,
|
76
|
+
connection: Connection,
|
77
|
+
max_attempts: int = MAX_ATTEMPTS,
|
78
|
+
base_interval: float = 3.0,
|
79
|
+
) -> str:
|
80
|
+
"""
|
81
|
+
Run a command within an open SSH connection.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
cmd: Command to be run
|
85
|
+
connection: Fabric connection object
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
Standard output of the command, if successful.
|
89
|
+
"""
|
90
|
+
t_0 = time.perf_counter()
|
91
|
+
ind_attempt = 0
|
92
|
+
while ind_attempt <= max_attempts:
|
93
|
+
ind_attempt += 1
|
94
|
+
prefix = f"[attempt {ind_attempt}/{max_attempts}]"
|
95
|
+
logger.info(f"{prefix} START running '{cmd}' over SSH.")
|
96
|
+
try:
|
97
|
+
# Case 1: Command runs successfully
|
98
|
+
res = connection.run(cmd, hide=True)
|
99
|
+
t_1 = time.perf_counter()
|
100
|
+
logger.info(
|
101
|
+
f"{prefix} END running '{cmd}' over SSH, "
|
102
|
+
f"elapsed {t_1-t_0:.3f}"
|
103
|
+
)
|
104
|
+
logger.debug(f"STDOUT: {res.stdout}")
|
105
|
+
logger.debug(f"STDERR: {res.stderr}")
|
106
|
+
return res.stdout
|
107
|
+
except NoValidConnectionsError as e:
|
108
|
+
# Case 2: Command fails with a connection error
|
109
|
+
logger.warning(
|
110
|
+
f"{prefix} Running command `{cmd}` over SSH failed.\n"
|
111
|
+
f"Original NoValidConnectionError:\n{str(e)}.\n"
|
112
|
+
f"{e.errors=}\n"
|
113
|
+
)
|
114
|
+
if ind_attempt < max_attempts:
|
115
|
+
sleeptime = (
|
116
|
+
base_interval**ind_attempt
|
117
|
+
) # FIXME SSH: add jitter?
|
118
|
+
logger.warning(
|
119
|
+
f"{prefix} Now sleep {sleeptime:.3f} seconds and continue."
|
120
|
+
)
|
121
|
+
time.sleep(sleeptime)
|
122
|
+
continue
|
123
|
+
else:
|
124
|
+
logger.error(f"{prefix} Reached last attempt")
|
125
|
+
break
|
126
|
+
except UnexpectedExit as e:
|
127
|
+
# Case 3: Command fails with an actual error
|
128
|
+
error_msg = (
|
129
|
+
f"{prefix} Running command `{cmd}` over SSH failed.\n"
|
130
|
+
f"Original error:\n{str(e)}."
|
131
|
+
)
|
132
|
+
logger.error(error_msg)
|
133
|
+
raise ValueError(error_msg)
|
134
|
+
except Exception as e:
|
135
|
+
logger.error(
|
136
|
+
f"Running command `{cmd}` over SSH failed.\n"
|
137
|
+
f"Original Error:\n{str(e)}."
|
138
|
+
)
|
139
|
+
raise e
|
140
|
+
|
141
|
+
raise ValueError(
|
142
|
+
f"Reached last attempt ({max_attempts=}) for running '{cmd}' over SSH"
|
143
|
+
)
|
144
|
+
|
145
|
+
|
146
|
+
def put_over_ssh(
|
147
|
+
*,
|
148
|
+
local: str,
|
149
|
+
remote: str,
|
150
|
+
connection: Connection,
|
151
|
+
logger_name: Optional[str] = None,
|
152
|
+
) -> None:
|
153
|
+
"""
|
154
|
+
Transfer a file via SSH
|
155
|
+
|
156
|
+
Args:
|
157
|
+
local: Local path to file
|
158
|
+
remote: Target path on remote host
|
159
|
+
connection: Fabric connection object
|
160
|
+
logger_name: Name of the logger
|
161
|
+
|
162
|
+
"""
|
163
|
+
try:
|
164
|
+
connection.put(local=local, remote=remote)
|
165
|
+
except Exception as e:
|
166
|
+
logger = get_logger(logger_name=logger_name)
|
167
|
+
logger.error(
|
168
|
+
f"Transferring {local=} to {remote=} over SSH failed.\n"
|
169
|
+
f"Original Error:\n{str(e)}."
|
170
|
+
)
|
171
|
+
raise e
|
172
|
+
|
173
|
+
|
174
|
+
def _mkdir_over_ssh(
|
175
|
+
*, folder: str, connection: Connection, parents: bool = True
|
176
|
+
) -> None:
|
177
|
+
"""
|
178
|
+
Create a folder remotely via SSH.
|
179
|
+
|
180
|
+
Args:
|
181
|
+
folder:
|
182
|
+
connection:
|
183
|
+
parents:
|
184
|
+
"""
|
185
|
+
# FIXME SSH: try using `mkdir` method of `paramiko.SFTPClient`
|
186
|
+
if parents:
|
187
|
+
cmd = f"mkdir -p {folder}"
|
188
|
+
else:
|
189
|
+
cmd = f"mkdir {folder}"
|
190
|
+
run_command_over_ssh(cmd=cmd, connection=connection)
|