fractal-server 2.2.0a0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/db/__init__.py +1 -1
- fractal_server/app/models/v1/state.py +1 -2
- fractal_server/app/routes/admin/v1.py +2 -2
- fractal_server/app/routes/admin/v2.py +2 -2
- fractal_server/app/routes/api/v1/job.py +2 -2
- fractal_server/app/routes/api/v1/task_collection.py +4 -4
- fractal_server/app/routes/api/v2/__init__.py +23 -3
- fractal_server/app/routes/api/v2/job.py +2 -2
- fractal_server/app/routes/api/v2/submit.py +6 -0
- fractal_server/app/routes/api/v2/task_collection.py +74 -34
- fractal_server/app/routes/api/v2/task_collection_custom.py +170 -0
- fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
- fractal_server/app/routes/aux/_runner.py +10 -2
- fractal_server/app/runner/compress_folder.py +120 -0
- fractal_server/app/runner/executors/slurm/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/_batching.py +0 -1
- fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
- fractal_server/app/runner/executors/slurm/ssh/executor.py +1488 -0
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
- fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
- fractal_server/app/runner/extract_archive.py +38 -0
- fractal_server/app/runner/v1/__init__.py +78 -40
- fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
- fractal_server/app/runner/v2/__init__.py +183 -82
- fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
- fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
- fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
- fractal_server/app/runner/v2/_slurm_ssh/__init__.py +125 -0
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
- fractal_server/app/runner/versions.py +30 -0
- fractal_server/app/schemas/v1/__init__.py +1 -0
- fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
- fractal_server/app/schemas/v2/__init__.py +4 -1
- fractal_server/app/schemas/v2/task_collection.py +101 -30
- fractal_server/config.py +222 -21
- fractal_server/main.py +27 -1
- fractal_server/migrations/env.py +1 -1
- fractal_server/ssh/__init__.py +4 -0
- fractal_server/ssh/_fabric.py +245 -0
- fractal_server/tasks/utils.py +12 -64
- fractal_server/tasks/v1/background_operations.py +2 -2
- fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
- fractal_server/tasks/v1/utils.py +67 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
- fractal_server/tasks/v2/_venv_pip.py +195 -0
- fractal_server/tasks/v2/background_operations.py +257 -295
- fractal_server/tasks/v2/background_operations_ssh.py +317 -0
- fractal_server/tasks/v2/endpoint_operations.py +136 -0
- fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
- fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
- fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
- fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
- fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
- fractal_server/tasks/v2/utils.py +54 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0.dist-info}/METADATA +6 -2
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0.dist-info}/RECORD +68 -44
- fractal_server/tasks/v2/get_collection_data.py +0 -14
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0.dist-info}/entry_points.txt +0 -0
fractal_server/config.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13
13
|
# Zurich.
|
14
14
|
import logging
|
15
15
|
import shutil
|
16
|
+
import sys
|
16
17
|
from os import environ
|
17
18
|
from os import getenv
|
18
19
|
from os.path import abspath
|
@@ -166,7 +167,7 @@ class Settings(BaseSettings):
|
|
166
167
|
###########################################################################
|
167
168
|
# DATABASE
|
168
169
|
###########################################################################
|
169
|
-
DB_ENGINE: Literal["sqlite", "postgres"] = "sqlite"
|
170
|
+
DB_ENGINE: Literal["sqlite", "postgres", "postgres-psycopg"] = "sqlite"
|
170
171
|
"""
|
171
172
|
Select which database engine to use (supported: `sqlite` and `postgres`).
|
172
173
|
"""
|
@@ -201,39 +202,51 @@ class Settings(BaseSettings):
|
|
201
202
|
"""
|
202
203
|
|
203
204
|
@property
|
204
|
-
def
|
205
|
-
if self.DB_ENGINE == "
|
206
|
-
if not self.SQLITE_PATH:
|
207
|
-
raise FractalConfigurationError(
|
208
|
-
"SQLITE_PATH path cannot be None"
|
209
|
-
)
|
210
|
-
sqlite_path = abspath(self.SQLITE_PATH)
|
205
|
+
def DATABASE_ASYNC_URL(self) -> URL:
|
206
|
+
if self.DB_ENGINE == "postgres":
|
211
207
|
url = URL.create(
|
212
|
-
drivername="
|
213
|
-
|
208
|
+
drivername="postgresql+asyncpg",
|
209
|
+
username=self.POSTGRES_USER,
|
210
|
+
password=self.POSTGRES_PASSWORD,
|
211
|
+
host=self.POSTGRES_HOST,
|
212
|
+
port=self.POSTGRES_PORT,
|
213
|
+
database=self.POSTGRES_DB,
|
214
214
|
)
|
215
|
-
|
216
|
-
elif "postgres":
|
215
|
+
elif self.DB_ENGINE == "postgres-psycopg":
|
217
216
|
url = URL.create(
|
218
|
-
drivername="postgresql+
|
217
|
+
drivername="postgresql+psycopg",
|
219
218
|
username=self.POSTGRES_USER,
|
220
219
|
password=self.POSTGRES_PASSWORD,
|
221
220
|
host=self.POSTGRES_HOST,
|
222
221
|
port=self.POSTGRES_PORT,
|
223
222
|
database=self.POSTGRES_DB,
|
224
223
|
)
|
225
|
-
|
224
|
+
else:
|
225
|
+
if not self.SQLITE_PATH:
|
226
|
+
raise FractalConfigurationError(
|
227
|
+
"SQLITE_PATH path cannot be None"
|
228
|
+
)
|
229
|
+
sqlite_path = abspath(self.SQLITE_PATH)
|
230
|
+
url = URL.create(
|
231
|
+
drivername="sqlite+aiosqlite",
|
232
|
+
database=sqlite_path,
|
233
|
+
)
|
234
|
+
return url
|
226
235
|
|
227
236
|
@property
|
228
237
|
def DATABASE_SYNC_URL(self):
|
229
|
-
if self.DB_ENGINE == "
|
238
|
+
if self.DB_ENGINE == "postgres":
|
239
|
+
return self.DATABASE_ASYNC_URL.set(
|
240
|
+
drivername="postgresql+psycopg2"
|
241
|
+
)
|
242
|
+
elif self.DB_ENGINE == "postgres-psycopg":
|
243
|
+
return self.DATABASE_ASYNC_URL.set(drivername="postgresql+psycopg")
|
244
|
+
else:
|
230
245
|
if not self.SQLITE_PATH:
|
231
246
|
raise FractalConfigurationError(
|
232
247
|
"SQLITE_PATH path cannot be None"
|
233
248
|
)
|
234
|
-
return self.
|
235
|
-
elif self.DB_ENGINE == "postgres":
|
236
|
-
return self.DATABASE_URL.set(drivername="postgresql+psycopg2")
|
249
|
+
return self.DATABASE_ASYNC_URL.set(drivername="sqlite")
|
237
250
|
|
238
251
|
###########################################################################
|
239
252
|
# FRACTAL SPECIFIC
|
@@ -311,7 +324,10 @@ class Settings(BaseSettings):
|
|
311
324
|
return FRACTAL_RUNNER_WORKING_BASE_DIR_path
|
312
325
|
|
313
326
|
FRACTAL_RUNNER_BACKEND: Literal[
|
314
|
-
"local",
|
327
|
+
"local",
|
328
|
+
"local_experimental",
|
329
|
+
"slurm",
|
330
|
+
"slurm_ssh",
|
315
331
|
] = "local"
|
316
332
|
"""
|
317
333
|
Select which runner backend to use.
|
@@ -354,9 +370,125 @@ class Settings(BaseSettings):
|
|
354
370
|
|
355
371
|
FRACTAL_SLURM_WORKER_PYTHON: Optional[str] = None
|
356
372
|
"""
|
357
|
-
|
358
|
-
not specified, the same interpreter that runs the server is used.
|
373
|
+
Absolute path to Python interpreter that will run the jobs on the SLURM
|
374
|
+
nodes. If not specified, the same interpreter that runs the server is used.
|
375
|
+
"""
|
376
|
+
|
377
|
+
@validator("FRACTAL_SLURM_WORKER_PYTHON", always=True)
|
378
|
+
def absolute_FRACTAL_SLURM_WORKER_PYTHON(cls, v):
|
379
|
+
"""
|
380
|
+
If `FRACTAL_SLURM_WORKER_PYTHON` is a relative path, fail.
|
381
|
+
"""
|
382
|
+
if v is None:
|
383
|
+
return None
|
384
|
+
elif not Path(v).is_absolute():
|
385
|
+
raise FractalConfigurationError(
|
386
|
+
f"Non-absolute value for FRACTAL_SLURM_WORKER_PYTHON={v}"
|
387
|
+
)
|
388
|
+
else:
|
389
|
+
return v
|
390
|
+
|
391
|
+
FRACTAL_TASKS_PYTHON_DEFAULT_VERSION: Optional[
|
392
|
+
Literal["3.9", "3.10", "3.11", "3.12"]
|
393
|
+
] = None
|
394
|
+
"""
|
395
|
+
Default Python version to be used for task collection. Defaults to the
|
396
|
+
current version. Requires the corresponding variable (e.g
|
397
|
+
`FRACTAL_TASKS_PYTHON_3_10`) to be set.
|
398
|
+
"""
|
399
|
+
|
400
|
+
FRACTAL_TASKS_PYTHON_3_9: Optional[str] = None
|
359
401
|
"""
|
402
|
+
Absolute path to the Python 3.9 interpreter that serves as base for virtual
|
403
|
+
environments tasks. Note that this interpreter must have the `venv` module
|
404
|
+
installed. If set, this must be an absolute path. If the version specified
|
405
|
+
in `FRACTAL_TASKS_PYTHON_DEFAULT_VERSION` is `"3.9"` and this attribute is
|
406
|
+
unset, `sys.executable` is used as a default.
|
407
|
+
"""
|
408
|
+
|
409
|
+
FRACTAL_TASKS_PYTHON_3_10: Optional[str] = None
|
410
|
+
"""
|
411
|
+
Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.10.
|
412
|
+
"""
|
413
|
+
|
414
|
+
FRACTAL_TASKS_PYTHON_3_11: Optional[str] = None
|
415
|
+
"""
|
416
|
+
Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.11.
|
417
|
+
"""
|
418
|
+
|
419
|
+
FRACTAL_TASKS_PYTHON_3_12: Optional[str] = None
|
420
|
+
"""
|
421
|
+
Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.12.
|
422
|
+
"""
|
423
|
+
|
424
|
+
@root_validator(pre=True)
|
425
|
+
def check_tasks_python(cls, values) -> None:
|
426
|
+
"""
|
427
|
+
Perform multiple checks of the Python-intepreter variables.
|
428
|
+
|
429
|
+
1. Each `FRACTAL_TASKS_PYTHON_X_Y` variable must be an absolute path,
|
430
|
+
if set.
|
431
|
+
2. If `FRACTAL_TASKS_PYTHON_DEFAULT_VERSION` is unset, use
|
432
|
+
`sys.executable` and set the corresponding
|
433
|
+
`FRACTAL_TASKS_PYTHON_X_Y` (and unset all others).
|
434
|
+
"""
|
435
|
+
|
436
|
+
# `FRACTAL_TASKS_PYTHON_X_Y` variables can only be absolute paths
|
437
|
+
for version in ["3_9", "3_10", "3_11", "3_12"]:
|
438
|
+
key = f"FRACTAL_TASKS_PYTHON_{version}"
|
439
|
+
value = values.get(key)
|
440
|
+
if value is not None and not Path(value).is_absolute():
|
441
|
+
raise FractalConfigurationError(
|
442
|
+
f"Non-absolute value {key}={value}"
|
443
|
+
)
|
444
|
+
|
445
|
+
default_version = values.get("FRACTAL_TASKS_PYTHON_DEFAULT_VERSION")
|
446
|
+
|
447
|
+
if default_version is not None:
|
448
|
+
# "production/slurm" branch
|
449
|
+
# If a default version is set, then the corresponding interpreter
|
450
|
+
# must also be set
|
451
|
+
default_version_undescore = default_version.replace(".", "_")
|
452
|
+
key = f"FRACTAL_TASKS_PYTHON_{default_version_undescore}"
|
453
|
+
value = values.get(key)
|
454
|
+
if value is None:
|
455
|
+
msg = (
|
456
|
+
f"FRACTAL_TASKS_PYTHON_DEFAULT_VERSION={default_version} "
|
457
|
+
f"but {key}={value}."
|
458
|
+
)
|
459
|
+
logging.error(msg)
|
460
|
+
raise FractalConfigurationError(msg)
|
461
|
+
|
462
|
+
else:
|
463
|
+
# If no default version is set, then only `sys.executable` is made
|
464
|
+
# available
|
465
|
+
_info = sys.version_info
|
466
|
+
current_version = f"{_info.major}_{_info.minor}"
|
467
|
+
current_version_dot = f"{_info.major}.{_info.minor}"
|
468
|
+
values[
|
469
|
+
"FRACTAL_TASKS_PYTHON_DEFAULT_VERSION"
|
470
|
+
] = current_version_dot
|
471
|
+
logging.info(
|
472
|
+
"Setting FRACTAL_TASKS_PYTHON_DEFAULT_VERSION to "
|
473
|
+
f"{current_version_dot}"
|
474
|
+
)
|
475
|
+
|
476
|
+
# Unset all existing intepreters variable
|
477
|
+
for _version in ["3_9", "3_10", "3_11", "3_12"]:
|
478
|
+
key = f"FRACTAL_TASKS_PYTHON_{_version}"
|
479
|
+
if _version == current_version:
|
480
|
+
values[key] = sys.executable
|
481
|
+
logging.info(f"Setting {key} to {sys.executable}.")
|
482
|
+
else:
|
483
|
+
value = values.get(key)
|
484
|
+
if value is not None:
|
485
|
+
logging.info(
|
486
|
+
f"Setting {key} to None (given: {value}), "
|
487
|
+
"because FRACTAL_TASKS_PYTHON_DEFAULT_VERSION was "
|
488
|
+
"not set."
|
489
|
+
)
|
490
|
+
values[key] = None
|
491
|
+
return values
|
360
492
|
|
361
493
|
FRACTAL_SLURM_POLL_INTERVAL: int = 5
|
362
494
|
"""
|
@@ -380,6 +512,25 @@ class Settings(BaseSettings):
|
|
380
512
|
`JobExecutionError`.
|
381
513
|
"""
|
382
514
|
|
515
|
+
FRACTAL_SLURM_SSH_HOST: Optional[str] = None
|
516
|
+
"""
|
517
|
+
SSH-reachable host where a SLURM client is available.
|
518
|
+
"""
|
519
|
+
FRACTAL_SLURM_SSH_USER: Optional[str] = None
|
520
|
+
"""
|
521
|
+
User on `FRACTAL_SLURM_SSH_HOST`.
|
522
|
+
"""
|
523
|
+
FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH: Optional[str] = None
|
524
|
+
"""
|
525
|
+
Private key for connecting to `FRACTAL_SLURM_SSH_HOST` as
|
526
|
+
`FRACTAL_SLURM_SSH_USER`.
|
527
|
+
"""
|
528
|
+
# FIXME SSH: Split this into two folders (for tasks and for jobs)
|
529
|
+
FRACTAL_SLURM_SSH_WORKING_BASE_DIR: Optional[str] = None
|
530
|
+
"""
|
531
|
+
Remote folder on `FRACTAL_SLURM_SSH_HOST`.
|
532
|
+
"""
|
533
|
+
|
383
534
|
FRACTAL_API_SUBMIT_RATE_LIMIT: int = 2
|
384
535
|
"""
|
385
536
|
Interval to wait (in seconds) to be allowed to call again
|
@@ -420,6 +571,14 @@ class Settings(BaseSettings):
|
|
420
571
|
"DB engine is `postgres` but `psycopg2` or `asyncpg` "
|
421
572
|
"are not available"
|
422
573
|
)
|
574
|
+
elif self.DB_ENGINE == "postgres-psycopg":
|
575
|
+
try:
|
576
|
+
import psycopg # noqa: F401
|
577
|
+
except ModuleNotFoundError:
|
578
|
+
raise FractalConfigurationError(
|
579
|
+
"DB engine is `postgres-psycopg` but `psycopg` is not "
|
580
|
+
"available"
|
581
|
+
)
|
423
582
|
else:
|
424
583
|
if not self.SQLITE_PATH:
|
425
584
|
raise FractalConfigurationError(
|
@@ -460,6 +619,48 @@ class Settings(BaseSettings):
|
|
460
619
|
raise FractalConfigurationError(
|
461
620
|
f"{info} but `squeue` command not found."
|
462
621
|
)
|
622
|
+
elif self.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
623
|
+
if self.FRACTAL_SLURM_WORKER_PYTHON is None:
|
624
|
+
raise FractalConfigurationError(
|
625
|
+
f"Must set FRACTAL_SLURM_WORKER_PYTHON when {info}"
|
626
|
+
)
|
627
|
+
if self.FRACTAL_SLURM_SSH_USER is None:
|
628
|
+
raise FractalConfigurationError(
|
629
|
+
f"Must set FRACTAL_SLURM_SSH_USER when {info}"
|
630
|
+
)
|
631
|
+
if self.FRACTAL_SLURM_SSH_HOST is None:
|
632
|
+
raise FractalConfigurationError(
|
633
|
+
f"Must set FRACTAL_SLURM_SSH_HOST when {info}"
|
634
|
+
)
|
635
|
+
if self.FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH is None:
|
636
|
+
raise FractalConfigurationError(
|
637
|
+
f"Must set FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH when {info}"
|
638
|
+
)
|
639
|
+
if self.FRACTAL_SLURM_SSH_WORKING_BASE_DIR is None:
|
640
|
+
raise FractalConfigurationError(
|
641
|
+
f"Must set FRACTAL_SLURM_SSH_WORKING_BASE_DIR when {info}"
|
642
|
+
)
|
643
|
+
|
644
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import ( # noqa: E501
|
645
|
+
load_slurm_config_file,
|
646
|
+
)
|
647
|
+
|
648
|
+
if not self.FRACTAL_SLURM_CONFIG_FILE:
|
649
|
+
raise FractalConfigurationError(
|
650
|
+
f"Must set FRACTAL_SLURM_CONFIG_FILE when {info}"
|
651
|
+
)
|
652
|
+
else:
|
653
|
+
if not self.FRACTAL_SLURM_CONFIG_FILE.exists():
|
654
|
+
raise FractalConfigurationError(
|
655
|
+
f"{info} but FRACTAL_SLURM_CONFIG_FILE="
|
656
|
+
f"{self.FRACTAL_SLURM_CONFIG_FILE} not found."
|
657
|
+
)
|
658
|
+
|
659
|
+
load_slurm_config_file(self.FRACTAL_SLURM_CONFIG_FILE)
|
660
|
+
if not shutil.which("ssh"):
|
661
|
+
raise FractalConfigurationError(
|
662
|
+
f"{info} but `ssh` command not found."
|
663
|
+
)
|
463
664
|
else: # i.e. self.FRACTAL_RUNNER_BACKEND == "local"
|
464
665
|
if self.FRACTAL_LOCAL_CONFIG_FILE:
|
465
666
|
if not self.FRACTAL_LOCAL_CONFIG_FILE.exists():
|
fractal_server/main.py
CHANGED
@@ -20,6 +20,7 @@ from contextlib import asynccontextmanager
|
|
20
20
|
|
21
21
|
from fastapi import FastAPI
|
22
22
|
|
23
|
+
from .app.routes.aux._runner import _backend_supports_shutdown # FIXME: change
|
23
24
|
from .app.runner.shutdown import cleanup_after_shutdown
|
24
25
|
from .app.security import _create_first_user
|
25
26
|
from .config import get_settings
|
@@ -97,17 +98,40 @@ async def lifespan(app: FastAPI):
|
|
97
98
|
is_superuser=True,
|
98
99
|
is_verified=True,
|
99
100
|
)
|
101
|
+
|
102
|
+
if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
103
|
+
from fractal_server.ssh._fabric import get_ssh_connection
|
104
|
+
from fractal_server.ssh._fabric import FractalSSH
|
105
|
+
|
106
|
+
connection = get_ssh_connection()
|
107
|
+
app.state.fractal_ssh = FractalSSH(connection=connection)
|
108
|
+
logger.info(
|
109
|
+
f"Created SSH connection "
|
110
|
+
f"({app.state.fractal_ssh.is_connected=})."
|
111
|
+
)
|
112
|
+
else:
|
113
|
+
app.state.fractal_ssh = None
|
114
|
+
|
100
115
|
config_uvicorn_loggers()
|
101
116
|
logger.info("End application startup")
|
102
117
|
reset_logger_handlers(logger)
|
103
118
|
yield
|
104
119
|
logger = get_logger("fractal_server.lifespan")
|
105
120
|
logger.info("Start application shutdown")
|
121
|
+
|
122
|
+
if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
123
|
+
logger.info(
|
124
|
+
f"Closing SSH connection "
|
125
|
+
f"(current: {app.state.fractal_ssh.is_connected=})."
|
126
|
+
)
|
127
|
+
|
128
|
+
app.state.fractal_ssh.close()
|
129
|
+
|
106
130
|
logger.info(
|
107
131
|
f"Current worker with pid {os.getpid()} is shutting down. "
|
108
132
|
f"Current jobs: {app.state.jobsV1=}, {app.state.jobsV2=}"
|
109
133
|
)
|
110
|
-
if settings.FRACTAL_RUNNER_BACKEND
|
134
|
+
if _backend_supports_shutdown(settings.FRACTAL_RUNNER_BACKEND):
|
111
135
|
try:
|
112
136
|
await cleanup_after_shutdown(
|
113
137
|
jobsV1=app.state.jobsV1,
|
@@ -120,6 +144,8 @@ async def lifespan(app: FastAPI):
|
|
120
144
|
"some of running jobs are not shutdown properly. "
|
121
145
|
f"Original error: {e}"
|
122
146
|
)
|
147
|
+
else:
|
148
|
+
logger.info("Shutdown not available for this backend runner.")
|
123
149
|
|
124
150
|
logger.info("End application shutdown")
|
125
151
|
reset_logger_handlers(logger)
|
fractal_server/migrations/env.py
CHANGED
@@ -54,7 +54,7 @@ def run_migrations_offline() -> None:
|
|
54
54
|
settings = Inject(get_settings)
|
55
55
|
settings.check_db()
|
56
56
|
context.configure(
|
57
|
-
url=settings.
|
57
|
+
url=settings.DATABASE_ASYNC_URL,
|
58
58
|
target_metadata=target_metadata,
|
59
59
|
literal_binds=True,
|
60
60
|
dialect_opts={"paramstyle": "named"},
|
@@ -0,0 +1,245 @@
|
|
1
|
+
import time
|
2
|
+
from contextlib import contextmanager
|
3
|
+
from threading import Lock
|
4
|
+
from typing import Any
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from fabric import Connection
|
8
|
+
from fabric import Result
|
9
|
+
from invoke import UnexpectedExit
|
10
|
+
from paramiko.ssh_exception import NoValidConnectionsError
|
11
|
+
|
12
|
+
from ..logger import get_logger
|
13
|
+
from ..logger import set_logger
|
14
|
+
from fractal_server.config import get_settings
|
15
|
+
from fractal_server.syringe import Inject
|
16
|
+
|
17
|
+
logger = set_logger(__name__)
|
18
|
+
|
19
|
+
MAX_ATTEMPTS = 5
|
20
|
+
|
21
|
+
|
22
|
+
class TimeoutException(Exception):
|
23
|
+
pass
|
24
|
+
|
25
|
+
|
26
|
+
@contextmanager
|
27
|
+
def acquire_timeout(lock: Lock, timeout: int) -> Any:
|
28
|
+
logger.debug(f"Trying to acquire lock, with {timeout=}")
|
29
|
+
result = lock.acquire(timeout=timeout)
|
30
|
+
try:
|
31
|
+
if not result:
|
32
|
+
raise TimeoutException(
|
33
|
+
f"Failed to acquire lock within {timeout} seconds"
|
34
|
+
)
|
35
|
+
logger.debug("Lock was acquired.")
|
36
|
+
yield result
|
37
|
+
finally:
|
38
|
+
if result:
|
39
|
+
lock.release()
|
40
|
+
logger.debug("Lock was released")
|
41
|
+
|
42
|
+
|
43
|
+
class FractalSSH(object):
|
44
|
+
lock: Lock
|
45
|
+
connection: Connection
|
46
|
+
default_timeout: int
|
47
|
+
|
48
|
+
# FIXME SSH: maybe extend the actual_timeout logic to other methods
|
49
|
+
|
50
|
+
def __init__(self, connection: Connection, default_timeout: int = 250):
|
51
|
+
self.lock = Lock()
|
52
|
+
self.conn = connection
|
53
|
+
self.default_timeout = default_timeout
|
54
|
+
|
55
|
+
@property
|
56
|
+
def is_connected(self) -> bool:
|
57
|
+
return self.conn.is_connected
|
58
|
+
|
59
|
+
def put(self, *args, timeout: Optional[int] = None, **kwargs) -> Result:
|
60
|
+
actual_timeout = timeout or self.default_timeout
|
61
|
+
with acquire_timeout(self.lock, timeout=actual_timeout):
|
62
|
+
return self.conn.put(*args, **kwargs)
|
63
|
+
|
64
|
+
def get(self, *args, **kwargs) -> Result:
|
65
|
+
with acquire_timeout(self.lock, timeout=self.default_timeout):
|
66
|
+
return self.conn.get(*args, **kwargs)
|
67
|
+
|
68
|
+
def run(self, *args, **kwargs) -> Any:
|
69
|
+
with acquire_timeout(self.lock, timeout=self.default_timeout):
|
70
|
+
return self.conn.run(*args, **kwargs)
|
71
|
+
|
72
|
+
def close(self):
|
73
|
+
return self.conn.close()
|
74
|
+
|
75
|
+
def sftp(self):
|
76
|
+
return self.conn.sftp()
|
77
|
+
|
78
|
+
def check_connection(self) -> None:
|
79
|
+
"""
|
80
|
+
Open the SSH connection and handle exceptions.
|
81
|
+
|
82
|
+
This function can be called from within other functions that use
|
83
|
+
`connection`, so that we can provide a meaningful error in case the
|
84
|
+
SSH connection cannot be opened.
|
85
|
+
"""
|
86
|
+
if not self.conn.is_connected:
|
87
|
+
try:
|
88
|
+
self.conn.open()
|
89
|
+
except Exception as e:
|
90
|
+
raise RuntimeError(
|
91
|
+
f"Cannot open SSH connection (original error: '{str(e)}')."
|
92
|
+
)
|
93
|
+
|
94
|
+
|
95
|
+
def get_ssh_connection(
|
96
|
+
*,
|
97
|
+
host: Optional[str] = None,
|
98
|
+
user: Optional[str] = None,
|
99
|
+
key_filename: Optional[str] = None,
|
100
|
+
) -> Connection:
|
101
|
+
"""
|
102
|
+
Create a `fabric.Connection` object based on fractal-server settings
|
103
|
+
or explicit arguments.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
host:
|
107
|
+
user:
|
108
|
+
key_filename:
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
Fabric connection object
|
112
|
+
"""
|
113
|
+
settings = Inject(get_settings)
|
114
|
+
if host is None:
|
115
|
+
host = settings.FRACTAL_SLURM_SSH_HOST
|
116
|
+
if user is None:
|
117
|
+
user = settings.FRACTAL_SLURM_SSH_USER
|
118
|
+
if key_filename is None:
|
119
|
+
key_filename = settings.FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH
|
120
|
+
|
121
|
+
connection = Connection(
|
122
|
+
host=host,
|
123
|
+
user=user,
|
124
|
+
connect_kwargs={"key_filename": key_filename},
|
125
|
+
)
|
126
|
+
logger.debug(f"Now created {connection=}.")
|
127
|
+
return connection
|
128
|
+
|
129
|
+
|
130
|
+
def run_command_over_ssh(
|
131
|
+
*,
|
132
|
+
cmd: str,
|
133
|
+
fractal_ssh: FractalSSH,
|
134
|
+
max_attempts: int = MAX_ATTEMPTS,
|
135
|
+
base_interval: float = 3.0,
|
136
|
+
) -> str:
|
137
|
+
"""
|
138
|
+
Run a command within an open SSH connection.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
cmd: Command to be run
|
142
|
+
fractal_ssh: FractalSSH connection object with custom lock
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
Standard output of the command, if successful.
|
146
|
+
"""
|
147
|
+
t_0 = time.perf_counter()
|
148
|
+
ind_attempt = 0
|
149
|
+
while ind_attempt <= max_attempts:
|
150
|
+
ind_attempt += 1
|
151
|
+
prefix = f"[attempt {ind_attempt}/{max_attempts}]"
|
152
|
+
logger.info(f"{prefix} START running '{cmd}' over SSH.")
|
153
|
+
try:
|
154
|
+
# Case 1: Command runs successfully
|
155
|
+
res = fractal_ssh.run(cmd, hide=True)
|
156
|
+
t_1 = time.perf_counter()
|
157
|
+
logger.info(
|
158
|
+
f"{prefix} END running '{cmd}' over SSH, "
|
159
|
+
f"elapsed {t_1-t_0:.3f}"
|
160
|
+
)
|
161
|
+
logger.debug(f"STDOUT: {res.stdout}")
|
162
|
+
logger.debug(f"STDERR: {res.stderr}")
|
163
|
+
return res.stdout
|
164
|
+
except NoValidConnectionsError as e:
|
165
|
+
# Case 2: Command fails with a connection error
|
166
|
+
logger.warning(
|
167
|
+
f"{prefix} Running command `{cmd}` over SSH failed.\n"
|
168
|
+
f"Original NoValidConnectionError:\n{str(e)}.\n"
|
169
|
+
f"{e.errors=}\n"
|
170
|
+
)
|
171
|
+
if ind_attempt < max_attempts:
|
172
|
+
sleeptime = base_interval**ind_attempt
|
173
|
+
logger.warning(
|
174
|
+
f"{prefix} Now sleep {sleeptime:.3f} seconds and continue."
|
175
|
+
)
|
176
|
+
time.sleep(sleeptime)
|
177
|
+
continue
|
178
|
+
else:
|
179
|
+
logger.error(f"{prefix} Reached last attempt")
|
180
|
+
break
|
181
|
+
except UnexpectedExit as e:
|
182
|
+
# Case 3: Command fails with an actual error
|
183
|
+
error_msg = (
|
184
|
+
f"{prefix} Running command `{cmd}` over SSH failed.\n"
|
185
|
+
f"Original error:\n{str(e)}."
|
186
|
+
)
|
187
|
+
logger.error(error_msg)
|
188
|
+
raise ValueError(error_msg)
|
189
|
+
except Exception as e:
|
190
|
+
logger.error(
|
191
|
+
f"Running command `{cmd}` over SSH failed.\n"
|
192
|
+
f"Original Error:\n{str(e)}."
|
193
|
+
)
|
194
|
+
raise e
|
195
|
+
|
196
|
+
raise ValueError(
|
197
|
+
f"Reached last attempt ({max_attempts=}) for running '{cmd}' over SSH"
|
198
|
+
)
|
199
|
+
|
200
|
+
|
201
|
+
def put_over_ssh(
|
202
|
+
*,
|
203
|
+
local: str,
|
204
|
+
remote: str,
|
205
|
+
fractal_ssh: FractalSSH,
|
206
|
+
logger_name: Optional[str] = None,
|
207
|
+
) -> None:
|
208
|
+
"""
|
209
|
+
Transfer a file via SSH
|
210
|
+
|
211
|
+
Args:
|
212
|
+
local: Local path to file
|
213
|
+
remote: Target path on remote host
|
214
|
+
fractal_ssh: FractalSSH connection object with custom lock
|
215
|
+
logger_name: Name of the logger
|
216
|
+
|
217
|
+
"""
|
218
|
+
try:
|
219
|
+
fractal_ssh.put(local=local, remote=remote)
|
220
|
+
except Exception as e:
|
221
|
+
logger = get_logger(logger_name=logger_name)
|
222
|
+
logger.error(
|
223
|
+
f"Transferring {local=} to {remote=} over SSH failed.\n"
|
224
|
+
f"Original Error:\n{str(e)}."
|
225
|
+
)
|
226
|
+
raise e
|
227
|
+
|
228
|
+
|
229
|
+
def _mkdir_over_ssh(
|
230
|
+
*, folder: str, fractal_ssh: FractalSSH, parents: bool = True
|
231
|
+
) -> None:
|
232
|
+
"""
|
233
|
+
Create a folder remotely via SSH.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
folder:
|
237
|
+
fractal_ssh:
|
238
|
+
parents:
|
239
|
+
"""
|
240
|
+
# FIXME SSH: try using `mkdir` method of `paramiko.SFTPClient`
|
241
|
+
if parents:
|
242
|
+
cmd = f"mkdir -p {folder}"
|
243
|
+
else:
|
244
|
+
cmd = f"mkdir {folder}"
|
245
|
+
run_command_over_ssh(cmd=cmd, fractal_ssh=fractal_ssh)
|