fractal-server 2.2.0a1__py3-none-any.whl → 2.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v1/state.py +1 -2
  3. fractal_server/app/routes/admin/v1.py +2 -2
  4. fractal_server/app/routes/admin/v2.py +2 -2
  5. fractal_server/app/routes/api/v1/job.py +2 -2
  6. fractal_server/app/routes/api/v1/task_collection.py +4 -4
  7. fractal_server/app/routes/api/v2/__init__.py +23 -3
  8. fractal_server/app/routes/api/v2/job.py +2 -2
  9. fractal_server/app/routes/api/v2/submit.py +6 -0
  10. fractal_server/app/routes/api/v2/task_collection.py +74 -34
  11. fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
  12. fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
  13. fractal_server/app/routes/aux/_runner.py +10 -2
  14. fractal_server/app/runner/compress_folder.py +120 -0
  15. fractal_server/app/runner/executors/slurm/__init__.py +0 -3
  16. fractal_server/app/runner/executors/slurm/_batching.py +0 -1
  17. fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
  18. fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
  19. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
  20. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
  21. fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
  22. fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
  23. fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
  24. fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
  25. fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
  26. fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
  27. fractal_server/app/runner/extract_archive.py +38 -0
  28. fractal_server/app/runner/v1/__init__.py +78 -40
  29. fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
  30. fractal_server/app/runner/v2/__init__.py +147 -62
  31. fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
  32. fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
  33. fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
  34. fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
  35. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
  36. fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
  37. fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
  38. fractal_server/app/runner/versions.py +30 -0
  39. fractal_server/app/schemas/v1/__init__.py +1 -0
  40. fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
  41. fractal_server/app/schemas/v2/__init__.py +4 -1
  42. fractal_server/app/schemas/v2/task_collection.py +97 -27
  43. fractal_server/config.py +184 -3
  44. fractal_server/main.py +25 -1
  45. fractal_server/ssh/__init__.py +4 -0
  46. fractal_server/ssh/_fabric.py +190 -0
  47. fractal_server/tasks/utils.py +12 -64
  48. fractal_server/tasks/v1/background_operations.py +2 -2
  49. fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
  50. fractal_server/tasks/v1/utils.py +67 -0
  51. fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
  52. fractal_server/tasks/v2/_venv_pip.py +195 -0
  53. fractal_server/tasks/v2/background_operations.py +257 -295
  54. fractal_server/tasks/v2/background_operations_ssh.py +304 -0
  55. fractal_server/tasks/v2/endpoint_operations.py +136 -0
  56. fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
  57. fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
  58. fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
  59. fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
  60. fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
  61. fractal_server/tasks/v2/utils.py +54 -0
  62. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +4 -2
  63. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +66 -42
  64. fractal_server/tasks/v2/get_collection_data.py +0 -14
  65. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
  66. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
  67. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,26 @@
1
+ from datetime import datetime
2
+ from enum import Enum
1
3
  from pathlib import Path
4
+ from typing import Any
2
5
  from typing import Literal
3
6
  from typing import Optional
4
7
 
5
8
  from pydantic import BaseModel
6
- from pydantic import Field
9
+ from pydantic import root_validator
7
10
  from pydantic import validator
8
11
 
9
12
  from .._validators import valdictkeys
10
13
  from .._validators import valstr
11
- from .task import TaskReadV2
14
+ from fractal_server.app.schemas._validators import valutc
15
+ from fractal_server.app.schemas.v2 import ManifestV2
16
+
17
+
18
+ class CollectionStatusV2(str, Enum):
19
+ PENDING = "pending"
20
+ INSTALLING = "installing"
21
+ COLLECTING = "collecting"
22
+ FAIL = "fail"
23
+ OK = "OK"
12
24
 
13
25
 
14
26
  class TaskCollectPipV2(BaseModel):
@@ -41,7 +53,7 @@ class TaskCollectPipV2(BaseModel):
41
53
  package: str
42
54
  package_version: Optional[str] = None
43
55
  package_extras: Optional[str] = None
44
- python_version: Optional[str] = None
56
+ python_version: Optional[Literal["3.9", "3.10", "3.11", "3.12"]] = None
45
57
  pinned_package_versions: Optional[dict[str, str]] = None
46
58
 
47
59
  _pinned_package_versions = validator(
@@ -70,40 +82,98 @@ class TaskCollectPipV2(BaseModel):
70
82
 
71
83
  @validator("package_version")
72
84
  def package_version_validator(cls, v, values):
73
-
74
- valstr("package_version")(v)
75
-
85
+ v = valstr("package_version")(v)
76
86
  if values["package"].endswith(".whl"):
77
87
  raise ValueError(
78
- "Cannot provide version when package is a Wheel file."
88
+ "Cannot provide package version when package is a wheel file."
79
89
  )
80
90
  return v
81
91
 
82
92
 
83
- class TaskCollectStatusV2(BaseModel):
93
+ class TaskCollectCustomV2(BaseModel):
84
94
  """
85
- TaskCollectStatus class
86
-
87
95
  Attributes:
88
- status:
89
- package:
90
- venv_path:
91
- task_list:
92
- log:
93
- info:
96
+ manifest: Manifest of a Fractal task package (this is typically the
97
+ content of `__FRACTAL_MANIFEST__.json`).
98
+ python_interpreter: Absolute path to the Python interpreter to be used
99
+ for running tasks.
100
+ source: A common label identifying this package.
101
+ package_root: The folder where the package is installed.
102
+ If not provided, it will be extracted via `pip show`
103
+ (requires `package_name` to be set).
104
+ package_name: Name of the package, as used for `import <package_name>`;
105
+ this is then used to extract the package directory (`package_root`)
106
+ via `pip show <package_name>`.
107
+ version: Optional version of tasks to be collected.
94
108
  """
95
109
 
96
- status: Literal["pending", "installing", "collecting", "fail", "OK"]
97
- package: str
98
- venv_path: Path
99
- task_list: Optional[list[TaskReadV2]] = Field(default=[])
100
- log: Optional[str]
101
- info: Optional[str]
110
+ manifest: ManifestV2
111
+ python_interpreter: str
112
+ source: str
113
+ package_root: Optional[str]
114
+ package_name: Optional[str]
115
+ version: Optional[str]
116
+
117
+ @root_validator(pre=True)
118
+ def one_of_package_root_or_name(cls, values):
119
+ package_root = values["package_root"]
120
+ package_name = values["package_name"]
121
+ if (package_root is None and package_name is None) or (
122
+ package_root is not None and package_name is not None
123
+ ):
124
+ raise ValueError(
125
+ "One and only one must be set between "
126
+ "'package_root' and 'package_name'"
127
+ )
128
+ return values
102
129
 
103
- def sanitised_dict(self):
130
+ @validator("package_name")
131
+ def package_name_prevent_injection(cls, value: str):
104
132
  """
105
- Return `self.dict()` after casting `self.venv_path` to a string
133
+ Remove all whitespace characters, and reject values containing `;`.
106
134
  """
107
- d = self.dict()
108
- d["venv_path"] = str(self.venv_path)
109
- return d
135
+ if value is not None:
136
+ if ";" in value:
137
+ raise ValueError(f"Invalid package_name: {value}")
138
+ value = value.replace(" ", "")
139
+ return value
140
+
141
+ @validator("package_root")
142
+ def package_root_validator(cls, value):
143
+ if (value is not None) and (not Path(value).is_absolute()):
144
+ raise ValueError(
145
+ f"'package_root' must be an absolute path: (given {value})."
146
+ )
147
+ return value
148
+
149
+ @validator("python_interpreter")
150
+ def python_interpreter_validator(cls, value):
151
+ if not Path(value).is_absolute():
152
+ raise ValueError(
153
+ f"Python interpreter path must be absolute: (given {value})."
154
+ )
155
+ return value
156
+
157
+ # Valstr
158
+ _python_interpreter = validator("python_interpreter", allow_reuse=True)(
159
+ valstr("python_interpreter")
160
+ )
161
+ _source = validator("source", allow_reuse=True)(valstr("source"))
162
+ _package_root = validator("package_root", allow_reuse=True)(
163
+ valstr("package_root", accept_none=True)
164
+ )
165
+ _package_name = validator("package_name", allow_reuse=True)(
166
+ valstr("package_name", accept_none=True)
167
+ )
168
+ _version = validator("version", allow_reuse=True)(
169
+ valstr("version", accept_none=True)
170
+ )
171
+
172
+
173
+ class CollectionStateReadV2(BaseModel):
174
+
175
+ id: Optional[int]
176
+ data: dict[str, Any]
177
+ timestamp: datetime
178
+
179
+ _timestamp = validator("timestamp", allow_reuse=True)(valutc("timestamp"))
fractal_server/config.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # Zurich.
14
14
  import logging
15
15
  import shutil
16
+ import sys
16
17
  from os import environ
17
18
  from os import getenv
18
19
  from os.path import abspath
@@ -323,7 +324,10 @@ class Settings(BaseSettings):
323
324
  return FRACTAL_RUNNER_WORKING_BASE_DIR_path
324
325
 
325
326
  FRACTAL_RUNNER_BACKEND: Literal[
326
- "local", "local_experimental", "slurm"
327
+ "local",
328
+ "local_experimental",
329
+ "slurm",
330
+ "slurm_ssh",
327
331
  ] = "local"
328
332
  """
329
333
  Select which runner backend to use.
@@ -366,10 +370,126 @@ class Settings(BaseSettings):
366
370
 
367
371
  FRACTAL_SLURM_WORKER_PYTHON: Optional[str] = None
368
372
  """
369
- Path to Python interpreter that will run the jobs on the SLURM nodes. If
370
- not specified, the same interpreter that runs the server is used.
373
+ Absolute path to Python interpreter that will run the jobs on the SLURM
374
+ nodes. If not specified, the same interpreter that runs the server is used.
371
375
  """
372
376
 
377
+ @validator("FRACTAL_SLURM_WORKER_PYTHON", always=True)
378
+ def absolute_FRACTAL_SLURM_WORKER_PYTHON(cls, v):
379
+ """
380
+ If `FRACTAL_SLURM_WORKER_PYTHON` is a relative path, fail.
381
+ """
382
+ if v is None:
383
+ return None
384
+ elif not Path(v).is_absolute():
385
+ raise FractalConfigurationError(
386
+ f"Non-absolute value for FRACTAL_SLURM_WORKER_PYTHON={v}"
387
+ )
388
+ else:
389
+ return v
390
+
391
+ FRACTAL_TASKS_PYTHON_DEFAULT_VERSION: Optional[
392
+ Literal["3.9", "3.10", "3.11", "3.12"]
393
+ ] = None
394
+ """
395
+ Default Python version to be used for task collection. Defaults to the
396
+ current version. Requires the corresponding variable (e.g
397
+ `FRACTAL_TASKS_PYTHON_3_10`) to be set.
398
+ """
399
+
400
+ FRACTAL_TASKS_PYTHON_3_9: Optional[str] = None
401
+ """
402
+ Absolute path to the Python 3.9 interpreter that serves as base for virtual
403
+ environments tasks. Note that this interpreter must have the `venv` module
404
+ installed. If set, this must be an absolute path. If the version specified
405
+ in `FRACTAL_TASKS_PYTHON_DEFAULT_VERSION` is `"3.9"` and this attribute is
406
+ unset, `sys.executable` is used as a default.
407
+ """
408
+
409
+ FRACTAL_TASKS_PYTHON_3_10: Optional[str] = None
410
+ """
411
+ Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.10.
412
+ """
413
+
414
+ FRACTAL_TASKS_PYTHON_3_11: Optional[str] = None
415
+ """
416
+ Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.11.
417
+ """
418
+
419
+ FRACTAL_TASKS_PYTHON_3_12: Optional[str] = None
420
+ """
421
+ Same as `FRACTAL_TASKS_PYTHON_3_9`, for Python 3.12.
422
+ """
423
+
424
+ @root_validator(pre=True)
425
+ def check_tasks_python(cls, values) -> None:
426
+ """
427
+ Perform multiple checks of the Python-intepreter variables.
428
+
429
+ 1. Each `FRACTAL_TASKS_PYTHON_X_Y` variable must be an absolute path,
430
+ if set.
431
+ 2. If `FRACTAL_TASKS_PYTHON_DEFAULT_VERSION` is unset, use
432
+ `sys.executable` and set the corresponding
433
+ `FRACTAL_TASKS_PYTHON_X_Y` (and unset all others).
434
+ """
435
+
436
+ # `FRACTAL_TASKS_PYTHON_X_Y` variables can only be absolute paths
437
+ for version in ["3_9", "3_10", "3_11", "3_12"]:
438
+ key = f"FRACTAL_TASKS_PYTHON_{version}"
439
+ value = values.get(key)
440
+ if value is not None and not Path(value).is_absolute():
441
+ raise FractalConfigurationError(
442
+ f"Non-absolute value {key}={value}"
443
+ )
444
+
445
+ default_version = values.get("FRACTAL_TASKS_PYTHON_DEFAULT_VERSION")
446
+
447
+ if default_version is not None:
448
+ # "production/slurm" branch
449
+ # If a default version is set, then the corresponding interpreter
450
+ # must also be set
451
+ default_version_undescore = default_version.replace(".", "_")
452
+ key = f"FRACTAL_TASKS_PYTHON_{default_version_undescore}"
453
+ value = values.get(key)
454
+ if value is None:
455
+ msg = (
456
+ f"FRACTAL_TASKS_PYTHON_DEFAULT_VERSION={default_version} "
457
+ f"but {key}={value}."
458
+ )
459
+ logging.error(msg)
460
+ raise FractalConfigurationError(msg)
461
+
462
+ else:
463
+ # If no default version is set, then only `sys.executable` is made
464
+ # available
465
+ _info = sys.version_info
466
+ current_version = f"{_info.major}_{_info.minor}"
467
+ current_version_dot = f"{_info.major}.{_info.minor}"
468
+ values[
469
+ "FRACTAL_TASKS_PYTHON_DEFAULT_VERSION"
470
+ ] = current_version_dot
471
+ logging.info(
472
+ "Setting FRACTAL_TASKS_PYTHON_DEFAULT_VERSION to "
473
+ f"{current_version_dot}"
474
+ )
475
+
476
+ # Unset all existing intepreters variable
477
+ for _version in ["3_9", "3_10", "3_11", "3_12"]:
478
+ key = f"FRACTAL_TASKS_PYTHON_{_version}"
479
+ if _version == current_version:
480
+ values[key] = sys.executable
481
+ logging.info(f"Setting {key} to {sys.executable}.")
482
+ else:
483
+ value = values.get(key)
484
+ if value is not None:
485
+ logging.info(
486
+ f"Setting {key} to None (given: {value}), "
487
+ "because FRACTAL_TASKS_PYTHON_DEFAULT_VERSION was "
488
+ "not set."
489
+ )
490
+ values[key] = None
491
+ return values
492
+
373
493
  FRACTAL_SLURM_POLL_INTERVAL: int = 5
374
494
  """
375
495
  Interval to wait (in seconds) before checking whether unfinished job are
@@ -392,6 +512,25 @@ class Settings(BaseSettings):
392
512
  `JobExecutionError`.
393
513
  """
394
514
 
515
+ FRACTAL_SLURM_SSH_HOST: Optional[str] = None
516
+ """
517
+ SSH-reachable host where a SLURM client is available.
518
+ """
519
+ FRACTAL_SLURM_SSH_USER: Optional[str] = None
520
+ """
521
+ User on `FRACTAL_SLURM_SSH_HOST`.
522
+ """
523
+ FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH: Optional[str] = None
524
+ """
525
+ Private key for connecting to `FRACTAL_SLURM_SSH_HOST` as
526
+ `FRACTAL_SLURM_SSH_USER`.
527
+ """
528
+ # FIXME SSH: Split this into two folders (for tasks and for jobs)
529
+ FRACTAL_SLURM_SSH_WORKING_BASE_DIR: Optional[str] = None
530
+ """
531
+ Remote folder on `FRACTAL_SLURM_SSH_HOST`.
532
+ """
533
+
395
534
  FRACTAL_API_SUBMIT_RATE_LIMIT: int = 2
396
535
  """
397
536
  Interval to wait (in seconds) to be allowed to call again
@@ -480,6 +619,48 @@ class Settings(BaseSettings):
480
619
  raise FractalConfigurationError(
481
620
  f"{info} but `squeue` command not found."
482
621
  )
622
+ elif self.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
623
+ if self.FRACTAL_SLURM_WORKER_PYTHON is None:
624
+ raise FractalConfigurationError(
625
+ f"Must set FRACTAL_SLURM_WORKER_PYTHON when {info}"
626
+ )
627
+ if self.FRACTAL_SLURM_SSH_USER is None:
628
+ raise FractalConfigurationError(
629
+ f"Must set FRACTAL_SLURM_SSH_USER when {info}"
630
+ )
631
+ if self.FRACTAL_SLURM_SSH_HOST is None:
632
+ raise FractalConfigurationError(
633
+ f"Must set FRACTAL_SLURM_SSH_HOST when {info}"
634
+ )
635
+ if self.FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH is None:
636
+ raise FractalConfigurationError(
637
+ f"Must set FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH when {info}"
638
+ )
639
+ if self.FRACTAL_SLURM_SSH_WORKING_BASE_DIR is None:
640
+ raise FractalConfigurationError(
641
+ f"Must set FRACTAL_SLURM_SSH_WORKING_BASE_DIR when {info}"
642
+ )
643
+
644
+ from fractal_server.app.runner.executors.slurm._slurm_config import ( # noqa: E501
645
+ load_slurm_config_file,
646
+ )
647
+
648
+ if not self.FRACTAL_SLURM_CONFIG_FILE:
649
+ raise FractalConfigurationError(
650
+ f"Must set FRACTAL_SLURM_CONFIG_FILE when {info}"
651
+ )
652
+ else:
653
+ if not self.FRACTAL_SLURM_CONFIG_FILE.exists():
654
+ raise FractalConfigurationError(
655
+ f"{info} but FRACTAL_SLURM_CONFIG_FILE="
656
+ f"{self.FRACTAL_SLURM_CONFIG_FILE} not found."
657
+ )
658
+
659
+ load_slurm_config_file(self.FRACTAL_SLURM_CONFIG_FILE)
660
+ if not shutil.which("ssh"):
661
+ raise FractalConfigurationError(
662
+ f"{info} but `ssh` command not found."
663
+ )
483
664
  else: # i.e. self.FRACTAL_RUNNER_BACKEND == "local"
484
665
  if self.FRACTAL_LOCAL_CONFIG_FILE:
485
666
  if not self.FRACTAL_LOCAL_CONFIG_FILE.exists():
fractal_server/main.py CHANGED
@@ -20,6 +20,7 @@ from contextlib import asynccontextmanager
20
20
 
21
21
  from fastapi import FastAPI
22
22
 
23
+ from .app.routes.aux._runner import _backend_supports_shutdown # FIXME: change
23
24
  from .app.runner.shutdown import cleanup_after_shutdown
24
25
  from .app.security import _create_first_user
25
26
  from .config import get_settings
@@ -97,17 +98,38 @@ async def lifespan(app: FastAPI):
97
98
  is_superuser=True,
98
99
  is_verified=True,
99
100
  )
101
+
102
+ if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
103
+ from fractal_server.ssh._fabric import get_ssh_connection
104
+
105
+ app.state.connection = get_ssh_connection()
106
+ logger.info(
107
+ f"Created SSH connection "
108
+ f"({app.state.connection.is_connected=})."
109
+ )
110
+ else:
111
+ app.state.connection = None
112
+
100
113
  config_uvicorn_loggers()
101
114
  logger.info("End application startup")
102
115
  reset_logger_handlers(logger)
103
116
  yield
104
117
  logger = get_logger("fractal_server.lifespan")
105
118
  logger.info("Start application shutdown")
119
+
120
+ if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
121
+ logger.info(
122
+ f"Closing SSH connection "
123
+ f"(current: {app.state.connection.is_connected=})."
124
+ )
125
+
126
+ app.state.connection.close()
127
+
106
128
  logger.info(
107
129
  f"Current worker with pid {os.getpid()} is shutting down. "
108
130
  f"Current jobs: {app.state.jobsV1=}, {app.state.jobsV2=}"
109
131
  )
110
- if settings.FRACTAL_RUNNER_BACKEND == "slurm":
132
+ if _backend_supports_shutdown(settings.FRACTAL_RUNNER_BACKEND):
111
133
  try:
112
134
  await cleanup_after_shutdown(
113
135
  jobsV1=app.state.jobsV1,
@@ -120,6 +142,8 @@ async def lifespan(app: FastAPI):
120
142
  "some of running jobs are not shutdown properly. "
121
143
  f"Original error: {e}"
122
144
  )
145
+ else:
146
+ logger.info("Shutdown not available for this backend runner.")
123
147
 
124
148
  logger.info("End application shutdown")
125
149
  reset_logger_handlers(logger)
@@ -0,0 +1,4 @@
1
+ """
2
+ The `fractal_server.ssh` subpackage is meant as a layer in front of some SSH
3
+ library (e.g. `fabric` or `asyncssh`).
4
+ """
@@ -0,0 +1,190 @@
1
+ import time
2
+ from typing import Optional
3
+
4
+ from fabric import Connection
5
+ from invoke import UnexpectedExit
6
+ from paramiko.ssh_exception import NoValidConnectionsError
7
+
8
+ from ..logger import get_logger
9
+ from ..logger import set_logger
10
+ from fractal_server.config import get_settings
11
+ from fractal_server.syringe import Inject
12
+
13
+ logger = set_logger(__name__)
14
+
15
+ MAX_ATTEMPTS = 5
16
+
17
+
18
+ def get_ssh_connection(
19
+ *,
20
+ host: Optional[str] = None,
21
+ user: Optional[str] = None,
22
+ key_filename: Optional[str] = None,
23
+ ) -> Connection:
24
+ """
25
+ Create a `fabric.Connection` object based on fractal-server settings
26
+ or explicit arguments.
27
+
28
+ Args:
29
+ host:
30
+ user:
31
+ key_filename:
32
+
33
+ Returns:
34
+ Fabric connection object
35
+ """
36
+ settings = Inject(get_settings)
37
+ if host is None:
38
+ host = settings.FRACTAL_SLURM_SSH_HOST
39
+ if user is None:
40
+ user = settings.FRACTAL_SLURM_SSH_USER
41
+ if key_filename is None:
42
+ key_filename = settings.FRACTAL_SLURM_SSH_PRIVATE_KEY_PATH
43
+
44
+ connection = Connection(
45
+ host=host,
46
+ user=user,
47
+ connect_kwargs={"key_filename": key_filename},
48
+ )
49
+ logger.debug(f"Now created {connection=}.")
50
+ return connection
51
+
52
+
53
+ def check_connection(connection: Connection) -> None:
54
+ """
55
+ Open the SSH connection and handle exceptions.
56
+
57
+ This function can be called from within other functions that use
58
+ `connection`, so that we can provide a meaningful error in case the
59
+ SSH connection cannot be opened.
60
+
61
+ Args:
62
+ connection: Fabric connection object
63
+ """
64
+ if not connection.is_connected:
65
+ try:
66
+ connection.open()
67
+ except Exception as e:
68
+ raise RuntimeError(
69
+ f"Cannot open SSH connection (original error: '{str(e)}')."
70
+ )
71
+
72
+
73
+ def run_command_over_ssh(
74
+ *,
75
+ cmd: str,
76
+ connection: Connection,
77
+ max_attempts: int = MAX_ATTEMPTS,
78
+ base_interval: float = 3.0,
79
+ ) -> str:
80
+ """
81
+ Run a command within an open SSH connection.
82
+
83
+ Args:
84
+ cmd: Command to be run
85
+ connection: Fabric connection object
86
+
87
+ Returns:
88
+ Standard output of the command, if successful.
89
+ """
90
+ t_0 = time.perf_counter()
91
+ ind_attempt = 0
92
+ while ind_attempt <= max_attempts:
93
+ ind_attempt += 1
94
+ prefix = f"[attempt {ind_attempt}/{max_attempts}]"
95
+ logger.info(f"{prefix} START running '{cmd}' over SSH.")
96
+ try:
97
+ # Case 1: Command runs successfully
98
+ res = connection.run(cmd, hide=True)
99
+ t_1 = time.perf_counter()
100
+ logger.info(
101
+ f"{prefix} END running '{cmd}' over SSH, "
102
+ f"elapsed {t_1-t_0:.3f}"
103
+ )
104
+ logger.debug(f"STDOUT: {res.stdout}")
105
+ logger.debug(f"STDERR: {res.stderr}")
106
+ return res.stdout
107
+ except NoValidConnectionsError as e:
108
+ # Case 2: Command fails with a connection error
109
+ logger.warning(
110
+ f"{prefix} Running command `{cmd}` over SSH failed.\n"
111
+ f"Original NoValidConnectionError:\n{str(e)}.\n"
112
+ f"{e.errors=}\n"
113
+ )
114
+ if ind_attempt < max_attempts:
115
+ sleeptime = (
116
+ base_interval**ind_attempt
117
+ ) # FIXME SSH: add jitter?
118
+ logger.warning(
119
+ f"{prefix} Now sleep {sleeptime:.3f} seconds and continue."
120
+ )
121
+ time.sleep(sleeptime)
122
+ continue
123
+ else:
124
+ logger.error(f"{prefix} Reached last attempt")
125
+ break
126
+ except UnexpectedExit as e:
127
+ # Case 3: Command fails with an actual error
128
+ error_msg = (
129
+ f"{prefix} Running command `{cmd}` over SSH failed.\n"
130
+ f"Original error:\n{str(e)}."
131
+ )
132
+ logger.error(error_msg)
133
+ raise ValueError(error_msg)
134
+ except Exception as e:
135
+ logger.error(
136
+ f"Running command `{cmd}` over SSH failed.\n"
137
+ f"Original Error:\n{str(e)}."
138
+ )
139
+ raise e
140
+
141
+ raise ValueError(
142
+ f"Reached last attempt ({max_attempts=}) for running '{cmd}' over SSH"
143
+ )
144
+
145
+
146
+ def put_over_ssh(
147
+ *,
148
+ local: str,
149
+ remote: str,
150
+ connection: Connection,
151
+ logger_name: Optional[str] = None,
152
+ ) -> None:
153
+ """
154
+ Transfer a file via SSH
155
+
156
+ Args:
157
+ local: Local path to file
158
+ remote: Target path on remote host
159
+ connection: Fabric connection object
160
+ logger_name: Name of the logger
161
+
162
+ """
163
+ try:
164
+ connection.put(local=local, remote=remote)
165
+ except Exception as e:
166
+ logger = get_logger(logger_name=logger_name)
167
+ logger.error(
168
+ f"Transferring {local=} to {remote=} over SSH failed.\n"
169
+ f"Original Error:\n{str(e)}."
170
+ )
171
+ raise e
172
+
173
+
174
+ def _mkdir_over_ssh(
175
+ *, folder: str, connection: Connection, parents: bool = True
176
+ ) -> None:
177
+ """
178
+ Create a folder remotely via SSH.
179
+
180
+ Args:
181
+ folder:
182
+ connection:
183
+ parents:
184
+ """
185
+ # FIXME SSH: try using `mkdir` method of `paramiko.SFTPClient`
186
+ if parents:
187
+ cmd = f"mkdir -p {folder}"
188
+ else:
189
+ cmd = f"mkdir {folder}"
190
+ run_command_over_ssh(cmd=cmd, connection=connection)