lsst-ctrl-bps-parsl 27.2024.3100__tar.gz → 29.2025.4900__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lsst_ctrl_bps_parsl-27.2024.3100/python/lsst_ctrl_bps_parsl.egg-info → lsst_ctrl_bps_parsl-29.2025.4900}/PKG-INFO +7 -5
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/pyproject.toml +17 -4
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/job.py +5 -4
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/site.py +3 -2
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/__init__.py +1 -0
- lsst_ctrl_bps_parsl-29.2025.4900/python/lsst/ctrl/bps/parsl/sites/ccin2p3.py +363 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/local.py +1 -1
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/princeton.py +4 -15
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/slurm.py +3 -0
- lsst_ctrl_bps_parsl-29.2025.4900/python/lsst/ctrl/bps/parsl/sites/torque.py +273 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/work_queue.py +1 -1
- lsst_ctrl_bps_parsl-29.2025.4900/python/lsst/ctrl/bps/parsl/version.py +2 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/workflow.py +2 -1
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900/python/lsst_ctrl_bps_parsl.egg-info}/PKG-INFO +7 -5
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst_ctrl_bps_parsl.egg-info/SOURCES.txt +1 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst_ctrl_bps_parsl.egg-info/requires.txt +1 -1
- lsst_ctrl_bps_parsl-27.2024.3100/python/lsst/ctrl/bps/parsl/sites/ccin2p3.py +0 -245
- lsst_ctrl_bps_parsl-27.2024.3100/python/lsst/ctrl/bps/parsl/version.py +0 -2
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/COPYRIGHT +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/LICENSE +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/README.md +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/bsd_license.txt +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/__init__.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/configuration.py +1 -1
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/environment.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/service.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/nersc.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst/ctrl/bps/parsl/sites/slac.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst_ctrl_bps_parsl.egg-info/dependency_links.txt +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst_ctrl_bps_parsl.egg-info/top_level.txt +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/python/lsst_ctrl_bps_parsl.egg-info/zip-safe +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/setup.cfg +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/tests/test_config.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/tests/test_import.py +0 -0
- {lsst_ctrl_bps_parsl-27.2024.3100 → lsst_ctrl_bps_parsl-29.2025.4900}/tests/test_job.py +0 -0
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: lsst-ctrl-bps-parsl
|
|
3
|
-
Version:
|
|
3
|
+
Version: 29.2025.4900
|
|
4
4
|
Summary: Parsl-based plugin for lsst-ctrl-bps.
|
|
5
5
|
Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
|
|
6
|
-
License: BSD
|
|
6
|
+
License-Expression: BSD-3-Clause OR GPL-3.0-or-later
|
|
7
7
|
Project-URL: Homepage, https://github.com/lsst/ctrl_bps_parsl
|
|
8
8
|
Keywords: lsst
|
|
9
9
|
Classifier: Intended Audience :: Science/Research
|
|
10
|
-
Classifier: License :: OSI Approved :: BSD License
|
|
11
10
|
Classifier: Operating System :: OS Independent
|
|
12
11
|
Classifier: Programming Language :: Python :: 3
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
16
|
Classifier: Topic :: Scientific/Engineering :: Astronomy
|
|
16
17
|
Requires-Python: >=3.11.0
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
@@ -20,10 +21,11 @@ License-File: LICENSE
|
|
|
20
21
|
License-File: bsd_license.txt
|
|
21
22
|
License-File: gpl-v3.0.txt
|
|
22
23
|
Requires-Dist: lsst-ctrl-bps
|
|
23
|
-
Requires-Dist: parsl
|
|
24
|
+
Requires-Dist: parsl>=2024.03.04
|
|
24
25
|
Provides-Extra: test
|
|
25
26
|
Requires-Dist: pytest>=3.2; extra == "test"
|
|
26
27
|
Requires-Dist: pytest-openfiles>=0.5.0; extra == "test"
|
|
28
|
+
Dynamic: license-file
|
|
27
29
|
|
|
28
30
|
# ctrl_bps_parsl
|
|
29
31
|
|
|
@@ -6,24 +6,26 @@ build-backend = "setuptools.build_meta"
|
|
|
6
6
|
name = "lsst-ctrl-bps-parsl"
|
|
7
7
|
requires-python = ">=3.11.0"
|
|
8
8
|
description = "Parsl-based plugin for lsst-ctrl-bps."
|
|
9
|
-
license =
|
|
9
|
+
license = "BSD-3-Clause OR GPL-3.0-or-later"
|
|
10
|
+
license-files = ["COPYRIGHT", "LICENSE", "bsd_license.txt", "gpl-v3.0.txt"]
|
|
10
11
|
readme = "README.md"
|
|
11
12
|
authors = [
|
|
12
13
|
{name="Rubin Observatory Data Management", email="dm-admin@lists.lsst.org"},
|
|
13
14
|
]
|
|
14
15
|
classifiers = [
|
|
15
16
|
"Intended Audience :: Science/Research",
|
|
16
|
-
"License :: OSI Approved :: BSD License",
|
|
17
17
|
"Operating System :: OS Independent",
|
|
18
18
|
"Programming Language :: Python :: 3",
|
|
19
19
|
"Programming Language :: Python :: 3.11",
|
|
20
20
|
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Programming Language :: Python :: 3.14",
|
|
21
23
|
"Topic :: Scientific/Engineering :: Astronomy",
|
|
22
24
|
]
|
|
23
25
|
keywords = ["lsst"]
|
|
24
26
|
dependencies = [
|
|
25
27
|
"lsst-ctrl-bps",
|
|
26
|
-
"parsl",
|
|
28
|
+
"parsl >= 2024.03.04",
|
|
27
29
|
]
|
|
28
30
|
dynamic = ["version"]
|
|
29
31
|
|
|
@@ -41,7 +43,6 @@ where = ["python"]
|
|
|
41
43
|
|
|
42
44
|
[tool.setuptools]
|
|
43
45
|
zip-safe = true
|
|
44
|
-
license-files = ["COPYRIGHT", "LICENSE", "bsd_license.txt", "gpl-v3.0.txt"]
|
|
45
46
|
|
|
46
47
|
[tool.setuptools.package-data]
|
|
47
48
|
"lsst.ctrl.bps.parsl" = ["etc/*.yaml"]
|
|
@@ -94,6 +95,7 @@ target-version = ["py311"]
|
|
|
94
95
|
[tool.isort]
|
|
95
96
|
profile = "black"
|
|
96
97
|
line_length = 110
|
|
98
|
+
known_first_party = ["lsst"]
|
|
97
99
|
|
|
98
100
|
[tool.lsst_versions]
|
|
99
101
|
write_to = "python/lsst/ctrl/bps/parsl/version.py"
|
|
@@ -144,17 +146,28 @@ select = [
|
|
|
144
146
|
"D", # pydocstyle
|
|
145
147
|
"UP", # pyupgrade
|
|
146
148
|
"C4",
|
|
149
|
+
"I", # isort
|
|
150
|
+
"RUF022", # sort __all__
|
|
151
|
+
"B", # bugbear
|
|
147
152
|
]
|
|
148
153
|
extend-select = [
|
|
149
154
|
"RUF100", # Warn about unused noqa
|
|
150
155
|
]
|
|
151
156
|
|
|
157
|
+
[tool.ruff.lint.isort]
|
|
158
|
+
known-first-party = ["lsst"]
|
|
159
|
+
known-third-party = ["parsl"]
|
|
160
|
+
|
|
152
161
|
[tool.ruff.lint.pycodestyle]
|
|
153
162
|
max-doc-length = 79
|
|
154
163
|
|
|
155
164
|
[tool.ruff.lint.pydocstyle]
|
|
156
165
|
convention = "numpy"
|
|
157
166
|
|
|
167
|
+
[tool.ruff.format]
|
|
168
|
+
docstring-code-format = true
|
|
169
|
+
docstring-code-line-length = 79
|
|
170
|
+
|
|
158
171
|
[tool.numpydoc_validation]
|
|
159
172
|
checks = [
|
|
160
173
|
"all", # All except the rules listed below.
|
|
@@ -34,13 +34,14 @@ from functools import partial
|
|
|
34
34
|
from textwrap import dedent
|
|
35
35
|
from typing import Any
|
|
36
36
|
|
|
37
|
-
from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
|
|
38
37
|
from parsl.app.bash import BashApp
|
|
39
38
|
from parsl.app.futures import Future
|
|
40
39
|
|
|
40
|
+
from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob
|
|
41
|
+
|
|
41
42
|
from .configuration import get_bps_config_value
|
|
42
43
|
|
|
43
|
-
__all__ = ("
|
|
44
|
+
__all__ = ("ParslJob", "get_file_paths")
|
|
44
45
|
|
|
45
46
|
_env_regex = re.compile(r"<ENV:(\S+)>") # Regex for replacing <ENV:WHATEVER> in BPS job command-lines
|
|
46
47
|
_file_regex = re.compile(r"<FILE:(\S+)>") # Regex for replacing <FILE:WHATEVER> in BPS job command-lines
|
|
@@ -282,12 +283,12 @@ class ParslJob:
|
|
|
282
283
|
command = self.evaluate_command_line(command)
|
|
283
284
|
if command_prefix:
|
|
284
285
|
command = command_prefix + "\n" + command
|
|
285
|
-
resources = self.get_resources() if add_resources else
|
|
286
|
+
resources = self.get_resources() if add_resources else {}
|
|
286
287
|
|
|
287
288
|
# Add a layer of indirection to which we can add a useful name.
|
|
288
289
|
# This name is used by parsl for tracking workflow status.
|
|
289
290
|
func = partial(run_command)
|
|
290
|
-
|
|
291
|
+
func.__name__ = self.generic.label # type: ignore
|
|
291
292
|
|
|
292
293
|
self.future = app(func)(
|
|
293
294
|
command,
|
|
@@ -30,12 +30,13 @@ from types import ModuleType
|
|
|
30
30
|
from typing import TYPE_CHECKING
|
|
31
31
|
|
|
32
32
|
import parsl.config
|
|
33
|
-
from lsst.ctrl.bps import BpsConfig
|
|
34
|
-
from lsst.utils import doImport
|
|
35
33
|
from parsl.addresses import address_by_hostname
|
|
36
34
|
from parsl.executors.base import ParslExecutor
|
|
37
35
|
from parsl.monitoring import MonitoringHub
|
|
38
36
|
|
|
37
|
+
from lsst.ctrl.bps import BpsConfig
|
|
38
|
+
from lsst.utils import doImport
|
|
39
|
+
|
|
39
40
|
from .configuration import get_bps_config_value, get_workflow_name
|
|
40
41
|
from .environment import export_environment
|
|
41
42
|
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import platform
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
import parsl.config
|
|
6
|
+
from parsl.executors import HighThroughputExecutor
|
|
7
|
+
from parsl.executors.base import ParslExecutor
|
|
8
|
+
from parsl.providers import SlurmProvider
|
|
9
|
+
|
|
10
|
+
from ..configuration import get_bps_config_value
|
|
11
|
+
from ..site import SiteConfig
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .job import ParslJob
|
|
15
|
+
|
|
16
|
+
__all__ = ("Ccin2p3",)
|
|
17
|
+
|
|
18
|
+
Kwargs = dict[str, Any]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Ccin2p3(SiteConfig):
|
|
22
|
+
"""Configuration for executing Parsl jobs in CC-IN2P3 Slurm batch farm.
|
|
23
|
+
|
|
24
|
+
This class provides four job slot sizes each with its specific
|
|
25
|
+
requirements, in particular in terms of memory. Those slot sizes are named
|
|
26
|
+
"small", "medium", "large" and "xlarge".
|
|
27
|
+
|
|
28
|
+
Sensible default values for those requirements are provided for each
|
|
29
|
+
job slot but you can overwrite those defaults either in the
|
|
30
|
+
the BPS submission file or in a site configuration file that you
|
|
31
|
+
include in your BPS submission file.
|
|
32
|
+
|
|
33
|
+
If you don't need to modify the default requirements for the job slot
|
|
34
|
+
sizes, use the site specification below in your BPS configuration
|
|
35
|
+
file:
|
|
36
|
+
|
|
37
|
+
.. code-block:: yaml
|
|
38
|
+
|
|
39
|
+
wmsServiceClass: lsst.ctrl.bps.parsl.ParslService
|
|
40
|
+
computeSite: ccin2p3
|
|
41
|
+
|
|
42
|
+
site:
|
|
43
|
+
ccin2p3:
|
|
44
|
+
class: lsst.ctrl.bps.parsl.sites.ccin2p3.Ccin2p3
|
|
45
|
+
|
|
46
|
+
If you do need to modify those defaults, you can overwrite them for
|
|
47
|
+
all job slots or for specific each job slots. Requirements specified
|
|
48
|
+
for a job slot take priority over those specified for all job slots
|
|
49
|
+
at the level of entry '.site.ccin2p3:'.
|
|
50
|
+
|
|
51
|
+
This is an example of how to overwrite selected requirements in your BPS
|
|
52
|
+
submission file:
|
|
53
|
+
|
|
54
|
+
.. code-block:: yaml
|
|
55
|
+
|
|
56
|
+
wmsServiceClass: lsst.ctrl.bps.parsl.ParslService
|
|
57
|
+
computeSite: ccin2p3
|
|
58
|
+
|
|
59
|
+
site:
|
|
60
|
+
ccin2p3:
|
|
61
|
+
class: lsst.ctrl.bps.parsl.sites.ccin2p3.Ccin2p3
|
|
62
|
+
walltime: "72:00:00"
|
|
63
|
+
scheduler_options:
|
|
64
|
+
- "--licenses=sps"
|
|
65
|
+
- "--qos=normal"
|
|
66
|
+
small:
|
|
67
|
+
memory: 6
|
|
68
|
+
partition: "flash"
|
|
69
|
+
medium:
|
|
70
|
+
memory: 10
|
|
71
|
+
partition: "lsst,htc"
|
|
72
|
+
large:
|
|
73
|
+
memory: 80
|
|
74
|
+
xlarge:
|
|
75
|
+
memory: 180
|
|
76
|
+
partition: "lsst"
|
|
77
|
+
scheduler_options:
|
|
78
|
+
- "--constraint=el7"
|
|
79
|
+
- "--licenses=my_product"
|
|
80
|
+
- "--reservation=my_reservation"
|
|
81
|
+
|
|
82
|
+
At the level of entry 'site.ccin2p3:' in the BPS submission file, the
|
|
83
|
+
following configuration parameters are accepted, which apply to all slot
|
|
84
|
+
sizes:
|
|
85
|
+
|
|
86
|
+
- `partition` (`str`): name of the one or more configured partitions. If
|
|
87
|
+
more than one, separate them with comma (',').
|
|
88
|
+
(Default: "lsst,htc")
|
|
89
|
+
- `walltime` (`str`): walltime to require for the job (Default: "72:00:00")
|
|
90
|
+
- `scheduler_options` (`list` [`str`] ): scheduler options to send to Slurm
|
|
91
|
+
for scheduling purposes.
|
|
92
|
+
(Default: "--licenses=sps")
|
|
93
|
+
|
|
94
|
+
In addition, as shown in the previous example, for each job slot (i.e.
|
|
95
|
+
"small", "medium", etc.) you can specify the requirements above as well as
|
|
96
|
+
the following:
|
|
97
|
+
|
|
98
|
+
- `max_blocks` (`int`): maximum number of Slurm jobs that your workflow can
|
|
99
|
+
simultaneously use.
|
|
100
|
+
- `memory` (`int`): required amount of memory for each job, in Gigabytes.
|
|
101
|
+
(Defaults: 4 for "small", 10 for "medium", 50 fo "large" and
|
|
102
|
+
150 for "xlarge").
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
*args : optional
|
|
107
|
+
Arguments to initialize the super-class.
|
|
108
|
+
**kwargs : optional
|
|
109
|
+
Keyword arguments to initialize the super-class.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
Ccin2p3 : `SiteConfig`
|
|
114
|
+
Concrete instance of a `SiteConfig` specific for the CC-IN2P3 Slurm
|
|
115
|
+
farm.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
DEFAULT_ACCOUNT: str = "lsst"
|
|
119
|
+
DEFAULT_WALLTIME: str = "72:00:00"
|
|
120
|
+
DEFAULT_SCHEDULER_OPTIONS: list[str] = [
|
|
121
|
+
"--licenses=sps",
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
def __init__(self, *args, **kwargs):
|
|
125
|
+
super().__init__(*args, **kwargs)
|
|
126
|
+
self._account = get_bps_config_value(self.site, ".account", str, self.DEFAULT_ACCOUNT)
|
|
127
|
+
self._scheduler_options = get_bps_config_value(
|
|
128
|
+
self.site, ".scheduler_options", list, self.DEFAULT_SCHEDULER_OPTIONS
|
|
129
|
+
)
|
|
130
|
+
self._slot_size = {
|
|
131
|
+
"small": {
|
|
132
|
+
"memory": get_bps_config_value(self.site, ".small.memory", int, 4),
|
|
133
|
+
"walltime": self._get_walltime_for_slot("small"),
|
|
134
|
+
"partition": self._get_partition_for_slot("small"),
|
|
135
|
+
"max_blocks": get_bps_config_value(self.site, ".small.max_blocks", int, 3_000),
|
|
136
|
+
"scheduler_options": get_bps_config_value(self.site, ".small.scheduler_options", list, []),
|
|
137
|
+
},
|
|
138
|
+
"medium": {
|
|
139
|
+
"memory": get_bps_config_value(self.site, ".medium.memory", int, 10),
|
|
140
|
+
"walltime": self._get_walltime_for_slot("medium"),
|
|
141
|
+
"partition": self._get_partition_for_slot("medium"),
|
|
142
|
+
"max_blocks": get_bps_config_value(self.site, ".medium.max_blocks", int, 1_000),
|
|
143
|
+
"scheduler_options": get_bps_config_value(self.site, ".medium.scheduler_options", list, []),
|
|
144
|
+
},
|
|
145
|
+
"large": {
|
|
146
|
+
"memory": get_bps_config_value(self.site, ".large.memory", int, 50),
|
|
147
|
+
"walltime": self._get_walltime_for_slot("large"),
|
|
148
|
+
"partition": self._get_partition_for_slot("large"),
|
|
149
|
+
"max_blocks": get_bps_config_value(self.site, ".large.max_blocks", int, 100),
|
|
150
|
+
"scheduler_options": get_bps_config_value(self.site, ".large.scheduler_options", list, []),
|
|
151
|
+
},
|
|
152
|
+
"xlarge": {
|
|
153
|
+
"memory": get_bps_config_value(self.site, ".xlarge.memory", int, 150),
|
|
154
|
+
"walltime": self._get_walltime_for_slot("xlarge"),
|
|
155
|
+
"partition": self._get_partition_for_slot("xlarge"),
|
|
156
|
+
"max_blocks": get_bps_config_value(self.site, ".xlarge.max_blocks", int, 10),
|
|
157
|
+
"scheduler_options": get_bps_config_value(self.site, ".xlarge.scheduler_options", list, []),
|
|
158
|
+
},
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
def _get_partition_for_slot(self, slot: str) -> str:
|
|
162
|
+
"""Return the Slurm partition Parsl must use to submit jobs for the
|
|
163
|
+
job slot `slot`. Values of `slot` can be "small", "medium", "large"
|
|
164
|
+
or "xlarge".
|
|
165
|
+
"""
|
|
166
|
+
# The target Slurm partition must be selected according to the type of
|
|
167
|
+
# the job slot but also according to the CPU architecture of the
|
|
168
|
+
# compute node.
|
|
169
|
+
#
|
|
170
|
+
# Parsl requires that the CPU architecture of its orchestrator to
|
|
171
|
+
# be identical to the architecture of its executors. Therefore,
|
|
172
|
+
# we need to ensure that Slurm schedules our Parsl executors on
|
|
173
|
+
# compute nodes with the same architecture as the host where this
|
|
174
|
+
# orchestrator runs.
|
|
175
|
+
|
|
176
|
+
# Default target Slurm partitions per CPU architecture
|
|
177
|
+
default_partition = {
|
|
178
|
+
"aarch64": {
|
|
179
|
+
"small": "htc_arm",
|
|
180
|
+
"medium": "htc_arm",
|
|
181
|
+
"large": "htc_arm",
|
|
182
|
+
"xlarge": "htc_arm",
|
|
183
|
+
},
|
|
184
|
+
"x86_64": {
|
|
185
|
+
"small": "lsst,htc",
|
|
186
|
+
"medium": "lsst",
|
|
187
|
+
"large": "lsst",
|
|
188
|
+
"xlarge": "lsst",
|
|
189
|
+
},
|
|
190
|
+
}
|
|
191
|
+
architecture = platform.machine()
|
|
192
|
+
if architecture not in default_partition:
|
|
193
|
+
raise ValueError(f"architecture {architecture} is not supported")
|
|
194
|
+
|
|
195
|
+
# If a partition was specified in the workflow description file
|
|
196
|
+
# specifically for this job slot, use that partition. For instance:
|
|
197
|
+
#
|
|
198
|
+
# site:
|
|
199
|
+
# ccin2p3:
|
|
200
|
+
# class: lsst.ctrl.bps.parsl.sites.ccin2p3.Ccin2p3
|
|
201
|
+
# small:
|
|
202
|
+
# partition: htc
|
|
203
|
+
slot_partition = get_bps_config_value(self.site, f".{slot}.partition", str, "")
|
|
204
|
+
if slot_partition != "":
|
|
205
|
+
return slot_partition
|
|
206
|
+
|
|
207
|
+
# If a partition was specified in the workflow description file at
|
|
208
|
+
# the site level, use that partition. For instance:
|
|
209
|
+
#
|
|
210
|
+
# site:
|
|
211
|
+
# ccin2p3:
|
|
212
|
+
# class: lsst.ctrl.bps.parsl.sites.ccin2p3.Ccin2p3
|
|
213
|
+
# partition: htc
|
|
214
|
+
#
|
|
215
|
+
# Otherwise, use the default for this slot on this architecture.
|
|
216
|
+
return get_bps_config_value(self.site, ".partition", str, default_partition[architecture][slot])
|
|
217
|
+
|
|
218
|
+
def _get_walltime_for_slot(self, slot: str) -> str:
|
|
219
|
+
"""Return the value for walltime Parsl must use to submit jobs for the
|
|
220
|
+
job slot `slot`. Values of `slot` can be "small", "medium", "large"
|
|
221
|
+
or "xlarge".
|
|
222
|
+
"""
|
|
223
|
+
# If a specific walltime value was specified for this job slot in the
|
|
224
|
+
# configuration use that value. For instance:
|
|
225
|
+
#
|
|
226
|
+
# site:
|
|
227
|
+
# ccin2p3:
|
|
228
|
+
# class: lsst.ctrl.bps.parsl.sites.ccin2p3.Ccin2p3
|
|
229
|
+
# small:
|
|
230
|
+
# walltime: "3:00:00"
|
|
231
|
+
slot_walltime = get_bps_config_value(self.site, f".{slot}.walltime", str, "")
|
|
232
|
+
if slot_walltime != "":
|
|
233
|
+
return slot_walltime
|
|
234
|
+
|
|
235
|
+
# If a walltime value was specified for the site use that value.
|
|
236
|
+
# Otherwise, use the default walltime. For instance:
|
|
237
|
+
#
|
|
238
|
+
# site:
|
|
239
|
+
# ccin2p3:
|
|
240
|
+
# class: lsst.ctrl.bps.parsl.sites.ccin2p3.Ccin2p3
|
|
241
|
+
# walltime: "3:00:00"
|
|
242
|
+
return get_bps_config_value(self.site, ".walltime", str, self.DEFAULT_WALLTIME)
|
|
243
|
+
|
|
244
|
+
def get_executors(self) -> list[ParslExecutor]:
|
|
245
|
+
"""Get a list of Parsl executors that can be used for processing a
|
|
246
|
+
workflow.
|
|
247
|
+
|
|
248
|
+
Each executor must have a unique ``label``.
|
|
249
|
+
"""
|
|
250
|
+
executors: list[ParslExecutor] = []
|
|
251
|
+
for label, slot in self._slot_size.items():
|
|
252
|
+
# Compute the scheduler options for this job slot. Options
|
|
253
|
+
# specified at the slot level in the configuration file
|
|
254
|
+
# overwrite those specified at the site level.
|
|
255
|
+
scheduler_options = copy.deepcopy(self._scheduler_options)
|
|
256
|
+
if slot_scheduler_options := slot.get("scheduler_options", []):
|
|
257
|
+
scheduler_options = copy.deepcopy(slot_scheduler_options)
|
|
258
|
+
|
|
259
|
+
options = f"#SBATCH {' '.join(opt for opt in scheduler_options)}" if scheduler_options else ""
|
|
260
|
+
|
|
261
|
+
executor = HighThroughputExecutor(
|
|
262
|
+
label,
|
|
263
|
+
provider=SlurmProvider(
|
|
264
|
+
# Slurm partition to request blocks from.
|
|
265
|
+
partition=slot["partition"],
|
|
266
|
+
# Slurm account to which to charge resources used by the
|
|
267
|
+
# job.
|
|
268
|
+
account=self._account,
|
|
269
|
+
# Nodes to provision per block (1 block = 1 CPU core).
|
|
270
|
+
nodes_per_block=1,
|
|
271
|
+
# Number of CPU cores to provision per node.
|
|
272
|
+
cores_per_node=1,
|
|
273
|
+
# Memory per node (GB) for each Slurm job.
|
|
274
|
+
mem_per_node=slot["memory"],
|
|
275
|
+
# Initial number of blocks.
|
|
276
|
+
init_blocks=0,
|
|
277
|
+
# Minimum number of blocks to maintain.
|
|
278
|
+
min_blocks=0,
|
|
279
|
+
# Maximum number of blocks to maintain.
|
|
280
|
+
max_blocks=slot["max_blocks"],
|
|
281
|
+
# Time limit for each Slurm job.
|
|
282
|
+
walltime=slot["walltime"],
|
|
283
|
+
# '#SBATCH' directives to prepend to the Slurm submission
|
|
284
|
+
# script.
|
|
285
|
+
scheduler_options=options,
|
|
286
|
+
# Set the number of file descriptors and processes to
|
|
287
|
+
# the maximum allowed.
|
|
288
|
+
worker_init="ulimit -n hard && ulimit -u hard",
|
|
289
|
+
# Requests nodes which are not shared with other running
|
|
290
|
+
# jobs.
|
|
291
|
+
exclusive=False,
|
|
292
|
+
),
|
|
293
|
+
# Address to connect to the main Parsl process.
|
|
294
|
+
address=self.get_address(),
|
|
295
|
+
# GB of memory required per worker. If specified the node
|
|
296
|
+
# manager will check the available memory at startup and limit
|
|
297
|
+
# the number of workers such that the there’s sufficient memory
|
|
298
|
+
# for each worker.
|
|
299
|
+
mem_per_worker=None,
|
|
300
|
+
# Caps the number of workers launched per node.
|
|
301
|
+
max_workers_per_node=1,
|
|
302
|
+
# Timeout period (in milliseconds) to be used by the
|
|
303
|
+
# executor components.
|
|
304
|
+
poll_period=1_000,
|
|
305
|
+
# Retry submitting to Slurm in case of submission error.
|
|
306
|
+
block_error_handler=False,
|
|
307
|
+
)
|
|
308
|
+
executors.append(executor)
|
|
309
|
+
|
|
310
|
+
return executors
|
|
311
|
+
|
|
312
|
+
def select_executor(self, job: "ParslJob") -> str:
|
|
313
|
+
"""Get the ``label`` of the executor to use to execute ``job``.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
job : `ParslJob`
|
|
318
|
+
Job to be executed.
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
label : `str`
|
|
323
|
+
Label of executor to use to execute ``job``.
|
|
324
|
+
"""
|
|
325
|
+
# We choose the executor to use based only on the memory required
|
|
326
|
+
# by the job.
|
|
327
|
+
memory = job.generic.request_memory / 1024 # Convert to GB
|
|
328
|
+
for label in ("small", "medium", "large"):
|
|
329
|
+
if memory <= self._slot_size[label]["memory"]:
|
|
330
|
+
return label
|
|
331
|
+
|
|
332
|
+
return "xlarge"
|
|
333
|
+
|
|
334
|
+
def get_parsl_config(self) -> parsl.config.Config:
|
|
335
|
+
"""Get Parsl configuration for using CC-IN2P3 Slurm farm as a
|
|
336
|
+
Parsl execution site.
|
|
337
|
+
|
|
338
|
+
Returns
|
|
339
|
+
-------
|
|
340
|
+
config : `parsl.config.Config`
|
|
341
|
+
The configuration to be used to initialize Parsl for this site.
|
|
342
|
+
"""
|
|
343
|
+
executors = self.get_executors()
|
|
344
|
+
monitor = self.get_monitor()
|
|
345
|
+
|
|
346
|
+
# Number of retries in case of job failure.
|
|
347
|
+
retries = get_bps_config_value(self.site, ".retries", int, 0)
|
|
348
|
+
|
|
349
|
+
# Path to run directory.
|
|
350
|
+
run_dir = get_bps_config_value(self.site, ".run_dir", str, "parsl_runinfo")
|
|
351
|
+
|
|
352
|
+
# Strategy for scaling blocks according to workflow needs.
|
|
353
|
+
# Use a strategy that allows for scaling up and down Parsl workers.
|
|
354
|
+
strategy = get_bps_config_value(self.site, ".strategy", str, "htex_auto_scale")
|
|
355
|
+
|
|
356
|
+
return parsl.config.Config(
|
|
357
|
+
executors=executors,
|
|
358
|
+
monitoring=monitor,
|
|
359
|
+
retries=retries,
|
|
360
|
+
checkpoint_mode="task_exit",
|
|
361
|
+
run_dir=run_dir,
|
|
362
|
+
strategy=strategy,
|
|
363
|
+
)
|
|
@@ -53,7 +53,7 @@ class Local(SiteConfig):
|
|
|
53
53
|
Each executor should have a unique ``label``.
|
|
54
54
|
"""
|
|
55
55
|
cores = get_bps_config_value(self.site, "cores", int, required=True)
|
|
56
|
-
return [HighThroughputExecutor("local", provider=LocalProvider(),
|
|
56
|
+
return [HighThroughputExecutor("local", provider=LocalProvider(), max_workers_per_node=cores)]
|
|
57
57
|
|
|
58
58
|
def select_executor(self, job: "ParslJob") -> str:
|
|
59
59
|
"""Get the ``label`` of the executor to use to execute a job.
|
|
@@ -58,19 +58,8 @@ class Tiger(Slurm):
|
|
|
58
58
|
``True``.
|
|
59
59
|
|
|
60
60
|
When running on the Tiger cluster, you should operate on the
|
|
61
|
-
``/scratch/gpfs`` filesystem, rather than ``/projects`` or ``/tigress
|
|
62
|
-
|
|
63
|
-
head nodes. Your BPS config should contain::
|
|
64
|
-
|
|
65
|
-
includeConfigs:
|
|
66
|
-
- ${CTRL_BPS_PARSL_DIR}/etc/execution_butler_copy_files.yaml
|
|
67
|
-
|
|
68
|
-
This will cause the necessary files to be transferred from your repo
|
|
69
|
-
(presumably on ``/projects`` or ``/tigress``) to the execution butler in
|
|
70
|
-
your submission directory (presumably on ``/scratch/gpfs``). Failure to do
|
|
71
|
-
so will result in about a 6x slowdown, and probably degrading performance
|
|
72
|
-
for other users. The results will be copied back to the original repo when
|
|
73
|
-
everything has completed.
|
|
61
|
+
``/scratch/gpfs`` filesystem, rather than ``/projects`` or ``/tigress``;
|
|
62
|
+
the latter are not even mounted on the cluster nodes any more.
|
|
74
63
|
"""
|
|
75
64
|
|
|
76
65
|
def get_executors(self) -> list[ParslExecutor]:
|
|
@@ -94,9 +83,9 @@ class Tiger(Slurm):
|
|
|
94
83
|
self.make_executor(
|
|
95
84
|
"tiger",
|
|
96
85
|
nodes=4,
|
|
97
|
-
cores_per_node=
|
|
86
|
+
cores_per_node=112,
|
|
98
87
|
walltime="05:00:00", # Ensures we get into qos=tiger-vshort, which cuts off at 5h
|
|
99
|
-
mem_per_node=
|
|
88
|
+
mem_per_node=980, # Ensures all nodes are available, reserving a little for OS services
|
|
100
89
|
singleton=True,
|
|
101
90
|
provider_options={
|
|
102
91
|
"init_blocks": 1,
|
|
@@ -75,6 +75,7 @@ class Slurm(SiteConfig):
|
|
|
75
75
|
default we use whatever Slurm gives us.
|
|
76
76
|
- ``singleton`` (`bool`): allow only one job to run at a time; by default
|
|
77
77
|
``False``.
|
|
78
|
+
- ``account`` (`str`): account to use for Slurm jobs.
|
|
78
79
|
- ``scheduler_options`` (`str`): text to prepend to the Slurm submission
|
|
79
80
|
script (each line usually starting with ``#SBATCH``).
|
|
80
81
|
"""
|
|
@@ -135,6 +136,7 @@ class Slurm(SiteConfig):
|
|
|
135
136
|
mem_per_node = get_bps_config_value(self.site, "mem_per_node", int, mem_per_node)
|
|
136
137
|
qos = get_bps_config_value(self.site, "qos", str, qos)
|
|
137
138
|
singleton = get_bps_config_value(self.site, "singleton", bool, singleton)
|
|
139
|
+
account = get_bps_config_value(self.site, "account", str)
|
|
138
140
|
scheduler_options = get_bps_config_value(self.site, "scheduler_options", str, scheduler_options)
|
|
139
141
|
|
|
140
142
|
job_name = get_workflow_name(self.config)
|
|
@@ -163,6 +165,7 @@ class Slurm(SiteConfig):
|
|
|
163
165
|
cores_per_node=cores_per_node,
|
|
164
166
|
mem_per_node=mem_per_node,
|
|
165
167
|
walltime=walltime,
|
|
168
|
+
account=account,
|
|
166
169
|
scheduler_options=scheduler_options,
|
|
167
170
|
**(provider_options or {}),
|
|
168
171
|
),
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# This file is part of ctrl_bps_parsl.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (https://www.lsst.org) and the LSST DESC (https://www.lsstdesc.org/).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# This software is dual licensed under the GNU General Public License and also
|
|
10
|
+
# under a 3-clause BSD license. Recipients may choose which of these licenses
|
|
11
|
+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
|
|
12
|
+
# respectively. If you choose the GPL option then the following text applies
|
|
13
|
+
# (but note that there is still no warranty even if you opt for BSD instead):
|
|
14
|
+
#
|
|
15
|
+
# This program is free software: you can redistribute it and/or modify
|
|
16
|
+
# it under the terms of the GNU General Public License as published by
|
|
17
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
18
|
+
# (at your option) any later version.
|
|
19
|
+
#
|
|
20
|
+
# This program is distributed in the hope that it will be useful,
|
|
21
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23
|
+
# GNU General Public License for more details.
|
|
24
|
+
#
|
|
25
|
+
# You should have received a copy of the GNU General Public License
|
|
26
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
27
|
+
|
|
28
|
+
from typing import TYPE_CHECKING, Any
|
|
29
|
+
|
|
30
|
+
from parsl.executors import HighThroughputExecutor
|
|
31
|
+
from parsl.executors.base import ParslExecutor
|
|
32
|
+
from parsl.launchers import MpiRunLauncher
|
|
33
|
+
from parsl.providers import TorqueProvider
|
|
34
|
+
|
|
35
|
+
from ..configuration import get_bps_config_value, get_workflow_name
|
|
36
|
+
from ..site import SiteConfig
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from ..job import ParslJob
|
|
40
|
+
|
|
41
|
+
__all__ = ("Torque",)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
Kwargs = dict[str, Any]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Torque(SiteConfig):
|
|
48
|
+
"""Configuration for generic Torque cluster.
|
|
49
|
+
|
|
50
|
+
This can be used directly as the site configuration for a Torque cluster by
|
|
51
|
+
setting the BPS config, e.g.:
|
|
52
|
+
|
|
53
|
+
.. code-block:: yaml
|
|
54
|
+
|
|
55
|
+
computeSite: torque
|
|
56
|
+
site:
|
|
57
|
+
torque:
|
|
58
|
+
class: lsst.ctrl.bps.parsl.sites.Torque
|
|
59
|
+
nodes: 4
|
|
60
|
+
tasks_per_node: 20
|
|
61
|
+
walltime: "00:59:00" # Note: always quote walltime in YAML
|
|
62
|
+
|
|
63
|
+
Alternatively, it can be used as a base class for Torque cluster
|
|
64
|
+
configurations.
|
|
65
|
+
|
|
66
|
+
The following BPS configuration parameters are recognised (and required
|
|
67
|
+
unless there is a default mentioned here, or provided by a subclass):
|
|
68
|
+
|
|
69
|
+
- ``queue`` (`int`): Queue for the Torque job.
|
|
70
|
+
- ``nodes`` (`int`): number of nodes for each Torque job.
|
|
71
|
+
- ``tasks_per_node`` (`int`): number of cores per node for each Torque job;
|
|
72
|
+
by default we use all cores on the node.
|
|
73
|
+
- ``walltime`` (`str`): time limit for each Torque job.
|
|
74
|
+
- ``scheduler_options`` (`str`): text to prepend to the Torque submission
|
|
75
|
+
script (each line usually starting with ``#PBS``).
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def make_executor(
|
|
79
|
+
self,
|
|
80
|
+
label: str,
|
|
81
|
+
*,
|
|
82
|
+
queue: str | None = None,
|
|
83
|
+
nodes: int | None = None,
|
|
84
|
+
tasks_per_node: int | None = None,
|
|
85
|
+
walltime: str | None = None,
|
|
86
|
+
mem_per_worker: float | None = None,
|
|
87
|
+
scheduler_options: str | None = None,
|
|
88
|
+
worker_init: str | None = None,
|
|
89
|
+
provider_options: Kwargs | None = None,
|
|
90
|
+
executor_options: Kwargs | None = None,
|
|
91
|
+
) -> ParslExecutor:
|
|
92
|
+
"""Return an executor for running on a Torque cluster.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
label : `str`
|
|
97
|
+
Label for executor.
|
|
98
|
+
queue : `str`, optional
|
|
99
|
+
Queue for the Torque job.
|
|
100
|
+
nodes : `int`, optional
|
|
101
|
+
Default number of nodes for each Torque job.
|
|
102
|
+
tasks_per_node : `int`, optional
|
|
103
|
+
Default number of cores per node for each Torque job.
|
|
104
|
+
walltime : `str`, optional
|
|
105
|
+
Default time limit for each Torque job.
|
|
106
|
+
mem_per_worker : `float`, optional
|
|
107
|
+
Minimum memory per worker (GB), limited by the executor.
|
|
108
|
+
worker_init : `str`, optional
|
|
109
|
+
Environment initiation command
|
|
110
|
+
scheduler_options : `str`, optional
|
|
111
|
+
``#SBATCH`` directives to prepend to the Torque submission script.
|
|
112
|
+
provider_options : `dict`, optional
|
|
113
|
+
Additional arguments for `TorqueProvider` constructor.
|
|
114
|
+
executor_options : `dict`, optional
|
|
115
|
+
Additional arguments for `HighThroughputExecutor` constructor.
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
executor : `HighThroughputExecutor`
|
|
120
|
+
Executor for Torque jobs.
|
|
121
|
+
"""
|
|
122
|
+
nodes = get_bps_config_value(self.site, "nodes", int, nodes, required=True)
|
|
123
|
+
walltime = get_bps_config_value(self.site, "walltime", str, walltime, required=True)
|
|
124
|
+
queue = get_bps_config_value(self.site, "queue", str, queue)
|
|
125
|
+
tasks_per_node = get_bps_config_value(self.site, "tasks_per_node", int, tasks_per_node)
|
|
126
|
+
worker_init = get_bps_config_value(self.site, "worker_init", str, "")
|
|
127
|
+
scheduler_options = get_bps_config_value(self.site, "scheduler_options", str, scheduler_options)
|
|
128
|
+
|
|
129
|
+
if tasks_per_node is None:
|
|
130
|
+
tasks_per_node = 1
|
|
131
|
+
|
|
132
|
+
job_name = get_workflow_name(self.config)
|
|
133
|
+
|
|
134
|
+
if scheduler_options is None:
|
|
135
|
+
scheduler_options = ""
|
|
136
|
+
else:
|
|
137
|
+
scheduler_options += "\n"
|
|
138
|
+
scheduler_options += f"#PBS -N {job_name}\n"
|
|
139
|
+
if queue:
|
|
140
|
+
scheduler_options += f"#PBS -q {queue}\n"
|
|
141
|
+
|
|
142
|
+
if worker_init is None:
|
|
143
|
+
worker_init = ""
|
|
144
|
+
|
|
145
|
+
launcher = PbsMpiRunLauncher(overrides=f"--map-by core:{tasks_per_node}")
|
|
146
|
+
|
|
147
|
+
return HighThroughputExecutor(
|
|
148
|
+
label,
|
|
149
|
+
provider=PbsTorqueProvider(
|
|
150
|
+
nodes_per_block=nodes,
|
|
151
|
+
tasks_per_node=tasks_per_node,
|
|
152
|
+
queue=queue,
|
|
153
|
+
walltime=walltime,
|
|
154
|
+
scheduler_options=scheduler_options,
|
|
155
|
+
worker_init=worker_init,
|
|
156
|
+
launcher=launcher,
|
|
157
|
+
**(provider_options or {}),
|
|
158
|
+
),
|
|
159
|
+
max_workers_per_node=1,
|
|
160
|
+
mem_per_worker=mem_per_worker,
|
|
161
|
+
address=self.get_address(),
|
|
162
|
+
**(executor_options or {}),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def get_executors(self) -> list[ParslExecutor]:
|
|
166
|
+
"""Get a list of executors to be used in processing.
|
|
167
|
+
|
|
168
|
+
Each executor should have a unique ``label``.
|
|
169
|
+
"""
|
|
170
|
+
return [self.make_executor("torque")]
|
|
171
|
+
|
|
172
|
+
def select_executor(self, job: "ParslJob") -> str:
|
|
173
|
+
"""Get the ``label`` of the executor to use to execute a job.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
job : `ParslJob`
|
|
178
|
+
Job to be executed.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
label : `str`
|
|
183
|
+
Label of executor to use to execute ``job``.
|
|
184
|
+
"""
|
|
185
|
+
return "torque"
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class PbsTorqueProvider(TorqueProvider):
|
|
189
|
+
"""Torque Execution Provider
|
|
190
|
+
|
|
191
|
+
This provider uses qsub to submit, qstat for status, and qdel to cancel
|
|
192
|
+
jobs. The qsub script to be used is created from a template file in this
|
|
193
|
+
same module.
|
|
194
|
+
|
|
195
|
+
This subclass allows the ``tasks_per_node`` to be set at construction time
|
|
196
|
+
instead of at submission time.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, *args, tasks_per_node: int = 1, **kwargs):
|
|
200
|
+
super().__init__(*args, **kwargs)
|
|
201
|
+
self.tasks_per_node = tasks_per_node
|
|
202
|
+
|
|
203
|
+
def submit(self, command, tasks_per_node, job_name="parsl.torque"):
|
|
204
|
+
"""Submit the command onto an Local Resource Manager job.
|
|
205
|
+
|
|
206
|
+
This function returns an ID that corresponds to the task that was just
|
|
207
|
+
submitted.
|
|
208
|
+
|
|
209
|
+
The ``tasks_per_node`` parameter is ignored in this provider, as it is
|
|
210
|
+
set at construction time.
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
command : `str`
|
|
215
|
+
Command-line invocation to be made on the remote side.
|
|
216
|
+
tasks_per_node : `int`
|
|
217
|
+
Number of tasks to be launched per node. This is ignored in this
|
|
218
|
+
provider.
|
|
219
|
+
job_name : `str`:
|
|
220
|
+
Name for job, must be unique.
|
|
221
|
+
|
|
222
|
+
Returns
|
|
223
|
+
-------
|
|
224
|
+
None: At capacity, cannot provision more
|
|
225
|
+
job_id (string): Identifier for the job
|
|
226
|
+
|
|
227
|
+
"""
|
|
228
|
+
return super().submit(
|
|
229
|
+
command=command,
|
|
230
|
+
tasks_per_node=self.tasks_per_node,
|
|
231
|
+
job_name=job_name,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class PbsMpiRunLauncher(MpiRunLauncher):
|
|
236
|
+
"""Worker launcher that wraps the user's command with the framework to
|
|
237
|
+
launch multiple command invocations via ``mpirun``.
|
|
238
|
+
|
|
239
|
+
This wrapper sets the bash env variable ``CORES`` to the number of cores on
|
|
240
|
+
the machine.
|
|
241
|
+
|
|
242
|
+
This launcher makes the following assumptions:
|
|
243
|
+
- mpirun is installed and can be located in ``$PATH``
|
|
244
|
+
- The provider makes available the ``$PBS_NODEFILE`` environment variable
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
def __init__(
|
|
248
|
+
self,
|
|
249
|
+
debug: bool = True,
|
|
250
|
+
bash_location: str = "/bin/bash",
|
|
251
|
+
overrides: str = "",
|
|
252
|
+
):
|
|
253
|
+
super().__init__(debug=debug, bash_location=bash_location, overrides=overrides)
|
|
254
|
+
|
|
255
|
+
def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> str:
|
|
256
|
+
"""Wrap the user's command with mpirun invocation"""
|
|
257
|
+
worker_count = nodes_per_block * tasks_per_node
|
|
258
|
+
debug_num = int(self.debug)
|
|
259
|
+
|
|
260
|
+
return f"""set -e
|
|
261
|
+
export CORES=$(getconf _NPROCESSORS_ONLN)
|
|
262
|
+
[[ "{debug_num}" == "1" ]] && echo "Found cores : $CORES"
|
|
263
|
+
WORKERCOUNT={worker_count}
|
|
264
|
+
|
|
265
|
+
cat << MPIRUN_EOF > cmd_$JOBNAME.sh
|
|
266
|
+
{command}
|
|
267
|
+
MPIRUN_EOF
|
|
268
|
+
chmod u+x cmd_$JOBNAME.sh
|
|
269
|
+
|
|
270
|
+
mpirun -np $WORKERCOUNT {self.overrides} {self.bash_location} cmd_$JOBNAME.sh
|
|
271
|
+
|
|
272
|
+
[[ "{debug_num}" == "1" ]] && echo "All workers done"
|
|
273
|
+
"""
|
|
@@ -32,11 +32,12 @@ from collections.abc import Iterable, Mapping
|
|
|
32
32
|
|
|
33
33
|
import parsl
|
|
34
34
|
import parsl.config
|
|
35
|
-
from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob
|
|
36
35
|
from parsl.app.app import bash_app
|
|
37
36
|
from parsl.app.bash import BashApp
|
|
38
37
|
from parsl.app.futures import Future
|
|
39
38
|
|
|
39
|
+
from lsst.ctrl.bps import BaseWmsWorkflow, BpsConfig, GenericWorkflow, GenericWorkflowJob
|
|
40
|
+
|
|
40
41
|
from .configuration import get_workflow_filename, set_parsl_logging
|
|
41
42
|
from .job import ParslJob, get_file_paths
|
|
42
43
|
from .site import SiteConfig
|
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: lsst-ctrl-bps-parsl
|
|
3
|
-
Version:
|
|
3
|
+
Version: 29.2025.4900
|
|
4
4
|
Summary: Parsl-based plugin for lsst-ctrl-bps.
|
|
5
5
|
Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
|
|
6
|
-
License: BSD
|
|
6
|
+
License-Expression: BSD-3-Clause OR GPL-3.0-or-later
|
|
7
7
|
Project-URL: Homepage, https://github.com/lsst/ctrl_bps_parsl
|
|
8
8
|
Keywords: lsst
|
|
9
9
|
Classifier: Intended Audience :: Science/Research
|
|
10
|
-
Classifier: License :: OSI Approved :: BSD License
|
|
11
10
|
Classifier: Operating System :: OS Independent
|
|
12
11
|
Classifier: Programming Language :: Python :: 3
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
16
|
Classifier: Topic :: Scientific/Engineering :: Astronomy
|
|
16
17
|
Requires-Python: >=3.11.0
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
@@ -20,10 +21,11 @@ License-File: LICENSE
|
|
|
20
21
|
License-File: bsd_license.txt
|
|
21
22
|
License-File: gpl-v3.0.txt
|
|
22
23
|
Requires-Dist: lsst-ctrl-bps
|
|
23
|
-
Requires-Dist: parsl
|
|
24
|
+
Requires-Dist: parsl>=2024.03.04
|
|
24
25
|
Provides-Extra: test
|
|
25
26
|
Requires-Dist: pytest>=3.2; extra == "test"
|
|
26
27
|
Requires-Dist: pytest-openfiles>=0.5.0; extra == "test"
|
|
28
|
+
Dynamic: license-file
|
|
27
29
|
|
|
28
30
|
# ctrl_bps_parsl
|
|
29
31
|
|
|
@@ -20,6 +20,7 @@ python/lsst/ctrl/bps/parsl/sites/nersc.py
|
|
|
20
20
|
python/lsst/ctrl/bps/parsl/sites/princeton.py
|
|
21
21
|
python/lsst/ctrl/bps/parsl/sites/slac.py
|
|
22
22
|
python/lsst/ctrl/bps/parsl/sites/slurm.py
|
|
23
|
+
python/lsst/ctrl/bps/parsl/sites/torque.py
|
|
23
24
|
python/lsst/ctrl/bps/parsl/sites/work_queue.py
|
|
24
25
|
python/lsst_ctrl_bps_parsl.egg-info/PKG-INFO
|
|
25
26
|
python/lsst_ctrl_bps_parsl.egg-info/SOURCES.txt
|
|
@@ -1,245 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Any
|
|
2
|
-
|
|
3
|
-
import parsl.config
|
|
4
|
-
from parsl.executors import HighThroughputExecutor
|
|
5
|
-
from parsl.executors.base import ParslExecutor
|
|
6
|
-
from parsl.providers import SlurmProvider
|
|
7
|
-
|
|
8
|
-
from ..configuration import get_bps_config_value
|
|
9
|
-
from ..site import SiteConfig
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from .job import ParslJob
|
|
13
|
-
|
|
14
|
-
__all__ = ("Ccin2p3",)
|
|
15
|
-
|
|
16
|
-
Kwargs = dict[str, Any]
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class Ccin2p3(SiteConfig):
|
|
20
|
-
"""Configuration for running Parsl jobs in CC-IN2P3 Slurm batch farm.
|
|
21
|
-
|
|
22
|
-
This class provides 4 job slot sizes with different requirements, in
|
|
23
|
-
particular in terms of memory. Those slot sizes are named "small",
|
|
24
|
-
"medium", "large" and "xlarge".
|
|
25
|
-
|
|
26
|
-
Sensible default values for those requirements are provided for each
|
|
27
|
-
kind of job but you can specify different values either in the
|
|
28
|
-
the BPS submission file or in a site configuration file that you
|
|
29
|
-
include in your BPS submission file.
|
|
30
|
-
|
|
31
|
-
This is an example of how to modify the specifications for those job
|
|
32
|
-
slot sizes in the BPS submission file:
|
|
33
|
-
|
|
34
|
-
.. code-block:: yaml
|
|
35
|
-
|
|
36
|
-
wmsServiceClass: lsst.ctrl.bps.parsl.ParslService
|
|
37
|
-
computeSite: ccin2p3
|
|
38
|
-
|
|
39
|
-
site:
|
|
40
|
-
ccin2p3:
|
|
41
|
-
class: lsst.ctrl.bps.parsl.sites.Ccin2p3
|
|
42
|
-
walltime: "72:00:00"
|
|
43
|
-
qos: "normal"
|
|
44
|
-
small:
|
|
45
|
-
memory: 4
|
|
46
|
-
partition: "flash"
|
|
47
|
-
medium:
|
|
48
|
-
memory: 10
|
|
49
|
-
partition: "lsst,htc"
|
|
50
|
-
large:
|
|
51
|
-
memory: 50
|
|
52
|
-
xlarge:
|
|
53
|
-
memory: 150
|
|
54
|
-
partition: "lsst"
|
|
55
|
-
|
|
56
|
-
At the level of 'site:' entry in the BPS submission file, the following
|
|
57
|
-
configuration parameters are accepted, which apply to all slot sizes:
|
|
58
|
-
|
|
59
|
-
- `partition` (`str`): name of the one or more configured partitions. If
|
|
60
|
-
more than one, separate them with comma (',').
|
|
61
|
-
(Default: "lsst,htc")
|
|
62
|
-
- `qos` (`str`): quality of service to use (Default: "normal")
|
|
63
|
-
- `walltime` (`str`): walltime to require for the job (Default: "72:00:00")
|
|
64
|
-
|
|
65
|
-
For each kind of job slot (i.e. "small", "medium", etc.) you can specify
|
|
66
|
-
the parameters above as well as:
|
|
67
|
-
|
|
68
|
-
- `max_blocks` (`int`): maximum number of Slurm jobs that your workflow can
|
|
69
|
-
simultaneously use.
|
|
70
|
-
- ``memory`` (`int`): required amount of memory in Gigabytes.
|
|
71
|
-
|
|
72
|
-
as shown in the example above.
|
|
73
|
-
|
|
74
|
-
If you don't need to modify those values and use the default configuration
|
|
75
|
-
for all the job slot sizes use:
|
|
76
|
-
|
|
77
|
-
.. code-block:: yaml
|
|
78
|
-
|
|
79
|
-
wmsServiceClass: lsst.ctrl.bps.parsl.ParslService
|
|
80
|
-
computeSite: ccin2p3
|
|
81
|
-
|
|
82
|
-
site:
|
|
83
|
-
ccin2p3:
|
|
84
|
-
class: lsst.ctrl.bps.parsl.sites.Ccin2p3
|
|
85
|
-
|
|
86
|
-
Parameters
|
|
87
|
-
----------
|
|
88
|
-
*args : optional
|
|
89
|
-
Arguments to initialize the super-class.
|
|
90
|
-
**kwargs : optional
|
|
91
|
-
Keyword arguments to initialize the super-class.
|
|
92
|
-
|
|
93
|
-
Returns
|
|
94
|
-
-------
|
|
95
|
-
Ccin2p3 : `SiteConfig`
|
|
96
|
-
Concrete instance of a `SiteConfig` specific for the CC-IN2P3 Slurm
|
|
97
|
-
farm.
|
|
98
|
-
"""
|
|
99
|
-
|
|
100
|
-
def __init__(self, *args, **kwargs):
|
|
101
|
-
super().__init__(*args, **kwargs)
|
|
102
|
-
|
|
103
|
-
self._account = get_bps_config_value(self.site, "account", str, "lsst")
|
|
104
|
-
default_partition = get_bps_config_value(self.site, "partition", str, "lsst,htc")
|
|
105
|
-
default_qos = get_bps_config_value(self.site, "qos", str, "normal")
|
|
106
|
-
default_walltime = get_bps_config_value(self.site, "walltime", str, "72:00:00")
|
|
107
|
-
|
|
108
|
-
self._slot_size = {
|
|
109
|
-
"small": {
|
|
110
|
-
"max_blocks": get_bps_config_value(self.site, "small.max_blocks", int, 3_000),
|
|
111
|
-
"memory": get_bps_config_value(self.site, "small.memory", int, 4),
|
|
112
|
-
"partition": get_bps_config_value(self.site, "small.partition", str, default_partition),
|
|
113
|
-
"qos": get_bps_config_value(self.site, "small.qos", str, default_qos),
|
|
114
|
-
"walltime": get_bps_config_value(self.site, "small.walltime", str, default_walltime),
|
|
115
|
-
},
|
|
116
|
-
"medium": {
|
|
117
|
-
"max_blocks": get_bps_config_value(self.site, "medium.max_blocks", int, 1_000),
|
|
118
|
-
"memory": get_bps_config_value(self.site, "medium.memory", int, 10),
|
|
119
|
-
"partition": get_bps_config_value(self.site, "medium.partition", str, "lsst"),
|
|
120
|
-
"qos": get_bps_config_value(self.site, "medium.qos", str, default_qos),
|
|
121
|
-
"walltime": get_bps_config_value(self.site, "medium.walltime", str, default_walltime),
|
|
122
|
-
},
|
|
123
|
-
"large": {
|
|
124
|
-
"max_blocks": get_bps_config_value(self.site, "large.max_blocks", int, 100),
|
|
125
|
-
"memory": get_bps_config_value(self.site, "large.memory", int, 50),
|
|
126
|
-
"partition": get_bps_config_value(self.site, "large.partition", str, "lsst"),
|
|
127
|
-
"qos": get_bps_config_value(self.site, "large.qos", str, default_qos),
|
|
128
|
-
"walltime": get_bps_config_value(self.site, "large.walltime", str, default_walltime),
|
|
129
|
-
},
|
|
130
|
-
"xlarge": {
|
|
131
|
-
"max_blocks": get_bps_config_value(self.site, "xlarge.max_blocks", int, 10),
|
|
132
|
-
"memory": get_bps_config_value(self.site, "xlarge.memory", int, 150),
|
|
133
|
-
"partition": get_bps_config_value(self.site, "xlarge.partition", str, "lsst"),
|
|
134
|
-
"qos": get_bps_config_value(self.site, "xlarge.qos", str, default_qos),
|
|
135
|
-
"walltime": get_bps_config_value(self.site, "xlarge.walltime", str, default_walltime),
|
|
136
|
-
},
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
def get_executors(self) -> list[ParslExecutor]:
|
|
140
|
-
"""Get a list of executors to be used for processing a workflow.
|
|
141
|
-
Each executor must have a unique ``label``.
|
|
142
|
-
"""
|
|
143
|
-
executors: list[ParslExecutor] = []
|
|
144
|
-
for label, slot in self._slot_size.items():
|
|
145
|
-
qos = slot["qos"]
|
|
146
|
-
executor = HighThroughputExecutor(
|
|
147
|
-
label,
|
|
148
|
-
provider=SlurmProvider(
|
|
149
|
-
# Slurm partition to request blocks from.
|
|
150
|
-
partition=slot["partition"],
|
|
151
|
-
# Slurm account to which to charge resources used by the
|
|
152
|
-
# job.
|
|
153
|
-
account=self._account,
|
|
154
|
-
# Nodes to provision per block (1 block = 1 CPU core).
|
|
155
|
-
nodes_per_block=1,
|
|
156
|
-
# Number of CPU cores to provision per node.
|
|
157
|
-
cores_per_node=1,
|
|
158
|
-
# Memory per node (GB) for each Slurm job.
|
|
159
|
-
mem_per_node=slot["memory"],
|
|
160
|
-
# Initial number of blocks.
|
|
161
|
-
init_blocks=0,
|
|
162
|
-
# Minimum number of blocks to maintain.
|
|
163
|
-
min_blocks=0,
|
|
164
|
-
# Maximum number of blocks to maintain.
|
|
165
|
-
max_blocks=slot["max_blocks"],
|
|
166
|
-
# Time limit for each Slurm job.
|
|
167
|
-
walltime=slot["walltime"],
|
|
168
|
-
# '#SBATCH' directives to prepend to the Slurm submission
|
|
169
|
-
# script.
|
|
170
|
-
scheduler_options=f"#SBATCH --qos={qos} --licenses=sps",
|
|
171
|
-
# Set the number of file descriptors and processes to
|
|
172
|
-
# the maximum allowed.
|
|
173
|
-
worker_init="ulimit -n hard && ulimit -u hard",
|
|
174
|
-
# Requests nodes which are not shared with other running
|
|
175
|
-
# jobs.
|
|
176
|
-
exclusive=False,
|
|
177
|
-
# Should files be moved by Parsl?
|
|
178
|
-
move_files=False,
|
|
179
|
-
),
|
|
180
|
-
# Address to connect to the main Parsl process.
|
|
181
|
-
address=self.get_address(),
|
|
182
|
-
# GB of memory required per worker. If specified the node
|
|
183
|
-
# manager will check the available memory at startup and limit
|
|
184
|
-
# the number of workers such that the there’s sufficient memory
|
|
185
|
-
# for each worker.
|
|
186
|
-
mem_per_worker=None,
|
|
187
|
-
# Caps the number of workers launched per node.
|
|
188
|
-
max_workers=1,
|
|
189
|
-
# Timeout period (in milliseconds) to be used by the
|
|
190
|
-
# executor components.
|
|
191
|
-
poll_period=1_000,
|
|
192
|
-
# Retry submitting to Slurm in case of submission error.
|
|
193
|
-
block_error_handler=False,
|
|
194
|
-
)
|
|
195
|
-
executors.append(executor)
|
|
196
|
-
|
|
197
|
-
return executors
|
|
198
|
-
|
|
199
|
-
def select_executor(self, job: "ParslJob") -> str:
|
|
200
|
-
"""Get the ``label`` of the executor to use to execute ``job``.
|
|
201
|
-
|
|
202
|
-
Parameters
|
|
203
|
-
----------
|
|
204
|
-
job : `ParslJob`
|
|
205
|
-
Job to be executed.
|
|
206
|
-
|
|
207
|
-
Returns
|
|
208
|
-
-------
|
|
209
|
-
label : `str`
|
|
210
|
-
Label of executor to use to execute ``job``.
|
|
211
|
-
"""
|
|
212
|
-
# We choose the executor to use based only on the memory required
|
|
213
|
-
# by the job.
|
|
214
|
-
memory = job.generic.request_memory / 1024 # Convert to GB
|
|
215
|
-
for label in ("small", "medium", "large"):
|
|
216
|
-
if memory <= self._slot_size[label]["memory"]:
|
|
217
|
-
return label
|
|
218
|
-
|
|
219
|
-
return "xlarge"
|
|
220
|
-
|
|
221
|
-
def get_parsl_config(self) -> parsl.config.Config:
|
|
222
|
-
"""Get Parsl configuration for using CC-IN2P3 Slurm farm as a
|
|
223
|
-
Parsl execution site.
|
|
224
|
-
|
|
225
|
-
Returns
|
|
226
|
-
-------
|
|
227
|
-
config : `parsl.config.Config`
|
|
228
|
-
The configuration to be used to initialize Parsl for this site.
|
|
229
|
-
"""
|
|
230
|
-
executors = self.get_executors()
|
|
231
|
-
monitor = self.get_monitor()
|
|
232
|
-
retries = get_bps_config_value(self.site, "retries", int, 1)
|
|
233
|
-
run_dir = get_bps_config_value(self.site, "run_dir", str, "parsl_runinfo")
|
|
234
|
-
# Strategy for scaling blocks according to workflow needs.
|
|
235
|
-
# Use a strategy that allows for scaling in and out Parsl
|
|
236
|
-
# workers.
|
|
237
|
-
strategy = get_bps_config_value(self.site, "strategy", str, "htex_auto_scale")
|
|
238
|
-
return parsl.config.Config(
|
|
239
|
-
executors=executors,
|
|
240
|
-
monitoring=monitor,
|
|
241
|
-
retries=retries,
|
|
242
|
-
checkpoint_mode="task_exit",
|
|
243
|
-
run_dir=run_dir,
|
|
244
|
-
strategy=strategy,
|
|
245
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|