flux-batch 0.0.0__tar.gz → 0.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flux_batch-0.0.0/flux_batch.egg-info → flux_batch-0.0.11}/PKG-INFO +31 -6
- {flux_batch-0.0.0 → flux_batch-0.0.11}/README.md +25 -5
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/__init__.py +2 -1
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/jobspec.py +48 -1
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/models.py +5 -0
- flux_batch-0.0.11/flux_batch/script/__init__.py +16 -0
- flux_batch-0.0.11/flux_batch/script/save_logs.sh +16 -0
- flux_batch-0.0.11/flux_batch/service/__init__.py +81 -0
- flux_batch-0.0.11/flux_batch/service/scribe/__init__.py +2 -0
- flux_batch-0.0.11/flux_batch/service/scribe/__main__.py +113 -0
- flux_batch-0.0.11/flux_batch/service/scribe/database.py +150 -0
- flux_batch-0.0.11/flux_batch/service/scribe/models.py +94 -0
- flux_batch-0.0.11/flux_batch/service/scribe/template.py +53 -0
- flux_batch-0.0.11/flux_batch/submit.py +86 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/version.py +3 -2
- {flux_batch-0.0.0 → flux_batch-0.0.11/flux_batch.egg-info}/PKG-INFO +31 -6
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/SOURCES.txt +7 -1
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/requires.txt +6 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/setup.py +2 -0
- flux_batch-0.0.0/flux_batch/service/__init__.py +0 -31
- flux_batch-0.0.0/flux_batch/service/scribe.py +0 -12
- flux_batch-0.0.0/flux_batch/submit.py +0 -53
- {flux_batch-0.0.0 → flux_batch-0.0.11}/LICENSE +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/MANIFEST.in +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/NOTICE +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/logger/__init__.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/logger/generate.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/logger/logger.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/__init__.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/fileio.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/text.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/timer.py +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/dependency_links.txt +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/entry_points.txt +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/not-zip-safe +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/top_level.txt +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/pyproject.toml +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/setup.cfg +0 -0
- {flux_batch-0.0.0 → flux_batch-0.0.11}/tests/test_flux_batch.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flux-batch
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Python SDK for flux batch jobs and services
|
|
5
5
|
Home-page: https://github.com/converged-computing/flux-batch
|
|
6
6
|
Author: Vanessa Sochat
|
|
@@ -26,12 +26,17 @@ Provides-Extra: all
|
|
|
26
26
|
Requires-Dist: pyyaml; extra == "all"
|
|
27
27
|
Requires-Dist: ply; extra == "all"
|
|
28
28
|
Requires-Dist: pytest>=4.6.2; extra == "all"
|
|
29
|
+
Requires-Dist: sqlalchemy; extra == "all"
|
|
30
|
+
Requires-Dist: rich; extra == "all"
|
|
31
|
+
Provides-Extra: scribe
|
|
32
|
+
Requires-Dist: sqlalchemy; extra == "scribe"
|
|
33
|
+
Requires-Dist: rich; extra == "scribe"
|
|
29
34
|
|
|
30
35
|
# flux-batch
|
|
31
36
|
|
|
32
37
|
> Python SDK to generate Flux batch jobs and services
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+

|
|
35
40
|
|
|
36
41
|

|
|
37
42
|
|
|
@@ -46,6 +51,7 @@ Requires-Dist: pytest>=4.6.2; extra == "all"
|
|
|
46
51
|
|
|
47
52
|
- **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
|
|
48
53
|
|
|
54
|
+
|
|
49
55
|
## Usage
|
|
50
56
|
|
|
51
57
|
This is a small Flux utility that makes it easy to create Flux batch jobs and services.
|
|
@@ -60,9 +66,26 @@ flux start
|
|
|
60
66
|
pip install -e . --break-system-packages
|
|
61
67
|
```
|
|
62
68
|
|
|
63
|
-
###
|
|
69
|
+
### Examples
|
|
70
|
+
|
|
71
|
+
We have a few simple examples:
|
|
72
|
+
|
|
73
|
+
#### Saving Logs
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python3 ./examples/save_logs.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
#### Flux Scribe Module
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export FLUX_SCRIBE_DATABASE=sqlite:///flux-batch-job.db
|
|
83
|
+
python3 ./examples/flux_scribe_module.py
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
#### General Test
|
|
64
87
|
|
|
65
|
-
|
|
88
|
+
Or run the controlled example to see a batch job with prolog and epilog run and complete:
|
|
66
89
|
|
|
67
90
|
```bash
|
|
68
91
|
python3 ./tests/test_flux_batch.py
|
|
@@ -117,7 +140,9 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
|
|
|
117
140
|
nodes=1,
|
|
118
141
|
nslots=1,
|
|
119
142
|
time_limit="10m",
|
|
120
|
-
job_name="test-batch"
|
|
143
|
+
job_name="test-batch",
|
|
144
|
+
# Add saving of logs, info, and metadata
|
|
145
|
+
logs_dir="./logs",
|
|
121
146
|
)
|
|
122
147
|
|
|
123
148
|
# Add a prolog and epilog
|
|
@@ -125,7 +150,7 @@ jobspec.add_prolog("echo 'Batch Wrapper Starting'")
|
|
|
125
150
|
jobspec.add_epilog("echo 'Batch Wrapper Finished'")
|
|
126
151
|
|
|
127
152
|
# Add a service (this assumes user level that exists)
|
|
128
|
-
|
|
153
|
+
jobspec.add_service("flux-scribe")
|
|
129
154
|
|
|
130
155
|
# Preview it
|
|
131
156
|
print(flux_batch.submit(handle, jobspec, dry_run=True))
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
> Python SDK to generate Flux batch jobs and services
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+

|
|
6
6
|
|
|
7
7
|

|
|
8
8
|
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
|
|
18
18
|
- **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
|
|
19
19
|
|
|
20
|
+
|
|
20
21
|
## Usage
|
|
21
22
|
|
|
22
23
|
This is a small Flux utility that makes it easy to create Flux batch jobs and services.
|
|
@@ -31,9 +32,26 @@ flux start
|
|
|
31
32
|
pip install -e . --break-system-packages
|
|
32
33
|
```
|
|
33
34
|
|
|
34
|
-
###
|
|
35
|
+
### Examples
|
|
36
|
+
|
|
37
|
+
We have a few simple examples:
|
|
38
|
+
|
|
39
|
+
#### Saving Logs
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python3 ./examples/save_logs.py
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
#### Flux Scribe Module
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
export FLUX_SCRIBE_DATABASE=sqlite:///flux-batch-job.db
|
|
49
|
+
python3 ./examples/flux_scribe_module.py
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
#### General Test
|
|
35
53
|
|
|
36
|
-
|
|
54
|
+
Or run the controlled example to see a batch job with prolog and epilog run and complete:
|
|
37
55
|
|
|
38
56
|
```bash
|
|
39
57
|
python3 ./tests/test_flux_batch.py
|
|
@@ -88,7 +106,9 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
|
|
|
88
106
|
nodes=1,
|
|
89
107
|
nslots=1,
|
|
90
108
|
time_limit="10m",
|
|
91
|
-
job_name="test-batch"
|
|
109
|
+
job_name="test-batch",
|
|
110
|
+
# Add saving of logs, info, and metadata
|
|
111
|
+
logs_dir="./logs",
|
|
92
112
|
)
|
|
93
113
|
|
|
94
114
|
# Add a prolog and epilog
|
|
@@ -96,7 +116,7 @@ jobspec.add_prolog("echo 'Batch Wrapper Starting'")
|
|
|
96
116
|
jobspec.add_epilog("echo 'Batch Wrapper Finished'")
|
|
97
117
|
|
|
98
118
|
# Add a service (this assumes user level that exists)
|
|
99
|
-
|
|
119
|
+
jobspec.add_service("flux-scribe")
|
|
100
120
|
|
|
101
121
|
# Preview it
|
|
102
122
|
print(flux_batch.submit(handle, jobspec, dry_run=True))
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from .jobspec import BatchJobspecV1
|
|
2
2
|
from .models import BatchAttributesV1, BatchJobV1
|
|
3
|
+
from .submit import preview as jobspec
|
|
3
4
|
from .submit import submit
|
|
4
5
|
|
|
5
|
-
__all__ = ["BatchJobV1", "BatchAttributesV1", "BatchJobspecV1", "submit"]
|
|
6
|
+
__all__ = ["BatchJobV1", "BatchAttributesV1", "BatchJobspecV1", "submit", "jobspec"]
|
|
6
7
|
|
|
7
8
|
from .version import __version__ # noqa
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import shlex
|
|
2
3
|
from typing import List
|
|
3
4
|
|
|
4
5
|
import flux_batch.models as models
|
|
6
|
+
import flux_batch.script as scripts
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class BatchJobspecV1:
|
|
@@ -18,6 +20,7 @@ class BatchJobspecV1:
|
|
|
18
20
|
self.prologs: List[str] = []
|
|
19
21
|
self.epilogs: List[str] = []
|
|
20
22
|
self.services: List[str] = []
|
|
23
|
+
self.modules: List[str] = []
|
|
21
24
|
|
|
22
25
|
@classmethod
|
|
23
26
|
def from_command(cls, command: List[str], **kwargs):
|
|
@@ -51,6 +54,9 @@ class BatchJobspecV1:
|
|
|
51
54
|
def add_epilog(self, cmd: str):
|
|
52
55
|
self.epilogs.append(cmd)
|
|
53
56
|
|
|
57
|
+
def add_module(self, service_name: str):
|
|
58
|
+
self.modules.append(service_name)
|
|
59
|
+
|
|
54
60
|
def get_cli_flags(self) -> List[str]:
|
|
55
61
|
"""
|
|
56
62
|
Converts BatchAttributesV1 into a list of strings for subprocess.
|
|
@@ -107,8 +113,25 @@ class BatchJobspecV1:
|
|
|
107
113
|
for val in getattr(attr, field_name):
|
|
108
114
|
flags.extend([flag, str(val)])
|
|
109
115
|
|
|
116
|
+
# If we have modules, ensure they are added --conf <module>=true
|
|
117
|
+
if self.modules:
|
|
118
|
+
# Tell Flux to look in our user home for rc scripts
|
|
119
|
+
modprobe_path = os.path.expanduser("~/.flux-batch")
|
|
120
|
+
flags.extend(["--env", f"FLUX_MODPROBE_PATH_APPEND={modprobe_path}"])
|
|
121
|
+
|
|
122
|
+
# If modules are used, we need to pass the service names into the Flux config
|
|
123
|
+
# so the @task 'needs_config' filter allows them to run
|
|
124
|
+
for mod in self.modules:
|
|
125
|
+
flags.extend(["--conf", f"{mod}=true"])
|
|
126
|
+
|
|
110
127
|
return flags
|
|
111
128
|
|
|
129
|
+
def render(self) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Generate the jobspec.
|
|
132
|
+
"""
|
|
133
|
+
return self.generate_wrapper_script()
|
|
134
|
+
|
|
112
135
|
def generate_wrapper_script(self) -> str:
|
|
113
136
|
"""
|
|
114
137
|
Generate the wrapper script.
|
|
@@ -119,14 +142,38 @@ class BatchJobspecV1:
|
|
|
119
142
|
4. Add jobs/commands
|
|
120
143
|
5. Stop services
|
|
121
144
|
6. And epilogs
|
|
122
|
-
|
|
145
|
+
7. Custom scripts
|
|
123
146
|
|
|
147
|
+
Yes, it's redundant to write them as comments but I like the organization. -v
|
|
148
|
+
"""
|
|
149
|
+
# hashbang
|
|
124
150
|
lines = ["#!/bin/bash"]
|
|
151
|
+
|
|
152
|
+
# prologs
|
|
125
153
|
lines.extend(self.prologs)
|
|
126
154
|
for s in self.services:
|
|
127
155
|
lines.append(f"systemctl --user start {s}")
|
|
156
|
+
|
|
157
|
+
# commands that are derived from jobs or command
|
|
128
158
|
lines.extend(self.commands)
|
|
159
|
+
|
|
160
|
+
# stop services
|
|
129
161
|
for s in reversed(self.services):
|
|
130
162
|
lines.append(f"systemctl --user stop {s}")
|
|
163
|
+
|
|
164
|
+
# epilogs
|
|
131
165
|
lines.extend(self.epilogs)
|
|
166
|
+
|
|
167
|
+
# custom user scripts
|
|
168
|
+
if self.attributes.logs_dir is not None:
|
|
169
|
+
lines.append(self.script_save_logs())
|
|
132
170
|
return "\n".join(lines)
|
|
171
|
+
|
|
172
|
+
def script_save_logs(self):
|
|
173
|
+
"""
|
|
174
|
+
Custom saving of logs. This is what we wrote for our peformance study!
|
|
175
|
+
"""
|
|
176
|
+
script_path = scripts.get_script("save_logs.sh")
|
|
177
|
+
|
|
178
|
+
# Determine output directory (use home default if not defined)
|
|
179
|
+
return f"bash {script_path} {self.attributes.logs_dir}"
|
|
@@ -24,6 +24,11 @@ class BatchAttributesV1:
|
|
|
24
24
|
Explicitly defined arguments allowed by flux batch for V1 spec
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
+
# These are added / custom to our module
|
|
28
|
+
# If logs directory defined (not None) save output there
|
|
29
|
+
# We force the user to provide something.
|
|
30
|
+
logs_dir: Optional[Union[bool, str]] = None
|
|
31
|
+
|
|
27
32
|
# Resources
|
|
28
33
|
nslots: Optional[int] = None # -n
|
|
29
34
|
cores_per_slot: Optional[int] = None # -c
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import flux_batch
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_script(name):
|
|
7
|
+
"""
|
|
8
|
+
Get a script by name
|
|
9
|
+
"""
|
|
10
|
+
# Find the path to the installed script
|
|
11
|
+
base_path = os.path.dirname(os.path.abspath(flux_batch.__file__))
|
|
12
|
+
script_path = os.path.join(base_path, "script", name)
|
|
13
|
+
if not os.path.exists(script_path):
|
|
14
|
+
print(f"Warning: script {name} does not exist")
|
|
15
|
+
return
|
|
16
|
+
return script_path
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
output=$1
|
|
3
|
+
mkdir -p $output
|
|
4
|
+
|
|
5
|
+
echo "Saving logs and job metadata to ${output}"
|
|
6
|
+
|
|
7
|
+
# This will save logs, events, and jobspecs
|
|
8
|
+
for jobid in $(flux jobs -a --json | jq -r .jobs[].id)
|
|
9
|
+
do
|
|
10
|
+
echo "Parsing jobid ${jobid}"
|
|
11
|
+
flux job attach $jobid &> $output/${jobid}.out
|
|
12
|
+
echo "START OF JOBSPEC" >> $output/${jobid}.out
|
|
13
|
+
flux job info $jobid jobspec >> $output/${jobid}.out
|
|
14
|
+
echo "START OF EVENTLOG" >> $output/${jobid}.out
|
|
15
|
+
flux job info $jobid guest.exec.eventlog >> $output/${jobid}.out
|
|
16
|
+
done
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import flux_batch.service.scribe as scribe
|
|
6
|
+
|
|
7
|
+
# Lookup of known services
|
|
8
|
+
services = {"flux-scribe": scribe.SERVICE_TEMPLATE}
|
|
9
|
+
modules = {
|
|
10
|
+
"flux-scribe": {
|
|
11
|
+
"startup": scribe.START_MODULE_TEMPLATE,
|
|
12
|
+
"shutdown": scribe.STOP_MODULE_TEMPLATE,
|
|
13
|
+
"module": scribe.MODULE_NAME,
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def write_modprobe_script(rc_path, script, args=None):
|
|
19
|
+
"""
|
|
20
|
+
Shared function to write service file.
|
|
21
|
+
"""
|
|
22
|
+
args = args or {}
|
|
23
|
+
if not os.path.exists(rc_path):
|
|
24
|
+
with open(rc_path, "w") as f:
|
|
25
|
+
f.write(script.format(**args))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def ensure_modprobe_scripts(service_name: str):
|
|
29
|
+
"""
|
|
30
|
+
Ensures rc1.d (start) and rc3.d (stop) scripts exist for the service.
|
|
31
|
+
"""
|
|
32
|
+
if service_name not in modules:
|
|
33
|
+
print("Warning: module {service_name} is not known.")
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
# We will add these to FLUX_MODPROBE_PATH_APPEND
|
|
37
|
+
base_dir = os.path.expanduser("~/.flux-batch")
|
|
38
|
+
for subdir in ["rc1.d", "rc3.d"]:
|
|
39
|
+
os.makedirs(os.path.join(base_dir, subdir), exist_ok=True)
|
|
40
|
+
|
|
41
|
+
service_func = service_name.replace("-", "_")
|
|
42
|
+
|
|
43
|
+
# Path for rc1.d (startup)
|
|
44
|
+
args = {
|
|
45
|
+
"service_name": service_name,
|
|
46
|
+
"service_func": service_func,
|
|
47
|
+
"python_bin": sys.executable,
|
|
48
|
+
"module_name": modules[service_name]["module"],
|
|
49
|
+
}
|
|
50
|
+
rc1_path = os.path.join(base_dir, "rc1.d", f"{service_name}.py")
|
|
51
|
+
script = modules[service_name]["startup"]
|
|
52
|
+
write_modprobe_script(rc1_path, script, args=args)
|
|
53
|
+
|
|
54
|
+
# Path for rc3.d (shutdown)
|
|
55
|
+
args = {"service_name": service_name, "service_func": service_func}
|
|
56
|
+
rc3_path = os.path.join(base_dir, "rc3.d", f"{service_name}.py")
|
|
57
|
+
script = modules[service_name]["shutdown"]
|
|
58
|
+
write_modprobe_script(rc3_path, script, args=args)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def ensure_user_service(service_name: str):
|
|
62
|
+
"""
|
|
63
|
+
Checks for the existence of a systemd service file in the user's home.
|
|
64
|
+
If it doesn't exist, it creates it and reloads the daemon.
|
|
65
|
+
"""
|
|
66
|
+
user_systemd_dir = os.path.expanduser("~/.config/systemd/user")
|
|
67
|
+
os.makedirs(user_systemd_dir, exist_ok=True)
|
|
68
|
+
service_path = os.path.join(user_systemd_dir, f"{service_name}.service")
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(service_path):
|
|
71
|
+
if service_name in services:
|
|
72
|
+
template = services[service_name]
|
|
73
|
+
print(f"[*] Provisioning {service_name} at {service_path}")
|
|
74
|
+
with open(service_path, "w") as f:
|
|
75
|
+
f.write(template.format(python_path=sys.executable))
|
|
76
|
+
|
|
77
|
+
else:
|
|
78
|
+
print(f"[*] Service {service_name} is not known, assuming exists.")
|
|
79
|
+
|
|
80
|
+
# Reload the user-session manager to recognize the new unit
|
|
81
|
+
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import errno
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
import flux
|
|
9
|
+
import flux.job
|
|
10
|
+
|
|
11
|
+
# Not necessary, but it makes it pretty
|
|
12
|
+
from rich import print
|
|
13
|
+
|
|
14
|
+
# Use the synchronous version of the backend to avoid asyncio-in-thread conflicts
|
|
15
|
+
from flux_batch.service.scribe.database import SQLAlchemyBackend
|
|
16
|
+
|
|
17
|
+
# Setup logging to stderr (to avoid polluting stdout if run manually)
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s", stream=sys.stderr
|
|
20
|
+
)
|
|
21
|
+
logger = logging.getLogger("flux-scribe")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class JournalScribe:
|
|
25
|
+
def __init__(self, db_url: str):
|
|
26
|
+
"""
|
|
27
|
+
Initializes the Scribe with a synchronous DB backend and a Flux Journal Consumer.
|
|
28
|
+
"""
|
|
29
|
+
# Setup Database
|
|
30
|
+
logger.info(f"Connecting to Database: {db_url}")
|
|
31
|
+
self.db = SQLAlchemyBackend(db_url)
|
|
32
|
+
self.db.initialize()
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
self.handle = flux.Flux()
|
|
36
|
+
logger.info("Connected to Flux instance.")
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.critical(f"Failed to connect to Flux: {e}")
|
|
39
|
+
sys.exit(1)
|
|
40
|
+
|
|
41
|
+
# Initialize Journal Consumer
|
|
42
|
+
# This consumes the global event log for the entire instance
|
|
43
|
+
self.consumer = flux.job.JournalConsumer(self.handle)
|
|
44
|
+
self.running = True
|
|
45
|
+
|
|
46
|
+
def _normalize_event(self, event) -> dict:
|
|
47
|
+
"""
|
|
48
|
+
Converts a Flux event object into the dictionary format expected by record_event.
|
|
49
|
+
Matches the logic provided in your EventsEngine reference.
|
|
50
|
+
"""
|
|
51
|
+
# Convert the SWIG/CFFI event object to a dictionary
|
|
52
|
+
payload = dict(event)
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
"id": str(getattr(event, "jobid", "unknown")),
|
|
56
|
+
"type": getattr(event, "name", "unknown"),
|
|
57
|
+
"timestamp": getattr(event, "timestamp", time.time()),
|
|
58
|
+
"payload": payload,
|
|
59
|
+
"R": getattr(event, "R", None),
|
|
60
|
+
"jobspec": getattr(event, "jobspec", None),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def run(self):
|
|
64
|
+
"""
|
|
65
|
+
Main execution loop. Polls the journal and writes to the DB.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
logger.info("🚀 Flux Scribe (Journal Consumer) started.")
|
|
69
|
+
self.consumer.start()
|
|
70
|
+
|
|
71
|
+
while self.running:
|
|
72
|
+
try:
|
|
73
|
+
# Non-blocking poll (100ms timeout)
|
|
74
|
+
# This allows the loop to check for shutdown signals regularly
|
|
75
|
+
event = self.consumer.poll(timeout=0.1)
|
|
76
|
+
|
|
77
|
+
if event:
|
|
78
|
+
print(event)
|
|
79
|
+
# We only care about events associated with a job
|
|
80
|
+
if hasattr(event, "jobid"):
|
|
81
|
+
clean_event = self._normalize_event(event)
|
|
82
|
+
self.db.record_event("local", clean_event)
|
|
83
|
+
else:
|
|
84
|
+
# If no event, yield a tiny bit of CPU
|
|
85
|
+
time.sleep(0.01)
|
|
86
|
+
|
|
87
|
+
except EnvironmentError as e:
|
|
88
|
+
# Ignore timeouts (no data)
|
|
89
|
+
if e.errno == errno.ETIMEDOUT:
|
|
90
|
+
continue
|
|
91
|
+
logger.error(f"Flux connection error: {e}")
|
|
92
|
+
time.sleep(1)
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Unexpected error in event loop: {e}")
|
|
96
|
+
time.sleep(1)
|
|
97
|
+
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.critical(f"EventsEngine crashed: {e}")
|
|
100
|
+
finally:
|
|
101
|
+
self.db.close()
|
|
102
|
+
logger.info("EventsEngine thread exiting.")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def main():
|
|
106
|
+
# Retrieve DB path from environment or use a default
|
|
107
|
+
db_path = os.environ.get("FLUX_SCRIBE_DATABASE", "sqlite:///server_state.db")
|
|
108
|
+
scribe = JournalScribe(db_path)
|
|
109
|
+
scribe.run()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
if __name__ == "__main__":
|
|
113
|
+
main()
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import and_, create_engine, select, update
|
|
5
|
+
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
|
7
|
+
|
|
8
|
+
from flux_batch.service.scribe.models import Base, EventModel, EventRecord, JobModel, JobRecord
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _record_event_internal(session, cluster: str, event: Dict[str, Any]):
|
|
12
|
+
"""
|
|
13
|
+
Shared synchronous logic for recording events.
|
|
14
|
+
Used by both Sync and Async backends.
|
|
15
|
+
"""
|
|
16
|
+
job_id = event.get("id")
|
|
17
|
+
event_type = event.get("type")
|
|
18
|
+
data = event.get("payload", {})
|
|
19
|
+
timestamp = event.get("timestamp", time.time())
|
|
20
|
+
|
|
21
|
+
new_event = EventModel(
|
|
22
|
+
job_id=job_id,
|
|
23
|
+
cluster=cluster,
|
|
24
|
+
timestamp=timestamp,
|
|
25
|
+
event_type=event_type,
|
|
26
|
+
payload=data,
|
|
27
|
+
)
|
|
28
|
+
session.add(new_event)
|
|
29
|
+
|
|
30
|
+
if event_type == "submit":
|
|
31
|
+
stmt = select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
32
|
+
job = session.execute(stmt).scalar_one_or_none()
|
|
33
|
+
|
|
34
|
+
if not job:
|
|
35
|
+
job = JobModel(
|
|
36
|
+
job_id=job_id,
|
|
37
|
+
cluster=cluster,
|
|
38
|
+
user=str(data.get("userid", "unknown")),
|
|
39
|
+
state="submitted",
|
|
40
|
+
workdir=data.get("cwd", ""),
|
|
41
|
+
submit_time=timestamp,
|
|
42
|
+
last_updated=timestamp,
|
|
43
|
+
)
|
|
44
|
+
session.add(job)
|
|
45
|
+
else:
|
|
46
|
+
job.state = "submitted"
|
|
47
|
+
job.last_updated = timestamp
|
|
48
|
+
|
|
49
|
+
# state transitions
|
|
50
|
+
elif event_type == "state" or (event_type and event_type.endswith(".finish")):
|
|
51
|
+
state_name = data.get("state_name", event_type)
|
|
52
|
+
stmt = select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
53
|
+
job = session.execute(stmt).scalar_one_or_none()
|
|
54
|
+
if job:
|
|
55
|
+
job.state = state_name
|
|
56
|
+
job.last_updated = time.time()
|
|
57
|
+
if "status" in data:
|
|
58
|
+
job.exit_code = data["status"]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class AsyncSQLAlchemyBackend:
|
|
62
|
+
"""
|
|
63
|
+
Asynchronous backend for the MCP Gateway.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, db_url: str):
|
|
67
|
+
self.engine = create_async_engine(db_url, echo=False)
|
|
68
|
+
self.SessionLocal = async_sessionmaker(self.engine, expire_on_commit=False)
|
|
69
|
+
|
|
70
|
+
async def initialize(self):
|
|
71
|
+
async with self.engine.begin() as conn:
|
|
72
|
+
await conn.run_sync(Base.metadata.create_all)
|
|
73
|
+
|
|
74
|
+
async def close(self):
|
|
75
|
+
await self.engine.dispose()
|
|
76
|
+
|
|
77
|
+
async def record_event(self, cluster: str, event: Dict[str, Any]):
|
|
78
|
+
async with self.SessionLocal() as session:
|
|
79
|
+
# run_sync bridges our shared logic into the async session
|
|
80
|
+
await session.run_sync(_record_event_internal, cluster, event)
|
|
81
|
+
await session.commit()
|
|
82
|
+
|
|
83
|
+
async def get_job(self, cluster: str, job_id: int) -> Optional[JobRecord]:
|
|
84
|
+
async with self.SessionLocal() as session:
|
|
85
|
+
result = await session.execute(
|
|
86
|
+
select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
87
|
+
)
|
|
88
|
+
job = result.scalar_one_or_none()
|
|
89
|
+
return job.to_record() if job else None
|
|
90
|
+
|
|
91
|
+
async def get_event_history(self, cluster: str, job_id: int) -> List[EventRecord]:
|
|
92
|
+
async with self.SessionLocal() as session:
|
|
93
|
+
result = await session.execute(
|
|
94
|
+
select(EventModel)
|
|
95
|
+
.where(and_(EventModel.job_id == job_id, EventModel.cluster == cluster))
|
|
96
|
+
.order_by(EventModel.timestamp.asc())
|
|
97
|
+
)
|
|
98
|
+
return [e.to_record() for e in result.scalars().all()]
|
|
99
|
+
|
|
100
|
+
async def search_jobs(
|
|
101
|
+
self, cluster: str = None, state: str = None, limit: int = 10
|
|
102
|
+
) -> List[JobRecord]:
|
|
103
|
+
async with self.SessionLocal() as session:
|
|
104
|
+
stmt = select(JobModel)
|
|
105
|
+
if cluster:
|
|
106
|
+
stmt = stmt.where(JobModel.cluster == cluster)
|
|
107
|
+
if state:
|
|
108
|
+
stmt = stmt.where(JobModel.state == state)
|
|
109
|
+
result = await session.execute(stmt.limit(limit))
|
|
110
|
+
return [j.to_record() for j in result.scalars().all()]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class SQLAlchemyBackend:
|
|
114
|
+
"""
|
|
115
|
+
Synchronous backend for the standalone Scribe daemon.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(self, db_url: str):
|
|
119
|
+
# strip 'aiosqlite+' or similar if passed from shared config
|
|
120
|
+
url = db_url.replace("+aiosqlite", "").replace("+asyncpg", "")
|
|
121
|
+
self.engine = create_engine(url, echo=False)
|
|
122
|
+
self.SessionLocal = sessionmaker(bind=self.engine, expire_on_commit=False)
|
|
123
|
+
|
|
124
|
+
def initialize(self):
|
|
125
|
+
Base.metadata.create_all(self.engine)
|
|
126
|
+
|
|
127
|
+
def close(self):
|
|
128
|
+
self.engine.dispose()
|
|
129
|
+
|
|
130
|
+
def record_event(self, cluster: str, event: Dict[str, Any]):
|
|
131
|
+
with self.SessionLocal() as session:
|
|
132
|
+
with session.begin():
|
|
133
|
+
_record_event_internal(session, cluster, event)
|
|
134
|
+
|
|
135
|
+
def get_unwatched_job_ids(self, cluster: str) -> List[int]:
|
|
136
|
+
"""Specific for Scribe: find jobs that need a watcher."""
|
|
137
|
+
with self.SessionLocal() as session:
|
|
138
|
+
stmt = select(JobModel.job_id).where(
|
|
139
|
+
and_(JobModel.cluster == cluster, JobModel.state == "submitted")
|
|
140
|
+
)
|
|
141
|
+
return list(session.execute(stmt).scalars().all())
|
|
142
|
+
|
|
143
|
+
def mark_job_as_watched(self, cluster: str, job_id: int):
|
|
144
|
+
with self.SessionLocal() as session:
|
|
145
|
+
with session.begin():
|
|
146
|
+
session.execute(
|
|
147
|
+
update(JobModel)
|
|
148
|
+
.where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
149
|
+
.values(state="watching")
|
|
150
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import JSON, Float, Integer, String
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncAttrs
|
|
6
|
+
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
|
7
|
+
|
|
8
|
+
# DTOs are "Public Data Transfer Objects" and they are used by
|
|
9
|
+
# our interfaces and tools
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class JobRecord:
|
|
14
|
+
"""
|
|
15
|
+
Represents a snapshot of a job state.
|
|
16
|
+
Returned by get_job() and search_jobs().
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
job_id: int
|
|
20
|
+
cluster: str
|
|
21
|
+
state: str
|
|
22
|
+
user: str
|
|
23
|
+
workdir: Optional[str] = None
|
|
24
|
+
exit_code: Optional[int] = None
|
|
25
|
+
submit_time: float = 0.0
|
|
26
|
+
last_updated: float = 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class EventRecord:
|
|
31
|
+
"""
|
|
32
|
+
Represents a single historical event.
|
|
33
|
+
Returned by get_event_history().
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
timestamp: float
|
|
37
|
+
event_type: str
|
|
38
|
+
payload: Dict[str, Any]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Database models for SQLAlchemy ORM
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Base(AsyncAttrs, DeclarativeBase):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class JobModel(Base):
|
|
49
|
+
__tablename__ = "jobs"
|
|
50
|
+
|
|
51
|
+
# Composite Primary Key
|
|
52
|
+
job_id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
53
|
+
cluster: Mapped[str] = mapped_column(String(255), primary_key=True)
|
|
54
|
+
|
|
55
|
+
state: Mapped[str] = mapped_column(String(50))
|
|
56
|
+
user: Mapped[str] = mapped_column(String(255), nullable=True)
|
|
57
|
+
workdir: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
|
58
|
+
exit_code: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
|
|
59
|
+
submit_time: Mapped[float] = mapped_column(Float, default=0.0)
|
|
60
|
+
last_updated: Mapped[float] = mapped_column(Float, default=0.0)
|
|
61
|
+
|
|
62
|
+
def to_record(self) -> JobRecord:
|
|
63
|
+
"""
|
|
64
|
+
Helper to convert ORM model to public DTO
|
|
65
|
+
"""
|
|
66
|
+
return JobRecord(
|
|
67
|
+
job_id=self.job_id,
|
|
68
|
+
cluster=self.cluster,
|
|
69
|
+
state=self.state,
|
|
70
|
+
user=self.user,
|
|
71
|
+
workdir=self.workdir,
|
|
72
|
+
exit_code=self.exit_code,
|
|
73
|
+
submit_time=self.submit_time,
|
|
74
|
+
last_updated=self.last_updated,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class EventModel(Base):
|
|
79
|
+
__tablename__ = "events"
|
|
80
|
+
|
|
81
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
|
82
|
+
job_id: Mapped[int] = mapped_column(Integer, index=True)
|
|
83
|
+
cluster: Mapped[str] = mapped_column(String(255), index=True)
|
|
84
|
+
timestamp: Mapped[float] = mapped_column(Float)
|
|
85
|
+
event_type: Mapped[str] = mapped_column(String(50))
|
|
86
|
+
payload: Mapped[Dict[str, Any]] = mapped_column(JSON)
|
|
87
|
+
|
|
88
|
+
def to_record(self) -> EventRecord:
|
|
89
|
+
"""
|
|
90
|
+
Helper to convert ORM model to public DTO
|
|
91
|
+
"""
|
|
92
|
+
return EventRecord(
|
|
93
|
+
timestamp=self.timestamp, event_type=self.event_type, payload=self.payload
|
|
94
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Template for the Scribe Journal Consumer
|
|
2
|
+
SERVICE_TEMPLATE = """[Unit]
|
|
3
|
+
Description=Flux Scribe Journal Consumer
|
|
4
|
+
After=network.target
|
|
5
|
+
|
|
6
|
+
[Service]
|
|
7
|
+
ExecStart={python_path} -m flux_batch.service.scribe
|
|
8
|
+
Restart=on-failure
|
|
9
|
+
|
|
10
|
+
[Install]
|
|
11
|
+
WantedBy=default.target
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
START_MODULE_TEMPLATE = """
|
|
15
|
+
from flux.modprobe import task
|
|
16
|
+
import flux.subprocess as subprocess
|
|
17
|
+
|
|
18
|
+
@task(
|
|
19
|
+
"start-{service_name}",
|
|
20
|
+
ranks="0",
|
|
21
|
+
needs_config=["{service_name}"],
|
|
22
|
+
after=["resource", "job-list"],
|
|
23
|
+
)
|
|
24
|
+
def start_{service_func}(context):
|
|
25
|
+
# This triggers the systemd user service provisioned earlier
|
|
26
|
+
# context.bash("systemctl --user start {service_name}")
|
|
27
|
+
subprocess.rexec_bg(
|
|
28
|
+
context.handle,
|
|
29
|
+
["{python_bin}", "-m", "{module_name}"],
|
|
30
|
+
label="{service_name}",
|
|
31
|
+
nodeid=0
|
|
32
|
+
)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
STOP_MODULE_TEMPLATE = """
|
|
36
|
+
from flux.modprobe import task
|
|
37
|
+
import flux.subprocess as subprocess
|
|
38
|
+
|
|
39
|
+
@task(
|
|
40
|
+
"stop-{service_name}",
|
|
41
|
+
ranks="0",
|
|
42
|
+
needs_config=["{service_name}"],
|
|
43
|
+
before=["resource", "job-list"],
|
|
44
|
+
)
|
|
45
|
+
def stop_{service_func}(context):
|
|
46
|
+
# context.bash("systemctl --user stop {service_name}")
|
|
47
|
+
subprocess.kill(context.handle, signum=2, label="{service_name}").get()
|
|
48
|
+
try:
|
|
49
|
+
status = subprocess.wait(context.handle, label="{service_name}").get()["status"]
|
|
50
|
+
print(status)
|
|
51
|
+
except:
|
|
52
|
+
pass
|
|
53
|
+
"""
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import stat
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
|
|
7
|
+
import flux
|
|
8
|
+
import flux.job
|
|
9
|
+
|
|
10
|
+
import flux_batch.models as models
|
|
11
|
+
import flux_batch.service as services
|
|
12
|
+
import flux_batch.utils as utils
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def setup(spec):
|
|
16
|
+
"""
|
|
17
|
+
shared function to generate services / modules from a spec.
|
|
18
|
+
"""
|
|
19
|
+
# Provision services (like flux-scribe) if requested
|
|
20
|
+
for service in spec.services:
|
|
21
|
+
services.ensure_user_service(service)
|
|
22
|
+
for module in spec.modules:
|
|
23
|
+
services.ensure_modprobe_scripts(module)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def generate_jobspec(spec, script, wrapper_path):
|
|
27
|
+
"""
|
|
28
|
+
Shared function to write a script to a wrapper path and generate
|
|
29
|
+
a jobspec for it via flux batch --dry-run.
|
|
30
|
+
"""
|
|
31
|
+
utils.write_file(script, wrapper_path)
|
|
32
|
+
|
|
33
|
+
# Make the script executable so 'flux batch' can analyze it
|
|
34
|
+
os.chmod(wrapper_path, os.stat(wrapper_path).st_mode | stat.S_IEXEC)
|
|
35
|
+
|
|
36
|
+
# Generate the RFC 25 Jobspec JSON via the Flux CLI
|
|
37
|
+
# This handles all resource mapping (-N, -n, etc.)
|
|
38
|
+
cmd = ["flux", "batch"] + spec.get_cli_flags() + ["--dry-run", wrapper_path]
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
42
|
+
except subprocess.CalledProcessError as e:
|
|
43
|
+
print(f"Error during flux batch dryrun: {e.stderr}")
|
|
44
|
+
raise
|
|
45
|
+
return result.stdout
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def preview(spec: models.BatchJobV1) -> int:
|
|
49
|
+
"""
|
|
50
|
+
Preview the jobspec.
|
|
51
|
+
"""
|
|
52
|
+
setup(spec)
|
|
53
|
+
|
|
54
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
55
|
+
|
|
56
|
+
# Write the wrapper script (handling prologs, services, and jobs)
|
|
57
|
+
wrapper_path = os.path.join(tmpdir, "wrapper.sh")
|
|
58
|
+
script = spec.generate_wrapper_script()
|
|
59
|
+
jobspec = generate_jobspec(spec, script, wrapper_path)
|
|
60
|
+
return json.loads(jobspec)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def submit(handle: flux.Flux, spec: models.BatchJobV1, dry_run=False) -> int:
|
|
64
|
+
"""
|
|
65
|
+
Orchestrates the submission process:
|
|
66
|
+
1. Provisions any required user-space services.
|
|
67
|
+
2. Generates the wrapper shell script.
|
|
68
|
+
3. Uses 'flux batch --dryrun' to compile the Jobspec JSON.
|
|
69
|
+
4. Submits the Jobspec to the Flux instance.
|
|
70
|
+
"""
|
|
71
|
+
setup(spec)
|
|
72
|
+
|
|
73
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
74
|
+
# Write the wrapper script (handling prologs, services, and jobs)
|
|
75
|
+
wrapper_path = os.path.join(tmpdir, "wrapper.sh")
|
|
76
|
+
|
|
77
|
+
# dry run here just displays it
|
|
78
|
+
script = spec.generate_wrapper_script()
|
|
79
|
+
if dry_run:
|
|
80
|
+
return script
|
|
81
|
+
|
|
82
|
+
jobspec = generate_jobspec(spec, script, wrapper_path)
|
|
83
|
+
|
|
84
|
+
# Submit the JSON string to the Flux instance
|
|
85
|
+
# The result.stdout contains the raw JSON Jobspec
|
|
86
|
+
return flux.job.submit(handle, jobspec)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.11"
|
|
2
2
|
AUTHOR = "Vanessa Sochat"
|
|
3
3
|
AUTHOR_EMAIL = "vsoch@users.noreply.github.com"
|
|
4
4
|
NAME = "flux-batch"
|
|
@@ -13,4 +13,5 @@ INSTALL_REQUIRES = (
|
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
TESTS_REQUIRES = (("pytest", {"min_version": "4.6.2"}),)
|
|
16
|
-
|
|
16
|
+
SCRIBE_REQUIRES = (("sqlalchemy", {"min_version": None}), ("rich", {"min_version": None}))
|
|
17
|
+
INSTALL_REQUIRES_ALL = INSTALL_REQUIRES + TESTS_REQUIRES + SCRIBE_REQUIRES
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flux-batch
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Python SDK for flux batch jobs and services
|
|
5
5
|
Home-page: https://github.com/converged-computing/flux-batch
|
|
6
6
|
Author: Vanessa Sochat
|
|
@@ -26,12 +26,17 @@ Provides-Extra: all
|
|
|
26
26
|
Requires-Dist: pyyaml; extra == "all"
|
|
27
27
|
Requires-Dist: ply; extra == "all"
|
|
28
28
|
Requires-Dist: pytest>=4.6.2; extra == "all"
|
|
29
|
+
Requires-Dist: sqlalchemy; extra == "all"
|
|
30
|
+
Requires-Dist: rich; extra == "all"
|
|
31
|
+
Provides-Extra: scribe
|
|
32
|
+
Requires-Dist: sqlalchemy; extra == "scribe"
|
|
33
|
+
Requires-Dist: rich; extra == "scribe"
|
|
29
34
|
|
|
30
35
|
# flux-batch
|
|
31
36
|
|
|
32
37
|
> Python SDK to generate Flux batch jobs and services
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+

|
|
35
40
|
|
|
36
41
|

|
|
37
42
|
|
|
@@ -46,6 +51,7 @@ Requires-Dist: pytest>=4.6.2; extra == "all"
|
|
|
46
51
|
|
|
47
52
|
- **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
|
|
48
53
|
|
|
54
|
+
|
|
49
55
|
## Usage
|
|
50
56
|
|
|
51
57
|
This is a small Flux utility that makes it easy to create Flux batch jobs and services.
|
|
@@ -60,9 +66,26 @@ flux start
|
|
|
60
66
|
pip install -e . --break-system-packages
|
|
61
67
|
```
|
|
62
68
|
|
|
63
|
-
###
|
|
69
|
+
### Examples
|
|
70
|
+
|
|
71
|
+
We have a few simple examples:
|
|
72
|
+
|
|
73
|
+
#### Saving Logs
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python3 ./examples/save_logs.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
#### Flux Scribe Module
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export FLUX_SCRIBE_DATABASE=sqlite:///flux-batch-job.db
|
|
83
|
+
python3 ./examples/flux_scribe_module.py
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
#### General Test
|
|
64
87
|
|
|
65
|
-
|
|
88
|
+
Or run the controlled example to see a batch job with prolog and epilog run and complete:
|
|
66
89
|
|
|
67
90
|
```bash
|
|
68
91
|
python3 ./tests/test_flux_batch.py
|
|
@@ -117,7 +140,9 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
|
|
|
117
140
|
nodes=1,
|
|
118
141
|
nslots=1,
|
|
119
142
|
time_limit="10m",
|
|
120
|
-
job_name="test-batch"
|
|
143
|
+
job_name="test-batch",
|
|
144
|
+
# Add saving of logs, info, and metadata
|
|
145
|
+
logs_dir="./logs",
|
|
121
146
|
)
|
|
122
147
|
|
|
123
148
|
# Add a prolog and epilog
|
|
@@ -125,7 +150,7 @@ jobspec.add_prolog("echo 'Batch Wrapper Starting'")
|
|
|
125
150
|
jobspec.add_epilog("echo 'Batch Wrapper Finished'")
|
|
126
151
|
|
|
127
152
|
# Add a service (this assumes user level that exists)
|
|
128
|
-
|
|
153
|
+
jobspec.add_service("flux-scribe")
|
|
129
154
|
|
|
130
155
|
# Preview it
|
|
131
156
|
print(flux_batch.submit(handle, jobspec, dry_run=True))
|
|
@@ -20,8 +20,14 @@ flux_batch.egg-info/top_level.txt
|
|
|
20
20
|
flux_batch/logger/__init__.py
|
|
21
21
|
flux_batch/logger/generate.py
|
|
22
22
|
flux_batch/logger/logger.py
|
|
23
|
+
flux_batch/script/__init__.py
|
|
24
|
+
flux_batch/script/save_logs.sh
|
|
23
25
|
flux_batch/service/__init__.py
|
|
24
|
-
flux_batch/service/scribe.py
|
|
26
|
+
flux_batch/service/scribe/__init__.py
|
|
27
|
+
flux_batch/service/scribe/__main__.py
|
|
28
|
+
flux_batch/service/scribe/database.py
|
|
29
|
+
flux_batch/service/scribe/models.py
|
|
30
|
+
flux_batch/service/scribe/template.py
|
|
25
31
|
flux_batch/utils/__init__.py
|
|
26
32
|
flux_batch/utils/fileio.py
|
|
27
33
|
flux_batch/utils/text.py
|
|
@@ -62,6 +62,7 @@ if __name__ == "__main__":
|
|
|
62
62
|
INSTALL_REQUIRES = get_reqs(lookup)
|
|
63
63
|
TESTS_REQUIRES = get_reqs(lookup, "TESTS_REQUIRES")
|
|
64
64
|
INSTALL_REQUIRES_ALL = get_reqs(lookup, "INSTALL_REQUIRES_ALL")
|
|
65
|
+
INSTALL_REQUIRES_SCRIBE = get_reqs(lookup, "SCRIBE_REQUIRES")
|
|
65
66
|
|
|
66
67
|
setup(
|
|
67
68
|
name=NAME,
|
|
@@ -83,6 +84,7 @@ if __name__ == "__main__":
|
|
|
83
84
|
tests_require=TESTS_REQUIRES,
|
|
84
85
|
extras_require={
|
|
85
86
|
"all": [INSTALL_REQUIRES_ALL],
|
|
87
|
+
"scribe": [INSTALL_REQUIRES_SCRIBE],
|
|
86
88
|
},
|
|
87
89
|
classifiers=[
|
|
88
90
|
"Intended Audience :: Science/Research",
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import subprocess
|
|
3
|
-
import sys
|
|
4
|
-
|
|
5
|
-
from .scribe import SERVICE_TEMPLATE as scribe_template
|
|
6
|
-
|
|
7
|
-
# Lookup of known services
|
|
8
|
-
services = {"scribe": scribe_template}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def ensure_user_service(service_name: str):
|
|
12
|
-
"""
|
|
13
|
-
Checks for the existence of a systemd service file in the user's home.
|
|
14
|
-
If it doesn't exist, it creates it and reloads the daemon.
|
|
15
|
-
"""
|
|
16
|
-
user_systemd_dir = os.path.expanduser("~/.config/systemd/user")
|
|
17
|
-
os.makedirs(user_systemd_dir, exist_ok=True)
|
|
18
|
-
service_path = os.path.join(user_systemd_dir, f"{service_name}.service")
|
|
19
|
-
|
|
20
|
-
if not os.path.exists(service_path):
|
|
21
|
-
if service_name in services:
|
|
22
|
-
template = services[service_name]
|
|
23
|
-
print(f"[*] Provisioning {service_name} at {service_path}")
|
|
24
|
-
with open(service_path, "w") as f:
|
|
25
|
-
f.write(template.format(python_path=sys.executable))
|
|
26
|
-
|
|
27
|
-
else:
|
|
28
|
-
print(f"[*] Service {service_name} is not known, assuming exists.")
|
|
29
|
-
|
|
30
|
-
# Reload the user-session manager to recognize the new unit
|
|
31
|
-
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# Template for the Scribe Journal Consumer
|
|
2
|
-
SERVICE_TEMPLATE = """[Unit]
|
|
3
|
-
Description=Flux Scribe Journal Consumer
|
|
4
|
-
After=network.target
|
|
5
|
-
|
|
6
|
-
[Service]
|
|
7
|
-
ExecStart={python_path} -m flux_mcp_server.scribe
|
|
8
|
-
Restart=on-failure
|
|
9
|
-
|
|
10
|
-
[Install]
|
|
11
|
-
WantedBy=default.target
|
|
12
|
-
"""
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import stat
|
|
3
|
-
import subprocess
|
|
4
|
-
import tempfile
|
|
5
|
-
|
|
6
|
-
import flux
|
|
7
|
-
import flux.job
|
|
8
|
-
|
|
9
|
-
import flux_batch.models as models
|
|
10
|
-
import flux_batch.utils as utils
|
|
11
|
-
from flux_batch.service import ensure_user_service
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def submit(handle: flux.Flux, spec: models.BatchJobV1, dry_run=False) -> int:
|
|
15
|
-
"""
|
|
16
|
-
Orchestrates the submission process:
|
|
17
|
-
1. Provisions any required user-space services.
|
|
18
|
-
2. Generates the wrapper shell script.
|
|
19
|
-
3. Uses 'flux batch --dryrun' to compile the Jobspec JSON.
|
|
20
|
-
4. Submits the Jobspec to the Flux instance.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
# Provision services (like flux-scribe) if requested
|
|
24
|
-
for service in spec.services:
|
|
25
|
-
ensure_user_service(service)
|
|
26
|
-
|
|
27
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
|
28
|
-
# Write the wrapper script (handling prologs, services, and jobs)
|
|
29
|
-
wrapper_path = os.path.join(tmpdir, "wrapper.sh")
|
|
30
|
-
|
|
31
|
-
# dry run here just displays it
|
|
32
|
-
script = spec.generate_wrapper_script()
|
|
33
|
-
if dry_run:
|
|
34
|
-
return script
|
|
35
|
-
|
|
36
|
-
utils.write_file(script, wrapper_path)
|
|
37
|
-
|
|
38
|
-
# Make the script executable so 'flux batch' can analyze it
|
|
39
|
-
os.chmod(wrapper_path, os.stat(wrapper_path).st_mode | stat.S_IEXEC)
|
|
40
|
-
|
|
41
|
-
# Generate the RFC 25 Jobspec JSON via the Flux CLI
|
|
42
|
-
# This handles all resource mapping (-N, -n, etc.)
|
|
43
|
-
cmd = ["flux", "batch"] + spec.get_cli_flags() + ["--dry-run", wrapper_path]
|
|
44
|
-
|
|
45
|
-
try:
|
|
46
|
-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
47
|
-
except subprocess.CalledProcessError as e:
|
|
48
|
-
print(f"Error during flux batch dryrun: {e.stderr}")
|
|
49
|
-
raise
|
|
50
|
-
|
|
51
|
-
# Submit the JSON string to the Flux instance
|
|
52
|
-
# The result.stdout contains the raw JSON Jobspec
|
|
53
|
-
return flux.job.submit(handle, result.stdout)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|