flux-batch 0.0.0__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {flux_batch-0.0.0/flux_batch.egg-info → flux_batch-0.0.11}/PKG-INFO +31 -6
  2. {flux_batch-0.0.0 → flux_batch-0.0.11}/README.md +25 -5
  3. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/__init__.py +2 -1
  4. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/jobspec.py +48 -1
  5. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/models.py +5 -0
  6. flux_batch-0.0.11/flux_batch/script/__init__.py +16 -0
  7. flux_batch-0.0.11/flux_batch/script/save_logs.sh +16 -0
  8. flux_batch-0.0.11/flux_batch/service/__init__.py +81 -0
  9. flux_batch-0.0.11/flux_batch/service/scribe/__init__.py +2 -0
  10. flux_batch-0.0.11/flux_batch/service/scribe/__main__.py +113 -0
  11. flux_batch-0.0.11/flux_batch/service/scribe/database.py +150 -0
  12. flux_batch-0.0.11/flux_batch/service/scribe/models.py +94 -0
  13. flux_batch-0.0.11/flux_batch/service/scribe/template.py +53 -0
  14. flux_batch-0.0.11/flux_batch/submit.py +86 -0
  15. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/version.py +3 -2
  16. {flux_batch-0.0.0 → flux_batch-0.0.11/flux_batch.egg-info}/PKG-INFO +31 -6
  17. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/SOURCES.txt +7 -1
  18. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/requires.txt +6 -0
  19. {flux_batch-0.0.0 → flux_batch-0.0.11}/setup.py +2 -0
  20. flux_batch-0.0.0/flux_batch/service/__init__.py +0 -31
  21. flux_batch-0.0.0/flux_batch/service/scribe.py +0 -12
  22. flux_batch-0.0.0/flux_batch/submit.py +0 -53
  23. {flux_batch-0.0.0 → flux_batch-0.0.11}/LICENSE +0 -0
  24. {flux_batch-0.0.0 → flux_batch-0.0.11}/MANIFEST.in +0 -0
  25. {flux_batch-0.0.0 → flux_batch-0.0.11}/NOTICE +0 -0
  26. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/logger/__init__.py +0 -0
  27. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/logger/generate.py +0 -0
  28. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/logger/logger.py +0 -0
  29. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/__init__.py +0 -0
  30. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/fileio.py +0 -0
  31. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/text.py +0 -0
  32. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch/utils/timer.py +0 -0
  33. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/dependency_links.txt +0 -0
  34. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/entry_points.txt +0 -0
  35. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/not-zip-safe +0 -0
  36. {flux_batch-0.0.0 → flux_batch-0.0.11}/flux_batch.egg-info/top_level.txt +0 -0
  37. {flux_batch-0.0.0 → flux_batch-0.0.11}/pyproject.toml +0 -0
  38. {flux_batch-0.0.0 → flux_batch-0.0.11}/setup.cfg +0 -0
  39. {flux_batch-0.0.0 → flux_batch-0.0.11}/tests/test_flux_batch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flux-batch
3
- Version: 0.0.0
3
+ Version: 0.0.11
4
4
  Summary: Python SDK for flux batch jobs and services
5
5
  Home-page: https://github.com/converged-computing/flux-batch
6
6
  Author: Vanessa Sochat
@@ -26,12 +26,17 @@ Provides-Extra: all
26
26
  Requires-Dist: pyyaml; extra == "all"
27
27
  Requires-Dist: ply; extra == "all"
28
28
  Requires-Dist: pytest>=4.6.2; extra == "all"
29
+ Requires-Dist: sqlalchemy; extra == "all"
30
+ Requires-Dist: rich; extra == "all"
31
+ Provides-Extra: scribe
32
+ Requires-Dist: sqlalchemy; extra == "scribe"
33
+ Requires-Dist: rich; extra == "scribe"
29
34
 
30
35
  # flux-batch
31
36
 
32
37
  > Python SDK to generate Flux batch jobs and services
33
38
 
34
- [![PyPI version](https://badge.fury.io/py/flux-batch.svg)](https://badge.fury.io/py/flux-batch)
39
+ ![PyPI - Version](https://img.shields.io/pypi/v/flux-batch)
35
40
 
36
41
  ![https://github.com/converged-computing/flux-batch/raw/main/img/flux-batch-small.png](https://github.com/converged-computing/flux-batch/raw/main/img/flux-batch-small.png)
37
42
 
@@ -46,6 +51,7 @@ Requires-Dist: pytest>=4.6.2; extra == "all"
46
51
 
47
52
  - **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
48
53
 
54
+
49
55
  ## Usage
50
56
 
51
57
  This is a small Flux utility that makes it easy to create Flux batch jobs and services.
@@ -60,9 +66,26 @@ flux start
60
66
  pip install -e . --break-system-packages
61
67
  ```
62
68
 
63
- ### Example
69
+ ### Examples
70
+
71
+ We have a few simple examples:
72
+
73
+ #### Saving Logs
74
+
75
+ ```bash
76
+ python3 ./examples/save_logs.py
77
+ ```
78
+
79
+ #### Flux Scribe Module
80
+
81
+ ```bash
82
+ export FLUX_SCRIBE_DATABASE=sqlite:///flux-batch-job.db
83
+ python3 ./examples/flux_scribe_module.py
84
+ ```
85
+
86
+ #### General Test
64
87
 
65
- Run the controlled example to see a batch job with prolog and epilog run and complete:
88
+ Or run the controlled example to see a batch job with prolog and epilog run and complete:
66
89
 
67
90
  ```bash
68
91
  python3 ./tests/test_flux_batch.py
@@ -117,7 +140,9 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
117
140
  nodes=1,
118
141
  nslots=1,
119
142
  time_limit="10m",
120
- job_name="test-batch"
143
+ job_name="test-batch",
144
+ # Add saving of logs, info, and metadata
145
+ logs_dir="./logs",
121
146
  )
122
147
 
123
148
  # Add a prolog and epilog
@@ -125,7 +150,7 @@ jobspec.add_prolog("echo 'Batch Wrapper Starting'")
125
150
  jobspec.add_epilog("echo 'Batch Wrapper Finished'")
126
151
 
127
152
  # Add a service (this assumes user level that exists)
128
- # jobspec.add_service("my-service'")
153
+ jobspec.add_service("flux-scribe")
129
154
 
130
155
  # Preview it
131
156
  print(flux_batch.submit(handle, jobspec, dry_run=True))
@@ -2,7 +2,7 @@
2
2
 
3
3
  > Python SDK to generate Flux batch jobs and services
4
4
 
5
- [![PyPI version](https://badge.fury.io/py/flux-batch.svg)](https://badge.fury.io/py/flux-batch)
5
+ ![PyPI - Version](https://img.shields.io/pypi/v/flux-batch)
6
6
 
7
7
  ![https://github.com/converged-computing/flux-batch/raw/main/img/flux-batch-small.png](https://github.com/converged-computing/flux-batch/raw/main/img/flux-batch-small.png)
8
8
 
@@ -17,6 +17,7 @@
17
17
 
18
18
  - **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
19
19
 
20
+
20
21
  ## Usage
21
22
 
22
23
  This is a small Flux utility that makes it easy to create Flux batch jobs and services.
@@ -31,9 +32,26 @@ flux start
31
32
  pip install -e . --break-system-packages
32
33
  ```
33
34
 
34
- ### Example
35
+ ### Examples
36
+
37
+ We have a few simple examples:
38
+
39
+ #### Saving Logs
40
+
41
+ ```bash
42
+ python3 ./examples/save_logs.py
43
+ ```
44
+
45
+ #### Flux Scribe Module
46
+
47
+ ```bash
48
+ export FLUX_SCRIBE_DATABASE=sqlite:///flux-batch-job.db
49
+ python3 ./examples/flux_scribe_module.py
50
+ ```
51
+
52
+ #### General Test
35
53
 
36
- Run the controlled example to see a batch job with prolog and epilog run and complete:
54
+ Or run the controlled example to see a batch job with prolog and epilog run and complete:
37
55
 
38
56
  ```bash
39
57
  python3 ./tests/test_flux_batch.py
@@ -88,7 +106,9 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
88
106
  nodes=1,
89
107
  nslots=1,
90
108
  time_limit="10m",
91
- job_name="test-batch"
109
+ job_name="test-batch",
110
+ # Add saving of logs, info, and metadata
111
+ logs_dir="./logs",
92
112
  )
93
113
 
94
114
  # Add a prolog and epilog
@@ -96,7 +116,7 @@ jobspec.add_prolog("echo 'Batch Wrapper Starting'")
96
116
  jobspec.add_epilog("echo 'Batch Wrapper Finished'")
97
117
 
98
118
  # Add a service (this assumes user level that exists)
99
- # jobspec.add_service("my-service'")
119
+ jobspec.add_service("flux-scribe")
100
120
 
101
121
  # Preview it
102
122
  print(flux_batch.submit(handle, jobspec, dry_run=True))
@@ -1,7 +1,8 @@
1
1
  from .jobspec import BatchJobspecV1
2
2
  from .models import BatchAttributesV1, BatchJobV1
3
+ from .submit import preview as jobspec
3
4
  from .submit import submit
4
5
 
5
- __all__ = ["BatchJobV1", "BatchAttributesV1", "BatchJobspecV1", "submit"]
6
+ __all__ = ["BatchJobV1", "BatchAttributesV1", "BatchJobspecV1", "submit", "jobspec"]
6
7
 
7
8
  from .version import __version__ # noqa
@@ -1,7 +1,9 @@
1
+ import os
1
2
  import shlex
2
3
  from typing import List
3
4
 
4
5
  import flux_batch.models as models
6
+ import flux_batch.script as scripts
5
7
 
6
8
 
7
9
  class BatchJobspecV1:
@@ -18,6 +20,7 @@ class BatchJobspecV1:
18
20
  self.prologs: List[str] = []
19
21
  self.epilogs: List[str] = []
20
22
  self.services: List[str] = []
23
+ self.modules: List[str] = []
21
24
 
22
25
  @classmethod
23
26
  def from_command(cls, command: List[str], **kwargs):
@@ -51,6 +54,9 @@ class BatchJobspecV1:
51
54
  def add_epilog(self, cmd: str):
52
55
  self.epilogs.append(cmd)
53
56
 
57
+ def add_module(self, service_name: str):
58
+ self.modules.append(service_name)
59
+
54
60
  def get_cli_flags(self) -> List[str]:
55
61
  """
56
62
  Converts BatchAttributesV1 into a list of strings for subprocess.
@@ -107,8 +113,25 @@ class BatchJobspecV1:
107
113
  for val in getattr(attr, field_name):
108
114
  flags.extend([flag, str(val)])
109
115
 
116
+ # If we have modules, ensure they are added --conf <module>=true
117
+ if self.modules:
118
+ # Tell Flux to look in our user home for rc scripts
119
+ modprobe_path = os.path.expanduser("~/.flux-batch")
120
+ flags.extend(["--env", f"FLUX_MODPROBE_PATH_APPEND={modprobe_path}"])
121
+
122
+ # If modules are used, we need to pass the service names into the Flux config
123
+ # so the @task 'needs_config' filter allows them to run
124
+ for mod in self.modules:
125
+ flags.extend(["--conf", f"{mod}=true"])
126
+
110
127
  return flags
111
128
 
129
+ def render(self) -> str:
130
+ """
131
+ Generate the jobspec.
132
+ """
133
+ return self.generate_wrapper_script()
134
+
112
135
  def generate_wrapper_script(self) -> str:
113
136
  """
114
137
  Generate the wrapper script.
@@ -119,14 +142,38 @@ class BatchJobspecV1:
119
142
  4. Add jobs/commands
120
143
  5. Stop services
121
144
  6. And epilogs
122
- """
145
+ 7. Custom scripts
123
146
 
147
+ Yes, it's redundant to write them as comments but I like the organization. -v
148
+ """
149
+ # hashbang
124
150
  lines = ["#!/bin/bash"]
151
+
152
+ # prologs
125
153
  lines.extend(self.prologs)
126
154
  for s in self.services:
127
155
  lines.append(f"systemctl --user start {s}")
156
+
157
+ # commands that are derived from jobs or command
128
158
  lines.extend(self.commands)
159
+
160
+ # stop services
129
161
  for s in reversed(self.services):
130
162
  lines.append(f"systemctl --user stop {s}")
163
+
164
+ # epilogs
131
165
  lines.extend(self.epilogs)
166
+
167
+ # custom user scripts
168
+ if self.attributes.logs_dir is not None:
169
+ lines.append(self.script_save_logs())
132
170
  return "\n".join(lines)
171
+
172
+ def script_save_logs(self):
173
+ """
174
+ Custom saving of logs. This is what we wrote for our peformance study!
175
+ """
176
+ script_path = scripts.get_script("save_logs.sh")
177
+
178
+ # Determine output directory (use home default if not defined)
179
+ return f"bash {script_path} {self.attributes.logs_dir}"
@@ -24,6 +24,11 @@ class BatchAttributesV1:
24
24
  Explicitly defined arguments allowed by flux batch for V1 spec
25
25
  """
26
26
 
27
+ # These are added / custom to our module
28
+ # If logs directory defined (not None) save output there
29
+ # We force the user to provide something.
30
+ logs_dir: Optional[Union[bool, str]] = None
31
+
27
32
  # Resources
28
33
  nslots: Optional[int] = None # -n
29
34
  cores_per_slot: Optional[int] = None # -c
@@ -0,0 +1,16 @@
1
+ import os
2
+
3
+ import flux_batch
4
+
5
+
6
+ def get_script(name):
7
+ """
8
+ Get a script by name
9
+ """
10
+ # Find the path to the installed script
11
+ base_path = os.path.dirname(os.path.abspath(flux_batch.__file__))
12
+ script_path = os.path.join(base_path, "script", name)
13
+ if not os.path.exists(script_path):
14
+ print(f"Warning: script {name} does not exist")
15
+ return
16
+ return script_path
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+ output=$1
3
+ mkdir -p $output
4
+
5
+ echo "Saving logs and job metadata to ${output}"
6
+
7
+ # This will save logs, events, and jobspecs
8
+ for jobid in $(flux jobs -a --json | jq -r .jobs[].id)
9
+ do
10
+ echo "Parsing jobid ${jobid}"
11
+ flux job attach $jobid &> $output/${jobid}.out
12
+ echo "START OF JOBSPEC" >> $output/${jobid}.out
13
+ flux job info $jobid jobspec >> $output/${jobid}.out
14
+ echo "START OF EVENTLOG" >> $output/${jobid}.out
15
+ flux job info $jobid guest.exec.eventlog >> $output/${jobid}.out
16
+ done
@@ -0,0 +1,81 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+
5
+ import flux_batch.service.scribe as scribe
6
+
7
+ # Lookup of known services
8
+ services = {"flux-scribe": scribe.SERVICE_TEMPLATE}
9
+ modules = {
10
+ "flux-scribe": {
11
+ "startup": scribe.START_MODULE_TEMPLATE,
12
+ "shutdown": scribe.STOP_MODULE_TEMPLATE,
13
+ "module": scribe.MODULE_NAME,
14
+ }
15
+ }
16
+
17
+
18
+ def write_modprobe_script(rc_path, script, args=None):
19
+ """
20
+ Shared function to write service file.
21
+ """
22
+ args = args or {}
23
+ if not os.path.exists(rc_path):
24
+ with open(rc_path, "w") as f:
25
+ f.write(script.format(**args))
26
+
27
+
28
+ def ensure_modprobe_scripts(service_name: str):
29
+ """
30
+ Ensures rc1.d (start) and rc3.d (stop) scripts exist for the service.
31
+ """
32
+ if service_name not in modules:
33
+ print("Warning: module {service_name} is not known.")
34
+ return
35
+
36
+ # We will add these to FLUX_MODPROBE_PATH_APPEND
37
+ base_dir = os.path.expanduser("~/.flux-batch")
38
+ for subdir in ["rc1.d", "rc3.d"]:
39
+ os.makedirs(os.path.join(base_dir, subdir), exist_ok=True)
40
+
41
+ service_func = service_name.replace("-", "_")
42
+
43
+ # Path for rc1.d (startup)
44
+ args = {
45
+ "service_name": service_name,
46
+ "service_func": service_func,
47
+ "python_bin": sys.executable,
48
+ "module_name": modules[service_name]["module"],
49
+ }
50
+ rc1_path = os.path.join(base_dir, "rc1.d", f"{service_name}.py")
51
+ script = modules[service_name]["startup"]
52
+ write_modprobe_script(rc1_path, script, args=args)
53
+
54
+ # Path for rc3.d (shutdown)
55
+ args = {"service_name": service_name, "service_func": service_func}
56
+ rc3_path = os.path.join(base_dir, "rc3.d", f"{service_name}.py")
57
+ script = modules[service_name]["shutdown"]
58
+ write_modprobe_script(rc3_path, script, args=args)
59
+
60
+
61
+ def ensure_user_service(service_name: str):
62
+ """
63
+ Checks for the existence of a systemd service file in the user's home.
64
+ If it doesn't exist, it creates it and reloads the daemon.
65
+ """
66
+ user_systemd_dir = os.path.expanduser("~/.config/systemd/user")
67
+ os.makedirs(user_systemd_dir, exist_ok=True)
68
+ service_path = os.path.join(user_systemd_dir, f"{service_name}.service")
69
+
70
+ if not os.path.exists(service_path):
71
+ if service_name in services:
72
+ template = services[service_name]
73
+ print(f"[*] Provisioning {service_name} at {service_path}")
74
+ with open(service_path, "w") as f:
75
+ f.write(template.format(python_path=sys.executable))
76
+
77
+ else:
78
+ print(f"[*] Service {service_name} is not known, assuming exists.")
79
+
80
+ # Reload the user-session manager to recognize the new unit
81
+ subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
@@ -0,0 +1,2 @@
1
+ MODULE_NAME = "flux_batch.service.scribe"
2
+ from .template import SERVICE_TEMPLATE, START_MODULE_TEMPLATE, STOP_MODULE_TEMPLATE
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ import errno
3
+ import logging
4
+ import os
5
+ import sys
6
+ import time
7
+
8
+ import flux
9
+ import flux.job
10
+
11
+ # Not necessary, but it makes it pretty
12
+ from rich import print
13
+
14
+ # Use the synchronous version of the backend to avoid asyncio-in-thread conflicts
15
+ from flux_batch.service.scribe.database import SQLAlchemyBackend
16
+
17
+ # Setup logging to stderr (to avoid polluting stdout if run manually)
18
+ logging.basicConfig(
19
+ level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s", stream=sys.stderr
20
+ )
21
+ logger = logging.getLogger("flux-scribe")
22
+
23
+
24
+ class JournalScribe:
25
+ def __init__(self, db_url: str):
26
+ """
27
+ Initializes the Scribe with a synchronous DB backend and a Flux Journal Consumer.
28
+ """
29
+ # Setup Database
30
+ logger.info(f"Connecting to Database: {db_url}")
31
+ self.db = SQLAlchemyBackend(db_url)
32
+ self.db.initialize()
33
+
34
+ try:
35
+ self.handle = flux.Flux()
36
+ logger.info("Connected to Flux instance.")
37
+ except Exception as e:
38
+ logger.critical(f"Failed to connect to Flux: {e}")
39
+ sys.exit(1)
40
+
41
+ # Initialize Journal Consumer
42
+ # This consumes the global event log for the entire instance
43
+ self.consumer = flux.job.JournalConsumer(self.handle)
44
+ self.running = True
45
+
46
+ def _normalize_event(self, event) -> dict:
47
+ """
48
+ Converts a Flux event object into the dictionary format expected by record_event.
49
+ Matches the logic provided in your EventsEngine reference.
50
+ """
51
+ # Convert the SWIG/CFFI event object to a dictionary
52
+ payload = dict(event)
53
+
54
+ return {
55
+ "id": str(getattr(event, "jobid", "unknown")),
56
+ "type": getattr(event, "name", "unknown"),
57
+ "timestamp": getattr(event, "timestamp", time.time()),
58
+ "payload": payload,
59
+ "R": getattr(event, "R", None),
60
+ "jobspec": getattr(event, "jobspec", None),
61
+ }
62
+
63
+ def run(self):
64
+ """
65
+ Main execution loop. Polls the journal and writes to the DB.
66
+ """
67
+ try:
68
+ logger.info("🚀 Flux Scribe (Journal Consumer) started.")
69
+ self.consumer.start()
70
+
71
+ while self.running:
72
+ try:
73
+ # Non-blocking poll (100ms timeout)
74
+ # This allows the loop to check for shutdown signals regularly
75
+ event = self.consumer.poll(timeout=0.1)
76
+
77
+ if event:
78
+ print(event)
79
+ # We only care about events associated with a job
80
+ if hasattr(event, "jobid"):
81
+ clean_event = self._normalize_event(event)
82
+ self.db.record_event("local", clean_event)
83
+ else:
84
+ # If no event, yield a tiny bit of CPU
85
+ time.sleep(0.01)
86
+
87
+ except EnvironmentError as e:
88
+ # Ignore timeouts (no data)
89
+ if e.errno == errno.ETIMEDOUT:
90
+ continue
91
+ logger.error(f"Flux connection error: {e}")
92
+ time.sleep(1)
93
+
94
+ except Exception as e:
95
+ logger.error(f"Unexpected error in event loop: {e}")
96
+ time.sleep(1)
97
+
98
+ except Exception as e:
99
+ logger.critical(f"EventsEngine crashed: {e}")
100
+ finally:
101
+ self.db.close()
102
+ logger.info("EventsEngine thread exiting.")
103
+
104
+
105
+ def main():
106
+ # Retrieve DB path from environment or use a default
107
+ db_path = os.environ.get("FLUX_SCRIBE_DATABASE", "sqlite:///server_state.db")
108
+ scribe = JournalScribe(db_path)
109
+ scribe.run()
110
+
111
+
112
+ if __name__ == "__main__":
113
+ main()
@@ -0,0 +1,150 @@
1
+ import time
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from sqlalchemy import and_, create_engine, select, update
5
+ from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
6
+ from sqlalchemy.orm import sessionmaker
7
+
8
+ from flux_batch.service.scribe.models import Base, EventModel, EventRecord, JobModel, JobRecord
9
+
10
+
11
+ def _record_event_internal(session, cluster: str, event: Dict[str, Any]):
12
+ """
13
+ Shared synchronous logic for recording events.
14
+ Used by both Sync and Async backends.
15
+ """
16
+ job_id = event.get("id")
17
+ event_type = event.get("type")
18
+ data = event.get("payload", {})
19
+ timestamp = event.get("timestamp", time.time())
20
+
21
+ new_event = EventModel(
22
+ job_id=job_id,
23
+ cluster=cluster,
24
+ timestamp=timestamp,
25
+ event_type=event_type,
26
+ payload=data,
27
+ )
28
+ session.add(new_event)
29
+
30
+ if event_type == "submit":
31
+ stmt = select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
32
+ job = session.execute(stmt).scalar_one_or_none()
33
+
34
+ if not job:
35
+ job = JobModel(
36
+ job_id=job_id,
37
+ cluster=cluster,
38
+ user=str(data.get("userid", "unknown")),
39
+ state="submitted",
40
+ workdir=data.get("cwd", ""),
41
+ submit_time=timestamp,
42
+ last_updated=timestamp,
43
+ )
44
+ session.add(job)
45
+ else:
46
+ job.state = "submitted"
47
+ job.last_updated = timestamp
48
+
49
+ # state transitions
50
+ elif event_type == "state" or (event_type and event_type.endswith(".finish")):
51
+ state_name = data.get("state_name", event_type)
52
+ stmt = select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
53
+ job = session.execute(stmt).scalar_one_or_none()
54
+ if job:
55
+ job.state = state_name
56
+ job.last_updated = time.time()
57
+ if "status" in data:
58
+ job.exit_code = data["status"]
59
+
60
+
61
+ class AsyncSQLAlchemyBackend:
62
+ """
63
+ Asynchronous backend for the MCP Gateway.
64
+ """
65
+
66
+ def __init__(self, db_url: str):
67
+ self.engine = create_async_engine(db_url, echo=False)
68
+ self.SessionLocal = async_sessionmaker(self.engine, expire_on_commit=False)
69
+
70
+ async def initialize(self):
71
+ async with self.engine.begin() as conn:
72
+ await conn.run_sync(Base.metadata.create_all)
73
+
74
+ async def close(self):
75
+ await self.engine.dispose()
76
+
77
+ async def record_event(self, cluster: str, event: Dict[str, Any]):
78
+ async with self.SessionLocal() as session:
79
+ # run_sync bridges our shared logic into the async session
80
+ await session.run_sync(_record_event_internal, cluster, event)
81
+ await session.commit()
82
+
83
+ async def get_job(self, cluster: str, job_id: int) -> Optional[JobRecord]:
84
+ async with self.SessionLocal() as session:
85
+ result = await session.execute(
86
+ select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
87
+ )
88
+ job = result.scalar_one_or_none()
89
+ return job.to_record() if job else None
90
+
91
+ async def get_event_history(self, cluster: str, job_id: int) -> List[EventRecord]:
92
+ async with self.SessionLocal() as session:
93
+ result = await session.execute(
94
+ select(EventModel)
95
+ .where(and_(EventModel.job_id == job_id, EventModel.cluster == cluster))
96
+ .order_by(EventModel.timestamp.asc())
97
+ )
98
+ return [e.to_record() for e in result.scalars().all()]
99
+
100
+ async def search_jobs(
101
+ self, cluster: str = None, state: str = None, limit: int = 10
102
+ ) -> List[JobRecord]:
103
+ async with self.SessionLocal() as session:
104
+ stmt = select(JobModel)
105
+ if cluster:
106
+ stmt = stmt.where(JobModel.cluster == cluster)
107
+ if state:
108
+ stmt = stmt.where(JobModel.state == state)
109
+ result = await session.execute(stmt.limit(limit))
110
+ return [j.to_record() for j in result.scalars().all()]
111
+
112
+
113
+ class SQLAlchemyBackend:
114
+ """
115
+ Synchronous backend for the standalone Scribe daemon.
116
+ """
117
+
118
+ def __init__(self, db_url: str):
119
+ # strip 'aiosqlite+' or similar if passed from shared config
120
+ url = db_url.replace("+aiosqlite", "").replace("+asyncpg", "")
121
+ self.engine = create_engine(url, echo=False)
122
+ self.SessionLocal = sessionmaker(bind=self.engine, expire_on_commit=False)
123
+
124
+ def initialize(self):
125
+ Base.metadata.create_all(self.engine)
126
+
127
+ def close(self):
128
+ self.engine.dispose()
129
+
130
+ def record_event(self, cluster: str, event: Dict[str, Any]):
131
+ with self.SessionLocal() as session:
132
+ with session.begin():
133
+ _record_event_internal(session, cluster, event)
134
+
135
+ def get_unwatched_job_ids(self, cluster: str) -> List[int]:
136
+ """Specific for Scribe: find jobs that need a watcher."""
137
+ with self.SessionLocal() as session:
138
+ stmt = select(JobModel.job_id).where(
139
+ and_(JobModel.cluster == cluster, JobModel.state == "submitted")
140
+ )
141
+ return list(session.execute(stmt).scalars().all())
142
+
143
+ def mark_job_as_watched(self, cluster: str, job_id: int):
144
+ with self.SessionLocal() as session:
145
+ with session.begin():
146
+ session.execute(
147
+ update(JobModel)
148
+ .where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
149
+ .values(state="watching")
150
+ )
@@ -0,0 +1,94 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, Optional
3
+
4
+ from sqlalchemy import JSON, Float, Integer, String
5
+ from sqlalchemy.ext.asyncio import AsyncAttrs
6
+ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
7
+
8
+ # DTOs are "Public Data Transfer Objects" and they are used by
9
+ # our interfaces and tools
10
+
11
+
12
+ @dataclass
13
+ class JobRecord:
14
+ """
15
+ Represents a snapshot of a job state.
16
+ Returned by get_job() and search_jobs().
17
+ """
18
+
19
+ job_id: int
20
+ cluster: str
21
+ state: str
22
+ user: str
23
+ workdir: Optional[str] = None
24
+ exit_code: Optional[int] = None
25
+ submit_time: float = 0.0
26
+ last_updated: float = 0.0
27
+
28
+
29
+ @dataclass
30
+ class EventRecord:
31
+ """
32
+ Represents a single historical event.
33
+ Returned by get_event_history().
34
+ """
35
+
36
+ timestamp: float
37
+ event_type: str
38
+ payload: Dict[str, Any]
39
+
40
+
41
+ # Database models for SQLAlchemy ORM
42
+
43
+
44
+ class Base(AsyncAttrs, DeclarativeBase):
45
+ pass
46
+
47
+
48
+ class JobModel(Base):
49
+ __tablename__ = "jobs"
50
+
51
+ # Composite Primary Key
52
+ job_id: Mapped[int] = mapped_column(Integer, primary_key=True)
53
+ cluster: Mapped[str] = mapped_column(String(255), primary_key=True)
54
+
55
+ state: Mapped[str] = mapped_column(String(50))
56
+ user: Mapped[str] = mapped_column(String(255), nullable=True)
57
+ workdir: Mapped[Optional[str]] = mapped_column(String, nullable=True)
58
+ exit_code: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
59
+ submit_time: Mapped[float] = mapped_column(Float, default=0.0)
60
+ last_updated: Mapped[float] = mapped_column(Float, default=0.0)
61
+
62
+ def to_record(self) -> JobRecord:
63
+ """
64
+ Helper to convert ORM model to public DTO
65
+ """
66
+ return JobRecord(
67
+ job_id=self.job_id,
68
+ cluster=self.cluster,
69
+ state=self.state,
70
+ user=self.user,
71
+ workdir=self.workdir,
72
+ exit_code=self.exit_code,
73
+ submit_time=self.submit_time,
74
+ last_updated=self.last_updated,
75
+ )
76
+
77
+
78
+ class EventModel(Base):
79
+ __tablename__ = "events"
80
+
81
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
82
+ job_id: Mapped[int] = mapped_column(Integer, index=True)
83
+ cluster: Mapped[str] = mapped_column(String(255), index=True)
84
+ timestamp: Mapped[float] = mapped_column(Float)
85
+ event_type: Mapped[str] = mapped_column(String(50))
86
+ payload: Mapped[Dict[str, Any]] = mapped_column(JSON)
87
+
88
+ def to_record(self) -> EventRecord:
89
+ """
90
+ Helper to convert ORM model to public DTO
91
+ """
92
+ return EventRecord(
93
+ timestamp=self.timestamp, event_type=self.event_type, payload=self.payload
94
+ )
@@ -0,0 +1,53 @@
1
+ # Template for the Scribe Journal Consumer
2
+ SERVICE_TEMPLATE = """[Unit]
3
+ Description=Flux Scribe Journal Consumer
4
+ After=network.target
5
+
6
+ [Service]
7
+ ExecStart={python_path} -m flux_batch.service.scribe
8
+ Restart=on-failure
9
+
10
+ [Install]
11
+ WantedBy=default.target
12
+ """
13
+
14
+ START_MODULE_TEMPLATE = """
15
+ from flux.modprobe import task
16
+ import flux.subprocess as subprocess
17
+
18
+ @task(
19
+ "start-{service_name}",
20
+ ranks="0",
21
+ needs_config=["{service_name}"],
22
+ after=["resource", "job-list"],
23
+ )
24
+ def start_{service_func}(context):
25
+ # This triggers the systemd user service provisioned earlier
26
+ # context.bash("systemctl --user start {service_name}")
27
+ subprocess.rexec_bg(
28
+ context.handle,
29
+ ["{python_bin}", "-m", "{module_name}"],
30
+ label="{service_name}",
31
+ nodeid=0
32
+ )
33
+ """
34
+
35
+ STOP_MODULE_TEMPLATE = """
36
+ from flux.modprobe import task
37
+ import flux.subprocess as subprocess
38
+
39
+ @task(
40
+ "stop-{service_name}",
41
+ ranks="0",
42
+ needs_config=["{service_name}"],
43
+ before=["resource", "job-list"],
44
+ )
45
+ def stop_{service_func}(context):
46
+ # context.bash("systemctl --user stop {service_name}")
47
+ subprocess.kill(context.handle, signum=2, label="{service_name}").get()
48
+ try:
49
+ status = subprocess.wait(context.handle, label="{service_name}").get()["status"]
50
+ print(status)
51
+ except:
52
+ pass
53
+ """
@@ -0,0 +1,86 @@
1
+ import json
2
+ import os
3
+ import stat
4
+ import subprocess
5
+ import tempfile
6
+
7
+ import flux
8
+ import flux.job
9
+
10
+ import flux_batch.models as models
11
+ import flux_batch.service as services
12
+ import flux_batch.utils as utils
13
+
14
+
15
+ def setup(spec):
16
+ """
17
+ shared function to generate services / modules from a spec.
18
+ """
19
+ # Provision services (like flux-scribe) if requested
20
+ for service in spec.services:
21
+ services.ensure_user_service(service)
22
+ for module in spec.modules:
23
+ services.ensure_modprobe_scripts(module)
24
+
25
+
26
+ def generate_jobspec(spec, script, wrapper_path):
27
+ """
28
+ Shared function to write a script to a wrapper path and generate
29
+ a jobspec for it via flux batch --dry-run.
30
+ """
31
+ utils.write_file(script, wrapper_path)
32
+
33
+ # Make the script executable so 'flux batch' can analyze it
34
+ os.chmod(wrapper_path, os.stat(wrapper_path).st_mode | stat.S_IEXEC)
35
+
36
+ # Generate the RFC 25 Jobspec JSON via the Flux CLI
37
+ # This handles all resource mapping (-N, -n, etc.)
38
+ cmd = ["flux", "batch"] + spec.get_cli_flags() + ["--dry-run", wrapper_path]
39
+
40
+ try:
41
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
42
+ except subprocess.CalledProcessError as e:
43
+ print(f"Error during flux batch dryrun: {e.stderr}")
44
+ raise
45
+ return result.stdout
46
+
47
+
48
+ def preview(spec: models.BatchJobV1) -> int:
49
+ """
50
+ Preview the jobspec.
51
+ """
52
+ setup(spec)
53
+
54
+ with tempfile.TemporaryDirectory() as tmpdir:
55
+
56
+ # Write the wrapper script (handling prologs, services, and jobs)
57
+ wrapper_path = os.path.join(tmpdir, "wrapper.sh")
58
+ script = spec.generate_wrapper_script()
59
+ jobspec = generate_jobspec(spec, script, wrapper_path)
60
+ return json.loads(jobspec)
61
+
62
+
63
+ def submit(handle: flux.Flux, spec: models.BatchJobV1, dry_run=False) -> int:
64
+ """
65
+ Orchestrates the submission process:
66
+ 1. Provisions any required user-space services.
67
+ 2. Generates the wrapper shell script.
68
+ 3. Uses 'flux batch --dryrun' to compile the Jobspec JSON.
69
+ 4. Submits the Jobspec to the Flux instance.
70
+ """
71
+ setup(spec)
72
+
73
+ with tempfile.TemporaryDirectory() as tmpdir:
74
+ # Write the wrapper script (handling prologs, services, and jobs)
75
+ wrapper_path = os.path.join(tmpdir, "wrapper.sh")
76
+
77
+ # dry run here just displays it
78
+ script = spec.generate_wrapper_script()
79
+ if dry_run:
80
+ return script
81
+
82
+ jobspec = generate_jobspec(spec, script, wrapper_path)
83
+
84
+ # Submit the JSON string to the Flux instance
85
+ # The result.stdout contains the raw JSON Jobspec
86
+ return flux.job.submit(handle, jobspec)
@@ -1,4 +1,4 @@
1
- __version__ = "0.0.0"
1
+ __version__ = "0.0.11"
2
2
  AUTHOR = "Vanessa Sochat"
3
3
  AUTHOR_EMAIL = "vsoch@users.noreply.github.com"
4
4
  NAME = "flux-batch"
@@ -13,4 +13,5 @@ INSTALL_REQUIRES = (
13
13
  )
14
14
 
15
15
  TESTS_REQUIRES = (("pytest", {"min_version": "4.6.2"}),)
16
- INSTALL_REQUIRES_ALL = INSTALL_REQUIRES + TESTS_REQUIRES
16
+ SCRIBE_REQUIRES = (("sqlalchemy", {"min_version": None}), ("rich", {"min_version": None}))
17
+ INSTALL_REQUIRES_ALL = INSTALL_REQUIRES + TESTS_REQUIRES + SCRIBE_REQUIRES
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flux-batch
3
- Version: 0.0.0
3
+ Version: 0.0.11
4
4
  Summary: Python SDK for flux batch jobs and services
5
5
  Home-page: https://github.com/converged-computing/flux-batch
6
6
  Author: Vanessa Sochat
@@ -26,12 +26,17 @@ Provides-Extra: all
26
26
  Requires-Dist: pyyaml; extra == "all"
27
27
  Requires-Dist: ply; extra == "all"
28
28
  Requires-Dist: pytest>=4.6.2; extra == "all"
29
+ Requires-Dist: sqlalchemy; extra == "all"
30
+ Requires-Dist: rich; extra == "all"
31
+ Provides-Extra: scribe
32
+ Requires-Dist: sqlalchemy; extra == "scribe"
33
+ Requires-Dist: rich; extra == "scribe"
29
34
 
30
35
  # flux-batch
31
36
 
32
37
  > Python SDK to generate Flux batch jobs and services
33
38
 
34
- [![PyPI version](https://badge.fury.io/py/flux-batch.svg)](https://badge.fury.io/py/flux-batch)
39
+ ![PyPI - Version](https://img.shields.io/pypi/v/flux-batch)
35
40
 
36
41
  ![https://github.com/converged-computing/flux-batch/raw/main/img/flux-batch-small.png](https://github.com/converged-computing/flux-batch/raw/main/img/flux-batch-small.png)
37
42
 
@@ -46,6 +51,7 @@ Requires-Dist: pytest>=4.6.2; extra == "all"
46
51
 
47
52
  - **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
48
53
 
54
+
49
55
  ## Usage
50
56
 
51
57
  This is a small Flux utility that makes it easy to create Flux batch jobs and services.
@@ -60,9 +66,26 @@ flux start
60
66
  pip install -e . --break-system-packages
61
67
  ```
62
68
 
63
- ### Example
69
+ ### Examples
70
+
71
+ We have a few simple examples:
72
+
73
+ #### Saving Logs
74
+
75
+ ```bash
76
+ python3 ./examples/save_logs.py
77
+ ```
78
+
79
+ #### Flux Scribe Module
80
+
81
+ ```bash
82
+ export FLUX_SCRIBE_DATABASE=sqlite:///flux-batch-job.db
83
+ python3 ./examples/flux_scribe_module.py
84
+ ```
85
+
86
+ #### General Test
64
87
 
65
- Run the controlled example to see a batch job with prolog and epilog run and complete:
88
+ Or run the controlled example to see a batch job with prolog and epilog run and complete:
66
89
 
67
90
  ```bash
68
91
  python3 ./tests/test_flux_batch.py
@@ -117,7 +140,9 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
117
140
  nodes=1,
118
141
  nslots=1,
119
142
  time_limit="10m",
120
- job_name="test-batch"
143
+ job_name="test-batch",
144
+ # Add saving of logs, info, and metadata
145
+ logs_dir="./logs",
121
146
  )
122
147
 
123
148
  # Add a prolog and epilog
@@ -125,7 +150,7 @@ jobspec.add_prolog("echo 'Batch Wrapper Starting'")
125
150
  jobspec.add_epilog("echo 'Batch Wrapper Finished'")
126
151
 
127
152
  # Add a service (this assumes user level that exists)
128
- # jobspec.add_service("my-service'")
153
+ jobspec.add_service("flux-scribe")
129
154
 
130
155
  # Preview it
131
156
  print(flux_batch.submit(handle, jobspec, dry_run=True))
@@ -20,8 +20,14 @@ flux_batch.egg-info/top_level.txt
20
20
  flux_batch/logger/__init__.py
21
21
  flux_batch/logger/generate.py
22
22
  flux_batch/logger/logger.py
23
+ flux_batch/script/__init__.py
24
+ flux_batch/script/save_logs.sh
23
25
  flux_batch/service/__init__.py
24
- flux_batch/service/scribe.py
26
+ flux_batch/service/scribe/__init__.py
27
+ flux_batch/service/scribe/__main__.py
28
+ flux_batch/service/scribe/database.py
29
+ flux_batch/service/scribe/models.py
30
+ flux_batch/service/scribe/template.py
25
31
  flux_batch/utils/__init__.py
26
32
  flux_batch/utils/fileio.py
27
33
  flux_batch/utils/text.py
@@ -5,3 +5,9 @@ ply
5
5
  pyyaml
6
6
  ply
7
7
  pytest>=4.6.2
8
+ sqlalchemy
9
+ rich
10
+
11
+ [scribe]
12
+ sqlalchemy
13
+ rich
@@ -62,6 +62,7 @@ if __name__ == "__main__":
62
62
  INSTALL_REQUIRES = get_reqs(lookup)
63
63
  TESTS_REQUIRES = get_reqs(lookup, "TESTS_REQUIRES")
64
64
  INSTALL_REQUIRES_ALL = get_reqs(lookup, "INSTALL_REQUIRES_ALL")
65
+ INSTALL_REQUIRES_SCRIBE = get_reqs(lookup, "SCRIBE_REQUIRES")
65
66
 
66
67
  setup(
67
68
  name=NAME,
@@ -83,6 +84,7 @@ if __name__ == "__main__":
83
84
  tests_require=TESTS_REQUIRES,
84
85
  extras_require={
85
86
  "all": [INSTALL_REQUIRES_ALL],
87
+ "scribe": [INSTALL_REQUIRES_SCRIBE],
86
88
  },
87
89
  classifiers=[
88
90
  "Intended Audience :: Science/Research",
@@ -1,31 +0,0 @@
1
- import os
2
- import subprocess
3
- import sys
4
-
5
- from .scribe import SERVICE_TEMPLATE as scribe_template
6
-
7
- # Lookup of known services
8
- services = {"scribe": scribe_template}
9
-
10
-
11
- def ensure_user_service(service_name: str):
12
- """
13
- Checks for the existence of a systemd service file in the user's home.
14
- If it doesn't exist, it creates it and reloads the daemon.
15
- """
16
- user_systemd_dir = os.path.expanduser("~/.config/systemd/user")
17
- os.makedirs(user_systemd_dir, exist_ok=True)
18
- service_path = os.path.join(user_systemd_dir, f"{service_name}.service")
19
-
20
- if not os.path.exists(service_path):
21
- if service_name in services:
22
- template = services[service_name]
23
- print(f"[*] Provisioning {service_name} at {service_path}")
24
- with open(service_path, "w") as f:
25
- f.write(template.format(python_path=sys.executable))
26
-
27
- else:
28
- print(f"[*] Service {service_name} is not known, assuming exists.")
29
-
30
- # Reload the user-session manager to recognize the new unit
31
- subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
@@ -1,12 +0,0 @@
1
- # Template for the Scribe Journal Consumer
2
- SERVICE_TEMPLATE = """[Unit]
3
- Description=Flux Scribe Journal Consumer
4
- After=network.target
5
-
6
- [Service]
7
- ExecStart={python_path} -m flux_mcp_server.scribe
8
- Restart=on-failure
9
-
10
- [Install]
11
- WantedBy=default.target
12
- """
@@ -1,53 +0,0 @@
1
- import os
2
- import stat
3
- import subprocess
4
- import tempfile
5
-
6
- import flux
7
- import flux.job
8
-
9
- import flux_batch.models as models
10
- import flux_batch.utils as utils
11
- from flux_batch.service import ensure_user_service
12
-
13
-
14
- def submit(handle: flux.Flux, spec: models.BatchJobV1, dry_run=False) -> int:
15
- """
16
- Orchestrates the submission process:
17
- 1. Provisions any required user-space services.
18
- 2. Generates the wrapper shell script.
19
- 3. Uses 'flux batch --dryrun' to compile the Jobspec JSON.
20
- 4. Submits the Jobspec to the Flux instance.
21
- """
22
-
23
- # Provision services (like flux-scribe) if requested
24
- for service in spec.services:
25
- ensure_user_service(service)
26
-
27
- with tempfile.TemporaryDirectory() as tmpdir:
28
- # Write the wrapper script (handling prologs, services, and jobs)
29
- wrapper_path = os.path.join(tmpdir, "wrapper.sh")
30
-
31
- # dry run here just displays it
32
- script = spec.generate_wrapper_script()
33
- if dry_run:
34
- return script
35
-
36
- utils.write_file(script, wrapper_path)
37
-
38
- # Make the script executable so 'flux batch' can analyze it
39
- os.chmod(wrapper_path, os.stat(wrapper_path).st_mode | stat.S_IEXEC)
40
-
41
- # Generate the RFC 25 Jobspec JSON via the Flux CLI
42
- # This handles all resource mapping (-N, -n, etc.)
43
- cmd = ["flux", "batch"] + spec.get_cli_flags() + ["--dry-run", wrapper_path]
44
-
45
- try:
46
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
47
- except subprocess.CalledProcessError as e:
48
- print(f"Error during flux batch dryrun: {e.stderr}")
49
- raise
50
-
51
- # Submit the JSON string to the Flux instance
52
- # The result.stdout contains the raw JSON Jobspec
53
- return flux.job.submit(handle, result.stdout)
File without changes
File without changes
File without changes
File without changes
File without changes