flux-batch 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flux_batch/service/scribe/__main__.py +16 -2
- flux_batch/service/scribe/database.py +25 -9
- flux_batch/service/scribe/models.py +18 -5
- flux_batch/service/usernetes/__init__.py +2 -0
- flux_batch/service/usernetes/__main__.py +115 -0
- flux_batch/service/usernetes/database.py +151 -0
- flux_batch/service/usernetes/models.py +98 -0
- flux_batch/service/usernetes/template.py +53 -0
- flux_batch/version.py +1 -1
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/METADATA +10 -7
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/RECORD +16 -11
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/LICENSE +0 -0
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/NOTICE +0 -0
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/WHEEL +0 -0
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/entry_points.txt +0 -0
- {flux_batch-0.0.11.dist-info → flux_batch-0.0.13.dist-info}/top_level.txt +0 -0
|
@@ -26,9 +26,12 @@ class JournalScribe:
|
|
|
26
26
|
"""
|
|
27
27
|
Initializes the Scribe with a synchronous DB backend and a Flux Journal Consumer.
|
|
28
28
|
"""
|
|
29
|
+
# Get settings from environment
|
|
30
|
+
self.init_settings()
|
|
31
|
+
|
|
29
32
|
# Setup Database
|
|
30
33
|
logger.info(f"Connecting to Database: {db_url}")
|
|
31
|
-
self.db = SQLAlchemyBackend(db_url)
|
|
34
|
+
self.db = SQLAlchemyBackend(db_url, self.settings["use_ssl"])
|
|
32
35
|
self.db.initialize()
|
|
33
36
|
|
|
34
37
|
try:
|
|
@@ -38,11 +41,22 @@ class JournalScribe:
|
|
|
38
41
|
logger.critical(f"Failed to connect to Flux: {e}")
|
|
39
42
|
sys.exit(1)
|
|
40
43
|
|
|
44
|
+
logger.info(f"🍳 Handle: {self.handle.attr_get('local-uri')}")
|
|
45
|
+
|
|
41
46
|
# Initialize Journal Consumer
|
|
42
47
|
# This consumes the global event log for the entire instance
|
|
43
48
|
self.consumer = flux.job.JournalConsumer(self.handle)
|
|
44
49
|
self.running = True
|
|
45
50
|
|
|
51
|
+
def init_settings(self):
|
|
52
|
+
"""
|
|
53
|
+
Initialize settings.
|
|
54
|
+
"""
|
|
55
|
+
self.settings = {
|
|
56
|
+
"use_ssl": os.environ.get("FLUX_SCRIBE_SSL") not in [None, "no", "false", "off"],
|
|
57
|
+
"uid": os.environ.get("FLUX_SCRIBE_UID"),
|
|
58
|
+
}
|
|
59
|
+
|
|
46
60
|
def _normalize_event(self, event) -> dict:
|
|
47
61
|
"""
|
|
48
62
|
Converts a Flux event object into the dictionary format expected by record_event.
|
|
@@ -79,7 +93,7 @@ class JournalScribe:
|
|
|
79
93
|
# We only care about events associated with a job
|
|
80
94
|
if hasattr(event, "jobid"):
|
|
81
95
|
clean_event = self._normalize_event(event)
|
|
82
|
-
self.db.record_event("local", clean_event)
|
|
96
|
+
self.db.record_event("local", self.settings['uid'], clean_event)
|
|
83
97
|
else:
|
|
84
98
|
# If no event, yield a tiny bit of CPU
|
|
85
99
|
time.sleep(0.01)
|
|
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
|
|
|
8
8
|
from flux_batch.service.scribe.models import Base, EventModel, EventRecord, JobModel, JobRecord
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def _record_event_internal(session, cluster: str, event: Dict[str, Any]):
|
|
11
|
+
def _record_event_internal(session, cluster: str, uid: str, event: Dict[str, Any]):
|
|
12
12
|
"""
|
|
13
13
|
Shared synchronous logic for recording events.
|
|
14
14
|
Used by both Sync and Async backends.
|
|
@@ -18,11 +18,13 @@ def _record_event_internal(session, cluster: str, event: Dict[str, Any]):
|
|
|
18
18
|
data = event.get("payload", {})
|
|
19
19
|
timestamp = event.get("timestamp", time.time())
|
|
20
20
|
|
|
21
|
+
# Add the new event with all metadata
|
|
21
22
|
new_event = EventModel(
|
|
22
23
|
job_id=job_id,
|
|
23
24
|
cluster=cluster,
|
|
24
25
|
timestamp=timestamp,
|
|
25
26
|
event_type=event_type,
|
|
27
|
+
uid=uid,
|
|
26
28
|
payload=data,
|
|
27
29
|
)
|
|
28
30
|
session.add(new_event)
|
|
@@ -40,6 +42,7 @@ def _record_event_internal(session, cluster: str, event: Dict[str, Any]):
|
|
|
40
42
|
workdir=data.get("cwd", ""),
|
|
41
43
|
submit_time=timestamp,
|
|
42
44
|
last_updated=timestamp,
|
|
45
|
+
uid=uid,
|
|
43
46
|
)
|
|
44
47
|
session.add(job)
|
|
45
48
|
else:
|
|
@@ -63,8 +66,18 @@ class AsyncSQLAlchemyBackend:
|
|
|
63
66
|
Asynchronous backend for the MCP Gateway.
|
|
64
67
|
"""
|
|
65
68
|
|
|
66
|
-
def __init__(self, db_url: str):
|
|
67
|
-
|
|
69
|
+
def __init__(self, db_url: str, use_ssl: bool = False):
|
|
70
|
+
# Use asyncio connection via asyncmy
|
|
71
|
+
if db_url.startswith("mysql://") or db_url.startswith("mariadb://"):
|
|
72
|
+
db_url = db_url.replace("mysql://", "mysql+asyncmy://", 1)
|
|
73
|
+
db_url = db_url.replace("mariadb://", "mysql+asyncmy://", 1)
|
|
74
|
+
|
|
75
|
+
connect_args = {"connect_timeout": 10}
|
|
76
|
+
if use_ssl:
|
|
77
|
+
print("SSL is enabled.")
|
|
78
|
+
connect_args["ssl"] = {"ssl_mode": "REQUIRED", "check_hostname": False}
|
|
79
|
+
|
|
80
|
+
self.engine = create_async_engine(db_url, echo=False, connect_args=connect_args)
|
|
68
81
|
self.SessionLocal = async_sessionmaker(self.engine, expire_on_commit=False)
|
|
69
82
|
|
|
70
83
|
async def initialize(self):
|
|
@@ -115,10 +128,13 @@ class SQLAlchemyBackend:
|
|
|
115
128
|
Synchronous backend for the standalone Scribe daemon.
|
|
116
129
|
"""
|
|
117
130
|
|
|
118
|
-
def __init__(self, db_url: str):
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
131
|
+
def __init__(self, db_url: str, use_ssl: bool = False):
|
|
132
|
+
connect_args = {"connect_timeout": 10}
|
|
133
|
+
if use_ssl:
|
|
134
|
+
print("SSL is enabled.")
|
|
135
|
+
connect_args["ssl"] = {"ssl_mode": "REQUIRED", "check_hostname": False}
|
|
136
|
+
|
|
137
|
+
self.engine = create_engine(db_url, echo=False, connect_args=connect_args)
|
|
122
138
|
self.SessionLocal = sessionmaker(bind=self.engine, expire_on_commit=False)
|
|
123
139
|
|
|
124
140
|
def initialize(self):
|
|
@@ -127,10 +143,10 @@ class SQLAlchemyBackend:
|
|
|
127
143
|
def close(self):
|
|
128
144
|
self.engine.dispose()
|
|
129
145
|
|
|
130
|
-
def record_event(self, cluster: str, event: Dict[str, Any]):
|
|
146
|
+
def record_event(self, cluster: str, uid: str, event: Dict[str, Any]):
|
|
131
147
|
with self.SessionLocal() as session:
|
|
132
148
|
with session.begin():
|
|
133
|
-
_record_event_internal(session, cluster, event)
|
|
149
|
+
_record_event_internal(session, cluster, uid, event)
|
|
134
150
|
|
|
135
151
|
def get_unwatched_job_ids(self, cluster: str) -> List[int]:
|
|
136
152
|
"""Specific for Scribe: find jobs that need a watcher."""
|
|
@@ -16,7 +16,7 @@ class JobRecord:
|
|
|
16
16
|
Returned by get_job() and search_jobs().
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
job_id:
|
|
19
|
+
job_id: str
|
|
20
20
|
cluster: str
|
|
21
21
|
state: str
|
|
22
22
|
user: str
|
|
@@ -24,6 +24,7 @@ class JobRecord:
|
|
|
24
24
|
exit_code: Optional[int] = None
|
|
25
25
|
submit_time: float = 0.0
|
|
26
26
|
last_updated: float = 0.0
|
|
27
|
+
uid: str = None
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
@dataclass
|
|
@@ -36,6 +37,7 @@ class EventRecord:
|
|
|
36
37
|
timestamp: float
|
|
37
38
|
event_type: str
|
|
38
39
|
payload: Dict[str, Any]
|
|
40
|
+
uid: str = None
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
# Database models for SQLAlchemy ORM
|
|
@@ -49,12 +51,16 @@ class JobModel(Base):
|
|
|
49
51
|
__tablename__ = "jobs"
|
|
50
52
|
|
|
51
53
|
# Composite Primary Key
|
|
52
|
-
job_id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
53
54
|
cluster: Mapped[str] = mapped_column(String(255), primary_key=True)
|
|
55
|
+
job_id: Mapped[str] = mapped_column(String(255), primary_key=True)
|
|
56
|
+
uid: Mapped[str] = mapped_column(String(255), nullable=True)
|
|
54
57
|
|
|
55
58
|
state: Mapped[str] = mapped_column(String(50))
|
|
56
59
|
user: Mapped[str] = mapped_column(String(255), nullable=True)
|
|
57
|
-
|
|
60
|
+
|
|
61
|
+
# Fixed: Added length 1024 for MySQL/MariaDB compatibility
|
|
62
|
+
workdir: Mapped[Optional[str]] = mapped_column(String(1024), nullable=True)
|
|
63
|
+
|
|
58
64
|
exit_code: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
|
|
59
65
|
submit_time: Mapped[float] = mapped_column(Float, default=0.0)
|
|
60
66
|
last_updated: Mapped[float] = mapped_column(Float, default=0.0)
|
|
@@ -72,6 +78,7 @@ class JobModel(Base):
|
|
|
72
78
|
exit_code=self.exit_code,
|
|
73
79
|
submit_time=self.submit_time,
|
|
74
80
|
last_updated=self.last_updated,
|
|
81
|
+
uid=self.uid,
|
|
75
82
|
)
|
|
76
83
|
|
|
77
84
|
|
|
@@ -79,16 +86,22 @@ class EventModel(Base):
|
|
|
79
86
|
__tablename__ = "events"
|
|
80
87
|
|
|
81
88
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
|
82
|
-
job_id: Mapped[
|
|
89
|
+
job_id: Mapped[str] = mapped_column(String(255), primary_key=True)
|
|
83
90
|
cluster: Mapped[str] = mapped_column(String(255), index=True)
|
|
84
91
|
timestamp: Mapped[float] = mapped_column(Float)
|
|
85
92
|
event_type: Mapped[str] = mapped_column(String(50))
|
|
86
93
|
payload: Mapped[Dict[str, Any]] = mapped_column(JSON)
|
|
94
|
+
uid: Mapped[str] = mapped_column(String(255), nullable=True)
|
|
87
95
|
|
|
88
96
|
def to_record(self) -> EventRecord:
|
|
89
97
|
"""
|
|
90
98
|
Helper to convert ORM model to public DTO
|
|
91
99
|
"""
|
|
92
100
|
return EventRecord(
|
|
93
|
-
timestamp=self.timestamp,
|
|
101
|
+
timestamp=self.timestamp,
|
|
102
|
+
event_type=self.event_type,
|
|
103
|
+
payload=self.payload,
|
|
104
|
+
cluster=self.cluster,
|
|
105
|
+
job_id=self.job_id,
|
|
106
|
+
uid=self.uid,
|
|
94
107
|
)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import errno
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
import flux
|
|
9
|
+
import flux.job
|
|
10
|
+
|
|
11
|
+
# Not necessary, but it makes it pretty
|
|
12
|
+
from rich import print
|
|
13
|
+
|
|
14
|
+
# Use the synchronous version of the backend to avoid asyncio-in-thread conflicts
|
|
15
|
+
from flux_batch.service.scribe.database import SQLAlchemyBackend
|
|
16
|
+
|
|
17
|
+
# Setup logging to stderr (to avoid polluting stdout if run manually)
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s", stream=sys.stderr
|
|
20
|
+
)
|
|
21
|
+
logger = logging.getLogger("flux-scribe")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class JournalScribe:
|
|
25
|
+
def __init__(self, db_url: str):
|
|
26
|
+
"""
|
|
27
|
+
Initializes the Scribe with a synchronous DB backend and a Flux Journal Consumer.
|
|
28
|
+
"""
|
|
29
|
+
# Setup Database
|
|
30
|
+
logger.info(f"Connecting to Database: {db_url}")
|
|
31
|
+
self.db = SQLAlchemyBackend(db_url)
|
|
32
|
+
self.db.initialize()
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
self.handle = flux.Flux()
|
|
36
|
+
logger.info("Connected to Flux instance.")
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.critical(f"Failed to connect to Flux: {e}")
|
|
39
|
+
sys.exit(1)
|
|
40
|
+
|
|
41
|
+
logger.info(f"🍳 Handle: {self.handle.attr_get('local-uri')}")
|
|
42
|
+
|
|
43
|
+
# Initialize Journal Consumer
|
|
44
|
+
# This consumes the global event log for the entire instance
|
|
45
|
+
self.consumer = flux.job.JournalConsumer(self.handle)
|
|
46
|
+
self.running = True
|
|
47
|
+
|
|
48
|
+
def _normalize_event(self, event) -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Converts a Flux event object into the dictionary format expected by record_event.
|
|
51
|
+
Matches the logic provided in your EventsEngine reference.
|
|
52
|
+
"""
|
|
53
|
+
# Convert the SWIG/CFFI event object to a dictionary
|
|
54
|
+
payload = dict(event)
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
"id": str(getattr(event, "jobid", "unknown")),
|
|
58
|
+
"type": getattr(event, "name", "unknown"),
|
|
59
|
+
"timestamp": getattr(event, "timestamp", time.time()),
|
|
60
|
+
"payload": payload,
|
|
61
|
+
"R": getattr(event, "R", None),
|
|
62
|
+
"jobspec": getattr(event, "jobspec", None),
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def run(self):
|
|
66
|
+
"""
|
|
67
|
+
Main execution loop. Polls the journal and writes to the DB.
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
logger.info("🚀 Flux Scribe (Journal Consumer) started.")
|
|
71
|
+
self.consumer.start()
|
|
72
|
+
|
|
73
|
+
while self.running:
|
|
74
|
+
try:
|
|
75
|
+
# Non-blocking poll (100ms timeout)
|
|
76
|
+
# This allows the loop to check for shutdown signals regularly
|
|
77
|
+
event = self.consumer.poll(timeout=0.1)
|
|
78
|
+
|
|
79
|
+
if event:
|
|
80
|
+
print(event)
|
|
81
|
+
# We only care about events associated with a job
|
|
82
|
+
if hasattr(event, "jobid"):
|
|
83
|
+
clean_event = self._normalize_event(event)
|
|
84
|
+
self.db.record_event("local", clean_event)
|
|
85
|
+
else:
|
|
86
|
+
# If no event, yield a tiny bit of CPU
|
|
87
|
+
time.sleep(0.01)
|
|
88
|
+
|
|
89
|
+
except EnvironmentError as e:
|
|
90
|
+
# Ignore timeouts (no data)
|
|
91
|
+
if e.errno == errno.ETIMEDOUT:
|
|
92
|
+
continue
|
|
93
|
+
logger.error(f"Flux connection error: {e}")
|
|
94
|
+
time.sleep(1)
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logger.error(f"Unexpected error in event loop: {e}")
|
|
98
|
+
time.sleep(1)
|
|
99
|
+
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.critical(f"EventsEngine crashed: {e}")
|
|
102
|
+
finally:
|
|
103
|
+
self.db.close()
|
|
104
|
+
logger.info("EventsEngine thread exiting.")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main():
|
|
108
|
+
# Retrieve DB path from environment or use a default
|
|
109
|
+
db_path = os.environ.get("FLUX_SCRIBE_DATABASE", "sqlite:///server_state.db")
|
|
110
|
+
scribe = JournalScribe(db_path)
|
|
111
|
+
scribe.run()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
if __name__ == "__main__":
|
|
115
|
+
main()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import and_, create_engine, select, update
|
|
5
|
+
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
|
7
|
+
|
|
8
|
+
from flux_batch.service.scribe.models import Base, EventModel, EventRecord, JobModel, JobRecord
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _record_event_internal(session, cluster: str, event: Dict[str, Any]):
|
|
12
|
+
"""
|
|
13
|
+
Shared synchronous logic for recording events.
|
|
14
|
+
Used by both Sync and Async backends.
|
|
15
|
+
"""
|
|
16
|
+
job_id = event.get("id")
|
|
17
|
+
event_type = event.get("type")
|
|
18
|
+
data = event.get("payload", {})
|
|
19
|
+
timestamp = event.get("timestamp", time.time())
|
|
20
|
+
|
|
21
|
+
# Add the new event with all metadata
|
|
22
|
+
new_event = EventModel(
|
|
23
|
+
job_id=job_id,
|
|
24
|
+
cluster=cluster,
|
|
25
|
+
timestamp=timestamp,
|
|
26
|
+
event_type=event_type,
|
|
27
|
+
payload=data,
|
|
28
|
+
)
|
|
29
|
+
session.add(new_event)
|
|
30
|
+
|
|
31
|
+
if event_type == "submit":
|
|
32
|
+
stmt = select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
33
|
+
job = session.execute(stmt).scalar_one_or_none()
|
|
34
|
+
|
|
35
|
+
if not job:
|
|
36
|
+
job = JobModel(
|
|
37
|
+
job_id=job_id,
|
|
38
|
+
cluster=cluster,
|
|
39
|
+
user=str(data.get("userid", "unknown")),
|
|
40
|
+
state="submitted",
|
|
41
|
+
workdir=data.get("cwd", ""),
|
|
42
|
+
submit_time=timestamp,
|
|
43
|
+
last_updated=timestamp,
|
|
44
|
+
)
|
|
45
|
+
session.add(job)
|
|
46
|
+
else:
|
|
47
|
+
job.state = "submitted"
|
|
48
|
+
job.last_updated = timestamp
|
|
49
|
+
|
|
50
|
+
# state transitions
|
|
51
|
+
elif event_type == "state" or (event_type and event_type.endswith(".finish")):
|
|
52
|
+
state_name = data.get("state_name", event_type)
|
|
53
|
+
stmt = select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
54
|
+
job = session.execute(stmt).scalar_one_or_none()
|
|
55
|
+
if job:
|
|
56
|
+
job.state = state_name
|
|
57
|
+
job.last_updated = time.time()
|
|
58
|
+
if "status" in data:
|
|
59
|
+
job.exit_code = data["status"]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AsyncSQLAlchemyBackend:
|
|
63
|
+
"""
|
|
64
|
+
Asynchronous backend for the MCP Gateway.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(self, db_url: str):
|
|
68
|
+
self.engine = create_async_engine(db_url, echo=False)
|
|
69
|
+
self.SessionLocal = async_sessionmaker(self.engine, expire_on_commit=False)
|
|
70
|
+
|
|
71
|
+
async def initialize(self):
|
|
72
|
+
async with self.engine.begin() as conn:
|
|
73
|
+
await conn.run_sync(Base.metadata.create_all)
|
|
74
|
+
|
|
75
|
+
async def close(self):
|
|
76
|
+
await self.engine.dispose()
|
|
77
|
+
|
|
78
|
+
async def record_event(self, cluster: str, event: Dict[str, Any]):
|
|
79
|
+
async with self.SessionLocal() as session:
|
|
80
|
+
# run_sync bridges our shared logic into the async session
|
|
81
|
+
await session.run_sync(_record_event_internal, cluster, event)
|
|
82
|
+
await session.commit()
|
|
83
|
+
|
|
84
|
+
async def get_job(self, cluster: str, job_id: int) -> Optional[JobRecord]:
|
|
85
|
+
async with self.SessionLocal() as session:
|
|
86
|
+
result = await session.execute(
|
|
87
|
+
select(JobModel).where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
88
|
+
)
|
|
89
|
+
job = result.scalar_one_or_none()
|
|
90
|
+
return job.to_record() if job else None
|
|
91
|
+
|
|
92
|
+
async def get_event_history(self, cluster: str, job_id: int) -> List[EventRecord]:
|
|
93
|
+
async with self.SessionLocal() as session:
|
|
94
|
+
result = await session.execute(
|
|
95
|
+
select(EventModel)
|
|
96
|
+
.where(and_(EventModel.job_id == job_id, EventModel.cluster == cluster))
|
|
97
|
+
.order_by(EventModel.timestamp.asc())
|
|
98
|
+
)
|
|
99
|
+
return [e.to_record() for e in result.scalars().all()]
|
|
100
|
+
|
|
101
|
+
async def search_jobs(
|
|
102
|
+
self, cluster: str = None, state: str = None, limit: int = 10
|
|
103
|
+
) -> List[JobRecord]:
|
|
104
|
+
async with self.SessionLocal() as session:
|
|
105
|
+
stmt = select(JobModel)
|
|
106
|
+
if cluster:
|
|
107
|
+
stmt = stmt.where(JobModel.cluster == cluster)
|
|
108
|
+
if state:
|
|
109
|
+
stmt = stmt.where(JobModel.state == state)
|
|
110
|
+
result = await session.execute(stmt.limit(limit))
|
|
111
|
+
return [j.to_record() for j in result.scalars().all()]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class SQLAlchemyBackend:
|
|
115
|
+
"""
|
|
116
|
+
Synchronous backend for the standalone Scribe daemon.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(self, db_url: str):
|
|
120
|
+
# strip 'aiosqlite+' or similar if passed from shared config
|
|
121
|
+
url = db_url.replace("+aiosqlite", "").replace("+asyncpg", "")
|
|
122
|
+
self.engine = create_engine(url, echo=False)
|
|
123
|
+
self.SessionLocal = sessionmaker(bind=self.engine, expire_on_commit=False)
|
|
124
|
+
|
|
125
|
+
def initialize(self):
|
|
126
|
+
Base.metadata.create_all(self.engine)
|
|
127
|
+
|
|
128
|
+
def close(self):
|
|
129
|
+
self.engine.dispose()
|
|
130
|
+
|
|
131
|
+
def record_event(self, cluster: str, event: Dict[str, Any]):
|
|
132
|
+
with self.SessionLocal() as session:
|
|
133
|
+
with session.begin():
|
|
134
|
+
_record_event_internal(session, cluster, event)
|
|
135
|
+
|
|
136
|
+
def get_unwatched_job_ids(self, cluster: str) -> List[int]:
|
|
137
|
+
"""Specific for Scribe: find jobs that need a watcher."""
|
|
138
|
+
with self.SessionLocal() as session:
|
|
139
|
+
stmt = select(JobModel.job_id).where(
|
|
140
|
+
and_(JobModel.cluster == cluster, JobModel.state == "submitted")
|
|
141
|
+
)
|
|
142
|
+
return list(session.execute(stmt).scalars().all())
|
|
143
|
+
|
|
144
|
+
def mark_job_as_watched(self, cluster: str, job_id: int):
|
|
145
|
+
with self.SessionLocal() as session:
|
|
146
|
+
with session.begin():
|
|
147
|
+
session.execute(
|
|
148
|
+
update(JobModel)
|
|
149
|
+
.where(and_(JobModel.job_id == job_id, JobModel.cluster == cluster))
|
|
150
|
+
.values(state="watching")
|
|
151
|
+
)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import JSON, Float, Integer, String
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncAttrs
|
|
6
|
+
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
|
7
|
+
|
|
8
|
+
# DTOs are "Public Data Transfer Objects" and they are used by
|
|
9
|
+
# our interfaces and tools
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class JobRecord:
|
|
14
|
+
"""
|
|
15
|
+
Represents a snapshot of a job state.
|
|
16
|
+
Returned by get_job() and search_jobs().
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
job_id: int
|
|
20
|
+
cluster: str
|
|
21
|
+
state: str
|
|
22
|
+
user: str
|
|
23
|
+
workdir: Optional[str] = None
|
|
24
|
+
exit_code: Optional[int] = None
|
|
25
|
+
submit_time: float = 0.0
|
|
26
|
+
last_updated: float = 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class EventRecord:
|
|
31
|
+
"""
|
|
32
|
+
Represents a single historical event.
|
|
33
|
+
Returned by get_event_history().
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
timestamp: float
|
|
37
|
+
event_type: str
|
|
38
|
+
payload: Dict[str, Any]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Database models for SQLAlchemy ORM
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Base(AsyncAttrs, DeclarativeBase):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class JobModel(Base):
|
|
49
|
+
__tablename__ = "jobs"
|
|
50
|
+
|
|
51
|
+
# Composite Primary Key
|
|
52
|
+
job_id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
53
|
+
cluster: Mapped[str] = mapped_column(String(255), primary_key=True)
|
|
54
|
+
|
|
55
|
+
state: Mapped[str] = mapped_column(String(50))
|
|
56
|
+
user: Mapped[str] = mapped_column(String(255), nullable=True)
|
|
57
|
+
workdir: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
|
58
|
+
exit_code: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
|
|
59
|
+
submit_time: Mapped[float] = mapped_column(Float, default=0.0)
|
|
60
|
+
last_updated: Mapped[float] = mapped_column(Float, default=0.0)
|
|
61
|
+
|
|
62
|
+
def to_record(self) -> JobRecord:
|
|
63
|
+
"""
|
|
64
|
+
Helper to convert ORM model to public DTO
|
|
65
|
+
"""
|
|
66
|
+
return JobRecord(
|
|
67
|
+
job_id=self.job_id,
|
|
68
|
+
cluster=self.cluster,
|
|
69
|
+
state=self.state,
|
|
70
|
+
user=self.user,
|
|
71
|
+
workdir=self.workdir,
|
|
72
|
+
exit_code=self.exit_code,
|
|
73
|
+
submit_time=self.submit_time,
|
|
74
|
+
last_updated=self.last_updated,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class EventModel(Base):
|
|
79
|
+
__tablename__ = "events"
|
|
80
|
+
|
|
81
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
|
82
|
+
job_id: Mapped[int] = mapped_column(Integer, index=True)
|
|
83
|
+
cluster: Mapped[str] = mapped_column(String(255), index=True)
|
|
84
|
+
timestamp: Mapped[float] = mapped_column(Float)
|
|
85
|
+
event_type: Mapped[str] = mapped_column(String(50))
|
|
86
|
+
payload: Mapped[Dict[str, Any]] = mapped_column(JSON)
|
|
87
|
+
|
|
88
|
+
def to_record(self) -> EventRecord:
|
|
89
|
+
"""
|
|
90
|
+
Helper to convert ORM model to public DTO
|
|
91
|
+
"""
|
|
92
|
+
return EventRecord(
|
|
93
|
+
timestamp=self.timestamp,
|
|
94
|
+
event_type=self.event_type,
|
|
95
|
+
payload=self.payload,
|
|
96
|
+
cluster=self.cluster,
|
|
97
|
+
job_id=self.job_id,
|
|
98
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Template for the Scribe Journal Consumer
|
|
2
|
+
SERVICE_TEMPLATE = """[Unit]
|
|
3
|
+
Description=Flux Scribe Journal Consumer
|
|
4
|
+
After=network.target
|
|
5
|
+
|
|
6
|
+
[Service]
|
|
7
|
+
ExecStart={python_path} -m flux_batch.service.scribe
|
|
8
|
+
Restart=on-failure
|
|
9
|
+
|
|
10
|
+
[Install]
|
|
11
|
+
WantedBy=default.target
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
START_MODULE_TEMPLATE = """
|
|
15
|
+
from flux.modprobe import task
|
|
16
|
+
import flux.subprocess as subprocess
|
|
17
|
+
|
|
18
|
+
@task(
|
|
19
|
+
"start-{service_name}",
|
|
20
|
+
ranks="0",
|
|
21
|
+
needs_config=["{service_name}"],
|
|
22
|
+
after=["resource", "job-list"],
|
|
23
|
+
)
|
|
24
|
+
def start_{service_func}(context):
|
|
25
|
+
# This triggers the systemd user service provisioned earlier
|
|
26
|
+
# context.bash("systemctl --user start {service_name}")
|
|
27
|
+
subprocess.rexec_bg(
|
|
28
|
+
context.handle,
|
|
29
|
+
["{python_bin}", "-m", "{module_name}"],
|
|
30
|
+
label="{service_name}",
|
|
31
|
+
nodeid=0
|
|
32
|
+
)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
STOP_MODULE_TEMPLATE = """
|
|
36
|
+
from flux.modprobe import task
|
|
37
|
+
import flux.subprocess as subprocess
|
|
38
|
+
|
|
39
|
+
@task(
|
|
40
|
+
"stop-{service_name}",
|
|
41
|
+
ranks="0",
|
|
42
|
+
needs_config=["{service_name}"],
|
|
43
|
+
before=["resource", "job-list"],
|
|
44
|
+
)
|
|
45
|
+
def stop_{service_func}(context):
|
|
46
|
+
# context.bash("systemctl --user stop {service_name}")
|
|
47
|
+
subprocess.kill(context.handle, signum=2, label="{service_name}").get()
|
|
48
|
+
try:
|
|
49
|
+
status = subprocess.wait(context.handle, label="{service_name}").get()["status"]
|
|
50
|
+
print(status)
|
|
51
|
+
except:
|
|
52
|
+
pass
|
|
53
|
+
"""
|
flux_batch/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flux-batch
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.13
|
|
4
4
|
Summary: Python SDK for flux batch jobs and services
|
|
5
5
|
Home-page: https://github.com/converged-computing/flux-batch
|
|
6
6
|
Author: Vanessa Sochat
|
|
@@ -47,6 +47,8 @@ Requires-Dist: rich ; extra == 'scribe'
|
|
|
47
47
|
- [fractale-mcp](https://github.com/compspec/fractale-mcp): (fractale) MCP orchestration (agents, databases, ui interfaces).
|
|
48
48
|
- [hpc-mcp](https://github.com/converged-computing/hpc-mcp): HPC tools for a larger set of HPC and converged computing use cases.
|
|
49
49
|
|
|
50
|
+
If you are looking for `flux batch` please see the documentation [here](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-batch.html). This library supports Flux Framework and is experimental.
|
|
51
|
+
|
|
50
52
|
## Services
|
|
51
53
|
|
|
52
54
|
- **flux-scribe**: Write job events to a local sqlite database via the JournalConsumer (not added yet, written and needs testing)
|
|
@@ -135,7 +137,7 @@ batch.add_job(["sleep", "5"])
|
|
|
135
137
|
batch.add_job(["echo", "Job 2 finished"])
|
|
136
138
|
|
|
137
139
|
# Wrap it up into a jobspec
|
|
138
|
-
|
|
140
|
+
spec = flux_batch.BatchJobspecV1.from_jobs(
|
|
139
141
|
batch,
|
|
140
142
|
nodes=1,
|
|
141
143
|
nslots=1,
|
|
@@ -146,14 +148,15 @@ jobspec = flux_batch.BatchJobspecV1.from_jobs(
|
|
|
146
148
|
)
|
|
147
149
|
|
|
148
150
|
# Add a prolog and epilog
|
|
149
|
-
|
|
150
|
-
|
|
151
|
+
spec.add_prolog("echo 'Batch Wrapper Starting'")
|
|
152
|
+
spec.add_epilog("echo 'Batch Wrapper Finished'")
|
|
151
153
|
|
|
152
154
|
# Add a service (this assumes user level that exists)
|
|
153
|
-
|
|
155
|
+
spec.add_service("flux-scribe")
|
|
154
156
|
|
|
155
|
-
# Preview it
|
|
156
|
-
print(flux_batch.submit(handle,
|
|
157
|
+
# Preview it (batch wrapper), or generate the jobspec (json)
|
|
158
|
+
print(flux_batch.submit(handle, spec, dry_run=True))
|
|
159
|
+
jobspec = flux_batch.jobspec(spec)
|
|
157
160
|
|
|
158
161
|
# Submit that bad boi.
|
|
159
162
|
jobid = flux_batch.submit(handle, jobspec)
|
|
@@ -2,7 +2,7 @@ flux_batch/__init__.py,sha256=35032OVyHkBiQIptg7NBY_6mdhthFu1hpqOCsREm5lQ,280
|
|
|
2
2
|
flux_batch/jobspec.py,sha256=MUMI4Y_DgjCRssgzUDagTTFw8L3t4L_5az1E3tmT1FI,5464
|
|
3
3
|
flux_batch/models.py,sha256=JjrFqi4Skrop_cyIWfgAHTdY53kotkiGD0JpTnVlByI,2200
|
|
4
4
|
flux_batch/submit.py,sha256=tNJMDvnsxbAuXg_5TkB8HurzQ9I-Dot_gXXpIU5LtTA,2654
|
|
5
|
-
flux_batch/version.py,sha256=
|
|
5
|
+
flux_batch/version.py,sha256=W-9wI-nhKTdrBIA6HgtR7eb_HyV0qAzlIAHd9UibgX4,643
|
|
6
6
|
flux_batch/logger/__init__.py,sha256=eDdpw_uppR5mPLHE39qT_haqMxu-2wniLlJZDigRC2k,52
|
|
7
7
|
flux_batch/logger/generate.py,sha256=L9JyMY2oapp0ss7f7LGuihbLomzVJsMq7sByy9NhbZI,4017
|
|
8
8
|
flux_batch/logger/logger.py,sha256=HKymVBNcoPdX87QWy69er5wUzHVeriiKp9p0bIYboUo,5927
|
|
@@ -11,18 +11,23 @@ flux_batch/script/save_logs.sh,sha256=HeapqvL0iR8aX7LtbwlcFTy19j5WxkQ-1M2J9epu-E
|
|
|
11
11
|
flux_batch/service/__init__.py,sha256=pD7RYpdSLZvYU4qwShYXA6UcaJy191fKtzqJL3uttEc,2724
|
|
12
12
|
flux_batch/service/scribe.py,sha256=dY6geiLvXYIRcIzuP_naZscKgzX4Y5dPzxoWf9Wywg0,253
|
|
13
13
|
flux_batch/service/scribe/__init__.py,sha256=773-AzF_WRY6udG5nQSexf7Vga4K1YZLy1sfQAEd1uo,126
|
|
14
|
-
flux_batch/service/scribe/__main__.py,sha256=
|
|
15
|
-
flux_batch/service/scribe/database.py,sha256=
|
|
16
|
-
flux_batch/service/scribe/models.py,sha256=
|
|
14
|
+
flux_batch/service/scribe/__main__.py,sha256=tyaETaimVptrg1qfWasgcavhh0AFj0kGAhFp2LKtwm4,4236
|
|
15
|
+
flux_batch/service/scribe/database.py,sha256=0BImmobV8yzskcu4kagQCLqYDzUAtDyxUUJRlj-Yf2A,6343
|
|
16
|
+
flux_batch/service/scribe/models.py,sha256=9eHYp_lO-oVYidY3G3khA1mDgm2AzmuPTpzPGEgxruA,3066
|
|
17
17
|
flux_batch/service/scribe/template.py,sha256=yDsheid2FXeYr458loxB3HyZKSWpGQpM8iUQfnfOA-s,1336
|
|
18
|
+
flux_batch/service/usernetes/__init__.py,sha256=BvrHIsjevkT-nFqUKcn9VKwvtUGA1G48jqk4z0O9xK4,129
|
|
19
|
+
flux_batch/service/usernetes/__main__.py,sha256=xQHIxRd2pjGxpVcIyv2tq88ts3e2IsmhrQuMVG-wrOw,3851
|
|
20
|
+
flux_batch/service/usernetes/database.py,sha256=kagh_YJFpKMtixJ5-Zs762vLFDGEU7Fq9lOYEgeKPmo,5668
|
|
21
|
+
flux_batch/service/usernetes/models.py,sha256=wVHgX5X9PUpUSDtZtMaz1M_fGC1Vyv8JdUt58AoEA3c,2759
|
|
22
|
+
flux_batch/service/usernetes/template.py,sha256=yDsheid2FXeYr458loxB3HyZKSWpGQpM8iUQfnfOA-s,1336
|
|
18
23
|
flux_batch/utils/__init__.py,sha256=CqMhw_mBfR0HBcHwv7LtFITq0J7LBV413VQE9xrz8ks,42
|
|
19
24
|
flux_batch/utils/fileio.py,sha256=Elz8WkNkJ9B6x7WmCwiIBW0GgsRSSFCcbuJh7aqu2z4,4879
|
|
20
25
|
flux_batch/utils/text.py,sha256=Ci1BqHs2IbOSn2o60zhLkT4kIA7CSNuGj8mdiGaDIGk,606
|
|
21
26
|
flux_batch/utils/timer.py,sha256=_Weec7Wd5hWQ1r4ZHjownG4YdoIowpVqilXhvYFmIgA,491
|
|
22
|
-
flux_batch-0.0.
|
|
23
|
-
flux_batch-0.0.
|
|
24
|
-
flux_batch-0.0.
|
|
25
|
-
flux_batch-0.0.
|
|
26
|
-
flux_batch-0.0.
|
|
27
|
-
flux_batch-0.0.
|
|
28
|
-
flux_batch-0.0.
|
|
27
|
+
flux_batch-0.0.13.dist-info/LICENSE,sha256=AlyLB1m_z0CENCx1ob0PedLTTohtH2VLZhs2kfygrfc,1108
|
|
28
|
+
flux_batch-0.0.13.dist-info/METADATA,sha256=Y0somwoZuJzlzyT1KmkwWliawzy9foVPGd4Y2UZt4J0,5641
|
|
29
|
+
flux_batch-0.0.13.dist-info/NOTICE,sha256=9CR93geVKl_4ZrJORbXN0fzkEM2y4DglWhY1hn9ZwQw,1167
|
|
30
|
+
flux_batch-0.0.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
31
|
+
flux_batch-0.0.13.dist-info/entry_points.txt,sha256=ynoKpD82xn2V2sD-aZIQoq7NnfOu9VEKqW55Y1AoPGI,67
|
|
32
|
+
flux_batch-0.0.13.dist-info/top_level.txt,sha256=jj8zAsZzMmbjiBISJL7lRtA37MSEAQYfObGLUncn9Lw,11
|
|
33
|
+
flux_batch-0.0.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|