abstractgateway 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,429 @@
1
+ """Run Gateway runner worker (AbstractGateway).
2
+
3
+ Backlog: 307-Framework: Durable Run Gateway (Command Inbox + Ledger Stream)
4
+
5
+ Key properties (v0):
6
+ - Commands are accepted by being appended to a durable JSONL inbox (idempotent by command_id).
7
+ - A background worker polls the inbox and applies commands to persisted runs.
8
+ - A tick loop progresses RUNNING runs by calling Runtime.tick(...) and appending StepRecords.
9
+ - Clients render by replaying the durable ledger (cursor/offset semantics), not by relying on live RPC.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import os
16
+ import threading
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+ from typing import Any, Dict, Optional, Protocol
21
+
22
+ from abstractruntime import JsonFileCommandCursorStore, JsonlCommandStore, Runtime
23
+ from abstractruntime.core.event_keys import build_event_wait_key
24
+ from abstractruntime.core.models import RunStatus, WaitReason
25
+ from abstractruntime.storage.commands import CommandRecord
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def _is_pause_wait(waiting: Any, *, run_id: str) -> bool:
32
+ if waiting is None:
33
+ return False
34
+ wait_key = getattr(waiting, "wait_key", None)
35
+ if isinstance(wait_key, str) and wait_key == f"pause:{run_id}":
36
+ return True
37
+ details = getattr(waiting, "details", None)
38
+ if isinstance(details, dict) and details.get("kind") == "pause":
39
+ return True
40
+ return False
41
+
42
+
43
+ class GatewayHost(Protocol):
44
+ """Host capability needed by GatewayRunner to tick/resume runs."""
45
+
46
+ @property
47
+ def run_store(self) -> Any: ...
48
+
49
+ @property
50
+ def ledger_store(self) -> Any: ...
51
+
52
+ @property
53
+ def artifact_store(self) -> Any: ...
54
+
55
+ def runtime_and_workflow_for_run(self, run_id: str) -> tuple[Runtime, Any]: ...
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class GatewayRunnerConfig:
60
+ poll_interval_s: float = 0.25
61
+ command_batch_limit: int = 200
62
+ tick_max_steps: int = 100
63
+ tick_workers: int = 2
64
+ run_scan_limit: int = 200
65
+
66
+
67
+ class GatewayRunner:
68
+ """Background worker: poll command inbox + tick runs forward."""
69
+
70
+ def __init__(
71
+ self,
72
+ *,
73
+ base_dir: Path,
74
+ host: GatewayHost,
75
+ config: Optional[GatewayRunnerConfig] = None,
76
+ enable: bool = True,
77
+ ) -> None:
78
+ self._base_dir = Path(base_dir)
79
+ self._host = host
80
+ self._cfg = config or GatewayRunnerConfig()
81
+ self._enable = bool(enable)
82
+
83
+ self._command_store = JsonlCommandStore(self._base_dir)
84
+ self._cursor_store = JsonFileCommandCursorStore(self._base_dir / "commands_cursor.json")
85
+
86
+ self._stop = threading.Event()
87
+ self._thread: Optional[threading.Thread] = None
88
+ self._executor = ThreadPoolExecutor(max_workers=max(1, int(self._cfg.tick_workers or 1)))
89
+ self._inflight: set[str] = set()
90
+ self._inflight_lock = threading.Lock()
91
+
92
+ self._singleton_lock_path = self._base_dir / "gateway_runner.lock"
93
+ self._singleton_lock_fh = None
94
+
95
+ @property
96
+ def enabled(self) -> bool:
97
+ return self._enable
98
+
99
+ @property
100
+ def command_store(self) -> JsonlCommandStore:
101
+ return self._command_store
102
+
103
+ @property
104
+ def run_store(self) -> Any:
105
+ return self._host.run_store
106
+
107
+ @property
108
+ def ledger_store(self) -> Any:
109
+ return self._host.ledger_store
110
+
111
+ @property
112
+ def artifact_store(self) -> Any:
113
+ return self._host.artifact_store
114
+
115
+ def start(self) -> None:
116
+ if not self._enable:
117
+ logger.info("GatewayRunner disabled by config/env")
118
+ return
119
+ if self._thread is not None and self._thread.is_alive():
120
+ return
121
+ if not self._acquire_singleton_lock():
122
+ logger.warning("GatewayRunner not started: another process holds %s", self._singleton_lock_path)
123
+ return
124
+ self._stop.clear()
125
+ self._thread = threading.Thread(target=self._loop, name="abstractgateway-runner", daemon=True)
126
+ self._thread.start()
127
+ logger.info("GatewayRunner started (base_dir=%s)", self._base_dir)
128
+
129
+ def stop(self, timeout_s: float = 5.0) -> None:
130
+ self._stop.set()
131
+ if self._thread is not None:
132
+ self._thread.join(timeout=timeout_s)
133
+ self._thread = None
134
+ try:
135
+ self._executor.shutdown(wait=False, cancel_futures=True) # type: ignore[call-arg]
136
+ except Exception:
137
+ pass
138
+ self._release_singleton_lock()
139
+
140
+ def _acquire_singleton_lock(self) -> bool:
141
+ """Best-effort process singleton lock (prevents multi-worker double ticking)."""
142
+ try:
143
+ import fcntl # Unix only
144
+ except Exception: # pragma: no cover
145
+ return True
146
+ try:
147
+ self._singleton_lock_path.parent.mkdir(parents=True, exist_ok=True)
148
+ fh = self._singleton_lock_path.open("a", encoding="utf-8")
149
+ fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
150
+ fh.write(f"pid={os.getpid()}\n")
151
+ fh.flush()
152
+ self._singleton_lock_fh = fh
153
+ return True
154
+ except Exception:
155
+ try:
156
+ if self._singleton_lock_fh is not None:
157
+ self._singleton_lock_fh.close()
158
+ except Exception:
159
+ pass
160
+ self._singleton_lock_fh = None
161
+ return False
162
+
163
+ def _release_singleton_lock(self) -> None:
164
+ try:
165
+ if self._singleton_lock_fh is not None:
166
+ self._singleton_lock_fh.close()
167
+ except Exception:
168
+ pass
169
+ self._singleton_lock_fh = None
170
+
171
+ # ---------------------------------------------------------------------
172
+ # Main loop
173
+ # ---------------------------------------------------------------------
174
+
175
+ def _loop(self) -> None:
176
+ cursor = int(self._cursor_store.load() or 0)
177
+ while not self._stop.is_set():
178
+ try:
179
+ cursor = self._poll_commands(cursor)
180
+ except Exception as e:
181
+ logger.exception("GatewayRunner command poll error: %s", e)
182
+ try:
183
+ self._schedule_ticks()
184
+ except Exception as e:
185
+ logger.exception("GatewayRunner tick scheduling error: %s", e)
186
+ self._stop.wait(timeout=float(self._cfg.poll_interval_s or 0.25))
187
+
188
+ def _poll_commands(self, cursor: int) -> int:
189
+ items, next_cursor = self._command_store.list_after(after=int(cursor or 0), limit=int(self._cfg.command_batch_limit))
190
+ if not items:
191
+ return int(cursor or 0)
192
+
193
+ cur = int(cursor or 0)
194
+ for rec in items:
195
+ try:
196
+ self._apply_command(rec)
197
+ except Exception as e:
198
+ # Durable inbox: we advance cursor even if a command fails so it does not block the stream.
199
+ logger.exception("GatewayRunner failed applying command %s: %s", rec.command_id, e)
200
+ cur = max(cur, int(rec.seq or cur))
201
+ # Persist after each command for restart safety (at-least-once acceptance).
202
+ try:
203
+ self._cursor_store.save(cur)
204
+ except Exception:
205
+ pass
206
+ return max(int(next_cursor or 0), cur)
207
+
208
+ def _schedule_ticks(self) -> None:
209
+ list_runs = getattr(self.run_store, "list_runs", None)
210
+ if callable(list_runs):
211
+ runs = list_runs(status=RunStatus.RUNNING, limit=int(self._cfg.run_scan_limit))
212
+ else:
213
+ runs = []
214
+
215
+ list_due = getattr(self.run_store, "list_due_wait_until", None)
216
+ if callable(list_due):
217
+ try:
218
+ from abstractruntime.scheduler.scheduler import utc_now_iso
219
+
220
+ due = list_due(now_iso=utc_now_iso(), limit=int(self._cfg.run_scan_limit))
221
+ except Exception:
222
+ due = []
223
+ else:
224
+ due = []
225
+
226
+ def _is_gateway_owned(run: Any) -> bool:
227
+ return bool(getattr(run, "actor_id", None) == "gateway")
228
+
229
+ for r in list(runs or []) + list(due or []):
230
+ rid = getattr(r, "run_id", None)
231
+ if not isinstance(rid, str) or not rid:
232
+ continue
233
+ if not _is_gateway_owned(r):
234
+ continue
235
+ self._submit_tick(rid)
236
+
237
+ def _submit_tick(self, run_id: str) -> None:
238
+ with self._inflight_lock:
239
+ if run_id in self._inflight:
240
+ return
241
+ self._inflight.add(run_id)
242
+
243
+ def _done(_f: Any) -> None:
244
+ with self._inflight_lock:
245
+ self._inflight.discard(run_id)
246
+
247
+ fut = self._executor.submit(self._tick_run, run_id)
248
+ try:
249
+ fut.add_done_callback(_done)
250
+ except Exception:
251
+ _done(fut)
252
+
253
+ # ---------------------------------------------------------------------
254
+ # Command application
255
+ # ---------------------------------------------------------------------
256
+
257
+ def _apply_command(self, rec: CommandRecord) -> None:
258
+ typ = str(rec.type or "").strip().lower()
259
+ if typ not in {"pause", "resume", "cancel", "emit_event"}:
260
+ raise ValueError(f"Unknown command type '{typ}'")
261
+
262
+ payload = dict(rec.payload or {})
263
+ run_id = str(rec.run_id or "").strip()
264
+ if not run_id:
265
+ raise ValueError("Command.run_id is required")
266
+
267
+ # pause/cancel are durability operations; apply to full run tree.
268
+ if typ in {"pause", "cancel"}:
269
+ self._apply_run_control(typ, run_id=run_id, payload=payload, apply_to_tree=True)
270
+ return
271
+
272
+ # resume can mean either:
273
+ # - resume a paused run (no payload.payload provided) [tree-wide]
274
+ # - resume a WAITING run with a payload (payload.payload provided) [single run]
275
+ if typ == "resume":
276
+ wants_wait_resume = "payload" in payload
277
+ self._apply_run_control(typ, run_id=run_id, payload=payload, apply_to_tree=not wants_wait_resume)
278
+ return
279
+
280
+ # emit_event: host-side signal -> resume matching WAIT_EVENT runs
281
+ if typ == "emit_event":
282
+ self._apply_emit_event(payload, default_session_id=run_id, client_id=rec.client_id)
283
+ return
284
+
285
+ def _apply_run_control(self, typ: str, *, run_id: str, payload: Dict[str, Any], apply_to_tree: bool) -> None:
286
+ runtime = Runtime(run_store=self.run_store, ledger_store=self.ledger_store, artifact_store=self.artifact_store)
287
+
288
+ reason = payload.get("reason")
289
+ reason_str = str(reason).strip() if isinstance(reason, str) and reason.strip() else None
290
+
291
+ targets = self._list_descendant_run_ids(runtime, run_id) if apply_to_tree else [run_id]
292
+ for rid in targets:
293
+ if typ == "pause":
294
+ runtime.pause_run(rid, reason=reason_str)
295
+ elif typ == "resume":
296
+ # Resume WAITING runs when the client provides a durable resume payload.
297
+ if "payload" in payload:
298
+ resume_payload = payload.get("payload")
299
+ if not isinstance(resume_payload, dict):
300
+ raise ValueError("resume command requires payload.payload to be an object")
301
+ wait_key = payload.get("wait_key") or payload.get("waitKey")
302
+ wait_key2 = str(wait_key).strip() if isinstance(wait_key, str) and wait_key.strip() else None
303
+ rt2, wf2 = self._host.runtime_and_workflow_for_run(rid)
304
+ rt2.resume(workflow=wf2, run_id=rid, wait_key=wait_key2, payload=resume_payload, max_steps=0)
305
+ continue
306
+
307
+ # Otherwise, interpret resume as "resume paused run".
308
+ runtime.resume_run(rid)
309
+ else:
310
+ runtime.cancel_run(rid, reason=reason_str or "Cancelled")
311
+
312
+ def _apply_emit_event(self, payload: Dict[str, Any], *, default_session_id: str, client_id: Optional[str]) -> None:
313
+ name = payload.get("name")
314
+ name2 = str(name or "").strip()
315
+ if not name2:
316
+ raise ValueError("emit_event requires payload.name")
317
+
318
+ scope = payload.get("scope") or "session"
319
+ scope2 = str(scope or "session").strip().lower() or "session"
320
+ session_id = payload.get("session_id") or payload.get("sessionId") or default_session_id
321
+ workflow_id = payload.get("workflow_id") or payload.get("workflowId")
322
+ run_id = payload.get("run_id") or payload.get("runId")
323
+ event_payload = payload.get("payload")
324
+ if isinstance(event_payload, dict):
325
+ payload2 = dict(event_payload)
326
+ else:
327
+ payload2 = {"value": event_payload}
328
+
329
+ wait_key = build_event_wait_key(
330
+ scope=scope2,
331
+ name=name2,
332
+ session_id=str(session_id) if isinstance(session_id, str) and session_id else None,
333
+ workflow_id=str(workflow_id) if isinstance(workflow_id, str) and workflow_id else None,
334
+ run_id=str(run_id) if isinstance(run_id, str) and run_id else None,
335
+ )
336
+
337
+ envelope: Dict[str, Any] = {
338
+ "event_id": payload.get("event_id") or payload.get("eventId"),
339
+ "name": name2,
340
+ "scope": scope2,
341
+ "session_id": session_id,
342
+ "payload": payload2,
343
+ "emitted_at": payload.get("emitted_at") or payload.get("emittedAt"),
344
+ "emitter": {"source": "external", "client_id": client_id},
345
+ }
346
+
347
+ # Find matching WAIT_EVENT runs and resume them.
348
+ list_runs = getattr(self.run_store, "list_runs", None)
349
+ if not callable(list_runs):
350
+ return
351
+
352
+ waiting_runs = list_runs(status=RunStatus.WAITING, wait_reason=WaitReason.EVENT, limit=10_000)
353
+ for r in waiting_runs or []:
354
+ if getattr(r, "waiting", None) is None:
355
+ continue
356
+ if getattr(r.waiting, "wait_key", None) != wait_key:
357
+ continue
358
+ if _is_pause_wait(getattr(r, "waiting", None), run_id=str(getattr(r, "run_id", "") or "")):
359
+ continue
360
+ runtime, wf = self._host.runtime_and_workflow_for_run(r.run_id)
361
+ runtime.resume(workflow=wf, run_id=r.run_id, wait_key=wait_key, payload=envelope, max_steps=0)
362
+
363
+ def _list_descendant_run_ids(self, runtime: Runtime, root_run_id: str) -> list[str]:
364
+ """Return root + descendants (best-effort)."""
365
+ out: list[str] = []
366
+ queue: list[str] = [root_run_id]
367
+ seen: set[str] = set()
368
+ list_children = getattr(runtime.run_store, "list_children", None)
369
+ while queue:
370
+ rid = queue.pop(0)
371
+ if rid in seen:
372
+ continue
373
+ seen.add(rid)
374
+ out.append(rid)
375
+ if callable(list_children):
376
+ try:
377
+ children = list_children(parent_run_id=rid) or []
378
+ except Exception:
379
+ children = []
380
+ for c in children:
381
+ cid = getattr(c, "run_id", None)
382
+ if isinstance(cid, str) and cid and cid not in seen:
383
+ queue.append(cid)
384
+ return out
385
+
386
+ # ---------------------------------------------------------------------
387
+ # Tick execution + subworkflow parent resumption
388
+ # ---------------------------------------------------------------------
389
+
390
+ def _tick_run(self, run_id: str) -> None:
391
+ try:
392
+ runtime, wf = self._host.runtime_and_workflow_for_run(run_id)
393
+ except Exception as e:
394
+ logger.debug("GatewayRunner: cannot build runtime for %s: %s", run_id, e)
395
+ return
396
+
397
+ state = runtime.tick(workflow=wf, run_id=run_id, max_steps=int(self._cfg.tick_max_steps or 100))
398
+
399
+ # If this run completed, it may unblock a parent WAITING(SUBWORKFLOW).
400
+ if getattr(state, "status", None) == RunStatus.COMPLETED:
401
+ try:
402
+ self._resume_subworkflow_parents(child_run_id=run_id, child_output=state.output or {})
403
+ except Exception:
404
+ pass
405
+
406
+ def _resume_subworkflow_parents(self, *, child_run_id: str, child_output: Dict[str, Any]) -> None:
407
+ list_runs = getattr(self.run_store, "list_runs", None)
408
+ if not callable(list_runs):
409
+ return
410
+ waiting = list_runs(status=RunStatus.WAITING, limit=2000)
411
+ for r in waiting or []:
412
+ wait = getattr(r, "waiting", None)
413
+ if wait is None or getattr(wait, "reason", None) != WaitReason.SUBWORKFLOW:
414
+ continue
415
+ details = getattr(wait, "details", None)
416
+ if not isinstance(details, dict) or details.get("sub_run_id") != child_run_id:
417
+ continue
418
+ if _is_pause_wait(wait, run_id=str(getattr(r, "run_id", "") or "")):
419
+ continue
420
+ runtime, wf = self._host.runtime_and_workflow_for_run(r.run_id)
421
+ runtime.resume(
422
+ workflow=wf,
423
+ run_id=r.run_id,
424
+ wait_key=getattr(wait, "wait_key", None),
425
+ payload={"sub_run_id": child_run_id, "output": child_output},
426
+ max_steps=0,
427
+ )
428
+
429
+
@@ -0,0 +1,5 @@
1
+ from .gateway_security import GatewayAuthPolicy, GatewaySecurityMiddleware, load_gateway_auth_policy_from_env
2
+
3
+ __all__ = ["GatewayAuthPolicy", "GatewaySecurityMiddleware", "load_gateway_auth_policy_from_env"]
4
+
5
+