abstractgateway 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. abstractgateway/__init__.py +1 -2
  2. abstractgateway/__main__.py +7 -0
  3. abstractgateway/app.py +4 -4
  4. abstractgateway/cli.py +568 -8
  5. abstractgateway/config.py +15 -5
  6. abstractgateway/embeddings_config.py +45 -0
  7. abstractgateway/host_metrics.py +274 -0
  8. abstractgateway/hosts/bundle_host.py +528 -55
  9. abstractgateway/hosts/visualflow_host.py +30 -3
  10. abstractgateway/integrations/__init__.py +2 -0
  11. abstractgateway/integrations/email_bridge.py +782 -0
  12. abstractgateway/integrations/telegram_bridge.py +534 -0
  13. abstractgateway/maintenance/__init__.py +5 -0
  14. abstractgateway/maintenance/action_tokens.py +100 -0
  15. abstractgateway/maintenance/backlog_exec_runner.py +1592 -0
  16. abstractgateway/maintenance/backlog_parser.py +184 -0
  17. abstractgateway/maintenance/draft_generator.py +451 -0
  18. abstractgateway/maintenance/llm_assist.py +212 -0
  19. abstractgateway/maintenance/notifier.py +109 -0
  20. abstractgateway/maintenance/process_manager.py +1064 -0
  21. abstractgateway/maintenance/report_models.py +81 -0
  22. abstractgateway/maintenance/report_parser.py +219 -0
  23. abstractgateway/maintenance/text_similarity.py +123 -0
  24. abstractgateway/maintenance/triage.py +507 -0
  25. abstractgateway/maintenance/triage_queue.py +142 -0
  26. abstractgateway/migrate.py +155 -0
  27. abstractgateway/routes/__init__.py +2 -2
  28. abstractgateway/routes/gateway.py +10817 -179
  29. abstractgateway/routes/triage.py +118 -0
  30. abstractgateway/runner.py +689 -14
  31. abstractgateway/security/gateway_security.py +425 -110
  32. abstractgateway/service.py +213 -6
  33. abstractgateway/stores.py +64 -4
  34. abstractgateway/workflow_deprecations.py +225 -0
  35. abstractgateway-0.1.1.dist-info/METADATA +135 -0
  36. abstractgateway-0.1.1.dist-info/RECORD +40 -0
  37. abstractgateway-0.1.0.dist-info/METADATA +0 -101
  38. abstractgateway-0.1.0.dist-info/RECORD +0 -18
  39. {abstractgateway-0.1.0.dist-info → abstractgateway-0.1.1.dist-info}/WHEEL +0 -0
  40. {abstractgateway-0.1.0.dist-info → abstractgateway-0.1.1.dist-info}/entry_points.txt +0 -0
abstractgateway/runner.py CHANGED
@@ -11,18 +11,27 @@ Key properties (v0):
11
11
 
12
12
  from __future__ import annotations
13
13
 
14
+ import datetime
14
15
  import logging
15
16
  import os
17
+ import re
16
18
  import threading
17
19
  from concurrent.futures import ThreadPoolExecutor
18
20
  from dataclasses import dataclass
19
21
  from pathlib import Path
20
22
  from typing import Any, Dict, Optional, Protocol
21
23
 
22
- from abstractruntime import JsonFileCommandCursorStore, JsonlCommandStore, Runtime
24
+ from abstractruntime import Runtime
23
25
  from abstractruntime.core.event_keys import build_event_wait_key
24
- from abstractruntime.core.models import RunStatus, WaitReason
25
- from abstractruntime.storage.commands import CommandRecord
26
+ from abstractruntime.core.models import Effect, EffectType, RunStatus, StepRecord, WaitReason
27
+ from abstractruntime.scheduler.scheduler import utc_now_iso
28
+ from abstractruntime.storage.commands import (
29
+ CommandCursorStore,
30
+ CommandRecord,
31
+ CommandStore,
32
+ JsonFileCommandCursorStore,
33
+ JsonlCommandStore,
34
+ )
26
35
 
27
36
 
28
37
  logger = logging.getLogger(__name__)
@@ -60,7 +69,7 @@ class GatewayRunnerConfig:
60
69
  poll_interval_s: float = 0.25
61
70
  command_batch_limit: int = 200
62
71
  tick_max_steps: int = 100
63
- tick_workers: int = 2
72
+ tick_workers: int = 4
64
73
  run_scan_limit: int = 200
65
74
 
66
75
 
@@ -74,14 +83,18 @@ class GatewayRunner:
74
83
  host: GatewayHost,
75
84
  config: Optional[GatewayRunnerConfig] = None,
76
85
  enable: bool = True,
86
+ command_store: CommandStore | None = None,
87
+ cursor_store: CommandCursorStore | None = None,
77
88
  ) -> None:
78
89
  self._base_dir = Path(base_dir)
79
90
  self._host = host
80
91
  self._cfg = config or GatewayRunnerConfig()
81
92
  self._enable = bool(enable)
82
93
 
83
- self._command_store = JsonlCommandStore(self._base_dir)
84
- self._cursor_store = JsonFileCommandCursorStore(self._base_dir / "commands_cursor.json")
94
+ self._command_store: CommandStore = command_store or JsonlCommandStore(self._base_dir)
95
+ self._cursor_store: CommandCursorStore = cursor_store or JsonFileCommandCursorStore(
96
+ self._base_dir / "commands_cursor.json"
97
+ )
85
98
 
86
99
  self._stop = threading.Event()
87
100
  self._thread: Optional[threading.Thread] = None
@@ -97,7 +110,7 @@ class GatewayRunner:
97
110
  return self._enable
98
111
 
99
112
  @property
100
- def command_store(self) -> JsonlCommandStore:
113
+ def command_store(self) -> CommandStore:
101
114
  return self._command_store
102
115
 
103
116
  @property
@@ -137,6 +150,50 @@ class GatewayRunner:
137
150
  pass
138
151
  self._release_singleton_lock()
139
152
 
153
+ def emit_event(
154
+ self,
155
+ *,
156
+ name: str,
157
+ payload: Any,
158
+ session_id: str,
159
+ scope: str = "session",
160
+ workflow_id: Optional[str] = None,
161
+ run_id: Optional[str] = None,
162
+ event_id: Optional[str] = None,
163
+ emitted_at: Optional[str] = None,
164
+ client_id: Optional[str] = None,
165
+ ) -> None:
166
+ """Emit an external event into the runtime (resume matching WAIT_EVENT runs).
167
+
168
+ This is a thin wrapper around the internal emit_event command handling so
169
+ integrations (Telegram bridge, webhooks, etc.) don't need to know the
170
+ command-store format.
171
+ """
172
+
173
+ name2 = str(name or "").strip()
174
+ if not name2:
175
+ raise ValueError("name is required")
176
+ sid = str(session_id or "").strip()
177
+ if not sid:
178
+ raise ValueError("session_id is required")
179
+
180
+ body: Dict[str, Any] = {
181
+ "name": name2,
182
+ "scope": str(scope or "session").strip().lower() or "session",
183
+ "session_id": sid,
184
+ "payload": payload,
185
+ }
186
+ if isinstance(workflow_id, str) and workflow_id.strip():
187
+ body["workflow_id"] = workflow_id.strip()
188
+ if isinstance(run_id, str) and run_id.strip():
189
+ body["run_id"] = run_id.strip()
190
+ if isinstance(event_id, str) and event_id.strip():
191
+ body["event_id"] = event_id.strip()
192
+ if isinstance(emitted_at, str) and emitted_at.strip():
193
+ body["emitted_at"] = emitted_at.strip()
194
+
195
+ self._apply_emit_event(body, default_session_id=sid, client_id=client_id)
196
+
140
197
  def _acquire_singleton_lock(self) -> bool:
141
198
  """Best-effort process singleton lock (prevents multi-worker double ticking)."""
142
199
  try:
@@ -215,8 +272,6 @@ class GatewayRunner:
215
272
  list_due = getattr(self.run_store, "list_due_wait_until", None)
216
273
  if callable(list_due):
217
274
  try:
218
- from abstractruntime.scheduler.scheduler import utc_now_iso
219
-
220
275
  due = list_due(now_iso=utc_now_iso(), limit=int(self._cfg.run_scan_limit))
221
276
  except Exception:
222
277
  due = []
@@ -234,6 +289,85 @@ class GatewayRunner:
234
289
  continue
235
290
  self._submit_tick(rid)
236
291
 
292
+ # Best-effort recovery: if we restart after a child run reaches a terminal state,
293
+ # parents blocked on WAITING(SUBWORKFLOW) can remain stuck because we don't tick
294
+ # terminal child runs. Detect such cases and resume parents.
295
+ try:
296
+ self._repair_terminal_subworkflow_waits()
297
+ except Exception:
298
+ pass
299
+
300
+ def _repair_terminal_subworkflow_waits(self) -> None:
301
+ list_runs = getattr(self.run_store, "list_runs", None)
302
+ if not callable(list_runs):
303
+ return
304
+
305
+ try:
306
+ waiting = list_runs(status=RunStatus.WAITING, wait_reason=WaitReason.SUBWORKFLOW, limit=int(self._cfg.run_scan_limit))
307
+ except TypeError:
308
+ # Older/alternate stores may not support wait_reason filtering.
309
+ waiting = list_runs(status=RunStatus.WAITING, limit=int(self._cfg.run_scan_limit))
310
+ except Exception:
311
+ waiting = []
312
+
313
+ for r in waiting or []:
314
+ # Only repair gateway-owned run trees.
315
+ if getattr(r, "actor_id", None) != "gateway":
316
+ continue
317
+ wait = getattr(r, "waiting", None)
318
+ if wait is None or getattr(wait, "reason", None) != WaitReason.SUBWORKFLOW:
319
+ continue
320
+ details = getattr(wait, "details", None)
321
+ if not isinstance(details, dict):
322
+ continue
323
+ sub_run_id = details.get("sub_run_id")
324
+ if not isinstance(sub_run_id, str) or not sub_run_id.strip():
325
+ continue
326
+ child = self.run_store.load(sub_run_id.strip())
327
+ if child is None:
328
+ continue
329
+
330
+ st = getattr(child, "status", None)
331
+ if st not in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELLED):
332
+ continue
333
+
334
+ child_out_raw: Any = getattr(child, "output", None)
335
+ if isinstance(child_out_raw, dict):
336
+ child_out: Dict[str, Any] = dict(child_out_raw)
337
+ else:
338
+ child_out = {"result": child_out_raw}
339
+
340
+ if st != RunStatus.COMPLETED:
341
+ child_out.setdefault("success", False)
342
+ if st == RunStatus.CANCELLED:
343
+ child_out.setdefault("cancelled", True)
344
+ err = getattr(child, "error", None)
345
+ if isinstance(err, str) and err.strip():
346
+ child_out.setdefault("error", err.strip())
347
+
348
+ runtime, wf = self._host.runtime_and_workflow_for_run(r.run_id)
349
+ payload: Dict[str, Any] = {"sub_run_id": sub_run_id.strip(), "output": child_out}
350
+ try:
351
+ include_traces = bool(details.get("include_traces") or details.get("includeTraces"))
352
+ except Exception:
353
+ include_traces = False
354
+ if include_traces:
355
+ try:
356
+ payload["node_traces"] = runtime.get_node_traces(sub_run_id.strip()) or {}
357
+ except Exception:
358
+ payload["node_traces"] = {}
359
+ try:
360
+ runtime.resume(
361
+ workflow=wf,
362
+ run_id=r.run_id,
363
+ wait_key=getattr(wait, "wait_key", None),
364
+ payload=payload,
365
+ max_steps=0,
366
+ )
367
+ except Exception:
368
+ # Best-effort recovery only; avoid blocking the runner loop on a single bad tree.
369
+ continue
370
+
237
371
  def _submit_tick(self, run_id: str) -> None:
238
372
  with self._inflight_lock:
239
373
  if run_id in self._inflight:
@@ -256,7 +390,7 @@ class GatewayRunner:
256
390
 
257
391
  def _apply_command(self, rec: CommandRecord) -> None:
258
392
  typ = str(rec.type or "").strip().lower()
259
- if typ not in {"pause", "resume", "cancel", "emit_event"}:
393
+ if typ not in {"pause", "resume", "cancel", "emit_event", "update_schedule", "compact_memory"}:
260
394
  raise ValueError(f"Unknown command type '{typ}'")
261
395
 
262
396
  payload = dict(rec.payload or {})
@@ -282,6 +416,14 @@ class GatewayRunner:
282
416
  self._apply_emit_event(payload, default_session_id=run_id, client_id=rec.client_id)
283
417
  return
284
418
 
419
+ if typ == "update_schedule":
420
+ self._apply_update_schedule(payload, run_id=run_id, command_id=str(rec.command_id), client_id=rec.client_id)
421
+ return
422
+
423
+ if typ == "compact_memory":
424
+ self._apply_compact_memory(payload, run_id=run_id, command_id=str(rec.command_id), client_id=rec.client_id)
425
+ return
426
+
285
427
  def _apply_run_control(self, typ: str, *, run_id: str, payload: Dict[str, Any], apply_to_tree: bool) -> None:
286
428
  runtime = Runtime(run_store=self.run_store, ledger_store=self.ledger_store, artifact_store=self.artifact_store)
287
429
 
@@ -309,6 +451,37 @@ class GatewayRunner:
309
451
  else:
310
452
  runtime.cancel_run(rid, reason=reason_str or "Cancelled")
311
453
 
454
+ # UX affordance for scheduled runs: resuming a paused schedule should trigger the next
455
+ # WAIT_UNTIL immediately so the schedule "wakes up" right away.
456
+ if typ == "resume" and apply_to_tree and "payload" not in payload:
457
+ try:
458
+ self._maybe_trigger_scheduled_wait_now(run_id)
459
+ except Exception:
460
+ pass
461
+
462
+ def _maybe_trigger_scheduled_wait_now(self, run_id: str) -> None:
463
+ run = self.run_store.load(str(run_id))
464
+ if run is None:
465
+ return
466
+
467
+ root = run if self._is_scheduled_parent_run(run) else self._find_scheduled_root(run)
468
+ if root is None or not self._scheduled_parent_is_recurrent(root):
469
+ return
470
+
471
+ waiting = getattr(root, "waiting", None)
472
+ if getattr(root, "status", None) != RunStatus.WAITING or waiting is None:
473
+ return
474
+ if getattr(waiting, "reason", None) != WaitReason.UNTIL:
475
+ return
476
+ until = getattr(waiting, "until", None)
477
+ if not isinstance(until, str) or not until.strip():
478
+ return
479
+
480
+ now = utc_now_iso()
481
+ waiting.until = now # type: ignore[attr-defined]
482
+ root.updated_at = now
483
+ self.run_store.save(root)
484
+
312
485
  def _apply_emit_event(self, payload: Dict[str, Any], *, default_session_id: str, client_id: Optional[str]) -> None:
313
486
  name = payload.get("name")
314
487
  name2 = str(name or "").strip()
@@ -394,12 +567,67 @@ class GatewayRunner:
394
567
  logger.debug("GatewayRunner: cannot build runtime for %s: %s", run_id, e)
395
568
  return
396
569
 
397
- state = runtime.tick(workflow=wf, run_id=run_id, max_steps=int(self._cfg.tick_max_steps or 100))
570
+ try:
571
+ state = runtime.tick(workflow=wf, run_id=run_id, max_steps=int(self._cfg.tick_max_steps or 100))
572
+ except Exception as e:
573
+ # Never leave runs stuck in RUNNING due to an unhandled exception.
574
+ #
575
+ # Rationale: VisualFlow node planning can raise (e.g. missing optional deps),
576
+ # and effects can raise before the runtime has a chance to persist status.
577
+ # In gateway mode, a stuck RUNNING run can deadlock parents waiting on
578
+ # SUBWORKFLOW completion (KG ingest).
579
+ logger.exception("GatewayRunner: tick failed for %s", run_id)
580
+ err = f"{type(e).__name__}: {e}"
581
+ try:
582
+ latest = runtime.run_store.load(run_id)
583
+ if latest is None:
584
+ return
585
+ if getattr(latest, "status", None) == RunStatus.RUNNING:
586
+ latest.status = RunStatus.FAILED
587
+ latest.error = err
588
+ latest.updated_at = utc_now_iso()
589
+ runtime.run_store.save(latest)
590
+ try:
591
+ rec = StepRecord.start(
592
+ run=latest,
593
+ node_id=str(getattr(latest, "current_node", None) or "runtime"),
594
+ effect=None,
595
+ idempotency_key=f"system:tick_exception:{run_id}",
596
+ )
597
+ rec.finish_failure(err)
598
+ self.ledger_store.append(rec)
599
+ except Exception:
600
+ logger.exception("GatewayRunner: failed to append tick_exception record for %s", run_id)
601
+ state = latest
602
+ except Exception:
603
+ return
604
+
605
+ # Auto-compaction for scheduled workflows (best-effort).
606
+ try:
607
+ self._maybe_auto_compact(state)
608
+ except Exception:
609
+ pass
398
610
 
399
611
  # If this run completed, it may unblock a parent WAITING(SUBWORKFLOW).
400
- if getattr(state, "status", None) == RunStatus.COMPLETED:
612
+ if getattr(state, "status", None) in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELLED):
401
613
  try:
402
- self._resume_subworkflow_parents(child_run_id=run_id, child_output=state.output or {})
614
+ child_out_raw: Any = getattr(state, "output", None)
615
+ child_out: Dict[str, Any]
616
+ if isinstance(child_out_raw, dict):
617
+ child_out = dict(child_out_raw)
618
+ else:
619
+ child_out = {"result": child_out_raw}
620
+
621
+ if getattr(state, "status", None) != RunStatus.COMPLETED:
622
+ # Preserve a stable shape so visual subflow nodes can proceed.
623
+ child_out.setdefault("success", False)
624
+ if getattr(state, "status", None) == RunStatus.CANCELLED:
625
+ child_out.setdefault("cancelled", True)
626
+ err = getattr(state, "error", None)
627
+ if isinstance(err, str) and err.strip():
628
+ child_out.setdefault("error", err.strip())
629
+
630
+ self._resume_subworkflow_parents(child_run_id=run_id, child_output=child_out)
403
631
  except Exception:
404
632
  pass
405
633
 
@@ -418,12 +646,459 @@ class GatewayRunner:
418
646
  if _is_pause_wait(wait, run_id=str(getattr(r, "run_id", "") or "")):
419
647
  continue
420
648
  runtime, wf = self._host.runtime_and_workflow_for_run(r.run_id)
649
+ payload: Dict[str, Any] = {"sub_run_id": child_run_id, "output": child_output}
650
+ try:
651
+ include_traces = bool(details.get("include_traces") or details.get("includeTraces"))
652
+ except Exception:
653
+ include_traces = False
654
+ if include_traces:
655
+ try:
656
+ payload["node_traces"] = runtime.get_node_traces(child_run_id) or {}
657
+ except Exception:
658
+ payload["node_traces"] = {}
421
659
  runtime.resume(
422
660
  workflow=wf,
423
661
  run_id=r.run_id,
424
662
  wait_key=getattr(wait, "wait_key", None),
425
- payload={"sub_run_id": child_run_id, "output": child_output},
663
+ payload=payload,
426
664
  max_steps=0,
427
665
  )
428
666
 
667
+ # ---------------------------------------------------------------------
668
+ # Scheduled workflow commands
669
+ # ---------------------------------------------------------------------
670
+
671
+ _INTERVAL_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*(ms|s|m|h|d)\s*$", re.IGNORECASE)
672
+ _UNIT_SECONDS: Dict[str, float] = {"ms": 0.001, "s": 1.0, "m": 60.0, "h": 3600.0, "d": 86400.0}
673
+
674
+ def _is_scheduled_parent_run(self, run: Any) -> bool:
675
+ wid = getattr(run, "workflow_id", None)
676
+ if isinstance(wid, str) and wid.startswith("scheduled:"):
677
+ return True
678
+ vars_obj = getattr(run, "vars", None)
679
+ meta = vars_obj.get("_meta") if isinstance(vars_obj, dict) else None
680
+ schedule = meta.get("schedule") if isinstance(meta, dict) else None
681
+ if isinstance(schedule, dict) and schedule.get("kind") == "scheduled_run":
682
+ return True
683
+ return False
684
+
685
+ def _scheduled_parent_is_recurrent(self, run: Any) -> bool:
686
+ vars_obj = getattr(run, "vars", None)
687
+ meta = vars_obj.get("_meta") if isinstance(vars_obj, dict) else None
688
+ schedule = meta.get("schedule") if isinstance(meta, dict) else None
689
+ if not isinstance(schedule, dict):
690
+ return False
691
+ interval = schedule.get("interval")
692
+ return isinstance(interval, str) and interval.strip() != ""
693
+
694
+ def _find_scheduled_root(self, run: Any) -> Optional[Any]:
695
+ """Return the scheduled parent run (root) for a run tree, if any."""
696
+ cur = run
697
+ seen: set[str] = set()
698
+ while True:
699
+ rid = getattr(cur, "run_id", None)
700
+ if isinstance(rid, str) and rid:
701
+ if rid in seen:
702
+ break
703
+ seen.add(rid)
704
+ parent_id = getattr(cur, "parent_run_id", None)
705
+ if not isinstance(parent_id, str) or not parent_id.strip():
706
+ return cur if self._is_scheduled_parent_run(cur) else None
707
+ parent = self.run_store.load(parent_id.strip())
708
+ if parent is None:
709
+ return None
710
+ cur = parent
711
+ return None
712
+
713
+ def _parse_interval_seconds(self, raw: str) -> Optional[float]:
714
+ s = str(raw or "").strip()
715
+ if not s:
716
+ return None
717
+ m = self._INTERVAL_RE.match(s)
718
+ if not m:
719
+ # ISO timestamps are accepted by on_schedule but are one-shot; treat as non-interval.
720
+ return None
721
+ amount = float(m.group(1))
722
+ unit = str(m.group(2)).lower()
723
+ return float(amount) * float(self._UNIT_SECONDS.get(unit, 1.0))
724
+
725
+ def _mutate_schedule_interval_in_visualflow(self, raw: Dict[str, Any], *, interval: str) -> bool:
726
+ nodes = raw.get("nodes")
727
+ if not isinstance(nodes, list):
728
+ return False
729
+ changed = False
730
+ for n in nodes:
731
+ if not isinstance(n, dict):
732
+ continue
733
+ if str(n.get("id") or "") != "wait_interval":
734
+ continue
735
+ data = n.get("data")
736
+ if not isinstance(data, dict):
737
+ data = {}
738
+ n["data"] = data
739
+ event_cfg = data.get("eventConfig")
740
+ if not isinstance(event_cfg, dict):
741
+ event_cfg = {}
742
+ data["eventConfig"] = event_cfg
743
+ event_cfg["schedule"] = str(interval)
744
+ changed = True
745
+ return changed
746
+
747
+ def _apply_update_schedule(
748
+ self, payload: Dict[str, Any], *, run_id: str, command_id: str, client_id: Optional[str]
749
+ ) -> None:
750
+ del client_id
751
+ requested_run_id = str(run_id or "").strip()
752
+ raw_interval = payload.get("interval")
753
+ if raw_interval is None:
754
+ raw_interval = payload.get("schedule")
755
+ interval = str(raw_interval or "").strip()
756
+ if not interval:
757
+ raise ValueError("update_schedule requires payload.interval")
758
+
759
+ # Validate interval is a relative duration (not an ISO timestamp).
760
+ interval_s = self._parse_interval_seconds(interval)
761
+ if interval_s is None or interval_s <= 0:
762
+ raise ValueError("update_schedule interval must be a relative duration like '20m', '1h', '0.5s'")
763
+
764
+ parent = self.run_store.load(run_id)
765
+ if parent is None:
766
+ raise KeyError(f"Run '{run_id}' not found")
767
+ if not self._is_scheduled_parent_run(parent):
768
+ root = self._find_scheduled_root(parent)
769
+ if root is None:
770
+ raise ValueError("update_schedule is only supported for scheduled runs (or runs inside a scheduled run tree)")
771
+ parent = root
772
+ run_id = str(getattr(parent, "run_id", run_id))
773
+ if not self._scheduled_parent_is_recurrent(parent):
774
+ raise ValueError("update_schedule requires a recurrent scheduled run (interval must be set)")
775
+
776
+ workflow_id = getattr(parent, "workflow_id", None)
777
+ if not isinstance(workflow_id, str) or not workflow_id.strip():
778
+ raise ValueError("Scheduled run missing workflow_id")
779
+
780
+ # Update durable schedule metadata on the run (for UI).
781
+ vars_obj = getattr(parent, "vars", None)
782
+ if not isinstance(vars_obj, dict):
783
+ vars_obj = {}
784
+ parent.vars = vars_obj # type: ignore[attr-defined]
785
+ meta = vars_obj.get("_meta")
786
+ if not isinstance(meta, dict):
787
+ meta = {}
788
+ vars_obj["_meta"] = meta
789
+ sched = meta.get("schedule")
790
+ if not isinstance(sched, dict):
791
+ sched = {}
792
+ meta["schedule"] = sched
793
+ sched["interval"] = interval
794
+ sched["updated_at"] = utc_now_iso()
795
+ parent.updated_at = utc_now_iso()
796
+ self.run_store.save(parent)
797
+
798
+ # Update the persisted dynamic wrapper flow + registry entry (wait_interval node).
799
+ load_raw = getattr(self._host, "load_dynamic_visualflow", None)
800
+ upsert = getattr(self._host, "upsert_dynamic_visualflow", None)
801
+ if not callable(load_raw) or not callable(upsert):
802
+ raise RuntimeError("Host does not support editing dynamic workflows (load_dynamic_visualflow/upsert_dynamic_visualflow)")
803
+ raw_flow = load_raw(workflow_id)
804
+ if raw_flow is None:
805
+ raise RuntimeError(f"Dynamic wrapper flow not found on disk for workflow_id={workflow_id}")
806
+ if not self._mutate_schedule_interval_in_visualflow(raw_flow, interval=interval):
807
+ raise RuntimeError("Failed to locate wait_interval node in scheduled wrapper flow")
808
+
809
+ # Re-register so subsequent ticks use the updated spec.
810
+ upsert(raw_flow, persist=True)
811
+
812
+ # Optional: if currently blocked on the interval wait, recompute the concrete until timestamp.
813
+ apply_immediately = payload.get("apply_immediately")
814
+ apply_immediately_flag = True if apply_immediately is None else bool(apply_immediately)
815
+ waiting = getattr(parent, "waiting", None)
816
+ if (
817
+ apply_immediately_flag
818
+ and getattr(parent, "status", None) == RunStatus.WAITING
819
+ and waiting is not None
820
+ and getattr(waiting, "reason", None) == WaitReason.UNTIL
821
+ and str(getattr(parent, "current_node", "") or "") == "wait_interval"
822
+ ):
823
+ now = datetime.datetime.now(datetime.timezone.utc)
824
+ until = (now + datetime.timedelta(seconds=float(interval_s))).isoformat()
825
+ waiting.until = until # type: ignore[attr-defined]
826
+ parent.updated_at = utc_now_iso()
827
+ self.run_store.save(parent)
828
+
829
+ # Best-effort observability marker.
830
+ try:
831
+ runtime_ns = vars_obj.get("_runtime")
832
+ if not isinstance(runtime_ns, dict):
833
+ runtime_ns = {}
834
+ vars_obj["_runtime"] = runtime_ns
835
+ runtime_ns["last_schedule_update"] = {
836
+ "command_id": command_id,
837
+ "interval": interval,
838
+ "updated_at": utc_now_iso(),
839
+ "requested_run_id": requested_run_id,
840
+ "scheduled_root_run_id": str(run_id),
841
+ }
842
+ self.run_store.save(parent)
843
+ except Exception:
844
+ pass
845
+
846
+ def _resolve_compaction_target_run_id(self, root_run_id: str) -> Optional[str]:
847
+ """Pick the best-effort run_id whose vars contain context.messages to compact."""
848
+
849
+ def _has_messages(r: Any) -> bool:
850
+ vars_obj = getattr(r, "vars", None)
851
+ ctx = vars_obj.get("context") if isinstance(vars_obj, dict) else None
852
+ msgs = ctx.get("messages") if isinstance(ctx, dict) else None
853
+ return isinstance(msgs, list) and len(msgs) > 0
854
+
855
+ cur = self.run_store.load(root_run_id)
856
+ if cur is None:
857
+ return None
858
+ if _has_messages(cur):
859
+ return str(getattr(cur, "run_id"))
860
+
861
+ # Prefer following active SUBWORKFLOW wait chains (deepest active run).
862
+ seen: set[str] = set()
863
+ while True:
864
+ rid = str(getattr(cur, "run_id", "") or "")
865
+ if not rid or rid in seen:
866
+ break
867
+ seen.add(rid)
868
+ waiting = getattr(cur, "waiting", None)
869
+ details = getattr(waiting, "details", None) if waiting is not None else None
870
+ sub_id = details.get("sub_run_id") if isinstance(details, dict) else None
871
+ if not isinstance(sub_id, str) or not sub_id.strip():
872
+ break
873
+ nxt = self.run_store.load(sub_id.strip())
874
+ if nxt is None:
875
+ break
876
+ cur = nxt
877
+ if _has_messages(cur):
878
+ return str(getattr(cur, "run_id"))
879
+
880
+ # Fallback: compact most recent descendant that has messages (best-effort).
881
+ list_children = getattr(self.run_store, "list_children", None)
882
+ if not callable(list_children):
883
+ return None
884
+ try:
885
+ children = list_children(parent_run_id=root_run_id) or []
886
+ except Exception:
887
+ children = []
888
+ if not children:
889
+ return None
890
+
891
+ def _ts(r: Any) -> str:
892
+ return str(getattr(r, "updated_at", None) or getattr(r, "created_at", None) or "")
893
+
894
+ for child in sorted(children, key=_ts, reverse=True):
895
+ cid = getattr(child, "run_id", None)
896
+ if not isinstance(cid, str) or not cid:
897
+ continue
898
+ target = self._resolve_compaction_target_run_id(cid)
899
+ if target:
900
+ return target
901
+ return None
902
+
903
+ def _apply_compact_memory(
904
+ self, payload: Dict[str, Any], *, run_id: str, command_id: str, client_id: Optional[str]
905
+ ) -> None:
906
+ del client_id
907
+ requested_run_id = str(run_id or "").strip()
908
+ parent = self.run_store.load(run_id)
909
+ if parent is None:
910
+ raise KeyError(f"Run '{run_id}' not found")
911
+ if not self._is_scheduled_parent_run(parent):
912
+ root = self._find_scheduled_root(parent)
913
+ if root is None:
914
+ raise ValueError("compact_memory is only supported for scheduled runs (or runs inside a scheduled run tree)")
915
+ parent = root
916
+ run_id = str(getattr(parent, "run_id", run_id))
917
+
918
+ target_run_id = payload.get("target_run_id") or payload.get("targetRunId")
919
+ if isinstance(target_run_id, str) and target_run_id.strip():
920
+ target_id = target_run_id.strip()
921
+ else:
922
+ target_id = self._resolve_compaction_target_run_id(run_id) or ""
923
+ if not target_id:
924
+ raise RuntimeError("No compactable run found (no context.messages in the scheduled run tree)")
925
+
926
+ target = self.run_store.load(target_id)
927
+ if target is None:
928
+ raise KeyError(f"Target run '{target_id}' not found")
929
+
930
+ # Build effect payload.
931
+ preserve_recent_raw = payload.get("preserve_recent")
932
+ if preserve_recent_raw is None:
933
+ preserve_recent_raw = payload.get("preserveRecent")
934
+ try:
935
+ preserve_recent = int(preserve_recent_raw) if preserve_recent_raw is not None else 6
936
+ except Exception:
937
+ preserve_recent = 6
938
+ if preserve_recent < 0:
939
+ preserve_recent = 0
940
+ compression_mode = str(payload.get("compression_mode") or payload.get("compressionMode") or "standard").strip().lower()
941
+ if compression_mode not in {"light", "standard", "heavy"}:
942
+ compression_mode = "standard"
943
+ focus = payload.get("focus")
944
+ focus_text = str(focus).strip() if isinstance(focus, str) and focus.strip() else None
945
+
946
+ eff_payload: Dict[str, Any] = {
947
+ "preserve_recent": preserve_recent,
948
+ "compression_mode": compression_mode,
949
+ }
950
+ if focus_text is not None:
951
+ eff_payload["focus"] = focus_text
952
+
953
+ # Execute the memory_compact effect as an out-of-band action on the target run.
954
+ runtime = Runtime(run_store=self.run_store, ledger_store=self.ledger_store, artifact_store=self.artifact_store)
955
+ # Enable subworkflow lookups in MEMORY_COMPACT (it spawns a small LLM sub-run).
956
+ try:
957
+ runtime.set_workflow_registry(getattr(self._host, "workflow_registry", None))
958
+ except Exception:
959
+ pass
960
+
961
+ eff = Effect(type=EffectType.MEMORY_COMPACT, payload=eff_payload, result_key="_temp.command.compact_memory")
962
+ idem = f"command:compact_memory:{command_id}"
963
+ outcome = runtime._execute_effect_with_retry( # type: ignore[attr-defined]
964
+ run=target,
965
+ node_id="compact_memory",
966
+ effect=eff,
967
+ idempotency_key=idem,
968
+ default_next_node=None,
969
+ )
970
+
971
+ # MEMORY_COMPACT mutates run.vars but only saves when targeting a different run. When compacting
972
+ # the target itself out-of-band, explicitly persist the updated checkpoint.
973
+ try:
974
+ target.updated_at = utc_now_iso()
975
+ self.run_store.save(target)
976
+ except Exception:
977
+ pass
978
+
979
+ if getattr(outcome, "status", None) == "failed":
980
+ raise RuntimeError(getattr(outcome, "error", None) or "compact_memory failed")
981
+
982
+ # Best-effort observability marker (on the scheduled parent/root run).
983
+ try:
984
+ vars_obj = getattr(parent, "vars", None)
985
+ if not isinstance(vars_obj, dict):
986
+ vars_obj = {}
987
+ parent.vars = vars_obj # type: ignore[attr-defined]
988
+ runtime_ns = vars_obj.get("_runtime")
989
+ if not isinstance(runtime_ns, dict):
990
+ runtime_ns = {}
991
+ vars_obj["_runtime"] = runtime_ns
992
+ runtime_ns["last_compact_memory"] = {
993
+ "command_id": command_id,
994
+ "updated_at": utc_now_iso(),
995
+ "requested_run_id": requested_run_id,
996
+ "scheduled_root_run_id": str(run_id),
997
+ "target_run_id": str(target_id),
998
+ }
999
+ parent.updated_at = utc_now_iso()
1000
+ self.run_store.save(parent)
1001
+ except Exception:
1002
+ pass
1003
+
1004
+ # ---------------------------------------------------------------------
1005
+ # Auto-compaction for scheduled workflows
1006
+ # ---------------------------------------------------------------------
1007
+
1008
+ def _maybe_auto_compact(self, run: Any) -> None:
1009
+ """Auto-compact scheduled workflows when nearing context limits (best-effort)."""
1010
+ root = self._find_scheduled_root(run)
1011
+ if root is None or not self._scheduled_parent_is_recurrent(root):
1012
+ return
1013
+
1014
+ vars_obj = getattr(run, "vars", None)
1015
+ if not isinstance(vars_obj, dict):
1016
+ return
1017
+ ctx = vars_obj.get("context")
1018
+ msgs = ctx.get("messages") if isinstance(ctx, dict) else None
1019
+ if not isinstance(msgs, list) or len(msgs) < 12:
1020
+ return
1021
+
1022
+ limits = vars_obj.get("_limits")
1023
+ if not isinstance(limits, dict):
1024
+ return
1025
+ used = limits.get("estimated_tokens_used")
1026
+ if used is None or isinstance(used, bool):
1027
+ return
1028
+ try:
1029
+ used_i = int(used)
1030
+ except Exception:
1031
+ return
1032
+ if used_i <= 0:
1033
+ return
1034
+
1035
+ budget = limits.get("max_input_tokens")
1036
+ if budget is None:
1037
+ budget = limits.get("max_tokens")
1038
+ try:
1039
+ budget_i = int(budget) if budget is not None else 0
1040
+ except Exception:
1041
+ budget_i = 0
1042
+ if budget_i <= 0:
1043
+ return
1044
+
1045
+ pct = used_i / float(budget_i)
1046
+ if pct < 0.9:
1047
+ return
429
1048
 
1049
+ runtime_ns = vars_obj.get("_runtime")
1050
+ if not isinstance(runtime_ns, dict):
1051
+ runtime_ns = {}
1052
+ vars_obj["_runtime"] = runtime_ns
1053
+ auto = runtime_ns.get("auto_compact")
1054
+ if not isinstance(auto, dict):
1055
+ auto = {}
1056
+ runtime_ns["auto_compact"] = auto
1057
+ last_used = auto.get("last_trigger_tokens_used")
1058
+ try:
1059
+ last_used_i = int(last_used) if last_used is not None else -1
1060
+ except Exception:
1061
+ last_used_i = -1
1062
+ if used_i <= last_used_i:
1063
+ return
1064
+
1065
+ # Record guard before running to avoid thrash if compaction fails.
1066
+ auto["last_trigger_tokens_used"] = used_i
1067
+ auto["last_triggered_at"] = utc_now_iso()
1068
+ try:
1069
+ self.run_store.save(run)
1070
+ except Exception:
1071
+ pass
1072
+
1073
+ runtime = Runtime(run_store=self.run_store, ledger_store=self.ledger_store, artifact_store=self.artifact_store)
1074
+ try:
1075
+ runtime.set_workflow_registry(getattr(self._host, "workflow_registry", None))
1076
+ except Exception:
1077
+ pass
1078
+
1079
+ eff = Effect(
1080
+ type=EffectType.MEMORY_COMPACT,
1081
+ payload={"preserve_recent": 6, "compression_mode": "standard", "focus": None},
1082
+ result_key="_temp.runtime.auto_compact",
1083
+ )
1084
+ idem = f"runtime:auto_compact:{utc_now_iso()}:{used_i}"
1085
+ outcome = runtime._execute_effect_with_retry( # type: ignore[attr-defined]
1086
+ run=run,
1087
+ node_id="auto_compact",
1088
+ effect=eff,
1089
+ idempotency_key=idem,
1090
+ default_next_node=None,
1091
+ )
1092
+ try:
1093
+ run.updated_at = utc_now_iso()
1094
+ self.run_store.save(run)
1095
+ except Exception:
1096
+ pass
1097
+ if getattr(outcome, "status", None) == "failed":
1098
+ # Best-effort: record the error for debuggability but do not fail ticking.
1099
+ try:
1100
+ auto["last_error"] = getattr(outcome, "error", None)
1101
+ auto["last_error_at"] = utc_now_iso()
1102
+ self.run_store.save(run)
1103
+ except Exception:
1104
+ pass