nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,135 @@
1
+ # src/nthlayer_respond/config.py
2
+ """nthlayer-respond configuration."""
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+
8
+ import structlog
9
+ import yaml
10
+
11
+ logger = structlog.get_logger()
12
+
13
+
14
+ @dataclass
15
+ class RespondConfig:
16
+ # Coordinator
17
+ poll_interval_seconds: int = 30
18
+ escalation_threshold: float = 0.3
19
+ # Agents — NTHLAYER_MODEL env var takes precedence over hardcoded default
20
+ model: str = os.environ.get("NTHLAYER_MODEL", "claude-sonnet-4-20250514")
21
+ max_tokens: int = 4096
22
+ triage_timeout: int = 15
23
+ investigation_timeout: int = 60
24
+ communication_timeout: int = 20
25
+ remediation_timeout: int = 30
26
+ root_cause_threshold: float = 0.7
27
+ # Safe actions
28
+ cooldown_seconds: int = 300
29
+ arbiter_url: str = "http://localhost:8080"
30
+ # Stores
31
+ verdict_store_path: str = "verdicts.db"
32
+ context_store_path: str = "respond-incidents.db"
33
+ # Topology
34
+ manifests_dir: str | None = None
35
+ # Server
36
+ server_host: str = "0.0.0.0"
37
+ server_port: int = 8090
38
+ # Approval
39
+ approval_timeout_seconds: int = 900
40
+ # Slack (interactive buttons)
41
+ slack_signing_secret: str = ""
42
+ slack_bot_token: str = ""
43
+ # Notification backends (on-call escalation)
44
+ ntfy_server_url: str = ""
45
+ ntfy_auth_token: str = ""
46
+ twilio_account_sid: str = ""
47
+ twilio_auth_token: str = ""
48
+ twilio_from_number: str = ""
49
+ pagerduty_routing_key: str = ""
50
+ webhook_base_url: str = "http://localhost:8090"
51
+ # Worker mode (P3-E.1) — set by nthlayer-workers CLI from nthlayer.yaml,
52
+ # not from respond.yaml. Defaults exist so legacy CLI construction is
53
+ # unaffected.
54
+ cycle_interval_seconds: float = 30.0
55
+ fallback_threshold_seconds: float = 60.0
56
+ terminal_retention_seconds: float = 86400.0
57
+ step_timeout_seconds: float = 90.0
58
+
59
+ def __post_init__(self) -> None:
60
+ # Validate worker-mode timing fields. Negative values silently invert
61
+ # cutoff/threshold semantics (e.g. negative fallback_threshold makes
62
+ # the cutoff a future timestamp, matching every breach including those
63
+ # that may yet receive a snapshot). 0 is allowed across the board:
64
+ # threshold=0 fires fallback immediately (used by integration tests),
65
+ # retention=0 prunes terminal incidents on the next cycle, cycle=0 is
66
+ # a busy loop (degenerate but not catastrophic), step_timeout=0 falls
67
+ # through coordinator._step_timeout()'s `> 0` guard and disables the
68
+ # per-step timeout. Validate at construction so misconfiguration of
69
+ # the more dangerous case (negative) fails loud.
70
+ for name in (
71
+ "cycle_interval_seconds",
72
+ "fallback_threshold_seconds",
73
+ "terminal_retention_seconds",
74
+ "step_timeout_seconds",
75
+ ):
76
+ value = getattr(self, name)
77
+ if value < 0:
78
+ raise ValueError(f"RespondConfig.{name} must be >= 0, got {value!r}")
79
+
80
+
81
+ def load_config(path: str) -> RespondConfig:
82
+ """Load config from YAML file. Returns defaults if file missing."""
83
+ try:
84
+ with open(path) as f:
85
+ data = yaml.safe_load(f) or {}
86
+ except FileNotFoundError:
87
+ logger.info("config_not_found", path=path)
88
+ return RespondConfig()
89
+
90
+ coord = data.get("coordinator", {})
91
+ agents = data.get("agents", {})
92
+ safe = data.get("safe_actions", {})
93
+ verdict = data.get("verdict", {}).get("store", {})
94
+ ctx_store = data.get("context_store", {})
95
+ topo = data.get("topology", {})
96
+ server = data.get("server", {})
97
+ approval = data.get("approval", {})
98
+ slack = data.get("slack", {})
99
+ notifications = data.get("notifications", {})
100
+
101
+ poll_interval = coord.get("poll_interval_seconds", 30)
102
+ escalation_thresh = coord.get("escalation_threshold", 0.3)
103
+ if not isinstance(poll_interval, (int, float)) or poll_interval <= 0:
104
+ raise ValueError(f"poll_interval_seconds must be a positive number, got {poll_interval!r}")
105
+ if not isinstance(escalation_thresh, (int, float)) or not (0.0 <= escalation_thresh <= 1.0):
106
+ raise ValueError(f"escalation_threshold must be between 0.0 and 1.0, got {escalation_thresh!r}")
107
+
108
+ return RespondConfig(
109
+ poll_interval_seconds=int(poll_interval),
110
+ escalation_threshold=float(escalation_thresh),
111
+ model=agents.get("model", "claude-sonnet-4-20250514"),
112
+ max_tokens=agents.get("max_tokens", 4096),
113
+ triage_timeout=agents.get("triage", {}).get("timeout", 15),
114
+ investigation_timeout=agents.get("investigation", {}).get("timeout", 60),
115
+ communication_timeout=agents.get("communication", {}).get("timeout", 20),
116
+ remediation_timeout=agents.get("remediation", {}).get("timeout", 30),
117
+ root_cause_threshold=agents.get("investigation", {}).get("root_cause_threshold", 0.7),
118
+ cooldown_seconds=safe.get("cooldown_seconds", 300),
119
+ arbiter_url=safe.get("arbiter_url", "http://localhost:8080"),
120
+ verdict_store_path=verdict.get("path", "verdicts.db"),
121
+ context_store_path=ctx_store.get("path", "respond-incidents.db"),
122
+ manifests_dir=topo.get("manifests_dir"),
123
+ server_host=server.get("host", "0.0.0.0"),
124
+ server_port=int(server.get("port", 8090)),
125
+ approval_timeout_seconds=int(approval.get("timeout_seconds", 900)),
126
+ slack_signing_secret=slack.get("signing_secret", ""),
127
+ slack_bot_token=slack.get("bot_token", ""),
128
+ ntfy_server_url=notifications.get("ntfy", {}).get("server_url", ""),
129
+ ntfy_auth_token=notifications.get("ntfy", {}).get("auth_token", ""),
130
+ twilio_account_sid=notifications.get("twilio", {}).get("account_sid", ""),
131
+ twilio_auth_token=notifications.get("twilio", {}).get("auth_token", ""),
132
+ twilio_from_number=notifications.get("twilio", {}).get("from_number", ""),
133
+ pagerduty_routing_key=notifications.get("pagerduty", {}).get("routing_key", ""),
134
+ webhook_base_url=notifications.get("webhook", {}).get("public_url", "http://localhost:8090"),
135
+ )
@@ -0,0 +1,256 @@
1
+ """SQLite-backed context store for incident crash recovery."""
2
+ from __future__ import annotations
3
+
4
+ import dataclasses
5
+ import json
6
+ import sqlite3
7
+ from typing import Protocol
8
+
9
+ from nthlayer_workers.respond.types import (
10
+ CommunicationResult,
11
+ CommunicationUpdate,
12
+ Hypothesis,
13
+ IncidentContext,
14
+ IncidentState,
15
+ InvestigationResult,
16
+ RemediationResult,
17
+ TERMINAL_STATES,
18
+ TriageResult,
19
+ )
20
+
21
+
22
+ class ContextStore(Protocol):
23
+ def save(self, context: IncidentContext) -> None: ...
24
+ def load(self, incident_id: str) -> IncidentContext | None: ...
25
+ def list_active(self) -> list[str]: ...
26
+ def list_all(self, limit: int = 50) -> list[IncidentContext]: ...
27
+ def get_metadata(self, key: str) -> str | None: ...
28
+ def set_metadata(self, key: str, value: str) -> None: ...
29
+ def close(self) -> None: ...
30
+
31
+
32
+ def incident_context_to_dict(ctx: IncidentContext) -> dict:
33
+ """Serialise IncidentContext to a plain dict suitable for JSON encoding.
34
+
35
+ dataclasses.asdict() recursively converts nested dataclasses to dicts and
36
+ automatically calls .value on str-enums, which is exactly what we need.
37
+
38
+ ``verdict_chain`` is a dataclass field on IncidentContext, so asdict()
39
+ preserves it. This is load-bearing for lineage continuity across worker
40
+ restarts: post-restore, ``_emit_verdict`` reads ``context.verdict_chain[-1]``
41
+ to chain new verdicts to predecessors. An empty or missing chain
42
+ post-restore would cause new verdicts to be created with parent=None,
43
+ breaking the lineage. See ``test_state_roundtrip_preserves_all_fields``.
44
+ """
45
+ return dataclasses.asdict(ctx)
46
+
47
+
48
+ _REQUIRED_INCIDENT_FIELDS = ("id", "state", "created_at", "updated_at", "trigger_source")
49
+
50
+
51
+ def incident_context_from_dict(data: dict) -> IncidentContext:
52
+ """Reconstruct an IncidentContext from its serialised dict form.
53
+
54
+ Required fields (id, state, created_at, updated_at, trigger_source) MUST
55
+ be present — corrupt/malformed dicts raise ValueError so the worker's
56
+ restore_state can skip them rather than producing an incident with default
57
+ values that don't match the dict key.
58
+ """
59
+ if not isinstance(data, dict):
60
+ raise ValueError(f"incident_context_from_dict: expected dict, got {type(data).__name__}")
61
+ missing = [f for f in _REQUIRED_INCIDENT_FIELDS if f not in data]
62
+ if missing:
63
+ raise ValueError(f"incident_context_from_dict: missing required fields {missing}")
64
+ """Reconstruct a fully typed IncidentContext from a plain dict."""
65
+ # Reconstruct nested dataclasses manually because dict unpacking alone
66
+ # would leave them as plain dicts.
67
+
68
+ triage: TriageResult | None = None
69
+ if data.get("triage") is not None:
70
+ triage = TriageResult(**data["triage"])
71
+
72
+ investigation: InvestigationResult | None = None
73
+ if data.get("investigation") is not None:
74
+ inv = data["investigation"]
75
+ hypotheses = [Hypothesis(**h) for h in inv.get("hypotheses", [])]
76
+ investigation = InvestigationResult(
77
+ hypotheses=hypotheses,
78
+ root_cause=inv.get("root_cause"),
79
+ root_cause_confidence=inv.get("root_cause_confidence", 0.0),
80
+ reasoning=inv.get("reasoning", ""),
81
+ confidence=inv.get("confidence"),
82
+ )
83
+
84
+ communication: CommunicationResult | None = None
85
+ if data.get("communication") is not None:
86
+ comm = data["communication"]
87
+ updates_sent = [CommunicationUpdate(**u) for u in comm.get("updates_sent", [])]
88
+ communication = CommunicationResult(
89
+ updates_sent=updates_sent,
90
+ reasoning=comm.get("reasoning", ""),
91
+ confidence=comm.get("confidence"),
92
+ )
93
+
94
+ remediation: RemediationResult | None = None
95
+ if data.get("remediation") is not None:
96
+ remediation = RemediationResult(**data["remediation"])
97
+
98
+ # ``state`` must round-trip from get_state(), so missing/unknown is a
99
+ # genuine corruption. Default to TRIGGERED (an actual IncidentState value)
100
+ # rather than the previous "created" default, which is NOT a valid enum
101
+ # value and silently raised ValueError on every restore.
102
+ raw_state = data.get("state") or IncidentState.TRIGGERED.value
103
+ return IncidentContext(
104
+ id=data.get("id", "unknown"),
105
+ state=IncidentState(raw_state),
106
+ created_at=data.get("created_at", ""),
107
+ updated_at=data.get("updated_at", ""),
108
+ trigger_source=data.get("trigger_source", ""),
109
+ trigger_verdict_ids=data.get("trigger_verdict_ids", []),
110
+ topology=data.get("topology", {}),
111
+ triage=triage,
112
+ investigation=investigation,
113
+ communication=communication,
114
+ remediation=remediation,
115
+ verdict_chain=data.get("verdict_chain", []),
116
+ last_completed_step_index=data.get("last_completed_step_index"),
117
+ error=data.get("error"),
118
+ metadata=data.get("metadata", {}),
119
+ )
120
+
121
+
122
+ _CREATE_INCIDENTS = """
123
+ CREATE TABLE IF NOT EXISTS incidents (
124
+ id TEXT PRIMARY KEY,
125
+ state TEXT NOT NULL,
126
+ error TEXT,
127
+ data TEXT NOT NULL,
128
+ created_at TEXT NOT NULL,
129
+ updated_at TEXT NOT NULL
130
+ )
131
+ """
132
+
133
+ _CREATE_METADATA = """
134
+ CREATE TABLE IF NOT EXISTS metadata (
135
+ key TEXT PRIMARY KEY,
136
+ value TEXT NOT NULL
137
+ )
138
+ """
139
+
140
+ _CREATE_IDX_STATE = "CREATE INDEX IF NOT EXISTS idx_incidents_state ON incidents (state)"
141
+ _CREATE_IDX_UPDATED = "CREATE INDEX IF NOT EXISTS idx_incidents_updated_at ON incidents (updated_at DESC)"
142
+
143
+
144
+ class SQLiteContextStore:
145
+ """SQLite-backed store for IncidentContext objects.
146
+
147
+ Uses WAL journal mode and a 5 000 ms busy timeout so concurrent readers
148
+ (e.g. CLI status queries) do not block the coordinator.
149
+ """
150
+
151
+ def __init__(self, db_path: str) -> None:
152
+ self._conn = sqlite3.connect(db_path, check_same_thread=False)
153
+ self._conn.execute("PRAGMA journal_mode=WAL")
154
+ self._conn.execute("PRAGMA busy_timeout=5000")
155
+ self._conn.execute(_CREATE_INCIDENTS)
156
+ self._conn.execute(_CREATE_METADATA)
157
+ self._conn.execute(_CREATE_IDX_STATE)
158
+ self._conn.execute(_CREATE_IDX_UPDATED)
159
+ self._conn.commit()
160
+
161
+ # ------------------------------------------------------------------
162
+ # Core CRUD
163
+ # ------------------------------------------------------------------
164
+
165
+ def save(self, context: IncidentContext) -> None:
166
+ """Persist context, overwriting any previous record with the same id."""
167
+ data_json = json.dumps(incident_context_to_dict(context))
168
+ self._conn.execute(
169
+ """
170
+ INSERT INTO incidents (id, state, error, data, created_at, updated_at)
171
+ VALUES (?, ?, ?, ?, ?, ?)
172
+ ON CONFLICT(id) DO UPDATE SET
173
+ state = excluded.state,
174
+ error = excluded.error,
175
+ data = excluded.data,
176
+ updated_at = excluded.updated_at
177
+ """,
178
+ (
179
+ context.id,
180
+ context.state.value,
181
+ context.error,
182
+ data_json,
183
+ context.created_at,
184
+ context.updated_at,
185
+ ),
186
+ )
187
+ self._conn.commit()
188
+
189
+ def load(self, incident_id: str) -> IncidentContext | None:
190
+ """Return a fully typed IncidentContext, or None if not found."""
191
+ row = self._conn.execute(
192
+ "SELECT data FROM incidents WHERE id = ?",
193
+ (incident_id,),
194
+ ).fetchone()
195
+ if row is None:
196
+ return None
197
+ return incident_context_from_dict(json.loads(row[0]))
198
+
199
+ # ------------------------------------------------------------------
200
+ # Queries
201
+ # ------------------------------------------------------------------
202
+
203
+ def list_active(self) -> list[str]:
204
+ """Return ids of all incidents not in a terminal state."""
205
+ terminal_values = tuple(s.value for s in TERMINAL_STATES)
206
+ placeholders = ",".join("?" * len(terminal_values))
207
+ rows = self._conn.execute(
208
+ f"SELECT id FROM incidents WHERE state NOT IN ({placeholders})",
209
+ terminal_values,
210
+ ).fetchall()
211
+ return [row[0] for row in rows]
212
+
213
+ def list_all(self, limit: int = 50) -> list[IncidentContext]:
214
+ """Return up to *limit* incidents ordered by most recently updated."""
215
+ rows = self._conn.execute(
216
+ "SELECT data FROM incidents ORDER BY updated_at DESC LIMIT ?",
217
+ (limit,),
218
+ ).fetchall()
219
+ results = []
220
+ for row in rows:
221
+ try:
222
+ results.append(incident_context_from_dict(json.loads(row[0])))
223
+ except (KeyError, ValueError, json.JSONDecodeError):
224
+ continue # skip corrupted rows
225
+ return results
226
+
227
+ # ------------------------------------------------------------------
228
+ # Metadata key-value store
229
+ # ------------------------------------------------------------------
230
+
231
+ def get_metadata(self, key: str) -> str | None:
232
+ """Return a stored metadata value, or None if the key is absent."""
233
+ row = self._conn.execute(
234
+ "SELECT value FROM metadata WHERE key = ?",
235
+ (key,),
236
+ ).fetchone()
237
+ return row[0] if row is not None else None
238
+
239
+ def set_metadata(self, key: str, value: str) -> None:
240
+ """Insert or replace a metadata key-value pair."""
241
+ self._conn.execute(
242
+ """
243
+ INSERT INTO metadata (key, value) VALUES (?, ?)
244
+ ON CONFLICT(key) DO UPDATE SET value = excluded.value
245
+ """,
246
+ (key, value),
247
+ )
248
+ self._conn.commit()
249
+
250
+ # ------------------------------------------------------------------
251
+ # Lifecycle
252
+ # ------------------------------------------------------------------
253
+
254
+ def close(self) -> None:
255
+ """Close the underlying database connection."""
256
+ self._conn.close()