PyPI - nthlayer-workers - Versions diffs - 1.0.0__py3-none-any.whl - Mend

nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

nthlayer_workers/__init__.py +5 -0
nthlayer_workers/cli.py +234 -0
nthlayer_workers/correlate/__init__.py +1 -0
nthlayer_workers/correlate/cli.py +847 -0
nthlayer_workers/correlate/config.py +111 -0
nthlayer_workers/correlate/correlation/__init__.py +1 -0
nthlayer_workers/correlate/correlation/changes.py +87 -0
nthlayer_workers/correlate/correlation/dedup.py +62 -0
nthlayer_workers/correlate/correlation/engine.py +244 -0
nthlayer_workers/correlate/correlation/temporal.py +79 -0
nthlayer_workers/correlate/correlation/topology.py +104 -0
nthlayer_workers/correlate/ingestion/__init__.py +1 -0
nthlayer_workers/correlate/ingestion/protocol.py +10 -0
nthlayer_workers/correlate/ingestion/severity.py +18 -0
nthlayer_workers/correlate/ingestion/webhook.py +197 -0
nthlayer_workers/correlate/notifications.py +85 -0
nthlayer_workers/correlate/prometheus.py +234 -0
nthlayer_workers/correlate/reasoning.py +375 -0
nthlayer_workers/correlate/session.py +189 -0
nthlayer_workers/correlate/snapshot/__init__.py +1 -0
nthlayer_workers/correlate/snapshot/generator.py +170 -0
nthlayer_workers/correlate/snapshot/model.py +177 -0
nthlayer_workers/correlate/snapshot/token.py +14 -0
nthlayer_workers/correlate/state.py +88 -0
nthlayer_workers/correlate/store/__init__.py +5 -0
nthlayer_workers/correlate/store/protocol.py +48 -0
nthlayer_workers/correlate/store/sqlite.py +443 -0
nthlayer_workers/correlate/summary.py +180 -0
nthlayer_workers/correlate/traces/__init__.py +1 -0
nthlayer_workers/correlate/traces/protocol.py +120 -0
nthlayer_workers/correlate/traces/tempo.py +667 -0
nthlayer_workers/correlate/traces/topology.py +39 -0
nthlayer_workers/correlate/types.py +77 -0
nthlayer_workers/correlate/worker.py +630 -0
nthlayer_workers/learn/__init__.py +5 -0
nthlayer_workers/learn/__main__.py +5 -0
nthlayer_workers/learn/cli.py +164 -0
nthlayer_workers/learn/retrospective.py +381 -0
nthlayer_workers/learn/trends.py +102 -0
nthlayer_workers/learn/worker.py +366 -0
nthlayer_workers/measure/__init__.py +3 -0
nthlayer_workers/measure/__main__.py +5 -0
nthlayer_workers/measure/_parsing.py +15 -0
nthlayer_workers/measure/adapters/__init__.py +0 -0
nthlayer_workers/measure/adapters/_util.py +24 -0
nthlayer_workers/measure/adapters/devin.py +119 -0
nthlayer_workers/measure/adapters/gastown.py +88 -0
nthlayer_workers/measure/adapters/prometheus.py +277 -0
nthlayer_workers/measure/adapters/protocol.py +20 -0
nthlayer_workers/measure/adapters/webhook.py +161 -0
nthlayer_workers/measure/api/__init__.py +0 -0
nthlayer_workers/measure/api/normalise.py +50 -0
nthlayer_workers/measure/api/queue.py +243 -0
nthlayer_workers/measure/api/response.py +51 -0
nthlayer_workers/measure/api/server.py +504 -0
nthlayer_workers/measure/calibration/__init__.py +0 -0
nthlayer_workers/measure/calibration/loop.py +62 -0
nthlayer_workers/measure/calibration/slos.py +212 -0
nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
nthlayer_workers/measure/cli.py +753 -0
nthlayer_workers/measure/config.py +191 -0
nthlayer_workers/measure/detection/__init__.py +6 -0
nthlayer_workers/measure/detection/detector.py +82 -0
nthlayer_workers/measure/detection/protocol.py +29 -0
nthlayer_workers/measure/governance/__init__.py +0 -0
nthlayer_workers/measure/governance/engine.py +163 -0
nthlayer_workers/measure/manifest.py +77 -0
nthlayer_workers/measure/notifications.py +53 -0
nthlayer_workers/measure/pipeline/__init__.py +0 -0
nthlayer_workers/measure/pipeline/evaluator.py +155 -0
nthlayer_workers/measure/pipeline/router.py +160 -0
nthlayer_workers/measure/store/__init__.py +0 -0
nthlayer_workers/measure/store/protocol.py +38 -0
nthlayer_workers/measure/store/sqlite.py +276 -0
nthlayer_workers/measure/telemetry.py +116 -0
nthlayer_workers/measure/tiering/__init__.py +0 -0
nthlayer_workers/measure/tiering/classifier.py +58 -0
nthlayer_workers/measure/tiering/promotion.py +118 -0
nthlayer_workers/measure/trends/__init__.py +0 -0
nthlayer_workers/measure/trends/tracker.py +72 -0
nthlayer_workers/measure/types.py +75 -0
nthlayer_workers/measure/worker.py +439 -0
nthlayer_workers/observe/__init__.py +25 -0
nthlayer_workers/observe/__main__.py +5 -0
nthlayer_workers/observe/api/__init__.py +1 -0
nthlayer_workers/observe/assessment.py +95 -0
nthlayer_workers/observe/cli.py +737 -0
nthlayer_workers/observe/config.py +11 -0
nthlayer_workers/observe/db/__init__.py +1 -0
nthlayer_workers/observe/decision_records.py +220 -0
nthlayer_workers/observe/dependencies/__init__.py +18 -0
nthlayer_workers/observe/dependencies/discovery.py +294 -0
nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
nthlayer_workers/observe/dependencies/providers/base.py +76 -0
nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
nthlayer_workers/observe/deployments/__init__.py +1 -0
nthlayer_workers/observe/discovery/__init__.py +14 -0
nthlayer_workers/observe/discovery/classifier.py +66 -0
nthlayer_workers/observe/discovery/client.py +189 -0
nthlayer_workers/observe/discovery/models.py +53 -0
nthlayer_workers/observe/drift/__init__.py +26 -0
nthlayer_workers/observe/drift/analyzer.py +383 -0
nthlayer_workers/observe/drift/models.py +174 -0
nthlayer_workers/observe/drift/patterns.py +88 -0
nthlayer_workers/observe/explanation.py +118 -0
nthlayer_workers/observe/gate/__init__.py +39 -0
nthlayer_workers/observe/gate/conditions.py +92 -0
nthlayer_workers/observe/gate/correlator.py +154 -0
nthlayer_workers/observe/gate/evaluator.py +192 -0
nthlayer_workers/observe/gate/policies.py +226 -0
nthlayer_workers/observe/gate_adapter.py +40 -0
nthlayer_workers/observe/incident.py +36 -0
nthlayer_workers/observe/portfolio/__init__.py +17 -0
nthlayer_workers/observe/portfolio/aggregator.py +168 -0
nthlayer_workers/observe/portfolio/scorer.py +13 -0
nthlayer_workers/observe/slo/__init__.py +19 -0
nthlayer_workers/observe/slo/collector.py +235 -0
nthlayer_workers/observe/slo/spec_loader.py +40 -0
nthlayer_workers/observe/sqlite_store.py +152 -0
nthlayer_workers/observe/store.py +92 -0
nthlayer_workers/observe/verification/__init__.py +22 -0
nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
nthlayer_workers/observe/verification/extractor.py +127 -0
nthlayer_workers/observe/verification/models.py +101 -0
nthlayer_workers/observe/verification/verifier.py +111 -0
nthlayer_workers/observe/worker.py +332 -0
nthlayer_workers/respond/__init__.py +2 -0
nthlayer_workers/respond/__main__.py +4 -0
nthlayer_workers/respond/agents/__init__.py +0 -0
nthlayer_workers/respond/agents/base.py +556 -0
nthlayer_workers/respond/agents/communication.py +115 -0
nthlayer_workers/respond/agents/investigation.py +124 -0
nthlayer_workers/respond/agents/remediation.py +219 -0
nthlayer_workers/respond/agents/triage.py +132 -0
nthlayer_workers/respond/cli.py +772 -0
nthlayer_workers/respond/config.py +135 -0
nthlayer_workers/respond/context_store.py +256 -0
nthlayer_workers/respond/coordinator.py +487 -0
nthlayer_workers/respond/metrics.py +104 -0
nthlayer_workers/respond/notification_backends/__init__.py +1 -0
nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
nthlayer_workers/respond/notification_backends/protocol.py +59 -0
nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
nthlayer_workers/respond/notifications.py +247 -0
nthlayer_workers/respond/oncall/__init__.py +1 -0
nthlayer_workers/respond/oncall/escalation.py +103 -0
nthlayer_workers/respond/oncall/runner.py +193 -0
nthlayer_workers/respond/oncall/schedule.py +243 -0
nthlayer_workers/respond/safe_actions/__init__.py +0 -0
nthlayer_workers/respond/safe_actions/actions.py +139 -0
nthlayer_workers/respond/safe_actions/registry.py +171 -0
nthlayer_workers/respond/safe_actions/webhook.py +194 -0
nthlayer_workers/respond/server.py +357 -0
nthlayer_workers/respond/sre/__init__.py +1 -0
nthlayer_workers/respond/sre/brief.py +175 -0
nthlayer_workers/respond/sre/delegation.py +101 -0
nthlayer_workers/respond/sre/post_incident.py +146 -0
nthlayer_workers/respond/sre/shift_report.py +129 -0
nthlayer_workers/respond/sre/suppression.py +91 -0
nthlayer_workers/respond/types.py +109 -0
nthlayer_workers/respond/verdict_submission.py +56 -0
nthlayer_workers/respond/worker.py +533 -0
nthlayer_workers/respond/worker_helpers.py +140 -0
nthlayer_workers/runner.py +198 -0
nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0

nthlayer_workers/respond/config.py ADDED Viewed

@@ -0,0 +1,135 @@
+# src/nthlayer_respond/config.py
+"""nthlayer-respond configuration."""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+import structlog
+import yaml
+logger = structlog.get_logger()
+@dataclass
+class RespondConfig:
+    # Coordinator
+    poll_interval_seconds: int = 30
+    escalation_threshold: float = 0.3
+    # Agents — NTHLAYER_MODEL env var takes precedence over hardcoded default
+    model: str = os.environ.get("NTHLAYER_MODEL", "claude-sonnet-4-20250514")
+    max_tokens: int = 4096
+    triage_timeout: int = 15
+    investigation_timeout: int = 60
+    communication_timeout: int = 20
+    remediation_timeout: int = 30
+    root_cause_threshold: float = 0.7
+    # Safe actions
+    cooldown_seconds: int = 300
+    arbiter_url: str = "http://localhost:8080"
+    # Stores
+    verdict_store_path: str = "verdicts.db"
+    context_store_path: str = "respond-incidents.db"
+    # Topology
+    manifests_dir: str | None = None
+    # Server
+    server_host: str = "0.0.0.0"
+    server_port: int = 8090
+    # Approval
+    approval_timeout_seconds: int = 900
+    # Slack (interactive buttons)
+    slack_signing_secret: str = ""
+    slack_bot_token: str = ""
+    # Notification backends (on-call escalation)
+    ntfy_server_url: str = ""
+    ntfy_auth_token: str = ""
+    twilio_account_sid: str = ""
+    twilio_auth_token: str = ""
+    twilio_from_number: str = ""
+    pagerduty_routing_key: str = ""
+    webhook_base_url: str = "http://localhost:8090"
+    # Worker mode (P3-E.1) — set by nthlayer-workers CLI from nthlayer.yaml,
+    # not from respond.yaml. Defaults exist so legacy CLI construction is
+    # unaffected.
+    cycle_interval_seconds: float = 30.0
+    fallback_threshold_seconds: float = 60.0
+    terminal_retention_seconds: float = 86400.0
+    step_timeout_seconds: float = 90.0
+    def __post_init__(self) -> None:
+        # Validate worker-mode timing fields. Negative values silently invert
+        # cutoff/threshold semantics (e.g. negative fallback_threshold makes
+        # the cutoff a future timestamp, matching every breach including those
+        # that may yet receive a snapshot). 0 is allowed across the board:
+        # threshold=0 fires fallback immediately (used by integration tests),
+        # retention=0 prunes terminal incidents on the next cycle, cycle=0 is
+        # a busy loop (degenerate but not catastrophic), step_timeout=0 falls
+        # through coordinator._step_timeout()'s `> 0` guard and disables the
+        # per-step timeout. Validate at construction so misconfiguration of
+        # the more dangerous case (negative) fails loud.
+        for name in (
+            "cycle_interval_seconds",
+            "fallback_threshold_seconds",
+            "terminal_retention_seconds",
+            "step_timeout_seconds",
+        ):
+            value = getattr(self, name)
+            if value < 0:
+                raise ValueError(f"RespondConfig.{name} must be >= 0, got {value!r}")
+def load_config(path: str) -> RespondConfig:
+    """Load config from YAML file. Returns defaults if file missing."""
+    try:
+        with open(path) as f:
+            data = yaml.safe_load(f) or {}
+    except FileNotFoundError:
+        logger.info("config_not_found", path=path)
+        return RespondConfig()
+    coord = data.get("coordinator", {})
+    agents = data.get("agents", {})
+    safe = data.get("safe_actions", {})
+    verdict = data.get("verdict", {}).get("store", {})
+    ctx_store = data.get("context_store", {})
+    topo = data.get("topology", {})
+    server = data.get("server", {})
+    approval = data.get("approval", {})
+    slack = data.get("slack", {})
+    notifications = data.get("notifications", {})
+    poll_interval = coord.get("poll_interval_seconds", 30)
+    escalation_thresh = coord.get("escalation_threshold", 0.3)
+    if not isinstance(poll_interval, (int, float)) or poll_interval <= 0:
+        raise ValueError(f"poll_interval_seconds must be a positive number, got {poll_interval!r}")
+    if not isinstance(escalation_thresh, (int, float)) or not (0.0 <= escalation_thresh <= 1.0):
+        raise ValueError(f"escalation_threshold must be between 0.0 and 1.0, got {escalation_thresh!r}")
+    return RespondConfig(
+        poll_interval_seconds=int(poll_interval),
+        escalation_threshold=float(escalation_thresh),
+        model=agents.get("model", "claude-sonnet-4-20250514"),
+        max_tokens=agents.get("max_tokens", 4096),
+        triage_timeout=agents.get("triage", {}).get("timeout", 15),
+        investigation_timeout=agents.get("investigation", {}).get("timeout", 60),
+        communication_timeout=agents.get("communication", {}).get("timeout", 20),
+        remediation_timeout=agents.get("remediation", {}).get("timeout", 30),
+        root_cause_threshold=agents.get("investigation", {}).get("root_cause_threshold", 0.7),
+        cooldown_seconds=safe.get("cooldown_seconds", 300),
+        arbiter_url=safe.get("arbiter_url", "http://localhost:8080"),
+        verdict_store_path=verdict.get("path", "verdicts.db"),
+        context_store_path=ctx_store.get("path", "respond-incidents.db"),
+        manifests_dir=topo.get("manifests_dir"),
+        server_host=server.get("host", "0.0.0.0"),
+        server_port=int(server.get("port", 8090)),
+        approval_timeout_seconds=int(approval.get("timeout_seconds", 900)),
+        slack_signing_secret=slack.get("signing_secret", ""),
+        slack_bot_token=slack.get("bot_token", ""),
+        ntfy_server_url=notifications.get("ntfy", {}).get("server_url", ""),
+        ntfy_auth_token=notifications.get("ntfy", {}).get("auth_token", ""),
+        twilio_account_sid=notifications.get("twilio", {}).get("account_sid", ""),
+        twilio_auth_token=notifications.get("twilio", {}).get("auth_token", ""),
+        twilio_from_number=notifications.get("twilio", {}).get("from_number", ""),
+        pagerduty_routing_key=notifications.get("pagerduty", {}).get("routing_key", ""),
+        webhook_base_url=notifications.get("webhook", {}).get("public_url", "http://localhost:8090"),
+    )

nthlayer_workers/respond/context_store.py ADDED Viewed

@@ -0,0 +1,256 @@
+"""SQLite-backed context store for incident crash recovery."""
+from __future__ import annotations
+import dataclasses
+import json
+import sqlite3
+from typing import Protocol
+from nthlayer_workers.respond.types import (
+    CommunicationResult,
+    CommunicationUpdate,
+    Hypothesis,
+    IncidentContext,
+    IncidentState,
+    InvestigationResult,
+    RemediationResult,
+    TERMINAL_STATES,
+    TriageResult,
+)
+class ContextStore(Protocol):
+    def save(self, context: IncidentContext) -> None: ...
+    def load(self, incident_id: str) -> IncidentContext | None: ...
+    def list_active(self) -> list[str]: ...
+    def list_all(self, limit: int = 50) -> list[IncidentContext]: ...
+    def get_metadata(self, key: str) -> str | None: ...
+    def set_metadata(self, key: str, value: str) -> None: ...
+    def close(self) -> None: ...
+def incident_context_to_dict(ctx: IncidentContext) -> dict:
+    """Serialise IncidentContext to a plain dict suitable for JSON encoding.
+    dataclasses.asdict() recursively converts nested dataclasses to dicts and
+    automatically calls .value on str-enums, which is exactly what we need.
+    ``verdict_chain`` is a dataclass field on IncidentContext, so asdict()
+    preserves it. This is load-bearing for lineage continuity across worker
+    restarts: post-restore, ``_emit_verdict`` reads ``context.verdict_chain[-1]``
+    to chain new verdicts to predecessors. An empty or missing chain
+    post-restore would cause new verdicts to be created with parent=None,
+    breaking the lineage. See ``test_state_roundtrip_preserves_all_fields``.
+    """
+    return dataclasses.asdict(ctx)
+_REQUIRED_INCIDENT_FIELDS = ("id", "state", "created_at", "updated_at", "trigger_source")
+def incident_context_from_dict(data: dict) -> IncidentContext:
+    """Reconstruct an IncidentContext from its serialised dict form.
+    Required fields (id, state, created_at, updated_at, trigger_source) MUST
+    be present — corrupt/malformed dicts raise ValueError so the worker's
+    restore_state can skip them rather than producing an incident with default
+    values that don't match the dict key.
+    """
+    if not isinstance(data, dict):
+        raise ValueError(f"incident_context_from_dict: expected dict, got {type(data).__name__}")
+    missing = [f for f in _REQUIRED_INCIDENT_FIELDS if f not in data]
+    if missing:
+        raise ValueError(f"incident_context_from_dict: missing required fields {missing}")
+    """Reconstruct a fully typed IncidentContext from a plain dict."""
+    # Reconstruct nested dataclasses manually because dict unpacking alone
+    # would leave them as plain dicts.
+    triage: TriageResult | None = None
+    if data.get("triage") is not None:
+        triage = TriageResult(**data["triage"])
+    investigation: InvestigationResult | None = None
+    if data.get("investigation") is not None:
+        inv = data["investigation"]
+        hypotheses = [Hypothesis(**h) for h in inv.get("hypotheses", [])]
+        investigation = InvestigationResult(
+            hypotheses=hypotheses,
+            root_cause=inv.get("root_cause"),
+            root_cause_confidence=inv.get("root_cause_confidence", 0.0),
+            reasoning=inv.get("reasoning", ""),
+            confidence=inv.get("confidence"),
+        )
+    communication: CommunicationResult | None = None
+    if data.get("communication") is not None:
+        comm = data["communication"]
+        updates_sent = [CommunicationUpdate(**u) for u in comm.get("updates_sent", [])]
+        communication = CommunicationResult(
+            updates_sent=updates_sent,
+            reasoning=comm.get("reasoning", ""),
+            confidence=comm.get("confidence"),
+        )
+    remediation: RemediationResult | None = None
+    if data.get("remediation") is not None:
+        remediation = RemediationResult(**data["remediation"])
+    # ``state`` must round-trip from get_state(), so missing/unknown is a
+    # genuine corruption. Default to TRIGGERED (an actual IncidentState value)
+    # rather than the previous "created" default, which is NOT a valid enum
+    # value and silently raised ValueError on every restore.
+    raw_state = data.get("state") or IncidentState.TRIGGERED.value
+    return IncidentContext(
+        id=data.get("id", "unknown"),
+        state=IncidentState(raw_state),
+        created_at=data.get("created_at", ""),
+        updated_at=data.get("updated_at", ""),
+        trigger_source=data.get("trigger_source", ""),
+        trigger_verdict_ids=data.get("trigger_verdict_ids", []),
+        topology=data.get("topology", {}),
+        triage=triage,
+        investigation=investigation,
+        communication=communication,
+        remediation=remediation,
+        verdict_chain=data.get("verdict_chain", []),
+        last_completed_step_index=data.get("last_completed_step_index"),
+        error=data.get("error"),
+        metadata=data.get("metadata", {}),
+    )
+_CREATE_INCIDENTS = """
+CREATE TABLE IF NOT EXISTS incidents (
+    id         TEXT PRIMARY KEY,
+    state      TEXT NOT NULL,
+    error      TEXT,
+    data       TEXT NOT NULL,
+    created_at TEXT NOT NULL,
+    updated_at TEXT NOT NULL
+)
+"""
+_CREATE_METADATA = """
+CREATE TABLE IF NOT EXISTS metadata (
+    key   TEXT PRIMARY KEY,
+    value TEXT NOT NULL
+)
+"""
+_CREATE_IDX_STATE = "CREATE INDEX IF NOT EXISTS idx_incidents_state ON incidents (state)"
+_CREATE_IDX_UPDATED = "CREATE INDEX IF NOT EXISTS idx_incidents_updated_at ON incidents (updated_at DESC)"
+class SQLiteContextStore:
+    """SQLite-backed store for IncidentContext objects.
+    Uses WAL journal mode and a 5 000 ms busy timeout so concurrent readers
+    (e.g. CLI status queries) do not block the coordinator.
+    """
+    def __init__(self, db_path: str) -> None:
+        self._conn = sqlite3.connect(db_path, check_same_thread=False)
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        self._conn.execute("PRAGMA busy_timeout=5000")
+        self._conn.execute(_CREATE_INCIDENTS)
+        self._conn.execute(_CREATE_METADATA)
+        self._conn.execute(_CREATE_IDX_STATE)
+        self._conn.execute(_CREATE_IDX_UPDATED)
+        self._conn.commit()
+    # ------------------------------------------------------------------
+    # Core CRUD
+    # ------------------------------------------------------------------
+    def save(self, context: IncidentContext) -> None:
+        """Persist context, overwriting any previous record with the same id."""
+        data_json = json.dumps(incident_context_to_dict(context))
+        self._conn.execute(
+            """
+            INSERT INTO incidents (id, state, error, data, created_at, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?)
+            ON CONFLICT(id) DO UPDATE SET
+                state      = excluded.state,
+                error      = excluded.error,
+                data       = excluded.data,
+                updated_at = excluded.updated_at
+            """,
+            (
+                context.id,
+                context.state.value,
+                context.error,
+                data_json,
+                context.created_at,
+                context.updated_at,
+            ),
+        )
+        self._conn.commit()
+    def load(self, incident_id: str) -> IncidentContext | None:
+        """Return a fully typed IncidentContext, or None if not found."""
+        row = self._conn.execute(
+            "SELECT data FROM incidents WHERE id = ?",
+            (incident_id,),
+        ).fetchone()
+        if row is None:
+            return None
+        return incident_context_from_dict(json.loads(row[0]))
+    # ------------------------------------------------------------------
+    # Queries
+    # ------------------------------------------------------------------
+    def list_active(self) -> list[str]:
+        """Return ids of all incidents not in a terminal state."""
+        terminal_values = tuple(s.value for s in TERMINAL_STATES)
+        placeholders = ",".join("?" * len(terminal_values))
+        rows = self._conn.execute(
+            f"SELECT id FROM incidents WHERE state NOT IN ({placeholders})",
+            terminal_values,
+        ).fetchall()
+        return [row[0] for row in rows]
+    def list_all(self, limit: int = 50) -> list[IncidentContext]:
+        """Return up to *limit* incidents ordered by most recently updated."""
+        rows = self._conn.execute(
+            "SELECT data FROM incidents ORDER BY updated_at DESC LIMIT ?",
+            (limit,),
+        ).fetchall()
+        results = []
+        for row in rows:
+            try:
+                results.append(incident_context_from_dict(json.loads(row[0])))
+            except (KeyError, ValueError, json.JSONDecodeError):
+                continue  # skip corrupted rows
+        return results
+    # ------------------------------------------------------------------
+    # Metadata key-value store
+    # ------------------------------------------------------------------
+    def get_metadata(self, key: str) -> str | None:
+        """Return a stored metadata value, or None if the key is absent."""
+        row = self._conn.execute(
+            "SELECT value FROM metadata WHERE key = ?",
+            (key,),
+        ).fetchone()
+        return row[0] if row is not None else None
+    def set_metadata(self, key: str, value: str) -> None:
+        """Insert or replace a metadata key-value pair."""
+        self._conn.execute(
+            """
+            INSERT INTO metadata (key, value) VALUES (?, ?)
+            ON CONFLICT(key) DO UPDATE SET value = excluded.value
+            """,
+            (key, value),
+        )
+        self._conn.commit()
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def close(self) -> None:
+        """Close the underlying database connection."""
+        self._conn.close()