delimit-cli 4.6.0 → 4.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,185 @@
1
+ """LED-1415 — CLI subprocess contract.
2
+
3
+ The deliberation engine drives 4 model CLIs as subprocesses
4
+ (claude / codex / gemini / cursor) and treats their stdout as model
5
+ verdict text. Three classes of bug have surfaced in this pipeline:
6
+
7
+ 1. Banner contamination — the Delimit governance shim leaks ASCII
8
+ art onto stdout instead of stderr (PR #154, fixed by LED-1428).
9
+ 2. Empty/silent responses — CLI exits 0 but stdout is empty
10
+ (transient API issues, OOM, network blips). Caught by LED-1416's
11
+ retry state machine.
12
+ 3. Schema drift — CLI changes its output shape between versions
13
+ (e.g., adds an auto-correction line at the top). Caught
14
+ reactively by failing deliberation panels.
15
+
16
+ This module holds the ONE contract that every CLI response must
17
+ satisfy + the ONE validator that enforces it. Both the per-CLI mock
18
+ tests (tests/test_cli_contract.py) AND the weekly real-CLI smoke
19
+ script (scripts/smoke_cli_contracts.py) call validate_cli_contract()
20
+ so the contract definition lives in exactly one place — extending
21
+ it doesn't require changing two places to remember.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import re
27
+ from dataclasses import dataclass, field
28
+ from typing import List, Optional
29
+
30
+
31
+ # The 4 known CLIs the deliberation engine targets. cursor is included
32
+ # even though it's not yet installed in the dev environment — adding
33
+ # it to the contract surface now means the validator is ready when it
34
+ # lands; smoke skips when the binary isn't present.
35
+ KNOWN_CLI_NAMES = ("claude", "codex", "gemini", "cursor")
36
+
37
+
38
+ # Minimum scrubbed-response length we'll accept as "looks like a real
39
+ # model verdict" rather than "leftover garbage after banner strip."
40
+ # Calibrated against historical scrub-debug.jsonl entries: every real
41
+ # round-1/round-2 verdict from past deliberations was >= 60 chars;
42
+ # every banner-only contamination was < 30 chars. 30 is the cutoff
43
+ # the production scrubber already uses; keeping that here means the
44
+ # validator + the scrubber agree.
45
+ MIN_VERDICT_LEN = 30
46
+
47
+
48
+ # Patterns that signal "the response is contamination, not a verdict."
49
+ # Each gets the response REJECTED even if length and scrub passed.
50
+ _CONTAMINATION_MARKERS = (
51
+ re.compile(r"^\[scrub:\s*contaminated\b", re.IGNORECASE),
52
+ re.compile(r"^\[.+\bunavailable\b.+\bnot found in PATH\]", re.IGNORECASE),
53
+ re.compile(r"^\[.+\bskipped under INTERNAL_PYTEST_GUARD", re.IGNORECASE),
54
+ re.compile(r"^\[.+\btimed out after\b", re.IGNORECASE),
55
+ re.compile(r"^\[.+\breturned empty response\]", re.IGNORECASE),
56
+ re.compile(r"^\[.+\berror:.+\]\s*$", re.IGNORECASE),
57
+ )
58
+
59
+
60
+ # A response should contain at least ONE of these markers to be
61
+ # recognizable as a panel verdict. The deliberation engine prompts all
62
+ # models to end with `VERDICT: ...` so we expect to see it. Falling
63
+ # back: "AGREE" / "DISAGREE" / "REMEDIATE" / "AGREE WITH MODIFICATIONS"
64
+ # all appear in real responses even when the trailing VERDICT line is
65
+ # omitted by a chatty model.
66
+ _VERDICT_HINT_RE = re.compile(
67
+ r"\b(VERDICT:|AGREE|DISAGREE|REMEDIATE|APPROVE|REJECT)\b",
68
+ re.IGNORECASE,
69
+ )
70
+
71
+
72
+ @dataclass
73
+ class CliContractResult:
74
+ """Outcome of validating one CLI's response.
75
+
76
+ `ok` is True iff every contract clause passed. `failures` is the
77
+ list of clauses that fired — the smoke script ntfys with this list
78
+ so the operator can see exactly what shape the regression took.
79
+ """
80
+ cli: str
81
+ raw_len: int
82
+ scrubbed_len: int
83
+ ok: bool
84
+ failures: List[str] = field(default_factory=list)
85
+ preview: str = "" # First 200 chars of scrubbed text, for log readability
86
+
87
+
88
+ def validate_cli_contract(
89
+ cli_name: str,
90
+ raw_stdout: str,
91
+ raw_stderr: str = "",
92
+ expect_verdict_hint: bool = True,
93
+ ) -> CliContractResult:
94
+ """Apply the per-CLI contract to one subprocess response.
95
+
96
+ Mirrors the EXACT production scrub path so the validator's view
97
+ matches what ai/deliberation.py's _call_cli sees. Failures append
98
+ a short reason string; an empty failures list means the response
99
+ is contract-clean.
100
+
101
+ Args:
102
+ cli_name: which CLI produced this (claude/codex/gemini/cursor);
103
+ used in the failure messages.
104
+ raw_stdout: subprocess.stdout bytes decoded to str.
105
+ raw_stderr: subprocess.stderr bytes decoded to str. The
106
+ contract is permissive on stderr — banner output is
107
+ ALLOWED there (intentional shim behavior); but completely
108
+ empty stderr + completely empty stdout is suspicious.
109
+ expect_verdict_hint: when True, fail the response if it
110
+ doesn't contain at least one verdict marker. Mock tests
111
+ and the smoke script set this; tests of low-content
112
+ responses (e.g., a `--version` smoke) set False.
113
+
114
+ Returns:
115
+ CliContractResult with `ok`, `failures`, and a preview.
116
+ """
117
+ # Import lazily so this module can be imported in a context where
118
+ # ai.deliberation isn't available (e.g., the smoke script when
119
+ # gateway code path changes).
120
+ failures: List[str] = []
121
+ try:
122
+ from ai.deliberation import _scrub_cli_output
123
+ scrubbed = _scrub_cli_output(raw_stdout, source=cli_name).strip()
124
+ except Exception as exc:
125
+ return CliContractResult(
126
+ cli=cli_name,
127
+ raw_len=len(raw_stdout),
128
+ scrubbed_len=0,
129
+ ok=False,
130
+ failures=[f"scrub_failed:{type(exc).__name__}:{str(exc)[:80]}"],
131
+ preview="",
132
+ )
133
+
134
+ # 1. Contamination markers — if the scrubber returned one, fail.
135
+ for pat in _CONTAMINATION_MARKERS:
136
+ if pat.search(scrubbed):
137
+ failures.append(f"contamination_marker:{pat.pattern[:40]}")
138
+ break
139
+
140
+ # 2. Minimum length. Below MIN_VERDICT_LEN is almost certainly
141
+ # garbage even if scrub didn't tag it.
142
+ if len(scrubbed) < MIN_VERDICT_LEN and "contamination_marker" not in " ".join(failures):
143
+ failures.append(f"too_short:{len(scrubbed)}<{MIN_VERDICT_LEN}")
144
+
145
+ # 3. Verdict hint — at least one of VERDICT:/AGREE/DISAGREE/REMEDIATE/
146
+ # APPROVE/REJECT must appear. Skip when expect_verdict_hint=False.
147
+ if expect_verdict_hint and not _VERDICT_HINT_RE.search(scrubbed):
148
+ failures.append("no_verdict_hint")
149
+
150
+ # 4. Doesn't start with a known banner prefix (defense-in-depth on
151
+ # top of scrub). If a brand-new banner shape lands tomorrow that
152
+ # the scrubber doesn't know about, this should catch it.
153
+ if scrubbed.startswith("["):
154
+ # Bracketed prefix is almost always a tool-emitted status line
155
+ # (e.g. "[Delimit]" / "[claude error: ...]") not a model verdict.
156
+ if not any(scrubbed.lower().startswith(p) for p in (
157
+ "[delimit", "[scrub:", "[claude", "[codex", "[gemini", "[cursor",
158
+ )):
159
+ # Unknown bracketed prefix — surface for inspection
160
+ failures.append(f"unknown_bracketed_prefix:{scrubbed[:40]!r}")
161
+
162
+ return CliContractResult(
163
+ cli=cli_name,
164
+ raw_len=len(raw_stdout),
165
+ scrubbed_len=len(scrubbed),
166
+ ok=not failures,
167
+ failures=failures,
168
+ preview=scrubbed[:200],
169
+ )
170
+
171
+
172
+ def format_contract_report(results: List[CliContractResult]) -> str:
173
+ """Human-readable summary of N validation results for ntfy / logs."""
174
+ lines = []
175
+ n_ok = sum(1 for r in results if r.ok)
176
+ lines.append(f"CLI contract: {n_ok}/{len(results)} clean")
177
+ for r in results:
178
+ flag = "OK" if r.ok else "FAIL"
179
+ lines.append(f" [{flag}] {r.cli:8s} raw={r.raw_len}B scrubbed={r.scrubbed_len}B")
180
+ if not r.ok:
181
+ for f in r.failures:
182
+ lines.append(f" ↳ {f}")
183
+ if r.preview:
184
+ lines.append(f" preview: {r.preview[:100]!r}")
185
+ return "\n".join(lines)
@@ -13,7 +13,10 @@ This replaces _with_next_steps — governance IS the next step system.
13
13
  import json
14
14
  import logging
15
15
  import os
16
+ import re
17
+ import subprocess
16
18
  import time
19
+ from datetime import datetime, timezone
17
20
  from pathlib import Path
18
21
  from typing import Any, Dict, List, Optional
19
22
 
@@ -826,6 +829,184 @@ def govern(tool_name: str, result: Dict[str, Any], project_path: str = ".") -> D
826
829
  return governed_result
827
830
 
828
831
 
832
+ # ─────────────────────────────────────────────────────────────────────
833
+ # LED-2214b-followup — sensor_github_issue sync impl
834
+ # ─────────────────────────────────────────────────────────────────────
835
+ #
836
+ # The outreach daemon's monitor_phase needs to call the same logic that
837
+ # delimit_sensor_github_issue (MCP tool) runs, but synchronously and
838
+ # without the _with_next_steps wrapping. Before this extraction the
839
+ # daemon tried to import the impl from two paths that don't exist —
840
+ # `ai.governance._sensor_github_issue_impl` and
841
+ # `backends.governance_bridge.sensor_github_issue` — and silently fell
842
+ # back to "monitor skipped" on every tick, leaving the entire reply-
843
+ # tracking cycle dead.
844
+ #
845
+ # Now both callers share this function. The MCP tool wraps the result
846
+ # with `_with_next_steps`; the daemon consumes the raw dict.
847
+
848
+ _NEGATIVE_KEYWORDS = (
849
+ "not interested", "won't be", "will not", "don't need", "do not need",
850
+ "no thanks", "pass on", "not a fit", "not for us", "closing",
851
+ "won't adopt", "will not adopt", "reject", "declined",
852
+ )
853
+
854
+ _REPO_FORMAT_RE = re.compile(r"^[\w.-]+/[\w.-]+$")
855
+
856
+ # Module-local guard so the warning fires at most once per process.
857
+ _REPO_ALLOWLIST_WARNED = False
858
+
859
+
860
+ def _check_repo_allowlist(repo: str) -> Optional[Dict[str, Any]]:
861
+ """Return a refusal dict if the repo isn't in DELIMIT_ALLOWED_REPOS.
862
+
863
+ Duplicates the logic of ai.server._check_repo_allowlist intentionally:
864
+ importing from ai.server would create a circular import (server.py
865
+ imports from governance). Mirror with care — both copies must stay
866
+ in sync until LED-216 splits the allowlist into its own module.
867
+ """
868
+ global _REPO_ALLOWLIST_WARNED
869
+ allowlist_raw = os.environ.get("DELIMIT_ALLOWED_REPOS", "").strip()
870
+ if not allowlist_raw:
871
+ if not _REPO_ALLOWLIST_WARNED:
872
+ logger.warning(
873
+ "DELIMIT_ALLOWED_REPOS unset — sensor_github_issue calls "
874
+ "pass through to gh api using the caller's token."
875
+ )
876
+ _REPO_ALLOWLIST_WARNED = True
877
+ return None
878
+ allowed = {entry.strip().lower() for entry in allowlist_raw.split(",") if entry.strip()}
879
+ if (repo or "").strip().lower() not in allowed:
880
+ return {
881
+ "error": "repo_not_allowlisted",
882
+ "repo": repo,
883
+ "allowed": sorted(allowed),
884
+ "hint": (
885
+ "Repo not in DELIMIT_ALLOWED_REPOS. Add it or use a tool "
886
+ "that does not reach external APIs."
887
+ ),
888
+ }
889
+ return None
890
+
891
+
892
+ def _sensor_github_issue_impl(
893
+ repo: str,
894
+ issue_number: int,
895
+ since_comment_id: int = 0,
896
+ ) -> Dict[str, Any]:
897
+ """Sync implementation of the sensor_github_issue MCP tool.
898
+
899
+ Returns the RAW result dict (no _with_next_steps wrapping). Callers
900
+ that want the MCP wrapping apply it themselves. Returns
901
+ ``{"error": ..., "has_new_activity": False}`` on any failure mode
902
+ rather than raising — the outreach daemon's monitor loop relies on
903
+ fail-soft behavior so one bad LED doesn't kill the whole tick.
904
+
905
+ Result schema (success path):
906
+ {
907
+ "repo": str, "issue_number": str,
908
+ "signal": {id, venture, metric, source, timestamp, severity},
909
+ "issue_state": "open" | "closed" | "unknown",
910
+ "new_comments": [{id, author, created_at, body}, ...],
911
+ "latest_comment_id": int,
912
+ "total_comments": int,
913
+ "has_new_activity": bool,
914
+ }
915
+ """
916
+ # Validate inputs — defense-in-depth even though subprocess.run with
917
+ # list argv (no shell=True) makes classic injection inert.
918
+ if not _REPO_FORMAT_RE.match(repo or ""):
919
+ return {"error": f"Invalid repo format: {repo!r}. Use owner/repo.",
920
+ "has_new_activity": False}
921
+ if ".." in repo:
922
+ return {"error": "Invalid repo: path traversal sequences not allowed",
923
+ "has_new_activity": False}
924
+ if not isinstance(issue_number, int) or issue_number <= 0:
925
+ return {"error": f"Invalid issue number: {issue_number}",
926
+ "has_new_activity": False}
927
+
928
+ refusal = _check_repo_allowlist(repo)
929
+ if refusal is not None:
930
+ refusal.setdefault("has_new_activity", False)
931
+ return refusal
932
+
933
+ try:
934
+ # Fetch comments
935
+ comments_jq = (
936
+ "[.[] | {id: .id, author: .user.login, "
937
+ "created_at: .created_at, body: (.body | .[0:500])}]"
938
+ )
939
+ comments_proc = subprocess.run(
940
+ ["gh", "api",
941
+ f"repos/{repo}/issues/{issue_number}/comments",
942
+ "--jq", comments_jq],
943
+ capture_output=True, text=True, timeout=30,
944
+ )
945
+ if comments_proc.returncode != 0:
946
+ return {
947
+ "error": f"gh api comments failed: {(comments_proc.stderr or '').strip()[:200]}",
948
+ "has_new_activity": False,
949
+ }
950
+ all_comments = json.loads(comments_proc.stdout) if comments_proc.stdout.strip() else []
951
+ new_comments = [c for c in all_comments if c.get("id", 0) > since_comment_id]
952
+
953
+ # Fetch issue state
954
+ issue_jq = "{state: .state, labels: [.labels[].name], reactions: .reactions.total_count}"
955
+ issue_proc = subprocess.run(
956
+ ["gh", "api",
957
+ f"repos/{repo}/issues/{issue_number}",
958
+ "--jq", issue_jq],
959
+ capture_output=True, text=True, timeout=30,
960
+ )
961
+ if issue_proc.returncode != 0:
962
+ return {
963
+ "error": f"gh api issue failed: {(issue_proc.stderr or '').strip()[:200]}",
964
+ "has_new_activity": False,
965
+ }
966
+ issue_info = json.loads(issue_proc.stdout) if issue_proc.stdout.strip() else {}
967
+ issue_state = issue_info.get("state", "unknown")
968
+
969
+ # Severity classification — green default; amber on closed; red on
970
+ # negative keyword in any new comment body.
971
+ severity = "green"
972
+ combined_body = " ".join(c.get("body", "") or "" for c in new_comments).lower()
973
+ has_negative = any(kw in combined_body for kw in _NEGATIVE_KEYWORDS)
974
+ if has_negative:
975
+ severity = "red"
976
+ elif issue_state == "closed":
977
+ severity = "amber"
978
+
979
+ latest_comment_id = max((c.get("id", 0) for c in all_comments), default=since_comment_id)
980
+ repo_key = repo.replace("/", "_")
981
+
982
+ return {
983
+ "repo": repo,
984
+ "issue_number": str(issue_number),
985
+ "signal": {
986
+ "id": f"sensor:github_issue:{repo_key}:{issue_number}",
987
+ "venture": "delimit",
988
+ "metric": "outreach_issue_activity",
989
+ "source": f"https://github.com/{repo}/issues/{issue_number}",
990
+ "timestamp": datetime.now(timezone.utc).isoformat(),
991
+ "severity": severity,
992
+ },
993
+ "issue_state": issue_state,
994
+ "new_comments": new_comments,
995
+ "latest_comment_id": latest_comment_id,
996
+ "total_comments": len(all_comments),
997
+ "has_new_activity": len(new_comments) > 0,
998
+ }
999
+ except subprocess.TimeoutExpired:
1000
+ return {"error": "gh command timed out after 30s",
1001
+ "has_new_activity": False}
1002
+ except json.JSONDecodeError as exc:
1003
+ return {"error": f"Failed to parse gh output: {exc}",
1004
+ "has_new_activity": False}
1005
+ except Exception as exc: # noqa: BLE001 — sensor must fail soft
1006
+ logger.error("sensor_github_issue impl error: %s", exc)
1007
+ return {"error": str(exc), "has_new_activity": False}
1008
+
1009
+
829
1010
  def _deep_get(d: Dict, key: str) -> Any:
830
1011
  """Get a value from a dict, supporting nested keys with dots."""
831
1012
  if "." in key:
@@ -0,0 +1,290 @@
1
+ """Heartbeat liveness framework — Phase 1 local file-based (LED-1412).
2
+
3
+ Solves the silent-staleness class that the 2026-05-15 session exposed:
4
+ delimit-reddit-proxy.service was inactive/disabled for 13 days, all
5
+ reddit scans failed silently with 429/403, and the founder noticed via
6
+ "3 day old posts" — not the system. There was no central liveness
7
+ reporting and no alert.
8
+
9
+ Phase 1 (this module): every scheduled task writes a heartbeat file
10
+ when it runs. A central check tool walks the heartbeat directory and
11
+ flags anything stale. Local-only — Codex's correct caveat that
12
+ heartbeats can't catch a full-host outage motivates Phase 2 (external
13
+ deadman ping, tracked separately as LED-1414).
14
+
15
+ Heartbeat file format — one per service at ~/.delimit/heartbeats/<service>.json:
16
+ {
17
+ "service": "delimit-reddit-proxy",
18
+ "last_run": "2026-05-15T14:23:51Z",
19
+ "last_success": "2026-05-15T14:23:51Z", # may differ from last_run on partial failure
20
+ "status": "ok" | "degraded" | "failed",
21
+ "next_expected": "2026-05-15T15:23:51Z",
22
+ "detail": "string — optional one-line context for status != ok"
23
+ }
24
+
25
+ Memory anchor: feedback_corrupted_worktree_phantom_failures.md (sister
26
+ failure class — both surface as "system reports stale data because no-one
27
+ checks freshness").
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import json
33
+ import os
34
+ import time
35
+ from pathlib import Path
36
+ from typing import Any, Dict, List, Optional
37
+
38
+ # All heartbeats live under one directory. Override via env for tests.
39
+ DEFAULT_HEARTBEAT_DIR = Path.home() / ".delimit" / "heartbeats"
40
+
41
+ # Per-service staleness thresholds (seconds). Overridable via config file
42
+ # at ~/.delimit/heartbeats/_thresholds.json. Service names match the
43
+ # `service` key written by write_heartbeat().
44
+ DEFAULT_STALENESS_THRESHOLDS: Dict[str, int] = {
45
+ # Reddit scanner: hourly social loop. >2 hours = stale.
46
+ "delimit-reddit-proxy": 7200,
47
+ "delimit-social-loop": 7200,
48
+ # Inbox daemon: 5-min poll. >30 min = stale.
49
+ "delimit-inbox": 1800,
50
+ # License watch: daily timer. >36 hours = stale.
51
+ "delimit-license-watch": 129600,
52
+ # Drift check: daily. >36 hours = stale.
53
+ "delimit-drift-check": 129600,
54
+ # stake.one INJ-claim: daily 13:00 UTC. >30 hours = stale.
55
+ "stakeone-inj-claim": 108000,
56
+ }
57
+
58
+ # Fallback for services not in the threshold map.
59
+ DEFAULT_FALLBACK_STALENESS = 86400 # 24 hours
60
+
61
+
62
+ def _heartbeat_dir(override: Optional[str] = None) -> Path:
63
+ """Resolve the heartbeat directory. Honors:
64
+ - explicit override arg
65
+ - DELIMIT_HEARTBEAT_DIR env var
66
+ - default ~/.delimit/heartbeats/
67
+ """
68
+ if override:
69
+ return Path(override)
70
+ env = os.environ.get("DELIMIT_HEARTBEAT_DIR")
71
+ if env:
72
+ return Path(env)
73
+ return DEFAULT_HEARTBEAT_DIR
74
+
75
+
76
+ def _now_iso() -> str:
77
+ """Current UTC time as ISO 8601 with Z suffix (matches existing
78
+ delimit timestamp convention)."""
79
+ return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
80
+
81
+
82
+ def _parse_iso(ts: str) -> Optional[float]:
83
+ """Parse an ISO 8601 timestamp to a unix epoch float. Returns None
84
+ on parse failure — callers treat None as 'unknown' (degraded but
85
+ not actionable)."""
86
+ if not ts:
87
+ return None
88
+ try:
89
+ # %Y-%m-%dT%H:%M:%SZ — UTC, no fractional seconds.
90
+ return time.mktime(time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - time.timezone
91
+ except (ValueError, TypeError):
92
+ return None
93
+
94
+
95
+ def write_heartbeat(
96
+ service: str,
97
+ status: str = "ok",
98
+ next_expected_in: Optional[int] = None,
99
+ detail: str = "",
100
+ success: bool = True,
101
+ heartbeat_dir: Optional[str] = None,
102
+ ) -> Dict[str, Any]:
103
+ """Write a heartbeat for `service`.
104
+
105
+ Called by every scheduled task at the end of its run. On success,
106
+ pass status='ok' and success=True (default). On partial failure
107
+ (e.g., one of N subreddits 429'd but most succeeded), pass
108
+ status='degraded'. On total failure, status='failed' + success=False.
109
+
110
+ Args:
111
+ service: stable service identifier (e.g., 'delimit-reddit-proxy').
112
+ Should match the systemd unit name where applicable.
113
+ status: 'ok' | 'degraded' | 'failed'.
114
+ next_expected_in: seconds until the next run is expected. Used
115
+ by check_staleness to compute next_expected timestamp.
116
+ detail: optional one-line context (printed to operators on stale).
117
+ success: True if the run achieved its primary purpose (independent
118
+ of `status` — a successful run can still be 'degraded' if
119
+ some optional sub-tasks failed). last_success only updates
120
+ when True.
121
+ heartbeat_dir: override the heartbeat directory (for tests).
122
+
123
+ Returns:
124
+ Dict with the written record (also persisted to disk).
125
+ """
126
+ target_dir = _heartbeat_dir(heartbeat_dir)
127
+ target_dir.mkdir(parents=True, exist_ok=True)
128
+ file_path = target_dir / f"{service}.json"
129
+
130
+ now = _now_iso()
131
+ next_expected = ""
132
+ if next_expected_in:
133
+ next_expected_epoch = time.time() + next_expected_in
134
+ next_expected = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(next_expected_epoch))
135
+
136
+ # Preserve last_success across runs (only update if this run succeeded).
137
+ last_success = now if success else ""
138
+ if not success and file_path.exists():
139
+ try:
140
+ prior = json.loads(file_path.read_text())
141
+ last_success = prior.get("last_success", "")
142
+ except (json.JSONDecodeError, OSError):
143
+ pass # Ignore corrupted prior; treat as no last_success known.
144
+
145
+ record = {
146
+ "service": service,
147
+ "last_run": now,
148
+ "last_success": last_success,
149
+ "status": status,
150
+ "next_expected": next_expected,
151
+ "detail": detail,
152
+ }
153
+ file_path.write_text(json.dumps(record, indent=2) + "\n")
154
+ return record
155
+
156
+
157
+ def read_heartbeats(heartbeat_dir: Optional[str] = None) -> List[Dict[str, Any]]:
158
+ """Read every heartbeat file in the directory. Skips files that
159
+ don't parse as JSON (corrupted heartbeats are reported as a separate
160
+ 'parse_error' entry so the operator sees them)."""
161
+ target_dir = _heartbeat_dir(heartbeat_dir)
162
+ if not target_dir.exists():
163
+ return []
164
+ out: List[Dict[str, Any]] = []
165
+ for path in sorted(target_dir.glob("*.json")):
166
+ # Skip the threshold config file
167
+ if path.name == "_thresholds.json":
168
+ continue
169
+ try:
170
+ data = json.loads(path.read_text())
171
+ out.append(data)
172
+ except (json.JSONDecodeError, OSError) as e:
173
+ out.append({
174
+ "service": path.stem,
175
+ "status": "parse_error",
176
+ "detail": f"heartbeat file {path.name} unreadable: {type(e).__name__}: {e}",
177
+ "last_run": "",
178
+ "last_success": "",
179
+ "next_expected": "",
180
+ })
181
+ return out
182
+
183
+
184
+ def _load_thresholds(heartbeat_dir: Optional[str] = None) -> Dict[str, int]:
185
+ """Merge defaults with the optional override at <dir>/_thresholds.json."""
186
+ thresholds = dict(DEFAULT_STALENESS_THRESHOLDS)
187
+ target_dir = _heartbeat_dir(heartbeat_dir)
188
+ override_path = target_dir / "_thresholds.json"
189
+ if override_path.exists():
190
+ try:
191
+ override = json.loads(override_path.read_text())
192
+ if isinstance(override, dict):
193
+ thresholds.update({k: int(v) for k, v in override.items() if isinstance(v, (int, float))})
194
+ except (json.JSONDecodeError, OSError, ValueError):
195
+ pass
196
+ return thresholds
197
+
198
+
199
+ def check_staleness(heartbeat_dir: Optional[str] = None) -> Dict[str, Any]:
200
+ """Walk all heartbeats and classify each by staleness.
201
+
202
+ Returns:
203
+ {
204
+ "checked_at": ISO8601 string,
205
+ "summary": {"ok": N, "stale": N, "degraded": N, "failed": N, "parse_error": N},
206
+ "services": [{service, status, last_run, last_success, age_seconds,
207
+ threshold_seconds, classification}],
208
+ "stale_services": [<service names that are stale>], # convenience for alerts
209
+ }
210
+
211
+ Classification rules (most-severe-first):
212
+ - parse_error: heartbeat file unreadable
213
+ - failed: status='failed' in the record
214
+ - stale: last_run older than threshold
215
+ - degraded: status='degraded' in the record
216
+ - ok: status='ok' AND last_run within threshold
217
+ - never_seen: heartbeat directory exists but service has no file
218
+ (only reported when a service is configured in thresholds but
219
+ has never written a heartbeat — surfaces "scheduled task never
220
+ ran since heartbeat instrumentation landed")
221
+ """
222
+ now = time.time()
223
+ records = read_heartbeats(heartbeat_dir)
224
+ thresholds = _load_thresholds(heartbeat_dir)
225
+
226
+ by_service: Dict[str, Dict[str, Any]] = {}
227
+ for rec in records:
228
+ service = rec.get("service", "?unknown?")
229
+ last_run_epoch = _parse_iso(rec.get("last_run", ""))
230
+ threshold = thresholds.get(service, DEFAULT_FALLBACK_STALENESS)
231
+ if last_run_epoch is not None:
232
+ age_seconds = int(now - last_run_epoch)
233
+ else:
234
+ age_seconds = -1
235
+
236
+ # Classify (most-severe-first)
237
+ if rec.get("status") == "parse_error":
238
+ classification = "parse_error"
239
+ elif rec.get("status") == "failed":
240
+ classification = "failed"
241
+ elif age_seconds < 0:
242
+ classification = "unknown_age"
243
+ elif age_seconds > threshold:
244
+ classification = "stale"
245
+ elif rec.get("status") == "degraded":
246
+ classification = "degraded"
247
+ else:
248
+ classification = "ok"
249
+
250
+ by_service[service] = {
251
+ "service": service,
252
+ "status": rec.get("status", "?"),
253
+ "last_run": rec.get("last_run", ""),
254
+ "last_success": rec.get("last_success", ""),
255
+ "age_seconds": age_seconds,
256
+ "threshold_seconds": threshold,
257
+ "classification": classification,
258
+ "detail": rec.get("detail", ""),
259
+ }
260
+
261
+ # Add never_seen entries for configured services that have no record
262
+ for service in thresholds.keys():
263
+ if service not in by_service:
264
+ by_service[service] = {
265
+ "service": service,
266
+ "status": "never_seen",
267
+ "last_run": "",
268
+ "last_success": "",
269
+ "age_seconds": -1,
270
+ "threshold_seconds": thresholds[service],
271
+ "classification": "never_seen",
272
+ "detail": "no heartbeat file found — service may not be instrumented yet",
273
+ }
274
+
275
+ services = list(by_service.values())
276
+ summary = {"ok": 0, "stale": 0, "degraded": 0, "failed": 0, "parse_error": 0,
277
+ "never_seen": 0, "unknown_age": 0}
278
+ stale_services = []
279
+ for svc in services:
280
+ c = svc["classification"]
281
+ summary[c] = summary.get(c, 0) + 1
282
+ if c in ("stale", "failed", "parse_error", "never_seen"):
283
+ stale_services.append(svc["service"])
284
+
285
+ return {
286
+ "checked_at": _now_iso(),
287
+ "summary": summary,
288
+ "services": services,
289
+ "stale_services": stale_services,
290
+ }