@cleocode/skills 2026.5.16 → 2026.5.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/ct-council/SKILL.md +377 -0
- package/skills/ct-council/optimization/HARDENING-PLAYBOOK.md +107 -0
- package/skills/ct-council/optimization/README.md +74 -0
- package/skills/ct-council/optimization/scenarios.yaml +121 -0
- package/skills/ct-council/optimization/scripts/campaign.py +543 -0
- package/skills/ct-council/optimization/scripts/test_campaign.py +143 -0
- package/skills/ct-council/references/chairman.md +119 -0
- package/skills/ct-council/references/contrarian.md +70 -0
- package/skills/ct-council/references/evidence-pack.md +145 -0
- package/skills/ct-council/references/examples.md +235 -0
- package/skills/ct-council/references/executor.md +83 -0
- package/skills/ct-council/references/expansionist.md +68 -0
- package/skills/ct-council/references/first-principles.md +73 -0
- package/skills/ct-council/references/outsider.md +73 -0
- package/skills/ct-council/references/peer-review.md +125 -0
- package/skills/ct-council/scripts/analyze_runs.py +293 -0
- package/skills/ct-council/scripts/fixtures/executor_multi.md +198 -0
- package/skills/ct-council/scripts/fixtures/missing_advisor.md +117 -0
- package/skills/ct-council/scripts/fixtures/missing_convergence.md +190 -0
- package/skills/ct-council/scripts/fixtures/thin_evidence.md +193 -0
- package/skills/ct-council/scripts/fixtures/valid.md +226 -0
- package/skills/ct-council/scripts/fixtures/valid_with_llmtxt.md +226 -0
- package/skills/ct-council/scripts/llmtxt_ref.py +223 -0
- package/skills/ct-council/scripts/run_council.py +578 -0
- package/skills/ct-council/scripts/telemetry.py +624 -0
- package/skills/ct-council/scripts/test_telemetry.py +509 -0
- package/skills/ct-council/scripts/test_validate.py +452 -0
- package/skills/ct-council/scripts/validate.py +396 -0
- package/skills.json +19 -0
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
"""Tests for telemetry.py and analyze_runs.py."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import unittest
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
SCRIPTS_DIR = Path(__file__).resolve().parent
|
|
12
|
+
FIXTURES = SCRIPTS_DIR / "fixtures"
|
|
13
|
+
|
|
14
|
+
sys.path.insert(0, str(SCRIPTS_DIR))
|
|
15
|
+
import telemetry # noqa: E402
|
|
16
|
+
import analyze_runs # noqa: E402
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_fixture(name: str) -> str:
|
|
20
|
+
return (FIXTURES / name).read_text()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestTelemetryExtractValidFixture(unittest.TestCase):
|
|
24
|
+
"""Telemetry on the canonical valid.md fixture."""
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def setUpClass(cls):
|
|
28
|
+
cls.md = load_fixture("valid.md")
|
|
29
|
+
cls.record = telemetry.extract_record(cls.md)
|
|
30
|
+
cls.payload = json.loads(json.dumps(telemetry.asdict(cls.record), default=str))
|
|
31
|
+
|
|
32
|
+
def test_schema_version(self):
|
|
33
|
+
self.assertEqual(self.payload["schema_version"], telemetry.SCHEMA_VERSION)
|
|
34
|
+
|
|
35
|
+
def test_validation_passes(self):
|
|
36
|
+
self.assertTrue(self.payload["validation"]["valid"])
|
|
37
|
+
self.assertEqual(self.payload["validation"]["structural_violations"], 0)
|
|
38
|
+
|
|
39
|
+
def test_question_extracted(self):
|
|
40
|
+
self.assertIn("retry-on-timeout", self.payload["question"])
|
|
41
|
+
|
|
42
|
+
def test_evidence_pack_count(self):
|
|
43
|
+
self.assertEqual(self.payload["evidence_pack"]["count"], 3)
|
|
44
|
+
self.assertFalse(self.payload["evidence_pack"]["has_llmtxt"])
|
|
45
|
+
|
|
46
|
+
def test_all_five_advisors_present(self):
|
|
47
|
+
for advisor in telemetry.ADVISORS:
|
|
48
|
+
self.assertIn(advisor, self.payload["advisors"])
|
|
49
|
+
|
|
50
|
+
def test_each_advisor_has_four_gate_results(self):
|
|
51
|
+
for advisor, body in self.payload["advisors"].items():
|
|
52
|
+
with self.subTest(advisor=advisor):
|
|
53
|
+
self.assertEqual(set(body["gates"].keys()), {"G1", "G2", "G3", "G4"})
|
|
54
|
+
for gate, verdict in body["gates"].items():
|
|
55
|
+
self.assertIn(verdict, ("PASS", "FAIL", "MISSING"))
|
|
56
|
+
|
|
57
|
+
def test_advisors_get_full_weight_when_4_of_4(self):
|
|
58
|
+
for advisor, body in self.payload["advisors"].items():
|
|
59
|
+
self.assertEqual(body["gate_pass_count"], 4)
|
|
60
|
+
self.assertEqual(body["weight"], "full")
|
|
61
|
+
|
|
62
|
+
def test_each_advisor_has_a_reviewer(self):
|
|
63
|
+
# Per the fixed rotation, every advisor is the reviewee of exactly one peer.
|
|
64
|
+
for advisor, body in self.payload["advisors"].items():
|
|
65
|
+
self.assertIsNotNone(body["reviewer"])
|
|
66
|
+
|
|
67
|
+
def test_sharpest_points_extracted(self):
|
|
68
|
+
for advisor, body in self.payload["advisors"].items():
|
|
69
|
+
self.assertIsNotNone(body["sharpest"])
|
|
70
|
+
self.assertGreater(len(body["sharpest"]), 0)
|
|
71
|
+
|
|
72
|
+
def test_peer_reviews_match_rotation(self):
|
|
73
|
+
prs = self.payload["peer_reviews"]
|
|
74
|
+
pairs = [(p["reviewer"], p["reviewee"]) for p in prs]
|
|
75
|
+
self.assertEqual(pairs, list(telemetry.PEER_REVIEW_ROTATION))
|
|
76
|
+
|
|
77
|
+
def test_all_peer_reviews_have_disposition(self):
|
|
78
|
+
for pr in self.payload["peer_reviews"]:
|
|
79
|
+
self.assertIn(pr["disposition"], ("Accept", "Modify", "Reject"))
|
|
80
|
+
|
|
81
|
+
def test_convergence_flag_cleared(self):
|
|
82
|
+
# The fixture explicitly says "no convergence flag".
|
|
83
|
+
self.assertEqual(self.payload["convergence"]["flag"], False)
|
|
84
|
+
|
|
85
|
+
def test_chairman_confidence_high(self):
|
|
86
|
+
self.assertEqual(self.payload["chairman"]["confidence"], "high")
|
|
87
|
+
|
|
88
|
+
def test_chairman_recommendation_and_action_present(self):
|
|
89
|
+
self.assertTrue(self.payload["chairman"]["recommendation_present"])
|
|
90
|
+
self.assertTrue(self.payload["chairman"]["next_action_present"])
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TestTelemetryExtractWithLlmtxt(unittest.TestCase):
|
|
94
|
+
def test_llmtxt_flag_set_when_evidence_pack_uses_it(self):
|
|
95
|
+
md = load_fixture("valid_with_llmtxt.md")
|
|
96
|
+
rec = telemetry.extract_record(md)
|
|
97
|
+
payload = json.loads(json.dumps(telemetry.asdict(rec), default=str))
|
|
98
|
+
self.assertTrue(payload["evidence_pack"]["has_llmtxt"])
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TestTelemetryHandlesInvalid(unittest.TestCase):
|
|
102
|
+
def test_invalid_record_marks_validation_false(self):
|
|
103
|
+
md = load_fixture("missing_advisor.md")
|
|
104
|
+
rec = telemetry.extract_record(md)
|
|
105
|
+
self.assertFalse(rec.validation["valid"])
|
|
106
|
+
self.assertGreater(rec.validation["structural_violations"], 0)
|
|
107
|
+
|
|
108
|
+
def test_thin_evidence_pack_flagged(self):
|
|
109
|
+
md = load_fixture("thin_evidence.md")
|
|
110
|
+
rec = telemetry.extract_record(md)
|
|
111
|
+
self.assertFalse(rec.validation["valid"])
|
|
112
|
+
self.assertLess(rec.evidence_pack["count"], 3)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class TestSyntheticGateFails(unittest.TestCase):
|
|
116
|
+
"""Synthesize an output with a deliberate gate FAIL on a target peer review."""
|
|
117
|
+
|
|
118
|
+
def _replace_gate_in_review(self, md: str, reviewer: str, reviewee: str, gate: str, new_verdict: str) -> str:
|
|
119
|
+
"""Replace `- <gate>: PASS|FAIL` inside the section `### <reviewer> reviewing <reviewee>`."""
|
|
120
|
+
marker = f"### {reviewer} reviewing {reviewee}"
|
|
121
|
+
idx = md.index(marker)
|
|
122
|
+
# Find the next gate line within this section's body.
|
|
123
|
+
line_marker = f"- {gate}: PASS"
|
|
124
|
+
# Search from `idx`.
|
|
125
|
+
rel = md.find(line_marker, idx)
|
|
126
|
+
assert rel != -1, f"Did not find {line_marker!r} after {marker!r}"
|
|
127
|
+
return md[:rel] + f"- {gate}: {new_verdict}" + md[rel + len(line_marker):]
|
|
128
|
+
|
|
129
|
+
def test_synthetic_fail_propagates_to_advisor_record(self):
|
|
130
|
+
md = self._replace_gate_in_review(
|
|
131
|
+
load_fixture("valid.md"),
|
|
132
|
+
reviewer="Outsider",
|
|
133
|
+
reviewee="Executor",
|
|
134
|
+
gate="G1 Rigor",
|
|
135
|
+
new_verdict="FAIL",
|
|
136
|
+
)
|
|
137
|
+
rec = telemetry.extract_record(md)
|
|
138
|
+
# Outsider reviews Executor → Executor's G1 should now read FAIL.
|
|
139
|
+
executor = rec.advisors["Executor"]
|
|
140
|
+
self.assertEqual(executor["gates"]["G1"], "FAIL")
|
|
141
|
+
self.assertEqual(executor["gate_pass_count"], 3)
|
|
142
|
+
self.assertEqual(executor["weight"], "high")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class TestAnalyzeRuns(unittest.TestCase):
|
|
146
|
+
def setUp(self):
|
|
147
|
+
# Build a small synthetic JSONL with 3 runs: 1 clean, 1 gate-fail, 1 convergence-flagged.
|
|
148
|
+
valid_md = load_fixture("valid.md")
|
|
149
|
+
clean = telemetry.extract_record(valid_md)
|
|
150
|
+
|
|
151
|
+
# Run 2: synthesize a FAIL on Executor's G1 by editing `Outsider reviewing Executor`.
|
|
152
|
+
marker = "### Outsider reviewing Executor"
|
|
153
|
+
idx = valid_md.index(marker)
|
|
154
|
+
rel = valid_md.find("- G1 Rigor: PASS", idx)
|
|
155
|
+
fail_md = valid_md[:rel] + "- G1 Rigor: FAIL" + valid_md[rel + len("- G1 Rigor: PASS"):]
|
|
156
|
+
gate_fail = telemetry.extract_record(fail_md, tokens=42000, wall_clock=70.0)
|
|
157
|
+
|
|
158
|
+
# Run 3: synthesize convergence flag raised.
|
|
159
|
+
conv_md = valid_md.replace(
|
|
160
|
+
"Distinct subjects: retry storms, idempotency, breaker wiring, ADR-precondition gap, test-first action. No convergence flag raised. Proceeding to Phase 3.",
|
|
161
|
+
"All five sharpest points reduce to retry-storm risk. Convergence flag raised. Reran Contrarian.",
|
|
162
|
+
)
|
|
163
|
+
conv = telemetry.extract_record(conv_md, tokens=58000, wall_clock=110.0)
|
|
164
|
+
|
|
165
|
+
self.records = [clean, gate_fail, conv]
|
|
166
|
+
self.runs = [json.loads(json.dumps(telemetry.asdict(r), default=str)) for r in self.records]
|
|
167
|
+
|
|
168
|
+
def test_gate_hotspots_finds_the_fail(self):
|
|
169
|
+
rows = analyze_runs.gate_hotspots(self.runs)
|
|
170
|
+
# The Executor G1 fail in run 2 should sit on top of the hotspot list.
|
|
171
|
+
executor_g1 = [r for r in rows if r["advisor"] == "Executor" and r["gate"] == "G1"]
|
|
172
|
+
self.assertEqual(len(executor_g1), 1)
|
|
173
|
+
self.assertEqual(executor_g1[0]["fail"], 1)
|
|
174
|
+
self.assertGreater(executor_g1[0]["fail_rate"], 0)
|
|
175
|
+
|
|
176
|
+
def test_disposition_distribution_counts_correctly(self):
|
|
177
|
+
disp = analyze_runs.disposition_distribution(self.runs)
|
|
178
|
+
# 3 runs × 5 peer reviews = 15 total. All Accept in the source fixture.
|
|
179
|
+
self.assertEqual(disp["overall"].get("Accept"), 15)
|
|
180
|
+
|
|
181
|
+
def test_convergence_flag_detected(self):
|
|
182
|
+
cv = analyze_runs.convergence_rate(self.runs)
|
|
183
|
+
self.assertEqual(cv["raised"], 1)
|
|
184
|
+
self.assertEqual(cv["cleared"], 2)
|
|
185
|
+
|
|
186
|
+
def test_confidence_distribution(self):
|
|
187
|
+
conf = analyze_runs.confidence_distribution(self.runs)
|
|
188
|
+
self.assertEqual(conf["counts"].get("high"), 3)
|
|
189
|
+
|
|
190
|
+
def test_cost_distribution_handles_partial_metrics(self):
|
|
191
|
+
cost = analyze_runs.cost_distribution(self.runs)
|
|
192
|
+
# Only runs 2 and 3 have tokens stamped.
|
|
193
|
+
self.assertEqual(cost["tokens"]["n"], 2)
|
|
194
|
+
|
|
195
|
+
def test_exit_criteria_token_spread_outside_20pct(self):
|
|
196
|
+
report = analyze_runs.build_report(self.runs)
|
|
197
|
+
ec = report["exit_criteria"]
|
|
198
|
+
# 42000 vs 58000 → spread > 20%.
|
|
199
|
+
self.assertFalse(ec["token_spread_within_20pct"])
|
|
200
|
+
|
|
201
|
+
def test_exit_criteria_advisor_min_average(self):
|
|
202
|
+
report = analyze_runs.build_report(self.runs)
|
|
203
|
+
ec = report["exit_criteria"]
|
|
204
|
+
# Executor avg = (4 + 3 + 4) / 3 = 3.67.
|
|
205
|
+
self.assertGreaterEqual(ec["advisor_gate_avg"]["Executor"], 3.0)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class TestPhase25Extractor(unittest.TestCase):
|
|
209
|
+
"""Tests for the Phase 2.5 structured extractor (T-shakedown-1 verdict)."""
|
|
210
|
+
|
|
211
|
+
def setUp(self):
|
|
212
|
+
import tempfile, shutil
|
|
213
|
+
self.tmpdir = Path(tempfile.mkdtemp(prefix="council-25-"))
|
|
214
|
+
self.addCleanup(shutil.rmtree, self.tmpdir, ignore_errors=True)
|
|
215
|
+
# Minimal run.json so run_id is preserved.
|
|
216
|
+
(self.tmpdir / "run.json").write_text(json.dumps({"run_id": "abcd1234"}))
|
|
217
|
+
|
|
218
|
+
def _write_phase1(self, slug: str, sharpest: str) -> None:
|
|
219
|
+
body = (
|
|
220
|
+
f"### Advisor: {slug.replace('-', ' ').title()}\n\n"
|
|
221
|
+
"**Frame:** ...\n\n"
|
|
222
|
+
"**Evidence anchored:**\n"
|
|
223
|
+
"- foo — bar\n"
|
|
224
|
+
"- baz — qux\n\n"
|
|
225
|
+
"**Verdict from this lens:** ...\n\n"
|
|
226
|
+
f"**Single sharpest point:** {sharpest}\n"
|
|
227
|
+
)
|
|
228
|
+
(self.tmpdir / f"phase1-{slug}.md").write_text(body)
|
|
229
|
+
|
|
230
|
+
def test_extract_5_distinct_points_no_clique(self):
|
|
231
|
+
# 5 distinct topics → no pairwise overlap, no clique.
|
|
232
|
+
self._write_phase1("contrarian", "Retry storms cascade under upstream latency spikes.")
|
|
233
|
+
self._write_phase1("first-principles", "Idempotency classification is the missing atomic truth.")
|
|
234
|
+
self._write_phase1("expansionist", "Wire the dormant circuit breaker for system-wide resilience.")
|
|
235
|
+
self._write_phase1("outsider", "ADR-021 says the precondition has been met for years.")
|
|
236
|
+
self._write_phase1("executor", "Write a failing test that pins the GET-vs-POST retry contract.")
|
|
237
|
+
verdict = telemetry.extract_phase_2_5(self.tmpdir)
|
|
238
|
+
self.assertEqual(verdict["flag_mechanical"], False)
|
|
239
|
+
self.assertEqual(verdict["pairwise_same"], [])
|
|
240
|
+
self.assertEqual(verdict["missing_advisors"], [])
|
|
241
|
+
self.assertEqual(verdict["run_id"], "abcd1234")
|
|
242
|
+
|
|
243
|
+
def test_extract_3_clique_raises_flag(self):
|
|
244
|
+
# 3 of 5 sentences are near-identical; all pairwise Jaccard ≥ 0.6 → clique → flag=True.
|
|
245
|
+
self._write_phase1("contrarian", "Wire the dormant circuit breaker before retries land.")
|
|
246
|
+
self._write_phase1("first-principles", "Wire the dormant circuit breaker before retries land in production.")
|
|
247
|
+
self._write_phase1("expansionist", "Wire the dormant circuit breaker before retries land for safety.")
|
|
248
|
+
self._write_phase1("outsider", "Completely unrelated observation about ADR drift over time.")
|
|
249
|
+
self._write_phase1("executor", "Run a failing test next inside packages core.")
|
|
250
|
+
verdict = telemetry.extract_phase_2_5(self.tmpdir)
|
|
251
|
+
self.assertTrue(verdict["flag_mechanical"], f"Expected flag=True, got verdict: {verdict}")
|
|
252
|
+
self.assertGreaterEqual(len(verdict["pairwise_same"]), 3)
|
|
253
|
+
|
|
254
|
+
def test_anchor_distinguishes_inline_marker_from_structural_marker(self):
|
|
255
|
+
"""Regression test: the marker text inside a paragraph must NOT match.
|
|
256
|
+
|
|
257
|
+
Specifically guards against the bug surfaced in shakedown #1, where the
|
|
258
|
+
Executor's action body referenced `**Single sharpest point:**` as a
|
|
259
|
+
parse target and the un-anchored regex matched the inline mention.
|
|
260
|
+
"""
|
|
261
|
+
body = (
|
|
262
|
+
"### Advisor: Executor\n\n"
|
|
263
|
+
"**Frame:** ...\n\n"
|
|
264
|
+
"**Evidence anchored:**\n"
|
|
265
|
+
"- foo — bar\n"
|
|
266
|
+
"- baz — qux\n\n"
|
|
267
|
+
"**The action (one):** Add a parser that reads each `**Single sharpest point:**` line "
|
|
268
|
+
"from per-advisor files and emits structured JSON for this run.\n\n"
|
|
269
|
+
"**Expected outcome:** ...\n\n"
|
|
270
|
+
"**Single sharpest point:** Ship the structured-output extractor.\n"
|
|
271
|
+
)
|
|
272
|
+
(self.tmpdir / "phase1-executor.md").write_text(body)
|
|
273
|
+
# Other advisors absent — only Executor's sharpest is being tested.
|
|
274
|
+
sharpest = telemetry._read_sharpest(self.tmpdir, "Executor")
|
|
275
|
+
self.assertIsNotNone(sharpest)
|
|
276
|
+
self.assertIn("Ship the structured-output extractor", sharpest)
|
|
277
|
+
self.assertNotIn("Add a parser", sharpest)
|
|
278
|
+
|
|
279
|
+
def test_missing_advisor_file_recorded(self):
|
|
280
|
+
self._write_phase1("contrarian", "X.")
|
|
281
|
+
self._write_phase1("first-principles", "Y.")
|
|
282
|
+
# Three other phase1 files absent.
|
|
283
|
+
verdict = telemetry.extract_phase_2_5(self.tmpdir)
|
|
284
|
+
self.assertEqual(set(verdict["missing_advisors"]), {"Expansionist", "Outsider", "Executor"})
|
|
285
|
+
|
|
286
|
+
def test_jaccard_threshold_respected(self):
|
|
287
|
+
a = telemetry._tokenize("retry storms cascade under upstream latency spikes")
|
|
288
|
+
b = telemetry._tokenize("retry storms cascade when upstream latency spikes happen")
|
|
289
|
+
self.assertGreater(telemetry._jaccard(a, b), 0.6)
|
|
290
|
+
c = telemetry._tokenize("idempotency classification atomic truth")
|
|
291
|
+
d = telemetry._tokenize("circuit breaker resilience platform")
|
|
292
|
+
self.assertLess(telemetry._jaccard(c, d), 0.6)
|
|
293
|
+
|
|
294
|
+
def test_3_clique_detection_logic(self):
|
|
295
|
+
# Pairs covering all 3 edges of vertices {0,1,2} → clique.
|
|
296
|
+
self.assertTrue(telemetry._has_3_clique([[0, 1], [1, 2], [0, 2]], n=5))
|
|
297
|
+
# Star pattern around vertex 0 → no triangle.
|
|
298
|
+
self.assertFalse(telemetry._has_3_clique([[0, 1], [0, 2], [0, 3]], n=5))
|
|
299
|
+
# Empty.
|
|
300
|
+
self.assertFalse(telemetry._has_3_clique([], n=5))
|
|
301
|
+
|
|
302
|
+
def test_extracts_real_shakedown_1_correctly(self):
|
|
303
|
+
"""Live regression: extract from the actual shakedown #1 run dir."""
|
|
304
|
+
run_dir = SCRIPTS_DIR.parent / ".runs" / "20260425T023423Z-0f82cea9"
|
|
305
|
+
if not run_dir.exists():
|
|
306
|
+
self.skipTest("shakedown #1 run dir not present")
|
|
307
|
+
verdict = telemetry.extract_phase_2_5(run_dir)
|
|
308
|
+
self.assertEqual(verdict["flag_mechanical"], False)
|
|
309
|
+
self.assertEqual(verdict["missing_advisors"], [])
|
|
310
|
+
# Each sharpest point should be the full final sentence, not a truncated
|
|
311
|
+
# fragment from the action body.
|
|
312
|
+
executor_sentence = next(
|
|
313
|
+
sp["sentence"] for sp in verdict["sharpest_points"] if sp["advisor"] == "Executor"
|
|
314
|
+
)
|
|
315
|
+
self.assertIn("--phase 2.5-extract", executor_sentence)
|
|
316
|
+
self.assertTrue(executor_sentence.startswith("Within the hour"))
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class TestVerdictAndTldrRendering(unittest.TestCase):
|
|
320
|
+
"""The Council emits THREE artifacts: verdict.md (the deliverable),
|
|
321
|
+
tldr.md (for PR comments), output.md (audit trail). Tests that the lean
|
|
322
|
+
deliverables extract correctly from a validated full output."""
|
|
323
|
+
|
|
324
|
+
def setUp(self):
|
|
325
|
+
self.md = load_fixture("valid.md")
|
|
326
|
+
|
|
327
|
+
def test_verdict_has_question_header(self):
|
|
328
|
+
v = telemetry.render_verdict(self.md)
|
|
329
|
+
self.assertTrue(v.startswith("# Council Verdict — "))
|
|
330
|
+
self.assertIn("retry-on-timeout", v)
|
|
331
|
+
|
|
332
|
+
def test_verdict_contains_chairman_section(self):
|
|
333
|
+
v = telemetry.render_verdict(self.md)
|
|
334
|
+
self.assertIn("## Phase 3 — Chairman's verdict", v)
|
|
335
|
+
self.assertIn("### Gate summary", v)
|
|
336
|
+
self.assertIn("### Recommendation", v)
|
|
337
|
+
self.assertIn("### Next 60-minute action", v)
|
|
338
|
+
self.assertIn("### Confidence", v)
|
|
339
|
+
|
|
340
|
+
def test_verdict_omits_upstream_phases(self):
|
|
341
|
+
v = telemetry.render_verdict(self.md)
|
|
342
|
+
# Should NOT contain the per-advisor sections or peer reviews.
|
|
343
|
+
self.assertNotIn("### Advisor: Contrarian", v)
|
|
344
|
+
self.assertNotIn("Contrarian reviewing First Principles", v)
|
|
345
|
+
self.assertNotIn("## Phase 1 — Advisor analyses", v)
|
|
346
|
+
self.assertNotIn("## Phase 2 — Shuffled peer reviews", v)
|
|
347
|
+
|
|
348
|
+
def test_verdict_is_significantly_shorter_than_output(self):
|
|
349
|
+
v_lines = telemetry.render_verdict(self.md).count("\n")
|
|
350
|
+
full_lines = self.md.count("\n")
|
|
351
|
+
# Verdict should be <40% of full output's line count.
|
|
352
|
+
self.assertLess(v_lines, full_lines * 0.4,
|
|
353
|
+
f"Verdict {v_lines} lines vs full {full_lines} — should be much leaner")
|
|
354
|
+
|
|
355
|
+
def test_tldr_has_question_header(self):
|
|
356
|
+
t = telemetry.render_tldr(self.md)
|
|
357
|
+
self.assertTrue(t.startswith("# Council TL;DR — "))
|
|
358
|
+
|
|
359
|
+
def test_tldr_contains_load_bearing_fields(self):
|
|
360
|
+
t = telemetry.render_tldr(self.md)
|
|
361
|
+
self.assertIn("**Recommendation**", t)
|
|
362
|
+
self.assertIn("**Next 60-minute action**", t)
|
|
363
|
+
self.assertIn("**Confidence**", t)
|
|
364
|
+
self.assertIn("Conditions:", t)
|
|
365
|
+
|
|
366
|
+
def test_tldr_confidence_is_just_the_level(self):
|
|
367
|
+
t = telemetry.render_tldr(self.md)
|
|
368
|
+
# Should NOT contain the full confidence justification (e.g., "four independent frames converged").
|
|
369
|
+
# Accept em-dash, en-dash, or hyphen as separator.
|
|
370
|
+
m = re.search(r"\*\*Confidence\*\*\s*[—–\-]\s*(.+?)\n", t)
|
|
371
|
+
self.assertIsNotNone(m, f"TLDR did not match Confidence pattern. Got:\n{t}")
|
|
372
|
+
confidence_field = m.group(1).strip()
|
|
373
|
+
# Must be short — just the level.
|
|
374
|
+
self.assertLess(len(confidence_field), 30,
|
|
375
|
+
f"Confidence field too long: {confidence_field!r}")
|
|
376
|
+
self.assertIn(confidence_field.lower(), ("low", "medium", "high", "medium-high", "medium-low"))
|
|
377
|
+
|
|
378
|
+
def test_tldr_is_under_15_lines(self):
|
|
379
|
+
t = telemetry.render_tldr(self.md)
|
|
380
|
+
# Strictly bounded — TL;DR must fit in a chat message / PR comment.
|
|
381
|
+
self.assertLess(t.count("\n"), 16, f"TL;DR too long:\n{t}")
|
|
382
|
+
|
|
383
|
+
def test_tldr_points_to_full_artifacts(self):
|
|
384
|
+
t = telemetry.render_tldr(self.md)
|
|
385
|
+
self.assertIn("verdict.md", t)
|
|
386
|
+
self.assertIn("output.md", t)
|
|
387
|
+
|
|
388
|
+
def test_render_verdict_raises_on_missing_phase3(self):
|
|
389
|
+
bad_md = load_fixture("missing_advisor.md") # Has no Phase 3 properly.
|
|
390
|
+
# missing_advisor.md may still have Phase 3 — use a synthetic md without it.
|
|
391
|
+
no_phase3 = "# The Council — test\n\n## Evidence pack\n\n1. `foo` — bar\n"
|
|
392
|
+
with self.assertRaises(ValueError):
|
|
393
|
+
telemetry.render_verdict(no_phase3)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class TestAnalyzeRunsEdgeCases(unittest.TestCase):
|
|
397
|
+
def test_empty_log_produces_empty_report(self):
|
|
398
|
+
report = analyze_runs.build_report([])
|
|
399
|
+
self.assertEqual(report["n_runs"], 0)
|
|
400
|
+
self.assertEqual(report["gate_hotspots"], [])
|
|
401
|
+
|
|
402
|
+
def test_render_report_is_non_empty(self):
|
|
403
|
+
valid_md = load_fixture("valid.md")
|
|
404
|
+
rec = telemetry.extract_record(valid_md)
|
|
405
|
+
runs = [json.loads(json.dumps(telemetry.asdict(rec), default=str))]
|
|
406
|
+
out = analyze_runs.render_report(analyze_runs.build_report(runs))
|
|
407
|
+
self.assertIn("Council telemetry", out)
|
|
408
|
+
self.assertIn("Exit-criteria scorecard", out)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class TestRunIndex(unittest.TestCase):
|
|
412
|
+
"""Tests for INDEX.jsonl auto-generation in run_council.py."""
|
|
413
|
+
|
|
414
|
+
def setUp(self):
|
|
415
|
+
# Import run_council fresh — it lives next to telemetry.py.
|
|
416
|
+
sys.path.insert(0, str(SCRIPTS_DIR))
|
|
417
|
+
import run_council # noqa: E402
|
|
418
|
+
self.run_council = run_council
|
|
419
|
+
|
|
420
|
+
import tempfile, shutil
|
|
421
|
+
self.tmp_root = Path(tempfile.mkdtemp(prefix="council-index-"))
|
|
422
|
+
self.runs_dir = self.tmp_root / ".cleo" / "council-runs"
|
|
423
|
+
self.runs_dir.mkdir(parents=True)
|
|
424
|
+
self.addCleanup(shutil.rmtree, self.tmp_root, ignore_errors=True)
|
|
425
|
+
|
|
426
|
+
def test_auto_title_strips_should_prefix(self):
|
|
427
|
+
t = self.run_council._auto_title("Should we adopt X for the new schema?")
|
|
428
|
+
self.assertNotIn("Should we", t)
|
|
429
|
+
self.assertTrue(t.startswith("Adopt") or t.startswith("adopt"),
|
|
430
|
+
f"Title should start with derived verb, got: {t!r}")
|
|
431
|
+
|
|
432
|
+
def test_auto_title_truncates_with_ellipsis(self):
|
|
433
|
+
long_q = "Should we " + "adopt the new architectural pattern that requires significant rework " * 3
|
|
434
|
+
t = self.run_council._auto_title(long_q, max_len=60)
|
|
435
|
+
self.assertLessEqual(len(t), 60)
|
|
436
|
+
self.assertTrue(t.endswith("…"))
|
|
437
|
+
|
|
438
|
+
def test_auto_title_handles_short_question(self):
|
|
439
|
+
t = self.run_council._auto_title("Should we ship X?")
|
|
440
|
+
self.assertEqual(t, "Ship X")
|
|
441
|
+
|
|
442
|
+
def test_auto_title_normalizes_whitespace(self):
|
|
443
|
+
t = self.run_council._auto_title("Should we\n\tadopt X?")
|
|
444
|
+
self.assertNotIn(" ", t)
|
|
445
|
+
self.assertNotIn("\n", t)
|
|
446
|
+
self.assertNotIn("\t", t)
|
|
447
|
+
|
|
448
|
+
def test_upsert_index_creates_new_entry(self):
|
|
449
|
+
result = self.run_council._upsert_index(self.runs_dir, "abc123", {
|
|
450
|
+
"title": "Test", "status": "initialized",
|
|
451
|
+
})
|
|
452
|
+
self.assertEqual(result["run_id"], "abc123")
|
|
453
|
+
self.assertEqual(result["status"], "initialized")
|
|
454
|
+
# File must exist now.
|
|
455
|
+
self.assertTrue((self.runs_dir / "INDEX.jsonl").exists())
|
|
456
|
+
|
|
457
|
+
def test_upsert_index_updates_existing_entry_no_duplicates(self):
|
|
458
|
+
self.run_council._upsert_index(self.runs_dir, "abc123", {
|
|
459
|
+
"title": "Test", "status": "initialized",
|
|
460
|
+
})
|
|
461
|
+
self.run_council._upsert_index(self.runs_dir, "abc123", {
|
|
462
|
+
"status": "ingested", "verdict_recommendation": "Ship it.",
|
|
463
|
+
})
|
|
464
|
+
entries = self.run_council._read_index(self.runs_dir)
|
|
465
|
+
# Single entry, with merged fields.
|
|
466
|
+
self.assertEqual(len(entries), 1)
|
|
467
|
+
self.assertEqual(entries[0]["status"], "ingested")
|
|
468
|
+
self.assertEqual(entries[0]["title"], "Test")
|
|
469
|
+
self.assertEqual(entries[0]["verdict_recommendation"], "Ship it.")
|
|
470
|
+
|
|
471
|
+
def test_upsert_index_preserves_other_entries(self):
|
|
472
|
+
self.run_council._upsert_index(self.runs_dir, "aaa", {"title": "A"})
|
|
473
|
+
self.run_council._upsert_index(self.runs_dir, "bbb", {"title": "B"})
|
|
474
|
+
self.run_council._upsert_index(self.runs_dir, "aaa", {"status": "ingested"})
|
|
475
|
+
entries = self.run_council._read_index(self.runs_dir)
|
|
476
|
+
self.assertEqual(len(entries), 2)
|
|
477
|
+
ids = {e["run_id"] for e in entries}
|
|
478
|
+
self.assertEqual(ids, {"aaa", "bbb"})
|
|
479
|
+
|
|
480
|
+
def test_extract_recommendation_snippet_pulls_first_sentence(self):
|
|
481
|
+
verdict_md = (
|
|
482
|
+
"# Council Verdict — Should we ship X?\n\n"
|
|
483
|
+
"## Phase 3 — Chairman's verdict\n\n"
|
|
484
|
+
"### Gate summary\n\n| ... | ... |\n\n"
|
|
485
|
+
"### Recommendation\n\n"
|
|
486
|
+
"**Reject the binary.** This is a longer second sentence with more detail. "
|
|
487
|
+
"And a third sentence even longer than the second.\n\n"
|
|
488
|
+
"### Why this, not the alternatives\n\n..."
|
|
489
|
+
)
|
|
490
|
+
snippet = self.run_council._extract_recommendation_snippet(verdict_md)
|
|
491
|
+
self.assertIsNotNone(snippet)
|
|
492
|
+
self.assertIn("Reject the binary", snippet)
|
|
493
|
+
# Should be just the first sentence — not the longer follow-ups.
|
|
494
|
+
self.assertNotIn("And a third sentence", snippet)
|
|
495
|
+
|
|
496
|
+
def test_extract_recommendation_snippet_returns_none_when_missing(self):
|
|
497
|
+
verdict_md = "# Council Verdict — empty test\n\n## Phase 3 — Chairman's verdict\n\nNo recommendation here.\n"
|
|
498
|
+
self.assertIsNone(self.run_council._extract_recommendation_snippet(verdict_md))
|
|
499
|
+
|
|
500
|
+
def test_extract_recommendation_snippet_truncates_long_first_sentence(self):
|
|
501
|
+
body = "x" * 500 + "."
|
|
502
|
+
verdict_md = f"### Recommendation\n\n{body}\n\n### Next 60-minute action\n"
|
|
503
|
+
snippet = self.run_council._extract_recommendation_snippet(verdict_md, max_len=200)
|
|
504
|
+
self.assertLessEqual(len(snippet), 200)
|
|
505
|
+
self.assertTrue(snippet.endswith("…"))
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
if __name__ == "__main__":
|
|
509
|
+
unittest.main()
|