@rm0nroe/coach-claw 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +311 -0
- package/coach/README.md +99 -0
- package/coach/bin/aggregate_facets.py +274 -0
- package/coach/bin/analyze.py +678 -0
- package/coach/bin/bank.py +247 -0
- package/coach/bin/banner_themes.py +645 -0
- package/coach/bin/coach_paths.py +33 -0
- package/coach/bin/coexistence_check.py +129 -0
- package/coach/bin/configure.py +245 -0
- package/coach/bin/cron_check.py +81 -0
- package/coach/bin/default_statusline.py +135 -0
- package/coach/bin/doctor.py +663 -0
- package/coach/bin/insights-llm.sh +264 -0
- package/coach/bin/insights.sh +163 -0
- package/coach/bin/insights_window.py +111 -0
- package/coach/bin/marker_io.py +154 -0
- package/coach/bin/merge.py +671 -0
- package/coach/bin/redact.py +86 -0
- package/coach/bin/render_env.py +148 -0
- package/coach/bin/reward_hints.py +87 -0
- package/coach/bin/run-insights.sh +20 -0
- package/coach/bin/run_with_lock.py +85 -0
- package/coach/bin/scoring.py +260 -0
- package/coach/bin/skill_inventory.py +215 -0
- package/coach/bin/stats.py +459 -0
- package/coach/bin/status.py +293 -0
- package/coach/bin/statusline_self_patch.py +205 -0
- package/coach/bin/statusline_variants.py +146 -0
- package/coach/bin/statusline_wrap.py +244 -0
- package/coach/bin/statusline_wrap_action.py +460 -0
- package/coach/bin/switch_to_plugin.py +256 -0
- package/coach/bin/themes.py +256 -0
- package/coach/bin/user_config.py +176 -0
- package/coach/bin/xp_accounting.py +98 -0
- package/coach/changelog.md +4 -0
- package/coach/default-statusline-command.sh +19 -0
- package/coach/default-statusline-wrap-command.sh +15 -0
- package/coach/profile.yaml +37 -0
- package/coach/tests/conftest.py +13 -0
- package/coach/tests/test_aggregate_facets.py +379 -0
- package/coach/tests/test_analyze_aggregate.py +153 -0
- package/coach/tests/test_analyze_redaction.py +105 -0
- package/coach/tests/test_analyze_strengths.py +165 -0
- package/coach/tests/test_bank_atomic_write.py +61 -0
- package/coach/tests/test_bank_concurrency.py +126 -0
- package/coach/tests/test_banner_themes.py +981 -0
- package/coach/tests/test_celebrate_dedup.py +409 -0
- package/coach/tests/test_coach_paths.py +50 -0
- package/coach/tests/test_coexistence_check.py +128 -0
- package/coach/tests/test_configure.py +258 -0
- package/coach/tests/test_cron_check.py +118 -0
- package/coach/tests/test_cron_nudge_hook.py +134 -0
- package/coach/tests/test_detection_parity.py +105 -0
- package/coach/tests/test_doctor.py +595 -0
- package/coach/tests/test_hook_bespoke_dispatch.py +288 -0
- package/coach/tests/test_hook_module_resolution.py +116 -0
- package/coach/tests/test_hook_relevance.py +996 -0
- package/coach/tests/test_hook_render_env.py +364 -0
- package/coach/tests/test_hook_session_id_guard.py +160 -0
- package/coach/tests/test_insights_llm.py +759 -0
- package/coach/tests/test_insights_llm_venv_path.py +109 -0
- package/coach/tests/test_insights_window.py +237 -0
- package/coach/tests/test_install.py +1150 -0
- package/coach/tests/test_install_pyyaml_fallback.py +142 -0
- package/coach/tests/test_marker_consumption.py +167 -0
- package/coach/tests/test_marker_writer_locking.py +305 -0
- package/coach/tests/test_merge.py +413 -0
- package/coach/tests/test_no_broken_mktemp.py +90 -0
- package/coach/tests/test_render_env.py +137 -0
- package/coach/tests/test_render_env_glyphs.py +119 -0
- package/coach/tests/test_reward_hints.py +59 -0
- package/coach/tests/test_scoring.py +147 -0
- package/coach/tests/test_session_start_weekly_trigger.py +92 -0
- package/coach/tests/test_skill_inventory.py +368 -0
- package/coach/tests/test_stats_hybrid.py +142 -0
- package/coach/tests/test_status_accounting.py +41 -0
- package/coach/tests/test_statusline_failsafe.py +70 -0
- package/coach/tests/test_statusline_self_patch.py +261 -0
- package/coach/tests/test_statusline_variants.py +110 -0
- package/coach/tests/test_statusline_wrap.py +196 -0
- package/coach/tests/test_statusline_wrap_action.py +408 -0
- package/coach/tests/test_switch_to_plugin.py +360 -0
- package/coach/tests/test_themes.py +104 -0
- package/coach/tests/test_user_config.py +160 -0
- package/coach/tests/test_wrap_announce_hook.py +130 -0
- package/coach/tests/test_xp_accounting.py +55 -0
- package/hooks/coach-session-start.py +536 -0
- package/hooks/coach-user-prompt.py +2288 -0
- package/install-launchd.sh +102 -0
- package/install.sh +597 -0
- package/launchd/com.local.claude-coach.plist.template +34 -0
- package/launchd/run-insights.sh +20 -0
- package/npm/coach-claw.js +259 -0
- package/package.json +52 -0
- package/requirements.txt +11 -0
- package/settings-snippet.json +31 -0
- package/skills/coach/SKILL.md +107 -0
- package/skills/coach-insights/SKILL.md +78 -0
- package/skills/config/SKILL.md +149 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
"""Unit tests for coach/bin/aggregate_facets.py.
|
|
2
|
+
|
|
3
|
+
Mock facets/*.json sidecar fixtures, assert threshold-based emit shape
|
|
4
|
+
matches what merge.py expects on its --detections input.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import aggregate_facets
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _write_facet(dir_: Path, name: str, payload: dict) -> Path:
|
|
17
|
+
p = dir_ / f"{name}.json"
|
|
18
|
+
p.write_text(json.dumps(payload))
|
|
19
|
+
return p
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _make_session(
|
|
23
|
+
*,
|
|
24
|
+
friction: dict | None = None,
|
|
25
|
+
primary_success: str | None = None,
|
|
26
|
+
friction_detail: str = "",
|
|
27
|
+
brief_summary: str = "",
|
|
28
|
+
session_id: str = "test-session",
|
|
29
|
+
) -> dict:
|
|
30
|
+
out: dict = {"session_id": session_id}
|
|
31
|
+
if friction is not None:
|
|
32
|
+
out["friction_counts"] = friction
|
|
33
|
+
if primary_success is not None:
|
|
34
|
+
out["primary_success"] = primary_success
|
|
35
|
+
if friction_detail:
|
|
36
|
+
out["friction_detail"] = friction_detail
|
|
37
|
+
if brief_summary:
|
|
38
|
+
out["brief_summary"] = brief_summary
|
|
39
|
+
return out
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_friction_counts_emits_negative_detection(tmp_path: Path) -> None:
|
|
43
|
+
"""friction_counts.misunderstood_request in 4/5 sessions → emit negative."""
|
|
44
|
+
for i in range(4):
|
|
45
|
+
_write_facet(
|
|
46
|
+
tmp_path,
|
|
47
|
+
f"s{i}",
|
|
48
|
+
_make_session(
|
|
49
|
+
friction={"misunderstood_request": 2},
|
|
50
|
+
friction_detail=f"session {i} got off-track on the first attempt",
|
|
51
|
+
session_id=f"s{i}",
|
|
52
|
+
),
|
|
53
|
+
)
|
|
54
|
+
_write_facet(tmp_path, "s4", _make_session(session_id="s4"))
|
|
55
|
+
|
|
56
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
57
|
+
ids = [d["id"] for d in dets]
|
|
58
|
+
assert "misunderstood-request" in ids
|
|
59
|
+
det = next(d for d in dets if d["id"] == "misunderstood-request")
|
|
60
|
+
assert det["direction"] == "negative"
|
|
61
|
+
assert det["ratio"] == 0.8
|
|
62
|
+
assert det["n_sessions"] == 5
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_primary_success_emits_positive_detection(tmp_path: Path) -> None:
|
|
66
|
+
"""6/10 sessions with primary_success=good_debugging → emit positive."""
|
|
67
|
+
for i in range(6):
|
|
68
|
+
_write_facet(
|
|
69
|
+
tmp_path,
|
|
70
|
+
f"s{i}",
|
|
71
|
+
_make_session(
|
|
72
|
+
primary_success="good_debugging",
|
|
73
|
+
brief_summary=f"session {i}: drove the bug to root cause",
|
|
74
|
+
session_id=f"s{i}",
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
for i in range(6, 10):
|
|
78
|
+
_write_facet(tmp_path, f"s{i}", _make_session(session_id=f"s{i}"))
|
|
79
|
+
|
|
80
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
81
|
+
ids = [d["id"] for d in dets]
|
|
82
|
+
assert "good-debugging" in ids
|
|
83
|
+
det = next(d for d in dets if d["id"] == "good-debugging")
|
|
84
|
+
assert det["direction"] == "positive"
|
|
85
|
+
assert det["ratio"] == 0.6
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_below_threshold_drops_detection(tmp_path: Path) -> None:
|
|
89
|
+
"""Friction in 2/10 sessions (<25%) → not emitted."""
|
|
90
|
+
for i in range(2):
|
|
91
|
+
_write_facet(
|
|
92
|
+
tmp_path,
|
|
93
|
+
f"s{i}",
|
|
94
|
+
_make_session(friction={"buggy_code": 1}, session_id=f"s{i}"),
|
|
95
|
+
)
|
|
96
|
+
for i in range(2, 10):
|
|
97
|
+
_write_facet(tmp_path, f"s{i}", _make_session(session_id=f"s{i}"))
|
|
98
|
+
|
|
99
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
100
|
+
assert "buggy-code" not in [d["id"] for d in dets]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_strength_threshold_higher_than_negative(tmp_path: Path) -> None:
|
|
104
|
+
"""5/10 (50%) primary_success does NOT emit; needs ≥60%."""
|
|
105
|
+
for i in range(5):
|
|
106
|
+
_write_facet(
|
|
107
|
+
tmp_path,
|
|
108
|
+
f"s{i}",
|
|
109
|
+
_make_session(primary_success="multi_file_changes", session_id=f"s{i}"),
|
|
110
|
+
)
|
|
111
|
+
for i in range(5, 10):
|
|
112
|
+
_write_facet(tmp_path, f"s{i}", _make_session(session_id=f"s{i}"))
|
|
113
|
+
|
|
114
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
115
|
+
assert "multi-file-changes" not in [d["id"] for d in dets]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_strength_at_threshold_emits(tmp_path: Path) -> None:
|
|
119
|
+
"""Exactly 60% (6/10) primary_success → emits."""
|
|
120
|
+
for i in range(6):
|
|
121
|
+
_write_facet(
|
|
122
|
+
tmp_path,
|
|
123
|
+
f"s{i}",
|
|
124
|
+
_make_session(primary_success="multi_file_changes", session_id=f"s{i}"),
|
|
125
|
+
)
|
|
126
|
+
for i in range(6, 10):
|
|
127
|
+
_write_facet(tmp_path, f"s{i}", _make_session(session_id=f"s{i}"))
|
|
128
|
+
|
|
129
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
130
|
+
assert "multi-file-changes" in [d["id"] for d in dets]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_id_kebab_normalization(tmp_path: Path) -> None:
|
|
134
|
+
"""friction_counts underscore keys emit kebab-case ids."""
|
|
135
|
+
for i in range(3):
|
|
136
|
+
_write_facet(
|
|
137
|
+
tmp_path,
|
|
138
|
+
f"s{i}",
|
|
139
|
+
_make_session(
|
|
140
|
+
friction={"misunderstood_request": 1, "wrong_approach": 1},
|
|
141
|
+
session_id=f"s{i}",
|
|
142
|
+
),
|
|
143
|
+
)
|
|
144
|
+
_write_facet(tmp_path, "s3", _make_session(session_id="s3"))
|
|
145
|
+
|
|
146
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
147
|
+
ids = {d["id"] for d in dets}
|
|
148
|
+
# Both 3/4 = 75% > 25% → both should emit.
|
|
149
|
+
assert "misunderstood-request" in ids
|
|
150
|
+
assert "wrong-approach" in ids
|
|
151
|
+
# No underscores in any id.
|
|
152
|
+
for d in dets:
|
|
153
|
+
assert "_" not in d["id"]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_examples_capped_and_redacted(tmp_path: Path) -> None:
|
|
157
|
+
"""5 friction_detail strings → capped at 3, each ≤120 chars; file paths
|
|
158
|
+
redacted."""
|
|
159
|
+
raw_examples = [
|
|
160
|
+
"Edited /Users/foo/project/src/main.py and broke the build for an hour",
|
|
161
|
+
"The settings.py change cascaded into a migration regression",
|
|
162
|
+
"Wrong approach in /tmp/bar/src/handler.go before we caught it on PR",
|
|
163
|
+
"Went down the wrong rabbit hole on test_runner.ts for 30 minutes",
|
|
164
|
+
"Misread the spec — the README.md said the opposite of what I assumed",
|
|
165
|
+
]
|
|
166
|
+
for i, detail in enumerate(raw_examples):
|
|
167
|
+
_write_facet(
|
|
168
|
+
tmp_path,
|
|
169
|
+
f"s{i}",
|
|
170
|
+
_make_session(
|
|
171
|
+
friction={"misunderstood_request": 1},
|
|
172
|
+
friction_detail=detail,
|
|
173
|
+
session_id=f"s{i}",
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
178
|
+
det = next(d for d in dets if d["id"] == "misunderstood-request")
|
|
179
|
+
assert len(det["examples"]) == 3
|
|
180
|
+
for ex in det["examples"]:
|
|
181
|
+
assert len(ex) <= 120
|
|
182
|
+
# File-path tokens redacted.
|
|
183
|
+
assert "/Users/foo/" not in ex
|
|
184
|
+
assert "/tmp/bar/" not in ex
|
|
185
|
+
# File-extension tokens redacted.
|
|
186
|
+
assert "settings.py" not in ex
|
|
187
|
+
assert "handler.go" not in ex
|
|
188
|
+
assert "test_runner.ts" not in ex
|
|
189
|
+
assert "README.md" not in ex
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_window_filtering(tmp_path: Path) -> None:
|
|
193
|
+
"""Facets older than the window are dropped."""
|
|
194
|
+
import os
|
|
195
|
+
import time as _time
|
|
196
|
+
|
|
197
|
+
# Stale: mtime 14d ago.
|
|
198
|
+
stale_payload = _make_session(friction={"misunderstood_request": 5}, session_id="stale")
|
|
199
|
+
p_stale = _write_facet(tmp_path, "stale", stale_payload)
|
|
200
|
+
stale_ts = _time.time() - 14 * 86400
|
|
201
|
+
os.utime(p_stale, (stale_ts, stale_ts))
|
|
202
|
+
|
|
203
|
+
# Fresh: today.
|
|
204
|
+
for i in range(3):
|
|
205
|
+
_write_facet(
|
|
206
|
+
tmp_path,
|
|
207
|
+
f"fresh{i}",
|
|
208
|
+
_make_session(friction={"buggy_code": 1}, session_id=f"fresh{i}"),
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
212
|
+
ids = [d["id"] for d in dets]
|
|
213
|
+
# Stale not counted: only 3 sessions in window, all with buggy_code → emit it.
|
|
214
|
+
assert "buggy-code" in ids
|
|
215
|
+
# Stale's misunderstood_request should NOT emit.
|
|
216
|
+
assert "misunderstood-request" not in ids
|
|
217
|
+
det = next(d for d in dets if d["id"] == "buggy-code")
|
|
218
|
+
assert det["n_sessions"] == 3
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def test_missing_facets_dir_returns_empty(tmp_path: Path) -> None:
|
|
222
|
+
"""Nonexistent facets dir → empty list, no crash."""
|
|
223
|
+
nonexistent = tmp_path / "nope"
|
|
224
|
+
dets = aggregate_facets.aggregate(nonexistent, window_days=7, cap=8)
|
|
225
|
+
assert dets == []
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def test_empty_facets_dir_returns_empty(tmp_path: Path) -> None:
|
|
229
|
+
"""Existing but empty facets dir → empty list."""
|
|
230
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
231
|
+
assert dets == []
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def test_malformed_json_skipped(tmp_path: Path) -> None:
|
|
235
|
+
"""Malformed JSON files are skipped silently."""
|
|
236
|
+
(tmp_path / "broken.json").write_text("not valid json {{{")
|
|
237
|
+
for i in range(3):
|
|
238
|
+
_write_facet(
|
|
239
|
+
tmp_path,
|
|
240
|
+
f"s{i}",
|
|
241
|
+
_make_session(friction={"buggy_code": 1}, session_id=f"s{i}"),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
245
|
+
assert "buggy-code" in [d["id"] for d in dets]
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_cap_enforced(tmp_path: Path) -> None:
|
|
249
|
+
"""cap=2 limits output to 2 detections, highest ratio first."""
|
|
250
|
+
keys = ["misunderstood_request", "wrong_approach", "buggy_code", "edge_case"]
|
|
251
|
+
# All four hit 100%.
|
|
252
|
+
for i in range(5):
|
|
253
|
+
_write_facet(
|
|
254
|
+
tmp_path,
|
|
255
|
+
f"s{i}",
|
|
256
|
+
_make_session(
|
|
257
|
+
friction={k: 1 for k in keys},
|
|
258
|
+
session_id=f"s{i}",
|
|
259
|
+
),
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=2)
|
|
263
|
+
assert len(dets) == 2
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def test_schema_shape_matches_merge_input(tmp_path: Path) -> None:
|
|
267
|
+
"""Detection objects must carry the fields merge.py reads."""
|
|
268
|
+
for i in range(3):
|
|
269
|
+
_write_facet(
|
|
270
|
+
tmp_path,
|
|
271
|
+
f"s{i}",
|
|
272
|
+
_make_session(
|
|
273
|
+
friction={"misunderstood_request": 1},
|
|
274
|
+
friction_detail=f"detail {i}",
|
|
275
|
+
session_id=f"s{i}",
|
|
276
|
+
),
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
280
|
+
assert dets, "expected at least one detection"
|
|
281
|
+
for d in dets:
|
|
282
|
+
assert "id" in d and isinstance(d["id"], str) and d["id"]
|
|
283
|
+
assert d["direction"] in ("positive", "negative")
|
|
284
|
+
assert "name" in d
|
|
285
|
+
assert "nudge" in d
|
|
286
|
+
assert "examples" in d and isinstance(d["examples"], list)
|
|
287
|
+
assert d.get("source") == "insights-weekly"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def test_zero_count_friction_not_emitted(tmp_path: Path) -> None:
|
|
291
|
+
"""friction_counts entries with count=0 are not treated as present."""
|
|
292
|
+
for i in range(5):
|
|
293
|
+
_write_facet(
|
|
294
|
+
tmp_path,
|
|
295
|
+
f"s{i}",
|
|
296
|
+
_make_session(friction={"misunderstood_request": 0}, session_id=f"s{i}"),
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
dets = aggregate_facets.aggregate(tmp_path, window_days=7, cap=8)
|
|
300
|
+
assert "misunderstood-request" not in [d["id"] for d in dets]
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# --- CLI-level evidence gate (v0.5.1 P1 #1b) -------------------------------
|
|
304
|
+
# `aggregate()` continues to return [] for empty/missing dirs (those tests
|
|
305
|
+
# above stay green). The CLI `main()` adds an "evidence gate": if
|
|
306
|
+
# n_sessions == 0 in the requested window, exit 3 (EXIT_NO_EVIDENCE) and
|
|
307
|
+
# print no JSON to stdout. The wrapper translates that to its own exit 7.
|
|
308
|
+
# Reasoning: empty detections WITH n_sessions > 0 is valid (clean week,
|
|
309
|
+
# merges normally); empty detections WITH n_sessions == 0 is no evidence
|
|
310
|
+
# and must NOT advance absence-based streaks.
|
|
311
|
+
|
|
312
|
+
SCRIPT = Path(__file__).resolve().parent.parent / "bin" / "aggregate_facets.py"
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def test_no_sessions_in_window_returns_3(tmp_path: Path) -> None:
|
|
316
|
+
"""Empty facets dir → CLI exits 3, prints no JSON to stdout, prints
|
|
317
|
+
a clear stderr message naming the window. Pinned by
|
|
318
|
+
aggregate_facets.EXIT_NO_EVIDENCE."""
|
|
319
|
+
empty = tmp_path / "empty-facets"
|
|
320
|
+
empty.mkdir()
|
|
321
|
+
|
|
322
|
+
result = subprocess.run(
|
|
323
|
+
[sys.executable, str(SCRIPT), "--facets-dir", str(empty), "--window-days", "7"],
|
|
324
|
+
capture_output=True,
|
|
325
|
+
text=True,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
assert result.returncode == aggregate_facets.EXIT_NO_EVIDENCE == 3, (
|
|
329
|
+
f"expected exit 3, got {result.returncode}\nstderr={result.stderr}"
|
|
330
|
+
)
|
|
331
|
+
assert "no sessions in last 7 days" in result.stderr
|
|
332
|
+
assert "refusing to emit detections" in result.stderr
|
|
333
|
+
# Stdout MUST be empty so a caller piping stdout into merge gets a
|
|
334
|
+
# parse error instead of a silent `[]` merge.
|
|
335
|
+
assert result.stdout.strip() == "", (
|
|
336
|
+
f"stdout should be empty when bailing on no-evidence: {result.stdout!r}"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def test_nonexistent_facets_dir_cli_returns_3(tmp_path: Path) -> None:
|
|
341
|
+
"""Same gate fires when --facets-dir doesn't exist (the function
|
|
342
|
+
returns []; the CLI catches it via the n_sessions recount)."""
|
|
343
|
+
nonexistent = tmp_path / "does-not-exist"
|
|
344
|
+
|
|
345
|
+
result = subprocess.run(
|
|
346
|
+
[sys.executable, str(SCRIPT), "--facets-dir", str(nonexistent)],
|
|
347
|
+
capture_output=True,
|
|
348
|
+
text=True,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
assert result.returncode == 3
|
|
352
|
+
assert "no sessions" in result.stderr
|
|
353
|
+
assert result.stdout.strip() == ""
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def test_session_with_no_detections_still_exits_0(tmp_path: Path) -> None:
|
|
357
|
+
"""The gate fires on n_sessions==0, NOT on detections==0. A clean
|
|
358
|
+
session (no friction, no primary_success) with n_sessions=1 emits
|
|
359
|
+
no detections but is a legitimate clean signal and must merge as
|
|
360
|
+
`[]` — exit 0, NOT exit 3. This pins the asymmetry from
|
|
361
|
+
`_session_with_no_detections_still_exits_0` vs the no-evidence
|
|
362
|
+
gate."""
|
|
363
|
+
_write_facet(tmp_path, "s0", _make_session(session_id="s0"))
|
|
364
|
+
|
|
365
|
+
result = subprocess.run(
|
|
366
|
+
[sys.executable, str(SCRIPT), "--facets-dir", str(tmp_path)],
|
|
367
|
+
capture_output=True,
|
|
368
|
+
text=True,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
assert result.returncode == 0, (
|
|
372
|
+
f"single-session-no-friction should exit 0, got {result.returncode}\n"
|
|
373
|
+
f"stderr={result.stderr}"
|
|
374
|
+
)
|
|
375
|
+
assert "n_sessions=1" in result.stderr
|
|
376
|
+
assert "detections=0" in result.stderr
|
|
377
|
+
# Empty detections list, NOT empty stdout — wrapper merges this as
|
|
378
|
+
# a clean week.
|
|
379
|
+
assert json.loads(result.stdout) == []
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""analyze.py aggregator: per-project skill invocation breakdown.
|
|
2
|
+
|
|
3
|
+
Locks in the data shape that flows through insights.sh → merge.py and
|
|
4
|
+
ultimately becomes the rolling accumulator the inventory inference
|
|
5
|
+
reads. Pre-2026-04-24 the aggregator collapsed all projects into a
|
|
6
|
+
flat `skills_used` Counter and lost the project association — this
|
|
7
|
+
file guards that the new `skills_by_project` field stays correct
|
|
8
|
+
across the canonical cases.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import analyze
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# --- _project_name_from_slug -----------------------------------------------
|
|
16
|
+
|
|
17
|
+
def test_project_name_from_simple_slug():
|
|
18
|
+
"""The common case: `~/Desktop/dev/widget` becomes the slug
|
|
19
|
+
`-Users-alice-Desktop-dev-widget`. Last segment wins."""
|
|
20
|
+
assert analyze._project_name_from_slug(
|
|
21
|
+
"-Users-alice-Desktop-dev-widget") == "widget"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_project_name_from_hyphenated_slug_collapses_to_last_segment():
|
|
25
|
+
"""Documented limitation: hyphens in original project names
|
|
26
|
+
collide with the slash-to-dash separator. `acme-app` becomes
|
|
27
|
+
`app`. The hook tokenizer compensates by splitting cwd anchors
|
|
28
|
+
on dashes too, so the partial still matches at filter time."""
|
|
29
|
+
assert analyze._project_name_from_slug(
|
|
30
|
+
"-Users-r-Desktop-dev-acme-app") == "app"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_project_name_from_empty_or_garbage():
|
|
34
|
+
assert analyze._project_name_from_slug("") == ""
|
|
35
|
+
assert analyze._project_name_from_slug(None or "") == ""
|
|
36
|
+
# Trailing dashes stripped before split.
|
|
37
|
+
assert analyze._project_name_from_slug("-Users-x-foo--") == "foo"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_project_name_lowercases():
|
|
41
|
+
"""Anchor-token comparison is lowercase on the hook side; emit
|
|
42
|
+
lowercase at source so they line up without the consumer needing
|
|
43
|
+
to re-normalize."""
|
|
44
|
+
assert analyze._project_name_from_slug(
|
|
45
|
+
"-Users-r-Desktop-dev-MyProject") == "myproject"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# --- aggregate(): skills_by_project shape ----------------------------------
|
|
49
|
+
|
|
50
|
+
def _make_session(*, project: str, skills: dict[str, int],
|
|
51
|
+
assistant_turns: int = 5) -> dict:
|
|
52
|
+
"""Minimal session shape sufficient for aggregate() to consume.
|
|
53
|
+
aggregate() only reads project, skills_invoked, and a few fields
|
|
54
|
+
from the detection branches; we provide the per-skill counts and
|
|
55
|
+
fill the rest with neutral defaults."""
|
|
56
|
+
return {
|
|
57
|
+
"project": project,
|
|
58
|
+
"skills_invoked": dict(skills),
|
|
59
|
+
"session_hash": "abcd1234",
|
|
60
|
+
"tool_counts": {},
|
|
61
|
+
"user_turns": 1,
|
|
62
|
+
"assistant_turns": assistant_turns,
|
|
63
|
+
"first_ts": None,
|
|
64
|
+
"last_ts": None,
|
|
65
|
+
"first_user_ts": None,
|
|
66
|
+
"first_edit_ts": None,
|
|
67
|
+
"first_plan_ts": None,
|
|
68
|
+
"task_create_count": 0,
|
|
69
|
+
"exit_plan_count": 0,
|
|
70
|
+
"edit_count": 0,
|
|
71
|
+
"write_count": 0,
|
|
72
|
+
"bash_count": 0,
|
|
73
|
+
"commit_count": 0,
|
|
74
|
+
"test_run_count": 0,
|
|
75
|
+
"has_any_commit": False,
|
|
76
|
+
"has_any_test_run": False,
|
|
77
|
+
"bash_rm_rf_count": 0,
|
|
78
|
+
"read_count": 0,
|
|
79
|
+
"grep_count": 0,
|
|
80
|
+
"glob_count": 0,
|
|
81
|
+
"agent_count": 0,
|
|
82
|
+
"skill_count": sum(skills.values()),
|
|
83
|
+
"sec_first_user_to_first_edit": None,
|
|
84
|
+
"plan_before_edit": False,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_aggregate_emits_skills_by_project():
|
|
89
|
+
"""The new emit. Single session, single project, single skill —
|
|
90
|
+
smallest case that proves the pipe is open."""
|
|
91
|
+
sessions = [_make_session(
|
|
92
|
+
project="-Users-r-Desktop-dev-service",
|
|
93
|
+
skills={"deploy-staging": 3})]
|
|
94
|
+
_detections, summary = analyze.aggregate(sessions)
|
|
95
|
+
assert summary["skills_by_project"] == {"service": {"deploy-staging": 3}}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_aggregate_sums_across_sessions_in_same_project():
|
|
99
|
+
sessions = [
|
|
100
|
+
_make_session(project="-Users-r-Desktop-dev-service",
|
|
101
|
+
skills={"deploy-staging": 2}),
|
|
102
|
+
_make_session(project="-Users-r-Desktop-dev-service",
|
|
103
|
+
skills={"deploy-staging": 1, "design": 1}),
|
|
104
|
+
]
|
|
105
|
+
_detections, summary = analyze.aggregate(sessions)
|
|
106
|
+
assert summary["skills_by_project"]["service"]["deploy-staging"] == 3
|
|
107
|
+
assert summary["skills_by_project"]["service"]["design"] == 1
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_aggregate_separates_distinct_projects():
|
|
111
|
+
sessions = [
|
|
112
|
+
_make_session(project="-Users-r-Desktop-dev-service",
|
|
113
|
+
skills={"deploy-staging": 2}),
|
|
114
|
+
_make_session(project="-Users-r-Desktop-dev-widget",
|
|
115
|
+
skills={"widget-build": 4}),
|
|
116
|
+
]
|
|
117
|
+
_detections, summary = analyze.aggregate(sessions)
|
|
118
|
+
sbp = summary["skills_by_project"]
|
|
119
|
+
assert sbp["service"] == {"deploy-staging": 2}
|
|
120
|
+
assert sbp["widget"] == {"widget-build": 4}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_aggregate_skips_sessions_with_no_project():
|
|
124
|
+
"""If a session somehow lacks a project (corrupted state, edge
|
|
125
|
+
case from a transcript without a parent dir), it must not crash
|
|
126
|
+
or attribute its invocations to an empty-string project key."""
|
|
127
|
+
sessions = [
|
|
128
|
+
_make_session(project="", skills={"deploy-staging": 1}),
|
|
129
|
+
_make_session(project="-Users-r-Desktop-dev-service",
|
|
130
|
+
skills={"deploy-staging": 1}),
|
|
131
|
+
]
|
|
132
|
+
_detections, summary = analyze.aggregate(sessions)
|
|
133
|
+
assert "" not in summary["skills_by_project"]
|
|
134
|
+
assert summary["skills_by_project"] == {"service": {"deploy-staging": 1}}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_aggregate_keeps_skills_used_in_sync_with_skills_by_project():
|
|
138
|
+
"""The flat skills_used Counter and the per-project breakdown are
|
|
139
|
+
derived from the same source. Their totals must match — drift
|
|
140
|
+
here would be a sign of a bookkeeping error in aggregate()."""
|
|
141
|
+
sessions = [
|
|
142
|
+
_make_session(project="-Users-r-Desktop-dev-service",
|
|
143
|
+
skills={"deploy-staging": 2, "design": 1}),
|
|
144
|
+
_make_session(project="-Users-r-Desktop-dev-widget",
|
|
145
|
+
skills={"design": 3}),
|
|
146
|
+
]
|
|
147
|
+
_detections, summary = analyze.aggregate(sessions)
|
|
148
|
+
flat_total = summary["skills_used"]
|
|
149
|
+
by_proj_totals: dict[str, int] = {}
|
|
150
|
+
for proj_skills in summary["skills_by_project"].values():
|
|
151
|
+
for sid, count in proj_skills.items():
|
|
152
|
+
by_proj_totals[sid] = by_proj_totals.get(sid, 0) + count
|
|
153
|
+
assert flat_total == by_proj_totals
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import analyze
|
|
7
|
+
import redact
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _assistant_record(*, command: str = "") -> dict:
|
|
11
|
+
content = []
|
|
12
|
+
if command:
|
|
13
|
+
content.append({
|
|
14
|
+
"type": "tool_use",
|
|
15
|
+
"name": "Bash",
|
|
16
|
+
"input": {"command": command},
|
|
17
|
+
})
|
|
18
|
+
return {
|
|
19
|
+
"type": "assistant",
|
|
20
|
+
"message": {"role": "assistant", "content": content},
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_analyze_redacts_each_record_before_json_parsing(tmp_path, monkeypatch):
|
|
25
|
+
secret = "sk-" + ("A" * 40)
|
|
26
|
+
transcript = tmp_path / "session.jsonl"
|
|
27
|
+
transcript.write_text(
|
|
28
|
+
json.dumps(_assistant_record(command=f"echo {secret}; pytest")) + "\n"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
real_loads = analyze.json.loads
|
|
32
|
+
parsed_inputs: list[str] = []
|
|
33
|
+
|
|
34
|
+
def loads_spy(text, *args, **kwargs):
|
|
35
|
+
parsed_inputs.append(text)
|
|
36
|
+
assert secret not in text
|
|
37
|
+
return real_loads(text, *args, **kwargs)
|
|
38
|
+
|
|
39
|
+
monkeypatch.setattr(analyze.json, "loads", loads_spy)
|
|
40
|
+
|
|
41
|
+
sig = analyze.analyze_session(transcript)
|
|
42
|
+
|
|
43
|
+
assert sig is not None
|
|
44
|
+
assert sig["test_run_count"] == 1
|
|
45
|
+
assert parsed_inputs
|
|
46
|
+
assert "[REDACTED:openai-key]" in parsed_inputs[0]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_analyze_streams_transcript_without_reading_whole_file(tmp_path, monkeypatch):
|
|
50
|
+
transcript = tmp_path / "large.jsonl"
|
|
51
|
+
transcript.write_text(
|
|
52
|
+
"".join(json.dumps(_assistant_record()) + "\n" for _ in range(5000))
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def fail_read_text(*args, **kwargs):
|
|
56
|
+
raise AssertionError("analyze_session must not read whole transcripts")
|
|
57
|
+
|
|
58
|
+
monkeypatch.setattr(Path, "read_text", fail_read_text)
|
|
59
|
+
|
|
60
|
+
sig = analyze.analyze_session(transcript)
|
|
61
|
+
|
|
62
|
+
assert sig is not None
|
|
63
|
+
assert sig["assistant_turns"] == 5000
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# --- redact.py pattern coverage -------------------------------------------
|
|
67
|
+
# Each test passes a bare token in prose context (not a `KEY=value`
|
|
68
|
+
# assignment) so we know the token-shape pattern itself catches it, not the
|
|
69
|
+
# `.env`-style fallback.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_redact_stripe_live_key():
|
|
73
|
+
text = "we use sk_live_" + "a1B2c3D4e5F6g7H8i9J0k1L2" + " for prod"
|
|
74
|
+
out = redact.redact(text)
|
|
75
|
+
assert "sk_live_" not in out
|
|
76
|
+
assert "[REDACTED:stripe-live-key]" in out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_redact_stripe_test_key():
|
|
80
|
+
text = "test creds: sk_test_" + ("A" * 30)
|
|
81
|
+
out = redact.redact(text)
|
|
82
|
+
assert "sk_test_" not in out
|
|
83
|
+
assert "[REDACTED:stripe-test-key]" in out
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_redact_huggingface_token():
|
|
87
|
+
text = "use hf_" + ("a" * 35) + " to download"
|
|
88
|
+
out = redact.redact(text)
|
|
89
|
+
assert "hf_a" not in out
|
|
90
|
+
assert "[REDACTED:huggingface-token]" in out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_redact_npm_publish_token():
|
|
94
|
+
text = "npm_" + ("X" * 36) + " is the publish token"
|
|
95
|
+
out = redact.redact(text)
|
|
96
|
+
assert "npm_X" not in out
|
|
97
|
+
assert "[REDACTED:npm-token]" in out
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_redact_does_not_collapse_short_lookalikes():
|
|
101
|
+
"""Don't redact short fragments that happen to start with these
|
|
102
|
+
prefixes — minimum length thresholds matter."""
|
|
103
|
+
text = "sk_live_short hf_short npm_short"
|
|
104
|
+
out = redact.redact(text)
|
|
105
|
+
assert "REDACTED" not in out
|