claude-toolstack-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_toolstack_cli-1.0.0.dist-info/METADATA +354 -0
- claude_toolstack_cli-1.0.0.dist-info/RECORD +48 -0
- claude_toolstack_cli-1.0.0.dist-info/WHEEL +5 -0
- claude_toolstack_cli-1.0.0.dist-info/entry_points.txt +2 -0
- claude_toolstack_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
- claude_toolstack_cli-1.0.0.dist-info/top_level.txt +1 -0
- cts/__init__.py +3 -0
- cts/__main__.py +5 -0
- cts/autopilot.py +633 -0
- cts/bundle.py +958 -0
- cts/cli.py +2858 -0
- cts/confidence.py +218 -0
- cts/config.py +19 -0
- cts/corpus/__init__.py +139 -0
- cts/corpus/apply.py +305 -0
- cts/corpus/archive.py +309 -0
- cts/corpus/baseline.py +294 -0
- cts/corpus/evaluate.py +409 -0
- cts/corpus/experiment_eval.py +585 -0
- cts/corpus/experiment_schema.py +380 -0
- cts/corpus/extract.py +353 -0
- cts/corpus/load.py +44 -0
- cts/corpus/model.py +114 -0
- cts/corpus/patch.py +467 -0
- cts/corpus/registry.py +420 -0
- cts/corpus/report.py +745 -0
- cts/corpus/scan.py +87 -0
- cts/corpus/store.py +63 -0
- cts/corpus/trends.py +478 -0
- cts/corpus/tuning_schema.py +313 -0
- cts/corpus/variants.py +335 -0
- cts/ctags.py +133 -0
- cts/diff_context.py +92 -0
- cts/errors.py +109 -0
- cts/http.py +89 -0
- cts/ranking.py +466 -0
- cts/render.py +388 -0
- cts/schema.py +96 -0
- cts/semantic/__init__.py +47 -0
- cts/semantic/candidates.py +150 -0
- cts/semantic/chunker.py +184 -0
- cts/semantic/config.py +120 -0
- cts/semantic/embedder.py +151 -0
- cts/semantic/indexer.py +159 -0
- cts/semantic/search.py +252 -0
- cts/semantic/store.py +330 -0
- cts/sidecar.py +431 -0
- cts/structural.py +305 -0
cts/corpus/evaluate.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
"""Evaluation engine: compare before vs after corpora.
|
|
2
|
+
|
|
3
|
+
Given a baseline corpus (before tuning) and an updated corpus (after
|
|
4
|
+
tuning), compute KPI deltas that prove whether the changes improved
|
|
5
|
+
things.
|
|
6
|
+
|
|
7
|
+
KPIs tracked:
|
|
8
|
+
- confidence_final_mean: average final confidence
|
|
9
|
+
- confidence_delta_mean: average autopilot lift
|
|
10
|
+
- truncation_rate: fraction of truncated artifacts
|
|
11
|
+
- autopilot_low_lift_rate: fraction of low-lift autopilot runs
|
|
12
|
+
- bundle_bytes_p90: 90th percentile bundle size
|
|
13
|
+
- should_autopilot_count: artifacts that should have had autopilot
|
|
14
|
+
|
|
15
|
+
Verdict logic:
|
|
16
|
+
- ``improved``: majority of tracked KPIs got better
|
|
17
|
+
- ``regressed``: majority got worse
|
|
18
|
+
- ``mixed``: some better, some worse
|
|
19
|
+
- ``no_data``: insufficient data for comparison
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
import math
|
|
26
|
+
from typing import Any, Dict, List
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# KPI extraction (from corpus records directly)
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _percentile(values: List[float], p: float) -> float:
|
|
35
|
+
"""Compute percentile (same as in report.py)."""
|
|
36
|
+
if not values:
|
|
37
|
+
return 0.0
|
|
38
|
+
s = sorted(values)
|
|
39
|
+
k = (p / 100.0) * (len(s) - 1)
|
|
40
|
+
f = math.floor(k)
|
|
41
|
+
c = math.ceil(k)
|
|
42
|
+
if f == c:
|
|
43
|
+
return s[int(k)]
|
|
44
|
+
return s[f] * (c - k) + s[c] * (k - f)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _mean(values: List[float]) -> float:
|
|
48
|
+
if not values:
|
|
49
|
+
return 0.0
|
|
50
|
+
return sum(values) / len(values)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def extract_kpis(records: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
54
|
+
"""Extract KPI values from corpus records.
|
|
55
|
+
|
|
56
|
+
Returns a dict of KPI name → value, suitable for comparison.
|
|
57
|
+
"""
|
|
58
|
+
total = len(records)
|
|
59
|
+
if total == 0:
|
|
60
|
+
return {"total": 0}
|
|
61
|
+
|
|
62
|
+
# Confidence
|
|
63
|
+
finals = [
|
|
64
|
+
r["confidence_final"] for r in records if r.get("confidence_final") is not None
|
|
65
|
+
]
|
|
66
|
+
deltas = [
|
|
67
|
+
r["confidence_delta"] for r in records if r.get("confidence_delta") is not None
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
# Truncation
|
|
71
|
+
truncated = sum(
|
|
72
|
+
1 for r in records if r.get("truncation_flags", {}).get("truncated", False)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Autopilot waste
|
|
76
|
+
autopilot_runs = [r for r in records if r.get("passes_count", 0) > 0]
|
|
77
|
+
low_lift = [
|
|
78
|
+
r
|
|
79
|
+
for r in autopilot_runs
|
|
80
|
+
if r.get("confidence_delta") is not None and r["confidence_delta"] < 0.05
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
# Should-have-autopiloted
|
|
84
|
+
should_auto = [
|
|
85
|
+
r
|
|
86
|
+
for r in records
|
|
87
|
+
if r.get("passes_count", 0) == 0
|
|
88
|
+
and r.get("confidence_pass1") is not None
|
|
89
|
+
and r["confidence_pass1"] < 0.6
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# Bundle sizes
|
|
93
|
+
sizes = [
|
|
94
|
+
r["bundle_bytes_final"]
|
|
95
|
+
for r in records
|
|
96
|
+
if r.get("bundle_bytes_final") is not None
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
autopilot_count = len(autopilot_runs)
|
|
100
|
+
|
|
101
|
+
# Semantic augmentation metrics (Phase 4)
|
|
102
|
+
semantic_runs = [r for r in records if r.get("semantic_invoked", False)]
|
|
103
|
+
semantic_action_runs = [r for r in records if r.get("semantic_action_fired", False)]
|
|
104
|
+
semantic_lifts = [
|
|
105
|
+
r["semantic_lift"] for r in records if r.get("semantic_lift") is not None
|
|
106
|
+
]
|
|
107
|
+
semantic_times = [
|
|
108
|
+
r["semantic_time_ms"] for r in records if r.get("semantic_time_ms") is not None
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
"total": total,
|
|
113
|
+
"confidence_final_mean": round(_mean(finals), 4),
|
|
114
|
+
"confidence_delta_mean": round(_mean(deltas), 4),
|
|
115
|
+
"truncation_rate": round(truncated / total, 4) if total else 0.0,
|
|
116
|
+
"autopilot_low_lift_rate": (
|
|
117
|
+
round(len(low_lift) / autopilot_count, 4) if autopilot_count else 0.0
|
|
118
|
+
),
|
|
119
|
+
"bundle_bytes_p90": round(_percentile(sizes, 90), 0),
|
|
120
|
+
"should_autopilot_count": len(should_auto),
|
|
121
|
+
"autopilot_runs": autopilot_count,
|
|
122
|
+
"truncated_count": truncated,
|
|
123
|
+
# Semantic KPIs
|
|
124
|
+
"semantic_invoked_rate": (
|
|
125
|
+
round(len(semantic_runs) / total, 4) if total else 0.0
|
|
126
|
+
),
|
|
127
|
+
"semantic_action_rate": (
|
|
128
|
+
round(len(semantic_action_runs) / total, 4) if total else 0.0
|
|
129
|
+
),
|
|
130
|
+
"semantic_lift_mean": round(_mean(semantic_lifts), 4),
|
|
131
|
+
"semantic_time_ms_p90": (
|
|
132
|
+
round(_percentile(semantic_times, 90), 1) if semantic_times else 0.0
|
|
133
|
+
),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
# Comparison
|
|
139
|
+
# ---------------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
# KPIs where higher is better
|
|
142
|
+
_HIGHER_BETTER = {
|
|
143
|
+
"confidence_final_mean",
|
|
144
|
+
"confidence_delta_mean",
|
|
145
|
+
"semantic_lift_mean",
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# KPIs where lower is better
|
|
149
|
+
_LOWER_BETTER = {
|
|
150
|
+
"truncation_rate",
|
|
151
|
+
"autopilot_low_lift_rate",
|
|
152
|
+
"bundle_bytes_p90",
|
|
153
|
+
"should_autopilot_count",
|
|
154
|
+
"semantic_time_ms_p90",
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# Informational KPIs (tracked but not used for verdict direction)
|
|
158
|
+
_INFORMATIONAL = {
|
|
159
|
+
"semantic_invoked_rate",
|
|
160
|
+
"semantic_action_rate",
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Minimum absolute change to count as a real delta (avoid noise)
|
|
164
|
+
_NOISE_THRESHOLDS: Dict[str, float] = {
|
|
165
|
+
"confidence_final_mean": 0.005,
|
|
166
|
+
"confidence_delta_mean": 0.005,
|
|
167
|
+
"truncation_rate": 0.01,
|
|
168
|
+
"autopilot_low_lift_rate": 0.02,
|
|
169
|
+
"bundle_bytes_p90": 1024,
|
|
170
|
+
"should_autopilot_count": 1,
|
|
171
|
+
"semantic_lift_mean": 0.01,
|
|
172
|
+
"semantic_time_ms_p90": 5.0,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def compare_kpis(
|
|
177
|
+
before: Dict[str, Any],
|
|
178
|
+
after: Dict[str, Any],
|
|
179
|
+
) -> Dict[str, Any]:
|
|
180
|
+
"""Compare two KPI snapshots and produce a verdict.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
comparison dict with per-KPI deltas, directions, and overall verdict.
|
|
184
|
+
"""
|
|
185
|
+
if before.get("total", 0) == 0 or after.get("total", 0) == 0:
|
|
186
|
+
return {
|
|
187
|
+
"verdict": "no_data",
|
|
188
|
+
"reason": "Insufficient data in one or both corpora",
|
|
189
|
+
"kpis": {},
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
kpi_results: Dict[str, Dict[str, Any]] = {}
|
|
193
|
+
improved_count = 0
|
|
194
|
+
regressed_count = 0
|
|
195
|
+
tracked_count = 0
|
|
196
|
+
|
|
197
|
+
all_kpis = _HIGHER_BETTER | _LOWER_BETTER
|
|
198
|
+
for kpi_name in sorted(all_kpis):
|
|
199
|
+
before_val = before.get(kpi_name)
|
|
200
|
+
after_val = after.get(kpi_name)
|
|
201
|
+
|
|
202
|
+
if before_val is None or after_val is None:
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
tracked_count += 1
|
|
206
|
+
delta = after_val - before_val
|
|
207
|
+
threshold = _NOISE_THRESHOLDS.get(kpi_name, 0.001)
|
|
208
|
+
abs_delta = abs(delta)
|
|
209
|
+
|
|
210
|
+
# Direction
|
|
211
|
+
if abs_delta < threshold:
|
|
212
|
+
direction = "unchanged"
|
|
213
|
+
elif kpi_name in _HIGHER_BETTER:
|
|
214
|
+
direction = "improved" if delta > 0 else "regressed"
|
|
215
|
+
else:
|
|
216
|
+
direction = "improved" if delta < 0 else "regressed"
|
|
217
|
+
|
|
218
|
+
if direction == "improved":
|
|
219
|
+
improved_count += 1
|
|
220
|
+
elif direction == "regressed":
|
|
221
|
+
regressed_count += 1
|
|
222
|
+
|
|
223
|
+
kpi_results[kpi_name] = {
|
|
224
|
+
"before": before_val,
|
|
225
|
+
"after": after_val,
|
|
226
|
+
"delta": round(delta, 4),
|
|
227
|
+
"direction": direction,
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# Verdict
|
|
231
|
+
if tracked_count == 0:
|
|
232
|
+
verdict = "no_data"
|
|
233
|
+
reason = "No comparable KPIs found"
|
|
234
|
+
elif improved_count > regressed_count:
|
|
235
|
+
verdict = "improved"
|
|
236
|
+
reason = (
|
|
237
|
+
f"{improved_count}/{tracked_count} KPIs improved, "
|
|
238
|
+
f"{regressed_count} regressed"
|
|
239
|
+
)
|
|
240
|
+
elif regressed_count > improved_count:
|
|
241
|
+
verdict = "regressed"
|
|
242
|
+
reason = (
|
|
243
|
+
f"{regressed_count}/{tracked_count} KPIs regressed, "
|
|
244
|
+
f"{improved_count} improved"
|
|
245
|
+
)
|
|
246
|
+
elif improved_count > 0:
|
|
247
|
+
verdict = "mixed"
|
|
248
|
+
reason = (
|
|
249
|
+
f"{improved_count} improved, {regressed_count} regressed "
|
|
250
|
+
f"out of {tracked_count}"
|
|
251
|
+
)
|
|
252
|
+
else:
|
|
253
|
+
verdict = "unchanged"
|
|
254
|
+
reason = f"No significant changes across {tracked_count} KPIs"
|
|
255
|
+
|
|
256
|
+
return {
|
|
257
|
+
"verdict": verdict,
|
|
258
|
+
"reason": reason,
|
|
259
|
+
"improved_count": improved_count,
|
|
260
|
+
"regressed_count": regressed_count,
|
|
261
|
+
"tracked_count": tracked_count,
|
|
262
|
+
"kpis": kpi_results,
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# ---------------------------------------------------------------------------
|
|
267
|
+
# Full evaluation
|
|
268
|
+
# ---------------------------------------------------------------------------
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def evaluate(
|
|
272
|
+
before_records: List[Dict[str, Any]],
|
|
273
|
+
after_records: List[Dict[str, Any]],
|
|
274
|
+
) -> Dict[str, Any]:
|
|
275
|
+
"""Run full evaluation comparing before and after corpora.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
before_records: Corpus records before tuning.
|
|
279
|
+
after_records: Corpus records after tuning.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Evaluation report dict.
|
|
283
|
+
"""
|
|
284
|
+
before_kpis = extract_kpis(before_records)
|
|
285
|
+
after_kpis = extract_kpis(after_records)
|
|
286
|
+
comparison = compare_kpis(before_kpis, after_kpis)
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
"before": before_kpis,
|
|
290
|
+
"after": after_kpis,
|
|
291
|
+
"comparison": comparison,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# ---------------------------------------------------------------------------
|
|
296
|
+
# Renderers
|
|
297
|
+
# ---------------------------------------------------------------------------
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def render_evaluation_text(result: Dict[str, Any]) -> str:
|
|
301
|
+
"""Render evaluation result as human-readable text."""
|
|
302
|
+
lines: List[str] = []
|
|
303
|
+
lines.append("=" * 60)
|
|
304
|
+
lines.append("TUNING EVALUATION REPORT")
|
|
305
|
+
lines.append("=" * 60)
|
|
306
|
+
|
|
307
|
+
comparison = result.get("comparison", {})
|
|
308
|
+
verdict = comparison.get("verdict", "no_data")
|
|
309
|
+
reason = comparison.get("reason", "")
|
|
310
|
+
|
|
311
|
+
lines.append("")
|
|
312
|
+
v_label = verdict.upper()
|
|
313
|
+
lines.append(f"Verdict: {v_label}")
|
|
314
|
+
lines.append(f"Reason: {reason}")
|
|
315
|
+
|
|
316
|
+
before_kpis = result.get("before", {})
|
|
317
|
+
after_kpis = result.get("after", {})
|
|
318
|
+
lines.append("")
|
|
319
|
+
lines.append(
|
|
320
|
+
f"Before: {before_kpis.get('total', 0)} artifacts "
|
|
321
|
+
f"After: {after_kpis.get('total', 0)} artifacts"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
kpis = comparison.get("kpis", {})
|
|
325
|
+
if kpis:
|
|
326
|
+
lines.append("")
|
|
327
|
+
lines.append("-" * 60)
|
|
328
|
+
header = f"{'KPI':<30} {'Before':>10} {'After':>10} {'Delta':>10} Dir"
|
|
329
|
+
lines.append(header)
|
|
330
|
+
lines.append("-" * 60)
|
|
331
|
+
for name, kpi in sorted(kpis.items()):
|
|
332
|
+
bv = kpi["before"]
|
|
333
|
+
av = kpi["after"]
|
|
334
|
+
delta = kpi["delta"]
|
|
335
|
+
direction = kpi["direction"]
|
|
336
|
+
|
|
337
|
+
# Format values
|
|
338
|
+
if isinstance(bv, float) and bv < 10:
|
|
339
|
+
bv_s = f"{bv:.4f}"
|
|
340
|
+
av_s = f"{av:.4f}"
|
|
341
|
+
delta_s = f"{delta:+.4f}"
|
|
342
|
+
else:
|
|
343
|
+
bv_s = f"{bv}"
|
|
344
|
+
av_s = f"{av}"
|
|
345
|
+
delta_s = f"{delta:+}"
|
|
346
|
+
|
|
347
|
+
# Direction indicator
|
|
348
|
+
if direction == "improved":
|
|
349
|
+
ind = " ✓"
|
|
350
|
+
elif direction == "regressed":
|
|
351
|
+
ind = " ✗"
|
|
352
|
+
else:
|
|
353
|
+
ind = " ="
|
|
354
|
+
|
|
355
|
+
lines.append(f"{name:<30} {bv_s:>10} {av_s:>10} {delta_s:>10}{ind}")
|
|
356
|
+
|
|
357
|
+
lines.append("")
|
|
358
|
+
return "\n".join(lines)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def render_evaluation_json(result: Dict[str, Any]) -> str:
|
|
362
|
+
"""Render evaluation result as JSON."""
|
|
363
|
+
return json.dumps(result, indent=2, default=str)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def render_evaluation_markdown(result: Dict[str, Any]) -> str:
|
|
367
|
+
"""Render evaluation result as markdown."""
|
|
368
|
+
lines: List[str] = []
|
|
369
|
+
comparison = result.get("comparison", {})
|
|
370
|
+
verdict = comparison.get("verdict", "no_data")
|
|
371
|
+
reason = comparison.get("reason", "")
|
|
372
|
+
|
|
373
|
+
lines.append("# Tuning Evaluation Report")
|
|
374
|
+
lines.append("")
|
|
375
|
+
lines.append(f"**Verdict:** {verdict.upper()}")
|
|
376
|
+
lines.append(f"**Reason:** {reason}")
|
|
377
|
+
|
|
378
|
+
before_kpis = result.get("before", {})
|
|
379
|
+
after_kpis = result.get("after", {})
|
|
380
|
+
lines.append("")
|
|
381
|
+
lines.append(f"- Before: {before_kpis.get('total', 0)} artifacts")
|
|
382
|
+
lines.append(f"- After: {after_kpis.get('total', 0)} artifacts")
|
|
383
|
+
|
|
384
|
+
kpis = comparison.get("kpis", {})
|
|
385
|
+
if kpis:
|
|
386
|
+
lines.append("")
|
|
387
|
+
lines.append("## KPI Comparison")
|
|
388
|
+
lines.append("")
|
|
389
|
+
lines.append("| KPI | Before | After | Delta | Direction |")
|
|
390
|
+
lines.append("|-----|--------|-------|-------|-----------|")
|
|
391
|
+
for name, kpi in sorted(kpis.items()):
|
|
392
|
+
bv = kpi["before"]
|
|
393
|
+
av = kpi["after"]
|
|
394
|
+
delta = kpi["delta"]
|
|
395
|
+
direction = kpi["direction"]
|
|
396
|
+
|
|
397
|
+
if isinstance(bv, float) and bv < 10:
|
|
398
|
+
bv_s = f"{bv:.4f}"
|
|
399
|
+
av_s = f"{av:.4f}"
|
|
400
|
+
delta_s = f"{delta:+.4f}"
|
|
401
|
+
else:
|
|
402
|
+
bv_s = f"{bv}"
|
|
403
|
+
av_s = f"{av}"
|
|
404
|
+
delta_s = f"{delta:+}"
|
|
405
|
+
|
|
406
|
+
lines.append(f"| {name} | {bv_s} | {av_s} | {delta_s} | {direction} |")
|
|
407
|
+
|
|
408
|
+
lines.append("")
|
|
409
|
+
return "\n".join(lines)
|