claude-toolstack-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. claude_toolstack_cli-1.0.0.dist-info/METADATA +354 -0
  2. claude_toolstack_cli-1.0.0.dist-info/RECORD +48 -0
  3. claude_toolstack_cli-1.0.0.dist-info/WHEEL +5 -0
  4. claude_toolstack_cli-1.0.0.dist-info/entry_points.txt +2 -0
  5. claude_toolstack_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
  6. claude_toolstack_cli-1.0.0.dist-info/top_level.txt +1 -0
  7. cts/__init__.py +3 -0
  8. cts/__main__.py +5 -0
  9. cts/autopilot.py +633 -0
  10. cts/bundle.py +958 -0
  11. cts/cli.py +2858 -0
  12. cts/confidence.py +218 -0
  13. cts/config.py +19 -0
  14. cts/corpus/__init__.py +139 -0
  15. cts/corpus/apply.py +305 -0
  16. cts/corpus/archive.py +309 -0
  17. cts/corpus/baseline.py +294 -0
  18. cts/corpus/evaluate.py +409 -0
  19. cts/corpus/experiment_eval.py +585 -0
  20. cts/corpus/experiment_schema.py +380 -0
  21. cts/corpus/extract.py +353 -0
  22. cts/corpus/load.py +44 -0
  23. cts/corpus/model.py +114 -0
  24. cts/corpus/patch.py +467 -0
  25. cts/corpus/registry.py +420 -0
  26. cts/corpus/report.py +745 -0
  27. cts/corpus/scan.py +87 -0
  28. cts/corpus/store.py +63 -0
  29. cts/corpus/trends.py +478 -0
  30. cts/corpus/tuning_schema.py +313 -0
  31. cts/corpus/variants.py +335 -0
  32. cts/ctags.py +133 -0
  33. cts/diff_context.py +92 -0
  34. cts/errors.py +109 -0
  35. cts/http.py +89 -0
  36. cts/ranking.py +466 -0
  37. cts/render.py +388 -0
  38. cts/schema.py +96 -0
  39. cts/semantic/__init__.py +47 -0
  40. cts/semantic/candidates.py +150 -0
  41. cts/semantic/chunker.py +184 -0
  42. cts/semantic/config.py +120 -0
  43. cts/semantic/embedder.py +151 -0
  44. cts/semantic/indexer.py +159 -0
  45. cts/semantic/search.py +252 -0
  46. cts/semantic/store.py +330 -0
  47. cts/sidecar.py +431 -0
  48. cts/structural.py +305 -0
cts/corpus/evaluate.py ADDED
@@ -0,0 +1,409 @@
1
+ """Evaluation engine: compare before vs after corpora.
2
+
3
+ Given a baseline corpus (before tuning) and an updated corpus (after
4
+ tuning), compute KPI deltas that prove whether the changes improved
5
+ things.
6
+
7
+ KPIs tracked:
8
+ - confidence_final_mean: average final confidence
9
+ - confidence_delta_mean: average autopilot lift
10
+ - truncation_rate: fraction of truncated artifacts
11
+ - autopilot_low_lift_rate: fraction of low-lift autopilot runs
12
+ - bundle_bytes_p90: 90th percentile bundle size
13
+ - should_autopilot_count: artifacts that should have had autopilot
14
+
15
+ Verdict logic:
16
+ - ``improved``: majority of tracked KPIs got better
17
+ - ``regressed``: majority got worse
18
+ - ``mixed``: some better, some worse
19
+ - ``no_data``: insufficient data for comparison
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import math
26
+ from typing import Any, Dict, List
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # KPI extraction (from corpus records directly)
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ def _percentile(values: List[float], p: float) -> float:
35
+ """Compute percentile (same as in report.py)."""
36
+ if not values:
37
+ return 0.0
38
+ s = sorted(values)
39
+ k = (p / 100.0) * (len(s) - 1)
40
+ f = math.floor(k)
41
+ c = math.ceil(k)
42
+ if f == c:
43
+ return s[int(k)]
44
+ return s[f] * (c - k) + s[c] * (k - f)
45
+
46
+
47
+ def _mean(values: List[float]) -> float:
48
+ if not values:
49
+ return 0.0
50
+ return sum(values) / len(values)
51
+
52
+
53
+ def extract_kpis(records: List[Dict[str, Any]]) -> Dict[str, Any]:
54
+ """Extract KPI values from corpus records.
55
+
56
+ Returns a dict of KPI name → value, suitable for comparison.
57
+ """
58
+ total = len(records)
59
+ if total == 0:
60
+ return {"total": 0}
61
+
62
+ # Confidence
63
+ finals = [
64
+ r["confidence_final"] for r in records if r.get("confidence_final") is not None
65
+ ]
66
+ deltas = [
67
+ r["confidence_delta"] for r in records if r.get("confidence_delta") is not None
68
+ ]
69
+
70
+ # Truncation
71
+ truncated = sum(
72
+ 1 for r in records if r.get("truncation_flags", {}).get("truncated", False)
73
+ )
74
+
75
+ # Autopilot waste
76
+ autopilot_runs = [r for r in records if r.get("passes_count", 0) > 0]
77
+ low_lift = [
78
+ r
79
+ for r in autopilot_runs
80
+ if r.get("confidence_delta") is not None and r["confidence_delta"] < 0.05
81
+ ]
82
+
83
+ # Should-have-autopiloted
84
+ should_auto = [
85
+ r
86
+ for r in records
87
+ if r.get("passes_count", 0) == 0
88
+ and r.get("confidence_pass1") is not None
89
+ and r["confidence_pass1"] < 0.6
90
+ ]
91
+
92
+ # Bundle sizes
93
+ sizes = [
94
+ r["bundle_bytes_final"]
95
+ for r in records
96
+ if r.get("bundle_bytes_final") is not None
97
+ ]
98
+
99
+ autopilot_count = len(autopilot_runs)
100
+
101
+ # Semantic augmentation metrics (Phase 4)
102
+ semantic_runs = [r for r in records if r.get("semantic_invoked", False)]
103
+ semantic_action_runs = [r for r in records if r.get("semantic_action_fired", False)]
104
+ semantic_lifts = [
105
+ r["semantic_lift"] for r in records if r.get("semantic_lift") is not None
106
+ ]
107
+ semantic_times = [
108
+ r["semantic_time_ms"] for r in records if r.get("semantic_time_ms") is not None
109
+ ]
110
+
111
+ return {
112
+ "total": total,
113
+ "confidence_final_mean": round(_mean(finals), 4),
114
+ "confidence_delta_mean": round(_mean(deltas), 4),
115
+ "truncation_rate": round(truncated / total, 4) if total else 0.0,
116
+ "autopilot_low_lift_rate": (
117
+ round(len(low_lift) / autopilot_count, 4) if autopilot_count else 0.0
118
+ ),
119
+ "bundle_bytes_p90": round(_percentile(sizes, 90), 0),
120
+ "should_autopilot_count": len(should_auto),
121
+ "autopilot_runs": autopilot_count,
122
+ "truncated_count": truncated,
123
+ # Semantic KPIs
124
+ "semantic_invoked_rate": (
125
+ round(len(semantic_runs) / total, 4) if total else 0.0
126
+ ),
127
+ "semantic_action_rate": (
128
+ round(len(semantic_action_runs) / total, 4) if total else 0.0
129
+ ),
130
+ "semantic_lift_mean": round(_mean(semantic_lifts), 4),
131
+ "semantic_time_ms_p90": (
132
+ round(_percentile(semantic_times, 90), 1) if semantic_times else 0.0
133
+ ),
134
+ }
135
+
136
+
137
+ # ---------------------------------------------------------------------------
138
+ # Comparison
139
+ # ---------------------------------------------------------------------------
140
+
141
+ # KPIs where higher is better
142
+ _HIGHER_BETTER = {
143
+ "confidence_final_mean",
144
+ "confidence_delta_mean",
145
+ "semantic_lift_mean",
146
+ }
147
+
148
+ # KPIs where lower is better
149
+ _LOWER_BETTER = {
150
+ "truncation_rate",
151
+ "autopilot_low_lift_rate",
152
+ "bundle_bytes_p90",
153
+ "should_autopilot_count",
154
+ "semantic_time_ms_p90",
155
+ }
156
+
157
+ # Informational KPIs (tracked but not used for verdict direction)
158
+ _INFORMATIONAL = {
159
+ "semantic_invoked_rate",
160
+ "semantic_action_rate",
161
+ }
162
+
163
+ # Minimum absolute change to count as a real delta (avoid noise)
164
+ _NOISE_THRESHOLDS: Dict[str, float] = {
165
+ "confidence_final_mean": 0.005,
166
+ "confidence_delta_mean": 0.005,
167
+ "truncation_rate": 0.01,
168
+ "autopilot_low_lift_rate": 0.02,
169
+ "bundle_bytes_p90": 1024,
170
+ "should_autopilot_count": 1,
171
+ "semantic_lift_mean": 0.01,
172
+ "semantic_time_ms_p90": 5.0,
173
+ }
174
+
175
+
176
+ def compare_kpis(
177
+ before: Dict[str, Any],
178
+ after: Dict[str, Any],
179
+ ) -> Dict[str, Any]:
180
+ """Compare two KPI snapshots and produce a verdict.
181
+
182
+ Returns:
183
+ comparison dict with per-KPI deltas, directions, and overall verdict.
184
+ """
185
+ if before.get("total", 0) == 0 or after.get("total", 0) == 0:
186
+ return {
187
+ "verdict": "no_data",
188
+ "reason": "Insufficient data in one or both corpora",
189
+ "kpis": {},
190
+ }
191
+
192
+ kpi_results: Dict[str, Dict[str, Any]] = {}
193
+ improved_count = 0
194
+ regressed_count = 0
195
+ tracked_count = 0
196
+
197
+ all_kpis = _HIGHER_BETTER | _LOWER_BETTER
198
+ for kpi_name in sorted(all_kpis):
199
+ before_val = before.get(kpi_name)
200
+ after_val = after.get(kpi_name)
201
+
202
+ if before_val is None or after_val is None:
203
+ continue
204
+
205
+ tracked_count += 1
206
+ delta = after_val - before_val
207
+ threshold = _NOISE_THRESHOLDS.get(kpi_name, 0.001)
208
+ abs_delta = abs(delta)
209
+
210
+ # Direction
211
+ if abs_delta < threshold:
212
+ direction = "unchanged"
213
+ elif kpi_name in _HIGHER_BETTER:
214
+ direction = "improved" if delta > 0 else "regressed"
215
+ else:
216
+ direction = "improved" if delta < 0 else "regressed"
217
+
218
+ if direction == "improved":
219
+ improved_count += 1
220
+ elif direction == "regressed":
221
+ regressed_count += 1
222
+
223
+ kpi_results[kpi_name] = {
224
+ "before": before_val,
225
+ "after": after_val,
226
+ "delta": round(delta, 4),
227
+ "direction": direction,
228
+ }
229
+
230
+ # Verdict
231
+ if tracked_count == 0:
232
+ verdict = "no_data"
233
+ reason = "No comparable KPIs found"
234
+ elif improved_count > regressed_count:
235
+ verdict = "improved"
236
+ reason = (
237
+ f"{improved_count}/{tracked_count} KPIs improved, "
238
+ f"{regressed_count} regressed"
239
+ )
240
+ elif regressed_count > improved_count:
241
+ verdict = "regressed"
242
+ reason = (
243
+ f"{regressed_count}/{tracked_count} KPIs regressed, "
244
+ f"{improved_count} improved"
245
+ )
246
+ elif improved_count > 0:
247
+ verdict = "mixed"
248
+ reason = (
249
+ f"{improved_count} improved, {regressed_count} regressed "
250
+ f"out of {tracked_count}"
251
+ )
252
+ else:
253
+ verdict = "unchanged"
254
+ reason = f"No significant changes across {tracked_count} KPIs"
255
+
256
+ return {
257
+ "verdict": verdict,
258
+ "reason": reason,
259
+ "improved_count": improved_count,
260
+ "regressed_count": regressed_count,
261
+ "tracked_count": tracked_count,
262
+ "kpis": kpi_results,
263
+ }
264
+
265
+
266
+ # ---------------------------------------------------------------------------
267
+ # Full evaluation
268
+ # ---------------------------------------------------------------------------
269
+
270
+
271
+ def evaluate(
272
+ before_records: List[Dict[str, Any]],
273
+ after_records: List[Dict[str, Any]],
274
+ ) -> Dict[str, Any]:
275
+ """Run full evaluation comparing before and after corpora.
276
+
277
+ Args:
278
+ before_records: Corpus records before tuning.
279
+ after_records: Corpus records after tuning.
280
+
281
+ Returns:
282
+ Evaluation report dict.
283
+ """
284
+ before_kpis = extract_kpis(before_records)
285
+ after_kpis = extract_kpis(after_records)
286
+ comparison = compare_kpis(before_kpis, after_kpis)
287
+
288
+ return {
289
+ "before": before_kpis,
290
+ "after": after_kpis,
291
+ "comparison": comparison,
292
+ }
293
+
294
+
295
+ # ---------------------------------------------------------------------------
296
+ # Renderers
297
+ # ---------------------------------------------------------------------------
298
+
299
+
300
+ def render_evaluation_text(result: Dict[str, Any]) -> str:
301
+ """Render evaluation result as human-readable text."""
302
+ lines: List[str] = []
303
+ lines.append("=" * 60)
304
+ lines.append("TUNING EVALUATION REPORT")
305
+ lines.append("=" * 60)
306
+
307
+ comparison = result.get("comparison", {})
308
+ verdict = comparison.get("verdict", "no_data")
309
+ reason = comparison.get("reason", "")
310
+
311
+ lines.append("")
312
+ v_label = verdict.upper()
313
+ lines.append(f"Verdict: {v_label}")
314
+ lines.append(f"Reason: {reason}")
315
+
316
+ before_kpis = result.get("before", {})
317
+ after_kpis = result.get("after", {})
318
+ lines.append("")
319
+ lines.append(
320
+ f"Before: {before_kpis.get('total', 0)} artifacts "
321
+ f"After: {after_kpis.get('total', 0)} artifacts"
322
+ )
323
+
324
+ kpis = comparison.get("kpis", {})
325
+ if kpis:
326
+ lines.append("")
327
+ lines.append("-" * 60)
328
+ header = f"{'KPI':<30} {'Before':>10} {'After':>10} {'Delta':>10} Dir"
329
+ lines.append(header)
330
+ lines.append("-" * 60)
331
+ for name, kpi in sorted(kpis.items()):
332
+ bv = kpi["before"]
333
+ av = kpi["after"]
334
+ delta = kpi["delta"]
335
+ direction = kpi["direction"]
336
+
337
+ # Format values
338
+ if isinstance(bv, float) and bv < 10:
339
+ bv_s = f"{bv:.4f}"
340
+ av_s = f"{av:.4f}"
341
+ delta_s = f"{delta:+.4f}"
342
+ else:
343
+ bv_s = f"{bv}"
344
+ av_s = f"{av}"
345
+ delta_s = f"{delta:+}"
346
+
347
+ # Direction indicator
348
+ if direction == "improved":
349
+ ind = " ✓"
350
+ elif direction == "regressed":
351
+ ind = " ✗"
352
+ else:
353
+ ind = " ="
354
+
355
+ lines.append(f"{name:<30} {bv_s:>10} {av_s:>10} {delta_s:>10}{ind}")
356
+
357
+ lines.append("")
358
+ return "\n".join(lines)
359
+
360
+
361
+ def render_evaluation_json(result: Dict[str, Any]) -> str:
362
+ """Render evaluation result as JSON."""
363
+ return json.dumps(result, indent=2, default=str)
364
+
365
+
366
+ def render_evaluation_markdown(result: Dict[str, Any]) -> str:
367
+ """Render evaluation result as markdown."""
368
+ lines: List[str] = []
369
+ comparison = result.get("comparison", {})
370
+ verdict = comparison.get("verdict", "no_data")
371
+ reason = comparison.get("reason", "")
372
+
373
+ lines.append("# Tuning Evaluation Report")
374
+ lines.append("")
375
+ lines.append(f"**Verdict:** {verdict.upper()}")
376
+ lines.append(f"**Reason:** {reason}")
377
+
378
+ before_kpis = result.get("before", {})
379
+ after_kpis = result.get("after", {})
380
+ lines.append("")
381
+ lines.append(f"- Before: {before_kpis.get('total', 0)} artifacts")
382
+ lines.append(f"- After: {after_kpis.get('total', 0)} artifacts")
383
+
384
+ kpis = comparison.get("kpis", {})
385
+ if kpis:
386
+ lines.append("")
387
+ lines.append("## KPI Comparison")
388
+ lines.append("")
389
+ lines.append("| KPI | Before | After | Delta | Direction |")
390
+ lines.append("|-----|--------|-------|-------|-----------|")
391
+ for name, kpi in sorted(kpis.items()):
392
+ bv = kpi["before"]
393
+ av = kpi["after"]
394
+ delta = kpi["delta"]
395
+ direction = kpi["direction"]
396
+
397
+ if isinstance(bv, float) and bv < 10:
398
+ bv_s = f"{bv:.4f}"
399
+ av_s = f"{av:.4f}"
400
+ delta_s = f"{delta:+.4f}"
401
+ else:
402
+ bv_s = f"{bv}"
403
+ av_s = f"{av}"
404
+ delta_s = f"{delta:+}"
405
+
406
+ lines.append(f"| {name} | {bv_s} | {av_s} | {delta_s} | {direction} |")
407
+
408
+ lines.append("")
409
+ return "\n".join(lines)