claude-toolstack-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. claude_toolstack_cli-1.0.0.dist-info/METADATA +354 -0
  2. claude_toolstack_cli-1.0.0.dist-info/RECORD +48 -0
  3. claude_toolstack_cli-1.0.0.dist-info/WHEEL +5 -0
  4. claude_toolstack_cli-1.0.0.dist-info/entry_points.txt +2 -0
  5. claude_toolstack_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
  6. claude_toolstack_cli-1.0.0.dist-info/top_level.txt +1 -0
  7. cts/__init__.py +3 -0
  8. cts/__main__.py +5 -0
  9. cts/autopilot.py +633 -0
  10. cts/bundle.py +958 -0
  11. cts/cli.py +2858 -0
  12. cts/confidence.py +218 -0
  13. cts/config.py +19 -0
  14. cts/corpus/__init__.py +139 -0
  15. cts/corpus/apply.py +305 -0
  16. cts/corpus/archive.py +309 -0
  17. cts/corpus/baseline.py +294 -0
  18. cts/corpus/evaluate.py +409 -0
  19. cts/corpus/experiment_eval.py +585 -0
  20. cts/corpus/experiment_schema.py +380 -0
  21. cts/corpus/extract.py +353 -0
  22. cts/corpus/load.py +44 -0
  23. cts/corpus/model.py +114 -0
  24. cts/corpus/patch.py +467 -0
  25. cts/corpus/registry.py +420 -0
  26. cts/corpus/report.py +745 -0
  27. cts/corpus/scan.py +87 -0
  28. cts/corpus/store.py +63 -0
  29. cts/corpus/trends.py +478 -0
  30. cts/corpus/tuning_schema.py +313 -0
  31. cts/corpus/variants.py +335 -0
  32. cts/ctags.py +133 -0
  33. cts/diff_context.py +92 -0
  34. cts/errors.py +109 -0
  35. cts/http.py +89 -0
  36. cts/ranking.py +466 -0
  37. cts/render.py +388 -0
  38. cts/schema.py +96 -0
  39. cts/semantic/__init__.py +47 -0
  40. cts/semantic/candidates.py +150 -0
  41. cts/semantic/chunker.py +184 -0
  42. cts/semantic/config.py +120 -0
  43. cts/semantic/embedder.py +151 -0
  44. cts/semantic/indexer.py +159 -0
  45. cts/semantic/search.py +252 -0
  46. cts/semantic/store.py +330 -0
  47. cts/sidecar.py +431 -0
  48. cts/structural.py +305 -0
@@ -0,0 +1,380 @@
1
+ """Experiment schema for A/B tuning experiments.
2
+
3
+ Defines a versioned envelope for controlled tuning experiments.
4
+ An experiment compares two (or more) candidate tuning variants
5
+ against a baseline corpus using defined KPIs and decision rules.
6
+
7
+ Consumers check ``experiment_schema_version`` to decide if they
8
+ can parse the payload.
9
+
10
+ Assignment modes:
11
+ - ``time_window``: A runs for date range 1, B for date range 2
12
+ - ``repo_partition``: A runs on set of repos, B on another set
13
+ - ``manual``: operator assigns explicitly
14
+
15
+ Decision rules:
16
+ - ``primary_kpi``: the KPI that decides the winner
17
+ - ``constraints``: conditions that must hold (e.g. truncation
18
+ rate must not worsen beyond a threshold)
19
+ - ``tie_breakers``: fallback KPIs if primary is tied
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import time
25
+ import uuid
26
+ from dataclasses import dataclass, field
27
+ from typing import Any, Dict, List, Optional
28
+
29
+ EXPERIMENT_SCHEMA_VERSION = 1
30
+
31
+ # Default KPIs tracked in experiments (same as evaluate.py)
32
+ DEFAULT_KPIS = [
33
+ "confidence_final_mean",
34
+ "confidence_delta_mean",
35
+ "truncation_rate",
36
+ "autopilot_low_lift_rate",
37
+ "bundle_bytes_p90",
38
+ "should_autopilot_count",
39
+ ]
40
+
41
+ # Extended KPIs including semantic augmentation (Phase 4)
42
+ SEMANTIC_KPIS = DEFAULT_KPIS + [
43
+ "semantic_invoked_rate",
44
+ "semantic_action_rate",
45
+ "semantic_lift_mean",
46
+ ]
47
+
48
+
49
+ @dataclass
50
+ class VariantSpec:
51
+ """Specification for one experiment variant (A, B, ...)."""
52
+
53
+ name: str # "A", "B", etc.
54
+ tuning_ref: str = "" # path to tuning JSON
55
+ patch_ref: str = "" # path to patch diff
56
+ apply_plan: Dict[str, Any] = field(default_factory=dict)
57
+ expected_effects: List[str] = field(default_factory=list)
58
+
59
+ def to_dict(self) -> Dict[str, Any]:
60
+ d: Dict[str, Any] = {
61
+ "name": self.name,
62
+ "tuning_ref": self.tuning_ref,
63
+ "patch_ref": self.patch_ref,
64
+ }
65
+ if self.apply_plan:
66
+ d["apply_plan"] = self.apply_plan
67
+ if self.expected_effects:
68
+ d["expected_effects"] = self.expected_effects
69
+ return d
70
+
71
+
72
+ @dataclass
73
+ class DecisionRule:
74
+ """How to pick the winner."""
75
+
76
+ primary_kpi: str = "confidence_final_mean"
77
+ constraints: List[Dict[str, Any]] = field(default_factory=list)
78
+ tie_breakers: List[str] = field(default_factory=list)
79
+
80
+ def to_dict(self) -> Dict[str, Any]:
81
+ return {
82
+ "primary_kpi": self.primary_kpi,
83
+ "constraints": self.constraints,
84
+ "tie_breakers": self.tie_breakers,
85
+ }
86
+
87
+
88
+ @dataclass
89
+ class AssignmentSpec:
90
+ """How runs are assigned to variants."""
91
+
92
+ mode: str = "manual" # time_window | repo_partition | manual
93
+ details: Dict[str, Any] = field(default_factory=dict)
94
+
95
+ def to_dict(self) -> Dict[str, Any]:
96
+ return {
97
+ "mode": self.mode,
98
+ "details": self.details,
99
+ }
100
+
101
+
102
+ @dataclass
103
+ class ExperimentEnvelope:
104
+ """Top-level envelope for an A/B experiment."""
105
+
106
+ experiment_schema_version: int = EXPERIMENT_SCHEMA_VERSION
107
+ id: str = ""
108
+ created_at: float = 0.0
109
+ description: str = ""
110
+ hypothesis: str = ""
111
+ kpis: List[str] = field(default_factory=lambda: list(DEFAULT_KPIS))
112
+ baseline: Dict[str, Any] = field(default_factory=dict)
113
+ variants: List[VariantSpec] = field(default_factory=list)
114
+ assignment: AssignmentSpec = field(default_factory=AssignmentSpec)
115
+ decision_rule: DecisionRule = field(default_factory=DecisionRule)
116
+ audit: Dict[str, Any] = field(default_factory=dict)
117
+
118
+ def to_dict(self) -> Dict[str, Any]:
119
+ return {
120
+ "experiment_schema_version": self.experiment_schema_version,
121
+ "id": self.id,
122
+ "created_at": self.created_at,
123
+ "description": self.description,
124
+ "hypothesis": self.hypothesis,
125
+ "kpis": self.kpis,
126
+ "baseline": self.baseline,
127
+ "variants": [v.to_dict() for v in self.variants],
128
+ "assignment": self.assignment.to_dict(),
129
+ "decision_rule": self.decision_rule.to_dict(),
130
+ "audit": self.audit,
131
+ }
132
+
133
+
134
+ def parse_constraint(spec: str) -> Dict[str, Any]:
135
+ """Parse a constraint string like ``truncation_rate<=+0.02``.
136
+
137
+ Supported operators: ``<=``, ``>=``, ``<``, ``>``.
138
+
139
+ Returns dict with kpi, operator, threshold.
140
+ """
141
+ for op in ("<=", ">=", "<", ">"):
142
+ if op in spec:
143
+ parts = spec.split(op, 1)
144
+ kpi = parts[0].strip()
145
+ try:
146
+ threshold = float(parts[1].strip())
147
+ except ValueError:
148
+ threshold = 0.0
149
+ return {
150
+ "kpi": kpi,
151
+ "operator": op,
152
+ "threshold": threshold,
153
+ }
154
+ return {"kpi": spec, "operator": "<=", "threshold": 0.0}
155
+
156
+
157
+ def create_experiment(
158
+ *,
159
+ id: str = "",
160
+ description: str = "",
161
+ hypothesis: str = "",
162
+ variant_names: Optional[List[str]] = None,
163
+ primary_kpi: str = "confidence_final_mean",
164
+ constraints: Optional[List[str]] = None,
165
+ assignment_mode: str = "manual",
166
+ ) -> ExperimentEnvelope:
167
+ """Create a new experiment envelope with sensible defaults.
168
+
169
+ Args:
170
+ id: Experiment ID (auto-generated if empty).
171
+ description: What the experiment tests.
172
+ hypothesis: Expected outcome.
173
+ variant_names: List of variant names (default: ["A", "B"]).
174
+ primary_kpi: KPI that decides the winner.
175
+ constraints: Constraint strings (e.g. "truncation_rate<=+0.02").
176
+ assignment_mode: How to assign runs (manual/time_window/repo_partition).
177
+
178
+ Returns:
179
+ A fully-specified ExperimentEnvelope.
180
+ """
181
+ if not id:
182
+ id = f"exp-{uuid.uuid4().hex[:8]}"
183
+
184
+ names = variant_names or ["A", "B"]
185
+ variants = [VariantSpec(name=n) for n in names]
186
+
187
+ parsed_constraints = []
188
+ if constraints:
189
+ for c in constraints:
190
+ parsed_constraints.append(parse_constraint(c))
191
+
192
+ return ExperimentEnvelope(
193
+ id=id,
194
+ created_at=time.time(),
195
+ description=description,
196
+ hypothesis=hypothesis,
197
+ variants=variants,
198
+ assignment=AssignmentSpec(mode=assignment_mode),
199
+ decision_rule=DecisionRule(
200
+ primary_kpi=primary_kpi,
201
+ constraints=parsed_constraints,
202
+ ),
203
+ )
204
+
205
+
206
+ def create_semantic_experiment(
207
+ *,
208
+ id: str = "",
209
+ description: str = "",
210
+ hypothesis: str = "",
211
+ assignment_mode: str = "manual",
212
+ ) -> ExperimentEnvelope:
213
+ """Create a Phase 4 semantic A/B experiment.
214
+
215
+ Template: A = lexical-only baseline, B = lexical + semantic_fallback.
216
+
217
+ Uses SEMANTIC_KPIS (includes semantic_lift_mean) and applies
218
+ a constraint to prevent truncation regression.
219
+
220
+ Args:
221
+ id: Experiment ID (auto-generated if empty).
222
+ description: What the experiment tests.
223
+ hypothesis: Expected outcome.
224
+ assignment_mode: How to assign runs.
225
+
226
+ Returns:
227
+ A fully-specified ExperimentEnvelope for semantic comparison.
228
+ """
229
+ if not id:
230
+ id = f"exp-semantic-{uuid.uuid4().hex[:8]}"
231
+
232
+ if not description:
233
+ description = "Compare lexical-only (A) vs lexical + semantic_fallback (B)"
234
+
235
+ if not hypothesis:
236
+ hypothesis = (
237
+ "Semantic fallback improves confidence_final_mean "
238
+ "when lexical search produces sparse matches"
239
+ )
240
+
241
+ variants = [
242
+ VariantSpec(
243
+ name="A",
244
+ expected_effects=["Baseline: lexical search only"],
245
+ ),
246
+ VariantSpec(
247
+ name="B",
248
+ expected_effects=[
249
+ "semantic_fallback action enabled in autopilot",
250
+ "Expected: higher confidence when matches are sparse",
251
+ ],
252
+ ),
253
+ ]
254
+
255
+ return ExperimentEnvelope(
256
+ id=id,
257
+ created_at=time.time(),
258
+ description=description,
259
+ hypothesis=hypothesis,
260
+ kpis=list(SEMANTIC_KPIS),
261
+ variants=variants,
262
+ assignment=AssignmentSpec(mode=assignment_mode),
263
+ decision_rule=DecisionRule(
264
+ primary_kpi="confidence_final_mean",
265
+ constraints=[
266
+ {"kpi": "truncation_rate", "operator": "<=", "threshold": 0.02},
267
+ {"kpi": "bundle_bytes_p90", "operator": "<=", "threshold": 5000},
268
+ ],
269
+ tie_breakers=["semantic_lift_mean", "confidence_delta_mean"],
270
+ ),
271
+ )
272
+
273
+
274
+ def create_narrowing_experiment(
275
+ *,
276
+ id: str = "",
277
+ description: str = "",
278
+ hypothesis: str = "",
279
+ assignment_mode: str = "manual",
280
+ ) -> ExperimentEnvelope:
281
+ """Create a Phase 4.2 narrowing A/B experiment.
282
+
283
+ Template: A = semantic_fallback (baseline), B = semantic_fallback
284
+ + candidate narrowing (exclude_top_k).
285
+
286
+ Primary KPI is semantic_lift_mean — we want to verify narrowing
287
+ preserves lift while reducing latency.
288
+
289
+ Args:
290
+ id: Experiment ID (auto-generated if empty).
291
+ description: What the experiment tests.
292
+ hypothesis: Expected outcome.
293
+ assignment_mode: How to assign runs.
294
+
295
+ Returns:
296
+ A fully-specified ExperimentEnvelope for narrowing comparison.
297
+ """
298
+ if not id:
299
+ id = f"exp-narrowing-{uuid.uuid4().hex[:8]}"
300
+
301
+ if not description:
302
+ description = (
303
+ "Compare semantic_fallback without narrowing (A) "
304
+ "vs semantic_fallback with exclude_top_k narrowing (B)"
305
+ )
306
+
307
+ if not hypothesis:
308
+ hypothesis = (
309
+ "Candidate narrowing reduces semantic_time_ms p90 "
310
+ "without materially reducing semantic_lift_mean"
311
+ )
312
+
313
+ variants = [
314
+ VariantSpec(
315
+ name="A",
316
+ expected_effects=[
317
+ "Baseline: semantic_fallback searches all chunks",
318
+ ],
319
+ ),
320
+ VariantSpec(
321
+ name="B",
322
+ expected_effects=[
323
+ "exclude_top_k=10 narrowing enabled",
324
+ "Expected: lower latency, comparable lift",
325
+ ],
326
+ ),
327
+ ]
328
+
329
+ return ExperimentEnvelope(
330
+ id=id,
331
+ created_at=time.time(),
332
+ description=description,
333
+ hypothesis=hypothesis,
334
+ kpis=list(SEMANTIC_KPIS),
335
+ variants=variants,
336
+ assignment=AssignmentSpec(mode=assignment_mode),
337
+ decision_rule=DecisionRule(
338
+ primary_kpi="semantic_lift_mean",
339
+ constraints=[
340
+ {
341
+ "kpi": "truncation_rate",
342
+ "operator": "<=",
343
+ "threshold": 0.02,
344
+ },
345
+ ],
346
+ tie_breakers=["confidence_final_mean", "confidence_delta_mean"],
347
+ ),
348
+ )
349
+
350
+
351
+ def validate_experiment(data: Dict[str, Any]) -> List[str]:
352
+ """Validate an experiment envelope dict.
353
+
354
+ Returns a list of error strings (empty = valid).
355
+ """
356
+ errors: List[str] = []
357
+
358
+ if data.get("experiment_schema_version") != EXPERIMENT_SCHEMA_VERSION:
359
+ errors.append(
360
+ f"Unsupported schema version: "
361
+ f"{data.get('experiment_schema_version')} "
362
+ f"(expected {EXPERIMENT_SCHEMA_VERSION})"
363
+ )
364
+
365
+ if not data.get("id"):
366
+ errors.append("Missing experiment id")
367
+
368
+ variants = data.get("variants", [])
369
+ if len(variants) < 2:
370
+ errors.append(f"Need at least 2 variants, got {len(variants)}")
371
+
372
+ names = [v.get("name", "") for v in variants]
373
+ if len(names) != len(set(names)):
374
+ errors.append("Duplicate variant names")
375
+
376
+ dr = data.get("decision_rule", {})
377
+ if not dr.get("primary_kpi"):
378
+ errors.append("Missing decision_rule.primary_kpi")
379
+
380
+ return errors