claude-toolstack-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_toolstack_cli-1.0.0.dist-info/METADATA +354 -0
- claude_toolstack_cli-1.0.0.dist-info/RECORD +48 -0
- claude_toolstack_cli-1.0.0.dist-info/WHEEL +5 -0
- claude_toolstack_cli-1.0.0.dist-info/entry_points.txt +2 -0
- claude_toolstack_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
- claude_toolstack_cli-1.0.0.dist-info/top_level.txt +1 -0
- cts/__init__.py +3 -0
- cts/__main__.py +5 -0
- cts/autopilot.py +633 -0
- cts/bundle.py +958 -0
- cts/cli.py +2858 -0
- cts/confidence.py +218 -0
- cts/config.py +19 -0
- cts/corpus/__init__.py +139 -0
- cts/corpus/apply.py +305 -0
- cts/corpus/archive.py +309 -0
- cts/corpus/baseline.py +294 -0
- cts/corpus/evaluate.py +409 -0
- cts/corpus/experiment_eval.py +585 -0
- cts/corpus/experiment_schema.py +380 -0
- cts/corpus/extract.py +353 -0
- cts/corpus/load.py +44 -0
- cts/corpus/model.py +114 -0
- cts/corpus/patch.py +467 -0
- cts/corpus/registry.py +420 -0
- cts/corpus/report.py +745 -0
- cts/corpus/scan.py +87 -0
- cts/corpus/store.py +63 -0
- cts/corpus/trends.py +478 -0
- cts/corpus/tuning_schema.py +313 -0
- cts/corpus/variants.py +335 -0
- cts/ctags.py +133 -0
- cts/diff_context.py +92 -0
- cts/errors.py +109 -0
- cts/http.py +89 -0
- cts/ranking.py +466 -0
- cts/render.py +388 -0
- cts/schema.py +96 -0
- cts/semantic/__init__.py +47 -0
- cts/semantic/candidates.py +150 -0
- cts/semantic/chunker.py +184 -0
- cts/semantic/config.py +120 -0
- cts/semantic/embedder.py +151 -0
- cts/semantic/indexer.py +159 -0
- cts/semantic/search.py +252 -0
- cts/semantic/store.py +330 -0
- cts/sidecar.py +431 -0
- cts/structural.py +305 -0
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
"""Experiment schema for A/B tuning experiments.
|
|
2
|
+
|
|
3
|
+
Defines a versioned envelope for controlled tuning experiments.
|
|
4
|
+
An experiment compares two (or more) candidate tuning variants
|
|
5
|
+
against a baseline corpus using defined KPIs and decision rules.
|
|
6
|
+
|
|
7
|
+
Consumers check ``experiment_schema_version`` to decide if they
|
|
8
|
+
can parse the payload.
|
|
9
|
+
|
|
10
|
+
Assignment modes:
|
|
11
|
+
- ``time_window``: A runs for date range 1, B for date range 2
|
|
12
|
+
- ``repo_partition``: A runs on set of repos, B on another set
|
|
13
|
+
- ``manual``: operator assigns explicitly
|
|
14
|
+
|
|
15
|
+
Decision rules:
|
|
16
|
+
- ``primary_kpi``: the KPI that decides the winner
|
|
17
|
+
- ``constraints``: conditions that must hold (e.g. truncation
|
|
18
|
+
rate must not worsen beyond a threshold)
|
|
19
|
+
- ``tie_breakers``: fallback KPIs if primary is tied
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import time
|
|
25
|
+
import uuid
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from typing import Any, Dict, List, Optional
|
|
28
|
+
|
|
29
|
+
EXPERIMENT_SCHEMA_VERSION = 1
|
|
30
|
+
|
|
31
|
+
# Default KPIs tracked in experiments (same as evaluate.py)
|
|
32
|
+
DEFAULT_KPIS = [
|
|
33
|
+
"confidence_final_mean",
|
|
34
|
+
"confidence_delta_mean",
|
|
35
|
+
"truncation_rate",
|
|
36
|
+
"autopilot_low_lift_rate",
|
|
37
|
+
"bundle_bytes_p90",
|
|
38
|
+
"should_autopilot_count",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
# Extended KPIs including semantic augmentation (Phase 4)
|
|
42
|
+
SEMANTIC_KPIS = DEFAULT_KPIS + [
|
|
43
|
+
"semantic_invoked_rate",
|
|
44
|
+
"semantic_action_rate",
|
|
45
|
+
"semantic_lift_mean",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class VariantSpec:
|
|
51
|
+
"""Specification for one experiment variant (A, B, ...)."""
|
|
52
|
+
|
|
53
|
+
name: str # "A", "B", etc.
|
|
54
|
+
tuning_ref: str = "" # path to tuning JSON
|
|
55
|
+
patch_ref: str = "" # path to patch diff
|
|
56
|
+
apply_plan: Dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
expected_effects: List[str] = field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
60
|
+
d: Dict[str, Any] = {
|
|
61
|
+
"name": self.name,
|
|
62
|
+
"tuning_ref": self.tuning_ref,
|
|
63
|
+
"patch_ref": self.patch_ref,
|
|
64
|
+
}
|
|
65
|
+
if self.apply_plan:
|
|
66
|
+
d["apply_plan"] = self.apply_plan
|
|
67
|
+
if self.expected_effects:
|
|
68
|
+
d["expected_effects"] = self.expected_effects
|
|
69
|
+
return d
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class DecisionRule:
|
|
74
|
+
"""How to pick the winner."""
|
|
75
|
+
|
|
76
|
+
primary_kpi: str = "confidence_final_mean"
|
|
77
|
+
constraints: List[Dict[str, Any]] = field(default_factory=list)
|
|
78
|
+
tie_breakers: List[str] = field(default_factory=list)
|
|
79
|
+
|
|
80
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
81
|
+
return {
|
|
82
|
+
"primary_kpi": self.primary_kpi,
|
|
83
|
+
"constraints": self.constraints,
|
|
84
|
+
"tie_breakers": self.tie_breakers,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class AssignmentSpec:
|
|
90
|
+
"""How runs are assigned to variants."""
|
|
91
|
+
|
|
92
|
+
mode: str = "manual" # time_window | repo_partition | manual
|
|
93
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
96
|
+
return {
|
|
97
|
+
"mode": self.mode,
|
|
98
|
+
"details": self.details,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class ExperimentEnvelope:
|
|
104
|
+
"""Top-level envelope for an A/B experiment."""
|
|
105
|
+
|
|
106
|
+
experiment_schema_version: int = EXPERIMENT_SCHEMA_VERSION
|
|
107
|
+
id: str = ""
|
|
108
|
+
created_at: float = 0.0
|
|
109
|
+
description: str = ""
|
|
110
|
+
hypothesis: str = ""
|
|
111
|
+
kpis: List[str] = field(default_factory=lambda: list(DEFAULT_KPIS))
|
|
112
|
+
baseline: Dict[str, Any] = field(default_factory=dict)
|
|
113
|
+
variants: List[VariantSpec] = field(default_factory=list)
|
|
114
|
+
assignment: AssignmentSpec = field(default_factory=AssignmentSpec)
|
|
115
|
+
decision_rule: DecisionRule = field(default_factory=DecisionRule)
|
|
116
|
+
audit: Dict[str, Any] = field(default_factory=dict)
|
|
117
|
+
|
|
118
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
119
|
+
return {
|
|
120
|
+
"experiment_schema_version": self.experiment_schema_version,
|
|
121
|
+
"id": self.id,
|
|
122
|
+
"created_at": self.created_at,
|
|
123
|
+
"description": self.description,
|
|
124
|
+
"hypothesis": self.hypothesis,
|
|
125
|
+
"kpis": self.kpis,
|
|
126
|
+
"baseline": self.baseline,
|
|
127
|
+
"variants": [v.to_dict() for v in self.variants],
|
|
128
|
+
"assignment": self.assignment.to_dict(),
|
|
129
|
+
"decision_rule": self.decision_rule.to_dict(),
|
|
130
|
+
"audit": self.audit,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def parse_constraint(spec: str) -> Dict[str, Any]:
|
|
135
|
+
"""Parse a constraint string like ``truncation_rate<=+0.02``.
|
|
136
|
+
|
|
137
|
+
Supported operators: ``<=``, ``>=``, ``<``, ``>``.
|
|
138
|
+
|
|
139
|
+
Returns dict with kpi, operator, threshold.
|
|
140
|
+
"""
|
|
141
|
+
for op in ("<=", ">=", "<", ">"):
|
|
142
|
+
if op in spec:
|
|
143
|
+
parts = spec.split(op, 1)
|
|
144
|
+
kpi = parts[0].strip()
|
|
145
|
+
try:
|
|
146
|
+
threshold = float(parts[1].strip())
|
|
147
|
+
except ValueError:
|
|
148
|
+
threshold = 0.0
|
|
149
|
+
return {
|
|
150
|
+
"kpi": kpi,
|
|
151
|
+
"operator": op,
|
|
152
|
+
"threshold": threshold,
|
|
153
|
+
}
|
|
154
|
+
return {"kpi": spec, "operator": "<=", "threshold": 0.0}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def create_experiment(
|
|
158
|
+
*,
|
|
159
|
+
id: str = "",
|
|
160
|
+
description: str = "",
|
|
161
|
+
hypothesis: str = "",
|
|
162
|
+
variant_names: Optional[List[str]] = None,
|
|
163
|
+
primary_kpi: str = "confidence_final_mean",
|
|
164
|
+
constraints: Optional[List[str]] = None,
|
|
165
|
+
assignment_mode: str = "manual",
|
|
166
|
+
) -> ExperimentEnvelope:
|
|
167
|
+
"""Create a new experiment envelope with sensible defaults.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
id: Experiment ID (auto-generated if empty).
|
|
171
|
+
description: What the experiment tests.
|
|
172
|
+
hypothesis: Expected outcome.
|
|
173
|
+
variant_names: List of variant names (default: ["A", "B"]).
|
|
174
|
+
primary_kpi: KPI that decides the winner.
|
|
175
|
+
constraints: Constraint strings (e.g. "truncation_rate<=+0.02").
|
|
176
|
+
assignment_mode: How to assign runs (manual/time_window/repo_partition).
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
A fully-specified ExperimentEnvelope.
|
|
180
|
+
"""
|
|
181
|
+
if not id:
|
|
182
|
+
id = f"exp-{uuid.uuid4().hex[:8]}"
|
|
183
|
+
|
|
184
|
+
names = variant_names or ["A", "B"]
|
|
185
|
+
variants = [VariantSpec(name=n) for n in names]
|
|
186
|
+
|
|
187
|
+
parsed_constraints = []
|
|
188
|
+
if constraints:
|
|
189
|
+
for c in constraints:
|
|
190
|
+
parsed_constraints.append(parse_constraint(c))
|
|
191
|
+
|
|
192
|
+
return ExperimentEnvelope(
|
|
193
|
+
id=id,
|
|
194
|
+
created_at=time.time(),
|
|
195
|
+
description=description,
|
|
196
|
+
hypothesis=hypothesis,
|
|
197
|
+
variants=variants,
|
|
198
|
+
assignment=AssignmentSpec(mode=assignment_mode),
|
|
199
|
+
decision_rule=DecisionRule(
|
|
200
|
+
primary_kpi=primary_kpi,
|
|
201
|
+
constraints=parsed_constraints,
|
|
202
|
+
),
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def create_semantic_experiment(
|
|
207
|
+
*,
|
|
208
|
+
id: str = "",
|
|
209
|
+
description: str = "",
|
|
210
|
+
hypothesis: str = "",
|
|
211
|
+
assignment_mode: str = "manual",
|
|
212
|
+
) -> ExperimentEnvelope:
|
|
213
|
+
"""Create a Phase 4 semantic A/B experiment.
|
|
214
|
+
|
|
215
|
+
Template: A = lexical-only baseline, B = lexical + semantic_fallback.
|
|
216
|
+
|
|
217
|
+
Uses SEMANTIC_KPIS (includes semantic_lift_mean) and applies
|
|
218
|
+
a constraint to prevent truncation regression.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
id: Experiment ID (auto-generated if empty).
|
|
222
|
+
description: What the experiment tests.
|
|
223
|
+
hypothesis: Expected outcome.
|
|
224
|
+
assignment_mode: How to assign runs.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
A fully-specified ExperimentEnvelope for semantic comparison.
|
|
228
|
+
"""
|
|
229
|
+
if not id:
|
|
230
|
+
id = f"exp-semantic-{uuid.uuid4().hex[:8]}"
|
|
231
|
+
|
|
232
|
+
if not description:
|
|
233
|
+
description = "Compare lexical-only (A) vs lexical + semantic_fallback (B)"
|
|
234
|
+
|
|
235
|
+
if not hypothesis:
|
|
236
|
+
hypothesis = (
|
|
237
|
+
"Semantic fallback improves confidence_final_mean "
|
|
238
|
+
"when lexical search produces sparse matches"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
variants = [
|
|
242
|
+
VariantSpec(
|
|
243
|
+
name="A",
|
|
244
|
+
expected_effects=["Baseline: lexical search only"],
|
|
245
|
+
),
|
|
246
|
+
VariantSpec(
|
|
247
|
+
name="B",
|
|
248
|
+
expected_effects=[
|
|
249
|
+
"semantic_fallback action enabled in autopilot",
|
|
250
|
+
"Expected: higher confidence when matches are sparse",
|
|
251
|
+
],
|
|
252
|
+
),
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
return ExperimentEnvelope(
|
|
256
|
+
id=id,
|
|
257
|
+
created_at=time.time(),
|
|
258
|
+
description=description,
|
|
259
|
+
hypothesis=hypothesis,
|
|
260
|
+
kpis=list(SEMANTIC_KPIS),
|
|
261
|
+
variants=variants,
|
|
262
|
+
assignment=AssignmentSpec(mode=assignment_mode),
|
|
263
|
+
decision_rule=DecisionRule(
|
|
264
|
+
primary_kpi="confidence_final_mean",
|
|
265
|
+
constraints=[
|
|
266
|
+
{"kpi": "truncation_rate", "operator": "<=", "threshold": 0.02},
|
|
267
|
+
{"kpi": "bundle_bytes_p90", "operator": "<=", "threshold": 5000},
|
|
268
|
+
],
|
|
269
|
+
tie_breakers=["semantic_lift_mean", "confidence_delta_mean"],
|
|
270
|
+
),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def create_narrowing_experiment(
|
|
275
|
+
*,
|
|
276
|
+
id: str = "",
|
|
277
|
+
description: str = "",
|
|
278
|
+
hypothesis: str = "",
|
|
279
|
+
assignment_mode: str = "manual",
|
|
280
|
+
) -> ExperimentEnvelope:
|
|
281
|
+
"""Create a Phase 4.2 narrowing A/B experiment.
|
|
282
|
+
|
|
283
|
+
Template: A = semantic_fallback (baseline), B = semantic_fallback
|
|
284
|
+
+ candidate narrowing (exclude_top_k).
|
|
285
|
+
|
|
286
|
+
Primary KPI is semantic_lift_mean — we want to verify narrowing
|
|
287
|
+
preserves lift while reducing latency.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
id: Experiment ID (auto-generated if empty).
|
|
291
|
+
description: What the experiment tests.
|
|
292
|
+
hypothesis: Expected outcome.
|
|
293
|
+
assignment_mode: How to assign runs.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
A fully-specified ExperimentEnvelope for narrowing comparison.
|
|
297
|
+
"""
|
|
298
|
+
if not id:
|
|
299
|
+
id = f"exp-narrowing-{uuid.uuid4().hex[:8]}"
|
|
300
|
+
|
|
301
|
+
if not description:
|
|
302
|
+
description = (
|
|
303
|
+
"Compare semantic_fallback without narrowing (A) "
|
|
304
|
+
"vs semantic_fallback with exclude_top_k narrowing (B)"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
if not hypothesis:
|
|
308
|
+
hypothesis = (
|
|
309
|
+
"Candidate narrowing reduces semantic_time_ms p90 "
|
|
310
|
+
"without materially reducing semantic_lift_mean"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
variants = [
|
|
314
|
+
VariantSpec(
|
|
315
|
+
name="A",
|
|
316
|
+
expected_effects=[
|
|
317
|
+
"Baseline: semantic_fallback searches all chunks",
|
|
318
|
+
],
|
|
319
|
+
),
|
|
320
|
+
VariantSpec(
|
|
321
|
+
name="B",
|
|
322
|
+
expected_effects=[
|
|
323
|
+
"exclude_top_k=10 narrowing enabled",
|
|
324
|
+
"Expected: lower latency, comparable lift",
|
|
325
|
+
],
|
|
326
|
+
),
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
return ExperimentEnvelope(
|
|
330
|
+
id=id,
|
|
331
|
+
created_at=time.time(),
|
|
332
|
+
description=description,
|
|
333
|
+
hypothesis=hypothesis,
|
|
334
|
+
kpis=list(SEMANTIC_KPIS),
|
|
335
|
+
variants=variants,
|
|
336
|
+
assignment=AssignmentSpec(mode=assignment_mode),
|
|
337
|
+
decision_rule=DecisionRule(
|
|
338
|
+
primary_kpi="semantic_lift_mean",
|
|
339
|
+
constraints=[
|
|
340
|
+
{
|
|
341
|
+
"kpi": "truncation_rate",
|
|
342
|
+
"operator": "<=",
|
|
343
|
+
"threshold": 0.02,
|
|
344
|
+
},
|
|
345
|
+
],
|
|
346
|
+
tie_breakers=["confidence_final_mean", "confidence_delta_mean"],
|
|
347
|
+
),
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def validate_experiment(data: Dict[str, Any]) -> List[str]:
|
|
352
|
+
"""Validate an experiment envelope dict.
|
|
353
|
+
|
|
354
|
+
Returns a list of error strings (empty = valid).
|
|
355
|
+
"""
|
|
356
|
+
errors: List[str] = []
|
|
357
|
+
|
|
358
|
+
if data.get("experiment_schema_version") != EXPERIMENT_SCHEMA_VERSION:
|
|
359
|
+
errors.append(
|
|
360
|
+
f"Unsupported schema version: "
|
|
361
|
+
f"{data.get('experiment_schema_version')} "
|
|
362
|
+
f"(expected {EXPERIMENT_SCHEMA_VERSION})"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if not data.get("id"):
|
|
366
|
+
errors.append("Missing experiment id")
|
|
367
|
+
|
|
368
|
+
variants = data.get("variants", [])
|
|
369
|
+
if len(variants) < 2:
|
|
370
|
+
errors.append(f"Need at least 2 variants, got {len(variants)}")
|
|
371
|
+
|
|
372
|
+
names = [v.get("name", "") for v in variants]
|
|
373
|
+
if len(names) != len(set(names)):
|
|
374
|
+
errors.append("Duplicate variant names")
|
|
375
|
+
|
|
376
|
+
dr = data.get("decision_rule", {})
|
|
377
|
+
if not dr.get("primary_kpi"):
|
|
378
|
+
errors.append("Missing decision_rule.primary_kpi")
|
|
379
|
+
|
|
380
|
+
return errors
|