claude-toolstack-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_toolstack_cli-1.0.0.dist-info/METADATA +354 -0
- claude_toolstack_cli-1.0.0.dist-info/RECORD +48 -0
- claude_toolstack_cli-1.0.0.dist-info/WHEEL +5 -0
- claude_toolstack_cli-1.0.0.dist-info/entry_points.txt +2 -0
- claude_toolstack_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
- claude_toolstack_cli-1.0.0.dist-info/top_level.txt +1 -0
- cts/__init__.py +3 -0
- cts/__main__.py +5 -0
- cts/autopilot.py +633 -0
- cts/bundle.py +958 -0
- cts/cli.py +2858 -0
- cts/confidence.py +218 -0
- cts/config.py +19 -0
- cts/corpus/__init__.py +139 -0
- cts/corpus/apply.py +305 -0
- cts/corpus/archive.py +309 -0
- cts/corpus/baseline.py +294 -0
- cts/corpus/evaluate.py +409 -0
- cts/corpus/experiment_eval.py +585 -0
- cts/corpus/experiment_schema.py +380 -0
- cts/corpus/extract.py +353 -0
- cts/corpus/load.py +44 -0
- cts/corpus/model.py +114 -0
- cts/corpus/patch.py +467 -0
- cts/corpus/registry.py +420 -0
- cts/corpus/report.py +745 -0
- cts/corpus/scan.py +87 -0
- cts/corpus/store.py +63 -0
- cts/corpus/trends.py +478 -0
- cts/corpus/tuning_schema.py +313 -0
- cts/corpus/variants.py +335 -0
- cts/ctags.py +133 -0
- cts/diff_context.py +92 -0
- cts/errors.py +109 -0
- cts/http.py +89 -0
- cts/ranking.py +466 -0
- cts/render.py +388 -0
- cts/schema.py +96 -0
- cts/semantic/__init__.py +47 -0
- cts/semantic/candidates.py +150 -0
- cts/semantic/chunker.py +184 -0
- cts/semantic/config.py +120 -0
- cts/semantic/embedder.py +151 -0
- cts/semantic/indexer.py +159 -0
- cts/semantic/search.py +252 -0
- cts/semantic/store.py +330 -0
- cts/sidecar.py +431 -0
- cts/structural.py +305 -0
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
"""Experiment evaluation: assign runs to variants and pick a winner.
|
|
2
|
+
|
|
3
|
+
Implements the full experiment lifecycle:
|
|
4
|
+
1. **Assignment** — tag each corpus record with a variant name
|
|
5
|
+
2. **Per-variant KPIs** — extract KPIs for each variant's records
|
|
6
|
+
3. **Decision** — apply decision rules (primary KPI, constraints,
|
|
7
|
+
tie-breakers) to determine the winner
|
|
8
|
+
|
|
9
|
+
Assignment modes:
|
|
10
|
+
- ``manual``: records already tagged (variant field present)
|
|
11
|
+
- ``repo_partition``: map repo → variant via a partition dict
|
|
12
|
+
- ``time_window``: map timestamp → variant via date ranges
|
|
13
|
+
|
|
14
|
+
Decision algorithm:
|
|
15
|
+
1. Check constraints — a variant with any constraint violation is
|
|
16
|
+
eliminated (unless all variants violate the same constraint).
|
|
17
|
+
2. Compare primary KPI — the variant with the better value wins.
|
|
18
|
+
3. Tie-breakers — if primary KPI is within noise threshold, fall
|
|
19
|
+
through to tie-breaker KPIs in order.
|
|
20
|
+
4. If still tied → verdict ``tie``.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import Any, Dict, List, Tuple
|
|
26
|
+
|
|
27
|
+
from cts.corpus.evaluate import extract_kpis
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Assignment: tag records with variant names
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
# KPI direction maps (higher = better or lower = better)
|
|
34
|
+
_HIGHER_BETTER = {
|
|
35
|
+
"confidence_final_mean",
|
|
36
|
+
"confidence_delta_mean",
|
|
37
|
+
"semantic_lift_mean",
|
|
38
|
+
}
|
|
39
|
+
_LOWER_BETTER = {
|
|
40
|
+
"truncation_rate",
|
|
41
|
+
"autopilot_low_lift_rate",
|
|
42
|
+
"bundle_bytes_p90",
|
|
43
|
+
"should_autopilot_count",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Noise thresholds — differences below these are not significant
|
|
47
|
+
_NOISE: Dict[str, float] = {
|
|
48
|
+
"confidence_final_mean": 0.02,
|
|
49
|
+
"confidence_delta_mean": 0.02,
|
|
50
|
+
"truncation_rate": 0.02,
|
|
51
|
+
"autopilot_low_lift_rate": 0.05,
|
|
52
|
+
"bundle_bytes_p90": 5000,
|
|
53
|
+
"should_autopilot_count": 1,
|
|
54
|
+
"semantic_lift_mean": 0.01,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def assign_manual(
|
|
59
|
+
records: List[Dict[str, Any]],
|
|
60
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
61
|
+
"""Group records by their existing ``variant`` field.
|
|
62
|
+
|
|
63
|
+
Records without a ``variant`` field are placed in "unassigned".
|
|
64
|
+
"""
|
|
65
|
+
groups: Dict[str, List[Dict[str, Any]]] = {}
|
|
66
|
+
for rec in records:
|
|
67
|
+
variant = rec.get("variant", "unassigned")
|
|
68
|
+
groups.setdefault(variant, []).append(rec)
|
|
69
|
+
return groups
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def assign_repo_partition(
|
|
73
|
+
records: List[Dict[str, Any]],
|
|
74
|
+
partition: Dict[str, List[str]],
|
|
75
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
76
|
+
"""Assign records to variants based on repo membership.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
records: Corpus records with a ``repo`` field.
|
|
80
|
+
partition: Mapping ``{variant_name: [repo1, repo2, ...]}``.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Dict of variant_name → list of records.
|
|
84
|
+
"""
|
|
85
|
+
# Build reverse lookup: repo → variant
|
|
86
|
+
repo_to_variant: Dict[str, str] = {}
|
|
87
|
+
for variant, repos in partition.items():
|
|
88
|
+
for repo in repos:
|
|
89
|
+
repo_to_variant[repo] = variant
|
|
90
|
+
|
|
91
|
+
groups: Dict[str, List[Dict[str, Any]]] = {}
|
|
92
|
+
for rec in records:
|
|
93
|
+
repo = rec.get("repo", "")
|
|
94
|
+
variant = repo_to_variant.get(repo, "unassigned")
|
|
95
|
+
groups.setdefault(variant, []).append(rec)
|
|
96
|
+
return groups
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def assign_time_window(
|
|
100
|
+
records: List[Dict[str, Any]],
|
|
101
|
+
windows: Dict[str, Tuple[float, float]],
|
|
102
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
103
|
+
"""Assign records to variants based on timestamp ranges.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
records: Corpus records with a ``timestamp`` field.
|
|
107
|
+
windows: Mapping ``{variant_name: (start_ts, end_ts)}``.
|
|
108
|
+
Timestamps are epoch seconds. Intervals are half-open:
|
|
109
|
+
``start <= ts < end``.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dict of variant_name → list of records.
|
|
113
|
+
"""
|
|
114
|
+
groups: Dict[str, List[Dict[str, Any]]] = {}
|
|
115
|
+
for rec in records:
|
|
116
|
+
ts = rec.get("timestamp", 0.0)
|
|
117
|
+
assigned = "unassigned"
|
|
118
|
+
for variant, (start, end) in windows.items():
|
|
119
|
+
if start <= ts < end:
|
|
120
|
+
assigned = variant
|
|
121
|
+
break
|
|
122
|
+
groups.setdefault(assigned, []).append(rec)
|
|
123
|
+
return groups
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def assign_records(
|
|
127
|
+
records: List[Dict[str, Any]],
|
|
128
|
+
assignment: Dict[str, Any],
|
|
129
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
130
|
+
"""Route to the correct assignment function based on mode.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
records: Corpus records.
|
|
134
|
+
assignment: Assignment spec dict with ``mode`` and ``details``.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Dict of variant_name → list of records.
|
|
138
|
+
"""
|
|
139
|
+
mode = assignment.get("mode", "manual")
|
|
140
|
+
details = assignment.get("details", {})
|
|
141
|
+
|
|
142
|
+
if mode == "repo_partition":
|
|
143
|
+
return assign_repo_partition(records, details)
|
|
144
|
+
elif mode == "time_window":
|
|
145
|
+
# Convert detail values to tuples
|
|
146
|
+
windows: Dict[str, Tuple[float, float]] = {}
|
|
147
|
+
for name, window in details.items():
|
|
148
|
+
if isinstance(window, (list, tuple)) and len(window) == 2:
|
|
149
|
+
windows[name] = (float(window[0]), float(window[1]))
|
|
150
|
+
return assign_time_window(records, windows)
|
|
151
|
+
else:
|
|
152
|
+
return assign_manual(records)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ---------------------------------------------------------------------------
|
|
156
|
+
# Decision logic
|
|
157
|
+
# ---------------------------------------------------------------------------
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _check_constraint(
|
|
161
|
+
kpis: Dict[str, Any],
|
|
162
|
+
constraint: Dict[str, Any],
|
|
163
|
+
) -> bool:
|
|
164
|
+
"""Check if a single constraint is satisfied.
|
|
165
|
+
|
|
166
|
+
Returns True if constraint passes, False if violated.
|
|
167
|
+
"""
|
|
168
|
+
kpi_name = constraint.get("kpi", "")
|
|
169
|
+
operator = constraint.get("operator", "<=")
|
|
170
|
+
threshold = constraint.get("threshold", 0.0)
|
|
171
|
+
|
|
172
|
+
value = kpis.get(kpi_name)
|
|
173
|
+
if value is None:
|
|
174
|
+
return True # Can't check — assume ok
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
val = float(value)
|
|
178
|
+
thr = float(threshold)
|
|
179
|
+
except (ValueError, TypeError):
|
|
180
|
+
return True
|
|
181
|
+
|
|
182
|
+
if operator == "<=":
|
|
183
|
+
return val <= thr
|
|
184
|
+
elif operator == ">=":
|
|
185
|
+
return val >= thr
|
|
186
|
+
elif operator == "<":
|
|
187
|
+
return val < thr
|
|
188
|
+
elif operator == ">":
|
|
189
|
+
return val > thr
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def check_constraints(
|
|
194
|
+
kpis: Dict[str, Any],
|
|
195
|
+
constraints: List[Dict[str, Any]],
|
|
196
|
+
) -> List[Dict[str, Any]]:
|
|
197
|
+
"""Check all constraints against KPIs.
|
|
198
|
+
|
|
199
|
+
Returns list of violation dicts (empty if all pass).
|
|
200
|
+
"""
|
|
201
|
+
violations: List[Dict[str, Any]] = []
|
|
202
|
+
for c in constraints:
|
|
203
|
+
if not _check_constraint(kpis, c):
|
|
204
|
+
violations.append(
|
|
205
|
+
{
|
|
206
|
+
"kpi": c.get("kpi", ""),
|
|
207
|
+
"operator": c.get("operator", "<="),
|
|
208
|
+
"threshold": c.get("threshold", 0.0),
|
|
209
|
+
"actual": kpis.get(c.get("kpi", "")),
|
|
210
|
+
}
|
|
211
|
+
)
|
|
212
|
+
return violations
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _compare_kpi(
|
|
216
|
+
kpi_name: str,
|
|
217
|
+
value_a: float,
|
|
218
|
+
value_b: float,
|
|
219
|
+
) -> str:
|
|
220
|
+
"""Compare two KPI values accounting for direction and noise.
|
|
221
|
+
|
|
222
|
+
Returns "a_wins", "b_wins", or "tie".
|
|
223
|
+
"""
|
|
224
|
+
noise = _NOISE.get(kpi_name, 0.01)
|
|
225
|
+
|
|
226
|
+
if abs(value_a - value_b) <= noise:
|
|
227
|
+
return "tie"
|
|
228
|
+
|
|
229
|
+
if kpi_name in _HIGHER_BETTER:
|
|
230
|
+
return "a_wins" if value_a > value_b else "b_wins"
|
|
231
|
+
elif kpi_name in _LOWER_BETTER:
|
|
232
|
+
return "a_wins" if value_a < value_b else "b_wins"
|
|
233
|
+
|
|
234
|
+
# Unknown direction — treat as higher=better
|
|
235
|
+
return "a_wins" if value_a > value_b else "b_wins"
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def pick_winner(
|
|
239
|
+
variant_kpis: Dict[str, Dict[str, Any]],
|
|
240
|
+
decision_rule: Dict[str, Any],
|
|
241
|
+
) -> Dict[str, Any]:
|
|
242
|
+
"""Apply decision rules to determine the experiment winner.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
variant_kpis: Mapping ``{variant_name: kpi_dict}``.
|
|
246
|
+
decision_rule: Decision rule with primary_kpi, constraints,
|
|
247
|
+
tie_breakers.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Result dict with winner, verdict, constraint info, and reasoning.
|
|
251
|
+
"""
|
|
252
|
+
primary_kpi = decision_rule.get("primary_kpi", "confidence_final_mean")
|
|
253
|
+
constraints = decision_rule.get("constraints", [])
|
|
254
|
+
tie_breakers = decision_rule.get("tie_breakers", [])
|
|
255
|
+
|
|
256
|
+
variant_names = sorted(variant_kpis.keys())
|
|
257
|
+
|
|
258
|
+
# Need at least 2 variants to compare
|
|
259
|
+
if len(variant_names) < 2:
|
|
260
|
+
return {
|
|
261
|
+
"winner": variant_names[0] if variant_names else None,
|
|
262
|
+
"verdict": "insufficient_variants",
|
|
263
|
+
"reasoning": f"Only {len(variant_names)} variant(s) — need at least 2",
|
|
264
|
+
"per_variant": {},
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# Check for data in each variant
|
|
268
|
+
per_variant: Dict[str, Any] = {}
|
|
269
|
+
for name in variant_names:
|
|
270
|
+
kpis = variant_kpis[name]
|
|
271
|
+
total = kpis.get("total", 0)
|
|
272
|
+
violations = check_constraints(kpis, constraints)
|
|
273
|
+
per_variant[name] = {
|
|
274
|
+
"total": total,
|
|
275
|
+
"kpis": kpis,
|
|
276
|
+
"constraint_violations": violations,
|
|
277
|
+
"eliminated": len(violations) > 0,
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
# Check if any variant has no data
|
|
281
|
+
no_data = [n for n in variant_names if per_variant[n]["total"] == 0]
|
|
282
|
+
if no_data:
|
|
283
|
+
return {
|
|
284
|
+
"winner": None,
|
|
285
|
+
"verdict": "no_data",
|
|
286
|
+
"reasoning": f"Variant(s) with no data: {', '.join(no_data)}",
|
|
287
|
+
"per_variant": per_variant,
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# Eliminate variants that violate constraints
|
|
291
|
+
# (unless ALL variants violate the same constraint)
|
|
292
|
+
surviving = [n for n in variant_names if not per_variant[n]["eliminated"]]
|
|
293
|
+
|
|
294
|
+
if len(surviving) == 0:
|
|
295
|
+
# All variants violate constraints — compare anyway but flag it
|
|
296
|
+
surviving = list(variant_names)
|
|
297
|
+
all_violated = True
|
|
298
|
+
else:
|
|
299
|
+
all_violated = False
|
|
300
|
+
|
|
301
|
+
if len(surviving) == 1:
|
|
302
|
+
return {
|
|
303
|
+
"winner": surviving[0],
|
|
304
|
+
"verdict": "winner_by_elimination",
|
|
305
|
+
"reasoning": (
|
|
306
|
+
f"{surviving[0]} is the only variant passing all constraints"
|
|
307
|
+
),
|
|
308
|
+
"per_variant": per_variant,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
# Compare primary KPI across surviving variants
|
|
312
|
+
# For >2 variants, do pairwise from first surviving
|
|
313
|
+
best = surviving[0]
|
|
314
|
+
reasoning_parts: List[str] = []
|
|
315
|
+
resolved_by_tiebreaker = False
|
|
316
|
+
|
|
317
|
+
for challenger in surviving[1:]:
|
|
318
|
+
best_val = per_variant[best]["kpis"].get(primary_kpi, 0)
|
|
319
|
+
chal_val = per_variant[challenger]["kpis"].get(primary_kpi, 0)
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
best_val = float(best_val)
|
|
323
|
+
chal_val = float(chal_val)
|
|
324
|
+
except (ValueError, TypeError):
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
result = _compare_kpi(primary_kpi, best_val, chal_val)
|
|
328
|
+
|
|
329
|
+
if result == "b_wins":
|
|
330
|
+
best = challenger
|
|
331
|
+
reasoning_parts.append(
|
|
332
|
+
f"{challenger} beats {best} on {primary_kpi} "
|
|
333
|
+
f"({chal_val:.4f} vs {best_val:.4f})"
|
|
334
|
+
)
|
|
335
|
+
elif result == "tie":
|
|
336
|
+
# Try tie-breakers
|
|
337
|
+
tie_broken = False
|
|
338
|
+
for tb_kpi in tie_breakers:
|
|
339
|
+
tb_best = per_variant[best]["kpis"].get(tb_kpi, 0)
|
|
340
|
+
tb_chal = per_variant[challenger]["kpis"].get(tb_kpi, 0)
|
|
341
|
+
try:
|
|
342
|
+
tb_best = float(tb_best)
|
|
343
|
+
tb_chal = float(tb_chal)
|
|
344
|
+
except (ValueError, TypeError):
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
tb_result = _compare_kpi(tb_kpi, tb_best, tb_chal)
|
|
348
|
+
if tb_result == "b_wins":
|
|
349
|
+
best = challenger
|
|
350
|
+
reasoning_parts.append(
|
|
351
|
+
f"Tied on {primary_kpi}, {challenger} wins tie-breaker {tb_kpi}"
|
|
352
|
+
)
|
|
353
|
+
tie_broken = True
|
|
354
|
+
resolved_by_tiebreaker = True
|
|
355
|
+
break
|
|
356
|
+
elif tb_result == "a_wins":
|
|
357
|
+
reasoning_parts.append(
|
|
358
|
+
f"Tied on {primary_kpi}, {best} wins tie-breaker {tb_kpi}"
|
|
359
|
+
)
|
|
360
|
+
tie_broken = True
|
|
361
|
+
resolved_by_tiebreaker = True
|
|
362
|
+
break
|
|
363
|
+
|
|
364
|
+
if not tie_broken:
|
|
365
|
+
reasoning_parts.append(
|
|
366
|
+
f"Tied on {primary_kpi} between {best} and {challenger}"
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
# best stays
|
|
370
|
+
reasoning_parts.append(
|
|
371
|
+
f"{best} beats {challenger} on {primary_kpi} "
|
|
372
|
+
f"({best_val:.4f} vs {chal_val:.4f})"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Determine final verdict
|
|
376
|
+
if resolved_by_tiebreaker:
|
|
377
|
+
# Tie-breaker resolved it — trust the result
|
|
378
|
+
verdict = "winner"
|
|
379
|
+
winner = best
|
|
380
|
+
else:
|
|
381
|
+
# Check if the "best" is actually clearly better or tied
|
|
382
|
+
all_kpi_vals = []
|
|
383
|
+
for name in surviving:
|
|
384
|
+
val = per_variant[name]["kpis"].get(primary_kpi, 0)
|
|
385
|
+
try:
|
|
386
|
+
all_kpi_vals.append((name, float(val)))
|
|
387
|
+
except (ValueError, TypeError):
|
|
388
|
+
all_kpi_vals.append((name, 0.0))
|
|
389
|
+
|
|
390
|
+
noise = _NOISE.get(primary_kpi, 0.01)
|
|
391
|
+
best_val_f = dict(all_kpi_vals).get(best, 0.0)
|
|
392
|
+
all_within_noise = all(abs(v - best_val_f) <= noise for _, v in all_kpi_vals)
|
|
393
|
+
|
|
394
|
+
if all_within_noise and len(surviving) > 1:
|
|
395
|
+
verdict = "tie"
|
|
396
|
+
winner = None
|
|
397
|
+
else:
|
|
398
|
+
verdict = "winner"
|
|
399
|
+
winner = best
|
|
400
|
+
|
|
401
|
+
if all_violated:
|
|
402
|
+
verdict = f"{verdict}_all_constraints_violated"
|
|
403
|
+
|
|
404
|
+
return {
|
|
405
|
+
"winner": winner,
|
|
406
|
+
"verdict": verdict,
|
|
407
|
+
"reasoning": (
|
|
408
|
+
"; ".join(reasoning_parts) if reasoning_parts else "No comparison needed"
|
|
409
|
+
),
|
|
410
|
+
"per_variant": per_variant,
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
# ---------------------------------------------------------------------------
|
|
415
|
+
# Full experiment evaluation
|
|
416
|
+
# ---------------------------------------------------------------------------
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def evaluate_experiment(
|
|
420
|
+
records: List[Dict[str, Any]],
|
|
421
|
+
experiment: Dict[str, Any],
|
|
422
|
+
) -> Dict[str, Any]:
|
|
423
|
+
"""End-to-end experiment evaluation.
|
|
424
|
+
|
|
425
|
+
1. Assign records to variants.
|
|
426
|
+
2. Extract KPIs per variant.
|
|
427
|
+
3. Apply decision rules to pick a winner.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
records: Corpus records (all variants mixed).
|
|
431
|
+
experiment: Experiment envelope dict.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
Result dict with per-variant KPIs, winner, and reasoning.
|
|
435
|
+
"""
|
|
436
|
+
assignment = experiment.get("assignment", {"mode": "manual"})
|
|
437
|
+
decision_rule = experiment.get("decision_rule", {})
|
|
438
|
+
variant_names = [
|
|
439
|
+
v.get("name", f"V{i}") for i, v in enumerate(experiment.get("variants", []))
|
|
440
|
+
]
|
|
441
|
+
|
|
442
|
+
# Assign records to variants
|
|
443
|
+
groups = assign_records(records, assignment)
|
|
444
|
+
|
|
445
|
+
# Extract KPIs per variant
|
|
446
|
+
variant_kpis: Dict[str, Dict[str, Any]] = {}
|
|
447
|
+
for name in variant_names:
|
|
448
|
+
recs = groups.get(name, [])
|
|
449
|
+
variant_kpis[name] = extract_kpis(recs)
|
|
450
|
+
|
|
451
|
+
# Also capture unassigned if any
|
|
452
|
+
unassigned = groups.get("unassigned", [])
|
|
453
|
+
|
|
454
|
+
# Pick winner
|
|
455
|
+
result = pick_winner(variant_kpis, decision_rule)
|
|
456
|
+
|
|
457
|
+
return {
|
|
458
|
+
"experiment_id": experiment.get("id", ""),
|
|
459
|
+
"variant_count": len(variant_names),
|
|
460
|
+
"total_records": len(records),
|
|
461
|
+
"unassigned_records": len(unassigned),
|
|
462
|
+
"per_variant": result["per_variant"],
|
|
463
|
+
"winner": result["winner"],
|
|
464
|
+
"verdict": result["verdict"],
|
|
465
|
+
"reasoning": result["reasoning"],
|
|
466
|
+
"decision_rule": decision_rule,
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
# ---------------------------------------------------------------------------
|
|
471
|
+
# Renderers
|
|
472
|
+
# ---------------------------------------------------------------------------
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def render_experiment_result_text(result: Dict[str, Any]) -> str:
|
|
476
|
+
"""Render experiment evaluation as plain text."""
|
|
477
|
+
lines: List[str] = []
|
|
478
|
+
lines.append("=" * 60)
|
|
479
|
+
lines.append("EXPERIMENT EVALUATION")
|
|
480
|
+
lines.append("=" * 60)
|
|
481
|
+
lines.append("")
|
|
482
|
+
lines.append(f"Experiment: {result.get('experiment_id', '?')}")
|
|
483
|
+
lines.append(f"Records: {result.get('total_records', 0)}")
|
|
484
|
+
lines.append(f"Unassigned: {result.get('unassigned_records', 0)}")
|
|
485
|
+
lines.append(f"Verdict: {result.get('verdict', '?')}")
|
|
486
|
+
lines.append(f"Winner: {result.get('winner') or 'none'}")
|
|
487
|
+
lines.append(f"Reasoning: {result.get('reasoning', '')}")
|
|
488
|
+
lines.append("")
|
|
489
|
+
|
|
490
|
+
per_variant = result.get("per_variant", {})
|
|
491
|
+
for name in sorted(per_variant):
|
|
492
|
+
pv = per_variant[name]
|
|
493
|
+
kpis = pv.get("kpis", {})
|
|
494
|
+
lines.append(f"--- Variant: {name} ---")
|
|
495
|
+
lines.append(f" Records: {pv.get('total', 0)}")
|
|
496
|
+
lines.append(f" Eliminated: {pv.get('eliminated', False)}")
|
|
497
|
+
violations = pv.get("constraint_violations", [])
|
|
498
|
+
if violations:
|
|
499
|
+
for v in violations:
|
|
500
|
+
lines.append(
|
|
501
|
+
f" VIOLATION: {v['kpi']} {v['operator']} "
|
|
502
|
+
f"{v['threshold']} (actual: {v['actual']})"
|
|
503
|
+
)
|
|
504
|
+
for k, v in sorted(kpis.items()):
|
|
505
|
+
if k != "total":
|
|
506
|
+
lines.append(f" {k}: {v}")
|
|
507
|
+
lines.append("")
|
|
508
|
+
|
|
509
|
+
return "\n".join(lines)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def render_experiment_result_json(result: Dict[str, Any]) -> str:
|
|
513
|
+
"""Render experiment evaluation as JSON."""
|
|
514
|
+
import json
|
|
515
|
+
|
|
516
|
+
return json.dumps(result, indent=2, default=str)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def render_experiment_result_markdown(result: Dict[str, Any]) -> str:
|
|
520
|
+
"""Render experiment evaluation as Markdown."""
|
|
521
|
+
lines: List[str] = []
|
|
522
|
+
lines.append("# Experiment Evaluation")
|
|
523
|
+
lines.append("")
|
|
524
|
+
|
|
525
|
+
winner = result.get("winner") or "none"
|
|
526
|
+
verdict = result.get("verdict", "?")
|
|
527
|
+
lines.append(f"**Experiment:** {result.get('experiment_id', '?')}")
|
|
528
|
+
lines.append(f"**Records:** {result.get('total_records', 0)}")
|
|
529
|
+
lines.append(f"**Verdict:** {verdict}")
|
|
530
|
+
lines.append(f"**Winner:** {winner}")
|
|
531
|
+
lines.append(f"**Reasoning:** {result.get('reasoning', '')}")
|
|
532
|
+
lines.append("")
|
|
533
|
+
|
|
534
|
+
# KPI comparison table
|
|
535
|
+
per_variant = result.get("per_variant", {})
|
|
536
|
+
variant_names = sorted(per_variant.keys())
|
|
537
|
+
if variant_names:
|
|
538
|
+
# Collect all KPI names
|
|
539
|
+
all_kpis = set()
|
|
540
|
+
for pv in per_variant.values():
|
|
541
|
+
all_kpis.update(pv.get("kpis", {}).keys())
|
|
542
|
+
all_kpis.discard("total")
|
|
543
|
+
kpi_names = sorted(all_kpis)
|
|
544
|
+
|
|
545
|
+
# Header
|
|
546
|
+
header = "| KPI |"
|
|
547
|
+
sep = "|-----|"
|
|
548
|
+
for name in variant_names:
|
|
549
|
+
header += f" {name} |"
|
|
550
|
+
sep += "------|"
|
|
551
|
+
lines.append(header)
|
|
552
|
+
lines.append(sep)
|
|
553
|
+
|
|
554
|
+
# Records row
|
|
555
|
+
row = "| Records |"
|
|
556
|
+
for name in variant_names:
|
|
557
|
+
row += f" {per_variant[name].get('total', 0)} |"
|
|
558
|
+
lines.append(row)
|
|
559
|
+
|
|
560
|
+
# KPI rows
|
|
561
|
+
for kpi in kpi_names:
|
|
562
|
+
row = f"| {kpi} |"
|
|
563
|
+
for name in variant_names:
|
|
564
|
+
val = per_variant[name].get("kpis", {}).get(kpi, "—")
|
|
565
|
+
if isinstance(val, float):
|
|
566
|
+
row += f" {val:.4f} |"
|
|
567
|
+
else:
|
|
568
|
+
row += f" {val} |"
|
|
569
|
+
lines.append(row)
|
|
570
|
+
|
|
571
|
+
lines.append("")
|
|
572
|
+
|
|
573
|
+
# Constraint violations
|
|
574
|
+
for name in variant_names:
|
|
575
|
+
violations = per_variant[name].get("constraint_violations", [])
|
|
576
|
+
if violations:
|
|
577
|
+
lines.append(f"### ⚠ {name} — Constraint Violations")
|
|
578
|
+
for v in violations:
|
|
579
|
+
lines.append(
|
|
580
|
+
f"- `{v['kpi']}` {v['operator']} {v['threshold']} "
|
|
581
|
+
f"(actual: {v['actual']})"
|
|
582
|
+
)
|
|
583
|
+
lines.append("")
|
|
584
|
+
|
|
585
|
+
return "\n".join(lines)
|