harness-evolver 4.3.1 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/agents/evolver-evaluator.md +18 -1
- package/agents/evolver-testgen.md +5 -3
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +74 -10
- package/skills/status/SKILL.md +15 -19
- package/tools/constraint_check.py +154 -0
- package/tools/dataset_health.py +31 -0
- package/tools/evolution_chart.py +326 -0
- package/tools/mine_sessions.py +150 -0
- package/tools/read_results.py +110 -4
- package/tools/secret_filter.py +97 -0
- package/tools/seed_from_traces.py +15 -0
- package/tools/setup.py +17 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.
|
|
4
|
+
"version": "4.5.0",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
|
@@ -85,7 +85,24 @@ For each run, apply the requested evaluators. The evaluators you may be asked to
|
|
|
85
85
|
#### correctness
|
|
86
86
|
Judge: **Is the output a correct, accurate, and complete response to the input?**
|
|
87
87
|
|
|
88
|
-
|
|
88
|
+
**Rubric-aware scoring:** Some dataset examples have an `expected_behavior` rubric in their metadata. Before scoring, fetch example metadata:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
langsmith-cli --json examples list \
|
|
92
|
+
--dataset "{dataset_name}" \
|
|
93
|
+
--fields id,metadata \
|
|
94
|
+
--limit 200 \
|
|
95
|
+
--output example_metadata.jsonl
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Build a map of `reference_example_id → expected_behavior`. When scoring a run whose example has a rubric, evaluate against the rubric criteria specifically.
|
|
99
|
+
|
|
100
|
+
**With rubric:**
|
|
101
|
+
- `1.0` — Response satisfies all criteria in the rubric
|
|
102
|
+
- `0.5` — Response partially satisfies the rubric (some criteria met, others missing)
|
|
103
|
+
- `0.0` — Response fails to meet the rubric criteria
|
|
104
|
+
|
|
105
|
+
**Without rubric** (generic scoring):
|
|
89
106
|
- `1.0` — Correct and complete. The response accurately addresses the input.
|
|
90
107
|
- `0.0` — Incorrect, incomplete, or off-topic.
|
|
91
108
|
|
|
@@ -37,16 +37,18 @@ Do NOT copy production inputs verbatim — generate VARIATIONS.
|
|
|
37
37
|
|
|
38
38
|
### Phase 3: Generate Inputs
|
|
39
39
|
|
|
40
|
-
Generate 30 test inputs as a JSON file:
|
|
40
|
+
Generate 30 test inputs as a JSON file. Each example MUST include an `expected_behavior` rubric — a description of what a correct response should cover (NOT exact expected text):
|
|
41
41
|
|
|
42
42
|
```json
|
|
43
43
|
[
|
|
44
|
-
{"input": "
|
|
45
|
-
{"input": "
|
|
44
|
+
{"input": "What is Kotlin?", "expected_behavior": "Should explain Kotlin is a JVM language by JetBrains, mention null safety, and reference Android development as primary use case", "difficulty": "easy", "category": "knowledge"},
|
|
45
|
+
{"input": "Calculate 2^32", "expected_behavior": "Should return 4294967296, showing the calculation step", "difficulty": "easy", "category": "calculation"},
|
|
46
46
|
...
|
|
47
47
|
]
|
|
48
48
|
```
|
|
49
49
|
|
|
50
|
+
The `expected_behavior` is a **rubric**, not exact text. The LLM judge uses it to score responses. Write 1-3 specific, verifiable criteria per example.
|
|
51
|
+
|
|
50
52
|
Distribution:
|
|
51
53
|
- **40% Standard** (12): typical, well-formed inputs
|
|
52
54
|
- **20% Edge Cases** (6): boundary conditions, minimal inputs
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -189,6 +189,14 @@ wait # Wait for all data gathering to complete
|
|
|
189
189
|
```
|
|
190
190
|
|
|
191
191
|
If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
|
|
192
|
+
**For each failing example, include the judge's feedback comment** (from the `feedback` field) in the strategy. This gives proposers specific, actionable information about WHY examples fail:
|
|
193
|
+
|
|
194
|
+
```
|
|
195
|
+
## Failing Examples (with judge feedback)
|
|
196
|
+
- "What is Kotlin?" (score: 0.3) — Judge: "Response was factually correct but missed null safety and Android development use cases"
|
|
197
|
+
- "Calculate 2^32" (score: 0.0) — Judge: "Run failed with timeout error"
|
|
198
|
+
```
|
|
199
|
+
|
|
192
200
|
This failure data feeds into the strategy and lens generation step (1.8a).
|
|
193
201
|
If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
|
|
194
202
|
|
|
@@ -373,7 +381,7 @@ Then spawn ONE evaluator agent that scores ALL candidates in a single pass. This
|
|
|
373
381
|
Agent(
|
|
374
382
|
subagent_type: "evolver-evaluator",
|
|
375
383
|
description: "Evaluate all candidates for iteration v{NNN}",
|
|
376
|
-
prompt: "Experiments to evaluate: {comma-separated experiment names from non-abstained proposers}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}."
|
|
384
|
+
prompt: "Experiments to evaluate: {comma-separated experiment names from non-abstained proposers}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}. Dataset: {dataset_name}. NOTE: Some examples have expected_behavior rubrics in their metadata — fetch example metadata and use rubrics for scoring when available."
|
|
377
385
|
)
|
|
378
386
|
```
|
|
379
387
|
|
|
@@ -385,17 +393,47 @@ Wait for the evaluator agent to complete before proceeding.
|
|
|
385
393
|
$EVOLVER_PY $TOOLS/read_results.py \
|
|
386
394
|
--experiments "{comma-separated list of experiment names from non-abstained proposers}" \
|
|
387
395
|
--config .evolver.json \
|
|
396
|
+
--split held_out \
|
|
388
397
|
--output comparison.json
|
|
389
398
|
```
|
|
390
399
|
|
|
391
400
|
Parse `comparison.json`:
|
|
392
|
-
- `comparison.winner` — highest combined score
|
|
401
|
+
- `comparison.winner` — highest combined score **on held-out data** (never seen during optimization)
|
|
393
402
|
- `comparison.champion` — per-task champion (for next iteration's context)
|
|
403
|
+
- `comparison.pareto_front` — non-dominated candidates across evaluators (if >1, report tradeoffs)
|
|
394
404
|
- `comparison.all_candidates` — all scores for reporting
|
|
395
405
|
|
|
406
|
+
If `comparison.pareto_front` has more than 1 entry, report it:
|
|
407
|
+
```
|
|
408
|
+
Pareto front ({N} non-dominated candidates):
|
|
409
|
+
v{NNN}-1: {evaluator_scores} (winner by combined score)
|
|
410
|
+
v{NNN}-3: {evaluator_scores} (different tradeoff)
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
### 4.5. Constraint Gate
|
|
414
|
+
|
|
415
|
+
Before merging, validate the winner passes hard constraints:
|
|
416
|
+
|
|
417
|
+
```bash
|
|
418
|
+
$EVOLVER_PY $TOOLS/constraint_check.py \
|
|
419
|
+
--config .evolver.json \
|
|
420
|
+
--worktree-path "{winner_worktree_path}" \
|
|
421
|
+
--baseline-path "." \
|
|
422
|
+
--output constraint_result.json
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
If `all_pass` is false, skip this candidate and try the next-best from `comparison.all_candidates`. If NO candidates pass constraints, log a warning and proceed to next iteration without merging:
|
|
426
|
+
|
|
427
|
+
```
|
|
428
|
+
WARNING: No candidates passed constraint gates. Skipping merge.
|
|
429
|
+
growth: {growth_pct}% (limit: 30%)
|
|
430
|
+
entry_point: {pass/fail}
|
|
431
|
+
tests: {pass/fail}
|
|
432
|
+
```
|
|
433
|
+
|
|
396
434
|
### 5. Merge Winner
|
|
397
435
|
|
|
398
|
-
If the winner scored higher than the current best:
|
|
436
|
+
If the winner scored higher than the current best AND passed constraint gates:
|
|
399
437
|
|
|
400
438
|
```bash
|
|
401
439
|
# Get the winning worktree's branch
|
|
@@ -405,7 +443,20 @@ WINNER_BRANCH={winning_worktree_branch}
|
|
|
405
443
|
git merge $WINNER_BRANCH --no-edit -m "evolve: merge v{NNN}-{lens_id} (score: {score})"
|
|
406
444
|
```
|
|
407
445
|
|
|
408
|
-
Update `.evolver.json
|
|
446
|
+
Update `.evolver.json` with enriched history entry:
|
|
447
|
+
|
|
448
|
+
Extract winner metrics for the chart:
|
|
449
|
+
- `tokens`, `latency_ms`, `errors` → from `comparison.all_candidates` for the winner
|
|
450
|
+
- `passing`, `total` → count per_example scores ≥0.5 vs total from best_results.json (re-read for winner experiment)
|
|
451
|
+
- `per_evaluator` → average each evaluator's scores across per_example from best_results.json
|
|
452
|
+
- `approach` → first line of `## Approach` section from winner's proposal.md
|
|
453
|
+
- `lens` → the `source` field from the winning proposer's lens in lenses.json
|
|
454
|
+
- `code_loc` → count lines of code after merge for growth tracking:
|
|
455
|
+
|
|
456
|
+
```bash
|
|
457
|
+
CODE_LOC=$(find . -name "*.py" -not -path "./.venv/*" -not -path "./venv/*" -not -path "./__pycache__/*" | xargs wc -l 2>/dev/null | tail -1 | awk '{print $1}')
|
|
458
|
+
```
|
|
459
|
+
|
|
409
460
|
```python
|
|
410
461
|
import json
|
|
411
462
|
c = json.load(open('.evolver.json'))
|
|
@@ -415,7 +466,16 @@ c['iterations'] = c['iterations'] + 1
|
|
|
415
466
|
c['history'].append({
|
|
416
467
|
'version': 'v{NNN}',
|
|
417
468
|
'experiment': '{winner_experiment}',
|
|
418
|
-
'score': {winner_score}
|
|
469
|
+
'score': {winner_score},
|
|
470
|
+
'tokens': {winner_tokens},
|
|
471
|
+
'latency_ms': {winner_latency_ms},
|
|
472
|
+
'error_count': {winner_errors},
|
|
473
|
+
'passing': {winner_passing},
|
|
474
|
+
'total': {winner_total},
|
|
475
|
+
'per_evaluator': {winner_per_evaluator_dict},
|
|
476
|
+
'approach': '{approach_from_proposal_md}',
|
|
477
|
+
'lens': '{lens_source}',
|
|
478
|
+
'code_loc': {code_loc}
|
|
419
479
|
})
|
|
420
480
|
json.dump(c, open('.evolver.json', 'w'), indent=2)
|
|
421
481
|
```
|
|
@@ -529,9 +589,13 @@ If stopping, skip to the final report. If continuing, proceed to next iteration.
|
|
|
529
589
|
|
|
530
590
|
## When Loop Ends — Final Report
|
|
531
591
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
592
|
+
Display the evolution chart:
|
|
593
|
+
|
|
594
|
+
```bash
|
|
595
|
+
$EVOLVER_PY $TOOLS/evolution_chart.py --config .evolver.json
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
Then add:
|
|
599
|
+
- LangSmith experiment URL for the best experiment (construct from project name)
|
|
600
|
+
- `git log --oneline` from baseline to current HEAD (key changes summary)
|
|
537
601
|
- Suggest: `/evolver:deploy` to finalize
|
package/skills/status/SKILL.md
CHANGED
|
@@ -10,27 +10,23 @@ Show current evolution progress.
|
|
|
10
10
|
|
|
11
11
|
## What To Do
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
### Resolve Tool Path
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
c = json.load(open('.evolver.json'))
|
|
19
|
-
print(f'Project: {c[\"project\"]}')
|
|
20
|
-
print(f'Dataset: {c[\"dataset\"]}')
|
|
21
|
-
print(f'Framework: {c[\"framework\"]}')
|
|
22
|
-
print(f'Evaluators: {c[\"evaluators\"]}')
|
|
23
|
-
print(f'Iterations: {c[\"iterations\"]}')
|
|
24
|
-
print(f'Best: {c[\"best_experiment\"]} (score: {c[\"best_score\"]:.3f})')
|
|
25
|
-
print(f'Baseline: {c[\"history\"][0][\"score\"]:.3f}' if c['history'] else 'No baseline')
|
|
26
|
-
print()
|
|
27
|
-
print('History:')
|
|
28
|
-
for h in c.get('history', []):
|
|
29
|
-
print(f' {h[\"version\"]}: {h[\"score\"]:.3f}')
|
|
30
|
-
"
|
|
16
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
17
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
31
18
|
```
|
|
32
19
|
|
|
33
|
-
|
|
34
|
-
Detect regression: if current best is lower than a previous best, warn.
|
|
20
|
+
### Display Chart
|
|
35
21
|
|
|
36
|
-
|
|
22
|
+
```bash
|
|
23
|
+
$EVOLVER_PY $TOOLS/evolution_chart.py --config .evolver.json
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Additional Analysis
|
|
27
|
+
|
|
28
|
+
After displaying the chart:
|
|
29
|
+
|
|
30
|
+
- Detect stagnation: if last 3 scores within 1% of each other, warn and suggest `/evolver:evolve` with architect trigger.
|
|
31
|
+
- Detect regression: if current best is lower than a previous best, warn.
|
|
32
|
+
- Print LangSmith experiment URL for the best experiment if available.
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Constraint checker for evolution proposals.
|
|
3
|
+
|
|
4
|
+
Validates that a candidate proposal doesn't violate hard constraints
|
|
5
|
+
before it's merged. Inspired by Hermes Agent Self-Evolution.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 constraint_check.py \
|
|
9
|
+
--config .evolver.json \
|
|
10
|
+
--worktree-path /tmp/worktree \
|
|
11
|
+
--baseline-path /path/to/main \
|
|
12
|
+
--output constraint_result.json
|
|
13
|
+
|
|
14
|
+
Stdlib-only — no langsmith dependency.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def count_loc(directory, extensions=(".py", ".js", ".ts", ".jsx", ".tsx")):
|
|
25
|
+
"""Count lines of code in a directory, excluding venvs and node_modules."""
|
|
26
|
+
total = 0
|
|
27
|
+
skip_dirs = {".venv", "venv", "node_modules", "__pycache__", ".git"}
|
|
28
|
+
for root, dirs, files in os.walk(directory):
|
|
29
|
+
dirs[:] = [d for d in dirs if d not in skip_dirs]
|
|
30
|
+
for f in files:
|
|
31
|
+
if any(f.endswith(ext) for ext in extensions):
|
|
32
|
+
try:
|
|
33
|
+
with open(os.path.join(root, f)) as fh:
|
|
34
|
+
total += sum(1 for _ in fh)
|
|
35
|
+
except (OSError, UnicodeDecodeError):
|
|
36
|
+
pass
|
|
37
|
+
return total
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def check_growth(baseline_loc, candidate_loc, max_growth_pct=30):
|
|
41
|
+
"""Check code didn't grow beyond threshold."""
|
|
42
|
+
if baseline_loc == 0:
|
|
43
|
+
return {"pass": True, "reason": "no baseline LOC"}
|
|
44
|
+
growth = ((candidate_loc - baseline_loc) / baseline_loc) * 100
|
|
45
|
+
passed = growth <= max_growth_pct
|
|
46
|
+
return {
|
|
47
|
+
"pass": passed,
|
|
48
|
+
"baseline_loc": baseline_loc,
|
|
49
|
+
"candidate_loc": candidate_loc,
|
|
50
|
+
"growth_pct": round(growth, 1),
|
|
51
|
+
"max_growth_pct": max_growth_pct,
|
|
52
|
+
"reason": f"Code growth {growth:.1f}% {'<=' if passed else '>'} {max_growth_pct}% limit",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def check_entry_point(worktree_path, entry_point):
|
|
57
|
+
"""Check that the entry point is still runnable (syntax check)."""
|
|
58
|
+
parts = entry_point.split()
|
|
59
|
+
script_file = None
|
|
60
|
+
for part in parts:
|
|
61
|
+
if part.endswith((".py", ".js", ".ts", ".sh")):
|
|
62
|
+
script_file = part
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if not script_file:
|
|
66
|
+
return {"pass": True, "reason": "no script file detected in entry_point"}
|
|
67
|
+
|
|
68
|
+
full_path = os.path.join(worktree_path, script_file)
|
|
69
|
+
if not os.path.exists(full_path):
|
|
70
|
+
return {"pass": False, "reason": f"entry point file missing: {script_file}"}
|
|
71
|
+
|
|
72
|
+
if script_file.endswith(".py"):
|
|
73
|
+
result = subprocess.run(
|
|
74
|
+
["python3", "-m", "py_compile", full_path],
|
|
75
|
+
capture_output=True, text=True,
|
|
76
|
+
)
|
|
77
|
+
if result.returncode != 0:
|
|
78
|
+
return {"pass": False, "reason": f"syntax error: {result.stderr[:200]}"}
|
|
79
|
+
|
|
80
|
+
return {"pass": True, "reason": "entry point exists and has valid syntax"}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def check_tests(worktree_path):
|
|
84
|
+
"""Run test suite if it exists. Returns pass if no tests found."""
|
|
85
|
+
test_dirs = ["tests", "test"]
|
|
86
|
+
has_tests = False
|
|
87
|
+
for td in test_dirs:
|
|
88
|
+
test_path = os.path.join(worktree_path, td)
|
|
89
|
+
if os.path.isdir(test_path):
|
|
90
|
+
for f in os.listdir(test_path):
|
|
91
|
+
if f.startswith("test_") and f.endswith(".py"):
|
|
92
|
+
has_tests = True
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
if not has_tests:
|
|
96
|
+
return {"pass": True, "reason": "no test suite found (skipped)", "skipped": True}
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
result = subprocess.run(
|
|
100
|
+
["python3", "-m", "pytest", "-q", "--tb=no"],
|
|
101
|
+
capture_output=True, text=True,
|
|
102
|
+
cwd=worktree_path, timeout=120,
|
|
103
|
+
)
|
|
104
|
+
passed = result.returncode == 0
|
|
105
|
+
return {
|
|
106
|
+
"pass": passed,
|
|
107
|
+
"reason": result.stdout.strip()[:200] if passed else result.stderr.strip()[:200],
|
|
108
|
+
"skipped": False,
|
|
109
|
+
}
|
|
110
|
+
except FileNotFoundError:
|
|
111
|
+
return {"pass": True, "reason": "pytest not available (skipped)", "skipped": True}
|
|
112
|
+
except subprocess.TimeoutExpired:
|
|
113
|
+
return {"pass": False, "reason": "test suite timed out after 120s", "skipped": False}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def main():
|
|
117
|
+
parser = argparse.ArgumentParser(description="Check constraints on a proposal")
|
|
118
|
+
parser.add_argument("--config", default=".evolver.json")
|
|
119
|
+
parser.add_argument("--worktree-path", required=True, help="Candidate worktree path")
|
|
120
|
+
parser.add_argument("--baseline-path", default=".", help="Baseline (main) path")
|
|
121
|
+
parser.add_argument("--max-growth", type=int, default=30, help="Max code growth %% (default 30)")
|
|
122
|
+
parser.add_argument("--output", default=None)
|
|
123
|
+
args = parser.parse_args()
|
|
124
|
+
|
|
125
|
+
with open(args.config) as f:
|
|
126
|
+
config = json.load(f)
|
|
127
|
+
|
|
128
|
+
entry_point = config.get("entry_point", "")
|
|
129
|
+
ep_for_check = entry_point.split("python ")[-1].split("python3 ")[-1]
|
|
130
|
+
|
|
131
|
+
results = {
|
|
132
|
+
"growth": check_growth(
|
|
133
|
+
count_loc(args.baseline_path),
|
|
134
|
+
count_loc(args.worktree_path),
|
|
135
|
+
args.max_growth,
|
|
136
|
+
),
|
|
137
|
+
"entry_point": check_entry_point(args.worktree_path, ep_for_check),
|
|
138
|
+
"tests": check_tests(args.worktree_path),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
all_pass = all(r["pass"] for r in results.values())
|
|
142
|
+
output = {"all_pass": all_pass, "constraints": results}
|
|
143
|
+
|
|
144
|
+
out_str = json.dumps(output, indent=2)
|
|
145
|
+
if args.output:
|
|
146
|
+
with open(args.output, "w") as f:
|
|
147
|
+
f.write(out_str)
|
|
148
|
+
print(out_str)
|
|
149
|
+
|
|
150
|
+
sys.exit(0 if all_pass else 1)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
main()
|
package/tools/dataset_health.py
CHANGED
|
@@ -15,6 +15,14 @@ import os
|
|
|
15
15
|
import sys
|
|
16
16
|
from datetime import datetime, timezone
|
|
17
17
|
|
|
18
|
+
# Secret detection (local import from same directory)
|
|
19
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
20
|
+
try:
|
|
21
|
+
from secret_filter import has_secrets
|
|
22
|
+
except ImportError:
|
|
23
|
+
def has_secrets(text):
|
|
24
|
+
return False
|
|
25
|
+
|
|
18
26
|
|
|
19
27
|
def ensure_langsmith_api_key():
|
|
20
28
|
"""Load API key from langsmith-cli credentials if not in env."""
|
|
@@ -344,10 +352,32 @@ def main():
|
|
|
344
352
|
except Exception:
|
|
345
353
|
pass
|
|
346
354
|
|
|
355
|
+
# Check for secrets in dataset examples
|
|
356
|
+
secrets_check = {"checked": True, "flagged_count": 0, "flagged_ids": [], "clean": True}
|
|
357
|
+
for ex in examples:
|
|
358
|
+
text = str(getattr(ex, 'inputs', '') or '') + str(getattr(ex, 'outputs', '') or '')
|
|
359
|
+
if has_secrets(text):
|
|
360
|
+
secrets_check["flagged_count"] += 1
|
|
361
|
+
secrets_check["flagged_ids"].append(str(ex.id))
|
|
362
|
+
secrets_check["clean"] = False
|
|
363
|
+
secrets_check["flagged_ids"] = secrets_check["flagged_ids"][:10] # Cap at 10
|
|
364
|
+
|
|
347
365
|
# Compute health score and build report
|
|
348
366
|
health_score = compute_health_score(size_info, difficulty, dead, coverage, splits)
|
|
349
367
|
issues, corrections = build_issues_and_corrections(size_info, difficulty, dead, coverage, splits)
|
|
350
368
|
|
|
369
|
+
# Add secret issues
|
|
370
|
+
if not secrets_check["clean"]:
|
|
371
|
+
issues.append({
|
|
372
|
+
"severity": "critical",
|
|
373
|
+
"message": f"{secrets_check['flagged_count']} example(s) contain potential secrets (API keys, tokens)",
|
|
374
|
+
})
|
|
375
|
+
corrections.append({
|
|
376
|
+
"action": "remove_secrets",
|
|
377
|
+
"description": f"Remove or redact {secrets_check['flagged_count']} examples with detected secrets",
|
|
378
|
+
"example_ids": secrets_check["flagged_ids"],
|
|
379
|
+
})
|
|
380
|
+
|
|
351
381
|
report = {
|
|
352
382
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
353
383
|
"health_score": health_score,
|
|
@@ -357,6 +387,7 @@ def main():
|
|
|
357
387
|
"dead_examples": dead,
|
|
358
388
|
"coverage": coverage,
|
|
359
389
|
"splits": splits,
|
|
390
|
+
"secrets": secrets_check,
|
|
360
391
|
"issues": issues,
|
|
361
392
|
"corrections": corrections,
|
|
362
393
|
}
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Evolution chart — ASCII visualization of agent optimization progress.
|
|
3
|
+
|
|
4
|
+
Reads .evolver.json history and optionally best_results.json to render
|
|
5
|
+
a rich terminal chart with score progression, per-evaluator breakdown,
|
|
6
|
+
change narrative, and horizontal bar chart.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 evolution_chart.py --config .evolver.json
|
|
10
|
+
python3 evolution_chart.py --config .evolver.json --no-color
|
|
11
|
+
|
|
12
|
+
Stdlib-only — no langsmith dependency.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Colors:
|
|
22
|
+
def __init__(self, enabled=True):
|
|
23
|
+
if enabled:
|
|
24
|
+
self.G = '\033[32m'
|
|
25
|
+
self.R = '\033[31m'
|
|
26
|
+
self.Y = '\033[33m'
|
|
27
|
+
self.C = '\033[36m'
|
|
28
|
+
self.B = '\033[1m'
|
|
29
|
+
self.D = '\033[90m'
|
|
30
|
+
self.RST = '\033[0m'
|
|
31
|
+
else:
|
|
32
|
+
self.G = self.R = self.Y = self.C = ''
|
|
33
|
+
self.B = self.D = self.RST = ''
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def sparkline(values):
|
|
37
|
+
blocks = ' ▁▂▃▄▅▆▇█'
|
|
38
|
+
if not values:
|
|
39
|
+
return ''
|
|
40
|
+
mn, mx = min(values), max(values)
|
|
41
|
+
rng = mx - mn or 1
|
|
42
|
+
return ''.join(blocks[min(8, int((v - mn) / rng * 8))] for v in values)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def hbar(val, width, c):
|
|
46
|
+
filled = round(val * width)
|
|
47
|
+
return f'{c.G}{"█" * filled}{c.D}{"░" * (width - filled)}{c.RST}'
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def fmt_tokens(t):
|
|
51
|
+
if not t:
|
|
52
|
+
return '—'
|
|
53
|
+
if t >= 1_000_000:
|
|
54
|
+
return f'{t / 1_000_000:.1f}M'
|
|
55
|
+
if t >= 1000:
|
|
56
|
+
return f'{t / 1000:.1f}k'
|
|
57
|
+
return str(t)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def trend_icon(delta, is_best, c):
|
|
61
|
+
if is_best and delta >= 0:
|
|
62
|
+
return f'{c.G}★{c.RST}'
|
|
63
|
+
if delta > 0:
|
|
64
|
+
return f'{c.G}▲{c.RST}'
|
|
65
|
+
if delta < -0.01:
|
|
66
|
+
return f'{c.R}▼{c.RST}'
|
|
67
|
+
if delta < 0:
|
|
68
|
+
return f'{c.Y}━{c.RST}'
|
|
69
|
+
return f'{c.Y}━{c.RST}'
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def render_header(config, history, scores, c):
|
|
73
|
+
project = config.get('project', 'unknown')
|
|
74
|
+
dataset = config.get('dataset', 'unknown')
|
|
75
|
+
evals = config.get('evaluators', [])
|
|
76
|
+
total = history[0].get('total', config.get('num_examples', '?'))
|
|
77
|
+
base_score = scores[0]
|
|
78
|
+
best_score = max(scores)
|
|
79
|
+
iters = len(history) - 1
|
|
80
|
+
pct = ((best_score - base_score) / base_score * 100) if base_score > 0 else 0
|
|
81
|
+
spark = sparkline(scores)
|
|
82
|
+
evals_str = ' · '.join(evals)
|
|
83
|
+
|
|
84
|
+
W = 70
|
|
85
|
+
lines = []
|
|
86
|
+
lines.append(f' {c.C}╔{"═" * W}╗{c.RST}')
|
|
87
|
+
lines.append(f' {c.C}║{c.RST} {c.B}EVOLUTION REPORT{c.RST}{" " * (W - 18)}{c.C}║{c.RST}')
|
|
88
|
+
lines.append(f' {c.C}║{c.RST} {project:<{W - 16}}{c.D}{iters} iterations{c.RST} {c.C}║{c.RST}')
|
|
89
|
+
lines.append(f' {c.C}║{c.RST} {c.D}dataset{c.RST} {dataset} ({total} examples){" " * max(0, W - 22 - len(dataset) - len(str(total)))}{c.C}║{c.RST}')
|
|
90
|
+
lines.append(f' {c.C}║{c.RST} {c.D}evals{c.RST} {evals_str:<{W - 11}}{c.C}║{c.RST}')
|
|
91
|
+
lines.append(f' {c.C}║{c.RST} {c.D}trend{c.RST} {spark} {base_score:.3f} → {c.G}{c.B}{best_score:.3f}{c.RST} {c.G}(+{pct:.1f}%){c.RST}{" " * max(0, W - 40 - len(spark))}{c.C}║{c.RST}')
|
|
92
|
+
lines.append(f' {c.C}╚{"═" * W}╝{c.RST}')
|
|
93
|
+
return '\n'.join(lines)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def render_score_table(history, scores, c):
|
|
97
|
+
base = scores[0]
|
|
98
|
+
best = max(scores)
|
|
99
|
+
W = 70
|
|
100
|
+
|
|
101
|
+
lines = []
|
|
102
|
+
lines.append(f' {c.B}SCORE PROGRESSION{c.RST}')
|
|
103
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
104
|
+
has_loc = any(h.get('code_loc') for h in history)
|
|
105
|
+
loc_hdr = f'{"LOC":>6}' if has_loc else ''
|
|
106
|
+
lines.append(f' {c.D}{"Version":<10}{"Score":>6}{"Δ":>8}{"vs Base":>9}{"Pass":>7}{"Err":>5}{"Tokens":>8}{"Latency":>9}{loc_hdr}{c.RST}')
|
|
107
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
108
|
+
|
|
109
|
+
for i, h in enumerate(history):
|
|
110
|
+
v = h['version']
|
|
111
|
+
s = h['score']
|
|
112
|
+
passing = h.get('passing')
|
|
113
|
+
total = h.get('total')
|
|
114
|
+
errors = h.get('error_count', h.get('errors'))
|
|
115
|
+
tokens = h.get('tokens', 0)
|
|
116
|
+
latency = h.get('latency_ms', 0)
|
|
117
|
+
|
|
118
|
+
s_str = f'{c.G}{c.B}{s:.3f}{c.RST}' if s == best else f'{s:.3f}'
|
|
119
|
+
|
|
120
|
+
if i == 0:
|
|
121
|
+
d_str = f'{c.D}{"—":>7}{c.RST}'
|
|
122
|
+
p_str = f'{c.D}{"—":>8}{c.RST}'
|
|
123
|
+
icon = ''
|
|
124
|
+
else:
|
|
125
|
+
d = s - history[i - 1]['score']
|
|
126
|
+
pct = ((s - base) / base * 100) if base > 0 else 0
|
|
127
|
+
dc = c.G if d > 0 else (c.R if d < 0 else c.Y)
|
|
128
|
+
d_str = f'{dc}{d:>+7.3f}{c.RST}'
|
|
129
|
+
p_str = f'{dc}{pct:>+7.1f}%{c.RST}'
|
|
130
|
+
icon = trend_icon(d, i == len(history) - 1 and s == best, c)
|
|
131
|
+
|
|
132
|
+
if passing is not None and total is not None:
|
|
133
|
+
pass_str = f'{passing}/{total}'
|
|
134
|
+
else:
|
|
135
|
+
pass_str = '—'
|
|
136
|
+
|
|
137
|
+
if errors is not None:
|
|
138
|
+
e_str = f'{c.R}{errors}{c.RST}' if errors > 0 else f'{c.G}{errors}{c.RST}'
|
|
139
|
+
else:
|
|
140
|
+
e_str = '—'
|
|
141
|
+
|
|
142
|
+
tok_str = fmt_tokens(tokens)
|
|
143
|
+
lat_str = f'{latency}ms' if latency else '—'
|
|
144
|
+
|
|
145
|
+
loc_str = ''
|
|
146
|
+
if has_loc:
|
|
147
|
+
loc = h.get('code_loc')
|
|
148
|
+
if loc:
|
|
149
|
+
base_loc = history[0].get('code_loc', 0)
|
|
150
|
+
if base_loc and loc > base_loc * 1.3:
|
|
151
|
+
loc_str = f' {c.R}{loc}{c.RST}⚠'
|
|
152
|
+
else:
|
|
153
|
+
loc_str = f' {loc:>5}'
|
|
154
|
+
else:
|
|
155
|
+
loc_str = ' —'
|
|
156
|
+
|
|
157
|
+
lines.append(f' {v:<10}{s_str:>6} {d_str} {p_str} {pass_str:>5} {e_str:>3} {tok_str:>6} {lat_str:>6}{loc_str} {icon}')
|
|
158
|
+
|
|
159
|
+
return '\n'.join(lines)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def render_evaluator_breakdown(history, config, best_results, c):
|
|
163
|
+
evaluators = config.get('evaluators', [])
|
|
164
|
+
if not evaluators:
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
has_per_eval = any(h.get('per_evaluator') for h in history)
|
|
168
|
+
|
|
169
|
+
if not has_per_eval and not best_results:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
W = 70
|
|
173
|
+
lines = []
|
|
174
|
+
lines.append(f' {c.B}PER-EVALUATOR BREAKDOWN{c.RST}')
|
|
175
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
176
|
+
|
|
177
|
+
if has_per_eval:
|
|
178
|
+
lines.append(f' {c.D}{"Evaluator":<20}{"Base":>6}{"Best":>7}{"Δ":>7} {"":20} Trend{c.RST}')
|
|
179
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
180
|
+
|
|
181
|
+
for ev in evaluators:
|
|
182
|
+
vals = [h.get('per_evaluator', {}).get(ev, 0) for h in history]
|
|
183
|
+
bv = vals[0]
|
|
184
|
+
best_v = vals[-1]
|
|
185
|
+
delta = best_v - bv
|
|
186
|
+
dc = c.G if delta > 0 else c.R
|
|
187
|
+
spark_ev = sparkline(vals)
|
|
188
|
+
|
|
189
|
+
lines.append(
|
|
190
|
+
f' {ev:<20}{bv:>5.2f} → {dc}{c.B}{best_v:.2f}{c.RST}'
|
|
191
|
+
f' {dc}{delta:>+6.2f}{c.RST}'
|
|
192
|
+
f' {hbar(best_v, 20, c)}'
|
|
193
|
+
f' {spark_ev}'
|
|
194
|
+
)
|
|
195
|
+
elif best_results:
|
|
196
|
+
lines.append(f' {c.D}{"Evaluator":<20}{"Avg Score":>10} {"":20}{c.RST}')
|
|
197
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
198
|
+
|
|
199
|
+
eval_scores = {}
|
|
200
|
+
for ex_data in best_results.get('per_example', {}).values():
|
|
201
|
+
for ev_name, ev_score in ex_data.get('scores', {}).items():
|
|
202
|
+
eval_scores.setdefault(ev_name, []).append(ev_score)
|
|
203
|
+
|
|
204
|
+
for ev in evaluators:
|
|
205
|
+
if ev in eval_scores:
|
|
206
|
+
avg = sum(eval_scores[ev]) / len(eval_scores[ev])
|
|
207
|
+
lines.append(f' {ev:<20}{avg:>9.3f} {hbar(avg, 20, c)}')
|
|
208
|
+
|
|
209
|
+
return '\n'.join(lines)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def render_what_changed(history, c):
|
|
213
|
+
has_narrative = any(h.get('approach') for h in history[1:])
|
|
214
|
+
if not has_narrative:
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
W = 70
|
|
218
|
+
best_score = max(h['score'] for h in history)
|
|
219
|
+
lines = []
|
|
220
|
+
lines.append(f' {c.B}WHAT CHANGED{c.RST}')
|
|
221
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
222
|
+
|
|
223
|
+
for i, h in enumerate(history):
|
|
224
|
+
if i == 0:
|
|
225
|
+
continue
|
|
226
|
+
d = h['score'] - history[i - 1]['score']
|
|
227
|
+
dc = c.G if d > 0 else (c.R if d < 0 else c.Y)
|
|
228
|
+
icon = trend_icon(d, i == len(history) - 1 and h['score'] == best_score, c)
|
|
229
|
+
approach = (h.get('approach') or '—')[:42]
|
|
230
|
+
lens = h.get('lens', '')
|
|
231
|
+
lens_str = f' {c.D}[{lens}]{c.RST}' if lens else ''
|
|
232
|
+
lines.append(f' {h["version"]:<6} {icon} {dc}{d:>+.3f}{c.RST} {approach:<42}{lens_str}')
|
|
233
|
+
|
|
234
|
+
return '\n'.join(lines)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def render_bar_chart(history, scores, c):
|
|
238
|
+
best = max(scores)
|
|
239
|
+
best_idx = scores.index(best)
|
|
240
|
+
BAR_W = 40
|
|
241
|
+
W = 70
|
|
242
|
+
|
|
243
|
+
lines = []
|
|
244
|
+
lines.append(f' {c.B}SCORE CHART{c.RST}')
|
|
245
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
246
|
+
|
|
247
|
+
for i, h in enumerate(history):
|
|
248
|
+
v = h['version']
|
|
249
|
+
s = h['score']
|
|
250
|
+
filled = round(s * BAR_W)
|
|
251
|
+
|
|
252
|
+
if i == best_idx:
|
|
253
|
+
bar_str = f'{c.G}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
|
|
254
|
+
score_str = f'{c.G}{c.B}{s:.3f}{c.RST}'
|
|
255
|
+
elif i == 0:
|
|
256
|
+
bar_str = f'{c.C}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
|
|
257
|
+
score_str = f'{c.C}{s:.3f}{c.RST}'
|
|
258
|
+
else:
|
|
259
|
+
bar_str = f'{c.G}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
|
|
260
|
+
score_str = f'{s:.3f}'
|
|
261
|
+
|
|
262
|
+
lines.append(f' {v:<10}{bar_str} {score_str}')
|
|
263
|
+
|
|
264
|
+
lines.append(f' {c.D}{" " * 10}|{" " * 9}|{" " * 9}|{" " * 9}|{" " * 9}|{c.RST}')
|
|
265
|
+
lines.append(f' {c.D}{" " * 10}0{" " * 8}.25{" " * 7}.50{" " * 7}.75{" " * 8}1.0{c.RST}')
|
|
266
|
+
|
|
267
|
+
return '\n'.join(lines)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def main():
|
|
271
|
+
parser = argparse.ArgumentParser(description="Render evolution progress chart")
|
|
272
|
+
parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
|
|
273
|
+
parser.add_argument("--best-results", default=None, help="Path to best_results.json (auto-detected if not set)")
|
|
274
|
+
parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors")
|
|
275
|
+
parser.add_argument("--output", default=None, help="Write output to file instead of stdout")
|
|
276
|
+
args = parser.parse_args()
|
|
277
|
+
|
|
278
|
+
if not os.path.exists(args.config):
|
|
279
|
+
print(f"Config not found: {args.config}", file=sys.stderr)
|
|
280
|
+
sys.exit(1)
|
|
281
|
+
|
|
282
|
+
with open(args.config) as f:
|
|
283
|
+
config = json.load(f)
|
|
284
|
+
|
|
285
|
+
history = config.get('history', [])
|
|
286
|
+
if not history:
|
|
287
|
+
print("No history data in config.", file=sys.stderr)
|
|
288
|
+
sys.exit(1)
|
|
289
|
+
|
|
290
|
+
best_results = None
|
|
291
|
+
br_path = args.best_results or os.path.join(os.path.dirname(args.config) or '.', 'best_results.json')
|
|
292
|
+
if os.path.exists(br_path):
|
|
293
|
+
with open(br_path) as f:
|
|
294
|
+
best_results = json.load(f)
|
|
295
|
+
|
|
296
|
+
use_color = not args.no_color and sys.stdout.isatty() and args.output is None
|
|
297
|
+
c = Colors(enabled=use_color)
|
|
298
|
+
|
|
299
|
+
scores = [h['score'] for h in history]
|
|
300
|
+
|
|
301
|
+
sections = [
|
|
302
|
+
'',
|
|
303
|
+
render_header(config, history, scores, c),
|
|
304
|
+
'',
|
|
305
|
+
render_score_table(history, scores, c),
|
|
306
|
+
'',
|
|
307
|
+
render_evaluator_breakdown(history, config, best_results, c),
|
|
308
|
+
'',
|
|
309
|
+
render_what_changed(history, c),
|
|
310
|
+
'',
|
|
311
|
+
render_bar_chart(history, scores, c),
|
|
312
|
+
'',
|
|
313
|
+
]
|
|
314
|
+
|
|
315
|
+
output = '\n'.join(s for s in sections if s is not None)
|
|
316
|
+
|
|
317
|
+
if args.output:
|
|
318
|
+
with open(args.output, 'w') as f:
|
|
319
|
+
f.write(output + '\n')
|
|
320
|
+
print(f"Chart written to {args.output}", file=sys.stderr)
|
|
321
|
+
else:
|
|
322
|
+
print(output)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
if __name__ == "__main__":
|
|
326
|
+
main()
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Mine Claude Code session history for eval dataset examples.
|
|
3
|
+
|
|
4
|
+
Reads ~/.claude/ session files to extract real user interactions
|
|
5
|
+
that can be used as evaluation data. Filters for relevance to the
|
|
6
|
+
agent being optimized, detects and skips secrets.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 mine_sessions.py \
|
|
10
|
+
--agent-description "A ReAct agent that answers questions using tools" \
|
|
11
|
+
--output session_examples.json \
|
|
12
|
+
[--max-examples 50]
|
|
13
|
+
|
|
14
|
+
Stdlib-only except for secret_filter (local import).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import glob
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def find_session_files():
|
|
25
|
+
"""Find Claude Code session history files."""
|
|
26
|
+
candidates = [
|
|
27
|
+
os.path.expanduser("~/.claude/history.jsonl"),
|
|
28
|
+
os.path.expanduser("~/.claude/sessions/*/messages.jsonl"),
|
|
29
|
+
]
|
|
30
|
+
found = []
|
|
31
|
+
for pattern in candidates:
|
|
32
|
+
found.extend(glob.glob(pattern))
|
|
33
|
+
return found
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_messages(file_path):
|
|
37
|
+
"""Extract user->assistant message pairs from a session file."""
|
|
38
|
+
pairs = []
|
|
39
|
+
try:
|
|
40
|
+
with open(file_path) as f:
|
|
41
|
+
messages = []
|
|
42
|
+
for line in f:
|
|
43
|
+
line = line.strip()
|
|
44
|
+
if not line:
|
|
45
|
+
continue
|
|
46
|
+
try:
|
|
47
|
+
msg = json.loads(line)
|
|
48
|
+
messages.append(msg)
|
|
49
|
+
except json.JSONDecodeError:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
for i in range(len(messages) - 1):
|
|
53
|
+
if (messages[i].get("role") == "user" and
|
|
54
|
+
messages[i + 1].get("role") == "assistant"):
|
|
55
|
+
user_text = messages[i].get("content", "")
|
|
56
|
+
if isinstance(user_text, list):
|
|
57
|
+
user_text = " ".join(
|
|
58
|
+
p.get("text", "") for p in user_text
|
|
59
|
+
if isinstance(p, dict) and p.get("type") == "text"
|
|
60
|
+
)
|
|
61
|
+
asst_text = messages[i + 1].get("content", "")
|
|
62
|
+
if isinstance(asst_text, list):
|
|
63
|
+
asst_text = " ".join(
|
|
64
|
+
p.get("text", "") for p in asst_text
|
|
65
|
+
if isinstance(p, dict) and p.get("type") == "text"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if user_text and len(user_text) > 10:
|
|
69
|
+
pairs.append({
|
|
70
|
+
"input": user_text[:500],
|
|
71
|
+
"output_preview": asst_text[:200] if asst_text else "",
|
|
72
|
+
"source_file": os.path.basename(file_path),
|
|
73
|
+
})
|
|
74
|
+
except (OSError, UnicodeDecodeError):
|
|
75
|
+
pass
|
|
76
|
+
return pairs
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def filter_relevant(pairs, agent_description, max_examples=50):
|
|
80
|
+
"""Simple keyword-based relevance filter."""
|
|
81
|
+
stop_words = {"a", "an", "the", "is", "are", "was", "were", "that", "this",
|
|
82
|
+
"and", "or", "for", "to", "in", "on", "with", "using"}
|
|
83
|
+
keywords = set(
|
|
84
|
+
w.lower() for w in agent_description.split()
|
|
85
|
+
if len(w) > 3 and w.lower() not in stop_words
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
scored = []
|
|
89
|
+
for pair in pairs:
|
|
90
|
+
input_words = set(pair["input"].lower().split())
|
|
91
|
+
overlap = len(keywords & input_words)
|
|
92
|
+
if overlap >= 1:
|
|
93
|
+
scored.append((overlap, pair))
|
|
94
|
+
|
|
95
|
+
scored.sort(key=lambda x: -x[0])
|
|
96
|
+
return [pair for _, pair in scored[:max_examples]]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def main():
|
|
100
|
+
parser = argparse.ArgumentParser(description="Mine Claude Code sessions for eval data")
|
|
101
|
+
parser.add_argument("--agent-description", required=True, help="Description of the agent being optimized")
|
|
102
|
+
parser.add_argument("--output", default="session_examples.json")
|
|
103
|
+
parser.add_argument("--max-examples", type=int, default=50)
|
|
104
|
+
args = parser.parse_args()
|
|
105
|
+
|
|
106
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
107
|
+
try:
|
|
108
|
+
from secret_filter import has_secrets
|
|
109
|
+
except ImportError:
|
|
110
|
+
has_secrets = lambda text: False # noqa: E731
|
|
111
|
+
|
|
112
|
+
session_files = find_session_files()
|
|
113
|
+
if not session_files:
|
|
114
|
+
print("No Claude Code session files found.", file=sys.stderr)
|
|
115
|
+
print(json.dumps({"mined": 0, "output": args.output}))
|
|
116
|
+
sys.exit(0)
|
|
117
|
+
|
|
118
|
+
print(f"Found {len(session_files)} session file(s)", file=sys.stderr)
|
|
119
|
+
|
|
120
|
+
all_pairs = []
|
|
121
|
+
secrets_skipped = 0
|
|
122
|
+
for sf in session_files:
|
|
123
|
+
pairs = extract_messages(sf)
|
|
124
|
+
for p in pairs:
|
|
125
|
+
if has_secrets(p["input"]) or has_secrets(p.get("output_preview", "")):
|
|
126
|
+
secrets_skipped += 1
|
|
127
|
+
continue
|
|
128
|
+
all_pairs.append(p)
|
|
129
|
+
|
|
130
|
+
print(f"Extracted {len(all_pairs)} message pairs ({secrets_skipped} skipped for secrets)", file=sys.stderr)
|
|
131
|
+
|
|
132
|
+
relevant = filter_relevant(all_pairs, args.agent_description, args.max_examples)
|
|
133
|
+
print(f"Filtered to {len(relevant)} relevant examples", file=sys.stderr)
|
|
134
|
+
|
|
135
|
+
examples = []
|
|
136
|
+
for p in relevant:
|
|
137
|
+
examples.append({
|
|
138
|
+
"input": p["input"],
|
|
139
|
+
"metadata": {"source": "session_mining", "source_file": p["source_file"]},
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
output = {"examples": examples, "count": len(examples), "source": "claude_code_sessions"}
|
|
143
|
+
with open(args.output, "w") as f:
|
|
144
|
+
json.dump(output, f, indent=2)
|
|
145
|
+
|
|
146
|
+
print(json.dumps({"mined": len(examples), "output": args.output}))
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
if __name__ == "__main__":
|
|
150
|
+
main()
|
package/tools/read_results.py
CHANGED
|
@@ -61,7 +61,28 @@ def ensure_langsmith_api_key():
|
|
|
61
61
|
return False
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
def
|
|
64
|
+
def weighted_score(scores, weights=None):
|
|
65
|
+
"""Calculate weighted average of evaluator scores.
|
|
66
|
+
|
|
67
|
+
If weights provided, use them. Otherwise flat average.
|
|
68
|
+
Weights are normalized (don't need to sum to 1).
|
|
69
|
+
"""
|
|
70
|
+
if not scores:
|
|
71
|
+
return 0.0
|
|
72
|
+
if not weights:
|
|
73
|
+
return sum(scores.values()) / len(scores)
|
|
74
|
+
|
|
75
|
+
total_weight = 0
|
|
76
|
+
weighted_sum = 0
|
|
77
|
+
for key, val in scores.items():
|
|
78
|
+
w = weights.get(key, 1.0)
|
|
79
|
+
weighted_sum += val * w
|
|
80
|
+
total_weight += w
|
|
81
|
+
|
|
82
|
+
return weighted_sum / total_weight if total_weight > 0 else 0.0
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def read_experiment(client, experiment_name, weights=None):
|
|
65
86
|
"""Read results from a single LangSmith experiment."""
|
|
66
87
|
try:
|
|
67
88
|
# List runs for this experiment
|
|
@@ -103,13 +124,17 @@ def read_experiment(client, experiment_name):
|
|
|
103
124
|
# Read feedback/scores from pre-fetched batch
|
|
104
125
|
feedbacks = fb_map.get(str(run.id), [])
|
|
105
126
|
scores = {}
|
|
127
|
+
feedback_comments = {}
|
|
106
128
|
for fb in feedbacks:
|
|
107
129
|
if fb.score is not None:
|
|
108
130
|
scores[fb.key] = fb.score
|
|
131
|
+
if fb.comment:
|
|
132
|
+
feedback_comments[fb.key] = fb.comment
|
|
109
133
|
|
|
110
134
|
per_example[example_id] = {
|
|
111
|
-
"score":
|
|
135
|
+
"score": weighted_score(scores, weights),
|
|
112
136
|
"scores": scores,
|
|
137
|
+
"feedback": feedback_comments,
|
|
113
138
|
"tokens": tokens,
|
|
114
139
|
"latency_ms": latency_ms,
|
|
115
140
|
"error": run.error[:200] if run.error else None,
|
|
@@ -136,6 +161,50 @@ def read_experiment(client, experiment_name):
|
|
|
136
161
|
return {"experiment": experiment_name, "error": str(e), "combined_score": 0.0}
|
|
137
162
|
|
|
138
163
|
|
|
164
|
+
def pareto_front(candidates):
|
|
165
|
+
"""Find Pareto-optimal candidates (not dominated on any evaluator).
|
|
166
|
+
|
|
167
|
+
A candidate is dominated if another scores >= on ALL evaluators
|
|
168
|
+
and strictly > on at least one.
|
|
169
|
+
"""
|
|
170
|
+
if len(candidates) <= 1:
|
|
171
|
+
return candidates
|
|
172
|
+
|
|
173
|
+
front = []
|
|
174
|
+
for i, ci in enumerate(candidates):
|
|
175
|
+
dominated = False
|
|
176
|
+
ci_scores = ci.get("evaluator_scores", {})
|
|
177
|
+
if not ci_scores:
|
|
178
|
+
front.append(ci)
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
for j, cj in enumerate(candidates):
|
|
182
|
+
if i == j:
|
|
183
|
+
continue
|
|
184
|
+
cj_scores = cj.get("evaluator_scores", {})
|
|
185
|
+
if not cj_scores:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
all_geq = True
|
|
189
|
+
any_gt = False
|
|
190
|
+
for key in ci_scores:
|
|
191
|
+
if key in cj_scores:
|
|
192
|
+
if cj_scores[key] < ci_scores[key]:
|
|
193
|
+
all_geq = False
|
|
194
|
+
break
|
|
195
|
+
if cj_scores[key] > ci_scores[key]:
|
|
196
|
+
any_gt = True
|
|
197
|
+
|
|
198
|
+
if all_geq and any_gt:
|
|
199
|
+
dominated = True
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
if not dominated:
|
|
203
|
+
front.append(ci)
|
|
204
|
+
|
|
205
|
+
return front if front else candidates[:1]
|
|
206
|
+
|
|
207
|
+
|
|
139
208
|
def compare_experiments(results_list):
|
|
140
209
|
"""Compare multiple experiment results and find winner + per-task champion."""
|
|
141
210
|
if not results_list:
|
|
@@ -173,12 +242,27 @@ def compare_experiments(results_list):
|
|
|
173
242
|
"task_wins": task_wins[champion_name],
|
|
174
243
|
}
|
|
175
244
|
|
|
245
|
+
# Compute per-evaluator averages for Pareto analysis
|
|
246
|
+
for result in valid:
|
|
247
|
+
eval_avgs = {}
|
|
248
|
+
for ex_data in result.get("per_example", {}).values():
|
|
249
|
+
for ev_key, ev_score in ex_data.get("scores", {}).items():
|
|
250
|
+
eval_avgs.setdefault(ev_key, []).append(ev_score)
|
|
251
|
+
result["evaluator_scores"] = {k: sum(v) / len(v) for k, v in eval_avgs.items()}
|
|
252
|
+
|
|
253
|
+
front = pareto_front(valid)
|
|
254
|
+
|
|
176
255
|
return {
|
|
177
256
|
"winner": {
|
|
178
257
|
"experiment": winner["experiment"],
|
|
179
258
|
"score": winner["combined_score"],
|
|
180
259
|
},
|
|
181
260
|
"champion": champion,
|
|
261
|
+
"pareto_front": [
|
|
262
|
+
{"experiment": r["experiment"], "score": r["combined_score"],
|
|
263
|
+
"evaluator_scores": r.get("evaluator_scores", {})}
|
|
264
|
+
for r in front
|
|
265
|
+
],
|
|
182
266
|
"all_candidates": [
|
|
183
267
|
{
|
|
184
268
|
"experiment": r["experiment"],
|
|
@@ -231,12 +315,19 @@ def main():
|
|
|
231
315
|
args = parser.parse_args()
|
|
232
316
|
ensure_langsmith_api_key()
|
|
233
317
|
|
|
318
|
+
# Load evaluator weights from config if available
|
|
319
|
+
weights = None
|
|
320
|
+
if os.path.exists(args.config):
|
|
321
|
+
with open(args.config) as f:
|
|
322
|
+
cfg = json.load(f)
|
|
323
|
+
weights = cfg.get("evaluator_weights")
|
|
324
|
+
|
|
234
325
|
from langsmith import Client
|
|
235
326
|
client = Client()
|
|
236
327
|
|
|
237
328
|
if args.experiment:
|
|
238
329
|
# Single experiment
|
|
239
|
-
result = read_experiment(client, args.experiment)
|
|
330
|
+
result = read_experiment(client, args.experiment, weights=weights)
|
|
240
331
|
if not result:
|
|
241
332
|
print(f"No results found for experiment: {args.experiment}", file=sys.stderr)
|
|
242
333
|
sys.exit(1)
|
|
@@ -267,10 +358,25 @@ def main():
|
|
|
267
358
|
experiment_names = [e.strip() for e in args.experiments.split(",")]
|
|
268
359
|
results_list = []
|
|
269
360
|
|
|
361
|
+
# Load split filter if requested
|
|
362
|
+
split_example_ids = None
|
|
363
|
+
if args.split:
|
|
364
|
+
with open(args.config) as f:
|
|
365
|
+
cfg_for_split = json.load(f)
|
|
366
|
+
split_example_ids = set()
|
|
367
|
+
for ex in client.list_examples(dataset_name=cfg_for_split["dataset"], splits=[args.split]):
|
|
368
|
+
split_example_ids.add(str(ex.id))
|
|
369
|
+
|
|
270
370
|
for name in experiment_names:
|
|
271
371
|
print(f"Reading experiment: {name}...", file=sys.stderr)
|
|
272
|
-
result = read_experiment(client, name)
|
|
372
|
+
result = read_experiment(client, name, weights=weights)
|
|
273
373
|
if result:
|
|
374
|
+
# Apply split filter to each experiment
|
|
375
|
+
if split_example_ids is not None and "per_example" in result:
|
|
376
|
+
result["per_example"] = {k: v for k, v in result["per_example"].items() if k in split_example_ids}
|
|
377
|
+
all_scores = [v["score"] for v in result["per_example"].values()]
|
|
378
|
+
result["combined_score"] = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
|
379
|
+
result["num_examples"] = len(result["per_example"])
|
|
274
380
|
results_list.append(result)
|
|
275
381
|
|
|
276
382
|
if not results_list:
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Secret detection and filtering for eval datasets.
|
|
3
|
+
|
|
4
|
+
Detects API keys, tokens, passwords, and other sensitive data in text.
|
|
5
|
+
Used by seed_from_traces.py and dataset_health.py.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
echo "text with sk-ant-api..." | python3 secret_filter.py
|
|
9
|
+
python3 secret_filter.py < file.txt
|
|
10
|
+
|
|
11
|
+
Stdlib-only — no external dependencies.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
SECRET_PATTERNS = re.compile(
|
|
20
|
+
r'('
|
|
21
|
+
r'sk-ant-api\S{20,}'
|
|
22
|
+
r'|sk-or-v1-\S{20,}'
|
|
23
|
+
r'|sk-\S{20,}'
|
|
24
|
+
r'|ghp_\S{20,}'
|
|
25
|
+
r'|gho_\S{20,}'
|
|
26
|
+
r'|github_pat_\S{20,}'
|
|
27
|
+
r'|xoxb-\S{20,}'
|
|
28
|
+
r'|xapp-\S{20,}'
|
|
29
|
+
r'|ntn_\S{20,}'
|
|
30
|
+
r'|AKIA[A-Z0-9]{16}'
|
|
31
|
+
r'|Bearer\s+[A-Za-z0-9\-._~+/]{20,}'
|
|
32
|
+
r'|-----BEGIN\s+(RSA\s+)?PRIVATE\s+KEY-----'
|
|
33
|
+
r')',
|
|
34
|
+
re.IGNORECASE,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
ENV_PATTERNS = re.compile(
|
|
38
|
+
r'(?:ANTHROPIC_API_KEY|OPENAI_API_KEY|LANGSMITH_API_KEY|LANGCHAIN_API_KEY'
|
|
39
|
+
r'|AWS_SECRET_ACCESS_KEY|DATABASE_URL|POSTGRES_PASSWORD'
|
|
40
|
+
r'|SLACK_TOKEN|GITHUB_TOKEN|API_KEY|SECRET_KEY'
|
|
41
|
+
r')\s*[=:]\s*["\']?\S{10,}',
|
|
42
|
+
re.IGNORECASE,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
ASSIGN_PATTERNS = re.compile(
|
|
46
|
+
r'(?:password|secret|token|api_key|apikey)\s*[=:]\s*["\']?\S{10,}',
|
|
47
|
+
re.IGNORECASE,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def detect_secrets(text):
|
|
52
|
+
"""Return list of secret matches found in text."""
|
|
53
|
+
if not text:
|
|
54
|
+
return []
|
|
55
|
+
findings = []
|
|
56
|
+
for pattern, name in [
|
|
57
|
+
(SECRET_PATTERNS, "secret_key"),
|
|
58
|
+
(ENV_PATTERNS, "env_variable"),
|
|
59
|
+
(ASSIGN_PATTERNS, "assignment"),
|
|
60
|
+
]:
|
|
61
|
+
for m in pattern.finditer(text):
|
|
62
|
+
match_text = m.group()
|
|
63
|
+
redacted = match_text[:10] + "..." + match_text[-4:] if len(match_text) > 20 else match_text
|
|
64
|
+
findings.append({
|
|
65
|
+
"pattern": name,
|
|
66
|
+
"match": redacted,
|
|
67
|
+
"position": m.start(),
|
|
68
|
+
})
|
|
69
|
+
return findings
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def has_secrets(text):
|
|
73
|
+
"""Quick boolean check — does text contain any secrets?"""
|
|
74
|
+
if not text:
|
|
75
|
+
return False
|
|
76
|
+
return bool(SECRET_PATTERNS.search(text) or ENV_PATTERNS.search(text) or ASSIGN_PATTERNS.search(text))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def redact_secrets(text):
|
|
80
|
+
"""Replace detected secrets with [REDACTED]."""
|
|
81
|
+
if not text:
|
|
82
|
+
return text
|
|
83
|
+
text = SECRET_PATTERNS.sub("[REDACTED]", text)
|
|
84
|
+
text = ENV_PATTERNS.sub("[REDACTED]", text)
|
|
85
|
+
text = ASSIGN_PATTERNS.sub("[REDACTED]", text)
|
|
86
|
+
return text
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
text = sys.stdin.read()
|
|
91
|
+
findings = detect_secrets(text)
|
|
92
|
+
if findings:
|
|
93
|
+
print(json.dumps({"has_secrets": True, "count": len(findings), "findings": findings}, indent=2))
|
|
94
|
+
sys.exit(1)
|
|
95
|
+
else:
|
|
96
|
+
print(json.dumps({"has_secrets": False, "count": 0}))
|
|
97
|
+
sys.exit(0)
|
|
@@ -22,6 +22,14 @@ import sys
|
|
|
22
22
|
from collections import Counter
|
|
23
23
|
from datetime import datetime, timezone
|
|
24
24
|
|
|
25
|
+
# Secret detection (local import from same directory)
|
|
26
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
27
|
+
try:
|
|
28
|
+
from secret_filter import has_secrets
|
|
29
|
+
except ImportError:
|
|
30
|
+
def has_secrets(text):
|
|
31
|
+
return False
|
|
32
|
+
|
|
25
33
|
|
|
26
34
|
def extract_input(run):
|
|
27
35
|
"""Extract user input from a run's inputs field."""
|
|
@@ -118,9 +126,16 @@ def analyze_runs(runs):
|
|
|
118
126
|
token_counts = []
|
|
119
127
|
feedbacks = {"positive": 0, "negative": 0, "none": 0}
|
|
120
128
|
|
|
129
|
+
secrets_filtered = 0
|
|
121
130
|
for run in runs:
|
|
122
131
|
user_input = extract_input(run)
|
|
123
132
|
output = extract_output(run)
|
|
133
|
+
|
|
134
|
+
# Skip runs containing secrets (API keys, tokens, passwords)
|
|
135
|
+
if has_secrets(str(user_input or '')) or has_secrets(str(output or '')):
|
|
136
|
+
secrets_filtered += 1
|
|
137
|
+
continue
|
|
138
|
+
|
|
124
139
|
error = run.get("error")
|
|
125
140
|
tokens = run.get("total_tokens") or 0
|
|
126
141
|
latency_ms = None
|
package/tools/setup.py
CHANGED
|
@@ -180,9 +180,24 @@ def create_dataset_from_file(client, dataset_name, file_path):
|
|
|
180
180
|
elif "expected" in item:
|
|
181
181
|
ex["outputs"] = {"expected": item["expected"]}
|
|
182
182
|
|
|
183
|
+
# Include rubric/expected behavior in metadata
|
|
184
|
+
if "expected_behavior" in item:
|
|
185
|
+
if "metadata" not in ex:
|
|
186
|
+
ex["metadata"] = {}
|
|
187
|
+
ex["metadata"]["expected_behavior"] = item["expected_behavior"]
|
|
188
|
+
|
|
189
|
+
# Include difficulty and category in metadata
|
|
190
|
+
for field in ("difficulty", "category"):
|
|
191
|
+
if field in item:
|
|
192
|
+
if "metadata" not in ex:
|
|
193
|
+
ex["metadata"] = {}
|
|
194
|
+
ex["metadata"][field] = item[field]
|
|
195
|
+
|
|
183
196
|
# Include metadata
|
|
184
|
-
if "metadata" in item:
|
|
197
|
+
if "metadata" in item and "metadata" not in ex:
|
|
185
198
|
ex["metadata"] = item["metadata"]
|
|
199
|
+
elif "metadata" in item:
|
|
200
|
+
ex["metadata"].update(item["metadata"])
|
|
186
201
|
|
|
187
202
|
if "metadata" not in ex:
|
|
188
203
|
ex["metadata"] = {}
|
|
@@ -548,6 +563,7 @@ def main():
|
|
|
548
563
|
"project_dir": project_dir,
|
|
549
564
|
"entry_point": entry_point,
|
|
550
565
|
"evaluators": evaluator_keys,
|
|
566
|
+
"evaluator_weights": None,
|
|
551
567
|
"optimization_goals": goals,
|
|
552
568
|
"production_project": args.production_project,
|
|
553
569
|
"baseline_experiment": baseline_experiment,
|