harness-evolver 4.5.1 → 4.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +81 -28
- package/package.json +1 -1
- package/tools/add_evaluator.py +28 -11
- package/tools/constraint_check.py +28 -3
- package/tools/evolution_chart.py +10 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.5.
|
|
4
|
+
"version": "4.5.2",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -48,8 +48,9 @@ export LANGSMITH_API_KEY="lsv2_pt_..."
|
|
|
48
48
|
claude
|
|
49
49
|
|
|
50
50
|
/evolver:setup # explores project, configures LangSmith
|
|
51
|
+
/evolver:health # check dataset quality (auto-corrects issues)
|
|
51
52
|
/evolver:evolve # runs the optimization loop
|
|
52
|
-
/evolver:status # check progress
|
|
53
|
+
/evolver:status # check progress (rich ASCII chart)
|
|
53
54
|
/evolver:deploy # tag, push, finalize
|
|
54
55
|
```
|
|
55
56
|
|
|
@@ -64,19 +65,43 @@ claude
|
|
|
64
65
|
</tr>
|
|
65
66
|
<tr>
|
|
66
67
|
<td><b>Real Code Evolution</b></td>
|
|
67
|
-
<td>Proposers modify your actual agent code — not a wrapper. Each candidate works in an isolated git worktree. Winners are merged automatically.</td>
|
|
68
|
+
<td>Proposers modify your actual agent code — not a wrapper. Each candidate works in an isolated git worktree. Winners are merged automatically. Config files (.evolver.json, .env) are auto-propagated to worktrees.</td>
|
|
68
69
|
</tr>
|
|
69
70
|
<tr>
|
|
70
71
|
<td><b>Self-Organizing Proposers</b></td>
|
|
71
72
|
<td>Each iteration generates dynamic investigation lenses from failure data, architecture analysis, production traces, and evolution memory. Proposers self-organize their approach — no fixed strategies. They can self-abstain when their contribution would be redundant. Inspired by <a href="https://arxiv.org/abs/2603.28990">Dochkina (2026)</a>.</td>
|
|
72
73
|
</tr>
|
|
73
74
|
<tr>
|
|
75
|
+
<td><b>Rubric-Based Evaluation</b></td>
|
|
76
|
+
<td>Dataset examples support <code>expected_behavior</code> rubrics — specific criteria the judge evaluates against ("should mention null safety and Android development"), not just generic correctness. Partial scoring (0.5) for partially-met rubrics. Inspired by <a href="https://github.com/NousResearch/hermes-agent-self-evolution">Hermes Agent Self-Evolution</a>.</td>
|
|
77
|
+
</tr>
|
|
78
|
+
<tr>
|
|
79
|
+
<td><b>Constraint Gates</b></td>
|
|
80
|
+
<td>Proposals must pass hard constraints before merge: code growth ≤30%, entry point syntax valid, test suite passes. Candidates that fail are rejected and the next-best is tried. Prevents code bloat and broken merges.</td>
|
|
81
|
+
</tr>
|
|
82
|
+
<tr>
|
|
83
|
+
<td><b>Weighted Evaluators + Pareto</b></td>
|
|
84
|
+
<td>Configure <code>evaluator_weights</code> to prioritize what matters (e.g., correctness 50%, latency 30%). When candidates offer genuinely different tradeoffs, the Pareto front is reported instead of forcing a single winner.</td>
|
|
85
|
+
</tr>
|
|
86
|
+
<tr>
|
|
74
87
|
<td><b>Agent-Based Evaluation</b></td>
|
|
75
|
-
<td>The evaluator agent reads experiment outputs via langsmith-cli, judges correctness using
|
|
88
|
+
<td>The evaluator agent reads experiment outputs via langsmith-cli, judges correctness using rubrics when available, and writes scores back. Judge feedback (textual comments explaining WHY scores were given) is surfaced to proposers for targeted mutations.</td>
|
|
89
|
+
</tr>
|
|
90
|
+
<tr>
|
|
91
|
+
<td><b>Canary Preflight</b></td>
|
|
92
|
+
<td>Before running the full evaluation, 1 example is tested as a canary. If the agent produces no output, evaluation stops immediately — no API quota wasted on broken agents.</td>
|
|
93
|
+
</tr>
|
|
94
|
+
<tr>
|
|
95
|
+
<td><b>Secret Detection</b></td>
|
|
96
|
+
<td>Detects 15+ secret patterns (API keys, tokens, PEM keys) in production traces and dataset examples. Secrets are filtered from <code>seed_from_traces</code> and flagged as critical issues in dataset health checks.</td>
|
|
97
|
+
</tr>
|
|
98
|
+
<tr>
|
|
99
|
+
<td><b>Evolution Chart</b></td>
|
|
100
|
+
<td>Rich ASCII visualization with ANSI colors: sparkline trend, score progression table (per-evaluator breakdown), what-changed narrative, horizontal bar chart, and code growth tracking with warnings.</td>
|
|
76
101
|
</tr>
|
|
77
102
|
<tr>
|
|
78
103
|
<td><b>Production Traces</b></td>
|
|
79
|
-
<td>Auto-discovers existing LangSmith production projects. Uses real user inputs for test generation and real error patterns for targeted optimization.</td>
|
|
104
|
+
<td>Auto-discovers existing LangSmith production projects. Uses real user inputs for test generation and real error patterns for targeted optimization. Can also mine Claude Code session history for eval data.</td>
|
|
80
105
|
</tr>
|
|
81
106
|
<tr>
|
|
82
107
|
<td><b>Active Critic</b></td>
|
|
@@ -92,11 +117,11 @@ claude
|
|
|
92
117
|
</tr>
|
|
93
118
|
<tr>
|
|
94
119
|
<td><b>Dataset Health</b></td>
|
|
95
|
-
<td>Pre-flight dataset quality check: size adequacy, difficulty distribution, dead example detection, production coverage analysis, train/held-out splits. Auto-corrects issues before evolution starts.</td>
|
|
120
|
+
<td>Pre-flight dataset quality check: size adequacy, difficulty distribution, dead example detection, production coverage analysis, train/held-out splits, and secret scanning. Auto-corrects issues before evolution starts.</td>
|
|
96
121
|
</tr>
|
|
97
122
|
<tr>
|
|
98
123
|
<td><b>Smart Gating</b></td>
|
|
99
|
-
<td>Claude assesses gate conditions directly — score plateau, target reached, diminishing returns.
|
|
124
|
+
<td>Claude assesses gate conditions directly — score plateau, target reached, diminishing returns. Holdout enforcement ensures final comparison uses unseen data. Baseline is re-scored with LLM-judge before the loop to prevent inflated starting scores.</td>
|
|
100
125
|
</tr>
|
|
101
126
|
<tr>
|
|
102
127
|
<td><b>Background Mode</b></td>
|
|
@@ -111,9 +136,9 @@ claude
|
|
|
111
136
|
| Command | What it does |
|
|
112
137
|
|---|---|
|
|
113
138
|
| `/evolver:setup` | Explore project, configure LangSmith (dataset, evaluators), run baseline |
|
|
114
|
-
| `/evolver:health` | Check dataset quality (size, difficulty, coverage, splits), auto-correct
|
|
139
|
+
| `/evolver:health` | Check dataset quality (size, difficulty, coverage, splits, secrets), auto-correct |
|
|
115
140
|
| `/evolver:evolve` | Run the optimization loop (dynamic self-organizing proposers in worktrees) |
|
|
116
|
-
| `/evolver:status` | Show progress
|
|
141
|
+
| `/evolver:status` | Show progress with rich ASCII evolution chart |
|
|
117
142
|
| `/evolver:deploy` | Tag, push, clean up temporary files |
|
|
118
143
|
|
|
119
144
|
---
|
|
@@ -123,11 +148,11 @@ claude
|
|
|
123
148
|
| Agent | Role | Color |
|
|
124
149
|
|---|---|---|
|
|
125
150
|
| **Proposer** | Self-organizing — investigates a data-driven lens, decides own approach, may abstain | Green |
|
|
126
|
-
| **Evaluator** | LLM-as-judge —
|
|
151
|
+
| **Evaluator** | LLM-as-judge — rubric-aware scoring via langsmith-cli, textual feedback | Yellow |
|
|
127
152
|
| **Architect** | ULTRAPLAN mode — deep topology analysis with Opus model | Blue |
|
|
128
153
|
| **Critic** | Active — detects gaming AND implements stricter evaluators | Red |
|
|
129
154
|
| **Consolidator** | Cross-iteration memory consolidation (autoDream-inspired) | Cyan |
|
|
130
|
-
| **TestGen** | Generates test inputs + adversarial injection mode | Cyan |
|
|
155
|
+
| **TestGen** | Generates test inputs with rubrics + adversarial injection mode | Cyan |
|
|
131
156
|
|
|
132
157
|
---
|
|
133
158
|
|
|
@@ -136,20 +161,22 @@ claude
|
|
|
136
161
|
```
|
|
137
162
|
/evolver:evolve
|
|
138
163
|
|
|
|
139
|
-
+- 0.5 Validate state (
|
|
140
|
-
+- 0.6 /evolver:health — dataset quality
|
|
164
|
+
+- 0.5 Validate state (check .evolver.json vs LangSmith)
|
|
165
|
+
+- 0.6 /evolver:health — dataset quality + secret scan + auto-correct
|
|
166
|
+
+- 0.7 Baseline LLM-judge — re-score baseline with correctness if only has_output exists
|
|
141
167
|
+- 1. Read state (.evolver.json + LangSmith experiments)
|
|
142
|
-
+- 1.5 Gather trace insights (cluster errors, tokens, latency)
|
|
143
|
-
+- 1.8 Analyze per-task failures (train split only
|
|
168
|
+
+- 1.5 Gather trace insights + judge feedback (cluster errors, tokens, latency)
|
|
169
|
+
+- 1.8 Analyze per-task failures with judge comments (train split only)
|
|
144
170
|
+- 1.8a Claude generates strategy.md + lenses.json from analysis data
|
|
145
171
|
+- 1.9 Prepare shared proposer context (KV cache-optimized prefix)
|
|
146
172
|
+- 2. Spawn N self-organizing proposers in parallel (each in a git worktree)
|
|
147
|
-
+- 3.
|
|
148
|
-
+- 3.5 Spawn evaluator agent (LLM-as-judge via langsmith-cli)
|
|
149
|
-
+- 4. Compare experiments ->
|
|
173
|
+
+- 3. Copy .evolver.json + .env to worktrees, run canary, evaluate candidates
|
|
174
|
+
+- 3.5 Spawn evaluator agent (rubric-aware LLM-as-judge via langsmith-cli)
|
|
175
|
+
+- 4. Compare experiments on held-out split -> winner + Pareto front
|
|
176
|
+
+- 4.5 Constraint gate — reject candidates that break size/tests/entry-point
|
|
150
177
|
+- 5. Merge winning worktree into main branch
|
|
151
178
|
+- 5.5 Regression tracking (auto-add guard examples to dataset)
|
|
152
|
-
+- 6. Report results
|
|
179
|
+
+- 6. Report results + evolution chart
|
|
153
180
|
+- 6.2 Consolidator agent updates evolution memory (runs in background)
|
|
154
181
|
+- 6.5 Auto-trigger Active Critic (detect + fix evaluator gaming)
|
|
155
182
|
+- 7. Auto-trigger ULTRAPLAN Architect (opus model, deep analysis)
|
|
@@ -166,27 +193,31 @@ Plugin hook (SessionStart)
|
|
|
166
193
|
|
|
167
194
|
Skills (markdown)
|
|
168
195
|
├── /evolver:setup → explores project, smart defaults, runs setup.py
|
|
169
|
-
├── /evolver:health → dataset quality
|
|
196
|
+
├── /evolver:health → dataset quality + secret scan + auto-correct
|
|
170
197
|
├── /evolver:evolve → orchestrates the evolution loop
|
|
171
|
-
├── /evolver:status →
|
|
198
|
+
├── /evolver:status → rich ASCII evolution chart + stagnation detection
|
|
172
199
|
└── /evolver:deploy → tags and pushes
|
|
173
200
|
|
|
174
201
|
Agents (markdown)
|
|
175
202
|
├── Proposer (xN) → self-organizing, lens-driven, isolated git worktrees
|
|
176
|
-
├── Evaluator → LLM-as-judge via langsmith-cli
|
|
203
|
+
├── Evaluator → rubric-aware LLM-as-judge via langsmith-cli
|
|
177
204
|
├── Critic → detects gaming + implements stricter evaluators
|
|
178
205
|
├── Architect → ULTRAPLAN deep analysis (opus model)
|
|
179
206
|
├── Consolidator → cross-iteration memory (autoDream-inspired)
|
|
180
|
-
└── TestGen → generates test inputs + adversarial injection
|
|
207
|
+
└── TestGen → generates test inputs with rubrics + adversarial injection
|
|
181
208
|
|
|
182
|
-
Tools (Python
|
|
183
|
-
├── setup.py → creates datasets, configures evaluators
|
|
184
|
-
├── run_eval.py → runs target against dataset
|
|
185
|
-
├── read_results.py →
|
|
209
|
+
Tools (Python)
|
|
210
|
+
├── setup.py → creates datasets, configures evaluators + weights
|
|
211
|
+
├── run_eval.py → runs target against dataset (canary preflight, {input_text})
|
|
212
|
+
├── read_results.py → weighted scoring, Pareto front, judge feedback
|
|
186
213
|
├── trace_insights.py → clusters errors from traces
|
|
187
|
-
├── seed_from_traces.py → imports production traces
|
|
214
|
+
├── seed_from_traces.py → imports production traces (secret-filtered)
|
|
215
|
+
├── evolution_chart.py → rich ASCII chart (stdlib-only)
|
|
216
|
+
├── constraint_check.py → validates proposals (growth, syntax, tests) (stdlib-only)
|
|
217
|
+
├── secret_filter.py → detects 15+ secret patterns (stdlib-only)
|
|
218
|
+
├── mine_sessions.py → extracts eval data from Claude Code history (stdlib-only)
|
|
219
|
+
├── dataset_health.py → dataset quality diagnostic + secret scanning
|
|
188
220
|
├── validate_state.py → validates config vs LangSmith state
|
|
189
|
-
├── dataset_health.py → dataset quality diagnostic (size, difficulty, coverage, splits)
|
|
190
221
|
├── regression_tracker.py → tracks regressions, adds guard examples
|
|
191
222
|
├── add_evaluator.py → programmatically adds evaluators
|
|
192
223
|
└── adversarial_inject.py → detects memorization, injects adversarial tests
|
|
@@ -194,6 +225,27 @@ Tools (Python + langsmith SDK)
|
|
|
194
225
|
|
|
195
226
|
---
|
|
196
227
|
|
|
228
|
+
## Entry Point Placeholders
|
|
229
|
+
|
|
230
|
+
When configuring your agent's entry point during setup, use the placeholder that matches how your agent takes input:
|
|
231
|
+
|
|
232
|
+
| Placeholder | Behavior | Use when |
|
|
233
|
+
|---|---|---|
|
|
234
|
+
| `{input_text}` | Extracts plain text, shell-escapes it | Agent takes `--query "text"` or positional args |
|
|
235
|
+
| `{input}` | Passes path to a JSON file | Agent reads structured JSON from file |
|
|
236
|
+
| `{input_json}` | Passes raw JSON string inline | Agent parses JSON from command line |
|
|
237
|
+
|
|
238
|
+
**Example:**
|
|
239
|
+
```bash
|
|
240
|
+
# Agent that takes a query as text:
|
|
241
|
+
python agent.py --query {input_text}
|
|
242
|
+
|
|
243
|
+
# Agent that reads a JSON file:
|
|
244
|
+
python agent.py {input}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
197
249
|
## Requirements
|
|
198
250
|
|
|
199
251
|
- **LangSmith account** + `LANGSMITH_API_KEY`
|
|
@@ -223,6 +275,7 @@ LangSmith traces **any** AI framework. The evolver works with all of them:
|
|
|
223
275
|
|
|
224
276
|
- [Meta-Harness: End-to-End Optimization of Model Harnesses](https://arxiv.org/abs/2603.28052) — Lee et al., 2026
|
|
225
277
|
- [Drop the Hierarchy and Roles: How Self-Organizing LLM Agents Outperform Designed Structures](https://arxiv.org/abs/2603.28990) — Dochkina, 2026
|
|
278
|
+
- [Hermes Agent Self-Evolution](https://github.com/NousResearch/hermes-agent-self-evolution) — NousResearch (rubric-based eval, constraint gates)
|
|
226
279
|
- [Darwin Godel Machine](https://sakana.ai/dgm/) — Sakana AI
|
|
227
280
|
- [AlphaEvolve](https://deepmind.google/blog/alphaevolve/) — DeepMind
|
|
228
281
|
- [LangSmith Evaluation](https://docs.smith.langchain.com/evaluation) — LangChain
|
package/package.json
CHANGED
package/tools/add_evaluator.py
CHANGED
|
@@ -39,26 +39,39 @@ CODE_EVALUATOR_TEMPLATES = {
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def add_evaluator(config_path, evaluator_name, eval_type, pattern=None):
|
|
42
|
-
"""Add evaluator to config.
|
|
42
|
+
"""Add evaluator to config using partial update to avoid race conditions.
|
|
43
|
+
|
|
44
|
+
Re-reads the config immediately before writing to minimize the window
|
|
45
|
+
where concurrent updates (e.g., main loop updating best_score) could
|
|
46
|
+
be lost. Only modifies 'evaluators' and 'code_evaluators' fields.
|
|
47
|
+
"""
|
|
48
|
+
# First read to check if evaluator already exists
|
|
43
49
|
with open(config_path) as f:
|
|
44
50
|
config = json.load(f)
|
|
45
51
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if evaluator_name in evaluators:
|
|
52
|
+
if evaluator_name in config.get("evaluators", []):
|
|
49
53
|
print(f"Evaluator '{evaluator_name}' already exists", file=sys.stderr)
|
|
50
54
|
return False
|
|
51
55
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
# Prepare what we need to add
|
|
57
|
+
new_code_eval = None
|
|
55
58
|
if eval_type == "code" and pattern:
|
|
56
|
-
|
|
57
|
-
code_evals[evaluator_name] = {"pattern": pattern, "type": "regex"}
|
|
58
|
-
config["code_evaluators"] = code_evals
|
|
59
|
+
new_code_eval = {"pattern": pattern, "type": "regex"}
|
|
59
60
|
elif eval_type == "code" and evaluator_name in CODE_EVALUATOR_TEMPLATES:
|
|
61
|
+
new_code_eval = CODE_EVALUATOR_TEMPLATES[evaluator_name]
|
|
62
|
+
|
|
63
|
+
# Re-read config right before write to pick up concurrent changes
|
|
64
|
+
with open(config_path) as f:
|
|
65
|
+
config = json.load(f)
|
|
66
|
+
|
|
67
|
+
evaluators = config.get("evaluators", [])
|
|
68
|
+
if evaluator_name not in evaluators:
|
|
69
|
+
evaluators.append(evaluator_name)
|
|
70
|
+
config["evaluators"] = evaluators
|
|
71
|
+
|
|
72
|
+
if new_code_eval:
|
|
60
73
|
code_evals = config.get("code_evaluators", {})
|
|
61
|
-
code_evals[evaluator_name] =
|
|
74
|
+
code_evals[evaluator_name] = new_code_eval
|
|
62
75
|
config["code_evaluators"] = code_evals
|
|
63
76
|
|
|
64
77
|
with open(config_path, "w") as f:
|
|
@@ -77,12 +90,16 @@ def main():
|
|
|
77
90
|
args = parser.parse_args()
|
|
78
91
|
|
|
79
92
|
if args.remove:
|
|
93
|
+
# Re-read right before write to avoid race conditions
|
|
80
94
|
with open(args.config) as f:
|
|
81
95
|
config = json.load(f)
|
|
82
96
|
evaluators = config.get("evaluators", [])
|
|
83
97
|
if args.evaluator in evaluators:
|
|
84
98
|
evaluators.remove(args.evaluator)
|
|
85
99
|
config["evaluators"] = evaluators
|
|
100
|
+
code_evals = config.get("code_evaluators", {})
|
|
101
|
+
code_evals.pop(args.evaluator, None)
|
|
102
|
+
config["code_evaluators"] = code_evals
|
|
86
103
|
with open(args.config, "w") as f:
|
|
87
104
|
json.dump(config, f, indent=2)
|
|
88
105
|
print(f"Removed evaluator: {args.evaluator}")
|
|
@@ -80,7 +80,30 @@ def check_entry_point(worktree_path, entry_point):
|
|
|
80
80
|
return {"pass": True, "reason": "entry point exists and has valid syntax"}
|
|
81
81
|
|
|
82
82
|
|
|
83
|
-
def
|
|
83
|
+
def find_project_python(worktree_path, config=None):
|
|
84
|
+
"""Find the project's Python interpreter (venv > entry_point > system).
|
|
85
|
+
|
|
86
|
+
Checks for venv in the worktree, then extracts from entry_point config,
|
|
87
|
+
then falls back to system python3.
|
|
88
|
+
"""
|
|
89
|
+
# Check for venv in worktree
|
|
90
|
+
for venv_dir in [".venv", "venv"]:
|
|
91
|
+
venv_python = os.path.join(worktree_path, venv_dir, "bin", "python")
|
|
92
|
+
if os.path.isfile(venv_python):
|
|
93
|
+
return venv_python
|
|
94
|
+
|
|
95
|
+
# Extract from entry_point in config
|
|
96
|
+
if config:
|
|
97
|
+
entry = config.get("entry_point", "")
|
|
98
|
+
for part in entry.split():
|
|
99
|
+
if part.endswith("/python") or part.endswith("/python3"):
|
|
100
|
+
if os.path.isfile(part):
|
|
101
|
+
return part
|
|
102
|
+
|
|
103
|
+
return "python3"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def check_tests(worktree_path, config=None):
|
|
84
107
|
"""Run test suite if it exists. Returns pass if no tests found."""
|
|
85
108
|
test_dirs = ["tests", "test"]
|
|
86
109
|
has_tests = False
|
|
@@ -95,9 +118,11 @@ def check_tests(worktree_path):
|
|
|
95
118
|
if not has_tests:
|
|
96
119
|
return {"pass": True, "reason": "no test suite found (skipped)", "skipped": True}
|
|
97
120
|
|
|
121
|
+
python = find_project_python(worktree_path, config)
|
|
122
|
+
|
|
98
123
|
try:
|
|
99
124
|
result = subprocess.run(
|
|
100
|
-
[
|
|
125
|
+
[python, "-m", "pytest", "-q", "--tb=no"],
|
|
101
126
|
capture_output=True, text=True,
|
|
102
127
|
cwd=worktree_path, timeout=120,
|
|
103
128
|
)
|
|
@@ -135,7 +160,7 @@ def main():
|
|
|
135
160
|
args.max_growth,
|
|
136
161
|
),
|
|
137
162
|
"entry_point": check_entry_point(args.worktree_path, ep_for_check),
|
|
138
|
-
"tests": check_tests(args.worktree_path),
|
|
163
|
+
"tests": check_tests(args.worktree_path, config),
|
|
139
164
|
}
|
|
140
165
|
|
|
141
166
|
all_pass = all(r["pass"] for r in results.values())
|
package/tools/evolution_chart.py
CHANGED
|
@@ -73,7 +73,16 @@ def render_header(config, history, scores, c):
|
|
|
73
73
|
project = config.get('project', 'unknown')
|
|
74
74
|
dataset = config.get('dataset', 'unknown')
|
|
75
75
|
evals = config.get('evaluators', [])
|
|
76
|
-
|
|
76
|
+
# Find example count from multiple sources
|
|
77
|
+
total = history[0].get('total')
|
|
78
|
+
if not total:
|
|
79
|
+
# Check any history entry that has it
|
|
80
|
+
for h in history:
|
|
81
|
+
if h.get('total'):
|
|
82
|
+
total = h['total']
|
|
83
|
+
break
|
|
84
|
+
if not total:
|
|
85
|
+
total = config.get('num_examples', '?')
|
|
77
86
|
base_score = scores[0]
|
|
78
87
|
best_score = max(scores)
|
|
79
88
|
iters = len(history) - 1
|