harness-evolver 2.9.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +156 -687
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -293
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
package/tools/init.py
DELETED
|
@@ -1,531 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Project initializer for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Usage:
|
|
5
|
-
init.py [DIR] # auto-detect in DIR (or CWD)
|
|
6
|
-
init.py --harness PATH --eval PATH --tasks PATH # explicit paths
|
|
7
|
-
init.py --base-dir PATH [--harness-config PATH] # advanced options
|
|
8
|
-
|
|
9
|
-
Auto-detects harness.py, eval.py, tasks/ and config.json in the working directory.
|
|
10
|
-
Falls back to fuzzy matching (*harness*, *eval*, *score*, dirs with .json files).
|
|
11
|
-
Stdlib-only. No external dependencies.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
import argparse
|
|
15
|
-
import glob
|
|
16
|
-
import json
|
|
17
|
-
import os
|
|
18
|
-
import shutil
|
|
19
|
-
import subprocess
|
|
20
|
-
import sys
|
|
21
|
-
import tempfile
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _auto_detect(search_dir):
|
|
25
|
-
"""Auto-detect harness, eval, and tasks in a directory.
|
|
26
|
-
|
|
27
|
-
Returns (harness_path, eval_path, tasks_path, config_path) or raises SystemExit.
|
|
28
|
-
"""
|
|
29
|
-
search_dir = os.path.abspath(search_dir)
|
|
30
|
-
|
|
31
|
-
# Exact convention names first
|
|
32
|
-
harness = None
|
|
33
|
-
eval_script = None
|
|
34
|
-
tasks = None
|
|
35
|
-
config = None
|
|
36
|
-
|
|
37
|
-
# 1. Exact matches
|
|
38
|
-
for name in ["harness.py"]:
|
|
39
|
-
p = os.path.join(search_dir, name)
|
|
40
|
-
if os.path.isfile(p):
|
|
41
|
-
harness = p
|
|
42
|
-
for name in ["eval.py"]:
|
|
43
|
-
p = os.path.join(search_dir, name)
|
|
44
|
-
if os.path.isfile(p):
|
|
45
|
-
eval_script = p
|
|
46
|
-
for name in ["tasks", "tasks/"]:
|
|
47
|
-
p = os.path.join(search_dir, name.rstrip("/"))
|
|
48
|
-
if os.path.isdir(p):
|
|
49
|
-
tasks = p
|
|
50
|
-
for name in ["config.json"]:
|
|
51
|
-
p = os.path.join(search_dir, name)
|
|
52
|
-
if os.path.isfile(p):
|
|
53
|
-
config = p
|
|
54
|
-
|
|
55
|
-
# 2. Fuzzy fallback for harness
|
|
56
|
-
if not harness:
|
|
57
|
-
candidates = [f for f in glob.glob(os.path.join(search_dir, "*.py"))
|
|
58
|
-
if any(k in os.path.basename(f).lower() for k in ["harness", "agent", "run"])]
|
|
59
|
-
if len(candidates) == 1:
|
|
60
|
-
harness = candidates[0]
|
|
61
|
-
|
|
62
|
-
# 3. Fuzzy fallback for eval
|
|
63
|
-
if not eval_script:
|
|
64
|
-
candidates = [f for f in glob.glob(os.path.join(search_dir, "*.py"))
|
|
65
|
-
if any(k in os.path.basename(f).lower() for k in ["eval", "score", "judge"])
|
|
66
|
-
and f != harness]
|
|
67
|
-
if len(candidates) == 1:
|
|
68
|
-
eval_script = candidates[0]
|
|
69
|
-
|
|
70
|
-
# 4. Fuzzy fallback for tasks
|
|
71
|
-
if not tasks:
|
|
72
|
-
for d in os.listdir(search_dir):
|
|
73
|
-
dp = os.path.join(search_dir, d)
|
|
74
|
-
if os.path.isdir(dp) and any(f.endswith(".json") for f in os.listdir(dp)):
|
|
75
|
-
# Check if at least one JSON has "id" and "input" keys
|
|
76
|
-
for f in os.listdir(dp):
|
|
77
|
-
if f.endswith(".json"):
|
|
78
|
-
try:
|
|
79
|
-
with open(os.path.join(dp, f)) as fh:
|
|
80
|
-
data = json.load(fh)
|
|
81
|
-
if "id" in data and "input" in data:
|
|
82
|
-
tasks = dp
|
|
83
|
-
break
|
|
84
|
-
except (json.JSONDecodeError, KeyError):
|
|
85
|
-
pass
|
|
86
|
-
if tasks:
|
|
87
|
-
break
|
|
88
|
-
|
|
89
|
-
return harness, eval_script, tasks, config
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _detect_api_keys():
|
|
93
|
-
"""Detect which LLM/service API keys are available in the environment."""
|
|
94
|
-
KNOWN_KEYS = {
|
|
95
|
-
"ANTHROPIC_API_KEY": "Anthropic (Claude)",
|
|
96
|
-
"OPENAI_API_KEY": "OpenAI (GPT)",
|
|
97
|
-
"GOOGLE_API_KEY": "Google (Gemini)",
|
|
98
|
-
"GEMINI_API_KEY": "Google Gemini",
|
|
99
|
-
"OPENROUTER_API_KEY": "OpenRouter",
|
|
100
|
-
"LANGSMITH_API_KEY": "LangSmith",
|
|
101
|
-
"TOGETHER_API_KEY": "Together AI",
|
|
102
|
-
"GROQ_API_KEY": "Groq",
|
|
103
|
-
"MISTRAL_API_KEY": "Mistral",
|
|
104
|
-
"COHERE_API_KEY": "Cohere",
|
|
105
|
-
"FIREWORKS_API_KEY": "Fireworks AI",
|
|
106
|
-
"DEEPSEEK_API_KEY": "DeepSeek",
|
|
107
|
-
"XAI_API_KEY": "xAI (Grok)",
|
|
108
|
-
}
|
|
109
|
-
detected = {}
|
|
110
|
-
for env_var, display_name in KNOWN_KEYS.items():
|
|
111
|
-
if os.environ.get(env_var):
|
|
112
|
-
detected[env_var] = {"name": display_name, "status": "detected"}
|
|
113
|
-
return detected
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def _detect_langsmith():
|
|
117
|
-
"""Auto-detect LangSmith API key and return config section."""
|
|
118
|
-
if os.environ.get("LANGSMITH_API_KEY"):
|
|
119
|
-
return {
|
|
120
|
-
"enabled": True,
|
|
121
|
-
"api_key_env": "LANGSMITH_API_KEY",
|
|
122
|
-
"project_prefix": "harness-evolver",
|
|
123
|
-
}
|
|
124
|
-
return {"enabled": False}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def _detect_langsmith_project(search_dir="."):
|
|
128
|
-
"""Auto-detect the app's existing LangSmith project name.
|
|
129
|
-
|
|
130
|
-
Checks (in order):
|
|
131
|
-
1. LANGCHAIN_PROJECT env var (standard LangChain convention)
|
|
132
|
-
2. LANGSMITH_PROJECT env var (alternative)
|
|
133
|
-
3. .env file in the project directory
|
|
134
|
-
"""
|
|
135
|
-
for var in ("LANGCHAIN_PROJECT", "LANGSMITH_PROJECT"):
|
|
136
|
-
project = os.environ.get(var)
|
|
137
|
-
if project:
|
|
138
|
-
return project
|
|
139
|
-
|
|
140
|
-
# Parse .env file
|
|
141
|
-
for env_name in (".env", ".env.local"):
|
|
142
|
-
env_path = os.path.join(search_dir, env_name)
|
|
143
|
-
if os.path.exists(env_path):
|
|
144
|
-
try:
|
|
145
|
-
with open(env_path) as f:
|
|
146
|
-
for line in f:
|
|
147
|
-
line = line.strip()
|
|
148
|
-
if line.startswith("#") or "=" not in line:
|
|
149
|
-
continue
|
|
150
|
-
key, _, val = line.partition("=")
|
|
151
|
-
key = key.strip()
|
|
152
|
-
val = val.strip().strip("'\"")
|
|
153
|
-
if key in ("LANGCHAIN_PROJECT", "LANGSMITH_PROJECT") and val:
|
|
154
|
-
return val
|
|
155
|
-
except OSError:
|
|
156
|
-
pass
|
|
157
|
-
|
|
158
|
-
return None
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def _check_langsmith_cli():
|
|
162
|
-
"""Check if langsmith-cli is installed."""
|
|
163
|
-
try:
|
|
164
|
-
r = subprocess.run(["langsmith-cli", "self", "detect"],
|
|
165
|
-
capture_output=True, text=True, timeout=5)
|
|
166
|
-
return r.returncode == 0
|
|
167
|
-
except FileNotFoundError:
|
|
168
|
-
return False
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def _resolve_python():
|
|
172
|
-
"""Resolve the Python interpreter for subprocesses.
|
|
173
|
-
|
|
174
|
-
Uses the current interpreter (sys.executable) instead of hardcoded 'python3'.
|
|
175
|
-
This prevents version mismatches in monorepo setups where the harness may
|
|
176
|
-
need a specific venv Python different from the system python3.
|
|
177
|
-
"""
|
|
178
|
-
exe = sys.executable
|
|
179
|
-
if exe and os.path.isfile(exe):
|
|
180
|
-
return exe
|
|
181
|
-
return "python3"
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def _detect_stack(harness_path):
|
|
185
|
-
"""Detect technology stack from harness imports."""
|
|
186
|
-
detect_stack_py = os.path.join(os.path.dirname(__file__), "detect_stack.py")
|
|
187
|
-
if not os.path.exists(detect_stack_py):
|
|
188
|
-
return {}
|
|
189
|
-
try:
|
|
190
|
-
r = subprocess.run(
|
|
191
|
-
[_resolve_python(), detect_stack_py, harness_path],
|
|
192
|
-
capture_output=True, text=True, timeout=30,
|
|
193
|
-
)
|
|
194
|
-
if r.returncode == 0 and r.stdout.strip():
|
|
195
|
-
return json.loads(r.stdout)
|
|
196
|
-
except Exception:
|
|
197
|
-
pass
|
|
198
|
-
return {}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def _check_context7_available():
|
|
202
|
-
"""Check if Context7 MCP is configured in Claude Code."""
|
|
203
|
-
settings_paths = [
|
|
204
|
-
os.path.expanduser("~/.claude/settings.json"),
|
|
205
|
-
os.path.expanduser("~/.claude.json"),
|
|
206
|
-
]
|
|
207
|
-
for path in settings_paths:
|
|
208
|
-
if os.path.exists(path):
|
|
209
|
-
try:
|
|
210
|
-
with open(path) as f:
|
|
211
|
-
settings = json.load(f)
|
|
212
|
-
mcp = settings.get("mcpServers", {})
|
|
213
|
-
if "context7" in mcp or "Context7" in mcp:
|
|
214
|
-
return True
|
|
215
|
-
except (json.JSONDecodeError, KeyError):
|
|
216
|
-
pass
|
|
217
|
-
return False
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
def main():
|
|
221
|
-
parser = argparse.ArgumentParser(
|
|
222
|
-
description="Initialize Harness Evolver project",
|
|
223
|
-
usage="init.py [DIR] [--harness PATH] [--eval PATH] [--tasks PATH]",
|
|
224
|
-
)
|
|
225
|
-
parser.add_argument("dir", nargs="?", default=".",
|
|
226
|
-
help="Directory to scan (default: current directory)")
|
|
227
|
-
parser.add_argument("--harness", default=None, help="Path to harness script")
|
|
228
|
-
parser.add_argument("--eval", default=None, help="Path to eval script")
|
|
229
|
-
parser.add_argument("--tasks", default=None, help="Path to tasks directory")
|
|
230
|
-
parser.add_argument("--base-dir", default=None, help="Path for .harness-evolver/")
|
|
231
|
-
parser.add_argument("--harness-config", default=None, help="Path to harness config.json")
|
|
232
|
-
parser.add_argument("--tools-dir", default=None, help="Path to tools directory")
|
|
233
|
-
parser.add_argument("--validation-timeout", type=int, default=30,
|
|
234
|
-
help="Timeout for harness validation in seconds (default: 30). "
|
|
235
|
-
"Increase for LLM-powered agents that make real API calls.")
|
|
236
|
-
parser.add_argument("--skip-validation", action="store_true",
|
|
237
|
-
help="Skip harness validation step. Use when you know the harness "
|
|
238
|
-
"works but validation times out (e.g. real LLM agent calls).")
|
|
239
|
-
parser.add_argument("--langsmith-project", default=None,
|
|
240
|
-
help="Existing LangSmith project name with production traces. "
|
|
241
|
-
"Auto-detected from LANGCHAIN_PROJECT / LANGSMITH_PROJECT env vars or .env file.")
|
|
242
|
-
args = parser.parse_args()
|
|
243
|
-
|
|
244
|
-
# Auto-detect missing args
|
|
245
|
-
search_dir = os.path.abspath(args.dir)
|
|
246
|
-
if not args.harness or not args.eval or not args.tasks:
|
|
247
|
-
detected_harness, detected_eval, detected_tasks, detected_config = _auto_detect(search_dir)
|
|
248
|
-
if not args.harness:
|
|
249
|
-
args.harness = detected_harness
|
|
250
|
-
if not args.eval:
|
|
251
|
-
args.eval = detected_eval
|
|
252
|
-
if not args.tasks:
|
|
253
|
-
args.tasks = detected_tasks
|
|
254
|
-
if not args.harness_config and detected_config:
|
|
255
|
-
args.harness_config = detected_config
|
|
256
|
-
|
|
257
|
-
# Validate we have everything
|
|
258
|
-
missing = []
|
|
259
|
-
if not args.harness:
|
|
260
|
-
missing.append("harness (no harness.py or *harness*.py found)")
|
|
261
|
-
if not args.eval:
|
|
262
|
-
missing.append("eval (no eval.py or *eval*.py found)")
|
|
263
|
-
if not args.tasks:
|
|
264
|
-
missing.append("tasks (no tasks/ directory with JSON files found)")
|
|
265
|
-
if missing:
|
|
266
|
-
print("Could not auto-detect:", file=sys.stderr)
|
|
267
|
-
for m in missing:
|
|
268
|
-
print(f" - {m}", file=sys.stderr)
|
|
269
|
-
print(f"\nSearched in: {search_dir}", file=sys.stderr)
|
|
270
|
-
print("\nProvide explicitly:", file=sys.stderr)
|
|
271
|
-
print(" /harness-evolve-init --harness PATH --eval PATH --tasks PATH", file=sys.stderr)
|
|
272
|
-
sys.exit(1)
|
|
273
|
-
|
|
274
|
-
# Print what was detected
|
|
275
|
-
print(f"Harness: {os.path.relpath(args.harness, search_dir)}")
|
|
276
|
-
print(f"Eval: {os.path.relpath(args.eval, search_dir)}")
|
|
277
|
-
print(f"Tasks: {os.path.relpath(args.tasks, search_dir)}/")
|
|
278
|
-
if args.harness_config:
|
|
279
|
-
print(f"Config: {os.path.relpath(args.harness_config, search_dir)}")
|
|
280
|
-
print()
|
|
281
|
-
|
|
282
|
-
base = args.base_dir or os.path.join(search_dir, ".harness-evolver")
|
|
283
|
-
tools = args.tools_dir or os.path.dirname(__file__)
|
|
284
|
-
|
|
285
|
-
evaluate_py = os.path.join(tools, "evaluate.py")
|
|
286
|
-
state_py = os.path.join(tools, "state.py")
|
|
287
|
-
|
|
288
|
-
# 1. Create directory structure
|
|
289
|
-
for d in ["baseline", "eval/tasks", "harnesses"]:
|
|
290
|
-
os.makedirs(os.path.join(base, d), exist_ok=True)
|
|
291
|
-
|
|
292
|
-
# 2. Copy baseline harness
|
|
293
|
-
shutil.copy2(args.harness, os.path.join(base, "baseline", "harness.py"))
|
|
294
|
-
if args.harness_config and os.path.exists(args.harness_config):
|
|
295
|
-
shutil.copy2(args.harness_config, os.path.join(base, "baseline", "config.json"))
|
|
296
|
-
|
|
297
|
-
# 3. Copy eval script and tasks
|
|
298
|
-
shutil.copy2(args.eval, os.path.join(base, "eval", "eval.py"))
|
|
299
|
-
for f in os.listdir(args.tasks):
|
|
300
|
-
src = os.path.join(args.tasks, f)
|
|
301
|
-
if os.path.isfile(src):
|
|
302
|
-
shutil.copy2(src, os.path.join(base, "eval", "tasks", f))
|
|
303
|
-
|
|
304
|
-
# 4. Generate config.json
|
|
305
|
-
harness_name = os.path.basename(args.harness)
|
|
306
|
-
eval_name = os.path.basename(args.eval)
|
|
307
|
-
config = {
|
|
308
|
-
"version": "0.1.0",
|
|
309
|
-
"harness": {
|
|
310
|
-
"command": f"python3 {harness_name}",
|
|
311
|
-
"args": ["--input", "{input}", "--output", "{output}",
|
|
312
|
-
"--traces-dir", "{traces_dir}", "--config", "{config}"],
|
|
313
|
-
"timeout_per_task_sec": 60,
|
|
314
|
-
},
|
|
315
|
-
"eval": {
|
|
316
|
-
"command": f"python3 {eval_name}",
|
|
317
|
-
"args": ["--results-dir", "{results_dir}", "--tasks-dir", "{tasks_dir}",
|
|
318
|
-
"--scores", "{scores}"],
|
|
319
|
-
"langsmith": _detect_langsmith(),
|
|
320
|
-
"production_project": args.langsmith_project or _detect_langsmith_project(search_dir),
|
|
321
|
-
},
|
|
322
|
-
"evolution": {
|
|
323
|
-
"max_iterations": 10,
|
|
324
|
-
"candidates_per_iter": 1,
|
|
325
|
-
"stagnation_limit": 3,
|
|
326
|
-
"stagnation_threshold": 0.01,
|
|
327
|
-
"target_score": None,
|
|
328
|
-
},
|
|
329
|
-
"paths": {
|
|
330
|
-
"baseline": "baseline/",
|
|
331
|
-
"eval_tasks": "eval/tasks/",
|
|
332
|
-
"eval_script": "eval/eval.py",
|
|
333
|
-
"harnesses": "harnesses/",
|
|
334
|
-
},
|
|
335
|
-
}
|
|
336
|
-
# Detect API keys available in environment
|
|
337
|
-
api_keys = _detect_api_keys()
|
|
338
|
-
config["api_keys"] = api_keys
|
|
339
|
-
|
|
340
|
-
with open(os.path.join(base, "config.json"), "w") as f:
|
|
341
|
-
json.dump(config, f, indent=2)
|
|
342
|
-
|
|
343
|
-
if api_keys:
|
|
344
|
-
print("API keys detected:")
|
|
345
|
-
for env_var, info in api_keys.items():
|
|
346
|
-
print(f" {info['name']} ({env_var})")
|
|
347
|
-
print()
|
|
348
|
-
|
|
349
|
-
ls_config = config["eval"].get("langsmith", {})
|
|
350
|
-
if ls_config.get("enabled"):
|
|
351
|
-
print(" LangSmith tracing enabled (LANGSMITH_API_KEY detected)")
|
|
352
|
-
if _check_langsmith_cli():
|
|
353
|
-
print(" langsmith-cli detected — proposer will use it for trace analysis")
|
|
354
|
-
else:
|
|
355
|
-
print(" Recommendation: install langsmith-cli for rich trace analysis:")
|
|
356
|
-
print(" uv tool install langsmith-cli && langsmith-cli auth login")
|
|
357
|
-
|
|
358
|
-
# Detect stack — try original harness first, then baseline copy, then scan entire source dir
|
|
359
|
-
stack = _detect_stack(os.path.abspath(args.harness))
|
|
360
|
-
if not stack:
|
|
361
|
-
stack = _detect_stack(os.path.join(base, "baseline", "harness.py"))
|
|
362
|
-
if not stack:
|
|
363
|
-
# Scan the original directory for any .py files with known imports
|
|
364
|
-
harness_dir = os.path.dirname(os.path.abspath(args.harness))
|
|
365
|
-
detect_stack_py = os.path.join(os.path.dirname(__file__), "detect_stack.py")
|
|
366
|
-
if os.path.exists(detect_stack_py):
|
|
367
|
-
try:
|
|
368
|
-
r = subprocess.run(
|
|
369
|
-
[_resolve_python(), detect_stack_py, harness_dir],
|
|
370
|
-
capture_output=True, text=True, timeout=30,
|
|
371
|
-
)
|
|
372
|
-
if r.returncode == 0 and r.stdout.strip():
|
|
373
|
-
stack = json.loads(r.stdout)
|
|
374
|
-
except Exception:
|
|
375
|
-
pass
|
|
376
|
-
config["stack"] = {
|
|
377
|
-
"detected": stack if stack else {},
|
|
378
|
-
"documentation_hint": "use context7",
|
|
379
|
-
"auto_detected": True,
|
|
380
|
-
}
|
|
381
|
-
# Re-write config.json with stack section added
|
|
382
|
-
with open(os.path.join(base, "config.json"), "w") as f:
|
|
383
|
-
json.dump(config, f, indent=2)
|
|
384
|
-
|
|
385
|
-
if stack:
|
|
386
|
-
print("Stack detected:")
|
|
387
|
-
for lib_info in stack.values():
|
|
388
|
-
print(f" {lib_info['display']}")
|
|
389
|
-
if not _check_context7_available():
|
|
390
|
-
print("\nRecommendation: install Context7 MCP for up-to-date documentation:")
|
|
391
|
-
print(" claude mcp add context7 -- npx -y @upstash/context7-mcp@latest")
|
|
392
|
-
|
|
393
|
-
# Architecture analysis (quick, advisory)
|
|
394
|
-
# Auto-detect additional source files by scanning for .py files near the harness
|
|
395
|
-
analyze_py = os.path.join(tools, "analyze_architecture.py")
|
|
396
|
-
if os.path.exists(analyze_py):
|
|
397
|
-
try:
|
|
398
|
-
harness_dir = os.path.dirname(os.path.abspath(args.harness))
|
|
399
|
-
source_files = []
|
|
400
|
-
for fname in os.listdir(harness_dir):
|
|
401
|
-
fpath = os.path.join(harness_dir, fname)
|
|
402
|
-
if fname.endswith(".py") and os.path.isfile(fpath) and fpath != os.path.abspath(args.harness):
|
|
403
|
-
source_files.append(fpath)
|
|
404
|
-
arch_cmd = [_resolve_python(), analyze_py, "--harness", args.harness]
|
|
405
|
-
if source_files:
|
|
406
|
-
arch_cmd.extend(["--source-files"] + source_files[:10])
|
|
407
|
-
r = subprocess.run(
|
|
408
|
-
arch_cmd,
|
|
409
|
-
capture_output=True, text=True, timeout=30,
|
|
410
|
-
)
|
|
411
|
-
if r.returncode == 0 and r.stdout.strip():
|
|
412
|
-
arch_signals = json.loads(r.stdout)
|
|
413
|
-
config["architecture"] = {
|
|
414
|
-
"current_topology": arch_signals.get("code_signals", {}).get("estimated_topology", "unknown"),
|
|
415
|
-
"auto_analyzed": True,
|
|
416
|
-
}
|
|
417
|
-
# Re-write config with architecture
|
|
418
|
-
with open(os.path.join(base, "config.json"), "w") as f:
|
|
419
|
-
json.dump(config, f, indent=2)
|
|
420
|
-
topo = config["architecture"]["current_topology"]
|
|
421
|
-
if topo != "unknown":
|
|
422
|
-
print(f"Architecture: {topo}")
|
|
423
|
-
except Exception:
|
|
424
|
-
pass
|
|
425
|
-
|
|
426
|
-
# 4.5 Fetch production traces seed (if LangSmith production project detected)
|
|
427
|
-
prod_project = config["eval"].get("production_project")
|
|
428
|
-
if prod_project and os.environ.get("LANGSMITH_API_KEY"):
|
|
429
|
-
seed_py = os.path.join(tools, "seed_from_traces.py")
|
|
430
|
-
if os.path.exists(seed_py):
|
|
431
|
-
print(f"Fetching production traces from LangSmith project '{prod_project}'...")
|
|
432
|
-
try:
|
|
433
|
-
r = subprocess.run(
|
|
434
|
-
[_resolve_python(), seed_py,
|
|
435
|
-
"--project", prod_project,
|
|
436
|
-
"--output-md", os.path.join(base, "production_seed.md"),
|
|
437
|
-
"--output-json", os.path.join(base, "production_seed.json"),
|
|
438
|
-
"--limit", "100"],
|
|
439
|
-
capture_output=True, text=True, timeout=60,
|
|
440
|
-
)
|
|
441
|
-
if r.returncode == 0:
|
|
442
|
-
print(r.stdout.strip())
|
|
443
|
-
else:
|
|
444
|
-
print(f" Could not fetch production traces: {r.stderr.strip()[:200]}")
|
|
445
|
-
except Exception as e:
|
|
446
|
-
print(f" Production trace fetch failed: {e}")
|
|
447
|
-
elif prod_project:
|
|
448
|
-
print(f"Production LangSmith project detected: {prod_project}")
|
|
449
|
-
print(" Set LANGSMITH_API_KEY to auto-fetch production traces during init.")
|
|
450
|
-
|
|
451
|
-
# 5. Validate baseline harness
|
|
452
|
-
config_path = os.path.join(base, "baseline", "config.json")
|
|
453
|
-
if args.skip_validation:
|
|
454
|
-
print("Skipping baseline validation (--skip-validation).")
|
|
455
|
-
else:
|
|
456
|
-
print(f"Validating baseline harness (timeout: {args.validation_timeout}s)...")
|
|
457
|
-
val_args = [_resolve_python(), evaluate_py, "validate",
|
|
458
|
-
"--harness", os.path.join(base, "baseline", "harness.py"),
|
|
459
|
-
"--timeout", str(args.validation_timeout)]
|
|
460
|
-
if os.path.exists(config_path):
|
|
461
|
-
val_args.extend(["--config", config_path])
|
|
462
|
-
r = subprocess.run(val_args, capture_output=True, text=True)
|
|
463
|
-
if r.returncode != 0:
|
|
464
|
-
hint = ""
|
|
465
|
-
if "TIMEOUT" in r.stderr:
|
|
466
|
-
hint = (f"\n\nHint: The harness timed out after {args.validation_timeout}s. "
|
|
467
|
-
"This is common for LLM-powered agents that make real API calls.\n"
|
|
468
|
-
"Try: --validation-timeout 120 (or --skip-validation to bypass)")
|
|
469
|
-
print(f"FAIL: baseline harness validation failed.\n{r.stderr}{hint}", file=sys.stderr)
|
|
470
|
-
sys.exit(1)
|
|
471
|
-
print(r.stdout.strip())
|
|
472
|
-
|
|
473
|
-
# 6. Evaluate baseline
|
|
474
|
-
num_tasks = len([f for f in os.listdir(os.path.join(base, "eval", "tasks")) if f.endswith(".json")])
|
|
475
|
-
per_task_timeout = max(args.validation_timeout, 60)
|
|
476
|
-
eval_timeout = max(num_tasks * per_task_timeout + 60, 300)
|
|
477
|
-
print(f"Evaluating baseline harness ({num_tasks} tasks, timeout: {eval_timeout}s)...")
|
|
478
|
-
baseline_traces = tempfile.mkdtemp()
|
|
479
|
-
baseline_scores = os.path.join(base, "baseline_scores.json")
|
|
480
|
-
eval_args = [
|
|
481
|
-
_resolve_python(), evaluate_py, "run",
|
|
482
|
-
"--harness", os.path.join(base, "baseline", "harness.py"),
|
|
483
|
-
"--tasks-dir", os.path.join(base, "eval", "tasks"),
|
|
484
|
-
"--eval", os.path.join(base, "eval", "eval.py"),
|
|
485
|
-
"--traces-dir", baseline_traces,
|
|
486
|
-
"--scores", baseline_scores,
|
|
487
|
-
"--timeout", str(per_task_timeout),
|
|
488
|
-
]
|
|
489
|
-
if os.path.exists(config_path):
|
|
490
|
-
eval_args.extend(["--config", config_path])
|
|
491
|
-
try:
|
|
492
|
-
r = subprocess.run(eval_args, capture_output=True, text=True, timeout=eval_timeout)
|
|
493
|
-
except subprocess.TimeoutExpired:
|
|
494
|
-
print(f"WARNING: baseline evaluation timed out after {eval_timeout}s "
|
|
495
|
-
f"({num_tasks} tasks at {per_task_timeout}s/task). "
|
|
496
|
-
f"Using score 0.0. Run evaluation separately with more time.",
|
|
497
|
-
file=sys.stderr)
|
|
498
|
-
r = None
|
|
499
|
-
if r is not None and r.returncode != 0:
|
|
500
|
-
print(f"WARNING: baseline evaluation failed. Using score 0.0.\n{r.stderr}", file=sys.stderr)
|
|
501
|
-
if r is not None and r.returncode == 0:
|
|
502
|
-
print(r.stdout.strip())
|
|
503
|
-
if r is not None and r.returncode == 0 and os.path.exists(baseline_scores):
|
|
504
|
-
with open(baseline_scores) as f:
|
|
505
|
-
scores = json.load(f)
|
|
506
|
-
baseline_score = scores.get("combined_score", 0.0)
|
|
507
|
-
else:
|
|
508
|
-
baseline_score = 0.0
|
|
509
|
-
|
|
510
|
-
if os.path.exists(baseline_scores):
|
|
511
|
-
os.remove(baseline_scores)
|
|
512
|
-
|
|
513
|
-
# 7. Initialize state with baseline score
|
|
514
|
-
print(f"Baseline score: {baseline_score:.2f}")
|
|
515
|
-
r = subprocess.run(
|
|
516
|
-
[_resolve_python(), state_py, "init",
|
|
517
|
-
"--base-dir", base,
|
|
518
|
-
"--baseline-score", str(baseline_score)],
|
|
519
|
-
capture_output=True, text=True,
|
|
520
|
-
)
|
|
521
|
-
if r.returncode != 0:
|
|
522
|
-
print(f"FAIL: state init failed.\n{r.stderr}", file=sys.stderr)
|
|
523
|
-
sys.exit(1)
|
|
524
|
-
|
|
525
|
-
print(f"\nInitialized .harness-evolver/ at {base}")
|
|
526
|
-
print(f"Baseline score: {baseline_score:.2f}")
|
|
527
|
-
print("Run /harness-evolve to start the optimization loop.")
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
if __name__ == "__main__":
|
|
531
|
-
main()
|
package/tools/llm_api.py
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Shared LLM API calling utility. Stdlib-only (urllib).
|
|
3
|
-
|
|
4
|
-
Auto-detects the best available provider from environment variables.
|
|
5
|
-
Supports: Gemini, OpenAI, Anthropic, OpenRouter.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
import os
|
|
10
|
-
import time
|
|
11
|
-
from urllib.request import Request, urlopen
|
|
12
|
-
from urllib.error import HTTPError
|
|
13
|
-
|
|
14
|
-
PROVIDER_PRIORITY = [
|
|
15
|
-
("GEMINI_API_KEY", "gemini", "gemini-2.5-flash"),
|
|
16
|
-
("GOOGLE_API_KEY", "gemini", "gemini-2.5-flash"),
|
|
17
|
-
("OPENROUTER_API_KEY", "openrouter", "google/gemini-2.5-flash"),
|
|
18
|
-
("OPENAI_API_KEY", "openai", "gpt-4o-mini"),
|
|
19
|
-
("ANTHROPIC_API_KEY", "anthropic", "claude-haiku-4-5-20251001"),
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def detect_provider():
|
|
24
|
-
"""Auto-detect best available LLM provider from env vars.
|
|
25
|
-
Returns (provider_name, api_key, model) or raises RuntimeError."""
|
|
26
|
-
for env_var, provider, model in PROVIDER_PRIORITY:
|
|
27
|
-
key = os.environ.get(env_var, "")
|
|
28
|
-
if key:
|
|
29
|
-
return provider, key, model
|
|
30
|
-
raise RuntimeError(
|
|
31
|
-
"No LLM API key found. Set one of: " +
|
|
32
|
-
", ".join(e for e, _, _ in PROVIDER_PRIORITY)
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def call_llm(provider, api_key, model, prompt, max_tokens=4096, temperature=0.0):
|
|
37
|
-
"""Call LLM API via urllib. Returns response text. Retries 3x with backoff."""
|
|
38
|
-
for attempt in range(3):
|
|
39
|
-
try:
|
|
40
|
-
if provider == "gemini":
|
|
41
|
-
return _call_gemini(api_key, model, prompt, max_tokens, temperature)
|
|
42
|
-
elif provider == "openai":
|
|
43
|
-
return _call_openai(api_key, model, prompt, max_tokens, temperature)
|
|
44
|
-
elif provider == "anthropic":
|
|
45
|
-
return _call_anthropic(api_key, model, prompt, max_tokens, temperature)
|
|
46
|
-
elif provider == "openrouter":
|
|
47
|
-
return _call_openrouter(api_key, model, prompt, max_tokens, temperature)
|
|
48
|
-
else:
|
|
49
|
-
raise ValueError(f"Unknown provider: {provider}")
|
|
50
|
-
except ValueError:
|
|
51
|
-
raise
|
|
52
|
-
except Exception as e:
|
|
53
|
-
if attempt == 2:
|
|
54
|
-
raise
|
|
55
|
-
time.sleep(2 ** attempt)
|
|
56
|
-
raise RuntimeError("All retries failed")
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def _call_gemini(api_key, model, prompt, max_tokens, temperature):
|
|
60
|
-
url = (
|
|
61
|
-
f"https://generativelanguage.googleapis.com/v1beta/models/"
|
|
62
|
-
f"{model}:generateContent?key={api_key}"
|
|
63
|
-
)
|
|
64
|
-
body = json.dumps({
|
|
65
|
-
"contents": [{"parts": [{"text": prompt}]}],
|
|
66
|
-
"generationConfig": {
|
|
67
|
-
"maxOutputTokens": max_tokens,
|
|
68
|
-
"temperature": max(temperature, 0.0),
|
|
69
|
-
},
|
|
70
|
-
}).encode()
|
|
71
|
-
req = Request(url, data=body, headers={"Content-Type": "application/json"})
|
|
72
|
-
with urlopen(req, timeout=60) as resp:
|
|
73
|
-
data = json.loads(resp.read())
|
|
74
|
-
return data["candidates"][0]["content"]["parts"][0]["text"]
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def _call_openai(api_key, model, prompt, max_tokens, temperature):
|
|
78
|
-
url = "https://api.openai.com/v1/chat/completions"
|
|
79
|
-
body = json.dumps({
|
|
80
|
-
"model": model,
|
|
81
|
-
"max_tokens": max_tokens,
|
|
82
|
-
"temperature": temperature,
|
|
83
|
-
"messages": [{"role": "user", "content": prompt}],
|
|
84
|
-
}).encode()
|
|
85
|
-
req = Request(url, data=body, headers={
|
|
86
|
-
"Content-Type": "application/json",
|
|
87
|
-
"Authorization": f"Bearer {api_key}",
|
|
88
|
-
})
|
|
89
|
-
with urlopen(req, timeout=60) as resp:
|
|
90
|
-
data = json.loads(resp.read())
|
|
91
|
-
return data["choices"][0]["message"]["content"]
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def _call_anthropic(api_key, model, prompt, max_tokens, temperature):
|
|
95
|
-
url = "https://api.anthropic.com/v1/messages"
|
|
96
|
-
body = json.dumps({
|
|
97
|
-
"model": model,
|
|
98
|
-
"max_tokens": max_tokens,
|
|
99
|
-
"messages": [{"role": "user", "content": prompt}],
|
|
100
|
-
}).encode()
|
|
101
|
-
req = Request(url, data=body, headers={
|
|
102
|
-
"Content-Type": "application/json",
|
|
103
|
-
"x-api-key": api_key,
|
|
104
|
-
"anthropic-version": "2023-06-01",
|
|
105
|
-
})
|
|
106
|
-
with urlopen(req, timeout=60) as resp:
|
|
107
|
-
data = json.loads(resp.read())
|
|
108
|
-
return data["content"][0]["text"]
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def _call_openrouter(api_key, model, prompt, max_tokens, temperature):
|
|
112
|
-
url = "https://openrouter.ai/api/v1/chat/completions"
|
|
113
|
-
body = json.dumps({
|
|
114
|
-
"model": model,
|
|
115
|
-
"max_tokens": max_tokens,
|
|
116
|
-
"temperature": temperature,
|
|
117
|
-
"messages": [{"role": "user", "content": prompt}],
|
|
118
|
-
}).encode()
|
|
119
|
-
req = Request(url, data=body, headers={
|
|
120
|
-
"Content-Type": "application/json",
|
|
121
|
-
"Authorization": f"Bearer {api_key}",
|
|
122
|
-
})
|
|
123
|
-
with urlopen(req, timeout=60) as resp:
|
|
124
|
-
data = json.loads(resp.read())
|
|
125
|
-
return data["choices"][0]["message"]["content"]
|