harness-evolver 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/harness-evolver-judge.md +110 -0
- package/agents/harness-evolver-testgen.md +97 -0
- package/package.json +1 -1
- package/skills/architect/SKILL.md +2 -10
- package/skills/critic/SKILL.md +2 -10
- package/skills/evolve/SKILL.md +43 -39
- package/skills/init/SKILL.md +36 -3
- package/tools/eval_llm_judge.py +233 -0
- package/tools/eval_passthrough.py +55 -0
- package/tools/llm_api.py +125 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: harness-evolver-judge
|
|
3
|
+
description: |
|
|
4
|
+
Use this agent to evaluate harness outputs using multi-dimensional LLM-as-judge scoring.
|
|
5
|
+
Spawned by the evolve skill when eval returns pending scores (eval_type=pending-judge).
|
|
6
|
+
tools: Read, Write, Bash, Grep, Glob
|
|
7
|
+
color: yellow
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Harness Evolver — Judge Agent
|
|
11
|
+
|
|
12
|
+
You are an expert evaluator. Your job is to score harness outputs on multiple quality dimensions.
|
|
13
|
+
|
|
14
|
+
## Bootstrap
|
|
15
|
+
|
|
16
|
+
If your prompt contains a `<files_to_read>` block, you MUST use the Read tool to load
|
|
17
|
+
every file listed there before performing any other actions.
|
|
18
|
+
|
|
19
|
+
## Return Protocol
|
|
20
|
+
|
|
21
|
+
When done, end your response with:
|
|
22
|
+
|
|
23
|
+
## JUDGE COMPLETE
|
|
24
|
+
- **Tasks scored**: {N}
|
|
25
|
+
- **Combined score**: {score}
|
|
26
|
+
- **Dimensions**: accuracy={X}, completeness={X}, relevance={X}, no_hallucination={X}
|
|
27
|
+
|
|
28
|
+
## Your Workflow
|
|
29
|
+
|
|
30
|
+
### Phase 1: Load All Tasks and Outputs
|
|
31
|
+
|
|
32
|
+
Read the scores.json file (which has per_task entries with input/output but score=-1).
|
|
33
|
+
For each task, you have the input (what was asked) and the output (what the harness produced).
|
|
34
|
+
|
|
35
|
+
Also read the task files from eval/tasks/ to get any additional context (expected answers, metadata).
|
|
36
|
+
|
|
37
|
+
### Phase 2: Score Each Task
|
|
38
|
+
|
|
39
|
+
For each task, evaluate the output on 4 dimensions (1-5 integer scale):
|
|
40
|
+
|
|
41
|
+
**1. Accuracy (weight 0.4)**
|
|
42
|
+
- 5: Perfectly correct, addresses the question precisely
|
|
43
|
+
- 4: Mostly correct, minor inaccuracies
|
|
44
|
+
- 3: Partially correct, significant gaps
|
|
45
|
+
- 2: Mostly incorrect, but shows some understanding
|
|
46
|
+
- 1: Completely wrong or irrelevant
|
|
47
|
+
|
|
48
|
+
**2. Completeness (weight 0.2)**
|
|
49
|
+
- 5: Covers all aspects of the question
|
|
50
|
+
- 4: Covers most aspects
|
|
51
|
+
- 3: Covers some aspects, misses important ones
|
|
52
|
+
- 2: Very incomplete
|
|
53
|
+
- 1: Barely addresses the question
|
|
54
|
+
|
|
55
|
+
**3. Relevance (weight 0.2)**
|
|
56
|
+
- 5: Entirely focused on the question
|
|
57
|
+
- 4: Mostly relevant with minor tangents
|
|
58
|
+
- 3: Somewhat relevant but includes irrelevant information
|
|
59
|
+
- 2: Mostly irrelevant
|
|
60
|
+
- 1: Completely off-topic
|
|
61
|
+
|
|
62
|
+
**4. No-hallucination (weight 0.2)**
|
|
63
|
+
- 5: All claims supported by context/facts
|
|
64
|
+
- 4: Minor unsupported details
|
|
65
|
+
- 3: Some fabricated information
|
|
66
|
+
- 2: Significant hallucination
|
|
67
|
+
- 1: Mostly fabricated
|
|
68
|
+
|
|
69
|
+
If the task has an `expected` field, use it as a reference for accuracy scoring.
|
|
70
|
+
If no `expected` field, judge based on the quality and correctness of the output alone.
|
|
71
|
+
|
|
72
|
+
### Phase 3: Calculate Scores
|
|
73
|
+
|
|
74
|
+
For each task:
|
|
75
|
+
- Normalize each dimension: (score - 1) / 4 → 0.0 to 1.0
|
|
76
|
+
- Combined per-task score = accuracy*0.4 + completeness*0.2 + relevance*0.2 + no_hallucination*0.2
|
|
77
|
+
|
|
78
|
+
Overall combined_score = mean of all per-task combined scores.
|
|
79
|
+
|
|
80
|
+
### Phase 4: Write scores.json
|
|
81
|
+
|
|
82
|
+
Overwrite `.harness-evolver/harnesses/{version}/scores.json` with:
|
|
83
|
+
|
|
84
|
+
```json
|
|
85
|
+
{
|
|
86
|
+
"combined_score": 0.78,
|
|
87
|
+
"eval_type": "llm-judge",
|
|
88
|
+
"dimensions": {"accuracy": 0.85, "completeness": 0.72, "relevance": 0.80, "no_hallucination": 0.75},
|
|
89
|
+
"weights": {"accuracy": 0.4, "completeness": 0.2, "relevance": 0.2, "no_hallucination": 0.2},
|
|
90
|
+
"total_tasks": 30,
|
|
91
|
+
"per_task": {
|
|
92
|
+
"task_001": {
|
|
93
|
+
"score": 0.85,
|
|
94
|
+
"accuracy": 4,
|
|
95
|
+
"completeness": 3,
|
|
96
|
+
"relevance": 4,
|
|
97
|
+
"no_hallucination": 4,
|
|
98
|
+
"reasoning": "Brief explanation of scoring"
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Rules
|
|
105
|
+
|
|
106
|
+
1. **Be consistent** — similar quality outputs should get similar scores across tasks
|
|
107
|
+
2. **Be fair** — don't penalize for style/format if the content is correct
|
|
108
|
+
3. **Be specific in reasoning** — cite what's wrong or right, don't just say "good" or "bad"
|
|
109
|
+
4. **Don't score based on length** — a concise correct answer scores higher than a verbose wrong one
|
|
110
|
+
5. **Handle edge cases** — empty output = score 1 on all dimensions; error output = score 1 on all dimensions
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: harness-evolver-testgen
|
|
3
|
+
description: |
|
|
4
|
+
Use this agent to generate synthetic test cases from harness source code analysis.
|
|
5
|
+
Spawned by the init skill when no test cases exist in the project.
|
|
6
|
+
tools: Read, Write, Bash, Glob, Grep
|
|
7
|
+
color: cyan
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Harness Evolver — Test Generation Agent
|
|
11
|
+
|
|
12
|
+
You are a test case generator. Your job is to read the harness source code, understand its domain, and generate diverse, challenging test cases.
|
|
13
|
+
|
|
14
|
+
## Bootstrap
|
|
15
|
+
|
|
16
|
+
If your prompt contains a `<files_to_read>` block, you MUST use the Read tool to load
|
|
17
|
+
every file listed there before performing any other actions.
|
|
18
|
+
|
|
19
|
+
## Return Protocol
|
|
20
|
+
|
|
21
|
+
When done, end your response with:
|
|
22
|
+
|
|
23
|
+
## TESTGEN COMPLETE
|
|
24
|
+
- **Tasks generated**: {N}
|
|
25
|
+
- **Categories covered**: {list}
|
|
26
|
+
- **Distribution**: {N} standard, {N} edge, {N} cross-domain, {N} adversarial
|
|
27
|
+
|
|
28
|
+
## Your Workflow
|
|
29
|
+
|
|
30
|
+
### Phase 1: Understand the Domain
|
|
31
|
+
|
|
32
|
+
Read the harness source code to understand:
|
|
33
|
+
- What kind of agent is this? (Q&A bot, RAG, classifier, coding agent, etc.)
|
|
34
|
+
- What format does it expect for inputs?
|
|
35
|
+
- What categories/topics does it cover?
|
|
36
|
+
- What are its likely failure modes?
|
|
37
|
+
- Are there any data files (knowledge bases, docs, etc.) that define the domain?
|
|
38
|
+
|
|
39
|
+
### Phase 2: Design Test Distribution
|
|
40
|
+
|
|
41
|
+
Plan 30 test cases with this distribution:
|
|
42
|
+
- **40% Standard** (12 tasks): typical, well-formed inputs representative of the domain
|
|
43
|
+
- **20% Edge Cases** (6 tasks): boundary conditions, minimal inputs, unusual but valid
|
|
44
|
+
- **20% Cross-Domain** (6 tasks): inputs spanning multiple categories or requiring nuanced judgment
|
|
45
|
+
- **20% Adversarial** (6 tasks): misleading, ambiguous, or designed to expose weaknesses
|
|
46
|
+
|
|
47
|
+
Ensure all categories/topics from the harness are covered.
|
|
48
|
+
|
|
49
|
+
### Phase 3: Generate Tasks
|
|
50
|
+
|
|
51
|
+
Create each task as a JSON file in the tasks/ directory.
|
|
52
|
+
|
|
53
|
+
Format (WITHOUT expected — for LLM-as-judge eval):
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"id": "task_001",
|
|
57
|
+
"input": "The actual question or request",
|
|
58
|
+
"metadata": {
|
|
59
|
+
"difficulty": "easy|medium|hard",
|
|
60
|
+
"category": "the domain category",
|
|
61
|
+
"type": "standard|edge|cross_domain|adversarial"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Format (WITH expected — when using keyword eval):
|
|
67
|
+
```json
|
|
68
|
+
{
|
|
69
|
+
"id": "task_001",
|
|
70
|
+
"input": "The actual question or request",
|
|
71
|
+
"expected": "The expected answer or key phrases",
|
|
72
|
+
"metadata": {
|
|
73
|
+
"difficulty": "easy|medium|hard",
|
|
74
|
+
"category": "the domain category",
|
|
75
|
+
"type": "standard|edge|cross_domain|adversarial"
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Use the Write tool to create each file. Name them task_001.json through task_030.json.
|
|
81
|
+
|
|
82
|
+
### Phase 4: Validate
|
|
83
|
+
|
|
84
|
+
After generating all tasks:
|
|
85
|
+
- Verify each file is valid JSON
|
|
86
|
+
- Verify all IDs are unique
|
|
87
|
+
- Verify the distribution matches the target (40/20/20/20)
|
|
88
|
+
- Verify all domain categories are represented
|
|
89
|
+
|
|
90
|
+
## Rules
|
|
91
|
+
|
|
92
|
+
1. **Inputs must be realistic** — questions a real user would ask, not synthetic-sounding
|
|
93
|
+
2. **Vary phrasing** — don't use the same sentence structure repeatedly
|
|
94
|
+
3. **Include some hard questions** — questions that require reasoning, not just lookup
|
|
95
|
+
4. **Include out-of-scope questions** — 2-3 questions the agent should NOT be able to answer
|
|
96
|
+
5. **Test failure modes** — ambiguous questions, misspellings, multi-part questions
|
|
97
|
+
6. **Use the domain's language** — if the harness handles Portuguese, write inputs in Portuguese
|
package/package.json
CHANGED
|
@@ -48,21 +48,13 @@ python3 $TOOLS/analyze_architecture.py \
|
|
|
48
48
|
-o .harness-evolver/architecture_signals.json
|
|
49
49
|
```
|
|
50
50
|
|
|
51
|
-
3.
|
|
52
|
-
```bash
|
|
53
|
-
cat ~/.claude/agents/harness-evolver-architect.md
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
4. Dispatch using the Agent tool — include the agent definition in the prompt:
|
|
51
|
+
3. Dispatch using the Agent tool with `subagent_type: "harness-evolver-architect"`:
|
|
57
52
|
|
|
58
53
|
```
|
|
59
54
|
Agent(
|
|
55
|
+
subagent_type: "harness-evolver-architect",
|
|
60
56
|
description: "Architect: topology analysis",
|
|
61
57
|
prompt: |
|
|
62
|
-
<agent_instructions>
|
|
63
|
-
{paste the FULL content of harness-evolver-architect.md here}
|
|
64
|
-
</agent_instructions>
|
|
65
|
-
|
|
66
58
|
<objective>
|
|
67
59
|
Analyze the harness architecture and recommend the optimal multi-agent topology.
|
|
68
60
|
{If called from evolve: "The evolution loop stagnated/regressed after N iterations."}
|
package/skills/critic/SKILL.md
CHANGED
|
@@ -22,21 +22,13 @@ TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo
|
|
|
22
22
|
|
|
23
23
|
1. Read `summary.json` and identify the suspicious pattern (score jump, premature convergence).
|
|
24
24
|
|
|
25
|
-
2.
|
|
26
|
-
```bash
|
|
27
|
-
cat ~/.claude/agents/harness-evolver-critic.md
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
3. Dispatch using the Agent tool — include the agent definition in the prompt:
|
|
25
|
+
2. Dispatch using the Agent tool with `subagent_type: "harness-evolver-critic"`:
|
|
31
26
|
|
|
32
27
|
```
|
|
33
28
|
Agent(
|
|
29
|
+
subagent_type: "harness-evolver-critic",
|
|
34
30
|
description: "Critic: analyze eval quality",
|
|
35
31
|
prompt: |
|
|
36
|
-
<agent_instructions>
|
|
37
|
-
{paste the FULL content of harness-evolver-critic.md here}
|
|
38
|
-
</agent_instructions>
|
|
39
|
-
|
|
40
32
|
<objective>
|
|
41
33
|
Analyze eval quality for this harness evolution project.
|
|
42
34
|
The best version is {version} with score {score} achieved in {iterations} iteration(s).
|
package/skills/evolve/SKILL.md
CHANGED
|
@@ -57,6 +57,7 @@ If `LS_PROJECT` is empty, langsmith-cli is not available or no projects exist
|
|
|
57
57
|
```bash
|
|
58
58
|
if [ -n "$LS_PROJECT" ]; then
|
|
59
59
|
langsmith-cli --json runs list --project "$LS_PROJECT" --failed --fields id,name,error,inputs --limit 10 > .harness-evolver/langsmith_diagnosis.json 2>/dev/null || echo "[]" > .harness-evolver/langsmith_diagnosis.json
|
|
60
|
+
langsmith-cli --json runs list --project "$LS_PROJECT" --fields id,name,inputs,outputs,latency_ms,total_tokens --limit 20 > .harness-evolver/langsmith_runs.json 2>/dev/null || echo "[]" > .harness-evolver/langsmith_runs.json
|
|
60
61
|
langsmith-cli --json runs stats --project "$LS_PROJECT" > .harness-evolver/langsmith_stats.json 2>/dev/null || echo "{}" > .harness-evolver/langsmith_stats.json
|
|
61
62
|
echo "$LS_PROJECT" > .harness-evolver/langsmith_project.txt
|
|
62
63
|
else
|
|
@@ -72,28 +73,20 @@ These files are included in the proposer's `<files_to_read>` so it has real trac
|
|
|
72
73
|
Spawn 3 proposer agents IN PARALLEL, each with a different evolutionary strategy.
|
|
73
74
|
This follows the DGM/AlphaEvolve pattern: exploit + explore + crossover.
|
|
74
75
|
|
|
75
|
-
|
|
76
|
-
```bash
|
|
77
|
-
cat ~/.claude/agents/harness-evolver-proposer.md
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
Then determine parents for each strategy:
|
|
76
|
+
Determine parents for each strategy:
|
|
81
77
|
- **Exploiter parent**: current best version (from summary.json `best.version`)
|
|
82
78
|
- **Explorer parent**: a non-best version with low offspring count (read summary.json history, pick one that scored >0 but is NOT the best and has NOT been parent to many children)
|
|
83
79
|
- **Crossover parents**: best version + a different high-scorer from a different lineage
|
|
84
80
|
|
|
85
|
-
Spawn all 3 using the Agent tool
|
|
81
|
+
Spawn all 3 using the Agent tool with `subagent_type: "harness-evolver-proposer"`. The first 2 use `run_in_background: true`, the 3rd blocks:
|
|
86
82
|
|
|
87
83
|
**Candidate A (Exploiter)** — `run_in_background: true`:
|
|
88
84
|
```
|
|
89
85
|
Agent(
|
|
86
|
+
subagent_type: "harness-evolver-proposer",
|
|
90
87
|
description: "Proposer A (exploit): targeted fix for {version}",
|
|
91
88
|
run_in_background: true,
|
|
92
89
|
prompt: |
|
|
93
|
-
<agent_instructions>
|
|
94
|
-
{FULL content of harness-evolver-proposer.md}
|
|
95
|
-
</agent_instructions>
|
|
96
|
-
|
|
97
90
|
<strategy>
|
|
98
91
|
APPROACH: exploitation
|
|
99
92
|
You are the EXPLOITER. Make the SMALLEST, most targeted change that fixes
|
|
@@ -115,6 +108,7 @@ Agent(
|
|
|
115
108
|
- .harness-evolver/harnesses/{best_version}/proposal.md
|
|
116
109
|
- .harness-evolver/langsmith_diagnosis.json (if exists)
|
|
117
110
|
- .harness-evolver/langsmith_stats.json (if exists)
|
|
111
|
+
- .harness-evolver/langsmith_runs.json (if exists)
|
|
118
112
|
- .harness-evolver/architecture.json (if exists)
|
|
119
113
|
</files_to_read>
|
|
120
114
|
|
|
@@ -128,13 +122,10 @@ Agent(
|
|
|
128
122
|
**Candidate B (Explorer)** — `run_in_background: true`:
|
|
129
123
|
```
|
|
130
124
|
Agent(
|
|
125
|
+
subagent_type: "harness-evolver-proposer",
|
|
131
126
|
description: "Proposer B (explore): bold change from {explorer_parent}",
|
|
132
127
|
run_in_background: true,
|
|
133
128
|
prompt: |
|
|
134
|
-
<agent_instructions>
|
|
135
|
-
{FULL content of harness-evolver-proposer.md}
|
|
136
|
-
</agent_instructions>
|
|
137
|
-
|
|
138
129
|
<strategy>
|
|
139
130
|
APPROACH: exploration
|
|
140
131
|
You are the EXPLORER. Try a FUNDAMENTALLY DIFFERENT approach.
|
|
@@ -156,6 +147,7 @@ Agent(
|
|
|
156
147
|
- .harness-evolver/harnesses/{explorer_parent}/harness.py
|
|
157
148
|
- .harness-evolver/harnesses/{explorer_parent}/scores.json
|
|
158
149
|
- .harness-evolver/langsmith_diagnosis.json (if exists)
|
|
150
|
+
- .harness-evolver/langsmith_runs.json (if exists)
|
|
159
151
|
- .harness-evolver/architecture.json (if exists)
|
|
160
152
|
</files_to_read>
|
|
161
153
|
|
|
@@ -169,12 +161,9 @@ Agent(
|
|
|
169
161
|
**Candidate C (Crossover)** — blocks (last one):
|
|
170
162
|
```
|
|
171
163
|
Agent(
|
|
164
|
+
subagent_type: "harness-evolver-proposer",
|
|
172
165
|
description: "Proposer C (crossover): combine {parent_a} + {parent_b}",
|
|
173
166
|
prompt: |
|
|
174
|
-
<agent_instructions>
|
|
175
|
-
{FULL content of harness-evolver-proposer.md}
|
|
176
|
-
</agent_instructions>
|
|
177
|
-
|
|
178
167
|
<strategy>
|
|
179
168
|
APPROACH: crossover
|
|
180
169
|
You are the CROSSOVER agent. Combine the STRENGTHS of two different versions:
|
|
@@ -196,6 +185,7 @@ Agent(
|
|
|
196
185
|
- .harness-evolver/harnesses/{parent_b}/harness.py
|
|
197
186
|
- .harness-evolver/harnesses/{parent_b}/scores.json
|
|
198
187
|
- .harness-evolver/langsmith_diagnosis.json (if exists)
|
|
188
|
+
- .harness-evolver/langsmith_runs.json (if exists)
|
|
199
189
|
- .harness-evolver/architecture.json (if exists)
|
|
200
190
|
</files_to_read>
|
|
201
191
|
|
|
@@ -261,6 +251,36 @@ python3 $TOOLS/evaluate.py run \
|
|
|
261
251
|
--timeout 60
|
|
262
252
|
```
|
|
263
253
|
|
|
254
|
+
### 4.5. Judge (if eval returned pending scores)
|
|
255
|
+
|
|
256
|
+
For each evaluated candidate, read its scores.json. If `eval_type` is `"pending-judge"` (combined_score == -1), the eval was a passthrough and needs judge scoring.
|
|
257
|
+
|
|
258
|
+
Spawn judge subagent with `subagent_type: "harness-evolver-judge"` for EACH candidate that needs judging:
|
|
259
|
+
|
|
260
|
+
```
|
|
261
|
+
Agent(
|
|
262
|
+
subagent_type: "harness-evolver-judge",
|
|
263
|
+
description: "Judge: score {version}{suffix} outputs",
|
|
264
|
+
prompt: |
|
|
265
|
+
<objective>
|
|
266
|
+
Score the outputs of harness version {version}{suffix} across all {N} tasks.
|
|
267
|
+
</objective>
|
|
268
|
+
|
|
269
|
+
<files_to_read>
|
|
270
|
+
- .harness-evolver/harnesses/{version}{suffix}/scores.json
|
|
271
|
+
- .harness-evolver/eval/tasks/ (read all task files)
|
|
272
|
+
</files_to_read>
|
|
273
|
+
|
|
274
|
+
<output>
|
|
275
|
+
Overwrite .harness-evolver/harnesses/{version}{suffix}/scores.json with real scores.
|
|
276
|
+
</output>
|
|
277
|
+
)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Wait for `## JUDGE COMPLETE`.
|
|
281
|
+
|
|
282
|
+
If eval_type is NOT "pending-judge", the eval.py already produced real scores — skip this step.
|
|
283
|
+
|
|
264
284
|
### 5. Select Winner + Update State
|
|
265
285
|
|
|
266
286
|
Compare scores of all evaluated candidates. The winner is the one with highest combined_score.
|
|
@@ -312,21 +332,13 @@ python3 $TOOLS/evaluate.py run \
|
|
|
312
332
|
--timeout 60
|
|
313
333
|
```
|
|
314
334
|
|
|
315
|
-
|
|
316
|
-
```bash
|
|
317
|
-
cat ~/.claude/agents/harness-evolver-critic.md
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
Then dispatch:
|
|
335
|
+
Dispatch the critic agent:
|
|
321
336
|
|
|
322
337
|
```
|
|
323
338
|
Agent(
|
|
339
|
+
subagent_type: "harness-evolver-critic",
|
|
324
340
|
description: "Critic: analyze eval quality",
|
|
325
341
|
prompt: |
|
|
326
|
-
<agent_instructions>
|
|
327
|
-
{paste the FULL content of harness-evolver-critic.md here}
|
|
328
|
-
</agent_instructions>
|
|
329
|
-
|
|
330
342
|
<objective>
|
|
331
343
|
EVAL GAMING DETECTED: Score jumped from {parent_score} to {score} in one iteration.
|
|
332
344
|
Analyze the eval quality and propose a stricter eval.
|
|
@@ -389,21 +401,13 @@ python3 $TOOLS/analyze_architecture.py \
|
|
|
389
401
|
-o .harness-evolver/architecture_signals.json
|
|
390
402
|
```
|
|
391
403
|
|
|
392
|
-
|
|
393
|
-
```bash
|
|
394
|
-
cat ~/.claude/agents/harness-evolver-architect.md
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
Then dispatch:
|
|
404
|
+
Dispatch the architect agent:
|
|
398
405
|
|
|
399
406
|
```
|
|
400
407
|
Agent(
|
|
408
|
+
subagent_type: "harness-evolver-architect",
|
|
401
409
|
description: "Architect: analyze topology after {stagnation/regression}",
|
|
402
410
|
prompt: |
|
|
403
|
-
<agent_instructions>
|
|
404
|
-
{paste the FULL content of harness-evolver-architect.md here}
|
|
405
|
-
</agent_instructions>
|
|
406
|
-
|
|
407
411
|
<objective>
|
|
408
412
|
The evolution loop has {stagnated/regressed} after {iterations} iterations (best: {best_score}).
|
|
409
413
|
Analyze the harness architecture and recommend a topology change.
|
package/skills/init/SKILL.md
CHANGED
|
@@ -36,9 +36,42 @@ Three artifacts needed. For each — use existing if found, create if not.
|
|
|
36
36
|
|
|
37
37
|
**Harness** (`harness.py`): If user's entry point doesn't match our CLI interface (`--input`, `--output`, `--traces-dir`, `--config`), create a thin wrapper that imports their code. Read their entry point first to understand the I/O format. Ask if unsure.
|
|
38
38
|
|
|
39
|
-
**Eval** (`eval.py`):
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
**Eval** (`eval.py`): If an eval script exists, use it.
|
|
40
|
+
|
|
41
|
+
If NO eval exists:
|
|
42
|
+
- Copy `eval_passthrough.py` from `$TOOLS/eval_passthrough.py` as the project's eval.py:
|
|
43
|
+
```bash
|
|
44
|
+
cp $TOOLS/eval_passthrough.py eval.py
|
|
45
|
+
```
|
|
46
|
+
- This passthrough eval collects outputs for the judge subagent to score during evolve.
|
|
47
|
+
- Print: "No eval found. Using LLM-as-judge (Claude Code scores outputs directly)."
|
|
48
|
+
|
|
49
|
+
**Tasks** (`tasks/`): If test tasks exist, use them.
|
|
50
|
+
|
|
51
|
+
If NO tasks exist:
|
|
52
|
+
- Spawn testgen subagent with `subagent_type: "harness-evolver-testgen"`:
|
|
53
|
+
```
|
|
54
|
+
Agent(
|
|
55
|
+
subagent_type: "harness-evolver-testgen",
|
|
56
|
+
description: "TestGen: generate test cases for this project",
|
|
57
|
+
prompt: |
|
|
58
|
+
<objective>
|
|
59
|
+
Generate 30 diverse test cases for this project. Write them to tasks/ directory.
|
|
60
|
+
</objective>
|
|
61
|
+
|
|
62
|
+
<files_to_read>
|
|
63
|
+
- {harness source file path}
|
|
64
|
+
- {any data files found in the project}
|
|
65
|
+
</files_to_read>
|
|
66
|
+
|
|
67
|
+
<output>
|
|
68
|
+
Create tasks/ directory with task_001.json through task_030.json.
|
|
69
|
+
No expected field needed (judge subagent will score outputs).
|
|
70
|
+
</output>
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
- Wait for `## TESTGEN COMPLETE`.
|
|
74
|
+
- Print: "Generated {N} test cases from code analysis."
|
|
42
75
|
|
|
43
76
|
## Phase 3: Run Init
|
|
44
77
|
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""LLM-as-judge evaluation script for Harness Evolver.
|
|
3
|
+
|
|
4
|
+
Scores harness outputs using an LLM judge across multiple quality dimensions:
|
|
5
|
+
accuracy, completeness, relevance, no_hallucination.
|
|
6
|
+
|
|
7
|
+
CLI interface matches existing evals: --results-dir, --tasks-dir, --scores.
|
|
8
|
+
Stdlib-only. No external dependencies.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
18
|
+
from llm_api import detect_provider, call_llm
|
|
19
|
+
|
|
20
|
+
DIMENSIONS = ["accuracy", "completeness", "relevance", "no_hallucination"]
|
|
21
|
+
|
|
22
|
+
WEIGHTS = {
|
|
23
|
+
"accuracy": 0.4,
|
|
24
|
+
"completeness": 0.2,
|
|
25
|
+
"relevance": 0.2,
|
|
26
|
+
"no_hallucination": 0.2,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def build_judge_prompt(task, result):
|
|
31
|
+
"""Build the evaluation prompt for the LLM judge."""
|
|
32
|
+
prompt_parts = [
|
|
33
|
+
"You are an expert evaluator. Assess the quality of the following output.",
|
|
34
|
+
"",
|
|
35
|
+
"QUESTION/INPUT:",
|
|
36
|
+
str(task.get("input", "")),
|
|
37
|
+
"",
|
|
38
|
+
"OUTPUT TO EVALUATE:",
|
|
39
|
+
str(result.get("output", "")),
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
if "expected" in task:
|
|
43
|
+
prompt_parts.extend([
|
|
44
|
+
"",
|
|
45
|
+
"REFERENCE ANSWER:",
|
|
46
|
+
str(task["expected"]),
|
|
47
|
+
])
|
|
48
|
+
|
|
49
|
+
prompt_parts.extend([
|
|
50
|
+
"",
|
|
51
|
+
"Score each dimension from 1 (worst) to 5 (best):",
|
|
52
|
+
"- accuracy: Is the output factually correct and properly addresses the input?",
|
|
53
|
+
"- completeness: Does it cover all relevant aspects?",
|
|
54
|
+
"- relevance: Is it focused and on-topic?",
|
|
55
|
+
"- no_hallucination: Does it avoid fabricating information not supported by context?",
|
|
56
|
+
"",
|
|
57
|
+
"Think step by step, then respond with ONLY this JSON:",
|
|
58
|
+
'{"reasoning": "your analysis", "accuracy": N, "completeness": N, "relevance": N, "no_hallucination": N}',
|
|
59
|
+
])
|
|
60
|
+
|
|
61
|
+
return "\n".join(prompt_parts)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_json_scores(response):
|
|
65
|
+
"""Extract scoring JSON from LLM response. Handles fenced and bare JSON."""
|
|
66
|
+
# Try direct parse
|
|
67
|
+
try:
|
|
68
|
+
data = json.loads(response.strip())
|
|
69
|
+
if "accuracy" in data:
|
|
70
|
+
return data
|
|
71
|
+
except (json.JSONDecodeError, ValueError):
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
# Try extracting from markdown fences
|
|
75
|
+
fence_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
|
|
76
|
+
if fence_match:
|
|
77
|
+
try:
|
|
78
|
+
data = json.loads(fence_match.group(1))
|
|
79
|
+
if "accuracy" in data:
|
|
80
|
+
return data
|
|
81
|
+
except (json.JSONDecodeError, ValueError):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
# Try regex extraction for JSON with accuracy key
|
|
85
|
+
json_match = re.search(r'\{[^{}]*"accuracy"\s*:\s*\d[^{}]*\}', response)
|
|
86
|
+
if json_match:
|
|
87
|
+
try:
|
|
88
|
+
data = json.loads(json_match.group(0))
|
|
89
|
+
if "accuracy" in data:
|
|
90
|
+
return data
|
|
91
|
+
except (json.JSONDecodeError, ValueError):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def normalize_score(raw_score):
|
|
98
|
+
"""Normalize a 1-5 score to 0.0-1.0 range."""
|
|
99
|
+
clamped = max(1, min(5, int(raw_score)))
|
|
100
|
+
return (clamped - 1) / 4.0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def compute_combined_score(scores_dict):
|
|
104
|
+
"""Compute weighted combined score from normalized dimension scores."""
|
|
105
|
+
total = 0.0
|
|
106
|
+
for dim in DIMENSIONS:
|
|
107
|
+
total += scores_dict.get(dim, 0.0) * WEIGHTS[dim]
|
|
108
|
+
return total
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def evaluate_task(provider, api_key, model, task, result):
|
|
112
|
+
"""Evaluate a single task with the LLM judge. Returns per-task score dict."""
|
|
113
|
+
prompt = build_judge_prompt(task, result)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
response = call_llm(provider, api_key, model, prompt, max_tokens=2048)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
return {
|
|
119
|
+
"score": 0.0,
|
|
120
|
+
"accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
|
|
121
|
+
"reasoning": f"LLM call failed: {e}",
|
|
122
|
+
"error": str(e),
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
parsed = extract_json_scores(response)
|
|
126
|
+
if parsed is None:
|
|
127
|
+
return {
|
|
128
|
+
"score": 0.0,
|
|
129
|
+
"accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
|
|
130
|
+
"reasoning": f"Failed to parse judge response: {response[:200]}",
|
|
131
|
+
"error": "parse_failed",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
# Extract raw scores
|
|
135
|
+
raw = {}
|
|
136
|
+
normalized = {}
|
|
137
|
+
for dim in DIMENSIONS:
|
|
138
|
+
raw[dim] = parsed.get(dim, 1)
|
|
139
|
+
normalized[dim] = normalize_score(raw[dim])
|
|
140
|
+
|
|
141
|
+
combined = compute_combined_score(normalized)
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"score": round(combined, 4),
|
|
145
|
+
"accuracy": raw["accuracy"],
|
|
146
|
+
"completeness": raw["completeness"],
|
|
147
|
+
"relevance": raw["relevance"],
|
|
148
|
+
"no_hallucination": raw["no_hallucination"],
|
|
149
|
+
"reasoning": parsed.get("reasoning", ""),
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def main():
|
|
154
|
+
parser = argparse.ArgumentParser(description="LLM-as-judge evaluation")
|
|
155
|
+
parser.add_argument("--results-dir", required=True,
|
|
156
|
+
help="Directory with harness output JSON files")
|
|
157
|
+
parser.add_argument("--tasks-dir", required=True,
|
|
158
|
+
help="Directory with task JSON files")
|
|
159
|
+
parser.add_argument("--scores", required=True,
|
|
160
|
+
help="Output path for scores JSON")
|
|
161
|
+
args = parser.parse_args()
|
|
162
|
+
|
|
163
|
+
# Detect LLM provider
|
|
164
|
+
provider, api_key, model = detect_provider()
|
|
165
|
+
|
|
166
|
+
# Collect tasks
|
|
167
|
+
task_files = sorted(f for f in os.listdir(args.tasks_dir) if f.endswith(".json"))
|
|
168
|
+
if not task_files:
|
|
169
|
+
print(f"FAIL: no .json task files in {args.tasks_dir}", file=sys.stderr)
|
|
170
|
+
sys.exit(1)
|
|
171
|
+
|
|
172
|
+
per_task = {}
|
|
173
|
+
dimension_totals = {dim: 0.0 for dim in DIMENSIONS}
|
|
174
|
+
total_combined = 0.0
|
|
175
|
+
total_tasks = 0
|
|
176
|
+
|
|
177
|
+
for task_file in task_files:
|
|
178
|
+
# Load task
|
|
179
|
+
task_path = os.path.join(args.tasks_dir, task_file)
|
|
180
|
+
with open(task_path) as f:
|
|
181
|
+
task = json.load(f)
|
|
182
|
+
task_id = task["id"]
|
|
183
|
+
|
|
184
|
+
# Load result
|
|
185
|
+
result_path = os.path.join(args.results_dir, task_file)
|
|
186
|
+
if os.path.exists(result_path):
|
|
187
|
+
with open(result_path) as f:
|
|
188
|
+
result = json.load(f)
|
|
189
|
+
else:
|
|
190
|
+
result = {"id": task_id, "output": "", "error": "no output file"}
|
|
191
|
+
|
|
192
|
+
# Evaluate
|
|
193
|
+
task_scores = evaluate_task(provider, api_key, model, task, result)
|
|
194
|
+
per_task[task_id] = task_scores
|
|
195
|
+
|
|
196
|
+
# Accumulate
|
|
197
|
+
total_combined += task_scores["score"]
|
|
198
|
+
for dim in DIMENSIONS:
|
|
199
|
+
dimension_totals[dim] += normalize_score(task_scores[dim])
|
|
200
|
+
total_tasks += 1
|
|
201
|
+
|
|
202
|
+
# Compute averages
|
|
203
|
+
if total_tasks > 0:
|
|
204
|
+
combined_score = round(total_combined / total_tasks, 4)
|
|
205
|
+
avg_dimensions = {
|
|
206
|
+
dim: round(dimension_totals[dim] / total_tasks, 4) for dim in DIMENSIONS
|
|
207
|
+
}
|
|
208
|
+
else:
|
|
209
|
+
combined_score = 0.0
|
|
210
|
+
avg_dimensions = {dim: 0.0 for dim in DIMENSIONS}
|
|
211
|
+
|
|
212
|
+
scores = {
|
|
213
|
+
"combined_score": combined_score,
|
|
214
|
+
"eval_type": "llm-judge",
|
|
215
|
+
"judge_provider": provider,
|
|
216
|
+
"judge_model": model,
|
|
217
|
+
"dimensions": avg_dimensions,
|
|
218
|
+
"weights": WEIGHTS,
|
|
219
|
+
"total_tasks": total_tasks,
|
|
220
|
+
"per_task": per_task,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
# Write scores
|
|
224
|
+
os.makedirs(os.path.dirname(os.path.abspath(args.scores)), exist_ok=True)
|
|
225
|
+
with open(args.scores, "w") as f:
|
|
226
|
+
json.dump(scores, f, indent=2)
|
|
227
|
+
|
|
228
|
+
print(f"LLM judge evaluation complete. combined_score: {combined_score} "
|
|
229
|
+
f"({total_tasks} tasks, provider: {provider}/{model})")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
if __name__ == "__main__":
|
|
233
|
+
main()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Passthrough eval — collects outputs for judge subagent scoring.
|
|
3
|
+
|
|
4
|
+
When no custom eval.py exists, this is used as the default. It does NOT score
|
|
5
|
+
outputs — it collects them and marks them for the judge subagent to evaluate.
|
|
6
|
+
The evolve skill detects eval_type=pending-judge and spawns the judge agent.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main():
|
|
15
|
+
parser = argparse.ArgumentParser()
|
|
16
|
+
parser.add_argument("--results-dir", required=True)
|
|
17
|
+
parser.add_argument("--tasks-dir", required=True)
|
|
18
|
+
parser.add_argument("--scores", required=True)
|
|
19
|
+
args = parser.parse_args()
|
|
20
|
+
|
|
21
|
+
per_task = {}
|
|
22
|
+
for fname in sorted(os.listdir(args.tasks_dir)):
|
|
23
|
+
if not fname.endswith(".json"):
|
|
24
|
+
continue
|
|
25
|
+
with open(os.path.join(args.tasks_dir, fname)) as f:
|
|
26
|
+
task = json.load(f)
|
|
27
|
+
task_id = task["id"]
|
|
28
|
+
|
|
29
|
+
result_path = os.path.join(args.results_dir, fname)
|
|
30
|
+
output = ""
|
|
31
|
+
if os.path.exists(result_path):
|
|
32
|
+
with open(result_path) as f:
|
|
33
|
+
result = json.load(f)
|
|
34
|
+
output = str(result.get("output", ""))
|
|
35
|
+
|
|
36
|
+
per_task[task_id] = {
|
|
37
|
+
"score": -1,
|
|
38
|
+
"input": str(task.get("input", ""))[:500],
|
|
39
|
+
"output": output[:500],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
scores = {
|
|
43
|
+
"combined_score": -1,
|
|
44
|
+
"eval_type": "pending-judge",
|
|
45
|
+
"total_tasks": len(per_task),
|
|
46
|
+
"per_task": per_task,
|
|
47
|
+
}
|
|
48
|
+
with open(args.scores, "w") as f:
|
|
49
|
+
json.dump(scores, f, indent=2)
|
|
50
|
+
|
|
51
|
+
print(f"Collected {len(per_task)} task outputs for judge scoring.")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
main()
|
package/tools/llm_api.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared LLM API calling utility. Stdlib-only (urllib).
|
|
3
|
+
|
|
4
|
+
Auto-detects the best available provider from environment variables.
|
|
5
|
+
Supports: Gemini, OpenAI, Anthropic, OpenRouter.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
from urllib.request import Request, urlopen
|
|
12
|
+
from urllib.error import HTTPError
|
|
13
|
+
|
|
14
|
+
PROVIDER_PRIORITY = [
|
|
15
|
+
("GEMINI_API_KEY", "gemini", "gemini-2.5-flash"),
|
|
16
|
+
("GOOGLE_API_KEY", "gemini", "gemini-2.5-flash"),
|
|
17
|
+
("OPENROUTER_API_KEY", "openrouter", "google/gemini-2.5-flash"),
|
|
18
|
+
("OPENAI_API_KEY", "openai", "gpt-4o-mini"),
|
|
19
|
+
("ANTHROPIC_API_KEY", "anthropic", "claude-haiku-4-5-20251001"),
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def detect_provider():
|
|
24
|
+
"""Auto-detect best available LLM provider from env vars.
|
|
25
|
+
Returns (provider_name, api_key, model) or raises RuntimeError."""
|
|
26
|
+
for env_var, provider, model in PROVIDER_PRIORITY:
|
|
27
|
+
key = os.environ.get(env_var, "")
|
|
28
|
+
if key:
|
|
29
|
+
return provider, key, model
|
|
30
|
+
raise RuntimeError(
|
|
31
|
+
"No LLM API key found. Set one of: " +
|
|
32
|
+
", ".join(e for e, _, _ in PROVIDER_PRIORITY)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def call_llm(provider, api_key, model, prompt, max_tokens=4096, temperature=0.0):
|
|
37
|
+
"""Call LLM API via urllib. Returns response text. Retries 3x with backoff."""
|
|
38
|
+
for attempt in range(3):
|
|
39
|
+
try:
|
|
40
|
+
if provider == "gemini":
|
|
41
|
+
return _call_gemini(api_key, model, prompt, max_tokens, temperature)
|
|
42
|
+
elif provider == "openai":
|
|
43
|
+
return _call_openai(api_key, model, prompt, max_tokens, temperature)
|
|
44
|
+
elif provider == "anthropic":
|
|
45
|
+
return _call_anthropic(api_key, model, prompt, max_tokens, temperature)
|
|
46
|
+
elif provider == "openrouter":
|
|
47
|
+
return _call_openrouter(api_key, model, prompt, max_tokens, temperature)
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(f"Unknown provider: {provider}")
|
|
50
|
+
except ValueError:
|
|
51
|
+
raise
|
|
52
|
+
except Exception as e:
|
|
53
|
+
if attempt == 2:
|
|
54
|
+
raise
|
|
55
|
+
time.sleep(2 ** attempt)
|
|
56
|
+
raise RuntimeError("All retries failed")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _call_gemini(api_key, model, prompt, max_tokens, temperature):
|
|
60
|
+
url = (
|
|
61
|
+
f"https://generativelanguage.googleapis.com/v1beta/models/"
|
|
62
|
+
f"{model}:generateContent?key={api_key}"
|
|
63
|
+
)
|
|
64
|
+
body = json.dumps({
|
|
65
|
+
"contents": [{"parts": [{"text": prompt}]}],
|
|
66
|
+
"generationConfig": {
|
|
67
|
+
"maxOutputTokens": max_tokens,
|
|
68
|
+
"temperature": max(temperature, 0.0),
|
|
69
|
+
},
|
|
70
|
+
}).encode()
|
|
71
|
+
req = Request(url, data=body, headers={"Content-Type": "application/json"})
|
|
72
|
+
with urlopen(req, timeout=60) as resp:
|
|
73
|
+
data = json.loads(resp.read())
|
|
74
|
+
return data["candidates"][0]["content"]["parts"][0]["text"]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _call_openai(api_key, model, prompt, max_tokens, temperature):
|
|
78
|
+
url = "https://api.openai.com/v1/chat/completions"
|
|
79
|
+
body = json.dumps({
|
|
80
|
+
"model": model,
|
|
81
|
+
"max_tokens": max_tokens,
|
|
82
|
+
"temperature": temperature,
|
|
83
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
84
|
+
}).encode()
|
|
85
|
+
req = Request(url, data=body, headers={
|
|
86
|
+
"Content-Type": "application/json",
|
|
87
|
+
"Authorization": f"Bearer {api_key}",
|
|
88
|
+
})
|
|
89
|
+
with urlopen(req, timeout=60) as resp:
|
|
90
|
+
data = json.loads(resp.read())
|
|
91
|
+
return data["choices"][0]["message"]["content"]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _call_anthropic(api_key, model, prompt, max_tokens, temperature):
|
|
95
|
+
url = "https://api.anthropic.com/v1/messages"
|
|
96
|
+
body = json.dumps({
|
|
97
|
+
"model": model,
|
|
98
|
+
"max_tokens": max_tokens,
|
|
99
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
100
|
+
}).encode()
|
|
101
|
+
req = Request(url, data=body, headers={
|
|
102
|
+
"Content-Type": "application/json",
|
|
103
|
+
"x-api-key": api_key,
|
|
104
|
+
"anthropic-version": "2023-06-01",
|
|
105
|
+
})
|
|
106
|
+
with urlopen(req, timeout=60) as resp:
|
|
107
|
+
data = json.loads(resp.read())
|
|
108
|
+
return data["content"][0]["text"]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _call_openrouter(api_key, model, prompt, max_tokens, temperature):
|
|
112
|
+
url = "https://openrouter.ai/api/v1/chat/completions"
|
|
113
|
+
body = json.dumps({
|
|
114
|
+
"model": model,
|
|
115
|
+
"max_tokens": max_tokens,
|
|
116
|
+
"temperature": temperature,
|
|
117
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
118
|
+
}).encode()
|
|
119
|
+
req = Request(url, data=body, headers={
|
|
120
|
+
"Content-Type": "application/json",
|
|
121
|
+
"Authorization": f"Bearer {api_key}",
|
|
122
|
+
})
|
|
123
|
+
with urlopen(req, timeout=60) as resp:
|
|
124
|
+
data = json.loads(resp.read())
|
|
125
|
+
return data["choices"][0]["message"]["content"]
|