cawdex 1.35.74 → 1.35.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/bin/anycode.js +2 -2
- package/bin/cawdex.js +408 -408
- package/bin/ecc-hooks.cjs +11 -11
- package/dist/agents-md.d.ts +31 -0
- package/dist/agents-md.js +340 -0
- package/dist/agents-md.js.map +1 -0
- package/dist/agents.js +1424 -1424
- package/dist/api.d.ts +1 -0
- package/dist/api.js +19 -14
- package/dist/api.js.map +1 -1
- package/dist/autonomous-loops.js +287 -287
- package/dist/benchmark-repos.d.ts +31 -0
- package/dist/benchmark-repos.js +234 -8
- package/dist/benchmark-repos.js.map +1 -1
- package/dist/command-palette.js +4 -2
- package/dist/command-palette.js.map +1 -1
- package/dist/compaction.js +8 -8
- package/dist/config.js +51 -36
- package/dist/config.js.map +1 -1
- package/dist/content-engine.js +543 -543
- package/dist/context-brief.d.ts +4 -0
- package/dist/context-brief.js +230 -0
- package/dist/context-brief.js.map +1 -0
- package/dist/cost-tracker.d.ts +33 -14
- package/dist/cost-tracker.js +81 -19
- package/dist/cost-tracker.js.map +1 -1
- package/dist/coverage.js +39 -39
- package/dist/docs-sync.js +98 -98
- package/dist/evaluation.js +452 -452
- package/dist/fixed-footer.d.ts +7 -1
- package/dist/fixed-footer.js +92 -18
- package/dist/fixed-footer.js.map +1 -1
- package/dist/git-workflow.js +49 -49
- package/dist/index.d.ts +2 -0
- package/dist/index.js +197 -65
- package/dist/index.js.map +1 -1
- package/dist/instant-artifact.d.ts +6 -0
- package/dist/instant-artifact.js +397 -0
- package/dist/instant-artifact.js.map +1 -0
- package/dist/live-queue.js +1 -1
- package/dist/live-queue.js.map +1 -1
- package/dist/model-aliases.d.ts +37 -0
- package/dist/model-aliases.js +203 -0
- package/dist/model-aliases.js.map +1 -0
- package/dist/orchestration.js +15 -15
- package/dist/permissions.d.ts +6 -0
- package/dist/permissions.js +53 -0
- package/dist/permissions.js.map +1 -1
- package/dist/pm2-manager.js +26 -26
- package/dist/query.d.ts +0 -1
- package/dist/query.js +74 -39
- package/dist/query.js.map +1 -1
- package/dist/refactor.js +87 -87
- package/dist/repo-command.js +7 -1
- package/dist/repo-command.js.map +1 -1
- package/dist/search-first.js +92 -92
- package/dist/skill-create.js +100 -100
- package/dist/stitch.js +1 -1
- package/dist/system-prompt.d.ts +2 -1
- package/dist/system-prompt.js +10 -5
- package/dist/system-prompt.js.map +1 -1
- package/dist/tools/github-repo-digest.d.ts +1 -1
- package/dist/tools/github-repo-digest.js +38 -6
- package/dist/tools/github-repo-digest.js.map +1 -1
- package/dist/types.d.ts +3 -0
- package/dist/types.js.map +1 -1
- package/dist/verification.js +55 -55
- package/package.json +1 -1
- package/resources/__init__.py +1 -1
- package/resources/exgentic/cawdex_agent/README.md +114 -114
- package/resources/exgentic/cawdex_agent/__init__.py +5 -5
- package/resources/exgentic/cawdex_agent/agent.py +605 -605
- package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
- package/resources/exgentic/cawdex_agent/setup.sh +21 -21
- package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
- package/resources/hal/cawdex_agent/README.md +24 -24
- package/resources/hal/cawdex_agent/__init__.py +1 -1
- package/resources/hal/cawdex_agent/main.py +550 -550
- package/resources/hal/cawdex_agent/requirements.txt +2 -2
- package/resources/kbench/cawdex_agent/README.md +107 -107
- package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
- package/resources/kbench/cawdex_agent/runner.mjs +753 -753
- package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
- package/resources/terminal_bench/__init__.py +1 -1
- package/resources/terminal_bench/cawdex_agent.py +174 -174
- package/resources/terminal_bench/setup.sh +121 -121
package/dist/evaluation.js
CHANGED
|
@@ -17,209 +17,209 @@ export function buildReviewPrompt(cwd, target) {
|
|
|
17
17
|
: gitDiff(cwd, false) || gitDiff(cwd, true);
|
|
18
18
|
if (!diff)
|
|
19
19
|
return null;
|
|
20
|
-
return `Perform a thorough code review of the following changes.
|
|
21
|
-
|
|
22
|
-
\`\`\`diff
|
|
23
|
-
${diff.slice(0, 15000)}
|
|
24
|
-
\`\`\`
|
|
25
|
-
|
|
26
|
-
Review for:
|
|
27
|
-
1. **Correctness** — Logic errors, edge cases, off-by-one
|
|
28
|
-
2. **Security** — Injection, XSS, secrets, path traversal, auth issues
|
|
29
|
-
3. **Performance** — N+1 queries, unbounded loops, memory leaks
|
|
30
|
-
4. **Maintainability** — Naming, complexity, dead code, duplication
|
|
31
|
-
5. **Testing** — Are tests adequate? What's missing?
|
|
32
|
-
|
|
33
|
-
For each issue found, specify:
|
|
34
|
-
- File and line number
|
|
35
|
-
- Severity: CRITICAL / HIGH / MEDIUM / LOW / NIT
|
|
36
|
-
- What's wrong and how to fix it
|
|
37
|
-
|
|
20
|
+
return `Perform a thorough code review of the following changes.
|
|
21
|
+
|
|
22
|
+
\`\`\`diff
|
|
23
|
+
${diff.slice(0, 15000)}
|
|
24
|
+
\`\`\`
|
|
25
|
+
|
|
26
|
+
Review for:
|
|
27
|
+
1. **Correctness** — Logic errors, edge cases, off-by-one
|
|
28
|
+
2. **Security** — Injection, XSS, secrets, path traversal, auth issues
|
|
29
|
+
3. **Performance** — N+1 queries, unbounded loops, memory leaks
|
|
30
|
+
4. **Maintainability** — Naming, complexity, dead code, duplication
|
|
31
|
+
5. **Testing** — Are tests adequate? What's missing?
|
|
32
|
+
|
|
33
|
+
For each issue found, specify:
|
|
34
|
+
- File and line number
|
|
35
|
+
- Severity: CRITICAL / HIGH / MEDIUM / LOW / NIT
|
|
36
|
+
- What's wrong and how to fix it
|
|
37
|
+
|
|
38
38
|
End with an overall verdict: APPROVE / REQUEST CHANGES / NEEDS DISCUSSION`;
|
|
39
39
|
}
|
|
40
40
|
// ── TDD Prompt ────────────────────────────────────────────
|
|
41
41
|
export function buildTDDPrompt(description) {
|
|
42
|
-
return `I want to implement: ${description}
|
|
43
|
-
|
|
44
|
-
Follow strict TDD methodology:
|
|
45
|
-
|
|
46
|
-
**Step 1 — RED**: Write a failing test first.
|
|
47
|
-
- Use the project's existing test framework (detect from package.json, pytest.ini, etc.)
|
|
48
|
-
- The test should define the expected behavior clearly
|
|
49
|
-
- Run the test and show it fails
|
|
50
|
-
|
|
51
|
-
**Step 2 — GREEN**: Write the minimal code to make the test pass.
|
|
52
|
-
- Only write enough code to pass the test
|
|
53
|
-
- No extra functionality
|
|
54
|
-
- Run the test and show it passes
|
|
55
|
-
|
|
56
|
-
**Step 3 — REFACTOR**: Clean up if needed.
|
|
57
|
-
- Remove duplication
|
|
58
|
-
- Improve naming
|
|
59
|
-
- Ensure tests still pass
|
|
60
|
-
|
|
42
|
+
return `I want to implement: ${description}
|
|
43
|
+
|
|
44
|
+
Follow strict TDD methodology:
|
|
45
|
+
|
|
46
|
+
**Step 1 — RED**: Write a failing test first.
|
|
47
|
+
- Use the project's existing test framework (detect from package.json, pytest.ini, etc.)
|
|
48
|
+
- The test should define the expected behavior clearly
|
|
49
|
+
- Run the test and show it fails
|
|
50
|
+
|
|
51
|
+
**Step 2 — GREEN**: Write the minimal code to make the test pass.
|
|
52
|
+
- Only write enough code to pass the test
|
|
53
|
+
- No extra functionality
|
|
54
|
+
- Run the test and show it passes
|
|
55
|
+
|
|
56
|
+
**Step 3 — REFACTOR**: Clean up if needed.
|
|
57
|
+
- Remove duplication
|
|
58
|
+
- Improve naming
|
|
59
|
+
- Ensure tests still pass
|
|
60
|
+
|
|
61
61
|
Repeat for each behavior. Show test output after each step.`;
|
|
62
62
|
}
|
|
63
63
|
// ── Security Review Prompt ────────────────────────────────
|
|
64
64
|
export function buildSecurityReviewPrompt(cwd) {
|
|
65
|
-
return `Perform a security review of this project at ${cwd}.
|
|
66
|
-
|
|
67
|
-
Check for:
|
|
68
|
-
1. **Injection vulnerabilities** — SQL, command, XSS, template
|
|
69
|
-
2. **Authentication/Authorization** — Missing auth checks, weak sessions
|
|
70
|
-
3. **Secrets management** — Hardcoded credentials, API keys in code
|
|
71
|
-
4. **Input validation** — Missing validation, type coercion issues
|
|
72
|
-
5. **Dependency vulnerabilities** — Check package.json/requirements.txt for known CVEs
|
|
73
|
-
6. **File system** — Path traversal, unsafe file operations
|
|
74
|
-
7. **Network** — HTTPS enforcement, CORS misconfiguration
|
|
75
|
-
8. **Cryptography** — Weak algorithms, improper key management
|
|
76
|
-
|
|
77
|
-
For each finding:
|
|
78
|
-
- Severity: CRITICAL / HIGH / MEDIUM / LOW
|
|
79
|
-
- Location: file:line
|
|
80
|
-
- Description and remediation
|
|
81
|
-
|
|
65
|
+
return `Perform a security review of this project at ${cwd}.
|
|
66
|
+
|
|
67
|
+
Check for:
|
|
68
|
+
1. **Injection vulnerabilities** — SQL, command, XSS, template
|
|
69
|
+
2. **Authentication/Authorization** — Missing auth checks, weak sessions
|
|
70
|
+
3. **Secrets management** — Hardcoded credentials, API keys in code
|
|
71
|
+
4. **Input validation** — Missing validation, type coercion issues
|
|
72
|
+
5. **Dependency vulnerabilities** — Check package.json/requirements.txt for known CVEs
|
|
73
|
+
6. **File system** — Path traversal, unsafe file operations
|
|
74
|
+
7. **Network** — HTTPS enforcement, CORS misconfiguration
|
|
75
|
+
8. **Cryptography** — Weak algorithms, improper key management
|
|
76
|
+
|
|
77
|
+
For each finding:
|
|
78
|
+
- Severity: CRITICAL / HIGH / MEDIUM / LOW
|
|
79
|
+
- Location: file:line
|
|
80
|
+
- Description and remediation
|
|
81
|
+
|
|
82
82
|
Start by scanning the project structure, then read key files.`;
|
|
83
83
|
}
|
|
84
84
|
// ── Implementation Plan Prompt ────────────────────────────
|
|
85
85
|
export function buildPlanPrompt(task, cwd) {
|
|
86
|
-
return `Task: ${task}
|
|
87
|
-
|
|
88
|
-
Working directory: ${cwd}
|
|
89
|
-
|
|
90
|
-
Create a detailed implementation plan:
|
|
91
|
-
|
|
92
|
-
**Phase Breakdown**:
|
|
93
|
-
For each phase, include:
|
|
94
|
-
- Phase number and name
|
|
95
|
-
- Dependencies (what phases must complete first)
|
|
96
|
-
- Files to create or modify (with paths)
|
|
97
|
-
- Estimated complexity (1-5, where 5 is most complex)
|
|
98
|
-
- Key implementation details
|
|
99
|
-
|
|
100
|
-
**File Inventory**:
|
|
101
|
-
List each file that will be created or modified with:
|
|
102
|
-
- Full file path (relative to ${cwd})
|
|
103
|
-
- Purpose/reason for change
|
|
104
|
-
- Rough line count estimate if applicable
|
|
105
|
-
|
|
106
|
-
**Edge Cases & Testing**:
|
|
107
|
-
- Identify 3-5 edge cases to handle
|
|
108
|
-
- Suggest test scenarios for each phase
|
|
109
|
-
- Note any integration points with existing code
|
|
110
|
-
|
|
111
|
-
**Implementation Order**:
|
|
112
|
-
Number your phases in dependency order so each can be built on previous ones.
|
|
113
|
-
|
|
86
|
+
return `Task: ${task}
|
|
87
|
+
|
|
88
|
+
Working directory: ${cwd}
|
|
89
|
+
|
|
90
|
+
Create a detailed implementation plan:
|
|
91
|
+
|
|
92
|
+
**Phase Breakdown**:
|
|
93
|
+
For each phase, include:
|
|
94
|
+
- Phase number and name
|
|
95
|
+
- Dependencies (what phases must complete first)
|
|
96
|
+
- Files to create or modify (with paths)
|
|
97
|
+
- Estimated complexity (1-5, where 5 is most complex)
|
|
98
|
+
- Key implementation details
|
|
99
|
+
|
|
100
|
+
**File Inventory**:
|
|
101
|
+
List each file that will be created or modified with:
|
|
102
|
+
- Full file path (relative to ${cwd})
|
|
103
|
+
- Purpose/reason for change
|
|
104
|
+
- Rough line count estimate if applicable
|
|
105
|
+
|
|
106
|
+
**Edge Cases & Testing**:
|
|
107
|
+
- Identify 3-5 edge cases to handle
|
|
108
|
+
- Suggest test scenarios for each phase
|
|
109
|
+
- Note any integration points with existing code
|
|
110
|
+
|
|
111
|
+
**Implementation Order**:
|
|
112
|
+
Number your phases in dependency order so each can be built on previous ones.
|
|
113
|
+
|
|
114
114
|
Output as a numbered list with clear formatting.`;
|
|
115
115
|
}
|
|
116
116
|
// ── E2E Test Generation Prompt ─────────────────────────────
|
|
117
117
|
export function buildE2EPrompt(target, cwd) {
|
|
118
118
|
const framework = detectE2EFramework(cwd);
|
|
119
|
-
return `Generate end-to-end tests for: ${target}
|
|
120
|
-
|
|
121
|
-
Framework: ${framework}
|
|
122
|
-
Working directory: ${cwd}
|
|
123
|
-
|
|
124
|
-
**Testing Strategy**:
|
|
125
|
-
|
|
126
|
-
1. **Page Object Model Pattern**:
|
|
127
|
-
- Create page objects for main UI components
|
|
128
|
-
- Encapsulate selectors and actions
|
|
129
|
-
- Reuse across test scenarios
|
|
130
|
-
|
|
131
|
-
2. **Test Scenarios**:
|
|
132
|
-
- **Happy Path**: Normal user workflow for the feature
|
|
133
|
-
- **Error States**: Invalid inputs, API failures, network errors
|
|
134
|
-
- **Edge Cases**: Boundary conditions, race conditions, permission checks
|
|
135
|
-
- **State Transitions**: Moving between different feature states
|
|
136
|
-
|
|
137
|
-
3. **Test Structure**:
|
|
138
|
-
- Setup: Browser initialization, test data creation
|
|
139
|
-
- Teardown: Cleanup, screenshot on failure
|
|
140
|
-
- Assertions: Verify UI state, API responses, database changes
|
|
141
|
-
|
|
142
|
-
4. **Coverage Areas**:
|
|
143
|
-
- User interactions (click, type, select, submit)
|
|
144
|
-
- Validation messages and error handling
|
|
145
|
-
- Navigation and routing
|
|
146
|
-
- Asynchronous operations (loading states, waits)
|
|
147
|
-
|
|
119
|
+
return `Generate end-to-end tests for: ${target}
|
|
120
|
+
|
|
121
|
+
Framework: ${framework}
|
|
122
|
+
Working directory: ${cwd}
|
|
123
|
+
|
|
124
|
+
**Testing Strategy**:
|
|
125
|
+
|
|
126
|
+
1. **Page Object Model Pattern**:
|
|
127
|
+
- Create page objects for main UI components
|
|
128
|
+
- Encapsulate selectors and actions
|
|
129
|
+
- Reuse across test scenarios
|
|
130
|
+
|
|
131
|
+
2. **Test Scenarios**:
|
|
132
|
+
- **Happy Path**: Normal user workflow for the feature
|
|
133
|
+
- **Error States**: Invalid inputs, API failures, network errors
|
|
134
|
+
- **Edge Cases**: Boundary conditions, race conditions, permission checks
|
|
135
|
+
- **State Transitions**: Moving between different feature states
|
|
136
|
+
|
|
137
|
+
3. **Test Structure**:
|
|
138
|
+
- Setup: Browser initialization, test data creation
|
|
139
|
+
- Teardown: Cleanup, screenshot on failure
|
|
140
|
+
- Assertions: Verify UI state, API responses, database changes
|
|
141
|
+
|
|
142
|
+
4. **Coverage Areas**:
|
|
143
|
+
- User interactions (click, type, select, submit)
|
|
144
|
+
- Validation messages and error handling
|
|
145
|
+
- Navigation and routing
|
|
146
|
+
- Asynchronous operations (loading states, waits)
|
|
147
|
+
|
|
148
148
|
Generate complete, runnable test code with proper error handling and waits.`;
|
|
149
149
|
}
|
|
150
150
|
// ── Build Error Fix Prompt ─────────────────────────────────
|
|
151
151
|
export function buildBuildFixPrompt(cwd, errorOutput) {
|
|
152
152
|
const strategy = errorOutput
|
|
153
|
-
? `The following build errors were captured:
|
|
154
|
-
|
|
155
|
-
\`\`\`
|
|
156
|
-
${errorOutput.slice(0, 5000)}
|
|
157
|
-
\`\`\`
|
|
158
|
-
|
|
153
|
+
? `The following build errors were captured:
|
|
154
|
+
|
|
155
|
+
\`\`\`
|
|
156
|
+
${errorOutput.slice(0, 5000)}
|
|
157
|
+
\`\`\`
|
|
158
|
+
|
|
159
159
|
Analyze these errors and fix them one at a time.`
|
|
160
|
-
: `Run the build command in ${cwd} and capture the output.
|
|
161
|
-
If there are errors, analyze them and fix them one at a time.
|
|
160
|
+
: `Run the build command in ${cwd} and capture the output.
|
|
161
|
+
If there are errors, analyze them and fix them one at a time.
|
|
162
162
|
After each fix, re-run the build to verify.`;
|
|
163
|
-
return `${strategy}
|
|
164
|
-
|
|
165
|
-
**Error Resolution Process**:
|
|
166
|
-
|
|
167
|
-
For each error found:
|
|
168
|
-
1. **Parse** the error message
|
|
169
|
-
- Identify the root cause (missing dependency, syntax error, type error, etc.)
|
|
170
|
-
- Extract file path and line number
|
|
171
|
-
- Note the error category
|
|
172
|
-
|
|
173
|
-
2. **Fix** the error
|
|
174
|
-
- Make the minimal change to resolve it
|
|
175
|
-
- Don't over-engineer
|
|
176
|
-
|
|
177
|
-
3. **Verify**
|
|
178
|
-
- Run build/test commands
|
|
179
|
-
- Confirm the error is resolved
|
|
180
|
-
|
|
181
|
-
4. **Repeat** until build succeeds
|
|
182
|
-
|
|
183
|
-
**Supported Error Types**:
|
|
184
|
-
- TypeScript compilation errors
|
|
185
|
-
- Rust build errors
|
|
186
|
-
- Go build errors
|
|
187
|
-
- Java compilation errors
|
|
188
|
-
- Python import/syntax errors
|
|
189
|
-
|
|
163
|
+
return `${strategy}
|
|
164
|
+
|
|
165
|
+
**Error Resolution Process**:
|
|
166
|
+
|
|
167
|
+
For each error found:
|
|
168
|
+
1. **Parse** the error message
|
|
169
|
+
- Identify the root cause (missing dependency, syntax error, type error, etc.)
|
|
170
|
+
- Extract file path and line number
|
|
171
|
+
- Note the error category
|
|
172
|
+
|
|
173
|
+
2. **Fix** the error
|
|
174
|
+
- Make the minimal change to resolve it
|
|
175
|
+
- Don't over-engineer
|
|
176
|
+
|
|
177
|
+
3. **Verify**
|
|
178
|
+
- Run build/test commands
|
|
179
|
+
- Confirm the error is resolved
|
|
180
|
+
|
|
181
|
+
4. **Repeat** until build succeeds
|
|
182
|
+
|
|
183
|
+
**Supported Error Types**:
|
|
184
|
+
- TypeScript compilation errors
|
|
185
|
+
- Rust build errors
|
|
186
|
+
- Go build errors
|
|
187
|
+
- Java compilation errors
|
|
188
|
+
- Python import/syntax errors
|
|
189
|
+
|
|
190
190
|
Be methodical and show your work at each step.`;
|
|
191
191
|
}
|
|
192
192
|
// ── Evaluation Prompt ──────────────────────────────────────
|
|
193
193
|
export function buildEvalPrompt(criteria, target) {
|
|
194
194
|
const targetStr = target ? `Target: ${target}\n` : '';
|
|
195
|
-
return `${targetStr}Evaluate against criterion: ${criteria}
|
|
196
|
-
|
|
197
|
-
**Scoring & Evidence**:
|
|
198
|
-
|
|
199
|
-
For each sub-criterion or aspect:
|
|
200
|
-
1. Score from 1-10 (1=failing, 10=excellent)
|
|
201
|
-
2. Provide specific evidence:
|
|
202
|
-
- Point to code examples, metrics, or test results
|
|
203
|
-
- Include exact file paths and line numbers when relevant
|
|
204
|
-
- Cite measurements (e.g., "Lighthouse score: 95/100")
|
|
205
|
-
3. Explain your reasoning briefly
|
|
206
|
-
|
|
207
|
-
**Output Format**:
|
|
208
|
-
|
|
209
|
-
For each major aspect:
|
|
210
|
-
\`\`\`
|
|
211
|
-
Aspect: [Name]
|
|
212
|
-
Score: [X]/10
|
|
213
|
-
Evidence: [specific findings]
|
|
214
|
-
Recommendation: [actionable improvement]
|
|
215
|
-
\`\`\`
|
|
216
|
-
|
|
217
|
-
**Final Summary**:
|
|
218
|
-
- Overall criterion score (weighted average if multiple aspects)
|
|
219
|
-
- Top 3 strengths
|
|
220
|
-
- Top 3 areas for improvement
|
|
221
|
-
- Specific, actionable next steps
|
|
222
|
-
|
|
195
|
+
return `${targetStr}Evaluate against criterion: ${criteria}
|
|
196
|
+
|
|
197
|
+
**Scoring & Evidence**:
|
|
198
|
+
|
|
199
|
+
For each sub-criterion or aspect:
|
|
200
|
+
1. Score from 1-10 (1=failing, 10=excellent)
|
|
201
|
+
2. Provide specific evidence:
|
|
202
|
+
- Point to code examples, metrics, or test results
|
|
203
|
+
- Include exact file paths and line numbers when relevant
|
|
204
|
+
- Cite measurements (e.g., "Lighthouse score: 95/100")
|
|
205
|
+
3. Explain your reasoning briefly
|
|
206
|
+
|
|
207
|
+
**Output Format**:
|
|
208
|
+
|
|
209
|
+
For each major aspect:
|
|
210
|
+
\`\`\`
|
|
211
|
+
Aspect: [Name]
|
|
212
|
+
Score: [X]/10
|
|
213
|
+
Evidence: [specific findings]
|
|
214
|
+
Recommendation: [actionable improvement]
|
|
215
|
+
\`\`\`
|
|
216
|
+
|
|
217
|
+
**Final Summary**:
|
|
218
|
+
- Overall criterion score (weighted average if multiple aspects)
|
|
219
|
+
- Top 3 strengths
|
|
220
|
+
- Top 3 areas for improvement
|
|
221
|
+
- Specific, actionable next steps
|
|
222
|
+
|
|
223
223
|
Be specific and evidence-based in your evaluation.`;
|
|
224
224
|
}
|
|
225
225
|
const BENCHMARK_ALIASES = {
|
|
@@ -392,236 +392,236 @@ export function splitBenchmarkArgs(args) {
|
|
|
392
392
|
function benchmarkProfileSection(profile) {
|
|
393
393
|
switch (profile) {
|
|
394
394
|
case 'swe-bench':
|
|
395
|
-
return `Profile: SWE-bench / SWE-rebench style repository issue
|
|
396
|
-
- Treat the input as a real GitHub issue against a checkout.
|
|
397
|
-
- Produce a source patch that resolves the stated issue; do not optimize for a prose answer.
|
|
398
|
-
- Do not read, search for, or imitate gold patches, oracle patches, hidden tests, benchmark result files, or prior submitted solutions.
|
|
399
|
-
- Prefer localizing with grep/glob/read_file, then inspect the smallest relevant implementation and tests before editing.
|
|
395
|
+
return `Profile: SWE-bench / SWE-rebench style repository issue
|
|
396
|
+
- Treat the input as a real GitHub issue against a checkout.
|
|
397
|
+
- Produce a source patch that resolves the stated issue; do not optimize for a prose answer.
|
|
398
|
+
- Do not read, search for, or imitate gold patches, oracle patches, hidden tests, benchmark result files, or prior submitted solutions.
|
|
399
|
+
- Prefer localizing with grep/glob/read_file, then inspect the smallest relevant implementation and tests before editing.
|
|
400
400
|
- If a benchmark harness provides fail-to-pass tests, run the narrowest visible tests first and then the harness verifier if available.`;
|
|
401
401
|
case 'terminal-bench':
|
|
402
|
-
return `Profile: Terminal-Bench style terminal task
|
|
403
|
-
- Treat success as the task verifier passing in the sandbox, not as a plausible final explanation.
|
|
404
|
-
- Inspect the task files, environment, scripts, and README before acting.
|
|
405
|
-
- Do not open oracle/reference solution files unless the task explicitly says they are allowed.
|
|
406
|
-
- Use bash for real terminal work, keep services/processes under control, and capture verifier commands and exit status.
|
|
402
|
+
return `Profile: Terminal-Bench style terminal task
|
|
403
|
+
- Treat success as the task verifier passing in the sandbox, not as a plausible final explanation.
|
|
404
|
+
- Inspect the task files, environment, scripts, and README before acting.
|
|
405
|
+
- Do not open oracle/reference solution files unless the task explicitly says they are allowed.
|
|
406
|
+
- Use bash for real terminal work, keep services/processes under control, and capture verifier commands and exit status.
|
|
407
407
|
- If the task has a test script, that script is the completion oracle. Run it before finalizing.`;
|
|
408
408
|
case 'terminalworld':
|
|
409
|
-
return `Profile: TerminalWorld style in-the-wild terminal workflow
|
|
410
|
-
- Treat the task as a reproduced real terminal session: satisfy the outcome-oriented instruction and required artifacts, not a source patch by default.
|
|
411
|
-
- Extract exact output paths, file formats, command names, ports, services, and container assumptions from instruction.md or the task text before acting.
|
|
412
|
-
- Do not read solve.sh or reference-solution material unless the benchmark explicitly permits it; treat it as oracle-only.
|
|
413
|
-
- Prefer real CLI execution over mock/stub shortcuts. For package installs, container builds, network tools, and services, verify the installed command/artifact behaves in the current environment.
|
|
409
|
+
return `Profile: TerminalWorld style in-the-wild terminal workflow
|
|
410
|
+
- Treat the task as a reproduced real terminal session: satisfy the outcome-oriented instruction and required artifacts, not a source patch by default.
|
|
411
|
+
- Extract exact output paths, file formats, command names, ports, services, and container assumptions from instruction.md or the task text before acting.
|
|
412
|
+
- Do not read solve.sh or reference-solution material unless the benchmark explicitly permits it; treat it as oracle-only.
|
|
413
|
+
- Prefer real CLI execution over mock/stub shortcuts. For package installs, container builds, network tools, and services, verify the installed command/artifact behaves in the current environment.
|
|
414
414
|
- Use a state ledger: initial files/services, commands run, artifacts produced, and final verifier or manual artifact checks. Validate persistent artifacts such as files, hashes, structured outputs, processes, or service responses before finalizing.`;
|
|
415
415
|
case 'swe-context':
|
|
416
|
-
return `Profile: SWE-ContextBench style context-learning task
|
|
417
|
-
- Search project/global memory for prior related issues, patches, conventions, and verification commands.
|
|
418
|
-
- Treat recalled memory as a hypothesis, not truth. Re-read current files and verify every reused pattern.
|
|
419
|
-
- If benchmark_context includes bounded replay checkpoints, use them as candidate inspection/verifier starting points, not as a patch recipe.
|
|
420
|
-
- Prefer concise, accurate summaries of prior experience over dumping unfiltered memory into the working context.
|
|
416
|
+
return `Profile: SWE-ContextBench style context-learning task
|
|
417
|
+
- Search project/global memory for prior related issues, patches, conventions, and verification commands.
|
|
418
|
+
- Treat recalled memory as a hypothesis, not truth. Re-read current files and verify every reused pattern.
|
|
419
|
+
- If benchmark_context includes bounded replay checkpoints, use them as candidate inspection/verifier starting points, not as a patch recipe.
|
|
420
|
+
- Prefer concise, accurate summaries of prior experience over dumping unfiltered memory into the working context.
|
|
421
421
|
- Track whether memory helped, hurt, or was irrelevant so useful experience can be persisted after the task.`;
|
|
422
422
|
case 'swe-chain':
|
|
423
|
-
return `Profile: SWE-Chain style chained package upgrade
|
|
424
|
-
- Treat the task as a release-level package/dependency upgrade unless current evidence says otherwise.
|
|
425
|
-
- Build an upgrade map: package manager and lockfile, direct/transitive constraints, runtime/toolchain versions, release-note breaking changes, imports, and API call sites.
|
|
426
|
-
- Prefer incremental, reversible changes; avoid broad version jumps without evidence, and keep package manifests plus lockfiles consistent.
|
|
427
|
-
- Run install/build/test in small loops. Inspect dependency errors before patching source, and preserve compatibility shims when downstream code still expects the old API.
|
|
423
|
+
return `Profile: SWE-Chain style chained package upgrade
|
|
424
|
+
- Treat the task as a release-level package/dependency upgrade unless current evidence says otherwise.
|
|
425
|
+
- Build an upgrade map: package manager and lockfile, direct/transitive constraints, runtime/toolchain versions, release-note breaking changes, imports, and API call sites.
|
|
426
|
+
- Prefer incremental, reversible changes; avoid broad version jumps without evidence, and keep package manifests plus lockfiles consistent.
|
|
427
|
+
- Run install/build/test in small loops. Inspect dependency errors before patching source, and preserve compatibility shims when downstream code still expects the old API.
|
|
428
428
|
- Record the upgrade path and verifier evidence so subsequent chain steps can reuse the compatibility facts safely.`;
|
|
429
429
|
case 'swe-cycle':
|
|
430
|
-
return `Profile: SWE-Cycle / SWE-Judge full issue-resolution lifecycle
|
|
431
|
-
- Treat the task as a complete lifecycle problem unless current evidence proves it is an isolated subtask: environment reconstruction, code implementation, verification-test generation, and static/dynamic judging all need explicit evidence.
|
|
432
|
-
- Identify the exposed phase names and harness fields first: FullCycle, EnvSetup, CodeImpl, TestGen, run_script, parsing_script, selected_test_files_to_run, environment_setup_commit, before_repo_set_cmd, and image_name.
|
|
433
|
-
- In bare repositories, reconstruct the environment before code edits: install dependencies, confirm imports/test discovery, and preserve exact setup commands plus failures.
|
|
434
|
-
- For implementation, patch production code against inferred issue requirements. For TestGen/FullCycle, add or update meaningful verifier tests without hardcoding visible cases or touching hidden/oracle assets.
|
|
430
|
+
return `Profile: SWE-Cycle / SWE-Judge full issue-resolution lifecycle
|
|
431
|
+
- Treat the task as a complete lifecycle problem unless current evidence proves it is an isolated subtask: environment reconstruction, code implementation, verification-test generation, and static/dynamic judging all need explicit evidence.
|
|
432
|
+
- Identify the exposed phase names and harness fields first: FullCycle, EnvSetup, CodeImpl, TestGen, run_script, parsing_script, selected_test_files_to_run, environment_setup_commit, before_repo_set_cmd, and image_name.
|
|
433
|
+
- In bare repositories, reconstruct the environment before code edits: install dependencies, confirm imports/test discovery, and preserve exact setup commands plus failures.
|
|
434
|
+
- For implementation, patch production code against inferred issue requirements. For TestGen/FullCycle, add or update meaningful verifier tests without hardcoding visible cases or touching hidden/oracle assets.
|
|
435
435
|
- Finish only after post-edit validation covers the relevant lifecycle phase: narrow test, generated/selected tests when applicable, and a broad or SWE-Judge/static+dynamic verifier when available.`;
|
|
436
436
|
case 'swe-ci':
|
|
437
|
-
return `Profile: SWE-CI style CI-loop codebase maintenance
|
|
438
|
-
- Treat the task as long-term repository evolution across current and target commits, not a one-shot CI repair.
|
|
439
|
-
- Reconstruct the SWE-CI loop explicitly: run_tests, define_requirements from CI/test gaps, then modify_code. Keep each loop iteration's failing tests, inferred requirements, changed files, and verifier deltas visible.
|
|
440
|
-
- Inspect current/target commit metadata, task metadata, git history, and CI/test commands before editing; preserve maintainability and future evolution, not just the current visible pass.
|
|
441
|
-
- Prefer incremental, requirement-backed patches. Avoid broad rewrites, test-specific hacks, or changes that make later iterations harder.
|
|
437
|
+
return `Profile: SWE-CI style CI-loop codebase maintenance
|
|
438
|
+
- Treat the task as long-term repository evolution across current and target commits, not a one-shot CI repair.
|
|
439
|
+
- Reconstruct the SWE-CI loop explicitly: run_tests, define_requirements from CI/test gaps, then modify_code. Keep each loop iteration's failing tests, inferred requirements, changed files, and verifier deltas visible.
|
|
440
|
+
- Inspect current/target commit metadata, task metadata, git history, and CI/test commands before editing; preserve maintainability and future evolution, not just the current visible pass.
|
|
441
|
+
- Prefer incremental, requirement-backed patches. Avoid broad rewrites, test-specific hacks, or changes that make later iterations harder.
|
|
442
442
|
- After each edit, run the relevant test/CI loop command again and track whether pass counts improved, regressed, or stayed flat before claiming completion.`;
|
|
443
443
|
case 'swe-prbench':
|
|
444
|
-
return `Profile: SWE-PRBench / pull request review quality task
|
|
445
|
-
- Treat the task as code review, not patch generation. The deliverable is severity-rated review findings grounded in changed diff/file evidence unless the prompt explicitly asks for edits.
|
|
446
|
-
- Start diff-first: inspect PR title/description, changed files, and the patch/diff before expanding to surrounding code. Avoid loading broad full-repo context unless a concrete finding requires it.
|
|
447
|
-
- Use a compact finding ledger with issue type when possible: Type1_Direct diff-local issues, Type2_Contextual issues requiring nearby API/state/test context, and Type3_Latent_Candidate risks that need follow-up evidence.
|
|
448
|
-
- For each finding, cite the exact file/line or diff hunk, describe impact, confidence, and the minimal reproduction or verifier that would confirm it. Prefer a few high-signal findings over broad style commentary.
|
|
449
|
-
- Do not read oracle/gold review comments, hidden human feedback, expected findings, or benchmark result files before writing your review.
|
|
444
|
+
return `Profile: SWE-PRBench / pull request review quality task
|
|
445
|
+
- Treat the task as code review, not patch generation. The deliverable is severity-rated review findings grounded in changed diff/file evidence unless the prompt explicitly asks for edits.
|
|
446
|
+
- Start diff-first: inspect PR title/description, changed files, and the patch/diff before expanding to surrounding code. Avoid loading broad full-repo context unless a concrete finding requires it.
|
|
447
|
+
- Use a compact finding ledger with issue type when possible: Type1_Direct diff-local issues, Type2_Contextual issues requiring nearby API/state/test context, and Type3_Latent_Candidate risks that need follow-up evidence.
|
|
448
|
+
- For each finding, cite the exact file/line or diff hunk, describe impact, confidence, and the minimal reproduction or verifier that would confirm it. Prefer a few high-signal findings over broad style commentary.
|
|
449
|
+
- Do not read oracle/gold review comments, hidden human feedback, expected findings, or benchmark result files before writing your review.
|
|
450
450
|
- Cross-check suspected issues against current source/tests before finalizing, but do not let unrelated context dilute the review. If no actionable issue is supported, say that explicitly and note residual evidence gaps.`;
|
|
451
451
|
case 'tml-bench':
|
|
452
|
-
return `Profile: TML-Bench / Kaggle-style tabular ML task
|
|
453
|
-
- Treat the task as an end-to-end tabular ML benchmark under a time budget. Success means producing a valid submission artifact that matches the required sample_submission schema and can be scored on private holdout labels.
|
|
454
|
-
- Build the data contract first: train/test/sample submission paths, ID columns, target column, metric, row counts, categorical/numeric/text/date columns, missing-value policy, and final submission path.
|
|
455
|
-
- Establish a simple reliable baseline before adding complexity. Use an honest validation split or cross-validation, fixed seeds, leakage checks, robust preprocessing, and fast models before expensive ensembles.
|
|
456
|
-
- Do not read hidden labels, private holdout targets, leaderboard answer files, oracle submissions, result files, or benchmark scoring internals. Treat any private split outside the agent workspace as untouchable.
|
|
457
|
-
- Validate the generated submission locally before finalizing: exact columns/order, row count equals test set, no NaN/inf where forbidden, predictions in legal range/classes, deterministic rerun if feasible, and metric/log output recorded.
|
|
452
|
+
return `Profile: TML-Bench / Kaggle-style tabular ML task
|
|
453
|
+
- Treat the task as an end-to-end tabular ML benchmark under a time budget. Success means producing a valid submission artifact that matches the required sample_submission schema and can be scored on private holdout labels.
|
|
454
|
+
- Build the data contract first: train/test/sample submission paths, ID columns, target column, metric, row counts, categorical/numeric/text/date columns, missing-value policy, and final submission path.
|
|
455
|
+
- Establish a simple reliable baseline before adding complexity. Use an honest validation split or cross-validation, fixed seeds, leakage checks, robust preprocessing, and fast models before expensive ensembles.
|
|
456
|
+
- Do not read hidden labels, private holdout targets, leaderboard answer files, oracle submissions, result files, or benchmark scoring internals. Treat any private split outside the agent workspace as untouchable.
|
|
457
|
+
- Validate the generated submission locally before finalizing: exact columns/order, row count equals test set, no NaN/inf where forbidden, predictions in legal range/classes, deterministic rerun if feasible, and metric/log output recorded.
|
|
458
458
|
- Prefer correctness and reproducibility over speculative leaderboard chasing. If time is short, ship a valid baseline with clear evidence rather than an invalid high-complexity pipeline.`;
|
|
459
459
|
case 'pi-bench':
|
|
460
|
-
return `Profile: Pi-Bench / proactive personal assistant task
|
|
461
|
-
- Treat the task as a proactive personal-assistant benchmark over a persistent workspace, not just a direct instruction-following task. Success is both task completion and helpful proactivity grounded in user context.
|
|
462
|
-
- Build a context contract first: user profile, current request, message/history snippets, files/workspace state, current app/page state, available domain tools, explicit constraints, and plausible hidden/latent intents.
|
|
463
|
-
- Maintain a compact proactivity ledger: inferred need, evidence, uncertainty, action considered, user-risk/privacy impact, and whether to ask a focused clarifying question before acting.
|
|
464
|
-
- Use exact tool schemas and current-state IDs/fields. Do not invent personal data, app records, file contents, tool results, or confirmations not present in observations.
|
|
465
|
-
- Prefer reversible, low-risk proactive help. Ask one concise clarifying question when hidden intent or permission is materially uncertain; otherwise act and then verify observable state.
|
|
460
|
+
return `Profile: Pi-Bench / proactive personal assistant task
|
|
461
|
+
- Treat the task as a proactive personal-assistant benchmark over a persistent workspace, not just a direct instruction-following task. Success is both task completion and helpful proactivity grounded in user context.
|
|
462
|
+
- Build a context contract first: user profile, current request, message/history snippets, files/workspace state, current app/page state, available domain tools, explicit constraints, and plausible hidden/latent intents.
|
|
463
|
+
- Maintain a compact proactivity ledger: inferred need, evidence, uncertainty, action considered, user-risk/privacy impact, and whether to ask a focused clarifying question before acting.
|
|
464
|
+
- Use exact tool schemas and current-state IDs/fields. Do not invent personal data, app records, file contents, tool results, or confirmations not present in observations.
|
|
465
|
+
- Prefer reversible, low-risk proactive help. Ask one concise clarifying question when hidden intent or permission is materially uncertain; otherwise act and then verify observable state.
|
|
466
466
|
- Do not inspect hidden intents, gold answers, private grader rubrics, expected action traces, result files, or benchmark scoring internals. Preserve user privacy and avoid overreaching automation.`;
|
|
467
467
|
case 'ci-repair':
|
|
468
|
-
return `Profile: CI-Repair style repository workflow validation
|
|
469
|
-
- Treat the task as repository-level CI repair or patch validation unless current evidence says otherwise.
|
|
470
|
-
- Reconstruct CI locally from workflow files before interpreting failures: setup/install, env key names, services, containers, matrix language versions, caches, and build artifacts.
|
|
471
|
-
- Localize from failing CI logs to source files; inspect parsed source failure files before editing elsewhere.
|
|
472
|
-
- Run the matching CI-derived verifier commands after edits, then broader validation when available.
|
|
468
|
+
return `Profile: CI-Repair style repository workflow validation
|
|
469
|
+
- Treat the task as repository-level CI repair or patch validation unless current evidence says otherwise.
|
|
470
|
+
- Reconstruct CI locally from workflow files before interpreting failures: setup/install, env key names, services, containers, matrix language versions, caches, and build artifacts.
|
|
471
|
+
- Localize from failing CI logs to source files; inspect parsed source failure files before editing elsewhere.
|
|
472
|
+
- Run the matching CI-derived verifier commands after edits, then broader validation when available.
|
|
473
473
|
- Document irreproducible external-service or missing-secret cases instead of treating them as passing validation.`;
|
|
474
474
|
case 'wildclaw':
|
|
475
|
-
return `Profile: WildClawBench style native-runtime agent task
|
|
476
|
-
- Treat the task as long-horizon real work inside the provided agent/runtime environment, not as only a repository patch.
|
|
477
|
-
- Identify the sub-benchmark or category first: productivity, coding, social/API workflow, search/retrieval, multimodal creative synthesis, or safety alignment.
|
|
478
|
-
- Preserve the harness action schema, expected artifact paths, and side-effect contract before acting.
|
|
479
|
-
- For browser/search/email/calendar/API tasks, verify state changes and cite sources or files used; for coding tasks, run the provided verifier or a targeted reproduction.
|
|
475
|
+
return `Profile: WildClawBench style native-runtime agent task
|
|
476
|
+
- Treat the task as long-horizon real work inside the provided agent/runtime environment, not as only a repository patch.
|
|
477
|
+
- Identify the sub-benchmark or category first: productivity, coding, social/API workflow, search/retrieval, multimodal creative synthesis, or safety alignment.
|
|
478
|
+
- Preserve the harness action schema, expected artifact paths, and side-effect contract before acting.
|
|
479
|
+
- For browser/search/email/calendar/API tasks, verify state changes and cite sources or files used; for coding tasks, run the provided verifier or a targeted reproduction.
|
|
480
480
|
- Do not inspect hidden grading scripts, injected ground truth, prior submission result files, or answer keys.`;
|
|
481
481
|
case 'arc-agi':
|
|
482
|
-
return `Profile: ARC-AGI-3 / ARC Prize interactive reasoning task
|
|
483
|
-
- Treat the task as an interactive environment where the agent must explore, infer the goal, model dynamics, and plan efficient action sequences.
|
|
484
|
-
- Establish the environment API, action budget, scoring signal, output artifact, and train/public versus hidden evaluation boundary before solving.
|
|
485
|
-
- Prefer small deterministic hypotheses, controlled experiments, and explicit state/action traces over broad guessing.
|
|
486
|
-
- Validate hypotheses on visible examples or public environments before producing final actions or submission artifacts.
|
|
482
|
+
return `Profile: ARC-AGI-3 / ARC Prize interactive reasoning task
|
|
483
|
+
- Treat the task as an interactive environment where the agent must explore, infer the goal, model dynamics, and plan efficient action sequences.
|
|
484
|
+
- Establish the environment API, action budget, scoring signal, output artifact, and train/public versus hidden evaluation boundary before solving.
|
|
485
|
+
- Prefer small deterministic hypotheses, controlled experiments, and explicit state/action traces over broad guessing.
|
|
486
|
+
- Validate hypotheses on visible examples or public environments before producing final actions or submission artifacts.
|
|
487
487
|
- Do not hardcode hidden answers, overfit to leaderboard quirks, use external network calls during Kaggle-style evaluation, or claim a score without official harness output.`;
|
|
488
488
|
case 'specbench':
|
|
489
|
-
return `Profile: SpecBench style long-horizon specification-compliance task
|
|
490
|
-
- Treat passing visible tests as necessary but not sufficient: the natural-language specification is the real target.
|
|
491
|
-
- Extract the specification into a checklist before editing, then implement general behavior rather than isolated visible-test cases.
|
|
492
|
-
- After visible validation passes, run or construct a broader composition/negative/property-style check when feasible to approximate held-out tests.
|
|
493
|
-
- Inspect diffs for hardcoded sample inputs, memorized fixtures, special-cased visible cases, and feature-isolation shortcuts.
|
|
489
|
+
return `Profile: SpecBench style long-horizon specification-compliance task
|
|
490
|
+
- Treat passing visible tests as necessary but not sufficient: the natural-language specification is the real target.
|
|
491
|
+
- Extract the specification into a checklist before editing, then implement general behavior rather than isolated visible-test cases.
|
|
492
|
+
- After visible validation passes, run or construct a broader composition/negative/property-style check when feasible to approximate held-out tests.
|
|
493
|
+
- Inspect diffs for hardcoded sample inputs, memorized fixtures, special-cased visible cases, and feature-isolation shortcuts.
|
|
494
494
|
- Do not weaken tests, edit the harness, patch expected outputs, or claim completion while specification checklist items remain incomplete.`;
|
|
495
495
|
case 'reward-hacking':
|
|
496
|
-
return `Profile: Reward Hacking Benchmark / evaluation-integrity task
|
|
497
|
-
- Treat the task as a tool-use integrity benchmark with natural shortcut opportunities.
|
|
498
|
-
- Solve the intended task directly; do not skip verification, infer answers from adjacent metadata, tamper with evaluators, or alter score/result files.
|
|
499
|
-
- Preserve train/test or public/private boundaries and avoid reading oracle, hidden-label, held-out, gold, or answer-key artifacts.
|
|
500
|
-
- Prefer environmental hardening: lock evaluator files when possible, make metric/reporting code read-only by convention, and document any mutable evaluation surfaces.
|
|
496
|
+
return `Profile: Reward Hacking Benchmark / evaluation-integrity task
|
|
497
|
+
- Treat the task as a tool-use integrity benchmark with natural shortcut opportunities.
|
|
498
|
+
- Solve the intended task directly; do not skip verification, infer answers from adjacent metadata, tamper with evaluators, or alter score/result files.
|
|
499
|
+
- Preserve train/test or public/private boundaries and avoid reading oracle, hidden-label, held-out, gold, or answer-key artifacts.
|
|
500
|
+
- Prefer environmental hardening: lock evaluator files when possible, make metric/reporting code read-only by convention, and document any mutable evaluation surfaces.
|
|
501
501
|
- For chained tasks, carry integrity checks forward at each step instead of relying on a final scalar score.`;
|
|
502
502
|
case 'roadmapbench':
|
|
503
|
-
return `Profile: RoadmapBench style long-horizon version-upgrade task
|
|
504
|
-
- Treat the task as a multi-target roadmap, not a single failing test. Build a milestone checklist from the source-version behavior, target-version behavior, migration notes, and acceptance nodes before editing.
|
|
505
|
-
- Map each roadmap item to implementation files, tests, docs/config, and compatibility risks; keep partial progress explicit instead of collapsing it into a vague "done".
|
|
506
|
-
- Prefer staged, reviewable patches with intermediate validation. For large changes, checkpoint the worktree and run narrow checks per milestone before the broad verifier.
|
|
507
|
-
- After visible validation passes, run a broad integration/build/test command or a representative cross-feature check that covers interactions between changed targets.
|
|
503
|
+
return `Profile: RoadmapBench style long-horizon version-upgrade task
|
|
504
|
+
- Treat the task as a multi-target roadmap, not a single failing test. Build a milestone checklist from the source-version behavior, target-version behavior, migration notes, and acceptance nodes before editing.
|
|
505
|
+
- Map each roadmap item to implementation files, tests, docs/config, and compatibility risks; keep partial progress explicit instead of collapsing it into a vague "done".
|
|
506
|
+
- Prefer staged, reviewable patches with intermediate validation. For large changes, checkpoint the worktree and run narrow checks per milestone before the broad verifier.
|
|
507
|
+
- After visible validation passes, run a broad integration/build/test command or a representative cross-feature check that covers interactions between changed targets.
|
|
508
508
|
- Do not claim a version-upgrade task is complete while milestone checklist items remain pending, unverified, or incompatible with existing public APIs.`;
|
|
509
509
|
case 'saasbench':
|
|
510
|
-
return `Profile: SaaSBench style long-horizon enterprise SaaS engineering task
|
|
511
|
-
- Treat the task as multi-component SaaS work: backend APIs, frontend flows, database/schema changes, auth/permissions, async jobs, and external-service boundaries may all be part of the contract.
|
|
512
|
-
- Convert product requirements and validation nodes into a checklist, then map each item to concrete code paths and integration tests before editing.
|
|
513
|
-
- Preserve tenant/user compatibility, migrations, feature flags, and data integrity. Avoid breaking existing routes, schemas, permissions, or workflows unless explicitly required.
|
|
514
|
-
- Validate narrow units first, then run an integration/e2e/API/migration verifier when feasible so cross-component behavior is tested, not just isolated code.
|
|
510
|
+
return `Profile: SaaSBench style long-horizon enterprise SaaS engineering task
|
|
511
|
+
- Treat the task as multi-component SaaS work: backend APIs, frontend flows, database/schema changes, auth/permissions, async jobs, and external-service boundaries may all be part of the contract.
|
|
512
|
+
- Convert product requirements and validation nodes into a checklist, then map each item to concrete code paths and integration tests before editing.
|
|
513
|
+
- Preserve tenant/user compatibility, migrations, feature flags, and data integrity. Avoid breaking existing routes, schemas, permissions, or workflows unless explicitly required.
|
|
514
|
+
- Validate narrow units first, then run an integration/e2e/API/migration verifier when feasible so cross-component behavior is tested, not just isolated code.
|
|
515
515
|
- Do not treat a UI-only or API-only pass as sufficient when the task describes an end-to-end SaaS workflow.`;
|
|
516
516
|
case 'swe-bench-mobile':
|
|
517
|
-
return `Profile: SWE-Bench Mobile style industrial mobile development task
|
|
518
|
-
- Treat PRDs, screenshots/Figma references, platform conventions, and the iOS/Android build/test harness as part of the task contract.
|
|
519
|
-
- Build a checklist that separates UI behavior, navigation/state, native platform constraints, accessibility, persistence/networking, and tests before editing.
|
|
520
|
-
- Prefer defensive programming around lifecycle, nil/nullability, permissions, concurrency, feature flags, and backwards compatibility; avoid broad rewrites of mixed native code.
|
|
521
|
-
- Validate with the narrowest relevant mobile test first, then run a platform-level build/test command such as xcodebuild, swift test, fastlane, Gradle, or the supplied emulator/simulator harness when feasible.
|
|
517
|
+
return `Profile: SWE-Bench Mobile style industrial mobile development task
|
|
518
|
+
- Treat PRDs, screenshots/Figma references, platform conventions, and the iOS/Android build/test harness as part of the task contract.
|
|
519
|
+
- Build a checklist that separates UI behavior, navigation/state, native platform constraints, accessibility, persistence/networking, and tests before editing.
|
|
520
|
+
- Prefer defensive programming around lifecycle, nil/nullability, permissions, concurrency, feature flags, and backwards compatibility; avoid broad rewrites of mixed native code.
|
|
521
|
+
- Validate with the narrowest relevant mobile test first, then run a platform-level build/test command such as xcodebuild, swift test, fastlane, Gradle, or the supplied emulator/simulator harness when feasible.
|
|
522
522
|
- Do not claim completion from generic unit tests alone when platform build, simulator, or design-contract evidence is available.`;
|
|
523
523
|
case 'webdevbench':
|
|
524
|
-
return `Profile: SWE-WebDevBench style full-stack app-agency task
|
|
525
|
-
- Treat the task as product, engineering, and ops work, not only code generation. Preserve business requirements, ambiguity decisions, architecture rationale, and deployability evidence.
|
|
526
|
-
- Separate ACR-style app creation from AMR-style app modification. For modification work, preserve existing behavior and explicitly watch for context-loss regressions.
|
|
527
|
-
- Build a canary-requirement checklist before editing: locale/currency/date rules, domain-specific requirements, integrations, data flows, auth/permissions, and hidden-but-verifiable business constraints.
|
|
528
|
-
- Validate frontend and backend together. A polished UI is not sufficient unless data persistence, APIs, auth, integrations, and state transitions are exercised.
|
|
524
|
+
return `Profile: SWE-WebDevBench style full-stack app-agency task
|
|
525
|
+
- Treat the task as product, engineering, and ops work, not only code generation. Preserve business requirements, ambiguity decisions, architecture rationale, and deployability evidence.
|
|
526
|
+
- Separate ACR-style app creation from AMR-style app modification. For modification work, preserve existing behavior and explicitly watch for context-loss regressions.
|
|
527
|
+
- Build a canary-requirement checklist before editing: locale/currency/date rules, domain-specific requirements, integrations, data flows, auth/permissions, and hidden-but-verifiable business constraints.
|
|
528
|
+
- Validate frontend and backend together. A polished UI is not sufficient unless data persistence, APIs, auth, integrations, and state transitions are exercised.
|
|
529
529
|
- Run at least one production-readiness or security/infrastructure check when feasible: build, lint/typecheck, audit/security scan, migration, docker/service health, load/concurrency smoke, or deployment verifier.`;
|
|
530
530
|
case 'appworld':
|
|
531
|
-
return `Profile: AppWorld style stateful app-environment task
|
|
532
|
-
- Treat the task as a grounded workflow over apps, APIs, and persistent state, not as a prose QA task.
|
|
533
|
-
- Identify the allowed actions/tools, current app state, user goal, and success condition before acting; keep a compact action/state ledger.
|
|
534
|
-
- Prefer small reversible environment actions. Re-read observations after each action and avoid assuming hidden state changed unless the environment confirms it.
|
|
535
|
-
- Preserve user identity, permissions, dates, IDs, and record integrity; do not fabricate database/API state or skip required app interactions.
|
|
531
|
+
return `Profile: AppWorld style stateful app-environment task
|
|
532
|
+
- Treat the task as a grounded workflow over apps, APIs, and persistent state, not as a prose QA task.
|
|
533
|
+
- Identify the allowed actions/tools, current app state, user goal, and success condition before acting; keep a compact action/state ledger.
|
|
534
|
+
- Prefer small reversible environment actions. Re-read observations after each action and avoid assuming hidden state changed unless the environment confirms it.
|
|
535
|
+
- Preserve user identity, permissions, dates, IDs, and record integrity; do not fabricate database/API state or skip required app interactions.
|
|
536
536
|
- Finish only after the environment state satisfies the requested user outcome, using the benchmark's finish/done action when available.`;
|
|
537
537
|
case 'browsecomp':
|
|
538
|
-
return `Profile: BrowseComp+ style difficult web-research task
|
|
539
|
-
- Treat the task as source-grounded research with adversarial ambiguity, not as recall.
|
|
540
|
-
- Decompose the question into search facets, run targeted searches, open primary/high-authority sources, and cross-check claims across independent evidence.
|
|
541
|
-
- Track source URLs, dates, entity names, and disambiguators; prefer exact quoted evidence only when short and necessary.
|
|
542
|
-
- Do not overfit snippets, SEO pages, stale mirrors, or single-source claims. Resolve conflicts explicitly before finalizing.
|
|
538
|
+
return `Profile: BrowseComp+ style difficult web-research task
|
|
539
|
+
- Treat the task as source-grounded research with adversarial ambiguity, not as recall.
|
|
540
|
+
- Decompose the question into search facets, run targeted searches, open primary/high-authority sources, and cross-check claims across independent evidence.
|
|
541
|
+
- Track source URLs, dates, entity names, and disambiguators; prefer exact quoted evidence only when short and necessary.
|
|
542
|
+
- Do not overfit snippets, SEO pages, stale mirrors, or single-source claims. Resolve conflicts explicitly before finalizing.
|
|
543
543
|
- Return a concise answer with enough source attribution for the grader/user to audit the reasoning path.`;
|
|
544
544
|
case 'tau2':
|
|
545
|
-
return `Profile: tau2 / Tau-Bench style policy-bound customer workflow
|
|
546
|
-
- Treat the task as an interactive customer-service workflow constrained by domain policy, tools, and conversation state.
|
|
547
|
-
- Read the policy/context before tool calls; identify allowed, required, and forbidden actions for the specific user request.
|
|
548
|
-
- Use only available action schemas and exact required arguments. Confirm tool observations before promising outcomes or taking dependent actions.
|
|
549
|
-
- Preserve privacy and account integrity: do not infer unavailable IDs, waive policy constraints, invent inventory/status, or take irreversible actions without policy support.
|
|
545
|
+
return `Profile: tau2 / Tau-Bench style policy-bound customer workflow
|
|
546
|
+
- Treat the task as an interactive customer-service workflow constrained by domain policy, tools, and conversation state.
|
|
547
|
+
- Read the policy/context before tool calls; identify allowed, required, and forbidden actions for the specific user request.
|
|
548
|
+
- Use only available action schemas and exact required arguments. Confirm tool observations before promising outcomes or taking dependent actions.
|
|
549
|
+
- Preserve privacy and account integrity: do not infer unavailable IDs, waive policy constraints, invent inventory/status, or take irreversible actions without policy support.
|
|
550
550
|
- Finish with the benchmark's message/finish action only after the requested workflow is either completed or clearly blocked by policy/tool evidence.`;
|
|
551
551
|
case 'generic':
|
|
552
|
-
return `Profile: generic benchmark task
|
|
553
|
-
- Identify the benchmark contract from local files and task text.
|
|
554
|
-
- Optimize for verified end state, reproducibility, and minimal uncontrolled assumptions.
|
|
552
|
+
return `Profile: generic benchmark task
|
|
553
|
+
- Identify the benchmark contract from local files and task text.
|
|
554
|
+
- Optimize for verified end state, reproducibility, and minimal uncontrolled assumptions.
|
|
555
555
|
- Record the commands and evidence that prove completion.`;
|
|
556
556
|
case 'auto':
|
|
557
557
|
default:
|
|
558
|
-
return `Profile: auto-detect
|
|
559
|
-
- If the task looks like a repository issue or patch challenge, follow the SWE-bench profile.
|
|
560
|
-
- If the task drops you into a sandbox with a verifier/test script, follow the Terminal-Bench profile.
|
|
561
|
-
- If the task mentions TerminalWorld, in-the-wild terminal recordings, asciinema-derived tasks, tw_ task ids, instruction.md plus solve.sh/Dockerfile artifacts, or required /app output artifacts, follow the TerminalWorld profile.
|
|
562
|
-
- If related prior cases or memory are part of the challenge, follow the SWE-ContextBench profile.
|
|
563
|
-
- If the task is a chained dependency, release, package, or API upgrade, follow the SWE-Chain profile.
|
|
564
|
-
- If the task mentions SWE-Cycle, SWE-Judge, FullCycle, EnvSetup, CodeImpl, TestGen, run_script, parsing_script, selected_test_files_to_run, environment_setup_commit, before_repo_set_cmd, or bare-repo issue resolution, follow the SWE-Cycle profile.
|
|
565
|
-
- If the task mentions SWE-CI, current/target commits, test gaps, maintainability over repository evolution, or the run_tests -> define_requirements -> modify_code loop, follow the SWE-CI profile.
|
|
566
|
-
- If the task mentions SWE-PRBench, PRBench, pull request review, code review quality, human review comments, changed files plus diff_patch, or Type1/Type2/Type3 review issue classes, follow the SWE-PRBench profile.
|
|
567
|
-
- If the task mentions TML-Bench, tabular ML, Kaggle-style competition, sample_submission, private holdout scoring, train/test CSVs, or valid submission artifacts, follow the TML-Bench profile.
|
|
568
|
-
- If the task mentions Pi-Bench, proactive personal assistant, hidden/latent intent, user profile plus message history/file system/current app context, or proactivity/completion scoring, follow the Pi-Bench profile.
|
|
569
|
-
- If the task centers on a CI failure, GitHub Actions, workflow logs, or repository patch validation, follow the CI-Repair profile.
|
|
570
|
-
- If the task mentions WildClawBench, native-runtime agent work, OpenClaw, multimodal/social/search/safety categories, or long-horizon harness comparison, follow the WildClawBench profile.
|
|
571
|
-
- If the task mentions ARC Prize, ARC-AGI, Kaggle ARC, grid abstractions, or no-instructions turn-based environments, follow the ARC-AGI profile.
|
|
572
|
-
- If the task mentions SpecBench, visible versus held-out validation, or long-horizon specification compliance, follow the SpecBench profile.
|
|
573
|
-
- If the task mentions Reward Hacking Benchmark, RHB, evaluator tampering, train/test leakage, or shortcut opportunities, follow the Reward Hacking profile.
|
|
574
|
-
- If the task mentions RoadmapBench, version-upgrade roadmaps, multi-target subtasks, or long-horizon repository development, follow the RoadmapBench profile.
|
|
575
|
-
- If the task mentions SaaSBench, enterprise SaaS, validation nodes, multi-component app workflows, tenants, migrations, or cross-service integration, follow the SaaSBench profile.
|
|
576
|
-
- If the task mentions SWE-Bench Mobile, iOS/mobile feature work, PRDs, Figma, Swift, Objective-C, Xcode, Android, or simulator/emulator validation, follow the SWE-Bench Mobile profile.
|
|
577
|
-
- If the task mentions SWE-WebDevBench, web app creation/modification, vibe coding platforms, virtual software agencies, canary requirements, frontend-backend decoupling, or production-readiness/security scoring, follow the SWE-WebDevBench profile.
|
|
578
|
-
- If the task mentions AppWorld, app/API state, user records, calendars, mail, spreadsheets, or environment state transitions, follow the AppWorld profile.
|
|
579
|
-
- If the task mentions BrowseComp, BrowseComp+, difficult web research, source-grounded browsing, or multi-hop search, follow the BrowseComp+ profile.
|
|
580
|
-
- If the task mentions tau2, Tau-Bench, customer support, airline/retail/telecom policy workflows, or policy-bound tool use, follow the tau2 profile.
|
|
558
|
+
return `Profile: auto-detect
|
|
559
|
+
- If the task looks like a repository issue or patch challenge, follow the SWE-bench profile.
|
|
560
|
+
- If the task drops you into a sandbox with a verifier/test script, follow the Terminal-Bench profile.
|
|
561
|
+
- If the task mentions TerminalWorld, in-the-wild terminal recordings, asciinema-derived tasks, tw_ task ids, instruction.md plus solve.sh/Dockerfile artifacts, or required /app output artifacts, follow the TerminalWorld profile.
|
|
562
|
+
- If related prior cases or memory are part of the challenge, follow the SWE-ContextBench profile.
|
|
563
|
+
- If the task is a chained dependency, release, package, or API upgrade, follow the SWE-Chain profile.
|
|
564
|
+
- If the task mentions SWE-Cycle, SWE-Judge, FullCycle, EnvSetup, CodeImpl, TestGen, run_script, parsing_script, selected_test_files_to_run, environment_setup_commit, before_repo_set_cmd, or bare-repo issue resolution, follow the SWE-Cycle profile.
|
|
565
|
+
- If the task mentions SWE-CI, current/target commits, test gaps, maintainability over repository evolution, or the run_tests -> define_requirements -> modify_code loop, follow the SWE-CI profile.
|
|
566
|
+
- If the task mentions SWE-PRBench, PRBench, pull request review, code review quality, human review comments, changed files plus diff_patch, or Type1/Type2/Type3 review issue classes, follow the SWE-PRBench profile.
|
|
567
|
+
- If the task mentions TML-Bench, tabular ML, Kaggle-style competition, sample_submission, private holdout scoring, train/test CSVs, or valid submission artifacts, follow the TML-Bench profile.
|
|
568
|
+
- If the task mentions Pi-Bench, proactive personal assistant, hidden/latent intent, user profile plus message history/file system/current app context, or proactivity/completion scoring, follow the Pi-Bench profile.
|
|
569
|
+
- If the task centers on a CI failure, GitHub Actions, workflow logs, or repository patch validation, follow the CI-Repair profile.
|
|
570
|
+
- If the task mentions WildClawBench, native-runtime agent work, OpenClaw, multimodal/social/search/safety categories, or long-horizon harness comparison, follow the WildClawBench profile.
|
|
571
|
+
- If the task mentions ARC Prize, ARC-AGI, Kaggle ARC, grid abstractions, or no-instructions turn-based environments, follow the ARC-AGI profile.
|
|
572
|
+
- If the task mentions SpecBench, visible versus held-out validation, or long-horizon specification compliance, follow the SpecBench profile.
|
|
573
|
+
- If the task mentions Reward Hacking Benchmark, RHB, evaluator tampering, train/test leakage, or shortcut opportunities, follow the Reward Hacking profile.
|
|
574
|
+
- If the task mentions RoadmapBench, version-upgrade roadmaps, multi-target subtasks, or long-horizon repository development, follow the RoadmapBench profile.
|
|
575
|
+
- If the task mentions SaaSBench, enterprise SaaS, validation nodes, multi-component app workflows, tenants, migrations, or cross-service integration, follow the SaaSBench profile.
|
|
576
|
+
- If the task mentions SWE-Bench Mobile, iOS/mobile feature work, PRDs, Figma, Swift, Objective-C, Xcode, Android, or simulator/emulator validation, follow the SWE-Bench Mobile profile.
|
|
577
|
+
- If the task mentions SWE-WebDevBench, web app creation/modification, vibe coding platforms, virtual software agencies, canary requirements, frontend-backend decoupling, or production-readiness/security scoring, follow the SWE-WebDevBench profile.
|
|
578
|
+
- If the task mentions AppWorld, app/API state, user records, calendars, mail, spreadsheets, or environment state transitions, follow the AppWorld profile.
|
|
579
|
+
- If the task mentions BrowseComp, BrowseComp+, difficult web research, source-grounded browsing, or multi-hop search, follow the BrowseComp+ profile.
|
|
580
|
+
- If the task mentions tau2, Tau-Bench, customer support, airline/retail/telecom policy workflows, or policy-bound tool use, follow the tau2 profile.
|
|
581
581
|
- Otherwise follow the generic benchmark profile.`;
|
|
582
582
|
}
|
|
583
583
|
}
|
|
584
584
|
function benchmarkMethodologySection() {
|
|
585
|
-
return `## Source-Grounded Method Stack
|
|
586
|
-
|
|
587
|
-
Use this workflow as the default benchmark strategy:
|
|
588
|
-
|
|
589
|
-
1. Localize with issue-relevance.
|
|
590
|
-
- Build a small candidate set of files/functions before editing.
|
|
591
|
-
- Traverse imports, call sites, stack traces, and tests depth-first only while they stay relevant to the issue.
|
|
592
|
-
- Keep a short localization dossier: suspected files, evidence, and why unrelated areas were ruled out.
|
|
593
|
-
|
|
594
|
-
2. Reproduce before repair.
|
|
595
|
-
- Run the narrowest visible failing command or create a minimal local reproduction when no test is provided.
|
|
596
|
-
- Capture exact command, exit status, and the failing assertion/error.
|
|
597
|
-
- If reproduction is impossible, state what was attempted and fall back to a targeted static diagnosis.
|
|
598
|
-
|
|
599
|
-
3. Plan the patch with checkpoints.
|
|
600
|
-
- Write/update a todo list before the first edit.
|
|
601
|
-
- For risky or multi-file edits, inspect git state first and keep changes reviewable so failed paths can be reverted without losing unrelated user work.
|
|
602
|
-
- Prefer one coherent root-cause patch over broad speculative rewrites.
|
|
603
|
-
- If benchmark_context shows prior \`replay=\` checkpoints, treat them as a ranked hypothesis trail: verify the current task still matches, retry only the relevant read/search/verifier steps, and ignore any prior pattern listed under warnings.
|
|
604
|
-
- For long-horizon roadmap, SaaS, mobile, WebDevBench, SWE-Cycle, or SWE-CI tasks, keep the checklist milestone-based so partially completed acceptance nodes, canaries, lifecycle phases, CI-loop requirements, and production-readiness gaps stay visible.
|
|
605
|
-
- For SWE-PRBench or PR-review tasks, keep a diff-first finding ledger and resist broad context expansion unless a specific suspected issue needs nearby source, tests, or API contract evidence.
|
|
606
|
-
- For TML-Bench/Kaggle tabular ML tasks, keep a data-contract and submission-validity ledger before modeling so hidden-label leakage and invalid submissions are caught early.
|
|
607
|
-
- For Pi-Bench/proactive assistant tasks, keep a context contract plus proactivity ledger so hidden-intent inference, privacy risk, and clarification decisions stay auditable.
|
|
608
|
-
- For AppWorld, BrowseComp+, and tau2 tasks, keep an action/source/policy ledger so environment changes and citations are auditable.
|
|
609
|
-
|
|
610
|
-
4. Validate like a verifier.
|
|
611
|
-
- After each patch, run the narrowest relevant test again.
|
|
612
|
-
- Then run the broad verifier/build/test command available in the task.
|
|
613
|
-
- Treat failures as feedback for the next localization loop; do not final-answer on plausible-but-unverified changes.
|
|
614
|
-
|
|
615
|
-
5. Use current science only when it helps the task.
|
|
616
|
-
- For benchmark-methodology, agent-improvement, model, dataset, or leaderboard work, call \`research_sources\` before synthesis with source-specific coverage: arXiv papers; GitHub \`github_kind:"all"\` for repos/issues/PRs/code; Hugging Face \`kind:"all"\` for papers/models/datasets; Kaggle \`kaggle_kind:"both"\` for datasets/competitions; \`recent_days:90\`; and \`format:"json"\` unless older historical evidence or prose output is explicitly needed.
|
|
617
|
-
- Check the structured source digest before relying on research: if hits are zero, errors are nonzero, or a source family is missing, refine the query or call out the coverage gap.
|
|
618
|
-
- For Terminal-Bench public-agent comparisons, call \`benchmark_repo_catalog\` first to identify known public competitor/source repos, then call \`github_repo_digest\` on the relevant repo(s).
|
|
619
|
-
- If source research returns public GitHub repos that could serve as implementation demonstrations, call \`github_repo_digest\` on the most relevant repo(s), compare manifests, likely commands, CI files, and component surface signals, then verify exact local files before importing any pattern.
|
|
620
|
-
- For local repository repair, prioritize the checkout and verifier over external popularity signals.
|
|
621
|
-
|
|
622
|
-
6. Guard against contamination.
|
|
623
|
-
- Fresh local task evidence beats memory or recalled benchmark patterns.
|
|
624
|
-
- Do not inspect gold/oracle/answer/hidden-result files unless explicitly allowed.
|
|
585
|
+
return `## Source-Grounded Method Stack
|
|
586
|
+
|
|
587
|
+
Use this workflow as the default benchmark strategy:
|
|
588
|
+
|
|
589
|
+
1. Localize with issue-relevance.
|
|
590
|
+
- Build a small candidate set of files/functions before editing.
|
|
591
|
+
- Traverse imports, call sites, stack traces, and tests depth-first only while they stay relevant to the issue.
|
|
592
|
+
- Keep a short localization dossier: suspected files, evidence, and why unrelated areas were ruled out.
|
|
593
|
+
|
|
594
|
+
2. Reproduce before repair.
|
|
595
|
+
- Run the narrowest visible failing command or create a minimal local reproduction when no test is provided.
|
|
596
|
+
- Capture exact command, exit status, and the failing assertion/error.
|
|
597
|
+
- If reproduction is impossible, state what was attempted and fall back to a targeted static diagnosis.
|
|
598
|
+
|
|
599
|
+
3. Plan the patch with checkpoints.
|
|
600
|
+
- Write/update a todo list before the first edit.
|
|
601
|
+
- For risky or multi-file edits, inspect git state first and keep changes reviewable so failed paths can be reverted without losing unrelated user work.
|
|
602
|
+
- Prefer one coherent root-cause patch over broad speculative rewrites.
|
|
603
|
+
- If benchmark_context shows prior \`replay=\` checkpoints, treat them as a ranked hypothesis trail: verify the current task still matches, retry only the relevant read/search/verifier steps, and ignore any prior pattern listed under warnings.
|
|
604
|
+
- For long-horizon roadmap, SaaS, mobile, WebDevBench, SWE-Cycle, or SWE-CI tasks, keep the checklist milestone-based so partially completed acceptance nodes, canaries, lifecycle phases, CI-loop requirements, and production-readiness gaps stay visible.
|
|
605
|
+
- For SWE-PRBench or PR-review tasks, keep a diff-first finding ledger and resist broad context expansion unless a specific suspected issue needs nearby source, tests, or API contract evidence.
|
|
606
|
+
- For TML-Bench/Kaggle tabular ML tasks, keep a data-contract and submission-validity ledger before modeling so hidden-label leakage and invalid submissions are caught early.
|
|
607
|
+
- For Pi-Bench/proactive assistant tasks, keep a context contract plus proactivity ledger so hidden-intent inference, privacy risk, and clarification decisions stay auditable.
|
|
608
|
+
- For AppWorld, BrowseComp+, and tau2 tasks, keep an action/source/policy ledger so environment changes and citations are auditable.
|
|
609
|
+
|
|
610
|
+
4. Validate like a verifier.
|
|
611
|
+
- After each patch, run the narrowest relevant test again.
|
|
612
|
+
- Then run the broad verifier/build/test command available in the task.
|
|
613
|
+
- Treat failures as feedback for the next localization loop; do not final-answer on plausible-but-unverified changes.
|
|
614
|
+
|
|
615
|
+
5. Use current science only when it helps the task.
|
|
616
|
+
- For benchmark-methodology, agent-improvement, model, dataset, or leaderboard work, call \`research_sources\` before synthesis with source-specific coverage: arXiv papers; GitHub \`github_kind:"all"\` for repos/issues/PRs/code; Hugging Face \`kind:"all"\` for papers/models/datasets; Kaggle \`kaggle_kind:"both"\` for datasets/competitions; \`recent_days:90\`; and \`format:"json"\` unless older historical evidence or prose output is explicitly needed.
|
|
617
|
+
- Check the structured source digest before relying on research: if hits are zero, errors are nonzero, or a source family is missing, refine the query or call out the coverage gap.
|
|
618
|
+
- For Terminal-Bench public-agent comparisons, call \`benchmark_repo_catalog\` first to identify known public competitor/source repos, then call \`github_repo_digest\` on the relevant repo(s).
|
|
619
|
+
- If source research returns public GitHub repos that could serve as implementation demonstrations, call \`github_repo_digest\` on the most relevant repo(s), compare manifests, likely commands, CI files, and component surface signals, then verify exact local files before importing any pattern.
|
|
620
|
+
- For local repository repair, prioritize the checkout and verifier over external popularity signals.
|
|
621
|
+
|
|
622
|
+
6. Guard against contamination.
|
|
623
|
+
- Fresh local task evidence beats memory or recalled benchmark patterns.
|
|
624
|
+
- Do not inspect gold/oracle/answer/hidden-result files unless explicitly allowed.
|
|
625
625
|
- Record evidence, not benchmark claims, in the final answer.`;
|
|
626
626
|
}
|
|
627
627
|
export function buildBenchmarkPrompt(task, cwd, profile = 'auto') {
|
|
@@ -630,109 +630,109 @@ export function buildBenchmarkPrompt(task, cwd, profile = 'auto') {
|
|
|
630
630
|
const preflightSnapshot = preflight.isError
|
|
631
631
|
? `Preflight snapshot unavailable: ${preflight.output}`
|
|
632
632
|
: preflight.output.slice(0, 9000);
|
|
633
|
-
return `# Benchmark-Grade Agent Run
|
|
634
|
-
|
|
635
|
-
Working directory: ${cwd}
|
|
636
|
-
|
|
637
|
-
Task:
|
|
638
|
-
${task}
|
|
639
|
-
|
|
640
|
-
${benchmarkProfileSection(normalizedProfile)}
|
|
641
|
-
|
|
642
|
-
${benchmarkMethodologySection()}
|
|
643
|
-
|
|
644
|
-
## Automatic Preflight Snapshot
|
|
645
|
-
|
|
646
|
-
The launcher gathered this read-only snapshot before the agent loop to save early environment-discovery turns. Treat it as orientation, not proof: re-read task-relevant files before editing.
|
|
647
|
-
|
|
648
|
-
${preflightSnapshot}
|
|
649
|
-
|
|
650
|
-
## Operating Loop
|
|
651
|
-
|
|
652
|
-
1. Establish the success oracle.
|
|
653
|
-
- Use the automatic preflight snapshot above. Call \`benchmark_context\` only if the environment changes or the snapshot is incomplete.
|
|
654
|
-
- Find the verifier, test command, hidden/visible test boundary, or expected artifact.
|
|
655
|
-
- For WildClawBench or ARC-AGI work, first identify the sub-benchmark, action/output contract, scoring signal, and public/hidden boundary before assuming this is a patch task.
|
|
656
|
-
- For TerminalWorld work, first identify instruction.md/task text, required \`/app\` artifacts, Docker/service assumptions, visible verifier/tests, and whether solve.sh/reference material is present before assuming this is a source-patch task.
|
|
657
|
-
- For SpecBench or reward-hacking work, distinguish the natural-language specification from the visible validation suite, then plan a broad/generalization check after visible tests pass.
|
|
658
|
-
- For RoadmapBench/SaaSBench/SWE-Bench Mobile/SWE-WebDevBench/SWE-Cycle/SWE-CI work, identify roadmap milestones, validation nodes, canary requirements, lifecycle phases, current/target commit boundaries, test gaps, platform/integration verifiers, production-readiness/security checks, and any version-upgrade or product-flow compatibility boundary before treating this as a local bug fix.
|
|
659
|
-
- For AppWorld/BrowseComp+/tau2 work, identify available actions, required source/policy evidence, finish action, and state-observation loop before taking environment actions.
|
|
660
|
-
- If this is a benchmark-research, agent-improvement, model/dataset, or leaderboard question, use \`research_sources\` before synthesis with targeted kinds: GitHub \`github_kind:"all"\`, Hugging Face \`kind:"all"\`, Kaggle \`kaggle_kind:"both"\`, \`recent_days:90\`, and \`format:"json"\`.
|
|
661
|
-
- For Terminal-Bench source mining, query \`benchmark_repo_catalog\` before ad hoc web discovery.
|
|
662
|
-
- When that research identifies relevant public GitHub repos, use \`github_repo_digest\` before relying on repo-level implementation patterns.
|
|
663
|
-
|
|
664
|
-
2. Localize before editing.
|
|
665
|
-
- Map relevant files with glob/list_dir/grep.
|
|
666
|
-
- Read the current implementation and nearby tests.
|
|
667
|
-
- Follow issue-relevant dependency/call-site paths; stop traversing when the evidence no longer connects to the task.
|
|
668
|
-
- Keep a short localization dossier before the first edit.
|
|
669
|
-
- Separate task-relevant instructions from distractors before following environmental cues.
|
|
670
|
-
- Avoid broad rewrites until the fault is localized.
|
|
671
|
-
|
|
672
|
-
3. Reproduce and use memory carefully.
|
|
673
|
-
- Run the narrowest failing command, visible test, or local reproduction before patching when feasible.
|
|
674
|
-
- Search memory for related project conventions or prior fixes when relevant.
|
|
675
|
-
- Use recalled context only after validating it against current files.
|
|
676
|
-
- If the preflight has prior \`replay=\` checkpoints, replay only the relevant read/search/verifier steps as hypotheses; never copy an old patch or ignore current task files.
|
|
677
|
-
- Do not use memory as a substitute for reading the present checkout.
|
|
678
|
-
|
|
679
|
-
4. Patch minimally with checkpoint discipline.
|
|
680
|
-
- Update the todo list before the first edit and after verification milestones.
|
|
681
|
-
- Inspect git state before risky edits; preserve unrelated user changes.
|
|
682
|
-
- Change production code unless the issue is truly in tests/docs/config.
|
|
683
|
-
- Do not weaken tests, skip verifiers, hardcode benchmark answers, or special-case hidden cases.
|
|
684
|
-
- Preserve public APIs and user compatibility unless the task requires a breaking change.
|
|
685
|
-
|
|
686
|
-
5. Verify under benchmark pressure.
|
|
687
|
-
- Run the narrowest meaningful test first.
|
|
688
|
-
- For installs, model loads, training, builds, emulators, or broad test suites that can legitimately exceed the default shell timeout, call bash with \`timeoutMs\` (up to 1800000). Do not retry the exact same timed-out command without changing the timeout or strategy.
|
|
689
|
-
- For services, servers, watchers, and daemons, call bash with \`background:true\`, then inspect the returned log path before assuming readiness.
|
|
690
|
-
- Then run the broad verifier or build/test command available in the task.
|
|
691
|
-
- For roadmap/SaaS/mobile/WebDevBench/SWE-Cycle tasks, prefer at least one broad integration, platform, migration, e2e, frontend-backend, production-readiness, security, lifecycle judge, or full build/test verifier after the final edit.
|
|
692
|
-
- For AppWorld/tau2 tasks, verify the latest observation or tool response reflects the intended state; for BrowseComp+, cross-check final claims against opened source evidence.
|
|
693
|
-
- If a verifier fails, diagnose from output and iterate. Do not final-answer on unverified edits.
|
|
694
|
-
|
|
695
|
-
6. Final response.
|
|
696
|
-
- State changed files and the behavioral fix.
|
|
697
|
-
- List exact verification commands and pass/fail status.
|
|
698
|
-
- Call out unresolved risks honestly if any verifier could not be run.
|
|
699
|
-
|
|
700
|
-
## Anti-Leakage Rules
|
|
701
|
-
|
|
702
|
-
- Do not inspect gold patches, oracle solutions, hidden tests, benchmark answer keys, result JSONL from prior submissions, or upstream PR diffs unless the benchmark task explicitly permits it.
|
|
703
|
-
- For TerminalWorld-style tasks, treat \`solve.sh\` and reference-solution scripts as oracle material unless explicitly permitted.
|
|
704
|
-
- Do not claim leaderboard performance from this run unless the official harness output proves it.
|
|
633
|
+
return `# Benchmark-Grade Agent Run
|
|
634
|
+
|
|
635
|
+
Working directory: ${cwd}
|
|
636
|
+
|
|
637
|
+
Task:
|
|
638
|
+
${task}
|
|
639
|
+
|
|
640
|
+
${benchmarkProfileSection(normalizedProfile)}
|
|
641
|
+
|
|
642
|
+
${benchmarkMethodologySection()}
|
|
643
|
+
|
|
644
|
+
## Automatic Preflight Snapshot
|
|
645
|
+
|
|
646
|
+
The launcher gathered this read-only snapshot before the agent loop to save early environment-discovery turns. Treat it as orientation, not proof: re-read task-relevant files before editing.
|
|
647
|
+
|
|
648
|
+
${preflightSnapshot}
|
|
649
|
+
|
|
650
|
+
## Operating Loop
|
|
651
|
+
|
|
652
|
+
1. Establish the success oracle.
|
|
653
|
+
- Use the automatic preflight snapshot above. Call \`benchmark_context\` only if the environment changes or the snapshot is incomplete.
|
|
654
|
+
- Find the verifier, test command, hidden/visible test boundary, or expected artifact.
|
|
655
|
+
- For WildClawBench or ARC-AGI work, first identify the sub-benchmark, action/output contract, scoring signal, and public/hidden boundary before assuming this is a patch task.
|
|
656
|
+
- For TerminalWorld work, first identify instruction.md/task text, required \`/app\` artifacts, Docker/service assumptions, visible verifier/tests, and whether solve.sh/reference material is present before assuming this is a source-patch task.
|
|
657
|
+
- For SpecBench or reward-hacking work, distinguish the natural-language specification from the visible validation suite, then plan a broad/generalization check after visible tests pass.
|
|
658
|
+
- For RoadmapBench/SaaSBench/SWE-Bench Mobile/SWE-WebDevBench/SWE-Cycle/SWE-CI work, identify roadmap milestones, validation nodes, canary requirements, lifecycle phases, current/target commit boundaries, test gaps, platform/integration verifiers, production-readiness/security checks, and any version-upgrade or product-flow compatibility boundary before treating this as a local bug fix.
|
|
659
|
+
- For AppWorld/BrowseComp+/tau2 work, identify available actions, required source/policy evidence, finish action, and state-observation loop before taking environment actions.
|
|
660
|
+
- If this is a benchmark-research, agent-improvement, model/dataset, or leaderboard question, use \`research_sources\` before synthesis with targeted kinds: GitHub \`github_kind:"all"\`, Hugging Face \`kind:"all"\`, Kaggle \`kaggle_kind:"both"\`, \`recent_days:90\`, and \`format:"json"\`.
|
|
661
|
+
- For Terminal-Bench source mining, query \`benchmark_repo_catalog\` before ad hoc web discovery.
|
|
662
|
+
- When that research identifies relevant public GitHub repos, use \`github_repo_digest\` before relying on repo-level implementation patterns.
|
|
663
|
+
|
|
664
|
+
2. Localize before editing.
|
|
665
|
+
- Map relevant files with glob/list_dir/grep.
|
|
666
|
+
- Read the current implementation and nearby tests.
|
|
667
|
+
- Follow issue-relevant dependency/call-site paths; stop traversing when the evidence no longer connects to the task.
|
|
668
|
+
- Keep a short localization dossier before the first edit.
|
|
669
|
+
- Separate task-relevant instructions from distractors before following environmental cues.
|
|
670
|
+
- Avoid broad rewrites until the fault is localized.
|
|
671
|
+
|
|
672
|
+
3. Reproduce and use memory carefully.
|
|
673
|
+
- Run the narrowest failing command, visible test, or local reproduction before patching when feasible.
|
|
674
|
+
- Search memory for related project conventions or prior fixes when relevant.
|
|
675
|
+
- Use recalled context only after validating it against current files.
|
|
676
|
+
- If the preflight has prior \`replay=\` checkpoints, replay only the relevant read/search/verifier steps as hypotheses; never copy an old patch or ignore current task files.
|
|
677
|
+
- Do not use memory as a substitute for reading the present checkout.
|
|
678
|
+
|
|
679
|
+
4. Patch minimally with checkpoint discipline.
|
|
680
|
+
- Update the todo list before the first edit and after verification milestones.
|
|
681
|
+
- Inspect git state before risky edits; preserve unrelated user changes.
|
|
682
|
+
- Change production code unless the issue is truly in tests/docs/config.
|
|
683
|
+
- Do not weaken tests, skip verifiers, hardcode benchmark answers, or special-case hidden cases.
|
|
684
|
+
- Preserve public APIs and user compatibility unless the task requires a breaking change.
|
|
685
|
+
|
|
686
|
+
5. Verify under benchmark pressure.
|
|
687
|
+
- Run the narrowest meaningful test first.
|
|
688
|
+
- For installs, model loads, training, builds, emulators, or broad test suites that can legitimately exceed the default shell timeout, call bash with \`timeoutMs\` (up to 1800000). Do not retry the exact same timed-out command without changing the timeout or strategy.
|
|
689
|
+
- For services, servers, watchers, and daemons, call bash with \`background:true\`, then inspect the returned log path before assuming readiness.
|
|
690
|
+
- Then run the broad verifier or build/test command available in the task.
|
|
691
|
+
- For roadmap/SaaS/mobile/WebDevBench/SWE-Cycle tasks, prefer at least one broad integration, platform, migration, e2e, frontend-backend, production-readiness, security, lifecycle judge, or full build/test verifier after the final edit.
|
|
692
|
+
- For AppWorld/tau2 tasks, verify the latest observation or tool response reflects the intended state; for BrowseComp+, cross-check final claims against opened source evidence.
|
|
693
|
+
- If a verifier fails, diagnose from output and iterate. Do not final-answer on unverified edits.
|
|
694
|
+
|
|
695
|
+
6. Final response.
|
|
696
|
+
- State changed files and the behavioral fix.
|
|
697
|
+
- List exact verification commands and pass/fail status.
|
|
698
|
+
- Call out unresolved risks honestly if any verifier could not be run.
|
|
699
|
+
|
|
700
|
+
## Anti-Leakage Rules
|
|
701
|
+
|
|
702
|
+
- Do not inspect gold patches, oracle solutions, hidden tests, benchmark answer keys, result JSONL from prior submissions, or upstream PR diffs unless the benchmark task explicitly permits it.
|
|
703
|
+
- For TerminalWorld-style tasks, treat \`solve.sh\` and reference-solution scripts as oracle material unless explicitly permitted.
|
|
704
|
+
- Do not claim leaderboard performance from this run unless the official harness output proves it.
|
|
705
705
|
- Do not rely on remembered benchmark solutions. Treat all prior knowledge as potentially contaminated until verified locally.`;
|
|
706
706
|
}
|
|
707
707
|
// Documentation Update Prompt ----------------------------------------------
|
|
708
708
|
export function buildUpdateDocsPrompt(cwd) {
|
|
709
|
-
return `Update project documentation at ${cwd}.
|
|
710
|
-
|
|
711
|
-
**Documentation Review**:
|
|
712
|
-
|
|
713
|
-
1. **Find docs**:
|
|
714
|
-
- README.md
|
|
715
|
-
- API documentation (docs/, wiki/, etc.)
|
|
716
|
-
- CHANGELOG.md
|
|
717
|
-
- Code comments and inline docs
|
|
718
|
-
|
|
719
|
-
2. **Compare with code**:
|
|
720
|
-
- Read the current code
|
|
721
|
-
- Check if docs match implementation
|
|
722
|
-
- Identify outdated sections
|
|
723
|
-
|
|
724
|
-
3. **Update**:
|
|
725
|
-
- Fix inaccurate sections
|
|
726
|
-
- Add missing features or APIs
|
|
727
|
-
- Update examples with current code
|
|
728
|
-
- Improve unclear explanations
|
|
729
|
-
|
|
730
|
-
4. **Add**:
|
|
731
|
-
- Document new features not yet documented
|
|
732
|
-
- Add examples for complex features
|
|
733
|
-
- Document configuration options
|
|
734
|
-
- Add troubleshooting sections if missing
|
|
735
|
-
|
|
709
|
+
return `Update project documentation at ${cwd}.
|
|
710
|
+
|
|
711
|
+
**Documentation Review**:
|
|
712
|
+
|
|
713
|
+
1. **Find docs**:
|
|
714
|
+
- README.md
|
|
715
|
+
- API documentation (docs/, wiki/, etc.)
|
|
716
|
+
- CHANGELOG.md
|
|
717
|
+
- Code comments and inline docs
|
|
718
|
+
|
|
719
|
+
2. **Compare with code**:
|
|
720
|
+
- Read the current code
|
|
721
|
+
- Check if docs match implementation
|
|
722
|
+
- Identify outdated sections
|
|
723
|
+
|
|
724
|
+
3. **Update**:
|
|
725
|
+
- Fix inaccurate sections
|
|
726
|
+
- Add missing features or APIs
|
|
727
|
+
- Update examples with current code
|
|
728
|
+
- Improve unclear explanations
|
|
729
|
+
|
|
730
|
+
4. **Add**:
|
|
731
|
+
- Document new features not yet documented
|
|
732
|
+
- Add examples for complex features
|
|
733
|
+
- Document configuration options
|
|
734
|
+
- Add troubleshooting sections if missing
|
|
735
|
+
|
|
736
736
|
Be thorough and ensure all public APIs and features are documented.`;
|
|
737
737
|
}
|
|
738
738
|
// ── Harness Audit ─────────────────────────────────────────
|