swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,988 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from claude_agent_sdk import (
|
|
10
|
+
AssistantMessage,
|
|
11
|
+
ClaudeAgentOptions,
|
|
12
|
+
HookMatcher,
|
|
13
|
+
TextBlock,
|
|
14
|
+
query,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from swegen.create.claude_code_utils import Colors, print_sdk_message
|
|
18
|
+
from swegen.tools.harbor_runner import parse_harbor_outcome
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ClaudeCodeResult:
|
|
23
|
+
"""Result of the CC session."""
|
|
24
|
+
|
|
25
|
+
success: bool
|
|
26
|
+
nop_passed: bool # reward=0 (tests fail on buggy code)
|
|
27
|
+
oracle_passed: bool # reward=1 (tests pass after fix)
|
|
28
|
+
error_message: str | None = None
|
|
29
|
+
cc_output: str | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# The prompt for CC when using a reference task (much simpler task)
|
|
33
|
+
CC_REFERENCE_PROMPT = """
|
|
34
|
+
## Your Task: Fill In Skeleton Using Reference Task as Example
|
|
35
|
+
|
|
36
|
+
**GREAT NEWS**: We have a working task from PR #{reference_pr} (task: `{reference_task_id}`)!
|
|
37
|
+
|
|
38
|
+
Your job is MUCH SIMPLER than usual:
|
|
39
|
+
1. **Look at the reference task** to see what was added (runtime, packages, env vars, build steps, test command)
|
|
40
|
+
2. **Fill in your skeleton's TODOs** with the same things
|
|
41
|
+
3. **Update test file paths** to match this PR
|
|
42
|
+
4. **Run harbor validation** to confirm it works
|
|
43
|
+
|
|
44
|
+
## Context
|
|
45
|
+
|
|
46
|
+
**Repository**: {repo} (cloned at `{repo_path}`)
|
|
47
|
+
**Current PR**: #{pr_number}
|
|
48
|
+
**Reference Task**: `{reference_task_id}` (from PR #{reference_pr}, tested and validated)
|
|
49
|
+
**Current Task Directory**: `{task_dir}` ← Your skeleton (CORRECT hashes already!)
|
|
50
|
+
**Reference Task Directory**: `{reference_task_dir}` ← Working example to learn from
|
|
51
|
+
**Dataset Path**: `{dataset_path}`
|
|
52
|
+
|
|
53
|
+
## Test Files for This PR
|
|
54
|
+
|
|
55
|
+
{test_files_list}
|
|
56
|
+
|
|
57
|
+
## What's Already Done
|
|
58
|
+
|
|
59
|
+
✓ Skeleton Dockerfile with CORRECT git SHAs ({head_sha}) and basic structure
|
|
60
|
+
✓ Skeleton test.sh with TODO for test command
|
|
61
|
+
✓ bug.patch and fix.patch are ready
|
|
62
|
+
✓ instruction.md and task.toml are ready
|
|
63
|
+
✓ Reference task has working Dockerfile and test.sh as examples
|
|
64
|
+
|
|
65
|
+
## IMPORTANT: Your Skeleton Already Has Correct Hashes!
|
|
66
|
+
|
|
67
|
+
**DO NOT copy files from reference and replace hashes** - that's error-prone!
|
|
68
|
+
|
|
69
|
+
Instead:
|
|
70
|
+
1. Read `{task_dir}/environment/Dockerfile` - it has TODO comments
|
|
71
|
+
2. Read `{reference_task_dir}/environment/Dockerfile` - see what was filled in
|
|
72
|
+
3. Add the same things to YOUR skeleton's TODO sections
|
|
73
|
+
|
|
74
|
+
The skeleton already has:
|
|
75
|
+
✓ Correct git clone URL
|
|
76
|
+
✓ Correct HEAD SHA ({head_sha})
|
|
77
|
+
✓ Basic apt packages (git, curl, patch, build-essential)
|
|
78
|
+
✓ Correct bug.patch application
|
|
79
|
+
|
|
80
|
+
## Your Process
|
|
81
|
+
|
|
82
|
+
### Step 1: Compare Reference Dockerfile to Your Skeleton
|
|
83
|
+
|
|
84
|
+
Read both files:
|
|
85
|
+
```bash
|
|
86
|
+
# Your skeleton (has TODO comments to fill in)
|
|
87
|
+
cat {task_dir}/environment/Dockerfile
|
|
88
|
+
|
|
89
|
+
# Reference (shows what was filled in for a similar PR)
|
|
90
|
+
cat {reference_task_dir}/environment/Dockerfile
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Look for what the reference added beyond the basic skeleton:
|
|
94
|
+
- Language runtime installation (Python, Node.js, Go, Rust, Ruby, Java, etc.)
|
|
95
|
+
- Additional system packages (python3-dev, libssl-dev, etc.)
|
|
96
|
+
- Package manager setup
|
|
97
|
+
- Environment variables (CI=true, NODE_ENV=test, etc.)
|
|
98
|
+
- Dependency installation commands
|
|
99
|
+
- Build steps
|
|
100
|
+
- Post-patch rebuild steps
|
|
101
|
+
|
|
102
|
+
### Step 2: Fill In Your Skeleton's TODOs
|
|
103
|
+
|
|
104
|
+
**CRITICAL: Always use Ubuntu base image**
|
|
105
|
+
- The skeleton Dockerfile starts with `FROM ubuntu:24.04` - **DO NOT change this**
|
|
106
|
+
- **NEVER** use language-specific base images (node:XX, python:XX, golang:XX)
|
|
107
|
+
- Install language runtimes via apt-get or official installers
|
|
108
|
+
|
|
109
|
+
Add the same things from the reference to your skeleton. For example:
|
|
110
|
+
|
|
111
|
+
**If reference has:**
|
|
112
|
+
```dockerfile
|
|
113
|
+
# Install Python
|
|
114
|
+
RUN apt-get update && apt-get install -y \\
|
|
115
|
+
python3 python3-pip python3-venv python3-dev \\
|
|
116
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Then replace your TODO:**
|
|
120
|
+
```dockerfile
|
|
121
|
+
# TODO: Install language runtime
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**With the same installation commands.**
|
|
125
|
+
|
|
126
|
+
**DO NOT just copy the entire reference file** - the git SHAs would be wrong!
|
|
127
|
+
**DO fill in the TODOs** using the reference as a guide.
|
|
128
|
+
|
|
129
|
+
### Step 3: Fill In test.sh Test Command
|
|
130
|
+
|
|
131
|
+
Read both test files:
|
|
132
|
+
```bash
|
|
133
|
+
# Your skeleton (has TODO for test command)
|
|
134
|
+
cat {task_dir}/tests/test.sh
|
|
135
|
+
|
|
136
|
+
# Reference (shows what test command worked)
|
|
137
|
+
cat {reference_task_dir}/tests/test.sh
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
**CRITICAL**: Update the test command to run ONLY the test files for THIS PR!
|
|
141
|
+
|
|
142
|
+
**Current test files for THIS PR**:
|
|
143
|
+
{test_files_list}
|
|
144
|
+
|
|
145
|
+
The reference test.sh will show you the test runner pattern.
|
|
146
|
+
**Copy the pattern but update the file paths** to match this PR's test files.
|
|
147
|
+
|
|
148
|
+
**DO NOT use**:
|
|
149
|
+
- `npm test`, `pytest`, `go test ./...` without specific paths ❌ (runs entire suite)
|
|
150
|
+
- Any command without specific file paths ❌
|
|
151
|
+
|
|
152
|
+
Replace the TODO placeholder with the actual test command running THIS PR's test files.
|
|
153
|
+
|
|
154
|
+
### Step 4: Run Harbor Validation
|
|
155
|
+
|
|
156
|
+
For each validation attempt, increment the run number (-1, -2, -3, etc.):
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# Test NOP - should get reward=0
|
|
160
|
+
harbor run --agent nop -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-nop-1 --no-delete --env {environment}
|
|
161
|
+
|
|
162
|
+
# Test Oracle - should get reward=1
|
|
163
|
+
harbor run --agent oracle -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-oracle-1 --env {environment}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
If you need to re-run after fixing issues, increment the number:
|
|
167
|
+
- First NOP attempt: `{task_id}-nop-1`, second: `{task_id}-nop-2`, etc.
|
|
168
|
+
- First Oracle attempt: `{task_id}-oracle-1`, second: `{task_id}-oracle-2`, etc.
|
|
169
|
+
|
|
170
|
+
### Step 5: Fix Issues (if validation fails)
|
|
171
|
+
|
|
172
|
+
If harbor fails, check:
|
|
173
|
+
1. **Test file paths** - Most common issue (make sure you updated them for THIS PR)
|
|
174
|
+
2. **Missing build step** - Did you copy the build steps from reference?
|
|
175
|
+
3. **Missing packages** - Did you copy the system packages from reference?
|
|
176
|
+
4. **Post-patch rebuild** - For compiled languages, you MUST rebuild after applying bug.patch
|
|
177
|
+
|
|
178
|
+
### Step 6: Final Cleanup
|
|
179
|
+
|
|
180
|
+
**Once both NOP (reward=0) and Oracle (reward=1) pass**, clean up your files:
|
|
181
|
+
|
|
182
|
+
1. **Remove ALL TODO comments** from Dockerfile and test.sh
|
|
183
|
+
2. **Remove ALL template/example comments** that are no longer relevant
|
|
184
|
+
3. **Keep only meaningful comments** that explain non-obvious steps
|
|
185
|
+
|
|
186
|
+
**Files to clean:**
|
|
187
|
+
- `{task_dir}/environment/Dockerfile` - Remove TODOs, keep comments explaining non-standard steps
|
|
188
|
+
- `{task_dir}/tests/test.sh` - Remove TODOs and example templates, keep test-specific comments
|
|
189
|
+
|
|
190
|
+
## Tips
|
|
191
|
+
|
|
192
|
+
- **Your skeleton is the source of truth** - it has correct hashes
|
|
193
|
+
- **Reference is just an example** - shows you what to fill in
|
|
194
|
+
- **Don't copy entire files** - just the extra pieces (runtime, packages, env vars, build steps)
|
|
195
|
+
- **Update test paths** - most PRs touch different test files
|
|
196
|
+
|
|
197
|
+
You're done when both NOP (reward=0) and Oracle (reward=1) pass AND files are cleaned up!
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
# The prompt for CC to analyze repo and fill in skeleton (from scratch)
|
|
201
|
+
CC_PROMPT = """
|
|
202
|
+
## Your Task: Make This Harbor Task Work
|
|
203
|
+
|
|
204
|
+
You have a skeleton Harbor task that needs to be completed. Your job is to:
|
|
205
|
+
1. **Analyze the repository** to detect language, build system, test framework, dependencies
|
|
206
|
+
2. **Fill in the TODO sections** in Dockerfile and test.sh
|
|
207
|
+
3. **Run harbor validation** and iterate until it passes
|
|
208
|
+
|
|
209
|
+
## Context
|
|
210
|
+
|
|
211
|
+
**Repository**: {repo} (cloned at `{repo_path}`)
|
|
212
|
+
**PR**: #{pr_number}
|
|
213
|
+
**Task Directory**: `{task_dir}`
|
|
214
|
+
**Dataset Path**: `{dataset_path}`
|
|
215
|
+
|
|
216
|
+
The repo is already cloned locally. You can browse it, read files, and run commands.
|
|
217
|
+
|
|
218
|
+
## Skeleton Files to Complete
|
|
219
|
+
|
|
220
|
+
The skeleton files have been generated with the deterministic parts filled in:
|
|
221
|
+
- Git clone commands with correct SHAs ✓
|
|
222
|
+
- Basic apt packages (git, curl, ca-certificates, patch, build-essential) ✓
|
|
223
|
+
- bug.patch/fix.patch ✓
|
|
224
|
+
|
|
225
|
+
**You need to fill in the TODOs:**
|
|
226
|
+
|
|
227
|
+
### `{task_dir}/environment/Dockerfile`
|
|
228
|
+
- **Language runtime**: Detect and install (Python, Node.js, Go, Rust, Ruby, Java, etc.)
|
|
229
|
+
- **System packages**: Additional packages needed (dev headers, native dependencies)
|
|
230
|
+
- **Package manager**: Set up if needed (pip, npm, cargo, bundler, etc.)
|
|
231
|
+
- **Environment variables**: CI=true, etc.
|
|
232
|
+
- **Dependencies**: Install project dependencies
|
|
233
|
+
- **Build step**: If needed (TypeScript, Rust, Go, Java, etc.)
|
|
234
|
+
- **Rebuild after bug.patch**: Required for compiled languages
|
|
235
|
+
|
|
236
|
+
### `{task_dir}/tests/test.sh`
|
|
237
|
+
- **Environment variables**: For test runner
|
|
238
|
+
- **Test command**: The actual command to run the specific test files
|
|
239
|
+
|
|
240
|
+
## Step 1: Deep Repository Analysis
|
|
241
|
+
|
|
242
|
+
Before filling anything in, thoroughly analyze the repository to detect the language and setup:
|
|
243
|
+
|
|
244
|
+
### 1.1 Detect Language and Runtime
|
|
245
|
+
|
|
246
|
+
Check for language indicators:
|
|
247
|
+
```bash
|
|
248
|
+
# List files to detect language
|
|
249
|
+
ls -la {repo_path}
|
|
250
|
+
|
|
251
|
+
# Check for language-specific files
|
|
252
|
+
cat {repo_path}/package.json 2>/dev/null # Node.js/JavaScript/TypeScript
|
|
253
|
+
cat {repo_path}/pyproject.toml 2>/dev/null # Python (modern)
|
|
254
|
+
cat {repo_path}/setup.py 2>/dev/null # Python (legacy)
|
|
255
|
+
cat {repo_path}/requirements.txt 2>/dev/null # Python
|
|
256
|
+
cat {repo_path}/go.mod 2>/dev/null # Go
|
|
257
|
+
cat {repo_path}/Cargo.toml 2>/dev/null # Rust
|
|
258
|
+
cat {repo_path}/Gemfile 2>/dev/null # Ruby
|
|
259
|
+
cat {repo_path}/pom.xml 2>/dev/null # Java (Maven)
|
|
260
|
+
cat {repo_path}/build.gradle 2>/dev/null # Java/Kotlin (Gradle)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### 1.2 Check for Version Files
|
|
264
|
+
```bash
|
|
265
|
+
# Language version specifications
|
|
266
|
+
cat {repo_path}/.nvmrc 2>/dev/null # Node.js
|
|
267
|
+
cat {repo_path}/.node-version 2>/dev/null # Node.js
|
|
268
|
+
cat {repo_path}/.python-version 2>/dev/null # Python (pyenv)
|
|
269
|
+
cat {repo_path}/.ruby-version 2>/dev/null # Ruby
|
|
270
|
+
cat {repo_path}/rust-toolchain.toml 2>/dev/null # Rust
|
|
271
|
+
cat {repo_path}/.tool-versions 2>/dev/null # asdf (multiple languages)
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### 1.3 Check CI Configuration (GOLD MINE for setup hints!)
|
|
275
|
+
```bash
|
|
276
|
+
cat {repo_path}/.github/workflows/*.yml 2>/dev/null | head -300
|
|
277
|
+
```
|
|
278
|
+
CI configs often reveal:
|
|
279
|
+
- Exact language version and runtime setup
|
|
280
|
+
- Required system packages
|
|
281
|
+
- Environment variables
|
|
282
|
+
- Pre/post-install steps
|
|
283
|
+
- How tests are actually run
|
|
284
|
+
|
|
285
|
+
### 1.4 Check Test Configuration
|
|
286
|
+
Look for test framework configs:
|
|
287
|
+
```bash
|
|
288
|
+
# JavaScript/TypeScript
|
|
289
|
+
ls -la {repo_path}/*.config.* {repo_path}/jest.config.* {repo_path}/vitest.config.* 2>/dev/null
|
|
290
|
+
|
|
291
|
+
# Python
|
|
292
|
+
cat {repo_path}/pytest.ini 2>/dev/null
|
|
293
|
+
cat {repo_path}/pyproject.toml 2>/dev/null | grep -A20 "tool.pytest"
|
|
294
|
+
cat {repo_path}/setup.cfg 2>/dev/null | grep -A10 "tool:pytest"
|
|
295
|
+
|
|
296
|
+
# Go - tests are built into the language
|
|
297
|
+
# Rust - tests are built into the language
|
|
298
|
+
# Ruby
|
|
299
|
+
cat {repo_path}/.rspec 2>/dev/null
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### 1.5 Analyze the Test Files
|
|
303
|
+
Read the test files from `{task_dir}/tests/` to understand:
|
|
304
|
+
- What test framework they use (look at imports)
|
|
305
|
+
- Any special setup requirements
|
|
306
|
+
- Test file naming conventions
|
|
307
|
+
|
|
308
|
+
## Test Files from PR
|
|
309
|
+
|
|
310
|
+
**CRITICAL**: You MUST run ONLY these specific test files, NOT the entire test suite!
|
|
311
|
+
|
|
312
|
+
These test files have been extracted to `{task_dir}/tests/`:
|
|
313
|
+
{test_files_list}
|
|
314
|
+
|
|
315
|
+
In test.sh, these get copied from `/tests/` into the container before running.
|
|
316
|
+
|
|
317
|
+
**Your test command MUST run ONLY these files.** Examples by language:
|
|
318
|
+
|
|
319
|
+
### Python
|
|
320
|
+
```bash
|
|
321
|
+
pytest -xvs path/to/test_file.py
|
|
322
|
+
python -m pytest path/to/test_file.py path/to/test_other.py
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### JavaScript/TypeScript (TRICKY - read carefully!)
|
|
326
|
+
|
|
327
|
+
**Common test frameworks and their commands:**
|
|
328
|
+
```bash
|
|
329
|
+
# Jest (most common)
|
|
330
|
+
npx jest test/foo.test.js test/bar.test.js --coverage=false
|
|
331
|
+
|
|
332
|
+
# Vitest (Vite projects)
|
|
333
|
+
npx vitest run test/foo.test.ts --coverage.enabled=false
|
|
334
|
+
|
|
335
|
+
# Mocha
|
|
336
|
+
npx mocha test/foo.test.js test/bar.test.js
|
|
337
|
+
|
|
338
|
+
# TAP / borp (used by fastify, pino, undici, etc.)
|
|
339
|
+
npx borp test/foo.test.js --no-check-coverage
|
|
340
|
+
npx tap test/foo.test.js --no-check-coverage
|
|
341
|
+
|
|
342
|
+
# AVA
|
|
343
|
+
npx ava test/foo.test.js
|
|
344
|
+
|
|
345
|
+
# Node.js native test runner (node:test)
|
|
346
|
+
node --test test/foo.test.js
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
**CRITICAL JS/TS GOTCHAS:**
|
|
350
|
+
1. **NEVER run `npm test` or `npm run test` without file args** - runs entire suite!
|
|
351
|
+
2. **Disable coverage thresholds** - running a subset fails coverage checks:
|
|
352
|
+
- Jest: `--coverage=false`
|
|
353
|
+
- Vitest: `--coverage.enabled=false`
|
|
354
|
+
- TAP/borp: `--no-check-coverage`
|
|
355
|
+
3. **TypeScript projects need build step** before AND after applying bug.patch
|
|
356
|
+
4. **Check for Deno/Bun-specific tests** - skip if using `Deno.test()` or `bun:test`
|
|
357
|
+
5. **Some repos use fixture discovery** (like webpack) - run the discovery test, not fixtures
|
|
358
|
+
|
|
359
|
+
## JS/TS Test File Compatibility Check (CRITICAL!)
|
|
360
|
+
|
|
361
|
+
**Not all test files may be compatible with Node.js!** Check test files for:
|
|
362
|
+
|
|
363
|
+
**Node.js / Jest / Vitest / Mocha tests** (COMPATIBLE):
|
|
364
|
+
- Standard ES imports/requires
|
|
365
|
+
- Framework-specific APIs: `describe`, `it`, `test`, `expect`
|
|
366
|
+
|
|
367
|
+
**Deno tests** (INCOMPATIBLE with Node.js - SKIP these):
|
|
368
|
+
- `Deno.test()`
|
|
369
|
+
- `import {{ ... }} from "https://deno.land/..."`
|
|
370
|
+
- `.ts` extensions in imports without bundler
|
|
371
|
+
|
|
372
|
+
**Bun tests** (INCOMPATIBLE with Node.js - SKIP these):
|
|
373
|
+
- `Bun.test()`
|
|
374
|
+
- `import {{ ... }} from "bun:test"`
|
|
375
|
+
|
|
376
|
+
If you find incompatible test files, **remove them from test.sh** - don't try to run them!
|
|
377
|
+
|
|
378
|
+
## JS/TS package.json Analysis
|
|
379
|
+
|
|
380
|
+
When analyzing a Node.js project, check package.json carefully:
|
|
381
|
+
```bash
|
|
382
|
+
cat {repo_path}/package.json
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
Look for:
|
|
386
|
+
- `engines.node` - Required Node version
|
|
387
|
+
- `scripts.test` - What runs tests? (but don't use it directly!)
|
|
388
|
+
- `scripts.build` - Build command for TypeScript?
|
|
389
|
+
- `dependencies` / `devDependencies`:
|
|
390
|
+
- Test frameworks: jest, vitest, mocha, ava, tap, borp
|
|
391
|
+
- Native modules needing node-gyp: @parcel/watcher, fsevents, better-sqlite3, etc.
|
|
392
|
+
|
|
393
|
+
## JS/TS Test Configuration Files
|
|
394
|
+
|
|
395
|
+
Check for coverage thresholds that will fail when running a subset:
|
|
396
|
+
```bash
|
|
397
|
+
ls -la {repo_path}/*.config.* {repo_path}/.* 2>/dev/null | grep -E "(jest|vitest|mocha|tap|nyc)"
|
|
398
|
+
cat {repo_path}/jest.config.* 2>/dev/null | grep -i coverage
|
|
399
|
+
cat {repo_path}/.taprc 2>/dev/null
|
|
400
|
+
cat {repo_path}/.nycrc* 2>/dev/null
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
If you see coverage thresholds, you MUST disable them:
|
|
404
|
+
- TAP/borp: `--no-check-coverage`
|
|
405
|
+
- Jest: `--coverage=false`
|
|
406
|
+
- Vitest: `--coverage.enabled=false`
|
|
407
|
+
|
|
408
|
+
### Go
|
|
409
|
+
```bash
|
|
410
|
+
go test -v ./path/to/package/...
|
|
411
|
+
go test -v -run TestSpecificName ./...
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
### Rust
|
|
415
|
+
```bash
|
|
416
|
+
cargo test --test test_name -- --nocapture
|
|
417
|
+
cargo test specific_test_name -- --nocapture
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
### Ruby
|
|
421
|
+
```bash
|
|
422
|
+
bundle exec rspec spec/path/to/spec.rb
|
|
423
|
+
bundle exec ruby -Itest test/path/to/test.rb
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### Java
|
|
427
|
+
```bash
|
|
428
|
+
mvn test -Dtest=TestClassName
|
|
429
|
+
gradle test --tests TestClassName
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
**DO NOT run the entire test suite** - it's too slow and may have unrelated failures!
|
|
433
|
+
|
|
434
|
+
## Step 2: Fill In the Skeleton Files
|
|
435
|
+
|
|
436
|
+
Based on your analysis, edit the Dockerfile and test.sh.
|
|
437
|
+
|
|
438
|
+
### Dockerfile Guidelines
|
|
439
|
+
|
|
440
|
+
**CRITICAL: Always use Ubuntu base image**
|
|
441
|
+
- The skeleton starts with `FROM ubuntu:24.04` - **DO NOT change this**
|
|
442
|
+
- **NEVER** use language-specific base images (node:XX, python:XX, golang:XX)
|
|
443
|
+
- Install language runtimes via apt-get or official installers
|
|
444
|
+
|
|
445
|
+
**Language Runtime Installation Examples:**
|
|
446
|
+
|
|
447
|
+
**Python (PREFER uv for speed):**
|
|
448
|
+
```dockerfile
|
|
449
|
+
# Install Python and uv (much faster than pip)
|
|
450
|
+
RUN apt-get update && apt-get install -y \\
|
|
451
|
+
python3 python3-pip python3-venv python3-dev \\
|
|
452
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
453
|
+
|
|
454
|
+
# Install uv for fast package management
|
|
455
|
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \\
|
|
456
|
+
mv /root/.local/bin/uv /usr/local/bin/uv
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
**Node.js (check .nvmrc or package.json engines for version!):**
|
|
460
|
+
```dockerfile
|
|
461
|
+
# Check .nvmrc, .node-version, or package.json "engines.node" for required version
|
|
462
|
+
# Default to Node 20 if not specified
|
|
463
|
+
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \\
|
|
464
|
+
apt-get install -y nodejs && \\
|
|
465
|
+
rm -rf /var/lib/apt/lists/*
|
|
466
|
+
|
|
467
|
+
# Package manager setup - detect from lock file:
|
|
468
|
+
# pnpm-lock.yaml → pnpm
|
|
469
|
+
# yarn.lock → yarn
|
|
470
|
+
# bun.lockb → bun
|
|
471
|
+
# package-lock.json or none → npm
|
|
472
|
+
|
|
473
|
+
# For pnpm:
|
|
474
|
+
RUN corepack enable && corepack prepare pnpm@latest --activate
|
|
475
|
+
|
|
476
|
+
# For yarn (classic or berry):
|
|
477
|
+
RUN corepack enable
|
|
478
|
+
|
|
479
|
+
# For bun:
|
|
480
|
+
RUN curl -fsSL https://bun.sh/install | bash && ln -s /root/.bun/bin/bun /usr/local/bin/bun
|
|
481
|
+
|
|
482
|
+
# npm is included with Node.js (no extra setup needed)
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
**Node.js native dependencies (node-gyp):**
|
|
486
|
+
```dockerfile
|
|
487
|
+
# Many npm packages need native compilation (node-gyp)
|
|
488
|
+
# Add these if you see gyp errors during npm install:
|
|
489
|
+
RUN apt-get update && apt-get install -y \\
|
|
490
|
+
python3 make g++ \\
|
|
491
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
**Go:**
|
|
495
|
+
```dockerfile
|
|
496
|
+
RUN curl -fsSL https://go.dev/dl/go1.22.0.linux-amd64.tar.gz | tar -C /usr/local -xzf - && \\
|
|
497
|
+
ln -s /usr/local/go/bin/go /usr/local/bin/go
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
**Rust:**
|
|
501
|
+
```dockerfile
|
|
502
|
+
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
503
|
+
ENV PATH="/root/.cargo/bin:${{PATH}}"
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
**Ruby:**
|
|
507
|
+
```dockerfile
|
|
508
|
+
RUN apt-get update && apt-get install -y ruby ruby-dev && \\
|
|
509
|
+
rm -rf /var/lib/apt/lists/*
|
|
510
|
+
RUN gem install bundler
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
**Java:**
|
|
514
|
+
```dockerfile
|
|
515
|
+
RUN apt-get update && apt-get install -y openjdk-17-jdk maven && \\
|
|
516
|
+
rm -rf /var/lib/apt/lists/*
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
**Dependency Installation Examples:**
|
|
520
|
+
|
|
521
|
+
- **Python (PREFER uv):**
|
|
522
|
+
```dockerfile
|
|
523
|
+
# Create venv and install with uv (10-100x faster than pip)
|
|
524
|
+
RUN uv venv /opt/venv && \\
|
|
525
|
+
uv pip install --python /opt/venv/bin/python -e ".[dev,test]"
|
|
526
|
+
# Or for requirements.txt:
|
|
527
|
+
# RUN uv pip install --python /opt/venv/bin/python -r requirements.txt
|
|
528
|
+
ENV PATH="/opt/venv/bin:${{PATH}}"
|
|
529
|
+
```
|
|
530
|
+
- **Node.js (use frozen lockfile!):**
|
|
531
|
+
- npm: `npm ci` (NOT `npm install`)
|
|
532
|
+
- yarn: `yarn install --frozen-lockfile`
|
|
533
|
+
- pnpm: `pnpm install --frozen-lockfile`
|
|
534
|
+
- bun: `bun install`
|
|
535
|
+
- **Go:** `go mod download`
|
|
536
|
+
- **Rust:** `cargo fetch`
|
|
537
|
+
- **Ruby:** `bundle install`
|
|
538
|
+
- **Java:** `mvn dependency:resolve`
|
|
539
|
+
|
|
540
|
+
**Build Steps (for compiled languages):**
|
|
541
|
+
|
|
542
|
+
After installing dependencies AND after applying bug.patch, you may need to build:
|
|
543
|
+
- **TypeScript:** `npm run build` or `tsc` or `yarn build` or `pnpm build`
|
|
544
|
+
- **Go:** `go build ./...`
|
|
545
|
+
- **Rust:** `cargo build`
|
|
546
|
+
- **Java:** `mvn compile` or `gradle build`
|
|
547
|
+
|
|
548
|
+
**CRITICAL**: For compiled languages, you MUST rebuild AFTER applying bug.patch!
|
|
549
|
+
|
|
550
|
+
**TypeScript Projects - IMPORTANT:**
|
|
551
|
+
```dockerfile
|
|
552
|
+
# After npm install - build the project
|
|
553
|
+
RUN npm run build
|
|
554
|
+
# Or if no build script: RUN npx tsc
|
|
555
|
+
|
|
556
|
+
# Apply bug.patch
|
|
557
|
+
COPY bug.patch /tmp/bug.patch
|
|
558
|
+
RUN patch -p1 < /tmp/bug.patch && rm /tmp/bug.patch
|
|
559
|
+
|
|
560
|
+
# MUST rebuild after patching TypeScript source!
|
|
561
|
+
RUN npm run build
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
Check for TypeScript by looking for:
|
|
565
|
+
- `tsconfig.json` in repo root
|
|
566
|
+
- `.ts` or `.tsx` files in src/
|
|
567
|
+
- `typescript` in devDependencies
|
|
568
|
+
- `build` or `compile` scripts in package.json
|
|
569
|
+
|
|
570
|
+
### test.sh Guidelines
|
|
571
|
+
|
|
572
|
+
**CRITICAL**: Run ONLY the specific test files, NOT the entire test suite!
|
|
573
|
+
|
|
574
|
+
The test files you MUST run are:
|
|
575
|
+
{test_files_list}
|
|
576
|
+
|
|
577
|
+
Replace the TODO placeholder with the actual test command.
|
|
578
|
+
|
|
579
|
+
**Test command patterns (run MULTIPLE files by passing all paths):**
|
|
580
|
+
|
|
581
|
+
```bash
|
|
582
|
+
# Python (pytest) - with multiple files
|
|
583
|
+
pytest -xvs path/to/test_file.py path/to/test_other.py
|
|
584
|
+
|
|
585
|
+
# Jest - run specific files (can pass multiple files)
|
|
586
|
+
npx jest path/to/test1.js path/to/test2.js --coverage=false
|
|
587
|
+
|
|
588
|
+
# Vitest - run specific files (can pass multiple files)
|
|
589
|
+
npx vitest run path/to/test1.ts path/to/test2.ts --coverage.enabled=false
|
|
590
|
+
|
|
591
|
+
# TAP / borp - run specific files (disable coverage threshold)
|
|
592
|
+
# IMPORTANT: Pass the test file paths directly to the test runner, NOT through npm test
|
|
593
|
+
npx borp path/to/test1.js path/to/test2.js --no-check-coverage # For borp (used by fastify, pino, etc.)
|
|
594
|
+
npx tap path/to/test1.js path/to/test2.js --no-check-coverage # For standard tap
|
|
595
|
+
|
|
596
|
+
# Mocha - run specific files (can pass multiple files)
|
|
597
|
+
npx mocha path/to/test1.js path/to/test2.js
|
|
598
|
+
|
|
599
|
+
# If you must use npm/pnpm/yarn, use `--` separator and pass file paths:
|
|
600
|
+
npm run test -- path/to/test1.js path/to/test2.js
|
|
601
|
+
pnpm test -- path/to/test1.js path/to/test2.js
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
**Example with multiple test files:**
|
|
605
|
+
If you have test files: `test/foo.test.js`, `test/bar.test.js`, `tests/subdir/baz.test.js`
|
|
606
|
+
Run: `npx jest test/foo.test.js test/bar.test.js tests/subdir/baz.test.js --coverage=false`
|
|
607
|
+
|
|
608
|
+
**CRITICAL WARNING**: Running `npm test` or `npm run test` without file arguments runs the ENTIRE test suite!
|
|
609
|
+
This wastes time (100+ seconds), may hit timeouts, and is WRONG for this task.
|
|
610
|
+
You MUST pass the specific test file paths as arguments to run ONLY the tests from this PR.
|
|
611
|
+
|
|
612
|
+
**Discovery-based tests** (like webpack):
|
|
613
|
+
Some repos use a test runner that discovers fixtures, not direct test files.
|
|
614
|
+
In this case, run the discovery test file, not the individual fixtures.
|
|
615
|
+
|
|
616
|
+
## Harbor Validation Commands
|
|
617
|
+
|
|
618
|
+
For each validation attempt, increment the run number (-1, -2, -3, etc.):
|
|
619
|
+
|
|
620
|
+
```bash
|
|
621
|
+
# Test NOP - should get reward=0 (tests FAIL on buggy code)
|
|
622
|
+
harbor run --agent nop -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-nop-1 --no-delete --env {environment}
|
|
623
|
+
|
|
624
|
+
# Test Oracle - should get reward=1 (tests PASS after applying fix)
|
|
625
|
+
harbor run --agent oracle -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-oracle-1 --env {environment}
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
If you need to re-run after fixing issues, increment the number:
|
|
629
|
+
- First NOP attempt: `{task_id}-nop-1`, second: `{task_id}-nop-2`, etc.
|
|
630
|
+
- First Oracle attempt: `{task_id}-oracle-1`, second: `{task_id}-oracle-2`, etc.
|
|
631
|
+
|
|
632
|
+
## Success Criteria
|
|
633
|
+
|
|
634
|
+
You're done when BOTH pass:
|
|
635
|
+
- **NOP**: reward=0 (tests fail because bug.patch reverted the fix)
|
|
636
|
+
- **Oracle**: reward=1 (tests pass after solve.sh applies the fix)
|
|
637
|
+
|
|
638
|
+
## Finding Logs
|
|
639
|
+
|
|
640
|
+
After harbor runs, check `{jobs_dir}`:
|
|
641
|
+
- `{jobs_dir}/{task_id}-nop-N/<timestamp>/result.json` - NOP job result (N = run number)
|
|
642
|
+
- `{jobs_dir}/{task_id}-oracle-N/<timestamp>/result.json` - Oracle job result
|
|
643
|
+
|
|
644
|
+
Inside each job directory:
|
|
645
|
+
- `result.json` - Overall result with reward
|
|
646
|
+
- `verifier_stdout.txt` - Test output
|
|
647
|
+
- `verifier_stderr.txt` - Test errors
|
|
648
|
+
|
|
649
|
+
## Common Issues & Fixes
|
|
650
|
+
|
|
651
|
+
### Docker build fails
|
|
652
|
+
- **Missing language runtime** → Add installation commands
|
|
653
|
+
- **Missing system packages** → Check CI config, add to apt-get
|
|
654
|
+
- **Version mismatch** → Check version files (.nvmrc, .python-version, etc.)
|
|
655
|
+
- **Node.js: node-gyp errors** → Add `python3 make g++` to apt-get
|
|
656
|
+
- **Node.js: wrong version** → Check .nvmrc or package.json engines field
|
|
657
|
+
|
|
658
|
+
### Tests fail unexpectedly
|
|
659
|
+
- **Missing build step** → Check if compiled language needs build
|
|
660
|
+
- **Wrong test command** → Check how tests are run in CI config
|
|
661
|
+
- **Missing env vars** → Check CI config for env setup
|
|
662
|
+
- **Coverage threshold fails** → Add --no-check-coverage or similar flag
|
|
663
|
+
|
|
664
|
+
### JS/TS Specific Issues
|
|
665
|
+
- **"npm test" runs too many tests** → Use `npx <runner>` with specific files instead
|
|
666
|
+
- **Coverage threshold fails** → Add `--coverage=false` (Jest) or `--no-check-coverage` (TAP)
|
|
667
|
+
- **TypeScript compilation errors** → Check for missing build step
|
|
668
|
+
- **"Cannot find module"** → May need to run build before tests
|
|
669
|
+
- **Tests pass but shouldn't** → Check if tests are actually being run (look at output)
|
|
670
|
+
- **Deno/Bun tests incompatible** → Skip tests with `Deno.test()` or `bun:test` imports
|
|
671
|
+
|
|
672
|
+
### NOP gets reward=1 (should be 0)
|
|
673
|
+
- Tests don't actually test the bug
|
|
674
|
+
- Wrong test files being run
|
|
675
|
+
- Tests are skipped or not executed (check test output!)
|
|
676
|
+
|
|
677
|
+
### Oracle gets reward=0 (should be 1)
|
|
678
|
+
- fix.patch doesn't apply cleanly
|
|
679
|
+
- **TypeScript: MUST rebuild after patching** (most common JS/TS issue!)
|
|
680
|
+
- Missing post-patch setup steps
|
|
681
|
+
|
|
682
|
+
## Your Approach
|
|
683
|
+
|
|
684
|
+
1. **Read the skeleton files** first
|
|
685
|
+
2. **Detect language** from repo files (package.json, go.mod, Cargo.toml, etc.)
|
|
686
|
+
3. **Deep-analyze the repo** (package.json, CI config, test configs, version files)
|
|
687
|
+
4. **Check test file compatibility** (JS/TS: filter out Deno/Bun tests!)
|
|
688
|
+
5. **Fill in Dockerfile and test.sh**
|
|
689
|
+
6. **Run NOP** and iterate until reward=0
|
|
690
|
+
7. **Run Oracle** and iterate until reward=1
|
|
691
|
+
8. **Clean up files** - Remove ALL TODO comments and template examples
|
|
692
|
+
9. Done when both pass AND files are cleaned up!
|
|
693
|
+
|
|
694
|
+
## Final Cleanup
|
|
695
|
+
|
|
696
|
+
**Once both NOP (reward=0) and Oracle (reward=1) pass**, you MUST clean up the files:
|
|
697
|
+
|
|
698
|
+
1. **Remove ALL TODO comments** from Dockerfile and test.sh
|
|
699
|
+
2. **Remove ALL template/example comments** (e.g., "Examples: CI=true, NODE_ENV=test...")
|
|
700
|
+
3. **Remove large comment blocks** listing framework examples that aren't relevant
|
|
701
|
+
4. **Keep only meaningful comments** that explain non-obvious steps specific to this task
|
|
702
|
+
|
|
703
|
+
**Files to clean:**
|
|
704
|
+
- `{task_dir}/environment/Dockerfile` - Remove TODOs, keep comments explaining non-standard steps
|
|
705
|
+
- `{task_dir}/tests/test.sh` - Remove TODOs and all example templates, keep only test-specific comments
|
|
706
|
+
"""
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
def run_claude_code_session(
|
|
710
|
+
repo: str,
|
|
711
|
+
pr_number: int,
|
|
712
|
+
repo_path: Path,
|
|
713
|
+
task_dir: Path,
|
|
714
|
+
task_id: str,
|
|
715
|
+
dataset_path: Path,
|
|
716
|
+
test_files: list[str],
|
|
717
|
+
timeout: int = 900, # 15 minutes
|
|
718
|
+
verbose: bool = False,
|
|
719
|
+
reference_task_id: str | None = None,
|
|
720
|
+
reference_pr: int | None = None,
|
|
721
|
+
head_sha: str | None = None,
|
|
722
|
+
environment: str = "docker",
|
|
723
|
+
) -> ClaudeCodeResult:
|
|
724
|
+
"""
|
|
725
|
+
Run Claude Code session to complete skeleton and make harbor pass.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
repo: Repository in "owner/repo" format
|
|
729
|
+
pr_number: PR number
|
|
730
|
+
repo_path: Path to local repo clone
|
|
731
|
+
task_dir: Path to the task directory
|
|
732
|
+
task_id: Task identifier
|
|
733
|
+
dataset_path: Path to Harbor dataset root
|
|
734
|
+
test_files: List of test file paths
|
|
735
|
+
timeout: Maximum time for session
|
|
736
|
+
verbose: If True, stream output to console
|
|
737
|
+
reference_task_id: If provided, task_id to copy Dockerfile/test.sh from
|
|
738
|
+
reference_pr: If provided, PR number of the reference task
|
|
739
|
+
head_sha: If provided, new HEAD SHA to use in Dockerfile
|
|
740
|
+
environment: Environment type for Harbor runs (docker, daytona, etc.)
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
MakeItWorkResult with success status
|
|
744
|
+
"""
|
|
745
|
+
# Run async session in sync context
|
|
746
|
+
return asyncio.run(
|
|
747
|
+
_run_claude_code_session_async(
|
|
748
|
+
repo=repo,
|
|
749
|
+
pr_number=pr_number,
|
|
750
|
+
repo_path=repo_path,
|
|
751
|
+
task_dir=task_dir,
|
|
752
|
+
task_id=task_id,
|
|
753
|
+
dataset_path=dataset_path,
|
|
754
|
+
test_files=test_files,
|
|
755
|
+
timeout=timeout,
|
|
756
|
+
verbose=verbose,
|
|
757
|
+
reference_task_id=reference_task_id,
|
|
758
|
+
reference_pr=reference_pr,
|
|
759
|
+
head_sha=head_sha,
|
|
760
|
+
environment=environment,
|
|
761
|
+
)
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
async def _run_claude_code_session_async(
|
|
766
|
+
repo: str,
|
|
767
|
+
pr_number: int,
|
|
768
|
+
repo_path: Path,
|
|
769
|
+
task_dir: Path,
|
|
770
|
+
task_id: str,
|
|
771
|
+
dataset_path: Path,
|
|
772
|
+
test_files: list[str],
|
|
773
|
+
timeout: int = 900,
|
|
774
|
+
verbose: bool = False,
|
|
775
|
+
reference_task_id: str | None = None,
|
|
776
|
+
reference_pr: int | None = None,
|
|
777
|
+
head_sha: str | None = None,
|
|
778
|
+
environment: str = "docker",
|
|
779
|
+
) -> ClaudeCodeResult:
|
|
780
|
+
"""Async implementation of Claude Code session."""
|
|
781
|
+
logger = logging.getLogger("swegen")
|
|
782
|
+
logger.info("Starting Claude Code session for: %s", task_id)
|
|
783
|
+
|
|
784
|
+
# Resolve all paths to absolute paths for reliable usage
|
|
785
|
+
dataset_path = Path(dataset_path).resolve()
|
|
786
|
+
task_dir = Path(task_dir).resolve()
|
|
787
|
+
repo_path = Path(repo_path).resolve()
|
|
788
|
+
|
|
789
|
+
# Jobs directory for harbor output
|
|
790
|
+
jobs_dir = dataset_path.parent / ".state" / "harbor-jobs"
|
|
791
|
+
jobs_dir.mkdir(parents=True, exist_ok=True)
|
|
792
|
+
jobs_dir = jobs_dir.resolve()
|
|
793
|
+
|
|
794
|
+
# Format test files list
|
|
795
|
+
if test_files:
|
|
796
|
+
test_files_list = "\n".join(f" - {tf}" for tf in test_files)
|
|
797
|
+
else:
|
|
798
|
+
test_files_list = " (none)"
|
|
799
|
+
|
|
800
|
+
# Choose prompt based on whether we're using a reference task
|
|
801
|
+
if reference_task_id and reference_pr:
|
|
802
|
+
reference_task_dir = (dataset_path / reference_task_id).resolve()
|
|
803
|
+
prompt_text = CC_REFERENCE_PROMPT.format(
|
|
804
|
+
repo=repo,
|
|
805
|
+
pr_number=pr_number,
|
|
806
|
+
reference_pr=reference_pr,
|
|
807
|
+
reference_task_id=reference_task_id,
|
|
808
|
+
reference_task_dir=reference_task_dir,
|
|
809
|
+
repo_path=repo_path,
|
|
810
|
+
task_dir=task_dir,
|
|
811
|
+
task_id=task_id,
|
|
812
|
+
dataset_path=dataset_path,
|
|
813
|
+
jobs_dir=jobs_dir,
|
|
814
|
+
test_files_list=test_files_list,
|
|
815
|
+
head_sha=head_sha or "(check metadata)",
|
|
816
|
+
environment=environment,
|
|
817
|
+
)
|
|
818
|
+
logger.info(
|
|
819
|
+
f"Using reference prompt (copying from {reference_task_id}, PR #{reference_pr})"
|
|
820
|
+
)
|
|
821
|
+
else:
|
|
822
|
+
prompt_text = CC_PROMPT.format(
|
|
823
|
+
repo=repo,
|
|
824
|
+
pr_number=pr_number,
|
|
825
|
+
repo_path=repo_path,
|
|
826
|
+
task_dir=task_dir,
|
|
827
|
+
task_id=task_id,
|
|
828
|
+
dataset_path=dataset_path,
|
|
829
|
+
jobs_dir=jobs_dir,
|
|
830
|
+
test_files_list=test_files_list,
|
|
831
|
+
environment=environment,
|
|
832
|
+
)
|
|
833
|
+
logger.info("Using full prompt (generating from skeleton)")
|
|
834
|
+
|
|
835
|
+
# Create hook for logging Harbor validation attempts
|
|
836
|
+
harbor_runs: list[str] = []
|
|
837
|
+
|
|
838
|
+
async def log_harbor_runs(input_data: dict, tool_use_id: str, context: dict) -> dict:
|
|
839
|
+
"""Log Harbor validation attempts for debugging."""
|
|
840
|
+
command = input_data.get("tool_input", {}).get("command", "")
|
|
841
|
+
if "harbor run" in command:
|
|
842
|
+
harbor_runs.append(command)
|
|
843
|
+
if verbose:
|
|
844
|
+
print(f"{Colors.YELLOW}[Harbor]{Colors.RESET} {command}", flush=True)
|
|
845
|
+
return {}
|
|
846
|
+
|
|
847
|
+
try:
|
|
848
|
+
logger.info("Invoking Claude Code SDK with %ds timeout...", timeout)
|
|
849
|
+
|
|
850
|
+
if verbose:
|
|
851
|
+
project_root = os.getcwd()
|
|
852
|
+
print("[SDK] Running Claude Code Agent SDK", flush=True)
|
|
853
|
+
print(f"[SDK] Working directory: {project_root}", flush=True)
|
|
854
|
+
print(f"[SDK] Repo path: {repo_path}", flush=True)
|
|
855
|
+
print(f"[SDK] Task dir: {task_dir}", flush=True)
|
|
856
|
+
print("-" * 60, flush=True)
|
|
857
|
+
|
|
858
|
+
# Configure SDK options
|
|
859
|
+
options = ClaudeAgentOptions(
|
|
860
|
+
allowed_tools=["Read", "Write", "Edit", "Glob", "Grep", "LS", "Bash"],
|
|
861
|
+
permission_mode="bypassPermissions", # Auto-approve actions
|
|
862
|
+
cwd=os.getcwd(), # Run from project root
|
|
863
|
+
model="sonnet", # Use Sonnet model
|
|
864
|
+
hooks={
|
|
865
|
+
"PreToolUse": [HookMatcher(matcher="Bash", hooks=[log_harbor_runs])]
|
|
866
|
+
} if verbose else {},
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Run with timeout
|
|
870
|
+
try:
|
|
871
|
+
async with asyncio.timeout(timeout):
|
|
872
|
+
response_parts = []
|
|
873
|
+
|
|
874
|
+
if verbose:
|
|
875
|
+
# Stream messages with real-time display
|
|
876
|
+
async for message in query(prompt=prompt_text, options=options):
|
|
877
|
+
print_sdk_message(message)
|
|
878
|
+
|
|
879
|
+
# Collect text for final result
|
|
880
|
+
if isinstance(message, AssistantMessage):
|
|
881
|
+
for block in message.content:
|
|
882
|
+
if isinstance(block, TextBlock):
|
|
883
|
+
response_parts.append(block.text)
|
|
884
|
+
else:
|
|
885
|
+
# Collect messages without printing
|
|
886
|
+
async for message in query(prompt=prompt_text, options=options):
|
|
887
|
+
if isinstance(message, AssistantMessage):
|
|
888
|
+
for block in message.content:
|
|
889
|
+
if isinstance(block, TextBlock):
|
|
890
|
+
response_parts.append(block.text)
|
|
891
|
+
|
|
892
|
+
except TimeoutError:
|
|
893
|
+
logger.warning("Claude Code session timed out after %ds", timeout)
|
|
894
|
+
if verbose:
|
|
895
|
+
print(f"\n[SDK] Timed out after {timeout}s", flush=True)
|
|
896
|
+
return _check_validation_state(jobs_dir, task_id, logger, timed_out=True)
|
|
897
|
+
|
|
898
|
+
if verbose:
|
|
899
|
+
print("-" * 60, flush=True)
|
|
900
|
+
print("[SDK] Session complete", flush=True)
|
|
901
|
+
|
|
902
|
+
# Check final state from job files
|
|
903
|
+
return _check_validation_state(jobs_dir, task_id, logger)
|
|
904
|
+
|
|
905
|
+
except Exception as e:
|
|
906
|
+
logger.error("Claude Code session failed: %s", e)
|
|
907
|
+
return ClaudeCodeResult(
|
|
908
|
+
success=False,
|
|
909
|
+
nop_passed=False,
|
|
910
|
+
oracle_passed=False,
|
|
911
|
+
error_message=f"SDK failed: {e}",
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def _check_validation_state(
|
|
916
|
+
jobs_dir: Path,
|
|
917
|
+
task_id: str,
|
|
918
|
+
logger: logging.Logger,
|
|
919
|
+
timed_out: bool = False,
|
|
920
|
+
) -> ClaudeCodeResult:
|
|
921
|
+
"""Check validation state from harbor job results."""
|
|
922
|
+
nop_passed, oracle_passed = _check_job_results(jobs_dir, task_id)
|
|
923
|
+
success = nop_passed and oracle_passed
|
|
924
|
+
|
|
925
|
+
error_message = None
|
|
926
|
+
if not success:
|
|
927
|
+
parts = []
|
|
928
|
+
if timed_out:
|
|
929
|
+
parts.append("CC timed out")
|
|
930
|
+
if not nop_passed:
|
|
931
|
+
parts.append("NOP failed (expected reward=0)")
|
|
932
|
+
if not oracle_passed:
|
|
933
|
+
parts.append("Oracle failed (expected reward=1)")
|
|
934
|
+
error_message = "; ".join(parts) if parts else None
|
|
935
|
+
|
|
936
|
+
return ClaudeCodeResult(
|
|
937
|
+
success=success,
|
|
938
|
+
nop_passed=nop_passed,
|
|
939
|
+
oracle_passed=oracle_passed,
|
|
940
|
+
error_message=error_message,
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _check_job_results(jobs_dir: Path, task_id: str) -> tuple[bool, bool]:
|
|
945
|
+
"""Check the actual job results to determine validation state.
|
|
946
|
+
|
|
947
|
+
Looks for job directories matching:
|
|
948
|
+
- {task_id}-nop-N (where N is 1, 2, 3, etc.)
|
|
949
|
+
- {task_id}-oracle-N
|
|
950
|
+
|
|
951
|
+
Finds the most recent result.json by modification time.
|
|
952
|
+
"""
|
|
953
|
+
nop_passed = False
|
|
954
|
+
oracle_passed = False
|
|
955
|
+
|
|
956
|
+
if not jobs_dir.exists():
|
|
957
|
+
return nop_passed, oracle_passed
|
|
958
|
+
|
|
959
|
+
def find_most_recent_result(pattern: str) -> Path | None:
|
|
960
|
+
"""Find most recent result.json matching pattern."""
|
|
961
|
+
best_path = None
|
|
962
|
+
best_mtime = 0.0
|
|
963
|
+
|
|
964
|
+
for job_dir in jobs_dir.glob(pattern):
|
|
965
|
+
if not job_dir.is_dir():
|
|
966
|
+
continue
|
|
967
|
+
# Find result.json (Harbor creates a timestamped subdir inside --jobs-dir)
|
|
968
|
+
for result_file in job_dir.rglob("result.json"):
|
|
969
|
+
mtime = result_file.stat().st_mtime
|
|
970
|
+
if mtime > best_mtime:
|
|
971
|
+
best_mtime = mtime
|
|
972
|
+
best_path = result_file
|
|
973
|
+
|
|
974
|
+
return best_path
|
|
975
|
+
|
|
976
|
+
# Find most recent NOP result
|
|
977
|
+
nop_result_path = find_most_recent_result(f"{task_id}-nop-*")
|
|
978
|
+
if nop_result_path:
|
|
979
|
+
reward = parse_harbor_outcome(nop_result_path).reward
|
|
980
|
+
nop_passed = reward == 0
|
|
981
|
+
|
|
982
|
+
# Find most recent Oracle result
|
|
983
|
+
oracle_result_path = find_most_recent_result(f"{task_id}-oracle-*")
|
|
984
|
+
if oracle_result_path:
|
|
985
|
+
reward = parse_harbor_outcome(oracle_result_path).reward
|
|
986
|
+
oracle_passed = reward == 1
|
|
987
|
+
|
|
988
|
+
return nop_passed, oracle_passed
|