swegen 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,988 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ from claude_agent_sdk import (
10
+ AssistantMessage,
11
+ ClaudeAgentOptions,
12
+ HookMatcher,
13
+ TextBlock,
14
+ query,
15
+ )
16
+
17
+ from swegen.create.claude_code_utils import Colors, print_sdk_message
18
+ from swegen.tools.harbor_runner import parse_harbor_outcome
19
+
20
+
21
+ @dataclass
22
+ class ClaudeCodeResult:
23
+ """Result of the CC session."""
24
+
25
+ success: bool
26
+ nop_passed: bool # reward=0 (tests fail on buggy code)
27
+ oracle_passed: bool # reward=1 (tests pass after fix)
28
+ error_message: str | None = None
29
+ cc_output: str | None = None
30
+
31
+
32
+ # The prompt for CC when using a reference task (much simpler task)
33
+ CC_REFERENCE_PROMPT = """
34
+ ## Your Task: Fill In Skeleton Using Reference Task as Example
35
+
36
+ **GREAT NEWS**: We have a working task from PR #{reference_pr} (task: `{reference_task_id}`)!
37
+
38
+ Your job is MUCH SIMPLER than usual:
39
+ 1. **Look at the reference task** to see what was added (runtime, packages, env vars, build steps, test command)
40
+ 2. **Fill in your skeleton's TODOs** with the same things
41
+ 3. **Update test file paths** to match this PR
42
+ 4. **Run harbor validation** to confirm it works
43
+
44
+ ## Context
45
+
46
+ **Repository**: {repo} (cloned at `{repo_path}`)
47
+ **Current PR**: #{pr_number}
48
+ **Reference Task**: `{reference_task_id}` (from PR #{reference_pr}, tested and validated)
49
+ **Current Task Directory**: `{task_dir}` ← Your skeleton (CORRECT hashes already!)
50
+ **Reference Task Directory**: `{reference_task_dir}` ← Working example to learn from
51
+ **Dataset Path**: `{dataset_path}`
52
+
53
+ ## Test Files for This PR
54
+
55
+ {test_files_list}
56
+
57
+ ## What's Already Done
58
+
59
+ ✓ Skeleton Dockerfile with CORRECT git SHAs ({head_sha}) and basic structure
60
+ ✓ Skeleton test.sh with TODO for test command
61
+ ✓ bug.patch and fix.patch are ready
62
+ ✓ instruction.md and task.toml are ready
63
+ ✓ Reference task has working Dockerfile and test.sh as examples
64
+
65
+ ## IMPORTANT: Your Skeleton Already Has Correct Hashes!
66
+
67
+ **DO NOT copy files from reference and replace hashes** - that's error-prone!
68
+
69
+ Instead:
70
+ 1. Read `{task_dir}/environment/Dockerfile` - it has TODO comments
71
+ 2. Read `{reference_task_dir}/environment/Dockerfile` - see what was filled in
72
+ 3. Add the same things to YOUR skeleton's TODO sections
73
+
74
+ The skeleton already has:
75
+ ✓ Correct git clone URL
76
+ ✓ Correct HEAD SHA ({head_sha})
77
+ ✓ Basic apt packages (git, curl, patch, build-essential)
78
+ ✓ Correct bug.patch application
79
+
80
+ ## Your Process
81
+
82
+ ### Step 1: Compare Reference Dockerfile to Your Skeleton
83
+
84
+ Read both files:
85
+ ```bash
86
+ # Your skeleton (has TODO comments to fill in)
87
+ cat {task_dir}/environment/Dockerfile
88
+
89
+ # Reference (shows what was filled in for a similar PR)
90
+ cat {reference_task_dir}/environment/Dockerfile
91
+ ```
92
+
93
+ Look for what the reference added beyond the basic skeleton:
94
+ - Language runtime installation (Python, Node.js, Go, Rust, Ruby, Java, etc.)
95
+ - Additional system packages (python3-dev, libssl-dev, etc.)
96
+ - Package manager setup
97
+ - Environment variables (CI=true, NODE_ENV=test, etc.)
98
+ - Dependency installation commands
99
+ - Build steps
100
+ - Post-patch rebuild steps
101
+
102
+ ### Step 2: Fill In Your Skeleton's TODOs
103
+
104
+ **CRITICAL: Always use Ubuntu base image**
105
+ - The skeleton Dockerfile starts with `FROM ubuntu:24.04` - **DO NOT change this**
106
+ - **NEVER** use language-specific base images (node:XX, python:XX, golang:XX)
107
+ - Install language runtimes via apt-get or official installers
108
+
109
+ Add the same things from the reference to your skeleton. For example:
110
+
111
+ **If reference has:**
112
+ ```dockerfile
113
+ # Install Python
114
+ RUN apt-get update && apt-get install -y \\
115
+ python3 python3-pip python3-venv python3-dev \\
116
+ && rm -rf /var/lib/apt/lists/*
117
+ ```
118
+
119
+ **Then replace your TODO:**
120
+ ```dockerfile
121
+ # TODO: Install language runtime
122
+ ```
123
+
124
+ **With the same installation commands.**
125
+
126
+ **DO NOT just copy the entire reference file** - the git SHAs would be wrong!
127
+ **DO fill in the TODOs** using the reference as a guide.
128
+
129
+ ### Step 3: Fill In test.sh Test Command
130
+
131
+ Read both test files:
132
+ ```bash
133
+ # Your skeleton (has TODO for test command)
134
+ cat {task_dir}/tests/test.sh
135
+
136
+ # Reference (shows what test command worked)
137
+ cat {reference_task_dir}/tests/test.sh
138
+ ```
139
+
140
+ **CRITICAL**: Update the test command to run ONLY the test files for THIS PR!
141
+
142
+ **Current test files for THIS PR**:
143
+ {test_files_list}
144
+
145
+ The reference test.sh will show you the test runner pattern.
146
+ **Copy the pattern but update the file paths** to match this PR's test files.
147
+
148
+ **DO NOT use**:
149
+ - `npm test`, `pytest`, `go test ./...` without specific paths ❌ (runs entire suite)
150
+ - Any command without specific file paths ❌
151
+
152
+ Replace the TODO placeholder with the actual test command running THIS PR's test files.
153
+
154
+ ### Step 4: Run Harbor Validation
155
+
156
+ For each validation attempt, increment the run number (-1, -2, -3, etc.):
157
+
158
+ ```bash
159
+ # Test NOP - should get reward=0
160
+ harbor run --agent nop -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-nop-1 --no-delete --env {environment}
161
+
162
+ # Test Oracle - should get reward=1
163
+ harbor run --agent oracle -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-oracle-1 --env {environment}
164
+ ```
165
+
166
+ If you need to re-run after fixing issues, increment the number:
167
+ - First NOP attempt: `{task_id}-nop-1`, second: `{task_id}-nop-2`, etc.
168
+ - First Oracle attempt: `{task_id}-oracle-1`, second: `{task_id}-oracle-2`, etc.
169
+
170
+ ### Step 5: Fix Issues (if validation fails)
171
+
172
+ If harbor fails, check:
173
+ 1. **Test file paths** - Most common issue (make sure you updated them for THIS PR)
174
+ 2. **Missing build step** - Did you copy the build steps from reference?
175
+ 3. **Missing packages** - Did you copy the system packages from reference?
176
+ 4. **Post-patch rebuild** - For compiled languages, you MUST rebuild after applying bug.patch
177
+
178
+ ### Step 6: Final Cleanup
179
+
180
+ **Once both NOP (reward=0) and Oracle (reward=1) pass**, clean up your files:
181
+
182
+ 1. **Remove ALL TODO comments** from Dockerfile and test.sh
183
+ 2. **Remove ALL template/example comments** that are no longer relevant
184
+ 3. **Keep only meaningful comments** that explain non-obvious steps
185
+
186
+ **Files to clean:**
187
+ - `{task_dir}/environment/Dockerfile` - Remove TODOs, keep comments explaining non-standard steps
188
+ - `{task_dir}/tests/test.sh` - Remove TODOs and example templates, keep test-specific comments
189
+
190
+ ## Tips
191
+
192
+ - **Your skeleton is the source of truth** - it has correct hashes
193
+ - **Reference is just an example** - shows you what to fill in
194
+ - **Don't copy entire files** - just the extra pieces (runtime, packages, env vars, build steps)
195
+ - **Update test paths** - most PRs touch different test files
196
+
197
+ You're done when both NOP (reward=0) and Oracle (reward=1) pass AND files are cleaned up!
198
+ """
199
+
200
+ # The prompt for CC to analyze repo and fill in skeleton (from scratch)
201
+ CC_PROMPT = """
202
+ ## Your Task: Make This Harbor Task Work
203
+
204
+ You have a skeleton Harbor task that needs to be completed. Your job is to:
205
+ 1. **Analyze the repository** to detect language, build system, test framework, dependencies
206
+ 2. **Fill in the TODO sections** in Dockerfile and test.sh
207
+ 3. **Run harbor validation** and iterate until it passes
208
+
209
+ ## Context
210
+
211
+ **Repository**: {repo} (cloned at `{repo_path}`)
212
+ **PR**: #{pr_number}
213
+ **Task Directory**: `{task_dir}`
214
+ **Dataset Path**: `{dataset_path}`
215
+
216
+ The repo is already cloned locally. You can browse it, read files, and run commands.
217
+
218
+ ## Skeleton Files to Complete
219
+
220
+ The skeleton files have been generated with the deterministic parts filled in:
221
+ - Git clone commands with correct SHAs ✓
222
+ - Basic apt packages (git, curl, ca-certificates, patch, build-essential) ✓
223
+ - bug.patch/fix.patch ✓
224
+
225
+ **You need to fill in the TODOs:**
226
+
227
+ ### `{task_dir}/environment/Dockerfile`
228
+ - **Language runtime**: Detect and install (Python, Node.js, Go, Rust, Ruby, Java, etc.)
229
+ - **System packages**: Additional packages needed (dev headers, native dependencies)
230
+ - **Package manager**: Set up if needed (pip, npm, cargo, bundler, etc.)
231
+ - **Environment variables**: CI=true, etc.
232
+ - **Dependencies**: Install project dependencies
233
+ - **Build step**: If needed (TypeScript, Rust, Go, Java, etc.)
234
+ - **Rebuild after bug.patch**: Required for compiled languages
235
+
236
+ ### `{task_dir}/tests/test.sh`
237
+ - **Environment variables**: For test runner
238
+ - **Test command**: The actual command to run the specific test files
239
+
240
+ ## Step 1: Deep Repository Analysis
241
+
242
+ Before filling anything in, thoroughly analyze the repository to detect the language and setup:
243
+
244
+ ### 1.1 Detect Language and Runtime
245
+
246
+ Check for language indicators:
247
+ ```bash
248
+ # List files to detect language
249
+ ls -la {repo_path}
250
+
251
+ # Check for language-specific files
252
+ cat {repo_path}/package.json 2>/dev/null # Node.js/JavaScript/TypeScript
253
+ cat {repo_path}/pyproject.toml 2>/dev/null # Python (modern)
254
+ cat {repo_path}/setup.py 2>/dev/null # Python (legacy)
255
+ cat {repo_path}/requirements.txt 2>/dev/null # Python
256
+ cat {repo_path}/go.mod 2>/dev/null # Go
257
+ cat {repo_path}/Cargo.toml 2>/dev/null # Rust
258
+ cat {repo_path}/Gemfile 2>/dev/null # Ruby
259
+ cat {repo_path}/pom.xml 2>/dev/null # Java (Maven)
260
+ cat {repo_path}/build.gradle 2>/dev/null # Java/Kotlin (Gradle)
261
+ ```
262
+
263
+ ### 1.2 Check for Version Files
264
+ ```bash
265
+ # Language version specifications
266
+ cat {repo_path}/.nvmrc 2>/dev/null # Node.js
267
+ cat {repo_path}/.node-version 2>/dev/null # Node.js
268
+ cat {repo_path}/.python-version 2>/dev/null # Python (pyenv)
269
+ cat {repo_path}/.ruby-version 2>/dev/null # Ruby
270
+ cat {repo_path}/rust-toolchain.toml 2>/dev/null # Rust
271
+ cat {repo_path}/.tool-versions 2>/dev/null # asdf (multiple languages)
272
+ ```
273
+
274
+ ### 1.3 Check CI Configuration (GOLD MINE for setup hints!)
275
+ ```bash
276
+ cat {repo_path}/.github/workflows/*.yml 2>/dev/null | head -300
277
+ ```
278
+ CI configs often reveal:
279
+ - Exact language version and runtime setup
280
+ - Required system packages
281
+ - Environment variables
282
+ - Pre/post-install steps
283
+ - How tests are actually run
284
+
285
+ ### 1.4 Check Test Configuration
286
+ Look for test framework configs:
287
+ ```bash
288
+ # JavaScript/TypeScript
289
+ ls -la {repo_path}/*.config.* {repo_path}/jest.config.* {repo_path}/vitest.config.* 2>/dev/null
290
+
291
+ # Python
292
+ cat {repo_path}/pytest.ini 2>/dev/null
293
+ cat {repo_path}/pyproject.toml 2>/dev/null | grep -A20 "tool.pytest"
294
+ cat {repo_path}/setup.cfg 2>/dev/null | grep -A10 "tool:pytest"
295
+
296
+ # Go - tests are built into the language
297
+ # Rust - tests are built into the language
298
+ # Ruby
299
+ cat {repo_path}/.rspec 2>/dev/null
300
+ ```
301
+
302
+ ### 1.5 Analyze the Test Files
303
+ Read the test files from `{task_dir}/tests/` to understand:
304
+ - What test framework they use (look at imports)
305
+ - Any special setup requirements
306
+ - Test file naming conventions
307
+
308
+ ## Test Files from PR
309
+
310
+ **CRITICAL**: You MUST run ONLY these specific test files, NOT the entire test suite!
311
+
312
+ These test files have been extracted to `{task_dir}/tests/`:
313
+ {test_files_list}
314
+
315
+ In test.sh, these get copied from `/tests/` into the container before running.
316
+
317
+ **Your test command MUST run ONLY these files.** Examples by language:
318
+
319
+ ### Python
320
+ ```bash
321
+ pytest -xvs path/to/test_file.py
322
+ python -m pytest path/to/test_file.py path/to/test_other.py
323
+ ```
324
+
325
+ ### JavaScript/TypeScript (TRICKY - read carefully!)
326
+
327
+ **Common test frameworks and their commands:**
328
+ ```bash
329
+ # Jest (most common)
330
+ npx jest test/foo.test.js test/bar.test.js --coverage=false
331
+
332
+ # Vitest (Vite projects)
333
+ npx vitest run test/foo.test.ts --coverage.enabled=false
334
+
335
+ # Mocha
336
+ npx mocha test/foo.test.js test/bar.test.js
337
+
338
+ # TAP / borp (used by fastify, pino, undici, etc.)
339
+ npx borp test/foo.test.js --no-check-coverage
340
+ npx tap test/foo.test.js --no-check-coverage
341
+
342
+ # AVA
343
+ npx ava test/foo.test.js
344
+
345
+ # Node.js native test runner (node:test)
346
+ node --test test/foo.test.js
347
+ ```
348
+
349
+ **CRITICAL JS/TS GOTCHAS:**
350
+ 1. **NEVER run `npm test` or `npm run test` without file args** - runs entire suite!
351
+ 2. **Disable coverage thresholds** - running a subset fails coverage checks:
352
+ - Jest: `--coverage=false`
353
+ - Vitest: `--coverage.enabled=false`
354
+ - TAP/borp: `--no-check-coverage`
355
+ 3. **TypeScript projects need build step** before AND after applying bug.patch
356
+ 4. **Check for Deno/Bun-specific tests** - skip if using `Deno.test()` or `bun:test`
357
+ 5. **Some repos use fixture discovery** (like webpack) - run the discovery test, not fixtures
358
+
359
+ ## JS/TS Test File Compatibility Check (CRITICAL!)
360
+
361
+ **Not all test files may be compatible with Node.js!** Check test files for:
362
+
363
+ **Node.js / Jest / Vitest / Mocha tests** (COMPATIBLE):
364
+ - Standard ES imports/requires
365
+ - Framework-specific APIs: `describe`, `it`, `test`, `expect`
366
+
367
+ **Deno tests** (INCOMPATIBLE with Node.js - SKIP these):
368
+ - `Deno.test()`
369
+ - `import {{ ... }} from "https://deno.land/..."`
370
+ - `.ts` extensions in imports without bundler
371
+
372
+ **Bun tests** (INCOMPATIBLE with Node.js - SKIP these):
373
+ - `Bun.test()`
374
+ - `import {{ ... }} from "bun:test"`
375
+
376
+ If you find incompatible test files, **remove them from test.sh** - don't try to run them!
377
+
378
+ ## JS/TS package.json Analysis
379
+
380
+ When analyzing a Node.js project, check package.json carefully:
381
+ ```bash
382
+ cat {repo_path}/package.json
383
+ ```
384
+
385
+ Look for:
386
+ - `engines.node` - Required Node version
387
+ - `scripts.test` - What runs tests? (but don't use it directly!)
388
+ - `scripts.build` - Build command for TypeScript?
389
+ - `dependencies` / `devDependencies`:
390
+ - Test frameworks: jest, vitest, mocha, ava, tap, borp
391
+ - Native modules needing node-gyp: @parcel/watcher, fsevents, better-sqlite3, etc.
392
+
393
+ ## JS/TS Test Configuration Files
394
+
395
+ Check for coverage thresholds that will fail when running a subset:
396
+ ```bash
397
+ ls -la {repo_path}/*.config.* {repo_path}/.* 2>/dev/null | grep -E "(jest|vitest|mocha|tap|nyc)"
398
+ cat {repo_path}/jest.config.* 2>/dev/null | grep -i coverage
399
+ cat {repo_path}/.taprc 2>/dev/null
400
+ cat {repo_path}/.nycrc* 2>/dev/null
401
+ ```
402
+
403
+ If you see coverage thresholds, you MUST disable them:
404
+ - TAP/borp: `--no-check-coverage`
405
+ - Jest: `--coverage=false`
406
+ - Vitest: `--coverage.enabled=false`
407
+
408
+ ### Go
409
+ ```bash
410
+ go test -v ./path/to/package/...
411
+ go test -v -run TestSpecificName ./...
412
+ ```
413
+
414
+ ### Rust
415
+ ```bash
416
+ cargo test --test test_name -- --nocapture
417
+ cargo test specific_test_name -- --nocapture
418
+ ```
419
+
420
+ ### Ruby
421
+ ```bash
422
+ bundle exec rspec spec/path/to/spec.rb
423
+ bundle exec ruby -Itest test/path/to/test.rb
424
+ ```
425
+
426
+ ### Java
427
+ ```bash
428
+ mvn test -Dtest=TestClassName
429
+ gradle test --tests TestClassName
430
+ ```
431
+
432
+ **DO NOT run the entire test suite** - it's too slow and may have unrelated failures!
433
+
434
+ ## Step 2: Fill In the Skeleton Files
435
+
436
+ Based on your analysis, edit the Dockerfile and test.sh.
437
+
438
+ ### Dockerfile Guidelines
439
+
440
+ **CRITICAL: Always use Ubuntu base image**
441
+ - The skeleton starts with `FROM ubuntu:24.04` - **DO NOT change this**
442
+ - **NEVER** use language-specific base images (node:XX, python:XX, golang:XX)
443
+ - Install language runtimes via apt-get or official installers
444
+
445
+ **Language Runtime Installation Examples:**
446
+
447
+ **Python (PREFER uv for speed):**
448
+ ```dockerfile
449
+ # Install Python and uv (much faster than pip)
450
+ RUN apt-get update && apt-get install -y \\
451
+ python3 python3-pip python3-venv python3-dev \\
452
+ && rm -rf /var/lib/apt/lists/*
453
+
454
+ # Install uv for fast package management
455
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \\
456
+ mv /root/.local/bin/uv /usr/local/bin/uv
457
+ ```
458
+
459
+ **Node.js (check .nvmrc or package.json engines for version!):**
460
+ ```dockerfile
461
+ # Check .nvmrc, .node-version, or package.json "engines.node" for required version
462
+ # Default to Node 20 if not specified
463
+ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \\
464
+ apt-get install -y nodejs && \\
465
+ rm -rf /var/lib/apt/lists/*
466
+
467
+ # Package manager setup - detect from lock file:
468
+ # pnpm-lock.yaml → pnpm
469
+ # yarn.lock → yarn
470
+ # bun.lockb → bun
471
+ # package-lock.json or none → npm
472
+
473
+ # For pnpm:
474
+ RUN corepack enable && corepack prepare pnpm@latest --activate
475
+
476
+ # For yarn (classic or berry):
477
+ RUN corepack enable
478
+
479
+ # For bun:
480
+ RUN curl -fsSL https://bun.sh/install | bash && ln -s /root/.bun/bin/bun /usr/local/bin/bun
481
+
482
+ # npm is included with Node.js (no extra setup needed)
483
+ ```
484
+
485
+ **Node.js native dependencies (node-gyp):**
486
+ ```dockerfile
487
+ # Many npm packages need native compilation (node-gyp)
488
+ # Add these if you see gyp errors during npm install:
489
+ RUN apt-get update && apt-get install -y \\
490
+ python3 make g++ \\
491
+ && rm -rf /var/lib/apt/lists/*
492
+ ```
493
+
494
+ **Go:**
495
+ ```dockerfile
496
+ RUN curl -fsSL https://go.dev/dl/go1.22.0.linux-amd64.tar.gz | tar -C /usr/local -xzf - && \\
497
+ ln -s /usr/local/go/bin/go /usr/local/bin/go
498
+ ```
499
+
500
+ **Rust:**
501
+ ```dockerfile
502
+ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
503
+ ENV PATH="/root/.cargo/bin:${{PATH}}"
504
+ ```
505
+
506
+ **Ruby:**
507
+ ```dockerfile
508
+ RUN apt-get update && apt-get install -y ruby ruby-dev && \\
509
+ rm -rf /var/lib/apt/lists/*
510
+ RUN gem install bundler
511
+ ```
512
+
513
+ **Java:**
514
+ ```dockerfile
515
+ RUN apt-get update && apt-get install -y openjdk-17-jdk maven && \\
516
+ rm -rf /var/lib/apt/lists/*
517
+ ```
518
+
519
+ **Dependency Installation Examples:**
520
+
521
+ - **Python (PREFER uv):**
522
+ ```dockerfile
523
+ # Create venv and install with uv (10-100x faster than pip)
524
+ RUN uv venv /opt/venv && \\
525
+ uv pip install --python /opt/venv/bin/python -e ".[dev,test]"
526
+ # Or for requirements.txt:
527
+ # RUN uv pip install --python /opt/venv/bin/python -r requirements.txt
528
+ ENV PATH="/opt/venv/bin:${{PATH}}"
529
+ ```
530
+ - **Node.js (use frozen lockfile!):**
531
+ - npm: `npm ci` (NOT `npm install`)
532
+ - yarn: `yarn install --frozen-lockfile`
533
+ - pnpm: `pnpm install --frozen-lockfile`
534
+ - bun: `bun install`
535
+ - **Go:** `go mod download`
536
+ - **Rust:** `cargo fetch`
537
+ - **Ruby:** `bundle install`
538
+ - **Java:** `mvn dependency:resolve`
539
+
540
+ **Build Steps (for compiled languages):**
541
+
542
+ After installing dependencies AND after applying bug.patch, you may need to build:
543
+ - **TypeScript:** `npm run build` or `tsc` or `yarn build` or `pnpm build`
544
+ - **Go:** `go build ./...`
545
+ - **Rust:** `cargo build`
546
+ - **Java:** `mvn compile` or `gradle build`
547
+
548
+ **CRITICAL**: For compiled languages, you MUST rebuild AFTER applying bug.patch!
549
+
550
+ **TypeScript Projects - IMPORTANT:**
551
+ ```dockerfile
552
+ # After npm install - build the project
553
+ RUN npm run build
554
+ # Or if no build script: RUN npx tsc
555
+
556
+ # Apply bug.patch
557
+ COPY bug.patch /tmp/bug.patch
558
+ RUN patch -p1 < /tmp/bug.patch && rm /tmp/bug.patch
559
+
560
+ # MUST rebuild after patching TypeScript source!
561
+ RUN npm run build
562
+ ```
563
+
564
+ Check for TypeScript by looking for:
565
+ - `tsconfig.json` in repo root
566
+ - `.ts` or `.tsx` files in src/
567
+ - `typescript` in devDependencies
568
+ - `build` or `compile` scripts in package.json
569
+
570
+ ### test.sh Guidelines
571
+
572
+ **CRITICAL**: Run ONLY the specific test files, NOT the entire test suite!
573
+
574
+ The test files you MUST run are:
575
+ {test_files_list}
576
+
577
+ Replace the TODO placeholder with the actual test command.
578
+
579
+ **Test command patterns (run MULTIPLE files by passing all paths):**
580
+
581
+ ```bash
582
+ # Python (pytest) - with multiple files
583
+ pytest -xvs path/to/test_file.py path/to/test_other.py
584
+
585
+ # Jest - run specific files (can pass multiple files)
586
+ npx jest path/to/test1.js path/to/test2.js --coverage=false
587
+
588
+ # Vitest - run specific files (can pass multiple files)
589
+ npx vitest run path/to/test1.ts path/to/test2.ts --coverage.enabled=false
590
+
591
+ # TAP / borp - run specific files (disable coverage threshold)
592
+ # IMPORTANT: Pass the test file paths directly to the test runner, NOT through npm test
593
+ npx borp path/to/test1.js path/to/test2.js --no-check-coverage # For borp (used by fastify, pino, etc.)
594
+ npx tap path/to/test1.js path/to/test2.js --no-check-coverage # For standard tap
595
+
596
+ # Mocha - run specific files (can pass multiple files)
597
+ npx mocha path/to/test1.js path/to/test2.js
598
+
599
+ # If you must use npm/pnpm/yarn, use `--` separator and pass file paths:
600
+ npm run test -- path/to/test1.js path/to/test2.js
601
+ pnpm test -- path/to/test1.js path/to/test2.js
602
+ ```
603
+
604
+ **Example with multiple test files:**
605
+ If you have test files: `test/foo.test.js`, `test/bar.test.js`, `tests/subdir/baz.test.js`
606
+ Run: `npx jest test/foo.test.js test/bar.test.js tests/subdir/baz.test.js --coverage=false`
607
+
608
+ **CRITICAL WARNING**: Running `npm test` or `npm run test` without file arguments runs the ENTIRE test suite!
609
+ This wastes time (100+ seconds), may hit timeouts, and is WRONG for this task.
610
+ You MUST pass the specific test file paths as arguments to run ONLY the tests from this PR.
611
+
612
+ **Discovery-based tests** (like webpack):
613
+ Some repos use a test runner that discovers fixtures, not direct test files.
614
+ In this case, run the discovery test file, not the individual fixtures.
615
+
616
+ ## Harbor Validation Commands
617
+
618
+ For each validation attempt, increment the run number (-1, -2, -3, etc.):
619
+
620
+ ```bash
621
+ # Test NOP - should get reward=0 (tests FAIL on buggy code)
622
+ harbor run --agent nop -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-nop-1 --no-delete --env {environment}
623
+
624
+ # Test Oracle - should get reward=1 (tests PASS after applying fix)
625
+ harbor run --agent oracle -p {dataset_path} -t {task_id} --jobs-dir {jobs_dir}/{task_id}-oracle-1 --env {environment}
626
+ ```
627
+
628
+ If you need to re-run after fixing issues, increment the number:
629
+ - First NOP attempt: `{task_id}-nop-1`, second: `{task_id}-nop-2`, etc.
630
+ - First Oracle attempt: `{task_id}-oracle-1`, second: `{task_id}-oracle-2`, etc.
631
+
632
+ ## Success Criteria
633
+
634
+ You're done when BOTH pass:
635
+ - **NOP**: reward=0 (tests fail because bug.patch reverted the fix)
636
+ - **Oracle**: reward=1 (tests pass after solve.sh applies the fix)
637
+
638
+ ## Finding Logs
639
+
640
+ After harbor runs, check `{jobs_dir}`:
641
+ - `{jobs_dir}/{task_id}-nop-N/<timestamp>/result.json` - NOP job result (N = run number)
642
+ - `{jobs_dir}/{task_id}-oracle-N/<timestamp>/result.json` - Oracle job result
643
+
644
+ Inside each job directory:
645
+ - `result.json` - Overall result with reward
646
+ - `verifier_stdout.txt` - Test output
647
+ - `verifier_stderr.txt` - Test errors
648
+
649
+ ## Common Issues & Fixes
650
+
651
+ ### Docker build fails
652
+ - **Missing language runtime** → Add installation commands
653
+ - **Missing system packages** → Check CI config, add to apt-get
654
+ - **Version mismatch** → Check version files (.nvmrc, .python-version, etc.)
655
+ - **Node.js: node-gyp errors** → Add `python3 make g++` to apt-get
656
+ - **Node.js: wrong version** → Check .nvmrc or package.json engines field
657
+
658
+ ### Tests fail unexpectedly
659
+ - **Missing build step** → Check if compiled language needs build
660
+ - **Wrong test command** → Check how tests are run in CI config
661
+ - **Missing env vars** → Check CI config for env setup
662
+ - **Coverage threshold fails** → Add --no-check-coverage or similar flag
663
+
664
+ ### JS/TS Specific Issues
665
+ - **"npm test" runs too many tests** → Use `npx <runner>` with specific files instead
666
+ - **Coverage threshold fails** → Add `--coverage=false` (Jest) or `--no-check-coverage` (TAP)
667
+ - **TypeScript compilation errors** → Check for missing build step
668
+ - **"Cannot find module"** → May need to run build before tests
669
+ - **Tests pass but shouldn't** → Check if tests are actually being run (look at output)
670
+ - **Deno/Bun tests incompatible** → Skip tests with `Deno.test()` or `bun:test` imports
671
+
672
+ ### NOP gets reward=1 (should be 0)
673
+ - Tests don't actually test the bug
674
+ - Wrong test files being run
675
+ - Tests are skipped or not executed (check test output!)
676
+
677
+ ### Oracle gets reward=0 (should be 1)
678
+ - fix.patch doesn't apply cleanly
679
+ - **TypeScript: MUST rebuild after patching** (most common JS/TS issue!)
680
+ - Missing post-patch setup steps
681
+
682
+ ## Your Approach
683
+
684
+ 1. **Read the skeleton files** first
685
+ 2. **Detect language** from repo files (package.json, go.mod, Cargo.toml, etc.)
686
+ 3. **Deep-analyze the repo** (package.json, CI config, test configs, version files)
687
+ 4. **Check test file compatibility** (JS/TS: filter out Deno/Bun tests!)
688
+ 5. **Fill in Dockerfile and test.sh**
689
+ 6. **Run NOP** and iterate until reward=0
690
+ 7. **Run Oracle** and iterate until reward=1
691
+ 8. **Clean up files** - Remove ALL TODO comments and template examples
692
+ 9. Done when both pass AND files are cleaned up!
693
+
694
+ ## Final Cleanup
695
+
696
+ **Once both NOP (reward=0) and Oracle (reward=1) pass**, you MUST clean up the files:
697
+
698
+ 1. **Remove ALL TODO comments** from Dockerfile and test.sh
699
+ 2. **Remove ALL template/example comments** (e.g., "Examples: CI=true, NODE_ENV=test...")
700
+ 3. **Remove large comment blocks** listing framework examples that aren't relevant
701
+ 4. **Keep only meaningful comments** that explain non-obvious steps specific to this task
702
+
703
+ **Files to clean:**
704
+ - `{task_dir}/environment/Dockerfile` - Remove TODOs, keep comments explaining non-standard steps
705
+ - `{task_dir}/tests/test.sh` - Remove TODOs and all example templates, keep only test-specific comments
706
+ """
707
+
708
+
709
+ def run_claude_code_session(
710
+ repo: str,
711
+ pr_number: int,
712
+ repo_path: Path,
713
+ task_dir: Path,
714
+ task_id: str,
715
+ dataset_path: Path,
716
+ test_files: list[str],
717
+ timeout: int = 900, # 15 minutes
718
+ verbose: bool = False,
719
+ reference_task_id: str | None = None,
720
+ reference_pr: int | None = None,
721
+ head_sha: str | None = None,
722
+ environment: str = "docker",
723
+ ) -> ClaudeCodeResult:
724
+ """
725
+ Run Claude Code session to complete skeleton and make harbor pass.
726
+
727
+ Args:
728
+ repo: Repository in "owner/repo" format
729
+ pr_number: PR number
730
+ repo_path: Path to local repo clone
731
+ task_dir: Path to the task directory
732
+ task_id: Task identifier
733
+ dataset_path: Path to Harbor dataset root
734
+ test_files: List of test file paths
735
+ timeout: Maximum time for session
736
+ verbose: If True, stream output to console
737
+ reference_task_id: If provided, task_id to copy Dockerfile/test.sh from
738
+ reference_pr: If provided, PR number of the reference task
739
+ head_sha: If provided, new HEAD SHA to use in Dockerfile
740
+ environment: Environment type for Harbor runs (docker, daytona, etc.)
741
+
742
+ Returns:
743
+ MakeItWorkResult with success status
744
+ """
745
+ # Run async session in sync context
746
+ return asyncio.run(
747
+ _run_claude_code_session_async(
748
+ repo=repo,
749
+ pr_number=pr_number,
750
+ repo_path=repo_path,
751
+ task_dir=task_dir,
752
+ task_id=task_id,
753
+ dataset_path=dataset_path,
754
+ test_files=test_files,
755
+ timeout=timeout,
756
+ verbose=verbose,
757
+ reference_task_id=reference_task_id,
758
+ reference_pr=reference_pr,
759
+ head_sha=head_sha,
760
+ environment=environment,
761
+ )
762
+ )
763
+
764
+
765
+ async def _run_claude_code_session_async(
766
+ repo: str,
767
+ pr_number: int,
768
+ repo_path: Path,
769
+ task_dir: Path,
770
+ task_id: str,
771
+ dataset_path: Path,
772
+ test_files: list[str],
773
+ timeout: int = 900,
774
+ verbose: bool = False,
775
+ reference_task_id: str | None = None,
776
+ reference_pr: int | None = None,
777
+ head_sha: str | None = None,
778
+ environment: str = "docker",
779
+ ) -> ClaudeCodeResult:
780
+ """Async implementation of Claude Code session."""
781
+ logger = logging.getLogger("swegen")
782
+ logger.info("Starting Claude Code session for: %s", task_id)
783
+
784
+ # Resolve all paths to absolute paths for reliable usage
785
+ dataset_path = Path(dataset_path).resolve()
786
+ task_dir = Path(task_dir).resolve()
787
+ repo_path = Path(repo_path).resolve()
788
+
789
+ # Jobs directory for harbor output
790
+ jobs_dir = dataset_path.parent / ".state" / "harbor-jobs"
791
+ jobs_dir.mkdir(parents=True, exist_ok=True)
792
+ jobs_dir = jobs_dir.resolve()
793
+
794
+ # Format test files list
795
+ if test_files:
796
+ test_files_list = "\n".join(f" - {tf}" for tf in test_files)
797
+ else:
798
+ test_files_list = " (none)"
799
+
800
+ # Choose prompt based on whether we're using a reference task
801
+ if reference_task_id and reference_pr:
802
+ reference_task_dir = (dataset_path / reference_task_id).resolve()
803
+ prompt_text = CC_REFERENCE_PROMPT.format(
804
+ repo=repo,
805
+ pr_number=pr_number,
806
+ reference_pr=reference_pr,
807
+ reference_task_id=reference_task_id,
808
+ reference_task_dir=reference_task_dir,
809
+ repo_path=repo_path,
810
+ task_dir=task_dir,
811
+ task_id=task_id,
812
+ dataset_path=dataset_path,
813
+ jobs_dir=jobs_dir,
814
+ test_files_list=test_files_list,
815
+ head_sha=head_sha or "(check metadata)",
816
+ environment=environment,
817
+ )
818
+ logger.info(
819
+ f"Using reference prompt (copying from {reference_task_id}, PR #{reference_pr})"
820
+ )
821
+ else:
822
+ prompt_text = CC_PROMPT.format(
823
+ repo=repo,
824
+ pr_number=pr_number,
825
+ repo_path=repo_path,
826
+ task_dir=task_dir,
827
+ task_id=task_id,
828
+ dataset_path=dataset_path,
829
+ jobs_dir=jobs_dir,
830
+ test_files_list=test_files_list,
831
+ environment=environment,
832
+ )
833
+ logger.info("Using full prompt (generating from skeleton)")
834
+
835
+ # Create hook for logging Harbor validation attempts
836
+ harbor_runs: list[str] = []
837
+
838
+ async def log_harbor_runs(input_data: dict, tool_use_id: str, context: dict) -> dict:
839
+ """Log Harbor validation attempts for debugging."""
840
+ command = input_data.get("tool_input", {}).get("command", "")
841
+ if "harbor run" in command:
842
+ harbor_runs.append(command)
843
+ if verbose:
844
+ print(f"{Colors.YELLOW}[Harbor]{Colors.RESET} {command}", flush=True)
845
+ return {}
846
+
847
+ try:
848
+ logger.info("Invoking Claude Code SDK with %ds timeout...", timeout)
849
+
850
+ if verbose:
851
+ project_root = os.getcwd()
852
+ print("[SDK] Running Claude Code Agent SDK", flush=True)
853
+ print(f"[SDK] Working directory: {project_root}", flush=True)
854
+ print(f"[SDK] Repo path: {repo_path}", flush=True)
855
+ print(f"[SDK] Task dir: {task_dir}", flush=True)
856
+ print("-" * 60, flush=True)
857
+
858
+ # Configure SDK options
859
+ options = ClaudeAgentOptions(
860
+ allowed_tools=["Read", "Write", "Edit", "Glob", "Grep", "LS", "Bash"],
861
+ permission_mode="bypassPermissions", # Auto-approve actions
862
+ cwd=os.getcwd(), # Run from project root
863
+ model="sonnet", # Use Sonnet model
864
+ hooks={
865
+ "PreToolUse": [HookMatcher(matcher="Bash", hooks=[log_harbor_runs])]
866
+ } if verbose else {},
867
+ )
868
+
869
+ # Run with timeout
870
+ try:
871
+ async with asyncio.timeout(timeout):
872
+ response_parts = []
873
+
874
+ if verbose:
875
+ # Stream messages with real-time display
876
+ async for message in query(prompt=prompt_text, options=options):
877
+ print_sdk_message(message)
878
+
879
+ # Collect text for final result
880
+ if isinstance(message, AssistantMessage):
881
+ for block in message.content:
882
+ if isinstance(block, TextBlock):
883
+ response_parts.append(block.text)
884
+ else:
885
+ # Collect messages without printing
886
+ async for message in query(prompt=prompt_text, options=options):
887
+ if isinstance(message, AssistantMessage):
888
+ for block in message.content:
889
+ if isinstance(block, TextBlock):
890
+ response_parts.append(block.text)
891
+
892
+ except TimeoutError:
893
+ logger.warning("Claude Code session timed out after %ds", timeout)
894
+ if verbose:
895
+ print(f"\n[SDK] Timed out after {timeout}s", flush=True)
896
+ return _check_validation_state(jobs_dir, task_id, logger, timed_out=True)
897
+
898
+ if verbose:
899
+ print("-" * 60, flush=True)
900
+ print("[SDK] Session complete", flush=True)
901
+
902
+ # Check final state from job files
903
+ return _check_validation_state(jobs_dir, task_id, logger)
904
+
905
+ except Exception as e:
906
+ logger.error("Claude Code session failed: %s", e)
907
+ return ClaudeCodeResult(
908
+ success=False,
909
+ nop_passed=False,
910
+ oracle_passed=False,
911
+ error_message=f"SDK failed: {e}",
912
+ )
913
+
914
+
915
+ def _check_validation_state(
916
+ jobs_dir: Path,
917
+ task_id: str,
918
+ logger: logging.Logger,
919
+ timed_out: bool = False,
920
+ ) -> ClaudeCodeResult:
921
+ """Check validation state from harbor job results."""
922
+ nop_passed, oracle_passed = _check_job_results(jobs_dir, task_id)
923
+ success = nop_passed and oracle_passed
924
+
925
+ error_message = None
926
+ if not success:
927
+ parts = []
928
+ if timed_out:
929
+ parts.append("CC timed out")
930
+ if not nop_passed:
931
+ parts.append("NOP failed (expected reward=0)")
932
+ if not oracle_passed:
933
+ parts.append("Oracle failed (expected reward=1)")
934
+ error_message = "; ".join(parts) if parts else None
935
+
936
+ return ClaudeCodeResult(
937
+ success=success,
938
+ nop_passed=nop_passed,
939
+ oracle_passed=oracle_passed,
940
+ error_message=error_message,
941
+ )
942
+
943
+
944
+ def _check_job_results(jobs_dir: Path, task_id: str) -> tuple[bool, bool]:
945
+ """Check the actual job results to determine validation state.
946
+
947
+ Looks for job directories matching:
948
+ - {task_id}-nop-N (where N is 1, 2, 3, etc.)
949
+ - {task_id}-oracle-N
950
+
951
+ Finds the most recent result.json by modification time.
952
+ """
953
+ nop_passed = False
954
+ oracle_passed = False
955
+
956
+ if not jobs_dir.exists():
957
+ return nop_passed, oracle_passed
958
+
959
+ def find_most_recent_result(pattern: str) -> Path | None:
960
+ """Find most recent result.json matching pattern."""
961
+ best_path = None
962
+ best_mtime = 0.0
963
+
964
+ for job_dir in jobs_dir.glob(pattern):
965
+ if not job_dir.is_dir():
966
+ continue
967
+ # Find result.json (Harbor creates a timestamped subdir inside --jobs-dir)
968
+ for result_file in job_dir.rglob("result.json"):
969
+ mtime = result_file.stat().st_mtime
970
+ if mtime > best_mtime:
971
+ best_mtime = mtime
972
+ best_path = result_file
973
+
974
+ return best_path
975
+
976
+ # Find most recent NOP result
977
+ nop_result_path = find_most_recent_result(f"{task_id}-nop-*")
978
+ if nop_result_path:
979
+ reward = parse_harbor_outcome(nop_result_path).reward
980
+ nop_passed = reward == 0
981
+
982
+ # Find most recent Oracle result
983
+ oracle_result_path = find_most_recent_result(f"{task_id}-oracle-*")
984
+ if oracle_result_path:
985
+ reward = parse_harbor_outcome(oracle_result_path).reward
986
+ oracle_passed = reward == 1
987
+
988
+ return nop_passed, oracle_passed