swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
swegen/tools/validate.py
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from harbor.models.environment_type import EnvironmentType
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
|
|
16
|
+
from .harbor_runner import parse_harbor_outcome, run_harbor_agent
|
|
17
|
+
|
|
18
|
+
DOCKER_CLEANUP_CMD = "docker system prune -af"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ValidateArgs:
|
|
23
|
+
path: Path
|
|
24
|
+
task: str | None
|
|
25
|
+
jobs_dir: Path
|
|
26
|
+
agent: str # "both" | "nop" | "oracle"
|
|
27
|
+
timeout_multiplier: float | None = None
|
|
28
|
+
verbose: bool = False
|
|
29
|
+
quiet: bool = False
|
|
30
|
+
environment: EnvironmentType = EnvironmentType.DOCKER
|
|
31
|
+
max_parallel: int = 8
|
|
32
|
+
show_passed: bool = False
|
|
33
|
+
output_file: Path | None = None # Write results to file as they complete
|
|
34
|
+
docker_prune_batch: int = 5 # Run docker cleanup after every N tasks (0 to disable)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ValidationResult:
|
|
39
|
+
"""Result of validating a single task."""
|
|
40
|
+
|
|
41
|
+
task_id: str
|
|
42
|
+
nop_reward: float | None
|
|
43
|
+
oracle_reward: float | None
|
|
44
|
+
nop_exit_code: int
|
|
45
|
+
oracle_exit_code: int
|
|
46
|
+
passed: bool
|
|
47
|
+
error: str | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def run_validate(args: ValidateArgs) -> None:
|
|
51
|
+
"""Main entry point - routes to single or batch validation."""
|
|
52
|
+
dataset_path, task_id, task_dir = _resolve_paths(args)
|
|
53
|
+
|
|
54
|
+
if task_id is None:
|
|
55
|
+
_run_batch_mode(args, dataset_path)
|
|
56
|
+
else:
|
|
57
|
+
_run_single_mode(args, dataset_path, task_id, task_dir)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_paths(args: ValidateArgs) -> tuple[Path, str | None, Path | None]:
|
|
61
|
+
"""Resolve paths and determine if single or batch mode.
|
|
62
|
+
|
|
63
|
+
Returns: (dataset_path, task_id, task_dir)
|
|
64
|
+
task_id/task_dir are None for batch mode
|
|
65
|
+
"""
|
|
66
|
+
path = args.path.resolve()
|
|
67
|
+
|
|
68
|
+
if args.task:
|
|
69
|
+
# Explicit task ID: single mode
|
|
70
|
+
return path, args.task, path / args.task
|
|
71
|
+
|
|
72
|
+
if path.is_dir() and (path / "tests" / "test.sh").exists():
|
|
73
|
+
# Path is a task directory: single mode
|
|
74
|
+
return path.parent, path.name, path
|
|
75
|
+
|
|
76
|
+
if path.is_dir():
|
|
77
|
+
# Check if directory contains tasks: batch mode
|
|
78
|
+
tasks = [d for d in path.iterdir() if d.is_dir() and (d / "tests" / "test.sh").exists()]
|
|
79
|
+
if tasks:
|
|
80
|
+
return path, None, None
|
|
81
|
+
raise SystemExit(
|
|
82
|
+
f"No tasks found in directory: {path}\nExpected directories with tests/test.sh"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
raise SystemExit(
|
|
86
|
+
"Path must be:\n"
|
|
87
|
+
" 1. A task directory (containing tests/test.sh), or\n"
|
|
88
|
+
" 2. A dataset directory with multiple tasks"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ============================================================================
|
|
93
|
+
# SINGLE TASK MODE
|
|
94
|
+
# ============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _run_single_mode(args: ValidateArgs, dataset_path: Path, task_id: str, task_dir: Path) -> None:
|
|
98
|
+
"""Validate a single task with traditional output."""
|
|
99
|
+
jobs_dir = args.jobs_dir.resolve()
|
|
100
|
+
jobs_dir.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
# Run regular validation
|
|
103
|
+
print("[validate] Running regular validation...")
|
|
104
|
+
nop_reward, oracle_reward = _run_agents(
|
|
105
|
+
task_id, dataset_path, jobs_dir, args.agent, args.timeout_multiplier, args.environment
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Check results
|
|
109
|
+
if args.agent == "both":
|
|
110
|
+
if nop_reward != 0 or oracle_reward != 1:
|
|
111
|
+
print("\n[validate] FAILED: Harbor validation did not meet expectations")
|
|
112
|
+
print(f" NOP: expected reward=0, got reward={nop_reward}")
|
|
113
|
+
print(f" ORACLE: expected reward=1, got reward={oracle_reward}")
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
else:
|
|
116
|
+
print("\n[validate] PASSED: Harbor validation met expectations")
|
|
117
|
+
print(f" NOP: reward={nop_reward} ✓")
|
|
118
|
+
print(f" ORACLE: reward={oracle_reward} ✓")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _run_agents(
|
|
122
|
+
task_id: str,
|
|
123
|
+
dataset_path: Path,
|
|
124
|
+
jobs_dir: Path,
|
|
125
|
+
agent: str,
|
|
126
|
+
timeout_multiplier: float | None,
|
|
127
|
+
environment: EnvironmentType = EnvironmentType.DOCKER,
|
|
128
|
+
) -> tuple[float | None, float | None]:
|
|
129
|
+
"""Run NOP and/or Oracle agents, return (nop_reward, oracle_reward)."""
|
|
130
|
+
nop_reward = oracle_reward = None
|
|
131
|
+
|
|
132
|
+
if agent in ("nop", "both"):
|
|
133
|
+
# When running both, keep image for nop so oracle can reuse it
|
|
134
|
+
delete_after = agent == "nop" # Only delete if ONLY running nop
|
|
135
|
+
code, job_result = run_harbor_agent(
|
|
136
|
+
task_id,
|
|
137
|
+
dataset_path,
|
|
138
|
+
jobs_dir,
|
|
139
|
+
"nop",
|
|
140
|
+
timeout_multiplier,
|
|
141
|
+
delete_after=delete_after,
|
|
142
|
+
environment=environment,
|
|
143
|
+
)
|
|
144
|
+
nop_reward = parse_harbor_outcome(job_result).reward
|
|
145
|
+
print(f"[validate] nop exit={code}, reward={nop_reward}")
|
|
146
|
+
|
|
147
|
+
if agent in ("oracle", "both"):
|
|
148
|
+
# Oracle always deletes (cleanup)
|
|
149
|
+
code, job_result = run_harbor_agent(
|
|
150
|
+
task_id,
|
|
151
|
+
dataset_path,
|
|
152
|
+
jobs_dir,
|
|
153
|
+
"oracle",
|
|
154
|
+
timeout_multiplier,
|
|
155
|
+
delete_after=True,
|
|
156
|
+
environment=environment,
|
|
157
|
+
)
|
|
158
|
+
oracle_reward = parse_harbor_outcome(job_result).reward
|
|
159
|
+
print(f"[validate] oracle exit={code}, reward={oracle_reward}")
|
|
160
|
+
|
|
161
|
+
return nop_reward, oracle_reward
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ============================================================================
|
|
165
|
+
# BATCH MODE
|
|
166
|
+
# ============================================================================
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _run_batch_mode(args: ValidateArgs, dataset_path: Path) -> None:
|
|
170
|
+
"""Validate all tasks in parallel with clean output."""
|
|
171
|
+
console = Console()
|
|
172
|
+
jobs_dir = args.jobs_dir.resolve()
|
|
173
|
+
jobs_dir.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
|
|
175
|
+
# Find tasks
|
|
176
|
+
task_dirs = [
|
|
177
|
+
d for d in dataset_path.iterdir() if d.is_dir() and (d / "tests" / "test.sh").exists()
|
|
178
|
+
]
|
|
179
|
+
if not task_dirs:
|
|
180
|
+
console.print("[yellow]No tasks found[/yellow]")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
console.print(f"[blue]Found {len(task_dirs)} task(s) to validate[/blue]")
|
|
184
|
+
console.print(f"[blue]Parallel: {args.max_parallel} | Agent: {args.agent}[/blue]")
|
|
185
|
+
if args.output_file:
|
|
186
|
+
console.print(f"[blue]Output: {args.output_file}[/blue]")
|
|
187
|
+
# Show docker prune setting for local docker
|
|
188
|
+
if args.environment == EnvironmentType.DOCKER and args.docker_prune_batch > 0:
|
|
189
|
+
console.print(f"[blue]Docker prune: every {args.docker_prune_batch} tasks[/blue]")
|
|
190
|
+
console.print()
|
|
191
|
+
|
|
192
|
+
# Run validations
|
|
193
|
+
results = asyncio.run(
|
|
194
|
+
_validate_batch(
|
|
195
|
+
task_dirs,
|
|
196
|
+
dataset_path,
|
|
197
|
+
jobs_dir,
|
|
198
|
+
args.agent,
|
|
199
|
+
args.max_parallel,
|
|
200
|
+
args.timeout_multiplier,
|
|
201
|
+
args.environment,
|
|
202
|
+
console,
|
|
203
|
+
args.output_file,
|
|
204
|
+
args.docker_prune_batch,
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Print results
|
|
209
|
+
_print_results(results, args.agent, args.show_passed, console)
|
|
210
|
+
|
|
211
|
+
# Exit with failure if any tasks failed
|
|
212
|
+
if not all(r.passed for r in results):
|
|
213
|
+
sys.exit(1)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
async def _validate_batch(
|
|
217
|
+
task_dirs: list[Path],
|
|
218
|
+
dataset_path: Path,
|
|
219
|
+
jobs_dir: Path,
|
|
220
|
+
agent: str,
|
|
221
|
+
max_parallel: int,
|
|
222
|
+
timeout_multiplier: float | None,
|
|
223
|
+
environment: EnvironmentType,
|
|
224
|
+
console: Console,
|
|
225
|
+
output_file: Path | None = None,
|
|
226
|
+
docker_prune_batch: int = 5,
|
|
227
|
+
) -> list[ValidationResult]:
|
|
228
|
+
"""Run validations in parallel with progress bar."""
|
|
229
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
230
|
+
|
|
231
|
+
# Track completed count for docker pruning
|
|
232
|
+
completed_count = 0
|
|
233
|
+
prune_lock = asyncio.Lock()
|
|
234
|
+
|
|
235
|
+
# Lock and file handle for sequential writes
|
|
236
|
+
write_lock = asyncio.Lock()
|
|
237
|
+
file_handle = None
|
|
238
|
+
if output_file:
|
|
239
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
file_handle = open(output_file, "w")
|
|
241
|
+
# Write header
|
|
242
|
+
file_handle.write(f"# Validation results - {len(task_dirs)} tasks\n")
|
|
243
|
+
file_handle.write("# Format: TASK_ID: NOP=<reward> ORACLE=<reward> <STATUS>\n\n")
|
|
244
|
+
file_handle.flush()
|
|
245
|
+
|
|
246
|
+
async def write_result(result: ValidationResult) -> None:
|
|
247
|
+
"""Write a single result to file (thread-safe)."""
|
|
248
|
+
if file_handle is None:
|
|
249
|
+
return
|
|
250
|
+
async with write_lock:
|
|
251
|
+
line = _format_result_line(result, agent)
|
|
252
|
+
file_handle.write(line + "\n")
|
|
253
|
+
file_handle.flush() # Ensure immediate write to disk
|
|
254
|
+
|
|
255
|
+
async def validate_one(task_dir: Path) -> ValidationResult:
|
|
256
|
+
async with semaphore:
|
|
257
|
+
try:
|
|
258
|
+
nop_reward = oracle_reward = None
|
|
259
|
+
nop_code = oracle_code = 0
|
|
260
|
+
|
|
261
|
+
# Run NOP (capture_output=True to suppress Harbor's verbose output)
|
|
262
|
+
if agent in ("nop", "both"):
|
|
263
|
+
# When running both, keep image for nop so oracle can reuse it
|
|
264
|
+
delete_after = agent == "nop" # Only delete if ONLY running nop
|
|
265
|
+
nop_code, job = await asyncio.to_thread(
|
|
266
|
+
run_harbor_agent,
|
|
267
|
+
task_dir.name,
|
|
268
|
+
dataset_path,
|
|
269
|
+
jobs_dir,
|
|
270
|
+
"nop",
|
|
271
|
+
timeout_multiplier,
|
|
272
|
+
True,
|
|
273
|
+
delete_after,
|
|
274
|
+
environment,
|
|
275
|
+
)
|
|
276
|
+
nop_reward = parse_harbor_outcome(job).reward
|
|
277
|
+
|
|
278
|
+
# Run Oracle (capture_output=True to suppress Harbor's verbose output)
|
|
279
|
+
if agent in ("oracle", "both"):
|
|
280
|
+
# Oracle always deletes (cleanup)
|
|
281
|
+
oracle_code, job = await asyncio.to_thread(
|
|
282
|
+
run_harbor_agent,
|
|
283
|
+
task_dir.name,
|
|
284
|
+
dataset_path,
|
|
285
|
+
jobs_dir,
|
|
286
|
+
"oracle",
|
|
287
|
+
timeout_multiplier,
|
|
288
|
+
True,
|
|
289
|
+
True,
|
|
290
|
+
environment,
|
|
291
|
+
)
|
|
292
|
+
oracle_reward = parse_harbor_outcome(job).reward
|
|
293
|
+
|
|
294
|
+
# Determine pass/fail
|
|
295
|
+
passed = _check_passed(agent, nop_reward, oracle_reward)
|
|
296
|
+
|
|
297
|
+
result = ValidationResult(
|
|
298
|
+
task_id=task_dir.name,
|
|
299
|
+
nop_reward=nop_reward,
|
|
300
|
+
oracle_reward=oracle_reward,
|
|
301
|
+
nop_exit_code=nop_code,
|
|
302
|
+
oracle_exit_code=oracle_code,
|
|
303
|
+
passed=passed,
|
|
304
|
+
)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
result = ValidationResult(
|
|
307
|
+
task_id=task_dir.name,
|
|
308
|
+
nop_reward=None,
|
|
309
|
+
oracle_reward=None,
|
|
310
|
+
nop_exit_code=-1,
|
|
311
|
+
oracle_exit_code=-1,
|
|
312
|
+
passed=False,
|
|
313
|
+
error=str(e),
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Write to file immediately
|
|
317
|
+
await write_result(result)
|
|
318
|
+
return result
|
|
319
|
+
|
|
320
|
+
async def maybe_prune_docker(count: int) -> None:
|
|
321
|
+
"""Run docker prune if conditions are met (local docker only, every N tasks)."""
|
|
322
|
+
if environment != EnvironmentType.DOCKER:
|
|
323
|
+
return
|
|
324
|
+
if docker_prune_batch <= 0:
|
|
325
|
+
return
|
|
326
|
+
if count % docker_prune_batch != 0:
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
async with prune_lock:
|
|
330
|
+
await asyncio.to_thread(_prune_docker, console)
|
|
331
|
+
|
|
332
|
+
# Run with progress bar
|
|
333
|
+
results = []
|
|
334
|
+
try:
|
|
335
|
+
with Progress(
|
|
336
|
+
SpinnerColumn(),
|
|
337
|
+
TextColumn("[progress.description]{task.description}"),
|
|
338
|
+
BarColumn(),
|
|
339
|
+
TaskProgressColumn(),
|
|
340
|
+
console=console,
|
|
341
|
+
) as progress:
|
|
342
|
+
task_prog = progress.add_task("[cyan]Validating tasks...", total=len(task_dirs))
|
|
343
|
+
|
|
344
|
+
for coro in asyncio.as_completed([validate_one(d) for d in task_dirs]):
|
|
345
|
+
results.append(await coro)
|
|
346
|
+
progress.update(task_prog, advance=1)
|
|
347
|
+
|
|
348
|
+
# Docker cleanup after batch (local docker only)
|
|
349
|
+
completed_count = len(results)
|
|
350
|
+
await maybe_prune_docker(completed_count)
|
|
351
|
+
finally:
|
|
352
|
+
if file_handle:
|
|
353
|
+
# Write summary at end
|
|
354
|
+
passed = sum(1 for r in results if r.passed and not r.error)
|
|
355
|
+
failed = sum(1 for r in results if not r.passed and not r.error)
|
|
356
|
+
errors = sum(1 for r in results if r.error)
|
|
357
|
+
file_handle.write(f"\n# Summary: {passed} passed, {failed} failed, {errors} errors\n")
|
|
358
|
+
file_handle.close()
|
|
359
|
+
|
|
360
|
+
return results
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _format_result_line(result: ValidationResult, agent: str) -> str:
|
|
364
|
+
"""Format a single result as a text line."""
|
|
365
|
+
parts = [result.task_id + ":"]
|
|
366
|
+
|
|
367
|
+
if agent in ("nop", "both"):
|
|
368
|
+
if result.nop_reward is not None:
|
|
369
|
+
parts.append(f"NOP={result.nop_reward}")
|
|
370
|
+
else:
|
|
371
|
+
parts.append("NOP=ERROR")
|
|
372
|
+
|
|
373
|
+
if agent in ("oracle", "both"):
|
|
374
|
+
if result.oracle_reward is not None:
|
|
375
|
+
parts.append(f"ORACLE={result.oracle_reward}")
|
|
376
|
+
else:
|
|
377
|
+
parts.append("ORACLE=ERROR")
|
|
378
|
+
|
|
379
|
+
if result.error:
|
|
380
|
+
parts.append(f"ERROR: {result.error}")
|
|
381
|
+
elif result.passed:
|
|
382
|
+
parts.append("PASS")
|
|
383
|
+
else:
|
|
384
|
+
parts.append("FAIL")
|
|
385
|
+
|
|
386
|
+
return " ".join(parts)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _check_passed(agent: str, nop_reward: float | None, oracle_reward: float | None) -> bool:
|
|
390
|
+
"""Check if validation passed based on agent type and rewards."""
|
|
391
|
+
if agent == "both":
|
|
392
|
+
return nop_reward == 0 and oracle_reward == 1
|
|
393
|
+
elif agent == "nop":
|
|
394
|
+
return nop_reward == 0
|
|
395
|
+
elif agent == "oracle":
|
|
396
|
+
return oracle_reward == 1
|
|
397
|
+
return False
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _print_results(
|
|
401
|
+
results: list[ValidationResult], agent: str, show_passed: bool, console: Console
|
|
402
|
+
) -> None:
|
|
403
|
+
"""Print results table (failures only by default) and summary."""
|
|
404
|
+
passed = [r for r in results if r.passed and not r.error]
|
|
405
|
+
failed = [r for r in results if not r.passed and not r.error]
|
|
406
|
+
errors = [r for r in results if r.error]
|
|
407
|
+
|
|
408
|
+
# Show table if there are failures/errors or if show_passed requested
|
|
409
|
+
if failed or errors or show_passed:
|
|
410
|
+
table = Table(
|
|
411
|
+
title="Validation Failures" if not show_passed else "Validation Results",
|
|
412
|
+
title_style="bold cyan",
|
|
413
|
+
show_lines=True,
|
|
414
|
+
)
|
|
415
|
+
table.add_column("Task ID", style="cyan")
|
|
416
|
+
|
|
417
|
+
if agent in ("nop", "both"):
|
|
418
|
+
table.add_column("NOP", justify="center")
|
|
419
|
+
if agent in ("oracle", "both"):
|
|
420
|
+
table.add_column("Oracle", justify="center")
|
|
421
|
+
|
|
422
|
+
table.add_column("Status", justify="center")
|
|
423
|
+
table.add_column("Notes")
|
|
424
|
+
|
|
425
|
+
# Show errors, then failures, then passed (if requested)
|
|
426
|
+
for result in sorted(
|
|
427
|
+
errors + failed + (passed if show_passed else []), key=lambda r: r.task_id
|
|
428
|
+
):
|
|
429
|
+
_add_result_row(table, result, agent)
|
|
430
|
+
|
|
431
|
+
console.print("\n")
|
|
432
|
+
console.print(table)
|
|
433
|
+
|
|
434
|
+
# Always show summary
|
|
435
|
+
console.print("\n[bold]Summary:[/bold]")
|
|
436
|
+
console.print(f" ✅ Passed: {len(passed)}")
|
|
437
|
+
console.print(f" ❌ Failed: {len(failed)}")
|
|
438
|
+
console.print(f" ⚠️ Errors: {len(errors)}")
|
|
439
|
+
console.print(f" 📊 Total: {len(results)}")
|
|
440
|
+
|
|
441
|
+
if not failed and not errors:
|
|
442
|
+
console.print(f"\n[bold green]🎉 All {len(passed)} task(s) passed validation![/bold green]")
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _add_result_row(table: Table, result: ValidationResult, agent: str) -> None:
|
|
446
|
+
"""Add a single result row to the table."""
|
|
447
|
+
row = [result.task_id]
|
|
448
|
+
|
|
449
|
+
if result.error:
|
|
450
|
+
# Error row
|
|
451
|
+
if agent in ("nop", "both"):
|
|
452
|
+
row.append("?")
|
|
453
|
+
if agent in ("oracle", "both"):
|
|
454
|
+
row.append("?")
|
|
455
|
+
row.extend(["❌ ERROR", result.error])
|
|
456
|
+
table.add_row(*row, style="red")
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
if result.passed:
|
|
460
|
+
# Passed row (only shown if show_passed=True)
|
|
461
|
+
if agent in ("nop", "both"):
|
|
462
|
+
row.append(f"✓ ({result.nop_reward})" if result.nop_reward is not None else "—")
|
|
463
|
+
if agent in ("oracle", "both"):
|
|
464
|
+
row.append(f"✓ ({result.oracle_reward})" if result.oracle_reward is not None else "—")
|
|
465
|
+
row.extend(["✅ PASS", ""])
|
|
466
|
+
table.add_row(*row, style="green")
|
|
467
|
+
return
|
|
468
|
+
|
|
469
|
+
# Failed row
|
|
470
|
+
notes = []
|
|
471
|
+
|
|
472
|
+
if agent in ("nop", "both"):
|
|
473
|
+
if result.nop_reward is not None:
|
|
474
|
+
row.append(f"{'✓' if result.nop_reward == 0 else '✗'} ({result.nop_reward})")
|
|
475
|
+
if result.nop_reward != 0:
|
|
476
|
+
notes.append(f"NOP expected 0, got {result.nop_reward}")
|
|
477
|
+
else:
|
|
478
|
+
row.append("—")
|
|
479
|
+
|
|
480
|
+
if agent in ("oracle", "both"):
|
|
481
|
+
if result.oracle_reward is not None:
|
|
482
|
+
row.append(f"{'✓' if result.oracle_reward == 1 else '✗'} ({result.oracle_reward})")
|
|
483
|
+
if result.oracle_reward != 1:
|
|
484
|
+
notes.append(f"Oracle expected 1, got {result.oracle_reward}")
|
|
485
|
+
else:
|
|
486
|
+
row.append("—")
|
|
487
|
+
|
|
488
|
+
row.extend(["❌ FAIL", "; ".join(notes)])
|
|
489
|
+
table.add_row(*row, style="red")
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _prune_docker(console: Console) -> None:
|
|
493
|
+
"""Run docker cleanup to free disk space."""
|
|
494
|
+
if shutil.which("docker") is None:
|
|
495
|
+
console.print(
|
|
496
|
+
"[yellow]Skipping docker prune (docker binary not found in PATH).[/yellow]"
|
|
497
|
+
)
|
|
498
|
+
return
|
|
499
|
+
|
|
500
|
+
console.print(
|
|
501
|
+
Panel(
|
|
502
|
+
f"Running docker cleanup: {DOCKER_CLEANUP_CMD}",
|
|
503
|
+
title="Disk cleanup",
|
|
504
|
+
border_style="yellow",
|
|
505
|
+
)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
try:
|
|
509
|
+
result = subprocess.run(
|
|
510
|
+
DOCKER_CLEANUP_CMD,
|
|
511
|
+
shell=True,
|
|
512
|
+
capture_output=True,
|
|
513
|
+
text=True,
|
|
514
|
+
timeout=600,
|
|
515
|
+
)
|
|
516
|
+
if result.returncode == 0:
|
|
517
|
+
console.print("[green]Docker cleanup completed[/green]")
|
|
518
|
+
else:
|
|
519
|
+
console.print(f"[yellow]Docker cleanup returned code {result.returncode}[/yellow]")
|
|
520
|
+
except subprocess.TimeoutExpired:
|
|
521
|
+
console.print("[yellow]Docker cleanup timed out after 10 minutes[/yellow]")
|
|
522
|
+
except Exception as e:
|
|
523
|
+
console.print(f"[yellow]Docker cleanup failed: {e}[/yellow]")
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from harbor.models.environment_type import EnvironmentType
|
|
7
|
+
from harbor.models.task.task import Task
|
|
8
|
+
|
|
9
|
+
from .harbor_runner import parse_harbor_outcome, run_harbor_agent
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ValidationError(Exception):
|
|
13
|
+
"""Raised when Harbor validation fails (NOP or Oracle)."""
|
|
14
|
+
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def validate_task_structure(task_dir: Path) -> bool:
|
|
19
|
+
"""Validate task structure using Harbor's Task model.
|
|
20
|
+
|
|
21
|
+
This ensures the generated task has all required files and valid structure
|
|
22
|
+
before running Harbor validation.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
task_dir: Path to the task directory
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
True if task is valid
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ValidationError: If task structure is invalid with details
|
|
32
|
+
"""
|
|
33
|
+
logger = logging.getLogger("swegen")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
# Use Harbor's Task model to validate structure
|
|
37
|
+
task = Task(task_dir)
|
|
38
|
+
|
|
39
|
+
# Verify required attributes are present
|
|
40
|
+
if not task.instruction or len(task.instruction.strip()) < 10:
|
|
41
|
+
raise ValidationError("Invalid instruction: too short or empty")
|
|
42
|
+
|
|
43
|
+
if not task.config:
|
|
44
|
+
raise ValidationError("Missing or invalid task.toml")
|
|
45
|
+
|
|
46
|
+
# Verify required files exist
|
|
47
|
+
paths = task.paths
|
|
48
|
+
required_files = [
|
|
49
|
+
(paths.instruction_path, "instruction.md"),
|
|
50
|
+
(paths.config_path, "task.toml"),
|
|
51
|
+
(paths.solve_path, "solution/solve.sh"),
|
|
52
|
+
(paths.test_path, "tests/test.sh"),
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
for file_path, name in required_files:
|
|
56
|
+
if not file_path.exists():
|
|
57
|
+
raise ValidationError(f"Missing required file: {name}")
|
|
58
|
+
|
|
59
|
+
logger.debug(f"✓ Task structure validated: {task.name}")
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
except ValidationError:
|
|
63
|
+
raise
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Task validation failed: {e}")
|
|
66
|
+
raise ValidationError(f"Task structure validation failed: {e}") from e
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def run_nop_oracle(
|
|
70
|
+
task_id: str,
|
|
71
|
+
dataset_path: Path,
|
|
72
|
+
jobs_dir: Path,
|
|
73
|
+
timeout_multiplier: float | None = None,
|
|
74
|
+
environment: EnvironmentType = EnvironmentType.DOCKER,
|
|
75
|
+
) -> tuple[int | None, int | None, dict[str, Path | None]]:
|
|
76
|
+
"""Run both NOP and Oracle validations sequentially.
|
|
77
|
+
|
|
78
|
+
Validations are always run sequentially to avoid Docker conflict issues.
|
|
79
|
+
NOP keeps the Docker image so Oracle can reuse it (much faster).
|
|
80
|
+
Oracle deletes the image after running (cleanup).
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
task_id: Task identifier
|
|
84
|
+
dataset_path: Harbor dataset root path
|
|
85
|
+
jobs_dir: Jobs directory path
|
|
86
|
+
timeout_multiplier: Optional timeout multiplier
|
|
87
|
+
environment: Environment type (docker, daytona, e2b, modal, runloop, gke)
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Tuple of (nop_reward, oracle_reward, job_dirs) where:
|
|
91
|
+
- nop_reward: 0 if tests fail on buggy code (expected), None if error
|
|
92
|
+
- oracle_reward: 1 if tests pass after fix (expected), None if error
|
|
93
|
+
- job_dirs: Dict mapping "nop"/"oracle" to job result paths
|
|
94
|
+
"""
|
|
95
|
+
job_dirs: dict[str, Path | None] = {"nop": None, "oracle": None}
|
|
96
|
+
|
|
97
|
+
# NOP: Keep image (delete_after=False) so Oracle can reuse it
|
|
98
|
+
_, nop_result = run_harbor_agent(
|
|
99
|
+
task_id=task_id,
|
|
100
|
+
dataset_path=dataset_path,
|
|
101
|
+
jobs_dir=jobs_dir,
|
|
102
|
+
agent="nop",
|
|
103
|
+
timeout_multiplier=timeout_multiplier,
|
|
104
|
+
capture_output=True,
|
|
105
|
+
delete_after=False,
|
|
106
|
+
environment=environment,
|
|
107
|
+
)
|
|
108
|
+
nop_reward = parse_harbor_outcome(nop_result).reward
|
|
109
|
+
job_dirs["nop"] = nop_result.parent if nop_result else None
|
|
110
|
+
|
|
111
|
+
# Oracle: Delete image after running (cleanup)
|
|
112
|
+
_, oracle_result = run_harbor_agent(
|
|
113
|
+
task_id=task_id,
|
|
114
|
+
dataset_path=dataset_path,
|
|
115
|
+
jobs_dir=jobs_dir,
|
|
116
|
+
agent="oracle",
|
|
117
|
+
timeout_multiplier=timeout_multiplier,
|
|
118
|
+
capture_output=True,
|
|
119
|
+
delete_after=True,
|
|
120
|
+
environment=environment,
|
|
121
|
+
)
|
|
122
|
+
oracle_reward = parse_harbor_outcome(oracle_result).reward
|
|
123
|
+
job_dirs["oracle"] = oracle_result.parent if oracle_result else None
|
|
124
|
+
|
|
125
|
+
return nop_reward, oracle_reward, job_dirs
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def check_validation_passed(nop_reward: int | None, oracle_reward: int | None) -> bool:
|
|
129
|
+
"""Check if validation passed (NOP=0, Oracle=1)."""
|
|
130
|
+
return nop_reward == 0 and oracle_reward == 1
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Re-export for convenience
|
|
134
|
+
__all__ = [
|
|
135
|
+
"ValidationError",
|
|
136
|
+
"validate_task_structure",
|
|
137
|
+
"run_nop_oracle",
|
|
138
|
+
"check_validation_passed",
|
|
139
|
+
# Low-level (from harbor_runner)
|
|
140
|
+
"run_harbor_agent",
|
|
141
|
+
"parse_harbor_outcome",
|
|
142
|
+
]
|