swegen 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,523 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import shutil
5
+ import subprocess
6
+ import sys
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+
10
+ from harbor.models.environment_type import EnvironmentType
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+ from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
14
+ from rich.table import Table
15
+
16
+ from .harbor_runner import parse_harbor_outcome, run_harbor_agent
17
+
18
+ DOCKER_CLEANUP_CMD = "docker system prune -af"
19
+
20
+
21
+ @dataclass
22
+ class ValidateArgs:
23
+ path: Path
24
+ task: str | None
25
+ jobs_dir: Path
26
+ agent: str # "both" | "nop" | "oracle"
27
+ timeout_multiplier: float | None = None
28
+ verbose: bool = False
29
+ quiet: bool = False
30
+ environment: EnvironmentType = EnvironmentType.DOCKER
31
+ max_parallel: int = 8
32
+ show_passed: bool = False
33
+ output_file: Path | None = None # Write results to file as they complete
34
+ docker_prune_batch: int = 5 # Run docker cleanup after every N tasks (0 to disable)
35
+
36
+
37
+ @dataclass
38
+ class ValidationResult:
39
+ """Result of validating a single task."""
40
+
41
+ task_id: str
42
+ nop_reward: float | None
43
+ oracle_reward: float | None
44
+ nop_exit_code: int
45
+ oracle_exit_code: int
46
+ passed: bool
47
+ error: str | None = None
48
+
49
+
50
+ def run_validate(args: ValidateArgs) -> None:
51
+ """Main entry point - routes to single or batch validation."""
52
+ dataset_path, task_id, task_dir = _resolve_paths(args)
53
+
54
+ if task_id is None:
55
+ _run_batch_mode(args, dataset_path)
56
+ else:
57
+ _run_single_mode(args, dataset_path, task_id, task_dir)
58
+
59
+
60
+ def _resolve_paths(args: ValidateArgs) -> tuple[Path, str | None, Path | None]:
61
+ """Resolve paths and determine if single or batch mode.
62
+
63
+ Returns: (dataset_path, task_id, task_dir)
64
+ task_id/task_dir are None for batch mode
65
+ """
66
+ path = args.path.resolve()
67
+
68
+ if args.task:
69
+ # Explicit task ID: single mode
70
+ return path, args.task, path / args.task
71
+
72
+ if path.is_dir() and (path / "tests" / "test.sh").exists():
73
+ # Path is a task directory: single mode
74
+ return path.parent, path.name, path
75
+
76
+ if path.is_dir():
77
+ # Check if directory contains tasks: batch mode
78
+ tasks = [d for d in path.iterdir() if d.is_dir() and (d / "tests" / "test.sh").exists()]
79
+ if tasks:
80
+ return path, None, None
81
+ raise SystemExit(
82
+ f"No tasks found in directory: {path}\nExpected directories with tests/test.sh"
83
+ )
84
+
85
+ raise SystemExit(
86
+ "Path must be:\n"
87
+ " 1. A task directory (containing tests/test.sh), or\n"
88
+ " 2. A dataset directory with multiple tasks"
89
+ )
90
+
91
+
92
+ # ============================================================================
93
+ # SINGLE TASK MODE
94
+ # ============================================================================
95
+
96
+
97
+ def _run_single_mode(args: ValidateArgs, dataset_path: Path, task_id: str, task_dir: Path) -> None:
98
+ """Validate a single task with traditional output."""
99
+ jobs_dir = args.jobs_dir.resolve()
100
+ jobs_dir.mkdir(parents=True, exist_ok=True)
101
+
102
+ # Run regular validation
103
+ print("[validate] Running regular validation...")
104
+ nop_reward, oracle_reward = _run_agents(
105
+ task_id, dataset_path, jobs_dir, args.agent, args.timeout_multiplier, args.environment
106
+ )
107
+
108
+ # Check results
109
+ if args.agent == "both":
110
+ if nop_reward != 0 or oracle_reward != 1:
111
+ print("\n[validate] FAILED: Harbor validation did not meet expectations")
112
+ print(f" NOP: expected reward=0, got reward={nop_reward}")
113
+ print(f" ORACLE: expected reward=1, got reward={oracle_reward}")
114
+ sys.exit(1)
115
+ else:
116
+ print("\n[validate] PASSED: Harbor validation met expectations")
117
+ print(f" NOP: reward={nop_reward} ✓")
118
+ print(f" ORACLE: reward={oracle_reward} ✓")
119
+
120
+
121
+ def _run_agents(
122
+ task_id: str,
123
+ dataset_path: Path,
124
+ jobs_dir: Path,
125
+ agent: str,
126
+ timeout_multiplier: float | None,
127
+ environment: EnvironmentType = EnvironmentType.DOCKER,
128
+ ) -> tuple[float | None, float | None]:
129
+ """Run NOP and/or Oracle agents, return (nop_reward, oracle_reward)."""
130
+ nop_reward = oracle_reward = None
131
+
132
+ if agent in ("nop", "both"):
133
+ # When running both, keep image for nop so oracle can reuse it
134
+ delete_after = agent == "nop" # Only delete if ONLY running nop
135
+ code, job_result = run_harbor_agent(
136
+ task_id,
137
+ dataset_path,
138
+ jobs_dir,
139
+ "nop",
140
+ timeout_multiplier,
141
+ delete_after=delete_after,
142
+ environment=environment,
143
+ )
144
+ nop_reward = parse_harbor_outcome(job_result).reward
145
+ print(f"[validate] nop exit={code}, reward={nop_reward}")
146
+
147
+ if agent in ("oracle", "both"):
148
+ # Oracle always deletes (cleanup)
149
+ code, job_result = run_harbor_agent(
150
+ task_id,
151
+ dataset_path,
152
+ jobs_dir,
153
+ "oracle",
154
+ timeout_multiplier,
155
+ delete_after=True,
156
+ environment=environment,
157
+ )
158
+ oracle_reward = parse_harbor_outcome(job_result).reward
159
+ print(f"[validate] oracle exit={code}, reward={oracle_reward}")
160
+
161
+ return nop_reward, oracle_reward
162
+
163
+
164
+ # ============================================================================
165
+ # BATCH MODE
166
+ # ============================================================================
167
+
168
+
169
+ def _run_batch_mode(args: ValidateArgs, dataset_path: Path) -> None:
170
+ """Validate all tasks in parallel with clean output."""
171
+ console = Console()
172
+ jobs_dir = args.jobs_dir.resolve()
173
+ jobs_dir.mkdir(parents=True, exist_ok=True)
174
+
175
+ # Find tasks
176
+ task_dirs = [
177
+ d for d in dataset_path.iterdir() if d.is_dir() and (d / "tests" / "test.sh").exists()
178
+ ]
179
+ if not task_dirs:
180
+ console.print("[yellow]No tasks found[/yellow]")
181
+ return
182
+
183
+ console.print(f"[blue]Found {len(task_dirs)} task(s) to validate[/blue]")
184
+ console.print(f"[blue]Parallel: {args.max_parallel} | Agent: {args.agent}[/blue]")
185
+ if args.output_file:
186
+ console.print(f"[blue]Output: {args.output_file}[/blue]")
187
+ # Show docker prune setting for local docker
188
+ if args.environment == EnvironmentType.DOCKER and args.docker_prune_batch > 0:
189
+ console.print(f"[blue]Docker prune: every {args.docker_prune_batch} tasks[/blue]")
190
+ console.print()
191
+
192
+ # Run validations
193
+ results = asyncio.run(
194
+ _validate_batch(
195
+ task_dirs,
196
+ dataset_path,
197
+ jobs_dir,
198
+ args.agent,
199
+ args.max_parallel,
200
+ args.timeout_multiplier,
201
+ args.environment,
202
+ console,
203
+ args.output_file,
204
+ args.docker_prune_batch,
205
+ )
206
+ )
207
+
208
+ # Print results
209
+ _print_results(results, args.agent, args.show_passed, console)
210
+
211
+ # Exit with failure if any tasks failed
212
+ if not all(r.passed for r in results):
213
+ sys.exit(1)
214
+
215
+
216
+ async def _validate_batch(
217
+ task_dirs: list[Path],
218
+ dataset_path: Path,
219
+ jobs_dir: Path,
220
+ agent: str,
221
+ max_parallel: int,
222
+ timeout_multiplier: float | None,
223
+ environment: EnvironmentType,
224
+ console: Console,
225
+ output_file: Path | None = None,
226
+ docker_prune_batch: int = 5,
227
+ ) -> list[ValidationResult]:
228
+ """Run validations in parallel with progress bar."""
229
+ semaphore = asyncio.Semaphore(max_parallel)
230
+
231
+ # Track completed count for docker pruning
232
+ completed_count = 0
233
+ prune_lock = asyncio.Lock()
234
+
235
+ # Lock and file handle for sequential writes
236
+ write_lock = asyncio.Lock()
237
+ file_handle = None
238
+ if output_file:
239
+ output_file.parent.mkdir(parents=True, exist_ok=True)
240
+ file_handle = open(output_file, "w")
241
+ # Write header
242
+ file_handle.write(f"# Validation results - {len(task_dirs)} tasks\n")
243
+ file_handle.write("# Format: TASK_ID: NOP=<reward> ORACLE=<reward> <STATUS>\n\n")
244
+ file_handle.flush()
245
+
246
+ async def write_result(result: ValidationResult) -> None:
247
+ """Write a single result to file (thread-safe)."""
248
+ if file_handle is None:
249
+ return
250
+ async with write_lock:
251
+ line = _format_result_line(result, agent)
252
+ file_handle.write(line + "\n")
253
+ file_handle.flush() # Ensure immediate write to disk
254
+
255
+ async def validate_one(task_dir: Path) -> ValidationResult:
256
+ async with semaphore:
257
+ try:
258
+ nop_reward = oracle_reward = None
259
+ nop_code = oracle_code = 0
260
+
261
+ # Run NOP (capture_output=True to suppress Harbor's verbose output)
262
+ if agent in ("nop", "both"):
263
+ # When running both, keep image for nop so oracle can reuse it
264
+ delete_after = agent == "nop" # Only delete if ONLY running nop
265
+ nop_code, job = await asyncio.to_thread(
266
+ run_harbor_agent,
267
+ task_dir.name,
268
+ dataset_path,
269
+ jobs_dir,
270
+ "nop",
271
+ timeout_multiplier,
272
+ True,
273
+ delete_after,
274
+ environment,
275
+ )
276
+ nop_reward = parse_harbor_outcome(job).reward
277
+
278
+ # Run Oracle (capture_output=True to suppress Harbor's verbose output)
279
+ if agent in ("oracle", "both"):
280
+ # Oracle always deletes (cleanup)
281
+ oracle_code, job = await asyncio.to_thread(
282
+ run_harbor_agent,
283
+ task_dir.name,
284
+ dataset_path,
285
+ jobs_dir,
286
+ "oracle",
287
+ timeout_multiplier,
288
+ True,
289
+ True,
290
+ environment,
291
+ )
292
+ oracle_reward = parse_harbor_outcome(job).reward
293
+
294
+ # Determine pass/fail
295
+ passed = _check_passed(agent, nop_reward, oracle_reward)
296
+
297
+ result = ValidationResult(
298
+ task_id=task_dir.name,
299
+ nop_reward=nop_reward,
300
+ oracle_reward=oracle_reward,
301
+ nop_exit_code=nop_code,
302
+ oracle_exit_code=oracle_code,
303
+ passed=passed,
304
+ )
305
+ except Exception as e:
306
+ result = ValidationResult(
307
+ task_id=task_dir.name,
308
+ nop_reward=None,
309
+ oracle_reward=None,
310
+ nop_exit_code=-1,
311
+ oracle_exit_code=-1,
312
+ passed=False,
313
+ error=str(e),
314
+ )
315
+
316
+ # Write to file immediately
317
+ await write_result(result)
318
+ return result
319
+
320
+ async def maybe_prune_docker(count: int) -> None:
321
+ """Run docker prune if conditions are met (local docker only, every N tasks)."""
322
+ if environment != EnvironmentType.DOCKER:
323
+ return
324
+ if docker_prune_batch <= 0:
325
+ return
326
+ if count % docker_prune_batch != 0:
327
+ return
328
+
329
+ async with prune_lock:
330
+ await asyncio.to_thread(_prune_docker, console)
331
+
332
+ # Run with progress bar
333
+ results = []
334
+ try:
335
+ with Progress(
336
+ SpinnerColumn(),
337
+ TextColumn("[progress.description]{task.description}"),
338
+ BarColumn(),
339
+ TaskProgressColumn(),
340
+ console=console,
341
+ ) as progress:
342
+ task_prog = progress.add_task("[cyan]Validating tasks...", total=len(task_dirs))
343
+
344
+ for coro in asyncio.as_completed([validate_one(d) for d in task_dirs]):
345
+ results.append(await coro)
346
+ progress.update(task_prog, advance=1)
347
+
348
+ # Docker cleanup after batch (local docker only)
349
+ completed_count = len(results)
350
+ await maybe_prune_docker(completed_count)
351
+ finally:
352
+ if file_handle:
353
+ # Write summary at end
354
+ passed = sum(1 for r in results if r.passed and not r.error)
355
+ failed = sum(1 for r in results if not r.passed and not r.error)
356
+ errors = sum(1 for r in results if r.error)
357
+ file_handle.write(f"\n# Summary: {passed} passed, {failed} failed, {errors} errors\n")
358
+ file_handle.close()
359
+
360
+ return results
361
+
362
+
363
+ def _format_result_line(result: ValidationResult, agent: str) -> str:
364
+ """Format a single result as a text line."""
365
+ parts = [result.task_id + ":"]
366
+
367
+ if agent in ("nop", "both"):
368
+ if result.nop_reward is not None:
369
+ parts.append(f"NOP={result.nop_reward}")
370
+ else:
371
+ parts.append("NOP=ERROR")
372
+
373
+ if agent in ("oracle", "both"):
374
+ if result.oracle_reward is not None:
375
+ parts.append(f"ORACLE={result.oracle_reward}")
376
+ else:
377
+ parts.append("ORACLE=ERROR")
378
+
379
+ if result.error:
380
+ parts.append(f"ERROR: {result.error}")
381
+ elif result.passed:
382
+ parts.append("PASS")
383
+ else:
384
+ parts.append("FAIL")
385
+
386
+ return " ".join(parts)
387
+
388
+
389
+ def _check_passed(agent: str, nop_reward: float | None, oracle_reward: float | None) -> bool:
390
+ """Check if validation passed based on agent type and rewards."""
391
+ if agent == "both":
392
+ return nop_reward == 0 and oracle_reward == 1
393
+ elif agent == "nop":
394
+ return nop_reward == 0
395
+ elif agent == "oracle":
396
+ return oracle_reward == 1
397
+ return False
398
+
399
+
400
+ def _print_results(
401
+ results: list[ValidationResult], agent: str, show_passed: bool, console: Console
402
+ ) -> None:
403
+ """Print results table (failures only by default) and summary."""
404
+ passed = [r for r in results if r.passed and not r.error]
405
+ failed = [r for r in results if not r.passed and not r.error]
406
+ errors = [r for r in results if r.error]
407
+
408
+ # Show table if there are failures/errors or if show_passed requested
409
+ if failed or errors or show_passed:
410
+ table = Table(
411
+ title="Validation Failures" if not show_passed else "Validation Results",
412
+ title_style="bold cyan",
413
+ show_lines=True,
414
+ )
415
+ table.add_column("Task ID", style="cyan")
416
+
417
+ if agent in ("nop", "both"):
418
+ table.add_column("NOP", justify="center")
419
+ if agent in ("oracle", "both"):
420
+ table.add_column("Oracle", justify="center")
421
+
422
+ table.add_column("Status", justify="center")
423
+ table.add_column("Notes")
424
+
425
+ # Show errors, then failures, then passed (if requested)
426
+ for result in sorted(
427
+ errors + failed + (passed if show_passed else []), key=lambda r: r.task_id
428
+ ):
429
+ _add_result_row(table, result, agent)
430
+
431
+ console.print("\n")
432
+ console.print(table)
433
+
434
+ # Always show summary
435
+ console.print("\n[bold]Summary:[/bold]")
436
+ console.print(f" ✅ Passed: {len(passed)}")
437
+ console.print(f" ❌ Failed: {len(failed)}")
438
+ console.print(f" ⚠️ Errors: {len(errors)}")
439
+ console.print(f" 📊 Total: {len(results)}")
440
+
441
+ if not failed and not errors:
442
+ console.print(f"\n[bold green]🎉 All {len(passed)} task(s) passed validation![/bold green]")
443
+
444
+
445
+ def _add_result_row(table: Table, result: ValidationResult, agent: str) -> None:
446
+ """Add a single result row to the table."""
447
+ row = [result.task_id]
448
+
449
+ if result.error:
450
+ # Error row
451
+ if agent in ("nop", "both"):
452
+ row.append("?")
453
+ if agent in ("oracle", "both"):
454
+ row.append("?")
455
+ row.extend(["❌ ERROR", result.error])
456
+ table.add_row(*row, style="red")
457
+ return
458
+
459
+ if result.passed:
460
+ # Passed row (only shown if show_passed=True)
461
+ if agent in ("nop", "both"):
462
+ row.append(f"✓ ({result.nop_reward})" if result.nop_reward is not None else "—")
463
+ if agent in ("oracle", "both"):
464
+ row.append(f"✓ ({result.oracle_reward})" if result.oracle_reward is not None else "—")
465
+ row.extend(["✅ PASS", ""])
466
+ table.add_row(*row, style="green")
467
+ return
468
+
469
+ # Failed row
470
+ notes = []
471
+
472
+ if agent in ("nop", "both"):
473
+ if result.nop_reward is not None:
474
+ row.append(f"{'✓' if result.nop_reward == 0 else '✗'} ({result.nop_reward})")
475
+ if result.nop_reward != 0:
476
+ notes.append(f"NOP expected 0, got {result.nop_reward}")
477
+ else:
478
+ row.append("—")
479
+
480
+ if agent in ("oracle", "both"):
481
+ if result.oracle_reward is not None:
482
+ row.append(f"{'✓' if result.oracle_reward == 1 else '✗'} ({result.oracle_reward})")
483
+ if result.oracle_reward != 1:
484
+ notes.append(f"Oracle expected 1, got {result.oracle_reward}")
485
+ else:
486
+ row.append("—")
487
+
488
+ row.extend(["❌ FAIL", "; ".join(notes)])
489
+ table.add_row(*row, style="red")
490
+
491
+
492
+ def _prune_docker(console: Console) -> None:
493
+ """Run docker cleanup to free disk space."""
494
+ if shutil.which("docker") is None:
495
+ console.print(
496
+ "[yellow]Skipping docker prune (docker binary not found in PATH).[/yellow]"
497
+ )
498
+ return
499
+
500
+ console.print(
501
+ Panel(
502
+ f"Running docker cleanup: {DOCKER_CLEANUP_CMD}",
503
+ title="Disk cleanup",
504
+ border_style="yellow",
505
+ )
506
+ )
507
+
508
+ try:
509
+ result = subprocess.run(
510
+ DOCKER_CLEANUP_CMD,
511
+ shell=True,
512
+ capture_output=True,
513
+ text=True,
514
+ timeout=600,
515
+ )
516
+ if result.returncode == 0:
517
+ console.print("[green]Docker cleanup completed[/green]")
518
+ else:
519
+ console.print(f"[yellow]Docker cleanup returned code {result.returncode}[/yellow]")
520
+ except subprocess.TimeoutExpired:
521
+ console.print("[yellow]Docker cleanup timed out after 10 minutes[/yellow]")
522
+ except Exception as e:
523
+ console.print(f"[yellow]Docker cleanup failed: {e}[/yellow]")
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from harbor.models.environment_type import EnvironmentType
7
+ from harbor.models.task.task import Task
8
+
9
+ from .harbor_runner import parse_harbor_outcome, run_harbor_agent
10
+
11
+
12
+ class ValidationError(Exception):
13
+ """Raised when Harbor validation fails (NOP or Oracle)."""
14
+
15
+ pass
16
+
17
+
18
+ def validate_task_structure(task_dir: Path) -> bool:
19
+ """Validate task structure using Harbor's Task model.
20
+
21
+ This ensures the generated task has all required files and valid structure
22
+ before running Harbor validation.
23
+
24
+ Args:
25
+ task_dir: Path to the task directory
26
+
27
+ Returns:
28
+ True if task is valid
29
+
30
+ Raises:
31
+ ValidationError: If task structure is invalid with details
32
+ """
33
+ logger = logging.getLogger("swegen")
34
+
35
+ try:
36
+ # Use Harbor's Task model to validate structure
37
+ task = Task(task_dir)
38
+
39
+ # Verify required attributes are present
40
+ if not task.instruction or len(task.instruction.strip()) < 10:
41
+ raise ValidationError("Invalid instruction: too short or empty")
42
+
43
+ if not task.config:
44
+ raise ValidationError("Missing or invalid task.toml")
45
+
46
+ # Verify required files exist
47
+ paths = task.paths
48
+ required_files = [
49
+ (paths.instruction_path, "instruction.md"),
50
+ (paths.config_path, "task.toml"),
51
+ (paths.solve_path, "solution/solve.sh"),
52
+ (paths.test_path, "tests/test.sh"),
53
+ ]
54
+
55
+ for file_path, name in required_files:
56
+ if not file_path.exists():
57
+ raise ValidationError(f"Missing required file: {name}")
58
+
59
+ logger.debug(f"✓ Task structure validated: {task.name}")
60
+ return True
61
+
62
+ except ValidationError:
63
+ raise
64
+ except Exception as e:
65
+ logger.error(f"Task validation failed: {e}")
66
+ raise ValidationError(f"Task structure validation failed: {e}") from e
67
+
68
+
69
+ def run_nop_oracle(
70
+ task_id: str,
71
+ dataset_path: Path,
72
+ jobs_dir: Path,
73
+ timeout_multiplier: float | None = None,
74
+ environment: EnvironmentType = EnvironmentType.DOCKER,
75
+ ) -> tuple[int | None, int | None, dict[str, Path | None]]:
76
+ """Run both NOP and Oracle validations sequentially.
77
+
78
+ Validations are always run sequentially to avoid Docker conflict issues.
79
+ NOP keeps the Docker image so Oracle can reuse it (much faster).
80
+ Oracle deletes the image after running (cleanup).
81
+
82
+ Args:
83
+ task_id: Task identifier
84
+ dataset_path: Harbor dataset root path
85
+ jobs_dir: Jobs directory path
86
+ timeout_multiplier: Optional timeout multiplier
87
+ environment: Environment type (docker, daytona, e2b, modal, runloop, gke)
88
+
89
+ Returns:
90
+ Tuple of (nop_reward, oracle_reward, job_dirs) where:
91
+ - nop_reward: 0 if tests fail on buggy code (expected), None if error
92
+ - oracle_reward: 1 if tests pass after fix (expected), None if error
93
+ - job_dirs: Dict mapping "nop"/"oracle" to job result paths
94
+ """
95
+ job_dirs: dict[str, Path | None] = {"nop": None, "oracle": None}
96
+
97
+ # NOP: Keep image (delete_after=False) so Oracle can reuse it
98
+ _, nop_result = run_harbor_agent(
99
+ task_id=task_id,
100
+ dataset_path=dataset_path,
101
+ jobs_dir=jobs_dir,
102
+ agent="nop",
103
+ timeout_multiplier=timeout_multiplier,
104
+ capture_output=True,
105
+ delete_after=False,
106
+ environment=environment,
107
+ )
108
+ nop_reward = parse_harbor_outcome(nop_result).reward
109
+ job_dirs["nop"] = nop_result.parent if nop_result else None
110
+
111
+ # Oracle: Delete image after running (cleanup)
112
+ _, oracle_result = run_harbor_agent(
113
+ task_id=task_id,
114
+ dataset_path=dataset_path,
115
+ jobs_dir=jobs_dir,
116
+ agent="oracle",
117
+ timeout_multiplier=timeout_multiplier,
118
+ capture_output=True,
119
+ delete_after=True,
120
+ environment=environment,
121
+ )
122
+ oracle_reward = parse_harbor_outcome(oracle_result).reward
123
+ job_dirs["oracle"] = oracle_result.parent if oracle_result else None
124
+
125
+ return nop_reward, oracle_reward, job_dirs
126
+
127
+
128
+ def check_validation_passed(nop_reward: int | None, oracle_reward: int | None) -> bool:
129
+ """Check if validation passed (NOP=0, Oracle=1)."""
130
+ return nop_reward == 0 and oracle_reward == 1
131
+
132
+
133
+ # Re-export for convenience
134
+ __all__ = [
135
+ "ValidationError",
136
+ "validate_task_structure",
137
+ "run_nop_oracle",
138
+ "check_validation_passed",
139
+ # Low-level (from harbor_runner)
140
+ "run_harbor_agent",
141
+ "parse_harbor_outcome",
142
+ ]