swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
swegen/analyze/run.py
ADDED
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from harbor.models.trial.result import TrialResult
|
|
11
|
+
from harbor.models.environment_type import EnvironmentType
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.panel import Panel
|
|
14
|
+
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
|
|
17
|
+
from .models import (
|
|
18
|
+
BaselineValidation,
|
|
19
|
+
Classification,
|
|
20
|
+
TaskVerdict,
|
|
21
|
+
TrialClassification,
|
|
22
|
+
)
|
|
23
|
+
from .classifier import (
|
|
24
|
+
TrialClassifier,
|
|
25
|
+
classify_baseline_result,
|
|
26
|
+
compute_task_verdict,
|
|
27
|
+
write_trial_analysis_files,
|
|
28
|
+
)
|
|
29
|
+
from swegen.tools.harbor_runner import (
|
|
30
|
+
harbor_cmd_base,
|
|
31
|
+
parse_harbor_outcome,
|
|
32
|
+
run_harbor_agent,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _setup_claude_auth_preference(console: Console) -> None:
|
|
37
|
+
"""Setup Claude Code to prefer OAuth token over API key.
|
|
38
|
+
|
|
39
|
+
For Claude Code trials and classification, we prefer OAuth token:
|
|
40
|
+
1. CLAUDE_CODE_OAUTH_TOKEN (preferred - run 'claude setup-token')
|
|
41
|
+
2. ANTHROPIC_API_KEY (fallback)
|
|
42
|
+
|
|
43
|
+
Displays which authentication method is being used.
|
|
44
|
+
"""
|
|
45
|
+
has_oauth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN"))
|
|
46
|
+
has_api_key = bool(os.getenv("ANTHROPIC_API_KEY"))
|
|
47
|
+
|
|
48
|
+
if has_oauth:
|
|
49
|
+
# Prefer OAuth - unset API key to ensure OAuth is used
|
|
50
|
+
if "ANTHROPIC_API_KEY" in os.environ:
|
|
51
|
+
os.environ.pop("ANTHROPIC_API_KEY")
|
|
52
|
+
console.print("[dim]🔐 Claude Code authentication: OAuth token (preferred)[/dim]")
|
|
53
|
+
elif has_api_key:
|
|
54
|
+
# Use API key - unset OAuth to ensure API key is used
|
|
55
|
+
if "CLAUDE_CODE_OAUTH_TOKEN" in os.environ:
|
|
56
|
+
os.environ.pop("CLAUDE_CODE_OAUTH_TOKEN")
|
|
57
|
+
console.print("[dim]🔐 Claude Code authentication: API key (fallback)[/dim]")
|
|
58
|
+
console.print("[dim] Tip: For better security, use OAuth token ('claude setup-token')[/dim]")
|
|
59
|
+
else:
|
|
60
|
+
console.print("[yellow]⚠️ No Claude Code authentication configured[/yellow]")
|
|
61
|
+
console.print("[yellow] Set CLAUDE_CODE_OAUTH_TOKEN (preferred) or ANTHROPIC_API_KEY[/yellow]")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class TrialOutcome:
|
|
66
|
+
"""Result of a single trial (basic info before classification)."""
|
|
67
|
+
|
|
68
|
+
trial_name: str
|
|
69
|
+
trial_dir: Path
|
|
70
|
+
reward: float | None
|
|
71
|
+
exception_type: str | None
|
|
72
|
+
exception_message: str | None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class QualityCheckResult:
|
|
77
|
+
"""Result of static quality check."""
|
|
78
|
+
|
|
79
|
+
passed: bool
|
|
80
|
+
issues: list[str] = field(default_factory=list)
|
|
81
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class AnalysisResult:
|
|
86
|
+
"""Complete analysis result for a task."""
|
|
87
|
+
|
|
88
|
+
task_id: str
|
|
89
|
+
task_path: Path
|
|
90
|
+
|
|
91
|
+
# Quality check
|
|
92
|
+
quality_check: QualityCheckResult | None
|
|
93
|
+
|
|
94
|
+
# Baseline validation
|
|
95
|
+
baseline: BaselineValidation | None
|
|
96
|
+
|
|
97
|
+
# Trial results
|
|
98
|
+
trials_run: int
|
|
99
|
+
success_rate: float
|
|
100
|
+
trial_outcomes: list[TrialOutcome]
|
|
101
|
+
|
|
102
|
+
# Classifications (NEW)
|
|
103
|
+
classifications: list[TrialClassification]
|
|
104
|
+
|
|
105
|
+
# Task verdict (NEW)
|
|
106
|
+
verdict: TaskVerdict
|
|
107
|
+
|
|
108
|
+
# Job directory
|
|
109
|
+
job_dir: Path | None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class AnalyzeArgs:
|
|
114
|
+
"""Arguments for the analyze command."""
|
|
115
|
+
|
|
116
|
+
task_path: Path
|
|
117
|
+
agent: str = "claude-code"
|
|
118
|
+
model: str = "anthropic/claude-sonnet-4-5"
|
|
119
|
+
n_trials: int = 3
|
|
120
|
+
n_concurrent: int = 1 # Number of concurrent trials (matches Harbor's -n flag)
|
|
121
|
+
jobs_dir: Path = Path(".state/analyze-jobs")
|
|
122
|
+
skip_quality_check: bool = False
|
|
123
|
+
skip_baseline: bool = False # Skip baseline validation (nop/oracle)
|
|
124
|
+
skip_classify: bool = False # Skip Claude Code classification
|
|
125
|
+
analysis_model: str = "claude-sonnet-4-5" # Model for Claude Code classification
|
|
126
|
+
environment: str = "docker" # Environment type (docker|daytona|e2b|modal|runloop|gke)
|
|
127
|
+
verbose: bool = False
|
|
128
|
+
timeout_multiplier: float = 1.0
|
|
129
|
+
classification_timeout: int = 300 # Timeout per classification in seconds (5 min default)
|
|
130
|
+
verdict_timeout: int = 180 # Timeout for verdict synthesis in seconds (3 min default)
|
|
131
|
+
save_to_dir: bool = False # Write trajectory-analysis.{md,json} to each trial dir
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def run_analyze(args: AnalyzeArgs) -> AnalysisResult:
|
|
135
|
+
"""Main entry point for task analysis."""
|
|
136
|
+
console = Console()
|
|
137
|
+
|
|
138
|
+
# Resolve task path
|
|
139
|
+
task_path = args.task_path.resolve()
|
|
140
|
+
if not task_path.is_dir():
|
|
141
|
+
console.print(f"[red]Error: Task path does not exist: {task_path}[/red]")
|
|
142
|
+
raise SystemExit(1)
|
|
143
|
+
|
|
144
|
+
task_id = task_path.name
|
|
145
|
+
dataset_path = task_path.parent
|
|
146
|
+
|
|
147
|
+
# Check task structure
|
|
148
|
+
if not (task_path / "tests" / "test.sh").exists():
|
|
149
|
+
console.print(f"[red]Error: Not a valid task (missing tests/test.sh): {task_path}[/red]")
|
|
150
|
+
raise SystemExit(1)
|
|
151
|
+
|
|
152
|
+
# Setup and display Claude authentication for Claude Code agent
|
|
153
|
+
if args.agent == "claude-code":
|
|
154
|
+
_setup_claude_auth_preference(console)
|
|
155
|
+
|
|
156
|
+
console.print(
|
|
157
|
+
Panel.fit(
|
|
158
|
+
f"Agent: {args.agent} | Model: {args.model} | Trials: {args.n_trials}",
|
|
159
|
+
title=task_id,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Run analysis steps
|
|
164
|
+
result = _run_analysis(args, task_id, task_path, dataset_path, console)
|
|
165
|
+
|
|
166
|
+
# Print final report
|
|
167
|
+
_print_report(result, console)
|
|
168
|
+
|
|
169
|
+
return result
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _run_analysis(
|
|
173
|
+
args: AnalyzeArgs,
|
|
174
|
+
task_id: str,
|
|
175
|
+
task_path: Path,
|
|
176
|
+
dataset_path: Path,
|
|
177
|
+
console: Console,
|
|
178
|
+
) -> AnalysisResult:
|
|
179
|
+
"""Run all analysis steps."""
|
|
180
|
+
|
|
181
|
+
# Step 1: Static quality check
|
|
182
|
+
quality_check = None
|
|
183
|
+
if not args.skip_quality_check:
|
|
184
|
+
console.print("\n[bold blue]Step 1/4: Static Quality Check[/bold blue]")
|
|
185
|
+
quality_check = _run_quality_check(task_path, args.analysis_model, console)
|
|
186
|
+
else:
|
|
187
|
+
console.print("\n[dim]Step 1/4: Static Quality Check (skipped)[/dim]")
|
|
188
|
+
|
|
189
|
+
# Step 2: Baseline validation (necessary but not sufficient)
|
|
190
|
+
# Oracle/nop prove the task is technically solvable and requires changes,
|
|
191
|
+
# but they can't detect: underspecified instructions, overspecified tests,
|
|
192
|
+
# ambiguous requirements, or tests checking details not in instructions.
|
|
193
|
+
# That's what the trial classification step (Step 4) is for.
|
|
194
|
+
baseline = None
|
|
195
|
+
if not args.skip_baseline:
|
|
196
|
+
console.print("\n[bold blue]Step 2/4: Baseline Validation (nop/oracle)[/bold blue]")
|
|
197
|
+
baseline = _run_baseline_validation(args, task_id, dataset_path, console)
|
|
198
|
+
else:
|
|
199
|
+
console.print("\n[dim]Step 2/4: Baseline Validation (skipped)[/dim]")
|
|
200
|
+
|
|
201
|
+
# Step 3: Run agent trials
|
|
202
|
+
console.print(f"\n[bold blue]Step 3/4: Running {args.n_trials} Agent Trials[/bold blue]")
|
|
203
|
+
job_dir, trial_outcomes = _run_agent_trials(args, task_id, dataset_path, console)
|
|
204
|
+
|
|
205
|
+
successes = sum(1 for t in trial_outcomes if t.reward == 1)
|
|
206
|
+
failures = sum(1 for t in trial_outcomes if t.reward is not None and t.reward != 1)
|
|
207
|
+
errors = sum(1 for t in trial_outcomes if t.exception_type is not None)
|
|
208
|
+
success_rate = successes / len(trial_outcomes) if trial_outcomes else 0.0
|
|
209
|
+
|
|
210
|
+
console.print(f" Results: {successes} passed, {failures} failed, {errors} errors")
|
|
211
|
+
console.print(f" Success rate: {success_rate:.1%}")
|
|
212
|
+
|
|
213
|
+
# Step 4: Classify trials (detects issues baseline validation can't catch)
|
|
214
|
+
# Each trial is classified independently to identify:
|
|
215
|
+
# - Underspecified instructions (agent lacks critical details)
|
|
216
|
+
# - Overspecified/brittle tests (tests coupled to specific implementation)
|
|
217
|
+
# - Ambiguous requirements (multiple valid interpretations)
|
|
218
|
+
# - Tests checking for details not mentioned in instructions
|
|
219
|
+
# Then we aggregate across trials to detect systematic vs random issues.
|
|
220
|
+
classifications: list[TrialClassification] = []
|
|
221
|
+
if not args.skip_classify and trial_outcomes:
|
|
222
|
+
console.print("\n[bold blue]Step 4/4: Classifying Trial Outcomes[/bold blue]")
|
|
223
|
+
|
|
224
|
+
# Get trial directories for classification
|
|
225
|
+
trial_dirs = [t.trial_dir for t in trial_outcomes if t.trial_dir.exists()]
|
|
226
|
+
|
|
227
|
+
if trial_dirs:
|
|
228
|
+
classifier = TrialClassifier(
|
|
229
|
+
model=args.analysis_model,
|
|
230
|
+
verbose=args.verbose,
|
|
231
|
+
timeout=args.classification_timeout,
|
|
232
|
+
)
|
|
233
|
+
classifications = classifier.classify_trials_sync(trial_dirs, task_path, console)
|
|
234
|
+
|
|
235
|
+
# Write per-trial outputs if requested
|
|
236
|
+
if args.save_to_dir:
|
|
237
|
+
for classification in classifications:
|
|
238
|
+
# Find the matching trial directory
|
|
239
|
+
trial_dir = next(
|
|
240
|
+
(t.trial_dir for t in trial_outcomes if t.trial_name == classification.trial_name),
|
|
241
|
+
None
|
|
242
|
+
)
|
|
243
|
+
if trial_dir and trial_dir.exists():
|
|
244
|
+
write_trial_analysis_files(
|
|
245
|
+
trial_dir=trial_dir,
|
|
246
|
+
classification=classification,
|
|
247
|
+
task_id=task_id,
|
|
248
|
+
agent=args.agent,
|
|
249
|
+
model=args.model,
|
|
250
|
+
)
|
|
251
|
+
if args.verbose:
|
|
252
|
+
console.print(f" [dim]Wrote analysis to {trial_dir}/trajectory-analysis.*[/dim]")
|
|
253
|
+
|
|
254
|
+
# Show classification summary
|
|
255
|
+
task_problems = sum(1 for c in classifications if c.is_task_problem)
|
|
256
|
+
agent_problems = sum(1 for c in classifications if c.classification == Classification.GOOD_FAILURE)
|
|
257
|
+
|
|
258
|
+
if task_problems > 0:
|
|
259
|
+
console.print(f" [yellow]⚠ {task_problems} trial(s) indicate task problems[/yellow]")
|
|
260
|
+
if agent_problems > 0:
|
|
261
|
+
console.print(f" [green]✓ {agent_problems} trial(s) are normal agent failures[/green]")
|
|
262
|
+
else:
|
|
263
|
+
console.print(" [dim]No trial directories found to classify[/dim]")
|
|
264
|
+
else:
|
|
265
|
+
console.print("\n[dim]Step 4/4: Classifying Trial Outcomes (skipped)[/dim]")
|
|
266
|
+
|
|
267
|
+
# Compute task verdict (uses LLM synthesis)
|
|
268
|
+
quality_passed = quality_check is None or quality_check.passed
|
|
269
|
+
verdict = compute_task_verdict(
|
|
270
|
+
classifications,
|
|
271
|
+
baseline,
|
|
272
|
+
quality_passed,
|
|
273
|
+
model=args.analysis_model,
|
|
274
|
+
console=console,
|
|
275
|
+
verbose=args.verbose,
|
|
276
|
+
timeout=args.verdict_timeout,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return AnalysisResult(
|
|
280
|
+
task_id=task_id,
|
|
281
|
+
task_path=task_path,
|
|
282
|
+
quality_check=quality_check,
|
|
283
|
+
baseline=baseline,
|
|
284
|
+
trials_run=len(trial_outcomes),
|
|
285
|
+
success_rate=success_rate,
|
|
286
|
+
trial_outcomes=trial_outcomes,
|
|
287
|
+
classifications=classifications,
|
|
288
|
+
verdict=verdict,
|
|
289
|
+
job_dir=job_dir,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _run_quality_check(
|
|
294
|
+
task_path: Path,
|
|
295
|
+
model: str,
|
|
296
|
+
console: Console,
|
|
297
|
+
) -> QualityCheckResult:
|
|
298
|
+
"""Run Harbor's static quality check on the task."""
|
|
299
|
+
cmd = harbor_cmd_base() + [
|
|
300
|
+
"tasks",
|
|
301
|
+
"check",
|
|
302
|
+
str(task_path),
|
|
303
|
+
"-m",
|
|
304
|
+
model,
|
|
305
|
+
]
|
|
306
|
+
|
|
307
|
+
with console.status("[cyan]Running quality check..."):
|
|
308
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
309
|
+
|
|
310
|
+
# Parse output to extract issues
|
|
311
|
+
issues = []
|
|
312
|
+
details: dict[str, Any] = {}
|
|
313
|
+
|
|
314
|
+
output = proc.stdout + proc.stderr
|
|
315
|
+
|
|
316
|
+
# Look for failed checks in output
|
|
317
|
+
fail_keywords = ["fail", "FAIL", "❌"]
|
|
318
|
+
for line in output.split("\n"):
|
|
319
|
+
for keyword in fail_keywords:
|
|
320
|
+
if keyword in line and "passed" not in line.lower():
|
|
321
|
+
clean_line = line.strip()
|
|
322
|
+
if clean_line and "│" in clean_line:
|
|
323
|
+
parts = [p.strip() for p in clean_line.split("│")]
|
|
324
|
+
if len(parts) >= 2 and any(k in parts[1].lower() for k in ["fail"]):
|
|
325
|
+
issues.append(parts[0])
|
|
326
|
+
|
|
327
|
+
passed = proc.returncode == 0 and len(issues) == 0
|
|
328
|
+
|
|
329
|
+
if passed:
|
|
330
|
+
console.print(" [green]✓ Quality check passed[/green]")
|
|
331
|
+
else:
|
|
332
|
+
console.print(" [yellow]⚠ Quality check found issues:[/yellow]")
|
|
333
|
+
for issue in issues[:5]:
|
|
334
|
+
console.print(f" - {issue}")
|
|
335
|
+
|
|
336
|
+
return QualityCheckResult(passed=passed, issues=issues, details=details)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _run_baseline_validation(
|
|
340
|
+
args: AnalyzeArgs,
|
|
341
|
+
task_id: str,
|
|
342
|
+
dataset_path: Path,
|
|
343
|
+
console: Console,
|
|
344
|
+
) -> BaselineValidation:
|
|
345
|
+
"""Run nop and oracle baseline agents to validate task correctness."""
|
|
346
|
+
|
|
347
|
+
jobs_parent = args.jobs_dir.resolve()
|
|
348
|
+
jobs_parent.mkdir(parents=True, exist_ok=True)
|
|
349
|
+
|
|
350
|
+
baseline = BaselineValidation()
|
|
351
|
+
env = EnvironmentType(args.environment)
|
|
352
|
+
|
|
353
|
+
# Run nop agent (should fail - reward=0)
|
|
354
|
+
console.print(" Running nop agent (should fail)...")
|
|
355
|
+
nop_code, nop_job = run_harbor_agent(
|
|
356
|
+
task_id,
|
|
357
|
+
dataset_path,
|
|
358
|
+
jobs_parent,
|
|
359
|
+
"nop",
|
|
360
|
+
args.timeout_multiplier,
|
|
361
|
+
capture_output=True,
|
|
362
|
+
# Keep image when we will immediately run oracle; oracle will cleanup.
|
|
363
|
+
delete_after=False,
|
|
364
|
+
environment=env,
|
|
365
|
+
)
|
|
366
|
+
nop_outcome = parse_harbor_outcome(nop_job)
|
|
367
|
+
nop_reward = nop_outcome.reward
|
|
368
|
+
nop_error = nop_outcome.error
|
|
369
|
+
if nop_error is None and nop_reward is None:
|
|
370
|
+
if nop_job is None:
|
|
371
|
+
nop_error = "No Harbor job result found"
|
|
372
|
+
elif nop_code != 0:
|
|
373
|
+
nop_error = f"Harbor exited with code {nop_code}"
|
|
374
|
+
else:
|
|
375
|
+
nop_error = "Could not parse reward from Harbor job result"
|
|
376
|
+
baseline.nop = classify_baseline_result("nop", nop_reward, nop_error)
|
|
377
|
+
|
|
378
|
+
if baseline.nop.is_expected:
|
|
379
|
+
console.print(" [green]✓ nop failed as expected[/green]")
|
|
380
|
+
else:
|
|
381
|
+
console.print(" [red]✗ CRITICAL: nop passed - task may be pre-solved![/red]")
|
|
382
|
+
|
|
383
|
+
# Run oracle agent (should pass - reward=1)
|
|
384
|
+
console.print(" Running oracle agent (should pass)...")
|
|
385
|
+
oracle_code, oracle_job = run_harbor_agent(
|
|
386
|
+
task_id,
|
|
387
|
+
dataset_path,
|
|
388
|
+
jobs_parent,
|
|
389
|
+
"oracle",
|
|
390
|
+
args.timeout_multiplier,
|
|
391
|
+
capture_output=True,
|
|
392
|
+
delete_after=True,
|
|
393
|
+
environment=env,
|
|
394
|
+
)
|
|
395
|
+
oracle_outcome = parse_harbor_outcome(oracle_job)
|
|
396
|
+
oracle_reward = oracle_outcome.reward
|
|
397
|
+
oracle_error = oracle_outcome.error
|
|
398
|
+
if oracle_error is None and oracle_reward is None:
|
|
399
|
+
if oracle_job is None:
|
|
400
|
+
oracle_error = "No Harbor job result found"
|
|
401
|
+
elif oracle_code != 0:
|
|
402
|
+
oracle_error = f"Harbor exited with code {oracle_code}"
|
|
403
|
+
else:
|
|
404
|
+
oracle_error = "Could not parse reward from Harbor job result"
|
|
405
|
+
baseline.oracle = classify_baseline_result("oracle", oracle_reward, oracle_error)
|
|
406
|
+
|
|
407
|
+
if baseline.oracle.is_expected:
|
|
408
|
+
console.print(" [green]✓ oracle passed as expected[/green]")
|
|
409
|
+
else:
|
|
410
|
+
console.print(" [red]✗ CRITICAL: oracle failed - reference solution broken![/red]")
|
|
411
|
+
|
|
412
|
+
return baseline
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _run_agent_trials(
|
|
416
|
+
args: AnalyzeArgs,
|
|
417
|
+
task_id: str,
|
|
418
|
+
dataset_path: Path,
|
|
419
|
+
console: Console,
|
|
420
|
+
) -> tuple[Path | None, list[TrialOutcome]]:
|
|
421
|
+
"""Run multiple agent trials on the task."""
|
|
422
|
+
|
|
423
|
+
_timestamp = int(time.time())
|
|
424
|
+
jobs_parent = args.jobs_dir.resolve()
|
|
425
|
+
jobs_parent.mkdir(parents=True, exist_ok=True)
|
|
426
|
+
unique_parent = jobs_parent / f"{task_id}.{args.agent}.{_timestamp}"
|
|
427
|
+
unique_parent.mkdir(parents=True, exist_ok=True)
|
|
428
|
+
before = set(unique_parent.iterdir())
|
|
429
|
+
|
|
430
|
+
cmd = harbor_cmd_base() + [
|
|
431
|
+
"run",
|
|
432
|
+
"-p", str(dataset_path),
|
|
433
|
+
"-t", task_id,
|
|
434
|
+
"-a", args.agent,
|
|
435
|
+
"-m", args.model,
|
|
436
|
+
"-k", str(args.n_trials),
|
|
437
|
+
"-n", str(args.n_concurrent), # Matches Harbor's -n flag
|
|
438
|
+
"-e", args.environment,
|
|
439
|
+
"--jobs-dir", str(unique_parent),
|
|
440
|
+
"--timeout-multiplier", str(args.timeout_multiplier),
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
with Progress(
|
|
444
|
+
SpinnerColumn(),
|
|
445
|
+
TextColumn("[progress.description]{task.description}"),
|
|
446
|
+
BarColumn(),
|
|
447
|
+
TaskProgressColumn(),
|
|
448
|
+
console=console,
|
|
449
|
+
) as progress:
|
|
450
|
+
concurrent_msg = f" ({args.n_concurrent} concurrent)" if args.n_concurrent > 1 else ""
|
|
451
|
+
task = progress.add_task(
|
|
452
|
+
f"[cyan]Running {args.n_trials} trials with {args.agent}{concurrent_msg}...", total=None
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
_proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
456
|
+
progress.update(task, completed=True)
|
|
457
|
+
|
|
458
|
+
# Find the job directory that was created inside unique_parent
|
|
459
|
+
after = set(unique_parent.iterdir()) if unique_parent.exists() else set()
|
|
460
|
+
new_dirs = [p for p in (after - before) if p.is_dir()]
|
|
461
|
+
job_dirs = sorted(new_dirs, key=lambda p: p.stat().st_mtime, reverse=True)
|
|
462
|
+
job_dir = job_dirs[0] if job_dirs else None
|
|
463
|
+
|
|
464
|
+
# Parse trial results
|
|
465
|
+
trial_outcomes = []
|
|
466
|
+
if job_dir:
|
|
467
|
+
trial_outcomes = _parse_trial_results(job_dir)
|
|
468
|
+
|
|
469
|
+
return job_dir, trial_outcomes
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def _parse_trial_results(job_dir: Path) -> list[TrialOutcome]:
|
|
473
|
+
"""Parse trial results from a job directory."""
|
|
474
|
+
outcomes = []
|
|
475
|
+
|
|
476
|
+
for trial_dir in job_dir.iterdir():
|
|
477
|
+
if not trial_dir.is_dir():
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
result_path = trial_dir / "result.json"
|
|
481
|
+
if not result_path.exists():
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
result = TrialResult.model_validate_json(result_path.read_text())
|
|
486
|
+
|
|
487
|
+
reward = None
|
|
488
|
+
if result.verifier_result and result.verifier_result.rewards:
|
|
489
|
+
reward = result.verifier_result.rewards.get("reward")
|
|
490
|
+
|
|
491
|
+
exception_type = None
|
|
492
|
+
exception_message = None
|
|
493
|
+
if result.exception_info:
|
|
494
|
+
exception_type = result.exception_info.exception_type
|
|
495
|
+
exception_message = result.exception_info.exception_message
|
|
496
|
+
|
|
497
|
+
outcomes.append(
|
|
498
|
+
TrialOutcome(
|
|
499
|
+
trial_name=result.trial_name,
|
|
500
|
+
trial_dir=trial_dir,
|
|
501
|
+
reward=reward,
|
|
502
|
+
exception_type=exception_type,
|
|
503
|
+
exception_message=exception_message,
|
|
504
|
+
)
|
|
505
|
+
)
|
|
506
|
+
except Exception as e:
|
|
507
|
+
console = Console()
|
|
508
|
+
console.print(f"[dim]Warning: Could not parse {result_path}: {e}[/dim]")
|
|
509
|
+
|
|
510
|
+
return outcomes
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _print_report(result: AnalysisResult, console: Console) -> None:
|
|
514
|
+
"""Print the final analysis report."""
|
|
515
|
+
console.print("\n")
|
|
516
|
+
|
|
517
|
+
# Overall verdict
|
|
518
|
+
verdict = result.verdict
|
|
519
|
+
if verdict.is_good:
|
|
520
|
+
verdict_style = "bold green"
|
|
521
|
+
verdict_icon = "✅"
|
|
522
|
+
verdict_text = f"GOOD TASK (confidence: {verdict.confidence})"
|
|
523
|
+
else:
|
|
524
|
+
verdict_style = "bold red"
|
|
525
|
+
verdict_icon = "❌"
|
|
526
|
+
verdict_text = "NEEDS REVIEW"
|
|
527
|
+
|
|
528
|
+
console.print(
|
|
529
|
+
Panel.fit(
|
|
530
|
+
f"[{verdict_style}]{verdict_icon} {verdict_text}[/{verdict_style}]",
|
|
531
|
+
title=f"Task Verdict: {result.task_id}",
|
|
532
|
+
)
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# Summary table
|
|
536
|
+
table = Table(show_header=True, header_style="bold")
|
|
537
|
+
table.add_column("Check", style="cyan")
|
|
538
|
+
table.add_column("Result")
|
|
539
|
+
table.add_column("Details")
|
|
540
|
+
|
|
541
|
+
# Quality check row
|
|
542
|
+
if result.quality_check:
|
|
543
|
+
qc_status = (
|
|
544
|
+
"✅ Passed"
|
|
545
|
+
if result.quality_check.passed
|
|
546
|
+
else f"⚠️ {len(result.quality_check.issues)} issues"
|
|
547
|
+
)
|
|
548
|
+
qc_style = "green" if result.quality_check.passed else "yellow"
|
|
549
|
+
table.add_row(
|
|
550
|
+
"Quality Check",
|
|
551
|
+
f"[{qc_style}]{qc_status}[/{qc_style}]",
|
|
552
|
+
", ".join(result.quality_check.issues[:3])
|
|
553
|
+
if result.quality_check.issues
|
|
554
|
+
else "All checks passed",
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
# Baseline validation row
|
|
558
|
+
if result.baseline:
|
|
559
|
+
baseline_ok = result.baseline.is_valid
|
|
560
|
+
if baseline_ok:
|
|
561
|
+
baseline_status = "✅ Valid"
|
|
562
|
+
baseline_style = "green"
|
|
563
|
+
baseline_details = "nop fails, oracle passes"
|
|
564
|
+
else:
|
|
565
|
+
baseline_status = "❌ Invalid"
|
|
566
|
+
baseline_style = "red"
|
|
567
|
+
baseline_details = "; ".join(result.baseline.issues)
|
|
568
|
+
table.add_row(
|
|
569
|
+
"Baseline (nop/oracle)",
|
|
570
|
+
f"[{baseline_style}]{baseline_status}[/{baseline_style}]",
|
|
571
|
+
baseline_details,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
# Trials row
|
|
575
|
+
trials_status = f"{result.success_rate:.0%} success rate"
|
|
576
|
+
if result.success_rate >= 0.67:
|
|
577
|
+
trials_style = "green"
|
|
578
|
+
trials_icon = "✅"
|
|
579
|
+
elif result.success_rate >= 0.33:
|
|
580
|
+
trials_style = "yellow"
|
|
581
|
+
trials_icon = "⚠️"
|
|
582
|
+
else:
|
|
583
|
+
trials_style = "red"
|
|
584
|
+
trials_icon = "❌"
|
|
585
|
+
|
|
586
|
+
successes = sum(1 for t in result.trial_outcomes if t.reward == 1)
|
|
587
|
+
failures = sum(1 for t in result.trial_outcomes if t.reward is not None and t.reward != 1)
|
|
588
|
+
errors = sum(1 for t in result.trial_outcomes if t.exception_type)
|
|
589
|
+
|
|
590
|
+
table.add_row(
|
|
591
|
+
f"Agent Trials ({result.trials_run})",
|
|
592
|
+
f"[{trials_style}]{trials_icon} {trials_status}[/{trials_style}]",
|
|
593
|
+
f"{successes} passed, {failures} failed, {errors} errors",
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# Classification summary row
|
|
597
|
+
if result.classifications:
|
|
598
|
+
task_problems = verdict.task_problem_count
|
|
599
|
+
agent_problems = verdict.agent_problem_count
|
|
600
|
+
|
|
601
|
+
if task_problems > 0:
|
|
602
|
+
class_status = f"⚠️ {task_problems} task problem(s)"
|
|
603
|
+
class_style = "yellow"
|
|
604
|
+
else:
|
|
605
|
+
class_status = f"✅ {agent_problems} agent failure(s)"
|
|
606
|
+
class_style = "green"
|
|
607
|
+
|
|
608
|
+
table.add_row(
|
|
609
|
+
"Classification",
|
|
610
|
+
f"[{class_style}]{class_status}[/{class_style}]",
|
|
611
|
+
f"{verdict.success_count} success, {task_problems} task issue, {agent_problems} agent issue",
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
console.print(table)
|
|
615
|
+
|
|
616
|
+
# Show classification details
|
|
617
|
+
if result.classifications:
|
|
618
|
+
console.print("\n[bold]Trial Classifications:[/bold]")
|
|
619
|
+
|
|
620
|
+
for c in result.classifications:
|
|
621
|
+
# Color based on classification
|
|
622
|
+
if c.classification == Classification.GOOD_SUCCESS:
|
|
623
|
+
icon = "✅"
|
|
624
|
+
style = "green"
|
|
625
|
+
elif c.classification == Classification.GOOD_FAILURE:
|
|
626
|
+
icon = "⚪"
|
|
627
|
+
style = "dim"
|
|
628
|
+
elif c.classification == Classification.BAD_SUCCESS:
|
|
629
|
+
icon = "🔴"
|
|
630
|
+
style = "red"
|
|
631
|
+
elif c.classification == Classification.BAD_FAILURE:
|
|
632
|
+
icon = "🟡"
|
|
633
|
+
style = "yellow"
|
|
634
|
+
else:
|
|
635
|
+
icon = "⚫"
|
|
636
|
+
style = "dim"
|
|
637
|
+
|
|
638
|
+
console.print(f"\n [{style}]{icon} {c.trial_name}: {c.classification.value} - {c.subtype}[/{style}]")
|
|
639
|
+
console.print(f" [dim]Evidence:[/dim] {c.evidence}")
|
|
640
|
+
console.print(f" [dim]Root cause:[/dim] {c.root_cause}")
|
|
641
|
+
if c.is_task_problem and c.recommendation != "N/A - task is fine":
|
|
642
|
+
console.print(f" [yellow]Recommendation:[/yellow] {c.recommendation}")
|
|
643
|
+
|
|
644
|
+
# Show recommendations
|
|
645
|
+
if verdict.recommendations:
|
|
646
|
+
console.print("\n[bold yellow]Recommendations to Fix Task:[/bold yellow]")
|
|
647
|
+
for i, rec in enumerate(verdict.recommendations, 1):
|
|
648
|
+
console.print(f" {i}. {rec}")
|
|
649
|
+
|
|
650
|
+
# Primary issue
|
|
651
|
+
if verdict.primary_issue:
|
|
652
|
+
console.print(f"\n[bold]Primary Issue:[/bold] {verdict.primary_issue}")
|
|
653
|
+
|
|
654
|
+
# Job directory
|
|
655
|
+
if result.job_dir:
|
|
656
|
+
console.print(f"\n[dim]Job artifacts: {result.job_dir}[/dim]")
|