swegen 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
swegen/analyze/run.py ADDED
@@ -0,0 +1,656 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import subprocess
5
+ import time
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from harbor.models.trial.result import TrialResult
11
+ from harbor.models.environment_type import EnvironmentType
12
+ from rich.console import Console
13
+ from rich.panel import Panel
14
+ from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
15
+ from rich.table import Table
16
+
17
+ from .models import (
18
+ BaselineValidation,
19
+ Classification,
20
+ TaskVerdict,
21
+ TrialClassification,
22
+ )
23
+ from .classifier import (
24
+ TrialClassifier,
25
+ classify_baseline_result,
26
+ compute_task_verdict,
27
+ write_trial_analysis_files,
28
+ )
29
+ from swegen.tools.harbor_runner import (
30
+ harbor_cmd_base,
31
+ parse_harbor_outcome,
32
+ run_harbor_agent,
33
+ )
34
+
35
+
36
+ def _setup_claude_auth_preference(console: Console) -> None:
37
+ """Setup Claude Code to prefer OAuth token over API key.
38
+
39
+ For Claude Code trials and classification, we prefer OAuth token:
40
+ 1. CLAUDE_CODE_OAUTH_TOKEN (preferred - run 'claude setup-token')
41
+ 2. ANTHROPIC_API_KEY (fallback)
42
+
43
+ Displays which authentication method is being used.
44
+ """
45
+ has_oauth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN"))
46
+ has_api_key = bool(os.getenv("ANTHROPIC_API_KEY"))
47
+
48
+ if has_oauth:
49
+ # Prefer OAuth - unset API key to ensure OAuth is used
50
+ if "ANTHROPIC_API_KEY" in os.environ:
51
+ os.environ.pop("ANTHROPIC_API_KEY")
52
+ console.print("[dim]🔐 Claude Code authentication: OAuth token (preferred)[/dim]")
53
+ elif has_api_key:
54
+ # Use API key - unset OAuth to ensure API key is used
55
+ if "CLAUDE_CODE_OAUTH_TOKEN" in os.environ:
56
+ os.environ.pop("CLAUDE_CODE_OAUTH_TOKEN")
57
+ console.print("[dim]🔐 Claude Code authentication: API key (fallback)[/dim]")
58
+ console.print("[dim] Tip: For better security, use OAuth token ('claude setup-token')[/dim]")
59
+ else:
60
+ console.print("[yellow]⚠️ No Claude Code authentication configured[/yellow]")
61
+ console.print("[yellow] Set CLAUDE_CODE_OAUTH_TOKEN (preferred) or ANTHROPIC_API_KEY[/yellow]")
62
+
63
+
64
+ @dataclass
65
+ class TrialOutcome:
66
+ """Result of a single trial (basic info before classification)."""
67
+
68
+ trial_name: str
69
+ trial_dir: Path
70
+ reward: float | None
71
+ exception_type: str | None
72
+ exception_message: str | None
73
+
74
+
75
+ @dataclass
76
+ class QualityCheckResult:
77
+ """Result of static quality check."""
78
+
79
+ passed: bool
80
+ issues: list[str] = field(default_factory=list)
81
+ details: dict[str, Any] = field(default_factory=dict)
82
+
83
+
84
+ @dataclass
85
+ class AnalysisResult:
86
+ """Complete analysis result for a task."""
87
+
88
+ task_id: str
89
+ task_path: Path
90
+
91
+ # Quality check
92
+ quality_check: QualityCheckResult | None
93
+
94
+ # Baseline validation
95
+ baseline: BaselineValidation | None
96
+
97
+ # Trial results
98
+ trials_run: int
99
+ success_rate: float
100
+ trial_outcomes: list[TrialOutcome]
101
+
102
+ # Classifications (NEW)
103
+ classifications: list[TrialClassification]
104
+
105
+ # Task verdict (NEW)
106
+ verdict: TaskVerdict
107
+
108
+ # Job directory
109
+ job_dir: Path | None
110
+
111
+
112
+ @dataclass
113
+ class AnalyzeArgs:
114
+ """Arguments for the analyze command."""
115
+
116
+ task_path: Path
117
+ agent: str = "claude-code"
118
+ model: str = "anthropic/claude-sonnet-4-5"
119
+ n_trials: int = 3
120
+ n_concurrent: int = 1 # Number of concurrent trials (matches Harbor's -n flag)
121
+ jobs_dir: Path = Path(".state/analyze-jobs")
122
+ skip_quality_check: bool = False
123
+ skip_baseline: bool = False # Skip baseline validation (nop/oracle)
124
+ skip_classify: bool = False # Skip Claude Code classification
125
+ analysis_model: str = "claude-sonnet-4-5" # Model for Claude Code classification
126
+ environment: str = "docker" # Environment type (docker|daytona|e2b|modal|runloop|gke)
127
+ verbose: bool = False
128
+ timeout_multiplier: float = 1.0
129
+ classification_timeout: int = 300 # Timeout per classification in seconds (5 min default)
130
+ verdict_timeout: int = 180 # Timeout for verdict synthesis in seconds (3 min default)
131
+ save_to_dir: bool = False # Write trajectory-analysis.{md,json} to each trial dir
132
+
133
+
134
+ def run_analyze(args: AnalyzeArgs) -> AnalysisResult:
135
+ """Main entry point for task analysis."""
136
+ console = Console()
137
+
138
+ # Resolve task path
139
+ task_path = args.task_path.resolve()
140
+ if not task_path.is_dir():
141
+ console.print(f"[red]Error: Task path does not exist: {task_path}[/red]")
142
+ raise SystemExit(1)
143
+
144
+ task_id = task_path.name
145
+ dataset_path = task_path.parent
146
+
147
+ # Check task structure
148
+ if not (task_path / "tests" / "test.sh").exists():
149
+ console.print(f"[red]Error: Not a valid task (missing tests/test.sh): {task_path}[/red]")
150
+ raise SystemExit(1)
151
+
152
+ # Setup and display Claude authentication for Claude Code agent
153
+ if args.agent == "claude-code":
154
+ _setup_claude_auth_preference(console)
155
+
156
+ console.print(
157
+ Panel.fit(
158
+ f"Agent: {args.agent} | Model: {args.model} | Trials: {args.n_trials}",
159
+ title=task_id,
160
+ )
161
+ )
162
+
163
+ # Run analysis steps
164
+ result = _run_analysis(args, task_id, task_path, dataset_path, console)
165
+
166
+ # Print final report
167
+ _print_report(result, console)
168
+
169
+ return result
170
+
171
+
172
+ def _run_analysis(
173
+ args: AnalyzeArgs,
174
+ task_id: str,
175
+ task_path: Path,
176
+ dataset_path: Path,
177
+ console: Console,
178
+ ) -> AnalysisResult:
179
+ """Run all analysis steps."""
180
+
181
+ # Step 1: Static quality check
182
+ quality_check = None
183
+ if not args.skip_quality_check:
184
+ console.print("\n[bold blue]Step 1/4: Static Quality Check[/bold blue]")
185
+ quality_check = _run_quality_check(task_path, args.analysis_model, console)
186
+ else:
187
+ console.print("\n[dim]Step 1/4: Static Quality Check (skipped)[/dim]")
188
+
189
+ # Step 2: Baseline validation (necessary but not sufficient)
190
+ # Oracle/nop prove the task is technically solvable and requires changes,
191
+ # but they can't detect: underspecified instructions, overspecified tests,
192
+ # ambiguous requirements, or tests checking details not in instructions.
193
+ # That's what the trial classification step (Step 4) is for.
194
+ baseline = None
195
+ if not args.skip_baseline:
196
+ console.print("\n[bold blue]Step 2/4: Baseline Validation (nop/oracle)[/bold blue]")
197
+ baseline = _run_baseline_validation(args, task_id, dataset_path, console)
198
+ else:
199
+ console.print("\n[dim]Step 2/4: Baseline Validation (skipped)[/dim]")
200
+
201
+ # Step 3: Run agent trials
202
+ console.print(f"\n[bold blue]Step 3/4: Running {args.n_trials} Agent Trials[/bold blue]")
203
+ job_dir, trial_outcomes = _run_agent_trials(args, task_id, dataset_path, console)
204
+
205
+ successes = sum(1 for t in trial_outcomes if t.reward == 1)
206
+ failures = sum(1 for t in trial_outcomes if t.reward is not None and t.reward != 1)
207
+ errors = sum(1 for t in trial_outcomes if t.exception_type is not None)
208
+ success_rate = successes / len(trial_outcomes) if trial_outcomes else 0.0
209
+
210
+ console.print(f" Results: {successes} passed, {failures} failed, {errors} errors")
211
+ console.print(f" Success rate: {success_rate:.1%}")
212
+
213
+ # Step 4: Classify trials (detects issues baseline validation can't catch)
214
+ # Each trial is classified independently to identify:
215
+ # - Underspecified instructions (agent lacks critical details)
216
+ # - Overspecified/brittle tests (tests coupled to specific implementation)
217
+ # - Ambiguous requirements (multiple valid interpretations)
218
+ # - Tests checking for details not mentioned in instructions
219
+ # Then we aggregate across trials to detect systematic vs random issues.
220
+ classifications: list[TrialClassification] = []
221
+ if not args.skip_classify and trial_outcomes:
222
+ console.print("\n[bold blue]Step 4/4: Classifying Trial Outcomes[/bold blue]")
223
+
224
+ # Get trial directories for classification
225
+ trial_dirs = [t.trial_dir for t in trial_outcomes if t.trial_dir.exists()]
226
+
227
+ if trial_dirs:
228
+ classifier = TrialClassifier(
229
+ model=args.analysis_model,
230
+ verbose=args.verbose,
231
+ timeout=args.classification_timeout,
232
+ )
233
+ classifications = classifier.classify_trials_sync(trial_dirs, task_path, console)
234
+
235
+ # Write per-trial outputs if requested
236
+ if args.save_to_dir:
237
+ for classification in classifications:
238
+ # Find the matching trial directory
239
+ trial_dir = next(
240
+ (t.trial_dir for t in trial_outcomes if t.trial_name == classification.trial_name),
241
+ None
242
+ )
243
+ if trial_dir and trial_dir.exists():
244
+ write_trial_analysis_files(
245
+ trial_dir=trial_dir,
246
+ classification=classification,
247
+ task_id=task_id,
248
+ agent=args.agent,
249
+ model=args.model,
250
+ )
251
+ if args.verbose:
252
+ console.print(f" [dim]Wrote analysis to {trial_dir}/trajectory-analysis.*[/dim]")
253
+
254
+ # Show classification summary
255
+ task_problems = sum(1 for c in classifications if c.is_task_problem)
256
+ agent_problems = sum(1 for c in classifications if c.classification == Classification.GOOD_FAILURE)
257
+
258
+ if task_problems > 0:
259
+ console.print(f" [yellow]⚠ {task_problems} trial(s) indicate task problems[/yellow]")
260
+ if agent_problems > 0:
261
+ console.print(f" [green]✓ {agent_problems} trial(s) are normal agent failures[/green]")
262
+ else:
263
+ console.print(" [dim]No trial directories found to classify[/dim]")
264
+ else:
265
+ console.print("\n[dim]Step 4/4: Classifying Trial Outcomes (skipped)[/dim]")
266
+
267
+ # Compute task verdict (uses LLM synthesis)
268
+ quality_passed = quality_check is None or quality_check.passed
269
+ verdict = compute_task_verdict(
270
+ classifications,
271
+ baseline,
272
+ quality_passed,
273
+ model=args.analysis_model,
274
+ console=console,
275
+ verbose=args.verbose,
276
+ timeout=args.verdict_timeout,
277
+ )
278
+
279
+ return AnalysisResult(
280
+ task_id=task_id,
281
+ task_path=task_path,
282
+ quality_check=quality_check,
283
+ baseline=baseline,
284
+ trials_run=len(trial_outcomes),
285
+ success_rate=success_rate,
286
+ trial_outcomes=trial_outcomes,
287
+ classifications=classifications,
288
+ verdict=verdict,
289
+ job_dir=job_dir,
290
+ )
291
+
292
+
293
+ def _run_quality_check(
294
+ task_path: Path,
295
+ model: str,
296
+ console: Console,
297
+ ) -> QualityCheckResult:
298
+ """Run Harbor's static quality check on the task."""
299
+ cmd = harbor_cmd_base() + [
300
+ "tasks",
301
+ "check",
302
+ str(task_path),
303
+ "-m",
304
+ model,
305
+ ]
306
+
307
+ with console.status("[cyan]Running quality check..."):
308
+ proc = subprocess.run(cmd, capture_output=True, text=True)
309
+
310
+ # Parse output to extract issues
311
+ issues = []
312
+ details: dict[str, Any] = {}
313
+
314
+ output = proc.stdout + proc.stderr
315
+
316
+ # Look for failed checks in output
317
+ fail_keywords = ["fail", "FAIL", "❌"]
318
+ for line in output.split("\n"):
319
+ for keyword in fail_keywords:
320
+ if keyword in line and "passed" not in line.lower():
321
+ clean_line = line.strip()
322
+ if clean_line and "│" in clean_line:
323
+ parts = [p.strip() for p in clean_line.split("│")]
324
+ if len(parts) >= 2 and any(k in parts[1].lower() for k in ["fail"]):
325
+ issues.append(parts[0])
326
+
327
+ passed = proc.returncode == 0 and len(issues) == 0
328
+
329
+ if passed:
330
+ console.print(" [green]✓ Quality check passed[/green]")
331
+ else:
332
+ console.print(" [yellow]⚠ Quality check found issues:[/yellow]")
333
+ for issue in issues[:5]:
334
+ console.print(f" - {issue}")
335
+
336
+ return QualityCheckResult(passed=passed, issues=issues, details=details)
337
+
338
+
339
+ def _run_baseline_validation(
340
+ args: AnalyzeArgs,
341
+ task_id: str,
342
+ dataset_path: Path,
343
+ console: Console,
344
+ ) -> BaselineValidation:
345
+ """Run nop and oracle baseline agents to validate task correctness."""
346
+
347
+ jobs_parent = args.jobs_dir.resolve()
348
+ jobs_parent.mkdir(parents=True, exist_ok=True)
349
+
350
+ baseline = BaselineValidation()
351
+ env = EnvironmentType(args.environment)
352
+
353
+ # Run nop agent (should fail - reward=0)
354
+ console.print(" Running nop agent (should fail)...")
355
+ nop_code, nop_job = run_harbor_agent(
356
+ task_id,
357
+ dataset_path,
358
+ jobs_parent,
359
+ "nop",
360
+ args.timeout_multiplier,
361
+ capture_output=True,
362
+ # Keep image when we will immediately run oracle; oracle will cleanup.
363
+ delete_after=False,
364
+ environment=env,
365
+ )
366
+ nop_outcome = parse_harbor_outcome(nop_job)
367
+ nop_reward = nop_outcome.reward
368
+ nop_error = nop_outcome.error
369
+ if nop_error is None and nop_reward is None:
370
+ if nop_job is None:
371
+ nop_error = "No Harbor job result found"
372
+ elif nop_code != 0:
373
+ nop_error = f"Harbor exited with code {nop_code}"
374
+ else:
375
+ nop_error = "Could not parse reward from Harbor job result"
376
+ baseline.nop = classify_baseline_result("nop", nop_reward, nop_error)
377
+
378
+ if baseline.nop.is_expected:
379
+ console.print(" [green]✓ nop failed as expected[/green]")
380
+ else:
381
+ console.print(" [red]✗ CRITICAL: nop passed - task may be pre-solved![/red]")
382
+
383
+ # Run oracle agent (should pass - reward=1)
384
+ console.print(" Running oracle agent (should pass)...")
385
+ oracle_code, oracle_job = run_harbor_agent(
386
+ task_id,
387
+ dataset_path,
388
+ jobs_parent,
389
+ "oracle",
390
+ args.timeout_multiplier,
391
+ capture_output=True,
392
+ delete_after=True,
393
+ environment=env,
394
+ )
395
+ oracle_outcome = parse_harbor_outcome(oracle_job)
396
+ oracle_reward = oracle_outcome.reward
397
+ oracle_error = oracle_outcome.error
398
+ if oracle_error is None and oracle_reward is None:
399
+ if oracle_job is None:
400
+ oracle_error = "No Harbor job result found"
401
+ elif oracle_code != 0:
402
+ oracle_error = f"Harbor exited with code {oracle_code}"
403
+ else:
404
+ oracle_error = "Could not parse reward from Harbor job result"
405
+ baseline.oracle = classify_baseline_result("oracle", oracle_reward, oracle_error)
406
+
407
+ if baseline.oracle.is_expected:
408
+ console.print(" [green]✓ oracle passed as expected[/green]")
409
+ else:
410
+ console.print(" [red]✗ CRITICAL: oracle failed - reference solution broken![/red]")
411
+
412
+ return baseline
413
+
414
+
415
+ def _run_agent_trials(
416
+ args: AnalyzeArgs,
417
+ task_id: str,
418
+ dataset_path: Path,
419
+ console: Console,
420
+ ) -> tuple[Path | None, list[TrialOutcome]]:
421
+ """Run multiple agent trials on the task."""
422
+
423
+ _timestamp = int(time.time())
424
+ jobs_parent = args.jobs_dir.resolve()
425
+ jobs_parent.mkdir(parents=True, exist_ok=True)
426
+ unique_parent = jobs_parent / f"{task_id}.{args.agent}.{_timestamp}"
427
+ unique_parent.mkdir(parents=True, exist_ok=True)
428
+ before = set(unique_parent.iterdir())
429
+
430
+ cmd = harbor_cmd_base() + [
431
+ "run",
432
+ "-p", str(dataset_path),
433
+ "-t", task_id,
434
+ "-a", args.agent,
435
+ "-m", args.model,
436
+ "-k", str(args.n_trials),
437
+ "-n", str(args.n_concurrent), # Matches Harbor's -n flag
438
+ "-e", args.environment,
439
+ "--jobs-dir", str(unique_parent),
440
+ "--timeout-multiplier", str(args.timeout_multiplier),
441
+ ]
442
+
443
+ with Progress(
444
+ SpinnerColumn(),
445
+ TextColumn("[progress.description]{task.description}"),
446
+ BarColumn(),
447
+ TaskProgressColumn(),
448
+ console=console,
449
+ ) as progress:
450
+ concurrent_msg = f" ({args.n_concurrent} concurrent)" if args.n_concurrent > 1 else ""
451
+ task = progress.add_task(
452
+ f"[cyan]Running {args.n_trials} trials with {args.agent}{concurrent_msg}...", total=None
453
+ )
454
+
455
+ _proc = subprocess.run(cmd, capture_output=True, text=True)
456
+ progress.update(task, completed=True)
457
+
458
+ # Find the job directory that was created inside unique_parent
459
+ after = set(unique_parent.iterdir()) if unique_parent.exists() else set()
460
+ new_dirs = [p for p in (after - before) if p.is_dir()]
461
+ job_dirs = sorted(new_dirs, key=lambda p: p.stat().st_mtime, reverse=True)
462
+ job_dir = job_dirs[0] if job_dirs else None
463
+
464
+ # Parse trial results
465
+ trial_outcomes = []
466
+ if job_dir:
467
+ trial_outcomes = _parse_trial_results(job_dir)
468
+
469
+ return job_dir, trial_outcomes
470
+
471
+
472
+ def _parse_trial_results(job_dir: Path) -> list[TrialOutcome]:
473
+ """Parse trial results from a job directory."""
474
+ outcomes = []
475
+
476
+ for trial_dir in job_dir.iterdir():
477
+ if not trial_dir.is_dir():
478
+ continue
479
+
480
+ result_path = trial_dir / "result.json"
481
+ if not result_path.exists():
482
+ continue
483
+
484
+ try:
485
+ result = TrialResult.model_validate_json(result_path.read_text())
486
+
487
+ reward = None
488
+ if result.verifier_result and result.verifier_result.rewards:
489
+ reward = result.verifier_result.rewards.get("reward")
490
+
491
+ exception_type = None
492
+ exception_message = None
493
+ if result.exception_info:
494
+ exception_type = result.exception_info.exception_type
495
+ exception_message = result.exception_info.exception_message
496
+
497
+ outcomes.append(
498
+ TrialOutcome(
499
+ trial_name=result.trial_name,
500
+ trial_dir=trial_dir,
501
+ reward=reward,
502
+ exception_type=exception_type,
503
+ exception_message=exception_message,
504
+ )
505
+ )
506
+ except Exception as e:
507
+ console = Console()
508
+ console.print(f"[dim]Warning: Could not parse {result_path}: {e}[/dim]")
509
+
510
+ return outcomes
511
+
512
+
513
+ def _print_report(result: AnalysisResult, console: Console) -> None:
514
+ """Print the final analysis report."""
515
+ console.print("\n")
516
+
517
+ # Overall verdict
518
+ verdict = result.verdict
519
+ if verdict.is_good:
520
+ verdict_style = "bold green"
521
+ verdict_icon = "✅"
522
+ verdict_text = f"GOOD TASK (confidence: {verdict.confidence})"
523
+ else:
524
+ verdict_style = "bold red"
525
+ verdict_icon = "❌"
526
+ verdict_text = "NEEDS REVIEW"
527
+
528
+ console.print(
529
+ Panel.fit(
530
+ f"[{verdict_style}]{verdict_icon} {verdict_text}[/{verdict_style}]",
531
+ title=f"Task Verdict: {result.task_id}",
532
+ )
533
+ )
534
+
535
+ # Summary table
536
+ table = Table(show_header=True, header_style="bold")
537
+ table.add_column("Check", style="cyan")
538
+ table.add_column("Result")
539
+ table.add_column("Details")
540
+
541
+ # Quality check row
542
+ if result.quality_check:
543
+ qc_status = (
544
+ "✅ Passed"
545
+ if result.quality_check.passed
546
+ else f"⚠️ {len(result.quality_check.issues)} issues"
547
+ )
548
+ qc_style = "green" if result.quality_check.passed else "yellow"
549
+ table.add_row(
550
+ "Quality Check",
551
+ f"[{qc_style}]{qc_status}[/{qc_style}]",
552
+ ", ".join(result.quality_check.issues[:3])
553
+ if result.quality_check.issues
554
+ else "All checks passed",
555
+ )
556
+
557
+ # Baseline validation row
558
+ if result.baseline:
559
+ baseline_ok = result.baseline.is_valid
560
+ if baseline_ok:
561
+ baseline_status = "✅ Valid"
562
+ baseline_style = "green"
563
+ baseline_details = "nop fails, oracle passes"
564
+ else:
565
+ baseline_status = "❌ Invalid"
566
+ baseline_style = "red"
567
+ baseline_details = "; ".join(result.baseline.issues)
568
+ table.add_row(
569
+ "Baseline (nop/oracle)",
570
+ f"[{baseline_style}]{baseline_status}[/{baseline_style}]",
571
+ baseline_details,
572
+ )
573
+
574
+ # Trials row
575
+ trials_status = f"{result.success_rate:.0%} success rate"
576
+ if result.success_rate >= 0.67:
577
+ trials_style = "green"
578
+ trials_icon = "✅"
579
+ elif result.success_rate >= 0.33:
580
+ trials_style = "yellow"
581
+ trials_icon = "⚠️"
582
+ else:
583
+ trials_style = "red"
584
+ trials_icon = "❌"
585
+
586
+ successes = sum(1 for t in result.trial_outcomes if t.reward == 1)
587
+ failures = sum(1 for t in result.trial_outcomes if t.reward is not None and t.reward != 1)
588
+ errors = sum(1 for t in result.trial_outcomes if t.exception_type)
589
+
590
+ table.add_row(
591
+ f"Agent Trials ({result.trials_run})",
592
+ f"[{trials_style}]{trials_icon} {trials_status}[/{trials_style}]",
593
+ f"{successes} passed, {failures} failed, {errors} errors",
594
+ )
595
+
596
+ # Classification summary row
597
+ if result.classifications:
598
+ task_problems = verdict.task_problem_count
599
+ agent_problems = verdict.agent_problem_count
600
+
601
+ if task_problems > 0:
602
+ class_status = f"⚠️ {task_problems} task problem(s)"
603
+ class_style = "yellow"
604
+ else:
605
+ class_status = f"✅ {agent_problems} agent failure(s)"
606
+ class_style = "green"
607
+
608
+ table.add_row(
609
+ "Classification",
610
+ f"[{class_style}]{class_status}[/{class_style}]",
611
+ f"{verdict.success_count} success, {task_problems} task issue, {agent_problems} agent issue",
612
+ )
613
+
614
+ console.print(table)
615
+
616
+ # Show classification details
617
+ if result.classifications:
618
+ console.print("\n[bold]Trial Classifications:[/bold]")
619
+
620
+ for c in result.classifications:
621
+ # Color based on classification
622
+ if c.classification == Classification.GOOD_SUCCESS:
623
+ icon = "✅"
624
+ style = "green"
625
+ elif c.classification == Classification.GOOD_FAILURE:
626
+ icon = "⚪"
627
+ style = "dim"
628
+ elif c.classification == Classification.BAD_SUCCESS:
629
+ icon = "🔴"
630
+ style = "red"
631
+ elif c.classification == Classification.BAD_FAILURE:
632
+ icon = "🟡"
633
+ style = "yellow"
634
+ else:
635
+ icon = "⚫"
636
+ style = "dim"
637
+
638
+ console.print(f"\n [{style}]{icon} {c.trial_name}: {c.classification.value} - {c.subtype}[/{style}]")
639
+ console.print(f" [dim]Evidence:[/dim] {c.evidence}")
640
+ console.print(f" [dim]Root cause:[/dim] {c.root_cause}")
641
+ if c.is_task_problem and c.recommendation != "N/A - task is fine":
642
+ console.print(f" [yellow]Recommendation:[/yellow] {c.recommendation}")
643
+
644
+ # Show recommendations
645
+ if verdict.recommendations:
646
+ console.print("\n[bold yellow]Recommendations to Fix Task:[/bold yellow]")
647
+ for i, rec in enumerate(verdict.recommendations, 1):
648
+ console.print(f" {i}. {rec}")
649
+
650
+ # Primary issue
651
+ if verdict.primary_issue:
652
+ console.print(f"\n[bold]Primary Issue:[/bold] {verdict.primary_issue}")
653
+
654
+ # Job directory
655
+ if result.job_dir:
656
+ console.print(f"\n[dim]Job artifacts: {result.job_dir}[/dim]")