swegen 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
swegen/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from swegen.config import (
2
+ CreateConfig,
3
+ FarmConfig,
4
+ ValidateConfig,
5
+ )
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = [
10
+ "CreateConfig",
11
+ "FarmConfig",
12
+ "ValidateConfig",
13
+ "__version__",
14
+ ]
@@ -0,0 +1,24 @@
1
+ from swegen.analyze.models import (
2
+ BaselineResult,
3
+ BaselineValidation,
4
+ Classification,
5
+ Subtype,
6
+ TaskVerdict,
7
+ TrialClassification,
8
+ )
9
+ from swegen.analyze.classifier import TrialClassifier, write_trial_analysis_files
10
+ from swegen.analyze.run import AnalyzeArgs, AnalysisResult, run_analyze
11
+
12
+ __all__ = [
13
+ "AnalysisResult",
14
+ "AnalyzeArgs",
15
+ "BaselineResult",
16
+ "BaselineValidation",
17
+ "Classification",
18
+ "Subtype",
19
+ "TaskVerdict",
20
+ "TrialClassification",
21
+ "TrialClassifier",
22
+ "run_analyze",
23
+ "write_trial_analysis_files",
24
+ ]
@@ -0,0 +1,637 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from claude_agent_sdk import (
9
+ ClaudeAgentOptions,
10
+ ClaudeSDKClient,
11
+ ResultMessage,
12
+ )
13
+ from harbor.models.trial.result import TrialResult
14
+ from rich.console import Console
15
+
16
+ from swegen.create.claude_code_utils import Colors, print_sdk_message
17
+
18
+ from .models import (
19
+ BaselineResult,
20
+ BaselineValidation,
21
+ Classification,
22
+ TaskVerdict,
23
+ TaskVerdictModel,
24
+ TrialClassification,
25
+ TrialClassificationModel,
26
+ )
27
+
28
+
29
+ # Load prompt templates
30
+ _CLASSIFY_PROMPT_PATH = Path(__file__).parent / "classify_prompt.txt"
31
+ _CLASSIFY_PROMPT = _CLASSIFY_PROMPT_PATH.read_text()
32
+
33
+ _VERDICT_PROMPT_PATH = Path(__file__).parent / "verdict_prompt.txt"
34
+ _VERDICT_PROMPT = _VERDICT_PROMPT_PATH.read_text()
35
+
36
+
37
+ def write_trial_analysis_files(
38
+ trial_dir: Path,
39
+ classification: TrialClassification,
40
+ task_id: str,
41
+ agent: str,
42
+ model: str,
43
+ ) -> None:
44
+ """Write trajectory analysis files to trial directory.
45
+
46
+ Creates three files in the trial directory:
47
+ - trajectory-analysis.json: Structured JSON with classification results
48
+ - trajectory-analysis.md: Human-readable markdown report
49
+ - trajectory-analysis-raw.json: Raw classification data (same as JSON for now)
50
+
51
+ Args:
52
+ trial_dir: Path to trial directory
53
+ classification: TrialClassification result
54
+ task_id: Task identifier
55
+ agent: Agent name
56
+ model: Model name
57
+ """
58
+ import json
59
+
60
+ # Write JSON
61
+ json_data = {
62
+ "task_id": task_id,
63
+ "agent": agent,
64
+ "model": model,
65
+ "classification": classification.classification.value,
66
+ "subtype": classification.subtype,
67
+ "evidence": classification.evidence,
68
+ "root_cause": classification.root_cause,
69
+ "recommendation": classification.recommendation,
70
+ }
71
+
72
+ (trial_dir / "trajectory-analysis.json").write_text(
73
+ json.dumps(json_data, indent=2)
74
+ )
75
+
76
+ # Write markdown
77
+ md_content = f"""# Trajectory Analysis
78
+
79
+ **Task:** {task_id}
80
+ **Agent:** {agent}
81
+ **Model:** {model}
82
+
83
+ ---
84
+
85
+ ### Classification
86
+ {classification.classification.value} - {classification.subtype}
87
+
88
+ ### Evidence
89
+ {classification.evidence}
90
+
91
+ ### Root Cause
92
+ {classification.root_cause}
93
+
94
+ ### Recommendation
95
+ {classification.recommendation}
96
+ """
97
+
98
+ (trial_dir / "trajectory-analysis.md").write_text(md_content)
99
+
100
+ # Write raw (same as JSON for now, could include full SDK response)
101
+ (trial_dir / "trajectory-analysis-raw.json").write_text(
102
+ json.dumps(json_data, indent=2)
103
+ )
104
+
105
+
106
+ class TrialClassifier:
107
+ """Classifies trial outcomes using Claude Code to identify task quality issues.
108
+
109
+ Uses Claude Agent SDK with file access to explore trial artifacts
110
+ and classify whether outcomes reveal task problems.
111
+
112
+ Authentication (in priority order):
113
+ 1. CLAUDE_CODE_OAUTH_TOKEN environment variable (recommended)
114
+ - Generate with: claude setup-token (requires Claude Pro/Max)
115
+ 2. ANTHROPIC_API_KEY environment variable (fallback)
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ model: str = "claude-sonnet-4-5",
121
+ verbose: bool = False,
122
+ timeout: int = 300, # 5 minutes per classification
123
+ ):
124
+ """Initialize the classifier.
125
+
126
+ Args:
127
+ model: Model name for Claude Code (default: claude-sonnet-4-5)
128
+ verbose: If True, stream Claude Code output to console
129
+ timeout: Maximum time per classification in seconds (default: 300 = 5 min)
130
+ """
131
+ self._model = model
132
+ self._verbose = verbose
133
+ self._timeout = timeout
134
+ self._setup_authentication()
135
+
136
+ def _setup_authentication(self) -> None:
137
+ """Setup authentication for Claude Code.
138
+
139
+ Prefers OAuth token over API key. If OAuth token is available,
140
+ unset API key to ensure OAuth is used.
141
+ """
142
+ has_oauth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN"))
143
+ has_api_key = bool(os.getenv("ANTHROPIC_API_KEY"))
144
+
145
+ if has_oauth:
146
+ # Prefer OAuth - unset API key to ensure OAuth is used
147
+ if "ANTHROPIC_API_KEY" in os.environ:
148
+ os.environ.pop("ANTHROPIC_API_KEY")
149
+ # No action needed - Claude SDK will use CLAUDE_CODE_OAUTH_TOKEN
150
+ elif has_api_key:
151
+ # Use API key - unset OAuth to ensure API key is used
152
+ if "CLAUDE_CODE_OAUTH_TOKEN" in os.environ:
153
+ os.environ.pop("CLAUDE_CODE_OAUTH_TOKEN")
154
+ # No action needed - Claude SDK will use ANTHROPIC_API_KEY
155
+ else:
156
+ # No authentication available - will fail when trying to classify
157
+ # We'll handle this gracefully in classify_trial
158
+ pass
159
+
160
+ async def classify_trial(
161
+ self,
162
+ trial_dir: Path,
163
+ task_dir: Path,
164
+ ) -> TrialClassification:
165
+ """Classify a single trial outcome using Claude Code.
166
+
167
+ Args:
168
+ trial_dir: Path to trial directory (contains result.json, agent/, verifier/)
169
+ task_dir: Path to task directory (contains instruction.md, solution/, tests/)
170
+
171
+ Returns:
172
+ TrialClassification with classification, evidence, and recommendations
173
+ """
174
+ # Read trial result to get the verified outcome
175
+ result_path = trial_dir / "result.json"
176
+ if not result_path.exists():
177
+ return TrialClassification(
178
+ trial_name=trial_dir.name,
179
+ classification=Classification.HARNESS_ERROR,
180
+ subtype="Missing Result",
181
+ evidence="result.json not found in trial directory",
182
+ root_cause="Trial did not complete - no result.json file",
183
+ recommendation="Check Harbor logs for infrastructure issues",
184
+ reward=None,
185
+ )
186
+
187
+ try:
188
+ result = TrialResult.model_validate_json(result_path.read_text())
189
+ except Exception as e:
190
+ return TrialClassification(
191
+ trial_name=trial_dir.name,
192
+ classification=Classification.HARNESS_ERROR,
193
+ subtype="Invalid Result",
194
+ evidence=f"Could not parse result.json: {e}",
195
+ root_cause="Trial result file is corrupted or malformed",
196
+ recommendation="Check Harbor logs for what went wrong",
197
+ reward=None,
198
+ )
199
+
200
+ # Extract reward
201
+ reward = None
202
+ if result.verifier_result and result.verifier_result.rewards:
203
+ reward = result.verifier_result.rewards.get("reward")
204
+
205
+ # Determine result string for prompt
206
+ if reward == 1.0:
207
+ result_str = "pass"
208
+ elif reward == 0.0:
209
+ result_str = "fail"
210
+ else:
211
+ result_str = f"unknown (reward={reward})"
212
+
213
+ # Build prompt with paths for Claude to explore
214
+ prompt = _CLASSIFY_PROMPT.format(
215
+ result=result_str,
216
+ task_dir=str(task_dir),
217
+ trial_dir=str(trial_dir),
218
+ )
219
+
220
+ # Run Claude Code with file access
221
+ options = ClaudeAgentOptions(
222
+ permission_mode="bypassPermissions",
223
+ allowed_tools=["Read", "Glob"],
224
+ cwd=str(trial_dir),
225
+ add_dirs=[str(task_dir)],
226
+ model=self._model,
227
+ # Prefer structured output when supported by the SDK/runtime.
228
+ # This avoids brittle "parse JSON from text" logic entirely.
229
+ output_format={
230
+ "type": "json_schema",
231
+ "schema": TrialClassificationModel.model_json_schema(),
232
+ },
233
+ )
234
+
235
+ structured_output: Any = None
236
+ try:
237
+ # Check for authentication before attempting to classify
238
+ has_auth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN") or os.getenv("ANTHROPIC_API_KEY"))
239
+ if not has_auth:
240
+ raise RuntimeError(
241
+ "No authentication configured. Set either CLAUDE_CODE_OAUTH_TOKEN "
242
+ "(preferred, run 'claude setup-token') or ANTHROPIC_API_KEY"
243
+ )
244
+
245
+ if self._verbose:
246
+ print(f"{Colors.YELLOW}[Classifier] Running Claude Code classification (timeout: {self._timeout}s)...{Colors.RESET}", flush=True)
247
+ print(f"{Colors.YELLOW}[Classifier] Trial: {trial_dir.name}{Colors.RESET}", flush=True)
248
+ print(f"{Colors.YELLOW}[Classifier] Task: {task_dir.name}{Colors.RESET}", flush=True)
249
+ print("-" * 60, flush=True)
250
+
251
+ # Run with timeout
252
+ try:
253
+ async with asyncio.timeout(self._timeout):
254
+ async with ClaudeSDKClient(options=options) as client:
255
+ await client.query(prompt)
256
+
257
+ async for message in client.receive_response():
258
+ if self._verbose:
259
+ print_sdk_message(message)
260
+ if isinstance(message, ResultMessage):
261
+ structured_output = message.structured_output
262
+ except TimeoutError:
263
+ if self._verbose:
264
+ print(f"{Colors.RED}[Classifier] Timed out after {self._timeout}s{Colors.RESET}", flush=True)
265
+ return TrialClassification(
266
+ trial_name=trial_dir.name,
267
+ classification=Classification.HARNESS_ERROR,
268
+ subtype="Timeout",
269
+ evidence=f"Classification timed out after {self._timeout} seconds",
270
+ root_cause="Claude Code classification exceeded time limit",
271
+ recommendation="Review trial manually or increase timeout",
272
+ reward=reward,
273
+ )
274
+
275
+ if structured_output is None:
276
+ raise RuntimeError("Claude Agent SDK did not return structured_output for this request")
277
+
278
+ if self._verbose:
279
+ print("-" * 60, flush=True)
280
+ print(f"{Colors.GREEN}[Classifier] Classification complete for {trial_dir.name}{Colors.RESET}", flush=True)
281
+
282
+ return self._parse_trial_classification_structured(structured_output, trial_dir.name, reward)
283
+
284
+ except Exception as e:
285
+ # Fallback classification based on reward
286
+ if reward == 1.0:
287
+ classification = Classification.GOOD_SUCCESS
288
+ subtype = "Presumed Correct"
289
+ elif reward == 0.0:
290
+ classification = Classification.GOOD_FAILURE
291
+ subtype = "Presumed Agent Error"
292
+ else:
293
+ classification = Classification.HARNESS_ERROR
294
+ subtype = "Classification Failed"
295
+
296
+ return TrialClassification(
297
+ trial_name=trial_dir.name,
298
+ classification=classification,
299
+ subtype=subtype,
300
+ evidence=f"Claude Code classification failed: {e}",
301
+ root_cause="Could not analyze trial with Claude Code",
302
+ recommendation="Review trial manually",
303
+ reward=reward,
304
+ )
305
+
306
+ def _parse_trial_classification_structured(
307
+ self,
308
+ structured_output: Any,
309
+ trial_name: str,
310
+ reward: float | None,
311
+ ) -> TrialClassification:
312
+ """Parse and validate structured classification output (preferred path)."""
313
+ try:
314
+ data: Any = structured_output
315
+
316
+ # Allow mild nesting from some SDK wrappers
317
+ if isinstance(data, dict):
318
+ if "structured_output" in data and isinstance(data["structured_output"], dict):
319
+ data = data["structured_output"]
320
+ if "result" in data and isinstance(data["result"], dict):
321
+ data = data["result"]
322
+
323
+ model = TrialClassificationModel.model_validate(data)
324
+ classification = TrialClassification.from_model(
325
+ trial_name=trial_name, model=model, reward=reward
326
+ )
327
+
328
+ # Enforce classification/result consistency (defensive)
329
+ if reward == 1.0 and not classification.classification.is_success:
330
+ classification.classification = Classification.BAD_SUCCESS
331
+ classification.subtype = "Inconsistent Output"
332
+ classification.evidence = (
333
+ f"Claude returned {model.classification} but verified result was pass (reward=1.0). "
334
+ + classification.evidence
335
+ ).strip()
336
+ if reward == 0.0 and classification.classification.is_success:
337
+ classification.classification = Classification.HARNESS_ERROR
338
+ classification.subtype = "Inconsistent Output"
339
+ classification.evidence = (
340
+ f"Claude returned {model.classification} but verified result was fail (reward=0.0). "
341
+ + classification.evidence
342
+ ).strip()
343
+
344
+ return classification
345
+ except Exception as e:
346
+ return TrialClassification(
347
+ trial_name=trial_name,
348
+ classification=Classification.HARNESS_ERROR,
349
+ subtype="Parse Error",
350
+ evidence=f"Could not parse structured output: {e}",
351
+ root_cause="Claude's structured output did not match expected schema",
352
+ recommendation="Review trial manually",
353
+ reward=reward,
354
+ )
355
+
356
+ def classify_trial_sync(
357
+ self,
358
+ trial_dir: Path,
359
+ task_dir: Path,
360
+ ) -> TrialClassification:
361
+ """Synchronous wrapper for classify_trial."""
362
+ return asyncio.run(self.classify_trial(trial_dir, task_dir))
363
+
364
+ async def classify_trials(
365
+ self,
366
+ trial_dirs: list[Path],
367
+ task_dir: Path,
368
+ console: "Console | None" = None,
369
+ ) -> list[TrialClassification]:
370
+ """Classify multiple trials.
371
+
372
+ Note: Runs sequentially to avoid overwhelming Claude Code.
373
+
374
+ Args:
375
+ trial_dirs: List of trial directories to classify
376
+ task_dir: Path to task directory
377
+ console: Optional console for progress output
378
+
379
+ Returns:
380
+ List of TrialClassification results
381
+ """
382
+ if console:
383
+ console.print(f" Classifying {len(trial_dirs)} trial(s) with Claude Code...")
384
+
385
+ classifications = []
386
+ for i, trial_dir in enumerate(trial_dirs):
387
+ if console:
388
+ console.print(f" [{i+1}/{len(trial_dirs)}] {trial_dir.name}...")
389
+
390
+ try:
391
+ classification = await self.classify_trial(trial_dir, task_dir)
392
+ classifications.append(classification)
393
+ except Exception as e:
394
+ classifications.append(TrialClassification(
395
+ trial_name=trial_dir.name,
396
+ classification=Classification.HARNESS_ERROR,
397
+ subtype="Classification Error",
398
+ evidence=str(e),
399
+ root_cause="Exception during classification",
400
+ recommendation="Review trial manually",
401
+ reward=None,
402
+ ))
403
+
404
+ return classifications
405
+
406
+ def classify_trials_sync(
407
+ self,
408
+ trial_dirs: list[Path],
409
+ task_dir: Path,
410
+ console: "Console | None" = None,
411
+ ) -> list[TrialClassification]:
412
+ """Synchronous wrapper for classify_trials."""
413
+ return asyncio.run(self.classify_trials(trial_dirs, task_dir, console))
414
+
415
+
416
+ async def compute_task_verdict_with_llm(
417
+ classifications: list[TrialClassification],
418
+ baseline: BaselineValidation | None = None,
419
+ quality_check_passed: bool = True,
420
+ model: str = "claude-sonnet-4-5",
421
+ console: "Console | None" = None,
422
+ verbose: bool = False,
423
+ timeout: int = 180, # 3 minutes for verdict synthesis
424
+ ) -> TaskVerdict:
425
+ """Compute task verdict using LLM to synthesize trial analyses.
426
+
427
+ Args:
428
+ classifications: List of individual trial classifications
429
+ baseline: Optional baseline validation results
430
+ quality_check_passed: Whether static quality check passed
431
+ model: Model name for Claude Code
432
+ console: Optional console for progress output
433
+ verbose: If True, stream Claude Code output to console
434
+ timeout: Maximum time for verdict synthesis in seconds (default: 180 = 3 min)
435
+
436
+ Returns:
437
+ TaskVerdict with LLM-synthesized analysis
438
+ """
439
+ if not classifications:
440
+ return TaskVerdict(
441
+ is_good=False,
442
+ confidence="low",
443
+ primary_issue="No trials to analyze",
444
+ recommendations=["Run agent trials first"],
445
+ )
446
+
447
+ # Format baseline summary
448
+ if baseline:
449
+ if baseline.is_valid:
450
+ baseline_summary = "✓ Passed (nop failed as expected, oracle passed as expected)"
451
+ else:
452
+ baseline_summary = "✗ FAILED:\n" + "\n".join(f" - {issue}" for issue in baseline.issues)
453
+ else:
454
+ baseline_summary = "Not run"
455
+
456
+ # Format quality check summary
457
+ quality_check_summary = "✓ Passed" if quality_check_passed else "✗ Failed"
458
+
459
+ # Format trial classifications
460
+ trial_lines = []
461
+ for i, c in enumerate(classifications, 1):
462
+ trial_lines.append(f"""Trial {i}: {c.trial_name}
463
+ Classification: {c.classification.value}
464
+ Subtype: {c.subtype}
465
+ Reward: {c.reward}
466
+ Evidence: {c.evidence}
467
+ Root Cause: {c.root_cause}
468
+ Recommendation: {c.recommendation}
469
+ """)
470
+ trial_classifications = "\n".join(trial_lines)
471
+
472
+ # Build prompt
473
+ prompt = _VERDICT_PROMPT.format(
474
+ num_trials=len(classifications),
475
+ baseline_summary=baseline_summary,
476
+ quality_check_summary=quality_check_summary,
477
+ trial_classifications=trial_classifications,
478
+ )
479
+
480
+ if console:
481
+ console.print(" [dim]Synthesizing verdict with LLM...[/dim]")
482
+
483
+ # Run Claude Code with simple query (no file access needed)
484
+ options = ClaudeAgentOptions(
485
+ permission_mode="bypassPermissions",
486
+ allowed_tools=[], # No file access needed
487
+ model=model,
488
+ output_format={
489
+ "type": "json_schema",
490
+ "schema": TaskVerdictModel.model_json_schema(),
491
+ },
492
+ )
493
+
494
+ # Check for authentication
495
+ has_auth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN") or os.getenv("ANTHROPIC_API_KEY"))
496
+ if not has_auth:
497
+ raise RuntimeError(
498
+ "No Claude authentication configured for verdict synthesis. "
499
+ "Set either CLAUDE_CODE_OAUTH_TOKEN (preferred, run 'claude setup-token') "
500
+ "or ANTHROPIC_API_KEY"
501
+ )
502
+
503
+ if verbose:
504
+ print(f"\n{Colors.YELLOW}[Verdict] Synthesizing task verdict with LLM (timeout: {timeout}s)...{Colors.RESET}", flush=True)
505
+ print("-" * 60, flush=True)
506
+
507
+ structured_output: Any = None
508
+ try:
509
+ async with asyncio.timeout(timeout):
510
+ async with ClaudeSDKClient(options=options) as client:
511
+ await client.query(prompt)
512
+
513
+ async for message in client.receive_response():
514
+ if verbose:
515
+ print_sdk_message(message)
516
+ if isinstance(message, ResultMessage):
517
+ structured_output = message.structured_output
518
+
519
+ if verbose:
520
+ print("-" * 60, flush=True)
521
+ print(f"{Colors.GREEN}[Verdict] Verdict synthesis complete{Colors.RESET}\n", flush=True)
522
+ except TimeoutError:
523
+ if verbose:
524
+ print("-" * 60, flush=True)
525
+ print(f"{Colors.RED}[Verdict] Timed out after {timeout}s{Colors.RESET}\n", flush=True)
526
+ # Return a fallback verdict based on simple heuristics
527
+ if console:
528
+ console.print(f" [yellow]⚠ Verdict synthesis timed out, using fallback heuristics[/yellow]")
529
+
530
+ task_problem_count = sum(1 for c in classifications if c.is_task_problem)
531
+ return TaskVerdict(
532
+ is_good=task_problem_count == 0,
533
+ confidence="low",
534
+ primary_issue=f"Verdict synthesis timed out ({task_problem_count} task problems detected)",
535
+ recommendations=["Retry analysis with increased timeout", "Review trial classifications manually"],
536
+ task_problem_count=task_problem_count,
537
+ agent_problem_count=sum(1 for c in classifications if c.classification == Classification.GOOD_FAILURE),
538
+ success_count=sum(1 for c in classifications if c.classification in (Classification.GOOD_SUCCESS, Classification.BAD_SUCCESS)),
539
+ harness_error_count=sum(1 for c in classifications if c.classification == Classification.HARNESS_ERROR),
540
+ classifications=classifications,
541
+ baseline=baseline,
542
+ )
543
+
544
+ if structured_output is None:
545
+ raise RuntimeError("Claude Agent SDK did not return structured_output for verdict synthesis")
546
+ verdict_model = _parse_verdict_structured(structured_output)
547
+
548
+ # Build TaskVerdict from LLM response
549
+ task_problem_count = sum(1 for c in classifications if c.is_task_problem)
550
+ agent_problem_count = sum(1 for c in classifications if c.classification == Classification.GOOD_FAILURE)
551
+ success_count = sum(1 for c in classifications if c.classification in (Classification.GOOD_SUCCESS, Classification.BAD_SUCCESS))
552
+ harness_error_count = sum(1 for c in classifications if c.classification == Classification.HARNESS_ERROR)
553
+
554
+ return TaskVerdict(
555
+ is_good=verdict_model.is_good,
556
+ confidence=verdict_model.confidence,
557
+ primary_issue=verdict_model.primary_issue,
558
+ recommendations=verdict_model.recommendations,
559
+ task_problem_count=task_problem_count,
560
+ agent_problem_count=agent_problem_count,
561
+ success_count=success_count,
562
+ harness_error_count=harness_error_count,
563
+ classifications=classifications,
564
+ baseline=baseline,
565
+ )
566
+
567
+ def _parse_verdict_structured(structured_output: Any) -> TaskVerdictModel:
568
+ """Parse and validate verdict from SDK structured output (preferred path)."""
569
+ data: Any = structured_output
570
+ if isinstance(data, dict):
571
+ if "verdict" in data and isinstance(data["verdict"], dict):
572
+ data = data["verdict"]
573
+ if "result" in data and isinstance(data["result"], dict):
574
+ data = data["result"]
575
+ if "structured_output" in data and isinstance(data["structured_output"], dict):
576
+ data = data["structured_output"]
577
+ return TaskVerdictModel.model_validate(data)
578
+
579
+
580
+ def compute_task_verdict(
581
+ classifications: list[TrialClassification],
582
+ baseline: BaselineValidation | None = None,
583
+ quality_check_passed: bool = True,
584
+ model: str = "claude-sonnet-4-5",
585
+ console: "Console | None" = None,
586
+ verbose: bool = False,
587
+ timeout: int = 180,
588
+ ) -> TaskVerdict:
589
+ """Compute overall task verdict from trial classifications using LLM synthesis.
590
+
591
+ Uses Claude to intelligently synthesize individual trial analyses into a final verdict.
592
+ Performs pattern recognition, root cause analysis, and generates actionable recommendations.
593
+
594
+ Args:
595
+ classifications: List of trial classifications
596
+ baseline: Optional baseline validation results
597
+ quality_check_passed: Whether static quality check passed
598
+ model: Model name for Claude synthesis (default: claude-sonnet-4-5)
599
+ console: Optional console for progress output
600
+ verbose: If True, stream Claude Code output to console
601
+ timeout: Maximum time for verdict synthesis in seconds (default: 180 = 3 min)
602
+
603
+ Returns:
604
+ TaskVerdict with is_good, confidence, and recommendations
605
+
606
+ Raises:
607
+ RuntimeError: If no Claude authentication is configured
608
+ """
609
+ # Use async LLM-based synthesis
610
+ return asyncio.run(
611
+ compute_task_verdict_with_llm(
612
+ classifications, baseline, quality_check_passed, model, console, verbose, timeout
613
+ )
614
+ )
615
+
616
+ def classify_baseline_result(
617
+ agent: str,
618
+ reward: float | None,
619
+ error: str | None = None,
620
+ ) -> BaselineResult:
621
+ """Create a BaselineResult from agent run outcome.
622
+
623
+ Args:
624
+ agent: "nop" or "oracle"
625
+ reward: Reward value (1.0 = pass, 0.0 = fail)
626
+ error: Optional error message if agent failed to run
627
+
628
+ Returns:
629
+ BaselineResult with pass/fail status
630
+ """
631
+ passed = reward == 1.0 if reward is not None else False
632
+ return BaselineResult(
633
+ agent=agent, # type: ignore
634
+ passed=passed,
635
+ reward=reward,
636
+ error=error,
637
+ )