swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
swegen/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from swegen.analyze.models import (
|
|
2
|
+
BaselineResult,
|
|
3
|
+
BaselineValidation,
|
|
4
|
+
Classification,
|
|
5
|
+
Subtype,
|
|
6
|
+
TaskVerdict,
|
|
7
|
+
TrialClassification,
|
|
8
|
+
)
|
|
9
|
+
from swegen.analyze.classifier import TrialClassifier, write_trial_analysis_files
|
|
10
|
+
from swegen.analyze.run import AnalyzeArgs, AnalysisResult, run_analyze
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"AnalysisResult",
|
|
14
|
+
"AnalyzeArgs",
|
|
15
|
+
"BaselineResult",
|
|
16
|
+
"BaselineValidation",
|
|
17
|
+
"Classification",
|
|
18
|
+
"Subtype",
|
|
19
|
+
"TaskVerdict",
|
|
20
|
+
"TrialClassification",
|
|
21
|
+
"TrialClassifier",
|
|
22
|
+
"run_analyze",
|
|
23
|
+
"write_trial_analysis_files",
|
|
24
|
+
]
|
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from claude_agent_sdk import (
|
|
9
|
+
ClaudeAgentOptions,
|
|
10
|
+
ClaudeSDKClient,
|
|
11
|
+
ResultMessage,
|
|
12
|
+
)
|
|
13
|
+
from harbor.models.trial.result import TrialResult
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
|
|
16
|
+
from swegen.create.claude_code_utils import Colors, print_sdk_message
|
|
17
|
+
|
|
18
|
+
from .models import (
|
|
19
|
+
BaselineResult,
|
|
20
|
+
BaselineValidation,
|
|
21
|
+
Classification,
|
|
22
|
+
TaskVerdict,
|
|
23
|
+
TaskVerdictModel,
|
|
24
|
+
TrialClassification,
|
|
25
|
+
TrialClassificationModel,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Load prompt templates
|
|
30
|
+
_CLASSIFY_PROMPT_PATH = Path(__file__).parent / "classify_prompt.txt"
|
|
31
|
+
_CLASSIFY_PROMPT = _CLASSIFY_PROMPT_PATH.read_text()
|
|
32
|
+
|
|
33
|
+
_VERDICT_PROMPT_PATH = Path(__file__).parent / "verdict_prompt.txt"
|
|
34
|
+
_VERDICT_PROMPT = _VERDICT_PROMPT_PATH.read_text()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def write_trial_analysis_files(
|
|
38
|
+
trial_dir: Path,
|
|
39
|
+
classification: TrialClassification,
|
|
40
|
+
task_id: str,
|
|
41
|
+
agent: str,
|
|
42
|
+
model: str,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Write trajectory analysis files to trial directory.
|
|
45
|
+
|
|
46
|
+
Creates three files in the trial directory:
|
|
47
|
+
- trajectory-analysis.json: Structured JSON with classification results
|
|
48
|
+
- trajectory-analysis.md: Human-readable markdown report
|
|
49
|
+
- trajectory-analysis-raw.json: Raw classification data (same as JSON for now)
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
trial_dir: Path to trial directory
|
|
53
|
+
classification: TrialClassification result
|
|
54
|
+
task_id: Task identifier
|
|
55
|
+
agent: Agent name
|
|
56
|
+
model: Model name
|
|
57
|
+
"""
|
|
58
|
+
import json
|
|
59
|
+
|
|
60
|
+
# Write JSON
|
|
61
|
+
json_data = {
|
|
62
|
+
"task_id": task_id,
|
|
63
|
+
"agent": agent,
|
|
64
|
+
"model": model,
|
|
65
|
+
"classification": classification.classification.value,
|
|
66
|
+
"subtype": classification.subtype,
|
|
67
|
+
"evidence": classification.evidence,
|
|
68
|
+
"root_cause": classification.root_cause,
|
|
69
|
+
"recommendation": classification.recommendation,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
(trial_dir / "trajectory-analysis.json").write_text(
|
|
73
|
+
json.dumps(json_data, indent=2)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Write markdown
|
|
77
|
+
md_content = f"""# Trajectory Analysis
|
|
78
|
+
|
|
79
|
+
**Task:** {task_id}
|
|
80
|
+
**Agent:** {agent}
|
|
81
|
+
**Model:** {model}
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
### Classification
|
|
86
|
+
{classification.classification.value} - {classification.subtype}
|
|
87
|
+
|
|
88
|
+
### Evidence
|
|
89
|
+
{classification.evidence}
|
|
90
|
+
|
|
91
|
+
### Root Cause
|
|
92
|
+
{classification.root_cause}
|
|
93
|
+
|
|
94
|
+
### Recommendation
|
|
95
|
+
{classification.recommendation}
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
(trial_dir / "trajectory-analysis.md").write_text(md_content)
|
|
99
|
+
|
|
100
|
+
# Write raw (same as JSON for now, could include full SDK response)
|
|
101
|
+
(trial_dir / "trajectory-analysis-raw.json").write_text(
|
|
102
|
+
json.dumps(json_data, indent=2)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TrialClassifier:
|
|
107
|
+
"""Classifies trial outcomes using Claude Code to identify task quality issues.
|
|
108
|
+
|
|
109
|
+
Uses Claude Agent SDK with file access to explore trial artifacts
|
|
110
|
+
and classify whether outcomes reveal task problems.
|
|
111
|
+
|
|
112
|
+
Authentication (in priority order):
|
|
113
|
+
1. CLAUDE_CODE_OAUTH_TOKEN environment variable (recommended)
|
|
114
|
+
- Generate with: claude setup-token (requires Claude Pro/Max)
|
|
115
|
+
2. ANTHROPIC_API_KEY environment variable (fallback)
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
model: str = "claude-sonnet-4-5",
|
|
121
|
+
verbose: bool = False,
|
|
122
|
+
timeout: int = 300, # 5 minutes per classification
|
|
123
|
+
):
|
|
124
|
+
"""Initialize the classifier.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
model: Model name for Claude Code (default: claude-sonnet-4-5)
|
|
128
|
+
verbose: If True, stream Claude Code output to console
|
|
129
|
+
timeout: Maximum time per classification in seconds (default: 300 = 5 min)
|
|
130
|
+
"""
|
|
131
|
+
self._model = model
|
|
132
|
+
self._verbose = verbose
|
|
133
|
+
self._timeout = timeout
|
|
134
|
+
self._setup_authentication()
|
|
135
|
+
|
|
136
|
+
def _setup_authentication(self) -> None:
|
|
137
|
+
"""Setup authentication for Claude Code.
|
|
138
|
+
|
|
139
|
+
Prefers OAuth token over API key. If OAuth token is available,
|
|
140
|
+
unset API key to ensure OAuth is used.
|
|
141
|
+
"""
|
|
142
|
+
has_oauth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN"))
|
|
143
|
+
has_api_key = bool(os.getenv("ANTHROPIC_API_KEY"))
|
|
144
|
+
|
|
145
|
+
if has_oauth:
|
|
146
|
+
# Prefer OAuth - unset API key to ensure OAuth is used
|
|
147
|
+
if "ANTHROPIC_API_KEY" in os.environ:
|
|
148
|
+
os.environ.pop("ANTHROPIC_API_KEY")
|
|
149
|
+
# No action needed - Claude SDK will use CLAUDE_CODE_OAUTH_TOKEN
|
|
150
|
+
elif has_api_key:
|
|
151
|
+
# Use API key - unset OAuth to ensure API key is used
|
|
152
|
+
if "CLAUDE_CODE_OAUTH_TOKEN" in os.environ:
|
|
153
|
+
os.environ.pop("CLAUDE_CODE_OAUTH_TOKEN")
|
|
154
|
+
# No action needed - Claude SDK will use ANTHROPIC_API_KEY
|
|
155
|
+
else:
|
|
156
|
+
# No authentication available - will fail when trying to classify
|
|
157
|
+
# We'll handle this gracefully in classify_trial
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
async def classify_trial(
|
|
161
|
+
self,
|
|
162
|
+
trial_dir: Path,
|
|
163
|
+
task_dir: Path,
|
|
164
|
+
) -> TrialClassification:
|
|
165
|
+
"""Classify a single trial outcome using Claude Code.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
trial_dir: Path to trial directory (contains result.json, agent/, verifier/)
|
|
169
|
+
task_dir: Path to task directory (contains instruction.md, solution/, tests/)
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
TrialClassification with classification, evidence, and recommendations
|
|
173
|
+
"""
|
|
174
|
+
# Read trial result to get the verified outcome
|
|
175
|
+
result_path = trial_dir / "result.json"
|
|
176
|
+
if not result_path.exists():
|
|
177
|
+
return TrialClassification(
|
|
178
|
+
trial_name=trial_dir.name,
|
|
179
|
+
classification=Classification.HARNESS_ERROR,
|
|
180
|
+
subtype="Missing Result",
|
|
181
|
+
evidence="result.json not found in trial directory",
|
|
182
|
+
root_cause="Trial did not complete - no result.json file",
|
|
183
|
+
recommendation="Check Harbor logs for infrastructure issues",
|
|
184
|
+
reward=None,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
result = TrialResult.model_validate_json(result_path.read_text())
|
|
189
|
+
except Exception as e:
|
|
190
|
+
return TrialClassification(
|
|
191
|
+
trial_name=trial_dir.name,
|
|
192
|
+
classification=Classification.HARNESS_ERROR,
|
|
193
|
+
subtype="Invalid Result",
|
|
194
|
+
evidence=f"Could not parse result.json: {e}",
|
|
195
|
+
root_cause="Trial result file is corrupted or malformed",
|
|
196
|
+
recommendation="Check Harbor logs for what went wrong",
|
|
197
|
+
reward=None,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Extract reward
|
|
201
|
+
reward = None
|
|
202
|
+
if result.verifier_result and result.verifier_result.rewards:
|
|
203
|
+
reward = result.verifier_result.rewards.get("reward")
|
|
204
|
+
|
|
205
|
+
# Determine result string for prompt
|
|
206
|
+
if reward == 1.0:
|
|
207
|
+
result_str = "pass"
|
|
208
|
+
elif reward == 0.0:
|
|
209
|
+
result_str = "fail"
|
|
210
|
+
else:
|
|
211
|
+
result_str = f"unknown (reward={reward})"
|
|
212
|
+
|
|
213
|
+
# Build prompt with paths for Claude to explore
|
|
214
|
+
prompt = _CLASSIFY_PROMPT.format(
|
|
215
|
+
result=result_str,
|
|
216
|
+
task_dir=str(task_dir),
|
|
217
|
+
trial_dir=str(trial_dir),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Run Claude Code with file access
|
|
221
|
+
options = ClaudeAgentOptions(
|
|
222
|
+
permission_mode="bypassPermissions",
|
|
223
|
+
allowed_tools=["Read", "Glob"],
|
|
224
|
+
cwd=str(trial_dir),
|
|
225
|
+
add_dirs=[str(task_dir)],
|
|
226
|
+
model=self._model,
|
|
227
|
+
# Prefer structured output when supported by the SDK/runtime.
|
|
228
|
+
# This avoids brittle "parse JSON from text" logic entirely.
|
|
229
|
+
output_format={
|
|
230
|
+
"type": "json_schema",
|
|
231
|
+
"schema": TrialClassificationModel.model_json_schema(),
|
|
232
|
+
},
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
structured_output: Any = None
|
|
236
|
+
try:
|
|
237
|
+
# Check for authentication before attempting to classify
|
|
238
|
+
has_auth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN") or os.getenv("ANTHROPIC_API_KEY"))
|
|
239
|
+
if not has_auth:
|
|
240
|
+
raise RuntimeError(
|
|
241
|
+
"No authentication configured. Set either CLAUDE_CODE_OAUTH_TOKEN "
|
|
242
|
+
"(preferred, run 'claude setup-token') or ANTHROPIC_API_KEY"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if self._verbose:
|
|
246
|
+
print(f"{Colors.YELLOW}[Classifier] Running Claude Code classification (timeout: {self._timeout}s)...{Colors.RESET}", flush=True)
|
|
247
|
+
print(f"{Colors.YELLOW}[Classifier] Trial: {trial_dir.name}{Colors.RESET}", flush=True)
|
|
248
|
+
print(f"{Colors.YELLOW}[Classifier] Task: {task_dir.name}{Colors.RESET}", flush=True)
|
|
249
|
+
print("-" * 60, flush=True)
|
|
250
|
+
|
|
251
|
+
# Run with timeout
|
|
252
|
+
try:
|
|
253
|
+
async with asyncio.timeout(self._timeout):
|
|
254
|
+
async with ClaudeSDKClient(options=options) as client:
|
|
255
|
+
await client.query(prompt)
|
|
256
|
+
|
|
257
|
+
async for message in client.receive_response():
|
|
258
|
+
if self._verbose:
|
|
259
|
+
print_sdk_message(message)
|
|
260
|
+
if isinstance(message, ResultMessage):
|
|
261
|
+
structured_output = message.structured_output
|
|
262
|
+
except TimeoutError:
|
|
263
|
+
if self._verbose:
|
|
264
|
+
print(f"{Colors.RED}[Classifier] Timed out after {self._timeout}s{Colors.RESET}", flush=True)
|
|
265
|
+
return TrialClassification(
|
|
266
|
+
trial_name=trial_dir.name,
|
|
267
|
+
classification=Classification.HARNESS_ERROR,
|
|
268
|
+
subtype="Timeout",
|
|
269
|
+
evidence=f"Classification timed out after {self._timeout} seconds",
|
|
270
|
+
root_cause="Claude Code classification exceeded time limit",
|
|
271
|
+
recommendation="Review trial manually or increase timeout",
|
|
272
|
+
reward=reward,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if structured_output is None:
|
|
276
|
+
raise RuntimeError("Claude Agent SDK did not return structured_output for this request")
|
|
277
|
+
|
|
278
|
+
if self._verbose:
|
|
279
|
+
print("-" * 60, flush=True)
|
|
280
|
+
print(f"{Colors.GREEN}[Classifier] Classification complete for {trial_dir.name}{Colors.RESET}", flush=True)
|
|
281
|
+
|
|
282
|
+
return self._parse_trial_classification_structured(structured_output, trial_dir.name, reward)
|
|
283
|
+
|
|
284
|
+
except Exception as e:
|
|
285
|
+
# Fallback classification based on reward
|
|
286
|
+
if reward == 1.0:
|
|
287
|
+
classification = Classification.GOOD_SUCCESS
|
|
288
|
+
subtype = "Presumed Correct"
|
|
289
|
+
elif reward == 0.0:
|
|
290
|
+
classification = Classification.GOOD_FAILURE
|
|
291
|
+
subtype = "Presumed Agent Error"
|
|
292
|
+
else:
|
|
293
|
+
classification = Classification.HARNESS_ERROR
|
|
294
|
+
subtype = "Classification Failed"
|
|
295
|
+
|
|
296
|
+
return TrialClassification(
|
|
297
|
+
trial_name=trial_dir.name,
|
|
298
|
+
classification=classification,
|
|
299
|
+
subtype=subtype,
|
|
300
|
+
evidence=f"Claude Code classification failed: {e}",
|
|
301
|
+
root_cause="Could not analyze trial with Claude Code",
|
|
302
|
+
recommendation="Review trial manually",
|
|
303
|
+
reward=reward,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def _parse_trial_classification_structured(
|
|
307
|
+
self,
|
|
308
|
+
structured_output: Any,
|
|
309
|
+
trial_name: str,
|
|
310
|
+
reward: float | None,
|
|
311
|
+
) -> TrialClassification:
|
|
312
|
+
"""Parse and validate structured classification output (preferred path)."""
|
|
313
|
+
try:
|
|
314
|
+
data: Any = structured_output
|
|
315
|
+
|
|
316
|
+
# Allow mild nesting from some SDK wrappers
|
|
317
|
+
if isinstance(data, dict):
|
|
318
|
+
if "structured_output" in data and isinstance(data["structured_output"], dict):
|
|
319
|
+
data = data["structured_output"]
|
|
320
|
+
if "result" in data and isinstance(data["result"], dict):
|
|
321
|
+
data = data["result"]
|
|
322
|
+
|
|
323
|
+
model = TrialClassificationModel.model_validate(data)
|
|
324
|
+
classification = TrialClassification.from_model(
|
|
325
|
+
trial_name=trial_name, model=model, reward=reward
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Enforce classification/result consistency (defensive)
|
|
329
|
+
if reward == 1.0 and not classification.classification.is_success:
|
|
330
|
+
classification.classification = Classification.BAD_SUCCESS
|
|
331
|
+
classification.subtype = "Inconsistent Output"
|
|
332
|
+
classification.evidence = (
|
|
333
|
+
f"Claude returned {model.classification} but verified result was pass (reward=1.0). "
|
|
334
|
+
+ classification.evidence
|
|
335
|
+
).strip()
|
|
336
|
+
if reward == 0.0 and classification.classification.is_success:
|
|
337
|
+
classification.classification = Classification.HARNESS_ERROR
|
|
338
|
+
classification.subtype = "Inconsistent Output"
|
|
339
|
+
classification.evidence = (
|
|
340
|
+
f"Claude returned {model.classification} but verified result was fail (reward=0.0). "
|
|
341
|
+
+ classification.evidence
|
|
342
|
+
).strip()
|
|
343
|
+
|
|
344
|
+
return classification
|
|
345
|
+
except Exception as e:
|
|
346
|
+
return TrialClassification(
|
|
347
|
+
trial_name=trial_name,
|
|
348
|
+
classification=Classification.HARNESS_ERROR,
|
|
349
|
+
subtype="Parse Error",
|
|
350
|
+
evidence=f"Could not parse structured output: {e}",
|
|
351
|
+
root_cause="Claude's structured output did not match expected schema",
|
|
352
|
+
recommendation="Review trial manually",
|
|
353
|
+
reward=reward,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
def classify_trial_sync(
|
|
357
|
+
self,
|
|
358
|
+
trial_dir: Path,
|
|
359
|
+
task_dir: Path,
|
|
360
|
+
) -> TrialClassification:
|
|
361
|
+
"""Synchronous wrapper for classify_trial."""
|
|
362
|
+
return asyncio.run(self.classify_trial(trial_dir, task_dir))
|
|
363
|
+
|
|
364
|
+
async def classify_trials(
|
|
365
|
+
self,
|
|
366
|
+
trial_dirs: list[Path],
|
|
367
|
+
task_dir: Path,
|
|
368
|
+
console: "Console | None" = None,
|
|
369
|
+
) -> list[TrialClassification]:
|
|
370
|
+
"""Classify multiple trials.
|
|
371
|
+
|
|
372
|
+
Note: Runs sequentially to avoid overwhelming Claude Code.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
trial_dirs: List of trial directories to classify
|
|
376
|
+
task_dir: Path to task directory
|
|
377
|
+
console: Optional console for progress output
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
List of TrialClassification results
|
|
381
|
+
"""
|
|
382
|
+
if console:
|
|
383
|
+
console.print(f" Classifying {len(trial_dirs)} trial(s) with Claude Code...")
|
|
384
|
+
|
|
385
|
+
classifications = []
|
|
386
|
+
for i, trial_dir in enumerate(trial_dirs):
|
|
387
|
+
if console:
|
|
388
|
+
console.print(f" [{i+1}/{len(trial_dirs)}] {trial_dir.name}...")
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
classification = await self.classify_trial(trial_dir, task_dir)
|
|
392
|
+
classifications.append(classification)
|
|
393
|
+
except Exception as e:
|
|
394
|
+
classifications.append(TrialClassification(
|
|
395
|
+
trial_name=trial_dir.name,
|
|
396
|
+
classification=Classification.HARNESS_ERROR,
|
|
397
|
+
subtype="Classification Error",
|
|
398
|
+
evidence=str(e),
|
|
399
|
+
root_cause="Exception during classification",
|
|
400
|
+
recommendation="Review trial manually",
|
|
401
|
+
reward=None,
|
|
402
|
+
))
|
|
403
|
+
|
|
404
|
+
return classifications
|
|
405
|
+
|
|
406
|
+
def classify_trials_sync(
|
|
407
|
+
self,
|
|
408
|
+
trial_dirs: list[Path],
|
|
409
|
+
task_dir: Path,
|
|
410
|
+
console: "Console | None" = None,
|
|
411
|
+
) -> list[TrialClassification]:
|
|
412
|
+
"""Synchronous wrapper for classify_trials."""
|
|
413
|
+
return asyncio.run(self.classify_trials(trial_dirs, task_dir, console))
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
async def compute_task_verdict_with_llm(
|
|
417
|
+
classifications: list[TrialClassification],
|
|
418
|
+
baseline: BaselineValidation | None = None,
|
|
419
|
+
quality_check_passed: bool = True,
|
|
420
|
+
model: str = "claude-sonnet-4-5",
|
|
421
|
+
console: "Console | None" = None,
|
|
422
|
+
verbose: bool = False,
|
|
423
|
+
timeout: int = 180, # 3 minutes for verdict synthesis
|
|
424
|
+
) -> TaskVerdict:
|
|
425
|
+
"""Compute task verdict using LLM to synthesize trial analyses.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
classifications: List of individual trial classifications
|
|
429
|
+
baseline: Optional baseline validation results
|
|
430
|
+
quality_check_passed: Whether static quality check passed
|
|
431
|
+
model: Model name for Claude Code
|
|
432
|
+
console: Optional console for progress output
|
|
433
|
+
verbose: If True, stream Claude Code output to console
|
|
434
|
+
timeout: Maximum time for verdict synthesis in seconds (default: 180 = 3 min)
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
TaskVerdict with LLM-synthesized analysis
|
|
438
|
+
"""
|
|
439
|
+
if not classifications:
|
|
440
|
+
return TaskVerdict(
|
|
441
|
+
is_good=False,
|
|
442
|
+
confidence="low",
|
|
443
|
+
primary_issue="No trials to analyze",
|
|
444
|
+
recommendations=["Run agent trials first"],
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Format baseline summary
|
|
448
|
+
if baseline:
|
|
449
|
+
if baseline.is_valid:
|
|
450
|
+
baseline_summary = "✓ Passed (nop failed as expected, oracle passed as expected)"
|
|
451
|
+
else:
|
|
452
|
+
baseline_summary = "✗ FAILED:\n" + "\n".join(f" - {issue}" for issue in baseline.issues)
|
|
453
|
+
else:
|
|
454
|
+
baseline_summary = "Not run"
|
|
455
|
+
|
|
456
|
+
# Format quality check summary
|
|
457
|
+
quality_check_summary = "✓ Passed" if quality_check_passed else "✗ Failed"
|
|
458
|
+
|
|
459
|
+
# Format trial classifications
|
|
460
|
+
trial_lines = []
|
|
461
|
+
for i, c in enumerate(classifications, 1):
|
|
462
|
+
trial_lines.append(f"""Trial {i}: {c.trial_name}
|
|
463
|
+
Classification: {c.classification.value}
|
|
464
|
+
Subtype: {c.subtype}
|
|
465
|
+
Reward: {c.reward}
|
|
466
|
+
Evidence: {c.evidence}
|
|
467
|
+
Root Cause: {c.root_cause}
|
|
468
|
+
Recommendation: {c.recommendation}
|
|
469
|
+
""")
|
|
470
|
+
trial_classifications = "\n".join(trial_lines)
|
|
471
|
+
|
|
472
|
+
# Build prompt
|
|
473
|
+
prompt = _VERDICT_PROMPT.format(
|
|
474
|
+
num_trials=len(classifications),
|
|
475
|
+
baseline_summary=baseline_summary,
|
|
476
|
+
quality_check_summary=quality_check_summary,
|
|
477
|
+
trial_classifications=trial_classifications,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
if console:
|
|
481
|
+
console.print(" [dim]Synthesizing verdict with LLM...[/dim]")
|
|
482
|
+
|
|
483
|
+
# Run Claude Code with simple query (no file access needed)
|
|
484
|
+
options = ClaudeAgentOptions(
|
|
485
|
+
permission_mode="bypassPermissions",
|
|
486
|
+
allowed_tools=[], # No file access needed
|
|
487
|
+
model=model,
|
|
488
|
+
output_format={
|
|
489
|
+
"type": "json_schema",
|
|
490
|
+
"schema": TaskVerdictModel.model_json_schema(),
|
|
491
|
+
},
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
# Check for authentication
|
|
495
|
+
has_auth = bool(os.getenv("CLAUDE_CODE_OAUTH_TOKEN") or os.getenv("ANTHROPIC_API_KEY"))
|
|
496
|
+
if not has_auth:
|
|
497
|
+
raise RuntimeError(
|
|
498
|
+
"No Claude authentication configured for verdict synthesis. "
|
|
499
|
+
"Set either CLAUDE_CODE_OAUTH_TOKEN (preferred, run 'claude setup-token') "
|
|
500
|
+
"or ANTHROPIC_API_KEY"
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
if verbose:
|
|
504
|
+
print(f"\n{Colors.YELLOW}[Verdict] Synthesizing task verdict with LLM (timeout: {timeout}s)...{Colors.RESET}", flush=True)
|
|
505
|
+
print("-" * 60, flush=True)
|
|
506
|
+
|
|
507
|
+
structured_output: Any = None
|
|
508
|
+
try:
|
|
509
|
+
async with asyncio.timeout(timeout):
|
|
510
|
+
async with ClaudeSDKClient(options=options) as client:
|
|
511
|
+
await client.query(prompt)
|
|
512
|
+
|
|
513
|
+
async for message in client.receive_response():
|
|
514
|
+
if verbose:
|
|
515
|
+
print_sdk_message(message)
|
|
516
|
+
if isinstance(message, ResultMessage):
|
|
517
|
+
structured_output = message.structured_output
|
|
518
|
+
|
|
519
|
+
if verbose:
|
|
520
|
+
print("-" * 60, flush=True)
|
|
521
|
+
print(f"{Colors.GREEN}[Verdict] Verdict synthesis complete{Colors.RESET}\n", flush=True)
|
|
522
|
+
except TimeoutError:
|
|
523
|
+
if verbose:
|
|
524
|
+
print("-" * 60, flush=True)
|
|
525
|
+
print(f"{Colors.RED}[Verdict] Timed out after {timeout}s{Colors.RESET}\n", flush=True)
|
|
526
|
+
# Return a fallback verdict based on simple heuristics
|
|
527
|
+
if console:
|
|
528
|
+
console.print(f" [yellow]⚠ Verdict synthesis timed out, using fallback heuristics[/yellow]")
|
|
529
|
+
|
|
530
|
+
task_problem_count = sum(1 for c in classifications if c.is_task_problem)
|
|
531
|
+
return TaskVerdict(
|
|
532
|
+
is_good=task_problem_count == 0,
|
|
533
|
+
confidence="low",
|
|
534
|
+
primary_issue=f"Verdict synthesis timed out ({task_problem_count} task problems detected)",
|
|
535
|
+
recommendations=["Retry analysis with increased timeout", "Review trial classifications manually"],
|
|
536
|
+
task_problem_count=task_problem_count,
|
|
537
|
+
agent_problem_count=sum(1 for c in classifications if c.classification == Classification.GOOD_FAILURE),
|
|
538
|
+
success_count=sum(1 for c in classifications if c.classification in (Classification.GOOD_SUCCESS, Classification.BAD_SUCCESS)),
|
|
539
|
+
harness_error_count=sum(1 for c in classifications if c.classification == Classification.HARNESS_ERROR),
|
|
540
|
+
classifications=classifications,
|
|
541
|
+
baseline=baseline,
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
if structured_output is None:
|
|
545
|
+
raise RuntimeError("Claude Agent SDK did not return structured_output for verdict synthesis")
|
|
546
|
+
verdict_model = _parse_verdict_structured(structured_output)
|
|
547
|
+
|
|
548
|
+
# Build TaskVerdict from LLM response
|
|
549
|
+
task_problem_count = sum(1 for c in classifications if c.is_task_problem)
|
|
550
|
+
agent_problem_count = sum(1 for c in classifications if c.classification == Classification.GOOD_FAILURE)
|
|
551
|
+
success_count = sum(1 for c in classifications if c.classification in (Classification.GOOD_SUCCESS, Classification.BAD_SUCCESS))
|
|
552
|
+
harness_error_count = sum(1 for c in classifications if c.classification == Classification.HARNESS_ERROR)
|
|
553
|
+
|
|
554
|
+
return TaskVerdict(
|
|
555
|
+
is_good=verdict_model.is_good,
|
|
556
|
+
confidence=verdict_model.confidence,
|
|
557
|
+
primary_issue=verdict_model.primary_issue,
|
|
558
|
+
recommendations=verdict_model.recommendations,
|
|
559
|
+
task_problem_count=task_problem_count,
|
|
560
|
+
agent_problem_count=agent_problem_count,
|
|
561
|
+
success_count=success_count,
|
|
562
|
+
harness_error_count=harness_error_count,
|
|
563
|
+
classifications=classifications,
|
|
564
|
+
baseline=baseline,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
def _parse_verdict_structured(structured_output: Any) -> TaskVerdictModel:
|
|
568
|
+
"""Parse and validate verdict from SDK structured output (preferred path)."""
|
|
569
|
+
data: Any = structured_output
|
|
570
|
+
if isinstance(data, dict):
|
|
571
|
+
if "verdict" in data and isinstance(data["verdict"], dict):
|
|
572
|
+
data = data["verdict"]
|
|
573
|
+
if "result" in data and isinstance(data["result"], dict):
|
|
574
|
+
data = data["result"]
|
|
575
|
+
if "structured_output" in data and isinstance(data["structured_output"], dict):
|
|
576
|
+
data = data["structured_output"]
|
|
577
|
+
return TaskVerdictModel.model_validate(data)
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def compute_task_verdict(
|
|
581
|
+
classifications: list[TrialClassification],
|
|
582
|
+
baseline: BaselineValidation | None = None,
|
|
583
|
+
quality_check_passed: bool = True,
|
|
584
|
+
model: str = "claude-sonnet-4-5",
|
|
585
|
+
console: "Console | None" = None,
|
|
586
|
+
verbose: bool = False,
|
|
587
|
+
timeout: int = 180,
|
|
588
|
+
) -> TaskVerdict:
|
|
589
|
+
"""Compute overall task verdict from trial classifications using LLM synthesis.
|
|
590
|
+
|
|
591
|
+
Uses Claude to intelligently synthesize individual trial analyses into a final verdict.
|
|
592
|
+
Performs pattern recognition, root cause analysis, and generates actionable recommendations.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
classifications: List of trial classifications
|
|
596
|
+
baseline: Optional baseline validation results
|
|
597
|
+
quality_check_passed: Whether static quality check passed
|
|
598
|
+
model: Model name for Claude synthesis (default: claude-sonnet-4-5)
|
|
599
|
+
console: Optional console for progress output
|
|
600
|
+
verbose: If True, stream Claude Code output to console
|
|
601
|
+
timeout: Maximum time for verdict synthesis in seconds (default: 180 = 3 min)
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
TaskVerdict with is_good, confidence, and recommendations
|
|
605
|
+
|
|
606
|
+
Raises:
|
|
607
|
+
RuntimeError: If no Claude authentication is configured
|
|
608
|
+
"""
|
|
609
|
+
# Use async LLM-based synthesis
|
|
610
|
+
return asyncio.run(
|
|
611
|
+
compute_task_verdict_with_llm(
|
|
612
|
+
classifications, baseline, quality_check_passed, model, console, verbose, timeout
|
|
613
|
+
)
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
def classify_baseline_result(
|
|
617
|
+
agent: str,
|
|
618
|
+
reward: float | None,
|
|
619
|
+
error: str | None = None,
|
|
620
|
+
) -> BaselineResult:
|
|
621
|
+
"""Create a BaselineResult from agent run outcome.
|
|
622
|
+
|
|
623
|
+
Args:
|
|
624
|
+
agent: "nop" or "oracle"
|
|
625
|
+
reward: Reward value (1.0 = pass, 0.0 = fail)
|
|
626
|
+
error: Optional error message if agent failed to run
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
BaselineResult with pass/fail status
|
|
630
|
+
"""
|
|
631
|
+
passed = reward == 1.0 if reward is not None else False
|
|
632
|
+
return BaselineResult(
|
|
633
|
+
agent=agent, # type: ignore
|
|
634
|
+
passed=passed,
|
|
635
|
+
reward=reward,
|
|
636
|
+
error=error,
|
|
637
|
+
)
|