swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
You are analyzing a Harbor task trial to determine if the task is well-specified.
|
|
2
|
+
|
|
3
|
+
## Your Goal
|
|
4
|
+
Determine whether this trial outcome reveals a TASK PROBLEM (needs fixing) or is normal agent behavior (task is fine).
|
|
5
|
+
|
|
6
|
+
**Critical Context:** This task has already passed baseline validation (oracle passes, nop fails). Your job is to detect problems that baseline validation CANNOT catch:
|
|
7
|
+
- Underspecified instructions (agent lacks critical details)
|
|
8
|
+
- Overspecified/brittle tests (tests coupled to specific implementation)
|
|
9
|
+
- Ambiguous requirements (multiple valid interpretations)
|
|
10
|
+
- Tests checking for details not mentioned in instructions
|
|
11
|
+
|
|
12
|
+
## CRITICAL: Calibration for Hard Tasks
|
|
13
|
+
|
|
14
|
+
**Hard tasks are SUPPOSED to be hard.** A 20-40% pass rate is EXPECTED and DESIRABLE for good benchmark tasks. Do NOT classify a failure as a task problem just because:
|
|
15
|
+
- The agent had to explore the codebase to understand what to change
|
|
16
|
+
- The instruction doesn't explicitly list every file that needs modification
|
|
17
|
+
- The agent tried a reasonable approach that turned out to be wrong
|
|
18
|
+
- The task requires significant investigation or domain expertise
|
|
19
|
+
|
|
20
|
+
**The bar for BAD_FAILURE is HIGH.** Only classify as BAD_FAILURE if:
|
|
21
|
+
- Information is GENUINELY IMPOSSIBLE to derive from instruction + codebase combined
|
|
22
|
+
- Tests check for something that contradicts the instruction
|
|
23
|
+
- Multiple valid solutions exist but tests only accept one specific approach
|
|
24
|
+
- Tests are flaky or depend on non-deterministic behavior
|
|
25
|
+
|
|
26
|
+
**Default to GOOD_FAILURE** when the agent fails. Agent failures are the norm for hard tasks.
|
|
27
|
+
|
|
28
|
+
## CRITICAL: What the Agent Can and Cannot See
|
|
29
|
+
|
|
30
|
+
**During the trial, the agent ONLY has access to:**
|
|
31
|
+
- The `instruction.md` file describing the bug/task
|
|
32
|
+
- The buggy codebase (repository code with the bug present)
|
|
33
|
+
- Standard development tools (editor, terminal, etc.)
|
|
34
|
+
|
|
35
|
+
**The agent CANNOT see and has NO knowledge of:**
|
|
36
|
+
- `solution/` directory - contains fix.patch and solve.sh (used ONLY for oracle validation)
|
|
37
|
+
- `tests/` directory - test files are copied in AFTER the agent finishes (for verification only)
|
|
38
|
+
- Any patches, diffs, or reference solutions
|
|
39
|
+
|
|
40
|
+
**This means:**
|
|
41
|
+
- The agent must figure out the fix from scratch using only instruction.md and the buggy code
|
|
42
|
+
- The agent has NO access to any "solution patch" - do NOT fault the agent for not using it
|
|
43
|
+
- The agent cannot see how tests verify the solution - they work blind
|
|
44
|
+
|
|
45
|
+
## The Verified Result
|
|
46
|
+
**Test outcome: {result}** (pass = reward 1.0, fail = reward 0.0)
|
|
47
|
+
|
|
48
|
+
This result is FINAL and has been verified by running the tests. Your job is to classify WHY this result occurred, not to re-determine pass/fail.
|
|
49
|
+
|
|
50
|
+
**Classification constraints based on verified result:**
|
|
51
|
+
- If result = 'pass' → classify as GOOD_SUCCESS or BAD_SUCCESS
|
|
52
|
+
- If result = 'fail' → classify as GOOD_FAILURE, BAD_FAILURE, or HARNESS_ERROR
|
|
53
|
+
|
|
54
|
+
## Where to Look (For YOUR Analysis - NOT What Agent Saw)
|
|
55
|
+
|
|
56
|
+
**Task Definition ({task_dir}):** (for your analysis)
|
|
57
|
+
- instruction.md - What the agent was told (ONLY thing agent sees from task)
|
|
58
|
+
- solution/solve.sh - Reference solution (agent CANNOT see this)
|
|
59
|
+
- tests/ - Test files that verify (agent CANNOT see these)
|
|
60
|
+
|
|
61
|
+
**Trial Execution ({trial_dir}):**
|
|
62
|
+
- agent/ - Agent execution logs and trajectory
|
|
63
|
+
- verifier/test-stdout.txt - Test output
|
|
64
|
+
- result.json - Contains verifier_result.rewards.reward
|
|
65
|
+
|
|
66
|
+
Read the relevant files to understand WHY the result occurred, then classify accordingly.
|
|
67
|
+
|
|
68
|
+
**Task directory structure:**
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
<task-dir>
|
|
72
|
+
├── instruction.md
|
|
73
|
+
├── task.toml
|
|
74
|
+
├── environment
|
|
75
|
+
│ ├── Dockerfile
|
|
76
|
+
│ └── bug.patch
|
|
77
|
+
├── solution
|
|
78
|
+
│ ├── solve.sh
|
|
79
|
+
│ └── fix.patch
|
|
80
|
+
└── tests
|
|
81
|
+
├── test.sh
|
|
82
|
+
└── # test files (e.g., test_*.py, *.test.ts, *_test.go, etc.)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Classification Taxonomy
|
|
86
|
+
|
|
87
|
+
### HARNESS_ERROR (Infrastructure Issue)
|
|
88
|
+
The agent never ran properly:
|
|
89
|
+
- Agent binary not found (e.g., 'bash: claude: command not found')
|
|
90
|
+
- Docker/container setup failures
|
|
91
|
+
- Missing dependencies in test environment
|
|
92
|
+
- Empty trajectory files
|
|
93
|
+
|
|
94
|
+
### GOOD_FAILURE (Agent's Fault - Task is Fine) ✓ DEFAULT FOR FAILURES
|
|
95
|
+
Agent ran but couldn't solve it due to its own limitations. **This is the expected outcome for hard tasks.**
|
|
96
|
+
- **Timeout**: Task requires many steps, agent ran out of time
|
|
97
|
+
- **Wrong Approach**: Agent tried reasonable approaches but couldn't find the right solution
|
|
98
|
+
- **Implementation Bugs**: Agent understood task but made coding errors
|
|
99
|
+
- **Context Loss**: Agent forgot earlier context or requirements
|
|
100
|
+
- **Premature Stop**: Agent gave up early or declared success incorrectly
|
|
101
|
+
- **Complexity Overwhelm**: Task is genuinely difficult and agent couldn't handle it
|
|
102
|
+
- **Insufficient Exploration**: Agent didn't explore the codebase enough to understand what to change
|
|
103
|
+
- **Incomplete Understanding**: Agent misunderstood the problem or solution space
|
|
104
|
+
|
|
105
|
+
**Key insight**: If the agent COULD have solved it with more effort, better exploration, or smarter reasoning, it's GOOD_FAILURE even if the task is hard.
|
|
106
|
+
|
|
107
|
+
### BAD_FAILURE (Task's Fault - Needs Fix) ⚠️
|
|
108
|
+
Agent failed due to task specification issues.
|
|
109
|
+
|
|
110
|
+
**⚠️ IMPORTANT: The bar for BAD_FAILURE is VERY HIGH. Default to GOOD_FAILURE.**
|
|
111
|
+
|
|
112
|
+
**Underspecified Instruction** - Information is IMPOSSIBLE to derive:
|
|
113
|
+
- Tests require behavior that is NOT mentioned in instruction AND NOT discoverable from codebase
|
|
114
|
+
- The instruction is actively misleading or contradicts what tests expect
|
|
115
|
+
- Example: Instruction says "validate cookies" but tests ONLY check "authorization" header (completely different requirement)
|
|
116
|
+
|
|
117
|
+
**NOT underspecified** (classify as GOOD_FAILURE instead):
|
|
118
|
+
- Instruction describes the problem but agent must explore to find which files to change
|
|
119
|
+
- Tests check specific files that a competent developer could identify by investigation
|
|
120
|
+
- Agent needs to understand the codebase structure to implement the fix
|
|
121
|
+
- Example: Instruction says "fix version references" - agent must explore to find go.mod files
|
|
122
|
+
|
|
123
|
+
**Rigid/Brittle Tests** - Tests reject CORRECT solutions:
|
|
124
|
+
- Tests check exact string matches instead of behavior (e.g., `assert "duplicate" in msg` rejects valid "conflicts with")
|
|
125
|
+
- Tests require specific variable/function names not specified in instruction
|
|
126
|
+
- Agent's solution is FUNCTIONALLY CORRECT but fails due to superficial differences
|
|
127
|
+
- Example: Agent fixes the bug correctly but test fails because it expects specific error message format
|
|
128
|
+
|
|
129
|
+
**NOT brittle** (classify as GOOD_FAILURE instead):
|
|
130
|
+
- Tests check for the correct behavior and agent's solution doesn't implement it
|
|
131
|
+
- Agent's approach was reasonable but wrong (this is expected for hard tasks)
|
|
132
|
+
|
|
133
|
+
**Non-deterministic Tests** - Flaky/unpredictable:
|
|
134
|
+
- Tests fail or pass inconsistently due to timing, race conditions, or randomness
|
|
135
|
+
- Tests depend on external state or network conditions
|
|
136
|
+
- Tests have order dependencies
|
|
137
|
+
|
|
138
|
+
**Environment Issues** - Task-specific setup problems:
|
|
139
|
+
- Missing dependencies, wrong package versions, docker/container problems specific to this task
|
|
140
|
+
- Not general infrastructure failures (those are HARNESS_ERROR)
|
|
141
|
+
|
|
142
|
+
**Missing File Reference** - Tests expect files not mentioned:
|
|
143
|
+
- Tests check for outputs to files that instruction doesn't specify
|
|
144
|
+
- Tests import modules that instruction doesn't mention needing
|
|
145
|
+
|
|
146
|
+
**Ambiguous Requirements** - Multiple valid interpretations:
|
|
147
|
+
- Instruction wording allows different correct implementations, but tests only accept one
|
|
148
|
+
- Example: "Handle duplicates" could mean reject, skip, or merge - instruction doesn't specify
|
|
149
|
+
|
|
150
|
+
⚠️ **KEY SIGNAL FOR BAD_FAILURE:** The agent's solution is FUNCTIONALLY CORRECT but tests reject it for superficial reasons, OR the task requires information that doesn't exist anywhere.
|
|
151
|
+
|
|
152
|
+
⚠️ **KEY SIGNAL FOR GOOD_FAILURE:** The agent failed to find the right solution, but the information WAS available in the codebase if they had explored more thoroughly.
|
|
153
|
+
|
|
154
|
+
### GOOD_SUCCESS (Legitimate Solution)
|
|
155
|
+
Agent correctly solved the task:
|
|
156
|
+
- Solution matches expected behavior from instruction
|
|
157
|
+
- Tests verify actual functionality
|
|
158
|
+
- Agent demonstrated understanding of the problem
|
|
159
|
+
|
|
160
|
+
### BAD_SUCCESS (Cheating/Gaming) ⚠️
|
|
161
|
+
Agent "passed" but through illegitimate means:
|
|
162
|
+
|
|
163
|
+
**Hardcoding** - Agent hardcodes expected outputs:
|
|
164
|
+
- Returns specific values instead of computing them
|
|
165
|
+
- Pattern-matched test cases and hardcoded responses
|
|
166
|
+
- Example: Returns "success" for every input instead of actual validation logic
|
|
167
|
+
|
|
168
|
+
**Test Inspection** - Agent somehow accessed test files:
|
|
169
|
+
- NOTE: Tests should NOT be visible to agent - if they accessed them, that's an infrastructure issue
|
|
170
|
+
- Look for evidence in trajectory of agent reading test files from unexpected locations
|
|
171
|
+
- Based solution on what tests check rather than instruction requirements
|
|
172
|
+
|
|
173
|
+
**Oracle Copying** - Agent somehow accessed solution:
|
|
174
|
+
- NOTE: solution/ should NOT be visible to agent - if they accessed it, that's an infrastructure issue
|
|
175
|
+
- Look for evidence in trajectory of agent finding/reading solution files
|
|
176
|
+
- This is rare and indicates a task setup problem, not normal agent cheating
|
|
177
|
+
|
|
178
|
+
**Minimal Compliance** - Bare minimum to pass:
|
|
179
|
+
- Technically passes tests but doesn't solve the actual problem
|
|
180
|
+
- Only handles the specific test cases, would fail on similar inputs
|
|
181
|
+
- Example: Hardcodes 3 expected values instead of implementing the algorithm
|
|
182
|
+
|
|
183
|
+
**Tests Too Permissive** - Tests accept bad solutions:
|
|
184
|
+
- Tests don't actually verify the requirement from instruction
|
|
185
|
+
- Tests pass for trivial/wrong implementations
|
|
186
|
+
- Example: Test checks function exists but doesn't verify behavior
|
|
187
|
+
|
|
188
|
+
**Task Pre-solved** - Solution already present:
|
|
189
|
+
- Repository already contained working code, agent just had to find it
|
|
190
|
+
- Tests pass without any meaningful changes
|
|
191
|
+
|
|
192
|
+
⚠️ **KEY SIGNAL:** If agent passed but their implementation is suspiciously minimal or hardcodes specific values, classify as BAD_SUCCESS. If they somehow accessed solution/ or tests/ (which should be hidden), note this as an infrastructure concern.
|
|
193
|
+
|
|
194
|
+
## How to Analyze
|
|
195
|
+
|
|
196
|
+
1. **Remember agent visibility** - Agent only saw instruction.md + buggy code. No tests, no solution, no patches.
|
|
197
|
+
2. **Read the test output** (verifier/test-stdout.txt) - What specifically failed or passed?
|
|
198
|
+
3. **Compare instruction vs tests** - Are tests checking for things NOT in instructions?
|
|
199
|
+
4. **Examine agent trajectory** (agent/) - Did the agent try reasonable approaches given what they could see?
|
|
200
|
+
5. **Check for cheating patterns** - Did agent hardcode values? (Accessing tests/solution should be impossible)
|
|
201
|
+
6. **Consider consistency** - Would other agents likely have the same outcome?
|
|
202
|
+
7. **Alternative solution test** - Would a different valid approach (that matches instruction) pass the tests?
|
|
203
|
+
|
|
204
|
+
## Key Questions for Task Quality
|
|
205
|
+
|
|
206
|
+
**For BAD_FAILURE (instruction/test problems) - ALL must be true:**
|
|
207
|
+
- Is the required information IMPOSSIBLE to derive from instruction + codebase?
|
|
208
|
+
- Did the agent implement something that is FUNCTIONALLY CORRECT but tests reject it?
|
|
209
|
+
- Would ANY competent developer struggle because the spec is genuinely ambiguous or contradictory?
|
|
210
|
+
|
|
211
|
+
**For GOOD_FAILURE (task is fine, agent failed) - ANY is sufficient:**
|
|
212
|
+
- Could a skilled developer solve this by exploring the codebase carefully?
|
|
213
|
+
- Is the information technically available but just requires investigation?
|
|
214
|
+
- Did the agent fail to explore enough or make reasoning errors?
|
|
215
|
+
- Is this just a hard problem that requires expertise?
|
|
216
|
+
|
|
217
|
+
**For BAD_SUCCESS (cheating/too easy):**
|
|
218
|
+
- Did the agent hardcode outputs instead of implementing logic?
|
|
219
|
+
- Could an agent pass by pattern-matching without understanding the problem?
|
|
220
|
+
- Do tests actually verify the requirement or just check superficial things?
|
|
221
|
+
- Is there evidence the agent somehow accessed hidden files? (This shouldn't be possible normally)
|
|
222
|
+
|
|
223
|
+
**Critical distinction (GOOD vs BAD):**
|
|
224
|
+
- **GOOD_FAILURE**: Agent tried reasonable approaches but couldn't solve it (agent's limitation)
|
|
225
|
+
- **BAD_FAILURE**: Agent tried reasonable approaches but tests rejected valid solutions (task's fault)
|
|
226
|
+
- **GOOD_SUCCESS**: Agent solved it properly by understanding and implementing requirements
|
|
227
|
+
- **BAD_SUCCESS**: Agent "solved" it by cheating, hardcoding, or tests are too permissive
|
|
228
|
+
|
|
229
|
+
## Output Format
|
|
230
|
+
|
|
231
|
+
REMEMBER: Your classification MUST match the verified result!
|
|
232
|
+
- Result '{result}' means you must choose a matching classification (SUCCESS for pass, FAILURE for fail)
|
|
233
|
+
|
|
234
|
+
Output ONLY valid JSON with this exact structure (no markdown, no code blocks, no explanation):
|
|
235
|
+
{{
|
|
236
|
+
"classification": "HARNESS_ERROR | GOOD_FAILURE | BAD_FAILURE | GOOD_SUCCESS | BAD_SUCCESS",
|
|
237
|
+
"subtype": "specific subtype from the taxonomy above",
|
|
238
|
+
"evidence": "Quote specific test names, error messages, or code snippets that support your classification",
|
|
239
|
+
"root_cause": "1-2 sentence explanation of what specifically caused this outcome",
|
|
240
|
+
"recommendation": "If BAD_FAILURE or BAD_SUCCESS, explain how to fix the task. Otherwise write 'N/A - task is fine'"
|
|
241
|
+
}}
|
swegen/analyze/models.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Classification(str, Enum):
|
|
11
|
+
"""Top-level classification of a trial outcome.
|
|
12
|
+
|
|
13
|
+
The classification indicates whether the outcome reveals a problem
|
|
14
|
+
with the task (BAD_*) or is expected behavior (GOOD_*, HARNESS_ERROR).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Infrastructure problem - agent never ran
|
|
18
|
+
HARNESS_ERROR = "HARNESS_ERROR"
|
|
19
|
+
|
|
20
|
+
# Agent ran but failed - task is fine, agent couldn't solve it
|
|
21
|
+
GOOD_FAILURE = "GOOD_FAILURE"
|
|
22
|
+
|
|
23
|
+
# Agent failed due to task issues - task needs fixing
|
|
24
|
+
BAD_FAILURE = "BAD_FAILURE"
|
|
25
|
+
|
|
26
|
+
# Agent solved it legitimately - task is working
|
|
27
|
+
GOOD_SUCCESS = "GOOD_SUCCESS"
|
|
28
|
+
|
|
29
|
+
# Agent "solved" it by cheating or task is broken - task needs fixing
|
|
30
|
+
BAD_SUCCESS = "BAD_SUCCESS"
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def is_task_problem(self) -> bool:
|
|
34
|
+
"""Returns True if this classification indicates a task issue."""
|
|
35
|
+
return self in (Classification.BAD_FAILURE, Classification.BAD_SUCCESS)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def is_success(self) -> bool:
|
|
39
|
+
"""Returns True if tests passed."""
|
|
40
|
+
return self in (Classification.GOOD_SUCCESS, Classification.BAD_SUCCESS)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Subtype(str, Enum):
|
|
44
|
+
"""Detailed subtype explaining the classification.
|
|
45
|
+
|
|
46
|
+
These provide actionable information about what specifically
|
|
47
|
+
caused the outcome.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# HARNESS_ERROR subtypes
|
|
51
|
+
AGENT_NOT_FOUND = "Agent Not Found"
|
|
52
|
+
CONTAINER_FAILURE = "Container/Docker Failure"
|
|
53
|
+
MISSING_DEPENDENCIES = "Missing Dependencies"
|
|
54
|
+
EMPTY_TRAJECTORY = "Empty Trajectory"
|
|
55
|
+
INFRASTRUCTURE_ERROR = "Infrastructure Error"
|
|
56
|
+
|
|
57
|
+
# GOOD_FAILURE subtypes (agent's fault)
|
|
58
|
+
TIMEOUT = "Timeout"
|
|
59
|
+
WRONG_APPROACH = "Wrong Approach"
|
|
60
|
+
IMPLEMENTATION_BUGS = "Implementation Bugs"
|
|
61
|
+
CONTEXT_LOSS = "Context Loss"
|
|
62
|
+
PREMATURE_STOP = "Premature Stop"
|
|
63
|
+
COMPLEXITY_OVERWHELM = "Complexity Overwhelm"
|
|
64
|
+
INCOMPLETE_SOLUTION = "Incomplete Solution"
|
|
65
|
+
LOGIC_ERROR = "Logic Error"
|
|
66
|
+
|
|
67
|
+
# BAD_FAILURE subtypes (task's fault)
|
|
68
|
+
UNDERSPECIFIED_INSTRUCTION = "Underspecified Instruction"
|
|
69
|
+
RIGID_BRITTLE_TESTS = "Rigid/Brittle Tests"
|
|
70
|
+
NONDETERMINISTIC_TESTS = "Non-deterministic Tests"
|
|
71
|
+
ENVIRONMENT_ISSUES = "Environment Issues"
|
|
72
|
+
MISSING_FILE_REFERENCE = "Missing File Reference"
|
|
73
|
+
AMBIGUOUS_REQUIREMENTS = "Ambiguous Requirements"
|
|
74
|
+
IMPLEMENTATION_DETAILS_REQUIRED = "Implementation Details Required"
|
|
75
|
+
EDGE_CASES_NOT_SPECIFIED = "Edge Cases Not Specified"
|
|
76
|
+
TEST_EXPECTS_SPECIFIC_FORMAT = "Test Expects Specific Format"
|
|
77
|
+
|
|
78
|
+
# GOOD_SUCCESS subtypes
|
|
79
|
+
CORRECT_SOLUTION = "Correct Solution"
|
|
80
|
+
ALTERNATIVE_VALID_SOLUTION = "Alternative Valid Solution"
|
|
81
|
+
|
|
82
|
+
# BAD_SUCCESS subtypes (cheating/gaming)
|
|
83
|
+
HARDCODING = "Hardcoding"
|
|
84
|
+
TEST_INSPECTION = "Test Inspection"
|
|
85
|
+
ORACLE_COPYING = "Oracle Copying"
|
|
86
|
+
MINIMAL_COMPLIANCE = "Minimal Compliance"
|
|
87
|
+
TESTS_TOO_PERMISSIVE = "Tests Too Permissive"
|
|
88
|
+
TASK_PRE_SOLVED = "Task Pre-solved"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class TrialClassificationModel(BaseModel):
|
|
92
|
+
"""Pydantic model for LLM structured output."""
|
|
93
|
+
|
|
94
|
+
classification: Literal[
|
|
95
|
+
"HARNESS_ERROR", "GOOD_FAILURE", "BAD_FAILURE", "GOOD_SUCCESS", "BAD_SUCCESS"
|
|
96
|
+
] = Field(description="Top-level classification")
|
|
97
|
+
|
|
98
|
+
subtype: str = Field(
|
|
99
|
+
description="Specific subtype from the taxonomy (e.g., 'Timeout', 'Underspecified Instruction')"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
evidence: str = Field(
|
|
103
|
+
description="Specific evidence from files: test names, error messages, code snippets"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
root_cause: str = Field(
|
|
107
|
+
description="1-2 sentence explanation of what caused this outcome"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
recommendation: str = Field(
|
|
111
|
+
description="How to fix the task (if BAD_FAILURE or BAD_SUCCESS), or 'N/A' if task is fine"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class TaskVerdictModel(BaseModel):
|
|
116
|
+
"""Pydantic model for LLM structured output for the overall task verdict."""
|
|
117
|
+
|
|
118
|
+
is_good: bool = Field(description="Whether the task is good (true) or needs review (false)")
|
|
119
|
+
confidence: Literal["high", "medium", "low"] = Field(description="Confidence level")
|
|
120
|
+
primary_issue: str | None = Field(
|
|
121
|
+
default=None, description="Primary issue if task needs review, else null"
|
|
122
|
+
)
|
|
123
|
+
recommendations: list[str] = Field(
|
|
124
|
+
default_factory=list, description="Actionable recommendations (3-5 for bad tasks)"
|
|
125
|
+
)
|
|
126
|
+
reasoning: str | None = Field(
|
|
127
|
+
default=None, description="1-2 sentence explanation of the verdict (optional)"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@dataclass
|
|
132
|
+
class TrialClassification:
|
|
133
|
+
"""Classification result for a single trial.
|
|
134
|
+
|
|
135
|
+
This captures why a trial succeeded or failed, and whether
|
|
136
|
+
the outcome indicates a task problem that needs fixing.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
trial_name: str
|
|
140
|
+
classification: Classification
|
|
141
|
+
subtype: str
|
|
142
|
+
evidence: str
|
|
143
|
+
root_cause: str
|
|
144
|
+
recommendation: str
|
|
145
|
+
|
|
146
|
+
# Derived from verifier
|
|
147
|
+
reward: float | None = None
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def is_task_problem(self) -> bool:
|
|
151
|
+
"""Returns True if this trial reveals a task issue."""
|
|
152
|
+
return self.classification.is_task_problem
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def from_model(cls, trial_name: str, model: TrialClassificationModel, reward: float | None = None) -> "TrialClassification":
|
|
156
|
+
"""Create from Pydantic model response."""
|
|
157
|
+
return cls(
|
|
158
|
+
trial_name=trial_name,
|
|
159
|
+
classification=Classification(model.classification),
|
|
160
|
+
subtype=model.subtype,
|
|
161
|
+
evidence=model.evidence,
|
|
162
|
+
root_cause=model.root_cause,
|
|
163
|
+
recommendation=model.recommendation,
|
|
164
|
+
reward=reward,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class BaselineResult:
|
|
170
|
+
"""Result from running a baseline agent (nop or oracle)."""
|
|
171
|
+
|
|
172
|
+
agent: Literal["nop", "oracle"]
|
|
173
|
+
passed: bool # reward == 1
|
|
174
|
+
reward: float | None
|
|
175
|
+
error: str | None = None
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def is_expected(self) -> bool:
|
|
179
|
+
"""Returns True if the result is what we expect for a good task."""
|
|
180
|
+
if self.agent == "nop":
|
|
181
|
+
# nop should FAIL (reward=0) - tests should require changes
|
|
182
|
+
return not self.passed
|
|
183
|
+
else:
|
|
184
|
+
# oracle should PASS (reward=1) - reference solution should work
|
|
185
|
+
return self.passed
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@dataclass
|
|
189
|
+
class BaselineValidation:
|
|
190
|
+
"""Results from baseline validation (nop and oracle runs).
|
|
191
|
+
|
|
192
|
+
For a well-formed task:
|
|
193
|
+
- nop should FAIL (tests require actual work)
|
|
194
|
+
- oracle should PASS (reference solution works)
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
nop: BaselineResult | None = None
|
|
198
|
+
oracle: BaselineResult | None = None
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def is_valid(self) -> bool:
|
|
202
|
+
"""Returns True if baseline validation passes."""
|
|
203
|
+
nop_ok = self.nop is None or self.nop.is_expected
|
|
204
|
+
oracle_ok = self.oracle is None or self.oracle.is_expected
|
|
205
|
+
return nop_ok and oracle_ok
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def issues(self) -> list[str]:
|
|
209
|
+
"""Returns list of baseline validation issues."""
|
|
210
|
+
issues = []
|
|
211
|
+
if self.nop and not self.nop.is_expected:
|
|
212
|
+
issues.append(
|
|
213
|
+
"CRITICAL: nop agent passed - task may be pre-solved or tests are broken"
|
|
214
|
+
)
|
|
215
|
+
if self.oracle and not self.oracle.is_expected:
|
|
216
|
+
issues.append(
|
|
217
|
+
"CRITICAL: oracle agent failed - reference solution doesn't work"
|
|
218
|
+
)
|
|
219
|
+
return issues
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass
|
|
223
|
+
class TaskVerdict:
|
|
224
|
+
"""Final verdict on task quality based on all analysis.
|
|
225
|
+
|
|
226
|
+
This aggregates results from:
|
|
227
|
+
- Static quality checks
|
|
228
|
+
- Baseline validation (nop/oracle)
|
|
229
|
+
- Agent trial classifications
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
is_good: bool
|
|
233
|
+
confidence: Literal["high", "medium", "low"]
|
|
234
|
+
primary_issue: str | None
|
|
235
|
+
recommendations: list[str] = field(default_factory=list)
|
|
236
|
+
|
|
237
|
+
# Breakdown
|
|
238
|
+
task_problem_count: int = 0
|
|
239
|
+
agent_problem_count: int = 0
|
|
240
|
+
success_count: int = 0
|
|
241
|
+
harness_error_count: int = 0
|
|
242
|
+
|
|
243
|
+
# From classifications
|
|
244
|
+
classifications: list[TrialClassification] = field(default_factory=list)
|
|
245
|
+
baseline: BaselineValidation | None = None
|
|
246
|
+
|
|
247
|
+
def summary(self) -> str:
|
|
248
|
+
"""Return a one-line summary of the verdict."""
|
|
249
|
+
if self.is_good:
|
|
250
|
+
return f"✅ GOOD TASK (confidence: {self.confidence})"
|
|
251
|
+
else:
|
|
252
|
+
return f"❌ NEEDS REVIEW: {self.primary_issue}"
|
|
253
|
+
|