swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from harbor.models.environment_type import EnvironmentType
|
|
10
|
+
from harbor.models.job.result import JobResult
|
|
11
|
+
from harbor.models.trial.paths import TrialPaths
|
|
12
|
+
from harbor.models.trial.result import TrialResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def harbor_cmd_base() -> list[str]:
|
|
16
|
+
"""Get the base command to invoke Harbor.
|
|
17
|
+
|
|
18
|
+
Prefers direct `harbor` binary, falls back to `uv run harbor`.
|
|
19
|
+
"""
|
|
20
|
+
if shutil.which("harbor"):
|
|
21
|
+
return ["harbor"]
|
|
22
|
+
if shutil.which("uv"):
|
|
23
|
+
return ["uv", "run", "harbor"]
|
|
24
|
+
return ["python", "-m", "harbor"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run_harbor_agent(
|
|
28
|
+
task_id: str,
|
|
29
|
+
dataset_path: Path,
|
|
30
|
+
jobs_dir: Path,
|
|
31
|
+
agent: str,
|
|
32
|
+
timeout_multiplier: float | None = None,
|
|
33
|
+
capture_output: bool = False,
|
|
34
|
+
delete_after: bool = True,
|
|
35
|
+
environment: EnvironmentType = EnvironmentType.DOCKER,
|
|
36
|
+
) -> tuple[int, Path | None]:
|
|
37
|
+
"""Run a Harbor agent and return (exit_code, job_result_path).
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
task_id: The task identifier
|
|
41
|
+
dataset_path: Path to the Harbor dataset root
|
|
42
|
+
jobs_dir: Parent directory for job artifacts
|
|
43
|
+
agent: Agent type ("nop" or "oracle")
|
|
44
|
+
timeout_multiplier: Optional timeout multiplier for long tasks
|
|
45
|
+
capture_output: If True, suppress stdout/stderr (for rich console usage)
|
|
46
|
+
delete_after: If True, delete Docker images after run (default: True)
|
|
47
|
+
Set to False to keep images for faster subsequent runs
|
|
48
|
+
environment: Environment type (docker, daytona, e2b, modal, runloop, gke)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Tuple of (exit_code, path_to_result_json or None)
|
|
52
|
+
"""
|
|
53
|
+
# Create unique job directory to avoid race conditions
|
|
54
|
+
unique_parent = jobs_dir / f"{task_id}.{agent}.{int(time.time())}"
|
|
55
|
+
unique_parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
before = set(unique_parent.iterdir())
|
|
57
|
+
|
|
58
|
+
cmd = harbor_cmd_base() + [
|
|
59
|
+
"run",
|
|
60
|
+
"--agent",
|
|
61
|
+
agent,
|
|
62
|
+
"-p",
|
|
63
|
+
str(dataset_path),
|
|
64
|
+
"-t",
|
|
65
|
+
task_id,
|
|
66
|
+
"--jobs-dir",
|
|
67
|
+
str(unique_parent),
|
|
68
|
+
"--env",
|
|
69
|
+
environment.value,
|
|
70
|
+
]
|
|
71
|
+
if timeout_multiplier is not None:
|
|
72
|
+
cmd += ["--timeout-multiplier", str(timeout_multiplier)]
|
|
73
|
+
|
|
74
|
+
# Control image deletion: --no-delete keeps images for faster subsequent runs
|
|
75
|
+
if not delete_after:
|
|
76
|
+
cmd.append("--no-delete")
|
|
77
|
+
|
|
78
|
+
proc: subprocess.CompletedProcess[str]
|
|
79
|
+
if capture_output:
|
|
80
|
+
proc = subprocess.run(cmd, check=False, capture_output=True, text=True)
|
|
81
|
+
else:
|
|
82
|
+
proc_bytes = subprocess.run(cmd, check=False)
|
|
83
|
+
# Convert to text version for consistent return type
|
|
84
|
+
proc = subprocess.CompletedProcess(
|
|
85
|
+
args=proc_bytes.args,
|
|
86
|
+
returncode=proc_bytes.returncode,
|
|
87
|
+
stdout="",
|
|
88
|
+
stderr="",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Check if directory still exists after subprocess
|
|
92
|
+
if not unique_parent.exists():
|
|
93
|
+
return proc.returncode, None
|
|
94
|
+
|
|
95
|
+
after = set(unique_parent.iterdir())
|
|
96
|
+
new_dirs = [p for p in (after - before) if p.is_dir()]
|
|
97
|
+
job_dir = (
|
|
98
|
+
sorted(new_dirs, key=lambda p: p.stat().st_mtime, reverse=True)[0] if new_dirs else None
|
|
99
|
+
)
|
|
100
|
+
job_result = (job_dir / "result.json").resolve() if job_dir else None
|
|
101
|
+
|
|
102
|
+
return proc.returncode, job_result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass(frozen=True)
|
|
106
|
+
class HarborOutcome:
|
|
107
|
+
reward: int | None
|
|
108
|
+
error: str | None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_harbor_outcome(job_result_path: Path | None) -> HarborOutcome:
|
|
112
|
+
"""Parse Harbor job result and return both reward and error (best-effort).
|
|
113
|
+
|
|
114
|
+
Uses Harbor's JobResult and TrialResult Pydantic models for type-safe parsing.
|
|
115
|
+
This automatically handles schema changes and provides better error messages.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
job_result_path: Path to the job-level result.json
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
HarborOutcome with:
|
|
122
|
+
- reward: 0 or 1 (or None if unavailable)
|
|
123
|
+
- error: best-effort exception message (or None)
|
|
124
|
+
"""
|
|
125
|
+
if not job_result_path or not job_result_path.exists():
|
|
126
|
+
return HarborOutcome(reward=None, error=None)
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
# Use Harbor's JobResult model for type-safe parsing
|
|
130
|
+
job_result = JobResult.model_validate_json(job_result_path.read_text())
|
|
131
|
+
|
|
132
|
+
# Prefer structured exception info from typed trial results.
|
|
133
|
+
error: str | None = None
|
|
134
|
+
for trial_result in job_result.trial_results:
|
|
135
|
+
if getattr(trial_result, "exception_info", None):
|
|
136
|
+
exc = trial_result.exception_info
|
|
137
|
+
msg = getattr(exc, "exception_message", None) or getattr(exc, "exception_type", None)
|
|
138
|
+
if msg:
|
|
139
|
+
error = str(msg)
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
# Method 1: Check reward_stats in job stats (fastest)
|
|
143
|
+
if job_result.stats.evals:
|
|
144
|
+
# Get first eval (typically only one for single-task runs)
|
|
145
|
+
first_eval = next(iter(job_result.stats.evals.values()))
|
|
146
|
+
|
|
147
|
+
# Check reward_stats for "reward" key
|
|
148
|
+
if first_eval.reward_stats and "reward" in first_eval.reward_stats:
|
|
149
|
+
reward_map = first_eval.reward_stats["reward"]
|
|
150
|
+
|
|
151
|
+
# Check for reward=1 first (oracle success)
|
|
152
|
+
if 1 in reward_map or 1.0 in reward_map:
|
|
153
|
+
return HarborOutcome(reward=1, error=error)
|
|
154
|
+
# Then check for reward=0 (nop success)
|
|
155
|
+
if 0 in reward_map or 0.0 in reward_map:
|
|
156
|
+
return HarborOutcome(reward=0, error=error)
|
|
157
|
+
|
|
158
|
+
# Method 2: Check trial results directly
|
|
159
|
+
for trial_result in job_result.trial_results:
|
|
160
|
+
if trial_result.verifier_result and trial_result.verifier_result.rewards:
|
|
161
|
+
reward_value = trial_result.verifier_result.rewards.get("reward")
|
|
162
|
+
if reward_value is not None:
|
|
163
|
+
return HarborOutcome(reward=int(float(reward_value)), error=error)
|
|
164
|
+
|
|
165
|
+
# Method 3: Fallback - scan trial directories using TrialPaths
|
|
166
|
+
job_root = job_result_path.parent
|
|
167
|
+
for trial_dir in (p for p in job_root.iterdir() if p.is_dir()):
|
|
168
|
+
try:
|
|
169
|
+
trial_paths = TrialPaths(trial_dir)
|
|
170
|
+
if not trial_paths.result_path.exists():
|
|
171
|
+
continue
|
|
172
|
+
trial_result = TrialResult.model_validate_json(trial_paths.result_path.read_text())
|
|
173
|
+
|
|
174
|
+
if error is None and getattr(trial_result, "exception_info", None):
|
|
175
|
+
exc = trial_result.exception_info
|
|
176
|
+
msg = getattr(exc, "exception_message", None) or getattr(exc, "exception_type", None)
|
|
177
|
+
if msg:
|
|
178
|
+
error = str(msg)
|
|
179
|
+
|
|
180
|
+
if trial_result.verifier_result and trial_result.verifier_result.rewards:
|
|
181
|
+
reward_value = trial_result.verifier_result.rewards.get("reward")
|
|
182
|
+
if reward_value is not None:
|
|
183
|
+
return HarborOutcome(reward=int(float(reward_value)), error=error)
|
|
184
|
+
except Exception:
|
|
185
|
+
# Not a valid trial directory, continue searching
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
except Exception:
|
|
189
|
+
return HarborOutcome(reward=None, error=None)
|
|
190
|
+
|
|
191
|
+
return HarborOutcome(reward=None, error=error)
|