swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RepoCache:
|
|
9
|
+
"""Manages local clones of repositories for CC analysis."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, cache_dir: Path | None = None):
|
|
12
|
+
"""
|
|
13
|
+
Initialize the repo cache.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
cache_dir: Directory to store clones. Defaults to .cache/repos
|
|
17
|
+
"""
|
|
18
|
+
self.cache_dir = cache_dir or Path(".cache/repos")
|
|
19
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
self.logger = logging.getLogger("swegen")
|
|
21
|
+
|
|
22
|
+
def get_or_clone(
|
|
23
|
+
self,
|
|
24
|
+
repo: str,
|
|
25
|
+
head_sha: str,
|
|
26
|
+
repo_url: str | None = None,
|
|
27
|
+
) -> Path:
|
|
28
|
+
"""
|
|
29
|
+
Get cached repo or clone it. Checkout the specified commit.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
repo: Repository in "owner/repo" format
|
|
33
|
+
head_sha: Commit SHA to checkout
|
|
34
|
+
repo_url: Optional clone URL (defaults to https://github.com/{repo}.git)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Path to the repository root
|
|
38
|
+
"""
|
|
39
|
+
owner, name = self._parse_repo(repo)
|
|
40
|
+
repo_path = self.cache_dir / owner / name
|
|
41
|
+
|
|
42
|
+
if repo_url is None:
|
|
43
|
+
repo_url = f"https://github.com/{repo}.git"
|
|
44
|
+
|
|
45
|
+
if repo_path.exists() and (repo_path / ".git").exists():
|
|
46
|
+
self.logger.debug("Using cached repo: %s", repo_path)
|
|
47
|
+
self._fetch_and_checkout(repo_path, head_sha)
|
|
48
|
+
else:
|
|
49
|
+
self.logger.info("Cloning repo to cache: %s -> %s", repo, repo_path)
|
|
50
|
+
self._clone(repo_url, repo_path, head_sha)
|
|
51
|
+
|
|
52
|
+
return repo_path
|
|
53
|
+
|
|
54
|
+
def _parse_repo(self, repo: str) -> tuple[str, str]:
|
|
55
|
+
"""Parse 'owner/repo' into (owner, repo) tuple."""
|
|
56
|
+
# Handle full URLs
|
|
57
|
+
if repo.startswith("https://"):
|
|
58
|
+
repo = repo.replace("https://github.com/", "").rstrip(".git")
|
|
59
|
+
if repo.startswith("git@"):
|
|
60
|
+
repo = repo.replace("git@github.com:", "").rstrip(".git")
|
|
61
|
+
|
|
62
|
+
parts = repo.split("/")
|
|
63
|
+
if len(parts) != 2:
|
|
64
|
+
raise ValueError(f"Invalid repo format: {repo}. Expected 'owner/repo'")
|
|
65
|
+
return parts[0], parts[1]
|
|
66
|
+
|
|
67
|
+
def _clone(self, repo_url: str, repo_path: Path, head_sha: str) -> None:
|
|
68
|
+
"""Clone a repository and checkout the specified commit."""
|
|
69
|
+
repo_path.parent.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
# Full clone for maximum CC context
|
|
72
|
+
self.logger.debug("Cloning %s...", repo_url)
|
|
73
|
+
subprocess.run(
|
|
74
|
+
["git", "clone", repo_url, str(repo_path)],
|
|
75
|
+
check=True,
|
|
76
|
+
capture_output=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Checkout the target commit
|
|
80
|
+
self._checkout(repo_path, head_sha)
|
|
81
|
+
|
|
82
|
+
def _fetch_and_checkout(self, repo_path: Path, head_sha: str) -> None:
|
|
83
|
+
"""Fetch latest and checkout the specified commit."""
|
|
84
|
+
self.logger.debug("Fetching updates for %s...", repo_path)
|
|
85
|
+
|
|
86
|
+
# Fetch all refs
|
|
87
|
+
subprocess.run(
|
|
88
|
+
["git", "fetch", "--all"],
|
|
89
|
+
cwd=str(repo_path),
|
|
90
|
+
check=True,
|
|
91
|
+
capture_output=True,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Try to checkout the commit
|
|
95
|
+
self._checkout(repo_path, head_sha)
|
|
96
|
+
|
|
97
|
+
def _clean_repo(self, repo_path: Path) -> None:
|
|
98
|
+
"""Thoroughly clean the repository, including submodules."""
|
|
99
|
+
# Deinit all submodules to remove their contents
|
|
100
|
+
subprocess.run(
|
|
101
|
+
["git", "submodule", "deinit", "--all", "-f"],
|
|
102
|
+
cwd=str(repo_path),
|
|
103
|
+
capture_output=True, # Don't check - might fail if no submodules
|
|
104
|
+
)
|
|
105
|
+
# Reset any tracked changes
|
|
106
|
+
subprocess.run(
|
|
107
|
+
["git", "reset", "--hard"],
|
|
108
|
+
cwd=str(repo_path),
|
|
109
|
+
check=True,
|
|
110
|
+
capture_output=True,
|
|
111
|
+
)
|
|
112
|
+
# Clean untracked files, including nested git repos (-ff) and ignored files (-x)
|
|
113
|
+
subprocess.run(
|
|
114
|
+
["git", "clean", "-ffdx"],
|
|
115
|
+
cwd=str(repo_path),
|
|
116
|
+
check=True,
|
|
117
|
+
capture_output=True,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def _checkout(self, repo_path: Path, sha: str) -> None:
|
|
121
|
+
"""Checkout a specific commit, fetching if needed."""
|
|
122
|
+
try:
|
|
123
|
+
# First, thoroughly clean the repo
|
|
124
|
+
self._clean_repo(repo_path)
|
|
125
|
+
|
|
126
|
+
# Try direct checkout
|
|
127
|
+
subprocess.run(
|
|
128
|
+
["git", "checkout", sha],
|
|
129
|
+
cwd=str(repo_path),
|
|
130
|
+
check=True,
|
|
131
|
+
capture_output=True,
|
|
132
|
+
)
|
|
133
|
+
self.logger.debug("Checked out %s", sha[:8])
|
|
134
|
+
except subprocess.CalledProcessError as e:
|
|
135
|
+
# Commit not available, fetch it specifically
|
|
136
|
+
self.logger.debug(
|
|
137
|
+
"Commit %s not found, fetching... (stderr: %s)",
|
|
138
|
+
sha[:8],
|
|
139
|
+
e.stderr.decode() if e.stderr else "",
|
|
140
|
+
)
|
|
141
|
+
try:
|
|
142
|
+
subprocess.run(
|
|
143
|
+
["git", "fetch", "origin", sha],
|
|
144
|
+
cwd=str(repo_path),
|
|
145
|
+
check=True,
|
|
146
|
+
capture_output=True,
|
|
147
|
+
)
|
|
148
|
+
# Clean again before checkout to ensure no untracked files
|
|
149
|
+
self._clean_repo(repo_path)
|
|
150
|
+
subprocess.run(
|
|
151
|
+
["git", "checkout", sha],
|
|
152
|
+
cwd=str(repo_path),
|
|
153
|
+
check=True,
|
|
154
|
+
capture_output=True,
|
|
155
|
+
)
|
|
156
|
+
self.logger.debug("Fetched and checked out %s", sha[:8])
|
|
157
|
+
except subprocess.CalledProcessError as fetch_err:
|
|
158
|
+
# Provide more context in the error
|
|
159
|
+
stderr = fetch_err.stderr.decode() if fetch_err.stderr else ""
|
|
160
|
+
self.logger.error("Failed to checkout %s: %s", sha[:8], stderr)
|
|
161
|
+
raise RuntimeError(
|
|
162
|
+
f"Cannot checkout commit {sha[:8]}. It may have been force-pushed or deleted. Error: {stderr}"
|
|
163
|
+
) from fetch_err
|
|
164
|
+
|
|
165
|
+
# Update submodules if any
|
|
166
|
+
try:
|
|
167
|
+
subprocess.run(
|
|
168
|
+
["git", "submodule", "update", "--init", "--recursive"],
|
|
169
|
+
cwd=str(repo_path),
|
|
170
|
+
check=True,
|
|
171
|
+
capture_output=True,
|
|
172
|
+
timeout=120,
|
|
173
|
+
)
|
|
174
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
|
175
|
+
self.logger.debug("Submodule update skipped or failed (non-fatal)")
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from openai import OpenAI
|
|
7
|
+
|
|
8
|
+
from .utils import CombinedPRTaskEvaluation
|
|
9
|
+
|
|
10
|
+
MAX_LINKED_ISSUES = 5
|
|
11
|
+
MAX_ISSUE_BODY_LENGTH = 2500
|
|
12
|
+
MAX_PR_BODY_LENGTH = 2500
|
|
13
|
+
MAX_TEST_FILE_LENGTH = 3000 # Max chars per test file
|
|
14
|
+
MAX_TOTAL_TEST_LENGTH = 10000 # Max total chars for all test files
|
|
15
|
+
MIN_INSTRUCTION_LENGTH = 100
|
|
16
|
+
OPENAI_API_TIMEOUT = 90.0
|
|
17
|
+
MAX_COMPLETION_TOKENS = 4096
|
|
18
|
+
MODEL_NAME = "gpt-5.2"
|
|
19
|
+
DEBUG_REASON_TRUNCATE_LENGTH = 100
|
|
20
|
+
|
|
21
|
+
COMBINED_SYSTEM_PROMPT = """You are evaluating GitHub pull requests and converting substantial ones into SWE-bench tasks.
|
|
22
|
+
|
|
23
|
+
Your job has TWO PHASES:
|
|
24
|
+
|
|
25
|
+
PHASE 1 - Evaluate Substantiality:
|
|
26
|
+
Determine if the PR is substantial enough to generate a coding task.
|
|
27
|
+
|
|
28
|
+
SKIP (is_substantial=false) if the PR is:
|
|
29
|
+
- Pure documentation updates including:
|
|
30
|
+
* README, docs/, markdown files
|
|
31
|
+
* docs_src/, doc_src/, examples/ (documentation example code)
|
|
32
|
+
* tests/test_tutorial/, tests/test_docs/, test_examples/ (tests for documentation)
|
|
33
|
+
- Only dependency/package updates (requirements.txt, package.json, etc.)
|
|
34
|
+
- Simple typo or formatting fixes with no functional changes
|
|
35
|
+
- CI/config changes only (.github/workflows, .travis.yml, etc.)
|
|
36
|
+
- Version bumps or release commits
|
|
37
|
+
- Other trivial maintenance tasks
|
|
38
|
+
- Changes to only a single file (not substantial enough)
|
|
39
|
+
- Simple one-line fixes or trivial changes (even across multiple files)
|
|
40
|
+
- Purely cosmetic refactoring (renaming variables, reformatting, etc.)
|
|
41
|
+
- Adding simple logging or print statements without logic changes
|
|
42
|
+
|
|
43
|
+
KEEP (is_substantial=true) if the PR:
|
|
44
|
+
- Fixes a non-trivial bug with changes across MULTIPLE source files
|
|
45
|
+
- Adds or modifies functional tests AND implements corresponding source code changes
|
|
46
|
+
- Implements a feature or enhancement with changes to MULTIPLE source files
|
|
47
|
+
- Has meaningful behavioral changes affecting multiple components or modules
|
|
48
|
+
- Requires coordination between different parts of the codebase
|
|
49
|
+
|
|
50
|
+
CRITICAL REQUIREMENT for is_substantial=true:
|
|
51
|
+
The PR MUST modify multiple files (at least 2-3 meaningful source code files, not counting trivial changes).
|
|
52
|
+
Single-file changes are almost never substantial enough unless they involve major refactoring or complex logic.
|
|
53
|
+
|
|
54
|
+
PHASE 2 - Generate Task (ONLY if substantial):
|
|
55
|
+
If is_substantial=true, write a DETAILED bug report that an engineer can solve.
|
|
56
|
+
|
|
57
|
+
SOURCE PRIORITY:
|
|
58
|
+
1. Linked issues (if available) - for the problem description
|
|
59
|
+
2. PR title and body - for context and details
|
|
60
|
+
3. Test files - for expected behavior and API specifications
|
|
61
|
+
|
|
62
|
+
CRITICAL INSTRUCTIONS:
|
|
63
|
+
- Write a clear description of the PROBLEM that needs to be solved
|
|
64
|
+
- Include specific function/class/method names IF they appear in tests or issues
|
|
65
|
+
- Include exact error messages that users see or that tests expect
|
|
66
|
+
- Include expected behavior vs actual behavior
|
|
67
|
+
- If tests show specific API calls, mention them (e.g., "implement validate_email() method")
|
|
68
|
+
|
|
69
|
+
IMPORTANT - ABOUT TEST FILES:
|
|
70
|
+
You may see test file contents to help you understand what needs to be implemented. However:
|
|
71
|
+
✗ DO NOT mention the test files themselves (e.g., "from the test sample", "the test fixture", "the provided test")
|
|
72
|
+
✗ DO NOT reference test file names or paths
|
|
73
|
+
✗ DO NOT say things like "the test shows" or "according to the tests"
|
|
74
|
+
|
|
75
|
+
Instead, write as if describing the problem from a user/issue perspective:
|
|
76
|
+
✓ "When calling foo() with X, it should return Y but currently returns Z"
|
|
77
|
+
✓ "The function should handle these cases: ..."
|
|
78
|
+
✓ "Expected behavior: ... Actual behavior: ..."
|
|
79
|
+
|
|
80
|
+
The agent solving this task will NOT see the test files, so any reference to them will be confusing.
|
|
81
|
+
|
|
82
|
+
WHAT TO INCLUDE:
|
|
83
|
+
✓ Problem description from issue/PR
|
|
84
|
+
✓ Expected behavior vs actual behavior
|
|
85
|
+
✓ Error messages users see
|
|
86
|
+
✓ Function/method/class names that tests call or issue mentions
|
|
87
|
+
✓ Expected return values or outputs
|
|
88
|
+
✓ Code examples showing the bug (if in issue/PR)
|
|
89
|
+
✓ Specific scenarios/cases that should work (derived from tests, but written as requirements)
|
|
90
|
+
|
|
91
|
+
WHAT TO EXCLUDE:
|
|
92
|
+
✗ File paths or module locations (e.g., "fix in utils/validators.py")
|
|
93
|
+
✗ Test file names, paths, or references (e.g., "test_foo.py", "the test fixture")
|
|
94
|
+
✗ Phrases like "from the test", "the test shows", "according to the tests"
|
|
95
|
+
✗ Implementation approaches (e.g., "use a try-catch", "add caching")
|
|
96
|
+
✗ How the PR fixed it (e.g., "I changed X to Y")
|
|
97
|
+
✗ Internal implementation details not visible in tests/issue
|
|
98
|
+
|
|
99
|
+
FORMAT RULES:
|
|
100
|
+
- Be clear and specific enough that an engineer knows what to implement
|
|
101
|
+
- Include code snippets from issues/tests if they clarify the expected behavior
|
|
102
|
+
- DO NOT use sections like "Impact:", "Acceptance criteria:", "Notes:", "Additional considerations:"
|
|
103
|
+
- Write naturally, as if explaining to a colleague
|
|
104
|
+
|
|
105
|
+
EXAMPLE GOOD INSTRUCTION:
|
|
106
|
+
"The email validation is failing for valid email addresses. When calling user.validate_email('test@example.com'),
|
|
107
|
+
it should return True, but currently returns False for addresses with subdomains. The validation should accept
|
|
108
|
+
any email matching the pattern <local>@<domain>.<tld> including subdomains like test@mail.example.com."
|
|
109
|
+
|
|
110
|
+
EXAMPLE BAD INSTRUCTION:
|
|
111
|
+
"Fix the email validator in utils/auth.py by changing the regex pattern to support subdomains using a more
|
|
112
|
+
permissive regex."
|
|
113
|
+
|
|
114
|
+
TAGS:
|
|
115
|
+
Generate exactly 3 tags in this order:
|
|
116
|
+
1. Primary programming language (e.g., "python", "javascript", "typescript", "go", "rust", "java", "ruby", "cpp")
|
|
117
|
+
2. Tier/area: Choose ONE from: "backend", "frontend", "fullstack", "cli", "library", "framework"
|
|
118
|
+
3. Framework/library name (e.g., "fastapi", "django", "react", "nextjs", "axios", "express") OR a specific category (e.g., "http", "async", "testing")
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
- FastAPI backend project: ["python", "backend", "fastapi"]
|
|
122
|
+
- Next.js frontend: ["typescript", "frontend", "nextjs"]
|
|
123
|
+
- Ripgrep CLI tool: ["rust", "cli", "regex"]
|
|
124
|
+
|
|
125
|
+
IMPORTANT: Generate exactly 3 tags.
|
|
126
|
+
|
|
127
|
+
If NOT substantial, set instruction to null and provide a brief reason.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _format_user_prompt(
|
|
132
|
+
pr_title: str,
|
|
133
|
+
pr_body: str,
|
|
134
|
+
repo: str,
|
|
135
|
+
changed_files: list[str],
|
|
136
|
+
linked_issues: list[dict] | None = None,
|
|
137
|
+
force_generate_instruction: bool = False,
|
|
138
|
+
test_contents: dict[str, str] | None = None,
|
|
139
|
+
) -> str:
|
|
140
|
+
"""Format user prompt for combined evaluation + task generation.
|
|
141
|
+
|
|
142
|
+
Prioritizes linked issues and avoids leaking solution details (files, diff, commits).
|
|
143
|
+
"""
|
|
144
|
+
# Calculate basic stats for evaluation (no file names - just counts)
|
|
145
|
+
total = len(changed_files or [])
|
|
146
|
+
tests = sum(1 for p in (changed_files or []) if "test" in (p or "").lower())
|
|
147
|
+
docs = sum(
|
|
148
|
+
1
|
|
149
|
+
for p in (changed_files or [])
|
|
150
|
+
if any(seg in (p or "").lower() for seg in ("docs/", "doc/"))
|
|
151
|
+
)
|
|
152
|
+
source_files = total - tests - docs
|
|
153
|
+
|
|
154
|
+
# Modify ending instruction based on force_generate_instruction flag
|
|
155
|
+
if force_generate_instruction:
|
|
156
|
+
ending_instruction = (
|
|
157
|
+
"\nIMPORTANT: Generate a detailed instruction for this PR regardless of complexity.\n"
|
|
158
|
+
"You should ALWAYS set is_substantial=true and write a comprehensive bug report/task instruction.\n"
|
|
159
|
+
"Even if the PR seems simple, treat it as a valid task and describe the problem that was fixed.\n"
|
|
160
|
+
"Include specific function/method/class names that appear in the tests or issue.\n"
|
|
161
|
+
"Focus on what needs to be implemented, not where or how to implement it.\n"
|
|
162
|
+
"REMEMBER: Do NOT mention test files - the agent won't see them. Write from a user/issue perspective."
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
ending_instruction = (
|
|
166
|
+
"\nFirst, evaluate if this PR is substantial enough to generate a task.\n"
|
|
167
|
+
"Remember: PRs with changes to only 1-2 files are usually too trivial unless they involve major complexity.\n"
|
|
168
|
+
"Look for changes across multiple source files that demonstrate real cross-component coordination.\n"
|
|
169
|
+
"If substantial, write a detailed bug report describing the PROBLEM and what needs to be implemented.\n"
|
|
170
|
+
"Include specific function/method/class names from tests or issues, but NOT file paths or implementation details.\n"
|
|
171
|
+
"REMEMBER: Do NOT mention test files - the agent won't see them. Write from a user/issue perspective.\n"
|
|
172
|
+
"If not substantial, explain why briefly and set instruction to null."
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Build test contents section if provided
|
|
176
|
+
# NOTE: Tests help the LLM understand expected behavior, but it should NOT
|
|
177
|
+
# mention test files in the instruction since the agent won't see them
|
|
178
|
+
test_section = ""
|
|
179
|
+
if test_contents and len(test_contents) > 0:
|
|
180
|
+
test_lines = ["Test Files (for understanding behavior - do NOT reference these in your instruction):"]
|
|
181
|
+
total_length = 0
|
|
182
|
+
|
|
183
|
+
# Sort by file size (smaller first) to prioritize including more files
|
|
184
|
+
sorted_tests = sorted(test_contents.items(), key=lambda x: len(x[1]))
|
|
185
|
+
|
|
186
|
+
for test_file, content in sorted_tests:
|
|
187
|
+
# Truncate individual file if too long
|
|
188
|
+
if len(content) > MAX_TEST_FILE_LENGTH:
|
|
189
|
+
content = content[:MAX_TEST_FILE_LENGTH] + "\n... (truncated)"
|
|
190
|
+
|
|
191
|
+
# Check if adding this file would exceed total limit
|
|
192
|
+
if total_length + len(content) > MAX_TOTAL_TEST_LENGTH:
|
|
193
|
+
test_lines.append(f"\n... ({len(test_contents) - len(test_lines) + 1} more test files omitted)")
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
test_lines.append(f"\n--- {test_file} ---")
|
|
197
|
+
test_lines.append(content)
|
|
198
|
+
total_length += len(content)
|
|
199
|
+
|
|
200
|
+
test_section = "\n".join(test_lines) + "\n\n"
|
|
201
|
+
|
|
202
|
+
# MODE 1: Linked issues exist - use issue + PR body + tests
|
|
203
|
+
if linked_issues and len(linked_issues) > 0:
|
|
204
|
+
# Sort by body length (longer = more detail = more useful), take top N
|
|
205
|
+
sorted_issues = sorted(
|
|
206
|
+
linked_issues, key=lambda x: len(x.get("body", "") or ""), reverse=True
|
|
207
|
+
)[:MAX_LINKED_ISSUES]
|
|
208
|
+
|
|
209
|
+
issue_lines = []
|
|
210
|
+
for issue in sorted_issues:
|
|
211
|
+
issue_num = issue.get("number", "")
|
|
212
|
+
issue_title = issue.get("title", "")
|
|
213
|
+
issue_body = (issue.get("body", "") or "").strip()
|
|
214
|
+
# Truncate issue body if too long
|
|
215
|
+
if len(issue_body) > MAX_ISSUE_BODY_LENGTH:
|
|
216
|
+
issue_body = issue_body[:MAX_ISSUE_BODY_LENGTH] + "\n...(truncated)"
|
|
217
|
+
|
|
218
|
+
issue_lines.append(f"Issue #{issue_num}: {issue_title}")
|
|
219
|
+
if issue_body:
|
|
220
|
+
issue_lines.append(f"{issue_body}\n")
|
|
221
|
+
|
|
222
|
+
issues_section = "\n".join(issue_lines)
|
|
223
|
+
|
|
224
|
+
# Include PR body for additional context
|
|
225
|
+
pr_body_truncated = (pr_body or "").strip()
|
|
226
|
+
if len(pr_body_truncated) > MAX_PR_BODY_LENGTH:
|
|
227
|
+
pr_body_truncated = pr_body_truncated[:MAX_PR_BODY_LENGTH] + "\n...(truncated)"
|
|
228
|
+
|
|
229
|
+
pr_body_section = ""
|
|
230
|
+
if pr_body_truncated:
|
|
231
|
+
pr_body_section = f"PR Description (for additional context):\n{pr_body_truncated}\n\n"
|
|
232
|
+
|
|
233
|
+
return (
|
|
234
|
+
f"Repository: {repo}\n"
|
|
235
|
+
f"PR Title: {pr_title}\n\n"
|
|
236
|
+
f"Linked Issue(s):\n{issues_section}\n\n"
|
|
237
|
+
+ pr_body_section
|
|
238
|
+
+ test_section
|
|
239
|
+
+ f"Scope (for evaluation only): {source_files} source files, {tests} test files changed\n"
|
|
240
|
+
+ ending_instruction
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# MODE 2: No linked issue - use PR title + body + tests
|
|
244
|
+
pr_body_truncated = (pr_body or "").strip()
|
|
245
|
+
if len(pr_body_truncated) > MAX_PR_BODY_LENGTH:
|
|
246
|
+
pr_body_truncated = pr_body_truncated[:MAX_PR_BODY_LENGTH] + "\n...(truncated)"
|
|
247
|
+
|
|
248
|
+
return (
|
|
249
|
+
f"Repository: {repo}\n"
|
|
250
|
+
f"PR Title: {pr_title}\n\n"
|
|
251
|
+
+ (f"PR Description:\n{pr_body_truncated}\n\n" if pr_body_truncated else "")
|
|
252
|
+
+ test_section
|
|
253
|
+
+ f"Scope (for evaluation only): {source_files} source files, {tests} test files changed\n\n"
|
|
254
|
+
+ ending_instruction
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def evaluate_and_generate_task(
|
|
259
|
+
metadata: dict,
|
|
260
|
+
files: list[dict],
|
|
261
|
+
repo: str,
|
|
262
|
+
model: str = MODEL_NAME,
|
|
263
|
+
api_key: str | None = None,
|
|
264
|
+
linked_issues: list[dict] | None = None,
|
|
265
|
+
force_generate_instruction: bool = False,
|
|
266
|
+
test_contents: dict[str, str] | None = None,
|
|
267
|
+
) -> CombinedPRTaskEvaluation:
|
|
268
|
+
"""Evaluate PR substantiality and generate task description in one LLM call.
|
|
269
|
+
|
|
270
|
+
Uses OpenAI's structured outputs with the parse() method for type-safe responses.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
metadata: PR metadata dict
|
|
274
|
+
files: List of changed files
|
|
275
|
+
repo: Repository name
|
|
276
|
+
model: OpenAI model to use
|
|
277
|
+
api_key: Optional OpenAI API key
|
|
278
|
+
linked_issues: Optional list of linked issue dicts (with 'title', 'body', 'number')
|
|
279
|
+
force_generate_instruction: If True, always generate an instruction even if PR seems trivial
|
|
280
|
+
test_contents: Optional dict mapping test file paths to their contents
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
CombinedPRTaskEvaluation with evaluation and task details
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
RuntimeError: If API key is missing or LLM call fails
|
|
287
|
+
"""
|
|
288
|
+
logger = logging.getLogger("swegen")
|
|
289
|
+
|
|
290
|
+
# Check API key
|
|
291
|
+
if not (api_key or os.getenv("OPENAI_API_KEY")):
|
|
292
|
+
raise RuntimeError("OPENAI_API_KEY not set")
|
|
293
|
+
|
|
294
|
+
# Prepare prompt data
|
|
295
|
+
# NOTE: We intentionally do NOT pass diff/commits to avoid leaking the solution
|
|
296
|
+
pr_title = metadata.get("title", "")
|
|
297
|
+
pr_body = metadata.get("body", "")
|
|
298
|
+
changed_files = [f.get("filename", "") for f in files]
|
|
299
|
+
|
|
300
|
+
user_prompt = _format_user_prompt(
|
|
301
|
+
pr_title,
|
|
302
|
+
pr_body,
|
|
303
|
+
repo,
|
|
304
|
+
changed_files,
|
|
305
|
+
linked_issues=linked_issues,
|
|
306
|
+
force_generate_instruction=force_generate_instruction,
|
|
307
|
+
test_contents=test_contents,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
client = OpenAI(
|
|
311
|
+
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
|
312
|
+
timeout=OPENAI_API_TIMEOUT, # Longer timeout for reasoning models
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
# Use structured outputs with parse() method - type-safe!
|
|
317
|
+
completion = client.beta.chat.completions.parse(
|
|
318
|
+
model=model,
|
|
319
|
+
messages=[
|
|
320
|
+
{"role": "system", "content": COMBINED_SYSTEM_PROMPT},
|
|
321
|
+
{"role": "user", "content": user_prompt},
|
|
322
|
+
],
|
|
323
|
+
response_format=CombinedPRTaskEvaluation,
|
|
324
|
+
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
|
325
|
+
# reasoning_effort="low", # TODO: reasoning level?
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
result = completion.choices[0].message.parsed
|
|
329
|
+
if result is None:
|
|
330
|
+
raise RuntimeError("LLM returned no parsed result")
|
|
331
|
+
|
|
332
|
+
logger.debug(
|
|
333
|
+
f"Combined evaluation: is_substantial={result.is_substantial}, reason={result.reason[:DEBUG_REASON_TRUNCATE_LENGTH]}..."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Post-process: validate tags if substantial
|
|
337
|
+
if result.is_substantial:
|
|
338
|
+
if len(result.tags) < 1:
|
|
339
|
+
logger.error(f"❌ LLM generated only {len(result.tags)} tags")
|
|
340
|
+
raise RuntimeError(f"LLM generated only {len(result.tags)} tags")
|
|
341
|
+
|
|
342
|
+
# Validate instruction length
|
|
343
|
+
if not result.instruction or len(result.instruction.strip()) < MIN_INSTRUCTION_LENGTH:
|
|
344
|
+
logger.error(
|
|
345
|
+
f"❌ LLM generated instruction too short: {len(result.instruction) if result.instruction else 0} chars"
|
|
346
|
+
)
|
|
347
|
+
raise RuntimeError(
|
|
348
|
+
f"Instruction too short: {len(result.instruction) if result.instruction else 0} chars (need {MIN_INSTRUCTION_LENGTH}+)"
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Ensure defaults
|
|
352
|
+
if not result.difficulty:
|
|
353
|
+
result.difficulty = "medium"
|
|
354
|
+
if not result.category:
|
|
355
|
+
result.category = "bugfix"
|
|
356
|
+
|
|
357
|
+
return result
|
|
358
|
+
|
|
359
|
+
except Exception as exc:
|
|
360
|
+
# Log the specific exception type for better debugging
|
|
361
|
+
exc_type = type(exc).__name__
|
|
362
|
+
logger.error(f"Combined LLM call failed ({exc_type}): {exc}")
|
|
363
|
+
raise RuntimeError(f"Combined LLM call failed: {exc}") from exc
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import asdict, dataclass
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("swegen")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TaskReference:
|
|
14
|
+
"""Reference to a successful task that can be reused."""
|
|
15
|
+
|
|
16
|
+
repo: str
|
|
17
|
+
task_id: str
|
|
18
|
+
pr_number: int
|
|
19
|
+
created_at: str | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TaskReferenceStore:
|
|
23
|
+
"""Stores references to successful tasks for reuse across PRs."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, reference_file: Path | None = None):
|
|
26
|
+
"""
|
|
27
|
+
Initialize task reference store.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
reference_file: Path to JSON file storing references (default: .state/task_references.json)
|
|
31
|
+
"""
|
|
32
|
+
self.reference_file = reference_file or Path(".state/task_references.json")
|
|
33
|
+
self.reference_file.parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
def _load_references(self) -> dict[str, TaskReference]:
|
|
36
|
+
"""Load all references from file."""
|
|
37
|
+
if not self.reference_file.exists():
|
|
38
|
+
return {}
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
data = json.loads(self.reference_file.read_text())
|
|
42
|
+
return {repo: TaskReference(**ref_data) for repo, ref_data in data.items()}
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.warning(f"Failed to load task references: {e}")
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
def _save_references(self, references: dict[str, TaskReference]) -> None:
|
|
48
|
+
"""Save all references to file."""
|
|
49
|
+
data = {repo: asdict(ref) for repo, ref in references.items()}
|
|
50
|
+
self.reference_file.write_text(json.dumps(data, indent=2))
|
|
51
|
+
|
|
52
|
+
def save(
|
|
53
|
+
self,
|
|
54
|
+
repo: str,
|
|
55
|
+
task_id: str,
|
|
56
|
+
pr_number: int,
|
|
57
|
+
) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Save a reference to a successful task.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
repo: Repository name (owner/repo)
|
|
63
|
+
task_id: Task identifier of the successful task
|
|
64
|
+
pr_number: PR number
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
True if reference was saved successfully
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
# Create reference
|
|
71
|
+
reference = TaskReference(
|
|
72
|
+
repo=repo,
|
|
73
|
+
task_id=task_id,
|
|
74
|
+
pr_number=pr_number,
|
|
75
|
+
created_at=datetime.now(UTC).isoformat(),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Load, update, save
|
|
79
|
+
references = self._load_references()
|
|
80
|
+
references[repo] = reference
|
|
81
|
+
self._save_references(references)
|
|
82
|
+
|
|
83
|
+
logger.info(f"✓ Saved task reference for {repo} → {task_id}")
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.warning(f"Failed to save task reference: {e}")
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
def get(
|
|
91
|
+
self,
|
|
92
|
+
repo: str,
|
|
93
|
+
max_age_days: int = 180,
|
|
94
|
+
) -> TaskReference | None:
|
|
95
|
+
"""
|
|
96
|
+
Get reference to a successful task for reuse.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
repo: Repository name (owner/repo)
|
|
100
|
+
max_age_days: Maximum age of reference in days (default: 180)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
TaskReference if valid reference exists, None otherwise
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
references = self._load_references()
|
|
107
|
+
|
|
108
|
+
if repo not in references:
|
|
109
|
+
logger.debug(f"No task reference found for {repo}")
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
reference = references[repo]
|
|
113
|
+
|
|
114
|
+
# Check age
|
|
115
|
+
if reference.created_at:
|
|
116
|
+
created = datetime.fromisoformat(reference.created_at)
|
|
117
|
+
age_days = (datetime.now(UTC) - created).days
|
|
118
|
+
if age_days > max_age_days:
|
|
119
|
+
logger.debug(f"Reference too old for {repo}: {age_days} days > {max_age_days}")
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
logger.info(
|
|
123
|
+
f"✓ Found task reference for {repo} → {reference.task_id} "
|
|
124
|
+
f"(from PR #{reference.pr_number})"
|
|
125
|
+
)
|
|
126
|
+
return reference
|
|
127
|
+
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Failed to get task reference for {repo}: {e}")
|
|
130
|
+
return None
|