swegen 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+ from .utils import _is_relevant_source, strip_tests_prefix
8
+
9
+
10
+ def generate_diffs(
11
+ repo_path: Path,
12
+ base_sha: str,
13
+ head_sha: str,
14
+ test_file_paths: list[str],
15
+ ) -> tuple[str, str]:
16
+ """
17
+ Generate fix.patch and bug.patch from a repository.
18
+
19
+ Reversed Baseline Strategy:
20
+ - fix.patch: base→head, SOURCE files only (what oracle applies to fix)
21
+ - bug.patch: head→base, ALL files (reverts everything to BASE state)
22
+
23
+ Args:
24
+ repo_path: Path to the git repository
25
+ base_sha: Base commit SHA (pre-PR state)
26
+ head_sha: Head commit SHA (post-PR state with fix)
27
+ test_file_paths: List of test file paths (for logging only)
28
+
29
+ Returns:
30
+ Tuple of (solution_diff, bug_diff)
31
+ """
32
+ logger = logging.getLogger("swegen")
33
+
34
+ # Get all changed files
35
+ result = subprocess.run(
36
+ ["git", "diff", "--name-only", base_sha, head_sha],
37
+ cwd=str(repo_path),
38
+ check=True,
39
+ capture_output=True,
40
+ text=True,
41
+ )
42
+ all_changed = [f for f in result.stdout.strip().split("\n") if f]
43
+
44
+ # Filter for source files (exclude tests and CI; includes docs, examples, configs)
45
+ source_files = [f for f in all_changed if _is_relevant_source(f)]
46
+
47
+ logger.debug("Total changed files: %d", len(all_changed))
48
+ logger.debug("Relevant source files (for fix.patch): %d", len(source_files))
49
+ logger.debug("Test files (included in bug.patch): %s", test_file_paths)
50
+
51
+ # Generate fix.patch (base → head, source only)
52
+ # This is what the oracle applies to fix the bug
53
+ logger.debug("Generating fix.patch (base → head, source only)...")
54
+ if source_files:
55
+ result = subprocess.run(
56
+ ["git", "diff", base_sha, head_sha, "--"] + source_files,
57
+ cwd=str(repo_path),
58
+ check=True,
59
+ capture_output=True,
60
+ text=True,
61
+ )
62
+ solution_diff = result.stdout
63
+ else:
64
+ logger.warning("No source files changed! fix.patch will be empty.")
65
+ solution_diff = ""
66
+
67
+ # Generate bug.patch (head → base, ALL files)
68
+ # This reverts everything so agent sees BASE state
69
+ logger.debug("Generating bug.patch (head → base, ALL files)...")
70
+ result = subprocess.run(
71
+ ["git", "diff", head_sha, base_sha],
72
+ cwd=str(repo_path),
73
+ check=True,
74
+ capture_output=True,
75
+ text=True,
76
+ )
77
+ bug_diff = result.stdout
78
+
79
+ return solution_diff, bug_diff
80
+
81
+
82
+ def extract_test_files(
83
+ repo_path: Path,
84
+ test_file_paths: list[str],
85
+ head_sha: str,
86
+ output_dir: Path,
87
+ ) -> list[str]:
88
+ """
89
+ Extract test files from HEAD commit to task/tests/ directory.
90
+
91
+ These files will be copied into the container at verification time,
92
+ overwriting the BASE state test files (after bug.patch is applied).
93
+
94
+ Args:
95
+ repo_path: Path to the git repository
96
+ test_file_paths: List of repo-relative test file paths
97
+ head_sha: Commit SHA to extract files from
98
+ output_dir: Task output directory (tests/ subdir will be used)
99
+
100
+ Returns:
101
+ List of successfully extracted test file paths (repo-relative)
102
+ """
103
+ logger = logging.getLogger("swegen")
104
+ logger.debug("Extracting test files from HEAD commit...")
105
+
106
+ # Create tests directory in task output
107
+ test_dir = output_dir / "tests"
108
+ test_dir.mkdir(exist_ok=True, parents=True)
109
+ test_dir = test_dir.resolve()
110
+
111
+ if not test_file_paths:
112
+ logger.warning("No test files found in PR!")
113
+ return []
114
+
115
+ extracted = []
116
+ for test_file_path in test_file_paths:
117
+ try:
118
+ # Extract file content directly from HEAD commit
119
+ content = subprocess.run(
120
+ ["git", "show", f"{head_sha}:{test_file_path}"],
121
+ cwd=str(repo_path),
122
+ check=True,
123
+ capture_output=True,
124
+ ).stdout
125
+
126
+ # Preserve directory structure under tests/
127
+ # Strip leading "tests/" since we're already putting in tests dir
128
+ relative_path = Path(strip_tests_prefix(test_file_path))
129
+
130
+ dest_path = test_dir / relative_path
131
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
132
+ dest_path.write_bytes(content)
133
+
134
+ logger.debug("Extracted test file: %s -> %s", test_file_path, dest_path)
135
+ extracted.append(test_file_path)
136
+
137
+ except subprocess.CalledProcessError:
138
+ logger.warning("Test file not found in HEAD: %s", test_file_path)
139
+ continue
140
+
141
+ logger.debug("Extracted %d/%d test files to %s", len(extracted), len(test_file_paths), test_dir)
142
+ return extracted
@@ -0,0 +1,368 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+ from harbor.models.task.paths import TaskPaths
8
+
9
+ from .claude_code_runner import ClaudeCodeResult, run_claude_code_session
10
+ from .diff_utils import extract_test_files, generate_diffs
11
+ from .pr_fetcher import GitHubPRFetcher
12
+ from .repo_cache import RepoCache
13
+ from .task_instruction import evaluate_and_generate_task
14
+ from .task_reference import TaskReference, TaskReferenceStore
15
+ from .task_skeleton import (
16
+ SkeletonParams,
17
+ generate_instruction_md,
18
+ generate_task_toml,
19
+ generate_dockerfile,
20
+ generate_solve_sh,
21
+ generate_test_sh,
22
+ )
23
+ from .utils import check_multi_file_requirement, identify_test_files
24
+
25
+
26
+ class TrivialPRError(Exception):
27
+ """Raised when a PR is too trivial to generate a task from."""
28
+
29
+ pass
30
+
31
+
32
+ class MissingIssueError(Exception):
33
+ """Raised when a PR has no linked issue and require_issue is enabled."""
34
+
35
+ pass
36
+
37
+
38
+ class PRToHarborPipeline:
39
+ """Orchestrates the conversion of a GitHub PR into a Harbor-compatible task."""
40
+
41
+ def __init__(self, repo: str, pr_number: int, github_token: str | None = None):
42
+ """
43
+ Initialize the pipeline.
44
+
45
+ Args:
46
+ repo: GitHub repo in format "owner/repo" or full URL
47
+ pr_number: PR number
48
+ github_token: Optional GitHub token for API access
49
+ """
50
+ self.pr_fetcher = GitHubPRFetcher(repo, pr_number, github_token)
51
+ self.repo = self.pr_fetcher.repo
52
+ self.pr_number = pr_number
53
+ # Lowercase repo name for task_id (used in Docker image names which must be lowercase)
54
+ # Format: owner__repo-number (SWEBench convention)
55
+ repo_slug = self.repo.lower().replace("/", "__")
56
+ self.task_id = f"{repo_slug}-{pr_number}"
57
+
58
+ def create_task_scaffold(self, tasks_root: Path, overwrite: bool = False) -> Path:
59
+ """
60
+ Create task directory structure.
61
+
62
+ Returns the task directory path.
63
+ """
64
+ logger = logging.getLogger("swegen")
65
+ logger.debug("Creating task scaffold...")
66
+
67
+ task_dir = tasks_root / self.task_id
68
+
69
+ # Check if task already exists
70
+ if task_dir.exists():
71
+ logger.debug(f"Task directory already exists: {task_dir}")
72
+ if overwrite:
73
+ logger.debug("Removing existing directory (forced)...")
74
+ shutil.rmtree(task_dir)
75
+ else:
76
+ raise FileExistsError(f"Task already exists: {task_dir}\nUse --force to overwrite.")
77
+
78
+ # Create the task directory
79
+ logger.debug(f"Creating directory: {task_dir}")
80
+ task_dir.mkdir(parents=True, exist_ok=True)
81
+
82
+ return task_dir
83
+
84
+ def generate_task(
85
+ self,
86
+ tasks_root: Path,
87
+ overwrite: bool = False,
88
+ cache_dir: Path | None = None,
89
+ repo_path: Path | None = None,
90
+ metadata: dict | None = None,
91
+ linked_issues: list | None = None,
92
+ run_cc: bool = True,
93
+ cc_timeout: int = 3200,
94
+ verbose: bool = True,
95
+ use_cache: bool = True,
96
+ state_dir: Path | None = None,
97
+ require_minimum_difficulty: bool = True,
98
+ min_source_files: int = 3,
99
+ max_source_files: int = 10,
100
+ environment: str = "docker",
101
+ ) -> tuple[Path, ClaudeCodeResult | None, list[str], TaskReference | None]:
102
+ """
103
+ Generate a Harbor task using skeleton + Claude Code.
104
+
105
+ This is the language-agnostic pipeline that works for any repository.
106
+ Claude Code analyzes the repo to detect language, runtime, build system,
107
+ and test framework, then fills in the skeleton accordingly.
108
+
109
+ Flow:
110
+ 1. Clone/update repo to local cache
111
+ 2. Generate skeleton (language-agnostic Dockerfile, test.sh)
112
+ 3. Run Claude Code to detect language and fill in skeleton
113
+ 4. Validate with Harbor NOP/Oracle agents
114
+
115
+ Args:
116
+ tasks_root: Output root directory (Harbor tasks go here)
117
+ overwrite: If True, remove existing task dir
118
+ cache_dir: Directory for repo cache (default: .cache/repos)
119
+ repo_path: Pre-cloned repo path (skips cloning if provided)
120
+ metadata: Pre-fetched PR metadata (skips API call if provided)
121
+ linked_issues: Pre-fetched linked issues (skips API call if provided)
122
+ run_cc: If True, run CC to complete skeleton (default: True)
123
+ cc_timeout: Timeout for CC session in seconds
124
+ verbose: If True, stream CC output
125
+ use_cache: If True, try to reuse cached artifacts from previous successful PRs
126
+ state_dir: State directory for task references (default: .state)
127
+ require_minimum_difficulty: If True, require 3+ source files modified
128
+ min_source_files: Minimum number of source files required (default: 3)
129
+ max_source_files: Maximum number of source files allowed to avoid large refactors (default: 10)
130
+
131
+ Returns:
132
+ Tuple of (task_dir, cc_result, extracted_test_files, task_reference)
133
+ cc_result is None if run_cc=False
134
+ task_reference is None if no cached reference exists or use_cache=False
135
+ """
136
+ logger = logging.getLogger("swegen")
137
+ logger.info("=" * 60)
138
+ logger.info("Task Generation")
139
+ logger.info("Repo: %s, PR: #%d", self.repo, self.pr_number)
140
+ logger.info("=" * 60)
141
+
142
+ # Initialize reference store with proper state directory
143
+ reference_store = None
144
+ if use_cache:
145
+ reference_file = (state_dir / "task_references.json") if state_dir else None
146
+ reference_store = TaskReferenceStore(reference_file=reference_file)
147
+
148
+ # Step 1: Fetch PR metadata (use provided or fetch)
149
+ if metadata is None:
150
+ metadata = self.pr_fetcher.fetch_pr_metadata(allow_unmerged=self.config.allow_unmerged)
151
+
152
+ # Fetch linked issues for better task descriptions (use provided or fetch)
153
+ if linked_issues is None:
154
+ linked_issues = []
155
+ try:
156
+ linked_issues = self.pr_fetcher.fetch_linked_issues()
157
+ if linked_issues:
158
+ logger.info("Found %d linked issue(s)", len(linked_issues))
159
+ except Exception as e:
160
+ logger.debug("Could not fetch linked issues: %s", str(e))
161
+
162
+ files = self.pr_fetcher.fetch_pr_files()
163
+
164
+ # Step 2: Multi-file requirement check (fail fast before expensive operations)
165
+ # Use generic language detection - CC will figure out the actual language
166
+ if require_minimum_difficulty:
167
+ passes, reason, source_count = check_multi_file_requirement(
168
+ files, min_files=min_source_files, max_files=max_source_files
169
+ )
170
+ if not passes:
171
+ logger.warning("Skipping PR - source file count out of range: %s", reason)
172
+ raise TrivialPRError(f"PR #{self.pr_number}: {reason}")
173
+ logger.info(
174
+ "Multi-file check passed: %d source files (excluding tests, range: %d-%d)",
175
+ source_count,
176
+ min_source_files,
177
+ max_source_files,
178
+ )
179
+ else:
180
+ logger.info("Skipping minimum difficulty check (require_minimum_difficulty=False)")
181
+
182
+ # Step 3: Identify test files (language-agnostic patterns)
183
+ test_file_paths = identify_test_files(files)
184
+ logger.info("Identified %d test files", len(test_file_paths))
185
+
186
+ # Step 4: Clone/update repo to local cache (use provided or clone)
187
+ if repo_path is None:
188
+ repo_cache = RepoCache(cache_dir)
189
+ repo_path = repo_cache.get_or_clone(
190
+ repo=self.repo,
191
+ head_sha=metadata["head_sha"],
192
+ repo_url=metadata["repo_url"],
193
+ )
194
+ logger.info("Repo at: %s", repo_path)
195
+
196
+ # Step 5: Create task scaffold
197
+ task_dir = self.create_task_scaffold(tasks_root, overwrite=overwrite)
198
+ paths = TaskPaths(task_dir)
199
+ paths.environment_dir.mkdir(exist_ok=True)
200
+ paths.solution_dir.mkdir(exist_ok=True)
201
+ paths.tests_dir.mkdir(exist_ok=True)
202
+
203
+ try:
204
+ # Step 6: Try to get reference to previous successful task
205
+ task_reference = None
206
+ if reference_store:
207
+ task_reference = reference_store.get(
208
+ repo=self.repo,
209
+ max_age_days=180,
210
+ )
211
+ if task_reference:
212
+ logger.info(
213
+ f"Found task reference: {task_reference.task_id} "
214
+ f"(from PR #{task_reference.pr_number}, created {task_reference.created_at[:10]})"
215
+ )
216
+
217
+ # Step 7: Generate diffs from local repo (language-agnostic)
218
+ solution_diff, bug_diff = generate_diffs(
219
+ repo_path=repo_path,
220
+ base_sha=metadata["base_sha"],
221
+ head_sha=metadata["head_sha"],
222
+ test_file_paths=test_file_paths,
223
+ )
224
+
225
+ # Step 8: Extract test files
226
+ extracted_test_files = extract_test_files(
227
+ repo_path=repo_path,
228
+ test_file_paths=test_file_paths,
229
+ head_sha=metadata["head_sha"],
230
+ output_dir=task_dir,
231
+ )
232
+
233
+ # Step 8b: Read test file contents for instruction generation
234
+ test_contents = {}
235
+ test_dir = task_dir / "tests"
236
+ if test_dir.exists():
237
+ for test_file in test_dir.rglob("*"):
238
+ if test_file.is_file():
239
+ try:
240
+ # Read as text, skip binary files
241
+ content = test_file.read_text(encoding='utf-8', errors='ignore')
242
+ # Store with relative path from tests/ dir
243
+ rel_path = test_file.relative_to(test_dir)
244
+ test_contents[str(rel_path)] = content
245
+ except Exception as e:
246
+ logger.debug(f"Could not read test file {test_file}: {e}")
247
+
248
+ # Step 9: Generate evaluation + instruction (uses LLM but not CC)
249
+ logger.info("Evaluating PR and generating instruction...")
250
+ try:
251
+ combined_result = evaluate_and_generate_task(
252
+ metadata,
253
+ files,
254
+ self.repo,
255
+ linked_issues=linked_issues,
256
+ force_generate_instruction=(not require_minimum_difficulty),
257
+ test_contents=test_contents,
258
+ )
259
+
260
+ if not combined_result.is_substantial:
261
+ if require_minimum_difficulty:
262
+ logger.warning("Skipping trivial PR: %s", combined_result.reason)
263
+ shutil.rmtree(task_dir)
264
+ raise TrivialPRError(
265
+ f"PR #{self.pr_number} is too trivial: {combined_result.reason}"
266
+ )
267
+ else:
268
+ logger.warning(
269
+ "PR deemed trivial by LLM, but proceeding anyway: %s",
270
+ combined_result.reason,
271
+ )
272
+
273
+ instruction_data = {
274
+ "instruction": combined_result.instruction,
275
+ "difficulty": combined_result.difficulty,
276
+ "category": combined_result.category,
277
+ "tags": combined_result.tags,
278
+ }
279
+ except TrivialPRError:
280
+ raise
281
+ except Exception:
282
+ if task_dir.exists():
283
+ shutil.rmtree(task_dir)
284
+ raise
285
+
286
+ # Step 10: Write skeleton files
287
+ logger.info("Writing skeleton task files...")
288
+
289
+ # Create skeleton params
290
+ skeleton_params = SkeletonParams(
291
+ repo_url=metadata["repo_url"],
292
+ head_sha=metadata["head_sha"],
293
+ base_sha=metadata["base_sha"],
294
+ pr_number=self.pr_number,
295
+ )
296
+
297
+ # bug.patch
298
+ (paths.environment_dir / "bug.patch").write_text(bug_diff)
299
+
300
+ # Dockerfile (with TODOs for CC)
301
+ dockerfile = generate_dockerfile(skeleton_params)
302
+ (paths.environment_dir / "Dockerfile").write_text(dockerfile)
303
+
304
+ # test.sh (with TODOs for CC)
305
+ test_sh_content = generate_test_sh(extracted_test_files)
306
+ paths.test_path.write_text(test_sh_content)
307
+ paths.test_path.chmod(0o755)
308
+
309
+ # instruction.md and task.toml
310
+ paths.instruction_path.write_text(generate_instruction_md(instruction_data))
311
+ paths.config_path.write_text(generate_task_toml(instruction_data))
312
+
313
+ # solution/fix.patch - the actual fix to apply
314
+ (paths.solution_dir / "fix.patch").write_text(solution_diff)
315
+
316
+ # solution/solve.sh - applies fix.patch (same for all languages)
317
+ paths.solve_path.write_text(generate_solve_sh())
318
+ paths.solve_path.chmod(0o755)
319
+
320
+ logger.info("Skeleton generated: %s", task_dir)
321
+
322
+ # Step 11: Run CC to complete skeleton and make harbor pass
323
+ cc_result = None
324
+ if run_cc:
325
+ if task_reference:
326
+ logger.info(
327
+ f"Running CC with reference task {task_reference.task_id} "
328
+ f"from PR #{task_reference.pr_number} (should be much faster)..."
329
+ )
330
+ else:
331
+ logger.info(
332
+ "Running CC session (will detect language automatically)..."
333
+ )
334
+
335
+ cc_result = run_claude_code_session(
336
+ repo=self.repo,
337
+ pr_number=self.pr_number,
338
+ repo_path=repo_path,
339
+ task_dir=task_dir,
340
+ task_id=self.task_id,
341
+ dataset_path=tasks_root,
342
+ test_files=extracted_test_files,
343
+ timeout=cc_timeout,
344
+ verbose=verbose,
345
+ reference_task_id=task_reference.task_id if task_reference else None,
346
+ reference_pr=task_reference.pr_number if task_reference else None,
347
+ head_sha=metadata.get("head_sha"),
348
+ environment=environment,
349
+ )
350
+
351
+ if cc_result.success:
352
+ logger.info("✓ CC completed task successfully!")
353
+ # Save reference to this successful task for future PRs
354
+ if reference_store and not task_reference:
355
+ reference_store.save(
356
+ repo=self.repo,
357
+ task_id=self.task_id,
358
+ pr_number=self.pr_number,
359
+ )
360
+ else:
361
+ logger.warning("✗ CC did not complete task: %s", cc_result.error_message)
362
+
363
+ return task_dir, cc_result, extracted_test_files, task_reference
364
+
365
+ except Exception:
366
+ if task_dir.exists():
367
+ shutil.rmtree(task_dir)
368
+ raise
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import re
6
+ from urllib.parse import urlparse
7
+
8
+ import requests
9
+
10
+
11
+ class GitHubPRFetcher:
12
+ """Fetches PR metadata from GitHub API."""
13
+
14
+ def __init__(self, repo: str, pr_number: int, github_token: str | None = None):
15
+ """
16
+ Initialize the PR fetcher.
17
+
18
+ Args:
19
+ repo: GitHub repo in format "owner/repo" or full URL
20
+ pr_number: PR number
21
+ github_token: Optional GitHub token for API access
22
+ """
23
+ self.repo = self._parse_repo(repo)
24
+ self.pr_number = pr_number
25
+ self.github_token = github_token or os.environ.get("GITHUB_TOKEN")
26
+
27
+ # API setup
28
+ self.api_base = "https://api.github.com"
29
+ self.headers = {"Accept": "application/vnd.github.v3+json"}
30
+ if self.github_token:
31
+ self.headers["Authorization"] = f"token {self.github_token}"
32
+
33
+ def _parse_repo(self, repo: str) -> str:
34
+ """Parse repo URL or owner/repo format to owner/repo."""
35
+ if repo.startswith("http"):
36
+ parsed = urlparse(repo)
37
+ # Extract owner/repo from path
38
+ path = parsed.path.strip("/")
39
+ if path.endswith(".git"):
40
+ path = path[:-4]
41
+ return path
42
+ return repo
43
+
44
+ def _api_get(self, endpoint: str) -> dict:
45
+ """Make a GET request to GitHub API."""
46
+ url = f"{self.api_base}{endpoint}"
47
+ response = requests.get(url, headers=self.headers)
48
+ response.raise_for_status()
49
+ return response.json()
50
+
51
+ def fetch_pr_metadata(self, allow_unmerged: bool = False) -> dict:
52
+ """Fetch PR metadata from GitHub API.
53
+
54
+ Args:
55
+ allow_unmerged: If True, allow unmerged PRs (for testing/preview). Default False.
56
+ """
57
+ logger = logging.getLogger("swegen")
58
+ logger.debug("Fetching PR #%s metadata from %s...", self.pr_number, self.repo)
59
+ pr_data = self._api_get(f"/repos/{self.repo}/pulls/{self.pr_number}")
60
+
61
+ if not allow_unmerged and not pr_data.get("merged"):
62
+ raise ValueError(f"PR #{self.pr_number} is not merged yet!")
63
+
64
+ # Get the commits
65
+ base_sha = pr_data["base"]["sha"]
66
+ head_sha = pr_data["head"]["sha"]
67
+ merge_commit_sha = pr_data.get("merge_commit_sha")
68
+ logger.debug("Base SHA: %s", base_sha)
69
+ logger.debug("Head SHA: %s", head_sha)
70
+ logger.debug("Merge SHA: %s", merge_commit_sha)
71
+
72
+ return {
73
+ "title": pr_data["title"],
74
+ "body": pr_data.get("body", ""),
75
+ "base_sha": base_sha,
76
+ "head_sha": head_sha,
77
+ "merge_commit_sha": merge_commit_sha,
78
+ "base_ref": pr_data["base"]["ref"],
79
+ "head_ref": pr_data["head"]["ref"],
80
+ "repo_url": pr_data["base"]["repo"]["clone_url"],
81
+ "html_url": pr_data["html_url"],
82
+ "created_at": pr_data["created_at"],
83
+ "merged_at": pr_data["merged_at"],
84
+ }
85
+
86
+ def fetch_pr_files(self) -> list[dict]:
87
+ """Fetch list of files changed in the PR."""
88
+ logger = logging.getLogger("swegen")
89
+ logger.debug("Fetching changed files for PR #%s...", self.pr_number)
90
+ files_response = self._api_get(f"/repos/{self.repo}/pulls/{self.pr_number}/files")
91
+ # API may return dict with pagination info or list directly
92
+ files = (
93
+ files_response if isinstance(files_response, list) else files_response.get("files", [])
94
+ )
95
+ logger.debug("Found %d changed files", len(files))
96
+ for f in files:
97
+ logger.debug(" %s %s", f["status"], f["filename"])
98
+
99
+ return files
100
+
101
+ def fetch_linked_issues(self) -> list[dict]:
102
+ """Fetch issues linked/referenced in the PR.
103
+
104
+ Uses the BROADEST approach possible:
105
+ 1. GitHub Timeline API (catches manual links and cross-references)
106
+ 2. PR title parsing
107
+ 3. PR body parsing
108
+
109
+ Returns a list of issue dictionaries with 'number', 'title', and 'body'.
110
+ """
111
+ logger = logging.getLogger("swegen")
112
+ logger.debug("Fetching linked issues for PR #%s...", self.pr_number)
113
+
114
+ issues = []
115
+ issue_numbers = set()
116
+
117
+ try:
118
+ # Method 1: Use timeline API to find closing references and manual links
119
+ timeline_url = f"/repos/{self.repo}/issues/{self.pr_number}/timeline"
120
+ headers = self.headers.copy()
121
+ headers["Accept"] = "application/vnd.github.mockingbird-preview+json"
122
+
123
+ url = f"{self.api_base}{timeline_url}"
124
+ response = requests.get(url, headers=headers)
125
+ response.raise_for_status()
126
+ timeline = response.json()
127
+
128
+ for event in timeline:
129
+ if event.get("event") == "cross-referenced":
130
+ source = event.get("source", {})
131
+ if source.get("type") == "issue":
132
+ issue_data = source.get("issue", {})
133
+ issue_num = issue_data.get("number")
134
+ if issue_num and issue_num != self.pr_number:
135
+ issue_numbers.add(issue_num)
136
+ except Exception as e:
137
+ logger.debug("Timeline API failed (may not have access): %s", str(e))
138
+
139
+ try:
140
+ # Method 2: Parse PR title and body for issue references
141
+ pr_data = self._api_get(f"/repos/{self.repo}/pulls/{self.pr_number}")
142
+ pr_title = pr_data.get("title", "") or ""
143
+ pr_body = pr_data.get("body", "") or ""
144
+
145
+ # Combine title and body
146
+ text = f"{pr_title}\n{pr_body}"
147
+
148
+ # Remove HTML comments before parsing (like SWE-smith does)
149
+ comments_pat = re.compile(r"(?s)<!--.*?-->")
150
+ text = comments_pat.sub("", text)
151
+
152
+ # Match patterns - BROADEST approach (keywords optional, standalone #123 also matches)
153
+ patterns = [
154
+ r"(?:fix(?:es|ed)?|close(?:s|d)?|resolve(?:s|d)?)\s+#(\d+)", # With keywords
155
+ r"(?:fix(?:es|ed)?|close(?:s|d)?|resolve(?:s|d)?)\s+https?://github\.com/[^/]+/[^/]+/issues/(\d+)", # Full URLs with keywords
156
+ r"#(\d+)", # Standalone #123 (no keyword required - broadest)
157
+ r"https?://github\.com/[^/]+/[^/]+/issues/(\d+)", # Full URLs without keywords
158
+ ]
159
+
160
+ for pattern in patterns:
161
+ matches = re.finditer(pattern, text, re.IGNORECASE)
162
+ for match in matches:
163
+ issue_num = int(match.group(1))
164
+ if issue_num != self.pr_number: # Don't include the PR itself
165
+ issue_numbers.add(issue_num)
166
+ except Exception as e:
167
+ logger.debug("Failed to parse PR title/body for issue refs: %s", str(e))
168
+
169
+ # Fetch full issue data for each linked issue
170
+ for issue_num in sorted(issue_numbers):
171
+ try:
172
+ issue_data = self._api_get(f"/repos/{self.repo}/issues/{issue_num}")
173
+ issues.append(
174
+ {
175
+ "number": issue_data["number"],
176
+ "title": issue_data["title"],
177
+ "body": issue_data.get("body", ""),
178
+ "state": issue_data.get("state", ""),
179
+ "html_url": issue_data.get("html_url", ""),
180
+ }
181
+ )
182
+ logger.debug(" Found linked issue #%d: %s", issue_num, issue_data["title"])
183
+ except Exception as e:
184
+ logger.debug(" Failed to fetch issue #%d: %s", issue_num, str(e))
185
+
186
+ logger.debug("Collected %d linked issues", len(issues))
187
+ return issues