swegen 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swegen/__init__.py +14 -0
- swegen/analyze/__init__.py +24 -0
- swegen/analyze/classifier.py +637 -0
- swegen/analyze/classify_prompt.txt +241 -0
- swegen/analyze/models.py +253 -0
- swegen/analyze/run.py +656 -0
- swegen/analyze/verdict_prompt.txt +126 -0
- swegen/cli.py +411 -0
- swegen/config.py +142 -0
- swegen/create/__init__.py +22 -0
- swegen/create/claude_code_runner.py +988 -0
- swegen/create/claude_code_utils.py +95 -0
- swegen/create/create.py +706 -0
- swegen/create/diff_utils.py +142 -0
- swegen/create/orchestrator.py +368 -0
- swegen/create/pr_fetcher.py +187 -0
- swegen/create/repo_cache.py +175 -0
- swegen/create/task_instruction.py +363 -0
- swegen/create/task_reference.py +130 -0
- swegen/create/task_skeleton.py +266 -0
- swegen/create/utils.py +350 -0
- swegen/farm/__init__.py +13 -0
- swegen/farm/farm_hand.py +342 -0
- swegen/farm/fetcher.py +341 -0
- swegen/farm/state.py +231 -0
- swegen/farm/stream_farm.py +430 -0
- swegen/tools/__init__.py +16 -0
- swegen/tools/harbor_runner.py +191 -0
- swegen/tools/validate.py +523 -0
- swegen/tools/validate_utils.py +142 -0
- swegen-0.1.0.dist-info/METADATA +292 -0
- swegen-0.1.0.dist-info/RECORD +35 -0
- swegen-0.1.0.dist-info/WHEEL +4 -0
- swegen-0.1.0.dist-info/entry_points.txt +3 -0
- swegen-0.1.0.dist-info/licenses/LICENSE +201 -0
swegen/farm/farm_hand.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
|
|
13
|
+
from swegen.config import CreateConfig, FarmConfig
|
|
14
|
+
from swegen.create import MissingIssueError, TrivialPRError, ValidationError
|
|
15
|
+
from swegen.create.create import run_reversal
|
|
16
|
+
from swegen.create.task_reference import TaskReferenceStore
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _now_utc() -> datetime:
|
|
20
|
+
return datetime.now(UTC)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _slug(repo: str) -> str:
|
|
24
|
+
"""Convert repo to slug using SWEBench convention: owner/repo -> owner__repo"""
|
|
25
|
+
return repo.replace("/", "__")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _task_id(repo: str, pr_number: int) -> str:
|
|
29
|
+
"""Generate task ID using SWEBench convention: owner__repo-number"""
|
|
30
|
+
return f"{_slug(repo)}-{pr_number}"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class PRCandidate:
|
|
35
|
+
"""A candidate PR for task generation."""
|
|
36
|
+
|
|
37
|
+
number: int
|
|
38
|
+
title: str
|
|
39
|
+
created_at: str
|
|
40
|
+
merged_at: str
|
|
41
|
+
author: str
|
|
42
|
+
files_changed: int
|
|
43
|
+
additions: int
|
|
44
|
+
deletions: int
|
|
45
|
+
url: str
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class TaskResult:
|
|
50
|
+
"""Result of processing a single PR into a task."""
|
|
51
|
+
|
|
52
|
+
repo: str
|
|
53
|
+
pr_number: int
|
|
54
|
+
task_id: str
|
|
55
|
+
status: str # "success", "failed", or "dry-run"
|
|
56
|
+
message: str
|
|
57
|
+
duration_seconds: float
|
|
58
|
+
timestamp: str
|
|
59
|
+
category: str = None # Category for detailed tracking
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _cleanup_task(task_id: str, tasks_root: Path, console: Console) -> None:
|
|
63
|
+
removed_any = False
|
|
64
|
+
paths = [
|
|
65
|
+
tasks_root / task_id,
|
|
66
|
+
Path("trash") / task_id,
|
|
67
|
+
]
|
|
68
|
+
for path in paths:
|
|
69
|
+
if path.exists():
|
|
70
|
+
shutil.rmtree(path, ignore_errors=True)
|
|
71
|
+
removed_any = True
|
|
72
|
+
if removed_any:
|
|
73
|
+
console.print(f"[dim]Cleaned up incomplete task directory: {task_id}[/dim]")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _classify_failure(stderr: str) -> tuple[str, str]:
|
|
77
|
+
"""Classify failure reason and return (category, message).
|
|
78
|
+
|
|
79
|
+
Categories:
|
|
80
|
+
- trivial: Trivial PR (too small/simple)
|
|
81
|
+
- no_issue: No linked issue
|
|
82
|
+
- no_tests: No tests detected
|
|
83
|
+
- validation_failed: Harbor validation failed
|
|
84
|
+
- already_exists: Task already exists
|
|
85
|
+
- rate_limit: GitHub API rate limit
|
|
86
|
+
- quota_exceeded: OpenAI quota exceeded
|
|
87
|
+
- timeout: Command timeout
|
|
88
|
+
- git_error: Git checkout/commit errors
|
|
89
|
+
- other: Unknown/other errors
|
|
90
|
+
"""
|
|
91
|
+
lowered = stderr.lower()
|
|
92
|
+
if "trivial" in stderr:
|
|
93
|
+
return "trivial", "Trivial PR (skipped)"
|
|
94
|
+
if "no linked issue" in lowered or "missingissueerror" in lowered:
|
|
95
|
+
return "no_issue", "No linked issue (skipped)"
|
|
96
|
+
if "validation failed" in lowered or "harbor validation" in lowered:
|
|
97
|
+
return "validation_failed", "Validation failed (NOP or Oracle)"
|
|
98
|
+
if "task already exists" in lowered or "file exists" in lowered:
|
|
99
|
+
return "already_exists", "Task already exists (skipped)"
|
|
100
|
+
if "no test" in stderr:
|
|
101
|
+
return "no_tests", "No tests detected"
|
|
102
|
+
if "rate limit exceeded" in lowered and "github" in lowered:
|
|
103
|
+
return "rate_limit", "GitHub API rate limit exceeded (set GITHUB_TOKEN)"
|
|
104
|
+
if "insufficient_quota" in lowered or "exceeded your current quota" in lowered:
|
|
105
|
+
return "quota_exceeded", "OpenAI API quota exceeded (check billing)"
|
|
106
|
+
if "timed out" in lowered or "timeout" in lowered:
|
|
107
|
+
return "timeout", "Command timed out"
|
|
108
|
+
if "cannot checkout commit" in lowered or "force-pushed or deleted" in lowered:
|
|
109
|
+
return "git_error", "Git commit not found (may be force-pushed or deleted)"
|
|
110
|
+
if "git checkout" in lowered:
|
|
111
|
+
return "git_error", "Git checkout failed (repo cache may be corrupted)"
|
|
112
|
+
|
|
113
|
+
message = (stderr or "Unknown error").replace("\n", " ")
|
|
114
|
+
return "other", message
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _print_success(
|
|
118
|
+
console: Console,
|
|
119
|
+
pr: PRCandidate,
|
|
120
|
+
task_id: str,
|
|
121
|
+
harbor_dir: Path,
|
|
122
|
+
) -> None:
|
|
123
|
+
console.print(
|
|
124
|
+
Panel.fit(
|
|
125
|
+
f"🎉 Successfully generated task\n[bold]{task_id}[/bold]\nHarbor: {harbor_dir}",
|
|
126
|
+
title=f"PR #{pr.number}",
|
|
127
|
+
border_style="green",
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _gate_task(
|
|
133
|
+
task_id: str,
|
|
134
|
+
tasks_root: Path,
|
|
135
|
+
) -> tuple[bool, str]:
|
|
136
|
+
"""
|
|
137
|
+
Validate that the task directory exists.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Tuple of (success, message)
|
|
141
|
+
"""
|
|
142
|
+
task_dir = tasks_root / task_id
|
|
143
|
+
if not task_dir.exists():
|
|
144
|
+
return False, f"Task directory missing: {task_dir}"
|
|
145
|
+
|
|
146
|
+
return True, f"Task generated successfully at {task_dir}"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _run_reversal_for_pr(
|
|
150
|
+
pr: PRCandidate,
|
|
151
|
+
config: FarmConfig,
|
|
152
|
+
tasks_root: Path,
|
|
153
|
+
console: Console,
|
|
154
|
+
) -> TaskResult:
|
|
155
|
+
start = time.time()
|
|
156
|
+
task_id = _task_id(config.repo, pr.number)
|
|
157
|
+
harbor_dir = tasks_root / task_id
|
|
158
|
+
|
|
159
|
+
# Wrap everything in try-except to catch unexpected errors
|
|
160
|
+
try:
|
|
161
|
+
return _run_reversal_for_pr_impl(
|
|
162
|
+
pr, config, tasks_root, console, task_id, harbor_dir, start
|
|
163
|
+
)
|
|
164
|
+
except Exception as e:
|
|
165
|
+
# Catch any unexpected exception and return proper error
|
|
166
|
+
error_msg = f"Unexpected error: {type(e).__name__}: {str(e)}"
|
|
167
|
+
console.print(f"[red]✗ PR #{pr.number}: {error_msg}[/red]")
|
|
168
|
+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
|
|
169
|
+
_cleanup_task(task_id, tasks_root, console)
|
|
170
|
+
return TaskResult(
|
|
171
|
+
repo=config.repo,
|
|
172
|
+
pr_number=pr.number,
|
|
173
|
+
task_id=task_id,
|
|
174
|
+
status="failed",
|
|
175
|
+
message=error_msg,
|
|
176
|
+
duration_seconds=round(time.time() - start, 2),
|
|
177
|
+
timestamp=_now_utc().isoformat(),
|
|
178
|
+
category="other",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _run_reversal_for_pr_impl(
|
|
183
|
+
pr: PRCandidate,
|
|
184
|
+
config: FarmConfig,
|
|
185
|
+
tasks_root: Path,
|
|
186
|
+
console: Console,
|
|
187
|
+
task_id: str,
|
|
188
|
+
harbor_dir: Path,
|
|
189
|
+
start: float,
|
|
190
|
+
) -> TaskResult:
|
|
191
|
+
if config.dry_run:
|
|
192
|
+
console.print(f"[cyan]DRY RUN[/cyan] would generate task for PR #{pr.number} -> {task_id}")
|
|
193
|
+
return TaskResult(
|
|
194
|
+
repo=config.repo,
|
|
195
|
+
pr_number=pr.number,
|
|
196
|
+
task_id=task_id,
|
|
197
|
+
status="dry-run",
|
|
198
|
+
message="Dry run (skipped actual execution)",
|
|
199
|
+
duration_seconds=0.0,
|
|
200
|
+
timestamp=_now_utc().isoformat(),
|
|
201
|
+
category=None,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Build CreateConfig for run_reversal
|
|
205
|
+
create_config = CreateConfig(
|
|
206
|
+
repo=config.repo,
|
|
207
|
+
pr=pr.number,
|
|
208
|
+
output=config.output,
|
|
209
|
+
cc_timeout=config.cc_timeout,
|
|
210
|
+
validate=config.validate, # Run Harbor validation if --validate flag is set
|
|
211
|
+
force=config.force,
|
|
212
|
+
state_dir=config.state_dir,
|
|
213
|
+
verbose=config.verbose,
|
|
214
|
+
quiet=False,
|
|
215
|
+
use_cache=not config.no_cache,
|
|
216
|
+
require_minimum_difficulty=config.require_minimum_difficulty,
|
|
217
|
+
min_source_files=config.min_source_files,
|
|
218
|
+
max_source_files=config.max_source_files,
|
|
219
|
+
require_issue=config.issue_only,
|
|
220
|
+
environment=config.environment,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Capture any errors from the pipeline
|
|
224
|
+
success = False
|
|
225
|
+
error_msg = ""
|
|
226
|
+
error_category = None
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
# Call the pipeline directly instead of using subprocess
|
|
230
|
+
run_reversal(create_config)
|
|
231
|
+
success = True
|
|
232
|
+
except TrivialPRError as e:
|
|
233
|
+
# Trivial PR - not an error, just skip it
|
|
234
|
+
error_msg = str(e)
|
|
235
|
+
error_category = "trivial"
|
|
236
|
+
success = False
|
|
237
|
+
except MissingIssueError as e:
|
|
238
|
+
# No linked issue - not an error, just skip it
|
|
239
|
+
error_msg = str(e)
|
|
240
|
+
error_category = "no_issue"
|
|
241
|
+
success = False
|
|
242
|
+
except ValidationError as e:
|
|
243
|
+
# Validation failed - not an error, just skip it
|
|
244
|
+
error_msg = str(e)
|
|
245
|
+
error_category = "validation_failed"
|
|
246
|
+
success = False
|
|
247
|
+
except FileExistsError as e:
|
|
248
|
+
# Task already exists - skip it
|
|
249
|
+
error_msg = f"Task already exists: {str(e)}"
|
|
250
|
+
error_category = "already_exists"
|
|
251
|
+
success = False
|
|
252
|
+
except Exception as e:
|
|
253
|
+
# Other errors
|
|
254
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
255
|
+
if config.verbose:
|
|
256
|
+
console.print(f"[red]{traceback.format_exc()}[/red]")
|
|
257
|
+
# Classify the error
|
|
258
|
+
error_category, _ = _classify_failure(error_msg)
|
|
259
|
+
success = False
|
|
260
|
+
|
|
261
|
+
if success:
|
|
262
|
+
if not harbor_dir.exists():
|
|
263
|
+
# Check for trivial PR (should have been caught by TrivialPRError)
|
|
264
|
+
if "trivial" in error_msg.lower():
|
|
265
|
+
failure_reason = "Trivial PR (skipped)"
|
|
266
|
+
failure_category = "trivial"
|
|
267
|
+
else:
|
|
268
|
+
failure_reason = (
|
|
269
|
+
"Pipeline reported success but Harbor task directory was not created."
|
|
270
|
+
)
|
|
271
|
+
failure_category = "other"
|
|
272
|
+
_cleanup_task(task_id, tasks_root, console)
|
|
273
|
+
console.print(f"[red]✗ PR #{pr.number}: {failure_reason}[/red]")
|
|
274
|
+
return TaskResult(
|
|
275
|
+
repo=config.repo,
|
|
276
|
+
pr_number=pr.number,
|
|
277
|
+
task_id=task_id,
|
|
278
|
+
status="failed",
|
|
279
|
+
message=failure_reason,
|
|
280
|
+
duration_seconds=round(time.time() - start, 2),
|
|
281
|
+
timestamp=_now_utc().isoformat(),
|
|
282
|
+
category=failure_category,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Task is already in Harbor format (create now generates directly to Harbor)
|
|
286
|
+
duration = time.time() - start
|
|
287
|
+
gate_ok, gate_msg = _gate_task(task_id, tasks_root)
|
|
288
|
+
if gate_ok:
|
|
289
|
+
_print_success(console, pr, task_id, harbor_dir)
|
|
290
|
+
|
|
291
|
+
# Save task reference for future PRs
|
|
292
|
+
try:
|
|
293
|
+
reference_store = TaskReferenceStore()
|
|
294
|
+
reference_store.save(
|
|
295
|
+
repo=config.repo,
|
|
296
|
+
task_id=task_id,
|
|
297
|
+
pr_number=pr.number,
|
|
298
|
+
)
|
|
299
|
+
except Exception as e:
|
|
300
|
+
console.print(f"[yellow]Warning: Could not save task reference: {e}[/yellow]")
|
|
301
|
+
|
|
302
|
+
return TaskResult(
|
|
303
|
+
repo=config.repo,
|
|
304
|
+
pr_number=pr.number,
|
|
305
|
+
task_id=task_id,
|
|
306
|
+
status="success",
|
|
307
|
+
message=gate_msg,
|
|
308
|
+
duration_seconds=round(duration, 2),
|
|
309
|
+
timestamp=_now_utc().isoformat(),
|
|
310
|
+
category=None,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Gate failed
|
|
314
|
+
failure_reason = gate_msg
|
|
315
|
+
failure_category = "other"
|
|
316
|
+
_cleanup_task(task_id, tasks_root, console)
|
|
317
|
+
console.print(f"[red]✗ PR #{pr.number}: {failure_reason}[/red]")
|
|
318
|
+
return TaskResult(
|
|
319
|
+
repo=config.repo,
|
|
320
|
+
pr_number=pr.number,
|
|
321
|
+
task_id=task_id,
|
|
322
|
+
status="failed",
|
|
323
|
+
message=failure_reason,
|
|
324
|
+
duration_seconds=round(duration, 2),
|
|
325
|
+
timestamp=_now_utc().isoformat(),
|
|
326
|
+
category=failure_category,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Pipeline failed
|
|
330
|
+
failure_category, failure_reason = _classify_failure(error_msg)
|
|
331
|
+
_cleanup_task(task_id, tasks_root, console)
|
|
332
|
+
console.print(f"[red]✗ PR #{pr.number}: {failure_reason}[/red]")
|
|
333
|
+
return TaskResult(
|
|
334
|
+
repo=config.repo,
|
|
335
|
+
pr_number=pr.number,
|
|
336
|
+
task_id=task_id,
|
|
337
|
+
status="failed",
|
|
338
|
+
message=failure_reason,
|
|
339
|
+
duration_seconds=round(time.time() - start, 2),
|
|
340
|
+
timestamp=_now_utc().isoformat(),
|
|
341
|
+
category=failure_category,
|
|
342
|
+
)
|
swegen/farm/fetcher.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
|
|
13
|
+
from swegen.create import is_test_file
|
|
14
|
+
|
|
15
|
+
from .farm_hand import PRCandidate, _slug
|
|
16
|
+
from .state import StreamState
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_skip_list(skip_list_file: Path, repo: str) -> set[int]:
|
|
20
|
+
"""Load PR numbers from a skip list file for the given repository.
|
|
21
|
+
|
|
22
|
+
The file should contain task IDs like (SWEBench format):
|
|
23
|
+
owner__repo-123
|
|
24
|
+
owner__repo-456
|
|
25
|
+
|
|
26
|
+
This function extracts PR numbers matching the current repo.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
skip_list_file: Path to the skip list file
|
|
30
|
+
repo: Repository in owner/repo format (e.g., "python/pillow")
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Set of PR numbers to skip
|
|
34
|
+
"""
|
|
35
|
+
if not skip_list_file.exists():
|
|
36
|
+
return set()
|
|
37
|
+
|
|
38
|
+
# Create expected prefix from repo (e.g., "python/pillow" -> "python__pillow-")
|
|
39
|
+
repo_slug = _slug(repo)
|
|
40
|
+
prefix = f"{repo_slug}-"
|
|
41
|
+
|
|
42
|
+
skip_prs: set[int] = set()
|
|
43
|
+
try:
|
|
44
|
+
content = skip_list_file.read_text()
|
|
45
|
+
for line in content.strip().split("\n"):
|
|
46
|
+
line = line.strip()
|
|
47
|
+
if not line or line.startswith("#"):
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# Check if this task ID matches our repo
|
|
51
|
+
if line.startswith(prefix):
|
|
52
|
+
# Extract PR number from task ID (e.g., "python__pillow-9272" -> 9272)
|
|
53
|
+
pr_part = line[len(prefix) :]
|
|
54
|
+
try:
|
|
55
|
+
pr_number = int(pr_part)
|
|
56
|
+
skip_prs.add(pr_number)
|
|
57
|
+
except ValueError:
|
|
58
|
+
# Ignore malformed entries
|
|
59
|
+
pass
|
|
60
|
+
except Exception:
|
|
61
|
+
# If file read fails, return empty set
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
return skip_prs
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class StreamingPRFetcher:
|
|
68
|
+
"""Fetches PRs from GitHub in a streaming fashion.
|
|
69
|
+
|
|
70
|
+
Yields PRs one at a time after filtering. Handles pagination,
|
|
71
|
+
rate limiting, and various filters (merged, has tests).
|
|
72
|
+
|
|
73
|
+
Attributes:
|
|
74
|
+
repo: Repository in "owner/repo" format
|
|
75
|
+
console: Rich console for output
|
|
76
|
+
state: StreamState for tracking processed PRs
|
|
77
|
+
min_files: Minimum total files changed (early approximate filter)
|
|
78
|
+
require_tests: Whether PRs must have test file changes
|
|
79
|
+
api_delay: Delay between API calls in seconds
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
repo: str,
|
|
85
|
+
console: Console,
|
|
86
|
+
state: StreamState,
|
|
87
|
+
min_files: int = 3,
|
|
88
|
+
require_tests: bool = True,
|
|
89
|
+
api_delay: float = 0.5,
|
|
90
|
+
):
|
|
91
|
+
self.repo = repo
|
|
92
|
+
self.console = console
|
|
93
|
+
self.state = state
|
|
94
|
+
self.min_files = min_files
|
|
95
|
+
self.require_tests = require_tests
|
|
96
|
+
self.api_delay = api_delay
|
|
97
|
+
|
|
98
|
+
# GitHub API setup
|
|
99
|
+
self.api_base = "https://api.github.com"
|
|
100
|
+
self.github_token = (
|
|
101
|
+
os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") or os.getenv("REPO_CREATION_TOKEN")
|
|
102
|
+
)
|
|
103
|
+
self.headers = {
|
|
104
|
+
"Accept": "application/vnd.github+json",
|
|
105
|
+
"User-Agent": "swegen-stream-farm",
|
|
106
|
+
}
|
|
107
|
+
if self.github_token:
|
|
108
|
+
self.headers["Authorization"] = f"token {self.github_token}"
|
|
109
|
+
|
|
110
|
+
def stream_prs(
|
|
111
|
+
self,
|
|
112
|
+
resume_from_time: str | None = None,
|
|
113
|
+
) -> Iterator[PRCandidate]:
|
|
114
|
+
"""Stream PRs from GitHub API, skipping already processed ones.
|
|
115
|
+
|
|
116
|
+
Yields PRs one at a time after validation. Fetches in pages
|
|
117
|
+
but yields immediately, allowing processing to happen concurrently.
|
|
118
|
+
|
|
119
|
+
Works backwards in time from present day (or resume point) by PR creation time.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
resume_from_time: If specified, only process PRs created before this timestamp.
|
|
123
|
+
Format: ISO 8601 string (e.g., "2024-01-15T23:59:59.999999+00:00")
|
|
124
|
+
This allows resuming from a specific time and continuing backwards.
|
|
125
|
+
|
|
126
|
+
Yields:
|
|
127
|
+
PRCandidate instances for each PR that passes filters
|
|
128
|
+
"""
|
|
129
|
+
yielded = 0
|
|
130
|
+
page = 1
|
|
131
|
+
|
|
132
|
+
# Fetch closed PRs sorted by created time descending
|
|
133
|
+
# This gives us all merged PRs in reverse chronological order (by creation)
|
|
134
|
+
params_base = {
|
|
135
|
+
"state": "closed",
|
|
136
|
+
"sort": "created",
|
|
137
|
+
"direction": "desc",
|
|
138
|
+
"per_page": 100,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
self.console.print(f"[dim]Streaming PRs from {self.repo}...[/dim]")
|
|
142
|
+
if resume_from_time is not None:
|
|
143
|
+
resume_dt = datetime.fromisoformat(resume_from_time.replace("Z", "+00:00"))
|
|
144
|
+
self.console.print(
|
|
145
|
+
f"[yellow]Resuming from {resume_dt.strftime('%Y-%m-%d %H:%M:%S UTC')} "
|
|
146
|
+
f"(only processing PRs created before this time)[/yellow]"
|
|
147
|
+
)
|
|
148
|
+
elif self.state.total_processed > 0:
|
|
149
|
+
self.console.print(
|
|
150
|
+
f"[yellow]Resuming: {self.state.total_processed} PRs already processed "
|
|
151
|
+
f"({self.state.successful} successful, {self.state.failed} failed)[/yellow]"
|
|
152
|
+
)
|
|
153
|
+
if self.state.last_created_at:
|
|
154
|
+
last_dt = datetime.fromisoformat(self.state.last_created_at.replace("Z", "+00:00"))
|
|
155
|
+
self.console.print(
|
|
156
|
+
f"[yellow]Last processed PR created at: {last_dt.strftime('%Y-%m-%d %H:%M:%S UTC')}[/yellow]"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
skipped_stats = {
|
|
160
|
+
"already_processed": 0,
|
|
161
|
+
"in_skip_list": 0,
|
|
162
|
+
"not_merged": 0,
|
|
163
|
+
"too_few_changes": 0,
|
|
164
|
+
"no_tests": 0,
|
|
165
|
+
"api_error": 0,
|
|
166
|
+
"after_resume_time": 0,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
while True:
|
|
170
|
+
# Fetch next page
|
|
171
|
+
url = f"{self.api_base}/repos/{self.repo}/pulls"
|
|
172
|
+
params: dict[str, Any] = {**params_base, "page": page}
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
resp = requests.get(url, headers=self.headers, params=params, timeout=30)
|
|
176
|
+
resp.raise_for_status()
|
|
177
|
+
except requests.exceptions.RequestException as exc:
|
|
178
|
+
self.console.print(f"[red]API error on page {page}: {exc}[/red]")
|
|
179
|
+
skipped_stats["api_error"] += 1
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
prs = resp.json()
|
|
183
|
+
if not prs:
|
|
184
|
+
self.console.print("[dim]No more PRs available[/dim]")
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
# Check rate limiting
|
|
188
|
+
remaining = int(resp.headers.get("X-RateLimit-Remaining", 999))
|
|
189
|
+
if remaining < 10:
|
|
190
|
+
reset_time = int(resp.headers.get("X-RateLimit-Reset", 0))
|
|
191
|
+
wait_seconds = max(0, reset_time - time.time())
|
|
192
|
+
self.console.print(
|
|
193
|
+
f"[yellow]Rate limit low ({remaining}), waiting {wait_seconds:.0f}s...[/yellow]"
|
|
194
|
+
)
|
|
195
|
+
time.sleep(wait_seconds + 1)
|
|
196
|
+
|
|
197
|
+
# Process PRs from this page
|
|
198
|
+
for pr_data in prs:
|
|
199
|
+
pr_number = pr_data["number"]
|
|
200
|
+
|
|
201
|
+
# Filter: must be merged
|
|
202
|
+
merged_at = pr_data.get("merged_at")
|
|
203
|
+
if not merged_at:
|
|
204
|
+
skipped_stats["not_merged"] += 1
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
# Get creation time
|
|
208
|
+
created_at = pr_data.get("created_at")
|
|
209
|
+
|
|
210
|
+
# Skip if this PR was created after our resume time
|
|
211
|
+
# (we're working backwards, so we only want PRs created before the resume point)
|
|
212
|
+
if resume_from_time is not None and created_at:
|
|
213
|
+
pr_created_dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
|
214
|
+
resume_dt = datetime.fromisoformat(resume_from_time.replace("Z", "+00:00"))
|
|
215
|
+
if pr_created_dt >= resume_dt:
|
|
216
|
+
skipped_stats["after_resume_time"] += 1
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
# Skip if already processed
|
|
220
|
+
if pr_number in self.state.processed_prs:
|
|
221
|
+
skipped_stats["already_processed"] += 1
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
# Skip if in external skip list
|
|
225
|
+
if pr_number in self.state.skip_list_prs:
|
|
226
|
+
skipped_stats["in_skip_list"] += 1
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Fetch full PR details
|
|
230
|
+
try:
|
|
231
|
+
pr_url = f"{self.api_base}/repos/{self.repo}/pulls/{pr_number}"
|
|
232
|
+
pr_resp = requests.get(pr_url, headers=self.headers, timeout=30)
|
|
233
|
+
pr_resp.raise_for_status()
|
|
234
|
+
pr_full = pr_resp.json()
|
|
235
|
+
time.sleep(self.api_delay)
|
|
236
|
+
except requests.exceptions.RequestException:
|
|
237
|
+
skipped_stats["api_error"] += 1
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# Get file change count for metadata
|
|
241
|
+
files_changed = pr_full.get("changed_files", 0)
|
|
242
|
+
|
|
243
|
+
# Filter: minimum files changed (early approximate filter to save API calls)
|
|
244
|
+
# Note: This is total files (including tests/docs/CI)
|
|
245
|
+
# The accurate source-only check happens later in the pipeline
|
|
246
|
+
if files_changed < self.min_files:
|
|
247
|
+
skipped_stats["too_few_changes"] += 1
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Filter: test file changes (if required)
|
|
251
|
+
if self.require_tests:
|
|
252
|
+
try:
|
|
253
|
+
has_tests = self._pr_has_test_changes(pr_number)
|
|
254
|
+
time.sleep(self.api_delay)
|
|
255
|
+
if not has_tests:
|
|
256
|
+
skipped_stats["no_tests"] += 1
|
|
257
|
+
continue
|
|
258
|
+
except requests.exceptions.RequestException:
|
|
259
|
+
skipped_stats["api_error"] += 1
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
# Passed all filters - yield this PR
|
|
263
|
+
candidate = PRCandidate(
|
|
264
|
+
number=pr_number,
|
|
265
|
+
title=pr_full.get("title", ""),
|
|
266
|
+
created_at=pr_full.get("created_at", ""),
|
|
267
|
+
merged_at=pr_full.get("merged_at", ""),
|
|
268
|
+
author=pr_full.get("user", {}).get("login", "unknown"),
|
|
269
|
+
files_changed=files_changed,
|
|
270
|
+
additions=pr_full.get("additions", 0),
|
|
271
|
+
deletions=pr_full.get("deletions", 0),
|
|
272
|
+
url=pr_full.get("html_url", ""),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
self.state.total_fetched += 1
|
|
276
|
+
yielded += 1
|
|
277
|
+
|
|
278
|
+
yield candidate
|
|
279
|
+
|
|
280
|
+
# Move to next page
|
|
281
|
+
page += 1
|
|
282
|
+
|
|
283
|
+
# Break if we got fewer results than expected (last page)
|
|
284
|
+
if len(prs) < 100:
|
|
285
|
+
self.console.print("[dim]Reached last page of PRs[/dim]")
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
# Final stats
|
|
289
|
+
self._print_stats(skipped_stats)
|
|
290
|
+
self.console.print(
|
|
291
|
+
f"[green]Stream complete: {yielded} PRs yielded, "
|
|
292
|
+
f"{self.state.total_processed} total processed[/green]"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def _pr_has_test_changes(self, pr_number: int) -> bool:
|
|
296
|
+
"""Check if PR modifies test files.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
pr_number: PR number to check
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
True if PR has test file changes
|
|
303
|
+
"""
|
|
304
|
+
files_url = f"{self.api_base}/repos/{self.repo}/pulls/{pr_number}/files"
|
|
305
|
+
page = 1
|
|
306
|
+
|
|
307
|
+
while True:
|
|
308
|
+
params = {"page": page, "per_page": 100}
|
|
309
|
+
resp = requests.get(files_url, headers=self.headers, params=params, timeout=30)
|
|
310
|
+
resp.raise_for_status()
|
|
311
|
+
|
|
312
|
+
files = resp.json()
|
|
313
|
+
if not files:
|
|
314
|
+
break
|
|
315
|
+
|
|
316
|
+
for file in files:
|
|
317
|
+
filename = file.get("filename", "")
|
|
318
|
+
# Use centralized test file detection (supports all languages)
|
|
319
|
+
if is_test_file(filename):
|
|
320
|
+
return True
|
|
321
|
+
|
|
322
|
+
if len(files) < 100:
|
|
323
|
+
break
|
|
324
|
+
page += 1
|
|
325
|
+
|
|
326
|
+
return False
|
|
327
|
+
|
|
328
|
+
def _print_stats(self, skipped: dict) -> None:
|
|
329
|
+
"""Print skipping statistics.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
skipped: Dict of skip reasons to counts
|
|
333
|
+
"""
|
|
334
|
+
total_skipped = sum(skipped.values())
|
|
335
|
+
if total_skipped == 0:
|
|
336
|
+
return
|
|
337
|
+
|
|
338
|
+
self.console.print("\n[dim]Skipped PRs:[/dim]")
|
|
339
|
+
for reason, count in skipped.items():
|
|
340
|
+
if count > 0:
|
|
341
|
+
self.console.print(f" [dim]• {reason}: {count}[/dim]")
|