github-pr-context-mcp 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
auth/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from auth.gmail_identity import GmailIdentityStore, GmailTokenVerifier, RegistrationResult
2
+
3
+ __all__ = ["GmailIdentityStore", "GmailTokenVerifier", "RegistrationResult"]
auth/gmail_identity.py ADDED
@@ -0,0 +1,236 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import hmac
5
+ import json
6
+ import re
7
+ import secrets
8
+ import sqlite3
9
+ from copy import deepcopy
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from mcp.server.auth.provider import AccessToken, TokenVerifier
16
+
17
+ GMAIL_EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@gmail\.com$", re.IGNORECASE)
18
+ ALLOWED_LLM_PROVIDERS = {"cerebras", "openai", "anthropic", "ollama", "groq", "gemini"}
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class RegistrationResult:
23
+ email: str
24
+ token: str
25
+ settings: dict[str, str]
26
+
27
+
28
+ class GmailIdentityStore:
29
+ """Store one registered bearer token per Gmail address backed by thread-safe SQLite."""
30
+
31
+ def __init__(self, file_path: str):
32
+ # Swap existing json suffixes to .db without breaking integrations
33
+ p = Path(file_path)
34
+ if p.suffix == '.json':
35
+ self._path = p.with_suffix('.db')
36
+ else:
37
+ self._path = p
38
+
39
+ self._path.parent.mkdir(parents=True, exist_ok=True)
40
+ self._init_db()
41
+
42
+ def _get_conn(self) -> sqlite3.Connection:
43
+ # isolation_level=None enables autocommit for simple operations
44
+ # check_same_thread=False allows sharing across async workers
45
+ return sqlite3.connect(str(self._path), isolation_level=None, check_same_thread=False)
46
+
47
+ def _init_db(self):
48
+ with self._get_conn() as conn:
49
+ conn.execute('''
50
+ CREATE TABLE IF NOT EXISTS users (
51
+ email TEXT PRIMARY KEY,
52
+ token_hash TEXT,
53
+ registered_at TEXT,
54
+ last_seen TEXT,
55
+ revoked INTEGER DEFAULT 0,
56
+ settings TEXT
57
+ )
58
+ ''')
59
+
60
+ def _utc_now(self) -> str:
61
+ return datetime.now(timezone.utc).isoformat()
62
+
63
+ def _normalize_email(self, email: str) -> str:
64
+ candidate = email.strip().lower()
65
+ if not GMAIL_EMAIL_RE.fullmatch(candidate):
66
+ raise ValueError("Only gmail.com addresses are allowed")
67
+ return candidate
68
+
69
+ def _hash_token(self, token: str) -> str:
70
+ return hashlib.sha256(token.encode("utf-8")).hexdigest()
71
+
72
+ def _normalize_optional(self, value: Any, field_name: str, max_len: int = 512) -> str | None:
73
+ if value is None:
74
+ return None
75
+ candidate = str(value).strip()
76
+ if not candidate:
77
+ return None
78
+ if len(candidate) > max_len:
79
+ raise ValueError(f"{field_name} is too long")
80
+ return candidate
81
+
82
+ def _sanitize_settings(self, settings: dict[str, Any] | None) -> dict[str, str]:
83
+ if not settings:
84
+ return {}
85
+ if not isinstance(settings, dict):
86
+ raise ValueError("settings must be an object")
87
+
88
+ sanitized: dict[str, str] = {}
89
+
90
+ github_token = self._normalize_optional(settings.get("github_token"), "github_token")
91
+ if github_token:
92
+ sanitized["github_token"] = github_token
93
+
94
+ llm_provider = self._normalize_optional(settings.get("llm_provider"), "llm_provider", max_len=64)
95
+ if llm_provider:
96
+ provider = llm_provider.lower()
97
+ if provider not in ALLOWED_LLM_PROVIDERS:
98
+ options = ", ".join(sorted(ALLOWED_LLM_PROVIDERS))
99
+ raise ValueError(f"llm_provider must be one of: {options}")
100
+ sanitized["llm_provider"] = provider
101
+
102
+ llm_model = self._normalize_optional(settings.get("llm_model"), "llm_model", max_len=128)
103
+ if llm_model:
104
+ sanitized["llm_model"] = llm_model
105
+
106
+ llm_api_key = self._normalize_optional(settings.get("llm_api_key"), "llm_api_key")
107
+ if llm_api_key:
108
+ sanitized["llm_api_key"] = llm_api_key
109
+
110
+ llm_base_url = self._normalize_optional(settings.get("llm_base_url"), "llm_base_url")
111
+ if llm_base_url:
112
+ lowered = llm_base_url.lower()
113
+ if not (lowered.startswith("http://") or lowered.startswith("https://")):
114
+ raise ValueError("llm_base_url must start with http:// or https://")
115
+ sanitized["llm_base_url"] = llm_base_url
116
+
117
+ return sanitized
118
+
119
+ def _masked_settings(self, settings: dict[str, str]) -> dict[str, str]:
120
+ masked = deepcopy(settings)
121
+ for key in ("github_token", "llm_api_key"):
122
+ if key in masked:
123
+ masked[key] = "***"
124
+ return masked
125
+
126
+ def register_email(self, email: str, settings: dict[str, Any] | None = None) -> RegistrationResult:
127
+ normalized_email = self._normalize_email(email)
128
+ sanitized_settings = self._sanitize_settings(settings)
129
+
130
+ token = secrets.token_urlsafe(32)
131
+ token_hash = self._hash_token(token)
132
+ now = self._utc_now()
133
+ settings_json = json.dumps(sanitized_settings)
134
+
135
+ with self._get_conn() as conn:
136
+ try:
137
+ conn.execute(
138
+ "INSERT INTO users (email, token_hash, registered_at, last_seen, revoked, settings) VALUES (?, ?, ?, ?, ?, ?)",
139
+ (normalized_email, token_hash, now, None, 0, settings_json)
140
+ )
141
+ except sqlite3.IntegrityError:
142
+ raise ValueError("This Gmail address is already registered")
143
+
144
+ return RegistrationResult(
145
+ email=normalized_email,
146
+ token=token,
147
+ settings=self._masked_settings(sanitized_settings),
148
+ )
149
+
150
+ def get_user_settings(self, email: str) -> dict[str, str]:
151
+ normalized_email = self._normalize_email(email)
152
+ with self._get_conn() as conn:
153
+ row = conn.execute("SELECT revoked, settings FROM users WHERE email = ?", (normalized_email,)).fetchone()
154
+
155
+ if not row:
156
+ return {}
157
+
158
+ revoked, settings_json = row
159
+ if revoked:
160
+ return {}
161
+
162
+ try:
163
+ return json.loads(settings_json) if settings_json else {}
164
+ except Exception:
165
+ return {}
166
+
167
+ def update_user_settings(self, email: str, settings: dict[str, Any]) -> dict[str, str]:
168
+ normalized_email = self._normalize_email(email)
169
+ sanitized_settings = self._sanitize_settings(settings)
170
+
171
+ with self._get_conn() as conn:
172
+ row = conn.execute("SELECT revoked, settings FROM users WHERE email = ?", (normalized_email,)).fetchone()
173
+ if not row:
174
+ raise ValueError("User not found")
175
+
176
+ revoked, existing_settings_json = row
177
+ if revoked:
178
+ raise ValueError("User not found")
179
+
180
+ existing = {}
181
+ if existing_settings_json:
182
+ try:
183
+ existing = json.loads(existing_settings_json)
184
+ except Exception:
185
+ pass
186
+
187
+ if sanitized_settings:
188
+ existing.update(sanitized_settings)
189
+ conn.execute("UPDATE users SET settings = ? WHERE email = ?", (json.dumps(existing), normalized_email))
190
+
191
+ return self._masked_settings(existing)
192
+
193
+ def revoke_email(self, email: str) -> bool:
194
+ normalized_email = self._normalize_email(email)
195
+ with self._get_conn() as conn:
196
+ cursor = conn.execute("UPDATE users SET revoked = 1 WHERE email = ?", (normalized_email,))
197
+ return cursor.rowcount > 0
198
+
199
+ def verify_token(self, token: str) -> AccessToken | None:
200
+ if not token:
201
+ return None
202
+
203
+ token_hash = self._hash_token(token)
204
+ now = self._utc_now()
205
+
206
+ with self._get_conn() as conn:
207
+ # We iterate rather than querying hash to handle potentially many rows
208
+ # Note: For hyper-scale, querying `WHERE token_hash = ?` is better.
209
+ cursor = conn.execute("SELECT email, revoked, token_hash FROM users WHERE revoked = 0")
210
+ matched_email = None
211
+ for email, revoked, stored_hash in cursor:
212
+ if isinstance(stored_hash, str) and hmac.compare_digest(stored_hash, token_hash):
213
+ matched_email = email
214
+ break
215
+
216
+ if not matched_email:
217
+ return None
218
+
219
+ conn.execute("UPDATE users SET last_seen = ? WHERE email = ?", (now, matched_email))
220
+
221
+ scopes = [f"identity:{matched_email}"]
222
+ return AccessToken(token=token, client_id=matched_email, scopes=scopes)
223
+
224
+ def whoami(self, token: str) -> dict[str, Any] | None:
225
+ token_info = self.verify_token(token)
226
+ if not token_info:
227
+ return None
228
+ return {"email": token_info.client_id, "scopes": token_info.scopes}
229
+
230
+
231
+ class GmailTokenVerifier(TokenVerifier):
232
+ def __init__(self, store: GmailIdentityStore):
233
+ self._store = store
234
+
235
+ async def verify_token(self, token: str) -> AccessToken | None:
236
+ return self._store.verify_token(token)
@@ -0,0 +1,34 @@
1
+ import os
2
+ import sys
3
+ import time
4
+ import threading
5
+ import requests
6
+ from app.mcp_app import mcp
7
+
8
+ def _run_keep_alive():
9
+ url = os.getenv("KEEP_ALIVE_URL")
10
+ if not url:
11
+ return
12
+
13
+ url = f"{url.rstrip('/')}/healthz"
14
+ print(f"Keep-alive service started. Pinging {url} every 60s.", file=sys.stderr)
15
+
16
+ # Wait for server to boot
17
+ time.sleep(10)
18
+
19
+ while True:
20
+ try:
21
+ requests.get(url, timeout=5)
22
+ except Exception:
23
+ pass
24
+ time.sleep(60)
25
+
26
+ def main() -> None:
27
+ if os.getenv("KEEP_ALIVE_URL"):
28
+ threading.Thread(target=_run_keep_alive, daemon=True).start()
29
+
30
+ mcp.run(transport="streamable-http")
31
+
32
+
33
+ if __name__ == "__main__":
34
+ main()
@@ -0,0 +1,273 @@
1
+ import argparse
2
+ import hashlib
3
+ import os
4
+ import platform
5
+ import sys
6
+ import threading
7
+
8
+
9
+ def _machine_fingerprint() -> str:
10
+ """Generates a stable, anonymous machine fingerprint. No PII.
11
+ Safe across Windows, macOS, Linux, and IDE-spawned processes.
12
+ """
13
+ parts = [platform.node(), platform.system(), platform.machine()]
14
+
15
+ # os.getlogin() crashes in some IDE-spawned/non-TTY environments on all platforms
16
+ for fn in (
17
+ lambda: os.environ.get("USER") or os.environ.get("USERNAME") or "",
18
+ lambda: str(os.getuid()) if hasattr(os, "getuid") else "",
19
+ ):
20
+ try:
21
+ val = fn()
22
+ if val:
23
+ parts.append(val)
24
+ break
25
+ except Exception:
26
+ pass
27
+
28
+ raw = "-".join(p for p in parts if p)
29
+ return hashlib.sha256(raw.encode()).hexdigest()[:32]
30
+
31
+
32
+ def _send_startup_ping(mode: str) -> None:
33
+ """Fire-and-forget anonymous ping to the Render server for user counting.
34
+ Only sends if TELEMETRY_ENDPOINT is configured. Defaults to opt-in via TELEMETRY=true.
35
+ Never blocks startup — runs in a daemon thread.
36
+ """
37
+ telemetry = os.getenv("TELEMETRY", "false").strip().lower()
38
+ if telemetry not in {"1", "true", "yes", "on"}:
39
+ return
40
+
41
+ endpoint = os.getenv("TELEMETRY_ENDPOINT", "").strip()
42
+ if not endpoint:
43
+ return
44
+
45
+ try:
46
+ import requests # always available — in pyproject.toml deps
47
+ fingerprint = _machine_fingerprint()
48
+ requests.post(
49
+ f"{endpoint.rstrip('/')}/ping",
50
+ json={"id": fingerprint, "mode": mode},
51
+ timeout=3,
52
+ )
53
+ except Exception:
54
+ pass # Never surface telemetry errors to the user
55
+
56
+
57
+ def _check_for_updates() -> None:
58
+ """Check if a newer version is available on GitHub and notify via stderr.
59
+ This check is non-blocking and runs in a daemon thread.
60
+ """
61
+ try:
62
+ from importlib.metadata import version
63
+ import requests
64
+ import re
65
+
66
+ current_version = version("github-pr-context-mcp")
67
+ # Check raw pyproject.toml on main branch for the latest version
68
+ # This is faster and more reliable than the GitHub releases API for development versions
69
+ url = "https://raw.githubusercontent.com/paarths-collab/github-pr-context-mcp/main/pyproject.toml"
70
+ response = requests.get(url, timeout=3)
71
+ if response.status_code == 200:
72
+ match = re.search(r'version\s*=\s*"([^"]+)"', response.text)
73
+ if match:
74
+ latest_version = match.group(1)
75
+ if latest_version != current_version:
76
+ print(
77
+ f"\n[UPDATE AVAILABLE] A new version of github-pr-context-mcp is available: {latest_version} (Current: {current_version})\n"
78
+ f"Run: pipx upgrade github-pr-context-mcp\n",
79
+ file=sys.stderr
80
+ )
81
+ except Exception:
82
+ pass # Never block startup if network or version lookup fails
83
+
84
+
85
+ def _detect_mode() -> str:
86
+ """Detect how this server was launched.
87
+
88
+ Detection logic:
89
+ - UV_PROJECT_ENVIRONMENT is set exclusively by uv/uvx virtual environments
90
+ - PIPX_HOME or PIPX_LOCAL_VENVS are set by pipx
91
+ - MCP_MODE can be set manually in the IDE env block for explicit override
92
+ - Falls back to 'local' (git clone / direct python call)
93
+ """
94
+ # MCP_MODE explicit override takes precedence
95
+ explicit = os.getenv("MCP_MODE", "").strip().lower()
96
+ if explicit in {"uvx", "pipx", "local"}:
97
+ return explicit
98
+
99
+ # uv/uvx sets UV_PROJECT_ENVIRONMENT when running in a managed venv
100
+ if os.getenv("UV_PROJECT_ENVIRONMENT"):
101
+ return "uvx"
102
+
103
+ # pipx sets PIPX_HOME when installing packages
104
+ if os.getenv("PIPX_HOME") or os.getenv("PIPX_LOCAL_VENVS"):
105
+ return "pipx"
106
+
107
+ return "local"
108
+
109
+
110
+ def main() -> None:
111
+ parser = argparse.ArgumentParser(
112
+ description="GitHub PR Context MCP Server - Provides historical PR review context for code reviews.",
113
+ formatter_class=argparse.RawDescriptionHelpFormatter,
114
+ epilog="""
115
+ Tools Overview:
116
+ - ensure_repo_ready: Prepares a repository for querying (indexes PRs).
117
+ - semantic_search_reviews: Search past review comments by meaning.
118
+ - review_code_with_history: Get a code review based on past team patterns.
119
+ - generate_code_from_history: Generate new code grounded in team history.
120
+ - get_team_review_patterns: Identify recurring feedback in a repository.
121
+ - list_indexed_repos: See which repositories are already available.
122
+
123
+ Configuration (Environment Variables):
124
+ - GITHUB_TOKEN: (Required) Personal Access Token with 'repo' scope.
125
+ - LLM_PROVIDER: (Optional) cerebras|openai|anthropic|gemini|ollama (default: cerebras).
126
+ - LLM_API_KEY: (Optional) API key for your chosen provider.
127
+ - CHROMA_PERSIST_DIR: (Optional) Custom path for persistent storage (default: ~/.github-pr-mcp/chroma_db).
128
+ - TELEMETRY: (Optional) set to 'false' to opt-out of anonymous usage pings.
129
+
130
+ Important Concepts:
131
+ - Permanent Storage: Indexed data is saved to disk and persists across restarts.
132
+ - Temporary Storage: Indexed data is kept in memory and lost when the server stops.
133
+ - Namespace: Use namespaces to isolate indexed data between different teams or users.
134
+
135
+ Example Usage (Claude Desktop Config):
136
+ {
137
+ "mcpServers": {
138
+ "github-pr-context": {
139
+ "command": "github-pr-context-mcp",
140
+ "env": {
141
+ "GITHUB_TOKEN": "your_github_token_here",
142
+ "LLM_PROVIDER": "anthropic",
143
+ "LLM_API_KEY": "your_anthropic_key_here"
144
+ }
145
+ }
146
+ }
147
+ }
148
+
149
+ Path & Installation:
150
+ The executable is typically installed to your user's local bin directory.
151
+ - Windows: %USERPROFILE%\\.local\\bin\\github-pr-context-mcp.exe
152
+ - macOS/Linux: ~/.local/bin/github-pr-context-mcp
153
+
154
+ If you are configuring Claude Desktop or another IDE, ensure you use the
155
+ ABSOLUTE PATH to the executable to avoid "command not found" errors.
156
+
157
+ Tool Selection & Strategy (When to use what):
158
+ - Indexing: Always start with `ensure_repo_ready`. Use it again if the repo has changed significantly.
159
+ - Research: Use `semantic_search_reviews` when you have a specific technical question (e.g., "How do we handle auth?").
160
+ - Writing Code: Use `generate_code_from_history` for new features or refactors to stay consistent with team patterns.
161
+ - Code Review: Use `review_code_with_history` before submitting a PR to catch issues early.
162
+ - Analysis: Use `get_team_review_patterns` to understand the team's "soul" and recurring feedback themes.
163
+
164
+ Tool Selection Strategy (JSON for AI Agents):
165
+ Load the block below at the START of every session. Match the user task to a trigger -> call that tool.
166
+ Full reference: https://github.com/paarths-collab/github-pr-context-mcp/blob/main/docs/tools_strategy.md
167
+
168
+ ```json
169
+ {
170
+ "tools": {
171
+ "ensure_repo_ready": { "call_when": "session start / new repo / repo changed" },
172
+ "set_active_repo": { "call_when": "user says switch/use a different repo" },
173
+ "list_indexed_repos": { "call_when": "user asks what repos are indexed" },
174
+ "delete_repo_index": { "call_when": "user wants to remove/reset index" },
175
+ "semantic_search_reviews": { "call_when": "user asks technical question / wants past examples" },
176
+ "review_code_with_history":{ "call_when": "user pastes code and asks for review" },
177
+ "generate_code_from_history":{"call_when": "user asks to write/implement/generate code" },
178
+ "get_team_review_patterns":{ "call_when": "user wants team norms / onboarding / standards" },
179
+ "get_index_stats": { "call_when": "verify index is complete / how many docs" },
180
+ "update_settings": { "call_when": "change token or LLM key (hosted mode only)" },
181
+ "get_usage_stats": { "call_when": "admin asks for adoption metrics" },
182
+ "generate_repo_rules": { "call_when": "user wants .cursorrules / CLAUDE.md / copilot-instructions.md from repo history" }
183
+ },
184
+ "session_flow": [
185
+ "1. ensure_repo_ready",
186
+ "2. get_team_review_patterns (optional)",
187
+ "2b. generate_repo_rules (optional — writes rules file once for future sessions)",
188
+ "3. semantic_search_reviews | generate_code_from_history | review_code_with_history",
189
+ "4. get_index_stats (optional)"
190
+ ]
191
+ }
192
+ ```
193
+
194
+ Troubleshooting:
195
+ - "command not found": Use the absolute path. Run `github-pr-context-mcp config` to get it.
196
+ - "invalid character": Fixed! This server now uses stderr for logs.
197
+ - Rate limits: Ensure GITHUB_TOKEN is valid and has 'repo' scope.
198
+ - Windows [WinError 32] (PermissionError):
199
+ This happens when trying to 'pipx upgrade' while the server is running.
200
+ 1. Close MCP clients (Cursor, Claude Desktop).
201
+ 2. Run: taskkill /F /IM github-pr-context-mcp.exe
202
+ 3. Retry: pipx upgrade github-pr-context-mcp
203
+
204
+ Troubleshooting (JSON for AI Agents):
205
+ ```json
206
+ {
207
+ "errors": {
208
+ "WinError 32": {
209
+ "cause": "Process lock. Binary is currently running/locked by Windows.",
210
+ "remediation": [
211
+ "taskkill /F /IM github-pr-context-mcp.exe",
212
+ "Close IDEs (Cursor/Claude Desktop)",
213
+ "Retry pipx upgrade"
214
+ ]
215
+ }
216
+ }
217
+ }
218
+ ```
219
+ """
220
+ )
221
+ parser.add_argument("command", nargs="?", choices=["config"], help="Run a helper command (e.g. 'config' to get your IDE snippet)")
222
+
223
+ args = parser.parse_args()
224
+
225
+ if args.command == "config":
226
+ import json
227
+ import sys
228
+
229
+ # Detect absolute path of the current binary/script
230
+ abs_path = os.path.abspath(sys.argv[0])
231
+ command_val = abs_path
232
+
233
+ # If running from source (.py file), prefix with python
234
+ if abs_path.endswith(".py"):
235
+ python_exe = sys.executable
236
+ command_val = f"{python_exe} {abs_path}"
237
+
238
+ detected_os = platform.system()
239
+
240
+ config = {
241
+ "mcpServers": {
242
+ "github-pr-context": {
243
+ "command": command_val,
244
+ "env": {
245
+ "GITHUB_TOKEN": "YOUR_GITHUB_TOKEN",
246
+ "LLM_PROVIDER": "cerebras",
247
+ "LLM_API_KEY": "YOUR_LLM_API_KEY"
248
+ }
249
+ }
250
+ }
251
+ }
252
+ print(f"\n=== {detected_os.upper()} CONFIG SNIPPET ===", file=sys.stderr)
253
+ print(f"Detected binary at: {command_val}", file=sys.stderr)
254
+ print("Copy the JSON below into your mcpConfig.json file:", file=sys.stderr)
255
+ print(json.dumps(config, indent=2))
256
+ print("\nNOTE: Ensure you replace YOUR_GITHUB_TOKEN and YOUR_LLM_API_KEY.\n", file=sys.stderr)
257
+ sys.exit(0)
258
+
259
+ # Import here so that env vars from IDE env block are set before mcp_app loads
260
+ from app.mcp_app import mcp
261
+
262
+ mode = _detect_mode()
263
+ # Send ping in background — startup is never delayed by telemetry
264
+ threading.Thread(target=_send_startup_ping, args=(mode,), daemon=True).start()
265
+ # Check for updates in background
266
+ threading.Thread(target=_check_for_updates, daemon=True).start()
267
+
268
+ # Run the server
269
+ mcp.run(transport="stdio")
270
+
271
+
272
+ if __name__ == "__main__":
273
+ main()
fetcher/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from fetcher.client import fetch_prs
2
+
3
+ __all__ = ["fetch_prs"]
fetcher/client.py ADDED
@@ -0,0 +1,131 @@
1
+ # HTTP client for the GitHub GraphQL API.
2
+ # Handles: auth, pagination, rate limit detection, and user-friendly errors.
3
+
4
+ import requests
5
+ import os
6
+ import sys
7
+ from dotenv import load_dotenv
8
+ from fetcher.queries import PR_QUERY
9
+ from fetcher.transform import flatten_prs
10
+
11
+ load_dotenv()
12
+
13
+ GITHUB_GRAPHQL_URL = "https://api.github.com/graphql"
14
+ MAX_PAGES = 10 # Hard cap to prevent accidental runaway fetches
15
+
16
+
17
+ def _headers(github_token: str | None = None) -> dict:
18
+ token = (github_token or "").strip() or os.getenv("GITHUB_TOKEN")
19
+ if not token:
20
+ raise EnvironmentError(
21
+ "GITHUB_TOKEN is not set. Add it to your .env file.\n"
22
+ "Get one at: https://github.com/settings/tokens (repo scope required)"
23
+ )
24
+ return {
25
+ "Authorization": f"Bearer {token}",
26
+ "Content-Type": "application/json",
27
+ }
28
+
29
+
30
+ def _check_rate_limit(response: requests.Response) -> None:
31
+ """Warn if approaching GitHub's GraphQL rate limit."""
32
+ remaining = response.headers.get("X-RateLimit-Remaining")
33
+ if remaining is not None and int(remaining) < 100:
34
+ reset_at = response.headers.get("X-RateLimit-Reset", "unknown")
35
+ print(
36
+ f"⚠️ GitHub rate limit low: {remaining} points remaining. "
37
+ f"Resets at unix timestamp {reset_at}.",
38
+ file=sys.stderr
39
+ )
40
+
41
+
42
+ def run_query(query: str, variables: dict, github_token: str | None = None) -> dict:
43
+ """Execute a raw GraphQL query against the GitHub API."""
44
+ try:
45
+ resp = requests.post(
46
+ GITHUB_GRAPHQL_URL,
47
+ json={"query": query, "variables": variables},
48
+ headers=_headers(github_token=github_token),
49
+ timeout=30,
50
+ )
51
+ except requests.exceptions.ConnectionError:
52
+ raise ConnectionError(
53
+ "Could not reach GitHub API. Check your internet connection."
54
+ )
55
+ except requests.exceptions.Timeout:
56
+ raise TimeoutError(
57
+ "GitHub API timed out after 30s. Try again or reduce --pages."
58
+ )
59
+
60
+ _check_rate_limit(resp)
61
+
62
+ # Surface actionable errors instead of raw HTTP codes
63
+ if resp.status_code == 401:
64
+ raise PermissionError(
65
+ "GitHub returned 401 Unauthorized. Your GITHUB_TOKEN is invalid or expired.\n"
66
+ "Generate a new one at: https://github.com/settings/tokens"
67
+ )
68
+ if resp.status_code == 403:
69
+ raise PermissionError(
70
+ "GitHub returned 403 Forbidden. Your token may lack 'repo' scope, "
71
+ "or you've exceeded the rate limit."
72
+ )
73
+
74
+ resp.raise_for_status()
75
+ data = resp.json()
76
+
77
+ if "errors" in data:
78
+ errors = data["errors"]
79
+ # Repo not found is the most common user error — give a specific message
80
+ if any(e.get("type") == "NOT_FOUND" for e in errors):
81
+ owner = variables.get("owner", "?")
82
+ repo = variables.get("repo", "?")
83
+ raise ValueError(
84
+ f"Repository '{owner}/{repo}' not found or not accessible with your token. "
85
+ "Check the owner/repo spelling and that your token has 'repo' scope."
86
+ )
87
+ raise ValueError(f"GitHub GraphQL errors: {errors}")
88
+
89
+ return data
90
+
91
+
92
+ def fetch_prs(owner: str, repo: str, pages: int = 2, github_token: str | None = None) -> list[dict]:
93
+ """
94
+ Fetch up to pages*30 merged/closed PRs with all review context.
95
+
96
+ Args:
97
+ owner: GitHub username or org, e.g. 'psf'
98
+ repo: Repository name, e.g. 'black'
99
+ pages: Number of pages to fetch (30 PRs per page).
100
+ Capped at MAX_PAGES={MAX_PAGES} to prevent runaway fetches.
101
+
102
+ Returns:
103
+ List of flattened PR dicts with review comments.
104
+ """
105
+ if pages < 1:
106
+ raise ValueError("pages must be at least 1.")
107
+ if pages > MAX_PAGES:
108
+ print(f"⚠️ pages capped at {MAX_PAGES} (requested {pages}).", file=sys.stderr)
109
+ pages = MAX_PAGES
110
+
111
+ all_prs = []
112
+ cursor = None
113
+
114
+ for page_num in range(1, pages + 1):
115
+ variables = {"owner": owner, "repo": repo}
116
+ if cursor:
117
+ variables["cursor"] = cursor
118
+
119
+ print(f" Fetching page {page_num}/{pages} for {owner}/{repo}...", file=sys.stderr)
120
+ data = run_query(PR_QUERY, variables, github_token=github_token)
121
+ pr_data = data["data"]["repository"]["pullRequests"]
122
+
123
+ batch = flatten_prs(pr_data["nodes"])
124
+ all_prs.extend(batch)
125
+
126
+ page_info = pr_data["pageInfo"]
127
+ if not page_info["hasPreviousPage"]:
128
+ break
129
+ cursor = page_info["startCursor"]
130
+
131
+ return all_prs