PyPI - github-pr-context-mcp - Versions diffs - 0.2.5__py3-none-any.whl - Mend

github-pr-context-mcp 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

analytics/__init__.py +3 -0
analytics/usage_metrics.py +185 -0
app/__init__.py +3 -0
app/mcp_app.py +928 -0
auth/__init__.py +3 -0
auth/gmail_identity.py +236 -0
entrypoints/deployed/server.py +34 -0
entrypoints/local/server.py +273 -0
fetcher/__init__.py +3 -0
fetcher/client.py +131 -0
fetcher/queries.py +67 -0
fetcher/transform.py +55 -0
github_pr_context_mcp-0.2.5.dist-info/METADATA +192 -0
github_pr_context_mcp-0.2.5.dist-info/RECORD +25 -0
github_pr_context_mcp-0.2.5.dist-info/WHEEL +5 -0
github_pr_context_mcp-0.2.5.dist-info/entry_points.txt +2 -0
github_pr_context_mcp-0.2.5.dist-info/licenses/LICENSE +21 -0
github_pr_context_mcp-0.2.5.dist-info/top_level.txt +7 -0
inference/__init__.py +3 -0
inference/providers.py +296 -0
inference/review.py +175 -0
storage/__init__.py +19 -0
storage/document_builder.py +74 -0
storage/encoder.py +35 -0
storage/vector_store.py +270 -0

auth/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from auth.gmail_identity import GmailIdentityStore, GmailTokenVerifier, RegistrationResult
+__all__ = ["GmailIdentityStore", "GmailTokenVerifier", "RegistrationResult"]

auth/gmail_identity.py ADDED Viewed

@@ -0,0 +1,236 @@
+from __future__ import annotations
+import hashlib
+import hmac
+import json
+import re
+import secrets
+import sqlite3
+from copy import deepcopy
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+from mcp.server.auth.provider import AccessToken, TokenVerifier
+GMAIL_EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@gmail\.com$", re.IGNORECASE)
+ALLOWED_LLM_PROVIDERS = {"cerebras", "openai", "anthropic", "ollama", "groq", "gemini"}
+@dataclass(frozen=True)
+class RegistrationResult:
+    email: str
+    token: str
+    settings: dict[str, str]
+class GmailIdentityStore:
+    """Store one registered bearer token per Gmail address backed by thread-safe SQLite."""
+    def __init__(self, file_path: str):
+        # Swap existing json suffixes to .db without breaking integrations
+        p = Path(file_path)
+        if p.suffix == '.json':
+            self._path = p.with_suffix('.db')
+        else:
+            self._path = p
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        self._init_db()
+    def _get_conn(self) -> sqlite3.Connection:
+        # isolation_level=None enables autocommit for simple operations
+        # check_same_thread=False allows sharing across async workers
+        return sqlite3.connect(str(self._path), isolation_level=None, check_same_thread=False)
+    def _init_db(self):
+        with self._get_conn() as conn:
+            conn.execute('''
+                CREATE TABLE IF NOT EXISTS users (
+                    email TEXT PRIMARY KEY,
+                    token_hash TEXT,
+                    registered_at TEXT,
+                    last_seen TEXT,
+                    revoked INTEGER DEFAULT 0,
+                    settings TEXT
+                )
+            ''')
+    def _utc_now(self) -> str:
+        return datetime.now(timezone.utc).isoformat()
+    def _normalize_email(self, email: str) -> str:
+        candidate = email.strip().lower()
+        if not GMAIL_EMAIL_RE.fullmatch(candidate):
+            raise ValueError("Only gmail.com addresses are allowed")
+        return candidate
+    def _hash_token(self, token: str) -> str:
+        return hashlib.sha256(token.encode("utf-8")).hexdigest()
+    def _normalize_optional(self, value: Any, field_name: str, max_len: int = 512) -> str | None:
+        if value is None:
+            return None
+        candidate = str(value).strip()
+        if not candidate:
+            return None
+        if len(candidate) > max_len:
+            raise ValueError(f"{field_name} is too long")
+        return candidate
+    def _sanitize_settings(self, settings: dict[str, Any] | None) -> dict[str, str]:
+        if not settings:
+            return {}
+        if not isinstance(settings, dict):
+            raise ValueError("settings must be an object")
+        sanitized: dict[str, str] = {}
+        github_token = self._normalize_optional(settings.get("github_token"), "github_token")
+        if github_token:
+            sanitized["github_token"] = github_token
+        llm_provider = self._normalize_optional(settings.get("llm_provider"), "llm_provider", max_len=64)
+        if llm_provider:
+            provider = llm_provider.lower()
+            if provider not in ALLOWED_LLM_PROVIDERS:
+                options = ", ".join(sorted(ALLOWED_LLM_PROVIDERS))
+                raise ValueError(f"llm_provider must be one of: {options}")
+            sanitized["llm_provider"] = provider
+        llm_model = self._normalize_optional(settings.get("llm_model"), "llm_model", max_len=128)
+        if llm_model:
+            sanitized["llm_model"] = llm_model
+        llm_api_key = self._normalize_optional(settings.get("llm_api_key"), "llm_api_key")
+        if llm_api_key:
+            sanitized["llm_api_key"] = llm_api_key
+        llm_base_url = self._normalize_optional(settings.get("llm_base_url"), "llm_base_url")
+        if llm_base_url:
+            lowered = llm_base_url.lower()
+            if not (lowered.startswith("http://") or lowered.startswith("https://")):
+                raise ValueError("llm_base_url must start with http:// or https://")
+            sanitized["llm_base_url"] = llm_base_url
+        return sanitized
+    def _masked_settings(self, settings: dict[str, str]) -> dict[str, str]:
+        masked = deepcopy(settings)
+        for key in ("github_token", "llm_api_key"):
+            if key in masked:
+                masked[key] = "***"
+        return masked
+    def register_email(self, email: str, settings: dict[str, Any] | None = None) -> RegistrationResult:
+        normalized_email = self._normalize_email(email)
+        sanitized_settings = self._sanitize_settings(settings)
+        token = secrets.token_urlsafe(32)
+        token_hash = self._hash_token(token)
+        now = self._utc_now()
+        settings_json = json.dumps(sanitized_settings)
+        with self._get_conn() as conn:
+            try:
+                conn.execute(
+                    "INSERT INTO users (email, token_hash, registered_at, last_seen, revoked, settings) VALUES (?, ?, ?, ?, ?, ?)",
+                    (normalized_email, token_hash, now, None, 0, settings_json)
+                )
+            except sqlite3.IntegrityError:
+                raise ValueError("This Gmail address is already registered")
+        return RegistrationResult(
+            email=normalized_email,
+            token=token,
+            settings=self._masked_settings(sanitized_settings),
+        )
+    def get_user_settings(self, email: str) -> dict[str, str]:
+        normalized_email = self._normalize_email(email)
+        with self._get_conn() as conn:
+            row = conn.execute("SELECT revoked, settings FROM users WHERE email = ?", (normalized_email,)).fetchone()
+        if not row:
+            return {}
+        revoked, settings_json = row
+        if revoked:
+            return {}
+        try:
+            return json.loads(settings_json) if settings_json else {}
+        except Exception:
+            return {}
+    def update_user_settings(self, email: str, settings: dict[str, Any]) -> dict[str, str]:
+        normalized_email = self._normalize_email(email)
+        sanitized_settings = self._sanitize_settings(settings)
+        with self._get_conn() as conn:
+            row = conn.execute("SELECT revoked, settings FROM users WHERE email = ?", (normalized_email,)).fetchone()
+            if not row:
+                raise ValueError("User not found")
+            revoked, existing_settings_json = row
+            if revoked:
+                raise ValueError("User not found")
+            existing = {}
+            if existing_settings_json:
+                try:
+                    existing = json.loads(existing_settings_json)
+                except Exception:
+                    pass
+            if sanitized_settings:
+                existing.update(sanitized_settings)
+                conn.execute("UPDATE users SET settings = ? WHERE email = ?", (json.dumps(existing), normalized_email))
+        return self._masked_settings(existing)
+    def revoke_email(self, email: str) -> bool:
+        normalized_email = self._normalize_email(email)
+        with self._get_conn() as conn:
+            cursor = conn.execute("UPDATE users SET revoked = 1 WHERE email = ?", (normalized_email,))
+            return cursor.rowcount > 0
+    def verify_token(self, token: str) -> AccessToken | None:
+        if not token:
+            return None
+        token_hash = self._hash_token(token)
+        now = self._utc_now()
+        with self._get_conn() as conn:
+            # We iterate rather than querying hash to handle potentially many rows
+            # Note: For hyper-scale, querying `WHERE token_hash = ?` is better.
+            cursor = conn.execute("SELECT email, revoked, token_hash FROM users WHERE revoked = 0")
+            matched_email = None
+            for email, revoked, stored_hash in cursor:
+                if isinstance(stored_hash, str) and hmac.compare_digest(stored_hash, token_hash):
+                    matched_email = email
+                    break
+            if not matched_email:
+                return None
+            conn.execute("UPDATE users SET last_seen = ? WHERE email = ?", (now, matched_email))
+        scopes = [f"identity:{matched_email}"]
+        return AccessToken(token=token, client_id=matched_email, scopes=scopes)
+    def whoami(self, token: str) -> dict[str, Any] | None:
+        token_info = self.verify_token(token)
+        if not token_info:
+            return None
+        return {"email": token_info.client_id, "scopes": token_info.scopes}
+class GmailTokenVerifier(TokenVerifier):
+    def __init__(self, store: GmailIdentityStore):
+        self._store = store
+    async def verify_token(self, token: str) -> AccessToken | None:
+        return self._store.verify_token(token)

entrypoints/deployed/server.py ADDED Viewed

@@ -0,0 +1,34 @@
+import os
+import sys
+import time
+import threading
+import requests
+from app.mcp_app import mcp
+def _run_keep_alive():
+    url = os.getenv("KEEP_ALIVE_URL")
+    if not url:
+        return
+    url = f"{url.rstrip('/')}/healthz"
+    print(f"Keep-alive service started. Pinging {url} every 60s.", file=sys.stderr)
+    # Wait for server to boot
+    time.sleep(10)
+    while True:
+        try:
+            requests.get(url, timeout=5)
+        except Exception:
+            pass
+        time.sleep(60)
+def main() -> None:
+    if os.getenv("KEEP_ALIVE_URL"):
+        threading.Thread(target=_run_keep_alive, daemon=True).start()
+    mcp.run(transport="streamable-http")
+if __name__ == "__main__":
+    main()

entrypoints/local/server.py ADDED Viewed

@@ -0,0 +1,273 @@
+import argparse
+import hashlib
+import os
+import platform
+import sys
+import threading
+def _machine_fingerprint() -> str:
+    """Generates a stable, anonymous machine fingerprint. No PII.
+    Safe across Windows, macOS, Linux, and IDE-spawned processes.
+    """
+    parts = [platform.node(), platform.system(), platform.machine()]
+    # os.getlogin() crashes in some IDE-spawned/non-TTY environments on all platforms
+    for fn in (
+        lambda: os.environ.get("USER") or os.environ.get("USERNAME") or "",
+        lambda: str(os.getuid()) if hasattr(os, "getuid") else "",
+    ):
+        try:
+            val = fn()
+            if val:
+                parts.append(val)
+                break
+        except Exception:
+            pass
+    raw = "-".join(p for p in parts if p)
+    return hashlib.sha256(raw.encode()).hexdigest()[:32]
+def _send_startup_ping(mode: str) -> None:
+    """Fire-and-forget anonymous ping to the Render server for user counting.
+    Only sends if TELEMETRY_ENDPOINT is configured. Defaults to opt-in via TELEMETRY=true.
+    Never blocks startup — runs in a daemon thread.
+    """
+    telemetry = os.getenv("TELEMETRY", "false").strip().lower()
+    if telemetry not in {"1", "true", "yes", "on"}:
+        return
+    endpoint = os.getenv("TELEMETRY_ENDPOINT", "").strip()
+    if not endpoint:
+        return
+    try:
+        import requests  # always available — in pyproject.toml deps
+        fingerprint = _machine_fingerprint()
+        requests.post(
+            f"{endpoint.rstrip('/')}/ping",
+            json={"id": fingerprint, "mode": mode},
+            timeout=3,
+        )
+    except Exception:
+        pass  # Never surface telemetry errors to the user
+def _check_for_updates() -> None:
+    """Check if a newer version is available on GitHub and notify via stderr.
+    This check is non-blocking and runs in a daemon thread.
+    """
+    try:
+        from importlib.metadata import version
+        import requests
+        import re
+        current_version = version("github-pr-context-mcp")
+        # Check raw pyproject.toml on main branch for the latest version
+        # This is faster and more reliable than the GitHub releases API for development versions
+        url = "https://raw.githubusercontent.com/paarths-collab/github-pr-context-mcp/main/pyproject.toml"
+        response = requests.get(url, timeout=3)
+        if response.status_code == 200:
+            match = re.search(r'version\s*=\s*"([^"]+)"', response.text)
+            if match:
+                latest_version = match.group(1)
+                if latest_version != current_version:
+                    print(
+                        f"\n[UPDATE AVAILABLE] A new version of github-pr-context-mcp is available: {latest_version} (Current: {current_version})\n"
+                        f"Run: pipx upgrade github-pr-context-mcp\n",
+                        file=sys.stderr
+                    )
+    except Exception:
+        pass  # Never block startup if network or version lookup fails
+def _detect_mode() -> str:
+    """Detect how this server was launched.
+    Detection logic:
+    - UV_PROJECT_ENVIRONMENT is set exclusively by uv/uvx virtual environments
+    - PIPX_HOME or PIPX_LOCAL_VENVS are set by pipx
+    - MCP_MODE can be set manually in the IDE env block for explicit override
+    - Falls back to 'local' (git clone / direct python call)
+    """
+    # MCP_MODE explicit override takes precedence
+    explicit = os.getenv("MCP_MODE", "").strip().lower()
+    if explicit in {"uvx", "pipx", "local"}:
+        return explicit
+    # uv/uvx sets UV_PROJECT_ENVIRONMENT when running in a managed venv
+    if os.getenv("UV_PROJECT_ENVIRONMENT"):
+        return "uvx"
+    # pipx sets PIPX_HOME when installing packages
+    if os.getenv("PIPX_HOME") or os.getenv("PIPX_LOCAL_VENVS"):
+        return "pipx"
+    return "local"
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="GitHub PR Context MCP Server - Provides historical PR review context for code reviews.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Tools Overview:
+  - ensure_repo_ready: Prepares a repository for querying (indexes PRs).
+  - semantic_search_reviews: Search past review comments by meaning.
+  - review_code_with_history: Get a code review based on past team patterns.
+  - generate_code_from_history: Generate new code grounded in team history.
+  - get_team_review_patterns: Identify recurring feedback in a repository.
+  - list_indexed_repos: See which repositories are already available.
+Configuration (Environment Variables):
+  - GITHUB_TOKEN: (Required) Personal Access Token with 'repo' scope.
+  - LLM_PROVIDER: (Optional) cerebras|openai|anthropic|gemini|ollama (default: cerebras).
+  - LLM_API_KEY: (Optional) API key for your chosen provider.
+  - CHROMA_PERSIST_DIR: (Optional) Custom path for persistent storage (default: ~/.github-pr-mcp/chroma_db).
+  - TELEMETRY: (Optional) set to 'false' to opt-out of anonymous usage pings.
+Important Concepts:
+  - Permanent Storage: Indexed data is saved to disk and persists across restarts.
+  - Temporary Storage: Indexed data is kept in memory and lost when the server stops.
+  - Namespace: Use namespaces to isolate indexed data between different teams or users.
+Example Usage (Claude Desktop Config):
+{
+  "mcpServers": {
+    "github-pr-context": {
+      "command": "github-pr-context-mcp",
+      "env": {
+        "GITHUB_TOKEN": "your_github_token_here",
+        "LLM_PROVIDER": "anthropic",
+        "LLM_API_KEY": "your_anthropic_key_here"
+      }
+    }
+  }
+}
+Path & Installation:
+  The executable is typically installed to your user's local bin directory.
+  - Windows: %USERPROFILE%\\.local\\bin\\github-pr-context-mcp.exe
+  - macOS/Linux: ~/.local/bin/github-pr-context-mcp
+  If you are configuring Claude Desktop or another IDE, ensure you use the
+  ABSOLUTE PATH to the executable to avoid "command not found" errors.
+Tool Selection & Strategy (When to use what):
+  - Indexing: Always start with `ensure_repo_ready`. Use it again if the repo has changed significantly.
+  - Research: Use `semantic_search_reviews` when you have a specific technical question (e.g., "How do we handle auth?").
+  - Writing Code: Use `generate_code_from_history` for new features or refactors to stay consistent with team patterns.
+  - Code Review: Use `review_code_with_history` before submitting a PR to catch issues early.
+  - Analysis: Use `get_team_review_patterns` to understand the team's "soul" and recurring feedback themes.
+Tool Selection Strategy (JSON for AI Agents):
+  Load the block below at the START of every session. Match the user task to a trigger -> call that tool.
+  Full reference: https://github.com/paarths-collab/github-pr-context-mcp/blob/main/docs/tools_strategy.md
+  ```json
+  {
+    "tools": {
+      "ensure_repo_ready":       { "call_when": "session start / new repo / repo changed" },
+      "set_active_repo":         { "call_when": "user says switch/use a different repo" },
+      "list_indexed_repos":      { "call_when": "user asks what repos are indexed" },
+      "delete_repo_index":       { "call_when": "user wants to remove/reset index" },
+      "semantic_search_reviews": { "call_when": "user asks technical question / wants past examples" },
+      "review_code_with_history":{ "call_when": "user pastes code and asks for review" },
+      "generate_code_from_history":{"call_when": "user asks to write/implement/generate code" },
+      "get_team_review_patterns":{ "call_when": "user wants team norms / onboarding / standards" },
+      "get_index_stats":         { "call_when": "verify index is complete / how many docs" },
+      "update_settings":         { "call_when": "change token or LLM key (hosted mode only)" },
+      "get_usage_stats":         { "call_when": "admin asks for adoption metrics" },
+      "generate_repo_rules":     { "call_when": "user wants .cursorrules / CLAUDE.md / copilot-instructions.md from repo history" }
+    },
+    "session_flow": [
+      "1. ensure_repo_ready",
+      "2. get_team_review_patterns (optional)",
+      "2b. generate_repo_rules (optional — writes rules file once for future sessions)",
+      "3. semantic_search_reviews | generate_code_from_history | review_code_with_history",
+      "4. get_index_stats (optional)"
+    ]
+  }
+  ```
+Troubleshooting:
+  - "command not found": Use the absolute path. Run `github-pr-context-mcp config` to get it.
+  - "invalid character": Fixed! This server now uses stderr for logs.
+  - Rate limits: Ensure GITHUB_TOKEN is valid and has 'repo' scope.
+  - Windows [WinError 32] (PermissionError):
+      This happens when trying to 'pipx upgrade' while the server is running.
+      1. Close MCP clients (Cursor, Claude Desktop).
+      2. Run: taskkill /F /IM github-pr-context-mcp.exe
+      3. Retry: pipx upgrade github-pr-context-mcp
+Troubleshooting (JSON for AI Agents):
+  ```json
+  {
+    "errors": {
+      "WinError 32": {
+        "cause": "Process lock. Binary is currently running/locked by Windows.",
+        "remediation": [
+          "taskkill /F /IM github-pr-context-mcp.exe",
+          "Close IDEs (Cursor/Claude Desktop)",
+          "Retry pipx upgrade"
+        ]
+      }
+    }
+  }
+  ```
+"""
+    )
+    parser.add_argument("command", nargs="?", choices=["config"], help="Run a helper command (e.g. 'config' to get your IDE snippet)")
+    args = parser.parse_args()
+    if args.command == "config":
+        import json
+        import sys
+        # Detect absolute path of the current binary/script
+        abs_path = os.path.abspath(sys.argv[0])
+        command_val = abs_path
+        # If running from source (.py file), prefix with python
+        if abs_path.endswith(".py"):
+            python_exe = sys.executable
+            command_val = f"{python_exe} {abs_path}"
+        detected_os = platform.system()
+        config = {
+            "mcpServers": {
+                "github-pr-context": {
+                    "command": command_val,
+                    "env": {
+                        "GITHUB_TOKEN": "YOUR_GITHUB_TOKEN",
+                        "LLM_PROVIDER": "cerebras",
+                        "LLM_API_KEY": "YOUR_LLM_API_KEY"
+                    }
+                }
+            }
+        }
+        print(f"\n=== {detected_os.upper()} CONFIG SNIPPET ===", file=sys.stderr)
+        print(f"Detected binary at: {command_val}", file=sys.stderr)
+        print("Copy the JSON below into your mcpConfig.json file:", file=sys.stderr)
+        print(json.dumps(config, indent=2))
+        print("\nNOTE: Ensure you replace YOUR_GITHUB_TOKEN and YOUR_LLM_API_KEY.\n", file=sys.stderr)
+        sys.exit(0)
+    # Import here so that env vars from IDE env block are set before mcp_app loads
+    from app.mcp_app import mcp
+    mode = _detect_mode()
+    # Send ping in background — startup is never delayed by telemetry
+    threading.Thread(target=_send_startup_ping, args=(mode,), daemon=True).start()
+    # Check for updates in background
+    threading.Thread(target=_check_for_updates, daemon=True).start()
+    # Run the server
+    mcp.run(transport="stdio")
+if __name__ == "__main__":
+    main()

fetcher/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from fetcher.client import fetch_prs
+__all__ = ["fetch_prs"]

fetcher/client.py ADDED Viewed

@@ -0,0 +1,131 @@
+# HTTP client for the GitHub GraphQL API.
+# Handles: auth, pagination, rate limit detection, and user-friendly errors.
+import requests
+import os
+import sys
+from dotenv import load_dotenv
+from fetcher.queries import PR_QUERY
+from fetcher.transform import flatten_prs
+load_dotenv()
+GITHUB_GRAPHQL_URL = "https://api.github.com/graphql"
+MAX_PAGES = 10  # Hard cap to prevent accidental runaway fetches
+def _headers(github_token: str | None = None) -> dict:
+    token = (github_token or "").strip() or os.getenv("GITHUB_TOKEN")
+    if not token:
+        raise EnvironmentError(
+            "GITHUB_TOKEN is not set. Add it to your .env file.\n"
+            "Get one at: https://github.com/settings/tokens (repo scope required)"
+        )
+    return {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+    }
+def _check_rate_limit(response: requests.Response) -> None:
+    """Warn if approaching GitHub's GraphQL rate limit."""
+    remaining = response.headers.get("X-RateLimit-Remaining")
+    if remaining is not None and int(remaining) < 100:
+        reset_at = response.headers.get("X-RateLimit-Reset", "unknown")
+        print(
+            f"⚠️  GitHub rate limit low: {remaining} points remaining. "
+            f"Resets at unix timestamp {reset_at}.",
+            file=sys.stderr
+        )
+def run_query(query: str, variables: dict, github_token: str | None = None) -> dict:
+    """Execute a raw GraphQL query against the GitHub API."""
+    try:
+        resp = requests.post(
+            GITHUB_GRAPHQL_URL,
+            json={"query": query, "variables": variables},
+            headers=_headers(github_token=github_token),
+            timeout=30,
+        )
+    except requests.exceptions.ConnectionError:
+        raise ConnectionError(
+            "Could not reach GitHub API. Check your internet connection."
+        )
+    except requests.exceptions.Timeout:
+        raise TimeoutError(
+            "GitHub API timed out after 30s. Try again or reduce --pages."
+        )
+    _check_rate_limit(resp)
+    # Surface actionable errors instead of raw HTTP codes
+    if resp.status_code == 401:
+        raise PermissionError(
+            "GitHub returned 401 Unauthorized. Your GITHUB_TOKEN is invalid or expired.\n"
+            "Generate a new one at: https://github.com/settings/tokens"
+        )
+    if resp.status_code == 403:
+        raise PermissionError(
+            "GitHub returned 403 Forbidden. Your token may lack 'repo' scope, "
+            "or you've exceeded the rate limit."
+        )
+    resp.raise_for_status()
+    data = resp.json()
+    if "errors" in data:
+        errors = data["errors"]
+        # Repo not found is the most common user error — give a specific message
+        if any(e.get("type") == "NOT_FOUND" for e in errors):
+            owner = variables.get("owner", "?")
+            repo = variables.get("repo", "?")
+            raise ValueError(
+                f"Repository '{owner}/{repo}' not found or not accessible with your token. "
+                "Check the owner/repo spelling and that your token has 'repo' scope."
+            )
+        raise ValueError(f"GitHub GraphQL errors: {errors}")
+    return data
+def fetch_prs(owner: str, repo: str, pages: int = 2, github_token: str | None = None) -> list[dict]:
+    """
+    Fetch up to pages*30 merged/closed PRs with all review context.
+    Args:
+        owner: GitHub username or org, e.g. 'psf'
+        repo:  Repository name, e.g. 'black'
+        pages: Number of pages to fetch (30 PRs per page).
+               Capped at MAX_PAGES={MAX_PAGES} to prevent runaway fetches.
+    Returns:
+        List of flattened PR dicts with review comments.
+    """
+    if pages < 1:
+        raise ValueError("pages must be at least 1.")
+    if pages > MAX_PAGES:
+        print(f"⚠️  pages capped at {MAX_PAGES} (requested {pages}).", file=sys.stderr)
+        pages = MAX_PAGES
+    all_prs = []
+    cursor = None
+    for page_num in range(1, pages + 1):
+        variables = {"owner": owner, "repo": repo}
+        if cursor:
+            variables["cursor"] = cursor
+        print(f"  Fetching page {page_num}/{pages} for {owner}/{repo}...", file=sys.stderr)
+        data = run_query(PR_QUERY, variables, github_token=github_token)
+        pr_data = data["data"]["repository"]["pullRequests"]
+        batch = flatten_prs(pr_data["nodes"])
+        all_prs.extend(batch)
+        page_info = pr_data["pageInfo"]
+        if not page_info["hasPreviousPage"]:
+            break
+        cursor = page_info["startCursor"]
+    return all_prs