PyPI - github-pr-context-mcp - Versions diffs - 0.2.5__py3-none-any.whl - Mend

github-pr-context-mcp 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

analytics/__init__.py +3 -0
analytics/usage_metrics.py +185 -0
app/__init__.py +3 -0
app/mcp_app.py +928 -0
auth/__init__.py +3 -0
auth/gmail_identity.py +236 -0
entrypoints/deployed/server.py +34 -0
entrypoints/local/server.py +273 -0
fetcher/__init__.py +3 -0
fetcher/client.py +131 -0
fetcher/queries.py +67 -0
fetcher/transform.py +55 -0
github_pr_context_mcp-0.2.5.dist-info/METADATA +192 -0
github_pr_context_mcp-0.2.5.dist-info/RECORD +25 -0
github_pr_context_mcp-0.2.5.dist-info/WHEEL +5 -0
github_pr_context_mcp-0.2.5.dist-info/entry_points.txt +2 -0
github_pr_context_mcp-0.2.5.dist-info/licenses/LICENSE +21 -0
github_pr_context_mcp-0.2.5.dist-info/top_level.txt +7 -0
inference/__init__.py +3 -0
inference/providers.py +296 -0
inference/review.py +175 -0
storage/__init__.py +19 -0
storage/document_builder.py +74 -0
storage/encoder.py +35 -0
storage/vector_store.py +270 -0

inference/review.py ADDED Viewed

@@ -0,0 +1,175 @@
+# LLM inference for code review — model-agnostic.
+# Uses inference/providers.py for the actual LLM call.
+# Swap providers by changing LLM_PROVIDER in .env, no code changes needed.
+from inference.providers import chat
+REVIEW_SYSTEM_PROMPT = """You are a senior software engineer doing code review.
+You have access to historical PR review comments from this repository.
+Use the provided context to give reviews that match the team's standards and catch issues
+they've flagged before. Be specific, reference line numbers when possible, be concise.
+Do not be sycophantic. Flag real problems."""
+def review_with_context(
+    diff_or_code: str,
+    retrieved_context: list[dict],
+    repo: str,
+    settings: dict | None = None,
+) -> str:
+    """Use retrieved RAG context + LLM to do a context-aware code review."""
+    context_text = "\n---\n".join([
+        f"[{c['similarity']:.2f}] {c['text'][:400]}"
+        for c in retrieved_context[:6]
+    ])
+    user_message = f"""Repository: {repo}
+HISTORICAL REVIEW CONTEXT (from past PRs in this repo):
+{context_text}
+---
+CODE TO REVIEW:
+{diff_or_code}
+---
+Provide a thorough code review. Reference specific past patterns where relevant.
+Flag issues the team has flagged before. Note what looks good too."""
+    return chat(
+        messages=[{"role": "user", "content": user_message}],
+        system=REVIEW_SYSTEM_PROMPT,
+        max_tokens=1024,
+        settings=settings,
+    )
+def summarize_patterns(retrieved_context: list[dict], repo: str, settings: dict | None = None) -> str:
+    """Summarize what this team commonly flags in reviews."""
+    context_text = "\n---\n".join([c["text"][:350] for c in retrieved_context])
+    return chat(
+        messages=[{
+            "role": "user",
+            "content": (
+                f"Repository: {repo}\n\n"
+                f"Here are past code review comments from this team:\n{context_text}\n\n"
+                "List the top 5 patterns this team commonly flags in code reviews. "
+                "Be specific. Quote examples where useful."
+            ),
+        }],
+        max_tokens=512,
+        settings=settings,
+    )
+GENERATE_SYSTEM_PROMPT = """You are a senior software engineer assistant.
+You write code that follows the repository's established patterns, naming conventions, and best practices.
+You have access to historical PR commits and review comments from this repository.
+Use the provided context to ensure your generated code matches the team's style and avoids issues they've flagged in the past."""
+def generate_with_context(
+    task: str,
+    retrieved_context: list[dict],
+    repo: str,
+    settings: dict | None = None,
+    repo_rules: str | None = None,
+) -> str:
+    """Use retrieved RAG context + LLM to generate code grounded in team patterns.
+    Args:
+        task: Description of what to implement.
+        retrieved_context: RAG documents from the indexed repo.
+        repo: GitHub repo identifier (owner/name).
+        settings: Optional LLM provider override dict.
+        repo_rules: Contents of a .cursorrules / CLAUDE.md file. When provided,
+                    these rules are injected as hard constraints before historical context.
+    Returns:
+        Generated code string.
+    """
+    context_text = "\n---\n".join([
+        f"[{c['similarity']:.2f}] {c['text'][:400]}"
+        for c in retrieved_context[:8]
+    ])
+    rules_block = ""
+    if repo_rules and repo_rules.strip():
+        # Truncate to 2000 chars — rules files are dense; first 2000 chars cover all hard rules
+        trimmed_rules = repo_rules.strip()[:2000]
+        rules_block = f"\nREPO RULES (enforce in ALL generated code):\n{trimmed_rules}\n\n---"
+    user_message = f"""Repository: {repo}
+TASK:
+{task}
+{rules_block}
+HISTORICAL CONTEXT (from past PRs in this repo):
+{context_text}
+---
+Write the code to complete the task. You MUST follow all REPO RULES above without exception.
+Ensure the output also matches the coding style, naming conventions, and best practices seen in the historical context.
+Avoid issues the team has flagged before in similar situations.
+Provide only the code and necessary brief explanations."""
+    return chat(
+        messages=[{"role": "user", "content": user_message}],
+        system=GENERATE_SYSTEM_PROMPT,
+        max_tokens=2048,
+        settings=settings,
+    )
+RULES_SYSTEM_PROMPT = """You are a senior engineering lead.
+Your job is to synthesize a repository's historical PR review comments into a concise,
+actionable set of rules for IDE agents (Cursor, GitHub Copilot, Claude).
+Output format rules:
+- Write in clear, imperative statements ("Always ...", "Never ...", "Prefer ...").
+- Group rules under the headings: Code Quality, Architecture, Testing, Documentation.
+- Maximum 30 rules total. Be specific. Reference concrete examples from the context.
+- Do NOT include generic advice not backed by the repo's real history.
+- Do NOT include any preamble or explanation outside the rule file content itself."""
+def generate_rules_content(
+    retrieved_context: list[dict],
+    repo: str,
+    settings: dict | None = None,
+) -> str:
+    """Synthesise a .cursorrules / CLAUDE.md / copilot-instructions.md file from indexed PR history.
+    Args:
+        retrieved_context: Retrieved RAG documents from the indexed repo.
+        repo: The GitHub repo identifier (owner/name).
+        settings: Optional LLM provider override dict.
+    Returns:
+        A markdown string ready to be written as a rules file.
+    """
+    context_text = "\n\n".join([c["text"] for c in retrieved_context])
+    user_message = (
+        f"Repository: {repo}\n\n"
+        f"Here are historical PR review comments, commit messages, and code patterns from this repository:\n\n"
+        f"{context_text}\n\n"
+        f"---\n"
+        f"Generate a complete `.cursorrules` / `CLAUDE.md` / `copilot-instructions.md` file "
+        f"for this repository. The file will be loaded automatically by IDE agents so they "
+        f"adhere to this team's standards without needing to re-analyse the PR history.\n\n"
+        f"Start the file with:\n"
+        f"# {repo} — AI Agent Rules\n"
+        f"# Auto-generated by github-pr-context-mcp from repository PR history.\n"
+        f"# Regenerate at any time with: generate_repo_rules tool.\n\n"
+        f"Then write the rules."
+    )
+    return chat(
+        messages=[{"role": "user", "content": user_message}],
+        system=RULES_SYSTEM_PROMPT,
+        max_tokens=2048,
+        settings=settings,
+    )

storage/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from storage.vector_store import (
+    index_prs,
+    query_similar,
+    get_collection_stats,
+    list_all_repos,
+    delete_repo_index,
+    repo_is_indexed_permanently,
+    repo_is_indexed_temporarily,
+)
+__all__ = [
+    "index_prs",
+    "query_similar",
+    "get_collection_stats",
+    "list_all_repos",
+    "delete_repo_index",
+    "repo_is_indexed_permanently",
+    "repo_is_indexed_temporarily",
+]

storage/document_builder.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Converts raw PR dicts → text documents ready for embedding + storage.
+# No model, no ChromaDB, no GitHub calls here.
+import json
+def build_documents(prs: list[dict]) -> tuple[list, list, list]:
+    """
+    Convert a list of PR dicts into (documents, metadatas, ids)
+    ready to be encoded and upserted into ChromaDB.
+    """
+    docs, metadatas, ids = [], [], []
+    for pr in prs:
+        pr_num = pr["number"]
+        # PR description
+        if pr["body"].strip():
+            docs.append(f"PR #{pr_num}: {pr['title']}\n{pr['body']}")
+            metadatas.append({
+                "type": "pr_description",
+                "pr_number": pr_num,
+                "author": pr["author"],
+                "files": json.dumps([f["path"] for f in pr["files"]]),
+            })
+            ids.append(f"pr-{pr_num}-desc")
+        # Inline review comments + code context
+        for i, comment in enumerate(pr["review_comments"]):
+            if not comment["body"].strip():
+                continue
+            diff_text = f"\nCode Context:\n{comment['diff_hunk']}" if comment.get("diff_hunk") else ""
+            docs.append(
+                f"PR #{pr_num} | File: {comment['file']} | Line: {comment['line']}{diff_text}\n"
+                f"Reviewer ({comment['author']}): {comment['body']}"
+            )
+            metadatas.append({
+                "type": "review_comment",
+                "pr_number": pr_num,
+                "file": comment["file"],
+                "author": comment["author"],
+                "resolved": comment["resolved"],
+            })
+            ids.append(f"pr-{pr_num}-comment-{i}")
+        # Commit messages
+        for i, commit in enumerate(pr.get("commits", [])):
+            if not commit["message"].strip():
+                continue
+            docs.append(f"PR #{pr_num} Commit: {commit['message']}")
+            metadatas.append({
+                "type": "commit_message",
+                "pr_number": pr_num,
+            })
+            ids.append(f"pr-{pr_num}-commit-{i}")
+        # Overall review summaries (only those with written body)
+        for i, review in enumerate(pr["reviews"]):
+            if not review["body"].strip():
+                continue
+            docs.append(
+                f"PR #{pr_num} overall review by {review['author']} "
+                f"[{review['state']}]: {review['body']}"
+            )
+            metadatas.append({
+                "type": "review_summary",
+                "pr_number": pr_num,
+                "state": review["state"],
+                "author": review["author"],
+            })
+            ids.append(f"pr-{pr_num}-review-{i}")
+    return docs, metadatas, ids

storage/encoder.py ADDED Viewed

@@ -0,0 +1,35 @@
+# SentenceTransformer model loading and text encoding only.
+# No ChromaDB, no PR logic here.
+#
+# Model is lazy-loaded on first call instead of at import time.
+# This prevents Render health checks from failing during cold start
+# (the model download takes ~20-30s, which exceeds Render's health check window).
+from __future__ import annotations
+from threading import Lock
+_model = None
+_model_lock = Lock()
+_MODEL_NAME = "all-MiniLM-L6-v2"
+def _get_model():
+    """Lazy-load the SentenceTransformer model — only once, thread-safe."""
+    global _model
+    if _model is None:
+        with _model_lock:
+            if _model is None:  # double-checked locking
+                from sentence_transformers import SentenceTransformer
+                _model = SentenceTransformer(_MODEL_NAME)
+    return _model
+def encode(text: str) -> list[float]:
+    """Encode a single string into a vector."""
+    return _get_model().encode(text).tolist()
+def encode_batch(texts: list[str]) -> list[list[float]]:
+    """Encode a list of strings into vectors in one pass."""
+    model = _get_model()
+    return [model.encode(t).tolist() for t in texts]

storage/vector_store.py ADDED Viewed

@@ -0,0 +1,270 @@
+# ChromaDB client management, indexing, querying, and repo listing.
+# No ML model loading, no PR transformation, no GitHub calls here.
+import chromadb
+import os
+import hashlib
+import re
+import sys
+from dotenv import load_dotenv
+from storage.encoder import encode
+from storage.document_builder import build_documents
+load_dotenv()
+_DEFAULT_CHROMA_DIR = os.path.join(os.path.expanduser("~"), ".github-pr-mcp", "chroma_db")
+PERSIST_DIR = os.getenv("CHROMA_PERSIST_DIR", _DEFAULT_CHROMA_DIR)
+# Persistent = survives restarts, stored on disk
+_persistent_client = chromadb.PersistentClient(path=PERSIST_DIR)
+# Ephemeral = in-memory only, wiped when the MCP server process stops
+_ephemeral_client = chromadb.EphemeralClient()
+# ── Internal helpers ──────────────────────────────────────────────────────────
+def _normalize_namespace(namespace: str | None) -> str | None:
+    if namespace is None:
+        return None
+    ns = namespace.strip()
+    return ns or None
+def _safe_namespace(namespace: str | None) -> str | None:
+    ns = _normalize_namespace(namespace)
+    if ns is None:
+        return None
+    # Keep names portable across Chroma backends.
+    return re.sub(r"[^A-Za-z0-9_-]", "-", ns)
+def _safe_name(repo_key: str) -> str:
+    return repo_key.replace("/", "--")
+def _collection_name(repo_key: str, namespace: str | None = None) -> str:
+    # We now strictly use ONE collection per repository to preserve ChromaDB capacity.
+    # User isolation is handled by injecting the namespace into document metadata and applying `where` filters.
+    return _safe_name(repo_key)
+def _collection_metadata(repo_key: str, namespace: str | None = None) -> dict:
+    metadata = {
+        "hnsw:space": "cosine",
+        "repo": repo_key,
+    }
+    ns = _normalize_namespace(namespace)
+    if ns is not None:
+        metadata["namespace"] = ns
+    return metadata
+def _collection_repo(col) -> str:
+    meta = col.metadata or {}
+    if "repo" in meta:
+        return meta["repo"]
+    # Backward compatibility for collections created before metadata tagging.
+    return col.name.replace("--", "/")
+def _collection_namespace(col) -> str | None:
+    meta = col.metadata or {}
+    ns = meta.get("namespace")
+    return _normalize_namespace(ns) if isinstance(ns, str) else None
+def _client(temporary: bool):
+    return _ephemeral_client if temporary else _persistent_client
+def _get_collection(repo_key: str, temporary: bool = False, namespace: str | None = None):
+    return _client(temporary).get_or_create_collection(
+        name=_collection_name(repo_key, namespace=namespace),
+        metadata=_collection_metadata(repo_key, namespace=namespace),
+    )
+# ── Status checks ─────────────────────────────────────────────────────────────
+def repo_is_indexed_permanently(repo_key: str, namespace: str | None = None) -> bool:
+    try:
+        col = _persistent_client.get_collection(_collection_name(repo_key, namespace=namespace))
+        return col.count() > 0
+    except Exception:
+        return False
+def repo_is_indexed_temporarily(repo_key: str, namespace: str | None = None) -> bool:
+    try:
+        col = _ephemeral_client.get_collection(_collection_name(repo_key, namespace=namespace))
+        return col.count() > 0
+    except Exception:
+        return False
+# ── Listing ───────────────────────────────────────────────────────────────────
+def list_all_repos(namespace: str | None = None) -> list[dict]:
+    ns_filter = _normalize_namespace(namespace)
+    def _rows(client, storage_label: str) -> list[dict]:
+        items = []
+        for col in client.list_collections():
+            repo = _collection_repo(col)
+            repo_ns = _collection_namespace(col)
+            if ns_filter is not None and repo_ns != ns_filter:
+                continue
+            items.append({
+                "repo": repo,
+                "namespace": repo_ns,
+                "total_documents": col.count(),
+                "storage": storage_label,
+            })
+        return items
+    permanent = _rows(_persistent_client, "permanent")
+    temporary = _rows(_ephemeral_client, "temporary")
+    return permanent + temporary
+# ── Indexing ──────────────────────────────────────────────────────────────────
+def index_prs(
+    repo_key: str,
+    prs: list[dict],
+    temporary: bool = False,
+    namespace: str | None = None,
+) -> int:
+    """
+    Embed and store all PR documents.
+    temporary=False → persistent on-disk ChromaDB
+    temporary=True  → ephemeral in-memory (lost on server restart)
+    """
+    collection = _get_collection(repo_key, temporary=temporary, namespace=namespace)
+    docs, metadatas, ids = build_documents(prs)
+    if not docs:
+        return 0
+    ns = _normalize_namespace(namespace)
+    for meta in metadatas:
+        if ns:
+            meta["namespace"] = ns
+    embeddings = [encode(doc) for doc in docs]
+    collection.upsert(documents=docs, embeddings=embeddings, metadatas=metadatas, ids=ids)
+    label = "temporary (in-memory)" if temporary else "permanent (disk)"
+    ns = _normalize_namespace(namespace)
+    ns_suffix = f", namespace={ns}" if ns else ""
+    print(f"Indexed {len(docs)} documents for {repo_key} [{label}{ns_suffix}]", file=sys.stderr)
+    return len(docs)
+# ── Querying ──────────────────────────────────────────────────────────────────
+def query_similar(
+    repo_key: str,
+    query_text: str,
+    n_results: int = 8,
+    temporary: bool = False,
+    namespace: str | None = None,
+) -> list[dict]:
+    collection = _get_collection(repo_key, temporary=temporary, namespace=namespace)
+    total = collection.count()
+    if total == 0:
+        return []
+    ns = _normalize_namespace(namespace)
+    where_filter = {"namespace": ns} if ns else None
+    # We must explicitly query with a where_filter to isolate queries to this namespace's vectors
+    results = collection.query(
+        query_embeddings=[encode(query_text)],
+        n_results=n_results, # We might get fewer than n_results back, which is fine
+        where=where_filter,
+        include=["documents", "metadatas", "distances"],
+    )
+    if not results["documents"] or not results["documents"][0]:
+        return []
+    return [
+        {
+            "text": doc,
+            "metadata": meta,
+            "similarity": round(1 - dist, 4),
+        }
+        for doc, meta, dist in zip(
+            results["documents"][0],
+            results["metadatas"][0],
+            results["distances"][0],
+        )
+    ]
+# ── Stats ─────────────────────────────────────────────────────────────────────
+def get_collection_stats(
+    repo_key: str,
+    temporary: bool = False,
+    namespace: str | None = None,
+) -> dict:
+    collection = _get_collection(repo_key, temporary=temporary, namespace=namespace)
+    ns = _normalize_namespace(namespace)
+    where_filter = {"namespace": ns} if ns else None
+    try:
+        data = collection.get(where=where_filter, include=[])
+        count = len(data["ids"]) if data and "ids" in data else 0
+    except Exception:
+        count = 0
+    return {
+        "repo": repo_key,
+        "namespace": ns,
+        "total_documents": count,
+        "storage": "temporary" if temporary else "permanent",
+    }
+# ── Deletion ──────────────────────────────────────────────────────────────────
+def delete_repo_index(
+    repo_key: str,
+    storage: str = "both",
+    namespace: str | None = None,
+) -> dict:
+    if storage not in {"temporary", "permanent", "both"}:
+        raise ValueError("storage must be one of: temporary, permanent, both")
+    name = _collection_name(repo_key, namespace=namespace)
+    ns = _normalize_namespace(namespace)
+    where_filter = {"namespace": ns} if ns else None
+    deleted = {
+        "temporary": False,
+        "permanent": False,
+    }
+    if storage in {"temporary", "both"}:
+        try:
+            col = _ephemeral_client.get_collection(name)
+            if where_filter:
+                col.delete(where=where_filter)
+            else:
+                _ephemeral_client.delete_collection(name)
+            deleted["temporary"] = True
+        except Exception:
+            pass
+    if storage in {"permanent", "both"}:
+        try:
+            col = _persistent_client.get_collection(name)
+            if where_filter:
+                col.delete(where=where_filter)
+            else:
+                _persistent_client.delete_collection(name)
+            deleted["permanent"] = True
+        except Exception:
+            pass
+    return {
+        "repo": repo_key,
+        "namespace": _normalize_namespace(namespace),
+        "storage": storage,
+        "deleted": deleted,
+        "deleted_any": any(deleted.values()),
+    }