github-pr-context-mcp 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
inference/review.py ADDED
@@ -0,0 +1,175 @@
1
+ # LLM inference for code review — model-agnostic.
2
+ # Uses inference/providers.py for the actual LLM call.
3
+ # Swap providers by changing LLM_PROVIDER in .env, no code changes needed.
4
+
5
+ from inference.providers import chat
6
+
7
+ REVIEW_SYSTEM_PROMPT = """You are a senior software engineer doing code review.
8
+ You have access to historical PR review comments from this repository.
9
+ Use the provided context to give reviews that match the team's standards and catch issues
10
+ they've flagged before. Be specific, reference line numbers when possible, be concise.
11
+ Do not be sycophantic. Flag real problems."""
12
+
13
+
14
+ def review_with_context(
15
+ diff_or_code: str,
16
+ retrieved_context: list[dict],
17
+ repo: str,
18
+ settings: dict | None = None,
19
+ ) -> str:
20
+ """Use retrieved RAG context + LLM to do a context-aware code review."""
21
+ context_text = "\n---\n".join([
22
+ f"[{c['similarity']:.2f}] {c['text'][:400]}"
23
+ for c in retrieved_context[:6]
24
+ ])
25
+
26
+ user_message = f"""Repository: {repo}
27
+
28
+ HISTORICAL REVIEW CONTEXT (from past PRs in this repo):
29
+ {context_text}
30
+
31
+ ---
32
+ CODE TO REVIEW:
33
+ {diff_or_code}
34
+
35
+ ---
36
+ Provide a thorough code review. Reference specific past patterns where relevant.
37
+ Flag issues the team has flagged before. Note what looks good too."""
38
+
39
+ return chat(
40
+ messages=[{"role": "user", "content": user_message}],
41
+ system=REVIEW_SYSTEM_PROMPT,
42
+ max_tokens=1024,
43
+ settings=settings,
44
+ )
45
+
46
+
47
+ def summarize_patterns(retrieved_context: list[dict], repo: str, settings: dict | None = None) -> str:
48
+ """Summarize what this team commonly flags in reviews."""
49
+ context_text = "\n---\n".join([c["text"][:350] for c in retrieved_context])
50
+
51
+ return chat(
52
+ messages=[{
53
+ "role": "user",
54
+ "content": (
55
+ f"Repository: {repo}\n\n"
56
+ f"Here are past code review comments from this team:\n{context_text}\n\n"
57
+ "List the top 5 patterns this team commonly flags in code reviews. "
58
+ "Be specific. Quote examples where useful."
59
+ ),
60
+ }],
61
+ max_tokens=512,
62
+ settings=settings,
63
+ )
64
+
65
+
66
+ GENERATE_SYSTEM_PROMPT = """You are a senior software engineer assistant.
67
+ You write code that follows the repository's established patterns, naming conventions, and best practices.
68
+ You have access to historical PR commits and review comments from this repository.
69
+ Use the provided context to ensure your generated code matches the team's style and avoids issues they've flagged in the past."""
70
+
71
+
72
+ def generate_with_context(
73
+ task: str,
74
+ retrieved_context: list[dict],
75
+ repo: str,
76
+ settings: dict | None = None,
77
+ repo_rules: str | None = None,
78
+ ) -> str:
79
+ """Use retrieved RAG context + LLM to generate code grounded in team patterns.
80
+
81
+ Args:
82
+ task: Description of what to implement.
83
+ retrieved_context: RAG documents from the indexed repo.
84
+ repo: GitHub repo identifier (owner/name).
85
+ settings: Optional LLM provider override dict.
86
+ repo_rules: Contents of a .cursorrules / CLAUDE.md file. When provided,
87
+ these rules are injected as hard constraints before historical context.
88
+
89
+ Returns:
90
+ Generated code string.
91
+ """
92
+ context_text = "\n---\n".join([
93
+ f"[{c['similarity']:.2f}] {c['text'][:400]}"
94
+ for c in retrieved_context[:8]
95
+ ])
96
+
97
+ rules_block = ""
98
+ if repo_rules and repo_rules.strip():
99
+ # Truncate to 2000 chars — rules files are dense; first 2000 chars cover all hard rules
100
+ trimmed_rules = repo_rules.strip()[:2000]
101
+ rules_block = f"\nREPO RULES (enforce in ALL generated code):\n{trimmed_rules}\n\n---"
102
+
103
+ user_message = f"""Repository: {repo}
104
+
105
+ TASK:
106
+ {task}
107
+ {rules_block}
108
+
109
+ HISTORICAL CONTEXT (from past PRs in this repo):
110
+ {context_text}
111
+
112
+ ---
113
+ Write the code to complete the task. You MUST follow all REPO RULES above without exception.
114
+ Ensure the output also matches the coding style, naming conventions, and best practices seen in the historical context.
115
+ Avoid issues the team has flagged before in similar situations.
116
+ Provide only the code and necessary brief explanations."""
117
+
118
+ return chat(
119
+ messages=[{"role": "user", "content": user_message}],
120
+ system=GENERATE_SYSTEM_PROMPT,
121
+ max_tokens=2048,
122
+ settings=settings,
123
+ )
124
+
125
+
126
+ RULES_SYSTEM_PROMPT = """You are a senior engineering lead.
127
+ Your job is to synthesize a repository's historical PR review comments into a concise,
128
+ actionable set of rules for IDE agents (Cursor, GitHub Copilot, Claude).
129
+
130
+ Output format rules:
131
+ - Write in clear, imperative statements ("Always ...", "Never ...", "Prefer ...").
132
+ - Group rules under the headings: Code Quality, Architecture, Testing, Documentation.
133
+ - Maximum 30 rules total. Be specific. Reference concrete examples from the context.
134
+ - Do NOT include generic advice not backed by the repo's real history.
135
+ - Do NOT include any preamble or explanation outside the rule file content itself."""
136
+
137
+
138
+ def generate_rules_content(
139
+ retrieved_context: list[dict],
140
+ repo: str,
141
+ settings: dict | None = None,
142
+ ) -> str:
143
+ """Synthesise a .cursorrules / CLAUDE.md / copilot-instructions.md file from indexed PR history.
144
+
145
+ Args:
146
+ retrieved_context: Retrieved RAG documents from the indexed repo.
147
+ repo: The GitHub repo identifier (owner/name).
148
+ settings: Optional LLM provider override dict.
149
+
150
+ Returns:
151
+ A markdown string ready to be written as a rules file.
152
+ """
153
+ context_text = "\n\n".join([c["text"] for c in retrieved_context])
154
+
155
+ user_message = (
156
+ f"Repository: {repo}\n\n"
157
+ f"Here are historical PR review comments, commit messages, and code patterns from this repository:\n\n"
158
+ f"{context_text}\n\n"
159
+ f"---\n"
160
+ f"Generate a complete `.cursorrules` / `CLAUDE.md` / `copilot-instructions.md` file "
161
+ f"for this repository. The file will be loaded automatically by IDE agents so they "
162
+ f"adhere to this team's standards without needing to re-analyse the PR history.\n\n"
163
+ f"Start the file with:\n"
164
+ f"# {repo} — AI Agent Rules\n"
165
+ f"# Auto-generated by github-pr-context-mcp from repository PR history.\n"
166
+ f"# Regenerate at any time with: generate_repo_rules tool.\n\n"
167
+ f"Then write the rules."
168
+ )
169
+
170
+ return chat(
171
+ messages=[{"role": "user", "content": user_message}],
172
+ system=RULES_SYSTEM_PROMPT,
173
+ max_tokens=2048,
174
+ settings=settings,
175
+ )
storage/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ from storage.vector_store import (
2
+ index_prs,
3
+ query_similar,
4
+ get_collection_stats,
5
+ list_all_repos,
6
+ delete_repo_index,
7
+ repo_is_indexed_permanently,
8
+ repo_is_indexed_temporarily,
9
+ )
10
+
11
+ __all__ = [
12
+ "index_prs",
13
+ "query_similar",
14
+ "get_collection_stats",
15
+ "list_all_repos",
16
+ "delete_repo_index",
17
+ "repo_is_indexed_permanently",
18
+ "repo_is_indexed_temporarily",
19
+ ]
@@ -0,0 +1,74 @@
1
+ # Converts raw PR dicts → text documents ready for embedding + storage.
2
+ # No model, no ChromaDB, no GitHub calls here.
3
+
4
+ import json
5
+
6
+
7
+ def build_documents(prs: list[dict]) -> tuple[list, list, list]:
8
+ """
9
+ Convert a list of PR dicts into (documents, metadatas, ids)
10
+ ready to be encoded and upserted into ChromaDB.
11
+ """
12
+ docs, metadatas, ids = [], [], []
13
+
14
+ for pr in prs:
15
+ pr_num = pr["number"]
16
+
17
+ # PR description
18
+ if pr["body"].strip():
19
+ docs.append(f"PR #{pr_num}: {pr['title']}\n{pr['body']}")
20
+ metadatas.append({
21
+ "type": "pr_description",
22
+ "pr_number": pr_num,
23
+ "author": pr["author"],
24
+ "files": json.dumps([f["path"] for f in pr["files"]]),
25
+ })
26
+ ids.append(f"pr-{pr_num}-desc")
27
+
28
+ # Inline review comments + code context
29
+ for i, comment in enumerate(pr["review_comments"]):
30
+ if not comment["body"].strip():
31
+ continue
32
+
33
+ diff_text = f"\nCode Context:\n{comment['diff_hunk']}" if comment.get("diff_hunk") else ""
34
+ docs.append(
35
+ f"PR #{pr_num} | File: {comment['file']} | Line: {comment['line']}{diff_text}\n"
36
+ f"Reviewer ({comment['author']}): {comment['body']}"
37
+ )
38
+ metadatas.append({
39
+ "type": "review_comment",
40
+ "pr_number": pr_num,
41
+ "file": comment["file"],
42
+ "author": comment["author"],
43
+ "resolved": comment["resolved"],
44
+ })
45
+ ids.append(f"pr-{pr_num}-comment-{i}")
46
+
47
+ # Commit messages
48
+ for i, commit in enumerate(pr.get("commits", [])):
49
+ if not commit["message"].strip():
50
+ continue
51
+ docs.append(f"PR #{pr_num} Commit: {commit['message']}")
52
+ metadatas.append({
53
+ "type": "commit_message",
54
+ "pr_number": pr_num,
55
+ })
56
+ ids.append(f"pr-{pr_num}-commit-{i}")
57
+
58
+ # Overall review summaries (only those with written body)
59
+ for i, review in enumerate(pr["reviews"]):
60
+ if not review["body"].strip():
61
+ continue
62
+ docs.append(
63
+ f"PR #{pr_num} overall review by {review['author']} "
64
+ f"[{review['state']}]: {review['body']}"
65
+ )
66
+ metadatas.append({
67
+ "type": "review_summary",
68
+ "pr_number": pr_num,
69
+ "state": review["state"],
70
+ "author": review["author"],
71
+ })
72
+ ids.append(f"pr-{pr_num}-review-{i}")
73
+
74
+ return docs, metadatas, ids
storage/encoder.py ADDED
@@ -0,0 +1,35 @@
1
+ # SentenceTransformer model loading and text encoding only.
2
+ # No ChromaDB, no PR logic here.
3
+ #
4
+ # Model is lazy-loaded on first call instead of at import time.
5
+ # This prevents Render health checks from failing during cold start
6
+ # (the model download takes ~20-30s, which exceeds Render's health check window).
7
+
8
+ from __future__ import annotations
9
+ from threading import Lock
10
+
11
+ _model = None
12
+ _model_lock = Lock()
13
+ _MODEL_NAME = "all-MiniLM-L6-v2"
14
+
15
+
16
+ def _get_model():
17
+ """Lazy-load the SentenceTransformer model — only once, thread-safe."""
18
+ global _model
19
+ if _model is None:
20
+ with _model_lock:
21
+ if _model is None: # double-checked locking
22
+ from sentence_transformers import SentenceTransformer
23
+ _model = SentenceTransformer(_MODEL_NAME)
24
+ return _model
25
+
26
+
27
+ def encode(text: str) -> list[float]:
28
+ """Encode a single string into a vector."""
29
+ return _get_model().encode(text).tolist()
30
+
31
+
32
+ def encode_batch(texts: list[str]) -> list[list[float]]:
33
+ """Encode a list of strings into vectors in one pass."""
34
+ model = _get_model()
35
+ return [model.encode(t).tolist() for t in texts]
@@ -0,0 +1,270 @@
1
+ # ChromaDB client management, indexing, querying, and repo listing.
2
+ # No ML model loading, no PR transformation, no GitHub calls here.
3
+
4
+ import chromadb
5
+ import os
6
+ import hashlib
7
+ import re
8
+ import sys
9
+ from dotenv import load_dotenv
10
+ from storage.encoder import encode
11
+ from storage.document_builder import build_documents
12
+
13
+ load_dotenv()
14
+
15
+ _DEFAULT_CHROMA_DIR = os.path.join(os.path.expanduser("~"), ".github-pr-mcp", "chroma_db")
16
+ PERSIST_DIR = os.getenv("CHROMA_PERSIST_DIR", _DEFAULT_CHROMA_DIR)
17
+
18
+ # Persistent = survives restarts, stored on disk
19
+ _persistent_client = chromadb.PersistentClient(path=PERSIST_DIR)
20
+
21
+ # Ephemeral = in-memory only, wiped when the MCP server process stops
22
+ _ephemeral_client = chromadb.EphemeralClient()
23
+
24
+
25
+ # ── Internal helpers ──────────────────────────────────────────────────────────
26
+
27
+ def _normalize_namespace(namespace: str | None) -> str | None:
28
+ if namespace is None:
29
+ return None
30
+ ns = namespace.strip()
31
+ return ns or None
32
+
33
+ def _safe_namespace(namespace: str | None) -> str | None:
34
+ ns = _normalize_namespace(namespace)
35
+ if ns is None:
36
+ return None
37
+ # Keep names portable across Chroma backends.
38
+ return re.sub(r"[^A-Za-z0-9_-]", "-", ns)
39
+
40
+ def _safe_name(repo_key: str) -> str:
41
+ return repo_key.replace("/", "--")
42
+
43
+ def _collection_name(repo_key: str, namespace: str | None = None) -> str:
44
+ # We now strictly use ONE collection per repository to preserve ChromaDB capacity.
45
+ # User isolation is handled by injecting the namespace into document metadata and applying `where` filters.
46
+ return _safe_name(repo_key)
47
+
48
+ def _collection_metadata(repo_key: str, namespace: str | None = None) -> dict:
49
+ metadata = {
50
+ "hnsw:space": "cosine",
51
+ "repo": repo_key,
52
+ }
53
+ ns = _normalize_namespace(namespace)
54
+ if ns is not None:
55
+ metadata["namespace"] = ns
56
+ return metadata
57
+
58
+ def _collection_repo(col) -> str:
59
+ meta = col.metadata or {}
60
+ if "repo" in meta:
61
+ return meta["repo"]
62
+ # Backward compatibility for collections created before metadata tagging.
63
+ return col.name.replace("--", "/")
64
+
65
+ def _collection_namespace(col) -> str | None:
66
+ meta = col.metadata or {}
67
+ ns = meta.get("namespace")
68
+ return _normalize_namespace(ns) if isinstance(ns, str) else None
69
+
70
+ def _client(temporary: bool):
71
+ return _ephemeral_client if temporary else _persistent_client
72
+
73
+ def _get_collection(repo_key: str, temporary: bool = False, namespace: str | None = None):
74
+ return _client(temporary).get_or_create_collection(
75
+ name=_collection_name(repo_key, namespace=namespace),
76
+ metadata=_collection_metadata(repo_key, namespace=namespace),
77
+ )
78
+
79
+
80
+ # ── Status checks ─────────────────────────────────────────────────────────────
81
+
82
+ def repo_is_indexed_permanently(repo_key: str, namespace: str | None = None) -> bool:
83
+ try:
84
+ col = _persistent_client.get_collection(_collection_name(repo_key, namespace=namespace))
85
+ return col.count() > 0
86
+ except Exception:
87
+ return False
88
+
89
+ def repo_is_indexed_temporarily(repo_key: str, namespace: str | None = None) -> bool:
90
+ try:
91
+ col = _ephemeral_client.get_collection(_collection_name(repo_key, namespace=namespace))
92
+ return col.count() > 0
93
+ except Exception:
94
+ return False
95
+
96
+
97
+ # ── Listing ───────────────────────────────────────────────────────────────────
98
+
99
+ def list_all_repos(namespace: str | None = None) -> list[dict]:
100
+ ns_filter = _normalize_namespace(namespace)
101
+
102
+ def _rows(client, storage_label: str) -> list[dict]:
103
+ items = []
104
+ for col in client.list_collections():
105
+ repo = _collection_repo(col)
106
+ repo_ns = _collection_namespace(col)
107
+ if ns_filter is not None and repo_ns != ns_filter:
108
+ continue
109
+ items.append({
110
+ "repo": repo,
111
+ "namespace": repo_ns,
112
+ "total_documents": col.count(),
113
+ "storage": storage_label,
114
+ })
115
+ return items
116
+
117
+ permanent = _rows(_persistent_client, "permanent")
118
+ temporary = _rows(_ephemeral_client, "temporary")
119
+ return permanent + temporary
120
+
121
+
122
+ # ── Indexing ──────────────────────────────────────────────────────────────────
123
+
124
+ def index_prs(
125
+ repo_key: str,
126
+ prs: list[dict],
127
+ temporary: bool = False,
128
+ namespace: str | None = None,
129
+ ) -> int:
130
+ """
131
+ Embed and store all PR documents.
132
+ temporary=False → persistent on-disk ChromaDB
133
+ temporary=True → ephemeral in-memory (lost on server restart)
134
+ """
135
+ collection = _get_collection(repo_key, temporary=temporary, namespace=namespace)
136
+ docs, metadatas, ids = build_documents(prs)
137
+
138
+ if not docs:
139
+ return 0
140
+
141
+ ns = _normalize_namespace(namespace)
142
+ for meta in metadatas:
143
+ if ns:
144
+ meta["namespace"] = ns
145
+
146
+ embeddings = [encode(doc) for doc in docs]
147
+ collection.upsert(documents=docs, embeddings=embeddings, metadatas=metadatas, ids=ids)
148
+
149
+ label = "temporary (in-memory)" if temporary else "permanent (disk)"
150
+ ns = _normalize_namespace(namespace)
151
+ ns_suffix = f", namespace={ns}" if ns else ""
152
+ print(f"Indexed {len(docs)} documents for {repo_key} [{label}{ns_suffix}]", file=sys.stderr)
153
+ return len(docs)
154
+
155
+
156
+ # ── Querying ──────────────────────────────────────────────────────────────────
157
+
158
+ def query_similar(
159
+ repo_key: str,
160
+ query_text: str,
161
+ n_results: int = 8,
162
+ temporary: bool = False,
163
+ namespace: str | None = None,
164
+ ) -> list[dict]:
165
+ collection = _get_collection(repo_key, temporary=temporary, namespace=namespace)
166
+ total = collection.count()
167
+ if total == 0:
168
+ return []
169
+
170
+ ns = _normalize_namespace(namespace)
171
+ where_filter = {"namespace": ns} if ns else None
172
+
173
+ # We must explicitly query with a where_filter to isolate queries to this namespace's vectors
174
+ results = collection.query(
175
+ query_embeddings=[encode(query_text)],
176
+ n_results=n_results, # We might get fewer than n_results back, which is fine
177
+ where=where_filter,
178
+ include=["documents", "metadatas", "distances"],
179
+ )
180
+
181
+ if not results["documents"] or not results["documents"][0]:
182
+ return []
183
+
184
+ return [
185
+ {
186
+ "text": doc,
187
+ "metadata": meta,
188
+ "similarity": round(1 - dist, 4),
189
+ }
190
+ for doc, meta, dist in zip(
191
+ results["documents"][0],
192
+ results["metadatas"][0],
193
+ results["distances"][0],
194
+ )
195
+ ]
196
+
197
+
198
+ # ── Stats ─────────────────────────────────────────────────────────────────────
199
+
200
+ def get_collection_stats(
201
+ repo_key: str,
202
+ temporary: bool = False,
203
+ namespace: str | None = None,
204
+ ) -> dict:
205
+ collection = _get_collection(repo_key, temporary=temporary, namespace=namespace)
206
+ ns = _normalize_namespace(namespace)
207
+ where_filter = {"namespace": ns} if ns else None
208
+
209
+ try:
210
+ data = collection.get(where=where_filter, include=[])
211
+ count = len(data["ids"]) if data and "ids" in data else 0
212
+ except Exception:
213
+ count = 0
214
+
215
+ return {
216
+ "repo": repo_key,
217
+ "namespace": ns,
218
+ "total_documents": count,
219
+ "storage": "temporary" if temporary else "permanent",
220
+ }
221
+
222
+
223
+ # ── Deletion ──────────────────────────────────────────────────────────────────
224
+
225
+ def delete_repo_index(
226
+ repo_key: str,
227
+ storage: str = "both",
228
+ namespace: str | None = None,
229
+ ) -> dict:
230
+ if storage not in {"temporary", "permanent", "both"}:
231
+ raise ValueError("storage must be one of: temporary, permanent, both")
232
+
233
+ name = _collection_name(repo_key, namespace=namespace)
234
+ ns = _normalize_namespace(namespace)
235
+ where_filter = {"namespace": ns} if ns else None
236
+
237
+ deleted = {
238
+ "temporary": False,
239
+ "permanent": False,
240
+ }
241
+
242
+ if storage in {"temporary", "both"}:
243
+ try:
244
+ col = _ephemeral_client.get_collection(name)
245
+ if where_filter:
246
+ col.delete(where=where_filter)
247
+ else:
248
+ _ephemeral_client.delete_collection(name)
249
+ deleted["temporary"] = True
250
+ except Exception:
251
+ pass
252
+
253
+ if storage in {"permanent", "both"}:
254
+ try:
255
+ col = _persistent_client.get_collection(name)
256
+ if where_filter:
257
+ col.delete(where=where_filter)
258
+ else:
259
+ _persistent_client.delete_collection(name)
260
+ deleted["permanent"] = True
261
+ except Exception:
262
+ pass
263
+
264
+ return {
265
+ "repo": repo_key,
266
+ "namespace": _normalize_namespace(namespace),
267
+ "storage": storage,
268
+ "deleted": deleted,
269
+ "deleted_any": any(deleted.values()),
270
+ }