contextl 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,201 @@
1
+ """
2
+ Repository Intelligence Engine — Universal MCP Server
3
+
4
+ Exposes the engine as an MCP (Model Context Protocol) server over stdio.
5
+ Works with any MCP-compatible IDE: Cursor, Windsurf, Claude Code, VS Code, etc.
6
+
7
+ The IDE launches this script automatically when it needs it.
8
+ No manual server to keep running.
9
+
10
+ Usage (the IDE does this for you, but you can test manually):
11
+ python mcp_server.py
12
+
13
+ Tools exposed:
14
+ query_repo — rank files by relevance to a natural-language query
15
+ scan_repo — list all source files the engine can see in a repo
16
+ """
17
+
18
+ import asyncio
19
+ import json
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ import mcp.types as types
24
+ from mcp.server import Server
25
+ from mcp.server.stdio import stdio_server
26
+
27
+ # Make sure the engine modules are importable from the same directory
28
+ sys.path.insert(0, str(Path(__file__).parent))
29
+
30
+ from scanner import scan_repo
31
+ from import_parser import parse_imports
32
+ from graph_builder import build_graph
33
+ from query_engine import query as engine_query
34
+ from main import _confidence, _reasoning
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Server setup
39
+ # ---------------------------------------------------------------------------
40
+ server = Server("repo-intelligence")
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Tool definitions
45
+ # ---------------------------------------------------------------------------
46
+ @server.list_tools()
47
+ async def list_tools() -> list[types.Tool]:
48
+ return [
49
+ types.Tool(
50
+ name="query_repo",
51
+ description=(
52
+ "Find the most relevant files in a code repository for a given "
53
+ "change request or natural-language query. "
54
+ "Returns a ranked list of files with relevance scores, confidence "
55
+ "levels, matched terms, and a one-line reasoning string. "
56
+ "Use this before editing code to identify exactly which files need "
57
+ "to change — avoids reading irrelevant files."
58
+ ),
59
+ inputSchema={
60
+ "type": "object",
61
+ "properties": {
62
+ "repo_path": {
63
+ "type": "string",
64
+ "description": "Absolute or relative path to the repository root.",
65
+ },
66
+ "query": {
67
+ "type": "string",
68
+ "description": (
69
+ "Natural-language description of the change you want to make. "
70
+ "Examples: 'fix the upload error handler', "
71
+ "'change the login page logo', 'add dark mode to footer'."
72
+ ),
73
+ },
74
+ "top_n": {
75
+ "type": "integer",
76
+ "description": "Maximum number of files to return (default: 5, max: 20).",
77
+ "default": 5,
78
+ "minimum": 1,
79
+ "maximum": 20,
80
+ },
81
+ },
82
+ "required": ["repo_path", "query"],
83
+ },
84
+ ),
85
+ types.Tool(
86
+ name="scan_repo",
87
+ description=(
88
+ "Scan a repository and list all source files the engine can see. "
89
+ "Useful for understanding the repository structure before querying. "
90
+ "Returns file paths, extensions, and sizes."
91
+ ),
92
+ inputSchema={
93
+ "type": "object",
94
+ "properties": {
95
+ "repo_path": {
96
+ "type": "string",
97
+ "description": "Absolute or relative path to the repository root.",
98
+ },
99
+ },
100
+ "required": ["repo_path"],
101
+ },
102
+ ),
103
+ ]
104
+
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Tool handlers
108
+ # ---------------------------------------------------------------------------
109
+ @server.call_tool()
110
+ async def call_tool(name: str, arguments: dict) -> list[types.TextContent]:
111
+
112
+ if name == "query_repo":
113
+ return await _handle_query(arguments)
114
+
115
+ if name == "scan_repo":
116
+ return await _handle_scan(arguments)
117
+
118
+ return [types.TextContent(
119
+ type="text",
120
+ text=json.dumps({"error": f"Unknown tool: {name}"}),
121
+ )]
122
+
123
+
124
+ async def _handle_query(args: dict) -> list[types.TextContent]:
125
+ repo_path = args.get("repo_path", "")
126
+ query_str = args.get("query", "")
127
+ top_n = min(int(args.get("top_n", 5)), 20)
128
+
129
+ # Run the engine (synchronous — wrap in thread to stay async-safe)
130
+ loop = asyncio.get_event_loop()
131
+ try:
132
+ result = await loop.run_in_executor(None, _run_query, repo_path, query_str, top_n)
133
+ except Exception as e:
134
+ result = {"error": str(e), "query": query_str, "repo": repo_path, "results": []}
135
+
136
+ return [types.TextContent(type="text", text=json.dumps(result, indent=2))]
137
+
138
+
139
+ def _run_query(repo_path: str, query_str: str, top_n: int) -> dict:
140
+ """Synchronous pipeline — called from a thread executor."""
141
+ scan = scan_repo(repo_path)
142
+ parse = parse_imports(scan)
143
+ repo_graph = build_graph(scan, parse)
144
+ ranked = engine_query(query_str, repo_graph, top_n=top_n)
145
+
146
+ return {
147
+ "query": query_str,
148
+ "repo": scan.root,
149
+ "total_files_scanned": scan.total_files,
150
+ "results": [
151
+ {
152
+ "rank": i + 1,
153
+ "path": r.path,
154
+ "score": round(r.total_score, 4),
155
+ "confidence": _confidence(r.total_score),
156
+ "matched_terms": sorted(r.matched_terms),
157
+ "reasoning": _reasoning(r, repo_graph),
158
+ }
159
+ for i, r in enumerate(ranked)
160
+ ],
161
+ }
162
+
163
+
164
+ async def _handle_scan(args: dict) -> list[types.TextContent]:
165
+ repo_path = args.get("repo_path", "")
166
+
167
+ loop = asyncio.get_event_loop()
168
+ try:
169
+ result = await loop.run_in_executor(None, _run_scan, repo_path)
170
+ except Exception as e:
171
+ result = {"error": str(e), "repo": repo_path}
172
+
173
+ return [types.TextContent(type="text", text=json.dumps(result, indent=2))]
174
+
175
+
176
+ def _run_scan(repo_path: str) -> dict:
177
+ scan = scan_repo(repo_path)
178
+ return {
179
+ "repo": scan.root,
180
+ "total_files": scan.total_files,
181
+ "files": [
182
+ {"path": f.path, "extension": f.extension, "size_bytes": f.size_bytes}
183
+ for f in scan.files
184
+ ],
185
+ }
186
+
187
+
188
+ # ---------------------------------------------------------------------------
189
+ # Entry point
190
+ # ---------------------------------------------------------------------------
191
+ async def main():
192
+ async with stdio_server() as (read_stream, write_stream):
193
+ await server.run(
194
+ read_stream,
195
+ write_stream,
196
+ server.create_initialization_options(),
197
+ )
198
+
199
+
200
+ if __name__ == "__main__":
201
+ asyncio.run(main())
@@ -0,0 +1,252 @@
1
+ """
2
+ Repository Intelligence Engine
3
+ Step 4: Query Engine
4
+
5
+ Accepts a natural-language query and ranks files by relevance using:
6
+
7
+ 1. Keyword match score — does the filename / path contain query terms?
8
+ 2. Content match score — do the file's contents mention query terms?
9
+ 3. Graph neighbor bonus — if a high-scoring file is nearby in the graph,
10
+ its neighbors get a proximity boost
11
+ 4. Centrality weight — more connected files rank slightly higher when
12
+ scores are otherwise equal
13
+
14
+ No LLM. No embeddings. Pure text + graph.
15
+ """
16
+
17
+ import re
18
+ from dataclasses import dataclass, field
19
+
20
+ from scanner import scan_repo
21
+ from import_parser import parse_imports
22
+ from graph_builder import build_graph, RepoGraph
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Stop words — stripped from queries before matching
27
+ # ---------------------------------------------------------------------------
28
+ STOP_WORDS = {
29
+ "a", "an", "the", "in", "on", "at", "to", "for", "of", "and",
30
+ "or", "is", "it", "this", "that", "with", "from", "how", "what",
31
+ "where", "when", "which", "who", "make", "change", "update", "fix",
32
+ "add", "remove", "edit", "modify", "the", "my", "our", "i", "we",
33
+ }
34
+
35
+
36
+ @dataclass
37
+ class RankedFile:
38
+ """A file with its relevance score and scoring breakdown."""
39
+ path: str
40
+ total_score: float
41
+ keyword_score: float # path/filename match
42
+ content_score: float # file content match
43
+ neighbor_bonus: float # proximity to other high-scoring files
44
+ centrality: float # PageRank weight
45
+ matched_terms: list[str] = field(default_factory=list)
46
+
47
+ def explain(self) -> str:
48
+ terms = ", ".join(self.matched_terms) if self.matched_terms else "none"
49
+ return (
50
+ f" {self.path}\n"
51
+ f" score={self.total_score:.4f} "
52
+ f"keyword={self.keyword_score:.3f} "
53
+ f"content={self.content_score:.3f} "
54
+ f"neighbor={self.neighbor_bonus:.3f} "
55
+ f"centrality={self.centrality:.4f}\n"
56
+ f" matched terms: {terms}"
57
+ )
58
+
59
+
60
+ def _tokenize(text: str) -> list[str]:
61
+ """Lowercase, split on non-alphanumeric, remove stop words."""
62
+ tokens = re.findall(r"[a-z0-9]+", text.lower())
63
+ return [t for t in tokens if t not in STOP_WORDS and len(t) > 1]
64
+
65
+
66
+ def _path_tokens(path: str) -> list[str]:
67
+ """Extract meaningful tokens from a file path."""
68
+ # Split on slashes, dots, camelCase, kebab-case, underscores
69
+ parts = re.split(r"[/\\.]", path)
70
+ tokens = []
71
+ for part in parts:
72
+ # Split camelCase: LoginHeader → login, header
73
+ sub = re.sub(r"([A-Z])", r" \1", part).lower()
74
+ tokens.extend(re.findall(r"[a-z0-9]+", sub))
75
+ return [t for t in tokens if t not in STOP_WORDS and len(t) > 1]
76
+
77
+
78
+ def _keyword_score(query_terms: list[str], file_path: str) -> tuple[float, list[str]]:
79
+ """
80
+ Score how well query terms match the file path / name.
81
+ Filename matches score higher than directory matches.
82
+ """
83
+ path_toks = _path_tokens(file_path)
84
+ filename_toks = _path_tokens(file_path.split("/")[-1])
85
+
86
+ matched = []
87
+ score = 0.0
88
+
89
+ for term in query_terms:
90
+ if term in filename_toks:
91
+ score += 1.0 # Strong signal: term in filename
92
+ matched.append(term)
93
+ elif term in path_toks:
94
+ score += 0.4 # Weaker signal: term in directory path
95
+ if term not in matched:
96
+ matched.append(term)
97
+
98
+ # Normalize by number of query terms
99
+ if query_terms:
100
+ score = score / len(query_terms)
101
+
102
+ return score, matched
103
+
104
+
105
+ def _content_score(query_terms: list[str], file_path: str) -> tuple[float, list[str]]:
106
+ """
107
+ Score how often query terms appear in the file's source code.
108
+ Uses term frequency, capped to avoid huge files dominating.
109
+ """
110
+ try:
111
+ content = open(file_path, encoding="utf-8").read().lower()
112
+ except Exception:
113
+ return 0.0, []
114
+
115
+ content_tokens = set(re.findall(r"[a-z0-9]+", content))
116
+ matched = [t for t in query_terms if t in content_tokens]
117
+
118
+ if not query_terms:
119
+ return 0.0, []
120
+
121
+ score = len(matched) / len(query_terms)
122
+ return score, matched
123
+
124
+
125
+ def _apply_neighbor_bonus(
126
+ scores: dict[str, float],
127
+ repo_graph: RepoGraph,
128
+ boost: float = 0.15,
129
+ depth: int = 1,
130
+ ) -> dict[str, float]:
131
+ """
132
+ Files neighbouring high-scoring files get a proximity boost.
133
+ Runs one pass: find top scorers, boost their graph neighbors.
134
+ """
135
+ if not scores:
136
+ return scores
137
+
138
+ max_score = max(scores.values()) or 1.0
139
+ boosted = dict(scores)
140
+
141
+ for path, score in scores.items():
142
+ # Only propagate from files with meaningful scores
143
+ if score < 0.1:
144
+ continue
145
+
146
+ neighbors = repo_graph.get_neighbors(path, depth=depth)
147
+ for neighbor in neighbors:
148
+ if neighbor in boosted:
149
+ # Boost proportional to how strong the source signal is
150
+ boosted[neighbor] += boost * (score / max_score)
151
+
152
+ return boosted
153
+
154
+
155
+ def query(
156
+ q: str,
157
+ repo_graph: RepoGraph,
158
+ top_n: int = 10,
159
+ ) -> list[RankedFile]:
160
+ """
161
+ Rank all files in the repository by relevance to a natural-language query.
162
+
163
+ Args:
164
+ q: Natural-language query string.
165
+ repo_graph: Built graph from build_graph().
166
+ top_n: Maximum number of results to return.
167
+
168
+ Returns:
169
+ List of RankedFile sorted by total_score descending.
170
+ """
171
+ query_terms = _tokenize(q)
172
+
173
+ if not query_terms:
174
+ return []
175
+
176
+ # --- Pass 1: keyword + content scores per file ---
177
+ keyword_scores: dict[str, float] = {}
178
+ content_scores: dict[str, float] = {}
179
+ matched_terms: dict[str, list[str]] = {}
180
+
181
+ for path, node in repo_graph.nodes.items():
182
+ kscore, kterms = _keyword_score(query_terms, path)
183
+ cscore, cterms = _content_score(query_terms, node.path if hasattr(node, "absolute_path") else
184
+ str(__import__("pathlib").Path(repo_graph.root) / path))
185
+
186
+ keyword_scores[path] = kscore
187
+ content_scores[path] = cscore
188
+ matched_terms[path] = list(set(kterms + cterms))
189
+
190
+ # --- Pass 2: combine into base score ---
191
+ base_scores: dict[str, float] = {}
192
+ for path in repo_graph.nodes:
193
+ base_scores[path] = (
194
+ keyword_scores.get(path, 0.0) * 0.5 +
195
+ content_scores.get(path, 0.0) * 0.5
196
+ )
197
+
198
+ # --- Pass 3: neighbor bonus ---
199
+ boosted_scores = _apply_neighbor_bonus(base_scores, repo_graph)
200
+
201
+ # --- Pass 4: centrality tiebreaker ---
202
+ results = []
203
+ for path, node in repo_graph.nodes.items():
204
+ neighbor_bonus = boosted_scores[path] - base_scores[path]
205
+ total = boosted_scores[path] + node.centrality * 0.05
206
+
207
+ results.append(RankedFile(
208
+ path=path,
209
+ total_score=total,
210
+ keyword_score=keyword_scores.get(path, 0.0),
211
+ content_score=content_scores.get(path, 0.0),
212
+ neighbor_bonus=neighbor_bonus,
213
+ centrality=node.centrality,
214
+ matched_terms=matched_terms.get(path, []),
215
+ ))
216
+
217
+ # Sort by total score descending
218
+ results.sort(key=lambda r: r.total_score, reverse=True)
219
+
220
+ # Return only files with non-zero score, up to top_n
221
+ return [r for r in results if r.total_score > 0.01][:top_n]
222
+
223
+
224
+ if __name__ == "__main__":
225
+ import sys
226
+
227
+ if len(sys.argv) < 3:
228
+ print("Usage: python query_engine.py <repo_path> '<query>'")
229
+ print("Example: python query_engine.py ../Lazy-Footers 'change the download button'")
230
+ sys.exit(1)
231
+
232
+ target = sys.argv[1]
233
+ user_query = sys.argv[2]
234
+
235
+ print(f"Query: '{user_query}'")
236
+ print(f"Repo: {target}")
237
+ print()
238
+
239
+ scan = scan_repo(target)
240
+ parse = parse_imports(scan)
241
+ repo_graph = build_graph(scan, parse)
242
+
243
+ ranked = query(user_query, repo_graph)
244
+
245
+ if not ranked:
246
+ print("No relevant files found.")
247
+ else:
248
+ print(f"Top {len(ranked)} relevant files:\n")
249
+ for i, result in enumerate(ranked, 1):
250
+ print(f"{i}.")
251
+ print(result.explain())
252
+ print()
@@ -0,0 +1,125 @@
1
+ """
2
+ Repository Intelligence Engine
3
+ Step 1: Repository Scanner
4
+
5
+ Walks a repository and discovers all relevant source files.
6
+ Filters by supported extensions for the MVP (Next.js / React / TypeScript).
7
+ """
8
+
9
+ import os
10
+ from pathlib import Path
11
+ from dataclasses import dataclass, field
12
+
13
+
14
+ SUPPORTED_EXTENSIONS = {".tsx", ".ts", ".jsx", ".js"}
15
+
16
+ IGNORED_DIRS = {
17
+ "node_modules",
18
+ ".git",
19
+ ".next",
20
+ "dist",
21
+ "build",
22
+ ".cache",
23
+ "__pycache__",
24
+ ".turbo",
25
+ "coverage",
26
+ }
27
+
28
+
29
+ @dataclass
30
+ class ScannedFile:
31
+ """Represents a single discovered file in the repository."""
32
+ path: str # Relative path from repo root
33
+ absolute_path: str # Full path on disk
34
+ extension: str # File extension
35
+ size_bytes: int # File size
36
+
37
+
38
+ @dataclass
39
+ class ScanResult:
40
+ """Result of scanning an entire repository."""
41
+ root: str
42
+ files: list[ScannedFile] = field(default_factory=list)
43
+
44
+ @property
45
+ def total_files(self) -> int:
46
+ return len(self.files)
47
+
48
+ def summary(self) -> str:
49
+ ext_counts: dict[str, int] = {}
50
+ for f in self.files:
51
+ ext_counts[f.extension] = ext_counts.get(f.extension, 0) + 1
52
+
53
+ lines = [
54
+ f"Repository: {self.root}",
55
+ f"Total files: {self.total_files}",
56
+ "",
57
+ "By extension:",
58
+ ]
59
+ for ext, count in sorted(ext_counts.items()):
60
+ lines.append(f" {ext:8s} {count} file{'s' if count != 1 else ''}")
61
+
62
+ return "\n".join(lines)
63
+
64
+
65
+ def scan_repo(repo_path: str) -> ScanResult:
66
+ """
67
+ Walk the repository at repo_path and return a ScanResult
68
+ containing all discovered source files.
69
+
70
+ Args:
71
+ repo_path: Path to the repository root (absolute or relative).
72
+
73
+ Returns:
74
+ ScanResult with all discovered files.
75
+
76
+ Raises:
77
+ ValueError: If repo_path does not exist or is not a directory.
78
+ """
79
+ root = Path(repo_path).resolve()
80
+
81
+ if not root.exists():
82
+ raise ValueError(f"Path does not exist: {root}")
83
+ if not root.is_dir():
84
+ raise ValueError(f"Path is not a directory: {root}")
85
+
86
+ result = ScanResult(root=str(root))
87
+
88
+ for dirpath, dirnames, filenames in os.walk(root):
89
+ # Prune ignored directories in-place so os.walk skips them
90
+ dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
91
+
92
+ for filename in filenames:
93
+ filepath = Path(dirpath) / filename
94
+ ext = filepath.suffix.lower()
95
+
96
+ if ext not in SUPPORTED_EXTENSIONS:
97
+ continue
98
+
99
+ relative = filepath.relative_to(root)
100
+ result.files.append(
101
+ ScannedFile(
102
+ path=str(relative),
103
+ absolute_path=str(filepath),
104
+ extension=ext,
105
+ size_bytes=filepath.stat().st_size,
106
+ )
107
+ )
108
+
109
+ # Sort by path for deterministic output
110
+ result.files.sort(key=lambda f: f.path)
111
+
112
+ return result
113
+
114
+
115
+ if __name__ == "__main__":
116
+ import sys
117
+
118
+ target = sys.argv[1] if len(sys.argv) > 1 else "."
119
+ result = scan_repo(target)
120
+
121
+ print(result.summary())
122
+ print()
123
+ print("Discovered files:")
124
+ for f in result.files:
125
+ print(f" {f.path} ({f.size_bytes} bytes)")