contextl 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +132 -0
- package/bin/contextl.js +182 -0
- package/package.json +40 -0
- package/python/graph_builder.py +171 -0
- package/python/import_parser.py +271 -0
- package/python/main.py +237 -0
- package/python/mcp_server.py +201 -0
- package/python/query_engine.py +252 -0
- package/python/scanner.py +125 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Repository Intelligence Engine — Universal MCP Server
|
|
3
|
+
|
|
4
|
+
Exposes the engine as an MCP (Model Context Protocol) server over stdio.
|
|
5
|
+
Works with any MCP-compatible IDE: Cursor, Windsurf, Claude Code, VS Code, etc.
|
|
6
|
+
|
|
7
|
+
The IDE launches this script automatically when it needs it.
|
|
8
|
+
No manual server to keep running.
|
|
9
|
+
|
|
10
|
+
Usage (the IDE does this for you, but you can test manually):
|
|
11
|
+
python mcp_server.py
|
|
12
|
+
|
|
13
|
+
Tools exposed:
|
|
14
|
+
query_repo — rank files by relevance to a natural-language query
|
|
15
|
+
scan_repo — list all source files the engine can see in a repo
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import json
|
|
20
|
+
import sys
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import mcp.types as types
|
|
24
|
+
from mcp.server import Server
|
|
25
|
+
from mcp.server.stdio import stdio_server
|
|
26
|
+
|
|
27
|
+
# Make sure the engine modules are importable from the same directory
|
|
28
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
29
|
+
|
|
30
|
+
from scanner import scan_repo
|
|
31
|
+
from import_parser import parse_imports
|
|
32
|
+
from graph_builder import build_graph
|
|
33
|
+
from query_engine import query as engine_query
|
|
34
|
+
from main import _confidence, _reasoning
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Server setup
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
server = Server("repo-intelligence")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Tool definitions
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
@server.list_tools()
|
|
47
|
+
async def list_tools() -> list[types.Tool]:
|
|
48
|
+
return [
|
|
49
|
+
types.Tool(
|
|
50
|
+
name="query_repo",
|
|
51
|
+
description=(
|
|
52
|
+
"Find the most relevant files in a code repository for a given "
|
|
53
|
+
"change request or natural-language query. "
|
|
54
|
+
"Returns a ranked list of files with relevance scores, confidence "
|
|
55
|
+
"levels, matched terms, and a one-line reasoning string. "
|
|
56
|
+
"Use this before editing code to identify exactly which files need "
|
|
57
|
+
"to change — avoids reading irrelevant files."
|
|
58
|
+
),
|
|
59
|
+
inputSchema={
|
|
60
|
+
"type": "object",
|
|
61
|
+
"properties": {
|
|
62
|
+
"repo_path": {
|
|
63
|
+
"type": "string",
|
|
64
|
+
"description": "Absolute or relative path to the repository root.",
|
|
65
|
+
},
|
|
66
|
+
"query": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": (
|
|
69
|
+
"Natural-language description of the change you want to make. "
|
|
70
|
+
"Examples: 'fix the upload error handler', "
|
|
71
|
+
"'change the login page logo', 'add dark mode to footer'."
|
|
72
|
+
),
|
|
73
|
+
},
|
|
74
|
+
"top_n": {
|
|
75
|
+
"type": "integer",
|
|
76
|
+
"description": "Maximum number of files to return (default: 5, max: 20).",
|
|
77
|
+
"default": 5,
|
|
78
|
+
"minimum": 1,
|
|
79
|
+
"maximum": 20,
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
"required": ["repo_path", "query"],
|
|
83
|
+
},
|
|
84
|
+
),
|
|
85
|
+
types.Tool(
|
|
86
|
+
name="scan_repo",
|
|
87
|
+
description=(
|
|
88
|
+
"Scan a repository and list all source files the engine can see. "
|
|
89
|
+
"Useful for understanding the repository structure before querying. "
|
|
90
|
+
"Returns file paths, extensions, and sizes."
|
|
91
|
+
),
|
|
92
|
+
inputSchema={
|
|
93
|
+
"type": "object",
|
|
94
|
+
"properties": {
|
|
95
|
+
"repo_path": {
|
|
96
|
+
"type": "string",
|
|
97
|
+
"description": "Absolute or relative path to the repository root.",
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
"required": ["repo_path"],
|
|
101
|
+
},
|
|
102
|
+
),
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# Tool handlers
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
@server.call_tool()
|
|
110
|
+
async def call_tool(name: str, arguments: dict) -> list[types.TextContent]:
|
|
111
|
+
|
|
112
|
+
if name == "query_repo":
|
|
113
|
+
return await _handle_query(arguments)
|
|
114
|
+
|
|
115
|
+
if name == "scan_repo":
|
|
116
|
+
return await _handle_scan(arguments)
|
|
117
|
+
|
|
118
|
+
return [types.TextContent(
|
|
119
|
+
type="text",
|
|
120
|
+
text=json.dumps({"error": f"Unknown tool: {name}"}),
|
|
121
|
+
)]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def _handle_query(args: dict) -> list[types.TextContent]:
|
|
125
|
+
repo_path = args.get("repo_path", "")
|
|
126
|
+
query_str = args.get("query", "")
|
|
127
|
+
top_n = min(int(args.get("top_n", 5)), 20)
|
|
128
|
+
|
|
129
|
+
# Run the engine (synchronous — wrap in thread to stay async-safe)
|
|
130
|
+
loop = asyncio.get_event_loop()
|
|
131
|
+
try:
|
|
132
|
+
result = await loop.run_in_executor(None, _run_query, repo_path, query_str, top_n)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
result = {"error": str(e), "query": query_str, "repo": repo_path, "results": []}
|
|
135
|
+
|
|
136
|
+
return [types.TextContent(type="text", text=json.dumps(result, indent=2))]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _run_query(repo_path: str, query_str: str, top_n: int) -> dict:
|
|
140
|
+
"""Synchronous pipeline — called from a thread executor."""
|
|
141
|
+
scan = scan_repo(repo_path)
|
|
142
|
+
parse = parse_imports(scan)
|
|
143
|
+
repo_graph = build_graph(scan, parse)
|
|
144
|
+
ranked = engine_query(query_str, repo_graph, top_n=top_n)
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"query": query_str,
|
|
148
|
+
"repo": scan.root,
|
|
149
|
+
"total_files_scanned": scan.total_files,
|
|
150
|
+
"results": [
|
|
151
|
+
{
|
|
152
|
+
"rank": i + 1,
|
|
153
|
+
"path": r.path,
|
|
154
|
+
"score": round(r.total_score, 4),
|
|
155
|
+
"confidence": _confidence(r.total_score),
|
|
156
|
+
"matched_terms": sorted(r.matched_terms),
|
|
157
|
+
"reasoning": _reasoning(r, repo_graph),
|
|
158
|
+
}
|
|
159
|
+
for i, r in enumerate(ranked)
|
|
160
|
+
],
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def _handle_scan(args: dict) -> list[types.TextContent]:
|
|
165
|
+
repo_path = args.get("repo_path", "")
|
|
166
|
+
|
|
167
|
+
loop = asyncio.get_event_loop()
|
|
168
|
+
try:
|
|
169
|
+
result = await loop.run_in_executor(None, _run_scan, repo_path)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
result = {"error": str(e), "repo": repo_path}
|
|
172
|
+
|
|
173
|
+
return [types.TextContent(type="text", text=json.dumps(result, indent=2))]
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _run_scan(repo_path: str) -> dict:
|
|
177
|
+
scan = scan_repo(repo_path)
|
|
178
|
+
return {
|
|
179
|
+
"repo": scan.root,
|
|
180
|
+
"total_files": scan.total_files,
|
|
181
|
+
"files": [
|
|
182
|
+
{"path": f.path, "extension": f.extension, "size_bytes": f.size_bytes}
|
|
183
|
+
for f in scan.files
|
|
184
|
+
],
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
# Entry point
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
async def main():
|
|
192
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
193
|
+
await server.run(
|
|
194
|
+
read_stream,
|
|
195
|
+
write_stream,
|
|
196
|
+
server.create_initialization_options(),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
if __name__ == "__main__":
|
|
201
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Repository Intelligence Engine
|
|
3
|
+
Step 4: Query Engine
|
|
4
|
+
|
|
5
|
+
Accepts a natural-language query and ranks files by relevance using:
|
|
6
|
+
|
|
7
|
+
1. Keyword match score — does the filename / path contain query terms?
|
|
8
|
+
2. Content match score — do the file's contents mention query terms?
|
|
9
|
+
3. Graph neighbor bonus — if a high-scoring file is nearby in the graph,
|
|
10
|
+
its neighbors get a proximity boost
|
|
11
|
+
4. Centrality weight — more connected files rank slightly higher when
|
|
12
|
+
scores are otherwise equal
|
|
13
|
+
|
|
14
|
+
No LLM. No embeddings. Pure text + graph.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
|
|
20
|
+
from scanner import scan_repo
|
|
21
|
+
from import_parser import parse_imports
|
|
22
|
+
from graph_builder import build_graph, RepoGraph
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Stop words — stripped from queries before matching
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
STOP_WORDS = {
|
|
29
|
+
"a", "an", "the", "in", "on", "at", "to", "for", "of", "and",
|
|
30
|
+
"or", "is", "it", "this", "that", "with", "from", "how", "what",
|
|
31
|
+
"where", "when", "which", "who", "make", "change", "update", "fix",
|
|
32
|
+
"add", "remove", "edit", "modify", "the", "my", "our", "i", "we",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class RankedFile:
|
|
38
|
+
"""A file with its relevance score and scoring breakdown."""
|
|
39
|
+
path: str
|
|
40
|
+
total_score: float
|
|
41
|
+
keyword_score: float # path/filename match
|
|
42
|
+
content_score: float # file content match
|
|
43
|
+
neighbor_bonus: float # proximity to other high-scoring files
|
|
44
|
+
centrality: float # PageRank weight
|
|
45
|
+
matched_terms: list[str] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
def explain(self) -> str:
|
|
48
|
+
terms = ", ".join(self.matched_terms) if self.matched_terms else "none"
|
|
49
|
+
return (
|
|
50
|
+
f" {self.path}\n"
|
|
51
|
+
f" score={self.total_score:.4f} "
|
|
52
|
+
f"keyword={self.keyword_score:.3f} "
|
|
53
|
+
f"content={self.content_score:.3f} "
|
|
54
|
+
f"neighbor={self.neighbor_bonus:.3f} "
|
|
55
|
+
f"centrality={self.centrality:.4f}\n"
|
|
56
|
+
f" matched terms: {terms}"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _tokenize(text: str) -> list[str]:
|
|
61
|
+
"""Lowercase, split on non-alphanumeric, remove stop words."""
|
|
62
|
+
tokens = re.findall(r"[a-z0-9]+", text.lower())
|
|
63
|
+
return [t for t in tokens if t not in STOP_WORDS and len(t) > 1]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _path_tokens(path: str) -> list[str]:
|
|
67
|
+
"""Extract meaningful tokens from a file path."""
|
|
68
|
+
# Split on slashes, dots, camelCase, kebab-case, underscores
|
|
69
|
+
parts = re.split(r"[/\\.]", path)
|
|
70
|
+
tokens = []
|
|
71
|
+
for part in parts:
|
|
72
|
+
# Split camelCase: LoginHeader → login, header
|
|
73
|
+
sub = re.sub(r"([A-Z])", r" \1", part).lower()
|
|
74
|
+
tokens.extend(re.findall(r"[a-z0-9]+", sub))
|
|
75
|
+
return [t for t in tokens if t not in STOP_WORDS and len(t) > 1]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _keyword_score(query_terms: list[str], file_path: str) -> tuple[float, list[str]]:
|
|
79
|
+
"""
|
|
80
|
+
Score how well query terms match the file path / name.
|
|
81
|
+
Filename matches score higher than directory matches.
|
|
82
|
+
"""
|
|
83
|
+
path_toks = _path_tokens(file_path)
|
|
84
|
+
filename_toks = _path_tokens(file_path.split("/")[-1])
|
|
85
|
+
|
|
86
|
+
matched = []
|
|
87
|
+
score = 0.0
|
|
88
|
+
|
|
89
|
+
for term in query_terms:
|
|
90
|
+
if term in filename_toks:
|
|
91
|
+
score += 1.0 # Strong signal: term in filename
|
|
92
|
+
matched.append(term)
|
|
93
|
+
elif term in path_toks:
|
|
94
|
+
score += 0.4 # Weaker signal: term in directory path
|
|
95
|
+
if term not in matched:
|
|
96
|
+
matched.append(term)
|
|
97
|
+
|
|
98
|
+
# Normalize by number of query terms
|
|
99
|
+
if query_terms:
|
|
100
|
+
score = score / len(query_terms)
|
|
101
|
+
|
|
102
|
+
return score, matched
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _content_score(query_terms: list[str], file_path: str) -> tuple[float, list[str]]:
|
|
106
|
+
"""
|
|
107
|
+
Score how often query terms appear in the file's source code.
|
|
108
|
+
Uses term frequency, capped to avoid huge files dominating.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
content = open(file_path, encoding="utf-8").read().lower()
|
|
112
|
+
except Exception:
|
|
113
|
+
return 0.0, []
|
|
114
|
+
|
|
115
|
+
content_tokens = set(re.findall(r"[a-z0-9]+", content))
|
|
116
|
+
matched = [t for t in query_terms if t in content_tokens]
|
|
117
|
+
|
|
118
|
+
if not query_terms:
|
|
119
|
+
return 0.0, []
|
|
120
|
+
|
|
121
|
+
score = len(matched) / len(query_terms)
|
|
122
|
+
return score, matched
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _apply_neighbor_bonus(
|
|
126
|
+
scores: dict[str, float],
|
|
127
|
+
repo_graph: RepoGraph,
|
|
128
|
+
boost: float = 0.15,
|
|
129
|
+
depth: int = 1,
|
|
130
|
+
) -> dict[str, float]:
|
|
131
|
+
"""
|
|
132
|
+
Files neighbouring high-scoring files get a proximity boost.
|
|
133
|
+
Runs one pass: find top scorers, boost their graph neighbors.
|
|
134
|
+
"""
|
|
135
|
+
if not scores:
|
|
136
|
+
return scores
|
|
137
|
+
|
|
138
|
+
max_score = max(scores.values()) or 1.0
|
|
139
|
+
boosted = dict(scores)
|
|
140
|
+
|
|
141
|
+
for path, score in scores.items():
|
|
142
|
+
# Only propagate from files with meaningful scores
|
|
143
|
+
if score < 0.1:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
neighbors = repo_graph.get_neighbors(path, depth=depth)
|
|
147
|
+
for neighbor in neighbors:
|
|
148
|
+
if neighbor in boosted:
|
|
149
|
+
# Boost proportional to how strong the source signal is
|
|
150
|
+
boosted[neighbor] += boost * (score / max_score)
|
|
151
|
+
|
|
152
|
+
return boosted
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def query(
|
|
156
|
+
q: str,
|
|
157
|
+
repo_graph: RepoGraph,
|
|
158
|
+
top_n: int = 10,
|
|
159
|
+
) -> list[RankedFile]:
|
|
160
|
+
"""
|
|
161
|
+
Rank all files in the repository by relevance to a natural-language query.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
q: Natural-language query string.
|
|
165
|
+
repo_graph: Built graph from build_graph().
|
|
166
|
+
top_n: Maximum number of results to return.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
List of RankedFile sorted by total_score descending.
|
|
170
|
+
"""
|
|
171
|
+
query_terms = _tokenize(q)
|
|
172
|
+
|
|
173
|
+
if not query_terms:
|
|
174
|
+
return []
|
|
175
|
+
|
|
176
|
+
# --- Pass 1: keyword + content scores per file ---
|
|
177
|
+
keyword_scores: dict[str, float] = {}
|
|
178
|
+
content_scores: dict[str, float] = {}
|
|
179
|
+
matched_terms: dict[str, list[str]] = {}
|
|
180
|
+
|
|
181
|
+
for path, node in repo_graph.nodes.items():
|
|
182
|
+
kscore, kterms = _keyword_score(query_terms, path)
|
|
183
|
+
cscore, cterms = _content_score(query_terms, node.path if hasattr(node, "absolute_path") else
|
|
184
|
+
str(__import__("pathlib").Path(repo_graph.root) / path))
|
|
185
|
+
|
|
186
|
+
keyword_scores[path] = kscore
|
|
187
|
+
content_scores[path] = cscore
|
|
188
|
+
matched_terms[path] = list(set(kterms + cterms))
|
|
189
|
+
|
|
190
|
+
# --- Pass 2: combine into base score ---
|
|
191
|
+
base_scores: dict[str, float] = {}
|
|
192
|
+
for path in repo_graph.nodes:
|
|
193
|
+
base_scores[path] = (
|
|
194
|
+
keyword_scores.get(path, 0.0) * 0.5 +
|
|
195
|
+
content_scores.get(path, 0.0) * 0.5
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# --- Pass 3: neighbor bonus ---
|
|
199
|
+
boosted_scores = _apply_neighbor_bonus(base_scores, repo_graph)
|
|
200
|
+
|
|
201
|
+
# --- Pass 4: centrality tiebreaker ---
|
|
202
|
+
results = []
|
|
203
|
+
for path, node in repo_graph.nodes.items():
|
|
204
|
+
neighbor_bonus = boosted_scores[path] - base_scores[path]
|
|
205
|
+
total = boosted_scores[path] + node.centrality * 0.05
|
|
206
|
+
|
|
207
|
+
results.append(RankedFile(
|
|
208
|
+
path=path,
|
|
209
|
+
total_score=total,
|
|
210
|
+
keyword_score=keyword_scores.get(path, 0.0),
|
|
211
|
+
content_score=content_scores.get(path, 0.0),
|
|
212
|
+
neighbor_bonus=neighbor_bonus,
|
|
213
|
+
centrality=node.centrality,
|
|
214
|
+
matched_terms=matched_terms.get(path, []),
|
|
215
|
+
))
|
|
216
|
+
|
|
217
|
+
# Sort by total score descending
|
|
218
|
+
results.sort(key=lambda r: r.total_score, reverse=True)
|
|
219
|
+
|
|
220
|
+
# Return only files with non-zero score, up to top_n
|
|
221
|
+
return [r for r in results if r.total_score > 0.01][:top_n]
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
if __name__ == "__main__":
|
|
225
|
+
import sys
|
|
226
|
+
|
|
227
|
+
if len(sys.argv) < 3:
|
|
228
|
+
print("Usage: python query_engine.py <repo_path> '<query>'")
|
|
229
|
+
print("Example: python query_engine.py ../Lazy-Footers 'change the download button'")
|
|
230
|
+
sys.exit(1)
|
|
231
|
+
|
|
232
|
+
target = sys.argv[1]
|
|
233
|
+
user_query = sys.argv[2]
|
|
234
|
+
|
|
235
|
+
print(f"Query: '{user_query}'")
|
|
236
|
+
print(f"Repo: {target}")
|
|
237
|
+
print()
|
|
238
|
+
|
|
239
|
+
scan = scan_repo(target)
|
|
240
|
+
parse = parse_imports(scan)
|
|
241
|
+
repo_graph = build_graph(scan, parse)
|
|
242
|
+
|
|
243
|
+
ranked = query(user_query, repo_graph)
|
|
244
|
+
|
|
245
|
+
if not ranked:
|
|
246
|
+
print("No relevant files found.")
|
|
247
|
+
else:
|
|
248
|
+
print(f"Top {len(ranked)} relevant files:\n")
|
|
249
|
+
for i, result in enumerate(ranked, 1):
|
|
250
|
+
print(f"{i}.")
|
|
251
|
+
print(result.explain())
|
|
252
|
+
print()
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Repository Intelligence Engine
|
|
3
|
+
Step 1: Repository Scanner
|
|
4
|
+
|
|
5
|
+
Walks a repository and discovers all relevant source files.
|
|
6
|
+
Filters by supported extensions for the MVP (Next.js / React / TypeScript).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
SUPPORTED_EXTENSIONS = {".tsx", ".ts", ".jsx", ".js"}
|
|
15
|
+
|
|
16
|
+
IGNORED_DIRS = {
|
|
17
|
+
"node_modules",
|
|
18
|
+
".git",
|
|
19
|
+
".next",
|
|
20
|
+
"dist",
|
|
21
|
+
"build",
|
|
22
|
+
".cache",
|
|
23
|
+
"__pycache__",
|
|
24
|
+
".turbo",
|
|
25
|
+
"coverage",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ScannedFile:
|
|
31
|
+
"""Represents a single discovered file in the repository."""
|
|
32
|
+
path: str # Relative path from repo root
|
|
33
|
+
absolute_path: str # Full path on disk
|
|
34
|
+
extension: str # File extension
|
|
35
|
+
size_bytes: int # File size
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ScanResult:
|
|
40
|
+
"""Result of scanning an entire repository."""
|
|
41
|
+
root: str
|
|
42
|
+
files: list[ScannedFile] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def total_files(self) -> int:
|
|
46
|
+
return len(self.files)
|
|
47
|
+
|
|
48
|
+
def summary(self) -> str:
|
|
49
|
+
ext_counts: dict[str, int] = {}
|
|
50
|
+
for f in self.files:
|
|
51
|
+
ext_counts[f.extension] = ext_counts.get(f.extension, 0) + 1
|
|
52
|
+
|
|
53
|
+
lines = [
|
|
54
|
+
f"Repository: {self.root}",
|
|
55
|
+
f"Total files: {self.total_files}",
|
|
56
|
+
"",
|
|
57
|
+
"By extension:",
|
|
58
|
+
]
|
|
59
|
+
for ext, count in sorted(ext_counts.items()):
|
|
60
|
+
lines.append(f" {ext:8s} {count} file{'s' if count != 1 else ''}")
|
|
61
|
+
|
|
62
|
+
return "\n".join(lines)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def scan_repo(repo_path: str) -> ScanResult:
|
|
66
|
+
"""
|
|
67
|
+
Walk the repository at repo_path and return a ScanResult
|
|
68
|
+
containing all discovered source files.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
repo_path: Path to the repository root (absolute or relative).
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
ScanResult with all discovered files.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If repo_path does not exist or is not a directory.
|
|
78
|
+
"""
|
|
79
|
+
root = Path(repo_path).resolve()
|
|
80
|
+
|
|
81
|
+
if not root.exists():
|
|
82
|
+
raise ValueError(f"Path does not exist: {root}")
|
|
83
|
+
if not root.is_dir():
|
|
84
|
+
raise ValueError(f"Path is not a directory: {root}")
|
|
85
|
+
|
|
86
|
+
result = ScanResult(root=str(root))
|
|
87
|
+
|
|
88
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
89
|
+
# Prune ignored directories in-place so os.walk skips them
|
|
90
|
+
dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
|
|
91
|
+
|
|
92
|
+
for filename in filenames:
|
|
93
|
+
filepath = Path(dirpath) / filename
|
|
94
|
+
ext = filepath.suffix.lower()
|
|
95
|
+
|
|
96
|
+
if ext not in SUPPORTED_EXTENSIONS:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
relative = filepath.relative_to(root)
|
|
100
|
+
result.files.append(
|
|
101
|
+
ScannedFile(
|
|
102
|
+
path=str(relative),
|
|
103
|
+
absolute_path=str(filepath),
|
|
104
|
+
extension=ext,
|
|
105
|
+
size_bytes=filepath.stat().st_size,
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Sort by path for deterministic output
|
|
110
|
+
result.files.sort(key=lambda f: f.path)
|
|
111
|
+
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
import sys
|
|
117
|
+
|
|
118
|
+
target = sys.argv[1] if len(sys.argv) > 1 else "."
|
|
119
|
+
result = scan_repo(target)
|
|
120
|
+
|
|
121
|
+
print(result.summary())
|
|
122
|
+
print()
|
|
123
|
+
print("Discovered files:")
|
|
124
|
+
for f in result.files:
|
|
125
|
+
print(f" {f.path} ({f.size_bytes} bytes)")
|