repomap-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repomap/__init__.py +320 -0
- repomap/ai.py +1108 -0
- repomap/check.py +1212 -0
- repomap/cli/__init__.py +3 -0
- repomap/cli/__main__.py +12 -0
- repomap/cli/cli.py +2475 -0
- repomap/core.py +730 -0
- repomap/lsp.py +753 -0
- repomap/parser.py +1697 -0
- repomap/ranking.py +639 -0
- repomap/resolver.py +906 -0
- repomap/toolkit.py +850 -0
- repomap/topic.py +600 -0
- repomap_cli-1.0.0.dist-info/METADATA +284 -0
- repomap_cli-1.0.0.dist-info/RECORD +18 -0
- repomap_cli-1.0.0.dist-info/WHEEL +4 -0
- repomap_cli-1.0.0.dist-info/entry_points.txt +2 -0
- repomap_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
repomap/core.py
ADDED
|
@@ -0,0 +1,730 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Repo Map Core — Tree-sitter Analysis Engine (Coordinator Layer)
|
|
4
|
+
================================================================
|
|
5
|
+
给 RepoMap CLI 提供扫描、解析、图构建和 AI overview 能力。
|
|
6
|
+
|
|
7
|
+
目标:AI 在逐文件阅读代码之前,先通过这个工具建立
|
|
8
|
+
"项目地图"——了解业务模块划分、核心函数调用关系、
|
|
9
|
+
高密度文件分布、入口点等,从而更高效地定位和理解代码。
|
|
10
|
+
|
|
11
|
+
安装 & 运行(CLI 模式):
|
|
12
|
+
uv run python -m repomap_cli overview --project /path/to/your/project
|
|
13
|
+
|
|
14
|
+
本地调试(直接打印 repo map):
|
|
15
|
+
python -m repomap_cli overview --project /path/to/your/project
|
|
16
|
+
python -m repomap_cli call-chain --project /path/to/your/project --symbol MyClassName
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import subprocess
|
|
24
|
+
import sys
|
|
25
|
+
from pathlib import Path, PurePosixPath
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from .ai import (
|
|
29
|
+
render_call_chain_report,
|
|
30
|
+
render_file_detail_report,
|
|
31
|
+
render_overview_report,
|
|
32
|
+
)
|
|
33
|
+
from .parser import EXT_TO_LANG, TreeSitterAdapter
|
|
34
|
+
from .ranking import EdgeBuilder, GraphAnalyzer
|
|
35
|
+
from .resolver import ImportResolver
|
|
36
|
+
from . import (
|
|
37
|
+
RepoGraph,
|
|
38
|
+
ScanStats,
|
|
39
|
+
Symbol,
|
|
40
|
+
get_incremental_cache_path,
|
|
41
|
+
serialize_symbol,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# ── 日志:统一写 stderr,绝不污染 CLI stdout ────────────────────────────────
|
|
45
|
+
logging.basicConfig(
|
|
46
|
+
level=logging.INFO,
|
|
47
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
48
|
+
stream=sys.stderr,
|
|
49
|
+
)
|
|
50
|
+
logger = logging.getLogger("repomap")
|
|
51
|
+
|
|
52
|
+
DEFAULT_MAX_FILE_BYTES = 512 * 1024
|
|
53
|
+
|
|
54
|
+
SKIP_DIR_NAMES = {
|
|
55
|
+
".cache",
|
|
56
|
+
".git",
|
|
57
|
+
".hg",
|
|
58
|
+
".idea",
|
|
59
|
+
".mypy_cache",
|
|
60
|
+
".next",
|
|
61
|
+
".nox",
|
|
62
|
+
".nuxt",
|
|
63
|
+
".parcel-cache",
|
|
64
|
+
".pnpm-store",
|
|
65
|
+
".pytest_cache",
|
|
66
|
+
".ruff_cache",
|
|
67
|
+
".svelte-kit",
|
|
68
|
+
".tox",
|
|
69
|
+
".turbo",
|
|
70
|
+
".venv",
|
|
71
|
+
".vscode",
|
|
72
|
+
".yarn",
|
|
73
|
+
"__pypackages__",
|
|
74
|
+
"__pycache__",
|
|
75
|
+
"build",
|
|
76
|
+
"coverage",
|
|
77
|
+
"dist",
|
|
78
|
+
"env",
|
|
79
|
+
"ENV",
|
|
80
|
+
"node_modules",
|
|
81
|
+
"site-packages",
|
|
82
|
+
"target",
|
|
83
|
+
"venv",
|
|
84
|
+
# 第三方库目录
|
|
85
|
+
"monaco-editor",
|
|
86
|
+
"monaco",
|
|
87
|
+
"vendor",
|
|
88
|
+
"third_party",
|
|
89
|
+
"third-party",
|
|
90
|
+
"libs",
|
|
91
|
+
"external",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
SKIP_FILE_NAMES = {
|
|
95
|
+
"package-lock.json",
|
|
96
|
+
"npm-shrinkwrap.json",
|
|
97
|
+
"bun.lock",
|
|
98
|
+
"bun.lockb",
|
|
99
|
+
"yarn.lock",
|
|
100
|
+
"pnpm-lock.yaml",
|
|
101
|
+
"Cargo.lock",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
SUPPORTING_FILE_NAMES = {
|
|
105
|
+
"AGENTS.md",
|
|
106
|
+
"CLAUDE.md",
|
|
107
|
+
"README.md",
|
|
108
|
+
"SKILL.md",
|
|
109
|
+
"CONTRIBUTING.md",
|
|
110
|
+
"CHANGELOG.md",
|
|
111
|
+
"Makefile",
|
|
112
|
+
"Dockerfile",
|
|
113
|
+
"docker-compose.yml",
|
|
114
|
+
"compose.yml",
|
|
115
|
+
"package.json",
|
|
116
|
+
"pyproject.toml",
|
|
117
|
+
"Cargo.toml",
|
|
118
|
+
"go.mod",
|
|
119
|
+
"requirements.txt",
|
|
120
|
+
"tsconfig.json",
|
|
121
|
+
"tsconfig.app.json",
|
|
122
|
+
"tsconfig.node.json",
|
|
123
|
+
"vitest.config.ts",
|
|
124
|
+
"vitest.config.js",
|
|
125
|
+
"vite.config.ts",
|
|
126
|
+
"vite.config.js",
|
|
127
|
+
"eslint.config.js",
|
|
128
|
+
"eslint.config.mjs",
|
|
129
|
+
"pytest.ini",
|
|
130
|
+
"tox.ini",
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
SENSITIVE_SUPPORTING_FILE_NAMES = {
|
|
134
|
+
".env",
|
|
135
|
+
".env.local",
|
|
136
|
+
".env.development",
|
|
137
|
+
".env.production",
|
|
138
|
+
".env.test",
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
143
|
+
# 核心引擎(协调层)
|
|
144
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class RepoMapEngine:
|
|
148
|
+
"""
|
|
149
|
+
项目地图引擎:扫描代码库 → 构建符号依赖图 → PageRank → 输出 AI 友好摘要。
|
|
150
|
+
|
|
151
|
+
给 AI 提供的"项目地图"信息包括:
|
|
152
|
+
1. 模块/文件分布(哪些文件密度高,可能是核心业务)
|
|
153
|
+
2. 入口点(main/app/index 等)
|
|
154
|
+
3. 重要符号(PageRank 高 = 被很多地方调用/导入)
|
|
155
|
+
4. 调用链(某函数被谁调、调了谁)
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
IMPORT_WEIGHT = 0.35
|
|
159
|
+
CALL_WEIGHT = 0.50
|
|
160
|
+
|
|
161
|
+
def __init__(self, project_root: str) -> None:
|
|
162
|
+
self.project_root = Path(project_root).resolve()
|
|
163
|
+
self.ts = TreeSitterAdapter()
|
|
164
|
+
self.graph = RepoGraph()
|
|
165
|
+
# file -> mtime 增量缓存(只存 mtime,不存 tree 对象以避免内存泄漏)
|
|
166
|
+
self._cache: dict[str, float] = {}
|
|
167
|
+
self.scan_state = "idle"
|
|
168
|
+
self.max_file_bytes = self._read_max_file_bytes()
|
|
169
|
+
self.scan_stats = ScanStats()
|
|
170
|
+
# 子组件
|
|
171
|
+
self._resolver: ImportResolver | None = None
|
|
172
|
+
self._analyzer = GraphAnalyzer(self.graph)
|
|
173
|
+
# 路由提取结果
|
|
174
|
+
self.routes: list = []
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _read_max_file_bytes() -> int:
|
|
178
|
+
raw = os.getenv("REPOMAP_MAX_FILE_BYTES", str(DEFAULT_MAX_FILE_BYTES))
|
|
179
|
+
try:
|
|
180
|
+
value = int(raw)
|
|
181
|
+
except ValueError:
|
|
182
|
+
return DEFAULT_MAX_FILE_BYTES
|
|
183
|
+
return max(0, value)
|
|
184
|
+
|
|
185
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
186
|
+
# 扫描主流程
|
|
187
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
188
|
+
|
|
189
|
+
def scan(self, max_files: int = 8000, max_scan_time: float = 300.0,
|
|
190
|
+
incremental: bool = False) -> None:
|
|
191
|
+
"""三阶段扫描:提取符号 → 建依赖边 → PageRank。
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
max_files: 最多扫描文件数
|
|
195
|
+
max_scan_time: 扫描超时时间(秒),默认 300 秒(5 分钟)
|
|
196
|
+
incremental: 尝试增量扫描——只重新解析 git 变更文件
|
|
197
|
+
"""
|
|
198
|
+
import time
|
|
199
|
+
start_time = time.time()
|
|
200
|
+
|
|
201
|
+
self.scan_state = "invalid"
|
|
202
|
+
if not self.ts.parsers:
|
|
203
|
+
raise RuntimeError(
|
|
204
|
+
"未检测到任何 tree-sitter 语言绑定。\n"
|
|
205
|
+
"请安装:pip install tree-sitter tree-sitter-python tree-sitter-javascript ..."
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
self.graph = RepoGraph()
|
|
209
|
+
self._cache = {}
|
|
210
|
+
self.scan_stats = ScanStats()
|
|
211
|
+
self.routes = []
|
|
212
|
+
self._inc_cache_loaded = False
|
|
213
|
+
|
|
214
|
+
# 尝试加载增量缓存
|
|
215
|
+
inc_cache = None
|
|
216
|
+
if incremental:
|
|
217
|
+
inc_cache = self._load_incremental_cache_if_valid()
|
|
218
|
+
if inc_cache:
|
|
219
|
+
changed_files, deleted_files = self._git_changed_files()
|
|
220
|
+
all_candidate_files = self._list_files(max_files)
|
|
221
|
+
# 过滤:只保留仍在项目中的变更文件
|
|
222
|
+
changed_set = set(changed_files) & set(all_candidate_files)
|
|
223
|
+
unchanged_set = set(inc_cache.files.keys()) - changed_set - set(deleted_files)
|
|
224
|
+
# 只解析变更文件
|
|
225
|
+
files_to_scan = [f for f in all_candidate_files if f in changed_set]
|
|
226
|
+
logger.info(
|
|
227
|
+
f"Incremental scan: {len(files_to_scan)} changed, "
|
|
228
|
+
f"{len(unchanged_set)} unchanged, {len(deleted_files)} deleted"
|
|
229
|
+
)
|
|
230
|
+
# 还原未变更文件
|
|
231
|
+
for f in sorted(unchanged_set):
|
|
232
|
+
if f in all_candidate_files:
|
|
233
|
+
self._restore_from_inc_cache(f, inc_cache.files[f])
|
|
234
|
+
self._inc_cache_loaded = True
|
|
235
|
+
else:
|
|
236
|
+
files_to_scan = self._list_files(max_files)
|
|
237
|
+
logger.info(f"Found {len(files_to_scan)} source files")
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
for f in files_to_scan:
|
|
241
|
+
# 超时熔断检查
|
|
242
|
+
elapsed = time.time() - start_time
|
|
243
|
+
if elapsed > max_scan_time:
|
|
244
|
+
self.scan_stats.timeout_triggered = True
|
|
245
|
+
logger.warning(f"扫描超时熔断:已运行 {elapsed:.1f}s,超过 {max_scan_time}s 限制")
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
self._process_file(f)
|
|
250
|
+
except Exception as e:
|
|
251
|
+
if len(self.scan_stats.failed_files) < 5:
|
|
252
|
+
self.scan_stats.failed_files.append(f"{f}: {type(e).__name__}: {str(e)[:50]}")
|
|
253
|
+
logger.warning(f"Failed to process file {f}: {e}")
|
|
254
|
+
|
|
255
|
+
self._build_edges()
|
|
256
|
+
self._analyzer = GraphAnalyzer(self.graph)
|
|
257
|
+
self._calculate_pagerank()
|
|
258
|
+
self.scan_state = "scanned"
|
|
259
|
+
except Exception:
|
|
260
|
+
self.scan_state = "invalid"
|
|
261
|
+
raise
|
|
262
|
+
finally:
|
|
263
|
+
self.scan_stats.scan_duration_ms = int((time.time() - start_time) * 1000)
|
|
264
|
+
|
|
265
|
+
# 全量扫描后保存增量基线
|
|
266
|
+
if not self._inc_cache_loaded and self.scan_state == "scanned":
|
|
267
|
+
try:
|
|
268
|
+
from .toolkit import save_incremental_cache
|
|
269
|
+
save_incremental_cache(str(self.project_root), self)
|
|
270
|
+
except Exception as e:
|
|
271
|
+
logger.debug(f"Failed to save incremental cache: {e}")
|
|
272
|
+
|
|
273
|
+
sym_count = len(self.graph.symbols)
|
|
274
|
+
edge_count = sum(len(v) for v in self.graph.outgoing.values())
|
|
275
|
+
|
|
276
|
+
summary_parts = [f"Scan complete — {sym_count} symbols, {edge_count} edges, {self.scan_stats.scan_duration_ms}ms"]
|
|
277
|
+
if self.scan_stats.skipped_files:
|
|
278
|
+
summary_parts.append(f", {self.scan_stats.skipped_files} skipped (unchanged)")
|
|
279
|
+
if self.scan_stats.failed_files:
|
|
280
|
+
summary_parts.append(f", {len(self.scan_stats.failed_files)} failed files")
|
|
281
|
+
if self.scan_stats.timeout_triggered:
|
|
282
|
+
summary_parts.append(", timeout triggered")
|
|
283
|
+
|
|
284
|
+
if self.scan_stats.failed_files or self.scan_stats.timeout_triggered:
|
|
285
|
+
logger.warning("".join(summary_parts))
|
|
286
|
+
else:
|
|
287
|
+
logger.info("".join(summary_parts))
|
|
288
|
+
|
|
289
|
+
def is_scanned(self) -> bool:
|
|
290
|
+
return self.scan_state == "scanned"
|
|
291
|
+
|
|
292
|
+
# ── 增量扫描辅助 ─────────────────────────────────────────────────────────
|
|
293
|
+
|
|
294
|
+
def _load_incremental_cache_if_valid(self) -> Any | None:
|
|
295
|
+
"""加载增量缓存并校验有效性(项目路径 + git HEAD 匹配)。"""
|
|
296
|
+
try:
|
|
297
|
+
from .toolkit import load_incremental_cache
|
|
298
|
+
cache = load_incremental_cache(str(self.project_root))
|
|
299
|
+
if cache is None or not cache.files:
|
|
300
|
+
return None
|
|
301
|
+
# 校验 git HEAD 是否匹配
|
|
302
|
+
try:
|
|
303
|
+
result = subprocess.run(
|
|
304
|
+
["git", "rev-parse", "HEAD"],
|
|
305
|
+
cwd=self.project_root, capture_output=True, text=True, timeout=5,
|
|
306
|
+
)
|
|
307
|
+
if result.returncode != 0:
|
|
308
|
+
return None
|
|
309
|
+
if cache.git_head and cache.git_head != result.stdout.strip():
|
|
310
|
+
logger.debug("Incremental cache stale: git HEAD changed")
|
|
311
|
+
return None
|
|
312
|
+
except Exception:
|
|
313
|
+
pass
|
|
314
|
+
return cache
|
|
315
|
+
except Exception:
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
def _git_changed_files(self) -> tuple[list[str], list[str]]:
|
|
319
|
+
"""返回 (modified_files, deleted_files),相对于项目根目录。"""
|
|
320
|
+
modified, deleted = [], []
|
|
321
|
+
try:
|
|
322
|
+
# unstaged + staged modifications
|
|
323
|
+
for status_cmd in (["git", "diff", "--name-only", "HEAD"],):
|
|
324
|
+
result = subprocess.run(
|
|
325
|
+
status_cmd, cwd=self.project_root, capture_output=True, text=True, timeout=10,
|
|
326
|
+
)
|
|
327
|
+
if result.returncode == 0:
|
|
328
|
+
for line in result.stdout.strip().split("\n"):
|
|
329
|
+
if line:
|
|
330
|
+
modified.append(line)
|
|
331
|
+
# deleted files
|
|
332
|
+
result = subprocess.run(
|
|
333
|
+
["git", "diff", "--name-only", "--diff-filter=D", "HEAD"],
|
|
334
|
+
cwd=self.project_root, capture_output=True, text=True, timeout=10,
|
|
335
|
+
)
|
|
336
|
+
if result.returncode == 0:
|
|
337
|
+
for line in result.stdout.strip().split("\n"):
|
|
338
|
+
if line:
|
|
339
|
+
deleted.append(line)
|
|
340
|
+
except Exception:
|
|
341
|
+
pass
|
|
342
|
+
return sorted(set(modified)), sorted(set(deleted))
|
|
343
|
+
|
|
344
|
+
def _restore_from_inc_cache(self, file_path: str, entry: Any) -> None:
|
|
345
|
+
"""从增量缓存还原文件解析结果,跳过 tree-sitter 解析。"""
|
|
346
|
+
# 检查文件 mtime 是否一致
|
|
347
|
+
full = self.project_root / file_path
|
|
348
|
+
if full.exists():
|
|
349
|
+
actual_mtime = full.stat().st_mtime
|
|
350
|
+
if abs(actual_mtime - entry.mtime) > 0.001:
|
|
351
|
+
return # mtime 不一致,不还原
|
|
352
|
+
|
|
353
|
+
# 还原符号
|
|
354
|
+
self.graph.file_symbols.setdefault(file_path, [])
|
|
355
|
+
for sym_dict in entry.symbols_json:
|
|
356
|
+
sym = Symbol(
|
|
357
|
+
id=sym_dict["id"], name=sym_dict["name"], kind=sym_dict["kind"],
|
|
358
|
+
file=sym_dict["file"], line=sym_dict["line"],
|
|
359
|
+
end_line=sym_dict.get("end_line", sym_dict["line"]),
|
|
360
|
+
col=sym_dict.get("col", 0),
|
|
361
|
+
visibility=sym_dict.get("visibility", "private"),
|
|
362
|
+
docstring=sym_dict.get("docstring", ""),
|
|
363
|
+
signature=sym_dict.get("signature", ""),
|
|
364
|
+
pagerank=sym_dict.get("pagerank", 0.0),
|
|
365
|
+
)
|
|
366
|
+
self.graph.symbols[sym.id] = sym
|
|
367
|
+
self.graph.file_symbols[file_path].append(sym.id)
|
|
368
|
+
|
|
369
|
+
# 还原 imports
|
|
370
|
+
self.graph.file_imports[file_path] = list(entry.imports)
|
|
371
|
+
|
|
372
|
+
# 还原 import bindings
|
|
373
|
+
from . import JSImportBinding
|
|
374
|
+
self.graph.file_import_bindings[file_path] = [
|
|
375
|
+
JSImportBinding(
|
|
376
|
+
local_name=b["local_name"], imported_name=b["imported_name"],
|
|
377
|
+
module=b["module"], line=b["line"], kind=b.get("kind", "named"),
|
|
378
|
+
)
|
|
379
|
+
for b in entry.import_bindings_json
|
|
380
|
+
]
|
|
381
|
+
|
|
382
|
+
# 还原 exports
|
|
383
|
+
from . import JSExportBinding
|
|
384
|
+
self.graph.file_exports[file_path] = [
|
|
385
|
+
JSExportBinding(
|
|
386
|
+
exported_name=b["exported_name"], source_name=b.get("source_name"),
|
|
387
|
+
module=b.get("module"), line=b["line"], kind=b.get("kind", "local"),
|
|
388
|
+
)
|
|
389
|
+
for b in entry.exports_json
|
|
390
|
+
]
|
|
391
|
+
|
|
392
|
+
# 还原 calls
|
|
393
|
+
self.graph.file_calls[file_path] = [
|
|
394
|
+
(c["name"], c["line"], c.get("kind", "direct"))
|
|
395
|
+
for c in entry.calls_json
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
# 更新 mtime 缓存
|
|
399
|
+
self._cache[file_path] = entry.mtime
|
|
400
|
+
self.scan_stats.processed_files += 1
|
|
401
|
+
|
|
402
|
+
# ── 文件处理 ───────────────────────────────────────────────────────────────
|
|
403
|
+
|
|
404
|
+
def _list_files(self, max_files: int) -> list[str]:
|
|
405
|
+
"""用 ripgrep 快速列文件,fallback 到 pathlib。"""
|
|
406
|
+
rg_cmd = ["rg", "--files", "--hidden", "-g", "!**/*.min.js"]
|
|
407
|
+
for ext in sorted(EXT_TO_LANG):
|
|
408
|
+
rg_cmd.extend(["-g", f"**/*{ext}"])
|
|
409
|
+
try:
|
|
410
|
+
result = subprocess.run(
|
|
411
|
+
rg_cmd, cwd=self.project_root,
|
|
412
|
+
capture_output=True, text=True, timeout=30,
|
|
413
|
+
)
|
|
414
|
+
candidates = sorted(
|
|
415
|
+
line for line in result.stdout.strip().split("\n")
|
|
416
|
+
if line
|
|
417
|
+
and Path(line).suffix.lower() in EXT_TO_LANG
|
|
418
|
+
)
|
|
419
|
+
except Exception:
|
|
420
|
+
# fallback:一次遍历过滤扩展名
|
|
421
|
+
valid_exts = set(EXT_TO_LANG)
|
|
422
|
+
candidates = sorted(
|
|
423
|
+
str(p.relative_to(self.project_root))
|
|
424
|
+
for p in self.project_root.rglob("*")
|
|
425
|
+
if p.is_file()
|
|
426
|
+
and p.suffix.lower() in valid_exts
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
filtered_files: list[str] = []
|
|
430
|
+
for file in candidates:
|
|
431
|
+
if self._should_skip_path(file):
|
|
432
|
+
self.scan_stats.filtered_path_files += 1
|
|
433
|
+
continue
|
|
434
|
+
filtered_files.append(file)
|
|
435
|
+
|
|
436
|
+
self.scan_stats.listed_source_files = len(candidates)
|
|
437
|
+
if len(filtered_files) > max_files:
|
|
438
|
+
self.scan_stats.truncated_files = len(filtered_files) - max_files
|
|
439
|
+
selected_files = filtered_files[:max_files]
|
|
440
|
+
self.scan_stats.selected_source_files = len(selected_files)
|
|
441
|
+
return selected_files
|
|
442
|
+
|
|
443
|
+
def supporting_files(self, limit: int = 8) -> list[dict[str, Any]]:
|
|
444
|
+
"""列出符号图之外也值得先看的文档、脚本和配置文件。"""
|
|
445
|
+
rows: list[dict[str, Any]] = []
|
|
446
|
+
seen: set[str] = set()
|
|
447
|
+
for file in self._list_supporting_file_candidates():
|
|
448
|
+
if file in seen:
|
|
449
|
+
continue
|
|
450
|
+
seen.add(file)
|
|
451
|
+
classified = self._classify_supporting_file(file)
|
|
452
|
+
if not classified:
|
|
453
|
+
continue
|
|
454
|
+
priority, role, reason = classified
|
|
455
|
+
rows.append({"file": file, "role": role, "reason": reason, "priority": priority})
|
|
456
|
+
rows.sort(key=lambda row: (row["priority"], row["file"]))
|
|
457
|
+
return [
|
|
458
|
+
{"file": row["file"], "role": row["role"], "reason": row["reason"]}
|
|
459
|
+
for row in rows[:limit]
|
|
460
|
+
]
|
|
461
|
+
|
|
462
|
+
def _list_supporting_file_candidates(self) -> list[str]:
|
|
463
|
+
"""快速列出仓库文件,用于轻量支撑文件清单;不读取文件内容。"""
|
|
464
|
+
try:
|
|
465
|
+
result = subprocess.run(
|
|
466
|
+
["rg", "--files", "--hidden", "-g", "!**/*.min.js"],
|
|
467
|
+
cwd=self.project_root,
|
|
468
|
+
capture_output=True,
|
|
469
|
+
text=True,
|
|
470
|
+
timeout=30,
|
|
471
|
+
)
|
|
472
|
+
candidates = sorted(line for line in result.stdout.strip().split("\n") if line)
|
|
473
|
+
except Exception:
|
|
474
|
+
candidates = sorted(
|
|
475
|
+
str(p.relative_to(self.project_root))
|
|
476
|
+
for p in self.project_root.rglob("*")
|
|
477
|
+
if p.is_file()
|
|
478
|
+
)
|
|
479
|
+
root_context_files = [
|
|
480
|
+
name
|
|
481
|
+
for name in ("AGENTS.md", "CLAUDE.md", "README.md", "SKILL.md")
|
|
482
|
+
if (self.project_root / name).is_file()
|
|
483
|
+
]
|
|
484
|
+
candidates = sorted(set(root_context_files + candidates))
|
|
485
|
+
return [file for file in candidates if not self._should_skip_supporting_path(file)]
|
|
486
|
+
|
|
487
|
+
def _should_skip_supporting_path(self, file: str) -> bool:
|
|
488
|
+
path = Path(file)
|
|
489
|
+
name = path.name
|
|
490
|
+
name_lower = name.lower()
|
|
491
|
+
if self._should_skip_path(file):
|
|
492
|
+
return True
|
|
493
|
+
if name in SENSITIVE_SUPPORTING_FILE_NAMES or name_lower.startswith(".env."):
|
|
494
|
+
return True
|
|
495
|
+
if name_lower.endswith((".pem", ".key", ".p12", ".pfx")):
|
|
496
|
+
return True
|
|
497
|
+
return False
|
|
498
|
+
|
|
499
|
+
@staticmethod
|
|
500
|
+
def _classify_supporting_file(file: str) -> tuple[int, str, str] | None:
|
|
501
|
+
path = PurePosixPath(file)
|
|
502
|
+
parts = path.parts
|
|
503
|
+
name = path.name
|
|
504
|
+
name_lower = name.lower()
|
|
505
|
+
suffix = path.suffix.lower()
|
|
506
|
+
depth = len(parts)
|
|
507
|
+
|
|
508
|
+
if name in {"AGENTS.md", "CLAUDE.md"}:
|
|
509
|
+
return 0, "agent-context", "注入的项目结构、规则和工作流上下文"
|
|
510
|
+
if name == "SKILL.md":
|
|
511
|
+
return 1, "skill-doc", "技能入口说明,通常是 skill 仓库核心"
|
|
512
|
+
if name == "README.md":
|
|
513
|
+
return 2, "readme", "用户/项目说明入口"
|
|
514
|
+
if name in {"package.json", "pyproject.toml", "Cargo.toml", "go.mod", "requirements.txt"}:
|
|
515
|
+
return 3, "manifest", "依赖、脚本或包元数据"
|
|
516
|
+
if name.startswith("tsconfig") and suffix == ".json":
|
|
517
|
+
return 4, "tooling-config", "TypeScript 编译配置"
|
|
518
|
+
if name_lower.startswith(("vite.config", "vitest.config", "eslint.config")):
|
|
519
|
+
return 4, "tooling-config", "构建、测试或 lint 配置"
|
|
520
|
+
if name in {"Makefile", "Dockerfile", "docker-compose.yml", "compose.yml"}:
|
|
521
|
+
return 5, "automation", "构建、容器或自动化入口"
|
|
522
|
+
if suffix == ".service":
|
|
523
|
+
return 5, "service", "服务部署/启动配置"
|
|
524
|
+
if suffix == ".sh" and (depth <= 2 or (parts and parts[0] in {"scripts", "bin"})):
|
|
525
|
+
return 6, "script", "启动、验证或维护脚本"
|
|
526
|
+
if suffix == ".md" and (depth <= 2 or (parts and parts[0] in {"docs", "references"})):
|
|
527
|
+
return 7, "docs", "补充文档或参考资料"
|
|
528
|
+
if name in SUPPORTING_FILE_NAMES:
|
|
529
|
+
return 8, "supporting", "项目支撑文件"
|
|
530
|
+
return None
|
|
531
|
+
|
|
532
|
+
def _should_skip_path(self, file: str) -> bool:
|
|
533
|
+
path = Path(file)
|
|
534
|
+
if path.name.endswith(".min.js"):
|
|
535
|
+
return True
|
|
536
|
+
if path.name in SKIP_FILE_NAMES:
|
|
537
|
+
return True
|
|
538
|
+
return any(part in SKIP_DIR_NAMES for part in path.parts)
|
|
539
|
+
|
|
540
|
+
def _should_skip_large_file(self, path: Path) -> bool:
|
|
541
|
+
if os.getenv("REPOMAP_SCAN_LARGE_FILES", "0") == "1":
|
|
542
|
+
return False
|
|
543
|
+
try:
|
|
544
|
+
return path.stat().st_size > self.max_file_bytes
|
|
545
|
+
except OSError:
|
|
546
|
+
return True
|
|
547
|
+
|
|
548
|
+
def _process_file(self, file: str) -> None:
|
|
549
|
+
path = self.project_root / file
|
|
550
|
+
if not path.exists():
|
|
551
|
+
return
|
|
552
|
+
if self._should_skip_large_file(path):
|
|
553
|
+
self.scan_stats.filtered_large_files += 1
|
|
554
|
+
logger.debug(f"Skip oversized file: {file}")
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
mtime = path.stat().st_mtime
|
|
558
|
+
cached_mtime = self._cache.get(file)
|
|
559
|
+
if cached_mtime == mtime:
|
|
560
|
+
self.scan_stats.skipped_files += 1
|
|
561
|
+
return # 未变更,复用缓存
|
|
562
|
+
|
|
563
|
+
ext = Path(file).suffix.lower()
|
|
564
|
+
lang = EXT_TO_LANG.get(ext)
|
|
565
|
+
if not lang or lang not in self.ts.parsers:
|
|
566
|
+
return
|
|
567
|
+
|
|
568
|
+
content = path.read_bytes()
|
|
569
|
+
tree = self.ts.parse(content, lang)
|
|
570
|
+
if not tree:
|
|
571
|
+
return
|
|
572
|
+
|
|
573
|
+
symbols = self.ts.extract_symbols(tree, lang, file, content)
|
|
574
|
+
self.graph.file_symbols.setdefault(file, [])
|
|
575
|
+
for sym in symbols:
|
|
576
|
+
self.graph.symbols[sym.id] = sym
|
|
577
|
+
self.graph.file_symbols[file].append(sym.id)
|
|
578
|
+
|
|
579
|
+
imports = self.ts.extract_imports(tree, lang)
|
|
580
|
+
import_bindings = self.ts.extract_js_ts_import_bindings(content, lang, tree=tree)
|
|
581
|
+
import_modules = {module for module, _ in imports}
|
|
582
|
+
import_modules.update(binding.module for binding in import_bindings if binding.module)
|
|
583
|
+
self.graph.file_imports[file] = sorted(import_modules)
|
|
584
|
+
self.graph.file_import_bindings[file] = import_bindings
|
|
585
|
+
self.graph.file_exports[file] = self.ts.extract_js_ts_export_bindings(content, lang, tree=tree)
|
|
586
|
+
self._mark_exported_symbols(file)
|
|
587
|
+
|
|
588
|
+
self.graph.file_calls[file] = self.ts.extract_calls(tree, lang)
|
|
589
|
+
|
|
590
|
+
# 提取 HTTP 路由(Python/JS/TS/Rust)
|
|
591
|
+
routes = self.ts.extract_http_routes(tree, lang, file)
|
|
592
|
+
if routes:
|
|
593
|
+
self.routes.extend(routes)
|
|
594
|
+
|
|
595
|
+
# 立即释放 tree 对象以避免内存泄漏,只缓存 mtime
|
|
596
|
+
del tree
|
|
597
|
+
self._cache[file] = mtime
|
|
598
|
+
self.scan_stats.processed_files += 1
|
|
599
|
+
|
|
600
|
+
# 清理已消失文件的缓存
|
|
601
|
+
stale = [k for k in list(self._cache) if not (self.project_root / k).exists()]
|
|
602
|
+
for k in stale:
|
|
603
|
+
del self._cache[k]
|
|
604
|
+
|
|
605
|
+
def _mark_exported_symbols(self, file: str) -> None:
|
|
606
|
+
exported_names = {
|
|
607
|
+
binding.source_name
|
|
608
|
+
for binding in self.graph.file_exports.get(file, [])
|
|
609
|
+
if binding.module is None and binding.source_name and binding.source_name != "*"
|
|
610
|
+
}
|
|
611
|
+
if not exported_names:
|
|
612
|
+
return
|
|
613
|
+
for symbol_id in self.graph.file_symbols.get(file, []):
|
|
614
|
+
symbol = self.graph.symbols.get(symbol_id)
|
|
615
|
+
if symbol and symbol.name in exported_names:
|
|
616
|
+
symbol.visibility = "exported"
|
|
617
|
+
|
|
618
|
+
# ── 构建边 ─────────────────────────────────────────────────────────────────
|
|
619
|
+
|
|
620
|
+
def _build_edges(self) -> None:
|
|
621
|
+
self._resolver = ImportResolver(self.project_root, self.graph)
|
|
622
|
+
edge_builder = EdgeBuilder(self.graph, self._resolver)
|
|
623
|
+
edge_builder.build_edges()
|
|
624
|
+
|
|
625
|
+
# ── PageRank ───────────────────────────────────────────────────────────────
|
|
626
|
+
|
|
627
|
+
def _calculate_pagerank(self, damping: float = 0.85, max_iter: int = 50,
|
|
628
|
+
tol: float = 1e-6) -> None:
|
|
629
|
+
self._analyzer.calculate_pagerank(damping, max_iter, tol)
|
|
630
|
+
|
|
631
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
632
|
+
# 查询接口(委托给 analyzer)
|
|
633
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
634
|
+
|
|
635
|
+
def query_symbol(self, name: str) -> list[Any]:
|
|
636
|
+
"""按名称模糊查找符号,按 PageRank 降序返回。"""
|
|
637
|
+
return self._analyzer.query_symbol(name)
|
|
638
|
+
|
|
639
|
+
def call_chain(self, symbol_id: str, direction: str = "both",
|
|
640
|
+
max_depth: int = 3) -> dict[str, list[Any]]:
|
|
641
|
+
"""
|
|
642
|
+
返回指定符号的调用链。
|
|
643
|
+
direction: "callers" | "callees" | "both"
|
|
644
|
+
"""
|
|
645
|
+
if direction not in ("callers", "callees", "both"):
|
|
646
|
+
raise ValueError("direction must be 'callers', 'callees', or 'both'")
|
|
647
|
+
return self._analyzer.call_chain(symbol_id, direction, max_depth)
|
|
648
|
+
|
|
649
|
+
def hotspots(self, limit: int = 15) -> list[dict]:
|
|
650
|
+
"""识别高密度文件。"""
|
|
651
|
+
return self._analyzer.hotspots(limit)
|
|
652
|
+
|
|
653
|
+
def entry_points(self) -> list[str]:
|
|
654
|
+
"""识别入口文件。"""
|
|
655
|
+
return self._analyzer.entry_points()
|
|
656
|
+
|
|
657
|
+
def file_analysis(self) -> dict[str, dict[str, Any]]:
|
|
658
|
+
"""分析每个文件的复杂度和连接性。"""
|
|
659
|
+
return self._analyzer.file_analysis()
|
|
660
|
+
|
|
661
|
+
def module_summary(self, limit: int = 8) -> list[dict[str, Any]]:
|
|
662
|
+
"""生成模块级别的摘要。"""
|
|
663
|
+
return self._analyzer.module_summary(limit)
|
|
664
|
+
|
|
665
|
+
def suggested_reading_order(self, limit: int = 8) -> list[dict[str, Any]]:
|
|
666
|
+
"""为 AI 生成推荐阅读顺序。"""
|
|
667
|
+
return self._analyzer.suggested_reading_order(limit)
|
|
668
|
+
|
|
669
|
+
def list_routes(self) -> list:
|
|
670
|
+
"""返回提取到的 HTTP 路由列表。"""
|
|
671
|
+
return self.routes
|
|
672
|
+
|
|
673
|
+
def summary_symbols(self, limit_files: int = 6, per_file: int = 4) -> list[dict[str, Any]]:
|
|
674
|
+
"""返回适合 overview 展示的关键实现符号。"""
|
|
675
|
+
return self._analyzer.summary_symbols(limit_files, per_file)
|
|
676
|
+
|
|
677
|
+
def _scan_summary_lines(self) -> list[str]:
|
|
678
|
+
lines = [
|
|
679
|
+
f"- 文件数: {self.scan_stats.processed_files}",
|
|
680
|
+
f"- 符号数: {len(self.graph.symbols)}",
|
|
681
|
+
f"- 依赖边: {sum(len(v) for v in self.graph.outgoing.values())}",
|
|
682
|
+
f"- 过滤路径: {self.scan_stats.filtered_path_files}",
|
|
683
|
+
f"- 过滤大文件: {self.scan_stats.filtered_large_files}",
|
|
684
|
+
]
|
|
685
|
+
if self._resolver and self._resolver.import_configs:
|
|
686
|
+
lines.append(f"- 解析配置: {len(self._resolver.import_configs)}")
|
|
687
|
+
# 超时熔断提示
|
|
688
|
+
if self.scan_stats.timeout_triggered:
|
|
689
|
+
lines.append(f"- ⚠️ 扫描超时熔断: 部分文件未处理,结果不完整")
|
|
690
|
+
# 失败文件提示(最多显示 3 个)
|
|
691
|
+
if self.scan_stats.failed_files:
|
|
692
|
+
lines.append(f"- 处理失败: {len(self.scan_stats.failed_files)} 个文件")
|
|
693
|
+
for ff in self.scan_stats.failed_files[:3]:
|
|
694
|
+
lines.append(f" - {ff}")
|
|
695
|
+
return lines
|
|
696
|
+
|
|
697
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
698
|
+
# AI 输出格式(委托给 repomap_ai)
|
|
699
|
+
# ═══════════════════════════════════════════════════════════════════════════
|
|
700
|
+
|
|
701
|
+
def render_overview(self, max_chars: int = 16000, with_heat: bool = False,
|
|
702
|
+
with_co_change: bool = False, granularity: str = "auto") -> str:
|
|
703
|
+
return render_overview_report(self, max_chars, with_heat=with_heat,
|
|
704
|
+
with_co_change=with_co_change,
|
|
705
|
+
granularity=granularity)
|
|
706
|
+
|
|
707
|
+
def render_call_chain(self, symbol_name: str, max_depth: int = 3) -> str:
|
|
708
|
+
return render_call_chain_report(self, symbol_name, max_depth)
|
|
709
|
+
|
|
710
|
+
def render_file_detail(self, file_path: str, max_symbols: int = 12, max_chars: int = 6000) -> str:
|
|
711
|
+
return render_file_detail_report(self, file_path, max_symbols=max_symbols, max_chars=max_chars)
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
715
|
+
# 向后兼容导出
|
|
716
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
717
|
+
|
|
718
|
+
# 从 parser 模块导出常量以保持兼容性
|
|
719
|
+
from .parser import QUERIES
|
|
720
|
+
|
|
721
|
+
__all__ = [
|
|
722
|
+
"DEFAULT_MAX_FILE_BYTES",
|
|
723
|
+
"EXT_TO_LANG",
|
|
724
|
+
"QUERIES",
|
|
725
|
+
"RepoMapEngine",
|
|
726
|
+
"SKIP_DIR_NAMES",
|
|
727
|
+
"SKIP_FILE_NAMES",
|
|
728
|
+
"TreeSitterAdapter",
|
|
729
|
+
"logger",
|
|
730
|
+
]
|