repomap-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repomap/topic.py ADDED
@@ -0,0 +1,600 @@
1
+ """
2
+ 主题评分引擎 + 测试匹配 + 文件角色分类。
3
+
4
+ 被 query、impact、diff-risk、overview 共用。
5
+ 零外部依赖(只依赖 repomap_support 的数据结构)。
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ import subprocess
12
+ from collections import defaultdict
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path, PurePosixPath
15
+ from typing import TYPE_CHECKING, Any
16
+
17
+ if TYPE_CHECKING:
18
+ from . import RepoGraph
19
+
20
+
21
+ # ═══════════════════════════════════════════════════════════════════════════════
22
+ # 数据结构
23
+ # ═══════════════════════════════════════════════════════════════════════════════
24
+
25
+
26
+ @dataclass
27
+ class FileMatch:
28
+ path: str
29
+ role: str
30
+ score: float
31
+ reasons: list[str] = field(default_factory=list)
32
+
33
+
34
+ @dataclass
35
+ class TestMatch:
36
+ test_file: str
37
+ target_file: str
38
+ confidence: str # high | medium | low
39
+ reason: str
40
+
41
+
42
+ # ═══════════════════════════════════════════════════════════════════════════════
43
+ # 噪音文件判断
44
+ # ═══════════════════════════════════════════════════════════════════════════════
45
+
46
+ NOISE_PATTERNS = [
47
+ "public/monaco-editor/",
48
+ ".min.js",
49
+ ".bundle.js",
50
+ ".generated.",
51
+ ".d.ts",
52
+ ]
53
+
54
+ # 复用 core 中的 SKIP_DIR_NAMES 作为额外参考
55
+ NOISE_PATH_SEGMENTS = {
56
+ "monaco-editor",
57
+ "vendor",
58
+ "third_party",
59
+ "third-party",
60
+ "node_modules",
61
+ ".next",
62
+ "dist",
63
+ "build",
64
+ ".cache",
65
+ }
66
+
67
+
68
+ def is_noise_file(file_path: str) -> bool:
69
+ """判断是否为噪音文件(构建产物、vendor 等)。"""
70
+ path_lower = file_path.lower()
71
+ for pattern in NOISE_PATTERNS:
72
+ if pattern in path_lower:
73
+ return True
74
+ parts = PurePosixPath(file_path).parts
75
+ for part in parts:
76
+ if part.lower() in NOISE_PATH_SEGMENTS:
77
+ return True
78
+ return False
79
+
80
+
81
+ # ═══════════════════════════════════════════════════════════════════════════════
82
+ # 文件角色分类
83
+ # ═══════════════════════════════════════════════════════════════════════════════
84
+
85
+
86
+ def classify_file_role(file_path: str, graph: "RepoGraph | None" = None) -> str:
87
+ """基于路径和符号信息的角色分类。"""
88
+ path = file_path.lower()
89
+ if is_test_like_file(file_path):
90
+ return "test"
91
+ if any(p in path for p in ["/components/", "/pages/", "/views/"]):
92
+ return "frontend-ui"
93
+ if any(p in path for p in ["/stores/", "/hooks/"]):
94
+ return "frontend-state"
95
+ if any(p in path for p in ["/server/", "/routes/", "/api/"]
96
+ ) or path.startswith("server/"):
97
+ return "backend"
98
+ if any(path.endswith(ext) for ext in [".config.ts", ".config.js", ".config.tsx", "package.json"]):
99
+ return "config"
100
+
101
+ # 结合符号信息进行更精确的分类
102
+ if graph is not None:
103
+ symbol_ids = graph.file_symbols.get(file_path, [])
104
+ if symbol_ids:
105
+ # 统计符号类型
106
+ kind_counts: dict[str, int] = {}
107
+ for sid in symbol_ids:
108
+ symbol = graph.symbols.get(sid)
109
+ if symbol:
110
+ kind_counts[symbol.kind] = kind_counts.get(symbol.kind, 0) + 1
111
+
112
+ # 如果文件包含大量导出符号,可能是核心模块
113
+ exported_count = sum(1 for sid in symbol_ids
114
+ if graph.symbols.get(sid) and graph.symbols[sid].visibility == "exported")
115
+ if exported_count >= 3:
116
+ return "core"
117
+
118
+ # 如果文件包含大量类/接口,可能是模型/类型定义
119
+ if kind_counts.get("class", 0) >= 2 or kind_counts.get("interface", 0) >= 2:
120
+ return "model"
121
+
122
+ # 如果文件包含大量函数,可能是工具/服务
123
+ if kind_counts.get("function", 0) >= 3:
124
+ return "service"
125
+
126
+ return "other"
127
+
128
+
129
+ # ═══════════════════════════════════════════════════════════════════════════════
130
+ # 标识符拆分
131
+ # ═══════════════════════════════════════════════════════════════════════════════
132
+
133
+ _CAMEL_SPLIT_RE = re.compile(r'([A-Z][a-z0-9]+|[a-z0-9]+|[A-Z0-9]+(?=[A-Z]|$))')
134
+
135
+
136
+ def split_identifier(name: str) -> list[str]:
137
+ """将 camelCase/PascalCase/snake_case 标识符拆分为词元列表。
138
+
139
+ "VirtualKeyboard" -> ["virtual", "keyboard"]
140
+ "queueInput" -> ["queue", "input"]
141
+ "terminal_store" -> ["terminal", "store"]
142
+ """
143
+ name = name.replace("_", " ").replace("-", " ")
144
+ tokens: list[str] = []
145
+ for part in name.split():
146
+ part_tokens = [t.lower() for t in _CAMEL_SPLIT_RE.findall(part) if t]
147
+ tokens.extend(part_tokens)
148
+ return tokens
149
+
150
+
151
+ # ═══════════════════════════════════════════════════════════════════════════════
152
+ # 主题评分
153
+ # ═══════════════════════════════════════════════════════════════════════════════
154
+
155
+
156
+ def topic_score(
157
+ query: str,
158
+ file_path: str,
159
+ file_data: dict,
160
+ graph: "RepoGraph",
161
+ keyword_weights: dict[str, float] | None = None,
162
+ ) -> float:
163
+ """对文件与查询关键词的相关性进行评分。
164
+
165
+ 第一版用手写加权快速上线,后续可升级为 BM25。
166
+ keyword_weights 用于高频词惩罚:命中文件比例越高,权重越低。
167
+ """
168
+ keywords = query.lower().split()
169
+ score = 0.0
170
+
171
+ path_lower = file_path.lower()
172
+ file_name = PurePosixPath(file_path).stem.lower()
173
+ file_name_tokens = split_identifier(PurePosixPath(file_path).stem)
174
+
175
+ for kw in keywords:
176
+ kw_weight = keyword_weights.get(kw, 1.0) if keyword_weights else 1.0
177
+
178
+ # 1. 路径命中(权重 30,文件名命中翻倍)
179
+ if kw in path_lower:
180
+ score += 30 * (2.0 if kw in file_name else 1.0) * kw_weight
181
+
182
+ # 2. 文件名命中(权重 25),含 camelCase/snake_case 拆词
183
+ if kw in file_name:
184
+ score += 25 * kw_weight
185
+ elif any(kw in t for t in file_name_tokens):
186
+ score += 15 * kw_weight
187
+
188
+ # 3. 符号名命中(权重 15)
189
+ for sid in graph.file_symbols.get(file_path, []):
190
+ symbol = graph.symbols.get(sid)
191
+ if symbol and kw in symbol.name.lower():
192
+ score += 15 * kw_weight
193
+ break # 每个关键词每个文件只计一次
194
+
195
+ # 4. 噪音惩罚
196
+ if is_noise_file(file_path):
197
+ score *= 0.05
198
+
199
+ # 5. 测试文件降权(默认优先看实现,再看测试)
200
+ if is_test_like_file(file_path):
201
+ score *= 0.55
202
+
203
+ return score
204
+
205
+
206
+ def compute_keyword_weights(
207
+ keywords: list[str],
208
+ candidate_files: list[str],
209
+ graph: "RepoGraph",
210
+ ) -> dict[str, float]:
211
+ """计算每个关键词的 IDF 风格权重。
212
+
213
+ 命中文件比例 > 80% → 权重 0.2
214
+ 命中文件比例 > 50% → 权重 0.5
215
+ 否则保持 1.0。
216
+ """
217
+ total = len(candidate_files)
218
+ if total == 0:
219
+ return {kw: 1.0 for kw in keywords}
220
+
221
+ weights: dict[str, float] = {}
222
+ for kw in keywords:
223
+ matched = 0
224
+ kw_lower = kw.lower()
225
+ for f in candidate_files:
226
+ path_lower = f.lower()
227
+ file_name = PurePosixPath(f).stem.lower()
228
+ tokens = split_identifier(PurePosixPath(f).stem)
229
+ if kw_lower in path_lower or kw_lower in file_name or any(kw_lower in t for t in tokens):
230
+ matched += 1
231
+ continue
232
+ # 也检查符号名
233
+ for sid in graph.file_symbols.get(f, []):
234
+ sym = graph.symbols.get(sid)
235
+ if sym and kw_lower in sym.name.lower():
236
+ matched += 1
237
+ break
238
+
239
+ ratio = matched / total
240
+ if ratio > 0.8:
241
+ weights[kw] = 0.2
242
+ elif ratio > 0.5:
243
+ weights[kw] = 0.5
244
+ else:
245
+ weights[kw] = 1.0
246
+
247
+ return weights
248
+
249
+
250
+ # ═══════════════════════════════════════════════════════════════════════════════
251
+ # 测试文件判断
252
+ # ═══════════════════════════════════════════════════════════════════════════════
253
+
254
+
255
+ def is_test_like_file(file_path: str) -> bool:
256
+ """判断是否为测试文件。"""
257
+ path = PurePosixPath(file_path)
258
+ name = path.name.lower()
259
+ if any(part.lower() in {"test", "tests", "__tests__"} for part in path.parts):
260
+ return True
261
+ return name.startswith("test_") or name.endswith("_test.py") or name.endswith(".spec.ts") or name.endswith(".test.ts") or name.endswith(".test.tsx") or name.endswith(".spec.tsx")
262
+
263
+
264
+ def _is_boilerplate_test(file_path: str) -> bool:
265
+ """排除低语义测试文件(package marker、pytest fixture 等)。"""
266
+ name = PurePosixPath(file_path).name
267
+ return name in ("__init__.py", "conftest.py", "__init__.pyi")
268
+
269
+
270
+ def _bare_name(stem: str) -> str:
271
+ """去除测试相关后缀,返回基础文件名。
272
+
273
+ "VirtualKeyboard.test" -> "VirtualKeyboard"
274
+ "terminal_test" -> "terminal"
275
+ """
276
+ for suffix in (".test", ".spec", "_test", "Test"):
277
+ if stem.endswith(suffix) and len(stem) > len(suffix):
278
+ return stem[: -len(suffix)]
279
+ if stem.startswith("test_") and len(stem) > 5:
280
+ return stem[5:]
281
+ return stem
282
+
283
+
284
+ # ═══════════════════════════════════════════════════════════════════════════════
285
+ # 测试匹配
286
+ # ═══════════════════════════════════════════════════════════════════════════════
287
+
288
+
289
+ def find_related_tests(
290
+ target_files: list[str],
291
+ graph: "RepoGraph",
292
+ analysis: dict,
293
+ project_root: str,
294
+ ) -> list[TestMatch]:
295
+ """根据目标文件查找相关测试(5 级优先级匹配)。"""
296
+ results: list[TestMatch] = []
297
+ test_files = [
298
+ f for f in graph.file_symbols
299
+ if is_test_like_file(f) and not _is_boilerplate_test(f)
300
+ ]
301
+ if not test_files:
302
+ return _dedupe_test_matches(results)
303
+
304
+ for target in target_files:
305
+ target_name = PurePosixPath(target).stem
306
+ target_bare = _bare_name(target_name)
307
+ target_symbol_ids = set(graph.file_symbols.get(target, []))
308
+
309
+ for test_file in test_files:
310
+ test_bare = _bare_name(PurePosixPath(test_file).stem)
311
+
312
+ # 策略1: 文件名强匹配(high confidence)
313
+ if test_bare == target_bare:
314
+ results.append(TestMatch(
315
+ test_file, target, "high",
316
+ "文件名精确匹配",
317
+ ))
318
+ continue
319
+
320
+ # 策略2: 路径邻近匹配(medium confidence)
321
+ if _share_test_dir(test_file, target):
322
+ results.append(TestMatch(
323
+ test_file, target, "medium",
324
+ "同测试目录",
325
+ ))
326
+ continue
327
+
328
+ # 策略3: import 路径命中(high confidence)
329
+ if _test_imports_target(test_file, target, graph):
330
+ results.append(TestMatch(
331
+ test_file, target, "high",
332
+ "测试 import 了目标模块",
333
+ ))
334
+ continue
335
+
336
+ # 策略4: 符号边命中(medium confidence)
337
+ test_symbols = graph.file_symbols.get(test_file, [])
338
+ for sid in test_symbols:
339
+ found = False
340
+ for edge in graph.outgoing.get(sid, []):
341
+ if edge.target in target_symbol_ids:
342
+ target_sym = graph.symbols.get(edge.target)
343
+ sym_name = target_sym.name if target_sym else "?"
344
+ results.append(TestMatch(
345
+ test_file, target, "medium",
346
+ f"测试引用了 {sym_name}",
347
+ ))
348
+ found = True
349
+ break
350
+ if found:
351
+ break
352
+ else:
353
+ # 策略5: git 共变更历史(medium confidence)
354
+ co_score = _get_co_change_score(project_root, test_file, target)
355
+ if co_score >= 3:
356
+ results.append(TestMatch(
357
+ test_file, target, "medium",
358
+ f"git 共变更 {co_score} 次",
359
+ ))
360
+
361
+ return _dedupe_test_matches(results)
362
+
363
+
364
+ def _dedupe_test_matches(matches: list[TestMatch]) -> list[TestMatch]:
365
+ confidence_rank = {"high": 3, "medium": 2, "low": 1}
366
+ reason_rank = {
367
+ "文件名精确匹配": 5,
368
+ "测试 import 了目标模块": 4,
369
+ "测试引用了": 3,
370
+ "同测试目录": 2,
371
+ "git 共变更": 1,
372
+ }
373
+
374
+ def score(match: TestMatch) -> tuple[int, int]:
375
+ reason_score = 0
376
+ for prefix, value in reason_rank.items():
377
+ if match.reason.startswith(prefix):
378
+ reason_score = value
379
+ break
380
+ return confidence_rank.get(match.confidence, 0), reason_score
381
+
382
+ best: dict[tuple[str, str], TestMatch] = {}
383
+ order: list[tuple[str, str]] = []
384
+ for match in matches:
385
+ key = (match.test_file, match.target_file)
386
+ if key not in best:
387
+ best[key] = match
388
+ order.append(key)
389
+ continue
390
+ if score(match) > score(best[key]):
391
+ best[key] = match
392
+ return [best[key] for key in order]
393
+
394
+ def _share_test_dir(test_file: str, target: str) -> bool:
395
+ """检查测试文件和目标文件是否在同一目录下(含 __tests__ 相邻目录)。"""
396
+ test_path = PurePosixPath(test_file)
397
+ target_path = PurePosixPath(target)
398
+ test_parent = test_path.parent
399
+ target_parent = target_path.parent
400
+ # 同为根目录文件不视为"同目录"
401
+ if test_parent == target_parent:
402
+ name = test_parent.name if test_parent.name else str(test_parent)
403
+ if name in ("", "."):
404
+ return False
405
+ return True
406
+ # __tests__/foo.test.ts 对应 ../foo.ts
407
+ if test_parent.name in ("__tests__", "tests", "test"):
408
+ grandparent = test_parent.parent
409
+ if grandparent == target_parent:
410
+ gp_name = grandparent.name if grandparent.name else str(grandparent)
411
+ if gp_name in ("", "."):
412
+ return False
413
+ return True
414
+ return False
415
+
416
+
417
+ def _test_imports_target(test_file: str, target: str, graph: "RepoGraph") -> bool:
418
+ """检查测试文件是否 import 了目标模块路径。"""
419
+ target_module = _file_to_module_path(target)
420
+ imports = graph.file_imports.get(test_file, [])
421
+ for imp in imports:
422
+ if target_module in imp or imp.endswith(target_module):
423
+ return True
424
+ return False
425
+
426
+
427
+ def _file_to_module_path(file_path: str) -> str:
428
+ """将文件路径转为模块路径。
429
+
430
+ "src/components/terminal/VirtualKeyboard.tsx" -> "src/components/terminal/VirtualKeyboard"
431
+ """
432
+ p = PurePosixPath(file_path)
433
+ # 去除扩展名
434
+ stem_path = str(p.parent / p.stem) if p.suffix else str(p)
435
+ # 去除 index
436
+ if p.stem == "index":
437
+ stem_path = str(p.parent)
438
+ return stem_path
439
+
440
+
441
+ # ═══════════════════════════════════════════════════════════════════════════════
442
+ # Git 共变更热度
443
+ # ═══════════════════════════════════════════════════════════════════════════════
444
+
445
+ _co_change_cache: dict[str, dict[tuple[str, str], int]] = {}
446
+
447
+
448
+ def get_co_change_score(project_root: str, file_a: str, file_b: str) -> int:
449
+ """查询两个文件的 git 共变更次数(带缓存,公开接口)。"""
450
+ cache = _co_change_cache.get(project_root)
451
+ if cache is None:
452
+ cache = _load_co_change_scores(project_root)
453
+ _co_change_cache[project_root] = cache
454
+ a, b = sorted([file_a, file_b])
455
+ return cache.get((a, b), 0)
456
+
457
+
458
+ # 向后兼容别名
459
+ _get_co_change_score = get_co_change_score
460
+
461
+
462
+ def get_co_change_neighbors(
463
+ project_root: str, file_path: str, top_n: int = 5,
464
+ ) -> list[tuple[str, int]]:
465
+ """返回与指定文件共变频率最高的文件列表(降序)。
466
+
467
+ 用途:识别隐式耦合——两个文件在 git 历史中频繁一起修改,
468
+ 即使代码上没有显式依赖,也可能存在隐含关联。
469
+ """
470
+ cache = _co_change_cache.get(project_root)
471
+ if cache is None:
472
+ cache = _load_co_change_scores(project_root)
473
+ _co_change_cache[project_root] = cache
474
+ neighbors: dict[str, int] = {}
475
+ for (a, b), count in cache.items():
476
+ if a == file_path:
477
+ neighbors[b] = count
478
+ elif b == file_path:
479
+ neighbors[a] = count
480
+ return sorted(neighbors.items(), key=lambda x: -x[1])[:top_n]
481
+
482
+
483
+ def _load_co_change_scores(project_root: str) -> dict[tuple[str, str], int]:
484
+ """统计项目中文件对的 git 共变更次数。"""
485
+ scores: dict[tuple[str, str], int] = defaultdict(int)
486
+ try:
487
+ result = subprocess.run(
488
+ ["git", "log", "--name-only", "--pretty=format:", "--since=90.days.ago", "--", "."],
489
+ cwd=project_root,
490
+ capture_output=True,
491
+ text=True,
492
+ timeout=30,
493
+ )
494
+ except Exception:
495
+ return dict(scores)
496
+
497
+ current_commit_files: list[str] = []
498
+ for line in result.stdout.split("\n"):
499
+ stripped = line.strip()
500
+ if not stripped:
501
+ # 空行分隔 commit
502
+ if len(current_commit_files) > 1:
503
+ for i in range(len(current_commit_files)):
504
+ for j in range(i + 1, len(current_commit_files)):
505
+ a, b = sorted([current_commit_files[i], current_commit_files[j]])
506
+ scores[(a, b)] += 1
507
+ current_commit_files = []
508
+ else:
509
+ current_commit_files.append(stripped)
510
+
511
+ # 处理最后一个 commit
512
+ if len(current_commit_files) > 1:
513
+ for i in range(len(current_commit_files)):
514
+ for j in range(i + 1, len(current_commit_files)):
515
+ a, b = sorted([current_commit_files[i], current_commit_files[j]])
516
+ scores[(a, b)] += 1
517
+
518
+ return dict(scores)
519
+
520
+
521
+ # ═══════════════════════════════════════════════════════════════════════════════
522
+ # 测试盲区检测
523
+ # ═══════════════════════════════════════════════════════════════════════════════
524
+
525
+ LOW_SIGNAL_KINDS = {"element", "selector", "class_selector", "id_selector", "json_key"}
526
+
527
+
528
+ def _signal_weight_for_symbol(sym: Any) -> float:
529
+ """独立版符号信号权重,不依赖 GraphAnalyzer 实例。"""
530
+ kind = getattr(sym, "kind", "") if hasattr(sym, "kind") else sym.get("kind", "")
531
+ name = getattr(sym, "name", "") if hasattr(sym, "name") else sym.get("name", "")
532
+ visibility = getattr(sym, "visibility", "") if hasattr(sym, "visibility") else sym.get("visibility", "")
533
+ if kind in LOW_SIGNAL_KINDS:
534
+ return 0.002
535
+ if name in {"__init__", "__main__"}:
536
+ return 0.35
537
+ if name.startswith("_") and visibility == "private":
538
+ return 0.85
539
+ return 1.0
540
+
541
+
542
+ def find_untested_symbols(
543
+ graph: "RepoGraph",
544
+ min_incoming_calls: int = 2,
545
+ min_score: float = 5.0,
546
+ max_results: int = 30,
547
+ ) -> list[dict]:
548
+ """找出没有测试覆盖的符号,按风险分降序排列。
549
+
550
+ 风险分 = incoming_calls × signal_weight × 5.0
551
+ 只返回非测试文件中被调用过但无测试关联的符号。
552
+ """
553
+ # 收集所有测试文件中的符号 ID
554
+ test_symbol_ids: set[str] = set()
555
+ for f in graph.file_symbols:
556
+ if is_test_like_file(f):
557
+ test_symbol_ids.update(graph.file_symbols[f])
558
+
559
+ if not test_symbol_ids:
560
+ return []
561
+
562
+ # BFS 一层:被测试符号直接引用的非测试符号视为"已覆盖"
563
+ covered: set[str] = set()
564
+ for tsid in test_symbol_ids:
565
+ for edge in graph.outgoing.get(tsid, []):
566
+ if edge.target not in test_symbol_ids:
567
+ covered.add(edge.target)
568
+
569
+ untested = []
570
+ for sid, sym in graph.symbols.items():
571
+ if sid in test_symbol_ids or sid in covered:
572
+ continue
573
+ kind = getattr(sym, "kind", "")
574
+ if kind in LOW_SIGNAL_KINDS:
575
+ continue
576
+ incoming = sum(1 for e in graph.incoming.get(sid, []) if e.kind == "call")
577
+ if incoming < min_incoming_calls:
578
+ continue
579
+ sw = _signal_weight_for_symbol(sym)
580
+ score = incoming * sw * 5.0
581
+ if score < min_score:
582
+ continue
583
+ untested.append({
584
+ "symbol": getattr(sym, "name", str(sid)),
585
+ "kind": getattr(sym, "kind", ""),
586
+ "file": getattr(sym, "file", ""),
587
+ "line": getattr(sym, "line", 0),
588
+ "incoming_calls": incoming,
589
+ "risk_score": round(score, 1),
590
+ })
591
+
592
+ untested.sort(key=lambda x: -x["risk_score"])
593
+ return untested[:max_results]
594
+
595
+
596
+ def fuzzy_symbol_suggest(query: str, graph: "RepoGraph", limit: int = 5) -> list[str]:
597
+ """用编辑距离找最接近的符号名,用于 query 无结果时的友好建议。"""
598
+ import difflib
599
+ all_names = sorted({s.name for s in graph.symbols.values() if len(s.name) >= 3})
600
+ return difflib.get_close_matches(query, all_names, n=limit, cutoff=0.5)