code-graph-builder 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. code_graph_builder/__init__.py +82 -0
  2. code_graph_builder/builder.py +366 -0
  3. code_graph_builder/cgb_cli.py +32 -0
  4. code_graph_builder/cli.py +564 -0
  5. code_graph_builder/commands_cli.py +1288 -0
  6. code_graph_builder/config.py +340 -0
  7. code_graph_builder/constants.py +708 -0
  8. code_graph_builder/embeddings/__init__.py +40 -0
  9. code_graph_builder/embeddings/qwen3_embedder.py +573 -0
  10. code_graph_builder/embeddings/vector_store.py +584 -0
  11. code_graph_builder/examples/__init__.py +0 -0
  12. code_graph_builder/examples/example_configuration.py +276 -0
  13. code_graph_builder/examples/example_kuzu_usage.py +109 -0
  14. code_graph_builder/examples/example_semantic_search_full.py +347 -0
  15. code_graph_builder/examples/generate_wiki.py +915 -0
  16. code_graph_builder/examples/graph_export_example.py +100 -0
  17. code_graph_builder/examples/rag_example.py +206 -0
  18. code_graph_builder/examples/test_cli_demo.py +129 -0
  19. code_graph_builder/examples/test_embedding_api.py +153 -0
  20. code_graph_builder/examples/test_kuzu_local.py +190 -0
  21. code_graph_builder/examples/test_rag_redis.py +390 -0
  22. code_graph_builder/graph_updater.py +605 -0
  23. code_graph_builder/guidance/__init__.py +1 -0
  24. code_graph_builder/guidance/agent.py +123 -0
  25. code_graph_builder/guidance/prompts.py +74 -0
  26. code_graph_builder/guidance/toolset.py +264 -0
  27. code_graph_builder/language_spec.py +536 -0
  28. code_graph_builder/mcp/__init__.py +21 -0
  29. code_graph_builder/mcp/api_doc_generator.py +764 -0
  30. code_graph_builder/mcp/file_editor.py +207 -0
  31. code_graph_builder/mcp/pipeline.py +777 -0
  32. code_graph_builder/mcp/server.py +161 -0
  33. code_graph_builder/mcp/tools.py +1800 -0
  34. code_graph_builder/models.py +115 -0
  35. code_graph_builder/parser_loader.py +344 -0
  36. code_graph_builder/parsers/__init__.py +7 -0
  37. code_graph_builder/parsers/call_processor.py +306 -0
  38. code_graph_builder/parsers/call_resolver.py +139 -0
  39. code_graph_builder/parsers/definition_processor.py +796 -0
  40. code_graph_builder/parsers/factory.py +119 -0
  41. code_graph_builder/parsers/import_processor.py +293 -0
  42. code_graph_builder/parsers/structure_processor.py +145 -0
  43. code_graph_builder/parsers/type_inference.py +143 -0
  44. code_graph_builder/parsers/utils.py +134 -0
  45. code_graph_builder/rag/__init__.py +68 -0
  46. code_graph_builder/rag/camel_agent.py +429 -0
  47. code_graph_builder/rag/client.py +298 -0
  48. code_graph_builder/rag/config.py +239 -0
  49. code_graph_builder/rag/cypher_generator.py +67 -0
  50. code_graph_builder/rag/llm_backend.py +210 -0
  51. code_graph_builder/rag/markdown_generator.py +352 -0
  52. code_graph_builder/rag/prompt_templates.py +440 -0
  53. code_graph_builder/rag/rag_engine.py +640 -0
  54. code_graph_builder/rag/review_report.md +172 -0
  55. code_graph_builder/rag/tests/__init__.py +3 -0
  56. code_graph_builder/rag/tests/test_camel_agent.py +313 -0
  57. code_graph_builder/rag/tests/test_client.py +221 -0
  58. code_graph_builder/rag/tests/test_config.py +177 -0
  59. code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
  60. code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
  61. code_graph_builder/services/__init__.py +39 -0
  62. code_graph_builder/services/graph_service.py +465 -0
  63. code_graph_builder/services/kuzu_service.py +665 -0
  64. code_graph_builder/services/memory_service.py +171 -0
  65. code_graph_builder/settings.py +75 -0
  66. code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
  67. code_graph_builder/tests/__init__.py +1 -0
  68. code_graph_builder/tests/run_acceptance_check.py +378 -0
  69. code_graph_builder/tests/test_api_find.py +231 -0
  70. code_graph_builder/tests/test_api_find_integration.py +226 -0
  71. code_graph_builder/tests/test_basic.py +78 -0
  72. code_graph_builder/tests/test_c_api_extraction.py +388 -0
  73. code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
  74. code_graph_builder/tests/test_embedder.py +411 -0
  75. code_graph_builder/tests/test_integration_semantic.py +434 -0
  76. code_graph_builder/tests/test_mcp_protocol.py +298 -0
  77. code_graph_builder/tests/test_mcp_user_flow.py +190 -0
  78. code_graph_builder/tests/test_rag.py +404 -0
  79. code_graph_builder/tests/test_settings.py +135 -0
  80. code_graph_builder/tests/test_step1_graph_build.py +264 -0
  81. code_graph_builder/tests/test_step2_api_docs.py +323 -0
  82. code_graph_builder/tests/test_step3_embedding.py +278 -0
  83. code_graph_builder/tests/test_vector_store.py +552 -0
  84. code_graph_builder/tools/__init__.py +40 -0
  85. code_graph_builder/tools/graph_query.py +495 -0
  86. code_graph_builder/tools/semantic_search.py +387 -0
  87. code_graph_builder/types.py +333 -0
  88. code_graph_builder/utils/__init__.py +0 -0
  89. code_graph_builder/utils/path_utils.py +30 -0
  90. code_graph_builder-0.2.0.dist-info/METADATA +321 -0
  91. code_graph_builder-0.2.0.dist-info/RECORD +93 -0
  92. code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
  93. code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,915 @@
1
+ #!/usr/bin/env python3
2
+ """通用代码 Wiki 生成器 - 对齐 deepwiki 两阶段流程。
3
+
4
+ 流程(对齐 deepwiki):
5
+ 阶段一(determineWikiStructure):
6
+ 读取文件树 + README → LLM 规划 XML 格式 Wiki 目录结构
7
+ → 解析出若干页面(每页含标题、描述、相关源文件)
8
+
9
+ 阶段二(generatePageContent):
10
+ 对每个规划页面,用页面标题作 query → 向量检索相关源码片段
11
+ → 使用 deepwiki page content prompt 生成含 Mermaid 图/表格/行号引用的 Markdown
12
+
13
+ 输出结构(对齐 deepwiki):
14
+ output_dir/
15
+ ├── index.md # summary hub:项目概览 + 页面索引表
16
+ └── wiki/
17
+ ├── <page-id>.md # 每个规划页面独立一个文件
18
+ └── ...
19
+
20
+ Wiki 模式(对齐 deepwiki):
21
+ --comprehensive 生成 8-12 页详细 wiki(默认)
22
+ --concise 生成 4-6 页简洁 wiki
23
+
24
+ Usage:
25
+ python generate_wiki.py --repo-path /path/to/repo
26
+ python generate_wiki.py --repo-path /path/to/repo --concise
27
+ python generate_wiki.py --repo-path /path/to/repo --max-pages 12
28
+ python generate_wiki.py --repo-path /path/to/repo --rebuild
29
+ python generate_wiki.py --repo-path /path/to/repo --output-dir ./my_wiki
30
+ python generate_wiki.py --repo-path /path/to/repo --pages page-1 page-3 # 重跑指定页面
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import argparse
36
+ import os
37
+ import pickle
38
+ import re
39
+ import shutil
40
+ import subprocess
41
+ import sys
42
+ import tempfile
43
+ from datetime import datetime
44
+ from pathlib import Path
45
+
46
+ MAX_SOURCE_CHARS_PER_FUNC = 2000
47
+ MAX_FUNCS_IN_CONTEXT = 8
48
+ EMBED_BATCH_SIZE = 10
49
+
50
+ MAX_PAGES_COMPREHENSIVE = 10
51
+ MAX_PAGES_CONCISE = 5
52
+
53
+ # 文件树最大行数,避免 prompt 过长
54
+ MAX_FILETREE_LINES = 300
55
+ # README 最大字符数
56
+ MAX_README_CHARS = 3000
57
+
58
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
59
+
60
+
61
+ def setup_environment():
62
+ sys.path.insert(0, str(PROJECT_ROOT))
63
+ env_file = PROJECT_ROOT / ".env"
64
+ if env_file.exists():
65
+ with open(env_file) as f:
66
+ for line in f:
67
+ line = line.strip()
68
+ if line and not line.startswith("#") and "=" in line:
69
+ key, value = line.split("=", 1)
70
+ os.environ.setdefault(key.strip(), value.strip())
71
+ if not os.getenv("MOONSHOT_API_KEY"):
72
+ print("错误: MOONSHOT_API_KEY 未设置,请在 .env 文件或环境变量中配置")
73
+ sys.exit(1)
74
+ if not os.getenv("DASHSCOPE_API_KEY"):
75
+ print("错误: DASHSCOPE_API_KEY 未设置,向量检索需要 DashScope API Key")
76
+ sys.exit(1)
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # 图构建
81
+ # ---------------------------------------------------------------------------
82
+
83
+ def build_or_load_graph(repo_path: Path, db_path: Path, rebuild: bool):
84
+ """构建或加载代码图,返回 CodeGraphBuilder 实例。"""
85
+ from code_graph_builder import CodeGraphBuilder
86
+
87
+ builder = CodeGraphBuilder(
88
+ repo_path=str(repo_path),
89
+ backend="kuzu",
90
+ backend_config={"db_path": str(db_path), "batch_size": 1000},
91
+ )
92
+
93
+ if rebuild or not db_path.exists():
94
+ print(f"构建代码图: {repo_path} -> {db_path}")
95
+ result = builder.build_graph(clean=rebuild)
96
+ print(f" 节点: {result.nodes_created:,} 关系: {result.relationships_created:,}")
97
+ else:
98
+ print(f"复用已有图数据库: {db_path}")
99
+
100
+ return builder
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # 源码读取(通用路径推导)
105
+ # ---------------------------------------------------------------------------
106
+
107
+ def resolve_source_file(qname: str, repo_path: Path) -> Path | None:
108
+ parts = qname.split(".")
109
+ if len(parts) < 3:
110
+ return None
111
+ dir_parts = parts[1:-1]
112
+ for depth in range(len(dir_parts), 0, -1):
113
+ for suffix in (".c", ".py", ".h", ".cpp", ".go", ".rs", ".js", ".ts"):
114
+ candidate = repo_path.joinpath(*dir_parts[:depth]).with_suffix(suffix)
115
+ if candidate.exists():
116
+ return candidate
117
+ return None
118
+
119
+
120
+ def read_function_source(func: dict, repo_path: Path) -> str | None:
121
+ qname = func.get("qualified_name", "")
122
+ start_line = func.get("start_line", 0)
123
+ end_line = func.get("end_line", 0)
124
+ if start_line == 0 or start_line == end_line:
125
+ return None
126
+ file_path = resolve_source_file(qname, repo_path)
127
+ if file_path is None:
128
+ return None
129
+ try:
130
+ with open(file_path, encoding="utf-8", errors="replace") as fh:
131
+ lines = fh.readlines()
132
+ source = "".join(lines[start_line - 1 : end_line])
133
+ if len(source) > MAX_SOURCE_CHARS_PER_FUNC:
134
+ source = source[:MAX_SOURCE_CHARS_PER_FUNC] + "\n /* ... truncated ... */"
135
+ return source
136
+ except OSError:
137
+ return None
138
+
139
+
140
+ def build_source_context(results: list[dict], repo_path: Path) -> str:
141
+ """组装向量检索结果的源码上下文,按文件分组,标注行号。"""
142
+ file_chunks: dict[tuple[str, str], list[str]] = {}
143
+ for func in results:
144
+ source = read_function_source(func, repo_path)
145
+ if not source:
146
+ continue
147
+ file_path = resolve_source_file(func.get("qualified_name", ""), repo_path)
148
+ filename = str(file_path.relative_to(repo_path)) if file_path else "unknown"
149
+ suffix = file_path.suffix.lstrip(".") if file_path else "c"
150
+ entry = f"// {func['name']} (line {func['start_line']}-{func['end_line']})\n{source}"
151
+ file_chunks.setdefault((filename, suffix), []).append(entry)
152
+
153
+ if not file_chunks:
154
+ return ""
155
+
156
+ result_parts = []
157
+ for (filename, suffix), chunks in file_chunks.items():
158
+ header = f"## File Path: {filename}"
159
+ body = "\n\n".join(f"```{suffix}\n{chunk}\n```" for chunk in chunks)
160
+ result_parts.append(f"{header}\n\n{body}")
161
+
162
+ return "\n\n----------\n\n".join(result_parts)
163
+
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # Mermaid 语法验证(mmdc)
167
+ # ---------------------------------------------------------------------------
168
+
169
+ MMDC_PATH = shutil.which("mmdc") or "/usr/local/bin/mmdc"
170
+
171
+
172
+ def validate_mermaid_blocks(content: str) -> list[dict]:
173
+ """用 mmdc 逐一验证 Markdown 内容中的所有 Mermaid 块。
174
+
175
+ 返回错误列表,每项: {index, code, error}
176
+ index 从 1 开始,代表第几个 Mermaid 块。
177
+ """
178
+ blocks = re.findall(r"```mermaid\n(.*?)```", content, re.DOTALL)
179
+ if not blocks:
180
+ return []
181
+
182
+ errors = []
183
+ for idx, code in enumerate(blocks, 1):
184
+ with tempfile.NamedTemporaryFile(suffix=".mmd", mode="w", delete=False) as f:
185
+ f.write(code)
186
+ tmp_in = f.name
187
+ tmp_out = tmp_in.replace(".mmd", ".svg")
188
+ try:
189
+ result = subprocess.run(
190
+ [MMDC_PATH, "-i", tmp_in, "-o", tmp_out],
191
+ capture_output=True,
192
+ text=True,
193
+ timeout=30,
194
+ )
195
+ if result.returncode != 0:
196
+ error_text = (result.stderr or result.stdout).strip()
197
+ # 只保留 Error: 那一行,去掉 stack trace
198
+ error_lines = [l for l in error_text.splitlines() if l.startswith("Error:")]
199
+ short_error = error_lines[0] if error_lines else error_text.split("\n")[0]
200
+ errors.append({"index": idx, "code": code.strip(), "error": short_error})
201
+ except subprocess.TimeoutExpired:
202
+ errors.append({"index": idx, "code": code.strip(), "error": "mmdc timeout"})
203
+ finally:
204
+ Path(tmp_in).unlink(missing_ok=True)
205
+ Path(tmp_out).unlink(missing_ok=True)
206
+
207
+ return errors
208
+
209
+
210
+ MAX_MERMAID_FIX_ATTEMPTS = 3
211
+
212
+
213
+ def _try_fix_once(code: str, error: str, attempt: int, agent) -> str | None:
214
+ """向 LLM 发起一次 Mermaid 修复请求,验证通过后返回代码,否则返回 None。"""
215
+ attempt_note = f"(第 {attempt} 次尝试)" if attempt > 1 else ""
216
+ prompt = f"""以下 Mermaid 图表代码存在语法错误,请修复它。{attempt_note}
217
+
218
+ 错误信息:{error}
219
+
220
+ 原始代码:
221
+ ```mermaid
222
+ {code}
223
+ ```
224
+
225
+ 要求:
226
+ - 只输出修复后的 Mermaid 代码,不要任何解释或 markdown 围栏
227
+ - 保持图表的原始意图和内容不变
228
+ - 使用合法的 Mermaid 语法(graph TD、sequenceDiagram、classDiagram 等)
229
+ - 节点 ID 只使用字母、数字和下划线,不使用特殊字符
230
+ - 节点标签中的特殊字符用双引号包裹
231
+
232
+ 直接输出修复后的代码:"""
233
+
234
+ try:
235
+ response = agent.analyze(task=prompt)
236
+ fixed = response.content.strip()
237
+ fixed = re.sub(r"^```mermaid\s*\n?", "", fixed)
238
+ fixed = re.sub(r"\n?```\s*$", "", fixed)
239
+ fixed = fixed.strip()
240
+ if not fixed:
241
+ return None
242
+ test_errors = validate_mermaid_blocks(f"```mermaid\n{fixed}\n```")
243
+ if test_errors:
244
+ return None
245
+ return fixed
246
+ except Exception:
247
+ return None
248
+
249
+
250
+ def fix_mermaid_errors(content: str, errors: list[dict], agent) -> tuple[str, list[dict]]:
251
+ """尝试用 LLM 修复所有有语法错误的 Mermaid 块。
252
+
253
+ 每个块最多尝试 MAX_MERMAID_FIX_ATTEMPTS 次,超过后直接删除该块。
254
+ 返回 (修复后的 content, 删除的 errors 列表)。
255
+ """
256
+ if not errors:
257
+ return content, []
258
+
259
+ error_map = {e["index"]: e for e in errors}
260
+ deleted_errors = []
261
+ idx = 0
262
+ result_parts = []
263
+ pos = 0
264
+
265
+ for m in re.finditer(r"```mermaid\n(.*?)```", content, re.DOTALL):
266
+ idx += 1
267
+ result_parts.append(content[pos:m.start()])
268
+
269
+ if idx in error_map:
270
+ err = error_map[idx]
271
+ fixed_code = None
272
+ for attempt in range(1, MAX_MERMAID_FIX_ATTEMPTS + 1):
273
+ print(f" 修复 Mermaid 块 #{idx}(第 {attempt}/{MAX_MERMAID_FIX_ATTEMPTS} 次)...")
274
+ fixed_code = _try_fix_once(err["code"], err["error"], attempt, agent)
275
+ if fixed_code is not None:
276
+ print(f" 块 #{idx} 第 {attempt} 次修复成功")
277
+ break
278
+ print(f" 块 #{idx} 第 {attempt} 次修复失败")
279
+
280
+ if fixed_code is not None:
281
+ result_parts.append(f"```mermaid\n{fixed_code}\n```")
282
+ else:
283
+ # 超过最大次数,删除该块
284
+ deleted_errors.append(err)
285
+ print(f" 块 #{idx} 超过 {MAX_MERMAID_FIX_ATTEMPTS} 次仍失败,已删除")
286
+ else:
287
+ result_parts.append(m.group(0))
288
+
289
+ pos = m.end()
290
+
291
+ result_parts.append(content[pos:])
292
+ return "".join(result_parts), deleted_errors
293
+
294
+
295
+ # ---------------------------------------------------------------------------
296
+ # Embedding 索引构建(deepwiki 风格)
297
+ # ---------------------------------------------------------------------------
298
+
299
+ def build_vector_index(builder, repo_path: Path, vectors_path: Path, rebuild: bool):
300
+ """对所有函数源码做 embedding,写入内存向量存储。"""
301
+ from code_graph_builder.embeddings.qwen3_embedder import create_embedder
302
+ from code_graph_builder.embeddings.vector_store import MemoryVectorStore, VectorRecord
303
+
304
+ embedder = create_embedder(batch_size=EMBED_BATCH_SIZE)
305
+
306
+ if not rebuild and vectors_path.exists():
307
+ print(f"从缓存加载向量索引: {vectors_path}")
308
+ with open(vectors_path, "rb") as fh:
309
+ cache = pickle.load(fh)
310
+ vector_store: MemoryVectorStore = cache["vector_store"]
311
+ func_map: dict[int, dict] = cache["func_map"]
312
+ print(f" 已加载 {len(vector_store)} 条 embedding")
313
+ return vector_store, embedder, func_map
314
+
315
+ print("构建向量索引(Embedding 所有函数源码)...")
316
+ rows = builder.query(
317
+ "MATCH (f:Function) RETURN f.name, f.qualified_name, f.start_line, f.end_line"
318
+ )
319
+ all_funcs: list[dict] = []
320
+ for row in rows:
321
+ vals = row.get("result") or list(row.values())
322
+ name, qname, start_line, end_line = vals
323
+ all_funcs.append({
324
+ "name": name,
325
+ "qualified_name": qname,
326
+ "start_line": start_line or 0,
327
+ "end_line": end_line or 0,
328
+ })
329
+
330
+ embeddable: list[tuple[int, dict, str]] = []
331
+ for i, func in enumerate(all_funcs):
332
+ source = read_function_source(func, repo_path)
333
+ if source:
334
+ text = f"// {func['name']}\n{source}"
335
+ embeddable.append((i, func, text))
336
+
337
+ print(f" 共 {len(all_funcs)} 个函数,{len(embeddable)} 个有源码,开始 embedding...")
338
+ texts = [t for _, _, t in embeddable]
339
+ embeddings = embedder.embed_documents(texts, show_progress=True)
340
+
341
+ vector_store = MemoryVectorStore(dimension=embedder.get_embedding_dimension())
342
+ func_map = {}
343
+ records = []
344
+ for idx, ((node_id, func, _), embedding) in enumerate(zip(embeddable, embeddings)):
345
+ records.append(VectorRecord(
346
+ node_id=node_id,
347
+ qualified_name=func["qualified_name"],
348
+ embedding=embedding,
349
+ metadata={
350
+ "name": func["name"],
351
+ "start_line": func["start_line"],
352
+ "end_line": func["end_line"],
353
+ },
354
+ ))
355
+ func_map[node_id] = func
356
+
357
+ vector_store.store_embeddings_batch(records)
358
+ print(f" 写入 {len(records)} 条 embedding,保存缓存到 {vectors_path}")
359
+ with open(vectors_path, "wb") as fh:
360
+ pickle.dump({"vector_store": vector_store, "func_map": func_map}, fh)
361
+
362
+ return vector_store, embedder, func_map
363
+
364
+
365
+ def semantic_search_funcs(
366
+ query: str,
367
+ vector_store,
368
+ embedder,
369
+ func_map: dict[int, dict],
370
+ top_k: int,
371
+ ) -> list[dict]:
372
+ """向量检索相关函数,失败时返回空列表。"""
373
+ try:
374
+ query_embedding = embedder.embed_query(query)
375
+ except Exception:
376
+ return []
377
+ results = vector_store.search_similar(query_embedding, top_k=top_k)
378
+ found = []
379
+ for r in results:
380
+ func = func_map.get(r.node_id)
381
+ if func:
382
+ found.append(func)
383
+ return found
384
+
385
+
386
+ # ---------------------------------------------------------------------------
387
+ # 阶段一:deepwiki determineWikiStructure — 规划 Wiki 目录
388
+ # ---------------------------------------------------------------------------
389
+
390
+ def build_file_tree(repo_path: Path) -> str:
391
+ """生成仓库文件树(忽略隐藏目录和常见无关目录)。"""
392
+ ignore_dirs = {
393
+ ".git", ".github", "__pycache__", "node_modules", ".venv", "venv",
394
+ "dist", "build", ".idea", ".vscode", "*.egg-info",
395
+ }
396
+ lines = []
397
+ for p in sorted(repo_path.rglob("*")):
398
+ # 跳过隐藏目录和忽略目录
399
+ parts = p.relative_to(repo_path).parts
400
+ if any(part.startswith(".") or part in ignore_dirs for part in parts):
401
+ continue
402
+ if p.is_file():
403
+ rel = str(p.relative_to(repo_path))
404
+ lines.append(rel)
405
+ if len(lines) >= MAX_FILETREE_LINES:
406
+ lines.append("... (truncated)")
407
+ break
408
+ return "\n".join(lines)
409
+
410
+
411
+ def read_readme(repo_path: Path) -> str:
412
+ """读取仓库 README 文件。"""
413
+ for name in ("README.md", "README.rst", "README.txt", "README"):
414
+ readme = repo_path / name
415
+ if readme.exists():
416
+ text = readme.read_text(encoding="utf-8", errors="replace")
417
+ if len(text) > MAX_README_CHARS:
418
+ text = text[:MAX_README_CHARS] + "\n... (truncated)"
419
+ return text
420
+ return "(no README found)"
421
+
422
+
423
+ def plan_wiki_structure(agent, repo_path: Path, project_name: str, comprehensive: bool) -> list[dict]:
424
+ """阶段一:让 LLM 规划 Wiki 结构,返回页面列表。
425
+
426
+ 对齐 deepwiki determineWikiStructure prompt。
427
+ 每个页面: {id, title, description, importance, relevant_files, related_pages}
428
+ """
429
+ file_tree = build_file_tree(repo_path)
430
+ readme = read_readme(repo_path)
431
+ page_count = "8-12" if comprehensive else "4-6"
432
+
433
+ prompt = f"""Analyze this repository "{project_name}" and create a wiki structure for it.
434
+
435
+ 1. The complete file tree of the project:
436
+ <file_tree>
437
+ {file_tree}
438
+ </file_tree>
439
+
440
+ 2. The README file of the project:
441
+ <readme>
442
+ {readme}
443
+ </readme>
444
+
445
+ I want to create a wiki for this repository. Determine the most logical structure for a wiki based on the repository's content.
446
+
447
+ The wiki content will be generated in Mandarin Chinese (中文).
448
+
449
+ When designing the wiki structure, include pages that would benefit from visual diagrams, such as:
450
+ - Architecture overviews
451
+ - Data flow descriptions
452
+ - Component relationships
453
+ - Process workflows
454
+ - State machines
455
+ - Class hierarchies
456
+
457
+ {"Create a structured wiki with sections covering: Overview, System Architecture, Core Features, Data Management/Flow, Key Modules/Components, APIs/Interfaces, and Deployment/Configuration." if comprehensive else "Create a concise wiki focusing on the most important aspects."}
458
+
459
+ Return your analysis in the following XML format:
460
+
461
+ <wiki_structure>
462
+ <title>[Overall title for the wiki]</title>
463
+ <description>[Brief description of the repository]</description>
464
+ <pages>
465
+ <page id="page-1">
466
+ <title>[Page title]</title>
467
+ <description>[Brief description of what this page will cover]</description>
468
+ <importance>high|medium|low</importance>
469
+ <relevant_files>
470
+ <file_path>[Path to a relevant file]</file_path>
471
+ </relevant_files>
472
+ <related_pages>
473
+ <related>page-2</related>
474
+ </related_pages>
475
+ </page>
476
+ </pages>
477
+ </wiki_structure>
478
+
479
+ IMPORTANT FORMATTING INSTRUCTIONS:
480
+ - Return ONLY the valid XML structure specified above
481
+ - DO NOT wrap the XML in markdown code blocks
482
+ - DO NOT include any explanation text before or after the XML
483
+ - Start directly with <wiki_structure> and end with </wiki_structure>
484
+
485
+ IMPORTANT:
486
+ 1. Create {page_count} pages that would make a {"comprehensive" if comprehensive else "concise"} wiki for this repository
487
+ 2. Each page should focus on a specific aspect of the codebase
488
+ 3. The relevant_files should be actual files from the repository
489
+ 4. Return ONLY valid XML"""
490
+
491
+ print("阶段一:规划 Wiki 目录结构...")
492
+ response = agent.analyze(task=prompt)
493
+ xml_text = response.content.strip()
494
+
495
+ # 提取 XML(防止 LLM 包了 markdown 代码块)
496
+ xml_match = re.search(r"<wiki_structure>.*?</wiki_structure>", xml_text, re.DOTALL)
497
+ if not xml_match:
498
+ print(" 警告:未能解析 XML,使用空结构")
499
+ return []
500
+
501
+ xml_text = xml_match.group(0)
502
+
503
+ # 解析页面列表
504
+ pages = []
505
+ for page_match in re.finditer(r"<page\s+id=[\"']([^\"']+)[\"']>(.*?)</page>", xml_text, re.DOTALL):
506
+ page_id = page_match.group(1)
507
+ page_xml = page_match.group(2)
508
+
509
+ title_m = re.search(r"<title>(.*?)</title>", page_xml, re.DOTALL)
510
+ desc_m = re.search(r"<description>(.*?)</description>", page_xml, re.DOTALL)
511
+ importance_m = re.search(r"<importance>(.*?)</importance>", page_xml)
512
+ files = re.findall(r"<file_path>(.*?)</file_path>", page_xml)
513
+ related = re.findall(r"<related>(.*?)</related>", page_xml)
514
+
515
+ pages.append({
516
+ "id": page_id,
517
+ "title": title_m.group(1).strip() if title_m else page_id,
518
+ "description": desc_m.group(1).strip() if desc_m else "",
519
+ "importance": importance_m.group(1).strip() if importance_m else "medium",
520
+ "relevant_files": [f.strip() for f in files],
521
+ "related_pages": related,
522
+ })
523
+
524
+ print(f" 规划了 {len(pages)} 个页面:")
525
+ for p in pages:
526
+ print(f" [{p['importance']}] {p['id']}: {p['title']}")
527
+
528
+ return pages
529
+
530
+
531
+ # ---------------------------------------------------------------------------
532
+ # 阶段二:deepwiki generatePageContent — 生成页面内容
533
+ # ---------------------------------------------------------------------------
534
+
535
+ def generate_page_content(
536
+ page: dict,
537
+ agent,
538
+ repo_path: Path,
539
+ vector_store,
540
+ embedder,
541
+ func_map: dict[int, dict],
542
+ ) -> str:
543
+ """阶段二:对齐 deepwiki generatePageContent prompt 生成页面 Markdown。
544
+
545
+ - 用页面标题做向量检索,获取相关源码作为 context
546
+ - 要求输出: <details> 源文件块、Mermaid 图、表格、行号引用
547
+ """
548
+ # 向量检索相关源码
549
+ query = f"{page['title']} {page['description']}"
550
+ funcs = semantic_search_funcs(query, vector_store, embedder, func_map, MAX_FUNCS_IN_CONTEXT)
551
+
552
+ # 若有 relevant_files 指定,也尝试直接读取
553
+ extra_context_parts = []
554
+ for rel_file in page.get("relevant_files", [])[:5]:
555
+ fpath = repo_path / rel_file
556
+ if fpath.exists() and fpath.is_file():
557
+ try:
558
+ text = fpath.read_text(encoding="utf-8", errors="replace")
559
+ if len(text) > 4000:
560
+ text = text[:4000] + "\n... (truncated)"
561
+ suffix = fpath.suffix.lstrip(".") or "txt"
562
+ extra_context_parts.append(
563
+ f"## File Path: {rel_file}\n\n```{suffix}\n{text}\n```"
564
+ )
565
+ except OSError:
566
+ pass
567
+
568
+ source_context = build_source_context(funcs, repo_path)
569
+
570
+ # 组合 context
571
+ all_context_parts = []
572
+ if extra_context_parts:
573
+ all_context_parts.extend(extra_context_parts)
574
+ if source_context:
575
+ all_context_parts.append(source_context)
576
+
577
+ full_context = "\n\n----------\n\n".join(all_context_parts) if all_context_parts else "(源码暂不可访问)"
578
+
579
+ # 所有引用文件列表(用于 <details> 块)
580
+ file_refs = list(page.get("relevant_files", []))
581
+ for func in funcs:
582
+ fp = resolve_source_file(func.get("qualified_name", ""), repo_path)
583
+ if fp:
584
+ rel = str(fp.relative_to(repo_path))
585
+ if rel not in file_refs:
586
+ file_refs.append(rel)
587
+
588
+ details_files = "\n".join(f"- {f}" for f in file_refs) if file_refs else "- (自动检索)"
589
+
590
+ prompt = f"""你是一名专家级技术作家和软件架构师。
591
+ 你的任务是为软件项目生成一篇关于特定功能、系统或模块的全面、准确的技术 Wiki 页面(Markdown 格式)。
592
+
593
+ Wiki 页面主题:**{page['title']}**
594
+ 页面描述:{page['description']}
595
+
596
+ 以下是从项目中检索到的相关源文件内容,你必须以此作为内容的唯一依据:
597
+
598
+ <START_OF_CONTEXT>
599
+ {full_context}
600
+ <END_OF_CONTEXT>
601
+
602
+ 请严格按照以下要求生成内容:
603
+
604
+ **开头必须是 `<details>` 块**,列出所有参考源文件,格式如下(不得在此之前输出任何内容):
605
+
606
+ <details>
607
+ <summary>Relevant source files</summary>
608
+
609
+ 以下文件被用于生成本 Wiki 页面:
610
+
611
+ {details_files}
612
+ </details>
613
+
614
+ 紧接 `<details>` 块之后,使用 H1 标题:`# {page['title']}`
615
+
616
+ 然后按以下要求生成正文内容:
617
+
618
+ 1. **引言**:1-2 段,说明本页面主题的目的、范围和高层概述。
619
+
620
+ 2. **详细章节**:使用 H2/H3 标题分节,说明架构、组件、数据流或核心逻辑。
621
+ 识别关键函数、类、数据结构、API 端点或配置项。
622
+
623
+ 3. **Mermaid 图表**(必须大量使用):
624
+ - 使用 `graph TD`(从上到下,禁止 `graph LR`)、`sequenceDiagram`、`classDiagram`、`erDiagram` 等
625
+ - 图表必须准确反映源文件中的实际结构和流程
626
+ - 每个图表前后都要有简短说明
627
+ - 序列图箭头规范:`->>` 请求、`-->>` 响应、`->x` 失败
628
+
629
+ 4. **表格**(必须使用):用 Markdown 表格汇总关键信息,如:
630
+ - 关键函数/组件及其描述
631
+ - API 参数、类型、说明
632
+ - 配置项及默认值
633
+ - 数据模型字段
634
+
635
+ 5. **代码片段**(可选):直接引用源文件中的关键实现片段,标注语言。
636
+
637
+ 6. **源码引用**(极其重要):
638
+ - 每个重要信息点、图表、表格后必须标注来源
639
+ - 格式:`Sources: [filename.ext:start_line-end_line]()`
640
+ - 整篇文档必须引用至少 5 个不同源文件
641
+
642
+ 7. **技术准确性**:所有信息必须且只能来自上方提供的源文件。
643
+
644
+ 8. **结语**:用简短段落总结本页面的关键内容及其在项目中的意义。
645
+
646
+ 请用**中文**生成内容。记住:
647
+ - 每个论断都必须来自源文件
648
+ - 优先保证准确性和对代码实际功能的直接描述
649
+ - 文档结构要便于其他开发者理解"""
650
+
651
+ response = agent.analyze(task=prompt)
652
+ return response.content
653
+
654
+
655
+ # ---------------------------------------------------------------------------
656
+ # Wiki 生成主流程
657
+ # ---------------------------------------------------------------------------
658
+
659
+ def generate_wiki(
660
+ builder,
661
+ repo_path: Path,
662
+ output_dir: Path,
663
+ max_pages: int,
664
+ rebuild: bool,
665
+ comprehensive: bool = True,
666
+ only_pages: list[str] | None = None,
667
+ ) -> tuple[Path, int]:
668
+ from code_graph_builder.rag.camel_agent import CamelAgent
669
+ from code_graph_builder.rag.client import create_llm_client
670
+
671
+ project_name = repo_path.name
672
+
673
+ llm_client = create_llm_client(
674
+ api_key=os.getenv("MOONSHOT_API_KEY"),
675
+ model=os.getenv("MOONSHOT_MODEL", "kimi-k2.5"),
676
+ temperature=1.0,
677
+ )
678
+ agent = CamelAgent(
679
+ role=f"{project_name} 技术文档专家",
680
+ goal=f"结合真实源码,为 {project_name} 生成专业、准确、图文并茂的技术 Wiki",
681
+ backstory=f"拥有丰富的技术写作和代码阅读经验,深入理解 {project_name} 源码架构",
682
+ llm_client=llm_client,
683
+ )
684
+
685
+ vectors_path = output_dir / f"{project_name}_vectors.pkl"
686
+ output_dir.mkdir(parents=True, exist_ok=True)
687
+
688
+ # 构建向量索引
689
+ vector_store, embedder, func_map = build_vector_index(
690
+ builder, repo_path, vectors_path, rebuild
691
+ )
692
+
693
+ # 阶段一:规划 Wiki 目录(或加载已有规划)
694
+ structure_cache = output_dir / f"{project_name}_structure.pkl"
695
+ if not rebuild and structure_cache.exists():
696
+ print(f"从缓存加载 Wiki 结构: {structure_cache}")
697
+ with open(structure_cache, "rb") as fh:
698
+ planned_pages = pickle.load(fh)
699
+ print(f" 已加载 {len(planned_pages)} 个页面规划")
700
+ else:
701
+ planned_pages = plan_wiki_structure(agent, repo_path, project_name, comprehensive)
702
+ with open(structure_cache, "wb") as fh:
703
+ pickle.dump(planned_pages, fh)
704
+
705
+ # 截取页面数量
706
+ if only_pages:
707
+ pages_to_generate = [p for p in planned_pages if p["id"] in only_pages]
708
+ else:
709
+ # 高重要性优先,再按规划顺序,取 max_pages 个
710
+ high = [p for p in planned_pages if p["importance"] == "high"]
711
+ others = [p for p in planned_pages if p["importance"] != "high"]
712
+ ordered = high + others
713
+ pages_to_generate = ordered[:max_pages]
714
+
715
+ wiki_mode = "详细(Comprehensive,8-12页)" if comprehensive else "简洁(Concise,4-6页)"
716
+ print(f"\n将生成 {len(pages_to_generate)} 个 Wiki 页面 [模式: {wiki_mode}]")
717
+
718
+ # 阶段二:逐页生成内容
719
+ wiki_dir = output_dir / "wiki"
720
+ wiki_dir.mkdir(parents=True, exist_ok=True)
721
+
722
+ gen_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
723
+ model_name = os.getenv("MOONSHOT_MODEL", "kimi-k2.5")
724
+
725
+ generated: list[dict] = []
726
+ all_mermaid_errors: dict[str, list[dict]] = {}
727
+
728
+ for i, page in enumerate(pages_to_generate, 1):
729
+ print(f"\n[{i}/{len(pages_to_generate)}] {page['id']}: {page['title']}...")
730
+ try:
731
+ content = generate_page_content(
732
+ page, agent, repo_path, vector_store, embedder, func_map
733
+ )
734
+ mermaid_errors = validate_mermaid_blocks(content)
735
+ if mermaid_errors:
736
+ print(f" ⚠️ Mermaid 语法错误: {len(mermaid_errors)} 个块,尝试修复...")
737
+ content, deleted = fix_mermaid_errors(content, mermaid_errors, agent)
738
+ if deleted:
739
+ all_mermaid_errors[page["id"]] = deleted
740
+ print(f" ⚠️ {len(deleted)} 个块超过 {MAX_MERMAID_FIX_ATTEMPTS} 次修复失败,已删除")
741
+ else:
742
+ print(f" ✓ 全部 {len(mermaid_errors)} 个 Mermaid 块修复成功")
743
+ page_file = wiki_dir / f"{page['id']}.md"
744
+ page_file.write_text(content, encoding="utf-8")
745
+ generated.append({**page, "content": content})
746
+ print(f" 完成 ({len(content)} 字符, {page_file.stat().st_size:,} 字节)")
747
+ except Exception as e:
748
+ print(f" 失败: {e}")
749
+ err_content = f"# {page['title']}\n\n*生成失败: {e}*"
750
+ page_file = wiki_dir / f"{page['id']}.md"
751
+ page_file.write_text(err_content, encoding="utf-8")
752
+ generated.append({**page, "content": err_content})
753
+
754
+ # only_pages 模式只更新指定页面,跳过 index.md
755
+ if only_pages:
756
+ print(f"\n页面文件已更新:")
757
+ for p in generated:
758
+ pf = wiki_dir / f"{p['id']}.md"
759
+ print(f" wiki/{p['id']}.md ({pf.stat().st_size:,} 字节)")
760
+ if all_mermaid_errors:
761
+ total_deleted = sum(len(v) for v in all_mermaid_errors.values())
762
+ print(f"\nMermaid 删除报告 ({total_deleted} 个块已删除):")
763
+ for pid, errs in all_mermaid_errors.items():
764
+ for e in errs:
765
+ print(f" [{pid}] 块#{e['index']}: {e['error']}")
766
+ return output_dir / "index.md", len(generated)
767
+
768
+ # 统计数据
769
+ total_funcs_row = builder.query("MATCH (f:Function) RETURN count(f) AS cnt")
770
+ total_funcs = list(total_funcs_row[0].values())[0] if total_funcs_row else 0
771
+ total_calls_row = builder.query("MATCH ()-[r:CALLS]->() RETURN count(r) AS cnt")
772
+ total_calls = list(total_calls_row[0].values())[0] if total_calls_row else 0
773
+
774
+ # 写 index.md
775
+ mode_label = "详细 Comprehensive" if comprehensive else "简洁 Concise"
776
+ index_path = output_dir / "index.md"
777
+ index_lines = [
778
+ f"# {project_name} 源码 Wiki",
779
+ "",
780
+ f"*生成时间: {gen_time}*",
781
+ f"*模型: {model_name} | 模式: {mode_label} | 上下文检索: 向量语义检索(Qwen3 Embedding)*",
782
+ "",
783
+ "---",
784
+ "",
785
+ "## 项目概览",
786
+ "",
787
+ "| 指标 | 数值 |",
788
+ "|------|------|",
789
+ f"| 总函数数 | {total_funcs:,} |",
790
+ f"| 总调用关系 | {total_calls:,} |",
791
+ f"| 本次生成页面 | {len(generated)} |",
792
+ "",
793
+ "---",
794
+ "",
795
+ "## Wiki 页面索引",
796
+ "",
797
+ "| 重要性 | 页面 | 描述 |",
798
+ "|--------|------|------|",
799
+ ]
800
+ for p in generated:
801
+ importance_icon = {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(p["importance"], "⚪")
802
+ index_lines.append(
803
+ f"| {importance_icon} {p['importance']} | [{p['title']}](./wiki/{p['id']}.md) | {p['description'][:60]}... |"
804
+ if len(p["description"]) > 60
805
+ else f"| {importance_icon} {p['importance']} | [{p['title']}](./wiki/{p['id']}.md) | {p['description']} |"
806
+ )
807
+ index_lines += ["", "---", "", "## 详细文档", ""]
808
+ for p in generated:
809
+ index_lines.append(f"- [{p['title']}](./wiki/{p['id']}.md) — {p['description']}")
810
+
811
+ index_path.write_text("\n".join(index_lines), encoding="utf-8")
812
+
813
+ total_size = sum((wiki_dir / f"{p['id']}.md").stat().st_size for p in generated) + index_path.stat().st_size
814
+ print(f"\nWiki 已保存到: {output_dir}/")
815
+ print(f" index.md ({index_path.stat().st_size:,} 字节)")
816
+ for p in generated:
817
+ pf = wiki_dir / f"{p['id']}.md"
818
+ print(f" wiki/{p['id']}.md ({pf.stat().st_size:,} 字节)")
819
+ print(f"总大小: {total_size:,} 字节 | 总页面数: {len(generated)}")
820
+ if all_mermaid_errors:
821
+ total_deleted = sum(len(v) for v in all_mermaid_errors.values())
822
+ print(f"\nMermaid 删除报告 ({total_deleted} 个块已删除,共 {len(all_mermaid_errors)} 个页面):")
823
+ for pid, errs in all_mermaid_errors.items():
824
+ for e in errs:
825
+ print(f" [{pid}] 块#{e['index']}: {e['error']}")
826
+ else:
827
+ print("Mermaid 验证: 全部通过 ✓")
828
+ return index_path, len(generated)
829
+
830
+
831
+ # ---------------------------------------------------------------------------
832
+ # 入口
833
+ # ---------------------------------------------------------------------------
834
+
835
+ def main():
836
+ parser = argparse.ArgumentParser(
837
+ description="通用代码 Wiki 生成器(对齐 deepwiki 两阶段流程)",
838
+ formatter_class=argparse.RawDescriptionHelpFormatter,
839
+ epilog="""
840
+ 示例:
841
+ python generate_wiki.py --repo-path /path/to/redis # 默认详细模式
842
+ python generate_wiki.py --repo-path /path/to/redis --concise # 简洁模式
843
+ python generate_wiki.py --repo-path /path/to/redis --max-pages 12
844
+ python generate_wiki.py --repo-path /path/to/redis --rebuild # 重新规划 + 重新 embedding
845
+ python generate_wiki.py --repo-path /path/to/redis --pages page-1 page-3 # 重跑指定页面
846
+ """,
847
+ )
848
+ parser.add_argument("--repo-path", type=Path, required=True, help="目标代码仓库路径")
849
+ mode_group = parser.add_mutually_exclusive_group()
850
+ mode_group.add_argument("--comprehensive", action="store_true", default=False,
851
+ help="生成详细 wiki(8-12 页,默认)")
852
+ mode_group.add_argument("--concise", action="store_true", default=False,
853
+ help="生成简洁 wiki(4-6 页)")
854
+ parser.add_argument("--max-pages", type=int, default=None,
855
+ help="最多生成几个页面(默认: comprehensive=10,concise=5)")
856
+ parser.add_argument("--output-dir", type=Path, default=None,
857
+ help="wiki 输出目录(默认: ./<repo_name>_wiki/)")
858
+ parser.add_argument("--db-path", type=Path, default=None,
859
+ help="Kùzu 数据库路径(默认: ./<repo_name>_graph.db)")
860
+ parser.add_argument("--rebuild", action="store_true",
861
+ help="强制重新构建图、向量索引和 Wiki 结构规划")
862
+ parser.add_argument("--pages", nargs="+", default=None, metavar="PAGE_ID",
863
+ help="只重新生成指定 page-id 的页面(空格分隔),需先有结构缓存")
864
+ args = parser.parse_args()
865
+
866
+ setup_environment()
867
+
868
+ repo_path = args.repo_path.resolve()
869
+ if not repo_path.exists():
870
+ print(f"错误: 仓库路径不存在: {repo_path}")
871
+ sys.exit(1)
872
+
873
+ comprehensive = not args.concise
874
+ max_pages = args.max_pages if args.max_pages is not None else (
875
+ MAX_PAGES_COMPREHENSIVE if comprehensive else MAX_PAGES_CONCISE
876
+ )
877
+
878
+ project_name = repo_path.name
879
+ db_path = args.db_path or Path(f"./{project_name}_graph.db")
880
+ output_dir = args.output_dir or Path(f"./{project_name}_wiki")
881
+
882
+ print("=" * 60)
883
+ print("通用代码 Wiki 生成器(deepwiki 两阶段流程)")
884
+ print("=" * 60)
885
+ print(f"仓库: {repo_path}")
886
+ print(f"项目名: {project_name}")
887
+ print(f"数据库: {db_path}")
888
+ print(f"输出目录: {output_dir}")
889
+ print(f"模式: {'详细 Comprehensive' if comprehensive else '简洁 Concise'}")
890
+ print(f"最大页面: {max_pages}")
891
+
892
+ builder = build_or_load_graph(repo_path, db_path, args.rebuild)
893
+
894
+ try:
895
+ index_path, page_count = generate_wiki(
896
+ builder=builder,
897
+ repo_path=repo_path,
898
+ output_dir=output_dir,
899
+ max_pages=max_pages,
900
+ rebuild=args.rebuild,
901
+ comprehensive=comprehensive,
902
+ only_pages=args.pages,
903
+ )
904
+ print(f"\n完成! 生成了 {page_count} 个页面")
905
+ print(f"目录: {index_path.parent}/")
906
+ print(f"入口: {index_path}")
907
+ except Exception as e:
908
+ print(f"\n错误: {e}")
909
+ import traceback
910
+ traceback.print_exc()
911
+ sys.exit(1)
912
+
913
+
914
+ if __name__ == "__main__":
915
+ main()