code-graph-builder 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. code_graph_builder/__init__.py +82 -0
  2. code_graph_builder/builder.py +366 -0
  3. code_graph_builder/cgb_cli.py +32 -0
  4. code_graph_builder/cli.py +564 -0
  5. code_graph_builder/commands_cli.py +1288 -0
  6. code_graph_builder/config.py +340 -0
  7. code_graph_builder/constants.py +708 -0
  8. code_graph_builder/embeddings/__init__.py +40 -0
  9. code_graph_builder/embeddings/qwen3_embedder.py +573 -0
  10. code_graph_builder/embeddings/vector_store.py +584 -0
  11. code_graph_builder/examples/__init__.py +0 -0
  12. code_graph_builder/examples/example_configuration.py +276 -0
  13. code_graph_builder/examples/example_kuzu_usage.py +109 -0
  14. code_graph_builder/examples/example_semantic_search_full.py +347 -0
  15. code_graph_builder/examples/generate_wiki.py +915 -0
  16. code_graph_builder/examples/graph_export_example.py +100 -0
  17. code_graph_builder/examples/rag_example.py +206 -0
  18. code_graph_builder/examples/test_cli_demo.py +129 -0
  19. code_graph_builder/examples/test_embedding_api.py +153 -0
  20. code_graph_builder/examples/test_kuzu_local.py +190 -0
  21. code_graph_builder/examples/test_rag_redis.py +390 -0
  22. code_graph_builder/graph_updater.py +605 -0
  23. code_graph_builder/guidance/__init__.py +1 -0
  24. code_graph_builder/guidance/agent.py +123 -0
  25. code_graph_builder/guidance/prompts.py +74 -0
  26. code_graph_builder/guidance/toolset.py +264 -0
  27. code_graph_builder/language_spec.py +536 -0
  28. code_graph_builder/mcp/__init__.py +21 -0
  29. code_graph_builder/mcp/api_doc_generator.py +764 -0
  30. code_graph_builder/mcp/file_editor.py +207 -0
  31. code_graph_builder/mcp/pipeline.py +777 -0
  32. code_graph_builder/mcp/server.py +161 -0
  33. code_graph_builder/mcp/tools.py +1800 -0
  34. code_graph_builder/models.py +115 -0
  35. code_graph_builder/parser_loader.py +344 -0
  36. code_graph_builder/parsers/__init__.py +7 -0
  37. code_graph_builder/parsers/call_processor.py +306 -0
  38. code_graph_builder/parsers/call_resolver.py +139 -0
  39. code_graph_builder/parsers/definition_processor.py +796 -0
  40. code_graph_builder/parsers/factory.py +119 -0
  41. code_graph_builder/parsers/import_processor.py +293 -0
  42. code_graph_builder/parsers/structure_processor.py +145 -0
  43. code_graph_builder/parsers/type_inference.py +143 -0
  44. code_graph_builder/parsers/utils.py +134 -0
  45. code_graph_builder/rag/__init__.py +68 -0
  46. code_graph_builder/rag/camel_agent.py +429 -0
  47. code_graph_builder/rag/client.py +298 -0
  48. code_graph_builder/rag/config.py +239 -0
  49. code_graph_builder/rag/cypher_generator.py +67 -0
  50. code_graph_builder/rag/llm_backend.py +210 -0
  51. code_graph_builder/rag/markdown_generator.py +352 -0
  52. code_graph_builder/rag/prompt_templates.py +440 -0
  53. code_graph_builder/rag/rag_engine.py +640 -0
  54. code_graph_builder/rag/review_report.md +172 -0
  55. code_graph_builder/rag/tests/__init__.py +3 -0
  56. code_graph_builder/rag/tests/test_camel_agent.py +313 -0
  57. code_graph_builder/rag/tests/test_client.py +221 -0
  58. code_graph_builder/rag/tests/test_config.py +177 -0
  59. code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
  60. code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
  61. code_graph_builder/services/__init__.py +39 -0
  62. code_graph_builder/services/graph_service.py +465 -0
  63. code_graph_builder/services/kuzu_service.py +665 -0
  64. code_graph_builder/services/memory_service.py +171 -0
  65. code_graph_builder/settings.py +75 -0
  66. code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
  67. code_graph_builder/tests/__init__.py +1 -0
  68. code_graph_builder/tests/run_acceptance_check.py +378 -0
  69. code_graph_builder/tests/test_api_find.py +231 -0
  70. code_graph_builder/tests/test_api_find_integration.py +226 -0
  71. code_graph_builder/tests/test_basic.py +78 -0
  72. code_graph_builder/tests/test_c_api_extraction.py +388 -0
  73. code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
  74. code_graph_builder/tests/test_embedder.py +411 -0
  75. code_graph_builder/tests/test_integration_semantic.py +434 -0
  76. code_graph_builder/tests/test_mcp_protocol.py +298 -0
  77. code_graph_builder/tests/test_mcp_user_flow.py +190 -0
  78. code_graph_builder/tests/test_rag.py +404 -0
  79. code_graph_builder/tests/test_settings.py +135 -0
  80. code_graph_builder/tests/test_step1_graph_build.py +264 -0
  81. code_graph_builder/tests/test_step2_api_docs.py +323 -0
  82. code_graph_builder/tests/test_step3_embedding.py +278 -0
  83. code_graph_builder/tests/test_vector_store.py +552 -0
  84. code_graph_builder/tools/__init__.py +40 -0
  85. code_graph_builder/tools/graph_query.py +495 -0
  86. code_graph_builder/tools/semantic_search.py +387 -0
  87. code_graph_builder/types.py +333 -0
  88. code_graph_builder/utils/__init__.py +0 -0
  89. code_graph_builder/utils/path_utils.py +30 -0
  90. code_graph_builder-0.2.0.dist-info/METADATA +321 -0
  91. code_graph_builder-0.2.0.dist-info/RECORD +93 -0
  92. code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
  93. code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,777 @@
1
+ """Pipeline with progress callbacks for MCP: graph (+ api_docs) → embedding → wiki.
2
+
3
+ Each stage calls `progress_cb(message)` after every meaningful unit of work
4
+ so the MCP server can relay real-time updates to the client.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ import pickle
12
+ from collections.abc import Callable
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from loguru import logger
18
+
19
+ try:
20
+ from ..rag.client import create_llm_client, LLMClient
21
+ except ImportError:
22
+ create_llm_client = None # type: ignore[assignment,misc]
23
+ LLMClient = None # type: ignore[assignment,misc]
24
+
25
+ ProgressCb = Callable[[str, float], None] | None
26
+ """Progress callback: (message, percentage_0_to_100) -> None.
27
+
28
+ Pipeline weight allocation:
29
+ Step 1 (graph + API docs): 0 – 15 %
30
+ Step 2 (embeddings): 15 – 40 %
31
+ Step 3 (wiki generation): 40 – 100 %
32
+ """
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Helpers shared with generate_wiki.py
37
+ # ---------------------------------------------------------------------------
38
+
39
+ def _resolve_source_file(qname: str, repo_path: Path) -> Path | None:
40
+ parts = qname.split(".")
41
+ if len(parts) < 3:
42
+ return None
43
+ dir_parts = parts[1:-1]
44
+ for depth in range(len(dir_parts), 0, -1):
45
+ for suffix in (".c", ".py", ".h", ".cpp", ".go", ".rs", ".js", ".ts"):
46
+ candidate = repo_path.joinpath(*dir_parts[:depth]).with_suffix(suffix)
47
+ if candidate.exists():
48
+ return candidate
49
+ return None
50
+
51
+
52
+ _MAX_SOURCE_CHARS = 2000
53
+
54
+
55
+ def _read_function_source(func: dict, repo_path: Path) -> str | None:
56
+ qname = func.get("qualified_name", "")
57
+ start_line = func.get("start_line", 0)
58
+ end_line = func.get("end_line", 0)
59
+ if start_line == 0 or start_line == end_line:
60
+ return None
61
+ file_path = _resolve_source_file(qname, repo_path)
62
+ if file_path is None:
63
+ return None
64
+ try:
65
+ with open(file_path, encoding="utf-8", errors="replace") as fh:
66
+ lines = fh.readlines()
67
+ source = "".join(lines[start_line - 1: end_line])
68
+ if len(source) > _MAX_SOURCE_CHARS:
69
+ source = source[:_MAX_SOURCE_CHARS] + "\n /* ... truncated ... */"
70
+ return source
71
+ except OSError:
72
+ return None
73
+
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # Step 1: graph build + API docs generation
77
+ # ---------------------------------------------------------------------------
78
+
79
+ _FUNC_DOC_QUERY = """
80
+ MATCH (m:Module)-[:DEFINES]->(f:Function)
81
+ RETURN DISTINCT m.qualified_name, m.path,
82
+ f.qualified_name, f.name, f.signature, f.return_type,
83
+ f.visibility, f.parameters, f.docstring,
84
+ f.start_line, f.end_line, f.path, f.kind
85
+ ORDER BY m.qualified_name, f.start_line
86
+ """
87
+
88
+ _TYPE_DOC_QUERY_CLASS = """
89
+ MATCH (c:Class)
90
+ RETURN DISTINCT c.qualified_name, c.name, c.kind, c.signature,
91
+ c.parameters, c.start_line, c.end_line
92
+ ORDER BY c.qualified_name, c.start_line
93
+ """
94
+
95
+ _TYPE_DOC_QUERY_TYPE = """
96
+ MATCH (t:Type)
97
+ RETURN DISTINCT t.qualified_name, t.name, t.kind, t.signature,
98
+ t.start_line, t.end_line
99
+ ORDER BY t.qualified_name, t.start_line
100
+ """
101
+
102
+ _CALLS_QUERY = """
103
+ MATCH (caller:Function)-[:CALLS]->(callee:Function)
104
+ RETURN DISTINCT caller.qualified_name, callee.qualified_name,
105
+ callee.path, callee.start_line
106
+ """
107
+
108
+
109
+ def build_graph(
110
+ repo_path: Path,
111
+ db_path: Path,
112
+ rebuild: bool,
113
+ progress_cb: ProgressCb = None,
114
+ backend: str = "kuzu",
115
+ ) -> Any:
116
+ """Build or reuse a code knowledge graph.
117
+
118
+ This step only creates the graph database. API docs, embeddings, and
119
+ wiki generation are separate steps.
120
+ """
121
+ from ..builder import CodeGraphBuilder
122
+
123
+ builder = CodeGraphBuilder(
124
+ repo_path=str(repo_path),
125
+ backend=backend,
126
+ backend_config={"db_path": str(db_path), "batch_size": 1000},
127
+ )
128
+
129
+ if rebuild or not db_path.exists():
130
+ result = builder.build_graph(clean=rebuild)
131
+ if progress_cb:
132
+ progress_cb(
133
+ f"Graph built: "
134
+ f"{result.nodes_created} nodes, "
135
+ f"{result.relationships_created} relationships, "
136
+ f"{result.files_processed} files processed.",
137
+ 10.0,
138
+ )
139
+ else:
140
+ stats = builder.get_statistics()
141
+ if progress_cb:
142
+ progress_cb(
143
+ f"Reusing existing graph: "
144
+ f"{stats.get('node_count', '?')} nodes, "
145
+ f"{stats.get('relationship_count', '?')} relationships.",
146
+ 10.0,
147
+ )
148
+
149
+ return builder
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # Step 2: API docs generation (graph-only, no embeddings needed)
154
+ # ---------------------------------------------------------------------------
155
+
156
+ def generate_api_docs_step(
157
+ builder: Any,
158
+ artifact_dir: Path,
159
+ rebuild: bool,
160
+ progress_cb: ProgressCb = None,
161
+ ) -> dict[str, Any]:
162
+ """Generate hierarchical API docs from the knowledge graph.
163
+
164
+ Requires only a populated graph database — no embeddings or LLM needed.
165
+ """
166
+ from .api_doc_generator import generate_api_docs
167
+
168
+ api_dir = artifact_dir / "api_docs"
169
+ index_file = api_dir / "index.md"
170
+
171
+ if not rebuild and index_file.exists():
172
+ if progress_cb:
173
+ progress_cb("Reusing cached API docs.", 15.0)
174
+ return {"status": "cached"}
175
+
176
+ try:
177
+ func_rows = builder.query(_FUNC_DOC_QUERY)
178
+ type_rows = builder.query(_TYPE_DOC_QUERY_CLASS) + builder.query(_TYPE_DOC_QUERY_TYPE)
179
+ call_rows = builder.query(_CALLS_QUERY)
180
+ except Exception as exc:
181
+ msg = f"API docs skipped — graph query failed: {exc}"
182
+ logger.warning(msg)
183
+ if progress_cb:
184
+ progress_cb(msg, 15.0)
185
+ return {"status": "skipped", "error": str(exc)}
186
+
187
+ result = generate_api_docs(func_rows, type_rows, call_rows, artifact_dir)
188
+ if progress_cb:
189
+ progress_cb(
190
+ f"API docs generated: "
191
+ f"{result['module_count']} modules, "
192
+ f"{result['func_count']} functions, "
193
+ f"{result['type_count']} types.",
194
+ 15.0,
195
+ )
196
+ return {"status": "success", **result}
197
+
198
+
199
+ # ---------------------------------------------------------------------------
200
+ # Step 1b: LLM-powered description generation for undocumented functions
201
+ # ---------------------------------------------------------------------------
202
+
203
+ _DESC_SYSTEM_PROMPT = """\
204
+ You are a code documentation assistant. Given a C/C++ function's signature, \
205
+ source code, and module context, generate a single concise sentence (in the \
206
+ same language as any existing comments in the code, defaulting to English) \
207
+ describing what the function does. Focus on the function's PURPOSE, not its \
208
+ implementation details. Do NOT include the function name in the description. \
209
+ Reply with ONLY the description sentence, nothing else."""
210
+
211
+ _DESC_BATCH_SIZE = 10
212
+
213
+
214
+ def _build_desc_prompt(funcs: list[dict]) -> str:
215
+ """Build a batched prompt for multiple functions."""
216
+ parts: list[str] = []
217
+ for i, f in enumerate(funcs):
218
+ sig = f.get("signature") or f.get("name", "unknown")
219
+ source = f.get("source", "")
220
+ module = f.get("module_qn", "")
221
+ parts.append(
222
+ f"[{i+1}] Module: {module}\n"
223
+ f" Signature: {sig}\n"
224
+ f" Source:\n{source}\n"
225
+ )
226
+ parts.append(
227
+ f"\nGenerate exactly {len(funcs)} descriptions, one per line, "
228
+ f"numbered [1] to [{len(funcs)}]. Each description should be a "
229
+ f"single concise sentence."
230
+ )
231
+ return "\n".join(parts)
232
+
233
+
234
+ def _parse_desc_response(response: str, count: int) -> list[str]:
235
+ """Parse numbered descriptions from LLM response."""
236
+ import re
237
+
238
+ descriptions: list[str] = [""] * count
239
+ for line in response.strip().splitlines():
240
+ line = line.strip()
241
+ if not line:
242
+ continue
243
+ # Match "[N] desc", "N. desc", or "N) desc" with regex
244
+ m = re.match(r"^\[?(\d+)[.\)\]]\s*(.*)", line)
245
+ if m:
246
+ idx = int(m.group(1)) - 1 # 1-based to 0-based
247
+ desc = m.group(2).strip()
248
+ if 0 <= idx < count and desc:
249
+ descriptions[idx] = desc
250
+ return descriptions
251
+
252
+
253
+ def generate_descriptions_step(
254
+ artifact_dir: Path,
255
+ repo_path: Path,
256
+ progress_cb: ProgressCb = None,
257
+ ) -> dict[str, Any]:
258
+ """Generate LLM descriptions for functions missing docstrings.
259
+
260
+ Reads L3 API doc files, finds those with TODO placeholders,
261
+ generates descriptions via LLM, and writes them back.
262
+
263
+ This step is optional -- skipped silently if no LLM API key is configured.
264
+
265
+ Returns:
266
+ Summary dict with generated_count, skipped_count, error_count.
267
+ """
268
+ if create_llm_client is None:
269
+ logger.info("LLM client not available, skipping description generation")
270
+ return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
271
+
272
+ try:
273
+ client = create_llm_client()
274
+ except (ValueError, RuntimeError) as e:
275
+ logger.info(f"No LLM API key configured, skipping description generation: {e}")
276
+ return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
277
+
278
+ funcs_dir = artifact_dir / "api_docs" / "funcs"
279
+ if not funcs_dir.exists():
280
+ logger.warning("No API docs found, skipping description generation")
281
+ return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
282
+
283
+ # Collect functions needing descriptions
284
+ todo_funcs: list[dict] = [] # {path, name, signature, source, module_qn, content}
285
+
286
+ for md_file in sorted(funcs_dir.glob("*.md")):
287
+ content = md_file.read_text(encoding="utf-8")
288
+ if "<!-- TODO:" not in content:
289
+ continue
290
+
291
+ # Parse minimal info from the markdown
292
+ func_info: dict = {"path": md_file, "content": content}
293
+ for line in content.splitlines():
294
+ if line.startswith("# "):
295
+ func_info["name"] = line[2:].strip()
296
+ elif line.startswith("- 签名:") or line.startswith("- 定义:"):
297
+ # Extract signature from backticks
298
+ start = line.find("`")
299
+ end = line.rfind("`")
300
+ if start != -1 and end > start:
301
+ func_info["signature"] = line[start + 1 : end]
302
+ elif line.startswith("- 模块:"):
303
+ func_info["module_qn"] = (
304
+ line[len("- 模块:") :].strip().split(" —")[0].strip()
305
+ )
306
+
307
+ # Read source from the implementation section
308
+ if "## 实现" in content:
309
+ source_start = content.index("## 实现")
310
+ # Extract code between ``` markers
311
+ code_start = content.find("```", source_start)
312
+ code_end = (
313
+ content.find("```", code_start + 3) if code_start != -1 else -1
314
+ )
315
+ if code_start != -1 and code_end != -1:
316
+ # Skip the ```c or ```cpp line
317
+ first_newline = content.index("\n", code_start)
318
+ func_info["source"] = content[first_newline + 1 : code_end].strip()
319
+
320
+ if "name" in func_info:
321
+ todo_funcs.append(func_info)
322
+
323
+ if not todo_funcs:
324
+ logger.info("All functions already have descriptions")
325
+ return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
326
+
327
+ logger.info(f"Generating descriptions for {len(todo_funcs)} functions")
328
+
329
+ generated = 0
330
+ errors = 0
331
+ total_batches = (len(todo_funcs) + _DESC_BATCH_SIZE - 1) // _DESC_BATCH_SIZE
332
+
333
+ for batch_idx in range(0, len(todo_funcs), _DESC_BATCH_SIZE):
334
+ batch = todo_funcs[batch_idx : batch_idx + _DESC_BATCH_SIZE]
335
+ current_batch = batch_idx // _DESC_BATCH_SIZE + 1
336
+
337
+ if progress_cb:
338
+ pct = int(current_batch / total_batches * 100)
339
+ progress_cb(
340
+ f"Generating descriptions: batch {current_batch}/{total_batches}",
341
+ float(pct),
342
+ )
343
+
344
+ prompt = _build_desc_prompt(batch)
345
+
346
+ try:
347
+ response = client.chat(
348
+ query=prompt,
349
+ system_prompt=_DESC_SYSTEM_PROMPT,
350
+ max_tokens=1024,
351
+ temperature=0.3,
352
+ )
353
+
354
+ descriptions = _parse_desc_response(response.content, len(batch))
355
+
356
+ for func_info, desc in zip(batch, descriptions):
357
+ if not desc:
358
+ errors += 1
359
+ continue
360
+
361
+ # Replace TODO placeholder with generated description
362
+ old_content = func_info["content"]
363
+ new_content = ""
364
+ for line in old_content.splitlines(keepends=True):
365
+ if "<!-- TODO:" in line and "-->" in line:
366
+ new_content += f"> {desc}\n"
367
+ else:
368
+ new_content += line
369
+
370
+ func_info["path"].write_text(new_content, encoding="utf-8")
371
+ generated += 1
372
+
373
+ except Exception as e:
374
+ logger.warning(f"LLM description generation failed for batch: {e}")
375
+ errors += len(batch)
376
+
377
+ logger.info(f"Generated {generated} descriptions, {errors} errors")
378
+ return {
379
+ "generated_count": generated,
380
+ "skipped_count": len(todo_funcs) - generated - errors,
381
+ "error_count": errors,
382
+ }
383
+
384
+
385
+ # ---------------------------------------------------------------------------
386
+ # Step 2: vector index with per-batch progress
387
+ # ---------------------------------------------------------------------------
388
+
389
+ _EMBED_BATCH_SIZE = 10
390
+
391
+
392
+ def _build_embedding_text(
393
+ func: dict,
394
+ callers: list[str],
395
+ callees: list[str],
396
+ source: str,
397
+ ) -> str:
398
+ """Compose rich embedding text for a function.
399
+
400
+ Combines name, file location, docstring, call relationships, and source
401
+ code so that semantic search can match abstract descriptions even when
402
+ functions lack formal documentation.
403
+ """
404
+ parts: list[str] = [f"Function: {func['name']}"]
405
+ if func.get("path"):
406
+ parts.append(f"File: {func['path']}")
407
+ if func.get("docstring"):
408
+ parts.append(f"Description: {func['docstring']}")
409
+ if callers:
410
+ parts.append(f"Called by: {', '.join(callers[:10])}")
411
+ if callees:
412
+ parts.append(f"Calls: {', '.join(callees[:10])}")
413
+ parts.append("---")
414
+ parts.append(source)
415
+ return "\n".join(parts)
416
+
417
+
418
+ def build_vector_index(
419
+ builder: Any,
420
+ repo_path: Path,
421
+ vectors_path: Path,
422
+ rebuild: bool,
423
+ progress_cb: ProgressCb = None,
424
+ ) -> tuple[Any, Any, dict[int, dict]]:
425
+ """Build or load vector embeddings, reporting after every API batch call."""
426
+ from ..embeddings.qwen3_embedder import create_embedder
427
+ from ..embeddings.vector_store import MemoryVectorStore, VectorRecord
428
+
429
+ embedder = create_embedder(batch_size=_EMBED_BATCH_SIZE)
430
+
431
+ if not rebuild and vectors_path.exists():
432
+ with open(vectors_path, "rb") as fh:
433
+ cache = pickle.load(fh)
434
+ vector_store: MemoryVectorStore = cache["vector_store"]
435
+ func_map: dict[int, dict] = cache["func_map"]
436
+ if progress_cb:
437
+ progress_cb(
438
+ f"[Step 2/3] Loaded {len(vector_store)} embeddings from cache: {vectors_path}",
439
+ 40.0,
440
+ )
441
+ return vector_store, embedder, func_map
442
+
443
+ # ---- Query functions with docstring and module path ----
444
+ rows = builder.query("""
445
+ MATCH (m:Module)-[:DEFINES]->(f:Function)
446
+ RETURN DISTINCT f.name AS name,
447
+ f.qualified_name AS qualified_name,
448
+ f.start_line AS start_line,
449
+ f.end_line AS end_line,
450
+ f.docstring AS docstring,
451
+ m.path AS path
452
+ """)
453
+ all_funcs: list[dict] = []
454
+ seen_qn: set[str] = set()
455
+ for row in rows:
456
+ qn = row.get("qualified_name") or ""
457
+ if not qn or qn in seen_qn:
458
+ continue
459
+ seen_qn.add(qn)
460
+ all_funcs.append({
461
+ "name": row.get("name") or "",
462
+ "qualified_name": qn,
463
+ "start_line": row.get("start_line") or 0,
464
+ "end_line": row.get("end_line") or 0,
465
+ "docstring": row.get("docstring") or "",
466
+ "path": row.get("path") or "",
467
+ })
468
+
469
+ # ---- Build caller/callee maps for richer embedding context ----
470
+ from collections import defaultdict
471
+ call_rows = builder.query("""
472
+ MATCH (caller:Function)-[:CALLS]->(callee:Function)
473
+ RETURN DISTINCT caller.qualified_name AS caller_qn,
474
+ callee.qualified_name AS callee_qn
475
+ """)
476
+ callees_of: dict[str, list[str]] = defaultdict(list)
477
+ callers_of: dict[str, list[str]] = defaultdict(list)
478
+ seen_edges: set[tuple[str, str]] = set()
479
+ for row in call_rows:
480
+ caller_qn = row.get("caller_qn") or ""
481
+ callee_qn = row.get("callee_qn") or ""
482
+ if not caller_qn or not callee_qn:
483
+ continue
484
+ edge = (caller_qn, callee_qn)
485
+ if edge in seen_edges:
486
+ continue
487
+ seen_edges.add(edge)
488
+ callees_of[caller_qn].append(callee_qn.split(".")[-1])
489
+ callers_of[callee_qn].append(caller_qn.split(".")[-1])
490
+
491
+ embeddable: list[tuple[int, dict, str]] = []
492
+ for i, func in enumerate(all_funcs):
493
+ source = _read_function_source(func, repo_path)
494
+ if source:
495
+ text = _build_embedding_text(
496
+ func,
497
+ callers=callers_of.get(func["qualified_name"], []),
498
+ callees=callees_of.get(func["qualified_name"], []),
499
+ source=source,
500
+ )
501
+ embeddable.append((i, func, text))
502
+
503
+ total = len(embeddable)
504
+ if progress_cb:
505
+ progress_cb(
506
+ f"[Step 2/3] Embedding {total} functions "
507
+ f"(batch size {_EMBED_BATCH_SIZE}, {(total + _EMBED_BATCH_SIZE - 1) // _EMBED_BATCH_SIZE} API calls)...",
508
+ 16.0,
509
+ )
510
+
511
+ vector_store = MemoryVectorStore(dimension=embedder.get_embedding_dimension())
512
+ func_map = {}
513
+ records: list[VectorRecord] = []
514
+
515
+ for batch_start in range(0, total, _EMBED_BATCH_SIZE):
516
+ batch = embeddable[batch_start: batch_start + _EMBED_BATCH_SIZE]
517
+ batch_texts = [t for _, _, t in batch]
518
+
519
+ batch_embeddings = embedder.embed_batch(batch_texts)
520
+
521
+ for (node_id, func, _), embedding in zip(batch, batch_embeddings):
522
+ records.append(VectorRecord(
523
+ node_id=node_id,
524
+ qualified_name=func["qualified_name"],
525
+ embedding=embedding,
526
+ metadata={
527
+ "name": func["name"],
528
+ "start_line": func["start_line"],
529
+ "end_line": func["end_line"],
530
+ },
531
+ ))
532
+ func_map[node_id] = func
533
+
534
+ done = min(batch_start + _EMBED_BATCH_SIZE, total)
535
+ local_pct = done * 100 // total
536
+ # Map local 0-100% to overall 16-40%
537
+ overall_pct = 16.0 + (done / total) * 24.0
538
+ if progress_cb:
539
+ progress_cb(
540
+ f"[Step 2/3] Embedded {done}/{total} functions ({local_pct}%).",
541
+ overall_pct,
542
+ )
543
+
544
+ vector_store.store_embeddings_batch(records)
545
+
546
+ with open(vectors_path, "wb") as fh:
547
+ pickle.dump({"vector_store": vector_store, "func_map": func_map}, fh)
548
+
549
+ if progress_cb:
550
+ progress_cb(f"[Step 2/3] Done — {len(records)} embeddings saved.", 40.0)
551
+
552
+ return vector_store, embedder, func_map
553
+
554
+
555
+ # ---------------------------------------------------------------------------
556
+ # Step 3: wiki generation with per-page progress
557
+ # ---------------------------------------------------------------------------
558
+
559
+ def run_wiki_generation(
560
+ builder: Any,
561
+ repo_path: Path,
562
+ output_dir: Path,
563
+ max_pages: int,
564
+ rebuild: bool,
565
+ comprehensive: bool,
566
+ vector_store: Any,
567
+ embedder: Any,
568
+ func_map: dict[int, dict],
569
+ progress_cb: ProgressCb = None,
570
+ ) -> tuple[Path, int]:
571
+ """Two-phase wiki generation with per-page progress callbacks."""
572
+ import re
573
+ from datetime import datetime
574
+
575
+ from ..examples.generate_wiki import (
576
+ MAX_MERMAID_FIX_ATTEMPTS,
577
+ build_source_context,
578
+ fix_mermaid_errors,
579
+ plan_wiki_structure,
580
+ generate_page_content,
581
+ semantic_search_funcs,
582
+ validate_mermaid_blocks,
583
+ )
584
+ from ..rag.camel_agent import CamelAgent
585
+ from ..rag.llm_backend import create_llm_backend
586
+
587
+ project_name = repo_path.name
588
+ output_dir.mkdir(parents=True, exist_ok=True)
589
+
590
+ structure_cache = output_dir / f"{project_name}_structure.pkl"
591
+
592
+ llm_backend = create_llm_backend(temperature=1.0)
593
+
594
+ if not llm_backend.available:
595
+ if progress_cb:
596
+ progress_cb(
597
+ "[Step 3/3] Skipped — no LLM API key configured. "
598
+ "Set LLM_API_KEY, OPENAI_API_KEY, or MOONSHOT_API_KEY to enable wiki generation.",
599
+ 100.0,
600
+ )
601
+ return output_dir / "index.md", 0
602
+ agent = CamelAgent(
603
+ role=f"{project_name} 技术文档专家",
604
+ goal=f"结合真实源码,为 {project_name} 生成专业、准确、图文并茂的技术 Wiki",
605
+ backstory=f"拥有丰富的技术写作和代码阅读经验,深入理解 {project_name} 源码架构",
606
+ )
607
+
608
+ # Phase 1: plan structure (or load cache)
609
+ if not rebuild and structure_cache.exists():
610
+ with open(structure_cache, "rb") as fh:
611
+ planned_pages = pickle.load(fh)
612
+ if progress_cb:
613
+ progress_cb(
614
+ f"[Step 3/3] Loaded wiki structure from cache: {len(planned_pages)} pages.",
615
+ 45.0,
616
+ )
617
+ else:
618
+ if progress_cb:
619
+ progress_cb("[Step 3/3] Planning wiki structure (Phase 1)...", 41.0)
620
+ planned_pages = plan_wiki_structure(agent, repo_path, project_name, comprehensive)
621
+ with open(structure_cache, "wb") as fh:
622
+ pickle.dump(planned_pages, fh)
623
+ if progress_cb:
624
+ progress_cb(
625
+ f"[Step 3/3] Wiki structure planned: {len(planned_pages)} pages.",
626
+ 45.0,
627
+ )
628
+
629
+ high = [p for p in planned_pages if p["importance"] == "high"]
630
+ others = [p for p in planned_pages if p["importance"] != "high"]
631
+ pages_to_generate = (high + others)[:max_pages]
632
+ total_pages = len(pages_to_generate)
633
+
634
+ if progress_cb:
635
+ progress_cb(
636
+ f"[Step 3/3] Generating {total_pages} wiki pages "
637
+ f"({'comprehensive' if comprehensive else 'concise'} mode)...",
638
+ 46.0,
639
+ )
640
+
641
+ wiki_dir = output_dir / "wiki"
642
+ wiki_dir.mkdir(parents=True, exist_ok=True)
643
+ gen_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
644
+ model_name = os.getenv("MOONSHOT_MODEL", "kimi-k2.5")
645
+
646
+ generated: list[dict] = []
647
+
648
+ for i, page in enumerate(pages_to_generate, 1):
649
+ try:
650
+ content = generate_page_content(
651
+ page, agent, repo_path, vector_store, embedder, func_map
652
+ )
653
+ mermaid_errors = validate_mermaid_blocks(content)
654
+ if mermaid_errors:
655
+ content, _ = fix_mermaid_errors(content, mermaid_errors, agent)
656
+
657
+ page_file = wiki_dir / f"{page['id']}.md"
658
+ page_file.write_text(content, encoding="utf-8")
659
+ generated.append({**page, "content": content})
660
+
661
+ # Map page progress to overall 46-98%
662
+ page_pct = 46.0 + (i / total_pages) * 52.0
663
+ if progress_cb:
664
+ progress_cb(
665
+ f"[Step 3/3] Page {i}/{total_pages} done: {page['id']} — {page['title']} "
666
+ f"({len(content)} chars).",
667
+ page_pct,
668
+ )
669
+ except Exception as exc:
670
+ err_content = f"# {page['title']}\n\n*生成失败: {exc}*"
671
+ (wiki_dir / f"{page['id']}.md").write_text(err_content, encoding="utf-8")
672
+ generated.append({**page, "content": err_content})
673
+ page_pct = 46.0 + (i / total_pages) * 52.0
674
+ if progress_cb:
675
+ progress_cb(
676
+ f"[Step 3/3] Page {i}/{total_pages} FAILED: {page['id']} — {exc}",
677
+ page_pct,
678
+ )
679
+
680
+ # Write index.md
681
+ total_funcs_row = builder.query("MATCH (f:Function) RETURN count(f) AS cnt")
682
+ total_funcs = list(total_funcs_row[0].values())[0] if total_funcs_row else 0
683
+ total_calls_row = builder.query("MATCH ()-[r:CALLS]->() RETURN count(r) AS cnt")
684
+ total_calls = list(total_calls_row[0].values())[0] if total_calls_row else 0
685
+
686
+ mode_label = "详细 Comprehensive" if comprehensive else "简洁 Concise"
687
+ index_path = output_dir / "index.md"
688
+ index_lines = [
689
+ f"# {project_name} 源码 Wiki",
690
+ "",
691
+ f"*生成时间: {gen_time}*",
692
+ f"*模型: {model_name} | 模式: {mode_label} | 上下文检索: 向量语义检索(Qwen3 Embedding)*",
693
+ "",
694
+ "---",
695
+ "",
696
+ "## 项目概览",
697
+ "",
698
+ "| 指标 | 数值 |",
699
+ "|------|------|",
700
+ f"| 总函数数 | {total_funcs:,} |",
701
+ f"| 总调用关系 | {total_calls:,} |",
702
+ f"| 本次生成页面 | {len(generated)} |",
703
+ "",
704
+ "---",
705
+ "",
706
+ "## Wiki 页面索引",
707
+ "",
708
+ "| 重要性 | 页面 | 描述 |",
709
+ "|--------|------|------|",
710
+ ]
711
+ for p in generated:
712
+ importance_icon = {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(p["importance"], "⚪")
713
+ desc = p["description"]
714
+ short_desc = desc[:60] + "..." if len(desc) > 60 else desc
715
+ index_lines.append(
716
+ f"| {importance_icon} {p['importance']} | [{p['title']}](./wiki/{p['id']}.md) | {short_desc} |"
717
+ )
718
+ index_lines += ["", "---", "", "## 详细文档", ""]
719
+ for p in generated:
720
+ index_lines.append(f"- [{p['title']}](./wiki/{p['id']}.md) — {p['description']}")
721
+
722
+ index_path.write_text("\n".join(index_lines), encoding="utf-8")
723
+
724
+ if progress_cb:
725
+ progress_cb(
726
+ f"[Step 3/3] Wiki complete: {len(generated)} pages at {output_dir}/",
727
+ 100.0,
728
+ )
729
+
730
+ return index_path, len(generated)
731
+
732
+
733
+ # ---------------------------------------------------------------------------
734
+ # Workspace helpers
735
+ # ---------------------------------------------------------------------------
736
+
737
+ def save_meta(artifact_dir: Path, repo_path: Path, wiki_page_count: int) -> None:
738
+ """Save or update artifact metadata.
739
+
740
+ Preserves existing fields (like step-completion flags) and updates
741
+ the timestamp and wiki page count.
742
+ """
743
+ meta_file = artifact_dir / "meta.json"
744
+ existing: dict = {}
745
+ if meta_file.exists():
746
+ try:
747
+ existing = json.loads(meta_file.read_text(encoding="utf-8"))
748
+ except (json.JSONDecodeError, OSError):
749
+ pass
750
+
751
+ # Auto-detect which artifacts exist
752
+ has_graph = (artifact_dir / "graph.db").exists()
753
+ has_api_docs = (artifact_dir / "api_docs" / "index.md").exists()
754
+ has_embeddings = (artifact_dir / "vectors.pkl").exists()
755
+ has_wiki = wiki_page_count > 0 or (artifact_dir / "wiki" / "index.md").exists()
756
+
757
+ meta = {
758
+ **existing,
759
+ "repo_path": str(repo_path),
760
+ "repo_name": repo_path.name,
761
+ "indexed_at": datetime.now().isoformat(),
762
+ "wiki_page_count": wiki_page_count,
763
+ "steps": {
764
+ "graph": has_graph,
765
+ "api_docs": has_api_docs,
766
+ "embeddings": has_embeddings,
767
+ "wiki": has_wiki,
768
+ },
769
+ }
770
+ meta_file.write_text(json.dumps(meta, ensure_ascii=False, indent=2))
771
+
772
+
773
+ def artifact_dir_for(workspace: Path, repo_path: Path) -> Path:
774
+ import hashlib
775
+
776
+ h = hashlib.md5(str(repo_path).encode()).hexdigest()[:8]
777
+ return workspace / f"{repo_path.name}_{h}"