code-graph-builder 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_graph_builder/__init__.py +82 -0
- code_graph_builder/builder.py +366 -0
- code_graph_builder/cgb_cli.py +32 -0
- code_graph_builder/cli.py +564 -0
- code_graph_builder/commands_cli.py +1288 -0
- code_graph_builder/config.py +340 -0
- code_graph_builder/constants.py +708 -0
- code_graph_builder/embeddings/__init__.py +40 -0
- code_graph_builder/embeddings/qwen3_embedder.py +573 -0
- code_graph_builder/embeddings/vector_store.py +584 -0
- code_graph_builder/examples/__init__.py +0 -0
- code_graph_builder/examples/example_configuration.py +276 -0
- code_graph_builder/examples/example_kuzu_usage.py +109 -0
- code_graph_builder/examples/example_semantic_search_full.py +347 -0
- code_graph_builder/examples/generate_wiki.py +915 -0
- code_graph_builder/examples/graph_export_example.py +100 -0
- code_graph_builder/examples/rag_example.py +206 -0
- code_graph_builder/examples/test_cli_demo.py +129 -0
- code_graph_builder/examples/test_embedding_api.py +153 -0
- code_graph_builder/examples/test_kuzu_local.py +190 -0
- code_graph_builder/examples/test_rag_redis.py +390 -0
- code_graph_builder/graph_updater.py +605 -0
- code_graph_builder/guidance/__init__.py +1 -0
- code_graph_builder/guidance/agent.py +123 -0
- code_graph_builder/guidance/prompts.py +74 -0
- code_graph_builder/guidance/toolset.py +264 -0
- code_graph_builder/language_spec.py +536 -0
- code_graph_builder/mcp/__init__.py +21 -0
- code_graph_builder/mcp/api_doc_generator.py +764 -0
- code_graph_builder/mcp/file_editor.py +207 -0
- code_graph_builder/mcp/pipeline.py +777 -0
- code_graph_builder/mcp/server.py +161 -0
- code_graph_builder/mcp/tools.py +1800 -0
- code_graph_builder/models.py +115 -0
- code_graph_builder/parser_loader.py +344 -0
- code_graph_builder/parsers/__init__.py +7 -0
- code_graph_builder/parsers/call_processor.py +306 -0
- code_graph_builder/parsers/call_resolver.py +139 -0
- code_graph_builder/parsers/definition_processor.py +796 -0
- code_graph_builder/parsers/factory.py +119 -0
- code_graph_builder/parsers/import_processor.py +293 -0
- code_graph_builder/parsers/structure_processor.py +145 -0
- code_graph_builder/parsers/type_inference.py +143 -0
- code_graph_builder/parsers/utils.py +134 -0
- code_graph_builder/rag/__init__.py +68 -0
- code_graph_builder/rag/camel_agent.py +429 -0
- code_graph_builder/rag/client.py +298 -0
- code_graph_builder/rag/config.py +239 -0
- code_graph_builder/rag/cypher_generator.py +67 -0
- code_graph_builder/rag/llm_backend.py +210 -0
- code_graph_builder/rag/markdown_generator.py +352 -0
- code_graph_builder/rag/prompt_templates.py +440 -0
- code_graph_builder/rag/rag_engine.py +640 -0
- code_graph_builder/rag/review_report.md +172 -0
- code_graph_builder/rag/tests/__init__.py +3 -0
- code_graph_builder/rag/tests/test_camel_agent.py +313 -0
- code_graph_builder/rag/tests/test_client.py +221 -0
- code_graph_builder/rag/tests/test_config.py +177 -0
- code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
- code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
- code_graph_builder/services/__init__.py +39 -0
- code_graph_builder/services/graph_service.py +465 -0
- code_graph_builder/services/kuzu_service.py +665 -0
- code_graph_builder/services/memory_service.py +171 -0
- code_graph_builder/settings.py +75 -0
- code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
- code_graph_builder/tests/__init__.py +1 -0
- code_graph_builder/tests/run_acceptance_check.py +378 -0
- code_graph_builder/tests/test_api_find.py +231 -0
- code_graph_builder/tests/test_api_find_integration.py +226 -0
- code_graph_builder/tests/test_basic.py +78 -0
- code_graph_builder/tests/test_c_api_extraction.py +388 -0
- code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
- code_graph_builder/tests/test_embedder.py +411 -0
- code_graph_builder/tests/test_integration_semantic.py +434 -0
- code_graph_builder/tests/test_mcp_protocol.py +298 -0
- code_graph_builder/tests/test_mcp_user_flow.py +190 -0
- code_graph_builder/tests/test_rag.py +404 -0
- code_graph_builder/tests/test_settings.py +135 -0
- code_graph_builder/tests/test_step1_graph_build.py +264 -0
- code_graph_builder/tests/test_step2_api_docs.py +323 -0
- code_graph_builder/tests/test_step3_embedding.py +278 -0
- code_graph_builder/tests/test_vector_store.py +552 -0
- code_graph_builder/tools/__init__.py +40 -0
- code_graph_builder/tools/graph_query.py +495 -0
- code_graph_builder/tools/semantic_search.py +387 -0
- code_graph_builder/types.py +333 -0
- code_graph_builder/utils/__init__.py +0 -0
- code_graph_builder/utils/path_utils.py +30 -0
- code_graph_builder-0.2.0.dist-info/METADATA +321 -0
- code_graph_builder-0.2.0.dist-info/RECORD +93 -0
- code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
- code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
"""Pipeline with progress callbacks for MCP: graph (+ api_docs) → embedding → wiki.
|
|
2
|
+
|
|
3
|
+
Each stage calls `progress_cb(message)` after every meaningful unit of work
|
|
4
|
+
so the MCP server can relay real-time updates to the client.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import pickle
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from loguru import logger
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from ..rag.client import create_llm_client, LLMClient
|
|
21
|
+
except ImportError:
|
|
22
|
+
create_llm_client = None # type: ignore[assignment,misc]
|
|
23
|
+
LLMClient = None # type: ignore[assignment,misc]
|
|
24
|
+
|
|
25
|
+
ProgressCb = Callable[[str, float], None] | None
|
|
26
|
+
"""Progress callback: (message, percentage_0_to_100) -> None.
|
|
27
|
+
|
|
28
|
+
Pipeline weight allocation:
|
|
29
|
+
Step 1 (graph + API docs): 0 – 15 %
|
|
30
|
+
Step 2 (embeddings): 15 – 40 %
|
|
31
|
+
Step 3 (wiki generation): 40 – 100 %
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Helpers shared with generate_wiki.py
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
def _resolve_source_file(qname: str, repo_path: Path) -> Path | None:
|
|
40
|
+
parts = qname.split(".")
|
|
41
|
+
if len(parts) < 3:
|
|
42
|
+
return None
|
|
43
|
+
dir_parts = parts[1:-1]
|
|
44
|
+
for depth in range(len(dir_parts), 0, -1):
|
|
45
|
+
for suffix in (".c", ".py", ".h", ".cpp", ".go", ".rs", ".js", ".ts"):
|
|
46
|
+
candidate = repo_path.joinpath(*dir_parts[:depth]).with_suffix(suffix)
|
|
47
|
+
if candidate.exists():
|
|
48
|
+
return candidate
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_MAX_SOURCE_CHARS = 2000
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _read_function_source(func: dict, repo_path: Path) -> str | None:
|
|
56
|
+
qname = func.get("qualified_name", "")
|
|
57
|
+
start_line = func.get("start_line", 0)
|
|
58
|
+
end_line = func.get("end_line", 0)
|
|
59
|
+
if start_line == 0 or start_line == end_line:
|
|
60
|
+
return None
|
|
61
|
+
file_path = _resolve_source_file(qname, repo_path)
|
|
62
|
+
if file_path is None:
|
|
63
|
+
return None
|
|
64
|
+
try:
|
|
65
|
+
with open(file_path, encoding="utf-8", errors="replace") as fh:
|
|
66
|
+
lines = fh.readlines()
|
|
67
|
+
source = "".join(lines[start_line - 1: end_line])
|
|
68
|
+
if len(source) > _MAX_SOURCE_CHARS:
|
|
69
|
+
source = source[:_MAX_SOURCE_CHARS] + "\n /* ... truncated ... */"
|
|
70
|
+
return source
|
|
71
|
+
except OSError:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
# Step 1: graph build + API docs generation
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
_FUNC_DOC_QUERY = """
|
|
80
|
+
MATCH (m:Module)-[:DEFINES]->(f:Function)
|
|
81
|
+
RETURN DISTINCT m.qualified_name, m.path,
|
|
82
|
+
f.qualified_name, f.name, f.signature, f.return_type,
|
|
83
|
+
f.visibility, f.parameters, f.docstring,
|
|
84
|
+
f.start_line, f.end_line, f.path, f.kind
|
|
85
|
+
ORDER BY m.qualified_name, f.start_line
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
_TYPE_DOC_QUERY_CLASS = """
|
|
89
|
+
MATCH (c:Class)
|
|
90
|
+
RETURN DISTINCT c.qualified_name, c.name, c.kind, c.signature,
|
|
91
|
+
c.parameters, c.start_line, c.end_line
|
|
92
|
+
ORDER BY c.qualified_name, c.start_line
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
_TYPE_DOC_QUERY_TYPE = """
|
|
96
|
+
MATCH (t:Type)
|
|
97
|
+
RETURN DISTINCT t.qualified_name, t.name, t.kind, t.signature,
|
|
98
|
+
t.start_line, t.end_line
|
|
99
|
+
ORDER BY t.qualified_name, t.start_line
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
_CALLS_QUERY = """
|
|
103
|
+
MATCH (caller:Function)-[:CALLS]->(callee:Function)
|
|
104
|
+
RETURN DISTINCT caller.qualified_name, callee.qualified_name,
|
|
105
|
+
callee.path, callee.start_line
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def build_graph(
|
|
110
|
+
repo_path: Path,
|
|
111
|
+
db_path: Path,
|
|
112
|
+
rebuild: bool,
|
|
113
|
+
progress_cb: ProgressCb = None,
|
|
114
|
+
backend: str = "kuzu",
|
|
115
|
+
) -> Any:
|
|
116
|
+
"""Build or reuse a code knowledge graph.
|
|
117
|
+
|
|
118
|
+
This step only creates the graph database. API docs, embeddings, and
|
|
119
|
+
wiki generation are separate steps.
|
|
120
|
+
"""
|
|
121
|
+
from ..builder import CodeGraphBuilder
|
|
122
|
+
|
|
123
|
+
builder = CodeGraphBuilder(
|
|
124
|
+
repo_path=str(repo_path),
|
|
125
|
+
backend=backend,
|
|
126
|
+
backend_config={"db_path": str(db_path), "batch_size": 1000},
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if rebuild or not db_path.exists():
|
|
130
|
+
result = builder.build_graph(clean=rebuild)
|
|
131
|
+
if progress_cb:
|
|
132
|
+
progress_cb(
|
|
133
|
+
f"Graph built: "
|
|
134
|
+
f"{result.nodes_created} nodes, "
|
|
135
|
+
f"{result.relationships_created} relationships, "
|
|
136
|
+
f"{result.files_processed} files processed.",
|
|
137
|
+
10.0,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
stats = builder.get_statistics()
|
|
141
|
+
if progress_cb:
|
|
142
|
+
progress_cb(
|
|
143
|
+
f"Reusing existing graph: "
|
|
144
|
+
f"{stats.get('node_count', '?')} nodes, "
|
|
145
|
+
f"{stats.get('relationship_count', '?')} relationships.",
|
|
146
|
+
10.0,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return builder
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# Step 2: API docs generation (graph-only, no embeddings needed)
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def generate_api_docs_step(
|
|
157
|
+
builder: Any,
|
|
158
|
+
artifact_dir: Path,
|
|
159
|
+
rebuild: bool,
|
|
160
|
+
progress_cb: ProgressCb = None,
|
|
161
|
+
) -> dict[str, Any]:
|
|
162
|
+
"""Generate hierarchical API docs from the knowledge graph.
|
|
163
|
+
|
|
164
|
+
Requires only a populated graph database — no embeddings or LLM needed.
|
|
165
|
+
"""
|
|
166
|
+
from .api_doc_generator import generate_api_docs
|
|
167
|
+
|
|
168
|
+
api_dir = artifact_dir / "api_docs"
|
|
169
|
+
index_file = api_dir / "index.md"
|
|
170
|
+
|
|
171
|
+
if not rebuild and index_file.exists():
|
|
172
|
+
if progress_cb:
|
|
173
|
+
progress_cb("Reusing cached API docs.", 15.0)
|
|
174
|
+
return {"status": "cached"}
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
func_rows = builder.query(_FUNC_DOC_QUERY)
|
|
178
|
+
type_rows = builder.query(_TYPE_DOC_QUERY_CLASS) + builder.query(_TYPE_DOC_QUERY_TYPE)
|
|
179
|
+
call_rows = builder.query(_CALLS_QUERY)
|
|
180
|
+
except Exception as exc:
|
|
181
|
+
msg = f"API docs skipped — graph query failed: {exc}"
|
|
182
|
+
logger.warning(msg)
|
|
183
|
+
if progress_cb:
|
|
184
|
+
progress_cb(msg, 15.0)
|
|
185
|
+
return {"status": "skipped", "error": str(exc)}
|
|
186
|
+
|
|
187
|
+
result = generate_api_docs(func_rows, type_rows, call_rows, artifact_dir)
|
|
188
|
+
if progress_cb:
|
|
189
|
+
progress_cb(
|
|
190
|
+
f"API docs generated: "
|
|
191
|
+
f"{result['module_count']} modules, "
|
|
192
|
+
f"{result['func_count']} functions, "
|
|
193
|
+
f"{result['type_count']} types.",
|
|
194
|
+
15.0,
|
|
195
|
+
)
|
|
196
|
+
return {"status": "success", **result}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
# Step 1b: LLM-powered description generation for undocumented functions
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
_DESC_SYSTEM_PROMPT = """\
|
|
204
|
+
You are a code documentation assistant. Given a C/C++ function's signature, \
|
|
205
|
+
source code, and module context, generate a single concise sentence (in the \
|
|
206
|
+
same language as any existing comments in the code, defaulting to English) \
|
|
207
|
+
describing what the function does. Focus on the function's PURPOSE, not its \
|
|
208
|
+
implementation details. Do NOT include the function name in the description. \
|
|
209
|
+
Reply with ONLY the description sentence, nothing else."""
|
|
210
|
+
|
|
211
|
+
_DESC_BATCH_SIZE = 10
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _build_desc_prompt(funcs: list[dict]) -> str:
|
|
215
|
+
"""Build a batched prompt for multiple functions."""
|
|
216
|
+
parts: list[str] = []
|
|
217
|
+
for i, f in enumerate(funcs):
|
|
218
|
+
sig = f.get("signature") or f.get("name", "unknown")
|
|
219
|
+
source = f.get("source", "")
|
|
220
|
+
module = f.get("module_qn", "")
|
|
221
|
+
parts.append(
|
|
222
|
+
f"[{i+1}] Module: {module}\n"
|
|
223
|
+
f" Signature: {sig}\n"
|
|
224
|
+
f" Source:\n{source}\n"
|
|
225
|
+
)
|
|
226
|
+
parts.append(
|
|
227
|
+
f"\nGenerate exactly {len(funcs)} descriptions, one per line, "
|
|
228
|
+
f"numbered [1] to [{len(funcs)}]. Each description should be a "
|
|
229
|
+
f"single concise sentence."
|
|
230
|
+
)
|
|
231
|
+
return "\n".join(parts)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _parse_desc_response(response: str, count: int) -> list[str]:
|
|
235
|
+
"""Parse numbered descriptions from LLM response."""
|
|
236
|
+
import re
|
|
237
|
+
|
|
238
|
+
descriptions: list[str] = [""] * count
|
|
239
|
+
for line in response.strip().splitlines():
|
|
240
|
+
line = line.strip()
|
|
241
|
+
if not line:
|
|
242
|
+
continue
|
|
243
|
+
# Match "[N] desc", "N. desc", or "N) desc" with regex
|
|
244
|
+
m = re.match(r"^\[?(\d+)[.\)\]]\s*(.*)", line)
|
|
245
|
+
if m:
|
|
246
|
+
idx = int(m.group(1)) - 1 # 1-based to 0-based
|
|
247
|
+
desc = m.group(2).strip()
|
|
248
|
+
if 0 <= idx < count and desc:
|
|
249
|
+
descriptions[idx] = desc
|
|
250
|
+
return descriptions
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def generate_descriptions_step(
|
|
254
|
+
artifact_dir: Path,
|
|
255
|
+
repo_path: Path,
|
|
256
|
+
progress_cb: ProgressCb = None,
|
|
257
|
+
) -> dict[str, Any]:
|
|
258
|
+
"""Generate LLM descriptions for functions missing docstrings.
|
|
259
|
+
|
|
260
|
+
Reads L3 API doc files, finds those with TODO placeholders,
|
|
261
|
+
generates descriptions via LLM, and writes them back.
|
|
262
|
+
|
|
263
|
+
This step is optional -- skipped silently if no LLM API key is configured.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Summary dict with generated_count, skipped_count, error_count.
|
|
267
|
+
"""
|
|
268
|
+
if create_llm_client is None:
|
|
269
|
+
logger.info("LLM client not available, skipping description generation")
|
|
270
|
+
return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
client = create_llm_client()
|
|
274
|
+
except (ValueError, RuntimeError) as e:
|
|
275
|
+
logger.info(f"No LLM API key configured, skipping description generation: {e}")
|
|
276
|
+
return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
|
|
277
|
+
|
|
278
|
+
funcs_dir = artifact_dir / "api_docs" / "funcs"
|
|
279
|
+
if not funcs_dir.exists():
|
|
280
|
+
logger.warning("No API docs found, skipping description generation")
|
|
281
|
+
return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
|
|
282
|
+
|
|
283
|
+
# Collect functions needing descriptions
|
|
284
|
+
todo_funcs: list[dict] = [] # {path, name, signature, source, module_qn, content}
|
|
285
|
+
|
|
286
|
+
for md_file in sorted(funcs_dir.glob("*.md")):
|
|
287
|
+
content = md_file.read_text(encoding="utf-8")
|
|
288
|
+
if "<!-- TODO:" not in content:
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
# Parse minimal info from the markdown
|
|
292
|
+
func_info: dict = {"path": md_file, "content": content}
|
|
293
|
+
for line in content.splitlines():
|
|
294
|
+
if line.startswith("# "):
|
|
295
|
+
func_info["name"] = line[2:].strip()
|
|
296
|
+
elif line.startswith("- 签名:") or line.startswith("- 定义:"):
|
|
297
|
+
# Extract signature from backticks
|
|
298
|
+
start = line.find("`")
|
|
299
|
+
end = line.rfind("`")
|
|
300
|
+
if start != -1 and end > start:
|
|
301
|
+
func_info["signature"] = line[start + 1 : end]
|
|
302
|
+
elif line.startswith("- 模块:"):
|
|
303
|
+
func_info["module_qn"] = (
|
|
304
|
+
line[len("- 模块:") :].strip().split(" —")[0].strip()
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# Read source from the implementation section
|
|
308
|
+
if "## 实现" in content:
|
|
309
|
+
source_start = content.index("## 实现")
|
|
310
|
+
# Extract code between ``` markers
|
|
311
|
+
code_start = content.find("```", source_start)
|
|
312
|
+
code_end = (
|
|
313
|
+
content.find("```", code_start + 3) if code_start != -1 else -1
|
|
314
|
+
)
|
|
315
|
+
if code_start != -1 and code_end != -1:
|
|
316
|
+
# Skip the ```c or ```cpp line
|
|
317
|
+
first_newline = content.index("\n", code_start)
|
|
318
|
+
func_info["source"] = content[first_newline + 1 : code_end].strip()
|
|
319
|
+
|
|
320
|
+
if "name" in func_info:
|
|
321
|
+
todo_funcs.append(func_info)
|
|
322
|
+
|
|
323
|
+
if not todo_funcs:
|
|
324
|
+
logger.info("All functions already have descriptions")
|
|
325
|
+
return {"generated_count": 0, "skipped_count": 0, "error_count": 0}
|
|
326
|
+
|
|
327
|
+
logger.info(f"Generating descriptions for {len(todo_funcs)} functions")
|
|
328
|
+
|
|
329
|
+
generated = 0
|
|
330
|
+
errors = 0
|
|
331
|
+
total_batches = (len(todo_funcs) + _DESC_BATCH_SIZE - 1) // _DESC_BATCH_SIZE
|
|
332
|
+
|
|
333
|
+
for batch_idx in range(0, len(todo_funcs), _DESC_BATCH_SIZE):
|
|
334
|
+
batch = todo_funcs[batch_idx : batch_idx + _DESC_BATCH_SIZE]
|
|
335
|
+
current_batch = batch_idx // _DESC_BATCH_SIZE + 1
|
|
336
|
+
|
|
337
|
+
if progress_cb:
|
|
338
|
+
pct = int(current_batch / total_batches * 100)
|
|
339
|
+
progress_cb(
|
|
340
|
+
f"Generating descriptions: batch {current_batch}/{total_batches}",
|
|
341
|
+
float(pct),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
prompt = _build_desc_prompt(batch)
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
response = client.chat(
|
|
348
|
+
query=prompt,
|
|
349
|
+
system_prompt=_DESC_SYSTEM_PROMPT,
|
|
350
|
+
max_tokens=1024,
|
|
351
|
+
temperature=0.3,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
descriptions = _parse_desc_response(response.content, len(batch))
|
|
355
|
+
|
|
356
|
+
for func_info, desc in zip(batch, descriptions):
|
|
357
|
+
if not desc:
|
|
358
|
+
errors += 1
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
# Replace TODO placeholder with generated description
|
|
362
|
+
old_content = func_info["content"]
|
|
363
|
+
new_content = ""
|
|
364
|
+
for line in old_content.splitlines(keepends=True):
|
|
365
|
+
if "<!-- TODO:" in line and "-->" in line:
|
|
366
|
+
new_content += f"> {desc}\n"
|
|
367
|
+
else:
|
|
368
|
+
new_content += line
|
|
369
|
+
|
|
370
|
+
func_info["path"].write_text(new_content, encoding="utf-8")
|
|
371
|
+
generated += 1
|
|
372
|
+
|
|
373
|
+
except Exception as e:
|
|
374
|
+
logger.warning(f"LLM description generation failed for batch: {e}")
|
|
375
|
+
errors += len(batch)
|
|
376
|
+
|
|
377
|
+
logger.info(f"Generated {generated} descriptions, {errors} errors")
|
|
378
|
+
return {
|
|
379
|
+
"generated_count": generated,
|
|
380
|
+
"skipped_count": len(todo_funcs) - generated - errors,
|
|
381
|
+
"error_count": errors,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
# ---------------------------------------------------------------------------
|
|
386
|
+
# Step 2: vector index with per-batch progress
|
|
387
|
+
# ---------------------------------------------------------------------------
|
|
388
|
+
|
|
389
|
+
_EMBED_BATCH_SIZE = 10
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _build_embedding_text(
|
|
393
|
+
func: dict,
|
|
394
|
+
callers: list[str],
|
|
395
|
+
callees: list[str],
|
|
396
|
+
source: str,
|
|
397
|
+
) -> str:
|
|
398
|
+
"""Compose rich embedding text for a function.
|
|
399
|
+
|
|
400
|
+
Combines name, file location, docstring, call relationships, and source
|
|
401
|
+
code so that semantic search can match abstract descriptions even when
|
|
402
|
+
functions lack formal documentation.
|
|
403
|
+
"""
|
|
404
|
+
parts: list[str] = [f"Function: {func['name']}"]
|
|
405
|
+
if func.get("path"):
|
|
406
|
+
parts.append(f"File: {func['path']}")
|
|
407
|
+
if func.get("docstring"):
|
|
408
|
+
parts.append(f"Description: {func['docstring']}")
|
|
409
|
+
if callers:
|
|
410
|
+
parts.append(f"Called by: {', '.join(callers[:10])}")
|
|
411
|
+
if callees:
|
|
412
|
+
parts.append(f"Calls: {', '.join(callees[:10])}")
|
|
413
|
+
parts.append("---")
|
|
414
|
+
parts.append(source)
|
|
415
|
+
return "\n".join(parts)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def build_vector_index(
|
|
419
|
+
builder: Any,
|
|
420
|
+
repo_path: Path,
|
|
421
|
+
vectors_path: Path,
|
|
422
|
+
rebuild: bool,
|
|
423
|
+
progress_cb: ProgressCb = None,
|
|
424
|
+
) -> tuple[Any, Any, dict[int, dict]]:
|
|
425
|
+
"""Build or load vector embeddings, reporting after every API batch call."""
|
|
426
|
+
from ..embeddings.qwen3_embedder import create_embedder
|
|
427
|
+
from ..embeddings.vector_store import MemoryVectorStore, VectorRecord
|
|
428
|
+
|
|
429
|
+
embedder = create_embedder(batch_size=_EMBED_BATCH_SIZE)
|
|
430
|
+
|
|
431
|
+
if not rebuild and vectors_path.exists():
|
|
432
|
+
with open(vectors_path, "rb") as fh:
|
|
433
|
+
cache = pickle.load(fh)
|
|
434
|
+
vector_store: MemoryVectorStore = cache["vector_store"]
|
|
435
|
+
func_map: dict[int, dict] = cache["func_map"]
|
|
436
|
+
if progress_cb:
|
|
437
|
+
progress_cb(
|
|
438
|
+
f"[Step 2/3] Loaded {len(vector_store)} embeddings from cache: {vectors_path}",
|
|
439
|
+
40.0,
|
|
440
|
+
)
|
|
441
|
+
return vector_store, embedder, func_map
|
|
442
|
+
|
|
443
|
+
# ---- Query functions with docstring and module path ----
|
|
444
|
+
rows = builder.query("""
|
|
445
|
+
MATCH (m:Module)-[:DEFINES]->(f:Function)
|
|
446
|
+
RETURN DISTINCT f.name AS name,
|
|
447
|
+
f.qualified_name AS qualified_name,
|
|
448
|
+
f.start_line AS start_line,
|
|
449
|
+
f.end_line AS end_line,
|
|
450
|
+
f.docstring AS docstring,
|
|
451
|
+
m.path AS path
|
|
452
|
+
""")
|
|
453
|
+
all_funcs: list[dict] = []
|
|
454
|
+
seen_qn: set[str] = set()
|
|
455
|
+
for row in rows:
|
|
456
|
+
qn = row.get("qualified_name") or ""
|
|
457
|
+
if not qn or qn in seen_qn:
|
|
458
|
+
continue
|
|
459
|
+
seen_qn.add(qn)
|
|
460
|
+
all_funcs.append({
|
|
461
|
+
"name": row.get("name") or "",
|
|
462
|
+
"qualified_name": qn,
|
|
463
|
+
"start_line": row.get("start_line") or 0,
|
|
464
|
+
"end_line": row.get("end_line") or 0,
|
|
465
|
+
"docstring": row.get("docstring") or "",
|
|
466
|
+
"path": row.get("path") or "",
|
|
467
|
+
})
|
|
468
|
+
|
|
469
|
+
# ---- Build caller/callee maps for richer embedding context ----
|
|
470
|
+
from collections import defaultdict
|
|
471
|
+
call_rows = builder.query("""
|
|
472
|
+
MATCH (caller:Function)-[:CALLS]->(callee:Function)
|
|
473
|
+
RETURN DISTINCT caller.qualified_name AS caller_qn,
|
|
474
|
+
callee.qualified_name AS callee_qn
|
|
475
|
+
""")
|
|
476
|
+
callees_of: dict[str, list[str]] = defaultdict(list)
|
|
477
|
+
callers_of: dict[str, list[str]] = defaultdict(list)
|
|
478
|
+
seen_edges: set[tuple[str, str]] = set()
|
|
479
|
+
for row in call_rows:
|
|
480
|
+
caller_qn = row.get("caller_qn") or ""
|
|
481
|
+
callee_qn = row.get("callee_qn") or ""
|
|
482
|
+
if not caller_qn or not callee_qn:
|
|
483
|
+
continue
|
|
484
|
+
edge = (caller_qn, callee_qn)
|
|
485
|
+
if edge in seen_edges:
|
|
486
|
+
continue
|
|
487
|
+
seen_edges.add(edge)
|
|
488
|
+
callees_of[caller_qn].append(callee_qn.split(".")[-1])
|
|
489
|
+
callers_of[callee_qn].append(caller_qn.split(".")[-1])
|
|
490
|
+
|
|
491
|
+
embeddable: list[tuple[int, dict, str]] = []
|
|
492
|
+
for i, func in enumerate(all_funcs):
|
|
493
|
+
source = _read_function_source(func, repo_path)
|
|
494
|
+
if source:
|
|
495
|
+
text = _build_embedding_text(
|
|
496
|
+
func,
|
|
497
|
+
callers=callers_of.get(func["qualified_name"], []),
|
|
498
|
+
callees=callees_of.get(func["qualified_name"], []),
|
|
499
|
+
source=source,
|
|
500
|
+
)
|
|
501
|
+
embeddable.append((i, func, text))
|
|
502
|
+
|
|
503
|
+
total = len(embeddable)
|
|
504
|
+
if progress_cb:
|
|
505
|
+
progress_cb(
|
|
506
|
+
f"[Step 2/3] Embedding {total} functions "
|
|
507
|
+
f"(batch size {_EMBED_BATCH_SIZE}, {(total + _EMBED_BATCH_SIZE - 1) // _EMBED_BATCH_SIZE} API calls)...",
|
|
508
|
+
16.0,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
vector_store = MemoryVectorStore(dimension=embedder.get_embedding_dimension())
|
|
512
|
+
func_map = {}
|
|
513
|
+
records: list[VectorRecord] = []
|
|
514
|
+
|
|
515
|
+
for batch_start in range(0, total, _EMBED_BATCH_SIZE):
|
|
516
|
+
batch = embeddable[batch_start: batch_start + _EMBED_BATCH_SIZE]
|
|
517
|
+
batch_texts = [t for _, _, t in batch]
|
|
518
|
+
|
|
519
|
+
batch_embeddings = embedder.embed_batch(batch_texts)
|
|
520
|
+
|
|
521
|
+
for (node_id, func, _), embedding in zip(batch, batch_embeddings):
|
|
522
|
+
records.append(VectorRecord(
|
|
523
|
+
node_id=node_id,
|
|
524
|
+
qualified_name=func["qualified_name"],
|
|
525
|
+
embedding=embedding,
|
|
526
|
+
metadata={
|
|
527
|
+
"name": func["name"],
|
|
528
|
+
"start_line": func["start_line"],
|
|
529
|
+
"end_line": func["end_line"],
|
|
530
|
+
},
|
|
531
|
+
))
|
|
532
|
+
func_map[node_id] = func
|
|
533
|
+
|
|
534
|
+
done = min(batch_start + _EMBED_BATCH_SIZE, total)
|
|
535
|
+
local_pct = done * 100 // total
|
|
536
|
+
# Map local 0-100% to overall 16-40%
|
|
537
|
+
overall_pct = 16.0 + (done / total) * 24.0
|
|
538
|
+
if progress_cb:
|
|
539
|
+
progress_cb(
|
|
540
|
+
f"[Step 2/3] Embedded {done}/{total} functions ({local_pct}%).",
|
|
541
|
+
overall_pct,
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
vector_store.store_embeddings_batch(records)
|
|
545
|
+
|
|
546
|
+
with open(vectors_path, "wb") as fh:
|
|
547
|
+
pickle.dump({"vector_store": vector_store, "func_map": func_map}, fh)
|
|
548
|
+
|
|
549
|
+
if progress_cb:
|
|
550
|
+
progress_cb(f"[Step 2/3] Done — {len(records)} embeddings saved.", 40.0)
|
|
551
|
+
|
|
552
|
+
return vector_store, embedder, func_map
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
# ---------------------------------------------------------------------------
|
|
556
|
+
# Step 3: wiki generation with per-page progress
|
|
557
|
+
# ---------------------------------------------------------------------------
|
|
558
|
+
|
|
559
|
+
def run_wiki_generation(
|
|
560
|
+
builder: Any,
|
|
561
|
+
repo_path: Path,
|
|
562
|
+
output_dir: Path,
|
|
563
|
+
max_pages: int,
|
|
564
|
+
rebuild: bool,
|
|
565
|
+
comprehensive: bool,
|
|
566
|
+
vector_store: Any,
|
|
567
|
+
embedder: Any,
|
|
568
|
+
func_map: dict[int, dict],
|
|
569
|
+
progress_cb: ProgressCb = None,
|
|
570
|
+
) -> tuple[Path, int]:
|
|
571
|
+
"""Two-phase wiki generation with per-page progress callbacks."""
|
|
572
|
+
import re
|
|
573
|
+
from datetime import datetime
|
|
574
|
+
|
|
575
|
+
from ..examples.generate_wiki import (
|
|
576
|
+
MAX_MERMAID_FIX_ATTEMPTS,
|
|
577
|
+
build_source_context,
|
|
578
|
+
fix_mermaid_errors,
|
|
579
|
+
plan_wiki_structure,
|
|
580
|
+
generate_page_content,
|
|
581
|
+
semantic_search_funcs,
|
|
582
|
+
validate_mermaid_blocks,
|
|
583
|
+
)
|
|
584
|
+
from ..rag.camel_agent import CamelAgent
|
|
585
|
+
from ..rag.llm_backend import create_llm_backend
|
|
586
|
+
|
|
587
|
+
project_name = repo_path.name
|
|
588
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
589
|
+
|
|
590
|
+
structure_cache = output_dir / f"{project_name}_structure.pkl"
|
|
591
|
+
|
|
592
|
+
llm_backend = create_llm_backend(temperature=1.0)
|
|
593
|
+
|
|
594
|
+
if not llm_backend.available:
|
|
595
|
+
if progress_cb:
|
|
596
|
+
progress_cb(
|
|
597
|
+
"[Step 3/3] Skipped — no LLM API key configured. "
|
|
598
|
+
"Set LLM_API_KEY, OPENAI_API_KEY, or MOONSHOT_API_KEY to enable wiki generation.",
|
|
599
|
+
100.0,
|
|
600
|
+
)
|
|
601
|
+
return output_dir / "index.md", 0
|
|
602
|
+
agent = CamelAgent(
|
|
603
|
+
role=f"{project_name} 技术文档专家",
|
|
604
|
+
goal=f"结合真实源码,为 {project_name} 生成专业、准确、图文并茂的技术 Wiki",
|
|
605
|
+
backstory=f"拥有丰富的技术写作和代码阅读经验,深入理解 {project_name} 源码架构",
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# Phase 1: plan structure (or load cache)
|
|
609
|
+
if not rebuild and structure_cache.exists():
|
|
610
|
+
with open(structure_cache, "rb") as fh:
|
|
611
|
+
planned_pages = pickle.load(fh)
|
|
612
|
+
if progress_cb:
|
|
613
|
+
progress_cb(
|
|
614
|
+
f"[Step 3/3] Loaded wiki structure from cache: {len(planned_pages)} pages.",
|
|
615
|
+
45.0,
|
|
616
|
+
)
|
|
617
|
+
else:
|
|
618
|
+
if progress_cb:
|
|
619
|
+
progress_cb("[Step 3/3] Planning wiki structure (Phase 1)...", 41.0)
|
|
620
|
+
planned_pages = plan_wiki_structure(agent, repo_path, project_name, comprehensive)
|
|
621
|
+
with open(structure_cache, "wb") as fh:
|
|
622
|
+
pickle.dump(planned_pages, fh)
|
|
623
|
+
if progress_cb:
|
|
624
|
+
progress_cb(
|
|
625
|
+
f"[Step 3/3] Wiki structure planned: {len(planned_pages)} pages.",
|
|
626
|
+
45.0,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
high = [p for p in planned_pages if p["importance"] == "high"]
|
|
630
|
+
others = [p for p in planned_pages if p["importance"] != "high"]
|
|
631
|
+
pages_to_generate = (high + others)[:max_pages]
|
|
632
|
+
total_pages = len(pages_to_generate)
|
|
633
|
+
|
|
634
|
+
if progress_cb:
|
|
635
|
+
progress_cb(
|
|
636
|
+
f"[Step 3/3] Generating {total_pages} wiki pages "
|
|
637
|
+
f"({'comprehensive' if comprehensive else 'concise'} mode)...",
|
|
638
|
+
46.0,
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
wiki_dir = output_dir / "wiki"
|
|
642
|
+
wiki_dir.mkdir(parents=True, exist_ok=True)
|
|
643
|
+
gen_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
644
|
+
model_name = os.getenv("MOONSHOT_MODEL", "kimi-k2.5")
|
|
645
|
+
|
|
646
|
+
generated: list[dict] = []
|
|
647
|
+
|
|
648
|
+
for i, page in enumerate(pages_to_generate, 1):
|
|
649
|
+
try:
|
|
650
|
+
content = generate_page_content(
|
|
651
|
+
page, agent, repo_path, vector_store, embedder, func_map
|
|
652
|
+
)
|
|
653
|
+
mermaid_errors = validate_mermaid_blocks(content)
|
|
654
|
+
if mermaid_errors:
|
|
655
|
+
content, _ = fix_mermaid_errors(content, mermaid_errors, agent)
|
|
656
|
+
|
|
657
|
+
page_file = wiki_dir / f"{page['id']}.md"
|
|
658
|
+
page_file.write_text(content, encoding="utf-8")
|
|
659
|
+
generated.append({**page, "content": content})
|
|
660
|
+
|
|
661
|
+
# Map page progress to overall 46-98%
|
|
662
|
+
page_pct = 46.0 + (i / total_pages) * 52.0
|
|
663
|
+
if progress_cb:
|
|
664
|
+
progress_cb(
|
|
665
|
+
f"[Step 3/3] Page {i}/{total_pages} done: {page['id']} — {page['title']} "
|
|
666
|
+
f"({len(content)} chars).",
|
|
667
|
+
page_pct,
|
|
668
|
+
)
|
|
669
|
+
except Exception as exc:
|
|
670
|
+
err_content = f"# {page['title']}\n\n*生成失败: {exc}*"
|
|
671
|
+
(wiki_dir / f"{page['id']}.md").write_text(err_content, encoding="utf-8")
|
|
672
|
+
generated.append({**page, "content": err_content})
|
|
673
|
+
page_pct = 46.0 + (i / total_pages) * 52.0
|
|
674
|
+
if progress_cb:
|
|
675
|
+
progress_cb(
|
|
676
|
+
f"[Step 3/3] Page {i}/{total_pages} FAILED: {page['id']} — {exc}",
|
|
677
|
+
page_pct,
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
# Write index.md
|
|
681
|
+
total_funcs_row = builder.query("MATCH (f:Function) RETURN count(f) AS cnt")
|
|
682
|
+
total_funcs = list(total_funcs_row[0].values())[0] if total_funcs_row else 0
|
|
683
|
+
total_calls_row = builder.query("MATCH ()-[r:CALLS]->() RETURN count(r) AS cnt")
|
|
684
|
+
total_calls = list(total_calls_row[0].values())[0] if total_calls_row else 0
|
|
685
|
+
|
|
686
|
+
mode_label = "详细 Comprehensive" if comprehensive else "简洁 Concise"
|
|
687
|
+
index_path = output_dir / "index.md"
|
|
688
|
+
index_lines = [
|
|
689
|
+
f"# {project_name} 源码 Wiki",
|
|
690
|
+
"",
|
|
691
|
+
f"*生成时间: {gen_time}*",
|
|
692
|
+
f"*模型: {model_name} | 模式: {mode_label} | 上下文检索: 向量语义检索(Qwen3 Embedding)*",
|
|
693
|
+
"",
|
|
694
|
+
"---",
|
|
695
|
+
"",
|
|
696
|
+
"## 项目概览",
|
|
697
|
+
"",
|
|
698
|
+
"| 指标 | 数值 |",
|
|
699
|
+
"|------|------|",
|
|
700
|
+
f"| 总函数数 | {total_funcs:,} |",
|
|
701
|
+
f"| 总调用关系 | {total_calls:,} |",
|
|
702
|
+
f"| 本次生成页面 | {len(generated)} |",
|
|
703
|
+
"",
|
|
704
|
+
"---",
|
|
705
|
+
"",
|
|
706
|
+
"## Wiki 页面索引",
|
|
707
|
+
"",
|
|
708
|
+
"| 重要性 | 页面 | 描述 |",
|
|
709
|
+
"|--------|------|------|",
|
|
710
|
+
]
|
|
711
|
+
for p in generated:
|
|
712
|
+
importance_icon = {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(p["importance"], "⚪")
|
|
713
|
+
desc = p["description"]
|
|
714
|
+
short_desc = desc[:60] + "..." if len(desc) > 60 else desc
|
|
715
|
+
index_lines.append(
|
|
716
|
+
f"| {importance_icon} {p['importance']} | [{p['title']}](./wiki/{p['id']}.md) | {short_desc} |"
|
|
717
|
+
)
|
|
718
|
+
index_lines += ["", "---", "", "## 详细文档", ""]
|
|
719
|
+
for p in generated:
|
|
720
|
+
index_lines.append(f"- [{p['title']}](./wiki/{p['id']}.md) — {p['description']}")
|
|
721
|
+
|
|
722
|
+
index_path.write_text("\n".join(index_lines), encoding="utf-8")
|
|
723
|
+
|
|
724
|
+
if progress_cb:
|
|
725
|
+
progress_cb(
|
|
726
|
+
f"[Step 3/3] Wiki complete: {len(generated)} pages at {output_dir}/",
|
|
727
|
+
100.0,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
return index_path, len(generated)
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
# ---------------------------------------------------------------------------
|
|
734
|
+
# Workspace helpers
|
|
735
|
+
# ---------------------------------------------------------------------------
|
|
736
|
+
|
|
737
|
+
def save_meta(artifact_dir: Path, repo_path: Path, wiki_page_count: int) -> None:
|
|
738
|
+
"""Save or update artifact metadata.
|
|
739
|
+
|
|
740
|
+
Preserves existing fields (like step-completion flags) and updates
|
|
741
|
+
the timestamp and wiki page count.
|
|
742
|
+
"""
|
|
743
|
+
meta_file = artifact_dir / "meta.json"
|
|
744
|
+
existing: dict = {}
|
|
745
|
+
if meta_file.exists():
|
|
746
|
+
try:
|
|
747
|
+
existing = json.loads(meta_file.read_text(encoding="utf-8"))
|
|
748
|
+
except (json.JSONDecodeError, OSError):
|
|
749
|
+
pass
|
|
750
|
+
|
|
751
|
+
# Auto-detect which artifacts exist
|
|
752
|
+
has_graph = (artifact_dir / "graph.db").exists()
|
|
753
|
+
has_api_docs = (artifact_dir / "api_docs" / "index.md").exists()
|
|
754
|
+
has_embeddings = (artifact_dir / "vectors.pkl").exists()
|
|
755
|
+
has_wiki = wiki_page_count > 0 or (artifact_dir / "wiki" / "index.md").exists()
|
|
756
|
+
|
|
757
|
+
meta = {
|
|
758
|
+
**existing,
|
|
759
|
+
"repo_path": str(repo_path),
|
|
760
|
+
"repo_name": repo_path.name,
|
|
761
|
+
"indexed_at": datetime.now().isoformat(),
|
|
762
|
+
"wiki_page_count": wiki_page_count,
|
|
763
|
+
"steps": {
|
|
764
|
+
"graph": has_graph,
|
|
765
|
+
"api_docs": has_api_docs,
|
|
766
|
+
"embeddings": has_embeddings,
|
|
767
|
+
"wiki": has_wiki,
|
|
768
|
+
},
|
|
769
|
+
}
|
|
770
|
+
meta_file.write_text(json.dumps(meta, ensure_ascii=False, indent=2))
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def artifact_dir_for(workspace: Path, repo_path: Path) -> Path:
|
|
774
|
+
import hashlib
|
|
775
|
+
|
|
776
|
+
h = hashlib.md5(str(repo_path).encode()).hexdigest()[:8]
|
|
777
|
+
return workspace / f"{repo_path.name}_{h}"
|