code-graph-builder 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. code_graph_builder/__init__.py +82 -0
  2. code_graph_builder/builder.py +366 -0
  3. code_graph_builder/cgb_cli.py +32 -0
  4. code_graph_builder/cli.py +564 -0
  5. code_graph_builder/commands_cli.py +1288 -0
  6. code_graph_builder/config.py +340 -0
  7. code_graph_builder/constants.py +708 -0
  8. code_graph_builder/embeddings/__init__.py +40 -0
  9. code_graph_builder/embeddings/qwen3_embedder.py +573 -0
  10. code_graph_builder/embeddings/vector_store.py +584 -0
  11. code_graph_builder/examples/__init__.py +0 -0
  12. code_graph_builder/examples/example_configuration.py +276 -0
  13. code_graph_builder/examples/example_kuzu_usage.py +109 -0
  14. code_graph_builder/examples/example_semantic_search_full.py +347 -0
  15. code_graph_builder/examples/generate_wiki.py +915 -0
  16. code_graph_builder/examples/graph_export_example.py +100 -0
  17. code_graph_builder/examples/rag_example.py +206 -0
  18. code_graph_builder/examples/test_cli_demo.py +129 -0
  19. code_graph_builder/examples/test_embedding_api.py +153 -0
  20. code_graph_builder/examples/test_kuzu_local.py +190 -0
  21. code_graph_builder/examples/test_rag_redis.py +390 -0
  22. code_graph_builder/graph_updater.py +605 -0
  23. code_graph_builder/guidance/__init__.py +1 -0
  24. code_graph_builder/guidance/agent.py +123 -0
  25. code_graph_builder/guidance/prompts.py +74 -0
  26. code_graph_builder/guidance/toolset.py +264 -0
  27. code_graph_builder/language_spec.py +536 -0
  28. code_graph_builder/mcp/__init__.py +21 -0
  29. code_graph_builder/mcp/api_doc_generator.py +764 -0
  30. code_graph_builder/mcp/file_editor.py +207 -0
  31. code_graph_builder/mcp/pipeline.py +777 -0
  32. code_graph_builder/mcp/server.py +161 -0
  33. code_graph_builder/mcp/tools.py +1800 -0
  34. code_graph_builder/models.py +115 -0
  35. code_graph_builder/parser_loader.py +344 -0
  36. code_graph_builder/parsers/__init__.py +7 -0
  37. code_graph_builder/parsers/call_processor.py +306 -0
  38. code_graph_builder/parsers/call_resolver.py +139 -0
  39. code_graph_builder/parsers/definition_processor.py +796 -0
  40. code_graph_builder/parsers/factory.py +119 -0
  41. code_graph_builder/parsers/import_processor.py +293 -0
  42. code_graph_builder/parsers/structure_processor.py +145 -0
  43. code_graph_builder/parsers/type_inference.py +143 -0
  44. code_graph_builder/parsers/utils.py +134 -0
  45. code_graph_builder/rag/__init__.py +68 -0
  46. code_graph_builder/rag/camel_agent.py +429 -0
  47. code_graph_builder/rag/client.py +298 -0
  48. code_graph_builder/rag/config.py +239 -0
  49. code_graph_builder/rag/cypher_generator.py +67 -0
  50. code_graph_builder/rag/llm_backend.py +210 -0
  51. code_graph_builder/rag/markdown_generator.py +352 -0
  52. code_graph_builder/rag/prompt_templates.py +440 -0
  53. code_graph_builder/rag/rag_engine.py +640 -0
  54. code_graph_builder/rag/review_report.md +172 -0
  55. code_graph_builder/rag/tests/__init__.py +3 -0
  56. code_graph_builder/rag/tests/test_camel_agent.py +313 -0
  57. code_graph_builder/rag/tests/test_client.py +221 -0
  58. code_graph_builder/rag/tests/test_config.py +177 -0
  59. code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
  60. code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
  61. code_graph_builder/services/__init__.py +39 -0
  62. code_graph_builder/services/graph_service.py +465 -0
  63. code_graph_builder/services/kuzu_service.py +665 -0
  64. code_graph_builder/services/memory_service.py +171 -0
  65. code_graph_builder/settings.py +75 -0
  66. code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
  67. code_graph_builder/tests/__init__.py +1 -0
  68. code_graph_builder/tests/run_acceptance_check.py +378 -0
  69. code_graph_builder/tests/test_api_find.py +231 -0
  70. code_graph_builder/tests/test_api_find_integration.py +226 -0
  71. code_graph_builder/tests/test_basic.py +78 -0
  72. code_graph_builder/tests/test_c_api_extraction.py +388 -0
  73. code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
  74. code_graph_builder/tests/test_embedder.py +411 -0
  75. code_graph_builder/tests/test_integration_semantic.py +434 -0
  76. code_graph_builder/tests/test_mcp_protocol.py +298 -0
  77. code_graph_builder/tests/test_mcp_user_flow.py +190 -0
  78. code_graph_builder/tests/test_rag.py +404 -0
  79. code_graph_builder/tests/test_settings.py +135 -0
  80. code_graph_builder/tests/test_step1_graph_build.py +264 -0
  81. code_graph_builder/tests/test_step2_api_docs.py +323 -0
  82. code_graph_builder/tests/test_step3_embedding.py +278 -0
  83. code_graph_builder/tests/test_vector_store.py +552 -0
  84. code_graph_builder/tools/__init__.py +40 -0
  85. code_graph_builder/tools/graph_query.py +495 -0
  86. code_graph_builder/tools/semantic_search.py +387 -0
  87. code_graph_builder/types.py +333 -0
  88. code_graph_builder/utils/__init__.py +0 -0
  89. code_graph_builder/utils/path_utils.py +30 -0
  90. code_graph_builder-0.2.0.dist-info/METADATA +321 -0
  91. code_graph_builder-0.2.0.dist-info/RECORD +93 -0
  92. code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
  93. code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,123 @@
1
+ """GuidanceAgent — ReAct-loop LLM agent that produces code generation guidance.
2
+
3
+ The agent receives a design document, uses tools to research the target
4
+ codebase, then synthesises a structured guidance Markdown file.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ from typing import Any
12
+
13
+ from loguru import logger
14
+
15
+ from ..rag.llm_backend import ChatMessage, LLMBackend, ToolCall
16
+ from .prompts import SYSTEM_PROMPT
17
+ from .toolset import ToolSet
18
+
19
+
20
+ class GuidanceAgent:
21
+ """LLM agent with a tool-calling loop.
22
+
23
+ Depends only on :class:`ToolSet` (abstract) and :class:`LLMBackend` —
24
+ has no knowledge of concrete MCP services.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ toolset: ToolSet,
30
+ llm: LLMBackend,
31
+ max_iterations: int = 8,
32
+ max_tokens: int = 8192,
33
+ ) -> None:
34
+ self._toolset = toolset
35
+ self._llm = llm
36
+ self._max_iterations = max_iterations
37
+ self._max_tokens = max_tokens
38
+
39
+ async def run(self, design_doc: str) -> str:
40
+ """Execute the ReAct loop and return the guidance Markdown."""
41
+ messages: list[dict[str, Any]] = [
42
+ {"role": "system", "content": SYSTEM_PROMPT},
43
+ {"role": "user", "content": design_doc},
44
+ ]
45
+
46
+ tool_specs = self._toolset.tool_specs()
47
+
48
+ for iteration in range(self._max_iterations):
49
+ logger.debug(f"GuidanceAgent iteration {iteration + 1}/{self._max_iterations}")
50
+
51
+ response = await asyncio.to_thread(
52
+ self._llm.chat_with_tools,
53
+ messages,
54
+ tools=tool_specs or None,
55
+ max_tokens=self._max_tokens,
56
+ )
57
+
58
+ if response.tool_calls:
59
+ messages.append(self._assistant_msg(response))
60
+ for tc in response.tool_calls:
61
+ result = await self._safe_call(tc)
62
+ messages.append({
63
+ "role": "tool",
64
+ "tool_call_id": tc.id,
65
+ "content": result,
66
+ })
67
+ else:
68
+ # No tool calls — final output
69
+ return response.content or ""
70
+
71
+ # Hit max iterations — force a final output without tools
72
+ logger.warning(
73
+ f"GuidanceAgent reached max iterations ({self._max_iterations}), "
74
+ "forcing final output."
75
+ )
76
+ messages.append({
77
+ "role": "user",
78
+ "content": (
79
+ "You have reached the maximum number of tool calls. "
80
+ "Please produce the final guidance document now based on "
81
+ "the information you have already gathered."
82
+ ),
83
+ })
84
+ final = await asyncio.to_thread(
85
+ self._llm.chat_with_tools,
86
+ messages,
87
+ tools=None,
88
+ max_tokens=self._max_tokens,
89
+ )
90
+ return final.content or ""
91
+
92
+ # -- Helpers -------------------------------------------------------------
93
+
94
+ async def _safe_call(self, tc: ToolCall) -> str:
95
+ """Execute a tool call, catching exceptions and returning them as text."""
96
+ try:
97
+ args = json.loads(tc.arguments)
98
+ except json.JSONDecodeError:
99
+ return json.dumps({"error": f"Invalid JSON arguments: {tc.arguments}"})
100
+
101
+ logger.debug(f"Tool call: {tc.function_name}({args})")
102
+ return await self._toolset.call(tc.function_name, args)
103
+
104
+ @staticmethod
105
+ def _assistant_msg(response: ChatMessage) -> dict[str, Any]:
106
+ """Build the assistant message dict including tool_calls for the
107
+ conversation history."""
108
+ msg: dict[str, Any] = {"role": "assistant"}
109
+ if response.content:
110
+ msg["content"] = response.content
111
+ if response.tool_calls:
112
+ msg["tool_calls"] = [
113
+ {
114
+ "id": tc.id,
115
+ "type": "function",
116
+ "function": {
117
+ "name": tc.function_name,
118
+ "arguments": tc.arguments,
119
+ },
120
+ }
121
+ for tc in response.tool_calls
122
+ ]
123
+ return msg
@@ -0,0 +1,74 @@
1
+ """System prompt templates for the GuidanceAgent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ SYSTEM_PROMPT = """\
6
+ You are a code architecture expert. Your task is to convert a **design \
7
+ document** into a **code generation guidance file** by researching the target \
8
+ codebase.
9
+
10
+ ## Workflow
11
+
12
+ 1. **Read** the design document carefully. Identify the modules, functions, \
13
+ data types, and interfaces that will be created or modified.
14
+ 2. **Search** the codebase using the tools available to you:
15
+ - Use `find_api` to locate existing APIs that the new code must integrate \
16
+ with. This is the most important step — the generated code must call real \
17
+ interfaces with correct signatures.
18
+ - Use `semantic_search` to find similar implementations that can serve as \
19
+ reference patterns (code style, error handling, naming conventions).
20
+ - Use `query_code_graph` to understand call relationships and dependency \
21
+ chains — who calls what, which modules depend on which.
22
+ 3. **Synthesise** everything into a single Markdown guidance document \
23
+ (described below).
24
+
25
+ ## Guidelines
26
+
27
+ - Be efficient: use the minimum number of tool calls needed. Do not repeat \
28
+ searches with near-identical queries.
29
+ - When a tool returns no useful results, move on rather than retrying with \
30
+ trivial variations.
31
+ - Focus on information that a code-generation agent **cannot infer** from the \
32
+ design document alone: real function signatures, existing patterns, actual \
33
+ file paths.
34
+
35
+ ## Output Format
36
+
37
+ Produce a single Markdown document with the following sections. Omit a \
38
+ section if you found no relevant information for it.
39
+
40
+ ```
41
+ # Code Generation Guidance
42
+
43
+ ## Implementation Goal
44
+ [One-paragraph summary of what needs to be built, derived from the design \
45
+ document.]
46
+
47
+ ## Existing APIs to Use
48
+ [For each API the new code must call, list:]
49
+ - Fully qualified name
50
+ - Signature (parameters + return type)
51
+ - File path and line number
52
+ - Brief usage note
53
+
54
+ ## Reference Implementations
55
+ [2-3 most relevant existing functions that demonstrate the coding patterns \
56
+ to follow. Include file path and key code snippets.]
57
+
58
+ ## Dependency & Call Relationships
59
+ [Upstream: who will call the new code. Downstream: what the new code needs \
60
+ to call. Module-level dependency notes.]
61
+
62
+ ## Type Definitions
63
+ [Structs, enums, interfaces, or classes that the new code will consume or \
64
+ produce.]
65
+
66
+ ## Code Conventions
67
+ [Naming style, error handling pattern, comment format, return conventions — \
68
+ derived from the reference implementations above.]
69
+
70
+ ## Implementation Constraints
71
+ [Constraints from the design document + any architectural constraints \
72
+ discovered during research.]
73
+ ```
74
+ """
@@ -0,0 +1,264 @@
1
+ """ToolSet abstraction and MCPToolSet adapter for GuidanceAgent.
2
+
3
+ The ``ToolSet`` protocol defines the contract between the agent and its tools.
4
+ ``MCPToolSet`` implements this contract by wrapping the existing MCP services
5
+ (semantic search, Cypher generation, API doc lookup) without going through
6
+ the MCP protocol layer.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from pathlib import Path
13
+ from typing import Any, Protocol
14
+
15
+ from loguru import logger
16
+
17
+
18
+ class ToolSet(Protocol):
19
+ """Abstract tool interface that GuidanceAgent depends on."""
20
+
21
+ def tool_specs(self) -> list[dict[str, Any]]:
22
+ """Return OpenAI function-calling format tool definitions."""
23
+ ...
24
+
25
+ async def call(self, name: str, arguments: dict[str, Any]) -> str:
26
+ """Execute a tool by name with the given arguments.
27
+
28
+ Returns a JSON-encoded string suitable for inclusion in the
29
+ LLM conversation as a tool result message.
30
+ """
31
+ ...
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Tool schema definitions (OpenAI function-calling format)
36
+ # ---------------------------------------------------------------------------
37
+
38
+ _SEMANTIC_SEARCH_SPEC: dict[str, Any] = {
39
+ "type": "function",
40
+ "function": {
41
+ "name": "semantic_search",
42
+ "description": (
43
+ "Search the codebase for functions, classes, or methods that are "
44
+ "semantically similar to the query. Returns source code snippets "
45
+ "with similarity scores."
46
+ ),
47
+ "parameters": {
48
+ "type": "object",
49
+ "properties": {
50
+ "query": {
51
+ "type": "string",
52
+ "description": "Natural language description of what to search for",
53
+ },
54
+ "top_k": {
55
+ "type": "integer",
56
+ "description": "Number of results to return (default: 5)",
57
+ },
58
+ },
59
+ "required": ["query"],
60
+ },
61
+ },
62
+ }
63
+
64
+ _FIND_API_SPEC: dict[str, Any] = {
65
+ "type": "function",
66
+ "function": {
67
+ "name": "find_api",
68
+ "description": (
69
+ "Find existing API interfaces by semantic search and return their "
70
+ "detailed documentation including function signatures, parameters, "
71
+ "call trees, and source code."
72
+ ),
73
+ "parameters": {
74
+ "type": "object",
75
+ "properties": {
76
+ "query": {
77
+ "type": "string",
78
+ "description": "Natural language description of the API to find",
79
+ },
80
+ "top_k": {
81
+ "type": "integer",
82
+ "description": "Number of results to return (default: 5)",
83
+ },
84
+ },
85
+ "required": ["query"],
86
+ },
87
+ },
88
+ }
89
+
90
+ _QUERY_CODE_GRAPH_SPEC: dict[str, Any] = {
91
+ "type": "function",
92
+ "function": {
93
+ "name": "query_code_graph",
94
+ "description": (
95
+ "Query the code knowledge graph using natural language. "
96
+ "Useful for finding call relationships, module dependencies, "
97
+ "class hierarchies, and structural patterns in the codebase."
98
+ ),
99
+ "parameters": {
100
+ "type": "object",
101
+ "properties": {
102
+ "question": {
103
+ "type": "string",
104
+ "description": "Natural language question about code structure",
105
+ },
106
+ },
107
+ "required": ["question"],
108
+ },
109
+ },
110
+ }
111
+
112
+ _ALL_SPECS = [_SEMANTIC_SEARCH_SPEC, _FIND_API_SPEC, _QUERY_CODE_GRAPH_SPEC]
113
+
114
+ # Maximum characters per tool result to avoid blowing up the context window.
115
+ _DEFAULT_MAX_RESULT_CHARS = 4000
116
+
117
+
118
+ # ---------------------------------------------------------------------------
119
+ # MCPToolSet — adapter that wraps existing Python services
120
+ # ---------------------------------------------------------------------------
121
+
122
+
123
+ class MCPToolSet:
124
+ """Adapter that exposes existing MCP services as a :class:`ToolSet`.
125
+
126
+ This calls the underlying Python service objects directly — it does NOT
127
+ go through the MCP protocol.
128
+ """
129
+
130
+ def __init__(
131
+ self,
132
+ semantic_service: Any | None,
133
+ cypher_gen: Any | None,
134
+ ingestor: Any | None,
135
+ artifact_dir: Path | None,
136
+ max_result_chars: int = _DEFAULT_MAX_RESULT_CHARS,
137
+ ) -> None:
138
+ self._semantic_service = semantic_service
139
+ self._cypher_gen = cypher_gen
140
+ self._ingestor = ingestor
141
+ self._artifact_dir = artifact_dir
142
+ self._max_chars = max_result_chars
143
+
144
+ self._dispatch = {
145
+ "semantic_search": self._call_semantic_search,
146
+ "find_api": self._call_find_api,
147
+ "query_code_graph": self._call_query_code_graph,
148
+ }
149
+
150
+ def tool_specs(self) -> list[dict[str, Any]]:
151
+ """Return tool definitions, excluding tools whose services are unavailable."""
152
+ specs: list[dict[str, Any]] = []
153
+ if self._semantic_service is not None:
154
+ specs.append(_SEMANTIC_SEARCH_SPEC)
155
+ specs.append(_FIND_API_SPEC)
156
+ if self._cypher_gen is not None and self._ingestor is not None:
157
+ specs.append(_QUERY_CODE_GRAPH_SPEC)
158
+ return specs
159
+
160
+ async def call(self, name: str, arguments: dict[str, Any]) -> str:
161
+ handler = self._dispatch.get(name)
162
+ if handler is None:
163
+ return json.dumps({"error": f"Unknown tool: {name}"}, ensure_ascii=False)
164
+
165
+ try:
166
+ result = await handler(**arguments)
167
+ except Exception as exc:
168
+ logger.warning(f"Tool '{name}' failed: {exc}")
169
+ return json.dumps(
170
+ {"error": f"Tool execution failed: {exc}"},
171
+ ensure_ascii=False,
172
+ default=str,
173
+ )
174
+
175
+ text = json.dumps(result, ensure_ascii=False, default=str)
176
+ if len(text) > self._max_chars:
177
+ text = text[: self._max_chars] + "\n... (truncated)"
178
+ return text
179
+
180
+ # -- Tool implementations ------------------------------------------------
181
+
182
+ async def _call_semantic_search(
183
+ self, query: str, top_k: int = 5, **_: Any
184
+ ) -> dict[str, Any]:
185
+ assert self._semantic_service is not None
186
+ results = self._semantic_service.search(query, top_k=top_k)
187
+ return {
188
+ "query": query,
189
+ "result_count": len(results),
190
+ "results": [
191
+ {
192
+ "qualified_name": r.qualified_name,
193
+ "name": r.name,
194
+ "type": r.type,
195
+ "score": r.score,
196
+ "file_path": r.file_path,
197
+ "start_line": r.start_line,
198
+ "end_line": r.end_line,
199
+ "source_code": r.source_code,
200
+ }
201
+ for r in results
202
+ ],
203
+ }
204
+
205
+ async def _call_find_api(
206
+ self, query: str, top_k: int = 5, **_: Any
207
+ ) -> dict[str, Any]:
208
+ assert self._semantic_service is not None
209
+ results = self._semantic_service.search(query, top_k=top_k)
210
+
211
+ api_dir = self._artifact_dir / "api_docs" if self._artifact_dir else None
212
+ funcs_dir = api_dir / "funcs" if api_dir else None
213
+ has_api_docs = funcs_dir is not None and funcs_dir.exists()
214
+
215
+ combined = []
216
+ for r in results:
217
+ entry: dict[str, Any] = {
218
+ "qualified_name": r.qualified_name,
219
+ "name": r.name,
220
+ "type": r.type,
221
+ "score": r.score,
222
+ "file_path": r.file_path,
223
+ "source_code": r.source_code,
224
+ "api_doc": None,
225
+ }
226
+ if has_api_docs and r.qualified_name:
227
+ safe_qn = r.qualified_name.replace("/", "_").replace("\\", "_")
228
+ doc_file = funcs_dir / f"{safe_qn}.md" # type: ignore[union-attr]
229
+ if doc_file.exists():
230
+ entry["api_doc"] = doc_file.read_text(
231
+ encoding="utf-8", errors="ignore"
232
+ )
233
+ combined.append(entry)
234
+
235
+ return {
236
+ "query": query,
237
+ "result_count": len(combined),
238
+ "api_docs_available": has_api_docs,
239
+ "results": combined,
240
+ }
241
+
242
+ async def _call_query_code_graph(
243
+ self, question: str, **_: Any
244
+ ) -> dict[str, Any]:
245
+ assert self._cypher_gen is not None
246
+ assert self._ingestor is not None
247
+
248
+ cypher = self._cypher_gen.generate(question)
249
+ rows = self._ingestor.query(cypher)
250
+
251
+ serialisable = []
252
+ for row in rows:
253
+ raw = row.get("result", row)
254
+ if isinstance(raw, (list, tuple)):
255
+ serialisable.append(list(raw))
256
+ else:
257
+ serialisable.append(raw)
258
+
259
+ return {
260
+ "question": question,
261
+ "cypher": cypher,
262
+ "row_count": len(serialisable),
263
+ "rows": serialisable,
264
+ }