code-graph-builder 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_graph_builder/__init__.py +82 -0
- code_graph_builder/builder.py +366 -0
- code_graph_builder/cgb_cli.py +32 -0
- code_graph_builder/cli.py +564 -0
- code_graph_builder/commands_cli.py +1288 -0
- code_graph_builder/config.py +340 -0
- code_graph_builder/constants.py +708 -0
- code_graph_builder/embeddings/__init__.py +40 -0
- code_graph_builder/embeddings/qwen3_embedder.py +573 -0
- code_graph_builder/embeddings/vector_store.py +584 -0
- code_graph_builder/examples/__init__.py +0 -0
- code_graph_builder/examples/example_configuration.py +276 -0
- code_graph_builder/examples/example_kuzu_usage.py +109 -0
- code_graph_builder/examples/example_semantic_search_full.py +347 -0
- code_graph_builder/examples/generate_wiki.py +915 -0
- code_graph_builder/examples/graph_export_example.py +100 -0
- code_graph_builder/examples/rag_example.py +206 -0
- code_graph_builder/examples/test_cli_demo.py +129 -0
- code_graph_builder/examples/test_embedding_api.py +153 -0
- code_graph_builder/examples/test_kuzu_local.py +190 -0
- code_graph_builder/examples/test_rag_redis.py +390 -0
- code_graph_builder/graph_updater.py +605 -0
- code_graph_builder/guidance/__init__.py +1 -0
- code_graph_builder/guidance/agent.py +123 -0
- code_graph_builder/guidance/prompts.py +74 -0
- code_graph_builder/guidance/toolset.py +264 -0
- code_graph_builder/language_spec.py +536 -0
- code_graph_builder/mcp/__init__.py +21 -0
- code_graph_builder/mcp/api_doc_generator.py +764 -0
- code_graph_builder/mcp/file_editor.py +207 -0
- code_graph_builder/mcp/pipeline.py +777 -0
- code_graph_builder/mcp/server.py +161 -0
- code_graph_builder/mcp/tools.py +1800 -0
- code_graph_builder/models.py +115 -0
- code_graph_builder/parser_loader.py +344 -0
- code_graph_builder/parsers/__init__.py +7 -0
- code_graph_builder/parsers/call_processor.py +306 -0
- code_graph_builder/parsers/call_resolver.py +139 -0
- code_graph_builder/parsers/definition_processor.py +796 -0
- code_graph_builder/parsers/factory.py +119 -0
- code_graph_builder/parsers/import_processor.py +293 -0
- code_graph_builder/parsers/structure_processor.py +145 -0
- code_graph_builder/parsers/type_inference.py +143 -0
- code_graph_builder/parsers/utils.py +134 -0
- code_graph_builder/rag/__init__.py +68 -0
- code_graph_builder/rag/camel_agent.py +429 -0
- code_graph_builder/rag/client.py +298 -0
- code_graph_builder/rag/config.py +239 -0
- code_graph_builder/rag/cypher_generator.py +67 -0
- code_graph_builder/rag/llm_backend.py +210 -0
- code_graph_builder/rag/markdown_generator.py +352 -0
- code_graph_builder/rag/prompt_templates.py +440 -0
- code_graph_builder/rag/rag_engine.py +640 -0
- code_graph_builder/rag/review_report.md +172 -0
- code_graph_builder/rag/tests/__init__.py +3 -0
- code_graph_builder/rag/tests/test_camel_agent.py +313 -0
- code_graph_builder/rag/tests/test_client.py +221 -0
- code_graph_builder/rag/tests/test_config.py +177 -0
- code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
- code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
- code_graph_builder/services/__init__.py +39 -0
- code_graph_builder/services/graph_service.py +465 -0
- code_graph_builder/services/kuzu_service.py +665 -0
- code_graph_builder/services/memory_service.py +171 -0
- code_graph_builder/settings.py +75 -0
- code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
- code_graph_builder/tests/__init__.py +1 -0
- code_graph_builder/tests/run_acceptance_check.py +378 -0
- code_graph_builder/tests/test_api_find.py +231 -0
- code_graph_builder/tests/test_api_find_integration.py +226 -0
- code_graph_builder/tests/test_basic.py +78 -0
- code_graph_builder/tests/test_c_api_extraction.py +388 -0
- code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
- code_graph_builder/tests/test_embedder.py +411 -0
- code_graph_builder/tests/test_integration_semantic.py +434 -0
- code_graph_builder/tests/test_mcp_protocol.py +298 -0
- code_graph_builder/tests/test_mcp_user_flow.py +190 -0
- code_graph_builder/tests/test_rag.py +404 -0
- code_graph_builder/tests/test_settings.py +135 -0
- code_graph_builder/tests/test_step1_graph_build.py +264 -0
- code_graph_builder/tests/test_step2_api_docs.py +323 -0
- code_graph_builder/tests/test_step3_embedding.py +278 -0
- code_graph_builder/tests/test_vector_store.py +552 -0
- code_graph_builder/tools/__init__.py +40 -0
- code_graph_builder/tools/graph_query.py +495 -0
- code_graph_builder/tools/semantic_search.py +387 -0
- code_graph_builder/types.py +333 -0
- code_graph_builder/utils/__init__.py +0 -0
- code_graph_builder/utils/path_utils.py +30 -0
- code_graph_builder-0.2.0.dist-info/METADATA +321 -0
- code_graph_builder-0.2.0.dist-info/RECORD +93 -0
- code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
- code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
"""Graph updater for building code knowledge graphs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from collections.abc import Callable, ItemsView, KeysView
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from loguru import logger
|
|
12
|
+
from tree_sitter import Node, Parser
|
|
13
|
+
|
|
14
|
+
from . import constants as cs
|
|
15
|
+
from .language_spec import get_language_spec
|
|
16
|
+
from .parsers.factory import ProcessorFactory
|
|
17
|
+
from .services import IngestorProtocol
|
|
18
|
+
from .types import (
|
|
19
|
+
FunctionRegistry,
|
|
20
|
+
LanguageQueries,
|
|
21
|
+
NodeType,
|
|
22
|
+
PropertyDict,
|
|
23
|
+
QualifiedName,
|
|
24
|
+
SimpleNameLookup,
|
|
25
|
+
TrieNode,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from .embeddings.qwen3_embedder import BaseEmbedder
|
|
30
|
+
from .embeddings.vector_store import VectorStore
|
|
31
|
+
from .utils.path_utils import should_skip_path
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FunctionRegistryTrie:
|
|
35
|
+
"""Trie-based registry for efficient function lookup."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, simple_name_lookup: SimpleNameLookup | None = None) -> None:
|
|
38
|
+
self.root: TrieNode = {}
|
|
39
|
+
self._entries: FunctionRegistry = {}
|
|
40
|
+
self._simple_name_lookup = simple_name_lookup
|
|
41
|
+
|
|
42
|
+
def insert(self, qualified_name: QualifiedName, func_type: NodeType) -> None:
|
|
43
|
+
self._entries[qualified_name] = func_type
|
|
44
|
+
|
|
45
|
+
parts = qualified_name.split(cs.SEPARATOR_DOT)
|
|
46
|
+
current: TrieNode = self.root
|
|
47
|
+
|
|
48
|
+
for part in parts:
|
|
49
|
+
if part not in current:
|
|
50
|
+
current[part] = {}
|
|
51
|
+
child = current[part]
|
|
52
|
+
assert isinstance(child, dict)
|
|
53
|
+
current = child
|
|
54
|
+
|
|
55
|
+
current[cs.TRIE_TYPE_KEY] = func_type
|
|
56
|
+
current[cs.TRIE_QN_KEY] = qualified_name
|
|
57
|
+
|
|
58
|
+
def get(
|
|
59
|
+
self, qualified_name: QualifiedName, default: NodeType | None = None
|
|
60
|
+
) -> NodeType | None:
|
|
61
|
+
return self._entries.get(qualified_name, default)
|
|
62
|
+
|
|
63
|
+
def __contains__(self, qualified_name: QualifiedName) -> bool:
|
|
64
|
+
return qualified_name in self._entries
|
|
65
|
+
|
|
66
|
+
def __getitem__(self, qualified_name: QualifiedName) -> NodeType:
|
|
67
|
+
return self._entries[qualified_name]
|
|
68
|
+
|
|
69
|
+
def __setitem__(self, qualified_name: QualifiedName, func_type: NodeType) -> None:
|
|
70
|
+
self.insert(qualified_name, func_type)
|
|
71
|
+
|
|
72
|
+
def __delitem__(self, qualified_name: QualifiedName) -> None:
|
|
73
|
+
if qualified_name not in self._entries:
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
del self._entries[qualified_name]
|
|
77
|
+
|
|
78
|
+
parts = qualified_name.split(cs.SEPARATOR_DOT)
|
|
79
|
+
self._cleanup_trie_path(parts, self.root)
|
|
80
|
+
|
|
81
|
+
def _cleanup_trie_path(self, parts: list[str], node: TrieNode) -> bool:
|
|
82
|
+
if not parts:
|
|
83
|
+
node.pop(cs.TRIE_QN_KEY, None)
|
|
84
|
+
node.pop(cs.TRIE_TYPE_KEY, None)
|
|
85
|
+
return not node
|
|
86
|
+
|
|
87
|
+
part = parts[0]
|
|
88
|
+
if part not in node:
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
child = node[part]
|
|
92
|
+
assert isinstance(child, dict)
|
|
93
|
+
if self._cleanup_trie_path(parts[1:], child):
|
|
94
|
+
del node[part]
|
|
95
|
+
|
|
96
|
+
is_endpoint = cs.TRIE_QN_KEY in node
|
|
97
|
+
has_children = any(not key.startswith(cs.TRIE_INTERNAL_PREFIX) for key in node)
|
|
98
|
+
return not has_children and not is_endpoint
|
|
99
|
+
|
|
100
|
+
def _navigate_to_prefix(self, prefix: str) -> TrieNode | None:
|
|
101
|
+
parts = prefix.split(cs.SEPARATOR_DOT) if prefix else []
|
|
102
|
+
current: TrieNode = self.root
|
|
103
|
+
for part in parts:
|
|
104
|
+
if part not in current:
|
|
105
|
+
return None
|
|
106
|
+
child = current[part]
|
|
107
|
+
assert isinstance(child, dict)
|
|
108
|
+
current = child
|
|
109
|
+
return current
|
|
110
|
+
|
|
111
|
+
def _collect_from_subtree(
|
|
112
|
+
self,
|
|
113
|
+
node: TrieNode,
|
|
114
|
+
filter_fn: Callable[[QualifiedName], bool] | None = None,
|
|
115
|
+
) -> list[tuple[QualifiedName, NodeType]]:
|
|
116
|
+
results: list[tuple[QualifiedName, NodeType]] = []
|
|
117
|
+
|
|
118
|
+
def dfs(n: TrieNode) -> None:
|
|
119
|
+
if cs.TRIE_QN_KEY in n:
|
|
120
|
+
qn = n[cs.TRIE_QN_KEY]
|
|
121
|
+
func_type = n[cs.TRIE_TYPE_KEY]
|
|
122
|
+
assert isinstance(qn, str) and isinstance(func_type, NodeType)
|
|
123
|
+
if filter_fn is None or filter_fn(qn):
|
|
124
|
+
results.append((qn, func_type))
|
|
125
|
+
|
|
126
|
+
for key, child in n.items():
|
|
127
|
+
if not key.startswith(cs.TRIE_INTERNAL_PREFIX):
|
|
128
|
+
assert isinstance(child, dict)
|
|
129
|
+
dfs(child)
|
|
130
|
+
|
|
131
|
+
dfs(node)
|
|
132
|
+
return results
|
|
133
|
+
|
|
134
|
+
def keys(self) -> KeysView[QualifiedName]:
|
|
135
|
+
return self._entries.keys()
|
|
136
|
+
|
|
137
|
+
def items(self) -> ItemsView[QualifiedName, NodeType]:
|
|
138
|
+
return self._entries.items()
|
|
139
|
+
|
|
140
|
+
def __len__(self) -> int:
|
|
141
|
+
return len(self._entries)
|
|
142
|
+
|
|
143
|
+
def find_with_prefix_and_suffix(
|
|
144
|
+
self, prefix: str, suffix: str
|
|
145
|
+
) -> list[QualifiedName]:
|
|
146
|
+
node = self._navigate_to_prefix(prefix)
|
|
147
|
+
if node is None:
|
|
148
|
+
return []
|
|
149
|
+
suffix_pattern = f".{suffix}"
|
|
150
|
+
matches = self._collect_from_subtree(
|
|
151
|
+
node, lambda qn: qn.endswith(suffix_pattern)
|
|
152
|
+
)
|
|
153
|
+
return [qn for qn, _ in matches]
|
|
154
|
+
|
|
155
|
+
def find_ending_with(self, suffix: str) -> list[QualifiedName]:
|
|
156
|
+
if self._simple_name_lookup is not None and suffix in self._simple_name_lookup:
|
|
157
|
+
return list(self._simple_name_lookup[suffix])
|
|
158
|
+
return [qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]
|
|
159
|
+
|
|
160
|
+
def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]:
|
|
161
|
+
node = self._navigate_to_prefix(prefix)
|
|
162
|
+
return [] if node is None else self._collect_from_subtree(node)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class BoundedASTCache:
|
|
166
|
+
"""LRU cache for AST nodes with memory limits."""
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
max_entries: int = 1000,
|
|
171
|
+
max_memory_mb: int = 500,
|
|
172
|
+
):
|
|
173
|
+
self.cache: OrderedDict[Path, tuple[Node, cs.SupportedLanguage]] = OrderedDict()
|
|
174
|
+
self.max_entries = max_entries
|
|
175
|
+
self.max_memory_bytes = max_memory_mb * cs.BYTES_PER_MB
|
|
176
|
+
|
|
177
|
+
def __setitem__(self, key: Path, value: tuple[Node, cs.SupportedLanguage]) -> None:
|
|
178
|
+
if key in self.cache:
|
|
179
|
+
del self.cache[key]
|
|
180
|
+
|
|
181
|
+
self.cache[key] = value
|
|
182
|
+
self._enforce_limits()
|
|
183
|
+
|
|
184
|
+
def __getitem__(self, key: Path) -> tuple[Node, cs.SupportedLanguage]:
|
|
185
|
+
value = self.cache[key]
|
|
186
|
+
self.cache.move_to_end(key)
|
|
187
|
+
return value
|
|
188
|
+
|
|
189
|
+
def __delitem__(self, key: Path) -> None:
|
|
190
|
+
if key in self.cache:
|
|
191
|
+
del self.cache[key]
|
|
192
|
+
|
|
193
|
+
def __contains__(self, key: Path) -> bool:
|
|
194
|
+
return key in self.cache
|
|
195
|
+
|
|
196
|
+
def items(self) -> ItemsView[Path, tuple[Node, cs.SupportedLanguage]]:
|
|
197
|
+
return self.cache.items()
|
|
198
|
+
|
|
199
|
+
def _enforce_limits(self) -> None:
|
|
200
|
+
while len(self.cache) > self.max_entries:
|
|
201
|
+
self.cache.popitem(last=False)
|
|
202
|
+
|
|
203
|
+
if self._should_evict_for_memory():
|
|
204
|
+
entries_to_remove = max(1, len(self.cache) // 10)
|
|
205
|
+
for _ in range(entries_to_remove):
|
|
206
|
+
if self.cache:
|
|
207
|
+
self.cache.popitem(last=False)
|
|
208
|
+
|
|
209
|
+
def _should_evict_for_memory(self) -> bool:
|
|
210
|
+
try:
|
|
211
|
+
cache_size = sum(sys.getsizeof(v) for v in self.cache.values())
|
|
212
|
+
return cache_size > self.max_memory_bytes
|
|
213
|
+
except Exception:
|
|
214
|
+
return len(self.cache) > int(self.max_entries * 0.8)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class GraphUpdater:
|
|
218
|
+
"""Main coordinator for building code knowledge graphs."""
|
|
219
|
+
|
|
220
|
+
def __init__(
|
|
221
|
+
self,
|
|
222
|
+
ingestor: IngestorProtocol,
|
|
223
|
+
repo_path: Path,
|
|
224
|
+
parsers: dict[cs.SupportedLanguage, Parser],
|
|
225
|
+
queries: dict[cs.SupportedLanguage, LanguageQueries],
|
|
226
|
+
unignore_paths: frozenset[str] | None = None,
|
|
227
|
+
exclude_paths: frozenset[str] | None = None,
|
|
228
|
+
embedder: BaseEmbedder | None = None,
|
|
229
|
+
vector_store: VectorStore | None = None,
|
|
230
|
+
embedding_config: dict[str, bool | int | str] | None = None,
|
|
231
|
+
):
|
|
232
|
+
self.ingestor = ingestor
|
|
233
|
+
self.repo_path = repo_path
|
|
234
|
+
self.parsers = parsers
|
|
235
|
+
self.queries = queries
|
|
236
|
+
self.project_name = repo_path.resolve().name
|
|
237
|
+
self.simple_name_lookup: SimpleNameLookup = defaultdict(set)
|
|
238
|
+
self.function_registry = FunctionRegistryTrie(
|
|
239
|
+
simple_name_lookup=self.simple_name_lookup
|
|
240
|
+
)
|
|
241
|
+
self.ast_cache = BoundedASTCache()
|
|
242
|
+
self.unignore_paths = unignore_paths
|
|
243
|
+
self.exclude_paths = exclude_paths
|
|
244
|
+
|
|
245
|
+
self.embedder = embedder
|
|
246
|
+
self.vector_store = vector_store
|
|
247
|
+
self.embedding_config = embedding_config or {}
|
|
248
|
+
self._embedding_enabled = self.embedding_config.get("enabled", False)
|
|
249
|
+
|
|
250
|
+
self.factory = ProcessorFactory(
|
|
251
|
+
ingestor=self.ingestor,
|
|
252
|
+
repo_path=self.repo_path,
|
|
253
|
+
project_name=self.project_name,
|
|
254
|
+
queries=self.queries,
|
|
255
|
+
function_registry=self.function_registry,
|
|
256
|
+
simple_name_lookup=self.simple_name_lookup,
|
|
257
|
+
ast_cache=self.ast_cache,
|
|
258
|
+
unignore_paths=self.unignore_paths,
|
|
259
|
+
exclude_paths=self.exclude_paths,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
def _is_dependency_file(self, file_name: str, filepath: Path) -> bool:
|
|
263
|
+
return (
|
|
264
|
+
file_name.lower() in cs.DEPENDENCY_FILES
|
|
265
|
+
or filepath.suffix.lower() == ".csproj"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def run(self) -> None:
|
|
269
|
+
"""Run the graph building process."""
|
|
270
|
+
logger.info(f"Building graph for project: {self.project_name}")
|
|
271
|
+
|
|
272
|
+
# Pass 1: Structure
|
|
273
|
+
logger.info("Pass 1: Identifying project structure")
|
|
274
|
+
self.factory.structure_processor.identify_structure()
|
|
275
|
+
|
|
276
|
+
# Pass 2: Files
|
|
277
|
+
logger.info("Pass 2: Processing files")
|
|
278
|
+
self._process_files()
|
|
279
|
+
|
|
280
|
+
logger.info(f"Found {len(self.function_registry)} functions")
|
|
281
|
+
|
|
282
|
+
# Pass 3: Calls
|
|
283
|
+
logger.info("Pass 3: Processing function calls")
|
|
284
|
+
self._process_function_calls()
|
|
285
|
+
|
|
286
|
+
# Process method overrides
|
|
287
|
+
self.factory.definition_processor.process_all_method_overrides()
|
|
288
|
+
|
|
289
|
+
# Pass 4: Semantic Embeddings (optional)
|
|
290
|
+
if self._embedding_enabled and self.embedder and self.vector_store:
|
|
291
|
+
logger.info("Pass 4: Generating semantic embeddings")
|
|
292
|
+
self._generate_semantic_embeddings()
|
|
293
|
+
|
|
294
|
+
logger.info("Analysis complete")
|
|
295
|
+
self.ingestor.flush_all()
|
|
296
|
+
|
|
297
|
+
def _process_files(self) -> None:
|
|
298
|
+
"""Process all files in the repository."""
|
|
299
|
+
try:
|
|
300
|
+
from .utils.path_utils import should_skip_path
|
|
301
|
+
except ImportError:
|
|
302
|
+
# Fallback if utils not available
|
|
303
|
+
def should_skip_path(
|
|
304
|
+
filepath: Path,
|
|
305
|
+
repo_path: Path,
|
|
306
|
+
exclude_paths: frozenset[str] | None = None,
|
|
307
|
+
unignore_paths: frozenset[str] | None = None,
|
|
308
|
+
) -> bool:
|
|
309
|
+
rel_path = filepath.relative_to(repo_path)
|
|
310
|
+
path_str = str(rel_path)
|
|
311
|
+
|
|
312
|
+
# Skip common directories
|
|
313
|
+
skip_dirs = {".git", "__pycache__", "node_modules", "venv", ".venv", ".pytest_cache"}
|
|
314
|
+
if any(part in skip_dirs for part in rel_path.parts):
|
|
315
|
+
return True
|
|
316
|
+
|
|
317
|
+
# Skip excluded paths
|
|
318
|
+
if exclude_paths:
|
|
319
|
+
for pattern in exclude_paths:
|
|
320
|
+
if pattern in path_str:
|
|
321
|
+
return True
|
|
322
|
+
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
# Sort files so header files (.h) are processed before source files (.c)
|
|
326
|
+
# to populate header declarations before visibility resolution.
|
|
327
|
+
all_files = sorted(
|
|
328
|
+
self.repo_path.rglob("*"),
|
|
329
|
+
key=lambda p: (0 if p.suffix == cs.EXT_H else 1, str(p)),
|
|
330
|
+
)
|
|
331
|
+
for filepath in all_files:
|
|
332
|
+
if filepath.is_file() and not should_skip_path(
|
|
333
|
+
filepath,
|
|
334
|
+
self.repo_path,
|
|
335
|
+
exclude_paths=self.exclude_paths,
|
|
336
|
+
unignore_paths=self.unignore_paths,
|
|
337
|
+
):
|
|
338
|
+
lang_config = get_language_spec(filepath.suffix)
|
|
339
|
+
# Fallback: if the mapped language (e.g. CPP for .h) isn't
|
|
340
|
+
# available but C is, use C for header files.
|
|
341
|
+
if (
|
|
342
|
+
lang_config
|
|
343
|
+
and isinstance(lang_config.language, cs.SupportedLanguage)
|
|
344
|
+
and lang_config.language not in self.parsers
|
|
345
|
+
and filepath.suffix == cs.EXT_H
|
|
346
|
+
and cs.SupportedLanguage.C in self.parsers
|
|
347
|
+
):
|
|
348
|
+
from .language_spec import LANGUAGE_SPECS
|
|
349
|
+
lang_config = LANGUAGE_SPECS.get(cs.SupportedLanguage.C)
|
|
350
|
+
if (
|
|
351
|
+
lang_config
|
|
352
|
+
and isinstance(lang_config.language, cs.SupportedLanguage)
|
|
353
|
+
and lang_config.language in self.parsers
|
|
354
|
+
):
|
|
355
|
+
result = self.factory.definition_processor.process_file(
|
|
356
|
+
filepath,
|
|
357
|
+
lang_config.language,
|
|
358
|
+
self.queries,
|
|
359
|
+
self.factory.structure_processor.structural_elements,
|
|
360
|
+
)
|
|
361
|
+
if result:
|
|
362
|
+
root_node, language = result
|
|
363
|
+
self.ast_cache[filepath] = (root_node, language)
|
|
364
|
+
elif self._is_dependency_file(filepath.name, filepath):
|
|
365
|
+
self.factory.definition_processor.process_dependencies(filepath)
|
|
366
|
+
|
|
367
|
+
self.factory.structure_processor.process_generic_file(
|
|
368
|
+
filepath, filepath.name
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
def _process_function_calls(self) -> None:
|
|
372
|
+
"""Process function calls in all cached ASTs."""
|
|
373
|
+
ast_cache_items = list(self.ast_cache.items())
|
|
374
|
+
for file_path, (root_node, language) in ast_cache_items:
|
|
375
|
+
self.factory.call_processor.process_calls_in_file(
|
|
376
|
+
file_path, root_node, language, self.queries
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
def _generate_semantic_embeddings(self) -> None:
|
|
380
|
+
"""Generate semantic embeddings for functions and classes.
|
|
381
|
+
|
|
382
|
+
This is Pass 4 of the graph building process.
|
|
383
|
+
Extracts source code for each function/method and generates
|
|
384
|
+
embeddings using the configured embedder.
|
|
385
|
+
"""
|
|
386
|
+
if not self.embedder or not self.vector_store:
|
|
387
|
+
logger.warning("Embedder or vector store not configured, skipping embeddings")
|
|
388
|
+
return
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
from .embeddings.vector_store import VectorRecord
|
|
392
|
+
|
|
393
|
+
records_to_store: list[VectorRecord] = []
|
|
394
|
+
texts_to_embed: list[str] = []
|
|
395
|
+
node_info: list[tuple[int, str, PropertyDict]] = []
|
|
396
|
+
|
|
397
|
+
batch_size = self.embedding_config.get("batch_size", 32)
|
|
398
|
+
|
|
399
|
+
for qn, node_type in self.function_registry.items():
|
|
400
|
+
if node_type not in (NodeType.FUNCTION, NodeType.METHOD, NodeType.CLASS):
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
try:
|
|
404
|
+
source_code = self._extract_source_for_qualified_name(qn)
|
|
405
|
+
if not source_code:
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
node_id = self._get_node_id_for_qualified_name(qn)
|
|
409
|
+
if node_id is None:
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
texts_to_embed.append(source_code)
|
|
413
|
+
node_info.append((node_id, qn, {"type": str(node_type)}))
|
|
414
|
+
|
|
415
|
+
if len(texts_to_embed) >= batch_size:
|
|
416
|
+
self._embed_and_store_batch(
|
|
417
|
+
texts_to_embed, node_info, records_to_store
|
|
418
|
+
)
|
|
419
|
+
texts_to_embed = []
|
|
420
|
+
node_info = []
|
|
421
|
+
|
|
422
|
+
except Exception as e:
|
|
423
|
+
logger.warning(f"Failed to prepare embedding for {qn}: {e}")
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
if texts_to_embed:
|
|
427
|
+
self._embed_and_store_batch(texts_to_embed, node_info, records_to_store)
|
|
428
|
+
|
|
429
|
+
stats = self.vector_store.get_stats()
|
|
430
|
+
logger.info(f"Generated embeddings for {stats['count']} code entities")
|
|
431
|
+
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logger.error(f"Failed to generate semantic embeddings: {e}")
|
|
434
|
+
|
|
435
|
+
def _embed_and_store_batch(
|
|
436
|
+
self,
|
|
437
|
+
texts: list[str],
|
|
438
|
+
node_info: list[tuple[int, str, PropertyDict]],
|
|
439
|
+
records: list,
|
|
440
|
+
) -> None:
|
|
441
|
+
"""Embed a batch of texts and store in vector store.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
texts: Source code texts to embed
|
|
445
|
+
node_info: Tuple of (node_id, qualified_name, metadata)
|
|
446
|
+
records: Accumulated records list
|
|
447
|
+
"""
|
|
448
|
+
from .embeddings.vector_store import VectorRecord
|
|
449
|
+
|
|
450
|
+
if not self.embedder or not self.vector_store:
|
|
451
|
+
return
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
embeddings = self.embedder.embed_documents(texts, show_progress=False)
|
|
455
|
+
|
|
456
|
+
for (node_id, qn, metadata), embedding in zip(node_info, embeddings):
|
|
457
|
+
record = VectorRecord(
|
|
458
|
+
node_id=node_id,
|
|
459
|
+
qualified_name=qn,
|
|
460
|
+
embedding=embedding,
|
|
461
|
+
metadata=metadata,
|
|
462
|
+
)
|
|
463
|
+
records.append(record)
|
|
464
|
+
|
|
465
|
+
self.vector_store.store_embeddings_batch(records)
|
|
466
|
+
records.clear()
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.warning(f"Failed to embed batch: {e}")
|
|
470
|
+
|
|
471
|
+
def _extract_source_for_qualified_name(self, qualified_name: str) -> str | None:
|
|
472
|
+
"""Extract source code for a qualified name.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
qualified_name: Fully qualified name of the entity
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Source code string or None if not found
|
|
479
|
+
"""
|
|
480
|
+
try:
|
|
481
|
+
parts = qualified_name.split(cs.SEPARATOR_DOT)
|
|
482
|
+
if len(parts) < 2:
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
file_path = self._resolve_file_from_qn(parts)
|
|
486
|
+
if not file_path or not file_path.exists():
|
|
487
|
+
return None
|
|
488
|
+
|
|
489
|
+
if file_path not in self.ast_cache:
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
root_node, language = self.ast_cache[file_path]
|
|
493
|
+
|
|
494
|
+
source_code = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
495
|
+
|
|
496
|
+
entity_name = parts[-1]
|
|
497
|
+
lines = source_code.split("\n")
|
|
498
|
+
|
|
499
|
+
for i, line in enumerate(lines):
|
|
500
|
+
if entity_name in line and self._is_definition_line(line, entity_name):
|
|
501
|
+
start_line = max(0, i - 2)
|
|
502
|
+
end_line = min(len(lines), i + 50)
|
|
503
|
+
return "\n".join(lines[start_line:end_line])
|
|
504
|
+
|
|
505
|
+
return source_code[:2000]
|
|
506
|
+
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.debug(f"Failed to extract source for {qualified_name}: {e}")
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
def _is_definition_line(self, line: str, name: str) -> bool:
|
|
512
|
+
"""Check if a line contains a definition for the given name.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
line: Source code line
|
|
516
|
+
name: Entity name to check
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
True if this looks like a definition line
|
|
520
|
+
"""
|
|
521
|
+
stripped = line.strip()
|
|
522
|
+
keywords = ["def ", "class ", "function ", "fn ", "func "]
|
|
523
|
+
return any(kw in stripped for kw in keywords) and name in stripped
|
|
524
|
+
|
|
525
|
+
def _resolve_file_from_qn(self, parts: list[str]) -> Path | None:
|
|
526
|
+
"""Resolve file path from qualified name parts.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
parts: Parts of the qualified name
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
Path object or None if not resolved
|
|
533
|
+
"""
|
|
534
|
+
try:
|
|
535
|
+
if parts[0] != self.project_name:
|
|
536
|
+
return None
|
|
537
|
+
|
|
538
|
+
relative_parts = parts[1:]
|
|
539
|
+
|
|
540
|
+
for i in range(len(relative_parts), 0, -1):
|
|
541
|
+
candidate = self.repo_path.joinpath(*relative_parts[:i])
|
|
542
|
+
if candidate.exists() and candidate.is_file():
|
|
543
|
+
return candidate
|
|
544
|
+
|
|
545
|
+
for ext in [".py", ".js", ".ts", ".rs", ".go", ".java", ".cpp", ".c"]:
|
|
546
|
+
candidate_with_ext = self.repo_path.joinpath(
|
|
547
|
+
*relative_parts[:i]
|
|
548
|
+
).with_suffix(ext)
|
|
549
|
+
if candidate_with_ext.exists():
|
|
550
|
+
return candidate_with_ext
|
|
551
|
+
|
|
552
|
+
return None
|
|
553
|
+
|
|
554
|
+
except Exception:
|
|
555
|
+
return None
|
|
556
|
+
|
|
557
|
+
def _get_node_id_for_qualified_name(self, qualified_name: str) -> int | None:
|
|
558
|
+
"""Get node ID for a qualified name from the ingestor.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
qualified_name: Fully qualified name
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Node ID or None if not found
|
|
565
|
+
"""
|
|
566
|
+
try:
|
|
567
|
+
if hasattr(self.ingestor, "_node_id_cache"):
|
|
568
|
+
cache = self.ingestor._node_id_cache
|
|
569
|
+
for key, node_id in cache.items():
|
|
570
|
+
if isinstance(key, tuple) and len(key) >= 3:
|
|
571
|
+
if key[2] == qualified_name:
|
|
572
|
+
return node_id
|
|
573
|
+
|
|
574
|
+
return hash(qualified_name) % (2**31)
|
|
575
|
+
|
|
576
|
+
except Exception:
|
|
577
|
+
return None
|
|
578
|
+
|
|
579
|
+
def remove_file_from_state(self, file_path: Path) -> None:
|
|
580
|
+
"""Remove a file from the internal state."""
|
|
581
|
+
logger.debug(f"Removing state for: {file_path}")
|
|
582
|
+
|
|
583
|
+
if file_path in self.ast_cache:
|
|
584
|
+
del self.ast_cache[file_path]
|
|
585
|
+
|
|
586
|
+
relative_path = file_path.relative_to(self.repo_path)
|
|
587
|
+
path_parts = (
|
|
588
|
+
relative_path.parent.parts
|
|
589
|
+
if file_path.name == cs.INIT_PY
|
|
590
|
+
else relative_path.with_suffix("").parts
|
|
591
|
+
)
|
|
592
|
+
module_qn_prefix = cs.SEPARATOR_DOT.join([self.project_name, *path_parts])
|
|
593
|
+
|
|
594
|
+
qns_to_remove = set()
|
|
595
|
+
|
|
596
|
+
for qn in list(self.function_registry.keys()):
|
|
597
|
+
if qn.startswith(f"{module_qn_prefix}.") or qn == module_qn_prefix:
|
|
598
|
+
qns_to_remove.add(qn)
|
|
599
|
+
del self.function_registry[qn]
|
|
600
|
+
|
|
601
|
+
for simple_name, qn_set in self.simple_name_lookup.items():
|
|
602
|
+
original_count = len(qn_set)
|
|
603
|
+
new_qn_set = qn_set - qns_to_remove
|
|
604
|
+
if len(new_qn_set) < original_count:
|
|
605
|
+
self.simple_name_lookup[simple_name] = new_qn_set
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Guidance agent: converts design documents into code generation guidance."""
|