code-graph-builder 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_graph_builder/__init__.py +82 -0
- code_graph_builder/builder.py +366 -0
- code_graph_builder/cgb_cli.py +32 -0
- code_graph_builder/cli.py +564 -0
- code_graph_builder/commands_cli.py +1288 -0
- code_graph_builder/config.py +340 -0
- code_graph_builder/constants.py +708 -0
- code_graph_builder/embeddings/__init__.py +40 -0
- code_graph_builder/embeddings/qwen3_embedder.py +573 -0
- code_graph_builder/embeddings/vector_store.py +584 -0
- code_graph_builder/examples/__init__.py +0 -0
- code_graph_builder/examples/example_configuration.py +276 -0
- code_graph_builder/examples/example_kuzu_usage.py +109 -0
- code_graph_builder/examples/example_semantic_search_full.py +347 -0
- code_graph_builder/examples/generate_wiki.py +915 -0
- code_graph_builder/examples/graph_export_example.py +100 -0
- code_graph_builder/examples/rag_example.py +206 -0
- code_graph_builder/examples/test_cli_demo.py +129 -0
- code_graph_builder/examples/test_embedding_api.py +153 -0
- code_graph_builder/examples/test_kuzu_local.py +190 -0
- code_graph_builder/examples/test_rag_redis.py +390 -0
- code_graph_builder/graph_updater.py +605 -0
- code_graph_builder/guidance/__init__.py +1 -0
- code_graph_builder/guidance/agent.py +123 -0
- code_graph_builder/guidance/prompts.py +74 -0
- code_graph_builder/guidance/toolset.py +264 -0
- code_graph_builder/language_spec.py +536 -0
- code_graph_builder/mcp/__init__.py +21 -0
- code_graph_builder/mcp/api_doc_generator.py +764 -0
- code_graph_builder/mcp/file_editor.py +207 -0
- code_graph_builder/mcp/pipeline.py +777 -0
- code_graph_builder/mcp/server.py +161 -0
- code_graph_builder/mcp/tools.py +1800 -0
- code_graph_builder/models.py +115 -0
- code_graph_builder/parser_loader.py +344 -0
- code_graph_builder/parsers/__init__.py +7 -0
- code_graph_builder/parsers/call_processor.py +306 -0
- code_graph_builder/parsers/call_resolver.py +139 -0
- code_graph_builder/parsers/definition_processor.py +796 -0
- code_graph_builder/parsers/factory.py +119 -0
- code_graph_builder/parsers/import_processor.py +293 -0
- code_graph_builder/parsers/structure_processor.py +145 -0
- code_graph_builder/parsers/type_inference.py +143 -0
- code_graph_builder/parsers/utils.py +134 -0
- code_graph_builder/rag/__init__.py +68 -0
- code_graph_builder/rag/camel_agent.py +429 -0
- code_graph_builder/rag/client.py +298 -0
- code_graph_builder/rag/config.py +239 -0
- code_graph_builder/rag/cypher_generator.py +67 -0
- code_graph_builder/rag/llm_backend.py +210 -0
- code_graph_builder/rag/markdown_generator.py +352 -0
- code_graph_builder/rag/prompt_templates.py +440 -0
- code_graph_builder/rag/rag_engine.py +640 -0
- code_graph_builder/rag/review_report.md +172 -0
- code_graph_builder/rag/tests/__init__.py +3 -0
- code_graph_builder/rag/tests/test_camel_agent.py +313 -0
- code_graph_builder/rag/tests/test_client.py +221 -0
- code_graph_builder/rag/tests/test_config.py +177 -0
- code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
- code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
- code_graph_builder/services/__init__.py +39 -0
- code_graph_builder/services/graph_service.py +465 -0
- code_graph_builder/services/kuzu_service.py +665 -0
- code_graph_builder/services/memory_service.py +171 -0
- code_graph_builder/settings.py +75 -0
- code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
- code_graph_builder/tests/__init__.py +1 -0
- code_graph_builder/tests/run_acceptance_check.py +378 -0
- code_graph_builder/tests/test_api_find.py +231 -0
- code_graph_builder/tests/test_api_find_integration.py +226 -0
- code_graph_builder/tests/test_basic.py +78 -0
- code_graph_builder/tests/test_c_api_extraction.py +388 -0
- code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
- code_graph_builder/tests/test_embedder.py +411 -0
- code_graph_builder/tests/test_integration_semantic.py +434 -0
- code_graph_builder/tests/test_mcp_protocol.py +298 -0
- code_graph_builder/tests/test_mcp_user_flow.py +190 -0
- code_graph_builder/tests/test_rag.py +404 -0
- code_graph_builder/tests/test_settings.py +135 -0
- code_graph_builder/tests/test_step1_graph_build.py +264 -0
- code_graph_builder/tests/test_step2_api_docs.py +323 -0
- code_graph_builder/tests/test_step3_embedding.py +278 -0
- code_graph_builder/tests/test_vector_store.py +552 -0
- code_graph_builder/tools/__init__.py +40 -0
- code_graph_builder/tools/graph_query.py +495 -0
- code_graph_builder/tools/semantic_search.py +387 -0
- code_graph_builder/types.py +333 -0
- code_graph_builder/utils/__init__.py +0 -0
- code_graph_builder/utils/path_utils.py +30 -0
- code_graph_builder-0.2.0.dist-info/METADATA +321 -0
- code_graph_builder-0.2.0.dist-info/RECORD +93 -0
- code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
- code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""Step 3 integration test: vector embedding generation from tinycc graph.
|
|
2
|
+
|
|
3
|
+
Builds graph, generates API docs, then creates vector embeddings.
|
|
4
|
+
Validates embedding quality, vector store integrity, and semantic search.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import pickle
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
TINYCC_PATH = Path(__file__).resolve().parents[3] / "tinycc"
|
|
16
|
+
|
|
17
|
+
pytestmark = [
|
|
18
|
+
pytest.mark.skipif(
|
|
19
|
+
not TINYCC_PATH.exists(),
|
|
20
|
+
reason=f"tinycc source not found at {TINYCC_PATH}",
|
|
21
|
+
),
|
|
22
|
+
pytest.mark.skipif(
|
|
23
|
+
not os.environ.get("DASHSCOPE_API_KEY"),
|
|
24
|
+
reason="DASHSCOPE_API_KEY not set (required for embedding API)",
|
|
25
|
+
),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture(scope="module")
|
|
30
|
+
def pipeline_artifacts(tmp_path_factory):
|
|
31
|
+
"""Run Step 1 (graph) + Step 2 (api-docs) + Step 3 (embedding) once."""
|
|
32
|
+
from code_graph_builder.mcp.pipeline import (
|
|
33
|
+
build_graph,
|
|
34
|
+
build_vector_index,
|
|
35
|
+
generate_api_docs_step,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
artifact_dir = tmp_path_factory.mktemp("artifacts")
|
|
39
|
+
db_path = artifact_dir / "graph.db"
|
|
40
|
+
vectors_path = artifact_dir / "vectors.pkl"
|
|
41
|
+
|
|
42
|
+
# Step 1: build graph
|
|
43
|
+
builder = build_graph(
|
|
44
|
+
repo_path=TINYCC_PATH,
|
|
45
|
+
db_path=db_path,
|
|
46
|
+
rebuild=True,
|
|
47
|
+
backend="kuzu",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Step 2: generate API docs
|
|
51
|
+
generate_api_docs_step(builder=builder, artifact_dir=artifact_dir, rebuild=True)
|
|
52
|
+
|
|
53
|
+
# Step 3: build embeddings
|
|
54
|
+
vector_store, embedder, func_map = build_vector_index(
|
|
55
|
+
builder=builder,
|
|
56
|
+
repo_path=TINYCC_PATH,
|
|
57
|
+
vectors_path=vectors_path,
|
|
58
|
+
rebuild=True,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
yield {
|
|
62
|
+
"builder": builder,
|
|
63
|
+
"vector_store": vector_store,
|
|
64
|
+
"embedder": embedder,
|
|
65
|
+
"func_map": func_map,
|
|
66
|
+
"vectors_path": vectors_path,
|
|
67
|
+
"artifact_dir": artifact_dir,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if hasattr(builder, "close"):
|
|
71
|
+
builder.close()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# Vector store basics
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TestVectorStoreStructure:
|
|
80
|
+
"""Verify the vector store is populated correctly."""
|
|
81
|
+
|
|
82
|
+
def test_store_not_empty(self, pipeline_artifacts):
|
|
83
|
+
vs = pipeline_artifacts["vector_store"]
|
|
84
|
+
assert len(vs) > 50, f"Expected many embeddings, got {len(vs)}"
|
|
85
|
+
|
|
86
|
+
def test_func_map_matches_store(self, pipeline_artifacts):
|
|
87
|
+
vs = pipeline_artifacts["vector_store"]
|
|
88
|
+
fm = pipeline_artifacts["func_map"]
|
|
89
|
+
assert len(fm) == len(vs), (
|
|
90
|
+
f"func_map ({len(fm)}) should match vector_store ({len(vs)})"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def test_embedding_dimension(self, pipeline_artifacts):
|
|
94
|
+
"""Embeddings should have a reasonable dimension (1024 or 1536)."""
|
|
95
|
+
vs = pipeline_artifacts["vector_store"]
|
|
96
|
+
for record in vs._records.values():
|
|
97
|
+
dim = len(record.embedding)
|
|
98
|
+
assert dim in (1024, 1536), (
|
|
99
|
+
f"Unexpected embedding dimension: {dim}"
|
|
100
|
+
)
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
def test_vectors_file_persisted(self, pipeline_artifacts):
|
|
104
|
+
vp = pipeline_artifacts["vectors_path"]
|
|
105
|
+
assert vp.exists()
|
|
106
|
+
assert vp.stat().st_size > 1000, "vectors.pkl should be substantial"
|
|
107
|
+
|
|
108
|
+
def test_vectors_file_loadable(self, pipeline_artifacts):
|
|
109
|
+
vp = pipeline_artifacts["vectors_path"]
|
|
110
|
+
with open(vp, "rb") as f:
|
|
111
|
+
cache = pickle.load(f)
|
|
112
|
+
assert "vector_store" in cache
|
|
113
|
+
assert "func_map" in cache
|
|
114
|
+
assert len(cache["func_map"]) > 0
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Embedding content quality
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class TestEmbeddingContent:
|
|
123
|
+
"""Verify embedding text is rich and includes expected context."""
|
|
124
|
+
|
|
125
|
+
def test_func_map_has_names(self, pipeline_artifacts):
|
|
126
|
+
fm = pipeline_artifacts["func_map"]
|
|
127
|
+
named = sum(1 for f in fm.values() if f.get("name"))
|
|
128
|
+
assert named == len(fm), "All funcs in func_map should have names"
|
|
129
|
+
|
|
130
|
+
def test_func_map_has_qualified_names(self, pipeline_artifacts):
|
|
131
|
+
fm = pipeline_artifacts["func_map"]
|
|
132
|
+
qn_count = sum(1 for f in fm.values() if f.get("qualified_name"))
|
|
133
|
+
assert qn_count == len(fm), "All funcs should have qualified_name"
|
|
134
|
+
|
|
135
|
+
def test_func_map_has_paths(self, pipeline_artifacts):
|
|
136
|
+
"""Most functions should have file paths."""
|
|
137
|
+
fm = pipeline_artifacts["func_map"]
|
|
138
|
+
with_path = sum(1 for f in fm.values() if f.get("path"))
|
|
139
|
+
ratio = with_path / len(fm) if fm else 0
|
|
140
|
+
assert ratio > 0.5, f"Only {ratio:.0%} functions have paths"
|
|
141
|
+
|
|
142
|
+
def test_func_map_has_line_numbers(self, pipeline_artifacts):
|
|
143
|
+
fm = pipeline_artifacts["func_map"]
|
|
144
|
+
with_lines = sum(
|
|
145
|
+
1 for f in fm.values()
|
|
146
|
+
if f.get("start_line") and f.get("end_line")
|
|
147
|
+
)
|
|
148
|
+
ratio = with_lines / len(fm) if fm else 0
|
|
149
|
+
assert ratio > 0.8, f"Only {ratio:.0%} functions have line numbers"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# Semantic search
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class TestSemanticSearch:
|
|
158
|
+
"""Verify semantic search returns meaningful results."""
|
|
159
|
+
|
|
160
|
+
def test_search_by_function_name(self, pipeline_artifacts):
|
|
161
|
+
"""Searching for a known function name should return it."""
|
|
162
|
+
vs = pipeline_artifacts["vector_store"]
|
|
163
|
+
embedder = pipeline_artifacts["embedder"]
|
|
164
|
+
|
|
165
|
+
query_emb = embedder.embed_query("tcc_compile")
|
|
166
|
+
results = vs.search_similar(query_emb, top_k=5)
|
|
167
|
+
|
|
168
|
+
assert len(results) > 0, "Search should return results"
|
|
169
|
+
# Check that at least one result is related to compilation
|
|
170
|
+
qns = [r.qualified_name for r in results]
|
|
171
|
+
found = any("compile" in qn.lower() or "tcc" in qn.lower() for qn in qns)
|
|
172
|
+
assert found, f"Expected compilation-related results, got: {qns}"
|
|
173
|
+
|
|
174
|
+
def test_search_by_concept(self, pipeline_artifacts):
|
|
175
|
+
"""Searching by abstract concept should return relevant functions."""
|
|
176
|
+
vs = pipeline_artifacts["vector_store"]
|
|
177
|
+
embedder = pipeline_artifacts["embedder"]
|
|
178
|
+
|
|
179
|
+
query_emb = embedder.embed_query("parse expression")
|
|
180
|
+
results = vs.search_similar(query_emb, top_k=10)
|
|
181
|
+
|
|
182
|
+
assert len(results) > 0
|
|
183
|
+
qns = [r.qualified_name for r in results]
|
|
184
|
+
# Should find parsing-related functions
|
|
185
|
+
found = any(
|
|
186
|
+
"parse" in qn.lower() or "expr" in qn.lower()
|
|
187
|
+
for qn in qns
|
|
188
|
+
)
|
|
189
|
+
assert found, f"Expected parse/expr results, got: {qns}"
|
|
190
|
+
|
|
191
|
+
def test_search_returns_scores(self, pipeline_artifacts):
|
|
192
|
+
"""Search results should have similarity scores between 0 and 1."""
|
|
193
|
+
vs = pipeline_artifacts["vector_store"]
|
|
194
|
+
embedder = pipeline_artifacts["embedder"]
|
|
195
|
+
|
|
196
|
+
query_emb = embedder.embed_query("memory allocation")
|
|
197
|
+
results = vs.search_similar(query_emb, top_k=5)
|
|
198
|
+
|
|
199
|
+
assert len(results) > 0
|
|
200
|
+
for r in results:
|
|
201
|
+
assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of range"
|
|
202
|
+
|
|
203
|
+
def test_search_scores_descending(self, pipeline_artifacts):
|
|
204
|
+
"""Results should be sorted by score descending."""
|
|
205
|
+
vs = pipeline_artifacts["vector_store"]
|
|
206
|
+
embedder = pipeline_artifacts["embedder"]
|
|
207
|
+
|
|
208
|
+
query_emb = embedder.embed_query("generate code")
|
|
209
|
+
results = vs.search_similar(query_emb, top_k=10)
|
|
210
|
+
|
|
211
|
+
scores = [r.score for r in results]
|
|
212
|
+
assert scores == sorted(scores, reverse=True), "Scores should be descending"
|
|
213
|
+
|
|
214
|
+
def test_search_top_k_limit(self, pipeline_artifacts):
|
|
215
|
+
"""Should return at most top_k results."""
|
|
216
|
+
vs = pipeline_artifacts["vector_store"]
|
|
217
|
+
embedder = pipeline_artifacts["embedder"]
|
|
218
|
+
|
|
219
|
+
query_emb = embedder.embed_query("function")
|
|
220
|
+
results = vs.search_similar(query_emb, top_k=3)
|
|
221
|
+
assert len(results) <= 3
|
|
222
|
+
|
|
223
|
+
def test_different_queries_different_results(self, pipeline_artifacts):
|
|
224
|
+
"""Different queries should return different top results."""
|
|
225
|
+
vs = pipeline_artifacts["vector_store"]
|
|
226
|
+
embedder = pipeline_artifacts["embedder"]
|
|
227
|
+
|
|
228
|
+
emb1 = embedder.embed_query("parse tokens lexer")
|
|
229
|
+
emb2 = embedder.embed_query("generate machine code output")
|
|
230
|
+
|
|
231
|
+
r1 = vs.search_similar(emb1, top_k=3)
|
|
232
|
+
r2 = vs.search_similar(emb2, top_k=3)
|
|
233
|
+
|
|
234
|
+
qns1 = {r.qualified_name for r in r1}
|
|
235
|
+
qns2 = {r.qualified_name for r in r2}
|
|
236
|
+
# At least some results should differ
|
|
237
|
+
assert qns1 != qns2, "Different queries should return different results"
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
# Cache reload
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class TestCacheReload:
|
|
246
|
+
"""Verify embeddings can be loaded from cache."""
|
|
247
|
+
|
|
248
|
+
def test_reload_matches_original(self, pipeline_artifacts):
|
|
249
|
+
"""Reloaded vector store should have same size as original."""
|
|
250
|
+
from code_graph_builder.mcp.pipeline import build_vector_index
|
|
251
|
+
|
|
252
|
+
vs_original = pipeline_artifacts["vector_store"]
|
|
253
|
+
builder = pipeline_artifacts["builder"]
|
|
254
|
+
vp = pipeline_artifacts["vectors_path"]
|
|
255
|
+
|
|
256
|
+
# Load from cache (rebuild=False)
|
|
257
|
+
vs_cached, _, fm_cached = build_vector_index(
|
|
258
|
+
builder=builder,
|
|
259
|
+
repo_path=TINYCC_PATH,
|
|
260
|
+
vectors_path=vp,
|
|
261
|
+
rebuild=False,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
assert len(vs_cached) == len(vs_original), "Cached store size should match"
|
|
265
|
+
assert len(fm_cached) == len(pipeline_artifacts["func_map"])
|
|
266
|
+
|
|
267
|
+
def test_cached_search_works(self, pipeline_artifacts):
|
|
268
|
+
"""Semantic search on cached store should work."""
|
|
269
|
+
vp = pipeline_artifacts["vectors_path"]
|
|
270
|
+
embedder = pipeline_artifacts["embedder"]
|
|
271
|
+
|
|
272
|
+
with open(vp, "rb") as f:
|
|
273
|
+
cache = pickle.load(f)
|
|
274
|
+
vs = cache["vector_store"]
|
|
275
|
+
|
|
276
|
+
query_emb = embedder.embed_query("compile source file")
|
|
277
|
+
results = vs.search_similar(query_emb, top_k=5)
|
|
278
|
+
assert len(results) > 0, "Cached store search should return results"
|