code-graph-builder 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. code_graph_builder/__init__.py +82 -0
  2. code_graph_builder/builder.py +366 -0
  3. code_graph_builder/cgb_cli.py +32 -0
  4. code_graph_builder/cli.py +564 -0
  5. code_graph_builder/commands_cli.py +1288 -0
  6. code_graph_builder/config.py +340 -0
  7. code_graph_builder/constants.py +708 -0
  8. code_graph_builder/embeddings/__init__.py +40 -0
  9. code_graph_builder/embeddings/qwen3_embedder.py +573 -0
  10. code_graph_builder/embeddings/vector_store.py +584 -0
  11. code_graph_builder/examples/__init__.py +0 -0
  12. code_graph_builder/examples/example_configuration.py +276 -0
  13. code_graph_builder/examples/example_kuzu_usage.py +109 -0
  14. code_graph_builder/examples/example_semantic_search_full.py +347 -0
  15. code_graph_builder/examples/generate_wiki.py +915 -0
  16. code_graph_builder/examples/graph_export_example.py +100 -0
  17. code_graph_builder/examples/rag_example.py +206 -0
  18. code_graph_builder/examples/test_cli_demo.py +129 -0
  19. code_graph_builder/examples/test_embedding_api.py +153 -0
  20. code_graph_builder/examples/test_kuzu_local.py +190 -0
  21. code_graph_builder/examples/test_rag_redis.py +390 -0
  22. code_graph_builder/graph_updater.py +605 -0
  23. code_graph_builder/guidance/__init__.py +1 -0
  24. code_graph_builder/guidance/agent.py +123 -0
  25. code_graph_builder/guidance/prompts.py +74 -0
  26. code_graph_builder/guidance/toolset.py +264 -0
  27. code_graph_builder/language_spec.py +536 -0
  28. code_graph_builder/mcp/__init__.py +21 -0
  29. code_graph_builder/mcp/api_doc_generator.py +764 -0
  30. code_graph_builder/mcp/file_editor.py +207 -0
  31. code_graph_builder/mcp/pipeline.py +777 -0
  32. code_graph_builder/mcp/server.py +161 -0
  33. code_graph_builder/mcp/tools.py +1800 -0
  34. code_graph_builder/models.py +115 -0
  35. code_graph_builder/parser_loader.py +344 -0
  36. code_graph_builder/parsers/__init__.py +7 -0
  37. code_graph_builder/parsers/call_processor.py +306 -0
  38. code_graph_builder/parsers/call_resolver.py +139 -0
  39. code_graph_builder/parsers/definition_processor.py +796 -0
  40. code_graph_builder/parsers/factory.py +119 -0
  41. code_graph_builder/parsers/import_processor.py +293 -0
  42. code_graph_builder/parsers/structure_processor.py +145 -0
  43. code_graph_builder/parsers/type_inference.py +143 -0
  44. code_graph_builder/parsers/utils.py +134 -0
  45. code_graph_builder/rag/__init__.py +68 -0
  46. code_graph_builder/rag/camel_agent.py +429 -0
  47. code_graph_builder/rag/client.py +298 -0
  48. code_graph_builder/rag/config.py +239 -0
  49. code_graph_builder/rag/cypher_generator.py +67 -0
  50. code_graph_builder/rag/llm_backend.py +210 -0
  51. code_graph_builder/rag/markdown_generator.py +352 -0
  52. code_graph_builder/rag/prompt_templates.py +440 -0
  53. code_graph_builder/rag/rag_engine.py +640 -0
  54. code_graph_builder/rag/review_report.md +172 -0
  55. code_graph_builder/rag/tests/__init__.py +3 -0
  56. code_graph_builder/rag/tests/test_camel_agent.py +313 -0
  57. code_graph_builder/rag/tests/test_client.py +221 -0
  58. code_graph_builder/rag/tests/test_config.py +177 -0
  59. code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
  60. code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
  61. code_graph_builder/services/__init__.py +39 -0
  62. code_graph_builder/services/graph_service.py +465 -0
  63. code_graph_builder/services/kuzu_service.py +665 -0
  64. code_graph_builder/services/memory_service.py +171 -0
  65. code_graph_builder/settings.py +75 -0
  66. code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
  67. code_graph_builder/tests/__init__.py +1 -0
  68. code_graph_builder/tests/run_acceptance_check.py +378 -0
  69. code_graph_builder/tests/test_api_find.py +231 -0
  70. code_graph_builder/tests/test_api_find_integration.py +226 -0
  71. code_graph_builder/tests/test_basic.py +78 -0
  72. code_graph_builder/tests/test_c_api_extraction.py +388 -0
  73. code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
  74. code_graph_builder/tests/test_embedder.py +411 -0
  75. code_graph_builder/tests/test_integration_semantic.py +434 -0
  76. code_graph_builder/tests/test_mcp_protocol.py +298 -0
  77. code_graph_builder/tests/test_mcp_user_flow.py +190 -0
  78. code_graph_builder/tests/test_rag.py +404 -0
  79. code_graph_builder/tests/test_settings.py +135 -0
  80. code_graph_builder/tests/test_step1_graph_build.py +264 -0
  81. code_graph_builder/tests/test_step2_api_docs.py +323 -0
  82. code_graph_builder/tests/test_step3_embedding.py +278 -0
  83. code_graph_builder/tests/test_vector_store.py +552 -0
  84. code_graph_builder/tools/__init__.py +40 -0
  85. code_graph_builder/tools/graph_query.py +495 -0
  86. code_graph_builder/tools/semantic_search.py +387 -0
  87. code_graph_builder/types.py +333 -0
  88. code_graph_builder/utils/__init__.py +0 -0
  89. code_graph_builder/utils/path_utils.py +30 -0
  90. code_graph_builder-0.2.0.dist-info/METADATA +321 -0
  91. code_graph_builder-0.2.0.dist-info/RECORD +93 -0
  92. code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
  93. code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,584 @@
1
+ """Vector store for code embeddings.
2
+
3
+ This module provides abstract base class and implementations for storing
4
+ and searching code embeddings.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from abc import ABC, abstractmethod
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING
13
+
14
+ from loguru import logger
15
+
16
+ if TYPE_CHECKING:
17
+ from ..types import PropertyDict
18
+
19
+
20
+ def cosine_similarity(a: list[float], b: list[float]) -> float:
21
+ """Calculate cosine similarity between two vectors.
22
+
23
+ Args:
24
+ a: First vector
25
+ b: Second vector
26
+
27
+ Returns:
28
+ Cosine similarity (-1 to 1)
29
+
30
+ Raises:
31
+ ValueError: If vectors have different lengths or are zero vectors
32
+ """
33
+ import math
34
+
35
+ if len(a) != len(b):
36
+ raise ValueError(f"Vectors have different lengths: {len(a)} vs {len(b)}")
37
+
38
+ dot_product = sum(x * y for x, y in zip(a, b))
39
+ norm_a = math.sqrt(sum(x * x for x in a))
40
+ norm_b = math.sqrt(sum(x * x for x in b))
41
+
42
+ if norm_a == 0 or norm_b == 0:
43
+ raise ValueError("Cannot compute cosine similarity for zero vectors")
44
+
45
+ return dot_product / (norm_a * norm_b)
46
+
47
+
48
+ @dataclass
49
+ class VectorRecord:
50
+ """A record in the vector store.
51
+
52
+ Attributes:
53
+ node_id: Unique node identifier
54
+ qualified_name: Fully qualified name of the code entity
55
+ embedding: Embedding vector
56
+ metadata: Additional metadata
57
+ """
58
+
59
+ node_id: int
60
+ qualified_name: str
61
+ embedding: list[float]
62
+ metadata: dict[str, str | int | float | None] = field(default_factory=dict)
63
+
64
+
65
+ @dataclass
66
+ class SearchResult:
67
+ """Result from vector similarity search.
68
+
69
+ Attributes:
70
+ node_id: Node identifier
71
+ score: Similarity score (0-1, higher is better)
72
+ qualified_name: Fully qualified name
73
+ """
74
+
75
+ node_id: int
76
+ score: float
77
+ qualified_name: str
78
+
79
+
80
+ class VectorStore(ABC):
81
+ """Abstract base class for vector stores."""
82
+
83
+ @abstractmethod
84
+ def store_embedding(
85
+ self,
86
+ node_id: int,
87
+ qualified_name: str,
88
+ embedding: list[float],
89
+ metadata: PropertyDict | None = None,
90
+ **kwargs,
91
+ ) -> None:
92
+ """Store an embedding vector.
93
+
94
+ Args:
95
+ node_id: Unique node identifier
96
+ qualified_name: Fully qualified name of the code entity
97
+ embedding: Embedding vector
98
+ metadata: Additional metadata
99
+ **kwargs: Additional keyword arguments (implementation-specific)
100
+ """
101
+ ...
102
+
103
+ @abstractmethod
104
+ def store_embeddings_batch(
105
+ self,
106
+ records: list[VectorRecord],
107
+ ) -> None:
108
+ """Store multiple embeddings in batch.
109
+
110
+ Args:
111
+ records: List of vector records to store
112
+ """
113
+ ...
114
+
115
+ @abstractmethod
116
+ def search_similar(
117
+ self,
118
+ query_embedding: list[float],
119
+ top_k: int = 5,
120
+ filter_metadata: PropertyDict | None = None,
121
+ ) -> list[SearchResult]:
122
+ """Search for similar embeddings.
123
+
124
+ Args:
125
+ query_embedding: Query embedding vector
126
+ top_k: Number of results to return
127
+ filter_metadata: Optional metadata filter
128
+
129
+ Returns:
130
+ List of search results
131
+ """
132
+ ...
133
+
134
+ @abstractmethod
135
+ def delete_by_node_id(self, node_id: int) -> bool:
136
+ """Delete an embedding by node ID.
137
+
138
+ Args:
139
+ node_id: Node identifier to delete
140
+
141
+ Returns:
142
+ True if deleted, False if not found
143
+ """
144
+ ...
145
+
146
+ @abstractmethod
147
+ def clear(self) -> None:
148
+ """Clear all embeddings from the store."""
149
+ ...
150
+
151
+ @abstractmethod
152
+ def get_stats(self) -> dict[str, int]:
153
+ """Get store statistics.
154
+
155
+ Returns:
156
+ Dictionary with statistics (count, dimension, etc.)
157
+ """
158
+ ...
159
+
160
+
161
+ class MemoryVectorStore(VectorStore):
162
+ """In-memory vector store implementation.
163
+
164
+ Uses cosine similarity for search. Suitable for testing and
165
+ small datasets.
166
+
167
+ Args:
168
+ dimension: Expected embedding dimension
169
+ """
170
+
171
+ def __init__(self, dimension: int = 1024):
172
+ self.dimension = dimension
173
+ self._records: dict[int, VectorRecord] = {}
174
+
175
+ def __len__(self) -> int:
176
+ """Return the number of stored embeddings."""
177
+ return len(self._records)
178
+
179
+ def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
180
+ """Calculate cosine similarity between two vectors.
181
+
182
+ Args:
183
+ a: First vector
184
+ b: Second vector
185
+
186
+ Returns:
187
+ Cosine similarity (-1 to 1)
188
+ """
189
+ try:
190
+ return cosine_similarity(a, b)
191
+ except ValueError:
192
+ return 0.0
193
+
194
+ def store_embedding(
195
+ self,
196
+ node_id: int,
197
+ qualified_name: str,
198
+ embedding: list[float],
199
+ metadata: PropertyDict | None = None,
200
+ **kwargs,
201
+ ) -> None:
202
+ """Store an embedding vector in memory.
203
+
204
+ Args:
205
+ node_id: Unique node identifier
206
+ qualified_name: Fully qualified name of the code entity
207
+ embedding: Embedding vector
208
+ metadata: Additional metadata dictionary
209
+ **kwargs: Additional keyword arguments (stored as metadata)
210
+
211
+ Raises:
212
+ ValueError: If embedding dimension doesn't match or embedding is empty
213
+ """
214
+ if not embedding:
215
+ raise ValueError("Embedding cannot be empty")
216
+
217
+ if len(embedding) != self.dimension:
218
+ raise ValueError(
219
+ f"Embedding dimension mismatch: expected {self.dimension}, got {len(embedding)}"
220
+ )
221
+
222
+ meta: dict[str, str | int | float | None] = {}
223
+ if metadata:
224
+ for k, v in metadata.items():
225
+ if isinstance(v, (str, int, float, type(None))):
226
+ meta[k] = v
227
+ elif isinstance(v, list):
228
+ meta[k] = str(v)
229
+ elif isinstance(v, bool):
230
+ meta[k] = int(v)
231
+
232
+ # Store additional kwargs as metadata
233
+ for k, v in kwargs.items():
234
+ if isinstance(v, (str, int, float, type(None))):
235
+ meta[k] = v
236
+ elif isinstance(v, list):
237
+ meta[k] = str(v)
238
+ elif isinstance(v, bool):
239
+ meta[k] = int(v)
240
+
241
+ self._records[node_id] = VectorRecord(
242
+ node_id=node_id,
243
+ qualified_name=qualified_name,
244
+ embedding=embedding,
245
+ metadata=meta,
246
+ )
247
+
248
+ def store_embeddings_batch(
249
+ self,
250
+ records: list[VectorRecord],
251
+ ) -> None:
252
+ """Store multiple embeddings in batch."""
253
+ for record in records:
254
+ self._records[record.node_id] = record
255
+
256
+ def search_similar(
257
+ self,
258
+ query_embedding: list[float],
259
+ top_k: int = 5,
260
+ filter_metadata: PropertyDict | None = None,
261
+ ) -> list[SearchResult]:
262
+ """Search for similar embeddings using cosine similarity."""
263
+ if top_k < 0:
264
+ raise ValueError(f"top_k must be non-negative, got {top_k}")
265
+
266
+ if not self._records:
267
+ return []
268
+
269
+ scores: list[tuple[int, float, str]] = []
270
+
271
+ for node_id, record in self._records.items():
272
+ if filter_metadata:
273
+ match = all(
274
+ record.metadata.get(k) == v for k, v in filter_metadata.items()
275
+ )
276
+ if not match:
277
+ continue
278
+
279
+ similarity = self._cosine_similarity(query_embedding, record.embedding)
280
+ scores.append((node_id, similarity, record.qualified_name))
281
+
282
+ scores.sort(key=lambda x: x[1], reverse=True)
283
+
284
+ return [
285
+ SearchResult(
286
+ node_id=node_id,
287
+ score=round(score, 4),
288
+ qualified_name=qn,
289
+ )
290
+ for node_id, score, qn in scores[:top_k]
291
+ ]
292
+
293
+ def delete_by_node_id(self, node_id: int) -> bool:
294
+ """Delete an embedding by node ID."""
295
+ if node_id in self._records:
296
+ del self._records[node_id]
297
+ return True
298
+ return False
299
+
300
+ # Alias for compatibility with tests
301
+ delete_embedding = delete_by_node_id
302
+
303
+ def get_embedding(self, node_id: int) -> "VectorRecord | None":
304
+ """Get an embedding record by node ID.
305
+
306
+ Args:
307
+ node_id: Node identifier
308
+
309
+ Returns:
310
+ VectorRecord if found, None otherwise
311
+ """
312
+ return self._records.get(node_id)
313
+
314
+ def clear(self) -> None:
315
+ """Clear all embeddings."""
316
+ self._records.clear()
317
+
318
+ def get_stats(self) -> dict[str, int]:
319
+ """Get store statistics."""
320
+ return {
321
+ "count": len(self._records),
322
+ "dimension": self.dimension,
323
+ }
324
+
325
+ def get_all_records(self) -> list[VectorRecord]:
326
+ """Get all records (for testing/debugging).
327
+
328
+ Returns:
329
+ List of all vector records
330
+ """
331
+ return list(self._records.values())
332
+
333
+
334
+ class QdrantVectorStore(VectorStore):
335
+ """Qdrant-based vector store implementation.
336
+
337
+ Requires qdrant-client to be installed.
338
+
339
+ Args:
340
+ collection_name: Name of the Qdrant collection
341
+ dimension: Embedding dimension
342
+ db_path: Path for local Qdrant storage (optional)
343
+ host: Qdrant server host (if not using local)
344
+ port: Qdrant server port
345
+ """
346
+
347
+ def __init__(
348
+ self,
349
+ collection_name: str = "code_embeddings",
350
+ dimension: int = 1024,
351
+ db_path: str | Path | None = None,
352
+ host: str | None = None,
353
+ port: int = 6333,
354
+ ):
355
+ self.collection_name = collection_name
356
+ self.dimension = dimension
357
+ self.db_path = Path(db_path) if db_path else None
358
+ self.host = host
359
+ self.port = port
360
+
361
+ self._client: "QdrantClient | None" = None
362
+ self._initialized = False
363
+
364
+ def _lazy_init(self) -> None:
365
+ """Lazy initialization of Qdrant client."""
366
+ if self._initialized:
367
+ return
368
+
369
+ try:
370
+ from qdrant_client import QdrantClient
371
+ from qdrant_client.models import Distance, VectorParams
372
+
373
+ if self.db_path:
374
+ self._client = QdrantClient(path=str(self.db_path))
375
+ elif self.host:
376
+ self._client = QdrantClient(host=self.host, port=self.port)
377
+ else:
378
+ self._client = QdrantClient(location=":memory:")
379
+
380
+ if not self._client.collection_exists(self.collection_name):
381
+ self._client.create_collection(
382
+ collection_name=self.collection_name,
383
+ vectors_config=VectorParams(
384
+ size=self.dimension,
385
+ distance=Distance.COSINE,
386
+ ),
387
+ )
388
+ logger.info(f"Created Qdrant collection: {self.collection_name}")
389
+
390
+ self._initialized = True
391
+
392
+ except ImportError as e:
393
+ logger.error(f"Failed to import qdrant-client: {e}")
394
+ raise RuntimeError(
395
+ "qdrant-client required for QdrantVectorStore. "
396
+ "Install with: pip install qdrant-client"
397
+ ) from e
398
+ except Exception as e:
399
+ logger.error(f"Failed to initialize Qdrant: {e}")
400
+ raise
401
+
402
+ def store_embedding(
403
+ self,
404
+ node_id: int,
405
+ qualified_name: str,
406
+ embedding: list[float],
407
+ metadata: PropertyDict | None = None,
408
+ **kwargs,
409
+ ) -> None:
410
+ """Store an embedding vector in Qdrant."""
411
+ self._lazy_init()
412
+
413
+ from qdrant_client.models import PointStruct
414
+
415
+ payload: dict[str, str | int | float | None] = {
416
+ "node_id": node_id,
417
+ "qualified_name": qualified_name,
418
+ }
419
+ if metadata:
420
+ for k, v in metadata.items():
421
+ if isinstance(v, (str, int, float, type(None))):
422
+ payload[k] = v
423
+ elif isinstance(v, list):
424
+ payload[k] = str(v)
425
+ elif isinstance(v, bool):
426
+ payload[k] = int(v)
427
+
428
+ # Store additional kwargs as metadata
429
+ for k, v in kwargs.items():
430
+ if isinstance(v, (str, int, float, type(None))):
431
+ payload[k] = v
432
+ elif isinstance(v, list):
433
+ payload[k] = str(v)
434
+ elif isinstance(v, bool):
435
+ payload[k] = int(v)
436
+
437
+ assert self._client is not None
438
+ self._client.upsert(
439
+ collection_name=self.collection_name,
440
+ points=[
441
+ PointStruct(
442
+ id=node_id,
443
+ vector=embedding,
444
+ payload=payload,
445
+ )
446
+ ],
447
+ )
448
+
449
+ def store_embeddings_batch(
450
+ self,
451
+ records: list[VectorRecord],
452
+ ) -> None:
453
+ """Store multiple embeddings in batch."""
454
+ self._lazy_init()
455
+
456
+ from qdrant_client.models import PointStruct
457
+
458
+ points = []
459
+ for record in records:
460
+ payload: dict[str, str | int | float | None] = {
461
+ "node_id": record.node_id,
462
+ "qualified_name": record.qualified_name,
463
+ }
464
+ payload.update(record.metadata)
465
+
466
+ points.append(
467
+ PointStruct(
468
+ id=record.node_id,
469
+ vector=record.embedding,
470
+ payload=payload,
471
+ )
472
+ )
473
+
474
+ if points:
475
+ assert self._client is not None
476
+ self._client.upsert(
477
+ collection_name=self.collection_name,
478
+ points=points,
479
+ )
480
+
481
+ def search_similar(
482
+ self,
483
+ query_embedding: list[float],
484
+ top_k: int = 5,
485
+ filter_metadata: PropertyDict | None = None,
486
+ ) -> list[SearchResult]:
487
+ """Search for similar embeddings in Qdrant."""
488
+ self._lazy_init()
489
+
490
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
491
+
492
+ search_filter = None
493
+ if filter_metadata:
494
+ conditions = []
495
+ for k, v in filter_metadata.items():
496
+ if isinstance(v, (str, int)):
497
+ conditions.append(
498
+ FieldCondition(key=k, match=MatchValue(value=v))
499
+ )
500
+ if conditions:
501
+ search_filter = Filter(must=conditions)
502
+
503
+ assert self._client is not None
504
+ results = self._client.query_points(
505
+ collection_name=self.collection_name,
506
+ query=query_embedding,
507
+ limit=top_k,
508
+ query_filter=search_filter,
509
+ )
510
+
511
+ return [
512
+ SearchResult(
513
+ node_id=hit.payload["node_id"],
514
+ score=hit.score,
515
+ qualified_name=str(hit.payload.get("qualified_name", "")),
516
+ )
517
+ for hit in results.points
518
+ if hit.payload is not None
519
+ ]
520
+
521
+ def delete_by_node_id(self, node_id: int) -> bool:
522
+ """Delete an embedding by node ID."""
523
+ self._lazy_init()
524
+
525
+ assert self._client is not None
526
+ result = self._client.delete(
527
+ collection_name=self.collection_name,
528
+ points_selector=[node_id],
529
+ )
530
+
531
+ return result.operation_id is not None
532
+
533
+ def clear(self) -> None:
534
+ """Clear all embeddings."""
535
+ self._lazy_init()
536
+
537
+ assert self._client is not None
538
+ self._client.delete_collection(self.collection_name)
539
+ self._initialized = False
540
+ self._lazy_init()
541
+
542
+ def get_stats(self) -> dict[str, int]:
543
+ """Get store statistics."""
544
+ self._lazy_init()
545
+
546
+ assert self._client is not None
547
+ info = self._client.get_collection(self.collection_name)
548
+
549
+ return {
550
+ "count": info.points_count,
551
+ "dimension": self.dimension,
552
+ }
553
+
554
+
555
+ def create_vector_store(
556
+ backend: str = "memory",
557
+ dimension: int = 1024,
558
+ **kwargs: str | int | Path | None,
559
+ ) -> VectorStore:
560
+ """Factory function to create vector store.
561
+
562
+ Args:
563
+ backend: Backend type ("memory" or "qdrant")
564
+ dimension: Embedding dimension
565
+ **kwargs: Additional arguments for specific backends
566
+
567
+ Returns:
568
+ VectorStore instance
569
+
570
+ Raises:
571
+ ValueError: If backend is unknown
572
+ """
573
+ if backend == "memory":
574
+ return MemoryVectorStore(dimension=dimension)
575
+ elif backend == "qdrant":
576
+ return QdrantVectorStore(
577
+ dimension=dimension,
578
+ collection_name=str(kwargs.get("collection_name", "code_embeddings")),
579
+ db_path=kwargs.get("db_path"),
580
+ host=kwargs.get("host"),
581
+ port=int(kwargs.get("port", 6333)),
582
+ )
583
+ else:
584
+ raise ValueError(f"Unknown vector store backend: {backend}")
File without changes