code-graph-rag 0.0.88__tar.gz → 0.0.100__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {code_graph_rag-0.0.88/code_graph_rag.egg-info → code_graph_rag-0.0.100}/PKG-INFO +1 -1
  2. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100/code_graph_rag.egg-info}/PKG-INFO +1 -1
  3. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/cli.py +12 -7
  4. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/config.py +6 -0
  5. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/constants.py +32 -3
  6. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/cypher_queries.py +21 -0
  7. code_graph_rag-0.0.100/codebase_rag/embedder.py +183 -0
  8. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/exceptions.py +1 -0
  9. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/graph_loader.py +12 -0
  10. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/graph_updater.py +217 -46
  11. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/logs.py +44 -1
  12. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/parser_loader.py +12 -13
  13. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/unixcoder.py +11 -0
  14. code_graph_rag-0.0.100/codebase_rag/vector_store.py +169 -0
  15. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/pyproject.toml +1 -1
  16. code_graph_rag-0.0.88/codebase_rag/embedder.py +0 -48
  17. code_graph_rag-0.0.88/codebase_rag/vector_store.py +0 -80
  18. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/LICENSE +0 -0
  19. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/PYPI_README.md +0 -0
  20. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/README.md +0 -0
  21. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/cgr/__init__.py +0 -0
  22. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/code_graph_rag.egg-info/SOURCES.txt +0 -0
  23. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/code_graph_rag.egg-info/dependency_links.txt +0 -0
  24. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/code_graph_rag.egg-info/entry_points.txt +0 -0
  25. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/code_graph_rag.egg-info/requires.txt +0 -0
  26. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/code_graph_rag.egg-info/top_level.txt +0 -0
  27. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/__init__.py +0 -0
  28. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/cli_help.py +0 -0
  29. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/decorators.py +0 -0
  30. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/language_spec.py +0 -0
  31. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/main.py +0 -0
  32. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/models.py +0 -0
  33. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/prompts.py +0 -0
  34. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/readme_sections.py +0 -0
  35. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/schema_builder.py +0 -0
  36. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/schemas.py +0 -0
  37. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/tool_errors.py +0 -0
  38. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/types_defs.py +0 -0
  39. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codec/__init__.py +0 -0
  40. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codec/schema_pb2.py +0 -0
  41. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codec/schema_pb2.pyi +0 -0
  42. {code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-graph-rag
3
- Version: 0.0.88
3
+ Version: 0.0.100
4
4
  Summary: The ultimate RAG for your monorepo. Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs
5
5
  License-Expression: MIT
6
6
  Keywords: rag,retrieval-augmented-generation,knowledge-graph,code-analysis,tree-sitter,mcp,mcp-server,llm,graph-database,semantic-search,codebase,memgraph,developer-tools,monorepo
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-graph-rag
3
- Version: 0.0.88
3
+ Version: 0.0.100
4
4
  Summary: The ultimate RAG for your monorepo. Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs
5
5
  License-Expression: MIT
6
6
  Keywords: rag,retrieval-augmented-generation,knowledge-graph,code-analysis,tree-sitter,mcp,mcp-server,llm,graph-database,semantic-search,codebase,memgraph,developer-tools,monorepo
@@ -169,12 +169,12 @@ def start(
169
169
  parsers, queries = load_parsers()
170
170
 
171
171
  updater = GraphUpdater(
172
- ingestor,
173
- repo_to_update,
174
- parsers,
175
- queries,
176
- unignore_paths,
177
- exclude_paths,
172
+ ingestor=ingestor,
173
+ repo_path=repo_to_update,
174
+ parsers=parsers,
175
+ queries=queries,
176
+ unignore_paths=unignore_paths,
177
+ exclude_paths=exclude_paths,
178
178
  )
179
179
  updater.run()
180
180
 
@@ -245,7 +245,12 @@ def index(
245
245
  )
246
246
  parsers, queries = load_parsers()
247
247
  updater = GraphUpdater(
248
- ingestor, repo_to_index, parsers, queries, unignore_paths, exclude_paths
248
+ ingestor=ingestor,
249
+ repo_path=repo_to_index,
250
+ parsers=parsers,
251
+ queries=queries,
252
+ unignore_paths=unignore_paths,
253
+ exclude_paths=exclude_paths,
249
254
  )
250
255
 
251
256
  updater.run()
@@ -246,9 +246,15 @@ class AppConfig(BaseSettings):
246
246
  QDRANT_COLLECTION_NAME: str = "code_embeddings"
247
247
  QDRANT_VECTOR_DIM: int = 768
248
248
  QDRANT_TOP_K: int = 5
249
+ QDRANT_UPSERT_RETRIES: int = Field(default=3, gt=0)
250
+ QDRANT_RETRY_BASE_DELAY: float = Field(default=0.5, gt=0)
251
+ QDRANT_BATCH_SIZE: int = Field(default=50, gt=0)
249
252
  EMBEDDING_MAX_LENGTH: int = 512
250
253
  EMBEDDING_PROGRESS_INTERVAL: int = 10
251
254
 
255
+ FLUSH_THREAD_POOL_SIZE: int = Field(default=4, gt=0)
256
+ FILE_FLUSH_INTERVAL: int = Field(default=500, gt=0)
257
+
252
258
  CACHE_MAX_ENTRIES: int = 1000
253
259
  CACHE_MAX_MEMORY_MB: int = 500
254
260
  CACHE_EVICTION_DIVISOR: int = 10
@@ -150,6 +150,8 @@ V1_PATH = "/v1"
150
150
  HTTP_OK = 200
151
151
 
152
152
  UNIXCODER_MODEL = "microsoft/unixcoder-base"
153
+ EMBEDDING_DEFAULT_BATCH_SIZE = 32
154
+ EMBEDDING_CACHE_FILENAME = ".embedding_cache.json"
153
155
 
154
156
  KEY_NODES = "nodes"
155
157
  KEY_RELATIONSHIPS = "relationships"
@@ -417,14 +419,21 @@ CSPROJ_SUFFIX = ".csproj"
417
419
  # (H) Cypher queries
418
420
  CYPHER_DEFAULT_LIMIT = 50
419
421
 
420
- CYPHER_QUERY_EMBEDDINGS = """
422
+ _CYPHER_EMBEDDING_BASE = """
421
423
  MATCH (m:Module)-[:DEFINES]->(n)
422
424
  WHERE (n:Function OR n:Method)
423
425
  AND m.qualified_name STARTS WITH ($project_name + '.')
424
- RETURN id(n) AS node_id, n.qualified_name AS qualified_name,
426
+ """
427
+
428
+ CYPHER_QUERY_EMBEDDINGS = (
429
+ _CYPHER_EMBEDDING_BASE
430
+ + """RETURN id(n) AS node_id, n.qualified_name AS qualified_name,
425
431
  n.start_line AS start_line, n.end_line AS end_line,
426
432
  m.path AS path
427
433
  """
434
+ )
435
+
436
+ CYPHER_QUERY_PROJECT_NODE_IDS = _CYPHER_EMBEDDING_BASE + "RETURN id(n) AS node_id\n"
428
437
 
429
438
 
430
439
  class SupportedLanguage(StrEnum):
@@ -883,7 +892,7 @@ PYINSTALLER_ARG_HIDDEN_IMPORT = "--hidden-import"
883
892
  PYINSTALLER_ARG_EXCLUDE_MODULE = "--exclude-module"
884
893
  PYINSTALLER_ENTRY_POINT = "main.py"
885
894
 
886
- PYINSTALLER_EXCLUDED_MODULES = ["logfire", "logfire_api"]
895
+ PYINSTALLER_EXCLUDED_MODULES = ["logfire"]
887
896
 
888
897
  # (H) TOML parsing constants
889
898
  TOML_KEY_PROJECT = "project"
@@ -908,6 +917,7 @@ PYINSTALLER_PACKAGES: list["PyInstallerPackage"] = [
908
917
  PyInstallerPackage(name="loguru", collect_all=True),
909
918
  PyInstallerPackage(name="toml", collect_all=True),
910
919
  PyInstallerPackage(name="protobuf", collect_all=True),
920
+ PyInstallerPackage(name="genai_prices", collect_all=True),
911
921
  ]
912
922
 
913
923
  ALLOWED_COMMENT_MARKERS = frozenset(
@@ -964,6 +974,22 @@ CYPHER_PREFIX = "cypher"
964
974
  CYPHER_SEMICOLON = ";"
965
975
  CYPHER_BACKTICK = "`"
966
976
  CYPHER_MATCH_KEYWORD = "MATCH"
977
+ CYPHER_DANGEROUS_KEYWORDS: frozenset[str] = frozenset(
978
+ {
979
+ "DELETE",
980
+ "DETACH",
981
+ "DROP",
982
+ "CREATE INDEX",
983
+ "CREATE CONSTRAINT",
984
+ "REMOVE",
985
+ "SET",
986
+ "MERGE",
987
+ "CREATE",
988
+ "CALL",
989
+ "LOAD CSV",
990
+ "FOREACH",
991
+ }
992
+ )
967
993
 
968
994
  # (H) Tool success messages
969
995
  MSG_SURGICAL_SUCCESS = "Successfully applied surgical code replacement in: {path}"
@@ -1572,6 +1598,9 @@ GOMOD_COMMENT_PREFIX = "//"
1572
1598
  # (H) Gemfile parsing patterns
1573
1599
  GEMFILE_GEM_PREFIX = "gem "
1574
1600
 
1601
+ # (H) Incremental update hash cache
1602
+ HASH_CACHE_FILENAME = ".cgr-hash-cache.json"
1603
+
1575
1604
  # (H) Import processor cache config
1576
1605
  IMPORT_CACHE_TTL = 3600
1577
1606
  IMPORT_CACHE_DIR = ".cache/codebase_rag"
@@ -126,3 +126,24 @@ def build_merge_relationship_query(
126
126
  )
127
127
  query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT
128
128
  return query
129
+
130
+
131
+ def build_create_node_query(label: str, id_key: str) -> str:
132
+ return f"CREATE (n:{label} {{{id_key}: row.id}})\nSET n += row.props"
133
+
134
+
135
+ def build_create_relationship_query(
136
+ from_label: str,
137
+ from_key: str,
138
+ rel_type: str,
139
+ to_label: str,
140
+ to_key: str,
141
+ has_props: bool = False,
142
+ ) -> str:
143
+ query = (
144
+ f"MATCH (a:{from_label} {{{from_key}: row.from_val}}), "
145
+ f"(b:{to_label} {{{to_key}: row.to_val}})\n"
146
+ f"CREATE (a)-[r:{rel_type}]->(b)\n"
147
+ )
148
+ query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT
149
+ return query
@@ -0,0 +1,183 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+
8
+ from loguru import logger
9
+
10
+ from . import constants as cs
11
+ from . import exceptions as ex
12
+ from . import logs as ls
13
+ from .config import settings
14
+ from .utils.dependencies import has_torch, has_transformers
15
+
16
+
17
+ class EmbeddingCache:
18
+ __slots__ = ("_cache", "_path")
19
+
20
+ def __init__(self, path: Path | None = None) -> None:
21
+ self._cache: dict[str, list[float]] = {}
22
+ self._path = path
23
+
24
+ @staticmethod
25
+ def _content_hash(content: str) -> str:
26
+ return hashlib.sha256(content.encode()).hexdigest()
27
+
28
+ def get(self, content: str) -> list[float] | None:
29
+ return self._cache.get(self._content_hash(content))
30
+
31
+ def put(self, content: str, embedding: list[float]) -> None:
32
+ self._cache[self._content_hash(content)] = embedding
33
+
34
+ def get_many(self, snippets: list[str]) -> dict[int, list[float]]:
35
+ results: dict[int, list[float]] = {}
36
+ for i, snippet in enumerate(snippets):
37
+ if (cached := self.get(snippet)) is not None:
38
+ results[i] = cached
39
+ return results
40
+
41
+ def put_many(self, snippets: list[str], embeddings: list[list[float]]) -> None:
42
+ for snippet, embedding in zip(snippets, embeddings):
43
+ self.put(snippet, embedding)
44
+
45
+ def save(self) -> None:
46
+ if self._path is None:
47
+ return
48
+ try:
49
+ self._path.parent.mkdir(parents=True, exist_ok=True)
50
+ with self._path.open("w", encoding="utf-8") as f:
51
+ json.dump(self._cache, f)
52
+ except Exception as e:
53
+ logger.warning(ls.EMBEDDING_CACHE_SAVE_FAILED, path=self._path, error=e)
54
+
55
+ def load(self) -> None:
56
+ if self._path is None or not self._path.exists():
57
+ return
58
+ try:
59
+ with self._path.open("r", encoding="utf-8") as f:
60
+ self._cache = json.load(f)
61
+ logger.debug(
62
+ ls.EMBEDDING_CACHE_LOADED, count=len(self._cache), path=self._path
63
+ )
64
+ except Exception as e:
65
+ logger.warning(ls.EMBEDDING_CACHE_LOAD_FAILED, path=self._path, error=e)
66
+ self._cache = {}
67
+
68
+ def clear(self) -> None:
69
+ self._cache.clear()
70
+
71
+ def __len__(self) -> int:
72
+ return len(self._cache)
73
+
74
+
75
+ _embedding_cache: EmbeddingCache | None = None
76
+
77
+
78
+ def get_embedding_cache() -> EmbeddingCache:
79
+ global _embedding_cache
80
+ if _embedding_cache is None:
81
+ cache_path = Path(settings.QDRANT_DB_PATH) / cs.EMBEDDING_CACHE_FILENAME
82
+ _embedding_cache = EmbeddingCache(path=cache_path)
83
+ _embedding_cache.load()
84
+ return _embedding_cache
85
+
86
+
87
+ def clear_embedding_cache() -> None:
88
+ global _embedding_cache
89
+ if _embedding_cache is not None:
90
+ _embedding_cache.clear()
91
+ _embedding_cache = None
92
+
93
+
94
+ if has_torch() and has_transformers():
95
+ import numpy as np
96
+ import torch
97
+ from numpy.typing import NDArray
98
+
99
+ from .unixcoder import UniXcoder
100
+
101
+ @lru_cache(maxsize=1)
102
+ def get_model() -> UniXcoder:
103
+ model = UniXcoder(cs.UNIXCODER_MODEL)
104
+ model.eval()
105
+ if torch.cuda.is_available():
106
+ model = model.cuda()
107
+ return model
108
+
109
+ def embed_code(code: str, max_length: int | None = None) -> list[float]:
110
+ cache = get_embedding_cache()
111
+ if (cached := cache.get(code)) is not None:
112
+ return cached
113
+
114
+ if max_length is None:
115
+ max_length = settings.EMBEDDING_MAX_LENGTH
116
+ model = get_model()
117
+ device = next(model.parameters()).device
118
+ tokens = model.tokenize([code], max_length=max_length)
119
+ tokens_tensor = torch.tensor(tokens).to(device)
120
+ with torch.no_grad():
121
+ _, sentence_embeddings = model(tokens_tensor)
122
+ embedding: NDArray[np.float32] = sentence_embeddings.cpu().numpy()
123
+ result: list[float] = embedding[0].tolist()
124
+
125
+ cache.put(code, result)
126
+ return result
127
+
128
+ def embed_code_batch(
129
+ snippets: list[str],
130
+ max_length: int | None = None,
131
+ batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE,
132
+ ) -> list[list[float]]:
133
+ if not snippets:
134
+ return []
135
+
136
+ if max_length is None:
137
+ max_length = settings.EMBEDDING_MAX_LENGTH
138
+
139
+ cache = get_embedding_cache()
140
+ cached_results = cache.get_many(snippets)
141
+
142
+ if len(cached_results) == len(snippets):
143
+ logger.debug(ls.EMBEDDING_CACHE_HIT, count=len(snippets))
144
+ return [cached_results[i] for i in range(len(snippets))]
145
+
146
+ uncached_indices = [i for i in range(len(snippets)) if i not in cached_results]
147
+ uncached_snippets = [snippets[i] for i in uncached_indices]
148
+
149
+ model = get_model()
150
+ device = next(model.parameters()).device
151
+
152
+ all_new_embeddings: list[list[float]] = []
153
+ for start in range(0, len(uncached_snippets), batch_size):
154
+ batch = uncached_snippets[start : start + batch_size]
155
+ tokens_list = model.tokenize(batch, max_length=max_length, padding=True)
156
+ tokens_tensor = torch.tensor(tokens_list).to(device)
157
+ with torch.no_grad():
158
+ _, sentence_embeddings = model(tokens_tensor)
159
+ batch_np: NDArray[np.float32] = sentence_embeddings.cpu().numpy()
160
+ for row in batch_np:
161
+ all_new_embeddings.append(row.tolist())
162
+
163
+ cache.put_many(uncached_snippets, all_new_embeddings)
164
+
165
+ results: list[list[float]] = [[] for _ in snippets]
166
+ for i, emb in cached_results.items():
167
+ results[i] = emb
168
+ for idx, orig_i in enumerate(uncached_indices):
169
+ results[orig_i] = all_new_embeddings[idx]
170
+
171
+ return results
172
+
173
+ else:
174
+
175
+ def embed_code(code: str, max_length: int | None = None) -> list[float]:
176
+ raise RuntimeError(ex.SEMANTIC_EXTRA)
177
+
178
+ def embed_code_batch(
179
+ snippets: list[str],
180
+ max_length: int | None = None,
181
+ batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE,
182
+ ) -> list[list[float]]:
183
+ raise RuntimeError(ex.SEMANTIC_EXTRA)
@@ -42,6 +42,7 @@ NO_LANGUAGES = "No Tree-sitter languages available."
42
42
  # (H) LLM errors
43
43
  LLM_INIT_CYPHER = "Failed to initialize CypherGenerator: {error}"
44
44
  LLM_INVALID_QUERY = "LLM did not generate a valid query. Output: {output}"
45
+ LLM_DANGEROUS_QUERY = "LLM generated a destructive Cypher query (found '{keyword}'). Query rejected: {query}"
45
46
  LLM_GENERATION_FAILED = "Cypher generation failed: {error}"
46
47
  LLM_INIT_ORCHESTRATOR = "Failed to initialize RAG Orchestrator: {error}"
47
48
 
@@ -13,6 +13,18 @@ from .types_defs import GraphData, GraphMetadata, GraphSummary, PropertyValue
13
13
 
14
14
 
15
15
  class GraphLoader:
16
+ __slots__ = (
17
+ "file_path",
18
+ "_data",
19
+ "_nodes",
20
+ "_relationships",
21
+ "_nodes_by_id",
22
+ "_nodes_by_label",
23
+ "_outgoing_rels",
24
+ "_incoming_rels",
25
+ "_property_indexes",
26
+ )
27
+
16
28
  def __init__(self, file_path: str):
17
29
  self.file_path = Path(file_path)
18
30
  self._data: GraphData | None = None