PyPI - code-graph-rag - Versions diffs - 0.0.88__tar.gz → 0.0.100__tar.gz - Mend

code-graph-rag 0.0.88tar.gz → 0.0.100tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{code_graph_rag-0.0.88/code_graph_rag.egg-info → code_graph_rag-0.0.100}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: code-graph-rag
-Version: 0.0.88
+Version: 0.0.100
 Summary: The ultimate RAG for your monorepo. Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs
 License-Expression: MIT
 Keywords: rag,retrieval-augmented-generation,knowledge-graph,code-analysis,tree-sitter,mcp,mcp-server,llm,graph-database,semantic-search,codebase,memgraph,developer-tools,monorepo

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100/code_graph_rag.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: code-graph-rag
-Version: 0.0.88
+Version: 0.0.100
 Summary: The ultimate RAG for your monorepo. Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs
 License-Expression: MIT
 Keywords: rag,retrieval-augmented-generation,knowledge-graph,code-analysis,tree-sitter,mcp,mcp-server,llm,graph-database,semantic-search,codebase,memgraph,developer-tools,monorepo

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/cli.py RENAMED Viewed

@@ -169,12 +169,12 @@ def start(
             parsers, queries = load_parsers()
             updater = GraphUpdater(
-                ingestor,
-                repo_to_update,
-                parsers,
-                queries,
-                unignore_paths,
-                exclude_paths,
+                ingestor=ingestor,
+                repo_path=repo_to_update,
+                parsers=parsers,
+                queries=queries,
+                unignore_paths=unignore_paths,
+                exclude_paths=exclude_paths,
             )
             updater.run()
@@ -245,7 +245,12 @@ def index(
         )
         parsers, queries = load_parsers()
         updater = GraphUpdater(
-            ingestor, repo_to_index, parsers, queries, unignore_paths, exclude_paths
+            ingestor=ingestor,
+            repo_path=repo_to_index,
+            parsers=parsers,
+            queries=queries,
+            unignore_paths=unignore_paths,
+            exclude_paths=exclude_paths,
         )
         updater.run()

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/config.py RENAMED Viewed

@@ -246,9 +246,15 @@ class AppConfig(BaseSettings):
     QDRANT_COLLECTION_NAME: str = "code_embeddings"
     QDRANT_VECTOR_DIM: int = 768
     QDRANT_TOP_K: int = 5
+    QDRANT_UPSERT_RETRIES: int = Field(default=3, gt=0)
+    QDRANT_RETRY_BASE_DELAY: float = Field(default=0.5, gt=0)
+    QDRANT_BATCH_SIZE: int = Field(default=50, gt=0)
     EMBEDDING_MAX_LENGTH: int = 512
     EMBEDDING_PROGRESS_INTERVAL: int = 10
+    FLUSH_THREAD_POOL_SIZE: int = Field(default=4, gt=0)
+    FILE_FLUSH_INTERVAL: int = Field(default=500, gt=0)
     CACHE_MAX_ENTRIES: int = 1000
     CACHE_MAX_MEMORY_MB: int = 500
     CACHE_EVICTION_DIVISOR: int = 10

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/constants.py RENAMED Viewed

@@ -150,6 +150,8 @@ V1_PATH = "/v1"
 HTTP_OK = 200
 UNIXCODER_MODEL = "microsoft/unixcoder-base"
+EMBEDDING_DEFAULT_BATCH_SIZE = 32
+EMBEDDING_CACHE_FILENAME = ".embedding_cache.json"
 KEY_NODES = "nodes"
 KEY_RELATIONSHIPS = "relationships"
@@ -417,14 +419,21 @@ CSPROJ_SUFFIX = ".csproj"
 # (H) Cypher queries
 CYPHER_DEFAULT_LIMIT = 50
-CYPHER_QUERY_EMBEDDINGS = """
+_CYPHER_EMBEDDING_BASE = """
 MATCH (m:Module)-[:DEFINES]->(n)
 WHERE (n:Function OR n:Method)
   AND m.qualified_name STARTS WITH ($project_name + '.')
-RETURN id(n) AS node_id, n.qualified_name AS qualified_name,
+"""
+CYPHER_QUERY_EMBEDDINGS = (
+    _CYPHER_EMBEDDING_BASE
+    + """RETURN id(n) AS node_id, n.qualified_name AS qualified_name,
        n.start_line AS start_line, n.end_line AS end_line,
        m.path AS path
 """
+)
+CYPHER_QUERY_PROJECT_NODE_IDS = _CYPHER_EMBEDDING_BASE + "RETURN id(n) AS node_id\n"
 class SupportedLanguage(StrEnum):
@@ -883,7 +892,7 @@ PYINSTALLER_ARG_HIDDEN_IMPORT = "--hidden-import"
 PYINSTALLER_ARG_EXCLUDE_MODULE = "--exclude-module"
 PYINSTALLER_ENTRY_POINT = "main.py"
-PYINSTALLER_EXCLUDED_MODULES = ["logfire", "logfire_api"]
+PYINSTALLER_EXCLUDED_MODULES = ["logfire"]
 # (H) TOML parsing constants
 TOML_KEY_PROJECT = "project"
@@ -908,6 +917,7 @@ PYINSTALLER_PACKAGES: list["PyInstallerPackage"] = [
     PyInstallerPackage(name="loguru", collect_all=True),
     PyInstallerPackage(name="toml", collect_all=True),
     PyInstallerPackage(name="protobuf", collect_all=True),
+    PyInstallerPackage(name="genai_prices", collect_all=True),
 ]
 ALLOWED_COMMENT_MARKERS = frozenset(
@@ -964,6 +974,22 @@ CYPHER_PREFIX = "cypher"
 CYPHER_SEMICOLON = ";"
 CYPHER_BACKTICK = "`"
 CYPHER_MATCH_KEYWORD = "MATCH"
+CYPHER_DANGEROUS_KEYWORDS: frozenset[str] = frozenset(
+    {
+        "DELETE",
+        "DETACH",
+        "DROP",
+        "CREATE INDEX",
+        "CREATE CONSTRAINT",
+        "REMOVE",
+        "SET",
+        "MERGE",
+        "CREATE",
+        "CALL",
+        "LOAD CSV",
+        "FOREACH",
+    }
+)
 # (H) Tool success messages
 MSG_SURGICAL_SUCCESS = "Successfully applied surgical code replacement in: {path}"
@@ -1572,6 +1598,9 @@ GOMOD_COMMENT_PREFIX = "//"
 # (H) Gemfile parsing patterns
 GEMFILE_GEM_PREFIX = "gem "
+# (H) Incremental update hash cache
+HASH_CACHE_FILENAME = ".cgr-hash-cache.json"
 # (H) Import processor cache config
 IMPORT_CACHE_TTL = 3600
 IMPORT_CACHE_DIR = ".cache/codebase_rag"

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/cypher_queries.py RENAMED Viewed

@@ -126,3 +126,24 @@ def build_merge_relationship_query(
     )
     query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT
     return query
+def build_create_node_query(label: str, id_key: str) -> str:
+    return f"CREATE (n:{label} {{{id_key}: row.id}})\nSET n += row.props"
+def build_create_relationship_query(
+    from_label: str,
+    from_key: str,
+    rel_type: str,
+    to_label: str,
+    to_key: str,
+    has_props: bool = False,
+) -> str:
+    query = (
+        f"MATCH (a:{from_label} {{{from_key}: row.from_val}}), "
+        f"(b:{to_label} {{{to_key}: row.to_val}})\n"
+        f"CREATE (a)-[r:{rel_type}]->(b)\n"
+    )
+    query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT
+    return query

code_graph_rag-0.0.100/codebase_rag/embedder.py ADDED Viewed

@@ -0,0 +1,183 @@
+from __future__ import annotations
+import hashlib
+import json
+from functools import lru_cache
+from pathlib import Path
+from loguru import logger
+from . import constants as cs
+from . import exceptions as ex
+from . import logs as ls
+from .config import settings
+from .utils.dependencies import has_torch, has_transformers
+class EmbeddingCache:
+    __slots__ = ("_cache", "_path")
+    def __init__(self, path: Path | None = None) -> None:
+        self._cache: dict[str, list[float]] = {}
+        self._path = path
+    @staticmethod
+    def _content_hash(content: str) -> str:
+        return hashlib.sha256(content.encode()).hexdigest()
+    def get(self, content: str) -> list[float] | None:
+        return self._cache.get(self._content_hash(content))
+    def put(self, content: str, embedding: list[float]) -> None:
+        self._cache[self._content_hash(content)] = embedding
+    def get_many(self, snippets: list[str]) -> dict[int, list[float]]:
+        results: dict[int, list[float]] = {}
+        for i, snippet in enumerate(snippets):
+            if (cached := self.get(snippet)) is not None:
+                results[i] = cached
+        return results
+    def put_many(self, snippets: list[str], embeddings: list[list[float]]) -> None:
+        for snippet, embedding in zip(snippets, embeddings):
+            self.put(snippet, embedding)
+    def save(self) -> None:
+        if self._path is None:
+            return
+        try:
+            self._path.parent.mkdir(parents=True, exist_ok=True)
+            with self._path.open("w", encoding="utf-8") as f:
+                json.dump(self._cache, f)
+        except Exception as e:
+            logger.warning(ls.EMBEDDING_CACHE_SAVE_FAILED, path=self._path, error=e)
+    def load(self) -> None:
+        if self._path is None or not self._path.exists():
+            return
+        try:
+            with self._path.open("r", encoding="utf-8") as f:
+                self._cache = json.load(f)
+            logger.debug(
+                ls.EMBEDDING_CACHE_LOADED, count=len(self._cache), path=self._path
+            )
+        except Exception as e:
+            logger.warning(ls.EMBEDDING_CACHE_LOAD_FAILED, path=self._path, error=e)
+            self._cache = {}
+    def clear(self) -> None:
+        self._cache.clear()
+    def __len__(self) -> int:
+        return len(self._cache)
+_embedding_cache: EmbeddingCache | None = None
+def get_embedding_cache() -> EmbeddingCache:
+    global _embedding_cache
+    if _embedding_cache is None:
+        cache_path = Path(settings.QDRANT_DB_PATH) / cs.EMBEDDING_CACHE_FILENAME
+        _embedding_cache = EmbeddingCache(path=cache_path)
+        _embedding_cache.load()
+    return _embedding_cache
+def clear_embedding_cache() -> None:
+    global _embedding_cache
+    if _embedding_cache is not None:
+        _embedding_cache.clear()
+        _embedding_cache = None
+if has_torch() and has_transformers():
+    import numpy as np
+    import torch
+    from numpy.typing import NDArray
+    from .unixcoder import UniXcoder
+    @lru_cache(maxsize=1)
+    def get_model() -> UniXcoder:
+        model = UniXcoder(cs.UNIXCODER_MODEL)
+        model.eval()
+        if torch.cuda.is_available():
+            model = model.cuda()
+        return model
+    def embed_code(code: str, max_length: int | None = None) -> list[float]:
+        cache = get_embedding_cache()
+        if (cached := cache.get(code)) is not None:
+            return cached
+        if max_length is None:
+            max_length = settings.EMBEDDING_MAX_LENGTH
+        model = get_model()
+        device = next(model.parameters()).device
+        tokens = model.tokenize([code], max_length=max_length)
+        tokens_tensor = torch.tensor(tokens).to(device)
+        with torch.no_grad():
+            _, sentence_embeddings = model(tokens_tensor)
+            embedding: NDArray[np.float32] = sentence_embeddings.cpu().numpy()
+        result: list[float] = embedding[0].tolist()
+        cache.put(code, result)
+        return result
+    def embed_code_batch(
+        snippets: list[str],
+        max_length: int | None = None,
+        batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE,
+    ) -> list[list[float]]:
+        if not snippets:
+            return []
+        if max_length is None:
+            max_length = settings.EMBEDDING_MAX_LENGTH
+        cache = get_embedding_cache()
+        cached_results = cache.get_many(snippets)
+        if len(cached_results) == len(snippets):
+            logger.debug(ls.EMBEDDING_CACHE_HIT, count=len(snippets))
+            return [cached_results[i] for i in range(len(snippets))]
+        uncached_indices = [i for i in range(len(snippets)) if i not in cached_results]
+        uncached_snippets = [snippets[i] for i in uncached_indices]
+        model = get_model()
+        device = next(model.parameters()).device
+        all_new_embeddings: list[list[float]] = []
+        for start in range(0, len(uncached_snippets), batch_size):
+            batch = uncached_snippets[start : start + batch_size]
+            tokens_list = model.tokenize(batch, max_length=max_length, padding=True)
+            tokens_tensor = torch.tensor(tokens_list).to(device)
+            with torch.no_grad():
+                _, sentence_embeddings = model(tokens_tensor)
+                batch_np: NDArray[np.float32] = sentence_embeddings.cpu().numpy()
+            for row in batch_np:
+                all_new_embeddings.append(row.tolist())
+        cache.put_many(uncached_snippets, all_new_embeddings)
+        results: list[list[float]] = [[] for _ in snippets]
+        for i, emb in cached_results.items():
+            results[i] = emb
+        for idx, orig_i in enumerate(uncached_indices):
+            results[orig_i] = all_new_embeddings[idx]
+        return results
+else:
+    def embed_code(code: str, max_length: int | None = None) -> list[float]:
+        raise RuntimeError(ex.SEMANTIC_EXTRA)
+    def embed_code_batch(
+        snippets: list[str],
+        max_length: int | None = None,
+        batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE,
+    ) -> list[list[float]]:
+        raise RuntimeError(ex.SEMANTIC_EXTRA)

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/exceptions.py RENAMED Viewed

@@ -42,6 +42,7 @@ NO_LANGUAGES = "No Tree-sitter languages available."
 # (H) LLM errors
 LLM_INIT_CYPHER = "Failed to initialize CypherGenerator: {error}"
 LLM_INVALID_QUERY = "LLM did not generate a valid query. Output: {output}"
+LLM_DANGEROUS_QUERY = "LLM generated a destructive Cypher query (found '{keyword}'). Query rejected: {query}"
 LLM_GENERATION_FAILED = "Cypher generation failed: {error}"
 LLM_INIT_ORCHESTRATOR = "Failed to initialize RAG Orchestrator: {error}"

{code_graph_rag-0.0.88 → code_graph_rag-0.0.100}/codebase_rag/graph_loader.py RENAMED Viewed

@@ -13,6 +13,18 @@ from .types_defs import GraphData, GraphMetadata, GraphSummary, PropertyValue
 class GraphLoader:
+    __slots__ = (
+        "file_path",
+        "_data",
+        "_nodes",
+        "_relationships",
+        "_nodes_by_id",
+        "_nodes_by_label",
+        "_outgoing_rels",
+        "_incoming_rels",
+        "_property_indexes",
+    )
     def __init__(self, file_path: str):
         self.file_path = Path(file_path)
         self._data: GraphData | None = None

code-graph-rag 0.0.88__tar.gz → 0.0.100__tar.gz

code-graph-rag 0.0.88tar.gz → 0.0.100tar.gz