PyPI - victor-codegraph - Versions diffs - 0.0.1__py3-none-any.whl - Mend

victor-codegraph 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

victor_codegraph/__init__.py +47 -0
victor_codegraph/adapter.py +94 -0
victor_codegraph/config.py +48 -0
victor_codegraph/languages.py +69 -0
victor_codegraph/model.py +148 -0
victor_codegraph/parser.py +120 -0
victor_codegraph/python_parser.py +270 -0
victor_codegraph/sizing.py +121 -0
victor_codegraph/treesitter_parser.py +268 -0
victor_codegraph-0.0.1.dist-info/METADATA +109 -0
victor_codegraph-0.0.1.dist-info/RECORD +13 -0
victor_codegraph-0.0.1.dist-info/WHEEL +5 -0
victor_codegraph-0.0.1.dist-info/top_level.txt +1 -0

victor_codegraph/__init__.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""victor-codegraph — shared code->CPG chunker.
+One tree-sitter symbol+relation chunker, three consumers (Victor, ProximaDB SDK,
+AnvaiOps). See ProximaDB ADR-029 / Victor ADR-014.
+    from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
+    chunks = chunk(source, file_path="foo.py")          # size-capped, embeddable
+    parsed = parse(source, file_path="foo.py")           # symbols + relations
+    records = to_proxima_records(parsed, repo_graph_id="myrepo")
+"""
+from __future__ import annotations
+from .adapter import relation_to_record, symbol_to_record, to_proxima_records
+from .config import ChunkConfig
+from .languages import detect_language
+from .model import (
+    CodeChunk,
+    CodeRelation,
+    CodeRelationType,
+    CodeSymbol,
+    CodeSymbolType,
+    ParsedCode,
+    SourceLocation,
+)
+from .parser import chunk, parse
+__version__ = "0.0.1"
+__all__ = [
+    "__version__",
+    "chunk",
+    "parse",
+    "ChunkConfig",
+    "detect_language",
+    "to_proxima_records",
+    "symbol_to_record",
+    "relation_to_record",
+    "CodeChunk",
+    "CodeSymbol",
+    "CodeRelation",
+    "CodeSymbolType",
+    "CodeRelationType",
+    "ParsedCode",
+    "SourceLocation",
+]

victor_codegraph/adapter.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""Projection to the ProximaDB substrate-keystone ``ProximaRecord`` shape.
+Per ProximaDB ``CODE_GRAPH_CORRELATED_SUBSTRATE_2026_06_22.adoc`` a code symbol is *one*
+record addressable as a relational row, a graph node, and a vector at once. This adapter
+emits the **shape** as plain dicts — it does not import proximadb, embed, or write. The
+consumer (Victor embedded, AnvaiOps service) supplies the embedder and the DB write.
+"""
+from __future__ import annotations
+from typing import Any, Callable
+from .model import CodeRelation, CodeSymbol, ParsedCode
+Embedder = Callable[[str], list[float]]
+def _symbol_oid(repo_graph_id: str, symbol: CodeSymbol) -> str:
+    return f"graph/{repo_graph_id}/node/{symbol.id}"
+def symbol_to_record(
+    symbol: CodeSymbol,
+    repo_graph_id: str,
+    branch_id: str = "main",
+    embedder: Embedder | None = None,
+    model_id: str = "bge-small-en-v1.5",
+    dim: int = 384,
+) -> dict[str, Any]:
+    """Project one symbol to a node record (row + graph node + optional vector)."""
+    oid = _symbol_oid(repo_graph_id, symbol)
+    record: dict[str, Any] = {
+        "oid": oid,
+        "labels": ["graph_node", "code_symbol"],
+        "branch_id": branch_id,
+        "props": {
+            "name": symbol.simple_name,
+            "fully_qualified_name": symbol.fully_qualified_name,
+            "file": symbol.location.file_path,
+            "line": symbol.location.start_line,
+            "end_line": symbol.location.end_line,
+            "lang": symbol.language,
+            "ast_kind": symbol.symbol_type.name,
+            "signature": symbol.signature,
+            "visibility": "private" if "private" in symbol.modifiers else "public",
+            "module_path": "::".join(symbol.scope_chain),
+            "snippet": symbol.source_code,
+            "documentation": symbol.documentation,
+        },
+        "embeddings": [],
+    }
+    if embedder is not None:
+        record["embeddings"].append(
+            {
+                "model_id": model_id,
+                "modality": "code",
+                "dim": dim,
+                "values": embedder(symbol.source_code),
+            }
+        )
+    return record
+def relation_to_record(relation: CodeRelation, repo_graph_id: str, branch_id: str = "main") -> dict[str, Any]:
+    """Project one relation to an edge record."""
+    return {
+        "labels": ["graph_edge"],
+        "branch_id": branch_id,
+        "edge": {
+            "from_oid": f"graph/{repo_graph_id}/node/{relation.from_symbol_id}",
+            "to_oid": f"graph/{repo_graph_id}/node/{relation.to_symbol_id}",
+            "edge_type": relation.relation_type.name,
+        },
+        "props": {"confidence": relation.confidence, "context": relation.context},
+    }
+def to_proxima_records(
+    parsed: ParsedCode,
+    repo_graph_id: str,
+    branch_id: str = "main",
+    embedder: Embedder | None = None,
+) -> list[dict[str, Any]]:
+    """Project an entire parsed file to node + edge records (shapes only)."""
+    records = [
+        symbol_to_record(s, repo_graph_id, branch_id, embedder) for s in parsed.symbols
+    ]
+    records.extend(
+        relation_to_record(r, repo_graph_id, branch_id) for r in parsed.relations
+    )
+    return records

victor_codegraph/config.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Chunking configuration — the size discipline both donor parsers needed.
+ProximaDB's ``code.py`` had *no* size-capping (a huge function became one huge chunk);
+LlamaIndex ``CodeSplitter`` and Victor's chunker both cap size. This config carries the
+cap so the merged parser never emits an over-budget chunk.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class ChunkConfig:
+    """Size + scope knobs for code chunking.
+    The token budget is matched to the embedding model (BGE-small 384-d ~ 512 tokens),
+    not an arbitrary char count. ``chars_per_token`` is conservative to avoid truncation.
+    """
+    # Size cap (the gap fix). A symbol whose body exceeds the budget is split.
+    max_chunk_tokens: int = 512
+    chunk_overlap_tokens: int = 64
+    chars_per_token: float = 3.5
+    # Symbols below this many lines are never body-split (cheap, keep whole).
+    large_symbol_threshold_lines: int = 30
+    # Scope filters.
+    include_private: bool = True
+    include_tests: bool = True
+    extract_relations: bool = True
+    # Restrict to these languages (None = all detectable).
+    languages: list[str] | None = None
+    # Computed budgets (chars), derived in __post_init__.
+    max_chunk_chars: int = field(init=False, default=0)
+    chunk_overlap_chars: int = field(init=False, default=0)
+    def __post_init__(self) -> None:
+        self.max_chunk_chars = max(1, int(self.max_chunk_tokens * self.chars_per_token))
+        self.chunk_overlap_chars = max(
+            0, min(int(self.chunk_overlap_tokens * self.chars_per_token), self.max_chunk_chars - 1)
+        )
+    def estimate_tokens(self, text: str) -> int:
+        """Conservative token estimate for ``text``."""
+        return int(len(text) / self.chars_per_token) + 1

victor_codegraph/languages.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Language detection + the tree-sitter grammar name map.
+The Python path uses the stdlib ``ast`` (no grammar needed); everything else routes to
+the generic tree-sitter extractor when the grammar pack is installed.
+"""
+from __future__ import annotations
+import os
+# Extension -> canonical language name.
+EXTENSION_TO_LANGUAGE: dict[str, str] = {
+    ".py": "python",
+    ".pyi": "python",
+    ".js": "javascript",
+    ".jsx": "javascript",
+    ".mjs": "javascript",
+    ".cjs": "javascript",
+    ".ts": "typescript",
+    ".tsx": "tsx",
+    ".rs": "rust",
+    ".go": "go",
+    ".java": "java",
+    ".c": "c",
+    ".h": "c",
+    ".cpp": "cpp",
+    ".cc": "cpp",
+    ".cxx": "cpp",
+    ".hpp": "cpp",
+    ".cs": "csharp",
+    ".rb": "ruby",
+    ".php": "php",
+    ".swift": "swift",
+    ".kt": "kotlin",
+    ".scala": "scala",
+    ".sh": "bash",
+    ".bash": "bash",
+    ".lua": "lua",
+    ".sql": "sql",
+}
+# Canonical language name -> tree-sitter-language-pack grammar name. (Most are 1:1;
+# this indirection lets us split TS/TSX which share one canonical extraction path.)
+TREE_SITTER_GRAMMAR: dict[str, str] = {
+    "javascript": "javascript",
+    "typescript": "typescript",
+    "tsx": "tsx",
+    "rust": "rust",
+    "go": "go",
+    "java": "java",
+    "c": "c",
+    "cpp": "cpp",
+    "csharp": "csharp",
+    "ruby": "ruby",
+    "php": "php",
+    "swift": "swift",
+    "kotlin": "kotlin",
+    "scala": "scala",
+    "bash": "bash",
+    "lua": "lua",
+    "sql": "sql",
+}
+def detect_language(file_path: str) -> str | None:
+    """Best-effort language from file extension."""
+    _, ext = os.path.splitext(file_path)
+    return EXTENSION_TO_LANGUAGE.get(ext.lower())

victor_codegraph/model.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Canonical, neutral data model for code symbols, relations, and chunks.
+This is the *union* of the two donor implementations (ProximaDB SDK ``code.py`` and
+Victor ``victor-coding``): ProximaDB contributed the richer symbol/relation taxonomy;
+Victor contributed the size-aware ``CodeChunk`` with hierarchical IDs. The model carries
+no SaaS/DB/framework concept, so every consumer can depend on it freely.
+"""
+from __future__ import annotations
+import hashlib
+from dataclasses import dataclass, field
+from enum import IntEnum
+from typing import Any
+class CodeSymbolType(IntEnum):
+    """Kinds of code symbol that can be extracted (superset across languages)."""
+    FILE = 1
+    MODULE = 2
+    PACKAGE = 3
+    CLASS = 4
+    INTERFACE = 5
+    TRAIT = 6
+    STRUCT = 7
+    ENUM = 8
+    FUNCTION = 9
+    METHOD = 10
+    CONSTRUCTOR = 11
+    PROPERTY = 12
+    FIELD = 13
+    CONSTANT = 14
+    VARIABLE = 15
+    PARAMETER = 16
+    TYPE_ALIAS = 17
+    MACRO = 18
+class CodeRelationType(IntEnum):
+    """Directed relationships between code symbols."""
+    CALLS = 1
+    CALLED_BY = 2
+    EXTENDS = 3
+    IMPLEMENTS = 4
+    USES_TYPE = 5
+    RETURNS_TYPE = 6
+    IMPORTS = 7
+    IMPORTED_BY = 8
+    DEPENDS_ON = 9
+    CONTAINS = 10
+    CONTAINED_BY = 11
+    DEFINES = 12
+    REFERENCES = 13
+    REFERENCED_BY = 14
+    OVERRIDES = 15
+    OVERRIDDEN_BY = 16
+    TESTS = 17
+    TESTED_BY = 18
+@dataclass
+class SourceLocation:
+    """Where a symbol lives in source. Lines are 1-based; bytes are 0-based."""
+    file_path: str
+    start_line: int = 0
+    start_column: int = 0
+    end_line: int = 0
+    end_column: int = 0
+    byte_offset: int = 0
+    byte_length: int = 0
+@dataclass
+class CodeSymbol:
+    """A semantic code unit (function/class/method/struct/...)."""
+    id: str
+    symbol_type: CodeSymbolType
+    fully_qualified_name: str
+    simple_name: str
+    location: SourceLocation
+    source_code: str
+    language: str
+    documentation: str | None = None
+    signature: str | None = None
+    modifiers: list[str] = field(default_factory=list)
+    scope_chain: list[str] = field(default_factory=list)
+    parameters: list[dict[str, Any]] = field(default_factory=list)
+    return_type: str | None = None
+    complexity: dict[str, int] | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class CodeRelation:
+    """A directed edge between two symbols (by id)."""
+    from_symbol_id: str
+    to_symbol_id: str
+    relation_type: CodeRelationType
+    call_site: SourceLocation | None = None
+    context: str | None = None
+    confidence: float = 1.0
+@dataclass
+class ParsedCode:
+    """Result of parsing one source file."""
+    file_path: str
+    language: str
+    symbols: list[CodeSymbol] = field(default_factory=list)
+    relations: list[CodeRelation] = field(default_factory=list)
+    imports: list[str] = field(default_factory=list)
+    content_hash: str = ""
+@dataclass
+class CodeChunk:
+    """An embeddable, size-capped chunk projected from a symbol.
+    A symbol within the size budget yields exactly one chunk; an oversized symbol is
+    body-split into several chunks sharing ``symbol_id`` (see ``sizing``). ``chunk_id``
+    is hierarchical and deterministic so incremental re-index is an idempotent upsert.
+    """
+    chunk_id: str
+    text: str
+    symbol_id: str
+    start_pos: int
+    end_pos: int
+    metadata: dict[str, Any] = field(default_factory=dict)
+def deterministic_symbol_id(file_path: str, name: str, line: int, column: int = 0) -> str:
+    """Stable 16-hex id keyed on (file, name, line, col) — same input, same id."""
+    key = f"{file_path}:{name}:{line}:{column}"
+    return hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
+def content_hash(content: str) -> str:
+    """SHA-256 of file content, for change detection on the re-index hot path."""
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()

victor_codegraph/parser.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Public entrypoints: ``parse`` (symbols+relations) and ``chunk`` (size-capped).
+Fallback chain (Victor's posture): python-ast -> tree-sitter -> sliding-window. A parse
+never hard-fails; an unknown or grammar-less language degrades to line-window chunks.
+"""
+from __future__ import annotations
+from .config import ChunkConfig
+from .languages import detect_language
+from .model import CodeChunk, ParsedCode, content_hash
+from .python_parser import parse_python
+from .sizing import chunks_for_symbol
+from .treesitter_parser import GrammarUnavailable, parse_treesitter
+def parse(content: str, language: str | None = None, file_path: str = "<unknown>") -> ParsedCode:
+    """Parse source into symbols + relations, falling back gracefully."""
+    language = language or detect_language(file_path)
+    if language == "python":
+        try:
+            return parse_python(content, file_path)
+        except SyntaxError:
+            pass  # fall through to window chunking via empty ParsedCode
+    if language is not None and language != "python":
+        try:
+            return parse_treesitter(content, file_path, language)
+        except GrammarUnavailable:
+            pass
+    # Last resort: no symbols (caller's chunk() will sliding-window the raw text).
+    return ParsedCode(
+        file_path=file_path,
+        language=language or "text",
+        symbols=[],
+        relations=[],
+        imports=[],
+        content_hash=content_hash(content),
+    )
+def _sliding_window(content: str, file_path: str, language: str, config: ChunkConfig) -> list[CodeChunk]:
+    """Universal fallback when no symbols were extracted."""
+    if not content:
+        return []
+    lines = content.splitlines(keepends=True)
+    out: list[CodeChunk] = []
+    cur: list[str] = []
+    cur_len = 0
+    start_line = 1
+    idx = 0
+    for n, ln in enumerate(lines, start=1):
+        if cur_len + len(ln) > config.max_chunk_chars and cur:
+            text = "".join(cur)
+            out.append(
+                CodeChunk(
+                    chunk_id=f"{file_path}#window#{idx}",
+                    text=text,
+                    symbol_id=f"{file_path}#window#{idx}",
+                    start_pos=0,
+                    end_pos=0,
+                    metadata={
+                        "file_path": file_path,
+                        "language": language,
+                        "chunk_index": idx,
+                        "start_line": start_line,
+                        "end_line": n - 1,
+                        "strategy": "sliding_window",
+                    },
+                )
+            )
+            idx += 1
+            cur, cur_len, start_line = [], 0, n
+        cur.append(ln)
+        cur_len += len(ln)
+    if cur:
+        out.append(
+            CodeChunk(
+                chunk_id=f"{file_path}#window#{idx}",
+                text="".join(cur),
+                symbol_id=f"{file_path}#window#{idx}",
+                start_pos=0,
+                end_pos=0,
+                metadata={
+                    "file_path": file_path,
+                    "language": language,
+                    "chunk_index": idx,
+                    "start_line": start_line,
+                    "end_line": len(lines),
+                    "strategy": "sliding_window",
+                },
+            )
+        )
+    return out
+def chunk(
+    content: str,
+    language: str | None = None,
+    file_path: str = "<unknown>",
+    config: ChunkConfig | None = None,
+) -> list[CodeChunk]:
+    """Parse + project into size-capped, embeddable chunks."""
+    config = config or ChunkConfig()
+    parsed = parse(content, language, file_path)
+    if not parsed.symbols:
+        return _sliding_window(content, file_path, parsed.language, config)
+    out: list[CodeChunk] = []
+    for sym in parsed.symbols:
+        if not config.include_private and "private" in sym.modifiers:
+            continue
+        out.extend(chunks_for_symbol(sym, config))
+    return out

victor_codegraph/python_parser.py ADDED Viewed

@@ -0,0 +1,270 @@
+"""Python parser using the stdlib ``ast`` module.
+This is the primary Python path (Victor's approach): it needs no native grammar, is
+deterministic, and works fully offline. Extracts modules/classes/functions/methods with
+signatures, docstrings, decorators, parameters, cyclomatic complexity, and CALLS edges.
+"""
+from __future__ import annotations
+import ast
+from .model import (
+    CodeRelation,
+    CodeRelationType,
+    CodeSymbol,
+    CodeSymbolType,
+    ParsedCode,
+    SourceLocation,
+    content_hash,
+    deterministic_symbol_id,
+)
+_BRANCH_NODES = (
+    ast.If,
+    ast.For,
+    ast.AsyncFor,
+    ast.While,
+    ast.With,
+    ast.AsyncWith,
+    ast.Try,
+    ast.ExceptHandler,
+    ast.BoolOp,
+    ast.IfExp,
+    ast.comprehension,
+)
+def _cyclomatic(node: ast.AST) -> dict[str, int]:
+    count = 1
+    for child in ast.walk(node):
+        if isinstance(child, _BRANCH_NODES):
+            count += 1
+    lineno = getattr(node, "lineno", 1)
+    end = getattr(node, "end_lineno", lineno) or lineno
+    return {"cyclomatic": count, "lines": end - lineno + 1}
+def _params(args: ast.arguments) -> list[dict]:
+    out: list[dict] = []
+    posonly = getattr(args, "posonlyargs", [])
+    for a in [*posonly, *args.args]:
+        if a.arg in ("self", "cls"):
+            continue
+        p: dict = {"name": a.arg}
+        if a.annotation is not None:
+            p["type"] = ast.unparse(a.annotation)
+        out.append(p)
+    if args.vararg is not None:
+        out.append({"name": f"*{args.vararg.arg}", "is_variadic": True})
+    for a in args.kwonlyargs:
+        p = {"name": a.arg, "is_kwonly": True}
+        if a.annotation is not None:
+            p["type"] = ast.unparse(a.annotation)
+        out.append(p)
+    if args.kwarg is not None:
+        out.append({"name": f"**{args.kwarg.arg}", "is_variadic": True})
+    return out
+def _signature(name: str, args: ast.arguments, returns: ast.AST | None) -> str:
+    parts = []
+    for p in _params(args):
+        s = p["name"]
+        if p.get("type"):
+            s += f": {p['type']}"
+        parts.append(s)
+    sig = f"{name}({', '.join(parts)})"
+    if returns is not None:
+        sig += f" -> {ast.unparse(returns)}"
+    return sig
+def _modifiers(name: str, decorators: list[ast.expr], is_async: bool) -> list[str]:
+    mods = [f"@{ast.unparse(d)}" for d in decorators]
+    if is_async:
+        mods.append("async")
+    if name.startswith("__") and name.endswith("__"):
+        mods.append("dunder")
+    elif name.startswith("_"):
+        mods.append("private")
+    return mods
+def _callee_name(call: ast.Call) -> str | None:
+    f = call.func
+    if isinstance(f, ast.Name):
+        return f.id
+    if isinstance(f, ast.Attribute):
+        return f.attr
+    return None
+class _Visitor:
+    def __init__(self, file_path: str, source: str) -> None:
+        self.file_path = file_path
+        self.source = source
+        self.symbols: list[CodeSymbol] = []
+        self.relations: list[CodeRelation] = []
+        self.imports: list[str] = []
+        self._fqn_prefix = file_path.replace("/", ".").replace("\\", ".")
+    def _src(self, node: ast.AST) -> str:
+        try:
+            return ast.get_source_segment(self.source, node) or ""
+        except Exception:
+            return ""
+    def _make_symbol(
+        self,
+        node: ast.AST,
+        name: str,
+        symbol_type: CodeSymbolType,
+        scope: list[str],
+        signature: str | None = None,
+        params: list[dict] | None = None,
+        return_type: str | None = None,
+        modifiers: list[str] | None = None,
+    ) -> CodeSymbol:
+        lineno = getattr(node, "lineno", 1)
+        end = getattr(node, "end_lineno", lineno) or lineno
+        col = getattr(node, "col_offset", 0)
+        fqn = "::".join([self._fqn_prefix, *scope, name])
+        return CodeSymbol(
+            id=deterministic_symbol_id(self.file_path, name, lineno, col),
+            symbol_type=symbol_type,
+            fully_qualified_name=fqn,
+            simple_name=name,
+            location=SourceLocation(
+                file_path=self.file_path,
+                start_line=lineno,
+                start_column=col,
+                end_line=end,
+                end_column=getattr(node, "end_col_offset", 0) or 0,
+            ),
+            source_code=self._src(node),
+            language="python",
+            documentation=(
+                ast.get_docstring(node)
+                if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef))
+                else None
+            ),
+            signature=signature,
+            modifiers=modifiers or [],
+            scope_chain=list(scope),
+            parameters=params or [],
+            return_type=return_type,
+            complexity=_cyclomatic(node),
+        )
+    def visit_function(
+        self, node: ast.FunctionDef | ast.AsyncFunctionDef, scope: list[str]
+    ) -> CodeSymbol:
+        name = node.name
+        if scope:
+            stype = CodeSymbolType.CONSTRUCTOR if name == "__init__" else CodeSymbolType.METHOD
+        else:
+            stype = CodeSymbolType.FUNCTION
+        sym = self._make_symbol(
+            node,
+            name,
+            stype,
+            scope,
+            signature=_signature(name, node.args, node.returns),
+            params=_params(node.args),
+            return_type=ast.unparse(node.returns) if node.returns is not None else None,
+            modifiers=_modifiers(name, node.decorator_list, isinstance(node, ast.AsyncFunctionDef)),
+        )
+        self.symbols.append(sym)
+        # CALLS edges. ``to_symbol_id`` is the textual callee here; ``parse_python``
+        # resolves it to a real in-file symbol id when the callee is defined locally
+        # and otherwise keeps it as a bare name (so cross-file/external calls — e.g.
+        # a CPG's blast radius — are not silently dropped). ``call_site`` records the
+        # call line for consumers that need it.
+        for child in ast.walk(node):
+            if isinstance(child, ast.Call):
+                callee = _callee_name(child)
+                if callee:
+                    self.relations.append(
+                        CodeRelation(
+                            from_symbol_id=sym.id,
+                            to_symbol_id=callee,
+                            relation_type=CodeRelationType.CALLS,
+                            context=callee,
+                            call_site=SourceLocation(
+                                file_path=self.file_path,
+                                start_line=getattr(child, "lineno", 0),
+                                start_column=getattr(child, "col_offset", 0),
+                            ),
+                        )
+                    )
+        return sym
+    def visit_class(self, node: ast.ClassDef, scope: list[str]) -> None:
+        bases = [ast.unparse(b) for b in node.bases]
+        mods = [f"@{ast.unparse(d)}" for d in node.decorator_list]
+        if bases:
+            mods.append(f"extends({','.join(bases)})")
+        cls = self._make_symbol(node, node.name, CodeSymbolType.CLASS, scope, modifiers=mods)
+        self.symbols.append(cls)
+        for base in bases:
+            self.relations.append(
+                CodeRelation(
+                    from_symbol_id=cls.id,
+                    to_symbol_id=base,
+                    relation_type=CodeRelationType.EXTENDS,
+                    context=base,
+                )
+            )
+        inner = [*scope, node.name]
+        for child in node.body:
+            if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                self.visit_function(child, inner)
+            elif isinstance(child, ast.ClassDef):
+                self.visit_class(child, inner)
+    def run(self, tree: ast.Module) -> None:
+        for node in tree.body:
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                self.imports.append(ast.unparse(node))
+            elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                self.visit_function(node, [])
+            elif isinstance(node, ast.ClassDef):
+                self.visit_class(node, [])
+def parse_python(content: str, file_path: str) -> ParsedCode:
+    """Parse Python source into symbols + relations using the stdlib ``ast``."""
+    tree = ast.parse(content)
+    v = _Visitor(file_path, content)
+    v.run(tree)
+    # Resolve CALLS/EXTENDS targets to real in-file symbol ids when possible.
+    # Unresolved targets (external / cross-file callees and bases) are RETAINED with
+    # ``to_symbol_id`` = the textual name and ``confidence`` < 1.0, so consumers that
+    # need outgoing-call coverage (e.g. a CPG's blast radius) are not silently lossy.
+    # Only self-references (recursive calls) are dropped.
+    by_name: dict[str, str] = {s.simple_name: s.id for s in v.symbols}
+    resolved: list[CodeRelation] = []
+    for r in v.relations:
+        target_id = by_name.get(r.to_symbol_id)
+        if target_id == r.from_symbol_id:
+            continue  # self-reference (recursive call) — emit no self-edge
+        resolved.append(
+            CodeRelation(
+                from_symbol_id=r.from_symbol_id,
+                to_symbol_id=target_id if target_id is not None else r.to_symbol_id,
+                relation_type=r.relation_type,
+                context=r.context,
+                call_site=r.call_site,
+                confidence=1.0 if target_id is not None else 0.5,
+            )
+        )
+    return ParsedCode(
+        file_path=file_path,
+        language="python",
+        symbols=v.symbols,
+        relations=resolved,
+        imports=v.imports,
+        content_hash=content_hash(content),
+    )

victor_codegraph/sizing.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Size-capping / body-split — the discipline ProximaDB's ``code.py`` lacked.
+A symbol within budget yields one chunk. An oversized symbol is split into
+line-aligned, overlapping sub-chunks (LlamaIndex ``CodeSplitter`` style: respect
+structure, but never exceed ``max_chunk_chars``). Sub-chunks share the parent
+``symbol_id`` and carry hierarchical, deterministic ``chunk_id``s (Victor's pattern).
+"""
+from __future__ import annotations
+from .config import ChunkConfig
+from .model import CodeChunk, CodeSymbol
+def _base_metadata(symbol: CodeSymbol) -> dict:
+    return {
+        "symbol_id": symbol.id,
+        "symbol_type": symbol.symbol_type.name,
+        "fully_qualified_name": symbol.fully_qualified_name,
+        "simple_name": symbol.simple_name,
+        "language": symbol.language,
+        "file_path": symbol.location.file_path,
+        "start_line": symbol.location.start_line,
+        "end_line": symbol.location.end_line,
+        "signature": symbol.signature,
+        "documentation": symbol.documentation,
+        "modifiers": list(symbol.modifiers),
+        "scope_chain": list(symbol.scope_chain),
+        "return_type": symbol.return_type,
+        "complexity": symbol.complexity,
+    }
+def chunks_for_symbol(symbol: CodeSymbol, config: ChunkConfig) -> list[CodeChunk]:
+    """Project one symbol into one or more size-capped chunks."""
+    source = symbol.source_code
+    line_count = symbol.location.end_line - symbol.location.start_line + 1
+    fits = len(source) <= config.max_chunk_chars
+    small = line_count <= config.large_symbol_threshold_lines
+    if fits or small:
+        # Whole symbol as a single chunk. If a *small* symbol is still over the char
+        # budget (rare: dense one-liners), we still cap it below.
+        if fits:
+            meta = _base_metadata(symbol)
+            meta["chunk_index"] = 0
+            meta["chunk_total"] = 1
+            return [
+                CodeChunk(
+                    chunk_id=f"{symbol.id}#0",
+                    text=source,
+                    symbol_id=symbol.id,
+                    start_pos=symbol.location.byte_offset,
+                    end_pos=symbol.location.byte_offset + len(source.encode("utf-8")),
+                    metadata=meta,
+                )
+            ]
+    return _body_split(symbol, config)
+def _body_split(symbol: CodeSymbol, config: ChunkConfig) -> list[CodeChunk]:
+    """Split an oversized symbol body into overlapping, line-aligned sub-chunks."""
+    lines = symbol.source_code.splitlines(keepends=True)
+    max_chars = config.max_chunk_chars
+    overlap_chars = config.chunk_overlap_chars
+    windows: list[tuple[int, str]] = []  # (start_line_offset, text)
+    cur: list[str] = []
+    cur_len = 0
+    cur_start = 0
+    i = 0
+    while i < len(lines):
+        ln = lines[i]
+        # A single line longer than the budget is hard-cut (degenerate minified case).
+        if not cur and len(ln) > max_chars:
+            windows.append((i, ln[:max_chars]))
+            i += 1
+            cur_start = i
+            continue
+        if cur_len + len(ln) > max_chars and cur:
+            windows.append((cur_start, "".join(cur)))
+            # Build overlap tail by walking back from the end of the current window.
+            tail: list[str] = []
+            tail_len = 0
+            j = i - 1
+            while j >= cur_start and tail_len + len(lines[j]) <= overlap_chars:
+                tail.insert(0, lines[j])
+                tail_len += len(lines[j])
+                j -= 1
+            cur = list(tail)
+            cur_len = tail_len
+            cur_start = j + 1
+        cur.append(ln)
+        cur_len += len(ln)
+        i += 1
+    if cur:
+        windows.append((cur_start, "".join(cur)))
+    total = len(windows)
+    out: list[CodeChunk] = []
+    base_line = symbol.location.start_line
+    for idx, (line_off, text) in enumerate(windows):
+        meta = _base_metadata(symbol)
+        meta["chunk_index"] = idx
+        meta["chunk_total"] = total
+        meta["is_body_split"] = True
+        meta["start_line"] = base_line + line_off
+        out.append(
+            CodeChunk(
+                chunk_id=f"{symbol.id}#body#{idx}",
+                text=text,
+                symbol_id=symbol.id,
+                start_pos=symbol.location.byte_offset,
+                end_pos=symbol.location.byte_offset + len(symbol.source_code.encode("utf-8")),
+                metadata=meta,
+            )
+        )
+    return out

victor_codegraph/treesitter_parser.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""Generic tree-sitter parser for non-Python languages.
+Includes a *real* JavaScript/TypeScript extractor — the donor ProximaDB ``code.py``
+shipped a JS/TS stub that returned no symbols (CLAUDE-mandate "plausible-but-wrong"
+failure). Here JS/TS, plus a language-agnostic node-walk for common grammars, produce
+functions, classes, methods, and imports.
+Requires the ``treesitter`` extra (``tree-sitter`` + ``tree-sitter-language-pack``).
+If the grammar is unavailable, :func:`parse_treesitter` raises :class:`GrammarUnavailable`
+so the orchestrator can fall back.
+"""
+from __future__ import annotations
+from .languages import TREE_SITTER_GRAMMAR
+from .model import (
+    CodeRelation,
+    CodeRelationType,
+    CodeSymbol,
+    CodeSymbolType,
+    ParsedCode,
+    SourceLocation,
+    content_hash,
+    deterministic_symbol_id,
+)
+class GrammarUnavailable(RuntimeError):
+    """Raised when a tree-sitter grammar can't be loaded for a language."""
+# Node types that denote a callable, per common tree-sitter grammars.
+_FUNC_NODES = {
+    "function_declaration",
+    "function_definition",
+    "function_item",  # rust
+    "method_definition",  # js/ts
+    "method_declaration",  # java/go
+    "function",
+    "arrow_function",
+}
+_CLASS_NODES = {
+    "class_declaration",
+    "class_definition",
+    "class_specifier",
+    "struct_item",  # rust
+    "struct_specifier",
+    "interface_declaration",
+    "impl_item",  # rust
+}
+_IMPORT_NODES = {
+    "import_statement",
+    "import_declaration",
+    "import_from_statement",
+    "use_declaration",  # rust
+    "preproc_include",  # c/cpp
+}
+def _get_parser(grammar: str):
+    # Build an OFFICIAL `tree_sitter.Parser` from the pack's Language, rather than
+    # `tree_sitter_language_pack.get_parser()` — the latter can return a vendored
+    # binding whose nodes are a minimal `builtins.Node` lacking `.type`/`.children`.
+    # The official Parser yields standard nodes (type/children/start_byte properties).
+    try:
+        from tree_sitter import Parser
+        from tree_sitter_language_pack import get_language
+    except Exception as e:  # ImportError or native load failure
+        raise GrammarUnavailable(f"tree-sitter unavailable: {e}") from e
+    try:
+        language = get_language(grammar)
+    except Exception as e:
+        raise GrammarUnavailable(f"grammar '{grammar}' unavailable: {e}") from e
+    try:
+        return Parser(language)  # tree_sitter >= 0.23
+    except TypeError:
+        parser = Parser()  # older API: set the language attribute
+        parser.language = language
+        return parser
+def _text(node, src: bytes) -> str:
+    return src[node.start_byte : node.end_byte].decode("utf-8", errors="replace")
+def _attr(obj, name):
+    """Access a tree-sitter attribute that may be a property OR a zero-arg method.
+    tree-sitter-language-pack's bundled binding exposes `root_node`/`children` as
+    *methods* (callables), whereas the canonical `tree_sitter` exposes them as
+    properties. A list/Node is never callable, so this is safe for both shapes.
+    """
+    v = getattr(obj, name)
+    return v() if callable(v) else v
+def _children(node):
+    """Return a node's children across tree-sitter binding flavors.
+    `children` may be a property, a zero-arg method, or absent entirely (the
+    bundled binding exposes only `child_count` + `child(i)`, the universal C API).
+    """
+    children = getattr(node, "children", None)
+    if children is not None:
+        return children() if callable(children) else children
+    count = _attr(node, "child_count")
+    return [node.child(i) for i in range(count)]
+def _name_of(node, src: bytes) -> str | None:
+    """Find the declared name of a function/class node (grammar-agnostic)."""
+    field = node.child_by_field_name("name")
+    if field is not None:
+        return _text(field, src)
+    for child in _children(node):
+        if child.type in ("identifier", "type_identifier", "field_identifier", "property_identifier"):
+            return _text(child, src)
+    return None
+def _walk_collect(node, src, file_path, language, scope, symbols, relations):
+    for child in _children(node):
+        t = child.type
+        if t in _CLASS_NODES:
+            name = _name_of(child, src) or "<anonymous>"
+            stype = (
+                CodeSymbolType.STRUCT
+                if t.startswith("struct")
+                else CodeSymbolType.INTERFACE
+                if "interface" in t
+                else CodeSymbolType.CLASS
+            )
+            sym = _mk(child, src, file_path, language, name, stype, scope)
+            symbols.append(sym)
+            # Recurse into the class body with the class name on the scope chain.
+            body = child.child_by_field_name("body")
+            _walk_collect(
+                body if body is not None else child,
+                src,
+                file_path,
+                language,
+                [*scope, name],
+                symbols,
+                relations,
+            )
+        elif t in _FUNC_NODES:
+            name = _name_of(child, src)
+            if name is None and t == "arrow_function":
+                continue  # anonymous arrow not bound to a name; skip
+            name = name or "<anonymous>"
+            stype = CodeSymbolType.METHOD if scope else CodeSymbolType.FUNCTION
+            if name in ("constructor", "__init__", "new"):
+                stype = CodeSymbolType.CONSTRUCTOR
+            symbols.append(_mk(child, src, file_path, language, name, stype, scope))
+            # Don't recurse into function bodies for nested defs (kept flat, like donors).
+        else:
+            _walk_collect(child, src, file_path, language, scope, symbols, relations)
+def _handle_const_arrow(node, src, file_path, language, symbols):
+    """JS/TS: ``const foo = (...) => {...}`` / ``export const foo = () => {}``."""
+    for decl in _children(node):
+        if decl.type != "variable_declarator":
+            continue
+        name_node = decl.child_by_field_name("name")
+        value = decl.child_by_field_name("value")
+        if name_node is not None and value is not None and value.type == "arrow_function":
+            name = _text(name_node, src)
+            symbols.append(
+                _mk(decl, src, file_path, language, name, CodeSymbolType.FUNCTION, [])
+            )
+def _mk(node, src, file_path, language, name, stype, scope) -> CodeSymbol:
+    start_line = node.start_point[0] + 1
+    end_line = node.end_point[0] + 1
+    fqn = "::".join([file_path.replace("/", "."), *scope, name])
+    return CodeSymbol(
+        id=deterministic_symbol_id(file_path, name, start_line, node.start_point[1]),
+        symbol_type=stype,
+        fully_qualified_name=fqn,
+        simple_name=name,
+        location=SourceLocation(
+            file_path=file_path,
+            start_line=start_line,
+            start_column=node.start_point[1],
+            end_line=end_line,
+            end_column=node.end_point[1],
+            byte_offset=node.start_byte,
+            byte_length=node.end_byte - node.start_byte,
+        ),
+        source_code=_text(node, src),
+        language=language,
+        scope_chain=list(scope),
+        complexity={"lines": end_line - start_line + 1},
+    )
+def parse_treesitter(content: str, file_path: str, language: str) -> ParsedCode:
+    """Parse non-Python source via tree-sitter. Raises GrammarUnavailable on fallback."""
+    grammar = TREE_SITTER_GRAMMAR.get(language)
+    if grammar is None:
+        raise GrammarUnavailable(f"no grammar mapping for language '{language}'")
+    parser = _get_parser(grammar)
+    src = content.encode("utf-8")
+    # tree-sitter's Parser.parse() takes bytes on most builds, but some
+    # (notably via tree-sitter-language-pack) require a str and raise
+    # TypeError on bytes. Node byte offsets are UTF-8 positions either way,
+    # so `src` stays valid for slicing in _text() regardless of which we pass.
+    try:
+        tree = parser.parse(src)
+    except TypeError:
+        tree = parser.parse(content)
+    root = _attr(tree, "root_node")
+    symbols: list[CodeSymbol] = []
+    relations: list[CodeRelation] = []
+    imports: list[str] = []
+    for child in _children(root):
+        if child.type in _IMPORT_NODES:
+            imports.append(_text(child, src))
+    _walk_collect(root, src, file_path, language, [], symbols, relations)
+    # JS/TS arrow-function-as-const: a real surface the stub missed.
+    if language in ("javascript", "typescript", "tsx"):
+        for child in _children(root):
+            target = child
+            # unwrap `export const ...`
+            if child.type in ("export_statement",) and child.child_count:
+                for c in _children(child):
+                    if c.type in ("lexical_declaration", "variable_declaration"):
+                        target = c
+                        break
+            if target.type in ("lexical_declaration", "variable_declaration"):
+                _handle_const_arrow(target, src, file_path, language, symbols)
+    # Best-effort CONTAINS edges (class -> its methods) by scope.
+    by_scope: dict[str, str] = {
+        s.simple_name: s.id for s in symbols if s.symbol_type == CodeSymbolType.CLASS
+    }
+    for s in symbols:
+        if s.scope_chain:
+            parent = by_scope.get(s.scope_chain[-1])
+            if parent is not None and parent != s.id:
+                relations.append(
+                    CodeRelation(
+                        from_symbol_id=parent,
+                        to_symbol_id=s.id,
+                        relation_type=CodeRelationType.CONTAINS,
+                    )
+                )
+    return ParsedCode(
+        file_path=file_path,
+        language=language,
+        symbols=symbols,
+        relations=relations,
+        imports=imports,
+        content_hash=content_hash(content),
+    )

victor_codegraph-0.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,109 @@
+Metadata-Version: 2.4
+Name: victor-codegraph
+Version: 0.0.1
+Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
+Author-email: Vijaykumar Singh <singhvjd@gmail.com>
+License: Apache-2.0
+Project-URL: Homepage, https://github.com/vjsingh1984/victor
+Project-URL: Repository, https://github.com/vjsingh1984/victor
+Keywords: code-graph,cpg,chunking,tree-sitter,ast,embeddings,rag
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Libraries
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Provides-Extra: treesitter
+Requires-Dist: tree-sitter>=0.23; extra == "treesitter"
+Requires-Dist: tree-sitter-language-pack>=1.0; extra == "treesitter"
+Provides-Extra: contracts
+Requires-Dist: victor-contracts<1.0,>=0.7.0; extra == "contracts"
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+Requires-Dist: ruff>=0.5; extra == "dev"
+Requires-Dist: tree-sitter>=0.23; extra == "dev"
+Requires-Dist: tree-sitter-language-pack>=1.0; extra == "dev"
+# victor-codegraph
+Shared **code → Code-Property-Graph chunker**: tree-sitter symbol + relation extraction,
+size-capped embeddable chunks, and a `ProximaRecord` projection. One chunker, three
+consumers — Victor (owner), the ProximaDB SDK (`[codegraph]` extra), and AnvaiOps (SaaS
+code-graph vertical).
+> Design: ProximaDB `ADR-029` (authoritative) · Victor `ADR-014` (owner/donor) ·
+> AnvaiOps `ADR-0018` (consumer). This package is the **TD-CG1** scaffold.
+## Why
+The same tree-sitter code→symbol+relation chunker existed twice (ProximaDB SDK `code.py`
+and Victor `victor-coding`) and was about to be written a third time in AnvaiOps. This
+package is the single neutral home. It merges the best of both donors and fixes their two
+gaps:
+- **Size-capping** — ProximaDB's `code.py` emitted one chunk per symbol with *no* size
+  bound (a huge function became a huge chunk). Here, oversized symbols are body-split with
+  overlap (LlamaIndex `CodeSplitter` discipline). See `sizing.py`.
+- **Real JS/TS** — the donor JS/TS parser was a stub returning no symbols. Here JS/TS get a
+  real tree-sitter extractor (functions, classes, methods, `const … = () =>`, imports).
+## Install
+Not yet published to PyPI — use an **editable install** from the monorepo for now. Consumers
+(Victor, the ProximaDB SDK, AnvaiOps) reference it editable until the first `victor-codegraph-v*`
+release is cut.
+```bash
+# dev: editable, with tree-sitter grammars + test tooling
+make -C victor-codegraph dev          # = pip install -e ../victor-contracts && pip install -e ".[dev]"
+# minimal: Python-only (stdlib ast) path, zero native deps
+pip install -e ./victor-codegraph
+# once published:
+#   pip install victor-codegraph                 # Python path
+#   pip install "victor-codegraph[treesitter]"   # + multi-language grammars
+```
+### Releasing
+CI: `.github/workflows/ci-codegraph.yml` runs the suite (editable install, grammars on) for every
+PR touching `victor-codegraph/**`. Publishing: push a tag `victor-codegraph-v0.1.0` to trigger
+`.github/workflows/release-codegraph.yml`, which builds and publishes via **PyPI Trusted Publishing**
+(OIDC — no API token). Configure the publisher once on PyPI (owner `vjsingh1984`, repo `victor`,
+workflow `release-codegraph.yml`, environments `pypi` / `testpypi`); see the header of that workflow.
+## Use
+```python
+from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
+# Size-capped, embeddable chunks:
+chunks = chunk(source, file_path="app/service.py", config=ChunkConfig(max_chunk_tokens=512))
+# Symbols + relations:
+parsed = parse(source, file_path="app/service.py")
+# Project to the ProximaDB substrate-keystone record shape (one symbol = row+node+vector):
+records = to_proxima_records(parsed, repo_graph_id="myrepo", branch_id="main",
+                             embedder=my_embed_fn)  # embedder optional
+```
+## Design principles (the "best posture" this encodes)
+1. Chunk at **symbol** granularity (not statement, not fixed-size).
+2. **AST-aligned and size-capped** — never split mid-statement, never exceed the budget.
+3. Extract **relations** (CALLS/EXTENDS/CONTAINS/…) and project to a CPG.
+4. **Deterministic IDs + content hash** → idempotent incremental re-index.
+5. **Graceful fallback chain**: python-ast → tree-sitter → sliding-window.
+6. Token budget **matched to the embedding model** (BGE-small 384-d ≈ 512 tokens).
+## Status
+`0.1.0` — TD-CG1 scaffold. Python (stdlib `ast`) is the primary, fully-offline path.
+Multi-language extraction is best-effort via tree-sitter; deeper per-language relation
+extraction (the donor parsers' Rust/Go/Java specifics) lands incrementally.

victor_codegraph-0.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+victor_codegraph/__init__.py,sha256=HFoP1DTpRjFEP6UJD-0ZshZ6miKtQlE54i0RUKo5kMc,1205
+victor_codegraph/adapter.py,sha256=x6h6X3t6Ks-bOCmKAfgq5n8fhq-ZlVlq10wLvdxQ7rU,3217
+victor_codegraph/config.py,sha256=RastshUhVaPQMltdLfQesxJlX2k4BCUPvxUc7YKOjtE,1793
+victor_codegraph/languages.py,sha256=qS4uwkEaEZqL3f7QM0cgkhpVsXws2PziMFRDkZ9VRFA,1672
+victor_codegraph/model.py,sha256=BVu3wBiGN5FgRCdzsYWSqjqhqWmmQyqjUgfv1SqW2DQ,3903
+victor_codegraph/parser.py,sha256=2pB02ygojCmefpUy94fceymZFqN4TwxIP1Zjscmuc54,3951
+victor_codegraph/python_parser.py,sha256=DSBKnF8GnruvOGAqmnqms-cL9saaxhln1N6OY8hedRM,9655
+victor_codegraph/sizing.py,sha256=K7B866OrrmbsbqGRUSEdkVqa3atDPHMVbEmXNYRoyEE,4446
+victor_codegraph/treesitter_parser.py,sha256=NKRWRjl3ryVfljHiD2yUEIXnAyXISt0m2UZdJRQLsmc,9813
+victor_codegraph-0.0.1.dist-info/METADATA,sha256=7l3h1akUIuM2PMC1pWmcqxkJBGkigR5LXscLOQLLULQ,5097
+victor_codegraph-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+victor_codegraph-0.0.1.dist-info/top_level.txt,sha256=CrB8C8JPZO8WqMdhwbWlXCuBGfz31FjKgpx1mW09YnM,17
+victor_codegraph-0.0.1.dist-info/RECORD,,

victor_codegraph-0.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

victor_codegraph-0.0.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ victor_codegraph