victor-codegraph 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ """victor-codegraph — shared code->CPG chunker.
2
+
3
+ One tree-sitter symbol+relation chunker, three consumers (Victor, ProximaDB SDK,
4
+ AnvaiOps). See ProximaDB ADR-029 / Victor ADR-014.
5
+
6
+ from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
7
+
8
+ chunks = chunk(source, file_path="foo.py") # size-capped, embeddable
9
+ parsed = parse(source, file_path="foo.py") # symbols + relations
10
+ records = to_proxima_records(parsed, repo_graph_id="myrepo")
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from .adapter import relation_to_record, symbol_to_record, to_proxima_records
16
+ from .config import ChunkConfig
17
+ from .languages import detect_language
18
+ from .model import (
19
+ CodeChunk,
20
+ CodeRelation,
21
+ CodeRelationType,
22
+ CodeSymbol,
23
+ CodeSymbolType,
24
+ ParsedCode,
25
+ SourceLocation,
26
+ )
27
+ from .parser import chunk, parse
28
+
29
+ __version__ = "0.0.1"
30
+
31
+ __all__ = [
32
+ "__version__",
33
+ "chunk",
34
+ "parse",
35
+ "ChunkConfig",
36
+ "detect_language",
37
+ "to_proxima_records",
38
+ "symbol_to_record",
39
+ "relation_to_record",
40
+ "CodeChunk",
41
+ "CodeSymbol",
42
+ "CodeRelation",
43
+ "CodeSymbolType",
44
+ "CodeRelationType",
45
+ "ParsedCode",
46
+ "SourceLocation",
47
+ ]
@@ -0,0 +1,94 @@
1
+ """Projection to the ProximaDB substrate-keystone ``ProximaRecord`` shape.
2
+
3
+ Per ProximaDB ``CODE_GRAPH_CORRELATED_SUBSTRATE_2026_06_22.adoc`` a code symbol is *one*
4
+ record addressable as a relational row, a graph node, and a vector at once. This adapter
5
+ emits the **shape** as plain dicts — it does not import proximadb, embed, or write. The
6
+ consumer (Victor embedded, AnvaiOps service) supplies the embedder and the DB write.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Callable
12
+
13
+ from .model import CodeRelation, CodeSymbol, ParsedCode
14
+
15
+ Embedder = Callable[[str], list[float]]
16
+
17
+
18
+ def _symbol_oid(repo_graph_id: str, symbol: CodeSymbol) -> str:
19
+ return f"graph/{repo_graph_id}/node/{symbol.id}"
20
+
21
+
22
+ def symbol_to_record(
23
+ symbol: CodeSymbol,
24
+ repo_graph_id: str,
25
+ branch_id: str = "main",
26
+ embedder: Embedder | None = None,
27
+ model_id: str = "bge-small-en-v1.5",
28
+ dim: int = 384,
29
+ ) -> dict[str, Any]:
30
+ """Project one symbol to a node record (row + graph node + optional vector)."""
31
+
32
+ oid = _symbol_oid(repo_graph_id, symbol)
33
+ record: dict[str, Any] = {
34
+ "oid": oid,
35
+ "labels": ["graph_node", "code_symbol"],
36
+ "branch_id": branch_id,
37
+ "props": {
38
+ "name": symbol.simple_name,
39
+ "fully_qualified_name": symbol.fully_qualified_name,
40
+ "file": symbol.location.file_path,
41
+ "line": symbol.location.start_line,
42
+ "end_line": symbol.location.end_line,
43
+ "lang": symbol.language,
44
+ "ast_kind": symbol.symbol_type.name,
45
+ "signature": symbol.signature,
46
+ "visibility": "private" if "private" in symbol.modifiers else "public",
47
+ "module_path": "::".join(symbol.scope_chain),
48
+ "snippet": symbol.source_code,
49
+ "documentation": symbol.documentation,
50
+ },
51
+ "embeddings": [],
52
+ }
53
+ if embedder is not None:
54
+ record["embeddings"].append(
55
+ {
56
+ "model_id": model_id,
57
+ "modality": "code",
58
+ "dim": dim,
59
+ "values": embedder(symbol.source_code),
60
+ }
61
+ )
62
+ return record
63
+
64
+
65
+ def relation_to_record(relation: CodeRelation, repo_graph_id: str, branch_id: str = "main") -> dict[str, Any]:
66
+ """Project one relation to an edge record."""
67
+
68
+ return {
69
+ "labels": ["graph_edge"],
70
+ "branch_id": branch_id,
71
+ "edge": {
72
+ "from_oid": f"graph/{repo_graph_id}/node/{relation.from_symbol_id}",
73
+ "to_oid": f"graph/{repo_graph_id}/node/{relation.to_symbol_id}",
74
+ "edge_type": relation.relation_type.name,
75
+ },
76
+ "props": {"confidence": relation.confidence, "context": relation.context},
77
+ }
78
+
79
+
80
+ def to_proxima_records(
81
+ parsed: ParsedCode,
82
+ repo_graph_id: str,
83
+ branch_id: str = "main",
84
+ embedder: Embedder | None = None,
85
+ ) -> list[dict[str, Any]]:
86
+ """Project an entire parsed file to node + edge records (shapes only)."""
87
+
88
+ records = [
89
+ symbol_to_record(s, repo_graph_id, branch_id, embedder) for s in parsed.symbols
90
+ ]
91
+ records.extend(
92
+ relation_to_record(r, repo_graph_id, branch_id) for r in parsed.relations
93
+ )
94
+ return records
@@ -0,0 +1,48 @@
1
+ """Chunking configuration — the size discipline both donor parsers needed.
2
+
3
+ ProximaDB's ``code.py`` had *no* size-capping (a huge function became one huge chunk);
4
+ LlamaIndex ``CodeSplitter`` and Victor's chunker both cap size. This config carries the
5
+ cap so the merged parser never emits an over-budget chunk.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+
12
+
13
+ @dataclass
14
+ class ChunkConfig:
15
+ """Size + scope knobs for code chunking.
16
+
17
+ The token budget is matched to the embedding model (BGE-small 384-d ~ 512 tokens),
18
+ not an arbitrary char count. ``chars_per_token`` is conservative to avoid truncation.
19
+ """
20
+
21
+ # Size cap (the gap fix). A symbol whose body exceeds the budget is split.
22
+ max_chunk_tokens: int = 512
23
+ chunk_overlap_tokens: int = 64
24
+ chars_per_token: float = 3.5
25
+ # Symbols below this many lines are never body-split (cheap, keep whole).
26
+ large_symbol_threshold_lines: int = 30
27
+
28
+ # Scope filters.
29
+ include_private: bool = True
30
+ include_tests: bool = True
31
+ extract_relations: bool = True
32
+ # Restrict to these languages (None = all detectable).
33
+ languages: list[str] | None = None
34
+
35
+ # Computed budgets (chars), derived in __post_init__.
36
+ max_chunk_chars: int = field(init=False, default=0)
37
+ chunk_overlap_chars: int = field(init=False, default=0)
38
+
39
+ def __post_init__(self) -> None:
40
+ self.max_chunk_chars = max(1, int(self.max_chunk_tokens * self.chars_per_token))
41
+ self.chunk_overlap_chars = max(
42
+ 0, min(int(self.chunk_overlap_tokens * self.chars_per_token), self.max_chunk_chars - 1)
43
+ )
44
+
45
+ def estimate_tokens(self, text: str) -> int:
46
+ """Conservative token estimate for ``text``."""
47
+
48
+ return int(len(text) / self.chars_per_token) + 1
@@ -0,0 +1,69 @@
1
+ """Language detection + the tree-sitter grammar name map.
2
+
3
+ The Python path uses the stdlib ``ast`` (no grammar needed); everything else routes to
4
+ the generic tree-sitter extractor when the grammar pack is installed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+
11
+ # Extension -> canonical language name.
12
+ EXTENSION_TO_LANGUAGE: dict[str, str] = {
13
+ ".py": "python",
14
+ ".pyi": "python",
15
+ ".js": "javascript",
16
+ ".jsx": "javascript",
17
+ ".mjs": "javascript",
18
+ ".cjs": "javascript",
19
+ ".ts": "typescript",
20
+ ".tsx": "tsx",
21
+ ".rs": "rust",
22
+ ".go": "go",
23
+ ".java": "java",
24
+ ".c": "c",
25
+ ".h": "c",
26
+ ".cpp": "cpp",
27
+ ".cc": "cpp",
28
+ ".cxx": "cpp",
29
+ ".hpp": "cpp",
30
+ ".cs": "csharp",
31
+ ".rb": "ruby",
32
+ ".php": "php",
33
+ ".swift": "swift",
34
+ ".kt": "kotlin",
35
+ ".scala": "scala",
36
+ ".sh": "bash",
37
+ ".bash": "bash",
38
+ ".lua": "lua",
39
+ ".sql": "sql",
40
+ }
41
+
42
+ # Canonical language name -> tree-sitter-language-pack grammar name. (Most are 1:1;
43
+ # this indirection lets us split TS/TSX which share one canonical extraction path.)
44
+ TREE_SITTER_GRAMMAR: dict[str, str] = {
45
+ "javascript": "javascript",
46
+ "typescript": "typescript",
47
+ "tsx": "tsx",
48
+ "rust": "rust",
49
+ "go": "go",
50
+ "java": "java",
51
+ "c": "c",
52
+ "cpp": "cpp",
53
+ "csharp": "csharp",
54
+ "ruby": "ruby",
55
+ "php": "php",
56
+ "swift": "swift",
57
+ "kotlin": "kotlin",
58
+ "scala": "scala",
59
+ "bash": "bash",
60
+ "lua": "lua",
61
+ "sql": "sql",
62
+ }
63
+
64
+
65
+ def detect_language(file_path: str) -> str | None:
66
+ """Best-effort language from file extension."""
67
+
68
+ _, ext = os.path.splitext(file_path)
69
+ return EXTENSION_TO_LANGUAGE.get(ext.lower())
@@ -0,0 +1,148 @@
1
+ """Canonical, neutral data model for code symbols, relations, and chunks.
2
+
3
+ This is the *union* of the two donor implementations (ProximaDB SDK ``code.py`` and
4
+ Victor ``victor-coding``): ProximaDB contributed the richer symbol/relation taxonomy;
5
+ Victor contributed the size-aware ``CodeChunk`` with hierarchical IDs. The model carries
6
+ no SaaS/DB/framework concept, so every consumer can depend on it freely.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ from dataclasses import dataclass, field
13
+ from enum import IntEnum
14
+ from typing import Any
15
+
16
+
17
+ class CodeSymbolType(IntEnum):
18
+ """Kinds of code symbol that can be extracted (superset across languages)."""
19
+
20
+ FILE = 1
21
+ MODULE = 2
22
+ PACKAGE = 3
23
+ CLASS = 4
24
+ INTERFACE = 5
25
+ TRAIT = 6
26
+ STRUCT = 7
27
+ ENUM = 8
28
+ FUNCTION = 9
29
+ METHOD = 10
30
+ CONSTRUCTOR = 11
31
+ PROPERTY = 12
32
+ FIELD = 13
33
+ CONSTANT = 14
34
+ VARIABLE = 15
35
+ PARAMETER = 16
36
+ TYPE_ALIAS = 17
37
+ MACRO = 18
38
+
39
+
40
+ class CodeRelationType(IntEnum):
41
+ """Directed relationships between code symbols."""
42
+
43
+ CALLS = 1
44
+ CALLED_BY = 2
45
+ EXTENDS = 3
46
+ IMPLEMENTS = 4
47
+ USES_TYPE = 5
48
+ RETURNS_TYPE = 6
49
+ IMPORTS = 7
50
+ IMPORTED_BY = 8
51
+ DEPENDS_ON = 9
52
+ CONTAINS = 10
53
+ CONTAINED_BY = 11
54
+ DEFINES = 12
55
+ REFERENCES = 13
56
+ REFERENCED_BY = 14
57
+ OVERRIDES = 15
58
+ OVERRIDDEN_BY = 16
59
+ TESTS = 17
60
+ TESTED_BY = 18
61
+
62
+
63
+ @dataclass
64
+ class SourceLocation:
65
+ """Where a symbol lives in source. Lines are 1-based; bytes are 0-based."""
66
+
67
+ file_path: str
68
+ start_line: int = 0
69
+ start_column: int = 0
70
+ end_line: int = 0
71
+ end_column: int = 0
72
+ byte_offset: int = 0
73
+ byte_length: int = 0
74
+
75
+
76
+ @dataclass
77
+ class CodeSymbol:
78
+ """A semantic code unit (function/class/method/struct/...)."""
79
+
80
+ id: str
81
+ symbol_type: CodeSymbolType
82
+ fully_qualified_name: str
83
+ simple_name: str
84
+ location: SourceLocation
85
+ source_code: str
86
+ language: str
87
+ documentation: str | None = None
88
+ signature: str | None = None
89
+ modifiers: list[str] = field(default_factory=list)
90
+ scope_chain: list[str] = field(default_factory=list)
91
+ parameters: list[dict[str, Any]] = field(default_factory=list)
92
+ return_type: str | None = None
93
+ complexity: dict[str, int] | None = None
94
+ metadata: dict[str, Any] = field(default_factory=dict)
95
+
96
+
97
+ @dataclass
98
+ class CodeRelation:
99
+ """A directed edge between two symbols (by id)."""
100
+
101
+ from_symbol_id: str
102
+ to_symbol_id: str
103
+ relation_type: CodeRelationType
104
+ call_site: SourceLocation | None = None
105
+ context: str | None = None
106
+ confidence: float = 1.0
107
+
108
+
109
+ @dataclass
110
+ class ParsedCode:
111
+ """Result of parsing one source file."""
112
+
113
+ file_path: str
114
+ language: str
115
+ symbols: list[CodeSymbol] = field(default_factory=list)
116
+ relations: list[CodeRelation] = field(default_factory=list)
117
+ imports: list[str] = field(default_factory=list)
118
+ content_hash: str = ""
119
+
120
+
121
+ @dataclass
122
+ class CodeChunk:
123
+ """An embeddable, size-capped chunk projected from a symbol.
124
+
125
+ A symbol within the size budget yields exactly one chunk; an oversized symbol is
126
+ body-split into several chunks sharing ``symbol_id`` (see ``sizing``). ``chunk_id``
127
+ is hierarchical and deterministic so incremental re-index is an idempotent upsert.
128
+ """
129
+
130
+ chunk_id: str
131
+ text: str
132
+ symbol_id: str
133
+ start_pos: int
134
+ end_pos: int
135
+ metadata: dict[str, Any] = field(default_factory=dict)
136
+
137
+
138
+ def deterministic_symbol_id(file_path: str, name: str, line: int, column: int = 0) -> str:
139
+ """Stable 16-hex id keyed on (file, name, line, col) — same input, same id."""
140
+
141
+ key = f"{file_path}:{name}:{line}:{column}"
142
+ return hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
143
+
144
+
145
+ def content_hash(content: str) -> str:
146
+ """SHA-256 of file content, for change detection on the re-index hot path."""
147
+
148
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
@@ -0,0 +1,120 @@
1
+ """Public entrypoints: ``parse`` (symbols+relations) and ``chunk`` (size-capped).
2
+
3
+ Fallback chain (Victor's posture): python-ast -> tree-sitter -> sliding-window. A parse
4
+ never hard-fails; an unknown or grammar-less language degrades to line-window chunks.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .config import ChunkConfig
10
+ from .languages import detect_language
11
+ from .model import CodeChunk, ParsedCode, content_hash
12
+ from .python_parser import parse_python
13
+ from .sizing import chunks_for_symbol
14
+ from .treesitter_parser import GrammarUnavailable, parse_treesitter
15
+
16
+
17
+ def parse(content: str, language: str | None = None, file_path: str = "<unknown>") -> ParsedCode:
18
+ """Parse source into symbols + relations, falling back gracefully."""
19
+
20
+ language = language or detect_language(file_path)
21
+
22
+ if language == "python":
23
+ try:
24
+ return parse_python(content, file_path)
25
+ except SyntaxError:
26
+ pass # fall through to window chunking via empty ParsedCode
27
+
28
+ if language is not None and language != "python":
29
+ try:
30
+ return parse_treesitter(content, file_path, language)
31
+ except GrammarUnavailable:
32
+ pass
33
+
34
+ # Last resort: no symbols (caller's chunk() will sliding-window the raw text).
35
+ return ParsedCode(
36
+ file_path=file_path,
37
+ language=language or "text",
38
+ symbols=[],
39
+ relations=[],
40
+ imports=[],
41
+ content_hash=content_hash(content),
42
+ )
43
+
44
+
45
+ def _sliding_window(content: str, file_path: str, language: str, config: ChunkConfig) -> list[CodeChunk]:
46
+ """Universal fallback when no symbols were extracted."""
47
+
48
+ if not content:
49
+ return []
50
+ lines = content.splitlines(keepends=True)
51
+ out: list[CodeChunk] = []
52
+ cur: list[str] = []
53
+ cur_len = 0
54
+ start_line = 1
55
+ idx = 0
56
+ for n, ln in enumerate(lines, start=1):
57
+ if cur_len + len(ln) > config.max_chunk_chars and cur:
58
+ text = "".join(cur)
59
+ out.append(
60
+ CodeChunk(
61
+ chunk_id=f"{file_path}#window#{idx}",
62
+ text=text,
63
+ symbol_id=f"{file_path}#window#{idx}",
64
+ start_pos=0,
65
+ end_pos=0,
66
+ metadata={
67
+ "file_path": file_path,
68
+ "language": language,
69
+ "chunk_index": idx,
70
+ "start_line": start_line,
71
+ "end_line": n - 1,
72
+ "strategy": "sliding_window",
73
+ },
74
+ )
75
+ )
76
+ idx += 1
77
+ cur, cur_len, start_line = [], 0, n
78
+ cur.append(ln)
79
+ cur_len += len(ln)
80
+ if cur:
81
+ out.append(
82
+ CodeChunk(
83
+ chunk_id=f"{file_path}#window#{idx}",
84
+ text="".join(cur),
85
+ symbol_id=f"{file_path}#window#{idx}",
86
+ start_pos=0,
87
+ end_pos=0,
88
+ metadata={
89
+ "file_path": file_path,
90
+ "language": language,
91
+ "chunk_index": idx,
92
+ "start_line": start_line,
93
+ "end_line": len(lines),
94
+ "strategy": "sliding_window",
95
+ },
96
+ )
97
+ )
98
+ return out
99
+
100
+
101
+ def chunk(
102
+ content: str,
103
+ language: str | None = None,
104
+ file_path: str = "<unknown>",
105
+ config: ChunkConfig | None = None,
106
+ ) -> list[CodeChunk]:
107
+ """Parse + project into size-capped, embeddable chunks."""
108
+
109
+ config = config or ChunkConfig()
110
+ parsed = parse(content, language, file_path)
111
+
112
+ if not parsed.symbols:
113
+ return _sliding_window(content, file_path, parsed.language, config)
114
+
115
+ out: list[CodeChunk] = []
116
+ for sym in parsed.symbols:
117
+ if not config.include_private and "private" in sym.modifiers:
118
+ continue
119
+ out.extend(chunks_for_symbol(sym, config))
120
+ return out
@@ -0,0 +1,270 @@
1
+ """Python parser using the stdlib ``ast`` module.
2
+
3
+ This is the primary Python path (Victor's approach): it needs no native grammar, is
4
+ deterministic, and works fully offline. Extracts modules/classes/functions/methods with
5
+ signatures, docstrings, decorators, parameters, cyclomatic complexity, and CALLS edges.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+
12
+ from .model import (
13
+ CodeRelation,
14
+ CodeRelationType,
15
+ CodeSymbol,
16
+ CodeSymbolType,
17
+ ParsedCode,
18
+ SourceLocation,
19
+ content_hash,
20
+ deterministic_symbol_id,
21
+ )
22
+
23
+ _BRANCH_NODES = (
24
+ ast.If,
25
+ ast.For,
26
+ ast.AsyncFor,
27
+ ast.While,
28
+ ast.With,
29
+ ast.AsyncWith,
30
+ ast.Try,
31
+ ast.ExceptHandler,
32
+ ast.BoolOp,
33
+ ast.IfExp,
34
+ ast.comprehension,
35
+ )
36
+
37
+
38
+ def _cyclomatic(node: ast.AST) -> dict[str, int]:
39
+ count = 1
40
+ for child in ast.walk(node):
41
+ if isinstance(child, _BRANCH_NODES):
42
+ count += 1
43
+ lineno = getattr(node, "lineno", 1)
44
+ end = getattr(node, "end_lineno", lineno) or lineno
45
+ return {"cyclomatic": count, "lines": end - lineno + 1}
46
+
47
+
48
+ def _params(args: ast.arguments) -> list[dict]:
49
+ out: list[dict] = []
50
+ posonly = getattr(args, "posonlyargs", [])
51
+ for a in [*posonly, *args.args]:
52
+ if a.arg in ("self", "cls"):
53
+ continue
54
+ p: dict = {"name": a.arg}
55
+ if a.annotation is not None:
56
+ p["type"] = ast.unparse(a.annotation)
57
+ out.append(p)
58
+ if args.vararg is not None:
59
+ out.append({"name": f"*{args.vararg.arg}", "is_variadic": True})
60
+ for a in args.kwonlyargs:
61
+ p = {"name": a.arg, "is_kwonly": True}
62
+ if a.annotation is not None:
63
+ p["type"] = ast.unparse(a.annotation)
64
+ out.append(p)
65
+ if args.kwarg is not None:
66
+ out.append({"name": f"**{args.kwarg.arg}", "is_variadic": True})
67
+ return out
68
+
69
+
70
+ def _signature(name: str, args: ast.arguments, returns: ast.AST | None) -> str:
71
+ parts = []
72
+ for p in _params(args):
73
+ s = p["name"]
74
+ if p.get("type"):
75
+ s += f": {p['type']}"
76
+ parts.append(s)
77
+ sig = f"{name}({', '.join(parts)})"
78
+ if returns is not None:
79
+ sig += f" -> {ast.unparse(returns)}"
80
+ return sig
81
+
82
+
83
+ def _modifiers(name: str, decorators: list[ast.expr], is_async: bool) -> list[str]:
84
+ mods = [f"@{ast.unparse(d)}" for d in decorators]
85
+ if is_async:
86
+ mods.append("async")
87
+ if name.startswith("__") and name.endswith("__"):
88
+ mods.append("dunder")
89
+ elif name.startswith("_"):
90
+ mods.append("private")
91
+ return mods
92
+
93
+
94
+ def _callee_name(call: ast.Call) -> str | None:
95
+ f = call.func
96
+ if isinstance(f, ast.Name):
97
+ return f.id
98
+ if isinstance(f, ast.Attribute):
99
+ return f.attr
100
+ return None
101
+
102
+
103
+ class _Visitor:
104
+ def __init__(self, file_path: str, source: str) -> None:
105
+ self.file_path = file_path
106
+ self.source = source
107
+ self.symbols: list[CodeSymbol] = []
108
+ self.relations: list[CodeRelation] = []
109
+ self.imports: list[str] = []
110
+ self._fqn_prefix = file_path.replace("/", ".").replace("\\", ".")
111
+
112
+ def _src(self, node: ast.AST) -> str:
113
+ try:
114
+ return ast.get_source_segment(self.source, node) or ""
115
+ except Exception:
116
+ return ""
117
+
118
+ def _make_symbol(
119
+ self,
120
+ node: ast.AST,
121
+ name: str,
122
+ symbol_type: CodeSymbolType,
123
+ scope: list[str],
124
+ signature: str | None = None,
125
+ params: list[dict] | None = None,
126
+ return_type: str | None = None,
127
+ modifiers: list[str] | None = None,
128
+ ) -> CodeSymbol:
129
+ lineno = getattr(node, "lineno", 1)
130
+ end = getattr(node, "end_lineno", lineno) or lineno
131
+ col = getattr(node, "col_offset", 0)
132
+ fqn = "::".join([self._fqn_prefix, *scope, name])
133
+ return CodeSymbol(
134
+ id=deterministic_symbol_id(self.file_path, name, lineno, col),
135
+ symbol_type=symbol_type,
136
+ fully_qualified_name=fqn,
137
+ simple_name=name,
138
+ location=SourceLocation(
139
+ file_path=self.file_path,
140
+ start_line=lineno,
141
+ start_column=col,
142
+ end_line=end,
143
+ end_column=getattr(node, "end_col_offset", 0) or 0,
144
+ ),
145
+ source_code=self._src(node),
146
+ language="python",
147
+ documentation=(
148
+ ast.get_docstring(node)
149
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef))
150
+ else None
151
+ ),
152
+ signature=signature,
153
+ modifiers=modifiers or [],
154
+ scope_chain=list(scope),
155
+ parameters=params or [],
156
+ return_type=return_type,
157
+ complexity=_cyclomatic(node),
158
+ )
159
+
160
+ def visit_function(
161
+ self, node: ast.FunctionDef | ast.AsyncFunctionDef, scope: list[str]
162
+ ) -> CodeSymbol:
163
+ name = node.name
164
+ if scope:
165
+ stype = CodeSymbolType.CONSTRUCTOR if name == "__init__" else CodeSymbolType.METHOD
166
+ else:
167
+ stype = CodeSymbolType.FUNCTION
168
+ sym = self._make_symbol(
169
+ node,
170
+ name,
171
+ stype,
172
+ scope,
173
+ signature=_signature(name, node.args, node.returns),
174
+ params=_params(node.args),
175
+ return_type=ast.unparse(node.returns) if node.returns is not None else None,
176
+ modifiers=_modifiers(name, node.decorator_list, isinstance(node, ast.AsyncFunctionDef)),
177
+ )
178
+ self.symbols.append(sym)
179
+ # CALLS edges. ``to_symbol_id`` is the textual callee here; ``parse_python``
180
+ # resolves it to a real in-file symbol id when the callee is defined locally
181
+ # and otherwise keeps it as a bare name (so cross-file/external calls — e.g.
182
+ # a CPG's blast radius — are not silently dropped). ``call_site`` records the
183
+ # call line for consumers that need it.
184
+ for child in ast.walk(node):
185
+ if isinstance(child, ast.Call):
186
+ callee = _callee_name(child)
187
+ if callee:
188
+ self.relations.append(
189
+ CodeRelation(
190
+ from_symbol_id=sym.id,
191
+ to_symbol_id=callee,
192
+ relation_type=CodeRelationType.CALLS,
193
+ context=callee,
194
+ call_site=SourceLocation(
195
+ file_path=self.file_path,
196
+ start_line=getattr(child, "lineno", 0),
197
+ start_column=getattr(child, "col_offset", 0),
198
+ ),
199
+ )
200
+ )
201
+ return sym
202
+
203
+ def visit_class(self, node: ast.ClassDef, scope: list[str]) -> None:
204
+ bases = [ast.unparse(b) for b in node.bases]
205
+ mods = [f"@{ast.unparse(d)}" for d in node.decorator_list]
206
+ if bases:
207
+ mods.append(f"extends({','.join(bases)})")
208
+ cls = self._make_symbol(node, node.name, CodeSymbolType.CLASS, scope, modifiers=mods)
209
+ self.symbols.append(cls)
210
+ for base in bases:
211
+ self.relations.append(
212
+ CodeRelation(
213
+ from_symbol_id=cls.id,
214
+ to_symbol_id=base,
215
+ relation_type=CodeRelationType.EXTENDS,
216
+ context=base,
217
+ )
218
+ )
219
+ inner = [*scope, node.name]
220
+ for child in node.body:
221
+ if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
222
+ self.visit_function(child, inner)
223
+ elif isinstance(child, ast.ClassDef):
224
+ self.visit_class(child, inner)
225
+
226
+ def run(self, tree: ast.Module) -> None:
227
+ for node in tree.body:
228
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
229
+ self.imports.append(ast.unparse(node))
230
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
231
+ self.visit_function(node, [])
232
+ elif isinstance(node, ast.ClassDef):
233
+ self.visit_class(node, [])
234
+
235
+
236
+ def parse_python(content: str, file_path: str) -> ParsedCode:
237
+ """Parse Python source into symbols + relations using the stdlib ``ast``."""
238
+
239
+ tree = ast.parse(content)
240
+ v = _Visitor(file_path, content)
241
+ v.run(tree)
242
+ # Resolve CALLS/EXTENDS targets to real in-file symbol ids when possible.
243
+ # Unresolved targets (external / cross-file callees and bases) are RETAINED with
244
+ # ``to_symbol_id`` = the textual name and ``confidence`` < 1.0, so consumers that
245
+ # need outgoing-call coverage (e.g. a CPG's blast radius) are not silently lossy.
246
+ # Only self-references (recursive calls) are dropped.
247
+ by_name: dict[str, str] = {s.simple_name: s.id for s in v.symbols}
248
+ resolved: list[CodeRelation] = []
249
+ for r in v.relations:
250
+ target_id = by_name.get(r.to_symbol_id)
251
+ if target_id == r.from_symbol_id:
252
+ continue # self-reference (recursive call) — emit no self-edge
253
+ resolved.append(
254
+ CodeRelation(
255
+ from_symbol_id=r.from_symbol_id,
256
+ to_symbol_id=target_id if target_id is not None else r.to_symbol_id,
257
+ relation_type=r.relation_type,
258
+ context=r.context,
259
+ call_site=r.call_site,
260
+ confidence=1.0 if target_id is not None else 0.5,
261
+ )
262
+ )
263
+ return ParsedCode(
264
+ file_path=file_path,
265
+ language="python",
266
+ symbols=v.symbols,
267
+ relations=resolved,
268
+ imports=v.imports,
269
+ content_hash=content_hash(content),
270
+ )
@@ -0,0 +1,121 @@
1
+ """Size-capping / body-split — the discipline ProximaDB's ``code.py`` lacked.
2
+
3
+ A symbol within budget yields one chunk. An oversized symbol is split into
4
+ line-aligned, overlapping sub-chunks (LlamaIndex ``CodeSplitter`` style: respect
5
+ structure, but never exceed ``max_chunk_chars``). Sub-chunks share the parent
6
+ ``symbol_id`` and carry hierarchical, deterministic ``chunk_id``s (Victor's pattern).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .config import ChunkConfig
12
+ from .model import CodeChunk, CodeSymbol
13
+
14
+
15
+ def _base_metadata(symbol: CodeSymbol) -> dict:
16
+ return {
17
+ "symbol_id": symbol.id,
18
+ "symbol_type": symbol.symbol_type.name,
19
+ "fully_qualified_name": symbol.fully_qualified_name,
20
+ "simple_name": symbol.simple_name,
21
+ "language": symbol.language,
22
+ "file_path": symbol.location.file_path,
23
+ "start_line": symbol.location.start_line,
24
+ "end_line": symbol.location.end_line,
25
+ "signature": symbol.signature,
26
+ "documentation": symbol.documentation,
27
+ "modifiers": list(symbol.modifiers),
28
+ "scope_chain": list(symbol.scope_chain),
29
+ "return_type": symbol.return_type,
30
+ "complexity": symbol.complexity,
31
+ }
32
+
33
+
34
+ def chunks_for_symbol(symbol: CodeSymbol, config: ChunkConfig) -> list[CodeChunk]:
35
+ """Project one symbol into one or more size-capped chunks."""
36
+
37
+ source = symbol.source_code
38
+ line_count = symbol.location.end_line - symbol.location.start_line + 1
39
+ fits = len(source) <= config.max_chunk_chars
40
+ small = line_count <= config.large_symbol_threshold_lines
41
+
42
+ if fits or small:
43
+ # Whole symbol as a single chunk. If a *small* symbol is still over the char
44
+ # budget (rare: dense one-liners), we still cap it below.
45
+ if fits:
46
+ meta = _base_metadata(symbol)
47
+ meta["chunk_index"] = 0
48
+ meta["chunk_total"] = 1
49
+ return [
50
+ CodeChunk(
51
+ chunk_id=f"{symbol.id}#0",
52
+ text=source,
53
+ symbol_id=symbol.id,
54
+ start_pos=symbol.location.byte_offset,
55
+ end_pos=symbol.location.byte_offset + len(source.encode("utf-8")),
56
+ metadata=meta,
57
+ )
58
+ ]
59
+
60
+ return _body_split(symbol, config)
61
+
62
+
63
+ def _body_split(symbol: CodeSymbol, config: ChunkConfig) -> list[CodeChunk]:
64
+ """Split an oversized symbol body into overlapping, line-aligned sub-chunks."""
65
+
66
+ lines = symbol.source_code.splitlines(keepends=True)
67
+ max_chars = config.max_chunk_chars
68
+ overlap_chars = config.chunk_overlap_chars
69
+
70
+ windows: list[tuple[int, str]] = [] # (start_line_offset, text)
71
+ cur: list[str] = []
72
+ cur_len = 0
73
+ cur_start = 0
74
+ i = 0
75
+ while i < len(lines):
76
+ ln = lines[i]
77
+ # A single line longer than the budget is hard-cut (degenerate minified case).
78
+ if not cur and len(ln) > max_chars:
79
+ windows.append((i, ln[:max_chars]))
80
+ i += 1
81
+ cur_start = i
82
+ continue
83
+ if cur_len + len(ln) > max_chars and cur:
84
+ windows.append((cur_start, "".join(cur)))
85
+ # Build overlap tail by walking back from the end of the current window.
86
+ tail: list[str] = []
87
+ tail_len = 0
88
+ j = i - 1
89
+ while j >= cur_start and tail_len + len(lines[j]) <= overlap_chars:
90
+ tail.insert(0, lines[j])
91
+ tail_len += len(lines[j])
92
+ j -= 1
93
+ cur = list(tail)
94
+ cur_len = tail_len
95
+ cur_start = j + 1
96
+ cur.append(ln)
97
+ cur_len += len(ln)
98
+ i += 1
99
+ if cur:
100
+ windows.append((cur_start, "".join(cur)))
101
+
102
+ total = len(windows)
103
+ out: list[CodeChunk] = []
104
+ base_line = symbol.location.start_line
105
+ for idx, (line_off, text) in enumerate(windows):
106
+ meta = _base_metadata(symbol)
107
+ meta["chunk_index"] = idx
108
+ meta["chunk_total"] = total
109
+ meta["is_body_split"] = True
110
+ meta["start_line"] = base_line + line_off
111
+ out.append(
112
+ CodeChunk(
113
+ chunk_id=f"{symbol.id}#body#{idx}",
114
+ text=text,
115
+ symbol_id=symbol.id,
116
+ start_pos=symbol.location.byte_offset,
117
+ end_pos=symbol.location.byte_offset + len(symbol.source_code.encode("utf-8")),
118
+ metadata=meta,
119
+ )
120
+ )
121
+ return out
@@ -0,0 +1,268 @@
1
+ """Generic tree-sitter parser for non-Python languages.
2
+
3
+ Includes a *real* JavaScript/TypeScript extractor — the donor ProximaDB ``code.py``
4
+ shipped a JS/TS stub that returned no symbols (CLAUDE-mandate "plausible-but-wrong"
5
+ failure). Here JS/TS, plus a language-agnostic node-walk for common grammars, produce
6
+ functions, classes, methods, and imports.
7
+
8
+ Requires the ``treesitter`` extra (``tree-sitter`` + ``tree-sitter-language-pack``).
9
+ If the grammar is unavailable, :func:`parse_treesitter` raises :class:`GrammarUnavailable`
10
+ so the orchestrator can fall back.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from .languages import TREE_SITTER_GRAMMAR
16
+ from .model import (
17
+ CodeRelation,
18
+ CodeRelationType,
19
+ CodeSymbol,
20
+ CodeSymbolType,
21
+ ParsedCode,
22
+ SourceLocation,
23
+ content_hash,
24
+ deterministic_symbol_id,
25
+ )
26
+
27
+
28
+ class GrammarUnavailable(RuntimeError):
29
+ """Raised when a tree-sitter grammar can't be loaded for a language."""
30
+
31
+
32
+ # Node types that denote a callable, per common tree-sitter grammars.
33
+ _FUNC_NODES = {
34
+ "function_declaration",
35
+ "function_definition",
36
+ "function_item", # rust
37
+ "method_definition", # js/ts
38
+ "method_declaration", # java/go
39
+ "function",
40
+ "arrow_function",
41
+ }
42
+ _CLASS_NODES = {
43
+ "class_declaration",
44
+ "class_definition",
45
+ "class_specifier",
46
+ "struct_item", # rust
47
+ "struct_specifier",
48
+ "interface_declaration",
49
+ "impl_item", # rust
50
+ }
51
+ _IMPORT_NODES = {
52
+ "import_statement",
53
+ "import_declaration",
54
+ "import_from_statement",
55
+ "use_declaration", # rust
56
+ "preproc_include", # c/cpp
57
+ }
58
+
59
+
60
+ def _get_parser(grammar: str):
61
+ # Build an OFFICIAL `tree_sitter.Parser` from the pack's Language, rather than
62
+ # `tree_sitter_language_pack.get_parser()` — the latter can return a vendored
63
+ # binding whose nodes are a minimal `builtins.Node` lacking `.type`/`.children`.
64
+ # The official Parser yields standard nodes (type/children/start_byte properties).
65
+ try:
66
+ from tree_sitter import Parser
67
+ from tree_sitter_language_pack import get_language
68
+ except Exception as e: # ImportError or native load failure
69
+ raise GrammarUnavailable(f"tree-sitter unavailable: {e}") from e
70
+ try:
71
+ language = get_language(grammar)
72
+ except Exception as e:
73
+ raise GrammarUnavailable(f"grammar '{grammar}' unavailable: {e}") from e
74
+ try:
75
+ return Parser(language) # tree_sitter >= 0.23
76
+ except TypeError:
77
+ parser = Parser() # older API: set the language attribute
78
+ parser.language = language
79
+ return parser
80
+
81
+
82
+ def _text(node, src: bytes) -> str:
83
+ return src[node.start_byte : node.end_byte].decode("utf-8", errors="replace")
84
+
85
+
86
+ def _attr(obj, name):
87
+ """Access a tree-sitter attribute that may be a property OR a zero-arg method.
88
+
89
+ tree-sitter-language-pack's bundled binding exposes `root_node`/`children` as
90
+ *methods* (callables), whereas the canonical `tree_sitter` exposes them as
91
+ properties. A list/Node is never callable, so this is safe for both shapes.
92
+ """
93
+
94
+ v = getattr(obj, name)
95
+ return v() if callable(v) else v
96
+
97
+
98
+ def _children(node):
99
+ """Return a node's children across tree-sitter binding flavors.
100
+
101
+ `children` may be a property, a zero-arg method, or absent entirely (the
102
+ bundled binding exposes only `child_count` + `child(i)`, the universal C API).
103
+ """
104
+
105
+ children = getattr(node, "children", None)
106
+ if children is not None:
107
+ return children() if callable(children) else children
108
+ count = _attr(node, "child_count")
109
+ return [node.child(i) for i in range(count)]
110
+
111
+
112
+ def _name_of(node, src: bytes) -> str | None:
113
+ """Find the declared name of a function/class node (grammar-agnostic)."""
114
+
115
+ field = node.child_by_field_name("name")
116
+ if field is not None:
117
+ return _text(field, src)
118
+ for child in _children(node):
119
+ if child.type in ("identifier", "type_identifier", "field_identifier", "property_identifier"):
120
+ return _text(child, src)
121
+ return None
122
+
123
+
124
+ def _walk_collect(node, src, file_path, language, scope, symbols, relations):
125
+ for child in _children(node):
126
+ t = child.type
127
+ if t in _CLASS_NODES:
128
+ name = _name_of(child, src) or "<anonymous>"
129
+ stype = (
130
+ CodeSymbolType.STRUCT
131
+ if t.startswith("struct")
132
+ else CodeSymbolType.INTERFACE
133
+ if "interface" in t
134
+ else CodeSymbolType.CLASS
135
+ )
136
+ sym = _mk(child, src, file_path, language, name, stype, scope)
137
+ symbols.append(sym)
138
+ # Recurse into the class body with the class name on the scope chain.
139
+ body = child.child_by_field_name("body")
140
+ _walk_collect(
141
+ body if body is not None else child,
142
+ src,
143
+ file_path,
144
+ language,
145
+ [*scope, name],
146
+ symbols,
147
+ relations,
148
+ )
149
+ elif t in _FUNC_NODES:
150
+ name = _name_of(child, src)
151
+ if name is None and t == "arrow_function":
152
+ continue # anonymous arrow not bound to a name; skip
153
+ name = name or "<anonymous>"
154
+ stype = CodeSymbolType.METHOD if scope else CodeSymbolType.FUNCTION
155
+ if name in ("constructor", "__init__", "new"):
156
+ stype = CodeSymbolType.CONSTRUCTOR
157
+ symbols.append(_mk(child, src, file_path, language, name, stype, scope))
158
+ # Don't recurse into function bodies for nested defs (kept flat, like donors).
159
+ else:
160
+ _walk_collect(child, src, file_path, language, scope, symbols, relations)
161
+
162
+
163
+ def _handle_const_arrow(node, src, file_path, language, symbols):
164
+ """JS/TS: ``const foo = (...) => {...}`` / ``export const foo = () => {}``."""
165
+
166
+ for decl in _children(node):
167
+ if decl.type != "variable_declarator":
168
+ continue
169
+ name_node = decl.child_by_field_name("name")
170
+ value = decl.child_by_field_name("value")
171
+ if name_node is not None and value is not None and value.type == "arrow_function":
172
+ name = _text(name_node, src)
173
+ symbols.append(
174
+ _mk(decl, src, file_path, language, name, CodeSymbolType.FUNCTION, [])
175
+ )
176
+
177
+
178
+ def _mk(node, src, file_path, language, name, stype, scope) -> CodeSymbol:
179
+ start_line = node.start_point[0] + 1
180
+ end_line = node.end_point[0] + 1
181
+ fqn = "::".join([file_path.replace("/", "."), *scope, name])
182
+ return CodeSymbol(
183
+ id=deterministic_symbol_id(file_path, name, start_line, node.start_point[1]),
184
+ symbol_type=stype,
185
+ fully_qualified_name=fqn,
186
+ simple_name=name,
187
+ location=SourceLocation(
188
+ file_path=file_path,
189
+ start_line=start_line,
190
+ start_column=node.start_point[1],
191
+ end_line=end_line,
192
+ end_column=node.end_point[1],
193
+ byte_offset=node.start_byte,
194
+ byte_length=node.end_byte - node.start_byte,
195
+ ),
196
+ source_code=_text(node, src),
197
+ language=language,
198
+ scope_chain=list(scope),
199
+ complexity={"lines": end_line - start_line + 1},
200
+ )
201
+
202
+
203
+ def parse_treesitter(content: str, file_path: str, language: str) -> ParsedCode:
204
+ """Parse non-Python source via tree-sitter. Raises GrammarUnavailable on fallback."""
205
+
206
+ grammar = TREE_SITTER_GRAMMAR.get(language)
207
+ if grammar is None:
208
+ raise GrammarUnavailable(f"no grammar mapping for language '{language}'")
209
+ parser = _get_parser(grammar)
210
+
211
+ src = content.encode("utf-8")
212
+ # tree-sitter's Parser.parse() takes bytes on most builds, but some
213
+ # (notably via tree-sitter-language-pack) require a str and raise
214
+ # TypeError on bytes. Node byte offsets are UTF-8 positions either way,
215
+ # so `src` stays valid for slicing in _text() regardless of which we pass.
216
+ try:
217
+ tree = parser.parse(src)
218
+ except TypeError:
219
+ tree = parser.parse(content)
220
+ root = _attr(tree, "root_node")
221
+
222
+ symbols: list[CodeSymbol] = []
223
+ relations: list[CodeRelation] = []
224
+ imports: list[str] = []
225
+
226
+ for child in _children(root):
227
+ if child.type in _IMPORT_NODES:
228
+ imports.append(_text(child, src))
229
+
230
+ _walk_collect(root, src, file_path, language, [], symbols, relations)
231
+
232
+ # JS/TS arrow-function-as-const: a real surface the stub missed.
233
+ if language in ("javascript", "typescript", "tsx"):
234
+ for child in _children(root):
235
+ target = child
236
+ # unwrap `export const ...`
237
+ if child.type in ("export_statement",) and child.child_count:
238
+ for c in _children(child):
239
+ if c.type in ("lexical_declaration", "variable_declaration"):
240
+ target = c
241
+ break
242
+ if target.type in ("lexical_declaration", "variable_declaration"):
243
+ _handle_const_arrow(target, src, file_path, language, symbols)
244
+
245
+ # Best-effort CONTAINS edges (class -> its methods) by scope.
246
+ by_scope: dict[str, str] = {
247
+ s.simple_name: s.id for s in symbols if s.symbol_type == CodeSymbolType.CLASS
248
+ }
249
+ for s in symbols:
250
+ if s.scope_chain:
251
+ parent = by_scope.get(s.scope_chain[-1])
252
+ if parent is not None and parent != s.id:
253
+ relations.append(
254
+ CodeRelation(
255
+ from_symbol_id=parent,
256
+ to_symbol_id=s.id,
257
+ relation_type=CodeRelationType.CONTAINS,
258
+ )
259
+ )
260
+
261
+ return ParsedCode(
262
+ file_path=file_path,
263
+ language=language,
264
+ symbols=symbols,
265
+ relations=relations,
266
+ imports=imports,
267
+ content_hash=content_hash(content),
268
+ )
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: victor-codegraph
3
+ Version: 0.0.1
4
+ Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
5
+ Author-email: Vijaykumar Singh <singhvjd@gmail.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/vjsingh1984/victor
8
+ Project-URL: Repository, https://github.com/vjsingh1984/victor
9
+ Keywords: code-graph,cpg,chunking,tree-sitter,ast,embeddings,rag
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Provides-Extra: treesitter
21
+ Requires-Dist: tree-sitter>=0.23; extra == "treesitter"
22
+ Requires-Dist: tree-sitter-language-pack>=1.0; extra == "treesitter"
23
+ Provides-Extra: contracts
24
+ Requires-Dist: victor-contracts<1.0,>=0.7.0; extra == "contracts"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0; extra == "dev"
27
+ Requires-Dist: ruff>=0.5; extra == "dev"
28
+ Requires-Dist: tree-sitter>=0.23; extra == "dev"
29
+ Requires-Dist: tree-sitter-language-pack>=1.0; extra == "dev"
30
+
31
+ # victor-codegraph
32
+
33
+ Shared **code → Code-Property-Graph chunker**: tree-sitter symbol + relation extraction,
34
+ size-capped embeddable chunks, and a `ProximaRecord` projection. One chunker, three
35
+ consumers — Victor (owner), the ProximaDB SDK (`[codegraph]` extra), and AnvaiOps (SaaS
36
+ code-graph vertical).
37
+
38
+ > Design: ProximaDB `ADR-029` (authoritative) · Victor `ADR-014` (owner/donor) ·
39
+ > AnvaiOps `ADR-0018` (consumer). This package is the **TD-CG1** scaffold.
40
+
41
+ ## Why
42
+
43
+ The same tree-sitter code→symbol+relation chunker existed twice (ProximaDB SDK `code.py`
44
+ and Victor `victor-coding`) and was about to be written a third time in AnvaiOps. This
45
+ package is the single neutral home. It merges the best of both donors and fixes their two
46
+ gaps:
47
+
48
+ - **Size-capping** — ProximaDB's `code.py` emitted one chunk per symbol with *no* size
49
+ bound (a huge function became a huge chunk). Here, oversized symbols are body-split with
50
+ overlap (LlamaIndex `CodeSplitter` discipline). See `sizing.py`.
51
+ - **Real JS/TS** — the donor JS/TS parser was a stub returning no symbols. Here JS/TS get a
52
+ real tree-sitter extractor (functions, classes, methods, `const … = () =>`, imports).
53
+
54
+ ## Install
55
+
56
+ Not yet published to PyPI — use an **editable install** from the monorepo for now. Consumers
57
+ (Victor, the ProximaDB SDK, AnvaiOps) reference it editable until the first `victor-codegraph-v*`
58
+ release is cut.
59
+
60
+ ```bash
61
+ # dev: editable, with tree-sitter grammars + test tooling
62
+ make -C victor-codegraph dev # = pip install -e ../victor-contracts && pip install -e ".[dev]"
63
+
64
+ # minimal: Python-only (stdlib ast) path, zero native deps
65
+ pip install -e ./victor-codegraph
66
+
67
+ # once published:
68
+ # pip install victor-codegraph # Python path
69
+ # pip install "victor-codegraph[treesitter]" # + multi-language grammars
70
+ ```
71
+
72
+ ### Releasing
73
+
74
+ CI: `.github/workflows/ci-codegraph.yml` runs the suite (editable install, grammars on) for every
75
+ PR touching `victor-codegraph/**`. Publishing: push a tag `victor-codegraph-v0.1.0` to trigger
76
+ `.github/workflows/release-codegraph.yml`, which builds and publishes via **PyPI Trusted Publishing**
77
+ (OIDC — no API token). Configure the publisher once on PyPI (owner `vjsingh1984`, repo `victor`,
78
+ workflow `release-codegraph.yml`, environments `pypi` / `testpypi`); see the header of that workflow.
79
+
80
+ ## Use
81
+
82
+ ```python
83
+ from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
84
+
85
+ # Size-capped, embeddable chunks:
86
+ chunks = chunk(source, file_path="app/service.py", config=ChunkConfig(max_chunk_tokens=512))
87
+
88
+ # Symbols + relations:
89
+ parsed = parse(source, file_path="app/service.py")
90
+
91
+ # Project to the ProximaDB substrate-keystone record shape (one symbol = row+node+vector):
92
+ records = to_proxima_records(parsed, repo_graph_id="myrepo", branch_id="main",
93
+ embedder=my_embed_fn) # embedder optional
94
+ ```
95
+
96
+ ## Design principles (the "best posture" this encodes)
97
+
98
+ 1. Chunk at **symbol** granularity (not statement, not fixed-size).
99
+ 2. **AST-aligned and size-capped** — never split mid-statement, never exceed the budget.
100
+ 3. Extract **relations** (CALLS/EXTENDS/CONTAINS/…) and project to a CPG.
101
+ 4. **Deterministic IDs + content hash** → idempotent incremental re-index.
102
+ 5. **Graceful fallback chain**: python-ast → tree-sitter → sliding-window.
103
+ 6. Token budget **matched to the embedding model** (BGE-small 384-d ≈ 512 tokens).
104
+
105
+ ## Status
106
+
107
+ `0.1.0` — TD-CG1 scaffold. Python (stdlib `ast`) is the primary, fully-offline path.
108
+ Multi-language extraction is best-effort via tree-sitter; deeper per-language relation
109
+ extraction (the donor parsers' Rust/Go/Java specifics) lands incrementally.
@@ -0,0 +1,13 @@
1
+ victor_codegraph/__init__.py,sha256=HFoP1DTpRjFEP6UJD-0ZshZ6miKtQlE54i0RUKo5kMc,1205
2
+ victor_codegraph/adapter.py,sha256=x6h6X3t6Ks-bOCmKAfgq5n8fhq-ZlVlq10wLvdxQ7rU,3217
3
+ victor_codegraph/config.py,sha256=RastshUhVaPQMltdLfQesxJlX2k4BCUPvxUc7YKOjtE,1793
4
+ victor_codegraph/languages.py,sha256=qS4uwkEaEZqL3f7QM0cgkhpVsXws2PziMFRDkZ9VRFA,1672
5
+ victor_codegraph/model.py,sha256=BVu3wBiGN5FgRCdzsYWSqjqhqWmmQyqjUgfv1SqW2DQ,3903
6
+ victor_codegraph/parser.py,sha256=2pB02ygojCmefpUy94fceymZFqN4TwxIP1Zjscmuc54,3951
7
+ victor_codegraph/python_parser.py,sha256=DSBKnF8GnruvOGAqmnqms-cL9saaxhln1N6OY8hedRM,9655
8
+ victor_codegraph/sizing.py,sha256=K7B866OrrmbsbqGRUSEdkVqa3atDPHMVbEmXNYRoyEE,4446
9
+ victor_codegraph/treesitter_parser.py,sha256=NKRWRjl3ryVfljHiD2yUEIXnAyXISt0m2UZdJRQLsmc,9813
10
+ victor_codegraph-0.0.1.dist-info/METADATA,sha256=7l3h1akUIuM2PMC1pWmcqxkJBGkigR5LXscLOQLLULQ,5097
11
+ victor_codegraph-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ victor_codegraph-0.0.1.dist-info/top_level.txt,sha256=CrB8C8JPZO8WqMdhwbWlXCuBGfz31FjKgpx1mW09YnM,17
13
+ victor_codegraph-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ victor_codegraph