victor-codegraph 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/PKG-INFO +1 -1
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/pyproject.toml +1 -1
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/tests/test_adapter.py +4 -0
- victor_codegraph-0.1.1/tests/test_line_base_contract.py +30 -0
- victor_codegraph-0.1.1/tests/test_repo.py +62 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/__init__.py +8 -1
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/adapter.py +6 -1
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/model.py +24 -1
- victor_codegraph-0.1.1/victor_codegraph/repo.py +113 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/PKG-INFO +1 -1
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/SOURCES.txt +3 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/README.md +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/setup.cfg +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/tests/test_python_parser.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/tests/test_sizing_and_chunk.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/tests/test_treesitter_jsts.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/config.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/languages.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/parser.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/python_parser.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/sizing.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph/treesitter_parser.py +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/dependency_links.txt +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/requires.txt +0 -0
- {victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: victor-codegraph
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
|
|
5
5
|
Author-email: Vijaykumar Singh <singhvjd@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "victor-codegraph"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -37,6 +37,10 @@ def test_edge_records_reference_node_oids():
|
|
|
37
37
|
assert e["edge"]["from_oid"].startswith("graph/repo1/node/")
|
|
38
38
|
assert e["edge"]["to_oid"].startswith("graph/repo1/node/")
|
|
39
39
|
assert e["edge"]["edge_type"] == "CALLS"
|
|
40
|
+
# edge props always carry a call-site line (0 when unknown; >0 once the parser
|
|
41
|
+
# preserves call_site through resolution — see the call-site-fidelity change).
|
|
42
|
+
assert "line" in e["props"]
|
|
43
|
+
assert isinstance(e["props"]["line"], int)
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
def test_embedder_populates_embedding_cell():
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Pins the canonical 1-based line-number contract (see model.LINE_BASE).
|
|
2
|
+
|
|
3
|
+
This is the cross-surface contract — Victor, the ProximaDB SDK, and AnvaiOps all
|
|
4
|
+
consume this package and must agree on a symbol's (file, name, line) coordinate.
|
|
5
|
+
If a refactor ever shifts line numbers, this test fails loudly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from victor_codegraph import LINE_BASE, parse
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_line_base_is_one():
|
|
14
|
+
assert LINE_BASE == 1
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_parse_emits_1_based_lines():
|
|
18
|
+
# The def is on the FIRST line → start_line must be 1 (not 0).
|
|
19
|
+
parsed = parse("def first():\n return second()\n\n\ndef second():\n return 1\n", file_path="m.py")
|
|
20
|
+
by_name = {s.simple_name: s for s in parsed.symbols}
|
|
21
|
+
assert by_name["first"].location.start_line == 1
|
|
22
|
+
assert by_name["second"].location.start_line == 5 # 1-based line of the 2nd def
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_call_site_line_is_1_based():
|
|
26
|
+
parsed = parse("def a():\n return b()\n\n\ndef b():\n return 1\n", file_path="m.py")
|
|
27
|
+
calls = [r for r in parsed.relations if r.relation_type.name == "CALLS"]
|
|
28
|
+
assert calls
|
|
29
|
+
# the b() call is on line 2 (1-based), not 1
|
|
30
|
+
assert any(r.call_site is not None and r.call_site.start_line == 2 for r in calls)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Repo-walk tests — fully offline (tmp tree + stdlib ast for the .py files)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from victor_codegraph import chunk_path, chunk_repo, iter_source_files, parse_path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _make_tree(root):
|
|
9
|
+
(root / "pkg").mkdir()
|
|
10
|
+
(root / "pkg" / "a.py").write_text("def a():\n return b()\ndef b():\n return 1\n")
|
|
11
|
+
(root / "pkg" / "notes.txt").write_text("not source") # unknown extension
|
|
12
|
+
(root / "README.md").write_text("# docs") # markdown is not in the language map
|
|
13
|
+
# noise dirs that must be skipped
|
|
14
|
+
(root / "node_modules").mkdir()
|
|
15
|
+
(root / "node_modules" / "dep.py").write_text("def vendored():\n pass\n")
|
|
16
|
+
(root / ".git").mkdir()
|
|
17
|
+
(root / ".git" / "hook.py").write_text("def hook():\n pass\n")
|
|
18
|
+
return root
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_iter_source_files_filters_and_skips_noise(tmp_path):
|
|
22
|
+
_make_tree(tmp_path)
|
|
23
|
+
found = {p.name for p in iter_source_files(tmp_path)}
|
|
24
|
+
assert found == {"a.py"} # .txt/.md skipped; node_modules/.git pruned
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_iter_source_files_language_filter(tmp_path):
|
|
28
|
+
_make_tree(tmp_path)
|
|
29
|
+
assert {p.name for p in iter_source_files(tmp_path, languages=["python"])} == {"a.py"}
|
|
30
|
+
assert list(iter_source_files(tmp_path, languages=["rust"])) == []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_iter_single_file(tmp_path):
|
|
34
|
+
f = tmp_path / "solo.py"
|
|
35
|
+
f.write_text("def f():\n return 1\n")
|
|
36
|
+
assert [p.name for p in iter_source_files(f)] == ["solo.py"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_chunk_path(tmp_path):
|
|
40
|
+
f = tmp_path / "m.py"
|
|
41
|
+
f.write_text("def f():\n return 1\n")
|
|
42
|
+
chunks = chunk_path(f)
|
|
43
|
+
assert chunks and any(c.metadata.get("simple_name") == "f" for c in chunks)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_chunk_path_unreadable_returns_empty(tmp_path):
|
|
47
|
+
assert chunk_path(tmp_path / "does_not_exist.py") == []
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_parse_path(tmp_path):
|
|
51
|
+
f = tmp_path / "m.py"
|
|
52
|
+
f.write_text("def a():\n return b()\ndef b():\n return 1\n")
|
|
53
|
+
parsed = parse_path(f)
|
|
54
|
+
assert parsed is not None
|
|
55
|
+
assert {s.simple_name for s in parsed.symbols} == {"a", "b"}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_chunk_repo_streams_all_files(tmp_path):
|
|
59
|
+
_make_tree(tmp_path)
|
|
60
|
+
chunks = list(chunk_repo(tmp_path))
|
|
61
|
+
names = {c.metadata.get("simple_name") for c in chunks}
|
|
62
|
+
assert {"a", "b"} <= names # symbols from pkg/a.py, vendored dirs excluded
|
|
@@ -16,6 +16,7 @@ from .adapter import relation_to_record, symbol_to_record, to_proxima_records
|
|
|
16
16
|
from .config import ChunkConfig
|
|
17
17
|
from .languages import detect_language
|
|
18
18
|
from .model import (
|
|
19
|
+
LINE_BASE,
|
|
19
20
|
CodeChunk,
|
|
20
21
|
CodeRelation,
|
|
21
22
|
CodeRelationType,
|
|
@@ -25,14 +26,20 @@ from .model import (
|
|
|
25
26
|
SourceLocation,
|
|
26
27
|
)
|
|
27
28
|
from .parser import chunk, parse
|
|
29
|
+
from .repo import chunk_path, chunk_repo, iter_source_files, parse_path
|
|
28
30
|
|
|
29
|
-
__version__ = "0.1.
|
|
31
|
+
__version__ = "0.1.1"
|
|
30
32
|
|
|
31
33
|
__all__ = [
|
|
32
34
|
"__version__",
|
|
33
35
|
"chunk",
|
|
34
36
|
"parse",
|
|
37
|
+
"chunk_repo",
|
|
38
|
+
"chunk_path",
|
|
39
|
+
"parse_path",
|
|
40
|
+
"iter_source_files",
|
|
35
41
|
"ChunkConfig",
|
|
42
|
+
"LINE_BASE",
|
|
36
43
|
"detect_language",
|
|
37
44
|
"to_proxima_records",
|
|
38
45
|
"symbol_to_record",
|
|
@@ -73,7 +73,12 @@ def relation_to_record(relation: CodeRelation, repo_graph_id: str, branch_id: st
|
|
|
73
73
|
"to_oid": f"graph/{repo_graph_id}/node/{relation.to_symbol_id}",
|
|
74
74
|
"edge_type": relation.relation_type.name,
|
|
75
75
|
},
|
|
76
|
-
"props": {
|
|
76
|
+
"props": {
|
|
77
|
+
"confidence": relation.confidence,
|
|
78
|
+
"context": relation.context,
|
|
79
|
+
# call-site line (0 when unknown) — lets the graph carry where the edge fires.
|
|
80
|
+
"line": relation.call_site.start_line if relation.call_site is not None else 0,
|
|
81
|
+
},
|
|
77
82
|
}
|
|
78
83
|
|
|
79
84
|
|
|
@@ -13,6 +13,23 @@ from dataclasses import dataclass, field
|
|
|
13
13
|
from enum import IntEnum
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
17
|
+
# CANONICAL LINE-NUMBER CONTRACT
|
|
18
|
+
#
|
|
19
|
+
# Every line number this package emits (``SourceLocation.start_line`` / ``end_line``,
|
|
20
|
+
# ``CodeChunk.metadata['start_line']``/``['end_line']``, the ``call_site`` line on a
|
|
21
|
+
# ``CodeRelation``) is **1-based** — line 1 is the first line of the file. This matches
|
|
22
|
+
# editors (Monaco/VS Code), pgwire, and how humans reference code.
|
|
23
|
+
#
|
|
24
|
+
# victor-codegraph is the SHARED parser across Victor, the ProximaDB SDK, and AnvaiOps,
|
|
25
|
+
# so this convention is the cross-surface contract: a symbol's ``(file, name, line)``
|
|
26
|
+
# coordinate must be identical in every consumer for the correlated code-graph to
|
|
27
|
+
# reconcile. Consumers MUST pass these line numbers through unchanged — do NOT convert
|
|
28
|
+
# to 0-based (a `- 1` shift) for an internal/tree-sitter-compat convention; that breaks
|
|
29
|
+
# cross-surface reconciliation. Byte offsets (``byte_offset``/``start_pos``) are 0-based.
|
|
30
|
+
LINE_BASE = 1
|
|
31
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
32
|
+
|
|
16
33
|
|
|
17
34
|
class CodeSymbolType(IntEnum):
|
|
18
35
|
"""Kinds of code symbol that can be extracted (superset across languages)."""
|
|
@@ -62,7 +79,13 @@ class CodeRelationType(IntEnum):
|
|
|
62
79
|
|
|
63
80
|
@dataclass
|
|
64
81
|
class SourceLocation:
|
|
65
|
-
"""Where a symbol lives in source.
|
|
82
|
+
"""Where a symbol lives in source.
|
|
83
|
+
|
|
84
|
+
Lines (``start_line``/``end_line``) are **1-based** (see ``LINE_BASE`` and the
|
|
85
|
+
canonical line-number contract at the top of this module); byte offsets
|
|
86
|
+
(``byte_offset``/``byte_length``) are 0-based. Pass line numbers through
|
|
87
|
+
unchanged — do not convert to 0-based.
|
|
88
|
+
"""
|
|
66
89
|
|
|
67
90
|
file_path: str
|
|
68
91
|
start_line: int = 0
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Repo-walking convenience — iterate source files and chunk/parse a whole tree.
|
|
2
|
+
|
|
3
|
+
Every consumer (Victor codebase indexing, AnvaiOps code-graph-sync) needs the same
|
|
4
|
+
loop: walk a repository, skip noise directories, and chunk/parse each source file
|
|
5
|
+
whose extension maps to a known language. This puts that loop in one place.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from .config import ChunkConfig
|
|
15
|
+
from .languages import detect_language
|
|
16
|
+
from .model import CodeChunk, ParsedCode
|
|
17
|
+
from .parser import chunk as _chunk
|
|
18
|
+
from .parser import parse as _parse
|
|
19
|
+
|
|
20
|
+
# Directories never worth indexing (VCS, caches, vendored deps, build output).
|
|
21
|
+
DEFAULT_EXCLUDE_DIRS: frozenset[str] = frozenset(
|
|
22
|
+
{
|
|
23
|
+
".git",
|
|
24
|
+
".hg",
|
|
25
|
+
".svn",
|
|
26
|
+
"node_modules",
|
|
27
|
+
"__pycache__",
|
|
28
|
+
".venv",
|
|
29
|
+
"venv",
|
|
30
|
+
".mypy_cache",
|
|
31
|
+
".pytest_cache",
|
|
32
|
+
".ruff_cache",
|
|
33
|
+
".tox",
|
|
34
|
+
"dist",
|
|
35
|
+
"build",
|
|
36
|
+
"target",
|
|
37
|
+
".idea",
|
|
38
|
+
".gradle",
|
|
39
|
+
".next",
|
|
40
|
+
"site-packages",
|
|
41
|
+
"vendor",
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def iter_source_files(
|
|
47
|
+
root: str | os.PathLike[str],
|
|
48
|
+
*,
|
|
49
|
+
languages: list[str] | None = None,
|
|
50
|
+
exclude_dirs: frozenset[str] | set[str] = DEFAULT_EXCLUDE_DIRS,
|
|
51
|
+
follow_symlinks: bool = False,
|
|
52
|
+
) -> Iterator[Path]:
|
|
53
|
+
"""Yield files under ``root`` whose extension maps to a known language.
|
|
54
|
+
|
|
55
|
+
Skips ``exclude_dirs`` and dot-directories. If ``root`` is itself a file, yields it
|
|
56
|
+
(when its extension is recognized). ``languages`` restricts to those language names.
|
|
57
|
+
"""
|
|
58
|
+
root_path = Path(root)
|
|
59
|
+
if root_path.is_file():
|
|
60
|
+
if detect_language(str(root_path)) is not None:
|
|
61
|
+
yield root_path
|
|
62
|
+
return
|
|
63
|
+
allowed = set(languages) if languages else None
|
|
64
|
+
excl = set(exclude_dirs)
|
|
65
|
+
for dirpath, dirnames, filenames in os.walk(root_path, followlinks=follow_symlinks):
|
|
66
|
+
# Prune in place so os.walk does not descend into excluded/hidden dirs.
|
|
67
|
+
dirnames[:] = [d for d in dirnames if d not in excl and not d.startswith(".")]
|
|
68
|
+
for fn in sorted(filenames):
|
|
69
|
+
p = Path(dirpath) / fn
|
|
70
|
+
lang = detect_language(str(p))
|
|
71
|
+
if lang is None:
|
|
72
|
+
continue
|
|
73
|
+
if allowed is not None and lang not in allowed:
|
|
74
|
+
continue
|
|
75
|
+
yield p
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def parse_path(
|
|
79
|
+
path: str | os.PathLike[str], *, encoding: str = "utf-8"
|
|
80
|
+
) -> ParsedCode | None:
|
|
81
|
+
"""Parse a single file into symbols + relations. Returns ``None`` if unreadable."""
|
|
82
|
+
p = Path(path)
|
|
83
|
+
try:
|
|
84
|
+
content = p.read_text(encoding=encoding)
|
|
85
|
+
except (OSError, UnicodeDecodeError):
|
|
86
|
+
return None
|
|
87
|
+
return _parse(content, file_path=str(p))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def chunk_path(
|
|
91
|
+
path: str | os.PathLike[str],
|
|
92
|
+
config: ChunkConfig | None = None,
|
|
93
|
+
*,
|
|
94
|
+
encoding: str = "utf-8",
|
|
95
|
+
) -> list[CodeChunk]:
|
|
96
|
+
"""Read + chunk a single file. Returns an empty list if it can't be read."""
|
|
97
|
+
p = Path(path)
|
|
98
|
+
try:
|
|
99
|
+
content = p.read_text(encoding=encoding)
|
|
100
|
+
except (OSError, UnicodeDecodeError):
|
|
101
|
+
return []
|
|
102
|
+
return _chunk(content, file_path=str(p), config=config)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def chunk_repo(
|
|
106
|
+
root: str | os.PathLike[str],
|
|
107
|
+
config: ChunkConfig | None = None,
|
|
108
|
+
*,
|
|
109
|
+
languages: list[str] | None = None,
|
|
110
|
+
) -> Iterator[CodeChunk]:
|
|
111
|
+
"""Walk ``root`` and yield chunks for every source file (streaming, low-memory)."""
|
|
112
|
+
for p in iter_source_files(root, languages=languages):
|
|
113
|
+
yield from chunk_path(p, config)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: victor-codegraph
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
|
|
5
5
|
Author-email: Vijaykumar Singh <singhvjd@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
README.md
|
|
2
2
|
pyproject.toml
|
|
3
3
|
tests/test_adapter.py
|
|
4
|
+
tests/test_line_base_contract.py
|
|
4
5
|
tests/test_python_parser.py
|
|
6
|
+
tests/test_repo.py
|
|
5
7
|
tests/test_sizing_and_chunk.py
|
|
6
8
|
tests/test_treesitter_jsts.py
|
|
7
9
|
victor_codegraph/__init__.py
|
|
@@ -11,6 +13,7 @@ victor_codegraph/languages.py
|
|
|
11
13
|
victor_codegraph/model.py
|
|
12
14
|
victor_codegraph/parser.py
|
|
13
15
|
victor_codegraph/python_parser.py
|
|
16
|
+
victor_codegraph/repo.py
|
|
14
17
|
victor_codegraph/sizing.py
|
|
15
18
|
victor_codegraph/treesitter_parser.py
|
|
16
19
|
victor_codegraph.egg-info/PKG-INFO
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{victor_codegraph-0.1.0 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|