victor-codegraph 0.0.1__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/PKG-INFO +1 -1
  2. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/pyproject.toml +1 -1
  3. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/tests/test_adapter.py +4 -0
  4. victor_codegraph-0.1.1/tests/test_line_base_contract.py +30 -0
  5. victor_codegraph-0.1.1/tests/test_repo.py +62 -0
  6. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/__init__.py +8 -1
  7. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/adapter.py +6 -1
  8. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/model.py +24 -1
  9. victor_codegraph-0.1.1/victor_codegraph/repo.py +113 -0
  10. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/PKG-INFO +1 -1
  11. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/SOURCES.txt +3 -0
  12. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/README.md +0 -0
  13. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/setup.cfg +0 -0
  14. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/tests/test_python_parser.py +0 -0
  15. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/tests/test_sizing_and_chunk.py +0 -0
  16. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/tests/test_treesitter_jsts.py +0 -0
  17. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/config.py +0 -0
  18. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/languages.py +0 -0
  19. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/parser.py +0 -0
  20. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/python_parser.py +0 -0
  21. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/sizing.py +0 -0
  22. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph/treesitter_parser.py +0 -0
  23. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/dependency_links.txt +0 -0
  24. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/requires.txt +0 -0
  25. {victor_codegraph-0.0.1 → victor_codegraph-0.1.1}/victor_codegraph.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: victor-codegraph
3
- Version: 0.0.1
3
+ Version: 0.1.1
4
4
  Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
5
5
  Author-email: Vijaykumar Singh <singhvjd@gmail.com>
6
6
  License: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "victor-codegraph"
7
- version = "0.0.1"
7
+ version = "0.1.1"
8
8
  description = "Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -37,6 +37,10 @@ def test_edge_records_reference_node_oids():
37
37
  assert e["edge"]["from_oid"].startswith("graph/repo1/node/")
38
38
  assert e["edge"]["to_oid"].startswith("graph/repo1/node/")
39
39
  assert e["edge"]["edge_type"] == "CALLS"
40
+ # edge props always carry a call-site line (0 when unknown; >0 once the parser
41
+ # preserves call_site through resolution — see the call-site-fidelity change).
42
+ assert "line" in e["props"]
43
+ assert isinstance(e["props"]["line"], int)
40
44
 
41
45
 
42
46
  def test_embedder_populates_embedding_cell():
@@ -0,0 +1,30 @@
1
+ """Pins the canonical 1-based line-number contract (see model.LINE_BASE).
2
+
3
+ This is the cross-surface contract — Victor, the ProximaDB SDK, and AnvaiOps all
4
+ consume this package and must agree on a symbol's (file, name, line) coordinate.
5
+ If a refactor ever shifts line numbers, this test fails loudly.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from victor_codegraph import LINE_BASE, parse
11
+
12
+
13
+ def test_line_base_is_one():
14
+ assert LINE_BASE == 1
15
+
16
+
17
+ def test_parse_emits_1_based_lines():
18
+ # The def is on the FIRST line → start_line must be 1 (not 0).
19
+ parsed = parse("def first():\n return second()\n\n\ndef second():\n return 1\n", file_path="m.py")
20
+ by_name = {s.simple_name: s for s in parsed.symbols}
21
+ assert by_name["first"].location.start_line == 1
22
+ assert by_name["second"].location.start_line == 5 # 1-based line of the 2nd def
23
+
24
+
25
+ def test_call_site_line_is_1_based():
26
+ parsed = parse("def a():\n return b()\n\n\ndef b():\n return 1\n", file_path="m.py")
27
+ calls = [r for r in parsed.relations if r.relation_type.name == "CALLS"]
28
+ assert calls
29
+ # the b() call is on line 2 (1-based), not 1
30
+ assert any(r.call_site is not None and r.call_site.start_line == 2 for r in calls)
@@ -0,0 +1,62 @@
1
+ """Repo-walk tests — fully offline (tmp tree + stdlib ast for the .py files)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from victor_codegraph import chunk_path, chunk_repo, iter_source_files, parse_path
6
+
7
+
8
+ def _make_tree(root):
9
+ (root / "pkg").mkdir()
10
+ (root / "pkg" / "a.py").write_text("def a():\n return b()\ndef b():\n return 1\n")
11
+ (root / "pkg" / "notes.txt").write_text("not source") # unknown extension
12
+ (root / "README.md").write_text("# docs") # markdown is not in the language map
13
+ # noise dirs that must be skipped
14
+ (root / "node_modules").mkdir()
15
+ (root / "node_modules" / "dep.py").write_text("def vendored():\n pass\n")
16
+ (root / ".git").mkdir()
17
+ (root / ".git" / "hook.py").write_text("def hook():\n pass\n")
18
+ return root
19
+
20
+
21
+ def test_iter_source_files_filters_and_skips_noise(tmp_path):
22
+ _make_tree(tmp_path)
23
+ found = {p.name for p in iter_source_files(tmp_path)}
24
+ assert found == {"a.py"} # .txt/.md skipped; node_modules/.git pruned
25
+
26
+
27
+ def test_iter_source_files_language_filter(tmp_path):
28
+ _make_tree(tmp_path)
29
+ assert {p.name for p in iter_source_files(tmp_path, languages=["python"])} == {"a.py"}
30
+ assert list(iter_source_files(tmp_path, languages=["rust"])) == []
31
+
32
+
33
+ def test_iter_single_file(tmp_path):
34
+ f = tmp_path / "solo.py"
35
+ f.write_text("def f():\n return 1\n")
36
+ assert [p.name for p in iter_source_files(f)] == ["solo.py"]
37
+
38
+
39
+ def test_chunk_path(tmp_path):
40
+ f = tmp_path / "m.py"
41
+ f.write_text("def f():\n return 1\n")
42
+ chunks = chunk_path(f)
43
+ assert chunks and any(c.metadata.get("simple_name") == "f" for c in chunks)
44
+
45
+
46
+ def test_chunk_path_unreadable_returns_empty(tmp_path):
47
+ assert chunk_path(tmp_path / "does_not_exist.py") == []
48
+
49
+
50
+ def test_parse_path(tmp_path):
51
+ f = tmp_path / "m.py"
52
+ f.write_text("def a():\n return b()\ndef b():\n return 1\n")
53
+ parsed = parse_path(f)
54
+ assert parsed is not None
55
+ assert {s.simple_name for s in parsed.symbols} == {"a", "b"}
56
+
57
+
58
+ def test_chunk_repo_streams_all_files(tmp_path):
59
+ _make_tree(tmp_path)
60
+ chunks = list(chunk_repo(tmp_path))
61
+ names = {c.metadata.get("simple_name") for c in chunks}
62
+ assert {"a", "b"} <= names # symbols from pkg/a.py, vendored dirs excluded
@@ -16,6 +16,7 @@ from .adapter import relation_to_record, symbol_to_record, to_proxima_records
16
16
  from .config import ChunkConfig
17
17
  from .languages import detect_language
18
18
  from .model import (
19
+ LINE_BASE,
19
20
  CodeChunk,
20
21
  CodeRelation,
21
22
  CodeRelationType,
@@ -25,14 +26,20 @@ from .model import (
25
26
  SourceLocation,
26
27
  )
27
28
  from .parser import chunk, parse
29
+ from .repo import chunk_path, chunk_repo, iter_source_files, parse_path
28
30
 
29
- __version__ = "0.0.1"
31
+ __version__ = "0.1.1"
30
32
 
31
33
  __all__ = [
32
34
  "__version__",
33
35
  "chunk",
34
36
  "parse",
37
+ "chunk_repo",
38
+ "chunk_path",
39
+ "parse_path",
40
+ "iter_source_files",
35
41
  "ChunkConfig",
42
+ "LINE_BASE",
36
43
  "detect_language",
37
44
  "to_proxima_records",
38
45
  "symbol_to_record",
@@ -73,7 +73,12 @@ def relation_to_record(relation: CodeRelation, repo_graph_id: str, branch_id: st
73
73
  "to_oid": f"graph/{repo_graph_id}/node/{relation.to_symbol_id}",
74
74
  "edge_type": relation.relation_type.name,
75
75
  },
76
- "props": {"confidence": relation.confidence, "context": relation.context},
76
+ "props": {
77
+ "confidence": relation.confidence,
78
+ "context": relation.context,
79
+ # call-site line (0 when unknown) — lets the graph carry where the edge fires.
80
+ "line": relation.call_site.start_line if relation.call_site is not None else 0,
81
+ },
77
82
  }
78
83
 
79
84
 
@@ -13,6 +13,23 @@ from dataclasses import dataclass, field
13
13
  from enum import IntEnum
14
14
  from typing import Any
15
15
 
16
+ # ─────────────────────────────────────────────────────────────────────────────
17
+ # CANONICAL LINE-NUMBER CONTRACT
18
+ #
19
+ # Every line number this package emits (``SourceLocation.start_line`` / ``end_line``,
20
+ # ``CodeChunk.metadata['start_line']``/``['end_line']``, the ``call_site`` line on a
21
+ # ``CodeRelation``) is **1-based** — line 1 is the first line of the file. This matches
22
+ # editors (Monaco/VS Code), pgwire, and how humans reference code.
23
+ #
24
+ # victor-codegraph is the SHARED parser across Victor, the ProximaDB SDK, and AnvaiOps,
25
+ # so this convention is the cross-surface contract: a symbol's ``(file, name, line)``
26
+ # coordinate must be identical in every consumer for the correlated code-graph to
27
+ # reconcile. Consumers MUST pass these line numbers through unchanged — do NOT convert
28
+ # to 0-based (a `- 1` shift) for an internal/tree-sitter-compat convention; that breaks
29
+ # cross-surface reconciliation. Byte offsets (``byte_offset``/``start_pos``) are 0-based.
30
+ LINE_BASE = 1
31
+ # ─────────────────────────────────────────────────────────────────────────────
32
+
16
33
 
17
34
  class CodeSymbolType(IntEnum):
18
35
  """Kinds of code symbol that can be extracted (superset across languages)."""
@@ -62,7 +79,13 @@ class CodeRelationType(IntEnum):
62
79
 
63
80
  @dataclass
64
81
  class SourceLocation:
65
- """Where a symbol lives in source. Lines are 1-based; bytes are 0-based."""
82
+ """Where a symbol lives in source.
83
+
84
+ Lines (``start_line``/``end_line``) are **1-based** (see ``LINE_BASE`` and the
85
+ canonical line-number contract at the top of this module); byte offsets
86
+ (``byte_offset``/``byte_length``) are 0-based. Pass line numbers through
87
+ unchanged — do not convert to 0-based.
88
+ """
66
89
 
67
90
  file_path: str
68
91
  start_line: int = 0
@@ -0,0 +1,113 @@
1
+ """Repo-walking convenience — iterate source files and chunk/parse a whole tree.
2
+
3
+ Every consumer (Victor codebase indexing, AnvaiOps code-graph-sync) needs the same
4
+ loop: walk a repository, skip noise directories, and chunk/parse each source file
5
+ whose extension maps to a known language. This puts that loop in one place.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from collections.abc import Iterator
12
+ from pathlib import Path
13
+
14
+ from .config import ChunkConfig
15
+ from .languages import detect_language
16
+ from .model import CodeChunk, ParsedCode
17
+ from .parser import chunk as _chunk
18
+ from .parser import parse as _parse
19
+
20
+ # Directories never worth indexing (VCS, caches, vendored deps, build output).
21
+ DEFAULT_EXCLUDE_DIRS: frozenset[str] = frozenset(
22
+ {
23
+ ".git",
24
+ ".hg",
25
+ ".svn",
26
+ "node_modules",
27
+ "__pycache__",
28
+ ".venv",
29
+ "venv",
30
+ ".mypy_cache",
31
+ ".pytest_cache",
32
+ ".ruff_cache",
33
+ ".tox",
34
+ "dist",
35
+ "build",
36
+ "target",
37
+ ".idea",
38
+ ".gradle",
39
+ ".next",
40
+ "site-packages",
41
+ "vendor",
42
+ }
43
+ )
44
+
45
+
46
+ def iter_source_files(
47
+ root: str | os.PathLike[str],
48
+ *,
49
+ languages: list[str] | None = None,
50
+ exclude_dirs: frozenset[str] | set[str] = DEFAULT_EXCLUDE_DIRS,
51
+ follow_symlinks: bool = False,
52
+ ) -> Iterator[Path]:
53
+ """Yield files under ``root`` whose extension maps to a known language.
54
+
55
+ Skips ``exclude_dirs`` and dot-directories. If ``root`` is itself a file, yields it
56
+ (when its extension is recognized). ``languages`` restricts to those language names.
57
+ """
58
+ root_path = Path(root)
59
+ if root_path.is_file():
60
+ if detect_language(str(root_path)) is not None:
61
+ yield root_path
62
+ return
63
+ allowed = set(languages) if languages else None
64
+ excl = set(exclude_dirs)
65
+ for dirpath, dirnames, filenames in os.walk(root_path, followlinks=follow_symlinks):
66
+ # Prune in place so os.walk does not descend into excluded/hidden dirs.
67
+ dirnames[:] = [d for d in dirnames if d not in excl and not d.startswith(".")]
68
+ for fn in sorted(filenames):
69
+ p = Path(dirpath) / fn
70
+ lang = detect_language(str(p))
71
+ if lang is None:
72
+ continue
73
+ if allowed is not None and lang not in allowed:
74
+ continue
75
+ yield p
76
+
77
+
78
+ def parse_path(
79
+ path: str | os.PathLike[str], *, encoding: str = "utf-8"
80
+ ) -> ParsedCode | None:
81
+ """Parse a single file into symbols + relations. Returns ``None`` if unreadable."""
82
+ p = Path(path)
83
+ try:
84
+ content = p.read_text(encoding=encoding)
85
+ except (OSError, UnicodeDecodeError):
86
+ return None
87
+ return _parse(content, file_path=str(p))
88
+
89
+
90
+ def chunk_path(
91
+ path: str | os.PathLike[str],
92
+ config: ChunkConfig | None = None,
93
+ *,
94
+ encoding: str = "utf-8",
95
+ ) -> list[CodeChunk]:
96
+ """Read + chunk a single file. Returns an empty list if it can't be read."""
97
+ p = Path(path)
98
+ try:
99
+ content = p.read_text(encoding=encoding)
100
+ except (OSError, UnicodeDecodeError):
101
+ return []
102
+ return _chunk(content, file_path=str(p), config=config)
103
+
104
+
105
+ def chunk_repo(
106
+ root: str | os.PathLike[str],
107
+ config: ChunkConfig | None = None,
108
+ *,
109
+ languages: list[str] | None = None,
110
+ ) -> Iterator[CodeChunk]:
111
+ """Walk ``root`` and yield chunks for every source file (streaming, low-memory)."""
112
+ for p in iter_source_files(root, languages=languages):
113
+ yield from chunk_path(p, config)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: victor-codegraph
3
- Version: 0.0.1
3
+ Version: 0.1.1
4
4
  Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
5
5
  Author-email: Vijaykumar Singh <singhvjd@gmail.com>
6
6
  License: Apache-2.0
@@ -1,7 +1,9 @@
1
1
  README.md
2
2
  pyproject.toml
3
3
  tests/test_adapter.py
4
+ tests/test_line_base_contract.py
4
5
  tests/test_python_parser.py
6
+ tests/test_repo.py
5
7
  tests/test_sizing_and_chunk.py
6
8
  tests/test_treesitter_jsts.py
7
9
  victor_codegraph/__init__.py
@@ -11,6 +13,7 @@ victor_codegraph/languages.py
11
13
  victor_codegraph/model.py
12
14
  victor_codegraph/parser.py
13
15
  victor_codegraph/python_parser.py
16
+ victor_codegraph/repo.py
14
17
  victor_codegraph/sizing.py
15
18
  victor_codegraph/treesitter_parser.py
16
19
  victor_codegraph.egg-info/PKG-INFO