victor-codegraph 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- victor_codegraph-0.0.1/PKG-INFO +109 -0
- victor_codegraph-0.0.1/README.md +79 -0
- victor_codegraph-0.0.1/pyproject.toml +74 -0
- victor_codegraph-0.0.1/setup.cfg +4 -0
- victor_codegraph-0.0.1/tests/test_adapter.py +52 -0
- victor_codegraph-0.0.1/tests/test_python_parser.py +108 -0
- victor_codegraph-0.0.1/tests/test_sizing_and_chunk.py +62 -0
- victor_codegraph-0.0.1/tests/test_treesitter_jsts.py +74 -0
- victor_codegraph-0.0.1/victor_codegraph/__init__.py +47 -0
- victor_codegraph-0.0.1/victor_codegraph/adapter.py +94 -0
- victor_codegraph-0.0.1/victor_codegraph/config.py +48 -0
- victor_codegraph-0.0.1/victor_codegraph/languages.py +69 -0
- victor_codegraph-0.0.1/victor_codegraph/model.py +148 -0
- victor_codegraph-0.0.1/victor_codegraph/parser.py +120 -0
- victor_codegraph-0.0.1/victor_codegraph/python_parser.py +270 -0
- victor_codegraph-0.0.1/victor_codegraph/sizing.py +121 -0
- victor_codegraph-0.0.1/victor_codegraph/treesitter_parser.py +268 -0
- victor_codegraph-0.0.1/victor_codegraph.egg-info/PKG-INFO +109 -0
- victor_codegraph-0.0.1/victor_codegraph.egg-info/SOURCES.txt +20 -0
- victor_codegraph-0.0.1/victor_codegraph.egg-info/dependency_links.txt +1 -0
- victor_codegraph-0.0.1/victor_codegraph.egg-info/requires.txt +13 -0
- victor_codegraph-0.0.1/victor_codegraph.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: victor-codegraph
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps.
|
|
5
|
+
Author-email: Vijaykumar Singh <singhvjd@gmail.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/vjsingh1984/victor
|
|
8
|
+
Project-URL: Repository, https://github.com/vjsingh1984/victor
|
|
9
|
+
Keywords: code-graph,cpg,chunking,tree-sitter,ast,embeddings,rag
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Provides-Extra: treesitter
|
|
21
|
+
Requires-Dist: tree-sitter>=0.23; extra == "treesitter"
|
|
22
|
+
Requires-Dist: tree-sitter-language-pack>=1.0; extra == "treesitter"
|
|
23
|
+
Provides-Extra: contracts
|
|
24
|
+
Requires-Dist: victor-contracts<1.0,>=0.7.0; extra == "contracts"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
28
|
+
Requires-Dist: tree-sitter>=0.23; extra == "dev"
|
|
29
|
+
Requires-Dist: tree-sitter-language-pack>=1.0; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# victor-codegraph
|
|
32
|
+
|
|
33
|
+
Shared **code → Code-Property-Graph chunker**: tree-sitter symbol + relation extraction,
|
|
34
|
+
size-capped embeddable chunks, and a `ProximaRecord` projection. One chunker, three
|
|
35
|
+
consumers — Victor (owner), the ProximaDB SDK (`[codegraph]` extra), and AnvaiOps (SaaS
|
|
36
|
+
code-graph vertical).
|
|
37
|
+
|
|
38
|
+
> Design: ProximaDB `ADR-029` (authoritative) · Victor `ADR-014` (owner/donor) ·
|
|
39
|
+
> AnvaiOps `ADR-0018` (consumer). This package is the **TD-CG1** scaffold.
|
|
40
|
+
|
|
41
|
+
## Why
|
|
42
|
+
|
|
43
|
+
The same tree-sitter code→symbol+relation chunker existed twice (ProximaDB SDK `code.py`
|
|
44
|
+
and Victor `victor-coding`) and was about to be written a third time in AnvaiOps. This
|
|
45
|
+
package is the single neutral home. It merges the best of both donors and fixes their two
|
|
46
|
+
gaps:
|
|
47
|
+
|
|
48
|
+
- **Size-capping** — ProximaDB's `code.py` emitted one chunk per symbol with *no* size
|
|
49
|
+
bound (a huge function became a huge chunk). Here, oversized symbols are body-split with
|
|
50
|
+
overlap (LlamaIndex `CodeSplitter` discipline). See `sizing.py`.
|
|
51
|
+
- **Real JS/TS** — the donor JS/TS parser was a stub returning no symbols. Here JS/TS get a
|
|
52
|
+
real tree-sitter extractor (functions, classes, methods, `const … = () =>`, imports).
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
Not yet published to PyPI — use an **editable install** from the monorepo for now. Consumers
|
|
57
|
+
(Victor, the ProximaDB SDK, AnvaiOps) reference it editable until the first `victor-codegraph-v*`
|
|
58
|
+
release is cut.
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# dev: editable, with tree-sitter grammars + test tooling
|
|
62
|
+
make -C victor-codegraph dev # = pip install -e ../victor-contracts && pip install -e ".[dev]"
|
|
63
|
+
|
|
64
|
+
# minimal: Python-only (stdlib ast) path, zero native deps
|
|
65
|
+
pip install -e ./victor-codegraph
|
|
66
|
+
|
|
67
|
+
# once published:
|
|
68
|
+
# pip install victor-codegraph # Python path
|
|
69
|
+
# pip install "victor-codegraph[treesitter]" # + multi-language grammars
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Releasing
|
|
73
|
+
|
|
74
|
+
CI: `.github/workflows/ci-codegraph.yml` runs the suite (editable install, grammars on) for every
|
|
75
|
+
PR touching `victor-codegraph/**`. Publishing: push a tag `victor-codegraph-v0.1.0` to trigger
|
|
76
|
+
`.github/workflows/release-codegraph.yml`, which builds and publishes via **PyPI Trusted Publishing**
|
|
77
|
+
(OIDC — no API token). Configure the publisher once on PyPI (owner `vjsingh1984`, repo `victor`,
|
|
78
|
+
workflow `release-codegraph.yml`, environments `pypi` / `testpypi`); see the header of that workflow.
|
|
79
|
+
|
|
80
|
+
## Use
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
|
|
84
|
+
|
|
85
|
+
# Size-capped, embeddable chunks:
|
|
86
|
+
chunks = chunk(source, file_path="app/service.py", config=ChunkConfig(max_chunk_tokens=512))
|
|
87
|
+
|
|
88
|
+
# Symbols + relations:
|
|
89
|
+
parsed = parse(source, file_path="app/service.py")
|
|
90
|
+
|
|
91
|
+
# Project to the ProximaDB substrate-keystone record shape (one symbol = row+node+vector):
|
|
92
|
+
records = to_proxima_records(parsed, repo_graph_id="myrepo", branch_id="main",
|
|
93
|
+
embedder=my_embed_fn) # embedder optional
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Design principles (the "best posture" this encodes)
|
|
97
|
+
|
|
98
|
+
1. Chunk at **symbol** granularity (not statement, not fixed-size).
|
|
99
|
+
2. **AST-aligned and size-capped** — never split mid-statement, never exceed the budget.
|
|
100
|
+
3. Extract **relations** (CALLS/EXTENDS/CONTAINS/…) and project to a CPG.
|
|
101
|
+
4. **Deterministic IDs + content hash** → idempotent incremental re-index.
|
|
102
|
+
5. **Graceful fallback chain**: python-ast → tree-sitter → sliding-window.
|
|
103
|
+
6. Token budget **matched to the embedding model** (BGE-small 384-d ≈ 512 tokens).
|
|
104
|
+
|
|
105
|
+
## Status
|
|
106
|
+
|
|
107
|
+
`0.1.0` — TD-CG1 scaffold. Python (stdlib `ast`) is the primary, fully-offline path.
|
|
108
|
+
Multi-language extraction is best-effort via tree-sitter; deeper per-language relation
|
|
109
|
+
extraction (the donor parsers' Rust/Go/Java specifics) lands incrementally.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# victor-codegraph
|
|
2
|
+
|
|
3
|
+
Shared **code → Code-Property-Graph chunker**: tree-sitter symbol + relation extraction,
|
|
4
|
+
size-capped embeddable chunks, and a `ProximaRecord` projection. One chunker, three
|
|
5
|
+
consumers — Victor (owner), the ProximaDB SDK (`[codegraph]` extra), and AnvaiOps (SaaS
|
|
6
|
+
code-graph vertical).
|
|
7
|
+
|
|
8
|
+
> Design: ProximaDB `ADR-029` (authoritative) · Victor `ADR-014` (owner/donor) ·
|
|
9
|
+
> AnvaiOps `ADR-0018` (consumer). This package is the **TD-CG1** scaffold.
|
|
10
|
+
|
|
11
|
+
## Why
|
|
12
|
+
|
|
13
|
+
The same tree-sitter code→symbol+relation chunker existed twice (ProximaDB SDK `code.py`
|
|
14
|
+
and Victor `victor-coding`) and was about to be written a third time in AnvaiOps. This
|
|
15
|
+
package is the single neutral home. It merges the best of both donors and fixes their two
|
|
16
|
+
gaps:
|
|
17
|
+
|
|
18
|
+
- **Size-capping** — ProximaDB's `code.py` emitted one chunk per symbol with *no* size
|
|
19
|
+
bound (a huge function became a huge chunk). Here, oversized symbols are body-split with
|
|
20
|
+
overlap (LlamaIndex `CodeSplitter` discipline). See `sizing.py`.
|
|
21
|
+
- **Real JS/TS** — the donor JS/TS parser was a stub returning no symbols. Here JS/TS get a
|
|
22
|
+
real tree-sitter extractor (functions, classes, methods, `const … = () =>`, imports).
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
Not yet published to PyPI — use an **editable install** from the monorepo for now. Consumers
|
|
27
|
+
(Victor, the ProximaDB SDK, AnvaiOps) reference it editable until the first `victor-codegraph-v*`
|
|
28
|
+
release is cut.
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# dev: editable, with tree-sitter grammars + test tooling
|
|
32
|
+
make -C victor-codegraph dev # = pip install -e ../victor-contracts && pip install -e ".[dev]"
|
|
33
|
+
|
|
34
|
+
# minimal: Python-only (stdlib ast) path, zero native deps
|
|
35
|
+
pip install -e ./victor-codegraph
|
|
36
|
+
|
|
37
|
+
# once published:
|
|
38
|
+
# pip install victor-codegraph # Python path
|
|
39
|
+
# pip install "victor-codegraph[treesitter]" # + multi-language grammars
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Releasing
|
|
43
|
+
|
|
44
|
+
CI: `.github/workflows/ci-codegraph.yml` runs the suite (editable install, grammars on) for every
|
|
45
|
+
PR touching `victor-codegraph/**`. Publishing: push a tag `victor-codegraph-v0.1.0` to trigger
|
|
46
|
+
`.github/workflows/release-codegraph.yml`, which builds and publishes via **PyPI Trusted Publishing**
|
|
47
|
+
(OIDC — no API token). Configure the publisher once on PyPI (owner `vjsingh1984`, repo `victor`,
|
|
48
|
+
workflow `release-codegraph.yml`, environments `pypi` / `testpypi`); see the header of that workflow.
|
|
49
|
+
|
|
50
|
+
## Use
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
|
|
54
|
+
|
|
55
|
+
# Size-capped, embeddable chunks:
|
|
56
|
+
chunks = chunk(source, file_path="app/service.py", config=ChunkConfig(max_chunk_tokens=512))
|
|
57
|
+
|
|
58
|
+
# Symbols + relations:
|
|
59
|
+
parsed = parse(source, file_path="app/service.py")
|
|
60
|
+
|
|
61
|
+
# Project to the ProximaDB substrate-keystone record shape (one symbol = row+node+vector):
|
|
62
|
+
records = to_proxima_records(parsed, repo_graph_id="myrepo", branch_id="main",
|
|
63
|
+
embedder=my_embed_fn) # embedder optional
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Design principles (the "best posture" this encodes)
|
|
67
|
+
|
|
68
|
+
1. Chunk at **symbol** granularity (not statement, not fixed-size).
|
|
69
|
+
2. **AST-aligned and size-capped** — never split mid-statement, never exceed the budget.
|
|
70
|
+
3. Extract **relations** (CALLS/EXTENDS/CONTAINS/…) and project to a CPG.
|
|
71
|
+
4. **Deterministic IDs + content hash** → idempotent incremental re-index.
|
|
72
|
+
5. **Graceful fallback chain**: python-ast → tree-sitter → sliding-window.
|
|
73
|
+
6. Token budget **matched to the embedding model** (BGE-small 384-d ≈ 512 tokens).
|
|
74
|
+
|
|
75
|
+
## Status
|
|
76
|
+
|
|
77
|
+
`0.1.0` — TD-CG1 scaffold. Python (stdlib `ast`) is the primary, fully-offline path.
|
|
78
|
+
Multi-language extraction is best-effort via tree-sitter; deeper per-language relation
|
|
79
|
+
extraction (the donor parsers' Rust/Go/Java specifics) lands incrementally.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "victor-codegraph"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Code->CPG chunker: tree-sitter symbol + relation extraction, size-capped chunks, ProximaRecord projection. Shared by Victor, ProximaDB SDK, and AnvaiOps."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Vijaykumar Singh", email = "singhvjd@gmail.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"code-graph",
|
|
17
|
+
"cpg",
|
|
18
|
+
"chunking",
|
|
19
|
+
"tree-sitter",
|
|
20
|
+
"ast",
|
|
21
|
+
"embeddings",
|
|
22
|
+
"rag",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"License :: OSI Approved :: Apache Software License",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
"Programming Language :: Python :: 3.10",
|
|
30
|
+
"Programming Language :: Python :: 3.11",
|
|
31
|
+
"Programming Language :: Python :: 3.12",
|
|
32
|
+
"Topic :: Software Development :: Libraries",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
# Thin + neutral by design (ADR-014 / ProximaDB ADR-029). The base install is
|
|
36
|
+
# ZERO-dependency: the Python path uses the stdlib `ast` and needs no native grammars,
|
|
37
|
+
# so `pip install -e .` works fully offline. Opt-in extras add multi-language grammars
|
|
38
|
+
# (`treesitter`) and the contracts Protocol boundary (`contracts`). victor-contracts
|
|
39
|
+
# becomes a hard dep once the parser is exposed as a contracts Protocol (ADR-014).
|
|
40
|
+
dependencies = []
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
treesitter = [
|
|
44
|
+
"tree-sitter>=0.23",
|
|
45
|
+
"tree-sitter-language-pack>=1.0",
|
|
46
|
+
]
|
|
47
|
+
contracts = [
|
|
48
|
+
"victor-contracts>=0.7.0,<1.0",
|
|
49
|
+
]
|
|
50
|
+
dev = [
|
|
51
|
+
"pytest>=8.0",
|
|
52
|
+
"ruff>=0.5",
|
|
53
|
+
"tree-sitter>=0.23",
|
|
54
|
+
"tree-sitter-language-pack>=1.0",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[project.urls]
|
|
58
|
+
Homepage = "https://github.com/vjsingh1984/victor"
|
|
59
|
+
Repository = "https://github.com/vjsingh1984/victor"
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
where = ["."]
|
|
63
|
+
include = ["victor_codegraph*"]
|
|
64
|
+
|
|
65
|
+
[tool.ruff]
|
|
66
|
+
line-length = 100
|
|
67
|
+
target-version = "py310"
|
|
68
|
+
|
|
69
|
+
# Self-contained pytest config so running `pytest` from this package directory uses
|
|
70
|
+
# THIS file as the rootdir/inifile and does NOT inherit the victor monorepo root's
|
|
71
|
+
# `addopts = --cov=victor` (pytest-cov isn't a dep here -> "unrecognized arguments").
|
|
72
|
+
[tool.pytest.ini_options]
|
|
73
|
+
testpaths = ["tests"]
|
|
74
|
+
addopts = ""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Adapter tests — ProximaRecord shape projection (no DB, no embed by default)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from victor_codegraph import parse, to_proxima_records
|
|
6
|
+
|
|
7
|
+
SAMPLE = '''\
|
|
8
|
+
def a():
|
|
9
|
+
return b()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def b():
|
|
13
|
+
return 1
|
|
14
|
+
'''
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_symbol_records_shape():
|
|
18
|
+
parsed = parse(SAMPLE, file_path="m.py")
|
|
19
|
+
records = to_proxima_records(parsed, repo_graph_id="repo1")
|
|
20
|
+
nodes = [r for r in records if "graph_node" in r["labels"]]
|
|
21
|
+
assert len(nodes) == 2
|
|
22
|
+
n = nodes[0]
|
|
23
|
+
assert n["oid"].startswith("graph/repo1/node/")
|
|
24
|
+
assert n["labels"] == ["graph_node", "code_symbol"]
|
|
25
|
+
assert n["branch_id"] == "main"
|
|
26
|
+
assert n["props"]["lang"] == "python"
|
|
27
|
+
assert n["props"]["ast_kind"] in ("FUNCTION", "METHOD", "CONSTRUCTOR")
|
|
28
|
+
assert n["embeddings"] == [] # no embedder supplied
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_edge_records_reference_node_oids():
|
|
32
|
+
parsed = parse(SAMPLE, file_path="m.py")
|
|
33
|
+
records = to_proxima_records(parsed, repo_graph_id="repo1")
|
|
34
|
+
edges = [r for r in records if "graph_edge" in r["labels"]]
|
|
35
|
+
assert edges, "expected a CALLS edge (a -> b)"
|
|
36
|
+
e = edges[0]
|
|
37
|
+
assert e["edge"]["from_oid"].startswith("graph/repo1/node/")
|
|
38
|
+
assert e["edge"]["to_oid"].startswith("graph/repo1/node/")
|
|
39
|
+
assert e["edge"]["edge_type"] == "CALLS"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_embedder_populates_embedding_cell():
|
|
43
|
+
parsed = parse(SAMPLE, file_path="m.py")
|
|
44
|
+
records = to_proxima_records(
|
|
45
|
+
parsed, repo_graph_id="repo1", embedder=lambda text: [0.0] * 384
|
|
46
|
+
)
|
|
47
|
+
node = next(r for r in records if "graph_node" in r["labels"])
|
|
48
|
+
assert len(node["embeddings"]) == 1
|
|
49
|
+
cell = node["embeddings"][0]
|
|
50
|
+
assert cell["modality"] == "code"
|
|
51
|
+
assert cell["dim"] == 384
|
|
52
|
+
assert len(cell["values"]) == 384
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Python parser tests — run fully offline (stdlib ast, no grammar needed)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from victor_codegraph import CodeSymbolType, parse
|
|
6
|
+
from victor_codegraph.model import CodeRelationType
|
|
7
|
+
|
|
8
|
+
SAMPLE = '''\
|
|
9
|
+
"""Module doc."""
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def top_level(x: int, y: str = "a") -> bool:
|
|
15
|
+
"""A function."""
|
|
16
|
+
return helper(x)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def helper(x: int) -> int:
|
|
20
|
+
return x + 1
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Greeter(Base):
|
|
24
|
+
"""A class."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, name: str) -> None:
|
|
27
|
+
self.name = name
|
|
28
|
+
|
|
29
|
+
def greet(self) -> str:
|
|
30
|
+
return top_level(1)
|
|
31
|
+
'''
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_extracts_functions_classes_methods():
|
|
35
|
+
parsed = parse(SAMPLE, file_path="pkg/mod.py")
|
|
36
|
+
names = {s.simple_name for s in parsed.symbols}
|
|
37
|
+
assert {"top_level", "helper", "Greeter", "__init__", "greet"} <= names
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_symbol_types_and_constructor():
|
|
41
|
+
parsed = parse(SAMPLE, file_path="pkg/mod.py")
|
|
42
|
+
by_name = {s.simple_name: s for s in parsed.symbols}
|
|
43
|
+
assert by_name["top_level"].symbol_type == CodeSymbolType.FUNCTION
|
|
44
|
+
assert by_name["greet"].symbol_type == CodeSymbolType.METHOD
|
|
45
|
+
assert by_name["__init__"].symbol_type == CodeSymbolType.CONSTRUCTOR
|
|
46
|
+
assert by_name["Greeter"].symbol_type == CodeSymbolType.CLASS
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_signature_and_docstring_and_params():
|
|
50
|
+
parsed = parse(SAMPLE, file_path="pkg/mod.py")
|
|
51
|
+
fn = next(s for s in parsed.symbols if s.simple_name == "top_level")
|
|
52
|
+
assert fn.signature == "top_level(x: int, y: str) -> bool"
|
|
53
|
+
assert fn.documentation == "A function."
|
|
54
|
+
assert {p["name"] for p in fn.parameters} == {"x", "y"}
|
|
55
|
+
assert fn.return_type == "bool"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_imports():
|
|
59
|
+
parsed = parse(SAMPLE, file_path="pkg/mod.py")
|
|
60
|
+
assert any("import os" in i for i in parsed.imports)
|
|
61
|
+
assert any("from typing import Any" in i for i in parsed.imports)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_calls_relation_resolved_to_ids():
|
|
65
|
+
parsed = parse(SAMPLE, file_path="pkg/mod.py")
|
|
66
|
+
ids = {s.id for s in parsed.symbols}
|
|
67
|
+
calls = [r for r in parsed.relations if r.relation_type == CodeRelationType.CALLS]
|
|
68
|
+
assert calls, "expected at least one CALLS edge"
|
|
69
|
+
for r in calls:
|
|
70
|
+
assert r.from_symbol_id in ids
|
|
71
|
+
assert r.to_symbol_id in ids # resolved to a real symbol, not a bare name
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_unresolved_callee_is_retained():
|
|
75
|
+
"""An outgoing call to a symbol not defined in-file is KEPT (bare-name target,
|
|
76
|
+
confidence < 1.0, with a call_site) so cross-file/external references survive for
|
|
77
|
+
CPG blast-radius / impact analysis rather than being silently dropped."""
|
|
78
|
+
parsed = parse("def authenticate():\n return verify()\n", file_path="svc/auth.py")
|
|
79
|
+
calls = list(parsed.relations)
|
|
80
|
+
assert len(calls) == 1
|
|
81
|
+
r = calls[0]
|
|
82
|
+
assert r.to_symbol_id == "verify" # bare name — NOT dropped
|
|
83
|
+
assert r.confidence < 1.0
|
|
84
|
+
assert r.call_site is not None
|
|
85
|
+
assert r.call_site.start_line == 2 # the verify() call line (1-based)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_recursive_self_call_is_dropped():
|
|
89
|
+
"""A recursive call to the enclosing symbol emits no self-edge."""
|
|
90
|
+
parsed = parse("def f():\n return f()\n", file_path="m.py")
|
|
91
|
+
assert [r for r in parsed.relations] == []
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_extends_relation():
|
|
95
|
+
parsed = parse(SAMPLE, file_path="pkg/mod.py")
|
|
96
|
+
cls = next(s for s in parsed.symbols if s.simple_name == "Greeter")
|
|
97
|
+
assert any("extends(Base)" in m for m in cls.modifiers)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_deterministic_ids():
|
|
101
|
+
a = parse(SAMPLE, file_path="pkg/mod.py")
|
|
102
|
+
b = parse(SAMPLE, file_path="pkg/mod.py")
|
|
103
|
+
assert [s.id for s in a.symbols] == [s.id for s in b.symbols]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_syntax_error_falls_back_to_no_symbols():
|
|
107
|
+
parsed = parse("def broken(:\n", file_path="bad.py")
|
|
108
|
+
assert parsed.symbols == []
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Size-capping tests — the headline gap fix vs ProximaDB's donor code.py."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from victor_codegraph import ChunkConfig, chunk
|
|
6
|
+
from victor_codegraph.model import (
|
|
7
|
+
CodeSymbol,
|
|
8
|
+
CodeSymbolType,
|
|
9
|
+
SourceLocation,
|
|
10
|
+
)
|
|
11
|
+
from victor_codegraph.sizing import chunks_for_symbol
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _big_symbol(n_lines: int) -> CodeSymbol:
|
|
15
|
+
body = "\n".join(f" x{i} = compute({i})" for i in range(n_lines))
|
|
16
|
+
source = f"def big():\n{body}\n"
|
|
17
|
+
return CodeSymbol(
|
|
18
|
+
id="sym1",
|
|
19
|
+
symbol_type=CodeSymbolType.FUNCTION,
|
|
20
|
+
fully_qualified_name="m::big",
|
|
21
|
+
simple_name="big",
|
|
22
|
+
location=SourceLocation(file_path="m.py", start_line=1, end_line=n_lines + 2),
|
|
23
|
+
source_code=source,
|
|
24
|
+
language="python",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_small_symbol_is_single_chunk():
|
|
29
|
+
sym = _big_symbol(3)
|
|
30
|
+
chunks = chunks_for_symbol(sym, ChunkConfig())
|
|
31
|
+
assert len(chunks) == 1
|
|
32
|
+
assert chunks[0].metadata["chunk_total"] == 1
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_oversized_symbol_is_body_split():
|
|
36
|
+
sym = _big_symbol(400)
|
|
37
|
+
cfg = ChunkConfig(max_chunk_tokens=128, chunk_overlap_tokens=16)
|
|
38
|
+
chunks = chunks_for_symbol(sym, cfg)
|
|
39
|
+
assert len(chunks) > 1, "oversized symbol must split"
|
|
40
|
+
# No chunk exceeds the char budget.
|
|
41
|
+
for c in chunks:
|
|
42
|
+
assert len(c.text) <= cfg.max_chunk_chars
|
|
43
|
+
# All sub-chunks share the parent symbol id and have hierarchical ids.
|
|
44
|
+
assert all(c.symbol_id == "sym1" for c in chunks)
|
|
45
|
+
assert all(c.chunk_id.startswith("sym1#body#") for c in chunks)
|
|
46
|
+
assert all(c.metadata["chunk_total"] == len(chunks) for c in chunks)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_chunk_end_to_end_python_respects_budget():
|
|
50
|
+
src = "def f():\n" + "\n".join(f" a{i} = {i}" for i in range(300)) + "\n"
|
|
51
|
+
cfg = ChunkConfig(max_chunk_tokens=100)
|
|
52
|
+
chunks = chunk(src, file_path="big.py", config=cfg)
|
|
53
|
+
assert chunks
|
|
54
|
+
for c in chunks:
|
|
55
|
+
assert len(c.text) <= cfg.max_chunk_chars
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_unknown_language_falls_back_to_sliding_window():
|
|
59
|
+
src = "\n".join(f"line {i}" for i in range(200))
|
|
60
|
+
chunks = chunk(src, file_path="notes.unknownext", config=ChunkConfig(max_chunk_tokens=50))
|
|
61
|
+
assert chunks
|
|
62
|
+
assert all(c.metadata.get("strategy") == "sliding_window" for c in chunks)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""JS/TS extraction tests — the donor stub fix.
|
|
2
|
+
|
|
3
|
+
Guarded with importorskip so the suite stays green where the grammar pack isn't
|
|
4
|
+
installed; where it is, these assert the stub is genuinely replaced (non-empty symbols).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
pytest.importorskip("tree_sitter_language_pack")
|
|
12
|
+
|
|
13
|
+
from victor_codegraph import CodeSymbolType, parse # noqa: E402
|
|
14
|
+
from victor_codegraph.treesitter_parser import GrammarUnavailable, parse_treesitter # noqa: E402
|
|
15
|
+
|
|
16
|
+
JS = """\
|
|
17
|
+
import { x } from "./x";
|
|
18
|
+
|
|
19
|
+
export function add(a, b) {
|
|
20
|
+
return a + b;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const mul = (a, b) => a * b;
|
|
24
|
+
|
|
25
|
+
class Calc {
|
|
26
|
+
constructor(seed) {
|
|
27
|
+
this.seed = seed;
|
|
28
|
+
}
|
|
29
|
+
run(n) {
|
|
30
|
+
return add(n, this.seed);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
TS = """\
|
|
36
|
+
export function greet(name: string): string {
|
|
37
|
+
return `hi ${name}`;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
class Service {
|
|
41
|
+
handle(req: Request): Response {
|
|
42
|
+
return new Response();
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _names(language: str, src: str):
|
|
49
|
+
try:
|
|
50
|
+
parsed = parse_treesitter(src, f"f.{language}", language)
|
|
51
|
+
except GrammarUnavailable:
|
|
52
|
+
pytest.skip(f"{language} grammar not installed")
|
|
53
|
+
return {s.simple_name for s in parsed.symbols}, parsed
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_javascript_functions_classes_methods_arrow():
|
|
57
|
+
names, parsed = _names("javascript", JS)
|
|
58
|
+
# The donor stub returned []; here we must see real symbols.
|
|
59
|
+
assert {"add", "Calc", "run"} <= names
|
|
60
|
+
assert "mul" in names # const arrow function
|
|
61
|
+
assert any(s.symbol_type == CodeSymbolType.CLASS for s in parsed.symbols)
|
|
62
|
+
assert any("import" in i for i in parsed.imports)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_typescript_functions_and_methods():
|
|
66
|
+
names, _ = _names("typescript", TS)
|
|
67
|
+
assert {"greet", "Service", "handle"} <= names
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_chunk_jsts_is_size_capped():
|
|
71
|
+
parsed = parse(JS, language="javascript", file_path="f.js")
|
|
72
|
+
if not parsed.symbols:
|
|
73
|
+
pytest.skip("javascript grammar not installed")
|
|
74
|
+
assert parsed.symbols # routed through tree-sitter, not the empty stub
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""victor-codegraph — shared code->CPG chunker.
|
|
2
|
+
|
|
3
|
+
One tree-sitter symbol+relation chunker, three consumers (Victor, ProximaDB SDK,
|
|
4
|
+
AnvaiOps). See ProximaDB ADR-029 / Victor ADR-014.
|
|
5
|
+
|
|
6
|
+
from victor_codegraph import chunk, parse, to_proxima_records, ChunkConfig
|
|
7
|
+
|
|
8
|
+
chunks = chunk(source, file_path="foo.py") # size-capped, embeddable
|
|
9
|
+
parsed = parse(source, file_path="foo.py") # symbols + relations
|
|
10
|
+
records = to_proxima_records(parsed, repo_graph_id="myrepo")
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from .adapter import relation_to_record, symbol_to_record, to_proxima_records
|
|
16
|
+
from .config import ChunkConfig
|
|
17
|
+
from .languages import detect_language
|
|
18
|
+
from .model import (
|
|
19
|
+
CodeChunk,
|
|
20
|
+
CodeRelation,
|
|
21
|
+
CodeRelationType,
|
|
22
|
+
CodeSymbol,
|
|
23
|
+
CodeSymbolType,
|
|
24
|
+
ParsedCode,
|
|
25
|
+
SourceLocation,
|
|
26
|
+
)
|
|
27
|
+
from .parser import chunk, parse
|
|
28
|
+
|
|
29
|
+
__version__ = "0.0.1"
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"__version__",
|
|
33
|
+
"chunk",
|
|
34
|
+
"parse",
|
|
35
|
+
"ChunkConfig",
|
|
36
|
+
"detect_language",
|
|
37
|
+
"to_proxima_records",
|
|
38
|
+
"symbol_to_record",
|
|
39
|
+
"relation_to_record",
|
|
40
|
+
"CodeChunk",
|
|
41
|
+
"CodeSymbol",
|
|
42
|
+
"CodeRelation",
|
|
43
|
+
"CodeSymbolType",
|
|
44
|
+
"CodeRelationType",
|
|
45
|
+
"ParsedCode",
|
|
46
|
+
"SourceLocation",
|
|
47
|
+
]
|