deltagraphar 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. deltagraphar-0.1.0/LICENSE +21 -0
  2. deltagraphar-0.1.0/PKG-INFO +204 -0
  3. deltagraphar-0.1.0/README.md +172 -0
  4. deltagraphar-0.1.0/deltagraphar/__init__.py +0 -0
  5. deltagraphar-0.1.0/deltagraphar/cli.py +140 -0
  6. deltagraphar-0.1.0/deltagraphar/format/__init__.py +0 -0
  7. deltagraphar-0.1.0/deltagraphar/format/paths.py +35 -0
  8. deltagraphar-0.1.0/deltagraphar/format/reader.py +58 -0
  9. deltagraphar-0.1.0/deltagraphar/format/schema.py +136 -0
  10. deltagraphar-0.1.0/deltagraphar/format/writer.py +14 -0
  11. deltagraphar-0.1.0/deltagraphar/store/__init__.py +0 -0
  12. deltagraphar-0.1.0/deltagraphar/store/compaction.py +119 -0
  13. deltagraphar-0.1.0/deltagraphar/store/graphstore.py +270 -0
  14. deltagraphar-0.1.0/deltagraphar/store/ids.py +99 -0
  15. deltagraphar-0.1.0/deltagraphar/versioning/__init__.py +0 -0
  16. deltagraphar-0.1.0/deltagraphar/versioning/backend.py +40 -0
  17. deltagraphar-0.1.0/deltagraphar/versioning/lakefs_backend.py +91 -0
  18. deltagraphar-0.1.0/deltagraphar/versioning/local_backend.py +86 -0
  19. deltagraphar-0.1.0/deltagraphar.egg-info/PKG-INFO +204 -0
  20. deltagraphar-0.1.0/deltagraphar.egg-info/SOURCES.txt +32 -0
  21. deltagraphar-0.1.0/deltagraphar.egg-info/dependency_links.txt +1 -0
  22. deltagraphar-0.1.0/deltagraphar.egg-info/entry_points.txt +2 -0
  23. deltagraphar-0.1.0/deltagraphar.egg-info/requires.txt +9 -0
  24. deltagraphar-0.1.0/deltagraphar.egg-info/top_level.txt +1 -0
  25. deltagraphar-0.1.0/pyproject.toml +50 -0
  26. deltagraphar-0.1.0/setup.cfg +4 -0
  27. deltagraphar-0.1.0/tests/test_append_and_read.py +105 -0
  28. deltagraphar-0.1.0/tests/test_cli.py +68 -0
  29. deltagraphar-0.1.0/tests/test_compaction.py +157 -0
  30. deltagraphar-0.1.0/tests/test_correctness_property.py +116 -0
  31. deltagraphar-0.1.0/tests/test_format_roundtrip.py +163 -0
  32. deltagraphar-0.1.0/tests/test_schema_evolution.py +79 -0
  33. deltagraphar-0.1.0/tests/test_time_travel.py +112 -0
  34. deltagraphar-0.1.0/tests/test_versioning.py +141 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nishank Mahore
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.4
2
+ Name: deltagraphar
3
+ Version: 0.1.0
4
+ Summary: A mutable, versioned property-graph store built on the GraphAr physical layout with ACID semantics delegated to LakeFS.
5
+ Author-email: Nishank Mahore <nishankmahore@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/nishankmahore/DeltaGraphAr
8
+ Project-URL: Repository, https://github.com/nishankmahore/DeltaGraphAr
9
+ Project-URL: Bug Tracker, https://github.com/nishankmahore/DeltaGraphAr/issues
10
+ Keywords: graph,graphar,lakefs,parquet,versioning,property-graph
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Database
19
+ Classifier: Topic :: Scientific/Engineering
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pyarrow>=14.0
24
+ Requires-Dist: pyyaml>=6.0
25
+ Requires-Dist: lakefs>=0.7
26
+ Requires-Dist: lakefs-spec>=0.12
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Requires-Dist: hypothesis>=6.100; extra == "dev"
30
+ Requires-Dist: pandas>=2.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # DeltaGraphAr
34
+
35
+ A mutable, versioned property-graph store built on the [GraphAr](https://graphar.apache.org) physical layout (chunked Parquet + YAML metadata) with ACID semantics delegated to [LakeFS](https://lakefs.io).
36
+
37
+ Pure-Python reference implementation. Suitable for graph datasets that evolve over time and need repeatable reads at arbitrary historical snapshots.
38
+
39
+ ## What it does
40
+
41
+ - Stores vertices and edges as chunked Parquet files following the GraphAr layout spec.
42
+ - Appends edges to an unordered "delta" region; CSR-ordered adjacency is built on demand via `compact()`.
43
+ - Every mutating operation produces a versioned commit. Any commit ref can be used as a `ref=` argument to read historical state.
44
+ - Vertices are identified by arbitrary string logical IDs; the ID map translates to contiguous physical chunk-aligned integers for storage.
45
+ - LakeFS backend delegates branching, tagging, and atomic commits to a running LakeFS instance. The local backend (copy-on-commit) requires no external dependencies.
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install deltagraphar
51
+ ```
52
+
53
+ Requires Python ≥ 3.10.
54
+
55
+ For development (includes pytest, hypothesis, pandas):
56
+
57
+ ```bash
58
+ git clone https://github.com/nishankmahore/DeltaGraphAr.git
59
+ cd DeltaGraphAr
60
+ pip install -e ".[dev]"
61
+ ```
62
+
63
+ ## Quickstart
64
+
65
+ ```bash
66
+ python examples/quickstart.py
67
+ ```
68
+
69
+ Or with LakeFS (requires `docker compose up` first):
70
+
71
+ ```bash
72
+ docker compose up -d
73
+ python examples/ldbc_snb_tiny_loader.py
74
+ ```
75
+
76
+ ## API
77
+
78
+ ```python
79
+ from deltagraphar.versioning.local_backend import LocalBackend
80
+ from deltagraphar.store.graphstore import GraphStore
81
+ from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
82
+
83
+ b = LocalBackend("/path/to/repo")
84
+ vi = VertexInfo(label="person", chunk_size=65_536)
85
+ ei = EdgeInfo("person", "knows", "person", chunk_size=1_048_576, src_chunk_size=65_536)
86
+ gi = GraphInfo(name="social", prefix="", vertex_infos=[vi], edge_infos=[ei])
87
+
88
+ gs = GraphStore.create(b, gi)
89
+ gs.add_vertices("person", [{"id": "alice"}, {"id": "bob"}])
90
+ gs.add_edges(("person", "knows", "person"), [{"src": "alice", "dst": "bob"}])
91
+ gs.compact(("person", "knows", "person"))
92
+
93
+ neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"))
94
+ # → ["bob"]
95
+
96
+ # Time travel
97
+ ref = gs.snapshots()[1].ref
98
+ old_neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"), ref=ref)
99
+ ```
100
+
101
+ ## CLI
102
+
103
+ ```bash
104
+ deltagraphar log --repo /path/to/repo
105
+ deltagraphar neighbors --repo /path/to/repo --label person --vertex alice --etype person,knows,person
106
+ deltagraphar compact --repo /path/to/repo --etype person,knows,person
107
+ deltagraphar tag --repo /path/to/repo v1
108
+ ```
109
+
110
+ ## Schema evolution
111
+
112
+ Add a new property group to existing vertices without rewriting existing data:
113
+
114
+ ```python
115
+ from deltagraphar.format.schema import PropertyGroup, Property
116
+
117
+ pg = PropertyGroup([Property("score", "float64")], prefix="person_score")
118
+ gs.add_property_group("vertex:person", pg, {"alice": 0.9, "bob": 0.7})
119
+ ```
120
+
121
+ ## Tests
122
+
123
+ ```bash
124
+ pytest
125
+ ```
126
+
127
+ 51 tests, 2 skipped (LakeFS integration — requires `docker compose up`).
128
+
129
+ ## Benchmarks
130
+
131
+ ```bash
132
+ python benchmarks/bench_v1.py --rows 10000 --queries 1000
133
+ ```
134
+
135
+ ## Architecture
136
+
137
+ ```
138
+ GraphStore
139
+ ├── IDMap — logical ↔ physical vertex ID, chunk-aligned Parquet
140
+ ├── compaction.py — delta→CSR merge, offset sweep, property reorder
141
+ └── VersioningBackend (ABC)
142
+ ├── LocalBackend — copy-on-commit snapshots, no external deps
143
+ └── LakeFSBackend — atomic commits, branching, tagging via LakeFS API
144
+
145
+ Physical layout (GraphAr spec)
146
+ vertex/<label>/<pg_prefix>/chunk<k> — vertex property tables
147
+ vertex/<label>/__vid_map__/chunk<k> — ID map
148
+ edge/<src>_<et>_<dst>/ordered_by_source/ — CSR adj list + offsets
149
+ edge/<src>_<et>_<dst>/unordered_by_source/ — delta (append-only per vchunk)
150
+ ```
151
+
152
+ ## Data storage layout
153
+
154
+ Data is stored as chunked Parquet files under a local repo directory. Using the movie graph as an example (`repo_dir = "/tmp/movies_repo"`):
155
+
156
+ ```
157
+ /tmp/movies_repo/
158
+ ├── work/ ← current HEAD (mutable working copy)
159
+ │ ├── movies.graph.yml ← graph manifest
160
+ │ ├── Person.vertex.yml ← vertex schema
161
+ │ ├── Movie.vertex.yml
162
+ │ ├── vertex/
163
+ │ │ ├── Person/
164
+ │ │ │ ├── person_name/
165
+ │ │ │ │ └── chunk0 ← name column (Parquet)
166
+ │ │ │ └── __vid_map__/
167
+ │ │ │ └── chunk0 ← logical↔physical ID map
168
+ │ │ └── Movie/
169
+ │ │ └── movie_props/
170
+ │ │ └── chunk0 ← title, released columns (Parquet)
171
+ │ └── edge/
172
+ │ └── Person_ACTED_IN_Movie/
173
+ │ ├── Person_ACTED_IN_Movie.edge.yml ← edge schema
174
+ │ ├── ordered_by_source/ ← CSR (written after compact)
175
+ │ │ ├── adj_list/
176
+ │ │ │ └── part0/chunk0 ← sorted src/dst pairs (Parquet)
177
+ │ │ └── offset/
178
+ │ │ └── part0/chunk0 ← CSR offset array (Parquet)
179
+ │ └── unordered_by_source/ ← delta (append-only, pre-compact)
180
+ │ └── adj_list/
181
+ │ └── part0/chunk0 ← unsorted src/dst pairs (Parquet)
182
+ └── snapshots/
183
+ ├── <sha1ref>/ ← immutable copy-on-commit snapshot
184
+ ├── <sha1ref>/
185
+ └── ... ← one directory per commit
186
+ ```
187
+
188
+ To persist data across runs, replace `tempfile.TemporaryDirectory()` with a fixed path:
189
+
190
+ ```python
191
+ repo_dir = "/tmp/movies_repo"
192
+ b = LocalBackend(repo_dir)
193
+ ```
194
+
195
+ To inspect any chunk file directly:
196
+
197
+ ```python
198
+ import pyarrow.parquet as pq
199
+ pq.read_table("/tmp/movies_repo/work/vertex/Person/person_name/chunk0").to_pandas()
200
+ ```
201
+
202
+ ## License
203
+
204
+ MIT — see [LICENSE](LICENSE)
@@ -0,0 +1,172 @@
1
+ # DeltaGraphAr
2
+
3
+ A mutable, versioned property-graph store built on the [GraphAr](https://graphar.apache.org) physical layout (chunked Parquet + YAML metadata) with ACID semantics delegated to [LakeFS](https://lakefs.io).
4
+
5
+ Pure-Python reference implementation. Suitable for graph datasets that evolve over time and need repeatable reads at arbitrary historical snapshots.
6
+
7
+ ## What it does
8
+
9
+ - Stores vertices and edges as chunked Parquet files following the GraphAr layout spec.
10
+ - Appends edges to an unordered "delta" region; CSR-ordered adjacency is built on demand via `compact()`.
11
+ - Every mutating operation produces a versioned commit. Any commit ref can be used as a `ref=` argument to read historical state.
12
+ - Vertices are identified by arbitrary string logical IDs; the ID map translates to contiguous physical chunk-aligned integers for storage.
13
+ - LakeFS backend delegates branching, tagging, and atomic commits to a running LakeFS instance. The local backend (copy-on-commit) requires no external dependencies.
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ pip install deltagraphar
19
+ ```
20
+
21
+ Requires Python ≥ 3.10.
22
+
23
+ For development (includes pytest, hypothesis, pandas):
24
+
25
+ ```bash
26
+ git clone https://github.com/nishankmahore/DeltaGraphAr.git
27
+ cd DeltaGraphAr
28
+ pip install -e ".[dev]"
29
+ ```
30
+
31
+ ## Quickstart
32
+
33
+ ```bash
34
+ python examples/quickstart.py
35
+ ```
36
+
37
+ Or with LakeFS (requires `docker compose up` first):
38
+
39
+ ```bash
40
+ docker compose up -d
41
+ python examples/ldbc_snb_tiny_loader.py
42
+ ```
43
+
44
+ ## API
45
+
46
+ ```python
47
+ from deltagraphar.versioning.local_backend import LocalBackend
48
+ from deltagraphar.store.graphstore import GraphStore
49
+ from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
50
+
51
+ b = LocalBackend("/path/to/repo")
52
+ vi = VertexInfo(label="person", chunk_size=65_536)
53
+ ei = EdgeInfo("person", "knows", "person", chunk_size=1_048_576, src_chunk_size=65_536)
54
+ gi = GraphInfo(name="social", prefix="", vertex_infos=[vi], edge_infos=[ei])
55
+
56
+ gs = GraphStore.create(b, gi)
57
+ gs.add_vertices("person", [{"id": "alice"}, {"id": "bob"}])
58
+ gs.add_edges(("person", "knows", "person"), [{"src": "alice", "dst": "bob"}])
59
+ gs.compact(("person", "knows", "person"))
60
+
61
+ neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"))
62
+ # → ["bob"]
63
+
64
+ # Time travel
65
+ ref = gs.snapshots()[1].ref
66
+ old_neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"), ref=ref)
67
+ ```
68
+
69
+ ## CLI
70
+
71
+ ```bash
72
+ deltagraphar log --repo /path/to/repo
73
+ deltagraphar neighbors --repo /path/to/repo --label person --vertex alice --etype person,knows,person
74
+ deltagraphar compact --repo /path/to/repo --etype person,knows,person
75
+ deltagraphar tag --repo /path/to/repo v1
76
+ ```
77
+
78
+ ## Schema evolution
79
+
80
+ Add a new property group to existing vertices without rewriting existing data:
81
+
82
+ ```python
83
+ from deltagraphar.format.schema import PropertyGroup, Property
84
+
85
+ pg = PropertyGroup([Property("score", "float64")], prefix="person_score")
86
+ gs.add_property_group("vertex:person", pg, {"alice": 0.9, "bob": 0.7})
87
+ ```
88
+
89
+ ## Tests
90
+
91
+ ```bash
92
+ pytest
93
+ ```
94
+
95
+ 51 tests, 2 skipped (LakeFS integration — requires `docker compose up`).
96
+
97
+ ## Benchmarks
98
+
99
+ ```bash
100
+ python benchmarks/bench_v1.py --rows 10000 --queries 1000
101
+ ```
102
+
103
+ ## Architecture
104
+
105
+ ```
106
+ GraphStore
107
+ ├── IDMap — logical ↔ physical vertex ID, chunk-aligned Parquet
108
+ ├── compaction.py — delta→CSR merge, offset sweep, property reorder
109
+ └── VersioningBackend (ABC)
110
+ ├── LocalBackend — copy-on-commit snapshots, no external deps
111
+ └── LakeFSBackend — atomic commits, branching, tagging via LakeFS API
112
+
113
+ Physical layout (GraphAr spec)
114
+ vertex/<label>/<pg_prefix>/chunk<k> — vertex property tables
115
+ vertex/<label>/__vid_map__/chunk<k> — ID map
116
+ edge/<src>_<et>_<dst>/ordered_by_source/ — CSR adj list + offsets
117
+ edge/<src>_<et>_<dst>/unordered_by_source/ — delta (append-only per vchunk)
118
+ ```
119
+
120
+ ## Data storage layout
121
+
122
+ Data is stored as chunked Parquet files under a local repo directory. Using the movie graph as an example (`repo_dir = "/tmp/movies_repo"`):
123
+
124
+ ```
125
+ /tmp/movies_repo/
126
+ ├── work/ ← current HEAD (mutable working copy)
127
+ │ ├── movies.graph.yml ← graph manifest
128
+ │ ├── Person.vertex.yml ← vertex schema
129
+ │ ├── Movie.vertex.yml
130
+ │ ├── vertex/
131
+ │ │ ├── Person/
132
+ │ │ │ ├── person_name/
133
+ │ │ │ │ └── chunk0 ← name column (Parquet)
134
+ │ │ │ └── __vid_map__/
135
+ │ │ │ └── chunk0 ← logical↔physical ID map
136
+ │ │ └── Movie/
137
+ │ │ └── movie_props/
138
+ │ │ └── chunk0 ← title, released columns (Parquet)
139
+ │ └── edge/
140
+ │ └── Person_ACTED_IN_Movie/
141
+ │ ├── Person_ACTED_IN_Movie.edge.yml ← edge schema
142
+ │ ├── ordered_by_source/ ← CSR (written after compact)
143
+ │ │ ├── adj_list/
144
+ │ │ │ └── part0/chunk0 ← sorted src/dst pairs (Parquet)
145
+ │ │ └── offset/
146
+ │ │ └── part0/chunk0 ← CSR offset array (Parquet)
147
+ │ └── unordered_by_source/ ← delta (append-only, pre-compact)
148
+ │ └── adj_list/
149
+ │ └── part0/chunk0 ← unsorted src/dst pairs (Parquet)
150
+ └── snapshots/
151
+ ├── <sha1ref>/ ← immutable copy-on-commit snapshot
152
+ ├── <sha1ref>/
153
+ └── ... ← one directory per commit
154
+ ```
155
+
156
+ To persist data across runs, replace `tempfile.TemporaryDirectory()` with a fixed path:
157
+
158
+ ```python
159
+ repo_dir = "/tmp/movies_repo"
160
+ b = LocalBackend(repo_dir)
161
+ ```
162
+
163
+ To inspect any chunk file directly:
164
+
165
+ ```python
166
+ import pyarrow.parquet as pq
167
+ pq.read_table("/tmp/movies_repo/work/vertex/Person/person_name/chunk0").to_pandas()
168
+ ```
169
+
170
+ ## License
171
+
172
+ MIT — see [LICENSE](LICENSE)
File without changes
@@ -0,0 +1,140 @@
1
+ """Command-line interface for DeltaGraphAr."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+
9
+ def _get_backend(args):
10
+ from deltagraphar.versioning.local_backend import LocalBackend
11
+ return LocalBackend(args.repo)
12
+
13
+
14
+ def cmd_log(args):
15
+ b = _get_backend(args)
16
+ commits = b.log()
17
+ if not commits:
18
+ print("(no commits)")
19
+ return
20
+ for c in commits:
21
+ from datetime import datetime, timezone
22
+ dt = datetime.fromtimestamp(c.timestamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
23
+ meta = f" {c.metadata}" if c.metadata else ""
24
+ print(f"{c.ref[:8]} {dt} {c.message}{meta}")
25
+
26
+
27
+ def cmd_tag(args):
28
+ b = _get_backend(args)
29
+ log = b.log()
30
+ if not log:
31
+ print("error: no commits to tag", file=sys.stderr)
32
+ sys.exit(1)
33
+ ref = log[-1].ref
34
+ b.tag(args.name, ref)
35
+ print(f"tagged {ref[:8]} as {args.name!r}")
36
+
37
+
38
+ def cmd_neighbors(args):
39
+ from deltagraphar.versioning.local_backend import LocalBackend
40
+ from deltagraphar.store.graphstore import GraphStore
41
+ from deltagraphar.format.reader import read_yaml
42
+ from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
43
+ from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
44
+
45
+ b = LocalBackend(args.repo)
46
+ ref = args.ref or None
47
+
48
+ etype = tuple(args.etype.split(","))
49
+ if len(etype) != 3:
50
+ print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+ src_label, et, dst_label = etype
54
+ vi_data = read_yaml(b, vertex_yaml_path(args.label), ref=ref)
55
+ ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label), ref=ref)
56
+
57
+ vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
58
+ ei = EdgeInfo(
59
+ src_type=ei_data["src_type"],
60
+ edge_type=ei_data["edge_type"],
61
+ dst_type=ei_data["dst_type"],
62
+ chunk_size=ei_data["chunk_size"],
63
+ src_chunk_size=ei_data["src_chunk_size"],
64
+ )
65
+ gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
66
+ gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
67
+ nbrs = gs.out_neighbors(args.label, args.vertex, etype, ref=ref)
68
+ print(json.dumps(nbrs))
69
+
70
+
71
+ def cmd_compact(args):
72
+ from deltagraphar.versioning.local_backend import LocalBackend
73
+ from deltagraphar.store.graphstore import GraphStore
74
+ from deltagraphar.format.reader import read_yaml
75
+ from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
76
+ from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
77
+
78
+ b = LocalBackend(args.repo)
79
+ etype = tuple(args.etype.split(","))
80
+ if len(etype) != 3:
81
+ print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
82
+ sys.exit(1)
83
+
84
+ src_label, et, dst_label = etype
85
+ vi_data = read_yaml(b, vertex_yaml_path(src_label))
86
+ ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label))
87
+
88
+ vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
89
+ ei = EdgeInfo(
90
+ src_type=ei_data["src_type"],
91
+ edge_type=ei_data["edge_type"],
92
+ dst_type=ei_data["dst_type"],
93
+ chunk_size=ei_data["chunk_size"],
94
+ src_chunk_size=ei_data["src_chunk_size"],
95
+ )
96
+ gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
97
+ gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
98
+
99
+ vchunks = [int(v) for v in args.vchunks.split(",")] if args.vchunks else None
100
+ ref = gs.compact(etype, vchunks=vchunks)
101
+ print(f"compacted → {ref[:8]}")
102
+
103
+
104
+ def main():
105
+ parser = argparse.ArgumentParser(
106
+ prog="deltagraphar",
107
+ description="DeltaGraphAr — versioned property-graph store",
108
+ )
109
+ sub = parser.add_subparsers(dest="command", required=True)
110
+
111
+ p_log = sub.add_parser("log", help="Show commit history")
112
+ p_log.add_argument("--repo", required=True, help="Path to local repo")
113
+ p_log.set_defaults(func=cmd_log)
114
+
115
+ p_tag = sub.add_parser("tag", help="Tag the latest commit")
116
+ p_tag.add_argument("--repo", required=True)
117
+ p_tag.add_argument("name", help="Tag name")
118
+ p_tag.set_defaults(func=cmd_tag)
119
+
120
+ p_nbr = sub.add_parser("neighbors", help="List out-neighbors of a vertex at a ref")
121
+ p_nbr.add_argument("--repo", required=True)
122
+ p_nbr.add_argument("--label", required=True, help="Vertex label")
123
+ p_nbr.add_argument("--vertex", required=True, help="Logical vertex ID")
124
+ p_nbr.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
125
+ p_nbr.add_argument("--ref", default=None, help="Commit ref or tag (default: HEAD)")
126
+ p_nbr.set_defaults(func=cmd_neighbors)
127
+
128
+ p_compact = sub.add_parser("compact", help="Compact delta into ordered CSR")
129
+ p_compact.add_argument("--repo", required=True)
130
+ p_compact.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
131
+ p_compact.add_argument("--vchunks", default=None,
132
+ help="Comma-separated vchunk indices (default: auto-discover)")
133
+ p_compact.set_defaults(func=cmd_compact)
134
+
135
+ args = parser.parse_args()
136
+ args.func(args)
137
+
138
+
139
+ if __name__ == "__main__":
140
+ main()
File without changes
@@ -0,0 +1,35 @@
1
+ def vertex_chunk_path(label: str, pg_prefix: str, chunk_idx: int) -> str:
2
+ return f"vertex/{label}/{pg_prefix}/chunk{chunk_idx}"
3
+
4
+
5
+ def vid_map_chunk_path(label: str, chunk_idx: int) -> str:
6
+ return f"vertex/{label}/__vid_map__/chunk{chunk_idx}"
7
+
8
+
9
+ def adj_list_chunk_path(
10
+ src: str, etype: str, dst: str, adj_type: str, vchunk: int, chunk_idx: int
11
+ ) -> str:
12
+ return f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}/chunk{chunk_idx}"
13
+
14
+
15
+ def offset_chunk_path(src: str, etype: str, dst: str, vchunk: int) -> str:
16
+ return f"edge/{src}_{etype}_{dst}/ordered_by_source/offset/part{vchunk}/chunk0"
17
+
18
+
19
+ def edge_prop_chunk_path(
20
+ src: str, etype: str, dst: str,
21
+ adj_type: str, pg_prefix: str, vchunk: int, chunk_idx: int,
22
+ ) -> str:
23
+ return f"edge/{src}_{etype}_{dst}/{adj_type}/{pg_prefix}/part{vchunk}/chunk{chunk_idx}"
24
+
25
+
26
+ def graph_yaml_path(name: str) -> str:
27
+ return f"{name}.graph.yml"
28
+
29
+
30
+ def vertex_yaml_path(label: str) -> str:
31
+ return f"{label}.vertex.yml"
32
+
33
+
34
+ def edge_yaml_path(src: str, etype: str, dst: str) -> str:
35
+ return f"{src}_{etype}_{dst}.edge.yml"
@@ -0,0 +1,58 @@
1
+ import io
2
+ import yaml
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+ import pyarrow.compute as pc
6
+
7
+ from deltagraphar.format.paths import offset_chunk_path
8
+
9
+
10
+ def read_table(backend, path: str, ref=None) -> pa.Table:
11
+ data = backend.read_file(path, ref=ref)
12
+ return pq.read_table(io.BytesIO(data))
13
+
14
+
15
+ def read_yaml(backend, path: str, ref=None) -> dict:
16
+ return yaml.safe_load(backend.read_file(path, ref=ref))
17
+
18
+
19
+ def read_adj_list(
20
+ backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
21
+ ) -> pa.Table:
22
+ """Concatenate all chunk files for one (adj_type, vchunk) partition."""
23
+ prefix = f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}"
24
+ paths = sorted(backend.list(prefix, ref=ref))
25
+ if not paths:
26
+ return pa.table({
27
+ "src_physical": pa.array([], type=pa.int64()),
28
+ "dst_physical": pa.array([], type=pa.int64()),
29
+ })
30
+ return pa.concat_tables([read_table(backend, p, ref=ref) for p in paths])
31
+
32
+
33
+ def read_offsets(backend, src: str, etype: str, dst: str, vchunk: int, ref=None) -> list[int]:
34
+ """Read the CSR offset array for a vchunk; returns [] if not yet compacted."""
35
+ path = offset_chunk_path(src, etype, dst, vchunk)
36
+ try:
37
+ tbl = read_table(backend, path, ref=ref)
38
+ return tbl["offset"].to_pylist()
39
+ except FileNotFoundError:
40
+ return []
41
+
42
+
43
+ def scan_delta(
44
+ backend, src: str, etype: str, dst: str, vchunk: int, src_physical: int, ref=None
45
+ ) -> list[int]:
46
+ """Return dst_physical list from unordered delta where src_physical matches."""
47
+ delta = read_adj_list(backend, src, etype, dst, "unordered_by_source", vchunk, ref=ref)
48
+ if len(delta) == 0:
49
+ return []
50
+ mask = pc.equal(delta["src_physical"], src_physical)
51
+ return delta.filter(mask)["dst_physical"].to_pylist()
52
+
53
+
54
+ def count_rows(
55
+ backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
56
+ ) -> int:
57
+ tbl = read_adj_list(backend, src, etype, dst, adj_type, vchunk, ref=ref)
58
+ return len(tbl)