deltagraphar 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltagraphar-0.1.0/LICENSE +21 -0
- deltagraphar-0.1.0/PKG-INFO +204 -0
- deltagraphar-0.1.0/README.md +172 -0
- deltagraphar-0.1.0/deltagraphar/__init__.py +0 -0
- deltagraphar-0.1.0/deltagraphar/cli.py +140 -0
- deltagraphar-0.1.0/deltagraphar/format/__init__.py +0 -0
- deltagraphar-0.1.0/deltagraphar/format/paths.py +35 -0
- deltagraphar-0.1.0/deltagraphar/format/reader.py +58 -0
- deltagraphar-0.1.0/deltagraphar/format/schema.py +136 -0
- deltagraphar-0.1.0/deltagraphar/format/writer.py +14 -0
- deltagraphar-0.1.0/deltagraphar/store/__init__.py +0 -0
- deltagraphar-0.1.0/deltagraphar/store/compaction.py +119 -0
- deltagraphar-0.1.0/deltagraphar/store/graphstore.py +270 -0
- deltagraphar-0.1.0/deltagraphar/store/ids.py +99 -0
- deltagraphar-0.1.0/deltagraphar/versioning/__init__.py +0 -0
- deltagraphar-0.1.0/deltagraphar/versioning/backend.py +40 -0
- deltagraphar-0.1.0/deltagraphar/versioning/lakefs_backend.py +91 -0
- deltagraphar-0.1.0/deltagraphar/versioning/local_backend.py +86 -0
- deltagraphar-0.1.0/deltagraphar.egg-info/PKG-INFO +204 -0
- deltagraphar-0.1.0/deltagraphar.egg-info/SOURCES.txt +32 -0
- deltagraphar-0.1.0/deltagraphar.egg-info/dependency_links.txt +1 -0
- deltagraphar-0.1.0/deltagraphar.egg-info/entry_points.txt +2 -0
- deltagraphar-0.1.0/deltagraphar.egg-info/requires.txt +9 -0
- deltagraphar-0.1.0/deltagraphar.egg-info/top_level.txt +1 -0
- deltagraphar-0.1.0/pyproject.toml +50 -0
- deltagraphar-0.1.0/setup.cfg +4 -0
- deltagraphar-0.1.0/tests/test_append_and_read.py +105 -0
- deltagraphar-0.1.0/tests/test_cli.py +68 -0
- deltagraphar-0.1.0/tests/test_compaction.py +157 -0
- deltagraphar-0.1.0/tests/test_correctness_property.py +116 -0
- deltagraphar-0.1.0/tests/test_format_roundtrip.py +163 -0
- deltagraphar-0.1.0/tests/test_schema_evolution.py +79 -0
- deltagraphar-0.1.0/tests/test_time_travel.py +112 -0
- deltagraphar-0.1.0/tests/test_versioning.py +141 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nishank Mahore
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deltagraphar
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A mutable, versioned property-graph store built on the GraphAr physical layout with ACID semantics delegated to LakeFS.
|
|
5
|
+
Author-email: Nishank Mahore <nishankmahore@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/nishankmahore/DeltaGraphAr
|
|
8
|
+
Project-URL: Repository, https://github.com/nishankmahore/DeltaGraphAr
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/nishankmahore/DeltaGraphAr/issues
|
|
10
|
+
Keywords: graph,graphar,lakefs,parquet,versioning,property-graph
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Database
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pyarrow>=14.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Requires-Dist: lakefs>=0.7
|
|
26
|
+
Requires-Dist: lakefs-spec>=0.12
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
29
|
+
Requires-Dist: hypothesis>=6.100; extra == "dev"
|
|
30
|
+
Requires-Dist: pandas>=2.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# DeltaGraphAr
|
|
34
|
+
|
|
35
|
+
A mutable, versioned property-graph store built on the [GraphAr](https://graphar.apache.org) physical layout (chunked Parquet + YAML metadata) with ACID semantics delegated to [LakeFS](https://lakefs.io).
|
|
36
|
+
|
|
37
|
+
Pure-Python reference implementation. Suitable for graph datasets that evolve over time and need repeatable reads at arbitrary historical snapshots.
|
|
38
|
+
|
|
39
|
+
## What it does
|
|
40
|
+
|
|
41
|
+
- Stores vertices and edges as chunked Parquet files following the GraphAr layout spec.
|
|
42
|
+
- Appends edges to an unordered "delta" region; CSR-ordered adjacency is built on demand via `compact()`.
|
|
43
|
+
- Every mutating operation produces a versioned commit. Any commit ref can be used as a `ref=` argument to read historical state.
|
|
44
|
+
- Vertices are identified by arbitrary string logical IDs; the ID map translates to contiguous physical chunk-aligned integers for storage.
|
|
45
|
+
- LakeFS backend delegates branching, tagging, and atomic commits to a running LakeFS instance. The local backend (copy-on-commit) requires no external dependencies.
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install deltagraphar
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Requires Python ≥ 3.10.
|
|
54
|
+
|
|
55
|
+
For development (includes pytest, hypothesis, pandas):
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://github.com/nishankmahore/DeltaGraphAr.git
|
|
59
|
+
cd DeltaGraphAr
|
|
60
|
+
pip install -e ".[dev]"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quickstart
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
python examples/quickstart.py
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Or with LakeFS (requires `docker compose up` first):
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
docker compose up -d
|
|
73
|
+
python examples/ldbc_snb_tiny_loader.py
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## API
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
80
|
+
from deltagraphar.store.graphstore import GraphStore
|
|
81
|
+
from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
|
|
82
|
+
|
|
83
|
+
b = LocalBackend("/path/to/repo")
|
|
84
|
+
vi = VertexInfo(label="person", chunk_size=65_536)
|
|
85
|
+
ei = EdgeInfo("person", "knows", "person", chunk_size=1_048_576, src_chunk_size=65_536)
|
|
86
|
+
gi = GraphInfo(name="social", prefix="", vertex_infos=[vi], edge_infos=[ei])
|
|
87
|
+
|
|
88
|
+
gs = GraphStore.create(b, gi)
|
|
89
|
+
gs.add_vertices("person", [{"id": "alice"}, {"id": "bob"}])
|
|
90
|
+
gs.add_edges(("person", "knows", "person"), [{"src": "alice", "dst": "bob"}])
|
|
91
|
+
gs.compact(("person", "knows", "person"))
|
|
92
|
+
|
|
93
|
+
neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"))
|
|
94
|
+
# → ["bob"]
|
|
95
|
+
|
|
96
|
+
# Time travel
|
|
97
|
+
ref = gs.snapshots()[1].ref
|
|
98
|
+
old_neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"), ref=ref)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## CLI
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
deltagraphar log --repo /path/to/repo
|
|
105
|
+
deltagraphar neighbors --repo /path/to/repo --label person --vertex alice --etype person,knows,person
|
|
106
|
+
deltagraphar compact --repo /path/to/repo --etype person,knows,person
|
|
107
|
+
deltagraphar tag --repo /path/to/repo v1
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Schema evolution
|
|
111
|
+
|
|
112
|
+
Add a new property group to existing vertices without rewriting existing data:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from deltagraphar.format.schema import PropertyGroup, Property
|
|
116
|
+
|
|
117
|
+
pg = PropertyGroup([Property("score", "float64")], prefix="person_score")
|
|
118
|
+
gs.add_property_group("vertex:person", pg, {"alice": 0.9, "bob": 0.7})
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Tests
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pytest
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
51 tests, 2 skipped (LakeFS integration — requires `docker compose up`).
|
|
128
|
+
|
|
129
|
+
## Benchmarks
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
python benchmarks/bench_v1.py --rows 10000 --queries 1000
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Architecture
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
GraphStore
|
|
139
|
+
├── IDMap — logical ↔ physical vertex ID, chunk-aligned Parquet
|
|
140
|
+
├── compaction.py — delta→CSR merge, offset sweep, property reorder
|
|
141
|
+
└── VersioningBackend (ABC)
|
|
142
|
+
├── LocalBackend — copy-on-commit snapshots, no external deps
|
|
143
|
+
└── LakeFSBackend — atomic commits, branching, tagging via LakeFS API
|
|
144
|
+
|
|
145
|
+
Physical layout (GraphAr spec)
|
|
146
|
+
vertex/<label>/<pg_prefix>/chunk<k> — vertex property tables
|
|
147
|
+
vertex/<label>/__vid_map__/chunk<k> — ID map
|
|
148
|
+
edge/<src>_<et>_<dst>/ordered_by_source/ — CSR adj list + offsets
|
|
149
|
+
edge/<src>_<et>_<dst>/unordered_by_source/ — delta (append-only per vchunk)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Data storage layout
|
|
153
|
+
|
|
154
|
+
Data is stored as chunked Parquet files under a local repo directory. Using the movie graph as an example (`repo_dir = "/tmp/movies_repo"`):
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
/tmp/movies_repo/
|
|
158
|
+
├── work/ ← current HEAD (mutable working copy)
|
|
159
|
+
│ ├── movies.graph.yml ← graph manifest
|
|
160
|
+
│ ├── Person.vertex.yml ← vertex schema
|
|
161
|
+
│ ├── Movie.vertex.yml
|
|
162
|
+
│ ├── vertex/
|
|
163
|
+
│ │ ├── Person/
|
|
164
|
+
│ │ │ ├── person_name/
|
|
165
|
+
│ │ │ │ └── chunk0 ← name column (Parquet)
|
|
166
|
+
│ │ │ └── __vid_map__/
|
|
167
|
+
│ │ │ └── chunk0 ← logical↔physical ID map
|
|
168
|
+
│ │ └── Movie/
|
|
169
|
+
│ │ └── movie_props/
|
|
170
|
+
│ │ └── chunk0 ← title, released columns (Parquet)
|
|
171
|
+
│ └── edge/
|
|
172
|
+
│ └── Person_ACTED_IN_Movie/
|
|
173
|
+
│ ├── Person_ACTED_IN_Movie.edge.yml ← edge schema
|
|
174
|
+
│ ├── ordered_by_source/ ← CSR (written after compact)
|
|
175
|
+
│ │ ├── adj_list/
|
|
176
|
+
│ │ │ └── part0/chunk0 ← sorted src/dst pairs (Parquet)
|
|
177
|
+
│ │ └── offset/
|
|
178
|
+
│ │ └── part0/chunk0 ← CSR offset array (Parquet)
|
|
179
|
+
│ └── unordered_by_source/ ← delta (append-only, pre-compact)
|
|
180
|
+
│ └── adj_list/
|
|
181
|
+
│ └── part0/chunk0 ← unsorted src/dst pairs (Parquet)
|
|
182
|
+
└── snapshots/
|
|
183
|
+
├── <sha1ref>/ ← immutable copy-on-commit snapshot
|
|
184
|
+
├── <sha1ref>/
|
|
185
|
+
└── ... ← one directory per commit
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
To persist data across runs, replace `tempfile.TemporaryDirectory()` with a fixed path:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
repo_dir = "/tmp/movies_repo"
|
|
192
|
+
b = LocalBackend(repo_dir)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
To inspect any chunk file directly:
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
import pyarrow.parquet as pq
|
|
199
|
+
pq.read_table("/tmp/movies_repo/work/vertex/Person/person_name/chunk0").to_pandas()
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
MIT — see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# DeltaGraphAr
|
|
2
|
+
|
|
3
|
+
A mutable, versioned property-graph store built on the [GraphAr](https://graphar.apache.org) physical layout (chunked Parquet + YAML metadata) with ACID semantics delegated to [LakeFS](https://lakefs.io).
|
|
4
|
+
|
|
5
|
+
Pure-Python reference implementation. Suitable for graph datasets that evolve over time and need repeatable reads at arbitrary historical snapshots.
|
|
6
|
+
|
|
7
|
+
## What it does
|
|
8
|
+
|
|
9
|
+
- Stores vertices and edges as chunked Parquet files following the GraphAr layout spec.
|
|
10
|
+
- Appends edges to an unordered "delta" region; CSR-ordered adjacency is built on demand via `compact()`.
|
|
11
|
+
- Every mutating operation produces a versioned commit. Any commit ref can be used as a `ref=` argument to read historical state.
|
|
12
|
+
- Vertices are identified by arbitrary string logical IDs; the ID map translates to contiguous physical chunk-aligned integers for storage.
|
|
13
|
+
- LakeFS backend delegates branching, tagging, and atomic commits to a running LakeFS instance. The local backend (copy-on-commit) requires no external dependencies.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install deltagraphar
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Requires Python ≥ 3.10.
|
|
22
|
+
|
|
23
|
+
For development (includes pytest, hypothesis, pandas):
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
git clone https://github.com/nishankmahore/DeltaGraphAr.git
|
|
27
|
+
cd DeltaGraphAr
|
|
28
|
+
pip install -e ".[dev]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quickstart
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
python examples/quickstart.py
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Or with LakeFS (requires `docker compose up` first):
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
docker compose up -d
|
|
41
|
+
python examples/ldbc_snb_tiny_loader.py
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## API
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
48
|
+
from deltagraphar.store.graphstore import GraphStore
|
|
49
|
+
from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
|
|
50
|
+
|
|
51
|
+
b = LocalBackend("/path/to/repo")
|
|
52
|
+
vi = VertexInfo(label="person", chunk_size=65_536)
|
|
53
|
+
ei = EdgeInfo("person", "knows", "person", chunk_size=1_048_576, src_chunk_size=65_536)
|
|
54
|
+
gi = GraphInfo(name="social", prefix="", vertex_infos=[vi], edge_infos=[ei])
|
|
55
|
+
|
|
56
|
+
gs = GraphStore.create(b, gi)
|
|
57
|
+
gs.add_vertices("person", [{"id": "alice"}, {"id": "bob"}])
|
|
58
|
+
gs.add_edges(("person", "knows", "person"), [{"src": "alice", "dst": "bob"}])
|
|
59
|
+
gs.compact(("person", "knows", "person"))
|
|
60
|
+
|
|
61
|
+
neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"))
|
|
62
|
+
# → ["bob"]
|
|
63
|
+
|
|
64
|
+
# Time travel
|
|
65
|
+
ref = gs.snapshots()[1].ref
|
|
66
|
+
old_neighbors = gs.out_neighbors("person", "alice", ("person", "knows", "person"), ref=ref)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## CLI
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
deltagraphar log --repo /path/to/repo
|
|
73
|
+
deltagraphar neighbors --repo /path/to/repo --label person --vertex alice --etype person,knows,person
|
|
74
|
+
deltagraphar compact --repo /path/to/repo --etype person,knows,person
|
|
75
|
+
deltagraphar tag --repo /path/to/repo v1
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Schema evolution
|
|
79
|
+
|
|
80
|
+
Add a new property group to existing vertices without rewriting existing data:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from deltagraphar.format.schema import PropertyGroup, Property
|
|
84
|
+
|
|
85
|
+
pg = PropertyGroup([Property("score", "float64")], prefix="person_score")
|
|
86
|
+
gs.add_property_group("vertex:person", pg, {"alice": 0.9, "bob": 0.7})
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Tests
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pytest
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
51 tests, 2 skipped (LakeFS integration — requires `docker compose up`).
|
|
96
|
+
|
|
97
|
+
## Benchmarks
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
python benchmarks/bench_v1.py --rows 10000 --queries 1000
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Architecture
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
GraphStore
|
|
107
|
+
├── IDMap — logical ↔ physical vertex ID, chunk-aligned Parquet
|
|
108
|
+
├── compaction.py — delta→CSR merge, offset sweep, property reorder
|
|
109
|
+
└── VersioningBackend (ABC)
|
|
110
|
+
├── LocalBackend — copy-on-commit snapshots, no external deps
|
|
111
|
+
└── LakeFSBackend — atomic commits, branching, tagging via LakeFS API
|
|
112
|
+
|
|
113
|
+
Physical layout (GraphAr spec)
|
|
114
|
+
vertex/<label>/<pg_prefix>/chunk<k> — vertex property tables
|
|
115
|
+
vertex/<label>/__vid_map__/chunk<k> — ID map
|
|
116
|
+
edge/<src>_<et>_<dst>/ordered_by_source/ — CSR adj list + offsets
|
|
117
|
+
edge/<src>_<et>_<dst>/unordered_by_source/ — delta (append-only per vchunk)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Data storage layout
|
|
121
|
+
|
|
122
|
+
Data is stored as chunked Parquet files under a local repo directory. Using the movie graph as an example (`repo_dir = "/tmp/movies_repo"`):
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
/tmp/movies_repo/
|
|
126
|
+
├── work/ ← current HEAD (mutable working copy)
|
|
127
|
+
│ ├── movies.graph.yml ← graph manifest
|
|
128
|
+
│ ├── Person.vertex.yml ← vertex schema
|
|
129
|
+
│ ├── Movie.vertex.yml
|
|
130
|
+
│ ├── vertex/
|
|
131
|
+
│ │ ├── Person/
|
|
132
|
+
│ │ │ ├── person_name/
|
|
133
|
+
│ │ │ │ └── chunk0 ← name column (Parquet)
|
|
134
|
+
│ │ │ └── __vid_map__/
|
|
135
|
+
│ │ │ └── chunk0 ← logical↔physical ID map
|
|
136
|
+
│ │ └── Movie/
|
|
137
|
+
│ │ └── movie_props/
|
|
138
|
+
│ │ └── chunk0 ← title, released columns (Parquet)
|
|
139
|
+
│ └── edge/
|
|
140
|
+
│ └── Person_ACTED_IN_Movie/
|
|
141
|
+
│ ├── Person_ACTED_IN_Movie.edge.yml ← edge schema
|
|
142
|
+
│ ├── ordered_by_source/ ← CSR (written after compact)
|
|
143
|
+
│ │ ├── adj_list/
|
|
144
|
+
│ │ │ └── part0/chunk0 ← sorted src/dst pairs (Parquet)
|
|
145
|
+
│ │ └── offset/
|
|
146
|
+
│ │ └── part0/chunk0 ← CSR offset array (Parquet)
|
|
147
|
+
│ └── unordered_by_source/ ← delta (append-only, pre-compact)
|
|
148
|
+
│ └── adj_list/
|
|
149
|
+
│ └── part0/chunk0 ← unsorted src/dst pairs (Parquet)
|
|
150
|
+
└── snapshots/
|
|
151
|
+
├── <sha1ref>/ ← immutable copy-on-commit snapshot
|
|
152
|
+
├── <sha1ref>/
|
|
153
|
+
└── ... ← one directory per commit
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
To persist data across runs, replace `tempfile.TemporaryDirectory()` with a fixed path:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
repo_dir = "/tmp/movies_repo"
|
|
160
|
+
b = LocalBackend(repo_dir)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
To inspect any chunk file directly:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
import pyarrow.parquet as pq
|
|
167
|
+
pq.read_table("/tmp/movies_repo/work/vertex/Person/person_name/chunk0").to_pandas()
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT — see [LICENSE](LICENSE)
|
|
File without changes
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Command-line interface for DeltaGraphAr."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_backend(args):
|
|
10
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
11
|
+
return LocalBackend(args.repo)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cmd_log(args):
|
|
15
|
+
b = _get_backend(args)
|
|
16
|
+
commits = b.log()
|
|
17
|
+
if not commits:
|
|
18
|
+
print("(no commits)")
|
|
19
|
+
return
|
|
20
|
+
for c in commits:
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
dt = datetime.fromtimestamp(c.timestamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
23
|
+
meta = f" {c.metadata}" if c.metadata else ""
|
|
24
|
+
print(f"{c.ref[:8]} {dt} {c.message}{meta}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def cmd_tag(args):
|
|
28
|
+
b = _get_backend(args)
|
|
29
|
+
log = b.log()
|
|
30
|
+
if not log:
|
|
31
|
+
print("error: no commits to tag", file=sys.stderr)
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
ref = log[-1].ref
|
|
34
|
+
b.tag(args.name, ref)
|
|
35
|
+
print(f"tagged {ref[:8]} as {args.name!r}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def cmd_neighbors(args):
|
|
39
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
40
|
+
from deltagraphar.store.graphstore import GraphStore
|
|
41
|
+
from deltagraphar.format.reader import read_yaml
|
|
42
|
+
from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
|
|
43
|
+
from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
|
|
44
|
+
|
|
45
|
+
b = LocalBackend(args.repo)
|
|
46
|
+
ref = args.ref or None
|
|
47
|
+
|
|
48
|
+
etype = tuple(args.etype.split(","))
|
|
49
|
+
if len(etype) != 3:
|
|
50
|
+
print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
|
|
51
|
+
sys.exit(1)
|
|
52
|
+
|
|
53
|
+
src_label, et, dst_label = etype
|
|
54
|
+
vi_data = read_yaml(b, vertex_yaml_path(args.label), ref=ref)
|
|
55
|
+
ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label), ref=ref)
|
|
56
|
+
|
|
57
|
+
vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
|
|
58
|
+
ei = EdgeInfo(
|
|
59
|
+
src_type=ei_data["src_type"],
|
|
60
|
+
edge_type=ei_data["edge_type"],
|
|
61
|
+
dst_type=ei_data["dst_type"],
|
|
62
|
+
chunk_size=ei_data["chunk_size"],
|
|
63
|
+
src_chunk_size=ei_data["src_chunk_size"],
|
|
64
|
+
)
|
|
65
|
+
gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
|
|
66
|
+
gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
|
|
67
|
+
nbrs = gs.out_neighbors(args.label, args.vertex, etype, ref=ref)
|
|
68
|
+
print(json.dumps(nbrs))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def cmd_compact(args):
|
|
72
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
73
|
+
from deltagraphar.store.graphstore import GraphStore
|
|
74
|
+
from deltagraphar.format.reader import read_yaml
|
|
75
|
+
from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
|
|
76
|
+
from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
|
|
77
|
+
|
|
78
|
+
b = LocalBackend(args.repo)
|
|
79
|
+
etype = tuple(args.etype.split(","))
|
|
80
|
+
if len(etype) != 3:
|
|
81
|
+
print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
|
|
84
|
+
src_label, et, dst_label = etype
|
|
85
|
+
vi_data = read_yaml(b, vertex_yaml_path(src_label))
|
|
86
|
+
ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label))
|
|
87
|
+
|
|
88
|
+
vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
|
|
89
|
+
ei = EdgeInfo(
|
|
90
|
+
src_type=ei_data["src_type"],
|
|
91
|
+
edge_type=ei_data["edge_type"],
|
|
92
|
+
dst_type=ei_data["dst_type"],
|
|
93
|
+
chunk_size=ei_data["chunk_size"],
|
|
94
|
+
src_chunk_size=ei_data["src_chunk_size"],
|
|
95
|
+
)
|
|
96
|
+
gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
|
|
97
|
+
gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
|
|
98
|
+
|
|
99
|
+
vchunks = [int(v) for v in args.vchunks.split(",")] if args.vchunks else None
|
|
100
|
+
ref = gs.compact(etype, vchunks=vchunks)
|
|
101
|
+
print(f"compacted → {ref[:8]}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def main():
|
|
105
|
+
parser = argparse.ArgumentParser(
|
|
106
|
+
prog="deltagraphar",
|
|
107
|
+
description="DeltaGraphAr — versioned property-graph store",
|
|
108
|
+
)
|
|
109
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
110
|
+
|
|
111
|
+
p_log = sub.add_parser("log", help="Show commit history")
|
|
112
|
+
p_log.add_argument("--repo", required=True, help="Path to local repo")
|
|
113
|
+
p_log.set_defaults(func=cmd_log)
|
|
114
|
+
|
|
115
|
+
p_tag = sub.add_parser("tag", help="Tag the latest commit")
|
|
116
|
+
p_tag.add_argument("--repo", required=True)
|
|
117
|
+
p_tag.add_argument("name", help="Tag name")
|
|
118
|
+
p_tag.set_defaults(func=cmd_tag)
|
|
119
|
+
|
|
120
|
+
p_nbr = sub.add_parser("neighbors", help="List out-neighbors of a vertex at a ref")
|
|
121
|
+
p_nbr.add_argument("--repo", required=True)
|
|
122
|
+
p_nbr.add_argument("--label", required=True, help="Vertex label")
|
|
123
|
+
p_nbr.add_argument("--vertex", required=True, help="Logical vertex ID")
|
|
124
|
+
p_nbr.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
|
|
125
|
+
p_nbr.add_argument("--ref", default=None, help="Commit ref or tag (default: HEAD)")
|
|
126
|
+
p_nbr.set_defaults(func=cmd_neighbors)
|
|
127
|
+
|
|
128
|
+
p_compact = sub.add_parser("compact", help="Compact delta into ordered CSR")
|
|
129
|
+
p_compact.add_argument("--repo", required=True)
|
|
130
|
+
p_compact.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
|
|
131
|
+
p_compact.add_argument("--vchunks", default=None,
|
|
132
|
+
help="Comma-separated vchunk indices (default: auto-discover)")
|
|
133
|
+
p_compact.set_defaults(func=cmd_compact)
|
|
134
|
+
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
args.func(args)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
def vertex_chunk_path(label: str, pg_prefix: str, chunk_idx: int) -> str:
|
|
2
|
+
return f"vertex/{label}/{pg_prefix}/chunk{chunk_idx}"
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def vid_map_chunk_path(label: str, chunk_idx: int) -> str:
|
|
6
|
+
return f"vertex/{label}/__vid_map__/chunk{chunk_idx}"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def adj_list_chunk_path(
|
|
10
|
+
src: str, etype: str, dst: str, adj_type: str, vchunk: int, chunk_idx: int
|
|
11
|
+
) -> str:
|
|
12
|
+
return f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}/chunk{chunk_idx}"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def offset_chunk_path(src: str, etype: str, dst: str, vchunk: int) -> str:
|
|
16
|
+
return f"edge/{src}_{etype}_{dst}/ordered_by_source/offset/part{vchunk}/chunk0"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def edge_prop_chunk_path(
|
|
20
|
+
src: str, etype: str, dst: str,
|
|
21
|
+
adj_type: str, pg_prefix: str, vchunk: int, chunk_idx: int,
|
|
22
|
+
) -> str:
|
|
23
|
+
return f"edge/{src}_{etype}_{dst}/{adj_type}/{pg_prefix}/part{vchunk}/chunk{chunk_idx}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def graph_yaml_path(name: str) -> str:
|
|
27
|
+
return f"{name}.graph.yml"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def vertex_yaml_path(label: str) -> str:
|
|
31
|
+
return f"{label}.vertex.yml"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def edge_yaml_path(src: str, etype: str, dst: str) -> str:
|
|
35
|
+
return f"{src}_{etype}_{dst}.edge.yml"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import yaml
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
import pyarrow.compute as pc
|
|
6
|
+
|
|
7
|
+
from deltagraphar.format.paths import offset_chunk_path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def read_table(backend, path: str, ref=None) -> pa.Table:
|
|
11
|
+
data = backend.read_file(path, ref=ref)
|
|
12
|
+
return pq.read_table(io.BytesIO(data))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_yaml(backend, path: str, ref=None) -> dict:
|
|
16
|
+
return yaml.safe_load(backend.read_file(path, ref=ref))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_adj_list(
|
|
20
|
+
backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
|
|
21
|
+
) -> pa.Table:
|
|
22
|
+
"""Concatenate all chunk files for one (adj_type, vchunk) partition."""
|
|
23
|
+
prefix = f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}"
|
|
24
|
+
paths = sorted(backend.list(prefix, ref=ref))
|
|
25
|
+
if not paths:
|
|
26
|
+
return pa.table({
|
|
27
|
+
"src_physical": pa.array([], type=pa.int64()),
|
|
28
|
+
"dst_physical": pa.array([], type=pa.int64()),
|
|
29
|
+
})
|
|
30
|
+
return pa.concat_tables([read_table(backend, p, ref=ref) for p in paths])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def read_offsets(backend, src: str, etype: str, dst: str, vchunk: int, ref=None) -> list[int]:
|
|
34
|
+
"""Read the CSR offset array for a vchunk; returns [] if not yet compacted."""
|
|
35
|
+
path = offset_chunk_path(src, etype, dst, vchunk)
|
|
36
|
+
try:
|
|
37
|
+
tbl = read_table(backend, path, ref=ref)
|
|
38
|
+
return tbl["offset"].to_pylist()
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def scan_delta(
|
|
44
|
+
backend, src: str, etype: str, dst: str, vchunk: int, src_physical: int, ref=None
|
|
45
|
+
) -> list[int]:
|
|
46
|
+
"""Return dst_physical list from unordered delta where src_physical matches."""
|
|
47
|
+
delta = read_adj_list(backend, src, etype, dst, "unordered_by_source", vchunk, ref=ref)
|
|
48
|
+
if len(delta) == 0:
|
|
49
|
+
return []
|
|
50
|
+
mask = pc.equal(delta["src_physical"], src_physical)
|
|
51
|
+
return delta.filter(mask)["dst_physical"].to_pylist()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def count_rows(
|
|
55
|
+
backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
|
|
56
|
+
) -> int:
|
|
57
|
+
tbl = read_adj_list(backend, src, etype, dst, adj_type, vchunk, ref=ref)
|
|
58
|
+
return len(tbl)
|