deltagraphar 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
deltagraphar/cli.py ADDED
@@ -0,0 +1,140 @@
1
+ """Command-line interface for DeltaGraphAr."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+
9
+ def _get_backend(args):
10
+ from deltagraphar.versioning.local_backend import LocalBackend
11
+ return LocalBackend(args.repo)
12
+
13
+
14
+ def cmd_log(args):
15
+ b = _get_backend(args)
16
+ commits = b.log()
17
+ if not commits:
18
+ print("(no commits)")
19
+ return
20
+ for c in commits:
21
+ from datetime import datetime, timezone
22
+ dt = datetime.fromtimestamp(c.timestamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
23
+ meta = f" {c.metadata}" if c.metadata else ""
24
+ print(f"{c.ref[:8]} {dt} {c.message}{meta}")
25
+
26
+
27
+ def cmd_tag(args):
28
+ b = _get_backend(args)
29
+ log = b.log()
30
+ if not log:
31
+ print("error: no commits to tag", file=sys.stderr)
32
+ sys.exit(1)
33
+ ref = log[-1].ref
34
+ b.tag(args.name, ref)
35
+ print(f"tagged {ref[:8]} as {args.name!r}")
36
+
37
+
38
+ def cmd_neighbors(args):
39
+ from deltagraphar.versioning.local_backend import LocalBackend
40
+ from deltagraphar.store.graphstore import GraphStore
41
+ from deltagraphar.format.reader import read_yaml
42
+ from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
43
+ from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
44
+
45
+ b = LocalBackend(args.repo)
46
+ ref = args.ref or None
47
+
48
+ etype = tuple(args.etype.split(","))
49
+ if len(etype) != 3:
50
+ print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+ src_label, et, dst_label = etype
54
+ vi_data = read_yaml(b, vertex_yaml_path(args.label), ref=ref)
55
+ ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label), ref=ref)
56
+
57
+ vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
58
+ ei = EdgeInfo(
59
+ src_type=ei_data["src_type"],
60
+ edge_type=ei_data["edge_type"],
61
+ dst_type=ei_data["dst_type"],
62
+ chunk_size=ei_data["chunk_size"],
63
+ src_chunk_size=ei_data["src_chunk_size"],
64
+ )
65
+ gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
66
+ gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
67
+ nbrs = gs.out_neighbors(args.label, args.vertex, etype, ref=ref)
68
+ print(json.dumps(nbrs))
69
+
70
+
71
+ def cmd_compact(args):
72
+ from deltagraphar.versioning.local_backend import LocalBackend
73
+ from deltagraphar.store.graphstore import GraphStore
74
+ from deltagraphar.format.reader import read_yaml
75
+ from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
76
+ from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
77
+
78
+ b = LocalBackend(args.repo)
79
+ etype = tuple(args.etype.split(","))
80
+ if len(etype) != 3:
81
+ print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
82
+ sys.exit(1)
83
+
84
+ src_label, et, dst_label = etype
85
+ vi_data = read_yaml(b, vertex_yaml_path(src_label))
86
+ ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label))
87
+
88
+ vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
89
+ ei = EdgeInfo(
90
+ src_type=ei_data["src_type"],
91
+ edge_type=ei_data["edge_type"],
92
+ dst_type=ei_data["dst_type"],
93
+ chunk_size=ei_data["chunk_size"],
94
+ src_chunk_size=ei_data["src_chunk_size"],
95
+ )
96
+ gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
97
+ gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
98
+
99
+ vchunks = [int(v) for v in args.vchunks.split(",")] if args.vchunks else None
100
+ ref = gs.compact(etype, vchunks=vchunks)
101
+ print(f"compacted → {ref[:8]}")
102
+
103
+
104
+ def main():
105
+ parser = argparse.ArgumentParser(
106
+ prog="deltagraphar",
107
+ description="DeltaGraphAr — versioned property-graph store",
108
+ )
109
+ sub = parser.add_subparsers(dest="command", required=True)
110
+
111
+ p_log = sub.add_parser("log", help="Show commit history")
112
+ p_log.add_argument("--repo", required=True, help="Path to local repo")
113
+ p_log.set_defaults(func=cmd_log)
114
+
115
+ p_tag = sub.add_parser("tag", help="Tag the latest commit")
116
+ p_tag.add_argument("--repo", required=True)
117
+ p_tag.add_argument("name", help="Tag name")
118
+ p_tag.set_defaults(func=cmd_tag)
119
+
120
+ p_nbr = sub.add_parser("neighbors", help="List out-neighbors of a vertex at a ref")
121
+ p_nbr.add_argument("--repo", required=True)
122
+ p_nbr.add_argument("--label", required=True, help="Vertex label")
123
+ p_nbr.add_argument("--vertex", required=True, help="Logical vertex ID")
124
+ p_nbr.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
125
+ p_nbr.add_argument("--ref", default=None, help="Commit ref or tag (default: HEAD)")
126
+ p_nbr.set_defaults(func=cmd_neighbors)
127
+
128
+ p_compact = sub.add_parser("compact", help="Compact delta into ordered CSR")
129
+ p_compact.add_argument("--repo", required=True)
130
+ p_compact.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
131
+ p_compact.add_argument("--vchunks", default=None,
132
+ help="Comma-separated vchunk indices (default: auto-discover)")
133
+ p_compact.set_defaults(func=cmd_compact)
134
+
135
+ args = parser.parse_args()
136
+ args.func(args)
137
+
138
+
139
+ if __name__ == "__main__":
140
+ main()
File without changes
@@ -0,0 +1,35 @@
1
+ def vertex_chunk_path(label: str, pg_prefix: str, chunk_idx: int) -> str:
2
+ return f"vertex/{label}/{pg_prefix}/chunk{chunk_idx}"
3
+
4
+
5
+ def vid_map_chunk_path(label: str, chunk_idx: int) -> str:
6
+ return f"vertex/{label}/__vid_map__/chunk{chunk_idx}"
7
+
8
+
9
+ def adj_list_chunk_path(
10
+ src: str, etype: str, dst: str, adj_type: str, vchunk: int, chunk_idx: int
11
+ ) -> str:
12
+ return f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}/chunk{chunk_idx}"
13
+
14
+
15
+ def offset_chunk_path(src: str, etype: str, dst: str, vchunk: int) -> str:
16
+ return f"edge/{src}_{etype}_{dst}/ordered_by_source/offset/part{vchunk}/chunk0"
17
+
18
+
19
+ def edge_prop_chunk_path(
20
+ src: str, etype: str, dst: str,
21
+ adj_type: str, pg_prefix: str, vchunk: int, chunk_idx: int,
22
+ ) -> str:
23
+ return f"edge/{src}_{etype}_{dst}/{adj_type}/{pg_prefix}/part{vchunk}/chunk{chunk_idx}"
24
+
25
+
26
+ def graph_yaml_path(name: str) -> str:
27
+ return f"{name}.graph.yml"
28
+
29
+
30
+ def vertex_yaml_path(label: str) -> str:
31
+ return f"{label}.vertex.yml"
32
+
33
+
34
+ def edge_yaml_path(src: str, etype: str, dst: str) -> str:
35
+ return f"{src}_{etype}_{dst}.edge.yml"
@@ -0,0 +1,58 @@
1
+ import io
2
+ import yaml
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+ import pyarrow.compute as pc
6
+
7
+ from deltagraphar.format.paths import offset_chunk_path
8
+
9
+
10
+ def read_table(backend, path: str, ref=None) -> pa.Table:
11
+ data = backend.read_file(path, ref=ref)
12
+ return pq.read_table(io.BytesIO(data))
13
+
14
+
15
+ def read_yaml(backend, path: str, ref=None) -> dict:
16
+ return yaml.safe_load(backend.read_file(path, ref=ref))
17
+
18
+
19
+ def read_adj_list(
20
+ backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
21
+ ) -> pa.Table:
22
+ """Concatenate all chunk files for one (adj_type, vchunk) partition."""
23
+ prefix = f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}"
24
+ paths = sorted(backend.list(prefix, ref=ref))
25
+ if not paths:
26
+ return pa.table({
27
+ "src_physical": pa.array([], type=pa.int64()),
28
+ "dst_physical": pa.array([], type=pa.int64()),
29
+ })
30
+ return pa.concat_tables([read_table(backend, p, ref=ref) for p in paths])
31
+
32
+
33
+ def read_offsets(backend, src: str, etype: str, dst: str, vchunk: int, ref=None) -> list[int]:
34
+ """Read the CSR offset array for a vchunk; returns [] if not yet compacted."""
35
+ path = offset_chunk_path(src, etype, dst, vchunk)
36
+ try:
37
+ tbl = read_table(backend, path, ref=ref)
38
+ return tbl["offset"].to_pylist()
39
+ except FileNotFoundError:
40
+ return []
41
+
42
+
43
+ def scan_delta(
44
+ backend, src: str, etype: str, dst: str, vchunk: int, src_physical: int, ref=None
45
+ ) -> list[int]:
46
+ """Return dst_physical list from unordered delta where src_physical matches."""
47
+ delta = read_adj_list(backend, src, etype, dst, "unordered_by_source", vchunk, ref=ref)
48
+ if len(delta) == 0:
49
+ return []
50
+ mask = pc.equal(delta["src_physical"], src_physical)
51
+ return delta.filter(mask)["dst_physical"].to_pylist()
52
+
53
+
54
+ def count_rows(
55
+ backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
56
+ ) -> int:
57
+ tbl = read_adj_list(backend, src, etype, dst, adj_type, vchunk, ref=ref)
58
+ return len(tbl)
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass, field
3
+
4
+
5
+ @dataclass
6
+ class Property:
7
+ name: str
8
+ data_type: str # int32 | int64 | float32 | float64 | string | bool
9
+
10
+
11
+ @dataclass
12
+ class PropertyGroup:
13
+ properties: list[Property]
14
+ file_type: str = "parquet"
15
+ prefix: str = "" # subdirectory name in chunk paths
16
+
17
+
18
+ @dataclass
19
+ class VertexInfo:
20
+ label: str
21
+ chunk_size: int
22
+ property_groups: list[PropertyGroup] = field(default_factory=list)
23
+ version: str = "gar/v1"
24
+
25
+ def to_dict(self) -> dict:
26
+ return {
27
+ "label": self.label,
28
+ "chunk_size": self.chunk_size,
29
+ "property_groups": [_pg_to_dict(pg) for pg in self.property_groups],
30
+ "version": self.version,
31
+ }
32
+
33
+ @classmethod
34
+ def from_dict(cls, d: dict) -> VertexInfo:
35
+ return cls(
36
+ label=d["label"],
37
+ chunk_size=d["chunk_size"],
38
+ property_groups=[_pg_from_dict(pg) for pg in d.get("property_groups", [])],
39
+ version=d.get("version", "gar/v1"),
40
+ )
41
+
42
+
43
+ @dataclass
44
+ class EdgeInfo:
45
+ src_type: str
46
+ edge_type: str
47
+ dst_type: str
48
+ chunk_size: int
49
+ src_chunk_size: int
50
+ directed: bool = True
51
+ property_groups: list[PropertyGroup] = field(default_factory=list)
52
+ version: str = "gar/v1"
53
+ # adj_lists stored so round-trips are lossless; v1 always emits ordered + unordered by src
54
+ adj_lists: list[dict] = field(default_factory=lambda: [
55
+ {"ordered": True, "aligned_by": "src", "file_type": "parquet", "prefix": "ordered_by_source"},
56
+ {"ordered": False, "aligned_by": "src", "file_type": "parquet", "prefix": "unordered_by_source"},
57
+ ])
58
+
59
+ @property
60
+ def etype(self) -> tuple[str, str, str]:
61
+ return (self.src_type, self.edge_type, self.dst_type)
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "src_type": self.src_type,
66
+ "edge_type": self.edge_type,
67
+ "dst_type": self.dst_type,
68
+ "directed": self.directed,
69
+ "chunk_size": self.chunk_size,
70
+ "src_chunk_size": self.src_chunk_size,
71
+ "adj_lists": self.adj_lists,
72
+ "property_groups": [_pg_to_dict(pg) for pg in self.property_groups],
73
+ "version": self.version,
74
+ }
75
+
76
+ @classmethod
77
+ def from_dict(cls, d: dict) -> EdgeInfo:
78
+ return cls(
79
+ src_type=d["src_type"],
80
+ edge_type=d["edge_type"],
81
+ dst_type=d["dst_type"],
82
+ chunk_size=d["chunk_size"],
83
+ src_chunk_size=d["src_chunk_size"],
84
+ directed=d.get("directed", True),
85
+ property_groups=[_pg_from_dict(pg) for pg in d.get("property_groups", [])],
86
+ version=d.get("version", "gar/v1"),
87
+ adj_lists=d.get("adj_lists", [
88
+ {"ordered": True, "aligned_by": "src", "file_type": "parquet", "prefix": "ordered_by_source"},
89
+ {"ordered": False, "aligned_by": "src", "file_type": "parquet", "prefix": "unordered_by_source"},
90
+ ]),
91
+ )
92
+
93
+
94
+ @dataclass
95
+ class GraphInfo:
96
+ name: str
97
+ prefix: str
98
+ vertex_infos: list[VertexInfo] = field(default_factory=list)
99
+ edge_infos: list[EdgeInfo] = field(default_factory=list)
100
+ version: str = "gar/v1"
101
+
102
+ def to_dict(self) -> dict:
103
+ return {
104
+ "name": self.name,
105
+ "prefix": self.prefix,
106
+ "vertices": [f"{vi.label}.vertex.yml" for vi in self.vertex_infos],
107
+ "edges": [f"{ei.src_type}_{ei.edge_type}_{ei.dst_type}.edge.yml" for ei in self.edge_infos],
108
+ "version": self.version,
109
+ }
110
+
111
+ @classmethod
112
+ def from_dict(cls, d: dict, vertex_infos: list[VertexInfo] | None = None, edge_infos: list[EdgeInfo] | None = None) -> GraphInfo:
113
+ """Reconstruct GraphInfo from manifest dict. Pass pre-loaded vertex/edge infos if available."""
114
+ return cls(
115
+ name=d["name"],
116
+ prefix=d.get("prefix", ""),
117
+ vertex_infos=vertex_infos or [],
118
+ edge_infos=edge_infos or [],
119
+ version=d.get("version", "gar/v1"),
120
+ )
121
+
122
+
123
+ def _pg_to_dict(pg: PropertyGroup) -> dict:
124
+ return {
125
+ "properties": [{"name": p.name, "data_type": p.data_type} for p in pg.properties],
126
+ "file_type": pg.file_type,
127
+ "prefix": pg.prefix,
128
+ }
129
+
130
+
131
+ def _pg_from_dict(d: dict) -> PropertyGroup:
132
+ return PropertyGroup(
133
+ properties=[Property(**p) for p in d["properties"]],
134
+ file_type=d.get("file_type", "parquet"),
135
+ prefix=d.get("prefix", ""),
136
+ )
@@ -0,0 +1,14 @@
1
+ import io
2
+ import yaml
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+
6
+
7
+ def write_table(backend, path: str, table: pa.Table) -> None:
8
+ buf = io.BytesIO()
9
+ pq.write_table(table, buf)
10
+ backend.write_file(path, buf.getvalue())
11
+
12
+
13
+ def write_yaml(backend, path: str, data: dict) -> None:
14
+ backend.write_file(path, yaml.dump(data, default_flow_style=False).encode())
File without changes
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+ import pyarrow as pa
3
+ import pyarrow.compute as pc
4
+
5
+ from deltagraphar.format.paths import adj_list_chunk_path, offset_chunk_path, edge_prop_chunk_path
6
+ from deltagraphar.format.reader import read_adj_list, count_rows, read_table
7
+ from deltagraphar.format.writer import write_table
8
+
9
+
10
+ def should_compact(backend, etype, vchunk, threshold_ratio, min_rows):
11
+ """True when delta rows exceed max(threshold_ratio * base_rows, min_rows)."""
12
+ src, et, dst = etype
13
+ delta_rows = count_rows(backend, src, et, dst, "unordered_by_source", vchunk)
14
+ if delta_rows == 0:
15
+ return False
16
+ base_rows = count_rows(backend, src, et, dst, "ordered_by_source", vchunk)
17
+ return delta_rows > max(threshold_ratio * base_rows, min_rows)
18
+
19
+
20
+ def compact_vchunk(backend, etype, vchunk, vsize, edge_chunk_size, pg_prefixes):
21
+ """Merge unordered delta into ordered_by_source CSR and recompute offsets.
22
+
23
+ Only files under part<vchunk> change. Sort by (src_physical, dst_physical)
24
+ for deterministic layout.
25
+ """
26
+ src, et, dst = etype
27
+
28
+ base_adj = read_adj_list(backend, src, et, dst, "ordered_by_source", vchunk)
29
+ delta_adj = read_adj_list(backend, src, et, dst, "unordered_by_source", vchunk)
30
+
31
+ if len(base_adj) == 0 and len(delta_adj) == 0:
32
+ return
33
+
34
+ parts = [t for t in [base_adj, delta_adj] if len(t) > 0]
35
+ merged = pa.concat_tables(parts) if len(parts) > 1 else parts[0]
36
+
37
+ # Sort by (src, dst) — required for CSR correctness and determinism
38
+ sort_indices = pc.sort_indices(
39
+ merged, sort_keys=[("src_physical", "ascending"), ("dst_physical", "ascending")]
40
+ )
41
+ merged = merged.take(sort_indices)
42
+
43
+ # Write new ordered adj_list chunks
44
+ _write_chunks(
45
+ backend, merged, edge_chunk_size,
46
+ lambda ci: adj_list_chunk_path(src, et, dst, "ordered_by_source", vchunk, ci),
47
+ )
48
+
49
+ # Recompute CSR offset array: offsets[i] = index of first row with src_physical >= vstart+i
50
+ vstart = vchunk * vsize
51
+ src_arr = merged["src_physical"].to_pylist()
52
+ ptr = 0
53
+ offsets: list[int] = []
54
+ for i in range(vsize + 1):
55
+ target = vstart + i
56
+ while ptr < len(src_arr) and src_arr[ptr] < target:
57
+ ptr += 1
58
+ offsets.append(ptr)
59
+
60
+ write_table(
61
+ backend,
62
+ offset_chunk_path(src, et, dst, vchunk),
63
+ pa.table({"offset": pa.array(offsets, type=pa.int64())}),
64
+ )
65
+
66
+ # Compact edge property groups in tandem, preserving row alignment with adj_list
67
+ for pg_prefix in pg_prefixes:
68
+ base_pg = _read_all_pg_chunks(backend, src, et, dst, "ordered_by_source", pg_prefix, vchunk)
69
+ delta_pg = _read_all_pg_chunks(backend, src, et, dst, "unordered_by_source", pg_prefix, vchunk)
70
+ pg_parts = [t for t in [base_pg, delta_pg] if t is not None and len(t) > 0]
71
+ if pg_parts:
72
+ merged_pg = (pa.concat_tables(pg_parts) if len(pg_parts) > 1 else pg_parts[0]).take(sort_indices)
73
+ _write_chunks(
74
+ backend, merged_pg, edge_chunk_size,
75
+ lambda ci, _p=pg_prefix: edge_prop_chunk_path(src, et, dst, "ordered_by_source", _p, vchunk, ci),
76
+ )
77
+
78
+ # Truncate delta: overwrite all existing delta chunks with an empty table
79
+ _truncate_delta(backend, src, et, dst, vchunk, pg_prefixes)
80
+
81
+
82
+ def _write_chunks(backend, table, chunk_size, path_fn):
83
+ n = len(table)
84
+ if n == 0:
85
+ write_table(backend, path_fn(0), table)
86
+ return
87
+ for ci, start in enumerate(range(0, n, chunk_size)):
88
+ write_table(backend, path_fn(ci), table.slice(start, chunk_size))
89
+
90
+
91
+ def _read_all_pg_chunks(backend, src, et, dst, adj_type, pg_prefix, vchunk):
92
+ prefix = f"edge/{src}_{et}_{dst}/{adj_type}/{pg_prefix}/part{vchunk}"
93
+ paths = sorted(backend.list(prefix))
94
+ if not paths:
95
+ return None
96
+ return pa.concat_tables([read_table(backend, p) for p in paths])
97
+
98
+
99
+ def _truncate_delta(backend, src, et, dst, vchunk, pg_prefixes=()):
100
+ """Zero out all unordered delta chunks for this vchunk (adj_list + all property groups)."""
101
+ empty_adj = pa.table({
102
+ "src_physical": pa.array([], type=pa.int64()),
103
+ "dst_physical": pa.array([], type=pa.int64()),
104
+ })
105
+ _zero_prefix(backend, f"edge/{src}_{et}_{dst}/unordered_by_source/adj_list/part{vchunk}", empty_adj)
106
+ for pg_prefix in pg_prefixes:
107
+ # empty table with no columns — schema unknown here, but zero rows is correct sentinel
108
+ _zero_prefix(backend, f"edge/{src}_{et}_{dst}/unordered_by_source/{pg_prefix}/part{vchunk}",
109
+ pa.table({}))
110
+
111
+
112
+ def _zero_prefix(backend, prefix, empty_table):
113
+ existing = backend.list(prefix)
114
+ if existing:
115
+ for p in existing:
116
+ write_table(backend, p, empty_table)
117
+ else:
118
+ # write a sentinel chunk0 so future compactions see "already cleared"
119
+ write_table(backend, prefix + "/chunk0", empty_table)