deltagraphar 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltagraphar/__init__.py +0 -0
- deltagraphar/cli.py +140 -0
- deltagraphar/format/__init__.py +0 -0
- deltagraphar/format/paths.py +35 -0
- deltagraphar/format/reader.py +58 -0
- deltagraphar/format/schema.py +136 -0
- deltagraphar/format/writer.py +14 -0
- deltagraphar/store/__init__.py +0 -0
- deltagraphar/store/compaction.py +119 -0
- deltagraphar/store/graphstore.py +270 -0
- deltagraphar/store/ids.py +99 -0
- deltagraphar/versioning/__init__.py +0 -0
- deltagraphar/versioning/backend.py +40 -0
- deltagraphar/versioning/lakefs_backend.py +91 -0
- deltagraphar/versioning/local_backend.py +86 -0
- deltagraphar-0.1.0.dist-info/METADATA +204 -0
- deltagraphar-0.1.0.dist-info/RECORD +21 -0
- deltagraphar-0.1.0.dist-info/WHEEL +5 -0
- deltagraphar-0.1.0.dist-info/entry_points.txt +2 -0
- deltagraphar-0.1.0.dist-info/licenses/LICENSE +21 -0
- deltagraphar-0.1.0.dist-info/top_level.txt +1 -0
deltagraphar/__init__.py
ADDED
|
File without changes
|
deltagraphar/cli.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Command-line interface for DeltaGraphAr."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_backend(args):
|
|
10
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
11
|
+
return LocalBackend(args.repo)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cmd_log(args):
|
|
15
|
+
b = _get_backend(args)
|
|
16
|
+
commits = b.log()
|
|
17
|
+
if not commits:
|
|
18
|
+
print("(no commits)")
|
|
19
|
+
return
|
|
20
|
+
for c in commits:
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
dt = datetime.fromtimestamp(c.timestamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
23
|
+
meta = f" {c.metadata}" if c.metadata else ""
|
|
24
|
+
print(f"{c.ref[:8]} {dt} {c.message}{meta}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def cmd_tag(args):
|
|
28
|
+
b = _get_backend(args)
|
|
29
|
+
log = b.log()
|
|
30
|
+
if not log:
|
|
31
|
+
print("error: no commits to tag", file=sys.stderr)
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
ref = log[-1].ref
|
|
34
|
+
b.tag(args.name, ref)
|
|
35
|
+
print(f"tagged {ref[:8]} as {args.name!r}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def cmd_neighbors(args):
|
|
39
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
40
|
+
from deltagraphar.store.graphstore import GraphStore
|
|
41
|
+
from deltagraphar.format.reader import read_yaml
|
|
42
|
+
from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
|
|
43
|
+
from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
|
|
44
|
+
|
|
45
|
+
b = LocalBackend(args.repo)
|
|
46
|
+
ref = args.ref or None
|
|
47
|
+
|
|
48
|
+
etype = tuple(args.etype.split(","))
|
|
49
|
+
if len(etype) != 3:
|
|
50
|
+
print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
|
|
51
|
+
sys.exit(1)
|
|
52
|
+
|
|
53
|
+
src_label, et, dst_label = etype
|
|
54
|
+
vi_data = read_yaml(b, vertex_yaml_path(args.label), ref=ref)
|
|
55
|
+
ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label), ref=ref)
|
|
56
|
+
|
|
57
|
+
vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
|
|
58
|
+
ei = EdgeInfo(
|
|
59
|
+
src_type=ei_data["src_type"],
|
|
60
|
+
edge_type=ei_data["edge_type"],
|
|
61
|
+
dst_type=ei_data["dst_type"],
|
|
62
|
+
chunk_size=ei_data["chunk_size"],
|
|
63
|
+
src_chunk_size=ei_data["src_chunk_size"],
|
|
64
|
+
)
|
|
65
|
+
gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
|
|
66
|
+
gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
|
|
67
|
+
nbrs = gs.out_neighbors(args.label, args.vertex, etype, ref=ref)
|
|
68
|
+
print(json.dumps(nbrs))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def cmd_compact(args):
|
|
72
|
+
from deltagraphar.versioning.local_backend import LocalBackend
|
|
73
|
+
from deltagraphar.store.graphstore import GraphStore
|
|
74
|
+
from deltagraphar.format.reader import read_yaml
|
|
75
|
+
from deltagraphar.format.paths import vertex_yaml_path, edge_yaml_path
|
|
76
|
+
from deltagraphar.format.schema import GraphInfo, VertexInfo, EdgeInfo
|
|
77
|
+
|
|
78
|
+
b = LocalBackend(args.repo)
|
|
79
|
+
etype = tuple(args.etype.split(","))
|
|
80
|
+
if len(etype) != 3:
|
|
81
|
+
print("error: --etype must be 'src,edge,dst'", file=sys.stderr)
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
|
|
84
|
+
src_label, et, dst_label = etype
|
|
85
|
+
vi_data = read_yaml(b, vertex_yaml_path(src_label))
|
|
86
|
+
ei_data = read_yaml(b, edge_yaml_path(src_label, et, dst_label))
|
|
87
|
+
|
|
88
|
+
vi = VertexInfo(label=vi_data["label"], chunk_size=vi_data["chunk_size"])
|
|
89
|
+
ei = EdgeInfo(
|
|
90
|
+
src_type=ei_data["src_type"],
|
|
91
|
+
edge_type=ei_data["edge_type"],
|
|
92
|
+
dst_type=ei_data["dst_type"],
|
|
93
|
+
chunk_size=ei_data["chunk_size"],
|
|
94
|
+
src_chunk_size=ei_data["src_chunk_size"],
|
|
95
|
+
)
|
|
96
|
+
gi = GraphInfo(name="graph", prefix="", vertex_infos=[vi], edge_infos=[ei])
|
|
97
|
+
gs = GraphStore(b, gi, vertex_chunk_size=vi_data["chunk_size"])
|
|
98
|
+
|
|
99
|
+
vchunks = [int(v) for v in args.vchunks.split(",")] if args.vchunks else None
|
|
100
|
+
ref = gs.compact(etype, vchunks=vchunks)
|
|
101
|
+
print(f"compacted → {ref[:8]}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def main():
|
|
105
|
+
parser = argparse.ArgumentParser(
|
|
106
|
+
prog="deltagraphar",
|
|
107
|
+
description="DeltaGraphAr — versioned property-graph store",
|
|
108
|
+
)
|
|
109
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
110
|
+
|
|
111
|
+
p_log = sub.add_parser("log", help="Show commit history")
|
|
112
|
+
p_log.add_argument("--repo", required=True, help="Path to local repo")
|
|
113
|
+
p_log.set_defaults(func=cmd_log)
|
|
114
|
+
|
|
115
|
+
p_tag = sub.add_parser("tag", help="Tag the latest commit")
|
|
116
|
+
p_tag.add_argument("--repo", required=True)
|
|
117
|
+
p_tag.add_argument("name", help="Tag name")
|
|
118
|
+
p_tag.set_defaults(func=cmd_tag)
|
|
119
|
+
|
|
120
|
+
p_nbr = sub.add_parser("neighbors", help="List out-neighbors of a vertex at a ref")
|
|
121
|
+
p_nbr.add_argument("--repo", required=True)
|
|
122
|
+
p_nbr.add_argument("--label", required=True, help="Vertex label")
|
|
123
|
+
p_nbr.add_argument("--vertex", required=True, help="Logical vertex ID")
|
|
124
|
+
p_nbr.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
|
|
125
|
+
p_nbr.add_argument("--ref", default=None, help="Commit ref or tag (default: HEAD)")
|
|
126
|
+
p_nbr.set_defaults(func=cmd_neighbors)
|
|
127
|
+
|
|
128
|
+
p_compact = sub.add_parser("compact", help="Compact delta into ordered CSR")
|
|
129
|
+
p_compact.add_argument("--repo", required=True)
|
|
130
|
+
p_compact.add_argument("--etype", required=True, help="Edge type as 'src,edge,dst'")
|
|
131
|
+
p_compact.add_argument("--vchunks", default=None,
|
|
132
|
+
help="Comma-separated vchunk indices (default: auto-discover)")
|
|
133
|
+
p_compact.set_defaults(func=cmd_compact)
|
|
134
|
+
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
args.func(args)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
def vertex_chunk_path(label: str, pg_prefix: str, chunk_idx: int) -> str:
|
|
2
|
+
return f"vertex/{label}/{pg_prefix}/chunk{chunk_idx}"
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def vid_map_chunk_path(label: str, chunk_idx: int) -> str:
|
|
6
|
+
return f"vertex/{label}/__vid_map__/chunk{chunk_idx}"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def adj_list_chunk_path(
|
|
10
|
+
src: str, etype: str, dst: str, adj_type: str, vchunk: int, chunk_idx: int
|
|
11
|
+
) -> str:
|
|
12
|
+
return f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}/chunk{chunk_idx}"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def offset_chunk_path(src: str, etype: str, dst: str, vchunk: int) -> str:
|
|
16
|
+
return f"edge/{src}_{etype}_{dst}/ordered_by_source/offset/part{vchunk}/chunk0"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def edge_prop_chunk_path(
|
|
20
|
+
src: str, etype: str, dst: str,
|
|
21
|
+
adj_type: str, pg_prefix: str, vchunk: int, chunk_idx: int,
|
|
22
|
+
) -> str:
|
|
23
|
+
return f"edge/{src}_{etype}_{dst}/{adj_type}/{pg_prefix}/part{vchunk}/chunk{chunk_idx}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def graph_yaml_path(name: str) -> str:
|
|
27
|
+
return f"{name}.graph.yml"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def vertex_yaml_path(label: str) -> str:
|
|
31
|
+
return f"{label}.vertex.yml"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def edge_yaml_path(src: str, etype: str, dst: str) -> str:
|
|
35
|
+
return f"{src}_{etype}_{dst}.edge.yml"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import yaml
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
import pyarrow.compute as pc
|
|
6
|
+
|
|
7
|
+
from deltagraphar.format.paths import offset_chunk_path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def read_table(backend, path: str, ref=None) -> pa.Table:
|
|
11
|
+
data = backend.read_file(path, ref=ref)
|
|
12
|
+
return pq.read_table(io.BytesIO(data))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_yaml(backend, path: str, ref=None) -> dict:
|
|
16
|
+
return yaml.safe_load(backend.read_file(path, ref=ref))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_adj_list(
|
|
20
|
+
backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
|
|
21
|
+
) -> pa.Table:
|
|
22
|
+
"""Concatenate all chunk files for one (adj_type, vchunk) partition."""
|
|
23
|
+
prefix = f"edge/{src}_{etype}_{dst}/{adj_type}/adj_list/part{vchunk}"
|
|
24
|
+
paths = sorted(backend.list(prefix, ref=ref))
|
|
25
|
+
if not paths:
|
|
26
|
+
return pa.table({
|
|
27
|
+
"src_physical": pa.array([], type=pa.int64()),
|
|
28
|
+
"dst_physical": pa.array([], type=pa.int64()),
|
|
29
|
+
})
|
|
30
|
+
return pa.concat_tables([read_table(backend, p, ref=ref) for p in paths])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def read_offsets(backend, src: str, etype: str, dst: str, vchunk: int, ref=None) -> list[int]:
|
|
34
|
+
"""Read the CSR offset array for a vchunk; returns [] if not yet compacted."""
|
|
35
|
+
path = offset_chunk_path(src, etype, dst, vchunk)
|
|
36
|
+
try:
|
|
37
|
+
tbl = read_table(backend, path, ref=ref)
|
|
38
|
+
return tbl["offset"].to_pylist()
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def scan_delta(
|
|
44
|
+
backend, src: str, etype: str, dst: str, vchunk: int, src_physical: int, ref=None
|
|
45
|
+
) -> list[int]:
|
|
46
|
+
"""Return dst_physical list from unordered delta where src_physical matches."""
|
|
47
|
+
delta = read_adj_list(backend, src, etype, dst, "unordered_by_source", vchunk, ref=ref)
|
|
48
|
+
if len(delta) == 0:
|
|
49
|
+
return []
|
|
50
|
+
mask = pc.equal(delta["src_physical"], src_physical)
|
|
51
|
+
return delta.filter(mask)["dst_physical"].to_pylist()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def count_rows(
|
|
55
|
+
backend, src: str, etype: str, dst: str, adj_type: str, vchunk: int, ref=None
|
|
56
|
+
) -> int:
|
|
57
|
+
tbl = read_adj_list(backend, src, etype, dst, adj_type, vchunk, ref=ref)
|
|
58
|
+
return len(tbl)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Property:
|
|
7
|
+
name: str
|
|
8
|
+
data_type: str # int32 | int64 | float32 | float64 | string | bool
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class PropertyGroup:
|
|
13
|
+
properties: list[Property]
|
|
14
|
+
file_type: str = "parquet"
|
|
15
|
+
prefix: str = "" # subdirectory name in chunk paths
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class VertexInfo:
|
|
20
|
+
label: str
|
|
21
|
+
chunk_size: int
|
|
22
|
+
property_groups: list[PropertyGroup] = field(default_factory=list)
|
|
23
|
+
version: str = "gar/v1"
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> dict:
|
|
26
|
+
return {
|
|
27
|
+
"label": self.label,
|
|
28
|
+
"chunk_size": self.chunk_size,
|
|
29
|
+
"property_groups": [_pg_to_dict(pg) for pg in self.property_groups],
|
|
30
|
+
"version": self.version,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_dict(cls, d: dict) -> VertexInfo:
|
|
35
|
+
return cls(
|
|
36
|
+
label=d["label"],
|
|
37
|
+
chunk_size=d["chunk_size"],
|
|
38
|
+
property_groups=[_pg_from_dict(pg) for pg in d.get("property_groups", [])],
|
|
39
|
+
version=d.get("version", "gar/v1"),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class EdgeInfo:
|
|
45
|
+
src_type: str
|
|
46
|
+
edge_type: str
|
|
47
|
+
dst_type: str
|
|
48
|
+
chunk_size: int
|
|
49
|
+
src_chunk_size: int
|
|
50
|
+
directed: bool = True
|
|
51
|
+
property_groups: list[PropertyGroup] = field(default_factory=list)
|
|
52
|
+
version: str = "gar/v1"
|
|
53
|
+
# adj_lists stored so round-trips are lossless; v1 always emits ordered + unordered by src
|
|
54
|
+
adj_lists: list[dict] = field(default_factory=lambda: [
|
|
55
|
+
{"ordered": True, "aligned_by": "src", "file_type": "parquet", "prefix": "ordered_by_source"},
|
|
56
|
+
{"ordered": False, "aligned_by": "src", "file_type": "parquet", "prefix": "unordered_by_source"},
|
|
57
|
+
])
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def etype(self) -> tuple[str, str, str]:
|
|
61
|
+
return (self.src_type, self.edge_type, self.dst_type)
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict:
|
|
64
|
+
return {
|
|
65
|
+
"src_type": self.src_type,
|
|
66
|
+
"edge_type": self.edge_type,
|
|
67
|
+
"dst_type": self.dst_type,
|
|
68
|
+
"directed": self.directed,
|
|
69
|
+
"chunk_size": self.chunk_size,
|
|
70
|
+
"src_chunk_size": self.src_chunk_size,
|
|
71
|
+
"adj_lists": self.adj_lists,
|
|
72
|
+
"property_groups": [_pg_to_dict(pg) for pg in self.property_groups],
|
|
73
|
+
"version": self.version,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def from_dict(cls, d: dict) -> EdgeInfo:
|
|
78
|
+
return cls(
|
|
79
|
+
src_type=d["src_type"],
|
|
80
|
+
edge_type=d["edge_type"],
|
|
81
|
+
dst_type=d["dst_type"],
|
|
82
|
+
chunk_size=d["chunk_size"],
|
|
83
|
+
src_chunk_size=d["src_chunk_size"],
|
|
84
|
+
directed=d.get("directed", True),
|
|
85
|
+
property_groups=[_pg_from_dict(pg) for pg in d.get("property_groups", [])],
|
|
86
|
+
version=d.get("version", "gar/v1"),
|
|
87
|
+
adj_lists=d.get("adj_lists", [
|
|
88
|
+
{"ordered": True, "aligned_by": "src", "file_type": "parquet", "prefix": "ordered_by_source"},
|
|
89
|
+
{"ordered": False, "aligned_by": "src", "file_type": "parquet", "prefix": "unordered_by_source"},
|
|
90
|
+
]),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class GraphInfo:
|
|
96
|
+
name: str
|
|
97
|
+
prefix: str
|
|
98
|
+
vertex_infos: list[VertexInfo] = field(default_factory=list)
|
|
99
|
+
edge_infos: list[EdgeInfo] = field(default_factory=list)
|
|
100
|
+
version: str = "gar/v1"
|
|
101
|
+
|
|
102
|
+
def to_dict(self) -> dict:
|
|
103
|
+
return {
|
|
104
|
+
"name": self.name,
|
|
105
|
+
"prefix": self.prefix,
|
|
106
|
+
"vertices": [f"{vi.label}.vertex.yml" for vi in self.vertex_infos],
|
|
107
|
+
"edges": [f"{ei.src_type}_{ei.edge_type}_{ei.dst_type}.edge.yml" for ei in self.edge_infos],
|
|
108
|
+
"version": self.version,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def from_dict(cls, d: dict, vertex_infos: list[VertexInfo] | None = None, edge_infos: list[EdgeInfo] | None = None) -> GraphInfo:
|
|
113
|
+
"""Reconstruct GraphInfo from manifest dict. Pass pre-loaded vertex/edge infos if available."""
|
|
114
|
+
return cls(
|
|
115
|
+
name=d["name"],
|
|
116
|
+
prefix=d.get("prefix", ""),
|
|
117
|
+
vertex_infos=vertex_infos or [],
|
|
118
|
+
edge_infos=edge_infos or [],
|
|
119
|
+
version=d.get("version", "gar/v1"),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _pg_to_dict(pg: PropertyGroup) -> dict:
|
|
124
|
+
return {
|
|
125
|
+
"properties": [{"name": p.name, "data_type": p.data_type} for p in pg.properties],
|
|
126
|
+
"file_type": pg.file_type,
|
|
127
|
+
"prefix": pg.prefix,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _pg_from_dict(d: dict) -> PropertyGroup:
|
|
132
|
+
return PropertyGroup(
|
|
133
|
+
properties=[Property(**p) for p in d["properties"]],
|
|
134
|
+
file_type=d.get("file_type", "parquet"),
|
|
135
|
+
prefix=d.get("prefix", ""),
|
|
136
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import yaml
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def write_table(backend, path: str, table: pa.Table) -> None:
|
|
8
|
+
buf = io.BytesIO()
|
|
9
|
+
pq.write_table(table, buf)
|
|
10
|
+
backend.write_file(path, buf.getvalue())
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def write_yaml(backend, path: str, data: dict) -> None:
|
|
14
|
+
backend.write_file(path, yaml.dump(data, default_flow_style=False).encode())
|
|
File without changes
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
import pyarrow.compute as pc
|
|
4
|
+
|
|
5
|
+
from deltagraphar.format.paths import adj_list_chunk_path, offset_chunk_path, edge_prop_chunk_path
|
|
6
|
+
from deltagraphar.format.reader import read_adj_list, count_rows, read_table
|
|
7
|
+
from deltagraphar.format.writer import write_table
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def should_compact(backend, etype, vchunk, threshold_ratio, min_rows):
|
|
11
|
+
"""True when delta rows exceed max(threshold_ratio * base_rows, min_rows)."""
|
|
12
|
+
src, et, dst = etype
|
|
13
|
+
delta_rows = count_rows(backend, src, et, dst, "unordered_by_source", vchunk)
|
|
14
|
+
if delta_rows == 0:
|
|
15
|
+
return False
|
|
16
|
+
base_rows = count_rows(backend, src, et, dst, "ordered_by_source", vchunk)
|
|
17
|
+
return delta_rows > max(threshold_ratio * base_rows, min_rows)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compact_vchunk(backend, etype, vchunk, vsize, edge_chunk_size, pg_prefixes):
|
|
21
|
+
"""Merge unordered delta into ordered_by_source CSR and recompute offsets.
|
|
22
|
+
|
|
23
|
+
Only files under part<vchunk> change. Sort by (src_physical, dst_physical)
|
|
24
|
+
for deterministic layout.
|
|
25
|
+
"""
|
|
26
|
+
src, et, dst = etype
|
|
27
|
+
|
|
28
|
+
base_adj = read_adj_list(backend, src, et, dst, "ordered_by_source", vchunk)
|
|
29
|
+
delta_adj = read_adj_list(backend, src, et, dst, "unordered_by_source", vchunk)
|
|
30
|
+
|
|
31
|
+
if len(base_adj) == 0 and len(delta_adj) == 0:
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
parts = [t for t in [base_adj, delta_adj] if len(t) > 0]
|
|
35
|
+
merged = pa.concat_tables(parts) if len(parts) > 1 else parts[0]
|
|
36
|
+
|
|
37
|
+
# Sort by (src, dst) — required for CSR correctness and determinism
|
|
38
|
+
sort_indices = pc.sort_indices(
|
|
39
|
+
merged, sort_keys=[("src_physical", "ascending"), ("dst_physical", "ascending")]
|
|
40
|
+
)
|
|
41
|
+
merged = merged.take(sort_indices)
|
|
42
|
+
|
|
43
|
+
# Write new ordered adj_list chunks
|
|
44
|
+
_write_chunks(
|
|
45
|
+
backend, merged, edge_chunk_size,
|
|
46
|
+
lambda ci: adj_list_chunk_path(src, et, dst, "ordered_by_source", vchunk, ci),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Recompute CSR offset array: offsets[i] = index of first row with src_physical >= vstart+i
|
|
50
|
+
vstart = vchunk * vsize
|
|
51
|
+
src_arr = merged["src_physical"].to_pylist()
|
|
52
|
+
ptr = 0
|
|
53
|
+
offsets: list[int] = []
|
|
54
|
+
for i in range(vsize + 1):
|
|
55
|
+
target = vstart + i
|
|
56
|
+
while ptr < len(src_arr) and src_arr[ptr] < target:
|
|
57
|
+
ptr += 1
|
|
58
|
+
offsets.append(ptr)
|
|
59
|
+
|
|
60
|
+
write_table(
|
|
61
|
+
backend,
|
|
62
|
+
offset_chunk_path(src, et, dst, vchunk),
|
|
63
|
+
pa.table({"offset": pa.array(offsets, type=pa.int64())}),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Compact edge property groups in tandem, preserving row alignment with adj_list
|
|
67
|
+
for pg_prefix in pg_prefixes:
|
|
68
|
+
base_pg = _read_all_pg_chunks(backend, src, et, dst, "ordered_by_source", pg_prefix, vchunk)
|
|
69
|
+
delta_pg = _read_all_pg_chunks(backend, src, et, dst, "unordered_by_source", pg_prefix, vchunk)
|
|
70
|
+
pg_parts = [t for t in [base_pg, delta_pg] if t is not None and len(t) > 0]
|
|
71
|
+
if pg_parts:
|
|
72
|
+
merged_pg = (pa.concat_tables(pg_parts) if len(pg_parts) > 1 else pg_parts[0]).take(sort_indices)
|
|
73
|
+
_write_chunks(
|
|
74
|
+
backend, merged_pg, edge_chunk_size,
|
|
75
|
+
lambda ci, _p=pg_prefix: edge_prop_chunk_path(src, et, dst, "ordered_by_source", _p, vchunk, ci),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Truncate delta: overwrite all existing delta chunks with an empty table
|
|
79
|
+
_truncate_delta(backend, src, et, dst, vchunk, pg_prefixes)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _write_chunks(backend, table, chunk_size, path_fn):
|
|
83
|
+
n = len(table)
|
|
84
|
+
if n == 0:
|
|
85
|
+
write_table(backend, path_fn(0), table)
|
|
86
|
+
return
|
|
87
|
+
for ci, start in enumerate(range(0, n, chunk_size)):
|
|
88
|
+
write_table(backend, path_fn(ci), table.slice(start, chunk_size))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _read_all_pg_chunks(backend, src, et, dst, adj_type, pg_prefix, vchunk):
|
|
92
|
+
prefix = f"edge/{src}_{et}_{dst}/{adj_type}/{pg_prefix}/part{vchunk}"
|
|
93
|
+
paths = sorted(backend.list(prefix))
|
|
94
|
+
if not paths:
|
|
95
|
+
return None
|
|
96
|
+
return pa.concat_tables([read_table(backend, p) for p in paths])
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _truncate_delta(backend, src, et, dst, vchunk, pg_prefixes=()):
|
|
100
|
+
"""Zero out all unordered delta chunks for this vchunk (adj_list + all property groups)."""
|
|
101
|
+
empty_adj = pa.table({
|
|
102
|
+
"src_physical": pa.array([], type=pa.int64()),
|
|
103
|
+
"dst_physical": pa.array([], type=pa.int64()),
|
|
104
|
+
})
|
|
105
|
+
_zero_prefix(backend, f"edge/{src}_{et}_{dst}/unordered_by_source/adj_list/part{vchunk}", empty_adj)
|
|
106
|
+
for pg_prefix in pg_prefixes:
|
|
107
|
+
# empty table with no columns — schema unknown here, but zero rows is correct sentinel
|
|
108
|
+
_zero_prefix(backend, f"edge/{src}_{et}_{dst}/unordered_by_source/{pg_prefix}/part{vchunk}",
|
|
109
|
+
pa.table({}))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _zero_prefix(backend, prefix, empty_table):
|
|
113
|
+
existing = backend.list(prefix)
|
|
114
|
+
if existing:
|
|
115
|
+
for p in existing:
|
|
116
|
+
write_table(backend, p, empty_table)
|
|
117
|
+
else:
|
|
118
|
+
# write a sentinel chunk0 so future compactions see "already cleared"
|
|
119
|
+
write_table(backend, prefix + "/chunk0", empty_table)
|