icebug-format 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icebug_format-0.1.0/PKG-INFO +160 -0
- icebug_format-0.1.0/README.md +144 -0
- icebug_format-0.1.0/icebug_format/__init__.py +3 -0
- icebug_format-0.1.0/icebug_format/cli.py +765 -0
- icebug_format-0.1.0/icebug_format.egg-info/PKG-INFO +160 -0
- icebug_format-0.1.0/icebug_format.egg-info/SOURCES.txt +10 -0
- icebug_format-0.1.0/icebug_format.egg-info/dependency_links.txt +1 -0
- icebug_format-0.1.0/icebug_format.egg-info/entry_points.txt +2 -0
- icebug_format-0.1.0/icebug_format.egg-info/requires.txt +7 -0
- icebug_format-0.1.0/icebug_format.egg-info/top_level.txt +1 -0
- icebug_format-0.1.0/pyproject.toml +35 -0
- icebug_format-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: icebug-format
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert graph data from DuckDB to CSR format for Icebug
|
|
5
|
+
Project-URL: Homepage, https://github.com/anomalyco/icebug-format
|
|
6
|
+
Project-URL: Repository, https://github.com/anomalyco/icebug-format
|
|
7
|
+
Project-URL: PyPI, https://pypi.org/project/icebug-format
|
|
8
|
+
Requires-Python: >=3.13
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: duckdb>=1.3.2
|
|
11
|
+
Provides-Extra: full
|
|
12
|
+
Requires-Dist: real_ladybug>=0.14.1; extra == "full"
|
|
13
|
+
Requires-Dist: networkx>=3.5; extra == "full"
|
|
14
|
+
Requires-Dist: pandas>=2.3.2; extra == "full"
|
|
15
|
+
Requires-Dist: pyarrow>=21.0.0; extra == "full"
|
|
16
|
+
|
|
17
|
+
# Icebug Format
|
|
18
|
+
|
|
19
|
+
> **Note**: This project was formerly called **graph-std**.
|
|
20
|
+
|
|
21
|
+
Icebug is a standardized graph format designed for efficient graph data interchange. It comes in two formats:
|
|
22
|
+
|
|
23
|
+
- **icebug-disk**: Parquet-based format for object storage
|
|
24
|
+
- **icebug-memory**: Apache Arrow-based format for in-memory processing
|
|
25
|
+
|
|
26
|
+
This project provides tools to convert graph data from simple DuckDB databases or Parquet files containing `nodes_*` and `edges_*` tables, along with a `schema.cypher` file, into standardized graph formats for efficient processing.
|
|
27
|
+
|
|
28
|
+
## Sample Usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv run icebug-format.py \
|
|
32
|
+
--source-db karate/karate_random.duckdb \
|
|
33
|
+
--output-db karate/karate_csr.duckdb \
|
|
34
|
+
--csr-table karate \
|
|
35
|
+
--schema karate/karate_csr/schema.cypher
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
This will create a CSR representation with multiple tables depending on the number of node and edge types:
|
|
39
|
+
|
|
40
|
+
- `{table_name}_indptr_{edge_name}`: Array of size N+1 for row pointers (one per edge table)
|
|
41
|
+
- `{table_name}_indices_{edge_name}`: Array of size E containing column indices (one per edge table)
|
|
42
|
+
- `{table_name}_nodes_{node_name}`: Original nodes table with node attributes (one per node table)
|
|
43
|
+
- `{table_name}_mapping_{node_name}`: Maps original node IDs to contiguous indices (one per node table)
|
|
44
|
+
- `{table_name}_metadata`: Global graph metadata (node count, edge count, directed flag)
|
|
45
|
+
- `schema.cypher`: A cypher schema that a graph database can mount without ingesting
|
|
46
|
+
|
|
47
|
+
## More information about Icebug and Apache GraphAR
|
|
48
|
+
|
|
49
|
+
[Blog Post](https://adsharma.github.io/graph-archiving/)
|
|
50
|
+
|
|
51
|
+
## Recreating demo-db/icebug-disk
|
|
52
|
+
|
|
53
|
+
Start from a simple demo-db.duckdb that looks like this
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
Querying database: demo-db.duckdb
|
|
57
|
+
================================
|
|
58
|
+
|
|
59
|
+
--- Table: edges_follows ---
|
|
60
|
+
┌────────┬────────┬───────┐
|
|
61
|
+
│ source │ target │ since │
|
|
62
|
+
│ int32 │ int32 │ int32 │
|
|
63
|
+
├────────┼────────┼───────┤
|
|
64
|
+
│ 100 │ 250 │ 2020 │
|
|
65
|
+
│ 300 │ 75 │ 2022 │
|
|
66
|
+
│ 250 │ 300 │ 2021 │
|
|
67
|
+
│ 100 │ 300 │ 2020 │
|
|
68
|
+
└────────┴────────┴───────┘
|
|
69
|
+
================================
|
|
70
|
+
|
|
71
|
+
--- Table: edges_livesin ---
|
|
72
|
+
┌────────┬────────┐
|
|
73
|
+
│ source │ target │
|
|
74
|
+
│ int32 │ int32 │
|
|
75
|
+
├────────┼────────┤
|
|
76
|
+
│ 100 │ 700 │
|
|
77
|
+
│ 250 │ 700 │
|
|
78
|
+
│ 300 │ 600 │
|
|
79
|
+
│ 75 │ 500 │
|
|
80
|
+
└────────┴────────┘
|
|
81
|
+
================================
|
|
82
|
+
|
|
83
|
+
--- Table: nodes_city ---
|
|
84
|
+
┌───────┬───────────┬────────────┐
|
|
85
|
+
│ id │ name │ population │
|
|
86
|
+
│ int32 │ varchar │ int64 │
|
|
87
|
+
├───────┼───────────┼────────────┤
|
|
88
|
+
│ 500 │ Guelph │ 75000 │
|
|
89
|
+
│ 600 │ Kitchener │ 200000 │
|
|
90
|
+
│ 700 │ Waterloo │ 150000 │
|
|
91
|
+
└───────┴───────────┴────────────┘
|
|
92
|
+
================================
|
|
93
|
+
|
|
94
|
+
--- Table: nodes_user ---
|
|
95
|
+
┌───────┬─────────┬───────┐
|
|
96
|
+
│ id │ name │ age │
|
|
97
|
+
│ int32 │ varchar │ int64 │
|
|
98
|
+
├───────┼─────────┼───────┤
|
|
99
|
+
│ 100 │ Adam │ 30 │
|
|
100
|
+
│ 250 │ Karissa │ 40 │
|
|
101
|
+
│ 75 │ Noura │ 25 │
|
|
102
|
+
│ 300 │ Zhang │ 50 │
|
|
103
|
+
└───────┴─────────┴───────┘
|
|
104
|
+
================================
|
|
105
|
+
|
|
106
|
+
--- Schema: schema.cypher --
|
|
107
|
+
CREATE NODE TABLE User(id INT64, name STRING, age INT64, PRIMARY KEY (id));
|
|
108
|
+
CREATE NODE TABLE City(id INT64, name STRING, population INT64, PRIMARY KEY (id));
|
|
109
|
+
CREATE REL TABLE Follows(FROM User TO User, since INT64);
|
|
110
|
+
CREATE REL TABLE LivesIn(FROM User TO City);
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
and run:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
uv run icebug-format.py \
|
|
117
|
+
--directed \
|
|
118
|
+
--source-db demo-db.duckdb \
|
|
119
|
+
--output-db demo-db_csr.duckdb \
|
|
120
|
+
--csr-table demo \
|
|
121
|
+
--schema demo-db/schema.cypher
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
You'll get a demo-db_csr.duckdb AND the object storage ready representation aka icebug-disk.
|
|
125
|
+
|
|
126
|
+
## Verification
|
|
127
|
+
|
|
128
|
+
You can verify that the conversion went ok by running `scan.py`. It's also a good way to understand the icebug-disk format.
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
uv run scan.py --input demo-db_csr --prefix demo
|
|
132
|
+
Metadata: 7 nodes, 8 edges, directed=True
|
|
133
|
+
|
|
134
|
+
Node Tables:
|
|
135
|
+
|
|
136
|
+
Table: demo_nodes_user
|
|
137
|
+
(100, 'Adam', 30)
|
|
138
|
+
(250, 'Karissa', 40)
|
|
139
|
+
(75, 'Noura', 25)
|
|
140
|
+
(300, 'Zhang', 50)
|
|
141
|
+
|
|
142
|
+
Table: demo_nodes_city
|
|
143
|
+
(500, 'Guelph', 75000)
|
|
144
|
+
(600, 'Kitchener', 200000)
|
|
145
|
+
(700, 'Waterloo', 150000)
|
|
146
|
+
|
|
147
|
+
Edge Tables (reconstructed from CSR):
|
|
148
|
+
|
|
149
|
+
Table: follows (FROM user TO user)
|
|
150
|
+
(100, 250, 2020)
|
|
151
|
+
(100, 300, 2020)
|
|
152
|
+
(250, 300, 2021)
|
|
153
|
+
(300, 75, 2022)
|
|
154
|
+
|
|
155
|
+
Table: livesin (FROM user TO city)
|
|
156
|
+
(75, 500)
|
|
157
|
+
(100, 700)
|
|
158
|
+
(250, 700)
|
|
159
|
+
(300, 600)
|
|
160
|
+
```
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Icebug Format
|
|
2
|
+
|
|
3
|
+
> **Note**: This project was formerly called **graph-std**.
|
|
4
|
+
|
|
5
|
+
Icebug is a standardized graph format designed for efficient graph data interchange. It comes in two formats:
|
|
6
|
+
|
|
7
|
+
- **icebug-disk**: Parquet-based format for object storage
|
|
8
|
+
- **icebug-memory**: Apache Arrow-based format for in-memory processing
|
|
9
|
+
|
|
10
|
+
This project provides tools to convert graph data from simple DuckDB databases or Parquet files containing `nodes_*` and `edges_*` tables, along with a `schema.cypher` file, into standardized graph formats for efficient processing.
|
|
11
|
+
|
|
12
|
+
## Sample Usage
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
uv run icebug-format.py \
|
|
16
|
+
--source-db karate/karate_random.duckdb \
|
|
17
|
+
--output-db karate/karate_csr.duckdb \
|
|
18
|
+
--csr-table karate \
|
|
19
|
+
--schema karate/karate_csr/schema.cypher
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
This will create a CSR representation with multiple tables depending on the number of node and edge types:
|
|
23
|
+
|
|
24
|
+
- `{table_name}_indptr_{edge_name}`: Array of size N+1 for row pointers (one per edge table)
|
|
25
|
+
- `{table_name}_indices_{edge_name}`: Array of size E containing column indices (one per edge table)
|
|
26
|
+
- `{table_name}_nodes_{node_name}`: Original nodes table with node attributes (one per node table)
|
|
27
|
+
- `{table_name}_mapping_{node_name}`: Maps original node IDs to contiguous indices (one per node table)
|
|
28
|
+
- `{table_name}_metadata`: Global graph metadata (node count, edge count, directed flag)
|
|
29
|
+
- `schema.cypher`: A cypher schema that a graph database can mount without ingesting
|
|
30
|
+
|
|
31
|
+
## More information about Icebug and Apache GraphAR
|
|
32
|
+
|
|
33
|
+
[Blog Post](https://adsharma.github.io/graph-archiving/)
|
|
34
|
+
|
|
35
|
+
## Recreating demo-db/icebug-disk
|
|
36
|
+
|
|
37
|
+
Start from a simple demo-db.duckdb that looks like this
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
Querying database: demo-db.duckdb
|
|
41
|
+
================================
|
|
42
|
+
|
|
43
|
+
--- Table: edges_follows ---
|
|
44
|
+
┌────────┬────────┬───────┐
|
|
45
|
+
│ source │ target │ since │
|
|
46
|
+
│ int32 │ int32 │ int32 │
|
|
47
|
+
├────────┼────────┼───────┤
|
|
48
|
+
│ 100 │ 250 │ 2020 │
|
|
49
|
+
│ 300 │ 75 │ 2022 │
|
|
50
|
+
│ 250 │ 300 │ 2021 │
|
|
51
|
+
│ 100 │ 300 │ 2020 │
|
|
52
|
+
└────────┴────────┴───────┘
|
|
53
|
+
================================
|
|
54
|
+
|
|
55
|
+
--- Table: edges_livesin ---
|
|
56
|
+
┌────────┬────────┐
|
|
57
|
+
│ source │ target │
|
|
58
|
+
│ int32 │ int32 │
|
|
59
|
+
├────────┼────────┤
|
|
60
|
+
│ 100 │ 700 │
|
|
61
|
+
│ 250 │ 700 │
|
|
62
|
+
│ 300 │ 600 │
|
|
63
|
+
│ 75 │ 500 │
|
|
64
|
+
└────────┴────────┘
|
|
65
|
+
================================
|
|
66
|
+
|
|
67
|
+
--- Table: nodes_city ---
|
|
68
|
+
┌───────┬───────────┬────────────┐
|
|
69
|
+
│ id │ name │ population │
|
|
70
|
+
│ int32 │ varchar │ int64 │
|
|
71
|
+
├───────┼───────────┼────────────┤
|
|
72
|
+
│ 500 │ Guelph │ 75000 │
|
|
73
|
+
│ 600 │ Kitchener │ 200000 │
|
|
74
|
+
│ 700 │ Waterloo │ 150000 │
|
|
75
|
+
└───────┴───────────┴────────────┘
|
|
76
|
+
================================
|
|
77
|
+
|
|
78
|
+
--- Table: nodes_user ---
|
|
79
|
+
┌───────┬─────────┬───────┐
|
|
80
|
+
│ id │ name │ age │
|
|
81
|
+
│ int32 │ varchar │ int64 │
|
|
82
|
+
├───────┼─────────┼───────┤
|
|
83
|
+
│ 100 │ Adam │ 30 │
|
|
84
|
+
│ 250 │ Karissa │ 40 │
|
|
85
|
+
│ 75 │ Noura │ 25 │
|
|
86
|
+
│ 300 │ Zhang │ 50 │
|
|
87
|
+
└───────┴─────────┴───────┘
|
|
88
|
+
================================
|
|
89
|
+
|
|
90
|
+
--- Schema: schema.cypher --
|
|
91
|
+
CREATE NODE TABLE User(id INT64, name STRING, age INT64, PRIMARY KEY (id));
|
|
92
|
+
CREATE NODE TABLE City(id INT64, name STRING, population INT64, PRIMARY KEY (id));
|
|
93
|
+
CREATE REL TABLE Follows(FROM User TO User, since INT64);
|
|
94
|
+
CREATE REL TABLE LivesIn(FROM User TO City);
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
and run:
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
uv run icebug-format.py \
|
|
101
|
+
--directed \
|
|
102
|
+
--source-db demo-db.duckdb \
|
|
103
|
+
--output-db demo-db_csr.duckdb \
|
|
104
|
+
--csr-table demo \
|
|
105
|
+
--schema demo-db/schema.cypher
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
You'll get a demo-db_csr.duckdb AND the object storage ready representation aka icebug-disk.
|
|
109
|
+
|
|
110
|
+
## Verification
|
|
111
|
+
|
|
112
|
+
You can verify that the conversion went ok by running `scan.py`. It's also a good way to understand the icebug-disk format.
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
uv run scan.py --input demo-db_csr --prefix demo
|
|
116
|
+
Metadata: 7 nodes, 8 edges, directed=True
|
|
117
|
+
|
|
118
|
+
Node Tables:
|
|
119
|
+
|
|
120
|
+
Table: demo_nodes_user
|
|
121
|
+
(100, 'Adam', 30)
|
|
122
|
+
(250, 'Karissa', 40)
|
|
123
|
+
(75, 'Noura', 25)
|
|
124
|
+
(300, 'Zhang', 50)
|
|
125
|
+
|
|
126
|
+
Table: demo_nodes_city
|
|
127
|
+
(500, 'Guelph', 75000)
|
|
128
|
+
(600, 'Kitchener', 200000)
|
|
129
|
+
(700, 'Waterloo', 150000)
|
|
130
|
+
|
|
131
|
+
Edge Tables (reconstructed from CSR):
|
|
132
|
+
|
|
133
|
+
Table: follows (FROM user TO user)
|
|
134
|
+
(100, 250, 2020)
|
|
135
|
+
(100, 300, 2020)
|
|
136
|
+
(250, 300, 2021)
|
|
137
|
+
(300, 75, 2022)
|
|
138
|
+
|
|
139
|
+
Table: livesin (FROM user TO city)
|
|
140
|
+
(75, 500)
|
|
141
|
+
(100, 700)
|
|
142
|
+
(250, 700)
|
|
143
|
+
(300, 600)
|
|
144
|
+
```
|
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Script to convert graph data from DuckDB to CSR (Compressed Sparse Row) format.
|
|
4
|
+
|
|
5
|
+
This script reads graph data from a DuckDB database containing an edges table
|
|
6
|
+
with source and target columns representing edges, and converts it to CSR format for
|
|
7
|
+
efficient processing with NetworkKit.
|
|
8
|
+
|
|
9
|
+
The conversion process:
|
|
10
|
+
1. Reads graph data from DuckDB (edges table with source, target columns)
|
|
11
|
+
2. Handles sparse node IDs by creating a dense mapping (original_id -> csr_index)
|
|
12
|
+
3. Converts edges to CSR (Compressed Sparse Row) format
|
|
13
|
+
4. Pre-sorts edges by source using DuckDB for memory efficiency
|
|
14
|
+
5. Saves CSR data and node mapping to DuckDB for reuse
|
|
15
|
+
6. Exports to parquet format and generates schema.cypher for ladybugdb
|
|
16
|
+
|
|
17
|
+
Key Features:
|
|
18
|
+
- Memory efficient: Uses database-level sorting and PyArrow for large graph processing
|
|
19
|
+
- Handles sparse node IDs: Works with any node ID range (e.g., 1000, 5000, 9999)
|
|
20
|
+
- Scalable: Optimized for large graphs using DuckDB's efficient sorting
|
|
21
|
+
- Multi-table support: Processes multiple node/edge tables (prefix: nodes*, edges*)
|
|
22
|
+
|
|
23
|
+
Usage Examples:
|
|
24
|
+
# Convert edges in karate_random.duckdb to CSR format and save to csr_graph.db
|
|
25
|
+
python convert_csr.py --source-db karate_random.duckdb --output-db csr_graph.db
|
|
26
|
+
|
|
27
|
+
# Convert with limited data for testing
|
|
28
|
+
python convert_csr.py --source-db karate_random.duckdb --test --limit 50000 --output-db test.db
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import re
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
import duckdb
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_schema_cypher(schema_path: Path) -> dict:
|
|
39
|
+
"""
|
|
40
|
+
Parse schema.cypher to extract edge relationships (FROM/TO node types).
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Dictionary mapping edge names to (from_node_type, to_node_type) tuples
|
|
44
|
+
"""
|
|
45
|
+
edge_relationships = {}
|
|
46
|
+
|
|
47
|
+
if not schema_path.exists():
|
|
48
|
+
return edge_relationships
|
|
49
|
+
|
|
50
|
+
content = schema_path.read_text()
|
|
51
|
+
|
|
52
|
+
# Parse REL TABLE definitions: CREATE REL TABLE Follows(FROM User TO User, ...);
|
|
53
|
+
# Also handles backtick-quoted identifiers: CREATE REL TABLE `edges` (FROM `nodes` TO `nodes`, ...)
|
|
54
|
+
rel_pattern = (
|
|
55
|
+
r"CREATE\s+REL\s+TABLE\s+`?(\w+)`?\s*\(\s*FROM\s+`?(\w+)`?\s+TO\s+`?(\w+)`?"
|
|
56
|
+
)
|
|
57
|
+
for match in re.finditer(rel_pattern, content, re.IGNORECASE):
|
|
58
|
+
edge_name = match.group(1).lower()
|
|
59
|
+
from_node = match.group(2).lower()
|
|
60
|
+
to_node = match.group(3).lower()
|
|
61
|
+
edge_relationships[edge_name] = (from_node, to_node)
|
|
62
|
+
|
|
63
|
+
return edge_relationships
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_node_and_edge_tables(
|
|
67
|
+
con, db_alias: str = "orig"
|
|
68
|
+
) -> tuple[list[str], list[str]]:
|
|
69
|
+
"""
|
|
70
|
+
Discover node and edge tables in the source database.
|
|
71
|
+
|
|
72
|
+
Tables starting with 'nodes' are considered node tables.
|
|
73
|
+
Tables starting with 'edges' are considered edge tables.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Tuple of (node_table_names, edge_table_names)
|
|
77
|
+
"""
|
|
78
|
+
result = con.execute(
|
|
79
|
+
f"SELECT table_name FROM information_schema.tables WHERE table_catalog = '{db_alias}'"
|
|
80
|
+
).fetchall()
|
|
81
|
+
all_tables = [row[0] for row in result]
|
|
82
|
+
|
|
83
|
+
node_tables = [t for t in all_tables if t.startswith("nodes")]
|
|
84
|
+
edge_tables = [t for t in all_tables if t.startswith("edges")]
|
|
85
|
+
|
|
86
|
+
return node_tables, edge_tables
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def duckdb_type_to_cypher_type(duckdb_type: str) -> str:
|
|
90
|
+
"""Convert DuckDB column type to Cypher/Ladybug type."""
|
|
91
|
+
duckdb_type = duckdb_type.upper()
|
|
92
|
+
type_map = {
|
|
93
|
+
"BIGINT": "INT64",
|
|
94
|
+
"INTEGER": "INT32",
|
|
95
|
+
"SMALLINT": "INT16",
|
|
96
|
+
"TINYINT": "INT8",
|
|
97
|
+
"HUGEINT": "INT128",
|
|
98
|
+
"UBIGINT": "UINT64",
|
|
99
|
+
"UINTEGER": "UINT32",
|
|
100
|
+
"USMALLINT": "UINT16",
|
|
101
|
+
"UTINYINT": "UINT8",
|
|
102
|
+
"DOUBLE": "DOUBLE",
|
|
103
|
+
"FLOAT": "FLOAT",
|
|
104
|
+
"REAL": "FLOAT",
|
|
105
|
+
"BOOLEAN": "BOOL",
|
|
106
|
+
"VARCHAR": "STRING",
|
|
107
|
+
"TEXT": "STRING",
|
|
108
|
+
"CHAR": "STRING",
|
|
109
|
+
"DATE": "DATE",
|
|
110
|
+
"TIMESTAMP": "TIMESTAMP",
|
|
111
|
+
"TIME": "TIME",
|
|
112
|
+
"BLOB": "BLOB",
|
|
113
|
+
}
|
|
114
|
+
# Handle parameterized types like DECIMAL(10,2)
|
|
115
|
+
base_type = duckdb_type.split("(")[0].strip()
|
|
116
|
+
return type_map.get(base_type, "STRING")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def generate_schema_cypher(
|
|
120
|
+
con,
|
|
121
|
+
csr_table_name: str,
|
|
122
|
+
node_tables: list[str],
|
|
123
|
+
edge_tables: list[str],
|
|
124
|
+
parquet_dir: Path,
|
|
125
|
+
edge_relationships: dict,
|
|
126
|
+
node_type_to_table: dict,
|
|
127
|
+
storage_path: str,
|
|
128
|
+
) -> str:
|
|
129
|
+
"""
|
|
130
|
+
Generate schema.cypher content for ladybugdb.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
con: DuckDB connection
|
|
134
|
+
csr_table_name: Prefix for CSR tables
|
|
135
|
+
node_tables: List of original node table names
|
|
136
|
+
edge_tables: List of original edge table names
|
|
137
|
+
parquet_dir: Path to the parquet output directory (for storage path)
|
|
138
|
+
edge_relationships: Dict of edge relationships from schema
|
|
139
|
+
node_type_to_table: Mapping of node types to table names
|
|
140
|
+
storage_path: Storage path string for schema.cypher
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
String containing the schema.cypher content
|
|
144
|
+
"""
|
|
145
|
+
lines = []
|
|
146
|
+
|
|
147
|
+
# Helper to derive display name from table name (lowercase)
|
|
148
|
+
# nodes => nodes, nodes_person => person, nodes_foo => foo
|
|
149
|
+
def get_node_display_name(table_name: str) -> str:
|
|
150
|
+
if table_name == "nodes":
|
|
151
|
+
return "nodes"
|
|
152
|
+
elif table_name.startswith("nodes_"):
|
|
153
|
+
return table_name[6:].lower() # Remove "nodes_" prefix and lowercase
|
|
154
|
+
return table_name.lower()
|
|
155
|
+
|
|
156
|
+
def get_edge_display_name(table_name: str) -> str:
|
|
157
|
+
if table_name == "edges":
|
|
158
|
+
return "edges"
|
|
159
|
+
elif table_name.startswith("edges_"):
|
|
160
|
+
return table_name[6:].lower() # Remove "edges_" prefix and lowercase
|
|
161
|
+
return table_name.lower()
|
|
162
|
+
|
|
163
|
+
# Build mapping of original table names to display names
|
|
164
|
+
node_display_names = {nt: get_node_display_name(nt) for nt in node_tables}
|
|
165
|
+
|
|
166
|
+
# Generate NODE TABLE definitions for each node table
|
|
167
|
+
for node_table in node_tables:
|
|
168
|
+
table_name = f"{csr_table_name}_{node_table}"
|
|
169
|
+
try:
|
|
170
|
+
cols = con.execute(f"DESCRIBE {table_name}").fetchall()
|
|
171
|
+
col_defs = []
|
|
172
|
+
pk_col = None
|
|
173
|
+
for col in cols:
|
|
174
|
+
col_name, col_type = col[0], col[1]
|
|
175
|
+
cypher_type = duckdb_type_to_cypher_type(col_type)
|
|
176
|
+
col_defs.append(f"{col_name} {cypher_type}")
|
|
177
|
+
# First column is typically the primary key
|
|
178
|
+
if pk_col is None:
|
|
179
|
+
pk_col = col_name
|
|
180
|
+
|
|
181
|
+
cols_str = ", ".join(col_defs)
|
|
182
|
+
display_name = node_display_names[node_table]
|
|
183
|
+
lines.append(
|
|
184
|
+
f"CREATE NODE TABLE {display_name}({cols_str}, PRIMARY KEY({pk_col})) "
|
|
185
|
+
f"WITH (storage = '{storage_path}');"
|
|
186
|
+
)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
print(
|
|
189
|
+
f"Warning: Could not generate schema for node table {table_name}: {e}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Generate REL TABLE definitions for each edge table
|
|
193
|
+
for edge_table in edge_tables:
|
|
194
|
+
rel_name = get_edge_display_name(edge_table)
|
|
195
|
+
edge_name = (
|
|
196
|
+
edge_table[6:].lower()
|
|
197
|
+
if edge_table.startswith("edges_")
|
|
198
|
+
else edge_table.lower()
|
|
199
|
+
)
|
|
200
|
+
src_node_type, dst_node_type = edge_relationships.get(edge_name, (None, None))
|
|
201
|
+
if (
|
|
202
|
+
src_node_type
|
|
203
|
+
and dst_node_type
|
|
204
|
+
and src_node_type in node_type_to_table
|
|
205
|
+
and dst_node_type in node_type_to_table
|
|
206
|
+
):
|
|
207
|
+
src_nt = node_type_to_table[src_node_type]
|
|
208
|
+
dst_nt = node_type_to_table[dst_node_type]
|
|
209
|
+
src_table = node_display_names[src_nt]
|
|
210
|
+
dst_table = node_display_names[dst_nt]
|
|
211
|
+
else:
|
|
212
|
+
src_table = node_display_names[node_tables[0]] if node_tables else "nodes"
|
|
213
|
+
dst_table = src_table
|
|
214
|
+
|
|
215
|
+
# Get columns from indices table
|
|
216
|
+
indices_table = f"{csr_table_name}_indices_{edge_name}"
|
|
217
|
+
try:
|
|
218
|
+
cols = con.execute(f"DESCRIBE {indices_table}").fetchall()
|
|
219
|
+
col_defs = []
|
|
220
|
+
for col in cols:
|
|
221
|
+
col_name, col_type = col[0], col[1]
|
|
222
|
+
if col_name == "target":
|
|
223
|
+
continue
|
|
224
|
+
cypher_type = duckdb_type_to_cypher_type(col_type)
|
|
225
|
+
col_defs.append(f"{col_name} {cypher_type}")
|
|
226
|
+
props_str = ", ".join(col_defs)
|
|
227
|
+
lines.append(
|
|
228
|
+
f"CREATE REL TABLE {rel_name}(FROM {src_table} TO {dst_table}"
|
|
229
|
+
f"{', ' + props_str if props_str else ''}) WITH (storage = '{storage_path}');"
|
|
230
|
+
)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
print(f"Warning: Could not generate schema for rel table {rel_name}: {e}")
|
|
233
|
+
|
|
234
|
+
return "\n".join(lines) + "\n"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def export_to_parquet_and_cypher(
|
|
238
|
+
con,
|
|
239
|
+
output_db_path: str,
|
|
240
|
+
csr_table_name: str,
|
|
241
|
+
node_tables: list[str],
|
|
242
|
+
edge_tables: list[str],
|
|
243
|
+
edge_relationships: dict,
|
|
244
|
+
node_type_to_table: dict,
|
|
245
|
+
storage_path: str | None = None,
|
|
246
|
+
) -> None:
|
|
247
|
+
"""
|
|
248
|
+
Export all tables to parquet format and generate schema.cypher.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
con: DuckDB connection
|
|
252
|
+
output_db_path: Path to output DuckDB database
|
|
253
|
+
csr_table_name: Prefix for CSR tables
|
|
254
|
+
node_tables: List of original node table names
|
|
255
|
+
edge_tables: List of original edge table names
|
|
256
|
+
storage_path: Storage path for schema.cypher (default: output_db without .duckdb + csr_table_name)
|
|
257
|
+
"""
|
|
258
|
+
print("\n=== Exporting to Parquet and Generating schema.cypher ===")
|
|
259
|
+
|
|
260
|
+
# Create output directory next to the database
|
|
261
|
+
output_path = Path(output_db_path)
|
|
262
|
+
parquet_dir = output_path.parent / output_path.stem
|
|
263
|
+
parquet_dir.mkdir(parents=True, exist_ok=True)
|
|
264
|
+
|
|
265
|
+
print(f"Parquet output directory: {parquet_dir}")
|
|
266
|
+
|
|
267
|
+
# Compute storage path if not provided
|
|
268
|
+
if storage_path is None:
|
|
269
|
+
storage_path = f"./{output_path.stem}/{csr_table_name}"
|
|
270
|
+
|
|
271
|
+
# Get all tables to export
|
|
272
|
+
result = con.execute("SHOW TABLES").fetchall()
|
|
273
|
+
all_tables = [row[0] for row in result]
|
|
274
|
+
|
|
275
|
+
# Export each table to parquet (lowercase filenames)
|
|
276
|
+
for table_name in all_tables:
|
|
277
|
+
parquet_file = parquet_dir / f"{table_name.lower()}.parquet"
|
|
278
|
+
con.execute(f"COPY {table_name} TO '{parquet_file}' (FORMAT 'parquet')")
|
|
279
|
+
print(f" Exported: {table_name} -> {parquet_file.name}")
|
|
280
|
+
|
|
281
|
+
# Generate schema.cypher
|
|
282
|
+
schema_cypher = generate_schema_cypher(
|
|
283
|
+
con,
|
|
284
|
+
csr_table_name,
|
|
285
|
+
node_tables,
|
|
286
|
+
edge_tables,
|
|
287
|
+
parquet_dir,
|
|
288
|
+
edge_relationships,
|
|
289
|
+
node_type_to_table,
|
|
290
|
+
storage_path,
|
|
291
|
+
)
|
|
292
|
+
schema_file = parquet_dir / "schema.cypher"
|
|
293
|
+
schema_file.write_text(schema_cypher)
|
|
294
|
+
print(f" Generated: {schema_file.name}")
|
|
295
|
+
|
|
296
|
+
# Remove old SQL files if they exist
|
|
297
|
+
for old_file in ["schema.sql", "load.sql"]:
|
|
298
|
+
old_path = parquet_dir / old_file
|
|
299
|
+
if old_path.exists():
|
|
300
|
+
old_path.unlink()
|
|
301
|
+
print(f" Removed: {old_file}")
|
|
302
|
+
|
|
303
|
+
print(f"✓ Export complete. Files saved to: {parquet_dir}")
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def create_csr_graph_to_duckdb(
|
|
307
|
+
source_db_path: str,
|
|
308
|
+
output_db_path: str,
|
|
309
|
+
limit_rels: int | None = None,
|
|
310
|
+
directed: bool = False,
|
|
311
|
+
csr_table_name: str = "csr_graph",
|
|
312
|
+
node_table: str | None = None,
|
|
313
|
+
edge_table: str | None = None,
|
|
314
|
+
schema_path: str | None = None,
|
|
315
|
+
storage_path: str | None = None,
|
|
316
|
+
) -> None:
|
|
317
|
+
"""
|
|
318
|
+
Create CSR graph data and save to DuckDB using optimized SQL approach.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
source_db_path: Path to source DuckDB with edges table
|
|
322
|
+
output_db_path: Path to output DuckDB for CSR data
|
|
323
|
+
limit_rels: Limit number of relationships for testing
|
|
324
|
+
directed: Whether graph is directed
|
|
325
|
+
csr_table_name: Name of table to store CSR data
|
|
326
|
+
node_table: Specific node table to use (default: auto-discover)
|
|
327
|
+
edge_table: Specific edge table to use (default: auto-discover)
|
|
328
|
+
schema_path: Path to schema.cypher for edge relationship info
|
|
329
|
+
storage_path: Storage path for schema.cypher (default: output_db without .duckdb + csr_table_name)
|
|
330
|
+
"""
|
|
331
|
+
print("\n=== Creating CSR Graph Data (Optimized SQL Approach) ===")
|
|
332
|
+
|
|
333
|
+
# Connect to a fresh DuckDB database for output
|
|
334
|
+
con = duckdb.connect(output_db_path)
|
|
335
|
+
|
|
336
|
+
# Drop all existing tables to recreate from scratch
|
|
337
|
+
result = con.execute("SHOW TABLES").fetchall()
|
|
338
|
+
existing_tables = [row[0] for row in result]
|
|
339
|
+
for table in existing_tables:
|
|
340
|
+
con.execute(f"DROP TABLE IF EXISTS {table}")
|
|
341
|
+
if existing_tables:
|
|
342
|
+
print(f"Dropped {len(existing_tables)} existing tables")
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
print("Step 0: Loading edges and nodes from original DB into new DB...")
|
|
346
|
+
|
|
347
|
+
# Import the edges table from the original database
|
|
348
|
+
con.execute(f"ATTACH '{source_db_path}' AS orig;")
|
|
349
|
+
|
|
350
|
+
# Discover node and edge tables
|
|
351
|
+
node_tables, edge_tables = get_node_and_edge_tables(con, "orig")
|
|
352
|
+
|
|
353
|
+
# Use specified tables or discovered ones
|
|
354
|
+
if node_table:
|
|
355
|
+
node_tables = [node_table] if node_table in node_tables else []
|
|
356
|
+
if edge_table:
|
|
357
|
+
edge_tables = [edge_table] if edge_table in edge_tables else []
|
|
358
|
+
|
|
359
|
+
if not edge_tables:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
"No edge tables found in source database (tables must start with 'edges')"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
print(f"Discovered node tables: {node_tables}")
|
|
365
|
+
print(f"Discovered edge tables: {edge_tables}")
|
|
366
|
+
|
|
367
|
+
# Parse schema.cypher for edge relationships
|
|
368
|
+
edge_relationships = {}
|
|
369
|
+
if schema_path:
|
|
370
|
+
schema_file = Path(schema_path)
|
|
371
|
+
edge_relationships = parse_schema_cypher(schema_file)
|
|
372
|
+
print(f"Parsed edge relationships from schema: {edge_relationships}")
|
|
373
|
+
|
|
374
|
+
# Build mapping from node type names to table names
|
|
375
|
+
# e.g., "user" -> "nodes_user", "city" -> "nodes_city"
|
|
376
|
+
node_type_to_table = {}
|
|
377
|
+
for nt in node_tables:
|
|
378
|
+
if nt == "nodes":
|
|
379
|
+
node_type_to_table["nodes"] = nt
|
|
380
|
+
elif nt.startswith("nodes_"):
|
|
381
|
+
node_type_name = nt[6:].lower() # Remove "nodes_" prefix and lowercase
|
|
382
|
+
node_type_to_table[node_type_name] = nt
|
|
383
|
+
|
|
384
|
+
print(f"Node type to table mapping: {node_type_to_table}")
|
|
385
|
+
|
|
386
|
+
# Copy all node tables with proper prefixing and create per-table mappings
|
|
387
|
+
node_counts = {} # Track node counts per table
|
|
388
|
+
for nt in node_tables:
|
|
389
|
+
try:
|
|
390
|
+
# Get the primary key column (first column of original node table)
|
|
391
|
+
cols = con.execute(f"DESCRIBE orig.{nt}").fetchall()
|
|
392
|
+
pk_col = cols[0][0] if cols else "id"
|
|
393
|
+
|
|
394
|
+
con.execute(
|
|
395
|
+
f"CREATE TABLE {csr_table_name}_{nt} AS SELECT * FROM orig.{nt} ORDER BY {pk_col};"
|
|
396
|
+
)
|
|
397
|
+
print(f" Copied node table: {nt} -> {csr_table_name}_{nt}")
|
|
398
|
+
|
|
399
|
+
# Create per-table node mapping
|
|
400
|
+
node_type = nt[6:].lower() if nt.startswith("nodes_") else nt.lower()
|
|
401
|
+
mapping_table = f"{csr_table_name}_mapping_{node_type}"
|
|
402
|
+
con.execute(
|
|
403
|
+
f"""
|
|
404
|
+
CREATE TABLE {mapping_table} AS
|
|
405
|
+
SELECT
|
|
406
|
+
row_number() OVER (ORDER BY {pk_col}) - 1 AS csr_index,
|
|
407
|
+
{pk_col} AS original_node_id
|
|
408
|
+
FROM {csr_table_name}_{nt}
|
|
409
|
+
ORDER BY csr_index;
|
|
410
|
+
"""
|
|
411
|
+
)
|
|
412
|
+
print(f" Created node mapping: {mapping_table}")
|
|
413
|
+
|
|
414
|
+
# Track node count
|
|
415
|
+
result = con.execute(
|
|
416
|
+
f"SELECT COUNT(*) FROM {csr_table_name}_{nt}"
|
|
417
|
+
).fetchone()
|
|
418
|
+
node_counts[nt] = result[0] if result else 0
|
|
419
|
+
except Exception as e:
|
|
420
|
+
print(f"Warning: Could not copy node table {nt}: {e}")
|
|
421
|
+
|
|
422
|
+
# Process each edge table separately to create per-edge CSR structures
|
|
423
|
+
print("\nStep 1: Building per-edge-table CSR structures...")
|
|
424
|
+
|
|
425
|
+
for et in edge_tables:
|
|
426
|
+
# Determine source and target node types from schema
|
|
427
|
+
edge_name = (
|
|
428
|
+
et[6:].lower() if et.startswith("edges_") else et.lower()
|
|
429
|
+
) # Remove "edges_" prefix and lowercase
|
|
430
|
+
src_node_type, dst_node_type = edge_relationships.get(
|
|
431
|
+
edge_name, (None, None)
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
# Find the corresponding node tables
|
|
435
|
+
src_table = node_type_to_table.get(src_node_type)
|
|
436
|
+
dst_table = node_type_to_table.get(dst_node_type)
|
|
437
|
+
|
|
438
|
+
fallback_node_type = None
|
|
439
|
+
if src_table and dst_table:
|
|
440
|
+
src_mapping = f"{csr_table_name}_mapping_{src_node_type}"
|
|
441
|
+
dst_mapping = f"{csr_table_name}_mapping_{dst_node_type}"
|
|
442
|
+
num_src_nodes = node_counts.get(src_table, 0)
|
|
443
|
+
print(
|
|
444
|
+
f"\n Processing {et}: {src_node_type} ({num_src_nodes} nodes) -> {dst_node_type}"
|
|
445
|
+
)
|
|
446
|
+
else:
|
|
447
|
+
# Fallback: use first node table for both
|
|
448
|
+
fallback_table = node_tables[0] if node_tables else "nodes"
|
|
449
|
+
fallback_node_type = (
|
|
450
|
+
fallback_table[6:].lower()
|
|
451
|
+
if fallback_table.startswith("nodes_")
|
|
452
|
+
else fallback_table.lower()
|
|
453
|
+
)
|
|
454
|
+
src_mapping = f"{csr_table_name}_mapping_{fallback_node_type}"
|
|
455
|
+
dst_mapping = src_mapping
|
|
456
|
+
num_src_nodes = node_counts.get(fallback_table, 0)
|
|
457
|
+
print(f"\n Processing {et}: using fallback mapping {src_mapping}")
|
|
458
|
+
|
|
459
|
+
# Get edge columns excluding source and target
|
|
460
|
+
edge_cols_result = con.execute(f"DESCRIBE orig.{et}").fetchall()
|
|
461
|
+
edge_col_names = [col[0] for col in edge_cols_result]
|
|
462
|
+
edge_cols = [c for c in edge_col_names if c not in ["source", "target"]]
|
|
463
|
+
|
|
464
|
+
# Prepare select column strings
|
|
465
|
+
select_cols = "m1.csr_index AS csr_source, m2.csr_index AS csr_target"
|
|
466
|
+
if edge_cols:
|
|
467
|
+
select_cols += ", " + ", ".join([f"e.{c}" for c in edge_cols])
|
|
468
|
+
reverse_select_cols = (
|
|
469
|
+
"m2.csr_index AS csr_source, m1.csr_index AS csr_target"
|
|
470
|
+
)
|
|
471
|
+
if edge_cols:
|
|
472
|
+
reverse_select_cols += ", " + ", ".join([f"e.{c}" for c in edge_cols])
|
|
473
|
+
reverse_cols = "csr_target AS csr_source, csr_source AS csr_target"
|
|
474
|
+
if edge_cols:
|
|
475
|
+
reverse_cols += ", " + ", ".join(edge_cols)
|
|
476
|
+
|
|
477
|
+
# Create relations table for this edge type
|
|
478
|
+
if limit_rels:
|
|
479
|
+
limit_per_table = limit_rels // len(edge_tables)
|
|
480
|
+
if directed:
|
|
481
|
+
rel_query = f"""
|
|
482
|
+
SELECT {select_cols}
|
|
483
|
+
FROM orig.{et} e
|
|
484
|
+
JOIN {src_mapping} m1 ON e.source = m1.original_node_id
|
|
485
|
+
JOIN {dst_mapping} m2 ON e.target = m2.original_node_id
|
|
486
|
+
WHERE e.source != e.target
|
|
487
|
+
LIMIT {limit_per_table}
|
|
488
|
+
"""
|
|
489
|
+
else:
|
|
490
|
+
rel_query = f"""
|
|
491
|
+
WITH limited AS (
|
|
492
|
+
SELECT {select_cols}
|
|
493
|
+
FROM orig.{et} e
|
|
494
|
+
JOIN {src_mapping} m1 ON e.source = m1.original_node_id
|
|
495
|
+
JOIN {dst_mapping} m2 ON e.target = m2.original_node_id
|
|
496
|
+
WHERE e.source != e.target
|
|
497
|
+
LIMIT {limit_per_table}
|
|
498
|
+
)
|
|
499
|
+
SELECT * FROM limited
|
|
500
|
+
UNION ALL
|
|
501
|
+
SELECT {reverse_cols} FROM limited
|
|
502
|
+
"""
|
|
503
|
+
else:
|
|
504
|
+
if directed:
|
|
505
|
+
rel_query = f"""
|
|
506
|
+
SELECT {select_cols}
|
|
507
|
+
FROM orig.{et} e
|
|
508
|
+
JOIN {src_mapping} m1 ON e.source = m1.original_node_id
|
|
509
|
+
JOIN {dst_mapping} m2 ON e.target = m2.original_node_id
|
|
510
|
+
WHERE e.source != e.target
|
|
511
|
+
"""
|
|
512
|
+
else:
|
|
513
|
+
rel_query = f"""
|
|
514
|
+
SELECT {select_cols}
|
|
515
|
+
FROM orig.{et} e
|
|
516
|
+
JOIN {src_mapping} m1 ON e.source = m1.original_node_id
|
|
517
|
+
JOIN {dst_mapping} m2 ON e.target = m2.original_node_id
|
|
518
|
+
WHERE e.source != e.target
|
|
519
|
+
UNION ALL
|
|
520
|
+
SELECT {reverse_select_cols}
|
|
521
|
+
FROM orig.{et} e
|
|
522
|
+
JOIN {src_mapping} m1 ON e.source = m1.original_node_id
|
|
523
|
+
JOIN {dst_mapping} m2 ON e.target = m2.original_node_id
|
|
524
|
+
WHERE e.source != e.target
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
con.execute(f"CREATE TABLE relations_{edge_name} AS {rel_query};")
|
|
528
|
+
|
|
529
|
+
result = con.execute(
|
|
530
|
+
f"SELECT COUNT(*) FROM relations_{edge_name}"
|
|
531
|
+
).fetchone()
|
|
532
|
+
edge_count = result[0] if result else 0
|
|
533
|
+
print(f" Edges: {edge_count:,}")
|
|
534
|
+
|
|
535
|
+
# Build CSR indptr for this edge type
|
|
536
|
+
indptr_table = f"{csr_table_name}_indptr_{edge_name}"
|
|
537
|
+
con.execute(
|
|
538
|
+
f"""
|
|
539
|
+
CREATE TABLE {indptr_table} AS
|
|
540
|
+
WITH node_range AS (
|
|
541
|
+
SELECT unnest(range(0, {num_src_nodes})) AS node_id
|
|
542
|
+
),
|
|
543
|
+
degrees AS (
|
|
544
|
+
SELECT csr_source AS src, COUNT(*) AS deg
|
|
545
|
+
FROM relations_{edge_name}
|
|
546
|
+
GROUP BY csr_source
|
|
547
|
+
),
|
|
548
|
+
cumulative AS (
|
|
549
|
+
SELECT
|
|
550
|
+
node_range.node_id,
|
|
551
|
+
COALESCE(SUM(degrees.deg) OVER (ORDER BY node_range.node_id ROWS UNBOUNDED PRECEDING), 0) AS ptr
|
|
552
|
+
FROM node_range
|
|
553
|
+
LEFT JOIN degrees ON node_range.node_id = degrees.src
|
|
554
|
+
)
|
|
555
|
+
SELECT ptr FROM cumulative
|
|
556
|
+
ORDER BY node_id;
|
|
557
|
+
"""
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Recreate with leading zero
|
|
561
|
+
con.execute(
|
|
562
|
+
f"""
|
|
563
|
+
CREATE OR REPLACE TABLE {indptr_table} AS
|
|
564
|
+
SELECT 0::BIGINT AS ptr
|
|
565
|
+
UNION ALL
|
|
566
|
+
SELECT ptr::int64 FROM {indptr_table}
|
|
567
|
+
ORDER BY ptr;
|
|
568
|
+
"""
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
result = con.execute(f"SELECT COUNT(*) FROM {indptr_table}").fetchone()
|
|
572
|
+
indptr_size = result[0] if result else 0
|
|
573
|
+
print(f" indptr: {indptr_size} entries")
|
|
574
|
+
|
|
575
|
+
# Build CSR indices for this edge type
|
|
576
|
+
indices_table = f"{csr_table_name}_indices_{edge_name}"
|
|
577
|
+
con.execute(
|
|
578
|
+
f"""
|
|
579
|
+
CREATE TABLE {indices_table} AS
|
|
580
|
+
SELECT csr_target AS target{', ' + ', '.join(edge_cols) if edge_cols else ''}
|
|
581
|
+
FROM relations_{edge_name}
|
|
582
|
+
ORDER BY csr_source, csr_target;
|
|
583
|
+
"""
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
result = con.execute(f"SELECT COUNT(*) FROM {indices_table}").fetchone()
|
|
587
|
+
indices_size = result[0] if result else 0
|
|
588
|
+
print(f" indices: {indices_size} entries")
|
|
589
|
+
|
|
590
|
+
# Drop temporary relations table
|
|
591
|
+
con.execute(f"DROP TABLE IF EXISTS relations_{edge_name.lower()};")
|
|
592
|
+
|
|
593
|
+
# Count total nodes and edges for summary
|
|
594
|
+
total_nodes = sum(node_counts.values())
|
|
595
|
+
total_edges = 0
|
|
596
|
+
for et in edge_tables:
|
|
597
|
+
edge_name = et[6:].lower() if et.startswith("edges_") else et.lower()
|
|
598
|
+
result = con.execute(
|
|
599
|
+
f"SELECT COUNT(*) FROM {csr_table_name}_indices_{edge_name}"
|
|
600
|
+
).fetchone()
|
|
601
|
+
total_edges += result[0] if result else 0
|
|
602
|
+
|
|
603
|
+
# Create global metadata
|
|
604
|
+
con.execute(
|
|
605
|
+
f"""
|
|
606
|
+
CREATE TABLE {csr_table_name}_metadata AS
|
|
607
|
+
SELECT {total_nodes} AS n_nodes, {total_edges} AS n_edges, {directed} AS directed
|
|
608
|
+
"""
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# List per-table node mappings for output
|
|
612
|
+
node_mapping_tables = [
|
|
613
|
+
f"{csr_table_name}_mapping_{nt[6:].lower() if nt.startswith('nodes_') else nt.lower()}"
|
|
614
|
+
for nt in node_tables
|
|
615
|
+
]
|
|
616
|
+
|
|
617
|
+
print("\n✅ CSR format built and cleaned up. Final tables:")
|
|
618
|
+
for mapping_table in node_mapping_tables:
|
|
619
|
+
print(f" - {mapping_table} (orig_id → mapped_id)")
|
|
620
|
+
for i, et in enumerate(edge_tables):
|
|
621
|
+
edge_name = et[6:].lower() if et.startswith("edges_") else et.lower()
|
|
622
|
+
print(f" - {csr_table_name}_indptr_{edge_name}")
|
|
623
|
+
print(f" - {csr_table_name}_indices_{edge_name}")
|
|
624
|
+
print(f" - {csr_table_name}_metadata (global)")
|
|
625
|
+
|
|
626
|
+
print(
|
|
627
|
+
f"\n✓ Built CSR format: {total_nodes} nodes, {total_edges} edges across {len(edge_tables)} edge types"
|
|
628
|
+
)
|
|
629
|
+
print(f"✓ Saved CSR graph data to {output_db_path}")
|
|
630
|
+
|
|
631
|
+
# Export to parquet and generate schema.cypher
|
|
632
|
+
export_to_parquet_and_cypher(
|
|
633
|
+
con,
|
|
634
|
+
output_db_path,
|
|
635
|
+
csr_table_name,
|
|
636
|
+
node_tables,
|
|
637
|
+
edge_tables,
|
|
638
|
+
edge_relationships,
|
|
639
|
+
node_type_to_table,
|
|
640
|
+
storage_path,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
except Exception as e:
|
|
644
|
+
print(f"Error building CSR format: {e}")
|
|
645
|
+
raise
|
|
646
|
+
finally:
|
|
647
|
+
con.close()
|
|
648
|
+
|
|
649
|
+
print(f"\nAll data saved to: {output_db_path}")
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
def main():
|
|
653
|
+
"""Main function to convert DuckDB edges to CSR format."""
|
|
654
|
+
parser = argparse.ArgumentParser(
|
|
655
|
+
description="Convert graph data from DuckDB to CSR format"
|
|
656
|
+
)
|
|
657
|
+
parser.add_argument(
|
|
658
|
+
"--source-db",
|
|
659
|
+
type=str,
|
|
660
|
+
default="karate_random.duckdb",
|
|
661
|
+
help="Source DuckDB database path (default: karate_random.duckdb)",
|
|
662
|
+
)
|
|
663
|
+
parser.add_argument(
|
|
664
|
+
"--output-db",
|
|
665
|
+
type=str,
|
|
666
|
+
default="csr_graph.db",
|
|
667
|
+
help="Output DuckDB database path (default: csr_graph.db)",
|
|
668
|
+
)
|
|
669
|
+
parser.add_argument(
|
|
670
|
+
"--csr-table",
|
|
671
|
+
type=str,
|
|
672
|
+
default="csr_graph",
|
|
673
|
+
help="Table name prefix for CSR data (default: csr_graph)",
|
|
674
|
+
)
|
|
675
|
+
parser.add_argument(
|
|
676
|
+
"--node-table",
|
|
677
|
+
type=str,
|
|
678
|
+
default=None,
|
|
679
|
+
help="Specific node table to use (default: auto-discover tables starting with 'nodes')",
|
|
680
|
+
)
|
|
681
|
+
parser.add_argument(
|
|
682
|
+
"--edge-table",
|
|
683
|
+
type=str,
|
|
684
|
+
default=None,
|
|
685
|
+
help="Specific edge table to use (default: auto-discover tables starting with 'edges')",
|
|
686
|
+
)
|
|
687
|
+
parser.add_argument(
|
|
688
|
+
"--test", action="store_true", help="Run in test mode with limited data"
|
|
689
|
+
)
|
|
690
|
+
parser.add_argument(
|
|
691
|
+
"--limit",
|
|
692
|
+
type=int,
|
|
693
|
+
default=50000,
|
|
694
|
+
help="Number of edges to use in test mode (default: 50000)",
|
|
695
|
+
)
|
|
696
|
+
parser.add_argument(
|
|
697
|
+
"--directed",
|
|
698
|
+
action="store_true",
|
|
699
|
+
help="Treat graph as directed (default: undirected)",
|
|
700
|
+
)
|
|
701
|
+
parser.add_argument(
|
|
702
|
+
"--storage",
|
|
703
|
+
type=str,
|
|
704
|
+
default=None,
|
|
705
|
+
help="Storage path for schema.cypher (default: output_db path without .duckdb extension)",
|
|
706
|
+
)
|
|
707
|
+
parser.add_argument(
|
|
708
|
+
"--schema",
|
|
709
|
+
type=str,
|
|
710
|
+
default=None,
|
|
711
|
+
help="Path to schema.cypher for edge relationship info (FROM/TO node types)",
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
args = parser.parse_args()
|
|
715
|
+
|
|
716
|
+
print("=== DuckDB to CSR Format Converter ===\n")
|
|
717
|
+
|
|
718
|
+
# Configuration
|
|
719
|
+
source_db_path = args.source_db # DuckDB source
|
|
720
|
+
|
|
721
|
+
# Create CSR graph
|
|
722
|
+
test_limit = args.limit if args.test else None
|
|
723
|
+
|
|
724
|
+
if test_limit:
|
|
725
|
+
print(f"Creating CSR graph in TEST MODE with limit: {test_limit} edges")
|
|
726
|
+
else:
|
|
727
|
+
print("Creating CSR graph on FULL DATASET")
|
|
728
|
+
|
|
729
|
+
print(f"Source database: {source_db_path}")
|
|
730
|
+
print(f"CSR output database: {args.output_db}")
|
|
731
|
+
print(f"CSR table prefix: {args.csr_table}")
|
|
732
|
+
print(f"Directed: {args.directed}")
|
|
733
|
+
|
|
734
|
+
# Compute default storage path from output_db if not specified
|
|
735
|
+
storage_path = args.storage
|
|
736
|
+
if storage_path is None:
|
|
737
|
+
# Use output_db path without .duckdb extension + csr_table_name
|
|
738
|
+
storage_path = f"./{Path(args.output_db).stem}/{args.csr_table}"
|
|
739
|
+
print(f"Storage path: {storage_path}")
|
|
740
|
+
|
|
741
|
+
if args.node_table:
|
|
742
|
+
print(f"Node table filter: {args.node_table}")
|
|
743
|
+
if args.edge_table:
|
|
744
|
+
print(f"Edge table filter: {args.edge_table}")
|
|
745
|
+
if args.schema:
|
|
746
|
+
print(f"Schema file: {args.schema}")
|
|
747
|
+
|
|
748
|
+
create_csr_graph_to_duckdb(
|
|
749
|
+
source_db_path=source_db_path,
|
|
750
|
+
output_db_path=args.output_db,
|
|
751
|
+
limit_rels=test_limit,
|
|
752
|
+
directed=args.directed,
|
|
753
|
+
csr_table_name=args.csr_table,
|
|
754
|
+
node_table=args.node_table,
|
|
755
|
+
edge_table=args.edge_table,
|
|
756
|
+
schema_path=args.schema,
|
|
757
|
+
storage_path=storage_path,
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
print("\n=== Conversion Completed Successfully! ===")
|
|
761
|
+
print(f"CSR graph data saved to: {args.output_db}")
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
if __name__ == "__main__":
|
|
765
|
+
main()
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: icebug-format
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert graph data from DuckDB to CSR format for Icebug
|
|
5
|
+
Project-URL: Homepage, https://github.com/anomalyco/icebug-format
|
|
6
|
+
Project-URL: Repository, https://github.com/anomalyco/icebug-format
|
|
7
|
+
Project-URL: PyPI, https://pypi.org/project/icebug-format
|
|
8
|
+
Requires-Python: >=3.13
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: duckdb>=1.3.2
|
|
11
|
+
Provides-Extra: full
|
|
12
|
+
Requires-Dist: real_ladybug>=0.14.1; extra == "full"
|
|
13
|
+
Requires-Dist: networkx>=3.5; extra == "full"
|
|
14
|
+
Requires-Dist: pandas>=2.3.2; extra == "full"
|
|
15
|
+
Requires-Dist: pyarrow>=21.0.0; extra == "full"
|
|
16
|
+
|
|
17
|
+
# Icebug Format
|
|
18
|
+
|
|
19
|
+
> **Note**: This project was formerly called **graph-std**.
|
|
20
|
+
|
|
21
|
+
Icebug is a standardized graph format designed for efficient graph data interchange. It comes in two formats:
|
|
22
|
+
|
|
23
|
+
- **icebug-disk**: Parquet-based format for object storage
|
|
24
|
+
- **icebug-memory**: Apache Arrow-based format for in-memory processing
|
|
25
|
+
|
|
26
|
+
This project provides tools to convert graph data from simple DuckDB databases or Parquet files containing `nodes_*` and `edges_*` tables, along with a `schema.cypher` file, into standardized graph formats for efficient processing.
|
|
27
|
+
|
|
28
|
+
## Sample Usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv run icebug-format.py \
|
|
32
|
+
--source-db karate/karate_random.duckdb \
|
|
33
|
+
--output-db karate/karate_csr.duckdb \
|
|
34
|
+
--csr-table karate \
|
|
35
|
+
--schema karate/karate_csr/schema.cypher
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
This will create a CSR representation with multiple tables depending on the number of node and edge types:
|
|
39
|
+
|
|
40
|
+
- `{table_name}_indptr_{edge_name}`: Array of size N+1 for row pointers (one per edge table)
|
|
41
|
+
- `{table_name}_indices_{edge_name}`: Array of size E containing column indices (one per edge table)
|
|
42
|
+
- `{table_name}_nodes_{node_name}`: Original nodes table with node attributes (one per node table)
|
|
43
|
+
- `{table_name}_mapping_{node_name}`: Maps original node IDs to contiguous indices (one per node table)
|
|
44
|
+
- `{table_name}_metadata`: Global graph metadata (node count, edge count, directed flag)
|
|
45
|
+
- `schema.cypher`: A cypher schema that a graph database can mount without ingesting
|
|
46
|
+
|
|
47
|
+
## More information about Icebug and Apache GraphAR
|
|
48
|
+
|
|
49
|
+
[Blog Post](https://adsharma.github.io/graph-archiving/)
|
|
50
|
+
|
|
51
|
+
## Recreating demo-db/icebug-disk
|
|
52
|
+
|
|
53
|
+
Start from a simple demo-db.duckdb that looks like this
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
Querying database: demo-db.duckdb
|
|
57
|
+
================================
|
|
58
|
+
|
|
59
|
+
--- Table: edges_follows ---
|
|
60
|
+
┌────────┬────────┬───────┐
|
|
61
|
+
│ source │ target │ since │
|
|
62
|
+
│ int32 │ int32 │ int32 │
|
|
63
|
+
├────────┼────────┼───────┤
|
|
64
|
+
│ 100 │ 250 │ 2020 │
|
|
65
|
+
│ 300 │ 75 │ 2022 │
|
|
66
|
+
│ 250 │ 300 │ 2021 │
|
|
67
|
+
│ 100 │ 300 │ 2020 │
|
|
68
|
+
└────────┴────────┴───────┘
|
|
69
|
+
================================
|
|
70
|
+
|
|
71
|
+
--- Table: edges_livesin ---
|
|
72
|
+
┌────────┬────────┐
|
|
73
|
+
│ source │ target │
|
|
74
|
+
│ int32 │ int32 │
|
|
75
|
+
├────────┼────────┤
|
|
76
|
+
│ 100 │ 700 │
|
|
77
|
+
│ 250 │ 700 │
|
|
78
|
+
│ 300 │ 600 │
|
|
79
|
+
│ 75 │ 500 │
|
|
80
|
+
└────────┴────────┘
|
|
81
|
+
================================
|
|
82
|
+
|
|
83
|
+
--- Table: nodes_city ---
|
|
84
|
+
┌───────┬───────────┬────────────┐
|
|
85
|
+
│ id │ name │ population │
|
|
86
|
+
│ int32 │ varchar │ int64 │
|
|
87
|
+
├───────┼───────────┼────────────┤
|
|
88
|
+
│ 500 │ Guelph │ 75000 │
|
|
89
|
+
│ 600 │ Kitchener │ 200000 │
|
|
90
|
+
│ 700 │ Waterloo │ 150000 │
|
|
91
|
+
└───────┴───────────┴────────────┘
|
|
92
|
+
================================
|
|
93
|
+
|
|
94
|
+
--- Table: nodes_user ---
|
|
95
|
+
┌───────┬─────────┬───────┐
|
|
96
|
+
│ id │ name │ age │
|
|
97
|
+
│ int32 │ varchar │ int64 │
|
|
98
|
+
├───────┼─────────┼───────┤
|
|
99
|
+
│ 100 │ Adam │ 30 │
|
|
100
|
+
│ 250 │ Karissa │ 40 │
|
|
101
|
+
│ 75 │ Noura │ 25 │
|
|
102
|
+
│ 300 │ Zhang │ 50 │
|
|
103
|
+
└───────┴─────────┴───────┘
|
|
104
|
+
================================
|
|
105
|
+
|
|
106
|
+
--- Schema: schema.cypher --
|
|
107
|
+
CREATE NODE TABLE User(id INT64, name STRING, age INT64, PRIMARY KEY (id));
|
|
108
|
+
CREATE NODE TABLE City(id INT64, name STRING, population INT64, PRIMARY KEY (id));
|
|
109
|
+
CREATE REL TABLE Follows(FROM User TO User, since INT64);
|
|
110
|
+
CREATE REL TABLE LivesIn(FROM User TO City);
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
and run:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
uv run icebug-format.py \
|
|
117
|
+
--directed \
|
|
118
|
+
--source-db demo-db.duckdb \
|
|
119
|
+
--output-db demo-db_csr.duckdb \
|
|
120
|
+
--csr-table demo \
|
|
121
|
+
--schema demo-db/schema.cypher
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
You'll get a demo-db_csr.duckdb AND the object storage ready representation aka icebug-disk.
|
|
125
|
+
|
|
126
|
+
## Verification
|
|
127
|
+
|
|
128
|
+
You can verify that the conversion went ok by running `scan.py`. It's also a good way to understand the icebug-disk format.
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
uv run scan.py --input demo-db_csr --prefix demo
|
|
132
|
+
Metadata: 7 nodes, 8 edges, directed=True
|
|
133
|
+
|
|
134
|
+
Node Tables:
|
|
135
|
+
|
|
136
|
+
Table: demo_nodes_user
|
|
137
|
+
(100, 'Adam', 30)
|
|
138
|
+
(250, 'Karissa', 40)
|
|
139
|
+
(75, 'Noura', 25)
|
|
140
|
+
(300, 'Zhang', 50)
|
|
141
|
+
|
|
142
|
+
Table: demo_nodes_city
|
|
143
|
+
(500, 'Guelph', 75000)
|
|
144
|
+
(600, 'Kitchener', 200000)
|
|
145
|
+
(700, 'Waterloo', 150000)
|
|
146
|
+
|
|
147
|
+
Edge Tables (reconstructed from CSR):
|
|
148
|
+
|
|
149
|
+
Table: follows (FROM user TO user)
|
|
150
|
+
(100, 250, 2020)
|
|
151
|
+
(100, 300, 2020)
|
|
152
|
+
(250, 300, 2021)
|
|
153
|
+
(300, 75, 2022)
|
|
154
|
+
|
|
155
|
+
Table: livesin (FROM user TO city)
|
|
156
|
+
(75, 500)
|
|
157
|
+
(100, 700)
|
|
158
|
+
(250, 700)
|
|
159
|
+
(300, 600)
|
|
160
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
icebug_format/__init__.py
|
|
4
|
+
icebug_format/cli.py
|
|
5
|
+
icebug_format.egg-info/PKG-INFO
|
|
6
|
+
icebug_format.egg-info/SOURCES.txt
|
|
7
|
+
icebug_format.egg-info/dependency_links.txt
|
|
8
|
+
icebug_format.egg-info/entry_points.txt
|
|
9
|
+
icebug_format.egg-info/requires.txt
|
|
10
|
+
icebug_format.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
icebug_format
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "icebug-format"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Convert graph data from DuckDB to CSR format for Icebug"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.13"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"duckdb>=1.3.2",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
[project.optional-dependencies]
|
|
12
|
+
full = [
|
|
13
|
+
"real_ladybug>=0.14.1",
|
|
14
|
+
"networkx>=3.5",
|
|
15
|
+
"pandas>=2.3.2",
|
|
16
|
+
"pyarrow>=21.0.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
icebug-format = "icebug_format:main"
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/anomalyco/icebug-format"
|
|
24
|
+
Repository = "https://github.com/anomalyco/icebug-format"
|
|
25
|
+
PyPI = "https://pypi.org/project/icebug-format"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
include = ["icebug_format*"]
|
|
29
|
+
|
|
30
|
+
[tool.isort]
|
|
31
|
+
profile = "black"
|
|
32
|
+
|
|
33
|
+
[tool.uv]
|
|
34
|
+
package = true
|
|
35
|
+
dev-dependencies = []
|