graphzero 0.2.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
graphzero/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from .graphzero import Graph, convert_csv_to_gl, convert_csv_to_gd, DataType, FeatureStore
2
+
3
+ # Metadata
4
+ __version__ = "0.2.0"
5
+ __author__ = "Krish Singaria"
6
+ __license__ = "MIT"
7
+
8
+ # Define what happens when someone does "from graphzero import *"
9
+ __all__ = [
10
+ "Graph",
11
+ "convert_csv_to_gl",
12
+ "convert_csv_to_gd",
13
+ "DataType",
14
+ "FeatureStore"
15
+ ]
Binary file
@@ -0,0 +1,170 @@
1
+ import enum
2
+ from typing import List, Tuple, Any, Union
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+
6
+ class Graph:
7
+ """
8
+ Graph Class contains the graphfile and its relevant functions and methods.
9
+ It holds the mmap / zero-copy memory.
10
+ """
11
+
12
+ num_nodes: int
13
+ num_edges: int
14
+
15
+ def __init__(self, filename: str) -> None:
16
+ """
17
+ Args:
18
+ filename (str): either absolute path or relative path (depends on the current working directory).
19
+ Returns:
20
+ Graph class instance.
21
+ """
22
+ ...
23
+
24
+ def get_degree(self, node_id: int) -> int:
25
+ """
26
+ Get the degree of a node.
27
+
28
+ Args:
29
+ node_id (int)
30
+ Returns:
31
+ degree (int)
32
+ """
33
+ ...
34
+
35
+ def get_neighbours(self, node_id: int) -> npt.NDArray[np.uint64]:
36
+ """
37
+ Returns the neighbours of a node.
38
+
39
+ Args:
40
+ node_id (int)
41
+ Returns:
42
+ 1-D numpy ndarray of neighbour node IDs (dtype: platform-size integer).
43
+ """
44
+ ...
45
+
46
+ def batch_random_walk(self, start_nodes: List[int], walk_length: int, p: float = 1.0, q: float = 1.0) -> npt.NDArray[np.int64]:
47
+ """
48
+ Performs 2nd-order random walks (Node2Vec style).
49
+
50
+ Args:
51
+ start_nodes (list): list of starting nodes.
52
+ walk_length (int): how long walks should be (e.g. 10).
53
+ p (float): Return parameter; Low = keeps walk local (BFS-like).
54
+ q (float): In-out parameter; Low = explores far away (DFS-like).
55
+
56
+ Returns:
57
+ ndarray of shape (len(start_nodes), walk_length) with dtype np.int64.
58
+ """
59
+ ...
60
+
61
+ def batch_random_walk_uniform(self, start_nodes: List[int], walk_length: int) -> npt.NDArray[np.int64]:
62
+ """
63
+ Performs uniform random walks.
64
+
65
+ Args:
66
+ start_nodes (list): list of starting nodes.
67
+ walk_length (int): how long walks should be (e.g. 10).
68
+
69
+ Returns:
70
+ ndarray of shape (len(start_nodes), walk_length) with dtype np.int64.
71
+ """
72
+ ...
73
+
74
+ def batch_random_fanout(self, start_nodes: List[int], K: int) -> npt.NDArray[np.int64]:
75
+ """
76
+ Performs uniform random fanout sampling.
77
+
78
+ Args:
79
+ start_nodes (list): list of starting nodes.
80
+ K (int): how many neighbours to sample.
81
+
82
+ Returns:
83
+ ndarray of shape (len(start_nodes), K) with dtype np.int64.
84
+ """
85
+ ...
86
+
87
+ def sample_neighbours(self, start_node: int, K: int) -> npt.NDArray[np.int64]:
88
+ """
89
+ Performs uniform random neighbour sampling for a node.
90
+
91
+ Args:
92
+ start_node (int): node id to sample from.
93
+ K (int): how many neighbours to sample.
94
+
95
+ Returns:
96
+ 1-D ndarray with up to K neighbour ids (dtype np.int64).
97
+ """
98
+ ...
99
+
100
+ def __getstate__(self) -> tuple: ...
101
+ def __setstate__(self, state: tuple) -> None: ...
102
+
103
+
104
+ class DataType(enum.Enum):
105
+ """
106
+ Enum DataType only For FeatureStore datatype
107
+
108
+ use gz.DataType.INT32, gz.DataType.INT64, gz.DataType.FLOAT32, gz.DataType.FLOAT64 when converting csv to gd.
109
+ """
110
+ INT32 = 0,
111
+ INT64 = 1,
112
+ FLOAT32 = 2,
113
+ FLOAT64 = 3
114
+
115
+
116
+ class FeatureStore:
117
+ """
118
+ FeatureStore Class contains the datafile and its relevant functions and methods.
119
+ It holds the mmap / zero-copy memory to Features Matrix.
120
+ """
121
+ num_nodes: int
122
+ feature_dim: int
123
+
124
+ def __init__(self, filename: str) -> None:
125
+ """
126
+ Args:
127
+ filename (str): either absolute path or relative path (depends on the current working directory).
128
+ Returns:
129
+ FeatureStore class instance.
130
+ """
131
+ ...
132
+ def get_data(self, node_id: int) -> npt.NDArray[Any]:
133
+ """
134
+ Get the features of a node ID.
135
+
136
+ Args:
137
+ node_id (int): Node ID to retrieve features for.
138
+ Returns:
139
+ 1-D numpy ndarray of shape (feature_dim,).
140
+ """
141
+ ...
142
+ def get_tensor(self) -> npt.NDArray[Any]:
143
+ """
144
+ Returns the entire Data (tensor).
145
+ Returns:
146
+ 2-D numpy ndarray of shape (num_nodes, feature_dim) with the given dtype.
147
+ """
148
+ ...
149
+
150
+ def convert_csv_to_gl(csv_path: str, out_path: str, directed: bool = False) -> None:
151
+ """
152
+ Convert a CSV edge list to GraphZero binary format (.gl)
153
+
154
+ Args:
155
+ csv_path (str): Path to input CSV.
156
+ out_path (str): Path to output .gl file.
157
+ directed (bool): Whether the graph is directed. Defaults to False.
158
+ """
159
+ ...
160
+
161
+ def convert_csv_to_gd(csv_path: str, out_path: str, dtype: DataType) -> None:
162
+ """
163
+ Convert a CSV data to GraphZero data format (.gd).
164
+
165
+ Args:
166
+ csv_path (str): Path to input CSV.
167
+ out_path (str): Path to output .gd file.
168
+ dtype (DataType): Data type of the features MUST BE SET BY THE USER.
169
+ """
170
+ ...
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.2
2
+ Name: graphzero
3
+ Version: 0.2.0
4
+ Summary: High-performance Zero-Copy Graph Engine
5
+ Author-Email: Krish <krishsingaria2005@gmail.com>
6
+ Requires-Python: >=3.8
7
+ Requires-Dist: numpy
8
+ Requires-Dist: torch
9
+ Description-Content-Type: text/markdown
10
+
11
+ # GraphZero
12
+
13
+ **High-Performance, Zero-Copy Graph Engine for Massive Datasets on Consumer Hardware.**
14
+
15
+ GraphZero is a C++ graph processing engine with lightweight Python bindings designed to solve the **"Memory Wall"** in Graph Neural Networks (GNNs). It allows you to load and sample **100 Million+ node graphs** (like `ogbn-papers100M`) and their massive feature matrices on a standard 16GB RAM laptopβ€”something standard libraries like PyTorch Geometric (PyG) or DGL cannot do.
16
+
17
+ ## The Problem
18
+
19
+ GNN datasets can be massive. `ogbn-papers100M` contains **111 Million nodes**, **1.6 Billion edges**, and gigabytes of node embeddings.
20
+
21
+ * **Standard approach (PyG/NetworkX):** Tries to load the entire graph structure and all node features into **RAM** before training begins.
22
+ * **The Result:** `MemoryError` (OOM) on consumer hardware. You need 64GB+ **RAM** servers just to *load* the data.
23
+
24
+ ## The Solution:
25
+
26
+ GraphZero abandons the "Load-to-RAM" model. Instead, it uses a custom **Zero-Copy Architecture**:
27
+
28
+ * **Memory Mapping (`mmap`):** The graph and its features stay on disk. The OS only loads the specific "hot" pages needed for computation into RAM via page faults.
29
+ * **Compressed CSR (`.gl`):** A custom binary format that compresses raw edges by **~60%** (30GB CSV $\to$ 13GB Binary).
30
+ * **Columnar Tensor Store (`.gd`):** A raw, C-contiguous binary format for node features that instantly translates to PyTorch tensors without memory allocation.
31
+ * **Parallel Sampling:** OpenMP-accelerated random walks that saturate NVMe SSD throughput, using thread-local RNGs to eliminate lock contention.
32
+
33
+ ## πŸ† Benchmarks: GraphZero vs. PyTorch Geometric
34
+
35
+ **Task:** Load `ogbn-papers100M` (56GB Raw) and perform random walks.
36
+ **Hardware:** Windows Laptop (16GB RAM, NVMe SSD).
37
+
38
+ | Metric | GraphZero (v0.2) | PyTorch Geometric |
39
+ | --- | --- | --- |
40
+ | **Load Time** | **0.000000 s** ⚑ | **FAILED** (Crash) ❌ |
41
+ | **Peak RAM Usage** | **~5.1 GB** (OS Cache) | **>24.1 GB** (Required) |
42
+ | **Throughput** | **1,264,000 steps/s** | N/A |
43
+ | **Status** | βœ… **Success** | ❌ **OOM Error** |
44
+
45
+ ### Proof of Performance
46
+
47
+ > *Left: GraphZero loading instantly and utilizing OS Page Cache. Right: PyG crashing with `Unable to allocate 24.1 GiB`.*
48
+
49
+ <p float="left ">
50
+ <img src="benchmark/images/gz_bench.png" width="45%" />
51
+ <img src="benchmark/images/py_crash.png" width="45%" />
52
+ </p>
53
+
54
+
55
+ ## πŸ“¦ Installation
56
+
57
+ GraphZero is available on PyPI:
58
+
59
+ ```bash
60
+ pip install graphzero
61
+
62
+ ```
63
+
64
+
65
+ ## πŸš€ Quick Start
66
+
67
+ ### 1. Convert Your Data (Topology & Features)
68
+
69
+ GraphZero uses high-efficiency binary formats. Convert your generic CSV lists once.
70
+
71
+ example `edges.csv`, weights are optional:
72
+ ```csv
73
+ src,dst,weight
74
+ 0,1,0.5
75
+ 1,2,1.0
76
+ ```
77
+
78
+ ```python
79
+ import graphzero as gz
80
+
81
+ # 1. Convert Topology (Edges & Weights) to .gl
82
+ gz.convert_csv_to_gl(
83
+ input_csv="dataset/edges.csv",
84
+ output_bin="graph.gl",
85
+ directed=True
86
+ )
87
+
88
+ # 2. Convert Node Features to .gd (Float32, Int64, etc.)
89
+ gz.convert_csv_to_gd(
90
+ csv_path="dataset/features.csv",
91
+ out_path="features.gd",
92
+ dtype=gz.DataType.FLOAT32
93
+ )
94
+
95
+ ```
96
+
97
+ ### 2. High-Speed Sampling & Zero-Copy Tensors
98
+
99
+ Once converted, the graph and its multi-gigabyte feature matrix are instantly accessible without consuming RAM.
100
+
101
+ ```python
102
+ import graphzero as gz
103
+ import numpy as np
104
+
105
+ # TOPOLOGY
106
+ # Zero-Copy Load (Instant)
107
+ g = gz.Graph("graph.gl")
108
+
109
+ # Define Start Nodes
110
+ start_nodes = np.random.randint(0, g.num_nodes, 1000).astype(np.uint64)
111
+
112
+ # Parallel Biased Random Walk (Node2Vec style: p=1.0, q=0.5)
113
+ walks = g.batch_random_walk(
114
+ start_nodes=start_nodes,
115
+ walk_length=10,
116
+ p=1.0,
117
+ q=0.5
118
+ )
119
+
120
+ # FEATURES
121
+ # Zero-Copy Feature Load (Instant)
122
+ fs = gz.FeatureStore("features.gd")
123
+
124
+ # Get a perfect 2D Numpy/PyTorch Tensor mapping directly to the SSD
125
+ # RAM used: 0 Bytes!
126
+ node_features = fs.get_tensor()
127
+
128
+ print(f"Graph loaded. Feature Matrix Shape: {node_features.shape}")
129
+
130
+ ```
131
+
132
+ ## βš™οΈ Under the Hood
133
+
134
+ GraphZero is built for **Systems & GNN** enthusiasts.
135
+
136
+ * **Core:** C++20 with `nanobind` for Python bindings.
137
+ * **Parallelism:** Uses `#pragma omp` with thread-local deterministic RNGs.
138
+ * **IO:** Direct `CreateFileMapping` (Windows) and `mmap` (Linux) calls with alignment optimization (4KB/2MB pages).
139
+
140
+ ## 🌟 Current Features List (v0.2)
141
+
142
+ GraphZero currently supports the following high-performance ML capabilities:
143
+
144
+ **Graph Structural Engine**
145
+
146
+ * **Instant Ingestion:** Fast `mmap`-backed loading of directed, undirected, and weighted graphs.
147
+ * **Zero-Copy CSR:** Custom `.gl` binary format for dense, continuous memory alignment and 64-byte CPU cache line optimization.
148
+ * **Thread-Safe Sampling:** OpenMP-accelerated `batch_random_walk_uniform` and `batch_random_fanout`.
149
+ * **Biased Walks (Node2Vec):** Hardware-optimized Alias Table generation for $O(1)$ weighted sampling (`batch_random_walk` with `p` and `q` parameters).
150
+ * **Fault-Tolerant:** Automatic handling of dead-ends (sinks) and out-of-bounds nodes.
151
+
152
+ **Graph Data Engine**
153
+
154
+ * **Columnar Tensor Store:** Custom `.gd` binary format for storing $N \times F$ feature matrices.
155
+ * **Strong Typing:** Native C++ template dispatching supporting `FLOAT32`, `FLOAT64`, `INT32`, and `INT64`.
156
+ * **Zero-Copy Bridge:** Direct translation of `mmap` pointers to Numpy/PyTorch multidimensional arrays.
157
+
158
+ # πŸ—ΊοΈ Roadmap
159
+ - v0.3 (The Algorithmic Core): High-performance analytics engine adding OpenMP-accelerated Parallel BFS/DFS, PageRank, and Connected Components.
160
+
161
+ - v0.4 (Dynamic Updates): Breaking the immutable CSR barrier via an LSM-Tree/Adjacency List memory overlay to allow real-time edge/node insertions.
162
+
163
+ - v0.5 (Production Hardening): ACID-compliant safety for multi-process PyTorch training using Reader-Writer Locks, Write-Ahead Logging (WAL), and graceful exception handling.
164
+
165
+ ## πŸ“„ License
166
+
167
+ MIT License. Created by **Krish Singaria** (IIT Mandi).
@@ -0,0 +1,7 @@
1
+ graphzero/__init__.py,sha256=lQhujCJEdJ5NlEZpfr2nkqmP-SFMpFF1jVX7U8sgEkU,367
2
+ graphzero/graphzero.cp313-win_amd64.pyd,sha256=IaBWdT3PLYVFWu4rKHNyLSGJJOX37YdYwc7345qN9rg,206336
3
+ graphzero/graphzero.pyi,sha256=wp3QbBetsxvnQNqblkTrKAfK9N4DvWqIRBfgGDz-hvE,5121
4
+ graphzero-0.2.0.dist-info/METADATA,sha256=t-QpVocD59fxNUirxVSyICZuApHinypF4boQFU55wco,6092
5
+ graphzero-0.2.0.dist-info/WHEEL,sha256=UZrbbE4r80xj7Ncfa6JoeTVe-77bdXLkKUA63V8pKWQ,106
6
+ graphzero-0.2.0.dist-info/licenses/LICENSE,sha256=3f1lwtCZQI2fiQt-KWJ98nFb5EGrjZQzz-wZ62p-T8w,1081
7
+ graphzero-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: scikit-build-core 0.12.2
3
+ Root-Is-Purelib: false
4
+ Tag: cp313-cp313-win_amd64
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Krish
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.