dgen-py 0.1.2__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dgen_py/__init__.py ADDED
@@ -0,0 +1,167 @@
1
+ """
2
+ dgen-py: High-performance random data generation with NUMA optimization
3
+
4
+ TRUE ZERO-COPY: Uses Python buffer protocol for zero-copy access to generated data.
5
+ No memcpy between Rust and Python - same performance as numpy!
6
+ """
7
+
8
+ from typing import Optional
9
+ import sys
10
+
11
+ # Import Rust extension module
12
+ try:
13
+ from ._dgen_rs import (
14
+ BytesView,
15
+ generate_buffer,
16
+ generate_into_buffer,
17
+ Generator,
18
+ )
19
+
20
+ # Try to import NUMA info (may not be available on all platforms)
21
+ try:
22
+ from ._dgen_rs import get_numa_info
23
+ except ImportError:
24
+ get_numa_info = None
25
+
26
+ except ImportError as e:
27
+ raise ImportError(
28
+ f"Failed to import dgen-py Rust extension: {e}\n"
29
+ "Please ensure the package is properly installed:\n"
30
+ " pip install dgen-py\n"
31
+ "Or build from source:\n"
32
+ " cd dgen-rs && maturin develop --release"
33
+ )
34
+
35
+ __version__ = "0.1.1"
36
+ __all__ = [
37
+ "BytesView",
38
+ "generate_buffer",
39
+ "generate_data",
40
+ "generate_into_buffer",
41
+ "fill_buffer",
42
+ "Generator",
43
+ "get_numa_info",
44
+ "get_system_info",
45
+ ]
46
+
47
+
48
+ def generate_data(
49
+ size: int,
50
+ dedup_ratio: float = 1.0,
51
+ compress_ratio: float = 1.0,
52
+ numa_mode: str = "auto",
53
+ max_threads: Optional[int] = None,
54
+ ):
55
+ """
56
+ Generate random data with ZERO-COPY access via buffer protocol.
57
+
58
+ Returns a BytesView object that supports memoryview() for true zero-copy access.
59
+ No memcpy between Rust and Python - same memory is shared!
60
+
61
+ Args:
62
+ size: Total bytes to generate
63
+ dedup_ratio: Deduplication ratio (1.0 = no dedup, 2.0 = 2:1 ratio)
64
+ compress_ratio: Compression ratio (1.0 = incompressible, 3.0 = 3:1 ratio)
65
+ numa_mode: NUMA optimization - \"auto\" (default), \"force\", or \"disabled\"
66
+ max_threads: Maximum threads to use (None = use all cores)
67
+
68
+ Returns:
69
+ BytesView: Zero-copy buffer (use memoryview() or numpy.frombuffer() for access)
70
+
71
+ Example - Zero-copy with numpy (fastest):
72
+ >>> import dgen_py
73
+ >>> import numpy as np
74
+ >>>
75
+ >>> # Generate data (no copy)
76
+ >>> data = dgen_py.generate_data(1024 * 1024)
77
+ >>>
78
+ >>> # Create memoryview (no copy)
79
+ >>> view = memoryview(data)
80
+ >>>
81
+ >>> # Create numpy array (STILL no copy!)
82
+ >>> arr = np.frombuffer(view, dtype=np.uint8)
83
+ >>>
84
+ >>> # All three share the SAME memory - zero copies!
85
+ >>> len(data), len(view), len(arr)
86
+ (1048576, 1048576, 1048576)
87
+
88
+ Example - Get Python bytes (copies data):
89
+ >>> # If you need actual bytes object, call bytes()
90
+ >>> data_bytes = bytes(data) # This copies, but gives you bytes object
91
+ """
92
+ return generate_buffer(size, dedup_ratio, compress_ratio, numa_mode, max_threads)
93
+
94
+
95
+ def fill_buffer(
96
+ buffer,
97
+ dedup_ratio: float = 1.0,
98
+ compress_ratio: float = 1.0,
99
+ numa_mode: str = "auto",
100
+ max_threads: Optional[int] = None,
101
+ ) -> int:
102
+ """
103
+ Generate data directly into an existing buffer (zero-copy).
104
+
105
+ This is the most efficient API for pre-allocated buffers.
106
+ Works with bytearray, memoryview, numpy arrays, etc.
107
+
108
+ Args:
109
+ buffer: Pre-allocated writable buffer (supports buffer protocol)
110
+ dedup_ratio: Deduplication ratio
111
+ compress_ratio: Compression ratio
112
+ numa_mode: NUMA optimization - "auto" (default), "force", or "disabled"
113
+ max_threads: Maximum threads to use (None = use all cores)
114
+
115
+ Returns:
116
+ int: Number of bytes written
117
+
118
+ Example:
119
+ >>> import dgen_py
120
+ >>>
121
+ >>> # Pre-allocate buffer
122
+ >>> buf = bytearray(1024 * 1024)
123
+ >>>
124
+ >>> # Generate directly into buffer (zero-copy) using 4 threads
125
+ >>> nbytes = dgen_py.fill_buffer(buf, compress_ratio=2.0, max_threads=4)
126
+ >>> print(f"Wrote {nbytes} bytes")
127
+
128
+ >>> # Works with numpy arrays
129
+ >>> import numpy as np
130
+ >>> arr = np.zeros(1024 * 1024, dtype=np.uint8)
131
+ >>> nbytes = dgen_py.fill_buffer(arr, dedup_ratio=2.0)
132
+ """
133
+ return generate_into_buffer(buffer, dedup_ratio, compress_ratio, numa_mode, max_threads)
134
+
135
+
136
+ def get_system_info() -> Optional[dict]:
137
+ """
138
+ Get NUMA topology information (if available).
139
+
140
+ Returns:
141
+ dict: NUMA info with keys:
142
+ - num_nodes: Number of NUMA nodes
143
+ - physical_cores: Total physical cores
144
+ - logical_cpus: Total logical CPUs
145
+ - is_uma: Whether this is a UMA system
146
+ - deployment_type: Description of deployment type
147
+ None: If NUMA detection is not available on this platform
148
+
149
+ Example:
150
+ >>> info = dgen_py.get_system_info()
151
+ >>> if info:
152
+ ... print(f"NUMA nodes: {info['num_nodes']}")
153
+ ... print(f"Cores: {info['physical_cores']}")
154
+ ... print(f"Type: {info['deployment_type']}")
155
+ """
156
+ if get_numa_info is None:
157
+ return None
158
+
159
+ try:
160
+ return get_numa_info()
161
+ except Exception:
162
+ return None
163
+
164
+
165
+ # Convenience alias
166
+ StreamingGenerator = Generator
167
+
dgen_py/__init__.pyi ADDED
@@ -0,0 +1,61 @@
1
+ """Type stubs for dgen-py"""
2
+
3
+ from typing import Optional
4
+
5
+ def generate_buffer(
6
+ size: int,
7
+ dedup_ratio: float = 1.0,
8
+ compress_ratio: float = 1.0
9
+ ) -> bytes:
10
+ """Generate random data with controllable characteristics"""
11
+ ...
12
+
13
+ def generate_into_buffer(
14
+ buffer,
15
+ dedup_ratio: float = 1.0,
16
+ compress_ratio: float = 1.0
17
+ ) -> int:
18
+ """Generate data directly into existing buffer (zero-copy)"""
19
+ ...
20
+
21
+ class Generator:
22
+ """Streaming data generator"""
23
+
24
+ def __init__(
25
+ self,
26
+ size: int,
27
+ dedup_ratio: float = 1.0,
28
+ compress_ratio: float = 1.0,
29
+ numa_mode: str = "auto",
30
+ max_threads: Optional[int] = None
31
+ ) -> None:
32
+ """Create new generator"""
33
+ ...
34
+
35
+ def fill_chunk(self, buffer) -> int:
36
+ """Fill next chunk into buffer"""
37
+ ...
38
+
39
+ def get_chunk(self, chunk_size: int) -> Optional[bytes]:
40
+ """Get next chunk as bytes"""
41
+ ...
42
+
43
+ def reset(self) -> None:
44
+ """Reset to start"""
45
+ ...
46
+
47
+ def position(self) -> int:
48
+ """Get current position"""
49
+ ...
50
+
51
+ def total_size(self) -> int:
52
+ """Get total size"""
53
+ ...
54
+
55
+ def is_complete(self) -> bool:
56
+ """Check if complete"""
57
+ ...
58
+
59
+ def get_numa_info() -> dict:
60
+ """Get NUMA topology information"""
61
+ ...
@@ -0,0 +1,241 @@
1
+ # Performance Benchmarks
2
+
3
+ ## dgen-py vs Numpy Random Generation
4
+
5
+ **Test System**: 12-core UMA system (single NUMA node)
6
+ **Date**: January 8, 2026
7
+ **Python**: 3.12
8
+ **Numpy**: Latest
9
+ **Test Method**: 5 runs per size, averaged
10
+
11
+ ---
12
+
13
+ ## Benchmark Results
14
+
15
+ ### Performance Comparison Table
16
+
17
+ | Size | Method | Time (ms) | Throughput | vs dgen-py |
18
+ |---------|-------------------------|-----------|------------|------------|
19
+ | **1 MiB** | dgen-py | 2.7 ms | 0.39 GB/s | baseline |
20
+ | | numpy.random.randint | 11.8 ms | 0.09 GB/s | **4.36x** |
21
+ | | numpy.random.bytes | 1.1 ms | 0.98 GB/s | 0.39x |
22
+ | **10 MiB** | dgen-py | 7.1 ms | 1.48 GB/s | baseline |
23
+ | | numpy.random.randint | 15.3 ms | 0.69 GB/s | **2.15x** |
24
+ | | numpy.random.bytes | 11.4 ms | 0.92 GB/s | **1.60x** |
25
+ | **100 MiB** | dgen-py | 14.4 ms | 7.26 GB/s | baseline |
26
+ | | numpy.random.randint | 152.6 ms | 0.69 GB/s | **10.56x** |
27
+ | | numpy.random.bytes | 219.0 ms | 0.48 GB/s | **15.16x** |
28
+ | **500 MiB** | dgen-py | 59.2 ms | 8.86 GB/s | baseline |
29
+ | | numpy.random.randint | 763.8 ms | 0.69 GB/s | **12.91x** |
30
+ | | numpy.random.bytes | 1097.8 ms | 0.48 GB/s | **18.56x** |
31
+
32
+ ---
33
+
34
+ ## Key Findings
35
+
36
+ ### 🚀 Performance Summary
37
+
38
+ - **Average speedup vs numpy.random.randint**: **7.50x faster**
39
+ - **Average speedup vs numpy.random.bytes**: **8.93x faster**
40
+ - **Peak throughput**: **8.86 GB/s** (at 500 MiB)
41
+ - **Best speedup**: **18.56x faster** than numpy.random.bytes at 500 MiB
42
+
43
+ ### 📊 Throughput Scaling
44
+
45
+ ```
46
+ Size dgen-py numpy.randint numpy.bytes Speedup (best)
47
+ --------------------------------------------------------------
48
+ 1 MiB 0.39 GB/s 0.09 GB/s 0.98 GB/s 0.39x (slower)
49
+ 10 MiB 1.48 GB/s 0.69 GB/s 0.92 GB/s 1.60x
50
+ 100 MiB 7.26 GB/s 0.69 GB/s 0.48 GB/s 15.16x
51
+ 500 MiB 8.86 GB/s 0.69 GB/s 0.48 GB/s 18.56x
52
+ ```
53
+
54
+ **Observation**: dgen-py scales linearly with data size due to multi-threading, while numpy performance plateaus (single-threaded).
55
+
56
+ ---
57
+
58
+ ## Why dgen-py is Faster
59
+
60
+ ### 1. **Superior RNG Algorithm**
61
+ - **dgen-py**: Xoshiro256++ (fastest high-quality RNG)
62
+ - **numpy**: MT19937 (Mersenne Twister, slower but proven)
63
+ - Xoshiro256++ provides ~2x raw speed advantage
64
+
65
+ ### 2. **Multi-Threading**
66
+ - **dgen-py**: Rayon-based parallel generation across all cores
67
+ - **numpy**: Single-threaded random number generation
68
+ - Linear scaling on multi-core systems (12 cores = ~12x potential)
69
+
70
+ ### 3. **Zero-Copy Architecture**
71
+ - **dgen-py**: Buffer protocol (`__getbuffer__`) for direct memory access
72
+ - **numpy**: Must allocate numpy array, copy data
73
+ - Eliminates allocation overhead and memcpy latency
74
+
75
+ ### 4. **Optimized for Bulk Generation**
76
+ - Pre-allocated buffers
77
+ - Cache-friendly memory access patterns
78
+ - First-touch NUMA locality (on NUMA systems)
79
+
80
+ ---
81
+
82
+ ## When to Use Each
83
+
84
+ ### Use dgen-py when:
85
+ ✅ Generating **large datasets** (100+ MiB)
86
+ ✅ Need **maximum throughput** (AI/ML data generation)
87
+ ✅ Working with **binary data** (files, network buffers)
88
+ ✅ Have **multi-core system** to leverage parallelism
89
+ ✅ Need **zero-copy integration** with other tools
90
+
91
+ ### Use numpy.random when:
92
+ ✅ Generating **small arrays** (<10 MiB)
93
+ ✅ Need **statistical distributions** (normal, poisson, etc.)
94
+ ✅ Need **specific random seeds** for reproducibility
95
+ ✅ Integration with **numpy-centric workflow**
96
+ ✅ Need **element-wise operations** on random data
97
+
98
+ ---
99
+
100
+ ## Zero-Copy Verification
101
+
102
+ ### Memory Access Pattern
103
+
104
+ ```python
105
+ import dgen_py
106
+ import numpy as np
107
+
108
+ # Generate data (Rust allocation)
109
+ data = dgen_py.generate_data(100 * 1024 * 1024) # 100 MiB
110
+
111
+ # Create memoryview (zero-copy, <2 µs)
112
+ view = memoryview(data)
113
+
114
+ # Create numpy array (zero-copy, <10 µs)
115
+ arr = np.frombuffer(view, dtype=np.uint8)
116
+
117
+ # All three share THE SAME memory:
118
+ assert len(data) == len(view) == len(arr) # 104,857,600 bytes
119
+ ```
120
+
121
+ **Memory Overhead**:
122
+ - **With copy**: 300 MiB (3 allocations)
123
+ - **Zero-copy**: 100 MiB (1 allocation)
124
+ - **Savings**: 66% memory reduction
125
+
126
+ **Performance Overhead**:
127
+ - Memoryview creation: **~1 µs**
128
+ - Numpy array creation: **~8 µs**
129
+ - **Total**: <10 µs (negligible compared to generation time)
130
+
131
+ ---
132
+
133
+ ## Comparison to Other Tools
134
+
135
+ ### Throughput Comparison (500 MiB test)
136
+
137
+ | Tool | Throughput | Notes |
138
+ |-------------------------|------------|---------------------------------|
139
+ | **dgen-py** | 8.86 GB/s | Multi-threaded, zero-copy |
140
+ | numpy.random.bytes | 0.48 GB/s | Single-threaded |
141
+ | numpy.random.randint | 0.69 GB/s | Single-threaded + array overhead|
142
+ | dd if=/dev/urandom | ~0.05 GB/s | Kernel RNG (cryptographic) |
143
+ | Rust (native) | 12.5 GB/s | Direct benchmark, no Python overhead |
144
+
145
+ **Note**: dgen-py achieves **70% of native Rust performance** through Python, demonstrating the effectiveness of zero-copy design.
146
+
147
+ ---
148
+
149
+ ## Technical Details
150
+
151
+ ### Test Configuration
152
+
153
+ ```python
154
+ # Benchmark parameters
155
+ sizes = [1 MiB, 10 MiB, 100 MiB, 500 MiB]
156
+ runs_per_size = 5
157
+ numa_mode = "auto" # Auto-detect NUMA topology
158
+ max_threads = None # Use all available cores (12)
159
+ ```
160
+
161
+ ### System Configuration
162
+
163
+ - **CPU**: 12 cores (UMA, single NUMA node)
164
+ - **Memory**: Standard system allocator
165
+ - **Python**: 3.12 with uv virtual environment
166
+ - **Compiler**: rustc 1.91+ with LTO enabled
167
+ - **Build**: `maturin develop --release`
168
+
169
+ ### Benchmark Script
170
+
171
+ Run the benchmark yourself:
172
+
173
+ ```bash
174
+ cd dgen-rs
175
+ python python/examples/benchmark_vs_numpy.py
176
+ ```
177
+
178
+ ---
179
+
180
+ ## Recommendations for Production
181
+
182
+ ### For AI/ML Training Data Generation
183
+
184
+ ```python
185
+ import dgen_py
186
+ import numpy as np
187
+
188
+ # Generate 1 GB of random data
189
+ data = dgen_py.generate_data(1024 * 1024 * 1024, numa_mode="auto")
190
+
191
+ # Zero-copy conversion to numpy for processing
192
+ view = memoryview(data)
193
+ arr = np.frombuffer(view, dtype=np.float32) # Reinterpret as float32
194
+
195
+ # Reshape for model input (e.g., 256x256 RGB images)
196
+ images = arr.reshape(-1, 256, 256, 3)
197
+ ```
198
+
199
+ **Expected Performance**: ~8-10 GB/s on typical workstation
200
+
201
+ ### For Storage Benchmarking
202
+
203
+ ```python
204
+ import dgen_py
205
+
206
+ # Generate incompressible data for realistic I/O testing
207
+ data = dgen_py.generate_data(
208
+ size=10 * 1024**3, # 10 GB
209
+ compress_ratio=1.0, # Incompressible
210
+ dedup_ratio=1.0, # No deduplication
211
+ numa_mode="force" # Force NUMA optimizations
212
+ )
213
+
214
+ # Write to storage (data is bytes-like, works with file I/O)
215
+ with open('/mnt/storage/testfile.bin', 'wb') as f:
216
+ f.write(bytes(data)) # Converts from BytesView to bytes
217
+ ```
218
+
219
+ ---
220
+
221
+ ## Future Optimizations
222
+
223
+ Potential improvements for even better performance:
224
+
225
+ 1. **SIMD Instructions**: AVX-512 for 2-4x speedup on modern CPUs
226
+ 2. **GPU Generation**: CUDA/ROCm for 100+ GB/s on high-end GPUs
227
+ 3. **Async I/O Integration**: Direct-to-disk generation without intermediate buffers
228
+ 4. **Custom Allocators**: jemalloc/mimalloc for better multi-threaded allocation
229
+
230
+ ---
231
+
232
+ ## Conclusion
233
+
234
+ dgen-py delivers **production-grade performance** for random data generation:
235
+
236
+ - ✅ **7-18x faster** than numpy for bulk generation
237
+ - ✅ **True zero-copy** via Python buffer protocol
238
+ - ✅ **Multi-threaded** scaling on modern CPUs
239
+ - ✅ **Competitive with native Rust** (70% of raw performance)
240
+
241
+ For AI/ML workloads requiring large amounts of random data, dgen-py provides a **significant performance advantage** over traditional numpy-based approaches.
@@ -0,0 +1,201 @@
1
+ # Python Examples
2
+
3
+ Performance benchmark scripts to find optimal dgen-py settings for your system.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ # Create virtual environment with uv
9
+ uv venv --python 3.12
10
+ source .venv/bin/activate # or: .venv\Scripts\activate on Windows
11
+
12
+ # Build and install dgen-py
13
+ maturin develop --release
14
+
15
+ # Run quick 30-second test
16
+ python python/examples/quick_perf_test.py
17
+ ```
18
+
19
+ ## Scripts
20
+
21
+ ### 1. quick_perf_test.py - Fast Optimization (30 seconds)
22
+
23
+ Quick test to find the best settings for your system.
24
+
25
+ ```bash
26
+ python python/examples/quick_perf_test.py
27
+ ```
28
+
29
+ **Tests:**
30
+ - Default (auto-detect)
31
+ - Force NUMA mode
32
+ - NUMA disabled
33
+ - Half thread count
34
+ - Single thread baseline
35
+
36
+ **Output:** Ranked results with recommended configuration
37
+
38
+ ---
39
+
40
+ ### 2. benchmark_cpu_numa.py - Comprehensive Benchmark (5-10 minutes)
41
+
42
+ Deep performance analysis with 4 benchmark suites.
43
+
44
+ ```bash
45
+ python python/examples/benchmark_cpu_numa.py
46
+ ```
47
+
48
+ **Benchmark Suites:**
49
+
50
+ 1. **Thread Scaling** - Test 1,2,4,8,16 threads and all cores
51
+ 2. **NUMA Modes** - Compare auto/force/disabled
52
+ 3. **Compression Impact** - Test 1x, 2x, 3x, 5x compression ratios
53
+ 4. **Optimal Config** - Find best thread count + NUMA mode combination
54
+
55
+ **Output:**
56
+ - Per-suite performance charts (text-based)
57
+ - Detailed throughput tables
58
+ - Optimal configuration recommendations
59
+ - CSV results in `benchmark_results/`
60
+
61
+ ---
62
+
63
+ ## Example Output
64
+
65
+ ### quick_perf_test.py
66
+ ```
67
+ dgen-py Quick Performance Test
68
+ ==================================================
69
+
70
+ System: 1 NUMA node(s), 12 CPUs
71
+ → Single-socket system (UMA)
72
+
73
+ Running tests...
74
+ --------------------------------------------------
75
+
76
+ 1. Default (auto-detect)... 1.05 GB/s
77
+ 2. Force NUMA... 1.04 GB/s
78
+ 3. NUMA disabled... 1.08 GB/s
79
+ 4. Half threads (6)... 1.12 GB/s
80
+ 5. Single thread (baseline)... 0.73 GB/s
81
+
82
+ ==================================================
83
+ RESULTS (fastest to slowest):
84
+ ==================================================
85
+ ★ 1. Half threads (6) 1.12 GB/s
86
+ 2. NUMA disabled 1.08 GB/s
87
+ 3. Default (auto) 1.05 GB/s
88
+
89
+ ==================================================
90
+ RECOMMENDATION: Half threads (6)
91
+ Throughput: 1.12 GB/s
92
+ Code: dgen_py.generate_data(size, max_threads=6)
93
+ ==================================================
94
+ ```
95
+
96
+ ---
97
+
98
+ ## System Requirements
99
+
100
+ - **Python**: 3.8+ (3.12 recommended via uv)
101
+ - **dgen-py**: Built from source with `maturin develop --release`
102
+ - **OS**: Linux (best performance), macOS, Windows
103
+
104
+ ## Performance Tips
105
+
106
+ ### UMA Systems (Cloud VMs, Workstations)
107
+ - Use `numa_mode="disabled"` to avoid detection overhead
108
+ - Experiment with thread counts (often half or 3/4 of cores is optimal)
109
+ - Single-socket systems won't benefit from NUMA optimizations
110
+
111
+ ### NUMA Systems (Bare Metal, Multi-Socket)
112
+ - Use `numa_mode="auto"` (default) for intelligent detection
113
+ - Use `numa_mode="force"` to force optimizations
114
+ - Expect **30-50% throughput improvement** from thread pinning + first-touch
115
+ - Run benchmarks on actual hardware to measure gains
116
+
117
+ ### General
118
+ - Always test on your target hardware
119
+ - Compression ratio affects optimal thread count
120
+ - I/O-bound workloads may benefit from more threads
121
+ - CPU-bound workloads may saturate with fewer threads
122
+
123
+ ---
124
+
125
+ ## Interpreting Results
126
+
127
+ ### Throughput (GB/s)
128
+ - **< 1 GB/s**: Single thread or small dataset
129
+ - **1-5 GB/s**: Good multi-threaded UMA performance
130
+ - **5-10 GB/s**: Excellent UMA or good NUMA performance
131
+ - **10-20 GB/s**: Excellent NUMA with optimizations
132
+
133
+ ### Scaling Efficiency
134
+ ```python
135
+ efficiency = (throughput_N_threads / throughput_1_thread) / N
136
+ ```
137
+ - **> 0.8**: Excellent scaling (near-linear)
138
+ - **0.5-0.8**: Good scaling
139
+ - **< 0.5**: Poor scaling (reduce thread count)
140
+
141
+ ### NUMA vs UMA
142
+ - **UMA systems**: Force/Auto should show similar performance
143
+ - **NUMA systems**: Force should outperform Disabled by 30-50%
144
+
145
+ ---
146
+
147
+ ## Development
148
+
149
+ To modify these scripts:
150
+
151
+ ```bash
152
+ # Edit the scripts
153
+ vim python/examples/quick_perf_test.py
154
+
155
+ # Rebuild if you changed Rust code
156
+ maturin develop --release
157
+
158
+ # Re-run tests
159
+ python python/examples/quick_perf_test.py
160
+ ```
161
+
162
+ ---
163
+
164
+ ## Troubleshooting
165
+
166
+ ### ModuleNotFoundError: No module named 'dgen_py'
167
+
168
+ Run `maturin develop --release` to build and install the package.
169
+
170
+ ### ImportError: cannot import name 'generate_data'
171
+
172
+ Your dgen-py installation is outdated. Rebuild:
173
+ ```bash
174
+ maturin develop --release --force
175
+ ```
176
+
177
+ ### Low Performance on NUMA Systems
178
+
179
+ 1. Verify NUMA detection: `python -c "import dgen_py; print(dgen_py.get_system_info())"`
180
+ 2. Check `num_nodes` > 1
181
+ 3. Try `numa_mode="force"`
182
+ 4. Ensure running on actual NUMA hardware (not VM)
183
+
184
+ ### Performance Varies Between Runs
185
+
186
+ - Normal variation: ±5%
187
+ - Large variation: System is busy, close background apps
188
+ - Benchmark script runs warmup iterations to stabilize
189
+
190
+ ---
191
+
192
+ ## Contributing
193
+
194
+ Found optimal settings for your hardware? Share them!
195
+
196
+ Create an issue or PR with:
197
+ - Hardware specs (CPU, sockets, NUMA topology)
198
+ - Benchmark results
199
+ - Optimal configuration
200
+
201
+ This helps us improve auto-detection and recommendations.