dgen-py 0.1.2__cp310-cp310-manylinux_2_24_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dgen_py/__init__.py +167 -0
- dgen_py/__init__.pyi +61 -0
- dgen_py/_dgen_rs.cpython-310-x86_64-linux-gnu.so +0 -0
- dgen_py/docs/PERFORMANCE.md +241 -0
- dgen_py/examples/README.md +201 -0
- dgen_py/examples/benchmark_cpu_numa.py +299 -0
- dgen_py/examples/benchmark_vs_numpy.py +146 -0
- dgen_py/examples/demo.py +107 -0
- dgen_py/examples/quick_perf_test.py +107 -0
- dgen_py/examples/zero_copy_demo.py +97 -0
- dgen_py-0.1.2.dist-info/METADATA +271 -0
- dgen_py-0.1.2.dist-info/RECORD +14 -0
- dgen_py-0.1.2.dist-info/WHEEL +4 -0
- dgen_py-0.1.2.dist-info/licenses/LICENSE +39 -0
dgen_py/__init__.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dgen-py: High-performance random data generation with NUMA optimization
|
|
3
|
+
|
|
4
|
+
TRUE ZERO-COPY: Uses Python buffer protocol for zero-copy access to generated data.
|
|
5
|
+
No memcpy between Rust and Python - same performance as numpy!
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
# Import Rust extension module
|
|
12
|
+
try:
|
|
13
|
+
from ._dgen_rs import (
|
|
14
|
+
BytesView,
|
|
15
|
+
generate_buffer,
|
|
16
|
+
generate_into_buffer,
|
|
17
|
+
Generator,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Try to import NUMA info (may not be available on all platforms)
|
|
21
|
+
try:
|
|
22
|
+
from ._dgen_rs import get_numa_info
|
|
23
|
+
except ImportError:
|
|
24
|
+
get_numa_info = None
|
|
25
|
+
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise ImportError(
|
|
28
|
+
f"Failed to import dgen-py Rust extension: {e}\n"
|
|
29
|
+
"Please ensure the package is properly installed:\n"
|
|
30
|
+
" pip install dgen-py\n"
|
|
31
|
+
"Or build from source:\n"
|
|
32
|
+
" cd dgen-rs && maturin develop --release"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
__version__ = "0.1.1"
|
|
36
|
+
__all__ = [
|
|
37
|
+
"BytesView",
|
|
38
|
+
"generate_buffer",
|
|
39
|
+
"generate_data",
|
|
40
|
+
"generate_into_buffer",
|
|
41
|
+
"fill_buffer",
|
|
42
|
+
"Generator",
|
|
43
|
+
"get_numa_info",
|
|
44
|
+
"get_system_info",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def generate_data(
|
|
49
|
+
size: int,
|
|
50
|
+
dedup_ratio: float = 1.0,
|
|
51
|
+
compress_ratio: float = 1.0,
|
|
52
|
+
numa_mode: str = "auto",
|
|
53
|
+
max_threads: Optional[int] = None,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Generate random data with ZERO-COPY access via buffer protocol.
|
|
57
|
+
|
|
58
|
+
Returns a BytesView object that supports memoryview() for true zero-copy access.
|
|
59
|
+
No memcpy between Rust and Python - same memory is shared!
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
size: Total bytes to generate
|
|
63
|
+
dedup_ratio: Deduplication ratio (1.0 = no dedup, 2.0 = 2:1 ratio)
|
|
64
|
+
compress_ratio: Compression ratio (1.0 = incompressible, 3.0 = 3:1 ratio)
|
|
65
|
+
numa_mode: NUMA optimization - \"auto\" (default), \"force\", or \"disabled\"
|
|
66
|
+
max_threads: Maximum threads to use (None = use all cores)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
BytesView: Zero-copy buffer (use memoryview() or numpy.frombuffer() for access)
|
|
70
|
+
|
|
71
|
+
Example - Zero-copy with numpy (fastest):
|
|
72
|
+
>>> import dgen_py
|
|
73
|
+
>>> import numpy as np
|
|
74
|
+
>>>
|
|
75
|
+
>>> # Generate data (no copy)
|
|
76
|
+
>>> data = dgen_py.generate_data(1024 * 1024)
|
|
77
|
+
>>>
|
|
78
|
+
>>> # Create memoryview (no copy)
|
|
79
|
+
>>> view = memoryview(data)
|
|
80
|
+
>>>
|
|
81
|
+
>>> # Create numpy array (STILL no copy!)
|
|
82
|
+
>>> arr = np.frombuffer(view, dtype=np.uint8)
|
|
83
|
+
>>>
|
|
84
|
+
>>> # All three share the SAME memory - zero copies!
|
|
85
|
+
>>> len(data), len(view), len(arr)
|
|
86
|
+
(1048576, 1048576, 1048576)
|
|
87
|
+
|
|
88
|
+
Example - Get Python bytes (copies data):
|
|
89
|
+
>>> # If you need actual bytes object, call bytes()
|
|
90
|
+
>>> data_bytes = bytes(data) # This copies, but gives you bytes object
|
|
91
|
+
"""
|
|
92
|
+
return generate_buffer(size, dedup_ratio, compress_ratio, numa_mode, max_threads)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def fill_buffer(
|
|
96
|
+
buffer,
|
|
97
|
+
dedup_ratio: float = 1.0,
|
|
98
|
+
compress_ratio: float = 1.0,
|
|
99
|
+
numa_mode: str = "auto",
|
|
100
|
+
max_threads: Optional[int] = None,
|
|
101
|
+
) -> int:
|
|
102
|
+
"""
|
|
103
|
+
Generate data directly into an existing buffer (zero-copy).
|
|
104
|
+
|
|
105
|
+
This is the most efficient API for pre-allocated buffers.
|
|
106
|
+
Works with bytearray, memoryview, numpy arrays, etc.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
buffer: Pre-allocated writable buffer (supports buffer protocol)
|
|
110
|
+
dedup_ratio: Deduplication ratio
|
|
111
|
+
compress_ratio: Compression ratio
|
|
112
|
+
numa_mode: NUMA optimization - "auto" (default), "force", or "disabled"
|
|
113
|
+
max_threads: Maximum threads to use (None = use all cores)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
int: Number of bytes written
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> import dgen_py
|
|
120
|
+
>>>
|
|
121
|
+
>>> # Pre-allocate buffer
|
|
122
|
+
>>> buf = bytearray(1024 * 1024)
|
|
123
|
+
>>>
|
|
124
|
+
>>> # Generate directly into buffer (zero-copy) using 4 threads
|
|
125
|
+
>>> nbytes = dgen_py.fill_buffer(buf, compress_ratio=2.0, max_threads=4)
|
|
126
|
+
>>> print(f"Wrote {nbytes} bytes")
|
|
127
|
+
|
|
128
|
+
>>> # Works with numpy arrays
|
|
129
|
+
>>> import numpy as np
|
|
130
|
+
>>> arr = np.zeros(1024 * 1024, dtype=np.uint8)
|
|
131
|
+
>>> nbytes = dgen_py.fill_buffer(arr, dedup_ratio=2.0)
|
|
132
|
+
"""
|
|
133
|
+
return generate_into_buffer(buffer, dedup_ratio, compress_ratio, numa_mode, max_threads)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_system_info() -> Optional[dict]:
|
|
137
|
+
"""
|
|
138
|
+
Get NUMA topology information (if available).
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
dict: NUMA info with keys:
|
|
142
|
+
- num_nodes: Number of NUMA nodes
|
|
143
|
+
- physical_cores: Total physical cores
|
|
144
|
+
- logical_cpus: Total logical CPUs
|
|
145
|
+
- is_uma: Whether this is a UMA system
|
|
146
|
+
- deployment_type: Description of deployment type
|
|
147
|
+
None: If NUMA detection is not available on this platform
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
>>> info = dgen_py.get_system_info()
|
|
151
|
+
>>> if info:
|
|
152
|
+
... print(f"NUMA nodes: {info['num_nodes']}")
|
|
153
|
+
... print(f"Cores: {info['physical_cores']}")
|
|
154
|
+
... print(f"Type: {info['deployment_type']}")
|
|
155
|
+
"""
|
|
156
|
+
if get_numa_info is None:
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
return get_numa_info()
|
|
161
|
+
except Exception:
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# Convenience alias
|
|
166
|
+
StreamingGenerator = Generator
|
|
167
|
+
|
dgen_py/__init__.pyi
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Type stubs for dgen-py"""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
def generate_buffer(
|
|
6
|
+
size: int,
|
|
7
|
+
dedup_ratio: float = 1.0,
|
|
8
|
+
compress_ratio: float = 1.0
|
|
9
|
+
) -> bytes:
|
|
10
|
+
"""Generate random data with controllable characteristics"""
|
|
11
|
+
...
|
|
12
|
+
|
|
13
|
+
def generate_into_buffer(
|
|
14
|
+
buffer,
|
|
15
|
+
dedup_ratio: float = 1.0,
|
|
16
|
+
compress_ratio: float = 1.0
|
|
17
|
+
) -> int:
|
|
18
|
+
"""Generate data directly into existing buffer (zero-copy)"""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
class Generator:
|
|
22
|
+
"""Streaming data generator"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
size: int,
|
|
27
|
+
dedup_ratio: float = 1.0,
|
|
28
|
+
compress_ratio: float = 1.0,
|
|
29
|
+
numa_mode: str = "auto",
|
|
30
|
+
max_threads: Optional[int] = None
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Create new generator"""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
def fill_chunk(self, buffer) -> int:
|
|
36
|
+
"""Fill next chunk into buffer"""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
def get_chunk(self, chunk_size: int) -> Optional[bytes]:
|
|
40
|
+
"""Get next chunk as bytes"""
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
def reset(self) -> None:
|
|
44
|
+
"""Reset to start"""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
def position(self) -> int:
|
|
48
|
+
"""Get current position"""
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
def total_size(self) -> int:
|
|
52
|
+
"""Get total size"""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
def is_complete(self) -> bool:
|
|
56
|
+
"""Check if complete"""
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
def get_numa_info() -> dict:
|
|
60
|
+
"""Get NUMA topology information"""
|
|
61
|
+
...
|
|
Binary file
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# Performance Benchmarks
|
|
2
|
+
|
|
3
|
+
## dgen-py vs Numpy Random Generation
|
|
4
|
+
|
|
5
|
+
**Test System**: 12-core UMA system (single NUMA node)
|
|
6
|
+
**Date**: January 8, 2026
|
|
7
|
+
**Python**: 3.12
|
|
8
|
+
**Numpy**: Latest
|
|
9
|
+
**Test Method**: 5 runs per size, averaged
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Benchmark Results
|
|
14
|
+
|
|
15
|
+
### Performance Comparison Table
|
|
16
|
+
|
|
17
|
+
| Size | Method | Time (ms) | Throughput | vs dgen-py |
|
|
18
|
+
|---------|-------------------------|-----------|------------|------------|
|
|
19
|
+
| **1 MiB** | dgen-py | 2.7 ms | 0.39 GB/s | baseline |
|
|
20
|
+
| | numpy.random.randint | 11.8 ms | 0.09 GB/s | **4.36x** |
|
|
21
|
+
| | numpy.random.bytes | 1.1 ms | 0.98 GB/s | 0.39x |
|
|
22
|
+
| **10 MiB** | dgen-py | 7.1 ms | 1.48 GB/s | baseline |
|
|
23
|
+
| | numpy.random.randint | 15.3 ms | 0.69 GB/s | **2.15x** |
|
|
24
|
+
| | numpy.random.bytes | 11.4 ms | 0.92 GB/s | **1.60x** |
|
|
25
|
+
| **100 MiB** | dgen-py | 14.4 ms | 7.26 GB/s | baseline |
|
|
26
|
+
| | numpy.random.randint | 152.6 ms | 0.69 GB/s | **10.56x** |
|
|
27
|
+
| | numpy.random.bytes | 219.0 ms | 0.48 GB/s | **15.16x** |
|
|
28
|
+
| **500 MiB** | dgen-py | 59.2 ms | 8.86 GB/s | baseline |
|
|
29
|
+
| | numpy.random.randint | 763.8 ms | 0.69 GB/s | **12.91x** |
|
|
30
|
+
| | numpy.random.bytes | 1097.8 ms | 0.48 GB/s | **18.56x** |
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Key Findings
|
|
35
|
+
|
|
36
|
+
### 🚀 Performance Summary
|
|
37
|
+
|
|
38
|
+
- **Average speedup vs numpy.random.randint**: **7.50x faster**
|
|
39
|
+
- **Average speedup vs numpy.random.bytes**: **8.93x faster**
|
|
40
|
+
- **Peak throughput**: **8.86 GB/s** (at 500 MiB)
|
|
41
|
+
- **Best speedup**: **18.56x faster** than numpy.random.bytes at 500 MiB
|
|
42
|
+
|
|
43
|
+
### 📊 Throughput Scaling
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
Size dgen-py numpy.randint numpy.bytes Speedup (best)
|
|
47
|
+
--------------------------------------------------------------
|
|
48
|
+
1 MiB 0.39 GB/s 0.09 GB/s 0.98 GB/s 0.39x (slower)
|
|
49
|
+
10 MiB 1.48 GB/s 0.69 GB/s 0.92 GB/s 1.60x
|
|
50
|
+
100 MiB 7.26 GB/s 0.69 GB/s 0.48 GB/s 15.16x
|
|
51
|
+
500 MiB 8.86 GB/s 0.69 GB/s 0.48 GB/s 18.56x
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
**Observation**: dgen-py scales linearly with data size due to multi-threading, while numpy performance plateaus (single-threaded).
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Why dgen-py is Faster
|
|
59
|
+
|
|
60
|
+
### 1. **Superior RNG Algorithm**
|
|
61
|
+
- **dgen-py**: Xoshiro256++ (fastest high-quality RNG)
|
|
62
|
+
- **numpy**: MT19937 (Mersenne Twister, slower but proven)
|
|
63
|
+
- Xoshiro256++ provides ~2x raw speed advantage
|
|
64
|
+
|
|
65
|
+
### 2. **Multi-Threading**
|
|
66
|
+
- **dgen-py**: Rayon-based parallel generation across all cores
|
|
67
|
+
- **numpy**: Single-threaded random number generation
|
|
68
|
+
- Linear scaling on multi-core systems (12 cores = ~12x potential)
|
|
69
|
+
|
|
70
|
+
### 3. **Zero-Copy Architecture**
|
|
71
|
+
- **dgen-py**: Buffer protocol (`__getbuffer__`) for direct memory access
|
|
72
|
+
- **numpy**: Must allocate numpy array, copy data
|
|
73
|
+
- Eliminates allocation overhead and memcpy latency
|
|
74
|
+
|
|
75
|
+
### 4. **Optimized for Bulk Generation**
|
|
76
|
+
- Pre-allocated buffers
|
|
77
|
+
- Cache-friendly memory access patterns
|
|
78
|
+
- First-touch NUMA locality (on NUMA systems)
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## When to Use Each
|
|
83
|
+
|
|
84
|
+
### Use dgen-py when:
|
|
85
|
+
✅ Generating **large datasets** (100+ MiB)
|
|
86
|
+
✅ Need **maximum throughput** (AI/ML data generation)
|
|
87
|
+
✅ Working with **binary data** (files, network buffers)
|
|
88
|
+
✅ Have **multi-core system** to leverage parallelism
|
|
89
|
+
✅ Need **zero-copy integration** with other tools
|
|
90
|
+
|
|
91
|
+
### Use numpy.random when:
|
|
92
|
+
✅ Generating **small arrays** (<10 MiB)
|
|
93
|
+
✅ Need **statistical distributions** (normal, poisson, etc.)
|
|
94
|
+
✅ Need **specific random seeds** for reproducibility
|
|
95
|
+
✅ Integration with **numpy-centric workflow**
|
|
96
|
+
✅ Need **element-wise operations** on random data
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Zero-Copy Verification
|
|
101
|
+
|
|
102
|
+
### Memory Access Pattern
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import dgen_py
|
|
106
|
+
import numpy as np
|
|
107
|
+
|
|
108
|
+
# Generate data (Rust allocation)
|
|
109
|
+
data = dgen_py.generate_data(100 * 1024 * 1024) # 100 MiB
|
|
110
|
+
|
|
111
|
+
# Create memoryview (zero-copy, <2 µs)
|
|
112
|
+
view = memoryview(data)
|
|
113
|
+
|
|
114
|
+
# Create numpy array (zero-copy, <10 µs)
|
|
115
|
+
arr = np.frombuffer(view, dtype=np.uint8)
|
|
116
|
+
|
|
117
|
+
# All three share THE SAME memory:
|
|
118
|
+
assert len(data) == len(view) == len(arr) # 104,857,600 bytes
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Memory Overhead**:
|
|
122
|
+
- **With copy**: 300 MiB (3 allocations)
|
|
123
|
+
- **Zero-copy**: 100 MiB (1 allocation)
|
|
124
|
+
- **Savings**: 66% memory reduction
|
|
125
|
+
|
|
126
|
+
**Performance Overhead**:
|
|
127
|
+
- Memoryview creation: **~1 µs**
|
|
128
|
+
- Numpy array creation: **~8 µs**
|
|
129
|
+
- **Total**: <10 µs (negligible compared to generation time)
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Comparison to Other Tools
|
|
134
|
+
|
|
135
|
+
### Throughput Comparison (500 MiB test)
|
|
136
|
+
|
|
137
|
+
| Tool | Throughput | Notes |
|
|
138
|
+
|-------------------------|------------|---------------------------------|
|
|
139
|
+
| **dgen-py** | 8.86 GB/s | Multi-threaded, zero-copy |
|
|
140
|
+
| numpy.random.bytes | 0.48 GB/s | Single-threaded |
|
|
141
|
+
| numpy.random.randint | 0.69 GB/s | Single-threaded + array overhead|
|
|
142
|
+
| dd if=/dev/urandom | ~0.05 GB/s | Kernel RNG (cryptographic) |
|
|
143
|
+
| Rust (native) | 12.5 GB/s | Direct benchmark, no Python overhead |
|
|
144
|
+
|
|
145
|
+
**Note**: dgen-py achieves **70% of native Rust performance** through Python, demonstrating the effectiveness of zero-copy design.
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Technical Details
|
|
150
|
+
|
|
151
|
+
### Test Configuration
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# Benchmark parameters
|
|
155
|
+
sizes = [1 MiB, 10 MiB, 100 MiB, 500 MiB]
|
|
156
|
+
runs_per_size = 5
|
|
157
|
+
numa_mode = "auto" # Auto-detect NUMA topology
|
|
158
|
+
max_threads = None # Use all available cores (12)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### System Configuration
|
|
162
|
+
|
|
163
|
+
- **CPU**: 12 cores (UMA, single NUMA node)
|
|
164
|
+
- **Memory**: Standard system allocator
|
|
165
|
+
- **Python**: 3.12 with uv virtual environment
|
|
166
|
+
- **Compiler**: rustc 1.91+ with LTO enabled
|
|
167
|
+
- **Build**: `maturin develop --release`
|
|
168
|
+
|
|
169
|
+
### Benchmark Script
|
|
170
|
+
|
|
171
|
+
Run the benchmark yourself:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
cd dgen-rs
|
|
175
|
+
python python/examples/benchmark_vs_numpy.py
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Recommendations for Production
|
|
181
|
+
|
|
182
|
+
### For AI/ML Training Data Generation
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
import dgen_py
|
|
186
|
+
import numpy as np
|
|
187
|
+
|
|
188
|
+
# Generate 1 GB of random data
|
|
189
|
+
data = dgen_py.generate_data(1024 * 1024 * 1024, numa_mode="auto")
|
|
190
|
+
|
|
191
|
+
# Zero-copy conversion to numpy for processing
|
|
192
|
+
view = memoryview(data)
|
|
193
|
+
arr = np.frombuffer(view, dtype=np.float32) # Reinterpret as float32
|
|
194
|
+
|
|
195
|
+
# Reshape for model input (e.g., 256x256 RGB images)
|
|
196
|
+
images = arr.reshape(-1, 256, 256, 3)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Expected Performance**: ~8-10 GB/s on typical workstation
|
|
200
|
+
|
|
201
|
+
### For Storage Benchmarking
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
import dgen_py
|
|
205
|
+
|
|
206
|
+
# Generate incompressible data for realistic I/O testing
|
|
207
|
+
data = dgen_py.generate_data(
|
|
208
|
+
size=10 * 1024**3, # 10 GB
|
|
209
|
+
compress_ratio=1.0, # Incompressible
|
|
210
|
+
dedup_ratio=1.0, # No deduplication
|
|
211
|
+
numa_mode="force" # Force NUMA optimizations
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Write to storage (data is bytes-like, works with file I/O)
|
|
215
|
+
with open('/mnt/storage/testfile.bin', 'wb') as f:
|
|
216
|
+
f.write(bytes(data)) # Converts from BytesView to bytes
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Future Optimizations
|
|
222
|
+
|
|
223
|
+
Potential improvements for even better performance:
|
|
224
|
+
|
|
225
|
+
1. **SIMD Instructions**: AVX-512 for 2-4x speedup on modern CPUs
|
|
226
|
+
2. **GPU Generation**: CUDA/ROCm for 100+ GB/s on high-end GPUs
|
|
227
|
+
3. **Async I/O Integration**: Direct-to-disk generation without intermediate buffers
|
|
228
|
+
4. **Custom Allocators**: jemalloc/mimalloc for better multi-threaded allocation
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Conclusion
|
|
233
|
+
|
|
234
|
+
dgen-py delivers **production-grade performance** for random data generation:
|
|
235
|
+
|
|
236
|
+
- ✅ **7-18x faster** than numpy for bulk generation
|
|
237
|
+
- ✅ **True zero-copy** via Python buffer protocol
|
|
238
|
+
- ✅ **Multi-threaded** scaling on modern CPUs
|
|
239
|
+
- ✅ **Competitive with native Rust** (70% of raw performance)
|
|
240
|
+
|
|
241
|
+
For AI/ML workloads requiring large amounts of random data, dgen-py provides a **significant performance advantage** over traditional numpy-based approaches.
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# Python Examples
|
|
2
|
+
|
|
3
|
+
Performance benchmark scripts to find optimal dgen-py settings for your system.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Create virtual environment with uv
|
|
9
|
+
uv venv --python 3.12
|
|
10
|
+
source .venv/bin/activate # or: .venv\Scripts\activate on Windows
|
|
11
|
+
|
|
12
|
+
# Build and install dgen-py
|
|
13
|
+
maturin develop --release
|
|
14
|
+
|
|
15
|
+
# Run quick 30-second test
|
|
16
|
+
python python/examples/quick_perf_test.py
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Scripts
|
|
20
|
+
|
|
21
|
+
### 1. quick_perf_test.py - Fast Optimization (30 seconds)
|
|
22
|
+
|
|
23
|
+
Quick test to find the best settings for your system.
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
python python/examples/quick_perf_test.py
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Tests:**
|
|
30
|
+
- Default (auto-detect)
|
|
31
|
+
- Force NUMA mode
|
|
32
|
+
- NUMA disabled
|
|
33
|
+
- Half thread count
|
|
34
|
+
- Single thread baseline
|
|
35
|
+
|
|
36
|
+
**Output:** Ranked results with recommended configuration
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
### 2. benchmark_cpu_numa.py - Comprehensive Benchmark (5-10 minutes)
|
|
41
|
+
|
|
42
|
+
Deep performance analysis with 4 benchmark suites.
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
python python/examples/benchmark_cpu_numa.py
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Benchmark Suites:**
|
|
49
|
+
|
|
50
|
+
1. **Thread Scaling** - Test 1,2,4,8,16 threads and all cores
|
|
51
|
+
2. **NUMA Modes** - Compare auto/force/disabled
|
|
52
|
+
3. **Compression Impact** - Test 1x, 2x, 3x, 5x compression ratios
|
|
53
|
+
4. **Optimal Config** - Find best thread count + NUMA mode combination
|
|
54
|
+
|
|
55
|
+
**Output:**
|
|
56
|
+
- Per-suite performance charts (text-based)
|
|
57
|
+
- Detailed throughput tables
|
|
58
|
+
- Optimal configuration recommendations
|
|
59
|
+
- CSV results in `benchmark_results/`
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Example Output
|
|
64
|
+
|
|
65
|
+
### quick_perf_test.py
|
|
66
|
+
```
|
|
67
|
+
dgen-py Quick Performance Test
|
|
68
|
+
==================================================
|
|
69
|
+
|
|
70
|
+
System: 1 NUMA node(s), 12 CPUs
|
|
71
|
+
→ Single-socket system (UMA)
|
|
72
|
+
|
|
73
|
+
Running tests...
|
|
74
|
+
--------------------------------------------------
|
|
75
|
+
|
|
76
|
+
1. Default (auto-detect)... 1.05 GB/s
|
|
77
|
+
2. Force NUMA... 1.04 GB/s
|
|
78
|
+
3. NUMA disabled... 1.08 GB/s
|
|
79
|
+
4. Half threads (6)... 1.12 GB/s
|
|
80
|
+
5. Single thread (baseline)... 0.73 GB/s
|
|
81
|
+
|
|
82
|
+
==================================================
|
|
83
|
+
RESULTS (fastest to slowest):
|
|
84
|
+
==================================================
|
|
85
|
+
★ 1. Half threads (6) 1.12 GB/s
|
|
86
|
+
2. NUMA disabled 1.08 GB/s
|
|
87
|
+
3. Default (auto) 1.05 GB/s
|
|
88
|
+
|
|
89
|
+
==================================================
|
|
90
|
+
RECOMMENDATION: Half threads (6)
|
|
91
|
+
Throughput: 1.12 GB/s
|
|
92
|
+
Code: dgen_py.generate_data(size, max_threads=6)
|
|
93
|
+
==================================================
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## System Requirements
|
|
99
|
+
|
|
100
|
+
- **Python**: 3.8+ (3.12 recommended via uv)
|
|
101
|
+
- **dgen-py**: Built from source with `maturin develop --release`
|
|
102
|
+
- **OS**: Linux (best performance), macOS, Windows
|
|
103
|
+
|
|
104
|
+
## Performance Tips
|
|
105
|
+
|
|
106
|
+
### UMA Systems (Cloud VMs, Workstations)
|
|
107
|
+
- Use `numa_mode="disabled"` to avoid detection overhead
|
|
108
|
+
- Experiment with thread counts (often half or 3/4 of cores is optimal)
|
|
109
|
+
- Single-socket systems won't benefit from NUMA optimizations
|
|
110
|
+
|
|
111
|
+
### NUMA Systems (Bare Metal, Multi-Socket)
|
|
112
|
+
- Use `numa_mode="auto"` (default) for intelligent detection
|
|
113
|
+
- Use `numa_mode="force"` to force optimizations
|
|
114
|
+
- Expect **30-50% throughput improvement** from thread pinning + first-touch
|
|
115
|
+
- Run benchmarks on actual hardware to measure gains
|
|
116
|
+
|
|
117
|
+
### General
|
|
118
|
+
- Always test on your target hardware
|
|
119
|
+
- Compression ratio affects optimal thread count
|
|
120
|
+
- I/O-bound workloads may benefit from more threads
|
|
121
|
+
- CPU-bound workloads may saturate with fewer threads
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## Interpreting Results
|
|
126
|
+
|
|
127
|
+
### Throughput (GB/s)
|
|
128
|
+
- **< 1 GB/s**: Single thread or small dataset
|
|
129
|
+
- **1-5 GB/s**: Good multi-threaded UMA performance
|
|
130
|
+
- **5-10 GB/s**: Excellent UMA or good NUMA performance
|
|
131
|
+
- **10-20 GB/s**: Excellent NUMA with optimizations
|
|
132
|
+
|
|
133
|
+
### Scaling Efficiency
|
|
134
|
+
```python
|
|
135
|
+
efficiency = (throughput_N_threads / throughput_1_thread) / N
|
|
136
|
+
```
|
|
137
|
+
- **> 0.8**: Excellent scaling (near-linear)
|
|
138
|
+
- **0.5-0.8**: Good scaling
|
|
139
|
+
- **< 0.5**: Poor scaling (reduce thread count)
|
|
140
|
+
|
|
141
|
+
### NUMA vs UMA
|
|
142
|
+
- **UMA systems**: Force/Auto should show similar performance
|
|
143
|
+
- **NUMA systems**: Force should outperform Disabled by 30-50%
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Development
|
|
148
|
+
|
|
149
|
+
To modify these scripts:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Edit the scripts
|
|
153
|
+
vim python/examples/quick_perf_test.py
|
|
154
|
+
|
|
155
|
+
# Rebuild if you changed Rust code
|
|
156
|
+
maturin develop --release
|
|
157
|
+
|
|
158
|
+
# Re-run tests
|
|
159
|
+
python python/examples/quick_perf_test.py
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Troubleshooting
|
|
165
|
+
|
|
166
|
+
### ModuleNotFoundError: No module named 'dgen_py'
|
|
167
|
+
|
|
168
|
+
Run `maturin develop --release` to build and install the package.
|
|
169
|
+
|
|
170
|
+
### ImportError: cannot import name 'generate_data'
|
|
171
|
+
|
|
172
|
+
Your dgen-py installation is outdated. Rebuild:
|
|
173
|
+
```bash
|
|
174
|
+
maturin develop --release --force
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Low Performance on NUMA Systems
|
|
178
|
+
|
|
179
|
+
1. Verify NUMA detection: `python -c "import dgen_py; print(dgen_py.get_system_info())"`
|
|
180
|
+
2. Check `num_nodes` > 1
|
|
181
|
+
3. Try `numa_mode="force"`
|
|
182
|
+
4. Ensure running on actual NUMA hardware (not VM)
|
|
183
|
+
|
|
184
|
+
### Performance Varies Between Runs
|
|
185
|
+
|
|
186
|
+
- Normal variation: ±5%
|
|
187
|
+
- Large variation: System is busy, close background apps
|
|
188
|
+
- Benchmark script runs warmup iterations to stabilize
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Contributing
|
|
193
|
+
|
|
194
|
+
Found optimal settings for your hardware? Share them!
|
|
195
|
+
|
|
196
|
+
Create an issue or PR with:
|
|
197
|
+
- Hardware specs (CPU, sockets, NUMA topology)
|
|
198
|
+
- Benchmark results
|
|
199
|
+
- Optimal configuration
|
|
200
|
+
|
|
201
|
+
This helps us improve auto-detection and recommendations.
|