dgen-py 0.1.2__cp310-cp310-manylinux_2_24_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dgen_py/__init__.py +167 -0
- dgen_py/__init__.pyi +61 -0
- dgen_py/_dgen_rs.cpython-310-x86_64-linux-gnu.so +0 -0
- dgen_py/docs/PERFORMANCE.md +241 -0
- dgen_py/examples/README.md +201 -0
- dgen_py/examples/benchmark_cpu_numa.py +299 -0
- dgen_py/examples/benchmark_vs_numpy.py +146 -0
- dgen_py/examples/demo.py +107 -0
- dgen_py/examples/quick_perf_test.py +107 -0
- dgen_py/examples/zero_copy_demo.py +97 -0
- dgen_py-0.1.2.dist-info/METADATA +271 -0
- dgen_py-0.1.2.dist-info/RECORD +14 -0
- dgen_py-0.1.2.dist-info/WHEEL +4 -0
- dgen_py-0.1.2.dist-info/licenses/LICENSE +39 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Performance Benchmark: CPU and NUMA Configuration
|
|
4
|
+
==================================================
|
|
5
|
+
|
|
6
|
+
This script helps you find the optimal CPU and NUMA settings for your system.
|
|
7
|
+
Run this to discover:
|
|
8
|
+
- Optimal thread count for your workload
|
|
9
|
+
- Whether NUMA optimizations help (bare metal) or hurt (cloud VM)
|
|
10
|
+
- Baseline single-core performance
|
|
11
|
+
- Memory bandwidth limits
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
python benchmark_cpu_numa.py
|
|
15
|
+
|
|
16
|
+
Requirements:
|
|
17
|
+
pip install dgen-py
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import time
|
|
21
|
+
import sys
|
|
22
|
+
import os
|
|
23
|
+
from typing import Optional, List, Tuple
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import dgen_py
|
|
27
|
+
except ImportError:
|
|
28
|
+
print("ERROR: dgen-py not installed")
|
|
29
|
+
print("Install with: pip install dgen-py")
|
|
30
|
+
print("Or build from source: cd dgen-rs && maturin develop --release")
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PerformanceBenchmark:
|
|
35
|
+
"""Comprehensive CPU and NUMA performance benchmark"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, size_mb: int = 100):
|
|
38
|
+
self.size = size_mb * 1024 * 1024
|
|
39
|
+
self.results = []
|
|
40
|
+
|
|
41
|
+
def run_test(
|
|
42
|
+
self,
|
|
43
|
+
name: str,
|
|
44
|
+
dedup_ratio: float = 1.0,
|
|
45
|
+
compress_ratio: float = 1.0,
|
|
46
|
+
numa_mode: str = "auto",
|
|
47
|
+
max_threads: Optional[int] = None,
|
|
48
|
+
iterations: int = 3
|
|
49
|
+
) -> Tuple[float, float]:
|
|
50
|
+
"""Run a single test configuration multiple times and return avg throughput"""
|
|
51
|
+
|
|
52
|
+
times = []
|
|
53
|
+
for i in range(iterations):
|
|
54
|
+
start = time.perf_counter()
|
|
55
|
+
data = dgen_py.generate_data(
|
|
56
|
+
self.size,
|
|
57
|
+
dedup_ratio=dedup_ratio,
|
|
58
|
+
compress_ratio=compress_ratio,
|
|
59
|
+
numa_mode=numa_mode,
|
|
60
|
+
max_threads=max_threads
|
|
61
|
+
)
|
|
62
|
+
elapsed = time.perf_counter() - start
|
|
63
|
+
times.append(elapsed)
|
|
64
|
+
|
|
65
|
+
# Verify size
|
|
66
|
+
if len(data) != self.size:
|
|
67
|
+
print(f"WARNING: Expected {self.size} bytes, got {len(data)}")
|
|
68
|
+
|
|
69
|
+
avg_time = sum(times) / len(times)
|
|
70
|
+
throughput = (self.size / avg_time) / 1e9 # GB/s
|
|
71
|
+
|
|
72
|
+
self.results.append({
|
|
73
|
+
'name': name,
|
|
74
|
+
'throughput': throughput,
|
|
75
|
+
'time': avg_time,
|
|
76
|
+
'threads': max_threads or 'all',
|
|
77
|
+
'numa_mode': numa_mode,
|
|
78
|
+
'dedup': dedup_ratio,
|
|
79
|
+
'compress': compress_ratio
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
return throughput, avg_time
|
|
83
|
+
|
|
84
|
+
def print_result(self, name: str, throughput: float, time_sec: float,
|
|
85
|
+
threads: Optional[int] = None, numa_mode: str = "auto"):
|
|
86
|
+
"""Print a single test result"""
|
|
87
|
+
threads_str = f"{threads} threads" if threads else "all cores"
|
|
88
|
+
print(f" {name:40s} {throughput:8.2f} GB/s ({time_sec:.3f}s, {threads_str}, numa={numa_mode})")
|
|
89
|
+
|
|
90
|
+
def benchmark_thread_scaling(self):
|
|
91
|
+
"""Test performance with different thread counts"""
|
|
92
|
+
print("\n" + "="*80)
|
|
93
|
+
print("THREAD SCALING BENCHMARK")
|
|
94
|
+
print("="*80)
|
|
95
|
+
print("\nTesting different thread counts to find optimal configuration...")
|
|
96
|
+
print("(All tests: incompressible data, no dedup, NUMA=auto)\n")
|
|
97
|
+
|
|
98
|
+
# Get CPU count for intelligent thread selection
|
|
99
|
+
import multiprocessing
|
|
100
|
+
cpu_count = multiprocessing.cpu_count()
|
|
101
|
+
|
|
102
|
+
thread_counts = [1, 2, 4]
|
|
103
|
+
if cpu_count >= 8:
|
|
104
|
+
thread_counts.append(8)
|
|
105
|
+
if cpu_count >= 16:
|
|
106
|
+
thread_counts.append(16)
|
|
107
|
+
thread_counts.append(None) # All cores
|
|
108
|
+
|
|
109
|
+
baseline_throughput = None
|
|
110
|
+
|
|
111
|
+
for threads in thread_counts:
|
|
112
|
+
name = f"Threads: {threads if threads else cpu_count} ({'baseline' if threads == 1 else 'parallel'})"
|
|
113
|
+
throughput, elapsed = self.run_test(name, max_threads=threads)
|
|
114
|
+
self.print_result(name, throughput, elapsed, threads)
|
|
115
|
+
|
|
116
|
+
if threads == 1:
|
|
117
|
+
baseline_throughput = throughput
|
|
118
|
+
elif baseline_throughput:
|
|
119
|
+
speedup = throughput / baseline_throughput
|
|
120
|
+
efficiency = (speedup / (threads or cpu_count)) * 100
|
|
121
|
+
print(f" └─> Speedup: {speedup:.2f}x, Efficiency: {efficiency:.1f}%")
|
|
122
|
+
|
|
123
|
+
def benchmark_numa_modes(self):
|
|
124
|
+
"""Test different NUMA configurations"""
|
|
125
|
+
print("\n" + "="*80)
|
|
126
|
+
print("NUMA MODE BENCHMARK")
|
|
127
|
+
print("="*80)
|
|
128
|
+
print("\nTesting NUMA optimization modes...")
|
|
129
|
+
print("(All tests: incompressible data, no dedup, all cores)\n")
|
|
130
|
+
|
|
131
|
+
# Check system NUMA topology
|
|
132
|
+
numa_info = dgen_py.get_system_info()
|
|
133
|
+
if numa_info:
|
|
134
|
+
num_nodes = numa_info['num_nodes']
|
|
135
|
+
print(f"System: {num_nodes} NUMA node(s) detected")
|
|
136
|
+
if num_nodes > 1:
|
|
137
|
+
print(f" Multi-socket NUMA system - optimizations should help!")
|
|
138
|
+
else:
|
|
139
|
+
print(f" UMA system (single socket) - optimizations add minimal overhead")
|
|
140
|
+
print()
|
|
141
|
+
else:
|
|
142
|
+
print("NUMA detection not available (NUMA feature not compiled in)\n")
|
|
143
|
+
|
|
144
|
+
modes = [
|
|
145
|
+
("Auto (default)", "auto"),
|
|
146
|
+
("Force NUMA", "force"),
|
|
147
|
+
("Disabled", "disabled")
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
for name, mode in modes:
|
|
151
|
+
test_name = f"NUMA mode: {name}"
|
|
152
|
+
throughput, elapsed = self.run_test(test_name, numa_mode=mode)
|
|
153
|
+
self.print_result(test_name, throughput, elapsed, numa_mode=mode)
|
|
154
|
+
|
|
155
|
+
def benchmark_compression_impact(self):
|
|
156
|
+
"""Test performance with different compression ratios"""
|
|
157
|
+
print("\n" + "="*80)
|
|
158
|
+
print("COMPRESSION IMPACT BENCHMARK")
|
|
159
|
+
print("="*80)
|
|
160
|
+
print("\nTesting how compression ratio affects throughput...")
|
|
161
|
+
print("(All tests: no dedup, NUMA=auto, all cores)\n")
|
|
162
|
+
|
|
163
|
+
compress_ratios = [1, 2, 3, 5]
|
|
164
|
+
|
|
165
|
+
baseline = None
|
|
166
|
+
for ratio in compress_ratios:
|
|
167
|
+
name = f"Compression ratio: {ratio}:1 ({'incompressible' if ratio == 1 else 'compressible'})"
|
|
168
|
+
throughput, elapsed = self.run_test(name, compress_ratio=ratio)
|
|
169
|
+
self.print_result(name, throughput, elapsed)
|
|
170
|
+
|
|
171
|
+
if ratio == 1:
|
|
172
|
+
baseline = throughput
|
|
173
|
+
else:
|
|
174
|
+
slowdown = baseline / throughput if throughput > 0 else 0
|
|
175
|
+
print(f" └─> {slowdown:.2f}x slower than incompressible (more back-refs to copy)")
|
|
176
|
+
|
|
177
|
+
def benchmark_optimal_config(self):
|
|
178
|
+
"""Find optimal configuration for this system"""
|
|
179
|
+
print("\n" + "="*80)
|
|
180
|
+
print("OPTIMAL CONFIGURATION FINDER")
|
|
181
|
+
print("="*80)
|
|
182
|
+
print("\nTesting combinations to find best performance...\n")
|
|
183
|
+
|
|
184
|
+
import multiprocessing
|
|
185
|
+
cpu_count = multiprocessing.cpu_count()
|
|
186
|
+
|
|
187
|
+
configs = [
|
|
188
|
+
("All cores + Auto NUMA", None, "auto"),
|
|
189
|
+
("All cores + Force NUMA", None, "force"),
|
|
190
|
+
("All cores + Disabled NUMA", None, "disabled"),
|
|
191
|
+
("Half cores + Auto NUMA", cpu_count // 2, "auto"),
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
best_throughput = 0
|
|
195
|
+
best_config = None
|
|
196
|
+
|
|
197
|
+
for name, threads, numa_mode in configs:
|
|
198
|
+
test_name = f"Config: {name}"
|
|
199
|
+
throughput, elapsed = self.run_test(test_name, max_threads=threads, numa_mode=numa_mode)
|
|
200
|
+
self.print_result(test_name, throughput, elapsed, threads, numa_mode)
|
|
201
|
+
|
|
202
|
+
if throughput > best_throughput:
|
|
203
|
+
best_throughput = throughput
|
|
204
|
+
best_config = (name, threads, numa_mode)
|
|
205
|
+
|
|
206
|
+
print(f"\n{'='*80}")
|
|
207
|
+
print(f"WINNER: {best_config[0]}")
|
|
208
|
+
print(f" Throughput: {best_throughput:.2f} GB/s")
|
|
209
|
+
print(f" Config: max_threads={best_config[1]}, numa_mode='{best_config[2]}'")
|
|
210
|
+
print(f"{'='*80}")
|
|
211
|
+
|
|
212
|
+
def print_summary(self):
|
|
213
|
+
"""Print overall benchmark summary"""
|
|
214
|
+
print("\n" + "="*80)
|
|
215
|
+
print("BENCHMARK SUMMARY")
|
|
216
|
+
print("="*80)
|
|
217
|
+
|
|
218
|
+
if not self.results:
|
|
219
|
+
print("No results to summarize")
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
# Sort by throughput
|
|
223
|
+
sorted_results = sorted(self.results, key=lambda x: x['throughput'], reverse=True)
|
|
224
|
+
|
|
225
|
+
print(f"\nTop 5 configurations (out of {len(self.results)} tested):\n")
|
|
226
|
+
print(f"{'Rank':<6} {'Configuration':<40} {'Throughput':<12} {'Settings'}")
|
|
227
|
+
print("-" * 80)
|
|
228
|
+
|
|
229
|
+
for i, result in enumerate(sorted_results[:5], 1):
|
|
230
|
+
config = result['name']
|
|
231
|
+
throughput = f"{result['throughput']:.2f} GB/s"
|
|
232
|
+
settings = f"threads={result['threads']}, numa={result['numa_mode']}"
|
|
233
|
+
print(f"{i:<6} {config:<40} {throughput:<12} {settings}")
|
|
234
|
+
|
|
235
|
+
# System recommendations
|
|
236
|
+
print(f"\n{'='*80}")
|
|
237
|
+
print("RECOMMENDATIONS FOR YOUR SYSTEM")
|
|
238
|
+
print(f"{'='*80}\n")
|
|
239
|
+
|
|
240
|
+
best = sorted_results[0]
|
|
241
|
+
print(f"For maximum performance on this system:")
|
|
242
|
+
print(f" dgen_py.generate_data(size,")
|
|
243
|
+
print(f" max_threads={repr(best['threads'])},")
|
|
244
|
+
print(f" numa_mode='{best['numa_mode']}')")
|
|
245
|
+
print(f" Expected: ~{best['throughput']:.1f} GB/s")
|
|
246
|
+
|
|
247
|
+
# Check if NUMA helps
|
|
248
|
+
numa_info = dgen_py.get_system_info()
|
|
249
|
+
if numa_info and numa_info['num_nodes'] == 1:
|
|
250
|
+
print("\nNOTE: This is a UMA system (single NUMA node).")
|
|
251
|
+
print(" NUMA optimizations have minimal impact here.")
|
|
252
|
+
print(" On multi-socket bare metal servers, expect 30-50% improvement!")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def main():
|
|
256
|
+
"""Run comprehensive benchmark suite"""
|
|
257
|
+
|
|
258
|
+
print("""
|
|
259
|
+
╔══════════════════════════════════════════════════════════════════════════════╗
|
|
260
|
+
║ dgen-py Performance Benchmark Suite ║
|
|
261
|
+
║ CPU and NUMA Configuration Optimizer ║
|
|
262
|
+
╚══════════════════════════════════════════════════════════════════════════════╝
|
|
263
|
+
""")
|
|
264
|
+
|
|
265
|
+
# Check system info
|
|
266
|
+
numa_info = dgen_py.get_system_info()
|
|
267
|
+
if numa_info:
|
|
268
|
+
print(f"System Information:")
|
|
269
|
+
print(f" NUMA nodes: {numa_info['num_nodes']}")
|
|
270
|
+
print(f" Physical cores: {numa_info['physical_cores']}")
|
|
271
|
+
print(f" Logical CPUs: {numa_info['logical_cpus']}")
|
|
272
|
+
print(f" Deployment: {numa_info['deployment_type']}")
|
|
273
|
+
else:
|
|
274
|
+
import multiprocessing
|
|
275
|
+
print(f"System Information:")
|
|
276
|
+
print(f" CPUs: {multiprocessing.cpu_count()}")
|
|
277
|
+
print(f" NUMA: Not available (feature not compiled in)")
|
|
278
|
+
|
|
279
|
+
print(f"\nBenchmark Configuration:")
|
|
280
|
+
print(f" Data size per test: 100 MiB")
|
|
281
|
+
print(f" Iterations per config: 3")
|
|
282
|
+
print(f" Total time: ~1-2 minutes")
|
|
283
|
+
|
|
284
|
+
# Run benchmarks
|
|
285
|
+
benchmark = PerformanceBenchmark(size_mb=100)
|
|
286
|
+
|
|
287
|
+
benchmark.benchmark_thread_scaling()
|
|
288
|
+
benchmark.benchmark_numa_modes()
|
|
289
|
+
benchmark.benchmark_compression_impact()
|
|
290
|
+
benchmark.benchmark_optimal_config()
|
|
291
|
+
benchmark.print_summary()
|
|
292
|
+
|
|
293
|
+
print(f"\n{'='*80}")
|
|
294
|
+
print("Benchmark complete! Save these results for your production configuration.")
|
|
295
|
+
print(f"{'='*80}\n")
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
if __name__ == "__main__":
|
|
299
|
+
main()
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Benchmark: dgen-py vs Numpy Random
|
|
4
|
+
===================================
|
|
5
|
+
|
|
6
|
+
Compare dgen-py data generation to numpy's random number generation.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import dgen_py
|
|
10
|
+
import numpy as np
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def benchmark_dgen(size, runs=5):
|
|
15
|
+
"""Benchmark dgen-py generation"""
|
|
16
|
+
times = []
|
|
17
|
+
for _ in range(runs):
|
|
18
|
+
start = time.perf_counter()
|
|
19
|
+
data = dgen_py.generate_data(size)
|
|
20
|
+
# Convert to memoryview to ensure zero-copy is used
|
|
21
|
+
view = memoryview(data)
|
|
22
|
+
elapsed = time.perf_counter() - start
|
|
23
|
+
times.append(elapsed)
|
|
24
|
+
|
|
25
|
+
avg_time = sum(times) / len(times)
|
|
26
|
+
return avg_time, size / avg_time / 1e9
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def benchmark_numpy_randint(size, runs=5):
|
|
30
|
+
"""Benchmark numpy random.randint (uint8)"""
|
|
31
|
+
times = []
|
|
32
|
+
for _ in range(runs):
|
|
33
|
+
start = time.perf_counter()
|
|
34
|
+
arr = np.random.randint(0, 256, size, dtype=np.uint8)
|
|
35
|
+
elapsed = time.perf_counter() - start
|
|
36
|
+
times.append(elapsed)
|
|
37
|
+
|
|
38
|
+
avg_time = sum(times) / len(times)
|
|
39
|
+
return avg_time, size / avg_time / 1e9
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def benchmark_numpy_bytes(size, runs=5):
|
|
43
|
+
"""Benchmark numpy random.bytes"""
|
|
44
|
+
times = []
|
|
45
|
+
for _ in range(runs):
|
|
46
|
+
start = time.perf_counter()
|
|
47
|
+
data = np.random.bytes(size)
|
|
48
|
+
elapsed = time.perf_counter() - start
|
|
49
|
+
times.append(elapsed)
|
|
50
|
+
|
|
51
|
+
avg_time = sum(times) / len(times)
|
|
52
|
+
return avg_time, size / avg_time / 1e9
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def main():
|
|
56
|
+
print("=" * 70)
|
|
57
|
+
print("BENCHMARK: dgen-py vs Numpy Random Number Generation")
|
|
58
|
+
print("=" * 70)
|
|
59
|
+
|
|
60
|
+
sizes = [
|
|
61
|
+
(1 * 1024 * 1024, "1 MiB"),
|
|
62
|
+
(10 * 1024 * 1024, "10 MiB"),
|
|
63
|
+
(100 * 1024 * 1024, "100 MiB"),
|
|
64
|
+
(500 * 1024 * 1024, "500 MiB"),
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
print("\nRunning benchmarks (5 runs each, averaged)...\n")
|
|
68
|
+
|
|
69
|
+
results = []
|
|
70
|
+
|
|
71
|
+
for size, label in sizes:
|
|
72
|
+
print(f"Testing {label}...")
|
|
73
|
+
|
|
74
|
+
# dgen-py
|
|
75
|
+
dgen_time, dgen_gbps = benchmark_dgen(size)
|
|
76
|
+
|
|
77
|
+
# numpy randint
|
|
78
|
+
numpy_randint_time, numpy_randint_gbps = benchmark_numpy_randint(size)
|
|
79
|
+
|
|
80
|
+
# numpy bytes
|
|
81
|
+
numpy_bytes_time, numpy_bytes_gbps = benchmark_numpy_bytes(size)
|
|
82
|
+
|
|
83
|
+
results.append({
|
|
84
|
+
'size': size,
|
|
85
|
+
'label': label,
|
|
86
|
+
'dgen_time': dgen_time,
|
|
87
|
+
'dgen_gbps': dgen_gbps,
|
|
88
|
+
'numpy_randint_time': numpy_randint_time,
|
|
89
|
+
'numpy_randint_gbps': numpy_randint_gbps,
|
|
90
|
+
'numpy_bytes_time': numpy_bytes_time,
|
|
91
|
+
'numpy_bytes_gbps': numpy_bytes_gbps,
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
# Print results table
|
|
95
|
+
print("\n" + "=" * 70)
|
|
96
|
+
print("RESULTS")
|
|
97
|
+
print("=" * 70)
|
|
98
|
+
print(f"\n{'Size':<10} {'Method':<20} {'Time (ms)':<12} {'Throughput':<15} {'vs dgen':<10}")
|
|
99
|
+
print("-" * 70)
|
|
100
|
+
|
|
101
|
+
for r in results:
|
|
102
|
+
# dgen-py
|
|
103
|
+
print(f"{r['label']:<10} {'dgen-py':<20} {r['dgen_time']*1000:>10.1f} ms {r['dgen_gbps']:>10.2f} GB/s {'baseline':>10}")
|
|
104
|
+
|
|
105
|
+
# numpy randint
|
|
106
|
+
speedup_randint = r['dgen_gbps'] / r['numpy_randint_gbps']
|
|
107
|
+
print(f"{'':<10} {'numpy.random.randint':<20} {r['numpy_randint_time']*1000:>10.1f} ms {r['numpy_randint_gbps']:>10.2f} GB/s {speedup_randint:>9.2f}x")
|
|
108
|
+
|
|
109
|
+
# numpy bytes
|
|
110
|
+
speedup_bytes = r['dgen_gbps'] / r['numpy_bytes_gbps']
|
|
111
|
+
print(f"{'':<10} {'numpy.random.bytes':<20} {r['numpy_bytes_time']*1000:>10.1f} ms {r['numpy_bytes_gbps']:>10.2f} GB/s {speedup_bytes:>9.2f}x")
|
|
112
|
+
|
|
113
|
+
print()
|
|
114
|
+
|
|
115
|
+
# Summary
|
|
116
|
+
print("=" * 70)
|
|
117
|
+
print("SUMMARY")
|
|
118
|
+
print("=" * 70)
|
|
119
|
+
|
|
120
|
+
# Average speedups
|
|
121
|
+
avg_speedup_randint = sum(r['dgen_gbps'] / r['numpy_randint_gbps'] for r in results) / len(results)
|
|
122
|
+
avg_speedup_bytes = sum(r['dgen_gbps'] / r['numpy_bytes_gbps'] for r in results) / len(results)
|
|
123
|
+
|
|
124
|
+
print(f"\ndgen-py average performance:")
|
|
125
|
+
print(f" vs numpy.random.randint: {avg_speedup_randint:.2f}x faster")
|
|
126
|
+
print(f" vs numpy.random.bytes: {avg_speedup_bytes:.2f}x faster")
|
|
127
|
+
|
|
128
|
+
# Find best dgen performance
|
|
129
|
+
best = max(results, key=lambda r: r['dgen_gbps'])
|
|
130
|
+
print(f"\nPeak dgen-py throughput: {best['dgen_gbps']:.2f} GB/s ({best['label']})")
|
|
131
|
+
|
|
132
|
+
print("\n" + "=" * 70)
|
|
133
|
+
print("✓ Zero-copy implementation delivers competitive performance!")
|
|
134
|
+
print("=" * 70)
|
|
135
|
+
|
|
136
|
+
# Technical notes
|
|
137
|
+
print("\nNOTES:")
|
|
138
|
+
print("- dgen-py uses Xoshiro256++ RNG (faster than Numpy's MT19937)")
|
|
139
|
+
print("- dgen-py leverages multi-threading via Rayon")
|
|
140
|
+
print("- dgen-py provides zero-copy access via buffer protocol")
|
|
141
|
+
print("- Numpy's random.bytes is single-threaded")
|
|
142
|
+
print("- Numpy's random.randint has array creation overhead")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
main()
|
dgen_py/examples/demo.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example: Generate data and compress it to verify compression ratio
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import dgen_py
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_compression_ratio():
|
|
11
|
+
"""Generate data with specific compression ratio and verify"""
|
|
12
|
+
try:
|
|
13
|
+
import zstandard as zstd
|
|
14
|
+
except ImportError:
|
|
15
|
+
print("Error: zstandard package required")
|
|
16
|
+
print("Install with: pip install zstandard")
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
|
|
19
|
+
size = 100 * 1024 * 1024 # 100 MiB
|
|
20
|
+
target_compress_ratio = 3.0 # 3:1 compression
|
|
21
|
+
|
|
22
|
+
print(f"Generating {size / (1024**2):.1f} MiB with {target_compress_ratio}:1 compression ratio...")
|
|
23
|
+
|
|
24
|
+
data = dgen_py.generate_data(size, compress_ratio=target_compress_ratio)
|
|
25
|
+
|
|
26
|
+
print(f"Generated {len(data) / (1024**2):.1f} MiB")
|
|
27
|
+
print("Compressing with zstd...")
|
|
28
|
+
|
|
29
|
+
compressor = zstd.ZstdCompressor(level=3)
|
|
30
|
+
compressed = compressor.compress(data)
|
|
31
|
+
|
|
32
|
+
actual_ratio = len(data) / len(compressed)
|
|
33
|
+
|
|
34
|
+
print(f"\nResults:")
|
|
35
|
+
print(f" Original: {len(data) / (1024**2):.2f} MiB")
|
|
36
|
+
print(f" Compressed: {len(compressed) / (1024**2):.2f} MiB")
|
|
37
|
+
print(f" Ratio: {actual_ratio:.2f}:1")
|
|
38
|
+
print(f" Target: {target_compress_ratio:.2f}:1")
|
|
39
|
+
print(f" Delta: {abs(actual_ratio - target_compress_ratio) / target_compress_ratio * 100:.1f}%")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_streaming():
|
|
43
|
+
"""Generate large dataset using streaming API"""
|
|
44
|
+
size = 1024 * 1024 * 1024 # 1 GiB
|
|
45
|
+
chunk_size = 4 * 1024 * 1024 # 4 MiB chunks
|
|
46
|
+
|
|
47
|
+
print(f"\nGenerating {size / (1024**3):.1f} GiB using streaming API...")
|
|
48
|
+
|
|
49
|
+
gen = dgen_py.StreamingGenerator(
|
|
50
|
+
size=size,
|
|
51
|
+
dedup_ratio=2.0,
|
|
52
|
+
compress_ratio=2.0,
|
|
53
|
+
numa_mode='force'
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
buf = bytearray(chunk_size)
|
|
57
|
+
total = 0
|
|
58
|
+
chunks = 0
|
|
59
|
+
|
|
60
|
+
while not gen.is_complete():
|
|
61
|
+
nbytes = gen.fill_chunk(buf)
|
|
62
|
+
if nbytes == 0:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
total += nbytes
|
|
66
|
+
chunks += 1
|
|
67
|
+
|
|
68
|
+
if chunks % 100 == 0:
|
|
69
|
+
pct = (total / size) * 100
|
|
70
|
+
print(f" Progress: {total / (1024**3):.2f} GiB ({pct:.1f}%)")
|
|
71
|
+
|
|
72
|
+
print(f"\nCompleted:")
|
|
73
|
+
print(f" Total: {total / (1024**3):.3f} GiB")
|
|
74
|
+
print(f" Chunks: {chunks}")
|
|
75
|
+
print(f" Avg chunk size: {total / chunks / (1024**2):.2f} MiB")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def show_system_info():
|
|
79
|
+
"""Display NUMA topology information"""
|
|
80
|
+
print("\nSystem Information:")
|
|
81
|
+
print("-" * 50)
|
|
82
|
+
|
|
83
|
+
info = dgen_py.get_system_info()
|
|
84
|
+
if info:
|
|
85
|
+
print(f" NUMA nodes: {info['num_nodes']}")
|
|
86
|
+
print(f" Physical cores: {info['physical_cores']}")
|
|
87
|
+
print(f" Logical CPUs: {info['logical_cpus']}")
|
|
88
|
+
print(f" UMA system: {info['is_uma']}")
|
|
89
|
+
print(f" Deployment: {info['deployment_type']}")
|
|
90
|
+
else:
|
|
91
|
+
print(" NUMA info not available on this platform")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == '__main__':
|
|
95
|
+
show_system_info()
|
|
96
|
+
|
|
97
|
+
print("\n" + "=" * 50)
|
|
98
|
+
print("Test 1: Compression Ratio Validation")
|
|
99
|
+
print("=" * 50)
|
|
100
|
+
test_compression_ratio()
|
|
101
|
+
|
|
102
|
+
print("\n" + "=" * 50)
|
|
103
|
+
print("Test 2: Streaming Generation")
|
|
104
|
+
print("=" * 50)
|
|
105
|
+
test_streaming()
|
|
106
|
+
|
|
107
|
+
print("\n✓ All tests completed!")
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Quick Performance Test
|
|
4
|
+
======================
|
|
5
|
+
|
|
6
|
+
Fast 30-second test to find optimal settings for your system.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python quick_perf_test.py
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import time
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import dgen_py
|
|
17
|
+
except ImportError:
|
|
18
|
+
print("ERROR: dgen-py not installed")
|
|
19
|
+
print("Run: pip install dgen-py")
|
|
20
|
+
sys.exit(1)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_config(name, size, **kwargs):
|
|
24
|
+
"""Test a configuration and return throughput"""
|
|
25
|
+
start = time.perf_counter()
|
|
26
|
+
data = dgen_py.generate_data(size, **kwargs)
|
|
27
|
+
elapsed = time.perf_counter() - start
|
|
28
|
+
throughput = (size / elapsed) / 1e9
|
|
29
|
+
return throughput, elapsed
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
print("dgen-py Quick Performance Test")
|
|
34
|
+
print("=" * 50)
|
|
35
|
+
|
|
36
|
+
# System info
|
|
37
|
+
info = dgen_py.get_system_info()
|
|
38
|
+
if info:
|
|
39
|
+
print(f"\nSystem: {info['num_nodes']} NUMA node(s), {info['logical_cpus']} CPUs")
|
|
40
|
+
if info['num_nodes'] > 1:
|
|
41
|
+
print(" → Multi-socket system (NUMA optimizations should help!)")
|
|
42
|
+
else:
|
|
43
|
+
print(" → Single-socket system (UMA)")
|
|
44
|
+
|
|
45
|
+
size = 100 * 1024 * 1024 # 100 MiB
|
|
46
|
+
print(f"\nTest size: 100 MiB per run")
|
|
47
|
+
print(f"\nRunning tests...")
|
|
48
|
+
print("-" * 50)
|
|
49
|
+
|
|
50
|
+
results = []
|
|
51
|
+
|
|
52
|
+
# Test 1: Default (auto-detect everything)
|
|
53
|
+
print("\n1. Default (auto-detect)...", end=" ", flush=True)
|
|
54
|
+
tp, t = test_config("default", size)
|
|
55
|
+
print(f"{tp:.2f} GB/s")
|
|
56
|
+
results.append(("Default (auto)", tp, {}))
|
|
57
|
+
|
|
58
|
+
# Test 2: Force NUMA
|
|
59
|
+
print("2. Force NUMA...", end=" ", flush=True)
|
|
60
|
+
tp, t = test_config("force_numa", size, numa_mode="force")
|
|
61
|
+
print(f"{tp:.2f} GB/s")
|
|
62
|
+
results.append(("Force NUMA", tp, {"numa_mode": "force"}))
|
|
63
|
+
|
|
64
|
+
# Test 3: NUMA disabled
|
|
65
|
+
print("3. NUMA disabled...", end=" ", flush=True)
|
|
66
|
+
tp, t = test_config("disabled_numa", size, numa_mode="disabled")
|
|
67
|
+
print(f"{tp:.2f} GB/s")
|
|
68
|
+
results.append(("NUMA disabled", tp, {"numa_mode": "disabled"}))
|
|
69
|
+
|
|
70
|
+
# Test 4: Half threads
|
|
71
|
+
import multiprocessing
|
|
72
|
+
half_threads = multiprocessing.cpu_count() // 2
|
|
73
|
+
print(f"4. Half threads ({half_threads})...", end=" ", flush=True)
|
|
74
|
+
tp, t = test_config("half_threads", size, max_threads=half_threads)
|
|
75
|
+
print(f"{tp:.2f} GB/s")
|
|
76
|
+
results.append((f"Half threads ({half_threads})", tp, {"max_threads": half_threads}))
|
|
77
|
+
|
|
78
|
+
# Test 5: Single thread (baseline)
|
|
79
|
+
print("5. Single thread (baseline)...", end=" ", flush=True)
|
|
80
|
+
tp, t = test_config("single", size, max_threads=1)
|
|
81
|
+
print(f"{tp:.2f} GB/s")
|
|
82
|
+
results.append(("Single thread", tp, {"max_threads": 1}))
|
|
83
|
+
|
|
84
|
+
# Find best
|
|
85
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
86
|
+
|
|
87
|
+
print("\n" + "=" * 50)
|
|
88
|
+
print("RESULTS (fastest to slowest):")
|
|
89
|
+
print("=" * 50)
|
|
90
|
+
for i, (name, tp, config) in enumerate(results, 1):
|
|
91
|
+
star = "★" if i == 1 else " "
|
|
92
|
+
print(f"{star} {i}. {name:25s} {tp:8.2f} GB/s")
|
|
93
|
+
|
|
94
|
+
# Recommendation
|
|
95
|
+
best_name, best_tp, best_config = results[0]
|
|
96
|
+
print("\n" + "=" * 50)
|
|
97
|
+
print(f"RECOMMENDATION: {best_name}")
|
|
98
|
+
print(f" Throughput: {best_tp:.2f} GB/s")
|
|
99
|
+
if best_config:
|
|
100
|
+
print(f" Code: dgen_py.generate_data(size, {', '.join(f'{k}={repr(v)}' for k, v in best_config.items())})")
|
|
101
|
+
else:
|
|
102
|
+
print(f" Code: dgen_py.generate_data(size) # defaults are optimal!")
|
|
103
|
+
print("=" * 50)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
main()
|