dgen-py 0.1.2__cp310-cp310-manylinux_2_24_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,299 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Performance Benchmark: CPU and NUMA Configuration
4
+ ==================================================
5
+
6
+ This script helps you find the optimal CPU and NUMA settings for your system.
7
+ Run this to discover:
8
+ - Optimal thread count for your workload
9
+ - Whether NUMA optimizations help (bare metal) or hurt (cloud VM)
10
+ - Baseline single-core performance
11
+ - Memory bandwidth limits
12
+
13
+ Usage:
14
+ python benchmark_cpu_numa.py
15
+
16
+ Requirements:
17
+ pip install dgen-py
18
+ """
19
+
20
+ import time
21
+ import sys
22
+ import os
23
+ from typing import Optional, List, Tuple
24
+
25
+ try:
26
+ import dgen_py
27
+ except ImportError:
28
+ print("ERROR: dgen-py not installed")
29
+ print("Install with: pip install dgen-py")
30
+ print("Or build from source: cd dgen-rs && maturin develop --release")
31
+ sys.exit(1)
32
+
33
+
34
+ class PerformanceBenchmark:
35
+ """Comprehensive CPU and NUMA performance benchmark"""
36
+
37
+ def __init__(self, size_mb: int = 100):
38
+ self.size = size_mb * 1024 * 1024
39
+ self.results = []
40
+
41
+ def run_test(
42
+ self,
43
+ name: str,
44
+ dedup_ratio: float = 1.0,
45
+ compress_ratio: float = 1.0,
46
+ numa_mode: str = "auto",
47
+ max_threads: Optional[int] = None,
48
+ iterations: int = 3
49
+ ) -> Tuple[float, float]:
50
+ """Run a single test configuration multiple times and return avg throughput"""
51
+
52
+ times = []
53
+ for i in range(iterations):
54
+ start = time.perf_counter()
55
+ data = dgen_py.generate_data(
56
+ self.size,
57
+ dedup_ratio=dedup_ratio,
58
+ compress_ratio=compress_ratio,
59
+ numa_mode=numa_mode,
60
+ max_threads=max_threads
61
+ )
62
+ elapsed = time.perf_counter() - start
63
+ times.append(elapsed)
64
+
65
+ # Verify size
66
+ if len(data) != self.size:
67
+ print(f"WARNING: Expected {self.size} bytes, got {len(data)}")
68
+
69
+ avg_time = sum(times) / len(times)
70
+ throughput = (self.size / avg_time) / 1e9 # GB/s
71
+
72
+ self.results.append({
73
+ 'name': name,
74
+ 'throughput': throughput,
75
+ 'time': avg_time,
76
+ 'threads': max_threads or 'all',
77
+ 'numa_mode': numa_mode,
78
+ 'dedup': dedup_ratio,
79
+ 'compress': compress_ratio
80
+ })
81
+
82
+ return throughput, avg_time
83
+
84
+ def print_result(self, name: str, throughput: float, time_sec: float,
85
+ threads: Optional[int] = None, numa_mode: str = "auto"):
86
+ """Print a single test result"""
87
+ threads_str = f"{threads} threads" if threads else "all cores"
88
+ print(f" {name:40s} {throughput:8.2f} GB/s ({time_sec:.3f}s, {threads_str}, numa={numa_mode})")
89
+
90
+ def benchmark_thread_scaling(self):
91
+ """Test performance with different thread counts"""
92
+ print("\n" + "="*80)
93
+ print("THREAD SCALING BENCHMARK")
94
+ print("="*80)
95
+ print("\nTesting different thread counts to find optimal configuration...")
96
+ print("(All tests: incompressible data, no dedup, NUMA=auto)\n")
97
+
98
+ # Get CPU count for intelligent thread selection
99
+ import multiprocessing
100
+ cpu_count = multiprocessing.cpu_count()
101
+
102
+ thread_counts = [1, 2, 4]
103
+ if cpu_count >= 8:
104
+ thread_counts.append(8)
105
+ if cpu_count >= 16:
106
+ thread_counts.append(16)
107
+ thread_counts.append(None) # All cores
108
+
109
+ baseline_throughput = None
110
+
111
+ for threads in thread_counts:
112
+ name = f"Threads: {threads if threads else cpu_count} ({'baseline' if threads == 1 else 'parallel'})"
113
+ throughput, elapsed = self.run_test(name, max_threads=threads)
114
+ self.print_result(name, throughput, elapsed, threads)
115
+
116
+ if threads == 1:
117
+ baseline_throughput = throughput
118
+ elif baseline_throughput:
119
+ speedup = throughput / baseline_throughput
120
+ efficiency = (speedup / (threads or cpu_count)) * 100
121
+ print(f" └─> Speedup: {speedup:.2f}x, Efficiency: {efficiency:.1f}%")
122
+
123
+ def benchmark_numa_modes(self):
124
+ """Test different NUMA configurations"""
125
+ print("\n" + "="*80)
126
+ print("NUMA MODE BENCHMARK")
127
+ print("="*80)
128
+ print("\nTesting NUMA optimization modes...")
129
+ print("(All tests: incompressible data, no dedup, all cores)\n")
130
+
131
+ # Check system NUMA topology
132
+ numa_info = dgen_py.get_system_info()
133
+ if numa_info:
134
+ num_nodes = numa_info['num_nodes']
135
+ print(f"System: {num_nodes} NUMA node(s) detected")
136
+ if num_nodes > 1:
137
+ print(f" Multi-socket NUMA system - optimizations should help!")
138
+ else:
139
+ print(f" UMA system (single socket) - optimizations add minimal overhead")
140
+ print()
141
+ else:
142
+ print("NUMA detection not available (NUMA feature not compiled in)\n")
143
+
144
+ modes = [
145
+ ("Auto (default)", "auto"),
146
+ ("Force NUMA", "force"),
147
+ ("Disabled", "disabled")
148
+ ]
149
+
150
+ for name, mode in modes:
151
+ test_name = f"NUMA mode: {name}"
152
+ throughput, elapsed = self.run_test(test_name, numa_mode=mode)
153
+ self.print_result(test_name, throughput, elapsed, numa_mode=mode)
154
+
155
+ def benchmark_compression_impact(self):
156
+ """Test performance with different compression ratios"""
157
+ print("\n" + "="*80)
158
+ print("COMPRESSION IMPACT BENCHMARK")
159
+ print("="*80)
160
+ print("\nTesting how compression ratio affects throughput...")
161
+ print("(All tests: no dedup, NUMA=auto, all cores)\n")
162
+
163
+ compress_ratios = [1, 2, 3, 5]
164
+
165
+ baseline = None
166
+ for ratio in compress_ratios:
167
+ name = f"Compression ratio: {ratio}:1 ({'incompressible' if ratio == 1 else 'compressible'})"
168
+ throughput, elapsed = self.run_test(name, compress_ratio=ratio)
169
+ self.print_result(name, throughput, elapsed)
170
+
171
+ if ratio == 1:
172
+ baseline = throughput
173
+ else:
174
+ slowdown = baseline / throughput if throughput > 0 else 0
175
+ print(f" └─> {slowdown:.2f}x slower than incompressible (more back-refs to copy)")
176
+
177
+ def benchmark_optimal_config(self):
178
+ """Find optimal configuration for this system"""
179
+ print("\n" + "="*80)
180
+ print("OPTIMAL CONFIGURATION FINDER")
181
+ print("="*80)
182
+ print("\nTesting combinations to find best performance...\n")
183
+
184
+ import multiprocessing
185
+ cpu_count = multiprocessing.cpu_count()
186
+
187
+ configs = [
188
+ ("All cores + Auto NUMA", None, "auto"),
189
+ ("All cores + Force NUMA", None, "force"),
190
+ ("All cores + Disabled NUMA", None, "disabled"),
191
+ ("Half cores + Auto NUMA", cpu_count // 2, "auto"),
192
+ ]
193
+
194
+ best_throughput = 0
195
+ best_config = None
196
+
197
+ for name, threads, numa_mode in configs:
198
+ test_name = f"Config: {name}"
199
+ throughput, elapsed = self.run_test(test_name, max_threads=threads, numa_mode=numa_mode)
200
+ self.print_result(test_name, throughput, elapsed, threads, numa_mode)
201
+
202
+ if throughput > best_throughput:
203
+ best_throughput = throughput
204
+ best_config = (name, threads, numa_mode)
205
+
206
+ print(f"\n{'='*80}")
207
+ print(f"WINNER: {best_config[0]}")
208
+ print(f" Throughput: {best_throughput:.2f} GB/s")
209
+ print(f" Config: max_threads={best_config[1]}, numa_mode='{best_config[2]}'")
210
+ print(f"{'='*80}")
211
+
212
+ def print_summary(self):
213
+ """Print overall benchmark summary"""
214
+ print("\n" + "="*80)
215
+ print("BENCHMARK SUMMARY")
216
+ print("="*80)
217
+
218
+ if not self.results:
219
+ print("No results to summarize")
220
+ return
221
+
222
+ # Sort by throughput
223
+ sorted_results = sorted(self.results, key=lambda x: x['throughput'], reverse=True)
224
+
225
+ print(f"\nTop 5 configurations (out of {len(self.results)} tested):\n")
226
+ print(f"{'Rank':<6} {'Configuration':<40} {'Throughput':<12} {'Settings'}")
227
+ print("-" * 80)
228
+
229
+ for i, result in enumerate(sorted_results[:5], 1):
230
+ config = result['name']
231
+ throughput = f"{result['throughput']:.2f} GB/s"
232
+ settings = f"threads={result['threads']}, numa={result['numa_mode']}"
233
+ print(f"{i:<6} {config:<40} {throughput:<12} {settings}")
234
+
235
+ # System recommendations
236
+ print(f"\n{'='*80}")
237
+ print("RECOMMENDATIONS FOR YOUR SYSTEM")
238
+ print(f"{'='*80}\n")
239
+
240
+ best = sorted_results[0]
241
+ print(f"For maximum performance on this system:")
242
+ print(f" dgen_py.generate_data(size,")
243
+ print(f" max_threads={repr(best['threads'])},")
244
+ print(f" numa_mode='{best['numa_mode']}')")
245
+ print(f" Expected: ~{best['throughput']:.1f} GB/s")
246
+
247
+ # Check if NUMA helps
248
+ numa_info = dgen_py.get_system_info()
249
+ if numa_info and numa_info['num_nodes'] == 1:
250
+ print("\nNOTE: This is a UMA system (single NUMA node).")
251
+ print(" NUMA optimizations have minimal impact here.")
252
+ print(" On multi-socket bare metal servers, expect 30-50% improvement!")
253
+
254
+
255
+ def main():
256
+ """Run comprehensive benchmark suite"""
257
+
258
+ print("""
259
+ ╔══════════════════════════════════════════════════════════════════════════════╗
260
+ ║ dgen-py Performance Benchmark Suite ║
261
+ ║ CPU and NUMA Configuration Optimizer ║
262
+ ╚══════════════════════════════════════════════════════════════════════════════╝
263
+ """)
264
+
265
+ # Check system info
266
+ numa_info = dgen_py.get_system_info()
267
+ if numa_info:
268
+ print(f"System Information:")
269
+ print(f" NUMA nodes: {numa_info['num_nodes']}")
270
+ print(f" Physical cores: {numa_info['physical_cores']}")
271
+ print(f" Logical CPUs: {numa_info['logical_cpus']}")
272
+ print(f" Deployment: {numa_info['deployment_type']}")
273
+ else:
274
+ import multiprocessing
275
+ print(f"System Information:")
276
+ print(f" CPUs: {multiprocessing.cpu_count()}")
277
+ print(f" NUMA: Not available (feature not compiled in)")
278
+
279
+ print(f"\nBenchmark Configuration:")
280
+ print(f" Data size per test: 100 MiB")
281
+ print(f" Iterations per config: 3")
282
+ print(f" Total time: ~1-2 minutes")
283
+
284
+ # Run benchmarks
285
+ benchmark = PerformanceBenchmark(size_mb=100)
286
+
287
+ benchmark.benchmark_thread_scaling()
288
+ benchmark.benchmark_numa_modes()
289
+ benchmark.benchmark_compression_impact()
290
+ benchmark.benchmark_optimal_config()
291
+ benchmark.print_summary()
292
+
293
+ print(f"\n{'='*80}")
294
+ print("Benchmark complete! Save these results for your production configuration.")
295
+ print(f"{'='*80}\n")
296
+
297
+
298
+ if __name__ == "__main__":
299
+ main()
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark: dgen-py vs Numpy Random
4
+ ===================================
5
+
6
+ Compare dgen-py data generation to numpy's random number generation.
7
+ """
8
+
9
+ import dgen_py
10
+ import numpy as np
11
+ import time
12
+
13
+
14
+ def benchmark_dgen(size, runs=5):
15
+ """Benchmark dgen-py generation"""
16
+ times = []
17
+ for _ in range(runs):
18
+ start = time.perf_counter()
19
+ data = dgen_py.generate_data(size)
20
+ # Convert to memoryview to ensure zero-copy is used
21
+ view = memoryview(data)
22
+ elapsed = time.perf_counter() - start
23
+ times.append(elapsed)
24
+
25
+ avg_time = sum(times) / len(times)
26
+ return avg_time, size / avg_time / 1e9
27
+
28
+
29
+ def benchmark_numpy_randint(size, runs=5):
30
+ """Benchmark numpy random.randint (uint8)"""
31
+ times = []
32
+ for _ in range(runs):
33
+ start = time.perf_counter()
34
+ arr = np.random.randint(0, 256, size, dtype=np.uint8)
35
+ elapsed = time.perf_counter() - start
36
+ times.append(elapsed)
37
+
38
+ avg_time = sum(times) / len(times)
39
+ return avg_time, size / avg_time / 1e9
40
+
41
+
42
+ def benchmark_numpy_bytes(size, runs=5):
43
+ """Benchmark numpy random.bytes"""
44
+ times = []
45
+ for _ in range(runs):
46
+ start = time.perf_counter()
47
+ data = np.random.bytes(size)
48
+ elapsed = time.perf_counter() - start
49
+ times.append(elapsed)
50
+
51
+ avg_time = sum(times) / len(times)
52
+ return avg_time, size / avg_time / 1e9
53
+
54
+
55
+ def main():
56
+ print("=" * 70)
57
+ print("BENCHMARK: dgen-py vs Numpy Random Number Generation")
58
+ print("=" * 70)
59
+
60
+ sizes = [
61
+ (1 * 1024 * 1024, "1 MiB"),
62
+ (10 * 1024 * 1024, "10 MiB"),
63
+ (100 * 1024 * 1024, "100 MiB"),
64
+ (500 * 1024 * 1024, "500 MiB"),
65
+ ]
66
+
67
+ print("\nRunning benchmarks (5 runs each, averaged)...\n")
68
+
69
+ results = []
70
+
71
+ for size, label in sizes:
72
+ print(f"Testing {label}...")
73
+
74
+ # dgen-py
75
+ dgen_time, dgen_gbps = benchmark_dgen(size)
76
+
77
+ # numpy randint
78
+ numpy_randint_time, numpy_randint_gbps = benchmark_numpy_randint(size)
79
+
80
+ # numpy bytes
81
+ numpy_bytes_time, numpy_bytes_gbps = benchmark_numpy_bytes(size)
82
+
83
+ results.append({
84
+ 'size': size,
85
+ 'label': label,
86
+ 'dgen_time': dgen_time,
87
+ 'dgen_gbps': dgen_gbps,
88
+ 'numpy_randint_time': numpy_randint_time,
89
+ 'numpy_randint_gbps': numpy_randint_gbps,
90
+ 'numpy_bytes_time': numpy_bytes_time,
91
+ 'numpy_bytes_gbps': numpy_bytes_gbps,
92
+ })
93
+
94
+ # Print results table
95
+ print("\n" + "=" * 70)
96
+ print("RESULTS")
97
+ print("=" * 70)
98
+ print(f"\n{'Size':<10} {'Method':<20} {'Time (ms)':<12} {'Throughput':<15} {'vs dgen':<10}")
99
+ print("-" * 70)
100
+
101
+ for r in results:
102
+ # dgen-py
103
+ print(f"{r['label']:<10} {'dgen-py':<20} {r['dgen_time']*1000:>10.1f} ms {r['dgen_gbps']:>10.2f} GB/s {'baseline':>10}")
104
+
105
+ # numpy randint
106
+ speedup_randint = r['dgen_gbps'] / r['numpy_randint_gbps']
107
+ print(f"{'':<10} {'numpy.random.randint':<20} {r['numpy_randint_time']*1000:>10.1f} ms {r['numpy_randint_gbps']:>10.2f} GB/s {speedup_randint:>9.2f}x")
108
+
109
+ # numpy bytes
110
+ speedup_bytes = r['dgen_gbps'] / r['numpy_bytes_gbps']
111
+ print(f"{'':<10} {'numpy.random.bytes':<20} {r['numpy_bytes_time']*1000:>10.1f} ms {r['numpy_bytes_gbps']:>10.2f} GB/s {speedup_bytes:>9.2f}x")
112
+
113
+ print()
114
+
115
+ # Summary
116
+ print("=" * 70)
117
+ print("SUMMARY")
118
+ print("=" * 70)
119
+
120
+ # Average speedups
121
+ avg_speedup_randint = sum(r['dgen_gbps'] / r['numpy_randint_gbps'] for r in results) / len(results)
122
+ avg_speedup_bytes = sum(r['dgen_gbps'] / r['numpy_bytes_gbps'] for r in results) / len(results)
123
+
124
+ print(f"\ndgen-py average performance:")
125
+ print(f" vs numpy.random.randint: {avg_speedup_randint:.2f}x faster")
126
+ print(f" vs numpy.random.bytes: {avg_speedup_bytes:.2f}x faster")
127
+
128
+ # Find best dgen performance
129
+ best = max(results, key=lambda r: r['dgen_gbps'])
130
+ print(f"\nPeak dgen-py throughput: {best['dgen_gbps']:.2f} GB/s ({best['label']})")
131
+
132
+ print("\n" + "=" * 70)
133
+ print("✓ Zero-copy implementation delivers competitive performance!")
134
+ print("=" * 70)
135
+
136
+ # Technical notes
137
+ print("\nNOTES:")
138
+ print("- dgen-py uses Xoshiro256++ RNG (faster than Numpy's MT19937)")
139
+ print("- dgen-py leverages multi-threading via Rayon")
140
+ print("- dgen-py provides zero-copy access via buffer protocol")
141
+ print("- Numpy's random.bytes is single-threaded")
142
+ print("- Numpy's random.randint has array creation overhead")
143
+
144
+
145
+ if __name__ == "__main__":
146
+ main()
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example: Generate data and compress it to verify compression ratio
4
+ """
5
+
6
+ import dgen_py
7
+ import sys
8
+
9
+
10
+ def test_compression_ratio():
11
+ """Generate data with specific compression ratio and verify"""
12
+ try:
13
+ import zstandard as zstd
14
+ except ImportError:
15
+ print("Error: zstandard package required")
16
+ print("Install with: pip install zstandard")
17
+ sys.exit(1)
18
+
19
+ size = 100 * 1024 * 1024 # 100 MiB
20
+ target_compress_ratio = 3.0 # 3:1 compression
21
+
22
+ print(f"Generating {size / (1024**2):.1f} MiB with {target_compress_ratio}:1 compression ratio...")
23
+
24
+ data = dgen_py.generate_data(size, compress_ratio=target_compress_ratio)
25
+
26
+ print(f"Generated {len(data) / (1024**2):.1f} MiB")
27
+ print("Compressing with zstd...")
28
+
29
+ compressor = zstd.ZstdCompressor(level=3)
30
+ compressed = compressor.compress(data)
31
+
32
+ actual_ratio = len(data) / len(compressed)
33
+
34
+ print(f"\nResults:")
35
+ print(f" Original: {len(data) / (1024**2):.2f} MiB")
36
+ print(f" Compressed: {len(compressed) / (1024**2):.2f} MiB")
37
+ print(f" Ratio: {actual_ratio:.2f}:1")
38
+ print(f" Target: {target_compress_ratio:.2f}:1")
39
+ print(f" Delta: {abs(actual_ratio - target_compress_ratio) / target_compress_ratio * 100:.1f}%")
40
+
41
+
42
+ def test_streaming():
43
+ """Generate large dataset using streaming API"""
44
+ size = 1024 * 1024 * 1024 # 1 GiB
45
+ chunk_size = 4 * 1024 * 1024 # 4 MiB chunks
46
+
47
+ print(f"\nGenerating {size / (1024**3):.1f} GiB using streaming API...")
48
+
49
+ gen = dgen_py.StreamingGenerator(
50
+ size=size,
51
+ dedup_ratio=2.0,
52
+ compress_ratio=2.0,
53
+ numa_mode='force'
54
+ )
55
+
56
+ buf = bytearray(chunk_size)
57
+ total = 0
58
+ chunks = 0
59
+
60
+ while not gen.is_complete():
61
+ nbytes = gen.fill_chunk(buf)
62
+ if nbytes == 0:
63
+ break
64
+
65
+ total += nbytes
66
+ chunks += 1
67
+
68
+ if chunks % 100 == 0:
69
+ pct = (total / size) * 100
70
+ print(f" Progress: {total / (1024**3):.2f} GiB ({pct:.1f}%)")
71
+
72
+ print(f"\nCompleted:")
73
+ print(f" Total: {total / (1024**3):.3f} GiB")
74
+ print(f" Chunks: {chunks}")
75
+ print(f" Avg chunk size: {total / chunks / (1024**2):.2f} MiB")
76
+
77
+
78
+ def show_system_info():
79
+ """Display NUMA topology information"""
80
+ print("\nSystem Information:")
81
+ print("-" * 50)
82
+
83
+ info = dgen_py.get_system_info()
84
+ if info:
85
+ print(f" NUMA nodes: {info['num_nodes']}")
86
+ print(f" Physical cores: {info['physical_cores']}")
87
+ print(f" Logical CPUs: {info['logical_cpus']}")
88
+ print(f" UMA system: {info['is_uma']}")
89
+ print(f" Deployment: {info['deployment_type']}")
90
+ else:
91
+ print(" NUMA info not available on this platform")
92
+
93
+
94
+ if __name__ == '__main__':
95
+ show_system_info()
96
+
97
+ print("\n" + "=" * 50)
98
+ print("Test 1: Compression Ratio Validation")
99
+ print("=" * 50)
100
+ test_compression_ratio()
101
+
102
+ print("\n" + "=" * 50)
103
+ print("Test 2: Streaming Generation")
104
+ print("=" * 50)
105
+ test_streaming()
106
+
107
+ print("\n✓ All tests completed!")
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick Performance Test
4
+ ======================
5
+
6
+ Fast 30-second test to find optimal settings for your system.
7
+
8
+ Usage:
9
+ python quick_perf_test.py
10
+ """
11
+
12
+ import time
13
+ import sys
14
+
15
+ try:
16
+ import dgen_py
17
+ except ImportError:
18
+ print("ERROR: dgen-py not installed")
19
+ print("Run: pip install dgen-py")
20
+ sys.exit(1)
21
+
22
+
23
+ def test_config(name, size, **kwargs):
24
+ """Test a configuration and return throughput"""
25
+ start = time.perf_counter()
26
+ data = dgen_py.generate_data(size, **kwargs)
27
+ elapsed = time.perf_counter() - start
28
+ throughput = (size / elapsed) / 1e9
29
+ return throughput, elapsed
30
+
31
+
32
+ def main():
33
+ print("dgen-py Quick Performance Test")
34
+ print("=" * 50)
35
+
36
+ # System info
37
+ info = dgen_py.get_system_info()
38
+ if info:
39
+ print(f"\nSystem: {info['num_nodes']} NUMA node(s), {info['logical_cpus']} CPUs")
40
+ if info['num_nodes'] > 1:
41
+ print(" → Multi-socket system (NUMA optimizations should help!)")
42
+ else:
43
+ print(" → Single-socket system (UMA)")
44
+
45
+ size = 100 * 1024 * 1024 # 100 MiB
46
+ print(f"\nTest size: 100 MiB per run")
47
+ print(f"\nRunning tests...")
48
+ print("-" * 50)
49
+
50
+ results = []
51
+
52
+ # Test 1: Default (auto-detect everything)
53
+ print("\n1. Default (auto-detect)...", end=" ", flush=True)
54
+ tp, t = test_config("default", size)
55
+ print(f"{tp:.2f} GB/s")
56
+ results.append(("Default (auto)", tp, {}))
57
+
58
+ # Test 2: Force NUMA
59
+ print("2. Force NUMA...", end=" ", flush=True)
60
+ tp, t = test_config("force_numa", size, numa_mode="force")
61
+ print(f"{tp:.2f} GB/s")
62
+ results.append(("Force NUMA", tp, {"numa_mode": "force"}))
63
+
64
+ # Test 3: NUMA disabled
65
+ print("3. NUMA disabled...", end=" ", flush=True)
66
+ tp, t = test_config("disabled_numa", size, numa_mode="disabled")
67
+ print(f"{tp:.2f} GB/s")
68
+ results.append(("NUMA disabled", tp, {"numa_mode": "disabled"}))
69
+
70
+ # Test 4: Half threads
71
+ import multiprocessing
72
+ half_threads = multiprocessing.cpu_count() // 2
73
+ print(f"4. Half threads ({half_threads})...", end=" ", flush=True)
74
+ tp, t = test_config("half_threads", size, max_threads=half_threads)
75
+ print(f"{tp:.2f} GB/s")
76
+ results.append((f"Half threads ({half_threads})", tp, {"max_threads": half_threads}))
77
+
78
+ # Test 5: Single thread (baseline)
79
+ print("5. Single thread (baseline)...", end=" ", flush=True)
80
+ tp, t = test_config("single", size, max_threads=1)
81
+ print(f"{tp:.2f} GB/s")
82
+ results.append(("Single thread", tp, {"max_threads": 1}))
83
+
84
+ # Find best
85
+ results.sort(key=lambda x: x[1], reverse=True)
86
+
87
+ print("\n" + "=" * 50)
88
+ print("RESULTS (fastest to slowest):")
89
+ print("=" * 50)
90
+ for i, (name, tp, config) in enumerate(results, 1):
91
+ star = "★" if i == 1 else " "
92
+ print(f"{star} {i}. {name:25s} {tp:8.2f} GB/s")
93
+
94
+ # Recommendation
95
+ best_name, best_tp, best_config = results[0]
96
+ print("\n" + "=" * 50)
97
+ print(f"RECOMMENDATION: {best_name}")
98
+ print(f" Throughput: {best_tp:.2f} GB/s")
99
+ if best_config:
100
+ print(f" Code: dgen_py.generate_data(size, {', '.join(f'{k}={repr(v)}' for k, v in best_config.items())})")
101
+ else:
102
+ print(f" Code: dgen_py.generate_data(size) # defaults are optimal!")
103
+ print("=" * 50)
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()