wings-quantum 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,783 @@
1
+ """cuStateVec-based circuit evaluators."""
2
+
3
+ import concurrent.futures
4
+ import logging
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+ from ..compat import HAS_CUSTATEVEC, get_compute_type, get_cuda_dtype
11
+ from ..types import ComplexArray, FloatArray, ParameterArray
12
+
13
+ # Conditional imports
14
+ if HAS_CUSTATEVEC:
15
+ import cupy as cp
16
+ from cuquantum.bindings import custatevec as cusv
17
+ else:
18
+ cp = None
19
+ cusv = None
20
+
21
+ if TYPE_CHECKING:
22
+ from ..config import OptimizerConfig
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ __all__ = [
27
+ "CuStateVecSimulator",
28
+ "CuStateVecEvaluator",
29
+ "BatchedCuStateVecEvaluator",
30
+ "MultiGPUBatchEvaluator",
31
+ ]
32
+
33
+
34
+ class CuStateVecSimulator:
35
+ """
36
+ High-performance statevector simulator using NVIDIA cuStateVec.
37
+
38
+ This provides the fastest possible GPU simulation by:
39
+ 1. Direct GPU memory management with CuPy
40
+ 2. Optimized gate application via cuStateVec
41
+ 3. Minimal Python overhead
42
+ 4. Double precision for high accuracy
43
+
44
+ Parameters
45
+ ----------
46
+ n_qubits : int
47
+ Number of qubits
48
+ precision : str
49
+ 'double' or 'single' precision
50
+ device_id : int
51
+ GPU device ID (default: 0)
52
+
53
+ For VQC optimization, this can be 5-20x faster than Qiskit Aer GPU.
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ n_qubits: int,
59
+ precision: str = "double",
60
+ device_id: int = 0,
61
+ ) -> None:
62
+ self.n_qubits = n_qubits
63
+ self.n_states = 2**n_qubits
64
+ self.device_id = device_id
65
+ self.handle = None
66
+
67
+ # Set precision FIRST - before any GPU operations
68
+ if precision == "double":
69
+ self.dtype = cp.complex128
70
+ self.cuda_dtype = get_cuda_dtype()
71
+ self.compute_type = get_compute_type()
72
+ else:
73
+ self.dtype = cp.complex64
74
+ self.cuda_dtype = cusv.cudaDataType.CUDA_C_32F
75
+ self.compute_type = cusv.ComputeType.COMPUTE_32F
76
+
77
+ # Initialize on the specified GPU device
78
+ with cp.cuda.Device(device_id):
79
+ # Initialize cuStateVec handle
80
+ self.handle = cusv.create()
81
+
82
+ # Pre-allocate GPU memory for statevector
83
+ self.d_sv = cp.zeros(self.n_states, dtype=self.dtype)
84
+
85
+ # Pre-allocate workspace (reused across operations)
86
+ self._workspace_size = 0
87
+ self._d_workspace = None
88
+
89
+ # Pre-compute and cache gate matrices on GPU
90
+ self._precompute_gates()
91
+
92
+ # Statistics
93
+ self.n_gate_applications = 0
94
+ self.n_resets = 0
95
+
96
+ def _precompute_gates(self):
97
+ """Pre-compute and cache gate matrices on GPU."""
98
+ x_np = np.array([[0, 1], [1, 0]], dtype=np.complex128)
99
+ self.x_gate = cp.asarray(x_np)
100
+
101
+ # Pre-allocate RY matrix on GPU (will be updated in-place)
102
+ self._ry_matrix = cp.zeros((2, 2), dtype=self.dtype)
103
+
104
+ def _get_ry_matrix(self, theta: float) -> "cp.ndarray":
105
+ """Compute RY(theta) matrix on GPU - optimized."""
106
+ c = np.cos(theta / 2)
107
+ s = np.sin(theta / 2)
108
+ self._ry_matrix[0, 0] = c
109
+ self._ry_matrix[0, 1] = -s
110
+ self._ry_matrix[1, 0] = s
111
+ self._ry_matrix[1, 1] = c
112
+ return self._ry_matrix
113
+
114
+ def reset_state(self) -> None:
115
+ """Reset to |0...0⟩ state."""
116
+ with cp.cuda.Device(self.device_id):
117
+ self.d_sv.fill(0)
118
+ self.d_sv[0] = 1.0 + 0j
119
+ self.n_resets += 1
120
+
121
+ def apply_x(self, target: int) -> None:
122
+ """Apply Pauli-X gate to target qubit."""
123
+ targets = np.array([target], dtype=np.int32)
124
+
125
+ with cp.cuda.Device(self.device_id):
126
+ cusv.apply_matrix(
127
+ self.handle,
128
+ self.d_sv.data.ptr,
129
+ self.cuda_dtype,
130
+ self.n_qubits,
131
+ self.x_gate.data.ptr,
132
+ self.cuda_dtype,
133
+ cusv.MatrixLayout.ROW,
134
+ 0,
135
+ targets.ctypes.data,
136
+ 1,
137
+ 0,
138
+ 0,
139
+ 0,
140
+ self.compute_type,
141
+ 0,
142
+ 0,
143
+ )
144
+ self.n_gate_applications += 1
145
+
146
+ def apply_ry(self, theta: float, target: int) -> None:
147
+ """Apply RY(theta) gate - optimized to minimize transfers."""
148
+ c = np.cos(theta / 2)
149
+ s = np.sin(theta / 2)
150
+
151
+ targets = np.array([target], dtype=np.int32)
152
+
153
+ with cp.cuda.Device(self.device_id):
154
+ # Build matrix on CPU and copy once
155
+ ry_cpu = np.array([[c, -s], [s, c]], dtype=np.complex128)
156
+ cp.copyto(self._ry_matrix, cp.asarray(ry_cpu))
157
+
158
+ cusv.apply_matrix(
159
+ self.handle,
160
+ self.d_sv.data.ptr,
161
+ self.cuda_dtype,
162
+ self.n_qubits,
163
+ self._ry_matrix.data.ptr,
164
+ self.cuda_dtype,
165
+ cusv.MatrixLayout.ROW,
166
+ 0,
167
+ targets.ctypes.data,
168
+ 1,
169
+ 0,
170
+ 0,
171
+ 0,
172
+ self.compute_type,
173
+ 0,
174
+ 0,
175
+ )
176
+ self.n_gate_applications += 1
177
+
178
+ def apply_cnot(self, control: int, target: int) -> None:
179
+ """Apply CNOT gate."""
180
+ targets = np.array([target], dtype=np.int32)
181
+ controls = np.array([control], dtype=np.int32)
182
+ control_bits = np.array([1], dtype=np.int32)
183
+
184
+ with cp.cuda.Device(self.device_id):
185
+ cusv.apply_matrix(
186
+ self.handle,
187
+ self.d_sv.data.ptr,
188
+ self.cuda_dtype,
189
+ self.n_qubits,
190
+ self.x_gate.data.ptr,
191
+ self.cuda_dtype,
192
+ cusv.MatrixLayout.ROW,
193
+ 0,
194
+ targets.ctypes.data,
195
+ 1,
196
+ controls.ctypes.data,
197
+ control_bits.ctypes.data,
198
+ 1,
199
+ self.compute_type,
200
+ 0,
201
+ 0,
202
+ )
203
+ self.n_gate_applications += 1
204
+
205
+ def get_statevector_gpu(self) -> "cp.ndarray":
206
+ """Return current statevector (stays on GPU)."""
207
+ with cp.cuda.Device(self.device_id):
208
+ return self.d_sv.copy()
209
+
210
+ def get_statevector_cpu(self) -> ComplexArray:
211
+ """Return statevector copied to CPU."""
212
+ with cp.cuda.Device(self.device_id):
213
+ return cp.asnumpy(self.d_sv)
214
+
215
+ def compute_overlap_gpu(self, target_conj: "cp.ndarray") -> complex:
216
+ """Compute ⟨target|current⟩ entirely on GPU."""
217
+ with cp.cuda.Device(self.device_id):
218
+ return cp.vdot(target_conj, self.d_sv)
219
+
220
+ def compute_fidelity_gpu(self, target_conj: "cp.ndarray") -> float:
221
+ """Compute |⟨target|current⟩|² on GPU."""
222
+ with cp.cuda.Device(self.device_id):
223
+ overlap = cp.vdot(target_conj, self.d_sv)
224
+ fidelity = float(overlap.real**2 + overlap.imag**2)
225
+ return fidelity
226
+
227
+ def get_stats(self) -> dict[str, Any]:
228
+ """Return simulator statistics."""
229
+ return {
230
+ "n_qubits": self.n_qubits,
231
+ "n_states": self.n_states,
232
+ "device_id": self.device_id,
233
+ "n_gate_applications": self.n_gate_applications,
234
+ "n_resets": self.n_resets,
235
+ "gates_per_circuit": (
236
+ self.n_gate_applications / self.n_resets if self.n_resets > 0 else 0
237
+ ),
238
+ }
239
+
240
+ def destroy(self) -> None:
241
+ """Clean up cuStateVec resources."""
242
+ if self.handle is not None:
243
+ with cp.cuda.Device(self.device_id):
244
+ cusv.destroy(self.handle)
245
+ self.handle = None
246
+
247
+ def __del__(self):
248
+ """Destructor to ensure cleanup."""
249
+ self.destroy()
250
+
251
+
252
+ class CuStateVecEvaluator:
253
+ """
254
+ High-performance circuit evaluator using cuStateVec.
255
+
256
+ Implements the default ansatz directly with cuStateVec gates,
257
+ avoiding Qiskit overhead entirely.
258
+ """
259
+
260
+ def __init__(
261
+ self,
262
+ config: "OptimizerConfig",
263
+ target: np.ndarray,
264
+ device_id: int = 0,
265
+ ):
266
+ if not HAS_CUSTATEVEC:
267
+ raise RuntimeError("cuStateVec not available")
268
+
269
+ self.config = config
270
+ self.n_qubits = config.n_qubits
271
+ self.n_params = config.n_params
272
+ self.ansatz = config.ansatz
273
+ self.device_id = device_id
274
+
275
+ # Check if ansatz has custatevec-native implementation
276
+ self._has_native_cusv = hasattr(self.ansatz, "apply_custatevec") if self.ansatz else False
277
+
278
+ # Initialize cuStateVec simulator
279
+ precision = "double" if config.gpu_precision == "double" else "single"
280
+ self.simulator = CuStateVecSimulator(
281
+ config.n_qubits,
282
+ precision=precision,
283
+ device_id=device_id,
284
+ )
285
+
286
+ target_reversed = self._bit_reverse_statevector(target, config.n_qubits)
287
+
288
+ with cp.cuda.Device(device_id):
289
+ self.target_gpu = cp.array(target_reversed, dtype=cp.complex128)
290
+ self.target_conj_gpu = cp.conj(self.target_gpu)
291
+
292
+ # Also keep original target for plotting reference
293
+ self._target_original = target
294
+
295
+ # Statistics
296
+ self.n_circuits_evaluated = 0
297
+ self.n_fidelity_computations = 0
298
+
299
+ def _bit_reverse_statevector(self, sv: np.ndarray, n_qubits: int) -> np.ndarray:
300
+ """Reverse bit order of statevector indices to match cuStateVec convention."""
301
+ n_states = len(sv)
302
+ result = np.zeros_like(sv)
303
+ for i in range(n_states):
304
+ reversed_i = int(format(i, f"0{n_qubits}b")[::-1], 2)
305
+ result[reversed_i] = sv[i]
306
+ return result
307
+
308
+ def apply_ansatz(self, params: np.ndarray) -> None:
309
+ """Apply the ansatz circuit to the simulator."""
310
+ if self._has_native_cusv:
311
+ self.ansatz.apply_custatevec(self.simulator, params)
312
+ else:
313
+ self._apply_default_ansatz(params)
314
+ self.n_circuits_evaluated += 1
315
+
316
+ def _apply_default_ansatz(self, params: np.ndarray) -> None:
317
+ """Apply default ansatz (Ollitrault-style) directly with cuStateVec."""
318
+ n = self.n_qubits
319
+ D2 = n
320
+
321
+ self.simulator.reset_state()
322
+ self.simulator.apply_x(n - 1)
323
+
324
+ for i in range(n):
325
+ self.simulator.apply_ry(float(params[i]), i)
326
+
327
+ for d in range(D2 - 1):
328
+ for i in range(n - 1):
329
+ self.simulator.apply_cnot(i, i + 1)
330
+ for i in range(n):
331
+ param_idx = n + n * d + i
332
+ self.simulator.apply_ry(float(params[param_idx]), i)
333
+
334
+ def get_statevector(self, params: np.ndarray) -> np.ndarray:
335
+ """Apply ansatz and return statevector on CPU."""
336
+ self.apply_ansatz(params)
337
+ return self.simulator.get_statevector_cpu()
338
+
339
+ def get_statevector_gpu(self, params: np.ndarray) -> "cp.ndarray":
340
+ """Apply ansatz and return statevector on GPU."""
341
+ self.apply_ansatz(params)
342
+ return self.simulator.get_statevector_gpu()
343
+
344
+ def compute_fidelity(self, params: np.ndarray) -> float:
345
+ """Compute fidelity entirely on GPU."""
346
+ self.apply_ansatz(params)
347
+ fidelity = self.simulator.compute_fidelity_gpu(self.target_conj_gpu)
348
+ self.n_fidelity_computations += 1
349
+ return fidelity
350
+
351
+ def compute_fidelity_from_statevector(self, psi_gpu: "cp.ndarray") -> float:
352
+ """Compute fidelity from GPU statevector."""
353
+ with cp.cuda.Device(self.device_id):
354
+ overlap = cp.vdot(self.target_conj_gpu, psi_gpu)
355
+ return float(overlap.real**2 + overlap.imag**2)
356
+
357
+ def get_statevector_qiskit_order(self, params: np.ndarray) -> np.ndarray:
358
+ self.apply_ansatz(params)
359
+ sv_cusv = self.simulator.get_statevector_cpu()
360
+ # Convert from cuStateVec convention back to Qiskit convention
361
+ return self._bit_reverse_statevector(sv_cusv, self.n_qubits)
362
+
363
+ def get_stats(self) -> dict:
364
+ """Return evaluator statistics."""
365
+ sim_stats = self.simulator.get_stats()
366
+ return {
367
+ **sim_stats,
368
+ "n_circuits_evaluated": self.n_circuits_evaluated,
369
+ "n_fidelity_computations": self.n_fidelity_computations,
370
+ }
371
+
372
+ def cleanup(self) -> None:
373
+ """Release GPU resources."""
374
+ if self.simulator is not None:
375
+ self.simulator.destroy()
376
+ self.simulator = None
377
+
378
+
379
+ class BatchedCuStateVecEvaluator:
380
+ """
381
+ Batched evaluation using multiple cuStateVec simulators on a single GPU.
382
+
383
+ For population-based methods (CMA-ES, etc.), we can evaluate
384
+ multiple circuits by:
385
+ 1. Using multiple simulators with reused pre-allocated GPU memory
386
+ 2. Round-robin distribution across simulators
387
+ 3. Batched fidelity computations
388
+ """
389
+
390
+ def __init__(
391
+ self,
392
+ config: "OptimizerConfig",
393
+ target: np.ndarray,
394
+ n_simulators: int = 4,
395
+ device_id: int = 0,
396
+ ):
397
+ if not HAS_CUSTATEVEC:
398
+ raise RuntimeError("cuStateVec not available")
399
+
400
+ self.config = config
401
+ self.n_qubits = config.n_qubits
402
+ self.n_params = config.n_params
403
+ self.n_simulators = n_simulators
404
+ self.device_id = device_id
405
+
406
+ precision = "double" if config.gpu_precision == "double" else "single"
407
+ self.simulators = [
408
+ CuStateVecSimulator(config.n_qubits, precision=precision, device_id=device_id)
409
+ for _ in range(n_simulators)
410
+ ]
411
+
412
+ # Bit-reverse target to match cuStateVec convention
413
+ target_reversed = self._bit_reverse_statevector(target, config.n_qubits)
414
+
415
+ with cp.cuda.Device(device_id):
416
+ self.target_gpu = cp.array(target_reversed, dtype=cp.complex128)
417
+ self.target_conj_gpu = cp.conj(self.target_gpu)
418
+ self.batch_fidelities = cp.zeros(config.custatevec_batch_size, dtype=cp.float64)
419
+
420
+ self.n_batches = 0
421
+ self.n_circuits_total = 0
422
+
423
+ def _bit_reverse_statevector(self, sv: np.ndarray, n_qubits: int) -> np.ndarray:
424
+ """Reverse bit order of statevector indices to match cuStateVec convention."""
425
+ n_states = len(sv)
426
+ result = np.zeros_like(sv)
427
+ for i in range(n_states):
428
+ reversed_i = int(format(i, f"0{n_qubits}b")[::-1], 2)
429
+ result[reversed_i] = sv[i]
430
+ return result
431
+
432
+ def _apply_ansatz_to_simulator(
433
+ self,
434
+ simulator: CuStateVecSimulator,
435
+ params: ParameterArray,
436
+ ) -> None:
437
+ """Apply ansatz to a specific simulator."""
438
+ n = self.n_qubits
439
+ D2 = n
440
+
441
+ simulator.reset_state()
442
+ simulator.apply_x(n - 1)
443
+
444
+ for i in range(n):
445
+ simulator.apply_ry(float(params[i]), i)
446
+
447
+ for d in range(D2 - 1):
448
+ for i in range(n - 1):
449
+ simulator.apply_cnot(i, i + 1)
450
+ for i in range(n):
451
+ param_idx = n + n * d + i
452
+ simulator.apply_ry(float(params[param_idx]), i)
453
+
454
+ def get_statevector_qiskit_order(self, params: np.ndarray) -> np.ndarray:
455
+ """Get statevector in Qiskit ordering (bit-reversed from cuStateVec)."""
456
+ self._apply_ansatz_to_simulator(self.simulators[0], params)
457
+ sv_cusv = self.simulators[0].get_statevector_cpu()
458
+ return self._bit_reverse_statevector(sv_cusv, self.n_qubits)
459
+
460
+ def evaluate_batch(self, params_batch: NDArray[np.float64]) -> FloatArray:
461
+ """
462
+ Evaluate fidelities for a batch of parameter sets.
463
+
464
+ Uses round-robin distribution across simulators.
465
+ """
466
+ batch_size = len(params_batch)
467
+ fidelities = np.zeros(batch_size, dtype=np.float64)
468
+
469
+ for i, params in enumerate(params_batch):
470
+ sim_idx = i % self.n_simulators
471
+ simulator = self.simulators[sim_idx]
472
+
473
+ self._apply_ansatz_to_simulator(simulator, params)
474
+ fidelities[i] = simulator.compute_fidelity_gpu(self.target_conj_gpu)
475
+
476
+ self.n_batches += 1
477
+ self.n_circuits_total += batch_size
478
+
479
+ return fidelities
480
+
481
+ def evaluate_batch_chunked(
482
+ self,
483
+ params_batch: NDArray[np.float64],
484
+ chunk_size: Optional[int] = None,
485
+ ) -> FloatArray:
486
+ """Evaluate large batch in chunks for memory efficiency."""
487
+ if chunk_size is None:
488
+ chunk_size = self.config.custatevec_batch_size
489
+
490
+ batch_size = len(params_batch)
491
+ fidelities = np.zeros(batch_size, dtype=np.float64)
492
+
493
+ for start in range(0, batch_size, chunk_size):
494
+ end = min(start + chunk_size, batch_size)
495
+ fidelities[start:end] = self.evaluate_batch(params_batch[start:end])
496
+
497
+ return fidelities
498
+
499
+ def compute_gradient_batched(self, params: ParameterArray) -> FloatArray:
500
+ """
501
+ Compute gradient using batched parameter-shift evaluation.
502
+
503
+ All 2*n_params shifted circuits evaluated in batch.
504
+ """
505
+ shift = np.pi / 2
506
+ n_params = self.n_params
507
+
508
+ params_shifted = np.zeros((2 * n_params, n_params), dtype=np.float64)
509
+
510
+ for i in range(n_params):
511
+ params_shifted[2 * i] = params.copy()
512
+ params_shifted[2 * i, i] += shift
513
+
514
+ params_shifted[2 * i + 1] = params.copy()
515
+ params_shifted[2 * i + 1, i] -= shift
516
+
517
+ fidelities = self.evaluate_batch(params_shifted)
518
+
519
+ gradient = np.zeros(n_params, dtype=np.float64)
520
+ for i in range(n_params):
521
+ gradient[i] = (fidelities[2 * i] - fidelities[2 * i + 1]) / 2
522
+
523
+ return -gradient # Negative for minimization of -fidelity
524
+
525
+ def get_stats(self) -> dict:
526
+ """Return batch evaluator statistics."""
527
+ total_gate_apps = sum(s.n_gate_applications for s in self.simulators)
528
+ return {
529
+ "n_simulators": self.n_simulators,
530
+ "device_id": self.device_id,
531
+ "n_batches": self.n_batches,
532
+ "n_circuits_total": self.n_circuits_total,
533
+ "total_gate_applications": total_gate_apps,
534
+ "avg_circuits_per_batch": (
535
+ self.n_circuits_total / self.n_batches if self.n_batches > 0 else 0
536
+ ),
537
+ }
538
+
539
+ def cleanup(self) -> None:
540
+ """Release all GPU resources."""
541
+ for sim in self.simulators:
542
+ sim.destroy()
543
+ self.simulators = []
544
+
545
+
546
+ class MultiGPUBatchEvaluator:
547
+ """
548
+ Distribute circuit evaluations across multiple GPUs.
549
+
550
+ Each GPU gets its own set of CuStateVecSimulators, and work is
551
+ distributed across GPUs in chunks.
552
+
553
+ Parameters
554
+ ----------
555
+ config : OptimizerConfig
556
+ Optimizer configuration
557
+ target : np.ndarray
558
+ Target wavefunction
559
+ device_ids : list of int, optional
560
+ GPU device IDs to use. If None, auto-detects all available GPUs.
561
+ simulators_per_gpu : int
562
+ Number of simulators per GPU for round-robin evaluation
563
+
564
+ Example
565
+ -------
566
+ >>> evaluator = MultiGPUBatchEvaluator(
567
+ ... config, target,
568
+ ... device_ids=[0, 1, 2, 3],
569
+ ... simulators_per_gpu=2,
570
+ ... )
571
+ >>> fidelities = evaluator.evaluate_batch_parallel(params_batch)
572
+ """
573
+
574
+ def __init__(
575
+ self,
576
+ config: "OptimizerConfig",
577
+ target: np.ndarray,
578
+ device_ids: Optional[list[int]] = None,
579
+ simulators_per_gpu: int = 2,
580
+ ):
581
+ if not HAS_CUSTATEVEC:
582
+ raise RuntimeError("cuStateVec not available")
583
+
584
+ self.config = config
585
+ self.n_qubits = config.n_qubits
586
+ self.n_params = config.n_params
587
+
588
+ # Auto-detect GPUs if not specified
589
+ if device_ids is None:
590
+ device_ids = list(range(cp.cuda.runtime.getDeviceCount()))
591
+
592
+ self.device_ids = device_ids
593
+ self.n_gpus = len(device_ids)
594
+ self.simulators_per_gpu = simulators_per_gpu
595
+
596
+ if self.n_gpus == 0:
597
+ raise RuntimeError("No GPUs available")
598
+
599
+ logger.info(f"Initializing Multi-GPU evaluator with {self.n_gpus} GPUs: {device_ids}")
600
+
601
+ precision = "double" if config.gpu_precision == "double" else "single"
602
+
603
+ # Bit-reverse target to match cuStateVec convention
604
+ target_reversed = self._bit_reverse_statevector(target, config.n_qubits)
605
+
606
+ # Create simulators and target arrays on each GPU
607
+ self.simulators: list[list[CuStateVecSimulator]] = []
608
+ self.target_conj_gpu: list[cp.ndarray] = []
609
+
610
+ for gpu_id in device_ids:
611
+ with cp.cuda.Device(gpu_id):
612
+ # Create simulators for this GPU
613
+ gpu_sims = [
614
+ CuStateVecSimulator(self.n_qubits, precision, device_id=gpu_id)
615
+ for _ in range(simulators_per_gpu)
616
+ ]
617
+ self.simulators.append(gpu_sims)
618
+
619
+ # Copy target to this GPU
620
+ target_gpu = cp.array(target_reversed, dtype=cp.complex128)
621
+ self.target_conj_gpu.append(cp.conj(target_gpu))
622
+
623
+ self.total_simulators = self.n_gpus * simulators_per_gpu
624
+
625
+ # Statistics
626
+ self.n_batches = 0
627
+ self.n_circuits_evaluated = 0
628
+
629
+ def _bit_reverse_statevector(self, sv: np.ndarray, n_qubits: int) -> np.ndarray:
630
+ """Reverse bit order of statevector indices."""
631
+ n_states = len(sv)
632
+ result = np.zeros_like(sv)
633
+ for i in range(n_states):
634
+ reversed_i = int(format(i, f"0{n_qubits}b")[::-1], 2)
635
+ result[reversed_i] = sv[i]
636
+ return result
637
+
638
+ def _apply_ansatz_to_simulator(
639
+ self,
640
+ simulator: CuStateVecSimulator,
641
+ params: np.ndarray,
642
+ ) -> None:
643
+ """Apply default ansatz to a simulator."""
644
+ n = self.n_qubits
645
+ D2 = n
646
+
647
+ simulator.reset_state()
648
+ simulator.apply_x(n - 1)
649
+
650
+ for i in range(n):
651
+ simulator.apply_ry(float(params[i]), i)
652
+
653
+ for d in range(D2 - 1):
654
+ for i in range(n - 1):
655
+ simulator.apply_cnot(i, i + 1)
656
+ for i in range(n):
657
+ param_idx = n + n * d + i
658
+ simulator.apply_ry(float(params[param_idx]), i)
659
+
660
+ def evaluate_batch(self, params_batch: np.ndarray) -> np.ndarray:
661
+ """
662
+ Evaluate batch distributed across GPUs sequentially.
663
+
664
+ For parallel execution, use evaluate_batch_parallel().
665
+ """
666
+ batch_size = len(params_batch)
667
+ fidelities = np.zeros(batch_size, dtype=np.float64)
668
+
669
+ # Split batch across GPUs
670
+ chunk_size = (batch_size + self.n_gpus - 1) // self.n_gpus
671
+
672
+ for gpu_idx, gpu_id in enumerate(self.device_ids):
673
+ start = gpu_idx * chunk_size
674
+ end = min(start + chunk_size, batch_size)
675
+
676
+ if start >= batch_size:
677
+ break
678
+
679
+ with cp.cuda.Device(gpu_id):
680
+ gpu_sims = self.simulators[gpu_idx]
681
+ target_conj = self.target_conj_gpu[gpu_idx]
682
+
683
+ for i, params in enumerate(params_batch[start:end]):
684
+ sim_idx = i % len(gpu_sims)
685
+ sim = gpu_sims[sim_idx]
686
+
687
+ self._apply_ansatz_to_simulator(sim, params)
688
+ fidelities[start + i] = sim.compute_fidelity_gpu(target_conj)
689
+
690
+ self.n_batches += 1
691
+ self.n_circuits_evaluated += batch_size
692
+ return fidelities
693
+
694
+ def evaluate_batch_parallel(self, params_batch: np.ndarray) -> np.ndarray:
695
+ """
696
+ Evaluate batch with parallel GPU execution using threads.
697
+
698
+ Each GPU processes its chunk in a separate thread for maximum throughput.
699
+ """
700
+ batch_size = len(params_batch)
701
+ fidelities = np.zeros(batch_size, dtype=np.float64)
702
+ chunk_size = (batch_size + self.n_gpus - 1) // self.n_gpus
703
+
704
+ def process_gpu_chunk(gpu_idx: int) -> None:
705
+ gpu_id = self.device_ids[gpu_idx]
706
+ start = gpu_idx * chunk_size
707
+ end = min(start + chunk_size, batch_size)
708
+
709
+ if start >= batch_size:
710
+ return
711
+
712
+ with cp.cuda.Device(gpu_id):
713
+ gpu_sims = self.simulators[gpu_idx]
714
+ target_conj = self.target_conj_gpu[gpu_idx]
715
+
716
+ for i, params in enumerate(params_batch[start:end]):
717
+ sim_idx = i % len(gpu_sims)
718
+ sim = gpu_sims[sim_idx]
719
+
720
+ self._apply_ansatz_to_simulator(sim, params)
721
+ fidelities[start + i] = sim.compute_fidelity_gpu(target_conj)
722
+
723
+ # Process all GPUs in parallel using thread pool
724
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.n_gpus) as executor:
725
+ futures = [executor.submit(process_gpu_chunk, i) for i in range(self.n_gpus)]
726
+ concurrent.futures.wait(futures)
727
+
728
+ # Check for exceptions
729
+ for future in futures:
730
+ if future.exception() is not None:
731
+ raise future.exception()
732
+
733
+ self.n_batches += 1
734
+ self.n_circuits_evaluated += batch_size
735
+ return fidelities
736
+
737
+ def compute_gradient_parallel(self, params: np.ndarray) -> np.ndarray:
738
+ """Compute gradient using parallel multi-GPU evaluation."""
739
+ shift = np.pi / 2
740
+ n_params = self.n_params
741
+
742
+ # Build all shifted parameter sets
743
+ params_shifted = np.zeros((2 * n_params, n_params), dtype=np.float64)
744
+
745
+ for i in range(n_params):
746
+ params_shifted[2 * i] = params.copy()
747
+ params_shifted[2 * i, i] += shift
748
+
749
+ params_shifted[2 * i + 1] = params.copy()
750
+ params_shifted[2 * i + 1, i] -= shift
751
+
752
+ # Parallel evaluation across GPUs
753
+ fidelities = self.evaluate_batch_parallel(params_shifted)
754
+
755
+ # Compute gradients
756
+ gradient = np.zeros(n_params, dtype=np.float64)
757
+ for i in range(n_params):
758
+ gradient[i] = (fidelities[2 * i] - fidelities[2 * i + 1]) / 2
759
+
760
+ return -gradient
761
+
762
+ def get_stats(self) -> dict:
763
+ """Return multi-GPU evaluator statistics."""
764
+ total_gate_apps = sum(
765
+ sim.n_gate_applications for gpu_sims in self.simulators for sim in gpu_sims
766
+ )
767
+ return {
768
+ "n_gpus": self.n_gpus,
769
+ "device_ids": self.device_ids,
770
+ "simulators_per_gpu": self.simulators_per_gpu,
771
+ "total_simulators": self.total_simulators,
772
+ "n_batches": self.n_batches,
773
+ "n_circuits_evaluated": self.n_circuits_evaluated,
774
+ "total_gate_applications": total_gate_apps,
775
+ }
776
+
777
+ def cleanup(self) -> None:
778
+ """Release all GPU resources."""
779
+ for gpu_sims in self.simulators:
780
+ for sim in gpu_sims:
781
+ sim.destroy()
782
+ self.simulators = []
783
+ self.target_conj_gpu = []