pyopencl 2025.1__cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (43) hide show
  1. pyopencl/.libs/libOpenCL-b54a1ea0.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +2410 -0
  3. pyopencl/_cl.cpython-313-x86_64-linux-gnu.so +0 -0
  4. pyopencl/_cluda.py +54 -0
  5. pyopencl/_mymako.py +14 -0
  6. pyopencl/algorithm.py +1449 -0
  7. pyopencl/array.py +3362 -0
  8. pyopencl/bitonic_sort.py +242 -0
  9. pyopencl/bitonic_sort_templates.py +594 -0
  10. pyopencl/cache.py +535 -0
  11. pyopencl/capture_call.py +177 -0
  12. pyopencl/characterize/__init__.py +456 -0
  13. pyopencl/characterize/performance.py +237 -0
  14. pyopencl/cl/pyopencl-airy.cl +324 -0
  15. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  16. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  17. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  18. pyopencl/cl/pyopencl-complex.h +303 -0
  19. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  20. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  21. pyopencl/cl/pyopencl-random123/array.h +325 -0
  22. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  23. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  24. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  25. pyopencl/clmath.py +280 -0
  26. pyopencl/clrandom.py +409 -0
  27. pyopencl/cltypes.py +137 -0
  28. pyopencl/compyte/.gitignore +21 -0
  29. pyopencl/compyte/__init__.py +0 -0
  30. pyopencl/compyte/array.py +214 -0
  31. pyopencl/compyte/dtypes.py +290 -0
  32. pyopencl/compyte/pyproject.toml +54 -0
  33. pyopencl/elementwise.py +1171 -0
  34. pyopencl/invoker.py +421 -0
  35. pyopencl/ipython_ext.py +68 -0
  36. pyopencl/reduction.py +786 -0
  37. pyopencl/scan.py +1915 -0
  38. pyopencl/tools.py +1527 -0
  39. pyopencl/version.py +9 -0
  40. pyopencl-2025.1.dist-info/METADATA +108 -0
  41. pyopencl-2025.1.dist-info/RECORD +43 -0
  42. pyopencl-2025.1.dist-info/WHEEL +6 -0
  43. pyopencl-2025.1.dist-info/licenses/LICENSE +104 -0
@@ -0,0 +1,456 @@
1
+ __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
2
+
3
+ __license__ = """
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in
12
+ all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ THE SOFTWARE.
21
+ """
22
+
23
+ from typing import Dict, Optional, Tuple
24
+
25
+ from pytools import memoize
26
+
27
+ import pyopencl as cl
28
+
29
+
30
+ class CLCharacterizationWarning(UserWarning):
31
+ pass
32
+
33
+
34
+ @memoize
35
+ def has_double_support(dev):
36
+ for ext in dev.extensions.split(" "):
37
+ if ext == "cl_khr_fp64":
38
+ return True
39
+ return False
40
+
41
+
42
+ def has_amd_double_support(dev):
43
+ """"Fix to allow incomplete amd double support in low end boards"""
44
+
45
+ for ext in dev.extensions.split(" "):
46
+ if ext == "cl_amd_fp64":
47
+ return True
48
+ return False
49
+
50
+
51
+ def reasonable_work_group_size_multiple(dev, ctx=None):
52
+ try:
53
+ return dev.warp_size_nv
54
+ except Exception:
55
+ pass
56
+
57
+ if ctx is None:
58
+ ctx = cl.Context([dev])
59
+ prg = cl.Program(ctx, """
60
+ __kernel void knl(__global float *a)
61
+ {
62
+ a[get_global_id(0)] = 0;
63
+ }
64
+ """)
65
+ prg.build()
66
+ return prg.knl.get_work_group_info(
67
+ cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
68
+ dev)
69
+
70
+
71
+ def nv_compute_capability(dev):
72
+ """If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
73
+ *(major, minor)* indicating the device's compute capability.
74
+ """
75
+
76
+ try:
77
+ return (dev.compute_capability_major_nv,
78
+ dev.compute_capability_minor_nv)
79
+ except Exception:
80
+ return None
81
+
82
+
83
+ def usable_local_mem_size(dev, nargs=None):
84
+ """Return an estimate of the usable local memory size.
85
+ :arg nargs: Number of 32-bit arguments passed.
86
+ """
87
+
88
+ usable_local_mem_size = dev.local_mem_size
89
+
90
+ nv_compute_cap = nv_compute_capability(dev)
91
+
92
+ if (nv_compute_cap is not None
93
+ and nv_compute_cap < (2, 0)):
94
+ # pre-Fermi use local mem for parameter passing
95
+ if nargs is None:
96
+ # assume maximum
97
+ usable_local_mem_size -= 256
98
+ else:
99
+ usable_local_mem_size -= 4*nargs
100
+
101
+ return usable_local_mem_size
102
+
103
+
104
+ def simultaneous_work_items_on_local_access(dev):
105
+ """Return the number of work items that access local
106
+ memory simultaneously and thereby may conflict with
107
+ each other.
108
+ """
109
+ nv_compute_cap = nv_compute_capability(dev)
110
+
111
+ if nv_compute_cap is not None:
112
+ if nv_compute_cap < (2, 0):
113
+ return 16
114
+ else:
115
+ if nv_compute_cap >= (3, 0):
116
+ from warnings import warn
117
+ warn(
118
+ f"Wildly guessing conflicting local access size on '{dev}'",
119
+ CLCharacterizationWarning, stacklevel=2)
120
+
121
+ return 32
122
+
123
+ if dev.type & cl.device_type.GPU:
124
+ from warnings import warn
125
+ warn(
126
+ f"Wildly guessing conflicting local access size on '{dev}'",
127
+ CLCharacterizationWarning, stacklevel=2)
128
+ return 16
129
+ elif dev.type & cl.device_type.CPU:
130
+ return 1
131
+ else:
132
+ from warnings import warn
133
+ warn(
134
+ f"Wildly guessing conflicting local access size on '{dev}'",
135
+ CLCharacterizationWarning, stacklevel=2)
136
+ return 16
137
+
138
+
139
+ def local_memory_access_granularity(dev):
140
+ """Return the number of bytes per bank in local memory."""
141
+ return 4
142
+
143
+
144
+ def local_memory_bank_count(dev):
145
+ """Return the number of banks present in local memory.
146
+ """
147
+ nv_compute_cap = nv_compute_capability(dev)
148
+
149
+ if nv_compute_cap is not None:
150
+ if nv_compute_cap < (2, 0):
151
+ return 16
152
+ else:
153
+ if nv_compute_cap >= (3, 0):
154
+ from warnings import warn
155
+ warn(
156
+ f"Wildly guessing local memory bank count on '{dev}'",
157
+ CLCharacterizationWarning, stacklevel=2)
158
+
159
+ return 32
160
+
161
+ if dev.type & cl.device_type.GPU:
162
+ from warnings import warn
163
+ warn(
164
+ f"Wildly guessing local memory bank count on '{dev}'",
165
+ CLCharacterizationWarning, stacklevel=2)
166
+ return 16
167
+ elif dev.type & cl.device_type.CPU:
168
+ if dev.local_mem_type == cl.device_local_mem_type.GLOBAL:
169
+ raise RuntimeError("asking for a bank count is "
170
+ "meaningless for cache-based lmem")
171
+
172
+ from warnings import warn
173
+ warn(
174
+ f"Wildly guessing conflicting local access size on '{dev}'",
175
+ CLCharacterizationWarning, stacklevel=2)
176
+ return 16
177
+
178
+
179
+ def why_not_local_access_conflict_free(dev, itemsize,
180
+ array_shape, array_stored_shape=None):
181
+ """
182
+ :param itemsize: size of accessed data in bytes
183
+ :param array_shape: array dimensions, fastest-moving last
184
+ (C order)
185
+
186
+ :returns: a tuple (multiplicity, explanation), where *multiplicity*
187
+ is the number of work items that will conflict on a bank when accessing
188
+ local memory. *explanation* is a string detailing the found conflict.
189
+ """
190
+ # FIXME: Treat 64-bit access on NV CC 2.x + correctly
191
+
192
+ if array_stored_shape is None:
193
+ array_stored_shape = array_shape
194
+
195
+ rank = len(array_shape)
196
+
197
+ array_shape = array_shape[::-1]
198
+ array_stored_shape = array_stored_shape[::-1]
199
+
200
+ gran = local_memory_access_granularity(dev)
201
+ if itemsize != gran:
202
+ from warnings import warn
203
+ warn(
204
+ f"Local conflict info might be inaccurate for itemsize != {gran}",
205
+ CLCharacterizationWarning, stacklevel=2)
206
+
207
+ sim_wi = simultaneous_work_items_on_local_access(dev)
208
+ bank_count = local_memory_bank_count(dev)
209
+
210
+ conflicts = []
211
+
212
+ for work_item_axis in range(rank):
213
+
214
+ bank_accesses = {}
215
+ for work_item_id in range(sim_wi):
216
+ addr = 0
217
+ addr_mult = itemsize
218
+
219
+ idx = []
220
+ left_over_idx = work_item_id
221
+ for axis, (ax_size, ax_stor_size) in enumerate(
222
+ zip(array_shape, array_stored_shape)):
223
+
224
+ if axis >= work_item_axis:
225
+ left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
226
+ addr += addr_mult*ax_idx
227
+ idx.append(ax_idx)
228
+ else:
229
+ idx.append(0)
230
+
231
+ addr_mult *= ax_stor_size
232
+
233
+ if left_over_idx:
234
+ # out-of-bounds, assume not taking place
235
+ continue
236
+
237
+ bank = (addr // gran) % bank_count
238
+ bank_accesses.setdefault(bank, []).append(
239
+ "w.item {} -> {}".format(work_item_id, idx[::-1]))
240
+
241
+ conflict_multiplicity = max(
242
+ len(acc) for acc in bank_accesses.values())
243
+
244
+ if conflict_multiplicity > 1:
245
+ for bank, acc in bank_accesses.items():
246
+ if len(acc) == conflict_multiplicity:
247
+ conflicts.append(
248
+ (conflict_multiplicity,
249
+ "%dx conflict on axis %d (from right, 0-based): "
250
+ "%s access bank %d" % (
251
+ conflict_multiplicity,
252
+ work_item_axis,
253
+ ", ".join(acc), bank)))
254
+
255
+ if conflicts:
256
+ return max(conflicts)
257
+ else:
258
+ return 1, None
259
+
260
+
261
+ def get_fast_inaccurate_build_options(dev):
262
+ """Return a list of flags valid on device *dev* that enable fast, but
263
+ potentially inaccurate floating point math.
264
+ """
265
+ result = ["-cl-mad-enable", "-cl-fast-relaxed-math",
266
+ "-cl-no-signed-zeros", ]
267
+ if dev.vendor.startswith("Advanced Micro") or dev.vendor.startswith("NVIDIA"):
268
+ result.append("-cl-strict-aliasing")
269
+ return result
270
+
271
+
272
+ def get_simd_group_size(dev, type_size):
273
+ """Return an estimate of how many work items will be executed across SIMD
274
+ lanes. This returns the size of what Nvidia calls a warp and what AMD calls
275
+ a wavefront.
276
+
277
+ Only refers to implicit SIMD.
278
+
279
+ :arg type_size: number of bytes in vector entry type.
280
+ """
281
+ try:
282
+ return dev.warp_size_nv
283
+ except Exception:
284
+ pass
285
+
286
+ lc_plat_vendor = dev.platform.vendor.lower()
287
+ lc_dev_vendor = dev.vendor.lower()
288
+ if "nvidia" in lc_plat_vendor or "nvidia" in lc_dev_vendor:
289
+ return 32
290
+
291
+ if ("advanced micro" in lc_plat_vendor or "ati" in lc_plat_vendor
292
+ or "advanced micro" in lc_dev_vendor or "ati" in lc_dev_vendor):
293
+ if dev.type & cl.device_type.GPU:
294
+ # Tomasz Rybak says, in response to reduction misbehaving on the AMD
295
+ # 'Loveland' APU:
296
+ #
297
+ # Like in CUDA reduction bug (related to Fermi) it again seems
298
+ # to be related to too eager concurrency when reducing results.
299
+ # According to http://oscarbg.blogspot.com/2009/10/news-from-web.html
300
+ # "Actually the wavefront size is only 64 for the highend cards(48XX,
301
+ # 58XX, 57XX), but 32 for the middleend cards and 16 for the lowend
302
+ # cards."
303
+ # IMO we should use PREFERRED_WORK_GROUP_SIZE_MULTIPLE to get
304
+ # non_sync_size. At the same size we lose SIMD CPU optimisation,
305
+ # but I do not know for now how to fix those two at the same time.
306
+ # Attached patch fixes problem on Loveland, not breaking anything on
307
+ # NVIDIA ION.
308
+
309
+ # This is therefore our best guess as to the SIMD group size.
310
+
311
+ return reasonable_work_group_size_multiple(dev)
312
+ elif dev.type & cl.device_type.CPU:
313
+ return 1
314
+ else:
315
+ raise RuntimeError("unexpected AMD device type")
316
+
317
+ if dev.type & cl.device_type.CPU:
318
+ # implicit assumption: Impl. will vectorize
319
+ return 1
320
+
321
+ return None
322
+
323
+
324
+ def get_pocl_version(
325
+ platform: cl.Platform,
326
+ fallback_value: Optional[Tuple[int, int]] = None
327
+ ) -> Optional[Tuple[int, int]]:
328
+ if platform.name != "Portable Computing Language":
329
+ return None
330
+
331
+ import re
332
+ version = platform.version
333
+ ver_match = re.match(
334
+ r"^OpenCL [0-9.]+ [Pp]o[Cc][Ll] ([0-9]+)\.([0-9]+)", version)
335
+
336
+ if ver_match is None:
337
+ from warnings import warn
338
+ warn(f"PoCL version number did not have expected format: '{version}'",
339
+ stacklevel=2)
340
+ return fallback_value
341
+ else:
342
+ return (int(ver_match.group(1)), int(ver_match.group(2)))
343
+
344
+
345
+ _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE: Dict[cl.Device, bool] = {}
346
+
347
+
348
+ def _check_for_pocl_arg_count_bug(
349
+ dev: cl.Device,
350
+ ctx: Optional[cl.Context] = None) -> bool:
351
+ try:
352
+ return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
353
+ except KeyError:
354
+ pass
355
+
356
+ if ctx is None:
357
+ build_ctx = cl.Context([dev])
358
+ else:
359
+ build_ctx = ctx
360
+
361
+ prg = cl.Program(build_ctx, """
362
+ struct two_things
363
+ {
364
+ long a;
365
+ long b;
366
+ };
367
+
368
+ __kernel void test_knl(struct two_things x)
369
+ {
370
+ }
371
+ """).build()
372
+
373
+ result = prg.test_knl.num_args == 2
374
+ _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] = result
375
+
376
+ return result
377
+
378
+
379
+ def has_struct_arg_count_bug(dev, ctx=None):
380
+ """Checks whether the device is expected to have the
381
+ `argument counting bug <https://github.com/pocl/pocl/issues/197>`__.
382
+ """
383
+
384
+ if dev.platform.name == "Apple" and dev.type & cl.device_type.CPU:
385
+ return "apple"
386
+ if dev.platform.name == "Portable Computing Language":
387
+ pocl_version = get_pocl_version(dev.platform, fallback_value=(0, 14))
388
+ if pocl_version <= (0, 13):
389
+ return "pocl"
390
+ elif pocl_version <= (0, 14) and _check_for_pocl_arg_count_bug(dev, ctx):
391
+ return "pocl"
392
+
393
+ return False
394
+
395
+
396
+ # {{{ SVM capabilities
397
+
398
+ def _may_have_svm(dev):
399
+ has_svm = (dev.platform._get_cl_version() >= (2, 0)
400
+ and cl.get_cl_header_version() >= (2, 0))
401
+
402
+ if dev.platform.name == "Portable Computing Language":
403
+ has_svm = (
404
+ get_pocl_version(dev.platform) >= (1, 0)
405
+ and cl.get_cl_header_version() >= (2, 0))
406
+
407
+ return has_svm
408
+
409
+
410
+ def has_coarse_grain_buffer_svm(dev):
411
+ return (_may_have_svm(dev)
412
+ and bool(dev.svm_capabilities
413
+ & cl.device_svm_capabilities.COARSE_GRAIN_BUFFER))
414
+
415
+
416
+ def has_fine_grain_buffer_svm(dev):
417
+ return (_may_have_svm(dev)
418
+ and bool(dev.svm_capabilities
419
+ & cl.device_svm_capabilities.FINE_GRAIN_BUFFER))
420
+
421
+
422
+ def has_fine_grain_system_svm(dev):
423
+ return (_may_have_svm(dev)
424
+ and bool(dev.svm_capabilities
425
+ & cl.device_svm_capabilities.FINE_GRAIN_SYSTEM))
426
+
427
+
428
+ def has_fine_grain_buffer_svm_atomics(dev):
429
+ return has_fine_grain_buffer_svm(dev) and bool(dev.svm_capabilities
430
+ & cl.device_svm_capabilities.ATOMICS)
431
+
432
+
433
+ def has_fine_grain_system_svm_atomics(dev):
434
+ return has_fine_grain_system_svm(dev) and bool(dev.svm_capabilities
435
+ & cl.device_svm_capabilities.ATOMICS)
436
+
437
+ # }}}
438
+
439
+
440
+ def has_src_build_cache(dev: cl.Device) -> Optional[bool]:
441
+ """
442
+ Return *True* if *dev* has internal support for caching builds from source,
443
+ *False* if it doesn't, and *None* if unknown.
444
+ """
445
+ if dev.platform.name == "Portable Computing Language":
446
+ return True
447
+
448
+ if nv_compute_capability(dev) is not None:
449
+ return True
450
+
451
+ if dev.platform.name == "AMD Accelerated Parallel Processing":
452
+ return False
453
+
454
+ return None
455
+
456
+ # vim: foldmethod=marker
@@ -0,0 +1,237 @@
1
+ __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
2
+
3
+ __license__ = """
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in
12
+ all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ THE SOFTWARE.
21
+ """
22
+
23
+ import numpy as np
24
+
25
+ import pyopencl as cl
26
+
27
+
28
+ # {{{ timing helpers
29
+
30
+ class Timer:
31
+ def __init__(self, queue):
32
+ self.queue = queue
33
+
34
+ def start(self):
35
+ pass
36
+
37
+ def stop(self):
38
+ pass
39
+
40
+ def add_event(self, evt):
41
+ pass
42
+
43
+ def get_elapsed(self):
44
+ pass
45
+
46
+
47
+ class WallTimer(Timer):
48
+ def start(self):
49
+ from time import time
50
+ self.queue.finish()
51
+ self.start_time = time()
52
+
53
+ def stop(self):
54
+ from time import time
55
+ self.queue.finish()
56
+ self.end_time = time()
57
+
58
+ def get_elapsed(self):
59
+ return self.end_time-self.start_time
60
+
61
+
62
+ def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
63
+ warmup_rounds=3):
64
+
65
+ if timer_factory is None:
66
+ timer_factory = WallTimer
67
+
68
+ count = 1
69
+
70
+ while True:
71
+ timer = timer_factory(queue)
72
+
73
+ for _i in range(warmup_rounds):
74
+ f()
75
+ warmup_rounds = 0
76
+
77
+ timer.start()
78
+ for _i in range(count):
79
+ timer.add_event(f())
80
+ timer.stop()
81
+
82
+ elapsed = timer.get_elapsed()
83
+ if elapsed < desired_duration:
84
+ if elapsed == 0:
85
+ count *= 5
86
+ else:
87
+ new_count = int(desired_duration/elapsed)
88
+
89
+ new_count = max(2*count, new_count)
90
+ new_count = min(10*count, new_count)
91
+ count = new_count
92
+
93
+ else:
94
+ return elapsed/count
95
+
96
+ # }}}
97
+
98
+
99
+ # {{{ transfer measurements
100
+
101
+ class HostDeviceTransferBase:
102
+ def __init__(self, queue, block_size):
103
+ self.queue = queue
104
+ self.host_buf = np.empty(block_size, dtype=np.uint8)
105
+ self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
106
+
107
+
108
+ class HostToDeviceTransfer(HostDeviceTransferBase):
109
+ def do(self):
110
+ return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)
111
+
112
+
113
+ class DeviceToHostTransfer(HostDeviceTransferBase):
114
+ def do(self):
115
+ return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf)
116
+
117
+
118
+ class DeviceToDeviceTransfer:
119
+ def __init__(self, queue, block_size):
120
+ self.queue = queue
121
+ mf = cl.mem_flags
122
+ self.dev_buf_1 = cl.Buffer(queue.context, mf.READ_WRITE, block_size)
123
+ self.dev_buf_2 = cl.Buffer(queue.context, mf.READ_WRITE, block_size)
124
+
125
+ def do(self):
126
+ return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1)
127
+
128
+
129
+ def transfer_latency(queue, transfer_type, timer_factory=None):
130
+ transfer = transfer_type(queue, 1)
131
+ return _get_time(queue, transfer.do, timer_factory=timer_factory)
132
+
133
+
134
+ def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None):
135
+ """Measures one-sided bandwidth."""
136
+
137
+ transfer = transfer_type(queue, block_size)
138
+ return block_size/_get_time(queue, transfer.do, timer_factory=timer_factory)
139
+
140
+ # }}}
141
+
142
+
143
+ def get_profiling_overhead(ctx, timer_factory=None):
144
+ no_prof_queue = cl.CommandQueue(ctx)
145
+ transfer = DeviceToDeviceTransfer(no_prof_queue, 1)
146
+ no_prof_time = _get_time(no_prof_queue, transfer.do, timer_factory=timer_factory)
147
+
148
+ prof_queue = cl.CommandQueue(ctx,
149
+ properties=cl.command_queue_properties.PROFILING_ENABLE)
150
+ transfer = DeviceToDeviceTransfer(prof_queue, 1)
151
+ prof_time = _get_time(prof_queue, transfer.do, timer_factory=timer_factory)
152
+
153
+ return prof_time - no_prof_time, prof_time
154
+
155
+
156
+ def get_empty_kernel_time(queue, timer_factory=None):
157
+ prg = cl.Program(queue.context, """
158
+ __kernel void empty()
159
+ { }
160
+ """).build()
161
+
162
+ knl = prg.empty
163
+
164
+ def f():
165
+ knl(queue, (1,), None)
166
+
167
+ return _get_time(queue, f, timer_factory=timer_factory)
168
+
169
+
170
+ def _get_full_machine_kernel_rate(queue, src, args, name="benchmark",
171
+ timer_factory=None):
172
+ prg = cl.Program(queue.context, src).build()
173
+
174
+ knl = getattr(prg, name)
175
+
176
+ dev = queue.device
177
+ global_size = 4 * dev.max_compute_units
178
+
179
+ def f():
180
+ knl(queue, (global_size,), None, *args)
181
+
182
+ rates = []
183
+ num_dips = 0
184
+
185
+ while True:
186
+ elapsed = _get_time(queue, f, timer_factory=timer_factory)
187
+ rate = global_size/elapsed
188
+
189
+ keep_trying = not rates
190
+
191
+ if rates and rate > 1.05*max(rates): # big improvement
192
+ keep_trying = True
193
+ num_dips = 0
194
+
195
+ if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip
196
+ keep_trying = True
197
+ num_dips += 1
198
+
199
+ if keep_trying:
200
+ global_size *= 2
201
+ rates.append(rate)
202
+ else:
203
+ rates.append(rate)
204
+ return max(rates)
205
+
206
+
207
+ def get_add_rate(queue, type="float", timer_factory=None):
208
+ return 50*10*_get_full_machine_kernel_rate(queue, """
209
+ typedef %(op_t)s op_t;
210
+ __kernel void benchmark()
211
+ {
212
+ local op_t tgt[1024];
213
+ op_t val = get_global_id(0);
214
+
215
+ for (int i = 0; i < 10; ++i)
216
+ {
217
+ val += val; val += val; val += val; val += val; val += val;
218
+ val += val; val += val; val += val; val += val; val += val;
219
+
220
+ val += val; val += val; val += val; val += val; val += val;
221
+ val += val; val += val; val += val; val += val; val += val;
222
+
223
+ val += val; val += val; val += val; val += val; val += val;
224
+ val += val; val += val; val += val; val += val; val += val;
225
+
226
+ val += val; val += val; val += val; val += val; val += val;
227
+ val += val; val += val; val += val; val += val; val += val;
228
+
229
+ val += val; val += val; val += val; val += val; val += val;
230
+ val += val; val += val; val += val; val += val; val += val;
231
+ }
232
+ tgt[get_local_id(0)] = val;
233
+ }
234
+ """ % {"op_t": type}, ())
235
+
236
+
237
+ # vim: foldmethod=marker:filetype=pyopencl