pyopencl 2024.2.7__cp311-cp311-macosx_11_0_arm64.whl → 2025.1__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (40) hide show
  1. pyopencl/__init__.py +127 -122
  2. pyopencl/_cl.cpython-311-darwin.so +0 -0
  3. pyopencl/_mymako.py +3 -3
  4. pyopencl/algorithm.py +10 -7
  5. pyopencl/array.py +58 -123
  6. pyopencl/bitonic_sort.py +3 -1
  7. pyopencl/bitonic_sort_templates.py +1 -1
  8. pyopencl/cache.py +23 -22
  9. pyopencl/capture_call.py +5 -4
  10. pyopencl/clrandom.py +1 -0
  11. pyopencl/cltypes.py +2 -2
  12. pyopencl/compyte/dtypes.py +4 -4
  13. pyopencl/compyte/pyproject.toml +54 -0
  14. pyopencl/elementwise.py +9 -2
  15. pyopencl/invoker.py +11 -9
  16. pyopencl/ipython_ext.py +1 -1
  17. pyopencl/reduction.py +16 -10
  18. pyopencl/scan.py +38 -22
  19. pyopencl/tools.py +23 -13
  20. pyopencl/version.py +1 -1
  21. {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/METADATA +11 -8
  22. pyopencl-2025.1.dist-info/RECORD +42 -0
  23. {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/WHEEL +1 -1
  24. pyopencl/compyte/.git +0 -1
  25. pyopencl/compyte/ndarray/Makefile +0 -31
  26. pyopencl/compyte/ndarray/__init__.py +0 -0
  27. pyopencl/compyte/ndarray/gen_elemwise.py +0 -1907
  28. pyopencl/compyte/ndarray/gen_reduction.py +0 -1511
  29. pyopencl/compyte/ndarray/gpu_ndarray.h +0 -35
  30. pyopencl/compyte/ndarray/pygpu_language.h +0 -207
  31. pyopencl/compyte/ndarray/pygpu_language_cuda.cu +0 -622
  32. pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +0 -317
  33. pyopencl/compyte/ndarray/pygpu_ndarray.cpp +0 -1546
  34. pyopencl/compyte/ndarray/pygpu_ndarray.h +0 -71
  35. pyopencl/compyte/ndarray/pygpu_ndarray_object.h +0 -232
  36. pyopencl/compyte/ndarray/setup_opencl.py +0 -101
  37. pyopencl/compyte/ndarray/test_gpu_elemwise.py +0 -411
  38. pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -487
  39. pyopencl-2024.2.7.dist-info/RECORD +0 -56
  40. {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,411 +0,0 @@
1
- # TODO: test other dtype
2
- from functools import reduce
3
-
4
- import numpy
5
- import pygpu_ndarray as gpu_ndarray
6
- import theano
7
-
8
- from .gen_elemwise import MyGpuNdArray, elemwise_collapses
9
- from .test_gpu_ndarray import (dtypes_all, enable_double, gen_gpu_nd_array,
10
- product)
11
-
12
-
13
- def rand(shape, dtype):
14
- r = numpy.random.randn(*shape) * 10
15
- if dtype.startswith("u"):
16
- r = numpy.absolute(r)
17
- return r.astype(dtype)
18
-
19
-
20
- # numpy.allclose seam to have problem with int8...
21
- def all_close(x, y):
22
- return (numpy.allclose(x, y) or
23
- numpy.absolute(x - y).max() == 0)
24
-
25
-
26
- def test_elemwise_collapse():
27
- """ Test collapsing under many broadcast and strided pattern """
28
-
29
- for dtype1 in ["int16", "float32", "int8"]:
30
- for dtype2 in ["int16", "float32", "int8"]:
31
-
32
- for shape1_, shape2_, expected in [
33
- # 1d to test this special case
34
- ((40,), (40,), 0),
35
- ((40,), (1,), 1),
36
- # No broadcastable dimensions
37
- ((4, 5, 6, 9), (4, 5, 6, 9), 0),
38
- # All inputs have one(and the same) broadcastable dimension
39
- ((1, 4, 5, 9), (1, 4, 5, 9), 0),
40
- ((4, 1, 5, 9), (4, 1, 5, 9), 0),
41
- ((4, 5, 1, 9), (4, 5, 1, 9), 0),
42
- ((4, 5, 9, 1), (4, 5, 9, 1), 0),
43
- # One inputs have one broadcastable dimension
44
- ((1, 5, 6, 9), (4, 5, 6, 9), 2),
45
- ((4, 1, 6, 9), (4, 5, 6, 9), 3),
46
- ((4, 5, 1, 9), (4, 5, 6, 9), 3),
47
- ((4, 5, 6, 1), (4, 5, 6, 9), 2),
48
- # One inputs have two broadcastable dimension
49
- ((1, 1, 6, 9), (4, 5, 6, 9), 2),
50
- ((1, 5, 1, 9), (4, 5, 6, 9), 4),
51
- ((1, 5, 6, 1), (4, 5, 6, 9), 3),
52
- ((4, 1, 1, 9), (4, 5, 6, 9), 3),
53
- ((4, 1, 6, 1), (4, 5, 6, 9), 4),
54
- ((4, 5, 1, 1), (4, 5, 6, 9), 2),
55
- # One inputs have tree broadcastable dimension
56
- ((1, 1, 1, 9), (4, 5, 6, 9), 2),
57
- ((1, 1, 6, 1), (4, 5, 6, 9), 3),
58
- ((1, 5, 1, 1), (4, 5, 6, 9), 3),
59
- ((4, 1, 1, 1), (4, 5, 6, 9), 2),
60
- # One scalar
61
- ((1, 1, 1, 1), (4, 5, 6, 9), 1),
62
- # One scalar, the other 1 broadcast dims
63
- ((1, 1, 1, 1), (4, 5, 6, 1), 1),
64
- ]:
65
- scalar_cpu = rand((1,) * len(shape1_), dtype=dtype1)
66
- scalar_gpu = gpu_ndarray.GpuNdArrayObject(scalar_cpu)
67
- scalar_gpu1 = MyGpuNdArray(scalar_gpu)
68
- for shape1, shape2 in [(shape1_, shape2_), (shape2_, shape1_)]:
69
- a_cpu = rand(shape1, dtype=dtype1)
70
- a = gpu_ndarray.GpuNdArrayObject(a_cpu)
71
- a1 = MyGpuNdArray(a)
72
-
73
- b_cpu = rand(shape2, dtype=dtype2)
74
- b = gpu_ndarray.GpuNdArrayObject(b_cpu)
75
- b1 = MyGpuNdArray(b)
76
-
77
- assert len(shape1) == len(shape2)
78
- o_shape = []
79
- for i in range(len(shape1)):
80
- o_shape.append(max(shape1[i], shape2[i]))
81
- o = gpu_ndarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype)
82
-
83
- # 1.1 Check direct collapse
84
- nd_collaps, info = elemwise_collapses([a, b], [o])
85
- assert nd_collaps == expected, (shape1, shape2,
86
- nd_collaps, expected, info)
87
-
88
- # 1.2 Check computation are still valid
89
- f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1],
90
- len(shape1))
91
- out = f([a1, b1])
92
- out2 = f([a1, b1], out=out)
93
- assert out is out2
94
- assert numpy.allclose(numpy.asarray(f([a1, b1])),
95
- a_cpu + b_cpu)
96
- assert numpy.allclose(numpy.asarray(
97
- MyGpuNdArray.adds(a1, b1)), a_cpu + b_cpu)
98
- assert numpy.allclose(numpy.asarray(
99
- MyGpuNdArray.add(a1, b1)), a_cpu + b_cpu)
100
- assert MyGpuNdArray.add(a1, b1, out=out2) is out2
101
-
102
- # 1.3 Check work without collaping
103
- f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1],
104
- len(shape1), collapse=False)
105
- out = f([a1, b1])
106
- out2 = f([a1, b1], out=out)
107
- assert out is out2
108
- assert numpy.allclose(numpy.asarray(f([a1, b1])),
109
- a_cpu + b_cpu)
110
- assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds(
111
- a1, b1)), a_cpu + b_cpu)
112
- assert numpy.allclose(numpy.asarray(MyGpuNdArray.add(
113
- a1, b1)), a_cpu + b_cpu)
114
- assert MyGpuNdArray.add(a1, b1, out=out2) is out2
115
-
116
- # 2.1 What if we add a scalar?
117
- nd_collaps, info = elemwise_collapses(
118
- [a, b, scalar_gpu], [o])
119
- if expected == 0:
120
- expected2 = 1
121
- else:
122
- expected2 = expected
123
- assert nd_collaps == expected2, (shape1, shape2,
124
- nd_collaps, expected,
125
- info)
126
- # 2.2 Check computation
127
- assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds(
128
- a1, b1, scalar_gpu1)),
129
- a_cpu + b_cpu + scalar_cpu)
130
-
131
- # 3.1 What if one of the dimensions is strided?
132
- broadcast = any([True for i in a.shape + b.shape
133
- if i == 1])
134
- if expected == 0:
135
- expected2 = 2
136
- else:
137
- expected2 = expected
138
-
139
- if len(shape1_) != 4:
140
- continue
141
-
142
- if a.shape[0] != 1:
143
- shape = list(shape1)
144
- shape[0] *= 2
145
- c_cpu = rand(shape, dtype='float32')
146
- c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::2]
147
- c1 = MyGpuNdArray(c)
148
-
149
- err = ("strided", c.shape, shape2,
150
- nd_collaps, expected, info)
151
- nd_collaps, info = elemwise_collapses([c, b], [o])
152
- if broadcast:
153
- assert nd_collaps >= expected, err
154
- else:
155
- assert nd_collaps == expected2, err
156
- assert numpy.allclose(numpy.asarray(
157
- MyGpuNdArray.adds(c1, b1)),
158
- numpy.asarray(c) + b_cpu)
159
-
160
- if a.shape[1] != 1:
161
- shape = list(shape1)
162
- shape[1] *= 2
163
- c_cpu = rand(shape, dtype='float32')
164
- c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::2]
165
- c1 = MyGpuNdArray(c)
166
-
167
- err = ("strided", c.shape, shape2,
168
- nd_collaps, expected, info)
169
- nd_collaps, info = elemwise_collapses([c, b], [o])
170
- if broadcast:
171
- assert nd_collaps >= expected, err
172
- else:
173
- assert nd_collaps == expected2, err
174
- pass
175
- assert numpy.allclose(numpy.asarray(
176
- MyGpuNdArray.adds(c1, b1)),
177
- numpy.asarray(c) + b_cpu)
178
-
179
- if a.shape[2] != 1:
180
- shape = list(shape1)
181
- shape[2] *= 2
182
- c_cpu = rand(shape, dtype='float32')
183
- c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::2]
184
- c1 = MyGpuNdArray(c)
185
-
186
- err = ("strided", c.shape, shape2,
187
- nd_collaps, expected, info)
188
- nd_collaps, info = elemwise_collapses([c, b], [o])
189
- if broadcast:
190
- assert nd_collaps >= expected, err
191
- else:
192
- assert nd_collaps == expected2, err
193
- pass
194
- assert numpy.allclose(numpy.asarray(
195
- MyGpuNdArray.adds(c1, b1)),
196
- numpy.asarray(c) + b_cpu)
197
-
198
- if a.shape[3] != 1:
199
- shape = list(shape1)
200
- shape[3] *= 2
201
- c_cpu = rand(shape, dtype='float32')
202
- c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::,
203
- ::, ::2]
204
- c1 = MyGpuNdArray(c)
205
-
206
- err = ("strided", c.shape, shape2,
207
- nd_collaps, expected, info)
208
- nd_collaps, info = elemwise_collapses([c, b], [o])
209
- if broadcast:
210
- assert nd_collaps >= expected, err
211
- else:
212
- assert nd_collaps == 1, err
213
- pass
214
- assert numpy.allclose(numpy.asarray(
215
- MyGpuNdArray.adds(c1, b1)),
216
- numpy.asarray(c) + b_cpu)
217
-
218
-
219
- def test_elemwise_mixed_dtype():
220
- to_cpu = numpy.asarray
221
-
222
- for dtype1 in ["int16", "float32", "int8"]:
223
- for dtype2 in ["int16", "float32", "int8"]:
224
- dtypeo = str((numpy.zeros(1, dtype=dtype1) +
225
- numpy.zeros(1, dtype=dtype2)).dtype)
226
- #print "dtypes", dtype1, dtype2, "o dtype", dtypeo
227
-
228
- #print " Test inside a wrapping python object 2 inputs"
229
- for shape in [(500,), (50, 5), (5, 6, 7)]:
230
- input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2]]
231
- del dtype
232
- gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
233
- for i in input_vals]
234
- assert all([numpy.allclose(to_cpu(ig), i)
235
- for ig, i in zip(gpu_vals, input_vals)])
236
-
237
- gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
238
- out = gpu_vals[0] + gpu_vals[1]
239
- assert numpy.allclose(to_cpu(out),
240
- input_vals[0] + input_vals[1])
241
- out = gpu_vals[0] - gpu_vals[1]
242
- assert numpy.allclose(to_cpu(out),
243
- input_vals[0] - input_vals[1])
244
- out = gpu_vals[0] * gpu_vals[1]
245
- assert all_close(to_cpu(out),
246
- input_vals[0] * input_vals[1])
247
- if dtypeo.startswith("float"):
248
- # TODO: execute for all dtype
249
- out = gpu_vals[0] / gpu_vals[1]
250
- assert numpy.allclose(to_cpu(out),
251
- input_vals[0] / input_vals[1])
252
-
253
- nb_in = 4
254
- #print " Test inside a wrapping python object %d inputs"%nb_in
255
- for shape in [(500,), (50, 5), (5, 6, 7)]:
256
- input_vals = [rand(shape, dtype)
257
- for dtype in [dtype1, dtype2, dtype1, dtype2]]
258
- gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
259
- for i in input_vals]
260
- assert all([numpy.allclose(to_cpu(ig), i)
261
- for ig, i in zip(gpu_vals, input_vals)])
262
-
263
- gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
264
- out = MyGpuNdArray.adds(*gpu_vals)
265
- assert numpy.allclose(to_cpu(out),
266
- reduce(numpy.add, input_vals))
267
-
268
- out = MyGpuNdArray.multiplys(*gpu_vals)
269
- assert all_close(to_cpu(out),
270
- reduce(numpy.multiply, input_vals))
271
-
272
- #print " Test broadcasting"
273
- for shapes in [((1, 5), (4, 5)), ((33, 10), (33, 1)),
274
- ((33, 1, 5), (33, 10, 1)),
275
- ((33, 1, 5), (33, 10, 1), ((1, 10, 5))),
276
- ]:
277
- input_vals = [rand(shape, dtype) for shape, dtype
278
- in zip(shapes, [dtype1, dtype2])]
279
- gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
280
- for i in input_vals]
281
- assert all([numpy.allclose(to_cpu(ig), i)
282
- for ig, i in zip(gpu_vals, input_vals)])
283
-
284
- gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
285
- out = MyGpuNdArray.adds(*gpu_vals)
286
- assert numpy.allclose(to_cpu(out),
287
- reduce(numpy.add, input_vals))
288
-
289
- out = MyGpuNdArray.multiplys(*gpu_vals)
290
- assert all_close(to_cpu(out),
291
- reduce(numpy.multiply, input_vals))
292
-
293
-
294
- def test_sum():
295
- to_cpu = numpy.asarray
296
- dtypes = list(dtypes_all)
297
- # I remove *int8 as currently the output have the same dtype
298
- # And this cause overflow
299
- dtypes.remove("int8")
300
- dtypes.remove("uint8")
301
- # I need to find how pycuda handle complexe in c.
302
- # I probably just need to add an header.
303
- dtypes.remove("complex64")
304
- if enable_double:
305
- dtypes.remove("complex128")
306
- for shape in [
307
- # need something bigger then 32, 1024 or 4096.
308
- # Those are corner case.
309
-
310
- # 1d, take only a few seconds on a GTX470
311
- (0,), (5,), (31,), (32,), (33,),
312
- (1023,), (1024,), (1025,),
313
- (4095,), (4096,), (4097,),
314
- (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,),
315
-
316
- # 2d, take 2 minutes on a GTX 470
317
- (0, 0), (1, 0), (0, 1,), (5, 4),
318
- (31, 31), (31, 32), (31, 33),
319
- (32, 31), (32, 32), (32, 33),
320
- (33, 31), (33, 32), (33, 33),
321
- (1024, 32), (1025, 32),
322
- (1024, 33), (1025, 33),
323
- (4096, 32), (32, 4096), (4096, 33), (33, 4096),
324
- (4097, 32), (32, 4097), (4097, 33), (33, 4097),
325
-
326
- # 3d, take 2 minutes on a GTX 470
327
- (0, 0, 0), (0, 1, 0), (0, 0, 1),
328
- (5, 4, 3), (5, 4, 3), (5, 4, 3),
329
- (4096, 2, 33), (2, 4096, 33), (33, 2, 4096),
330
- (4097, 2, 33), (2, 4097, 33), (33, 2, 4097),
331
- (4096, 33, 2), (33, 4096, 2), (2, 33, 4096),
332
- (4097, 33, 2), (33, 4097, 2), (2, 33, 4097),
333
-
334
- # 4d, take 1 minutes on a GTX 470
335
- (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0),
336
- (0, 0, 1, 0), (0, 0, 0, 1),
337
- (5, 4, 3, 2),
338
- (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32),
339
- (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32),
340
- (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33),
341
- (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33),
342
- (4100, 4, 3, 2), (4, 4100, 3, 2),
343
- (4, 3, 4100, 2), (4, 3, 2, 4100),
344
-
345
- # 5d, work only if c contiguous
346
- (5, 4, 3, 10, 11),
347
- ]:
348
-
349
- for dtype, off_o, off_i, sliced, order in product(
350
- *([dtypes] +
351
- [[False, True]] +
352
- [[False, True]] +
353
- [[-1, 2, -2, 1]] +
354
- [['f', 'c']])):
355
-
356
- cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o,
357
- off_i, sliced, order)
358
-
359
- if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or
360
- gpu_val.flags["F_CONTIGUOUS"]):
361
- continue
362
- gpu_val = MyGpuNdArray(gpu_val)
363
- cpu_sum = cpu_val.sum()
364
- # print dtype, shape, off_o, off_i, sliced, order
365
- # print (cpu_val.strides,
366
- # cpu_val.flags["C_CONTIGUOUS"],
367
- # cpu_val.flags["F_CONTIGUOUS"])
368
- # print (gpu_val.strides,
369
- # gpu_val.flags["C_CONTIGUOUS"],
370
- # gpu_val.flags["F_CONTIGUOUS"])
371
- gpu_sum = to_cpu(gpu_val.sum())
372
-
373
- def get_rtol(orig, after_reduction):
374
- if after_reduction.size == 0:
375
- return 0
376
- if orig.size // after_reduction.size > 500000:
377
- rtols = {"float32": 4.3e-5}
378
- elif orig.size // after_reduction.size > 100000:
379
- rtols = {"float32": 3e-5}
380
- elif orig.size // after_reduction.size > 50000:
381
- rtols = {"float32": 2e-5}
382
- else:
383
- rtols = {"float32": 1e-5}
384
- if dtype in rtols:
385
- rtol = rtols[dtype]
386
- else:
387
- rtol = 1e-8
388
- return rtol
389
- rtol = get_rtol(gpu_val, gpu_sum)
390
- cpu_sum = cpu_sum.astype(dtype)
391
- if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
392
- assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or
393
- cpu_sum == gpu_sum), (
394
- dtype, shape, cpu_sum, gpu_sum,
395
- (cpu_sum - gpu_sum) / cpu_sum)
396
-
397
- # Test pattern 10 and 01
398
- # Test pattern 100, 010 and 001
399
- if len(shape) in [2, 3]:
400
- for axis in range(len(shape)):
401
- gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
402
- cpu_sum = cpu_val.sum(axis=axis)
403
- rtol = get_rtol(gpu_val, gpu_sum)
404
- if cpu_sum.size > 0:
405
- argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
406
- cpu_max = cpu_sum.flatten()[argmax]
407
- gpu_max = gpu_sum.flatten()[argmax]
408
- assert numpy.allclose(cpu_sum, gpu_sum), (
409
- "axis=%d" % axis, dtype, shape, cpu_sum.shape,
410
- cpu_sum, gpu_sum,
411
- cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)