pyopencl 2025.2.7__cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (47) hide show
  1. pyopencl/.libs/libOpenCL-83a5a7fd.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +1995 -0
  3. pyopencl/_cl.cpython-314-x86_64-linux-gnu.so +0 -0
  4. pyopencl/_cl.pyi +2009 -0
  5. pyopencl/_cluda.py +57 -0
  6. pyopencl/_monkeypatch.py +1104 -0
  7. pyopencl/_mymako.py +17 -0
  8. pyopencl/algorithm.py +1454 -0
  9. pyopencl/array.py +3530 -0
  10. pyopencl/bitonic_sort.py +245 -0
  11. pyopencl/bitonic_sort_templates.py +597 -0
  12. pyopencl/cache.py +535 -0
  13. pyopencl/capture_call.py +200 -0
  14. pyopencl/characterize/__init__.py +461 -0
  15. pyopencl/characterize/performance.py +240 -0
  16. pyopencl/cl/pyopencl-airy.cl +324 -0
  17. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  18. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  19. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  20. pyopencl/cl/pyopencl-complex.h +303 -0
  21. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  22. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  23. pyopencl/cl/pyopencl-random123/array.h +325 -0
  24. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  25. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  26. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  27. pyopencl/clmath.py +281 -0
  28. pyopencl/clrandom.py +412 -0
  29. pyopencl/cltypes.py +217 -0
  30. pyopencl/compyte/.gitignore +21 -0
  31. pyopencl/compyte/__init__.py +0 -0
  32. pyopencl/compyte/array.py +211 -0
  33. pyopencl/compyte/dtypes.py +314 -0
  34. pyopencl/compyte/pyproject.toml +49 -0
  35. pyopencl/elementwise.py +1288 -0
  36. pyopencl/invoker.py +417 -0
  37. pyopencl/ipython_ext.py +70 -0
  38. pyopencl/py.typed +0 -0
  39. pyopencl/reduction.py +815 -0
  40. pyopencl/scan.py +1921 -0
  41. pyopencl/tools.py +1680 -0
  42. pyopencl/typing.py +61 -0
  43. pyopencl/version.py +11 -0
  44. pyopencl-2025.2.7.dist-info/METADATA +108 -0
  45. pyopencl-2025.2.7.dist-info/RECORD +47 -0
  46. pyopencl-2025.2.7.dist-info/WHEEL +6 -0
  47. pyopencl-2025.2.7.dist-info/licenses/LICENSE +104 -0
@@ -0,0 +1,597 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ __copyright__ = """
5
+ Copyright (c) 2011, Eric Bainville
6
+ Copyright (c) 2015, Ilya Efimoff
7
+ All rights reserved.
8
+ """
9
+
10
+ __license__ = """
11
+ Redistribution and use in source and binary forms, with or without
12
+ modification, are permitted provided that the following conditions are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright notice, this
15
+ list of conditions and the following disclaimer.
16
+
17
+ 2. Redistributions in binary form must reproduce the above copyright notice,
18
+ this list of conditions and the following disclaimer in the documentation
19
+ and/or other materials provided with the distribution.
20
+
21
+ 3. Neither the name of the copyright holder nor the names of its contributors
22
+ may be used to endorse or promote products derived from this software without
23
+ specific prior written permission.
24
+
25
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
29
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
31
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34
+ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ """
36
+
37
+ LOCAL_MEM_FACTOR = 1
38
+
39
+
40
+ # {{{ defines
41
+
42
+ defines = """//CL//
43
+
44
+ % if dtype == "double":
45
+ #if __OPENCL_C_VERSION__ < 120
46
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
47
+ #endif
48
+ % endif
49
+
50
+ typedef ${dtype} data_t;
51
+ typedef ${idxtype} idx_t;
52
+ #if CONFIG_USE_VALUE
53
+ #define getKey(a) ((a).x)
54
+ #define getValue(a) ((a).y)
55
+ #define makeData(k,v) ((${dtype}2)((k),(v)))
56
+ #else
57
+ #define getKey(a) (a)
58
+ #define getValue(a) (0)
59
+ #define makeData(k,v) (k)
60
+ #endif
61
+
62
+ #ifndef BLOCK_FACTOR
63
+ #define BLOCK_FACTOR 1
64
+ #endif
65
+
66
+ #define inc ${inc}
67
+ #define hinc ${inc>>1} //Half inc
68
+ #define qinc ${inc>>2} //Quarter inc
69
+ #define einc ${inc>>3} //Eighth of inc
70
+ #define dir ${dir}
71
+
72
+ % if argsort:
73
+ #define ORDER(a,b,ay,by) { bool swap = reverse ^ (getKey(a)<getKey(b));${NS}
74
+ data_t auxa = a; data_t auxb = b;${NS}
75
+ idx_t auya = ay; idx_t auyb = by;${NS}
76
+ a = (swap)?auxb:auxa; b = (swap)?auxa:auxb;${NS}
77
+ ay = (swap)?auyb:auya; by = (swap)?auya:auyb;}
78
+ #define ORDERV(x,y,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS}
79
+ data_t auxa = x[a]; data_t auxb = x[b];${NS}
80
+ idx_t auya = y[a]; idx_t auyb = y[b];${NS}
81
+ x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb;${NS}
82
+ y[a] = (swap)?auyb:auya; y[b] = (swap)?auya:auyb;}
83
+ #define B2V(x,y,a) { ORDERV(x,y,a,a+1) }
84
+ #define B4V(x,y,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,y,a+i4,a+i4+2) } B2V(x,y,a) B2V(x,y,a+2) }
85
+ #define B8V(x,y,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,y,a+i8,a+i8+4) } B4V(x,y,a) B4V(x,y,a+4) }
86
+ #define B16V(x,y,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,y,a+i16,a+i16+8) } B8V(x,y,a) B8V(x,y,a+8) }
87
+ % else:
88
+ #define ORDER(a,b) { bool swap = reverse ^ (getKey(a)<getKey(b)); data_t auxa = a; data_t auxb = b; a = (swap)?auxb:auxa; b = (swap)?auxa:auxb; }
89
+ #define ORDERV(x,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS}
90
+ data_t auxa = x[a]; data_t auxb = x[b];${NS}
91
+ x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb; }
92
+ #define B2V(x,a) { ORDERV(x,a,a+1) }
93
+ #define B4V(x,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,a+i4,a+i4+2) } B2V(x,a) B2V(x,a+2) }
94
+ #define B8V(x,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,a+i8,a+i8+4) } B4V(x,a) B4V(x,a+4) }
95
+ #define B16V(x,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,a+i16,a+i16+8) } B8V(x,a) B8V(x,a+8) }
96
+ % endif
97
+ #define nsize ${nsize} //Total next dimensions sizes sum. (Block size)
98
+ #define dsize ${dsize} //Dimension size
99
+ """ # noqa: E501
100
+
101
+ # }}}
102
+
103
+
104
+ # {{{ B2
105
+
106
+ ParallelBitonic_B2 = """//CL//
107
+ // N/2 threads
108
+ //ParallelBitonic_B2
109
+ __kernel void run(__global data_t * data\\
110
+ % if argsort:
111
+ , __global idx_t * index)
112
+ % else:
113
+ )
114
+ % endif
115
+ {
116
+ int t = get_global_id(0) % (dsize>>1); // thread index
117
+ int gt = get_global_id(0) / (dsize>>1);
118
+ int low = t & (inc - 1); // low order bits (below INC)
119
+ int i = (t<<1) - low; // insert 0 at position INC
120
+ int gi = i/dsize; // block index
121
+ bool reverse = ((dir & i) == 0);// ^ (gi%2); // asc/desc order
122
+
123
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
124
+ data += i*nsize + offset; // translate to first value
125
+ % if argsort:
126
+ index += i*nsize + offset; // translate to first value
127
+ % endif
128
+
129
+ // Load data
130
+ data_t x0 = data[ 0];
131
+ data_t x1 = data[inc*nsize];
132
+ % if argsort:
133
+ // Load index
134
+ idx_t i0 = index[ 0];
135
+ idx_t i1 = index[inc*nsize];
136
+ % endif
137
+
138
+ // Sort
139
+ % if argsort:
140
+ ORDER(x0,x1,i0,i1)
141
+ % else:
142
+ ORDER(x0,x1)
143
+ % endif
144
+
145
+ // Store data
146
+ data[0 ] = x0;
147
+ data[inc*nsize] = x1;
148
+ % if argsort:
149
+ // Store index
150
+ index[ 0] = i0;
151
+ index[inc*nsize] = i1;
152
+ % endif
153
+ }
154
+ """
155
+
156
+ # }}}
157
+
158
+
159
+ # {{{ B4
160
+
161
+ ParallelBitonic_B4 = """//CL//
162
+ // N/4 threads
163
+ //ParallelBitonic_B4
164
+ __kernel void run(__global data_t * data\\
165
+ % if argsort:
166
+ , __global idx_t * index)
167
+ % else:
168
+ )
169
+ % endif
170
+ {
171
+ int t = get_global_id(0) % (dsize>>2); // thread index
172
+ int gt = get_global_id(0) / (dsize>>2);
173
+ int low = t & (hinc - 1); // low order bits (below INC)
174
+ int i = ((t - low) << 2) + low; // insert 00 at position INC
175
+ bool reverse = ((dir & i) == 0); // asc/desc order
176
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
177
+ data += i*nsize + offset; // translate to first value
178
+ % if argsort:
179
+ index += i*nsize + offset; // translate to first value
180
+ % endif
181
+
182
+ // Load data
183
+ data_t x0 = data[ 0];
184
+ data_t x1 = data[ hinc*nsize];
185
+ data_t x2 = data[2*hinc*nsize];
186
+ data_t x3 = data[3*hinc*nsize];
187
+ % if argsort:
188
+ // Load index
189
+ idx_t i0 = index[ 0];
190
+ idx_t i1 = index[ hinc*nsize];
191
+ idx_t i2 = index[2*hinc*nsize];
192
+ idx_t i3 = index[3*hinc*nsize];
193
+ % endif
194
+
195
+ // Sort
196
+ % if argsort:
197
+ ORDER(x0,x2,i0,i2)
198
+ ORDER(x1,x3,i1,i3)
199
+ ORDER(x0,x1,i0,i1)
200
+ ORDER(x2,x3,i2,i3)
201
+ % else:
202
+ ORDER(x0,x2)
203
+ ORDER(x1,x3)
204
+ ORDER(x0,x1)
205
+ ORDER(x2,x3)
206
+ % endif
207
+
208
+ // Store data
209
+ data[ 0] = x0;
210
+ data[ hinc*nsize] = x1;
211
+ data[2*hinc*nsize] = x2;
212
+ data[3*hinc*nsize] = x3;
213
+ % if argsort:
214
+ // Store index
215
+ index[ 0] = i0;
216
+ index[ hinc*nsize] = i1;
217
+ index[2*hinc*nsize] = i2;
218
+ index[3*hinc*nsize] = i3;
219
+ % endif
220
+ }
221
+ """
222
+
223
+ # }}}
224
+
225
+
226
+ # {{{ B8
227
+
228
+ ParallelBitonic_B8 = """//CL//
229
+ // N/8 threads
230
+ //ParallelBitonic_B8
231
+ __kernel void run(__global data_t * data\\
232
+ % if argsort:
233
+ , __global idx_t * index)
234
+ % else:
235
+ )
236
+ % endif
237
+ {
238
+ int t = get_global_id(0) % (dsize>>3); // thread index
239
+ int gt = get_global_id(0) / (dsize>>3);
240
+ int low = t & (qinc - 1); // low order bits (below INC)
241
+ int i = ((t - low) << 3) + low; // insert 000 at position INC
242
+ bool reverse = ((dir & i) == 0); // asc/desc order
243
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
244
+
245
+ data += i*nsize + offset; // translate to first value
246
+ % if argsort:
247
+ index += i*nsize + offset; // translate to first value
248
+ % endif
249
+
250
+ // Load
251
+ data_t x[8];
252
+ % if argsort:
253
+ idx_t y[8];
254
+ % endif
255
+ for (int k=0;k<8;k++) x[k] = data[k*qinc*nsize];
256
+ % if argsort:
257
+ for (int k=0;k<8;k++) y[k] = index[k*qinc*nsize];
258
+ % endif
259
+
260
+ // Sort
261
+ % if argsort:
262
+ B8V(x,y,0)
263
+ % else:
264
+ B8V(x,0)
265
+ % endif
266
+
267
+ // Store
268
+ for (int k=0;k<8;k++) data[k*qinc*nsize] = x[k];
269
+ % if argsort:
270
+ for (int k=0;k<8;k++) index[k*qinc*nsize] = y[k];
271
+ % endif
272
+ }
273
+ """
274
+
275
+ # }}}
276
+
277
+
278
+ # {{{ B16
279
+
280
+ ParallelBitonic_B16 = """//CL//
281
+ // N/16 threads
282
+ //ParallelBitonic_B16
283
+ __kernel void run(__global data_t * data\\
284
+ % if argsort:
285
+ , __global idx_t * index)
286
+ % else:
287
+ )
288
+ % endif
289
+ {
290
+ int t = get_global_id(0) % (dsize>>4); // thread index
291
+ int gt = get_global_id(0) / (dsize>>4);
292
+ int low = t & (einc - 1); // low order bits (below INC)
293
+ int i = ((t - low) << 4) + low; // insert 0000 at position INC
294
+ bool reverse = ((dir & i) == 0); // asc/desc order
295
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
296
+
297
+ data += i*nsize + offset; // translate to first value
298
+ % if argsort:
299
+ index += i*nsize + offset; // translate to first value
300
+ % endif
301
+
302
+ // Load
303
+ data_t x[16];
304
+ % if argsort:
305
+ idx_t y[16];
306
+ % endif
307
+ for (int k=0;k<16;k++) x[k] = data[k*einc*nsize];
308
+ % if argsort:
309
+ for (int k=0;k<16;k++) y[k] = index[k*einc*nsize];
310
+ % endif
311
+
312
+ // Sort
313
+ % if argsort:
314
+ B16V(x,y,0)
315
+ % else:
316
+ B16V(x,0)
317
+ % endif
318
+
319
+ // Store
320
+ for (int k=0;k<16;k++) data[k*einc*nsize] = x[k];
321
+ % if argsort:
322
+ for (int k=0;k<16;k++) index[k*einc*nsize] = y[k];
323
+ % endif
324
+ }
325
+ """
326
+
327
+ # }}}
328
+
329
+
330
+ # {{{ C4
331
+
332
+ # IF YOU RE-ENABLE THIS, YOU NEED TO ADJUST LOCAL_MEM_FACTOR TO 4
333
+
334
+ ParallelBitonic_C4 = """//CL//
335
+ //ParallelBitonic_C4
336
+ __kernel void run\\
337
+ % if argsort:
338
+ (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
339
+ % else:
340
+ (__global data_t * data, __local data_t * aux)
341
+ % endif
342
+ {
343
+ int t = get_global_id(0); // thread index
344
+ int wgBits = 4*get_local_size(0) - 1; // bit mask to get index in local memory AUX (size is 4*WG)
345
+ int linc,low,i;
346
+ bool reverse;
347
+ data_t x[4];
348
+ % if argsort:
349
+ idx_t y[4];
350
+ % endif
351
+
352
+ // First iteration, global input, local output
353
+ linc = hinc;
354
+ low = t & (linc - 1); // low order bits (below INC)
355
+ i = ((t - low) << 2) + low; // insert 00 at position INC
356
+ reverse = ((dir & i) == 0); // asc/desc order
357
+ for (int k=0;k<4;k++) x[k] = data[i+k*linc];
358
+ % if argsort:
359
+ for (int k=0;k<4;k++) y[k] = index[i+k*linc];
360
+ B4V(x,y,0);
361
+ for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k];
362
+ % else:
363
+ B4V(x,0);
364
+ % endif
365
+ for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k];
366
+ barrier(CLK_LOCAL_MEM_FENCE);
367
+
368
+ // Internal iterations, local input and output
369
+ for ( ;linc>1;linc>>=2)
370
+ {
371
+ low = t & (linc - 1); // low order bits (below INC)
372
+ i = ((t - low) << 2) + low; // insert 00 at position INC
373
+ reverse = ((dir & i) == 0); // asc/desc order
374
+ for (int k=0;k<4;k++) x[k] = aux[(i+k*linc) & wgBits];
375
+ % if argsort:
376
+ for (int k=0;k<4;k++) y[k] = auy[(i+k*linc) & wgBits];
377
+ B4V(x,y,0);
378
+ barrier(CLK_LOCAL_MEM_FENCE);
379
+ for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k];
380
+ % else:
381
+ B4V(x,0);
382
+ barrier(CLK_LOCAL_MEM_FENCE);
383
+ % endif
384
+ for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k];
385
+ barrier(CLK_LOCAL_MEM_FENCE);
386
+ }
387
+
388
+ // Final iteration, local input, global output, INC=1
389
+ i = t << 2;
390
+ reverse = ((dir & i) == 0); // asc/desc order
391
+ for (int k=0;k<4;k++) x[k] = aux[(i+k) & wgBits];
392
+ % if argsort:
393
+ for (int k=0;k<4;k++) y[k] = auy[(i+k) & wgBits];
394
+ B4V(x,y,0);
395
+ for (int k=0;k<4;k++) index[i+k] = y[k];
396
+ % else:
397
+ B4V(x,0);
398
+ % endif
399
+ for (int k=0;k<4;k++) data[i+k] = x[k];
400
+ }
401
+ """ # noqa: E501
402
+
403
+ # }}}
404
+
405
+
406
+ # {{{ local merge
407
+
408
+ ParallelMerge_Local = """//CL//
409
+ // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
410
+ __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
411
+ {
412
+ int i = get_local_id(0); // index in workgroup
413
+ int wg = get_local_size(0); // workgroup size = block size, power of 2
414
+
415
+ // Move IN, OUT to block start
416
+ int offset = get_group_id(0) * wg;
417
+ in += offset; out += offset;
418
+
419
+ // Load block in AUX[WG]
420
+ aux[i] = in[i];
421
+ barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
422
+
423
+ // Now we will merge sub-sequences of length 1,2,...,WG/2
424
+ for (int length=1;length<wg;length<<=1)
425
+ {
426
+ data_t iData = aux[i];
427
+ data_t iKey = getKey(iData);
428
+ int ii = i & (length-1); // index in our sequence in 0..length-1
429
+ int sibling = (i - ii) ^ length; // beginning of the sibling sequence
430
+ int pos = 0;
431
+ for (int pinc=length;pinc>0;pinc>>=1) // increment for dichotomic search
432
+ {
433
+ int j = sibling+pos+pinc-1;
434
+ data_t jKey = getKey(aux[j]);
435
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
436
+ pos += (smaller)?pinc:0;
437
+ pos = min(pos,length);
438
+ }
439
+ int bits = 2*length-1; // mask for destination
440
+ int dest = ((ii + pos) & bits) | (i & ~bits); // destination index in merged sequence
441
+ barrier(CLK_LOCAL_MEM_FENCE);
442
+ aux[dest] = iData;
443
+ barrier(CLK_LOCAL_MEM_FENCE);
444
+ }
445
+
446
+ // Write output
447
+ out[i] = aux[i];
448
+ }
449
+ """ # noqa: E501
450
+
451
+ # }}}
452
+
453
+
454
+ # {{{
455
+
456
+ ParallelBitonic_Local = """//CL//
457
+ // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
458
+ __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
459
+ {
460
+ int i = get_local_id(0); // index in workgroup
461
+ int wg = get_local_size(0); // workgroup size = block size, power of 2
462
+
463
+ // Move IN, OUT to block start
464
+ int offset = get_group_id(0) * wg;
465
+ in += offset; out += offset;
466
+
467
+ // Load block in AUX[WG]
468
+ aux[i] = in[i];
469
+ barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
470
+
471
+ // Loop on sorted sequence length
472
+ for (int length=1;length<wg;length<<=1)
473
+ {
474
+ bool direction = ((i & (length<<1)) != 0); // direction of sort: 0=asc, 1=desc
475
+ // Loop on comparison distance (between keys)
476
+ for (int pinc=length;pinc>0;pinc>>=1)
477
+ {
478
+ int j = i + pinc; // sibling to compare
479
+ data_t iData = aux[i];
480
+ uint iKey = getKey(iData);
481
+ data_t jData = aux[j];
482
+ uint jKey = getKey(jData);
483
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
484
+ bool swap = smaller ^ (j < i) ^ direction;
485
+ barrier(CLK_LOCAL_MEM_FENCE);
486
+ aux[i] = (swap)?jData:iData;
487
+ barrier(CLK_LOCAL_MEM_FENCE);
488
+ }
489
+ }
490
+
491
+ // Write output
492
+ out[i] = aux[i];
493
+ }
494
+ """
495
+
496
+ # }}}
497
+
498
+
499
+ # {{{ A
500
+
501
+ ParallelBitonic_A = """//CL//
502
+ __kernel void ParallelBitonic_A(__global const data_t * in)
503
+ {
504
+ int i = get_global_id(0); // thread index
505
+ int j = i ^ inc; // sibling to compare
506
+
507
+ // Load values at I and J
508
+ data_t iData = in[i];
509
+ uint iKey = getKey(iData);
510
+ data_t jData = in[j];
511
+ uint jKey = getKey(jData);
512
+
513
+ // Compare
514
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
515
+ bool swap = smaller ^ (j < i) ^ ((dir & i) != 0);
516
+
517
+ // Store
518
+ in[i] = (swap)?jData:iData;
519
+ }
520
+ """
521
+
522
+ # }}}
523
+
524
+
525
+ # {{{ local optim
526
+
527
+ ParallelBitonic_Local_Optim = """//CL//
528
+ __kernel void run\\
529
+ % if argsort:
530
+ (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
531
+ % else:
532
+ (__global data_t * data, __local data_t * aux)
533
+ % endif
534
+ {
535
+ int t = get_global_id(0) % dsize; // thread index
536
+ int gt = get_global_id(0) / dsize;
537
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
538
+
539
+ int i = get_local_id(0); // index in workgroup
540
+ int wg = get_local_size(0); // workgroup size = block size, power of 2
541
+
542
+ // Move IN, OUT to block start
543
+ //int offset = get_group_id(0) * wg;
544
+ data += offset;
545
+ // Load block in AUX[WG]
546
+ data_t iData = data[t*nsize];
547
+ aux[i] = iData;
548
+ % if argsort:
549
+ index += offset;
550
+ // Load block in AUY[WG]
551
+ idx_t iidx = index[t*nsize];
552
+ auy[i] = iidx;
553
+ % endif
554
+ barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
555
+
556
+ // Loop on sorted sequence length
557
+ for (int pwg=1;pwg<=wg;pwg<<=1){
558
+ int loffset = pwg*(i/pwg);
559
+ int ii = i%pwg;
560
+ for (int length=1;length<pwg;length<<=1){
561
+ bool direction = ii & (length<<1); // direction of sort: 0=asc, 1=desc
562
+ // Loop on comparison distance (between keys)
563
+ for (int pinc=length;pinc>0;pinc>>=1){
564
+ int j = ii ^ pinc; // sibling to compare
565
+ data_t jData = aux[loffset+j];
566
+ % if argsort:
567
+ idx_t jidx = auy[loffset+j];
568
+ % endif
569
+ data_t iKey = getKey(iData);
570
+ data_t jKey = getKey(jData);
571
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < ii );
572
+ bool swap = smaller ^ (ii>j) ^ direction;
573
+ iData = (swap)?jData:iData; // update iData
574
+ % if argsort:
575
+ iidx = (swap)?jidx:iidx; // update iidx
576
+ % endif
577
+ barrier(CLK_LOCAL_MEM_FENCE);
578
+ aux[loffset+ii] = iData;
579
+ % if argsort:
580
+ auy[loffset+ii] = iidx;
581
+ % endif
582
+ barrier(CLK_LOCAL_MEM_FENCE);
583
+ }
584
+ }
585
+ }
586
+
587
+ // Write output
588
+ data[t*nsize] = iData;
589
+ % if argsort:
590
+ index[t*nsize] = iidx;
591
+ % endif
592
+ }
593
+ """ # noqa: E501
594
+
595
+ # }}}
596
+
597
+ # vim: filetype=pyopencl:fdm=marker