pyopencl 2024.3__cp39-cp39-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (43) hide show
  1. pyopencl/.libs/libOpenCL-1ef0e16e.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +2410 -0
  3. pyopencl/_cl.cpython-39-x86_64-linux-gnu.so +0 -0
  4. pyopencl/_cluda.py +54 -0
  5. pyopencl/_mymako.py +14 -0
  6. pyopencl/algorithm.py +1449 -0
  7. pyopencl/array.py +3437 -0
  8. pyopencl/bitonic_sort.py +242 -0
  9. pyopencl/bitonic_sort_templates.py +594 -0
  10. pyopencl/cache.py +535 -0
  11. pyopencl/capture_call.py +177 -0
  12. pyopencl/characterize/__init__.py +456 -0
  13. pyopencl/characterize/performance.py +237 -0
  14. pyopencl/cl/pyopencl-airy.cl +324 -0
  15. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  16. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  17. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  18. pyopencl/cl/pyopencl-complex.h +303 -0
  19. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  20. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  21. pyopencl/cl/pyopencl-random123/array.h +325 -0
  22. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  23. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  24. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  25. pyopencl/clmath.py +280 -0
  26. pyopencl/clrandom.py +409 -0
  27. pyopencl/cltypes.py +137 -0
  28. pyopencl/compyte/.gitignore +21 -0
  29. pyopencl/compyte/__init__.py +0 -0
  30. pyopencl/compyte/array.py +214 -0
  31. pyopencl/compyte/dtypes.py +290 -0
  32. pyopencl/compyte/pyproject.toml +54 -0
  33. pyopencl/elementwise.py +1171 -0
  34. pyopencl/invoker.py +421 -0
  35. pyopencl/ipython_ext.py +68 -0
  36. pyopencl/reduction.py +786 -0
  37. pyopencl/scan.py +1915 -0
  38. pyopencl/tools.py +1527 -0
  39. pyopencl/version.py +9 -0
  40. pyopencl-2024.3.dist-info/METADATA +108 -0
  41. pyopencl-2024.3.dist-info/RECORD +43 -0
  42. pyopencl-2024.3.dist-info/WHEEL +5 -0
  43. pyopencl-2024.3.dist-info/licenses/LICENSE +104 -0
@@ -0,0 +1,594 @@
1
+ __copyright__ = """
2
+ Copyright (c) 2011, Eric Bainville
3
+ Copyright (c) 2015, Ilya Efimoff
4
+ All rights reserved.
5
+ """
6
+
7
+ __license__ = """
8
+ Redistribution and use in source and binary forms, with or without
9
+ modification, are permitted provided that the following conditions are met:
10
+
11
+ 1. Redistributions of source code must retain the above copyright notice, this
12
+ list of conditions and the following disclaimer.
13
+
14
+ 2. Redistributions in binary form must reproduce the above copyright notice,
15
+ this list of conditions and the following disclaimer in the documentation
16
+ and/or other materials provided with the distribution.
17
+
18
+ 3. Neither the name of the copyright holder nor the names of its contributors
19
+ may be used to endorse or promote products derived from this software without
20
+ specific prior written permission.
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
26
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
28
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31
+ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
+ """
33
+
34
+ LOCAL_MEM_FACTOR = 1
35
+
36
+
37
+ # {{{ defines
38
+
39
+ defines = """//CL//
40
+
41
+ % if dtype == "double":
42
+ #if __OPENCL_C_VERSION__ < 120
43
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
44
+ #endif
45
+ % endif
46
+
47
+ typedef ${dtype} data_t;
48
+ typedef ${idxtype} idx_t;
49
+ #if CONFIG_USE_VALUE
50
+ #define getKey(a) ((a).x)
51
+ #define getValue(a) ((a).y)
52
+ #define makeData(k,v) ((${dtype}2)((k),(v)))
53
+ #else
54
+ #define getKey(a) (a)
55
+ #define getValue(a) (0)
56
+ #define makeData(k,v) (k)
57
+ #endif
58
+
59
+ #ifndef BLOCK_FACTOR
60
+ #define BLOCK_FACTOR 1
61
+ #endif
62
+
63
+ #define inc ${inc}
64
+ #define hinc ${inc>>1} //Half inc
65
+ #define qinc ${inc>>2} //Quarter inc
66
+ #define einc ${inc>>3} //Eighth of inc
67
+ #define dir ${dir}
68
+
69
+ % if argsort:
70
+ #define ORDER(a,b,ay,by) { bool swap = reverse ^ (getKey(a)<getKey(b));${NS}
71
+ data_t auxa = a; data_t auxb = b;${NS}
72
+ idx_t auya = ay; idx_t auyb = by;${NS}
73
+ a = (swap)?auxb:auxa; b = (swap)?auxa:auxb;${NS}
74
+ ay = (swap)?auyb:auya; by = (swap)?auya:auyb;}
75
+ #define ORDERV(x,y,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS}
76
+ data_t auxa = x[a]; data_t auxb = x[b];${NS}
77
+ idx_t auya = y[a]; idx_t auyb = y[b];${NS}
78
+ x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb;${NS}
79
+ y[a] = (swap)?auyb:auya; y[b] = (swap)?auya:auyb;}
80
+ #define B2V(x,y,a) { ORDERV(x,y,a,a+1) }
81
+ #define B4V(x,y,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,y,a+i4,a+i4+2) } B2V(x,y,a) B2V(x,y,a+2) }
82
+ #define B8V(x,y,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,y,a+i8,a+i8+4) } B4V(x,y,a) B4V(x,y,a+4) }
83
+ #define B16V(x,y,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,y,a+i16,a+i16+8) } B8V(x,y,a) B8V(x,y,a+8) }
84
+ % else:
85
+ #define ORDER(a,b) { bool swap = reverse ^ (getKey(a)<getKey(b)); data_t auxa = a; data_t auxb = b; a = (swap)?auxb:auxa; b = (swap)?auxa:auxb; }
86
+ #define ORDERV(x,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS}
87
+ data_t auxa = x[a]; data_t auxb = x[b];${NS}
88
+ x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb; }
89
+ #define B2V(x,a) { ORDERV(x,a,a+1) }
90
+ #define B4V(x,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,a+i4,a+i4+2) } B2V(x,a) B2V(x,a+2) }
91
+ #define B8V(x,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,a+i8,a+i8+4) } B4V(x,a) B4V(x,a+4) }
92
+ #define B16V(x,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,a+i16,a+i16+8) } B8V(x,a) B8V(x,a+8) }
93
+ % endif
94
+ #define nsize ${nsize} //Total next dimensions sizes sum. (Block size)
95
+ #define dsize ${dsize} //Dimension size
96
+ """ # noqa: E501
97
+
98
+ # }}}
99
+
100
+
101
+ # {{{ B2
102
+
103
+ ParallelBitonic_B2 = """//CL//
104
+ // N/2 threads
105
+ //ParallelBitonic_B2
106
+ __kernel void run(__global data_t * data\\
107
+ % if argsort:
108
+ , __global idx_t * index)
109
+ % else:
110
+ )
111
+ % endif
112
+ {
113
+ int t = get_global_id(0) % (dsize>>1); // thread index
114
+ int gt = get_global_id(0) / (dsize>>1);
115
+ int low = t & (inc - 1); // low order bits (below INC)
116
+ int i = (t<<1) - low; // insert 0 at position INC
117
+ int gi = i/dsize; // block index
118
+ bool reverse = ((dir & i) == 0);// ^ (gi%2); // asc/desc order
119
+
120
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
121
+ data += i*nsize + offset; // translate to first value
122
+ % if argsort:
123
+ index += i*nsize + offset; // translate to first value
124
+ % endif
125
+
126
+ // Load data
127
+ data_t x0 = data[ 0];
128
+ data_t x1 = data[inc*nsize];
129
+ % if argsort:
130
+ // Load index
131
+ idx_t i0 = index[ 0];
132
+ idx_t i1 = index[inc*nsize];
133
+ % endif
134
+
135
+ // Sort
136
+ % if argsort:
137
+ ORDER(x0,x1,i0,i1)
138
+ % else:
139
+ ORDER(x0,x1)
140
+ % endif
141
+
142
+ // Store data
143
+ data[0 ] = x0;
144
+ data[inc*nsize] = x1;
145
+ % if argsort:
146
+ // Store index
147
+ index[ 0] = i0;
148
+ index[inc*nsize] = i1;
149
+ % endif
150
+ }
151
+ """
152
+
153
+ # }}}
154
+
155
+
156
+ # {{{ B4
157
+
158
+ ParallelBitonic_B4 = """//CL//
159
+ // N/4 threads
160
+ //ParallelBitonic_B4
161
+ __kernel void run(__global data_t * data\\
162
+ % if argsort:
163
+ , __global idx_t * index)
164
+ % else:
165
+ )
166
+ % endif
167
+ {
168
+ int t = get_global_id(0) % (dsize>>2); // thread index
169
+ int gt = get_global_id(0) / (dsize>>2);
170
+ int low = t & (hinc - 1); // low order bits (below INC)
171
+ int i = ((t - low) << 2) + low; // insert 00 at position INC
172
+ bool reverse = ((dir & i) == 0); // asc/desc order
173
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
174
+ data += i*nsize + offset; // translate to first value
175
+ % if argsort:
176
+ index += i*nsize + offset; // translate to first value
177
+ % endif
178
+
179
+ // Load data
180
+ data_t x0 = data[ 0];
181
+ data_t x1 = data[ hinc*nsize];
182
+ data_t x2 = data[2*hinc*nsize];
183
+ data_t x3 = data[3*hinc*nsize];
184
+ % if argsort:
185
+ // Load index
186
+ idx_t i0 = index[ 0];
187
+ idx_t i1 = index[ hinc*nsize];
188
+ idx_t i2 = index[2*hinc*nsize];
189
+ idx_t i3 = index[3*hinc*nsize];
190
+ % endif
191
+
192
+ // Sort
193
+ % if argsort:
194
+ ORDER(x0,x2,i0,i2)
195
+ ORDER(x1,x3,i1,i3)
196
+ ORDER(x0,x1,i0,i1)
197
+ ORDER(x2,x3,i2,i3)
198
+ % else:
199
+ ORDER(x0,x2)
200
+ ORDER(x1,x3)
201
+ ORDER(x0,x1)
202
+ ORDER(x2,x3)
203
+ % endif
204
+
205
+ // Store data
206
+ data[ 0] = x0;
207
+ data[ hinc*nsize] = x1;
208
+ data[2*hinc*nsize] = x2;
209
+ data[3*hinc*nsize] = x3;
210
+ % if argsort:
211
+ // Store index
212
+ index[ 0] = i0;
213
+ index[ hinc*nsize] = i1;
214
+ index[2*hinc*nsize] = i2;
215
+ index[3*hinc*nsize] = i3;
216
+ % endif
217
+ }
218
+ """
219
+
220
+ # }}}
221
+
222
+
223
+ # {{{ B8
224
+
225
+ ParallelBitonic_B8 = """//CL//
226
+ // N/8 threads
227
+ //ParallelBitonic_B8
228
+ __kernel void run(__global data_t * data\\
229
+ % if argsort:
230
+ , __global idx_t * index)
231
+ % else:
232
+ )
233
+ % endif
234
+ {
235
+ int t = get_global_id(0) % (dsize>>3); // thread index
236
+ int gt = get_global_id(0) / (dsize>>3);
237
+ int low = t & (qinc - 1); // low order bits (below INC)
238
+ int i = ((t - low) << 3) + low; // insert 000 at position INC
239
+ bool reverse = ((dir & i) == 0); // asc/desc order
240
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
241
+
242
+ data += i*nsize + offset; // translate to first value
243
+ % if argsort:
244
+ index += i*nsize + offset; // translate to first value
245
+ % endif
246
+
247
+ // Load
248
+ data_t x[8];
249
+ % if argsort:
250
+ idx_t y[8];
251
+ % endif
252
+ for (int k=0;k<8;k++) x[k] = data[k*qinc*nsize];
253
+ % if argsort:
254
+ for (int k=0;k<8;k++) y[k] = index[k*qinc*nsize];
255
+ % endif
256
+
257
+ // Sort
258
+ % if argsort:
259
+ B8V(x,y,0)
260
+ % else:
261
+ B8V(x,0)
262
+ % endif
263
+
264
+ // Store
265
+ for (int k=0;k<8;k++) data[k*qinc*nsize] = x[k];
266
+ % if argsort:
267
+ for (int k=0;k<8;k++) index[k*qinc*nsize] = y[k];
268
+ % endif
269
+ }
270
+ """
271
+
272
+ # }}}
273
+
274
+
275
+ # {{{ B16
276
+
277
+ ParallelBitonic_B16 = """//CL//
278
+ // N/16 threads
279
+ //ParallelBitonic_B16
280
+ __kernel void run(__global data_t * data\\
281
+ % if argsort:
282
+ , __global idx_t * index)
283
+ % else:
284
+ )
285
+ % endif
286
+ {
287
+ int t = get_global_id(0) % (dsize>>4); // thread index
288
+ int gt = get_global_id(0) / (dsize>>4);
289
+ int low = t & (einc - 1); // low order bits (below INC)
290
+ int i = ((t - low) << 4) + low; // insert 0000 at position INC
291
+ bool reverse = ((dir & i) == 0); // asc/desc order
292
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
293
+
294
+ data += i*nsize + offset; // translate to first value
295
+ % if argsort:
296
+ index += i*nsize + offset; // translate to first value
297
+ % endif
298
+
299
+ // Load
300
+ data_t x[16];
301
+ % if argsort:
302
+ idx_t y[16];
303
+ % endif
304
+ for (int k=0;k<16;k++) x[k] = data[k*einc*nsize];
305
+ % if argsort:
306
+ for (int k=0;k<16;k++) y[k] = index[k*einc*nsize];
307
+ % endif
308
+
309
+ // Sort
310
+ % if argsort:
311
+ B16V(x,y,0)
312
+ % else:
313
+ B16V(x,0)
314
+ % endif
315
+
316
+ // Store
317
+ for (int k=0;k<16;k++) data[k*einc*nsize] = x[k];
318
+ % if argsort:
319
+ for (int k=0;k<16;k++) index[k*einc*nsize] = y[k];
320
+ % endif
321
+ }
322
+ """
323
+
324
+ # }}}
325
+
326
+
327
+ # {{{ C4
328
+
329
+ # IF YOU RE-ENABLE THIS, YOU NEED TO ADJUST LOCAL_MEM_FACTOR TO 4
330
+
331
+ ParallelBitonic_C4 = """//CL//
332
+ //ParallelBitonic_C4
333
+ __kernel void run\\
334
+ % if argsort:
335
+ (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
336
+ % else:
337
+ (__global data_t * data, __local data_t * aux)
338
+ % endif
339
+ {
340
+ int t = get_global_id(0); // thread index
341
+ int wgBits = 4*get_local_size(0) - 1; // bit mask to get index in local memory AUX (size is 4*WG)
342
+ int linc,low,i;
343
+ bool reverse;
344
+ data_t x[4];
345
+ % if argsort:
346
+ idx_t y[4];
347
+ % endif
348
+
349
+ // First iteration, global input, local output
350
+ linc = hinc;
351
+ low = t & (linc - 1); // low order bits (below INC)
352
+ i = ((t - low) << 2) + low; // insert 00 at position INC
353
+ reverse = ((dir & i) == 0); // asc/desc order
354
+ for (int k=0;k<4;k++) x[k] = data[i+k*linc];
355
+ % if argsort:
356
+ for (int k=0;k<4;k++) y[k] = index[i+k*linc];
357
+ B4V(x,y,0);
358
+ for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k];
359
+ % else:
360
+ B4V(x,0);
361
+ % endif
362
+ for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k];
363
+ barrier(CLK_LOCAL_MEM_FENCE);
364
+
365
+ // Internal iterations, local input and output
366
+ for ( ;linc>1;linc>>=2)
367
+ {
368
+ low = t & (linc - 1); // low order bits (below INC)
369
+ i = ((t - low) << 2) + low; // insert 00 at position INC
370
+ reverse = ((dir & i) == 0); // asc/desc order
371
+ for (int k=0;k<4;k++) x[k] = aux[(i+k*linc) & wgBits];
372
+ % if argsort:
373
+ for (int k=0;k<4;k++) y[k] = auy[(i+k*linc) & wgBits];
374
+ B4V(x,y,0);
375
+ barrier(CLK_LOCAL_MEM_FENCE);
376
+ for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k];
377
+ % else:
378
+ B4V(x,0);
379
+ barrier(CLK_LOCAL_MEM_FENCE);
380
+ % endif
381
+ for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k];
382
+ barrier(CLK_LOCAL_MEM_FENCE);
383
+ }
384
+
385
+ // Final iteration, local input, global output, INC=1
386
+ i = t << 2;
387
+ reverse = ((dir & i) == 0); // asc/desc order
388
+ for (int k=0;k<4;k++) x[k] = aux[(i+k) & wgBits];
389
+ % if argsort:
390
+ for (int k=0;k<4;k++) y[k] = auy[(i+k) & wgBits];
391
+ B4V(x,y,0);
392
+ for (int k=0;k<4;k++) index[i+k] = y[k];
393
+ % else:
394
+ B4V(x,0);
395
+ % endif
396
+ for (int k=0;k<4;k++) data[i+k] = x[k];
397
+ }
398
+ """ # noqa: E501
399
+
400
+ # }}}
401
+
402
+
403
+ # {{{ local merge
404
+
405
+ ParallelMerge_Local = """//CL//
406
+ // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
407
+ __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
408
+ {
409
+ int i = get_local_id(0); // index in workgroup
410
+ int wg = get_local_size(0); // workgroup size = block size, power of 2
411
+
412
+ // Move IN, OUT to block start
413
+ int offset = get_group_id(0) * wg;
414
+ in += offset; out += offset;
415
+
416
+ // Load block in AUX[WG]
417
+ aux[i] = in[i];
418
+ barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
419
+
420
+ // Now we will merge sub-sequences of length 1,2,...,WG/2
421
+ for (int length=1;length<wg;length<<=1)
422
+ {
423
+ data_t iData = aux[i];
424
+ data_t iKey = getKey(iData);
425
+ int ii = i & (length-1); // index in our sequence in 0..length-1
426
+ int sibling = (i - ii) ^ length; // beginning of the sibling sequence
427
+ int pos = 0;
428
+ for (int pinc=length;pinc>0;pinc>>=1) // increment for dichotomic search
429
+ {
430
+ int j = sibling+pos+pinc-1;
431
+ data_t jKey = getKey(aux[j]);
432
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
433
+ pos += (smaller)?pinc:0;
434
+ pos = min(pos,length);
435
+ }
436
+ int bits = 2*length-1; // mask for destination
437
+ int dest = ((ii + pos) & bits) | (i & ~bits); // destination index in merged sequence
438
+ barrier(CLK_LOCAL_MEM_FENCE);
439
+ aux[dest] = iData;
440
+ barrier(CLK_LOCAL_MEM_FENCE);
441
+ }
442
+
443
+ // Write output
444
+ out[i] = aux[i];
445
+ }
446
+ """ # noqa: E501
447
+
448
+ # }}}
449
+
450
+
451
+ # {{{
452
+
453
+ ParallelBitonic_Local = """//CL//
454
+ // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
455
+ __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
456
+ {
457
+ int i = get_local_id(0); // index in workgroup
458
+ int wg = get_local_size(0); // workgroup size = block size, power of 2
459
+
460
+ // Move IN, OUT to block start
461
+ int offset = get_group_id(0) * wg;
462
+ in += offset; out += offset;
463
+
464
+ // Load block in AUX[WG]
465
+ aux[i] = in[i];
466
+ barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
467
+
468
+ // Loop on sorted sequence length
469
+ for (int length=1;length<wg;length<<=1)
470
+ {
471
+ bool direction = ((i & (length<<1)) != 0); // direction of sort: 0=asc, 1=desc
472
+ // Loop on comparison distance (between keys)
473
+ for (int pinc=length;pinc>0;pinc>>=1)
474
+ {
475
+ int j = i + pinc; // sibling to compare
476
+ data_t iData = aux[i];
477
+ uint iKey = getKey(iData);
478
+ data_t jData = aux[j];
479
+ uint jKey = getKey(jData);
480
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
481
+ bool swap = smaller ^ (j < i) ^ direction;
482
+ barrier(CLK_LOCAL_MEM_FENCE);
483
+ aux[i] = (swap)?jData:iData;
484
+ barrier(CLK_LOCAL_MEM_FENCE);
485
+ }
486
+ }
487
+
488
+ // Write output
489
+ out[i] = aux[i];
490
+ }
491
+ """
492
+
493
+ # }}}
494
+
495
+
496
+ # {{{ A
497
+
498
+ ParallelBitonic_A = """//CL//
499
+ __kernel void ParallelBitonic_A(__global const data_t * in)
500
+ {
501
+ int i = get_global_id(0); // thread index
502
+ int j = i ^ inc; // sibling to compare
503
+
504
+ // Load values at I and J
505
+ data_t iData = in[i];
506
+ uint iKey = getKey(iData);
507
+ data_t jData = in[j];
508
+ uint jKey = getKey(jData);
509
+
510
+ // Compare
511
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
512
+ bool swap = smaller ^ (j < i) ^ ((dir & i) != 0);
513
+
514
+ // Store
515
+ in[i] = (swap)?jData:iData;
516
+ }
517
+ """
518
+
519
+ # }}}
520
+
521
+
522
+ # {{{ local optim
523
+
524
+ ParallelBitonic_Local_Optim = """//CL//
525
+ __kernel void run\\
526
+ % if argsort:
527
+ (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
528
+ % else:
529
+ (__global data_t * data, __local data_t * aux)
530
+ % endif
531
+ {
532
+ int t = get_global_id(0) % dsize; // thread index
533
+ int gt = get_global_id(0) / dsize;
534
+ int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
535
+
536
+ int i = get_local_id(0); // index in workgroup
537
+ int wg = get_local_size(0); // workgroup size = block size, power of 2
538
+
539
+ // Move IN, OUT to block start
540
+ //int offset = get_group_id(0) * wg;
541
+ data += offset;
542
+ // Load block in AUX[WG]
543
+ data_t iData = data[t*nsize];
544
+ aux[i] = iData;
545
+ % if argsort:
546
+ index += offset;
547
+ // Load block in AUY[WG]
548
+ idx_t iidx = index[t*nsize];
549
+ auy[i] = iidx;
550
+ % endif
551
+ barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
552
+
553
+ // Loop on sorted sequence length
554
+ for (int pwg=1;pwg<=wg;pwg<<=1){
555
+ int loffset = pwg*(i/pwg);
556
+ int ii = i%pwg;
557
+ for (int length=1;length<pwg;length<<=1){
558
+ bool direction = ii & (length<<1); // direction of sort: 0=asc, 1=desc
559
+ // Loop on comparison distance (between keys)
560
+ for (int pinc=length;pinc>0;pinc>>=1){
561
+ int j = ii ^ pinc; // sibling to compare
562
+ data_t jData = aux[loffset+j];
563
+ % if argsort:
564
+ idx_t jidx = auy[loffset+j];
565
+ % endif
566
+ data_t iKey = getKey(iData);
567
+ data_t jKey = getKey(jData);
568
+ bool smaller = (jKey < iKey) || ( jKey == iKey && j < ii );
569
+ bool swap = smaller ^ (ii>j) ^ direction;
570
+ iData = (swap)?jData:iData; // update iData
571
+ % if argsort:
572
+ iidx = (swap)?jidx:iidx; // update iidx
573
+ % endif
574
+ barrier(CLK_LOCAL_MEM_FENCE);
575
+ aux[loffset+ii] = iData;
576
+ % if argsort:
577
+ auy[loffset+ii] = iidx;
578
+ % endif
579
+ barrier(CLK_LOCAL_MEM_FENCE);
580
+ }
581
+ }
582
+ }
583
+
584
+ // Write output
585
+ data[t*nsize] = iData;
586
+ % if argsort:
587
+ index[t*nsize] = iidx;
588
+ % endif
589
+ }
590
+ """ # noqa: E501
591
+
592
+ # }}}
593
+
594
+ # vim: filetype=pyopencl:fdm=marker