pyopencl 2024.1__cp311-cp311-win_amd64.whl → 2024.2__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (107) hide show
  1. pyopencl/__init__.py +82 -80
  2. pyopencl/_cl.cp311-win_amd64.pyd +0 -0
  3. pyopencl/algorithm.py +8 -10
  4. pyopencl/array.py +16 -12
  5. pyopencl/bitonic_sort.py +5 -4
  6. pyopencl/cache.py +22 -22
  7. pyopencl/capture_call.py +4 -3
  8. pyopencl/characterize/__init__.py +4 -2
  9. pyopencl/characterize/performance.py +2 -1
  10. pyopencl/clmath.py +2 -1
  11. pyopencl/clrandom.py +5 -369
  12. pyopencl/cltypes.py +4 -1
  13. pyopencl/compyte/dtypes.py +1 -1
  14. pyopencl/compyte/ndarray/gen_elemwise.py +6 -5
  15. pyopencl/compyte/ndarray/gen_reduction.py +6 -6
  16. pyopencl/compyte/ndarray/setup_opencl.py +3 -2
  17. pyopencl/compyte/ndarray/test_gpu_elemwise.py +5 -4
  18. pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -1
  19. pyopencl/elementwise.py +4 -6
  20. pyopencl/invoker.py +15 -9
  21. pyopencl/ipython_ext.py +1 -1
  22. pyopencl/reduction.py +5 -5
  23. pyopencl/scan.py +17 -21
  24. pyopencl/tools.py +13 -16
  25. pyopencl/version.py +1 -1
  26. pyopencl-2024.2.data/data/CITATION.cff +74 -0
  27. {pyopencl-2024.1.dist-info → pyopencl-2024.2.data/data}/LICENSE +0 -23
  28. pyopencl-2024.2.data/data/Makefile.in +21 -0
  29. pyopencl-2024.2.data/data/README.rst +70 -0
  30. pyopencl-2024.2.data/data/README_SETUP.txt +34 -0
  31. pyopencl-2024.2.data/data/aksetup_helper.py +1013 -0
  32. pyopencl-2024.2.data/data/configure.py +6 -0
  33. pyopencl-2024.2.data/data/contrib/cldis.py +91 -0
  34. pyopencl-2024.2.data/data/contrib/fortran-to-opencl/README +29 -0
  35. pyopencl-2024.2.data/data/contrib/fortran-to-opencl/translate.py +1441 -0
  36. pyopencl-2024.2.data/data/contrib/pyopencl.vim +84 -0
  37. pyopencl-2024.2.data/data/doc/Makefile +23 -0
  38. pyopencl-2024.2.data/data/doc/algorithm.rst +214 -0
  39. pyopencl-2024.2.data/data/doc/array.rst +305 -0
  40. pyopencl-2024.2.data/data/doc/conf.py +26 -0
  41. pyopencl-2024.2.data/data/doc/howto.rst +105 -0
  42. pyopencl-2024.2.data/data/doc/index.rst +137 -0
  43. pyopencl-2024.2.data/data/doc/make_constants.py +561 -0
  44. pyopencl-2024.2.data/data/doc/misc.rst +885 -0
  45. pyopencl-2024.2.data/data/doc/runtime.rst +51 -0
  46. pyopencl-2024.2.data/data/doc/runtime_const.rst +30 -0
  47. pyopencl-2024.2.data/data/doc/runtime_gl.rst +78 -0
  48. pyopencl-2024.2.data/data/doc/runtime_memory.rst +527 -0
  49. pyopencl-2024.2.data/data/doc/runtime_platform.rst +184 -0
  50. pyopencl-2024.2.data/data/doc/runtime_program.rst +364 -0
  51. pyopencl-2024.2.data/data/doc/runtime_queue.rst +182 -0
  52. pyopencl-2024.2.data/data/doc/subst.rst +36 -0
  53. pyopencl-2024.2.data/data/doc/tools.rst +4 -0
  54. pyopencl-2024.2.data/data/doc/types.rst +42 -0
  55. pyopencl-2024.2.data/data/examples/black-hole-accretion.py +2227 -0
  56. pyopencl-2024.2.data/data/examples/demo-struct-reduce.py +75 -0
  57. pyopencl-2024.2.data/data/examples/demo.py +39 -0
  58. pyopencl-2024.2.data/data/examples/demo_array.py +32 -0
  59. pyopencl-2024.2.data/data/examples/demo_array_svm.py +37 -0
  60. pyopencl-2024.2.data/data/examples/demo_elementwise.py +34 -0
  61. pyopencl-2024.2.data/data/examples/demo_elementwise_complex.py +53 -0
  62. pyopencl-2024.2.data/data/examples/demo_mandelbrot.py +183 -0
  63. pyopencl-2024.2.data/data/examples/demo_meta_codepy.py +56 -0
  64. pyopencl-2024.2.data/data/examples/demo_meta_template.py +55 -0
  65. pyopencl-2024.2.data/data/examples/dump-performance.py +38 -0
  66. pyopencl-2024.2.data/data/examples/dump-properties.py +86 -0
  67. pyopencl-2024.2.data/data/examples/gl_interop_demo.py +84 -0
  68. pyopencl-2024.2.data/data/examples/gl_particle_animation.py +218 -0
  69. pyopencl-2024.2.data/data/examples/ipython-demo.ipynb +203 -0
  70. pyopencl-2024.2.data/data/examples/median-filter.py +99 -0
  71. pyopencl-2024.2.data/data/examples/n-body.py +1070 -0
  72. pyopencl-2024.2.data/data/examples/narray.py +37 -0
  73. pyopencl-2024.2.data/data/examples/noisyImage.jpg +0 -0
  74. pyopencl-2024.2.data/data/examples/pi-monte-carlo.py +1166 -0
  75. pyopencl-2024.2.data/data/examples/svm.py +82 -0
  76. pyopencl-2024.2.data/data/examples/transpose.py +229 -0
  77. pyopencl-2024.2.data/data/pytest.ini +3 -0
  78. pyopencl-2024.2.data/data/src/bitlog.cpp +51 -0
  79. pyopencl-2024.2.data/data/src/bitlog.hpp +83 -0
  80. pyopencl-2024.2.data/data/src/clinfo_ext.h +134 -0
  81. pyopencl-2024.2.data/data/src/mempool.hpp +444 -0
  82. pyopencl-2024.2.data/data/src/pyopencl_ext.h +77 -0
  83. pyopencl-2024.2.data/data/src/tools.hpp +90 -0
  84. pyopencl-2024.2.data/data/src/wrap_cl.cpp +61 -0
  85. pyopencl-2024.2.data/data/src/wrap_cl.hpp +5853 -0
  86. pyopencl-2024.2.data/data/src/wrap_cl_part_1.cpp +369 -0
  87. pyopencl-2024.2.data/data/src/wrap_cl_part_2.cpp +702 -0
  88. pyopencl-2024.2.data/data/src/wrap_constants.cpp +1274 -0
  89. pyopencl-2024.2.data/data/src/wrap_helpers.hpp +213 -0
  90. pyopencl-2024.2.data/data/src/wrap_mempool.cpp +731 -0
  91. pyopencl-2024.2.data/data/test/add-vectors-32.spv +0 -0
  92. pyopencl-2024.2.data/data/test/add-vectors-64.spv +0 -0
  93. pyopencl-2024.2.data/data/test/empty-header.h +1 -0
  94. pyopencl-2024.2.data/data/test/test_algorithm.py +1180 -0
  95. pyopencl-2024.2.data/data/test/test_array.py +2392 -0
  96. pyopencl-2024.2.data/data/test/test_arrays_in_structs.py +100 -0
  97. pyopencl-2024.2.data/data/test/test_clmath.py +529 -0
  98. pyopencl-2024.2.data/data/test/test_clrandom.py +75 -0
  99. pyopencl-2024.2.data/data/test/test_enqueue_copy.py +271 -0
  100. pyopencl-2024.2.data/data/test/test_wrapper.py +1554 -0
  101. pyopencl-2024.2.dist-info/LICENSE +282 -0
  102. {pyopencl-2024.1.dist-info → pyopencl-2024.2.dist-info}/METADATA +12 -12
  103. pyopencl-2024.2.dist-info/RECORD +122 -0
  104. {pyopencl-2024.1.dist-info → pyopencl-2024.2.dist-info}/WHEEL +1 -1
  105. pyopencl/cl/pyopencl-ranluxcl.cl +0 -957
  106. pyopencl-2024.1.dist-info/RECORD +0 -48
  107. {pyopencl-2024.1.dist-info → pyopencl-2024.2.dist-info}/top_level.txt +0 -0
@@ -1,957 +0,0 @@
1
- /* RanluxCL is deprecated in PyOpenCL and will be removed in the 2018.x
2
- * versions of the package. */
3
-
4
- #ifndef RANLUXCL_CL
5
- #define RANLUXCL_CL
6
-
7
- /**** RANLUXCL v1.3.1 MODIFIED *************************************************
8
-
9
- Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
10
- implementation by Fred James. This OpenCL code is a complete implementation
11
- which should perfectly replicate the numbers generated by the original Fortran
12
- 77 implementation (if using the legacy initialization routine).
13
-
14
- ***** QUICK USAGE DESCRIPTION **************************************************
15
-
16
- 1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte)
17
- per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112;
18
-
19
- 2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches
20
- the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit
21
- integer, and must be different on different OpenCL devices/NDRanges to ensure
22
- different sequences. As long as the number of work-items on each device/NDRange
23
- is less than 2^32 = 4294967296 all sequences will be different.
24
- An examle initialization kernel would be:
25
- #include "ranluxcl.cl"
26
- kernel void Kernel_Ranluxcl_Init(private uint ins,
27
- global ranluxcl_state_t *ranluxcltab)
28
- {
29
- ranluxcl_initialization(ins, ranluxcltab);
30
- }
31
-
32
- 3. Now the generator is ready for use. Remember to download the seeds first,
33
- and upload them again when done. Example kernel that downloads seeds, generates
34
- a float4 where each component is uniformly distributed between 0 and 1, end
35
- points not included, then uploads the seeds again:
36
- #include "ranluxcl.cl"
37
- kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab)
38
- {
39
- //ranluxclstate stores the state of the generator.
40
- ranluxcl_state_t ranluxclstate;
41
-
42
- //Download state into ranluxclstate struct.
43
- ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
44
-
45
- //Generate a float4 with each component on (0,1),
46
- //end points not included. We can call ranluxcl as many
47
- //times as we like until we upload the state again.
48
- float4 randomnr = ranluxcl32(&ranluxclstate);
49
-
50
- //Upload state again so that we don't get the same
51
- //numbers over again the next time we use ranluxcl.
52
- ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
53
- }
54
-
55
- ***** MACROS *******************************************************************
56
-
57
- The following macros can optionally be defined:
58
-
59
- RANLUXCL_LUX:
60
- Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger
61
- it sets the p-value of the generator (generally not needed). If this macro is
62
- not set then lux=4 is the default (highest quality). For many applications the
63
- high quality of lux=4 may not be needed. Indeed if two values (each value
64
- having 24 random bits) are glued together to form a 48-bit value the generator
65
- passes all tests in the TestU01 suite already with lux=2. See
66
- "TestU01: A C Library for Empirical Testing of Random Number Generators" by
67
- PIERRE LAeECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is RANLUX with
68
- two values glued together to create 48-bit numbers, and we see that it passes
69
- all tests already at luxury value 2.
70
-
71
- RANLUXCL_NO_WARMUP:
72
- Turns off the warmup functionality in ranluxcl_initialization. This macro
73
- should generally not be used, since the generators will initially be correlated
74
- if it is defined. The only advantage is that the numbers generated will exactly
75
- correspond to those of the original Fortran 77 implementation.
76
-
77
- RANLUXCL_SUPPORT_DOUBLE:
78
- Enables double precision functions. Please enable the OpenCL double precision
79
- extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".
80
-
81
- RANLUXCL_USE_LEGACY_INITIALIZATION
82
- Uses exactly the same initialization routine as in the original Fortran 77 code,
83
- leading to the same sequences. If using legacy initialization there are some
84
- restrictions on what the seed <ins> can be, and it may also be necessary to
85
- define RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.
86
-
87
- RANLUXCL_MAXWORKITEMS:
88
- When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
89
- If several OpenCL NDRanges will be running in parallel and the parallel
90
- sequences should be different then this macro should have a value equal or
91
- larger than the
92
- largest number of work-items in any of the parallel runs. The default is to
93
- use the current global size, so if all NDRanges are of the same size this need
94
- not be defined.
95
- Each parallel instance must also have different seeds <ins>. For example if
96
- we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would
97
- use different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be
98
- defined to be at least 10240. If GPU1 and GPU2 had the same number of work-items
99
- this would not be necessary.
100
- An underestimate of the highest permissible seed <ins> is given by the
101
- smallest of:
102
- (<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
103
- Please make sure that <ins> is never higher than this since it could cause
104
- undetected problems. For example with 10240 work-items the highest permissible
105
- <ins> is about 100 000.
106
- Again note that this is only relevant when using the legacy initialization
107
- function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using the
108
- legacy initialization this macro is effectively set to a very high value of
109
- 2^32-1.
110
-
111
- ***** FUNCTIONS: INITIALIZATION ************************************************
112
-
113
- The initialization function is defined as:
114
- void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
115
- Run once at the very beginning. ranluxcltab should be a buffer with space for
116
- 112 byte per work-item in the NDRange. <ins> is the seed to the generator.
117
- For a given <ins> each work-item in the NDRange will generate a different
118
- sequence. If more than one NDRange is used in parallel then <ins> must be
119
- different for each NDRange to avoid identical sequences.
120
-
121
- ***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ******************************************
122
-
123
- The following two functions should be launced at the beginning and end of a
124
- kernel that uses ranluxcl to generate numbers, respectively:
125
-
126
- void ranluxcl_download_seed(ranluxcl_state_t *rst,
127
- global ranluxcl_state_t *ranluxcltab)
128
- Run at the beginning of a kernel to download ranluxcl state data
129
-
130
- void ranluxcl_upload_seed(ranluxcl_state_t *rst,
131
- global ranluxcl_state_t *ranluxcltab)
132
- Run at the end of a kernel to upload state data
133
-
134
- ***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ********************************
135
-
136
- float4 ranluxcl32(ranluxcl_state_t *rst)
137
- Run to generate a pseudo-random float4 where each component is a number between
138
- 0 and 1, end points not included (meaning the number will never be exactly 0 or
139
- 1).
140
-
141
- double4 ranluxcl64(ranluxcl_state_t *rst)
142
- Double precision version of the above function. The preprocessor macro
143
- RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
144
- This function "glues" together two single-precision numbers to make one double
145
- precision number. Most of the work is still done in single precision, so the
146
- performance will be roughly halved regardless of the double precision
147
- performance of the hardware.
148
-
149
- float4 ranluxcl32norm(ranluxcl_state_t *rst)
150
- Run to generate a pseudo-random float4 where each component is normally
151
- distributed with mean 0 and standard deviation 1.
152
-
153
- double4 ranluxcl64norm(ranluxcl_state_t *rst)
154
- Double precision version of the above function. The preprocessor macro
155
- RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
156
-
157
- void ranluxcl_synchronize(ranluxcl_state_t *rst)
158
- Run to synchronize execution in case different work-items have made a different
159
- number of calls to ranluxcl. On SIMD machines this could lead to inefficient
160
- execution. ranluxcl_synchronize allows us to make sure all generators are
161
- SIMD-friendly again. Not needed if all work-items always call ranluxcl the same
162
- number of times.
163
-
164
- ***** PERFORMANCE **************************************************************
165
-
166
- For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudo-
167
- random values per second, when not downloading values to host memory (i.e. the
168
- values are just generated, but not used for anything in particular).
169
-
170
- ***** DESCRIPTION OF THE IMPLEMENTATION ****************************************
171
-
172
- This code closely follows the original Fortran 77 code (see credit section).
173
- Here the differences (and similarities) between RANLUXCL (this implementation)
174
- and the original RANLUX are discussed.
175
-
176
- The Fortran 77 implementation uses a simple LCG to initialize the generator, and
177
- so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
178
- seed, the first work-item behaves like the original RANLUX with seed equal 1,
179
- the second work-item as if with seed equal 2 and so on. If <ins> = 1 then the
180
- first work-item behaves like the original RANLUX with seed equal to
181
- <numWorkitems> + 1, and so on for higher <ins> so that we never have overlapping
182
- sequences. This is why the RANLUXCL_MAXWORKITEMS macro must be set if we have
183
- different NDRanges with a different number of work-items.
184
-
185
- RANLUX is based on chaos theory, and what we are actually doing when selecting
186
- a luxury value is setting how many values to skip over (causing decorrelation).
187
- The number of values to skip is controlled by the so-called p-value of the
188
- generator. After generating 24 values we skip p - 24 values until again
189
- generating 24 values.
190
-
191
- This implementation is somewhat modified from the original fortran
192
- implementation by F. James. Because of the way the OpenCL code is optimized with
193
- 4-component 32-bit float vectors, it is most convenient to always throw away
194
- some multiple of 24 values (i.e. p is always a multiple of 24).
195
-
196
- However, there might be some resonances if we always throw away a multiple of
197
- the seeds table size. Therefore the implementation is slightly more intricate
198
- where p can be a multiple of 4 instead, at a cost to performance (only about 10%
199
- lower than the cleaner 24 values approach on AMD Cypress). These two approaches
200
- are termed planar and planar shift respectively. The idea for the planar
201
- approach comes from the following paper:
202
- Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on
203
- Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
204
-
205
- Below the p-values for the original reference implementation are listed along
206
- with those of the planar shift implementation. Suggested values for the planar
207
- approach are also presented. When this function is called with RANLUXCL_LUX
208
- set to 0-4, the planar shift values are used. To use the pure planar approach
209
- (for some extra performance with likely undetectable quality decrease), set lux
210
- equal to the specific p-value.
211
-
212
- Luxury setting (RANLUXCL_LUX): 0 1 2 3 4
213
- Original fortran77 implementation by F. James: 24 48 97 223 389
214
- Planar (suggested): 24 48 120 240 408
215
- Planar shift: 24 48 100 224 404
216
-
217
- Note that levels 0 and 1 are the same as in the original implementation for both
218
- planar and planar shift. Level 4 of planar shift where p=404 is the same as
219
- chosen for luxury level 1 by Martin Luescher for his v3 version of RANLUX.
220
- Therefore if it is considered important to only use "official" values, luxury
221
- settings 0, 1 or 4 of planar shift should be used. It is however unlikely that
222
- the other values are bad, they just haven't been as extensively used and tested
223
- by others.
224
-
225
- Variable names are generally the same as in the fortran77 implementation,
226
- however because of the way the generator is implemented, the i24 and j24
227
- variables are no longer needed.
228
-
229
- ***** CREDIT *******************************************************************
230
-
231
- I have been told by Fred James (the coder) that the original Fortran 77
232
- implementation (which is the subject of the second paper below) is free to use
233
- and share. Therefore I am using the MIT license (below). But most importantly
234
- please always remember to give credit to the two articles by Martin Luscher and
235
- Fred James, describing the generator and the Fortran 77 implementation on which
236
- this implementation is based, respectively:
237
-
238
- Martin Luescher, A portable high-quality random number generator for lattice
239
- field theory simulations, Computer Physics Communications 79 (1994) 100-110
240
-
241
- F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
242
- number generator of Luescher, Computer Physics Communications 79 (1994) 111-114
243
-
244
- ***** LICENSE ******************************************************************
245
-
246
- Copyright (c) 2011 Ivar Ursin Nikolaisen
247
-
248
- Permission is hereby granted, free of charge, to any person obtaining a copy of
249
- this software and associated documentation files (the "Software"), to deal in
250
- the Software without restriction, including without limitation the rights to
251
- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
252
- the Software, and to permit persons to whom the Software is furnished to do so,
253
- subject to the following conditions:
254
-
255
- The above copyright notice and this permission notice shall be included in all
256
- copies or substantial portions of the Software.
257
-
258
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
259
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
260
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
261
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
262
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
263
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
264
- SOFTWARE.
265
-
266
- *******************************************************************************/
267
-
268
- typedef struct{
269
- float
270
- s01, s02, s03, s04,
271
- s05, s06, s07, s08,
272
- s09, s10, s11, s12,
273
- s13, s14, s15, s16,
274
- s17, s18, s19, s20,
275
- s21, s22, s23, s24;
276
- float carry;
277
- float dummy; //Causes struct to be a multiple of 128 bits
278
- int in24;
279
- int stepnr;
280
- } ranluxcl_state_t;
281
-
282
- //Initial prototypes makes Apple's compiler happy
283
- void ranluxcl_download_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
284
- void ranluxcl_upload_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
285
- float ranluxcl_os(float, float, float *, float *);
286
- float4 ranluxcl32(ranluxcl_state_t *);
287
- void ranluxcl_synchronize(ranluxcl_state_t *);
288
- void ranluxcl_initialization(uint, global ranluxcl_state_t *);
289
- float4 ranluxcl32norm(ranluxcl_state_t *);
290
-
291
- #ifdef RANLUXCL_SUPPORT_DOUBLE
292
- double4 ranluxcl64(ranluxcl_state_t *);
293
- double4 ranluxcl64norm(ranluxcl_state_t *);
294
- #endif
295
-
296
- #define RANLUXCL_TWOM24 0.000000059604644775f
297
- #define RANLUXCL_TWOM12 0.000244140625f
298
-
299
- #ifdef RANLUXCL_LUX
300
- #if RANLUXCL_LUX < 0
301
- #error ranluxcl: lux must be zero or positive.
302
- #endif
303
- #else
304
- #define RANLUXCL_LUX 4 //Default to high quality
305
- #endif //RANLUXCL_LUX
306
-
307
- //Here the luxury values are defined
308
- #if RANLUXCL_LUX == 0
309
- #define RANLUXCL_NSKIP 0
310
- #elif RANLUXCL_LUX == 1
311
- #define RANLUXCL_NSKIP 24
312
- #elif RANLUXCL_LUX == 2
313
- #define RANLUXCL_NSKIP 76
314
- #elif RANLUXCL_LUX == 3
315
- #define RANLUXCL_NSKIP 200
316
- #elif RANLUXCL_LUX == 4
317
- #define RANLUXCL_NSKIP 380
318
- #else
319
- #define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
320
- #endif //RANLUXCL_LUX == 0
321
-
322
- //Check that nskip is a permissible value
323
- #if RANLUXCL_NSKIP % 4 != 0
324
- #error nskip must be divisible by 4!
325
- #endif
326
- #if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
327
- #error nskip must be either 0 or >= 24!
328
- #endif
329
- #if RANLUXCL_NSKIP < 0
330
- #error nskip is negative!
331
- #endif
332
-
333
- //Check if planar scheme is recovered
334
- #if RANLUXCL_NSKIP % 24 == 0
335
- #define RANLUXCL_PLANAR
336
- #endif
337
-
338
- //Check if we will skip at all
339
- #if RANLUXCL_NSKIP == 0
340
- #define RANLUXCL_NOSKIP
341
- #endif
342
-
343
- //Single-value global size and id
344
- #define RANLUXCL_NUMWORKITEMS \
345
- (get_global_size(0) * get_global_size(1) * get_global_size(2))
346
- #define RANLUXCL_MYID \
347
- (get_global_id(0) + get_global_id(1) * get_global_size(0) + \
348
- get_global_id(2) * get_global_size(0) * get_global_size(1))
349
-
350
- void ranluxcl_download_seed(ranluxcl_state_t *rst,
351
- global ranluxcl_state_t *ranluxcltab)
352
- {
353
- (*rst) = ranluxcltab[RANLUXCL_MYID];
354
- }
355
-
356
- void ranluxcl_upload_seed(ranluxcl_state_t *rst,
357
- global ranluxcl_state_t *ranluxcltab)
358
- {
359
- ranluxcltab[RANLUXCL_MYID] = (*rst);
360
- }
361
-
362
- /*
363
- * Performs one "step" (generates a single value or skip). Only used internally,
364
- * not intended to be called from user code.
365
- */
366
- float ranluxcl_os(float sj24m1, float sj24, float *si24, float *carry)
367
- {
368
- float uni, out;
369
- uni = sj24 - (*si24) - (*carry);
370
- if(uni < 0.0f){
371
- uni += 1.0f;
372
- (*carry) = RANLUXCL_TWOM24;
373
- } else (*carry) = 0.0f;
374
- out = ((*si24) = uni);
375
-
376
- if(uni < RANLUXCL_TWOM12){
377
- out += RANLUXCL_TWOM24 * sj24m1;
378
- if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
379
- }
380
- return out;
381
- }
382
-
383
- /*
384
- * Return a float4 where each component is a uniformly distributed pseudo-
385
- * random value between 0 and 1, end points not included.
386
- */
387
- float4 ranluxcl32(ranluxcl_state_t *rst)
388
- {
389
- float4 out;
390
-
391
- if(rst->stepnr == 0){
392
- out.x = ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
393
- out.y = ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
394
- out.z = ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
395
- out.w = ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
396
- rst->stepnr += 4;
397
- }
398
-
399
- else if(rst->stepnr == 4){
400
- out.x = ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
401
- out.y = ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
402
- out.z = ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
403
- out.w = ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
404
- rst->stepnr += 4;
405
- }
406
-
407
- else if(rst->stepnr == 8){
408
- out.x = ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
409
- out.y = ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
410
- out.z = ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
411
- out.w = ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
412
- rst->stepnr += 4;
413
- }
414
-
415
- else if(rst->stepnr == 12){
416
- out.x = ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
417
- out.y = ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
418
- out.z = ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
419
- out.w = ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
420
- rst->stepnr += 4;
421
- }
422
-
423
- else if(rst->stepnr == 16){
424
- out.x = ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
425
- out.y = ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
426
- out.z = ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
427
- out.w = ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
428
- rst->stepnr += 4;
429
- }
430
-
431
- else if(rst->stepnr == 20){
432
- out.x = ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
433
- out.y = ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
434
- out.z = ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
435
- out.w = ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
436
- rst->stepnr = 0;
437
-
438
- // The below preprocessor directives are here to recover the simpler planar
439
- // scheme when nskip is a multiple of 24. For the most general planar shift
440
- // approach, just ignore all #if's below.
441
- #ifndef RANLUXCL_PLANAR
442
- }
443
-
444
- (*&(rst->in24)) += 4;
445
- if((*&(rst->in24)) == 24){
446
- (*&(rst->in24)) = 0;
447
- #endif //RANLUXCL_PLANAR
448
-
449
- int initialskips = (rst->stepnr) ? (24 - rst->stepnr) : 0;
450
- int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
451
- int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
452
-
453
- //We know there won't be any initial skips in the planar scheme
454
- #ifndef RANLUXCL_PLANAR
455
- //Do initial skips (lack of breaks in switch is intentional).
456
- switch(initialskips){
457
- case(20):
458
- ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
459
- ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
460
- ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
461
- ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
462
- case(16):
463
- ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
464
- ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
465
- ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
466
- ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
467
- case(12):
468
- ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
469
- ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
470
- ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
471
- ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
472
- case(8):
473
- ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
474
- ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
475
- ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
476
- ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
477
- case(4):
478
- ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
479
- ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
480
- ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
481
- ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
482
- }
483
- #endif //RANLUXCL_PLANAR
484
-
485
- //Also check if we will ever need to skip at all
486
- #ifndef RANLUXCL_NOSKIP
487
- for(int i=0; i<bulkskips/24; i++){
488
- ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
489
- ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
490
- ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
491
- ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
492
- ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
493
- ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
494
- ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
495
- ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
496
- ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
497
- ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
498
- ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
499
- ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
500
- ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
501
- ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
502
- ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
503
- ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
504
- ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
505
- ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
506
- ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
507
- ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
508
- ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
509
- ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
510
- ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
511
- ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
512
- }
513
- #endif //RANLUXCL_NOSKIP
514
-
515
- //There also won't be any remaining skips in the planar scheme
516
- #ifndef RANLUXCL_PLANAR
517
- //Do remaining skips
518
- if(remainingskips){
519
- ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
520
- ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
521
- ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
522
- ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
523
-
524
- if(remainingskips > 4){
525
- ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
526
- ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
527
- ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
528
- ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
529
- }
530
-
531
- if(remainingskips > 8){
532
- ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
533
- ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
534
- ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
535
- ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
536
- }
537
-
538
- if(remainingskips > 12){
539
- ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
540
- ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
541
- ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
542
- ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
543
- }
544
-
545
- if(remainingskips > 16){
546
- ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
547
- ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
548
- ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
549
- ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
550
- }
551
- }
552
- #endif //RANLUXCL_PLANAR
553
-
554
- // Initial skips brought stepnr down to 0. The bulk skips did only
555
- // full cycles. Therefore stepnr is now equal to remainingskips.
556
- rst->stepnr = remainingskips;
557
- }
558
-
559
- return out;
560
- }
561
-
562
- /*
563
- * Perform the necessary operations to set the generator to the "beginning",
564
- * i.e., ready to generate 24 numbers before the next skipping sequence. This
565
- * is useful if different work-items have called ranluxcl a different number
566
- * of times. Since that would lead to out of sync execution on different work-
567
- * items it could be rather inefficient on SIMD architectures (like current
568
- * GPUs). This function thus allows us to resynchronize execution across work-
569
- * items.
570
- */
571
- void ranluxcl_synchronize(ranluxcl_state_t *rst)
572
- {
573
- // Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
574
- if(rst->stepnr == 4)
575
- ranluxcl32(rst);
576
- if(rst->stepnr == 8)
577
- ranluxcl32(rst);
578
- if(rst->stepnr == 12)
579
- ranluxcl32(rst);
580
- if(rst->stepnr == 16)
581
- ranluxcl32(rst);
582
- if(rst->stepnr == 20)
583
- ranluxcl32(rst);
584
- }
585
-
586
- /*
587
- * Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator.
588
- *
589
- * This function can be used instead of ranluxcl_initialization if manual
590
- * control of the seed of each generator is desired. x must be unique for each
591
- * time this function is called, and *ranluxcltab should point to the specific
592
- * entry in the table to be initialized. Compare this to ranluxcl_initialization
593
- * where ins needs only be unique for each NDRange, and *ranluxcltab points
594
- * to the base address of the table for the entire NDRange. Also note that
595
- * depending on what you are doing the ranluxcl_upload_seed and
596
- * ranluxcl_download_seed functions may not do what you want, so make sure
597
- * you know what you are doing!
598
- */
599
-
600
- void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab)
601
- {
602
- ranluxcl_state_t rst;
603
-
604
- #define RANLUXCL_POW2_24 16777216
605
- #define RANLUXCL_56 0x00FFFFFFFFFFFFFF
606
- #define RANLUXCL_48 0x0000FFFFFFFFFFFF
607
- #define RANLUXCL_40 0x000000FFFFFFFFFF
608
- #define RANLUXCL_32 0x00000000FFFFFFFF
609
- #define RANLUXCL_24 0x0000000000FFFFFF
610
- #define RANLUXCL_16 0x000000000000FFFF
611
- #define RANLUXCL_8 0x00000000000000FF
612
-
613
- ulong x1, x2, x3;
614
-
615
- //Logical shifts used so that all 64 bits of output are used (24 bits
616
- //per float), to be certain that all initial states are different.
617
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
618
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
619
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
620
- rst.s01 = (float) (x1 >> 40)
621
- / (float)RANLUXCL_POW2_24;
622
- rst.s02 = (float) ((x1 & RANLUXCL_40) >> 16)
623
- / (float)RANLUXCL_POW2_24;
624
- rst.s03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
625
- / (float)RANLUXCL_POW2_24;
626
- rst.s04 = (float) ((x2 & RANLUXCL_56) >> 32)
627
- / (float)RANLUXCL_POW2_24;
628
- rst.s05 = (float) ((x2 & RANLUXCL_32) >> 8)
629
- / (float)RANLUXCL_POW2_24;
630
- rst.s06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
631
- / (float)RANLUXCL_POW2_24;
632
- rst.s07 = (float) ((x3 & RANLUXCL_48) >> 24)
633
- / (float)RANLUXCL_POW2_24;
634
- rst.s08 = (float) (x3 & RANLUXCL_24)
635
- / (float)RANLUXCL_POW2_24;
636
-
637
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
638
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
639
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
640
- rst.s09 = (float) (x1 >> 40)
641
- / (float)RANLUXCL_POW2_24;
642
- rst.s10 = (float) ((x1 & RANLUXCL_40) >> 16)
643
- / (float)RANLUXCL_POW2_24;
644
- rst.s11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
645
- / (float)RANLUXCL_POW2_24;
646
- rst.s12 = (float) ((x2 & RANLUXCL_56) >> 32)
647
- / (float)RANLUXCL_POW2_24;
648
- rst.s13 = (float) ((x2 & RANLUXCL_32) >> 8)
649
- / (float)RANLUXCL_POW2_24;
650
- rst.s14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
651
- / (float)RANLUXCL_POW2_24;
652
- rst.s15 = (float) ((x3 & RANLUXCL_48) >> 24)
653
- / (float)RANLUXCL_POW2_24;
654
- rst.s16 = (float) (x3 & RANLUXCL_24)
655
- / (float)RANLUXCL_POW2_24;
656
-
657
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
658
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
659
- x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
660
- rst.s17 = (float) (x1 >> 40)
661
- / (float)RANLUXCL_POW2_24;
662
- rst.s18 = (float) ((x1 & RANLUXCL_40) >> 16)
663
- / (float)RANLUXCL_POW2_24;
664
- rst.s19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
665
- / (float)RANLUXCL_POW2_24;
666
- rst.s20 = (float) ((x2 & RANLUXCL_56) >> 32)
667
- / (float)RANLUXCL_POW2_24;
668
- rst.s21 = (float) ((x2 & RANLUXCL_32) >> 8)
669
- / (float)RANLUXCL_POW2_24;
670
- rst.s22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
671
- / (float)RANLUXCL_POW2_24;
672
- rst.s23 = (float) ((x3 & RANLUXCL_48) >> 24)
673
- / (float)RANLUXCL_POW2_24;
674
- rst.s24 = (float) (x3 & RANLUXCL_24)
675
- / (float)RANLUXCL_POW2_24;
676
-
677
- #undef RANLUXCL_POW2_24
678
- #undef RANLUXCL_56
679
- #undef RANLUXCL_48
680
- #undef RANLUXCL_40
681
- #undef RANLUXCL_32
682
- #undef RANLUXCL_24
683
- #undef RANLUXCL_16
684
- #undef RANLUXCL_8
685
-
686
- rst.in24 = 0;
687
- rst.stepnr = 0;
688
- rst.carry = 0.0f;
689
- if(rst.s24 == 0.0f)
690
- rst.carry = RANLUXCL_TWOM24;
691
-
692
- #ifndef RANLUXCL_NO_WARMUP
693
- //Warming up the generator, ensuring there are no initial correlations.
694
- //16 is a "magic number". It is the number of times we must generate
695
- //a batch of 24 numbers to ensure complete decorrelation, however it
696
- //seems like it is necessary to double this for the special case when
697
- //the generator is initialized to all zeros.
698
- for(int i=0; i<16 * 2; i++){
699
- ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
700
- ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
701
- ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
702
- ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
703
- ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
704
- ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
705
- ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
706
- ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
707
- ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
708
- ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
709
- ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
710
- ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
711
- ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
712
- ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
713
- ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
714
- ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
715
- ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
716
- ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
717
- ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
718
- ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
719
- ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
720
- ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
721
- ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
722
- ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
723
- }
724
- #endif //RANLUXCL_NO_WARMUP
725
-
726
- //Upload the state
727
- *ranluxcltab = rst;
728
- }
729
-
730
- void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab)
731
- {
732
- //Using legacy initialization from original Fortan 77 implementation
733
-
734
- //ins is scaled so that if the user makes another call somewhere else
735
- //with ins + 1 there should be no overlap. Also adding one
736
- //allows us to use ins = 0.
737
- int k, maxWorkitems;
738
- ranluxcl_state_t rst;
739
-
740
- #ifdef RANLUXCL_MAXWORKITEMS
741
- maxWorkitems = RANLUXCL_MAXWORKITEMS;
742
- #else
743
- maxWorkitems = RANLUXCL_NUMWORKITEMS;
744
- #endif //RANLUXCL_MAXWORKITEMS
745
-
746
- int scaledins = ins * maxWorkitems + 1;
747
-
748
- int js = scaledins + RANLUXCL_MYID;
749
-
750
- //Make sure js is not too small (should really be an error)
751
- if(js < 1)
752
- js = 1;
753
-
754
- #define IC 2147483563
755
- #define ITWO24 16777216
756
-
757
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
758
- rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
759
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
760
- rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
761
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
762
- rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
763
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
764
- rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
765
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
766
- rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
767
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
768
- rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
769
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
770
- rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
771
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
772
- rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
773
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
774
- rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
775
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
776
- rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
777
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
778
- rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
779
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
780
- rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
781
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
782
- rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
783
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
784
- rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
785
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
786
- rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
787
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
788
- rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
789
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
790
- rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
791
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
792
- rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
793
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
794
- rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
795
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
796
- rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
797
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
798
- rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
799
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
800
- rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
801
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
802
- rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
803
- k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
804
- rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
805
-
806
- #undef IC
807
- #undef ITWO24
808
-
809
- rst.in24 = 0;
810
- rst.stepnr = 0;
811
- rst.carry = 0.0f;
812
- if(rst.s24 == 0.0f)
813
- rst.carry = RANLUXCL_TWOM24;
814
-
815
- #ifndef RANLUXCL_NO_WARMUP
816
- //Warming up the generator, ensuring there are no initial correlations.
817
- //16 is a "magic number". It is the number of times we must generate
818
- //a batch of 24 numbers to ensure complete decorrelation.
819
- for(int i=0; i<16; i++){
820
- ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
821
- ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
822
- ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
823
- ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
824
- ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
825
- ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
826
- ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
827
- ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
828
- ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
829
- ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
830
- ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
831
- ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
832
- ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
833
- ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
834
- ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
835
- ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
836
- ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
837
- ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
838
- ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
839
- ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
840
- ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
841
- ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
842
- ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
843
- ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
844
- }
845
- #endif //RANLUXCL_NO_WARMUP
846
-
847
- //Upload the state
848
- ranluxcl_upload_seed(&rst, ranluxcltab);
849
- }
850
-
851
- void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
852
- {
853
- #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
854
- ranluxcl_init_legacy(ins, ranluxcltab);
855
-
856
- #else // Not RANLUXCL_USE_LEGACY_INITIALIZATION
857
-
858
- // We scale ins by 2^32. As long as we never use more than (2^32)-1
859
- // work-items per NDRange the initial states should never be the same.
860
-
861
- ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
862
- ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID);
863
-
864
- #endif // RANLUXCL_USE_LEGACY_INITIALIZATION
865
- }
866
-
867
- float4 ranluxcl32norm(ranluxcl_state_t *rst)
868
- {
869
- //Returns a vector where each component is a normally
870
- //distributed PRN centered on 0, with standard deviation 1.
871
-
872
- //Roll our own since M_PI_F does not exist in OpenCL 1.0.
873
- #define RANLUXCL_PI_F 3.1415926535f
874
-
875
- float4 U = ranluxcl32(rst);
876
-
877
- float4 Z;
878
- float R, phi;
879
-
880
- R = sqrt(-2 * log(U.x));
881
- phi = 2 * RANLUXCL_PI_F * U.y;
882
- Z.x = R * cos(phi);
883
- Z.y = R * sin(phi);
884
-
885
- R = sqrt(-2 * log(U.z));
886
- phi = 2 * RANLUXCL_PI_F * U.w;
887
- Z.z = R * cos(phi);
888
- Z.w = R * sin(phi);
889
-
890
- return Z;
891
-
892
- #undef RANLUXCL_PI_F
893
- }
894
-
895
- #ifdef RANLUXCL_SUPPORT_DOUBLE
896
- double4 ranluxcl64(ranluxcl_state_t *rst)
897
- {
898
- double4 out;
899
- float4 randvec;
900
-
901
- //We know this value is caused by the never-zero part
902
- //of the original algorithm, but we want to allow zero for
903
- //the most significant bits in the double precision result.
904
- randvec = ranluxcl32(rst);
905
- if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
906
- randvec.x = 0.0f;
907
- if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
908
- randvec.z = 0.0f;
909
-
910
- out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
911
- out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;
912
-
913
- randvec = ranluxcl32(rst);
914
- if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
915
- randvec.x = 0.0f;
916
- if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
917
- randvec.z = 0.0f;
918
-
919
- out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
920
- out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;
921
-
922
- return out;
923
- }
924
-
925
- double4 ranluxcl64norm(ranluxcl_state_t *rst)
926
- {
927
- //Returns a vector where each component is a normally
928
- //distributed PRN centered on 0, with standard deviation
929
- //1.
930
-
931
- double4 U = ranluxcl64(rst);
932
-
933
- double4 Z;
934
- double R, phi;
935
-
936
- R = sqrt(-2 * log(U.x));
937
- phi = 2 * M_PI * U.y;
938
- Z.x = R * cos(phi);
939
- Z.y = R * sin(phi);
940
-
941
- R = sqrt(-2 * log(U.z));
942
- phi = 2 * M_PI * U.w;
943
- Z.z = R * cos(phi);
944
- Z.w = R * sin(phi);
945
-
946
- return Z;
947
- }
948
- #endif //RANLUXCL_SUPPORT_DOUBLE
949
-
950
- #undef RANLUXCL_TWOM24
951
- #undef RANLUXCL_TWOM12
952
- #undef RANLUXCL_NUMWORKITEMS
953
- #undef RANLUXCL_MYID
954
- #undef RANLUXCL_PLANAR
955
- #undef RANLUXCL_NOSKIP
956
-
957
- #endif //RANLUXCL_CL