PyPI - pyopencl - Versions diffs - 2024.1__cp311-cp311-win_amd64.whl → 2024.2__cp311-cp311-win_amd64.whl - Mend

pyopencl 2024.1__cp311-cp311-win_amd64.whl → 2024.2__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyopencl might be problematic. Click here for more details.

Files changed (107) hide show

pyopencl/__init__.py +82 -80
pyopencl/_cl.cp311-win_amd64.pyd +0 -0
pyopencl/algorithm.py +8 -10
pyopencl/array.py +16 -12
pyopencl/bitonic_sort.py +5 -4
pyopencl/cache.py +22 -22
pyopencl/capture_call.py +4 -3
pyopencl/characterize/__init__.py +4 -2
pyopencl/characterize/performance.py +2 -1
pyopencl/clmath.py +2 -1
pyopencl/clrandom.py +5 -369
pyopencl/cltypes.py +4 -1
pyopencl/compyte/dtypes.py +1 -1
pyopencl/compyte/ndarray/gen_elemwise.py +6 -5
pyopencl/compyte/ndarray/gen_reduction.py +6 -6
pyopencl/compyte/ndarray/setup_opencl.py +3 -2
pyopencl/compyte/ndarray/test_gpu_elemwise.py +5 -4
pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -1
pyopencl/elementwise.py +4 -6
pyopencl/invoker.py +15 -9
pyopencl/ipython_ext.py +1 -1
pyopencl/reduction.py +5 -5
pyopencl/scan.py +17 -21
pyopencl/tools.py +13 -16
pyopencl/version.py +1 -1
pyopencl-2024.2.data/data/CITATION.cff +74 -0
{pyopencl-2024.1.dist-info → pyopencl-2024.2.data/data}/LICENSE +0 -23
pyopencl-2024.2.data/data/Makefile.in +21 -0
pyopencl-2024.2.data/data/README.rst +70 -0
pyopencl-2024.2.data/data/README_SETUP.txt +34 -0
pyopencl-2024.2.data/data/aksetup_helper.py +1013 -0
pyopencl-2024.2.data/data/configure.py +6 -0
pyopencl-2024.2.data/data/contrib/cldis.py +91 -0
pyopencl-2024.2.data/data/contrib/fortran-to-opencl/README +29 -0
pyopencl-2024.2.data/data/contrib/fortran-to-opencl/translate.py +1441 -0
pyopencl-2024.2.data/data/contrib/pyopencl.vim +84 -0
pyopencl-2024.2.data/data/doc/Makefile +23 -0
pyopencl-2024.2.data/data/doc/algorithm.rst +214 -0
pyopencl-2024.2.data/data/doc/array.rst +305 -0
pyopencl-2024.2.data/data/doc/conf.py +26 -0
pyopencl-2024.2.data/data/doc/howto.rst +105 -0
pyopencl-2024.2.data/data/doc/index.rst +137 -0
pyopencl-2024.2.data/data/doc/make_constants.py +561 -0
pyopencl-2024.2.data/data/doc/misc.rst +885 -0
pyopencl-2024.2.data/data/doc/runtime.rst +51 -0
pyopencl-2024.2.data/data/doc/runtime_const.rst +30 -0
pyopencl-2024.2.data/data/doc/runtime_gl.rst +78 -0
pyopencl-2024.2.data/data/doc/runtime_memory.rst +527 -0
pyopencl-2024.2.data/data/doc/runtime_platform.rst +184 -0
pyopencl-2024.2.data/data/doc/runtime_program.rst +364 -0
pyopencl-2024.2.data/data/doc/runtime_queue.rst +182 -0
pyopencl-2024.2.data/data/doc/subst.rst +36 -0
pyopencl-2024.2.data/data/doc/tools.rst +4 -0
pyopencl-2024.2.data/data/doc/types.rst +42 -0
pyopencl-2024.2.data/data/examples/black-hole-accretion.py +2227 -0
pyopencl-2024.2.data/data/examples/demo-struct-reduce.py +75 -0
pyopencl-2024.2.data/data/examples/demo.py +39 -0
pyopencl-2024.2.data/data/examples/demo_array.py +32 -0
pyopencl-2024.2.data/data/examples/demo_array_svm.py +37 -0
pyopencl-2024.2.data/data/examples/demo_elementwise.py +34 -0
pyopencl-2024.2.data/data/examples/demo_elementwise_complex.py +53 -0
pyopencl-2024.2.data/data/examples/demo_mandelbrot.py +183 -0
pyopencl-2024.2.data/data/examples/demo_meta_codepy.py +56 -0
pyopencl-2024.2.data/data/examples/demo_meta_template.py +55 -0
pyopencl-2024.2.data/data/examples/dump-performance.py +38 -0
pyopencl-2024.2.data/data/examples/dump-properties.py +86 -0
pyopencl-2024.2.data/data/examples/gl_interop_demo.py +84 -0
pyopencl-2024.2.data/data/examples/gl_particle_animation.py +218 -0
pyopencl-2024.2.data/data/examples/ipython-demo.ipynb +203 -0
pyopencl-2024.2.data/data/examples/median-filter.py +99 -0
pyopencl-2024.2.data/data/examples/n-body.py +1070 -0
pyopencl-2024.2.data/data/examples/narray.py +37 -0
pyopencl-2024.2.data/data/examples/noisyImage.jpg +0 -0
pyopencl-2024.2.data/data/examples/pi-monte-carlo.py +1166 -0
pyopencl-2024.2.data/data/examples/svm.py +82 -0
pyopencl-2024.2.data/data/examples/transpose.py +229 -0
pyopencl-2024.2.data/data/pytest.ini +3 -0
pyopencl-2024.2.data/data/src/bitlog.cpp +51 -0
pyopencl-2024.2.data/data/src/bitlog.hpp +83 -0
pyopencl-2024.2.data/data/src/clinfo_ext.h +134 -0
pyopencl-2024.2.data/data/src/mempool.hpp +444 -0
pyopencl-2024.2.data/data/src/pyopencl_ext.h +77 -0
pyopencl-2024.2.data/data/src/tools.hpp +90 -0
pyopencl-2024.2.data/data/src/wrap_cl.cpp +61 -0
pyopencl-2024.2.data/data/src/wrap_cl.hpp +5853 -0
pyopencl-2024.2.data/data/src/wrap_cl_part_1.cpp +369 -0
pyopencl-2024.2.data/data/src/wrap_cl_part_2.cpp +702 -0
pyopencl-2024.2.data/data/src/wrap_constants.cpp +1274 -0
pyopencl-2024.2.data/data/src/wrap_helpers.hpp +213 -0
pyopencl-2024.2.data/data/src/wrap_mempool.cpp +731 -0
pyopencl-2024.2.data/data/test/add-vectors-32.spv +0 -0
pyopencl-2024.2.data/data/test/add-vectors-64.spv +0 -0
pyopencl-2024.2.data/data/test/empty-header.h +1 -0
pyopencl-2024.2.data/data/test/test_algorithm.py +1180 -0
pyopencl-2024.2.data/data/test/test_array.py +2392 -0
pyopencl-2024.2.data/data/test/test_arrays_in_structs.py +100 -0
pyopencl-2024.2.data/data/test/test_clmath.py +529 -0
pyopencl-2024.2.data/data/test/test_clrandom.py +75 -0
pyopencl-2024.2.data/data/test/test_enqueue_copy.py +271 -0
pyopencl-2024.2.data/data/test/test_wrapper.py +1554 -0
pyopencl-2024.2.dist-info/LICENSE +282 -0
{pyopencl-2024.1.dist-info → pyopencl-2024.2.dist-info}/METADATA +12 -12
pyopencl-2024.2.dist-info/RECORD +122 -0
{pyopencl-2024.1.dist-info → pyopencl-2024.2.dist-info}/WHEEL +1 -1
pyopencl/cl/pyopencl-ranluxcl.cl +0 -957
pyopencl-2024.1.dist-info/RECORD +0 -48
{pyopencl-2024.1.dist-info → pyopencl-2024.2.dist-info}/top_level.txt +0 -0

pyopencl/cl/pyopencl-ranluxcl.cl DELETED Viewed

@@ -1,957 +0,0 @@
-/* RanluxCL is deprecated in PyOpenCL and will be removed in the 2018.x
- * versions of the package. */
-#ifndef RANLUXCL_CL
-#define RANLUXCL_CL
-/**** RANLUXCL v1.3.1 MODIFIED *************************************************
-Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
-implementation by Fred James. This OpenCL code is a complete implementation
-which should perfectly replicate the numbers generated by the original Fortran
-77 implementation (if using the legacy initialization routine).
-***** QUICK USAGE DESCRIPTION **************************************************
-1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte)
-per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112;
-2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches
-the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit
-integer, and must be different on different OpenCL devices/NDRanges to ensure
-different sequences. As long as the number of work-items on each device/NDRange
-is less than 2^32 = 4294967296 all sequences will be different.
-An examle initialization kernel would be:
-	#include "ranluxcl.cl"
-	kernel void Kernel_Ranluxcl_Init(private uint ins,
-		global ranluxcl_state_t *ranluxcltab)
-	{
-		ranluxcl_initialization(ins, ranluxcltab);
-	}
-3. Now the generator is ready for use. Remember to download the seeds first,
-and upload them again when done. Example kernel that downloads seeds, generates
-a float4 where each component is uniformly distributed between 0 and 1, end
-points not included, then uploads the seeds again:
-	#include "ranluxcl.cl"
-	kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab)
-	{
-		//ranluxclstate stores the state of the generator.
-		ranluxcl_state_t ranluxclstate;
-		//Download state into ranluxclstate struct.
-		ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
-		//Generate a float4 with each component on (0,1),
-		//end points not included. We can call ranluxcl as many
-		//times as we like until we upload the state again.
-		float4 randomnr = ranluxcl32(&ranluxclstate);
-		//Upload state again so that we don't get the same
-		//numbers over again the next time we use ranluxcl.
-		ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
-	}
-***** MACROS *******************************************************************
-The following macros can optionally be defined:
-RANLUXCL_LUX:
-Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger
-it sets the p-value of the generator (generally not needed). If this macro is
-not set then lux=4 is the default (highest quality). For many applications the
-high quality of lux=4 may not be needed. Indeed if two values (each value
-having 24 random bits) are glued together to form a 48-bit value the generator
-passes all tests in the TestU01 suite already with lux=2. See
-"TestU01: A C Library for Empirical Testing of Random Number Generators" by
-PIERRE LAeECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is RANLUX with
-two values glued together to create 48-bit numbers, and we see that it passes
-all tests already at luxury value 2.
-RANLUXCL_NO_WARMUP:
-Turns off the warmup functionality in ranluxcl_initialization. This macro
-should generally not be used, since the generators will initially be correlated
-if it is defined. The only advantage is that the numbers generated will exactly
-correspond to those of the original Fortran 77 implementation.
-RANLUXCL_SUPPORT_DOUBLE:
-Enables double precision functions. Please enable the OpenCL double precision
-extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".
-RANLUXCL_USE_LEGACY_INITIALIZATION
-Uses exactly the same initialization routine as in the original Fortran 77 code,
-leading to the same sequences. If using legacy initialization there are some
-restrictions on what the seed <ins> can be, and it may also be necessary to
-define RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.
-RANLUXCL_MAXWORKITEMS:
-When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
-If several OpenCL NDRanges will be running in parallel and the parallel
-sequences should be different then this macro should have a value equal or
-larger than the
-largest number of work-items in any of the parallel runs. The default is to
-use the current global size, so if all NDRanges are of the same size this need
-not be defined.
-	Each parallel instance must also have different seeds <ins>. For example if
-we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would
-use different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be
-defined to be at least 10240. If GPU1 and GPU2 had the same number of work-items
-this would not be necessary.
-	An underestimate of the highest permissible seed <ins> is given by the
-smallest of:
-(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
-Please make sure that <ins> is never higher than this since it could cause
-undetected problems. For example with 10240 work-items the highest permissible
-<ins> is about 100 000.
-	Again note that this is only relevant when using the legacy initialization
-function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using the
-legacy initialization this macro is effectively set to a very high value of
-2^32-1.
-***** FUNCTIONS: INITIALIZATION ************************************************
-The initialization function is defined as:
-void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
-Run once at the very beginning. ranluxcltab should be a buffer with space for
-112 byte per work-item in the NDRange. <ins> is the seed to the generator.
-For a given <ins> each work-item in the NDRange will generate a different
-sequence. If more than one NDRange is used in parallel then <ins> must be
-different for each NDRange to avoid identical sequences.
-***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ******************************************
-The following two functions should be launced at the beginning and end of a
-kernel that uses ranluxcl to generate numbers, respectively:
-void ranluxcl_download_seed(ranluxcl_state_t *rst,
-	global ranluxcl_state_t *ranluxcltab)
-Run at the beginning of a kernel to download ranluxcl state data
-void ranluxcl_upload_seed(ranluxcl_state_t *rst,
-	global ranluxcl_state_t *ranluxcltab)
-Run at the end of a kernel to upload state data
-***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ********************************
-float4 ranluxcl32(ranluxcl_state_t *rst)
-Run to generate a pseudo-random float4 where each component is a number between
-0 and 1, end points not included (meaning the number will never be exactly 0 or
-1).
-double4 ranluxcl64(ranluxcl_state_t *rst)
-Double precision version of the above function. The preprocessor macro
-RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
-This function "glues" together two single-precision numbers to make one double
-precision number. Most of the work is still done in single precision, so the
-performance will be roughly halved regardless of the double precision
-performance of the hardware.
-float4 ranluxcl32norm(ranluxcl_state_t *rst)
-Run to generate a pseudo-random float4 where each component is normally
-distributed with mean 0 and standard deviation 1.
-double4 ranluxcl64norm(ranluxcl_state_t *rst)
-Double precision version of the above function. The preprocessor macro
-RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
-void ranluxcl_synchronize(ranluxcl_state_t *rst)
-Run to synchronize execution in case different work-items have made a different
-number of calls to ranluxcl. On SIMD machines this could lead to inefficient
-execution. ranluxcl_synchronize allows us to make sure all generators are
-SIMD-friendly again. Not needed if all work-items always call ranluxcl the same
-number of times.
-***** PERFORMANCE **************************************************************
-For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudo-
-random values per second, when not downloading values to host memory (i.e. the
-values are just generated, but not used for anything in particular).
-***** DESCRIPTION OF THE IMPLEMENTATION ****************************************
-This code closely follows the original Fortran 77 code (see credit section).
-Here the differences (and similarities) between RANLUXCL (this implementation)
-and the original RANLUX are discussed.
-The Fortran 77 implementation uses a simple LCG to initialize the generator, and
-so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
-seed, the first work-item behaves like the original RANLUX with seed equal 1,
-the second work-item as if with seed equal 2 and so on. If <ins> = 1 then the
-first work-item behaves like the original RANLUX with seed equal to
-<numWorkitems> + 1, and so on for higher <ins> so that we never have overlapping
-sequences. This is why the RANLUXCL_MAXWORKITEMS macro must be set if we have
-different NDRanges with a different number of work-items.
-RANLUX is based on chaos theory, and what we are actually doing when selecting
-a luxury value is setting how many values to skip over (causing decorrelation).
-The number of values to skip is controlled by the so-called p-value of the
-generator. After generating 24 values we skip p - 24 values until again
-generating 24 values.
-This implementation is somewhat modified from the original fortran
-implementation by F. James. Because of the way the OpenCL code is optimized with
-4-component 32-bit float vectors, it is most convenient to always throw away
-some multiple of 24 values (i.e. p is always a multiple of 24).
-However, there might be some resonances if we always throw away a multiple of
-the seeds table size. Therefore the implementation is slightly more intricate
-where p can be a multiple of 4 instead, at a cost to performance (only about 10%
-lower than the cleaner 24 values approach on AMD Cypress). These two approaches
-are termed planar and planar shift respectively. The idea for the planar
-approach comes from the following paper:
-Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on
-Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
-Below the p-values for the original reference implementation are listed along
-with those of the planar shift implementation. Suggested values for the planar
-approach are also presented. When this function is called with RANLUXCL_LUX
-set to 0-4, the planar shift values are used. To use the pure planar approach
-(for some extra performance with likely undetectable quality decrease), set lux
-equal to the specific p-value.
-Luxury setting (RANLUXCL_LUX):                   0   1   2   3   4
-Original fortran77 implementation by F. James:  24  48  97  223 389
-Planar (suggested):                             24  48  120 240 408
-Planar shift:                                   24  48  100 224 404
-Note that levels 0 and 1 are the same as in the original implementation for both
-planar and planar shift. Level 4 of planar shift where p=404 is the same as
-chosen for luxury level 1 by Martin Luescher for his v3 version of RANLUX.
-Therefore if it is considered important to only use "official" values, luxury
-settings 0, 1 or 4 of planar shift should be used. It is however unlikely that
-the other values are bad, they just haven't been as extensively used and tested
-by others.
-Variable names are generally the same as in the fortran77 implementation,
-however because of the way the generator is implemented, the i24 and j24
-variables are no longer needed.
-***** CREDIT *******************************************************************
-I have been told by Fred James (the coder) that the original Fortran 77
-implementation (which is the subject of the second paper below) is free to use
-and share. Therefore I am using the MIT license (below). But most importantly
-please always remember to give credit to the two articles by Martin Luscher and
-Fred James, describing the generator and the Fortran 77 implementation on which
-this implementation is based, respectively:
-Martin Luescher, A portable high-quality random number generator for lattice
-field theory simulations, Computer Physics Communications 79 (1994) 100-110
-F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
-number generator of Luescher, Computer Physics Communications 79 (1994) 111-114
-***** LICENSE ******************************************************************
-Copyright (c) 2011 Ivar Ursin Nikolaisen
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*******************************************************************************/
-typedef struct{
-	float
-		s01, s02, s03, s04,
-		s05, s06, s07, s08,
-		s09, s10, s11, s12,
-		s13, s14, s15, s16,
-		s17, s18, s19, s20,
-		s21, s22, s23, s24;
-	float carry;
-	float dummy; //Causes struct to be a multiple of 128 bits
-	int in24;
-	int stepnr;
-} ranluxcl_state_t;
-//Initial prototypes makes Apple's compiler happy
-void ranluxcl_download_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
-void ranluxcl_upload_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
-float ranluxcl_os(float, float, float *, float *);
-float4 ranluxcl32(ranluxcl_state_t *);
-void ranluxcl_synchronize(ranluxcl_state_t *);
-void ranluxcl_initialization(uint, global ranluxcl_state_t *);
-float4 ranluxcl32norm(ranluxcl_state_t *);
-#ifdef RANLUXCL_SUPPORT_DOUBLE
-double4 ranluxcl64(ranluxcl_state_t *);
-double4 ranluxcl64norm(ranluxcl_state_t *);
-#endif
-#define RANLUXCL_TWOM24 0.000000059604644775f
-#define RANLUXCL_TWOM12 0.000244140625f
-#ifdef RANLUXCL_LUX
-#if RANLUXCL_LUX < 0
-#error ranluxcl: lux must be zero or positive.
-#endif
-#else
-#define RANLUXCL_LUX 4 //Default to high quality
-#endif //RANLUXCL_LUX
-//Here the luxury values are defined
-#if RANLUXCL_LUX == 0
-#define RANLUXCL_NSKIP 0
-#elif RANLUXCL_LUX == 1
-#define RANLUXCL_NSKIP 24
-#elif RANLUXCL_LUX == 2
-#define RANLUXCL_NSKIP 76
-#elif RANLUXCL_LUX == 3
-#define RANLUXCL_NSKIP 200
-#elif RANLUXCL_LUX == 4
-#define RANLUXCL_NSKIP 380
-#else
-#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
-#endif //RANLUXCL_LUX == 0
-//Check that nskip is a permissible value
-#if RANLUXCL_NSKIP % 4 != 0
-#error nskip must be divisible by 4!
-#endif
-#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
-#error nskip must be either 0 or >= 24!
-#endif
-#if RANLUXCL_NSKIP < 0
-#error nskip is negative!
-#endif
-//Check if planar scheme is recovered
-#if RANLUXCL_NSKIP % 24 == 0
-#define RANLUXCL_PLANAR
-#endif
-//Check if we will skip at all
-#if RANLUXCL_NSKIP == 0
-#define RANLUXCL_NOSKIP
-#endif
-//Single-value global size and id
-#define RANLUXCL_NUMWORKITEMS \
-	(get_global_size(0) * get_global_size(1) * get_global_size(2))
-#define RANLUXCL_MYID \
-	(get_global_id(0) + get_global_id(1) * get_global_size(0) + \
-	 get_global_id(2) * get_global_size(0) * get_global_size(1))
-void ranluxcl_download_seed(ranluxcl_state_t *rst,
-	global ranluxcl_state_t *ranluxcltab)
-{
-	(*rst) = ranluxcltab[RANLUXCL_MYID];
-}
-void ranluxcl_upload_seed(ranluxcl_state_t *rst,
-	global ranluxcl_state_t *ranluxcltab)
-{
-	ranluxcltab[RANLUXCL_MYID] = (*rst);
-}
-/*
- * Performs one "step" (generates a single value or skip). Only used internally,
- * not intended to be called from user code.
- */
-float ranluxcl_os(float sj24m1, float sj24, float *si24, float *carry)
-{
-	float uni, out;
-	uni = sj24 - (*si24) - (*carry);
-	if(uni < 0.0f){
-		uni += 1.0f;
-		(*carry) = RANLUXCL_TWOM24;
-	} else (*carry) = 0.0f;
-	out = ((*si24) = uni);
-	if(uni < RANLUXCL_TWOM12){
-		out += RANLUXCL_TWOM24 * sj24m1;
-		if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
-	}
-	return out;
-}
-/*
- * Return a float4 where each component is a uniformly distributed pseudo-
- * random value between 0 and 1, end points not included.
- */
-float4 ranluxcl32(ranluxcl_state_t *rst)
-{
-	float4 out;
-	if(rst->stepnr == 0){
-		out.x = ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
-		out.y = ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
-		out.z = ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
-		out.w = ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
-		rst->stepnr += 4;
-	}
-	else if(rst->stepnr == 4){
-		out.x = ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
-		out.y = ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
-		out.z = ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
-		out.w = ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
-		rst->stepnr += 4;
-	}
-	else if(rst->stepnr == 8){
-		out.x = ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
-		out.y = ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
-		out.z = ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
-		out.w = ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
-		rst->stepnr += 4;
-	}
-	else if(rst->stepnr == 12){
-		out.x = ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
-		out.y = ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
-		out.z = ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
-		out.w = ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
-		rst->stepnr += 4;
-	}
-	else if(rst->stepnr == 16){
-		out.x = ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
-		out.y = ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
-		out.z = ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
-		out.w = ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
-		rst->stepnr += 4;
-	}
-	else if(rst->stepnr == 20){
-		out.x = ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
-		out.y = ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
-		out.z = ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
-		out.w = ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
-		rst->stepnr = 0;
-// The below preprocessor directives are here to recover the simpler planar
-// scheme when nskip is a multiple of 24. For the most general planar shift
-// approach, just ignore all #if's below.
-#ifndef RANLUXCL_PLANAR
-	}
-	(*&(rst->in24)) += 4;
-	if((*&(rst->in24)) == 24){
-		(*&(rst->in24)) = 0;
-#endif //RANLUXCL_PLANAR
-		int initialskips = (rst->stepnr) ? (24 - rst->stepnr) : 0;
-		int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
-		int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
-//We know there won't be any initial skips in the planar scheme
-#ifndef RANLUXCL_PLANAR
-		//Do initial skips (lack of breaks in switch is intentional).
-		switch(initialskips){
-			case(20):
-				ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
-				ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
-				ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
-				ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
-			case(16):
-				ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
-				ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
-				ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
-				ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
-			case(12):
-				ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
-				ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
-				ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
-				ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
-			case(8):
-				ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
-				ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
-				ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
-				ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
-			case(4):
-				ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
-				ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
-				ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
-				ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
-		}
-#endif //RANLUXCL_PLANAR
-//Also check if we will ever need to skip at all
-#ifndef RANLUXCL_NOSKIP
-		for(int i=0; i<bulkskips/24; i++){
-			ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
-			ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
-			ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
-			ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
-			ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
-			ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
-			ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
-			ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
-			ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
-			ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
-			ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
-			ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
-			ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
-			ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
-			ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
-			ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
-			ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
-			ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
-			ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
-			ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
-			ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
-			ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
-			ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
-			ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
-		}
-#endif //RANLUXCL_NOSKIP
-//There also won't be any remaining skips in the planar scheme
-#ifndef RANLUXCL_PLANAR
-		//Do remaining skips
-		if(remainingskips){
-			ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
-			ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
-			ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
-			ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
-			if(remainingskips > 4){
-				ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
-				ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
-				ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
-				ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
-			}
-			if(remainingskips > 8){
-				ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
-				ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
-				ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
-				ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
-			}
-			if(remainingskips > 12){
-				ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
-				ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
-				ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
-				ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
-			}
-			if(remainingskips > 16){
-				ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
-				ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
-				ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
-				ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
-			}
-		}
-#endif //RANLUXCL_PLANAR
-		// Initial skips brought stepnr down to 0. The bulk skips did only
-		// full cycles. Therefore stepnr is now equal to remainingskips.
-		rst->stepnr = remainingskips;
-	}
-	return out;
-}
-/*
- * Perform the necessary operations to set the generator to the "beginning",
- * i.e., ready to generate 24 numbers before the next skipping sequence. This
- * is useful if different work-items have called ranluxcl a different number
- * of times. Since that would lead to out of sync execution on different work-
- * items it could be rather inefficient on SIMD architectures (like current
- * GPUs). This function thus allows us to resynchronize execution across work-
- * items.
- */
-void ranluxcl_synchronize(ranluxcl_state_t *rst)
-{
-	// Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
-	if(rst->stepnr == 4)
-		ranluxcl32(rst);
-	if(rst->stepnr == 8)
-		ranluxcl32(rst);
-	if(rst->stepnr == 12)
-		ranluxcl32(rst);
-	if(rst->stepnr == 16)
-		ranluxcl32(rst);
-	if(rst->stepnr == 20)
-		ranluxcl32(rst);
-}
-/*
- * Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator.
- *
- * This function can be used instead of ranluxcl_initialization if manual
- * control of the seed of each generator is desired. x must be unique for each
- * time this function is called, and *ranluxcltab should point to the specific
- * entry in the table to be initialized. Compare this to ranluxcl_initialization
- * where ins needs only be unique for each NDRange, and *ranluxcltab points
- * to the base address of the table for the entire NDRange. Also note that
- * depending on what you are doing the ranluxcl_upload_seed and
- * ranluxcl_download_seed functions may not do what you want, so make sure
- * you know what you are doing!
- */
-void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab)
-{
-	ranluxcl_state_t rst;
-	#define RANLUXCL_POW2_24 16777216
-	#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
-	#define RANLUXCL_48 0x0000FFFFFFFFFFFF
-	#define RANLUXCL_40 0x000000FFFFFFFFFF
-	#define RANLUXCL_32 0x00000000FFFFFFFF
-	#define RANLUXCL_24 0x0000000000FFFFFF
-	#define RANLUXCL_16 0x000000000000FFFF
-	#define RANLUXCL_8  0x00000000000000FF
-	ulong x1, x2, x3;
-	//Logical shifts used so that all 64 bits of output are used (24 bits
-	//per float), to be certain that all initial states are different.
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
-	rst.s01 = (float)  (x1 >> 40)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s02 = (float) ((x1 & RANLUXCL_40) >> 16)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
-		/ (float)RANLUXCL_POW2_24;
-	rst.s04 = (float) ((x2 & RANLUXCL_56) >> 32)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s05 = (float) ((x2 & RANLUXCL_32) >> 8)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
-		/ (float)RANLUXCL_POW2_24;
-	rst.s07 = (float) ((x3 & RANLUXCL_48) >> 24)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s08 = (float)  (x3 & RANLUXCL_24)
-		/ (float)RANLUXCL_POW2_24;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
-	rst.s09 = (float)  (x1 >> 40)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s10 = (float) ((x1 & RANLUXCL_40) >> 16)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
-		/ (float)RANLUXCL_POW2_24;
-	rst.s12 = (float) ((x2 & RANLUXCL_56) >> 32)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s13 = (float) ((x2 & RANLUXCL_32) >> 8)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
-		/ (float)RANLUXCL_POW2_24;
-	rst.s15 = (float) ((x3 & RANLUXCL_48) >> 24)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s16 = (float)  (x3 & RANLUXCL_24)
-		/ (float)RANLUXCL_POW2_24;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
-	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
-	rst.s17 = (float)  (x1 >> 40)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s18 = (float) ((x1 & RANLUXCL_40) >> 16)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
-		/ (float)RANLUXCL_POW2_24;
-	rst.s20 = (float) ((x2 & RANLUXCL_56) >> 32)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s21 = (float) ((x2 & RANLUXCL_32) >> 8)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
-		/ (float)RANLUXCL_POW2_24;
-	rst.s23 = (float) ((x3 & RANLUXCL_48) >> 24)
-		/ (float)RANLUXCL_POW2_24;
-	rst.s24 = (float)  (x3 & RANLUXCL_24)
-		/ (float)RANLUXCL_POW2_24;
-	#undef RANLUXCL_POW2_24
-	#undef RANLUXCL_56
-	#undef RANLUXCL_48
-	#undef RANLUXCL_40
-	#undef RANLUXCL_32
-	#undef RANLUXCL_24
-	#undef RANLUXCL_16
-	#undef RANLUXCL_8
-	rst.in24 = 0;
-	rst.stepnr = 0;
-	rst.carry = 0.0f;
-	if(rst.s24 == 0.0f)
-		rst.carry = RANLUXCL_TWOM24;
-	#ifndef RANLUXCL_NO_WARMUP
-	//Warming up the generator, ensuring there are no initial correlations.
-	//16 is a "magic number". It is the number of times we must generate
-	//a batch of 24 numbers to ensure complete decorrelation, however it
-	//seems like it is necessary to double this for the special case when
-	//the generator is initialized to all zeros.
-	for(int i=0; i<16 * 2; i++){
-		ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
-		ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
-		ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
-		ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
-		ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
-		ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
-		ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
-		ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
-		ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
-		ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
-		ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
-		ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
-		ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
-		ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
-		ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
-		ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
-		ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
-		ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
-		ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
-		ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
-		ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
-		ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
-		ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
-		ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
-	}
-	#endif //RANLUXCL_NO_WARMUP
-	//Upload the state
-	*ranluxcltab = rst;
-}
-void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab)
-{
-	//Using legacy initialization from original Fortan 77 implementation
-	//ins is scaled so that if the user makes another call somewhere else
-	//with ins + 1 there should be no overlap. Also adding one
-	//allows us to use ins = 0.
-	int k, maxWorkitems;
-	ranluxcl_state_t rst;
-	#ifdef RANLUXCL_MAXWORKITEMS
-	maxWorkitems = RANLUXCL_MAXWORKITEMS;
-	#else
-	maxWorkitems = RANLUXCL_NUMWORKITEMS;
-	#endif //RANLUXCL_MAXWORKITEMS
-	int scaledins = ins * maxWorkitems + 1;
-	int js = scaledins + RANLUXCL_MYID;
-	//Make sure js is not too small (should really be an error)
-	if(js < 1)
-		js = 1;
-	#define IC 2147483563
-	#define ITWO24 16777216
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
-	#undef IC
-	#undef ITWO24
-	rst.in24 = 0;
-	rst.stepnr = 0;
-	rst.carry = 0.0f;
-	if(rst.s24 == 0.0f)
-		rst.carry = RANLUXCL_TWOM24;
-	#ifndef RANLUXCL_NO_WARMUP
-	//Warming up the generator, ensuring there are no initial correlations.
-	//16 is a "magic number". It is the number of times we must generate
-	//a batch of 24 numbers to ensure complete decorrelation.
-	for(int i=0; i<16; i++){
-		ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
-		ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
-		ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
-		ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
-		ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
-		ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
-		ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
-		ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
-		ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
-		ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
-		ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
-		ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
-		ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
-		ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
-		ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
-		ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
-		ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
-		ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
-		ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
-		ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
-		ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
-		ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
-		ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
-		ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
-	}
-	#endif //RANLUXCL_NO_WARMUP
-	//Upload the state
-	ranluxcl_upload_seed(&rst, ranluxcltab);
-}
-void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
-{
-	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
-	ranluxcl_init_legacy(ins, ranluxcltab);
-	#else // Not RANLUXCL_USE_LEGACY_INITIALIZATION
-	// We scale ins by 2^32. As long as we never use more than (2^32)-1
-	// work-items per NDRange the initial states should never be the same.
-	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
-	ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID);
-	#endif // RANLUXCL_USE_LEGACY_INITIALIZATION
-}
-float4 ranluxcl32norm(ranluxcl_state_t *rst)
-{
-	//Returns a vector where each component is a normally
-	//distributed PRN centered on 0, with standard deviation 1.
-	//Roll our own since M_PI_F does not exist in OpenCL 1.0.
-	#define RANLUXCL_PI_F 3.1415926535f
-	float4 U = ranluxcl32(rst);
-	float4 Z;
-	float R, phi;
-	R = sqrt(-2 * log(U.x));
-	phi = 2 * RANLUXCL_PI_F * U.y;
-	Z.x = R * cos(phi);
-	Z.y = R * sin(phi);
-	R = sqrt(-2 * log(U.z));
-	phi = 2 * RANLUXCL_PI_F * U.w;
-	Z.z = R * cos(phi);
-	Z.w = R * sin(phi);
-	return Z;
-	#undef RANLUXCL_PI_F
-}
-#ifdef RANLUXCL_SUPPORT_DOUBLE
-double4 ranluxcl64(ranluxcl_state_t *rst)
-{
-	double4 out;
-	float4 randvec;
-	//We know this value is caused by the never-zero part
-	//of the original algorithm, but we want to allow zero for
-	//the most significant bits in the double precision result.
-	randvec = ranluxcl32(rst);
-	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.x = 0.0f;
-	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.z = 0.0f;
-	out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
-	out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;
-	randvec = ranluxcl32(rst);
-	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.x = 0.0f;
-	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.z = 0.0f;
-	out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
-	out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;
-	return out;
-}
-double4 ranluxcl64norm(ranluxcl_state_t *rst)
-{
-	//Returns a vector where each component is a normally
-	//distributed PRN centered on 0, with standard deviation
-	//1.
-	double4 U = ranluxcl64(rst);
-	double4 Z;
-	double R, phi;
-	R = sqrt(-2 * log(U.x));
-	phi = 2 * M_PI * U.y;
-	Z.x = R * cos(phi);
-	Z.y = R * sin(phi);
-	R = sqrt(-2 * log(U.z));
-	phi = 2 * M_PI * U.w;
-	Z.z = R * cos(phi);
-	Z.w = R * sin(phi);
-	return Z;
-}
-#endif //RANLUXCL_SUPPORT_DOUBLE
-#undef RANLUXCL_TWOM24
-#undef RANLUXCL_TWOM12
-#undef RANLUXCL_NUMWORKITEMS
-#undef RANLUXCL_MYID
-#undef RANLUXCL_PLANAR
-#undef RANLUXCL_NOSKIP
-#endif //RANLUXCL_CL