pyopencl 2024.2.2__cp38-cp38-win_amd64.whl → 2024.2.4__cp38-cp38-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyopencl might be problematic. Click here for more details.
- pyopencl/__init__.py +16 -4
- pyopencl/_cl.cp38-win_amd64.pyd +0 -0
- pyopencl/algorithm.py +3 -1
- pyopencl/bitonic_sort.py +2 -0
- pyopencl/characterize/__init__.py +23 -0
- pyopencl/compyte/.git +1 -0
- pyopencl/compyte/.github/workflows/autopush.yml +21 -0
- pyopencl/compyte/.github/workflows/ci.yml +30 -0
- pyopencl/compyte/.gitignore +21 -0
- pyopencl/compyte/ndarray/Makefile +31 -0
- pyopencl/compyte/ndarray/gpu_ndarray.h +35 -0
- pyopencl/compyte/ndarray/pygpu_language.h +207 -0
- pyopencl/compyte/ndarray/pygpu_language_cuda.cu +622 -0
- pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +317 -0
- pyopencl/compyte/ndarray/pygpu_ndarray.cpp +1546 -0
- pyopencl/compyte/ndarray/pygpu_ndarray.h +71 -0
- pyopencl/compyte/ndarray/pygpu_ndarray_object.h +232 -0
- pyopencl/compyte/setup.cfg +9 -0
- pyopencl/tools.py +60 -56
- pyopencl/version.py +7 -3
- {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/METADATA +105 -105
- pyopencl-2024.2.4.dist-info/RECORD +59 -0
- {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/WHEEL +1 -1
- pyopencl-2024.2.2.data/data/CITATION.cff +0 -74
- pyopencl-2024.2.2.data/data/CMakeLists.txt +0 -83
- pyopencl-2024.2.2.data/data/Makefile.in +0 -21
- pyopencl-2024.2.2.data/data/README.rst +0 -70
- pyopencl-2024.2.2.data/data/README_SETUP.txt +0 -34
- pyopencl-2024.2.2.data/data/aksetup_helper.py +0 -1013
- pyopencl-2024.2.2.data/data/configure.py +0 -6
- pyopencl-2024.2.2.data/data/contrib/cldis.py +0 -91
- pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/README +0 -29
- pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/translate.py +0 -1441
- pyopencl-2024.2.2.data/data/contrib/pyopencl.vim +0 -84
- pyopencl-2024.2.2.data/data/doc/Makefile +0 -23
- pyopencl-2024.2.2.data/data/doc/algorithm.rst +0 -214
- pyopencl-2024.2.2.data/data/doc/array.rst +0 -305
- pyopencl-2024.2.2.data/data/doc/conf.py +0 -26
- pyopencl-2024.2.2.data/data/doc/howto.rst +0 -105
- pyopencl-2024.2.2.data/data/doc/index.rst +0 -137
- pyopencl-2024.2.2.data/data/doc/make_constants.py +0 -561
- pyopencl-2024.2.2.data/data/doc/misc.rst +0 -885
- pyopencl-2024.2.2.data/data/doc/runtime.rst +0 -51
- pyopencl-2024.2.2.data/data/doc/runtime_const.rst +0 -30
- pyopencl-2024.2.2.data/data/doc/runtime_gl.rst +0 -78
- pyopencl-2024.2.2.data/data/doc/runtime_memory.rst +0 -527
- pyopencl-2024.2.2.data/data/doc/runtime_platform.rst +0 -184
- pyopencl-2024.2.2.data/data/doc/runtime_program.rst +0 -364
- pyopencl-2024.2.2.data/data/doc/runtime_queue.rst +0 -182
- pyopencl-2024.2.2.data/data/doc/subst.rst +0 -36
- pyopencl-2024.2.2.data/data/doc/tools.rst +0 -4
- pyopencl-2024.2.2.data/data/doc/types.rst +0 -42
- pyopencl-2024.2.2.data/data/examples/black-hole-accretion.py +0 -2227
- pyopencl-2024.2.2.data/data/examples/demo-struct-reduce.py +0 -75
- pyopencl-2024.2.2.data/data/examples/demo.py +0 -39
- pyopencl-2024.2.2.data/data/examples/demo_array.py +0 -32
- pyopencl-2024.2.2.data/data/examples/demo_array_svm.py +0 -37
- pyopencl-2024.2.2.data/data/examples/demo_elementwise.py +0 -34
- pyopencl-2024.2.2.data/data/examples/demo_elementwise_complex.py +0 -53
- pyopencl-2024.2.2.data/data/examples/demo_mandelbrot.py +0 -183
- pyopencl-2024.2.2.data/data/examples/demo_meta_codepy.py +0 -56
- pyopencl-2024.2.2.data/data/examples/demo_meta_template.py +0 -55
- pyopencl-2024.2.2.data/data/examples/dump-performance.py +0 -38
- pyopencl-2024.2.2.data/data/examples/dump-properties.py +0 -86
- pyopencl-2024.2.2.data/data/examples/gl_interop_demo.py +0 -84
- pyopencl-2024.2.2.data/data/examples/gl_particle_animation.py +0 -218
- pyopencl-2024.2.2.data/data/examples/ipython-demo.ipynb +0 -203
- pyopencl-2024.2.2.data/data/examples/median-filter.py +0 -99
- pyopencl-2024.2.2.data/data/examples/n-body.py +0 -1070
- pyopencl-2024.2.2.data/data/examples/narray.py +0 -37
- pyopencl-2024.2.2.data/data/examples/noisyImage.jpg +0 -0
- pyopencl-2024.2.2.data/data/examples/pi-monte-carlo.py +0 -1166
- pyopencl-2024.2.2.data/data/examples/svm.py +0 -82
- pyopencl-2024.2.2.data/data/examples/transpose.py +0 -229
- pyopencl-2024.2.2.data/data/pytest.ini +0 -3
- pyopencl-2024.2.2.data/data/src/bitlog.cpp +0 -51
- pyopencl-2024.2.2.data/data/src/bitlog.hpp +0 -83
- pyopencl-2024.2.2.data/data/src/clinfo_ext.h +0 -134
- pyopencl-2024.2.2.data/data/src/mempool.hpp +0 -444
- pyopencl-2024.2.2.data/data/src/pyopencl_ext.h +0 -77
- pyopencl-2024.2.2.data/data/src/tools.hpp +0 -90
- pyopencl-2024.2.2.data/data/src/wrap_cl.cpp +0 -61
- pyopencl-2024.2.2.data/data/src/wrap_cl.hpp +0 -5853
- pyopencl-2024.2.2.data/data/src/wrap_cl_part_1.cpp +0 -369
- pyopencl-2024.2.2.data/data/src/wrap_cl_part_2.cpp +0 -702
- pyopencl-2024.2.2.data/data/src/wrap_constants.cpp +0 -1274
- pyopencl-2024.2.2.data/data/src/wrap_helpers.hpp +0 -213
- pyopencl-2024.2.2.data/data/src/wrap_mempool.cpp +0 -738
- pyopencl-2024.2.2.data/data/test/add-vectors-32.spv +0 -0
- pyopencl-2024.2.2.data/data/test/add-vectors-64.spv +0 -0
- pyopencl-2024.2.2.data/data/test/empty-header.h +0 -1
- pyopencl-2024.2.2.data/data/test/test_algorithm.py +0 -1180
- pyopencl-2024.2.2.data/data/test/test_array.py +0 -2392
- pyopencl-2024.2.2.data/data/test/test_arrays_in_structs.py +0 -100
- pyopencl-2024.2.2.data/data/test/test_clmath.py +0 -529
- pyopencl-2024.2.2.data/data/test/test_clrandom.py +0 -75
- pyopencl-2024.2.2.data/data/test/test_enqueue_copy.py +0 -271
- pyopencl-2024.2.2.data/data/test/test_wrapper.py +0 -1565
- pyopencl-2024.2.2.dist-info/LICENSE +0 -282
- pyopencl-2024.2.2.dist-info/RECORD +0 -123
- pyopencl-2024.2.2.dist-info/top_level.txt +0 -1
- {pyopencl-2024.2.2.data/data → pyopencl-2024.2.4.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,1166 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
|
|
5
|
-
#
|
|
6
|
-
# performs an estimation of Pi using Monte Carlo method
|
|
7
|
-
# a large amount of iterations is divided and distributed to compute units
|
|
8
|
-
# a lot of options are provided to perform scalabilty tests
|
|
9
|
-
#
|
|
10
|
-
# use -h for complete set of options
|
|
11
|
-
#
|
|
12
|
-
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
|
13
|
-
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
|
14
|
-
#
|
|
15
|
-
|
|
16
|
-
# Thanks to Andreas Klockner for PyCUDA:
|
|
17
|
-
# http://mathema.tician.de/software/pycuda
|
|
18
|
-
# Thanks to Andreas Klockner for PyOpenCL:
|
|
19
|
-
# http://mathema.tician.de/software/pyopencl
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
# 2013-01-01 : problems with launch timeout
|
|
23
|
-
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
|
|
24
|
-
# Option "Interactive" "0" in /etc/X11/xorg.conf
|
|
25
|
-
|
|
26
|
-
import getopt
|
|
27
|
-
import itertools
|
|
28
|
-
import sys
|
|
29
|
-
import time
|
|
30
|
-
from socket import gethostname
|
|
31
|
-
|
|
32
|
-
# Common tools
|
|
33
|
-
import numpy
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class PenStacle:
|
|
37
|
-
"""Pentacle of Statistics from data"""
|
|
38
|
-
|
|
39
|
-
Avg = 0
|
|
40
|
-
Med = 0
|
|
41
|
-
Std = 0
|
|
42
|
-
Min = 0
|
|
43
|
-
Max = 0
|
|
44
|
-
|
|
45
|
-
def __init__(self, Data):
|
|
46
|
-
self.Avg = numpy.average(Data)
|
|
47
|
-
self.Med = numpy.median(Data)
|
|
48
|
-
self.Std = numpy.std(Data)
|
|
49
|
-
self.Max = numpy.max(Data)
|
|
50
|
-
self.Min = numpy.min(Data)
|
|
51
|
-
|
|
52
|
-
def display(self):
|
|
53
|
-
print("%s %s %s %s %s" % (self.Avg, self.Med, self.Std, self.Min, self.Max))
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class Experience:
|
|
57
|
-
"""Metrology for experiences"""
|
|
58
|
-
|
|
59
|
-
DeviceStyle = ""
|
|
60
|
-
DeviceId = 0
|
|
61
|
-
AvgD = 0
|
|
62
|
-
MedD = 0
|
|
63
|
-
StdD = 0
|
|
64
|
-
MinD = 0
|
|
65
|
-
MaxD = 0
|
|
66
|
-
AvgR = 0
|
|
67
|
-
MedR = 0
|
|
68
|
-
StdR = 0
|
|
69
|
-
MinR = 0
|
|
70
|
-
MaxR = 0
|
|
71
|
-
|
|
72
|
-
def __init__(self, DeviceStyle, DeviceId, Iterations):
|
|
73
|
-
self.DeviceStyle = DeviceStyle
|
|
74
|
-
self.DeviceId = DeviceId
|
|
75
|
-
self.Iterations
|
|
76
|
-
|
|
77
|
-
def Metrology(self, Data):
|
|
78
|
-
Duration = PenStacle(Data)
|
|
79
|
-
Rate = PenStacle(Iterations / Data)
|
|
80
|
-
print("Duration %s" % Duration)
|
|
81
|
-
print("Rate %s" % Rate)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def DictionariesAPI():
|
|
85
|
-
Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3}
|
|
86
|
-
Computing = {"INT32": 0, "INT64": 1, "FP32": 2, "FP64": 3}
|
|
87
|
-
Test = {True: 1, False: 0}
|
|
88
|
-
return (Marsaglia, Computing, Test)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
# find prime factors of a number
|
|
92
|
-
# Get for WWW :
|
|
93
|
-
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
|
|
94
|
-
def PrimeFactors(x):
|
|
95
|
-
|
|
96
|
-
factorlist = numpy.array([]).astype("uint32")
|
|
97
|
-
loop = 2
|
|
98
|
-
while loop <= x:
|
|
99
|
-
if x % loop == 0:
|
|
100
|
-
x /= loop
|
|
101
|
-
factorlist = numpy.append(factorlist, [loop])
|
|
102
|
-
else:
|
|
103
|
-
loop += 1
|
|
104
|
-
return factorlist
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
|
|
108
|
-
# output is thread number
|
|
109
|
-
def BestThreadsNumber(jobs):
|
|
110
|
-
factors = PrimeFactors(jobs)
|
|
111
|
-
matrix = numpy.append([factors], [factors[::-1]], axis=0)
|
|
112
|
-
threads = 1
|
|
113
|
-
for factor in matrix.transpose().ravel():
|
|
114
|
-
threads = threads * factor
|
|
115
|
-
if threads * threads > jobs or threads > 512:
|
|
116
|
-
break
|
|
117
|
-
return int(threads)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
# Predicted Amdahl Law (Reduced with s=1-p)
|
|
121
|
-
def AmdahlR(N, T1, p):
|
|
122
|
-
return T1 * (1 - p + p / N)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
# Predicted Amdahl Law
|
|
126
|
-
def Amdahl(N, T1, s, p):
|
|
127
|
-
return T1 * (s + p / N)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# Predicted Mylq Law with first order
|
|
131
|
-
def Mylq(N, T1, s, c, p):
|
|
132
|
-
return T1 * (s + p / N) + c * N
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
# Predicted Mylq Law with second order
|
|
136
|
-
def Mylq2(N, T1, s, c1, c2, p):
|
|
137
|
-
return T1 * (s + p / N) + c1 * N + c2 * N * N
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def KernelCodeCuda():
|
|
141
|
-
KERNEL_CODE_CUDA = """
|
|
142
|
-
#define TCONG 0
|
|
143
|
-
#define TSHR3 1
|
|
144
|
-
#define TMWC 2
|
|
145
|
-
#define TKISS 3
|
|
146
|
-
|
|
147
|
-
#define TINT32 0
|
|
148
|
-
#define TINT64 1
|
|
149
|
-
#define TFP32 2
|
|
150
|
-
#define TFP64 3
|
|
151
|
-
|
|
152
|
-
#define IFTHEN 1
|
|
153
|
-
|
|
154
|
-
// Marsaglia RNG very simple implementation
|
|
155
|
-
|
|
156
|
-
#define znew ((z=36969*(z&65535)+(z>>16))<<16)
|
|
157
|
-
#define wnew ((w=18000*(w&65535)+(w>>16))&65535)
|
|
158
|
-
#define MWC (znew+wnew)
|
|
159
|
-
#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
|
|
160
|
-
#define CONG (jcong=69069*jcong+1234567)
|
|
161
|
-
#define KISS ((MWC^CONG)+SHR3)
|
|
162
|
-
|
|
163
|
-
#define MWCfp MWC * 2.328306435454494e-10f
|
|
164
|
-
#define KISSfp KISS * 2.328306435454494e-10f
|
|
165
|
-
#define SHR3fp SHR3 * 2.328306435454494e-10f
|
|
166
|
-
#define CONGfp CONG * 2.328306435454494e-10f
|
|
167
|
-
|
|
168
|
-
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
|
|
169
|
-
{
|
|
170
|
-
|
|
171
|
-
#if TRNG == TCONG
|
|
172
|
-
uint jcong=seed_z+work;
|
|
173
|
-
#elif TRNG == TSHR3
|
|
174
|
-
uint jsr=seed_w+work;
|
|
175
|
-
#elif TRNG == TMWC
|
|
176
|
-
uint z=seed_z+work;
|
|
177
|
-
uint w=seed_w+work;
|
|
178
|
-
#elif TRNG == TKISS
|
|
179
|
-
uint jcong=seed_z+work;
|
|
180
|
-
uint jsr=seed_w+work;
|
|
181
|
-
uint z=seed_z-work;
|
|
182
|
-
uint w=seed_w-work;
|
|
183
|
-
#endif
|
|
184
|
-
|
|
185
|
-
ulong total=0;
|
|
186
|
-
|
|
187
|
-
for (ulong i=0;i<iterations;i++) {
|
|
188
|
-
|
|
189
|
-
#if TYPE == TINT32
|
|
190
|
-
#define THEONE 1073741824
|
|
191
|
-
#if TRNG == TCONG
|
|
192
|
-
uint x=CONG>>17 ;
|
|
193
|
-
uint y=CONG>>17 ;
|
|
194
|
-
#elif TRNG == TSHR3
|
|
195
|
-
uint x=SHR3>>17 ;
|
|
196
|
-
uint y=SHR3>>17 ;
|
|
197
|
-
#elif TRNG == TMWC
|
|
198
|
-
uint x=MWC>>17 ;
|
|
199
|
-
uint y=MWC>>17 ;
|
|
200
|
-
#elif TRNG == TKISS
|
|
201
|
-
uint x=KISS>>17 ;
|
|
202
|
-
uint y=KISS>>17 ;
|
|
203
|
-
#endif
|
|
204
|
-
#elif TYPE == TINT64
|
|
205
|
-
#define THEONE 4611686018427387904
|
|
206
|
-
#if TRNG == TCONG
|
|
207
|
-
ulong x=(ulong)(CONG>>1) ;
|
|
208
|
-
ulong y=(ulong)(CONG>>1) ;
|
|
209
|
-
#elif TRNG == TSHR3
|
|
210
|
-
ulong x=(ulong)(SHR3>>1) ;
|
|
211
|
-
ulong y=(ulong)(SHR3>>1) ;
|
|
212
|
-
#elif TRNG == TMWC
|
|
213
|
-
ulong x=(ulong)(MWC>>1) ;
|
|
214
|
-
ulong y=(ulong)(MWC>>1) ;
|
|
215
|
-
#elif TRNG == TKISS
|
|
216
|
-
ulong x=(ulong)(KISS>>1) ;
|
|
217
|
-
ulong y=(ulong)(KISS>>1) ;
|
|
218
|
-
#endif
|
|
219
|
-
#elif TYPE == TFP32
|
|
220
|
-
#define THEONE 1.0f
|
|
221
|
-
#if TRNG == TCONG
|
|
222
|
-
float x=CONGfp ;
|
|
223
|
-
float y=CONGfp ;
|
|
224
|
-
#elif TRNG == TSHR3
|
|
225
|
-
float x=SHR3fp ;
|
|
226
|
-
float y=SHR3fp ;
|
|
227
|
-
#elif TRNG == TMWC
|
|
228
|
-
float x=MWCfp ;
|
|
229
|
-
float y=MWCfp ;
|
|
230
|
-
#elif TRNG == TKISS
|
|
231
|
-
float x=KISSfp ;
|
|
232
|
-
float y=KISSfp ;
|
|
233
|
-
#endif
|
|
234
|
-
#elif TYPE == TFP64
|
|
235
|
-
#define THEONE 1.0f
|
|
236
|
-
#if TRNG == TCONG
|
|
237
|
-
double x=(double)CONGfp ;
|
|
238
|
-
double y=(double)CONGfp ;
|
|
239
|
-
#elif TRNG == TSHR3
|
|
240
|
-
double x=(double)SHR3fp ;
|
|
241
|
-
double y=(double)SHR3fp ;
|
|
242
|
-
#elif TRNG == TMWC
|
|
243
|
-
double x=(double)MWCfp ;
|
|
244
|
-
double y=(double)MWCfp ;
|
|
245
|
-
#elif TRNG == TKISS
|
|
246
|
-
double x=(double)KISSfp ;
|
|
247
|
-
double y=(double)KISSfp ;
|
|
248
|
-
#endif
|
|
249
|
-
#endif
|
|
250
|
-
|
|
251
|
-
#if TEST == IFTHEN
|
|
252
|
-
if ((x*x+y*y) <=THEONE) {
|
|
253
|
-
total+=1;
|
|
254
|
-
}
|
|
255
|
-
#else
|
|
256
|
-
ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
|
|
257
|
-
total+=inside;
|
|
258
|
-
#endif
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
return(total);
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
265
|
-
{
|
|
266
|
-
ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
|
|
267
|
-
s[blockIdx.x]=total;
|
|
268
|
-
__syncthreads();
|
|
269
|
-
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
273
|
-
{
|
|
274
|
-
ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
|
|
275
|
-
s[threadIdx.x]=total;
|
|
276
|
-
__syncthreads();
|
|
277
|
-
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
281
|
-
{
|
|
282
|
-
ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
|
|
283
|
-
s[blockDim.x*blockIdx.x+threadIdx.x]=total;
|
|
284
|
-
__syncthreads();
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
"""
|
|
288
|
-
return KERNEL_CODE_CUDA
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def KernelCodeOpenCL():
|
|
292
|
-
KERNEL_CODE_OPENCL = """
|
|
293
|
-
#define TCONG 0
|
|
294
|
-
#define TSHR3 1
|
|
295
|
-
#define TMWC 2
|
|
296
|
-
#define TKISS 3
|
|
297
|
-
|
|
298
|
-
#define TINT32 0
|
|
299
|
-
#define TINT64 1
|
|
300
|
-
#define TFP32 2
|
|
301
|
-
#define TFP64 3
|
|
302
|
-
|
|
303
|
-
#define IFTHEN 1
|
|
304
|
-
|
|
305
|
-
// Marsaglia RNG very simple implementation
|
|
306
|
-
#define znew ((z=36969*(z&65535)+(z>>16))<<16)
|
|
307
|
-
#define wnew ((w=18000*(w&65535)+(w>>16))&65535)
|
|
308
|
-
|
|
309
|
-
#define MWC (znew+wnew)
|
|
310
|
-
#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
|
|
311
|
-
#define CONG (jcong=69069*jcong+1234567)
|
|
312
|
-
#define KISS ((MWC^CONG)+SHR3)
|
|
313
|
-
|
|
314
|
-
#define MWCfp MWC * 2.328306435454494e-10f
|
|
315
|
-
#define KISSfp KISS * 2.328306435454494e-10f
|
|
316
|
-
#define CONGfp CONG * 2.328306435454494e-10f
|
|
317
|
-
#define SHR3fp SHR3 * 2.328306435454494e-10f
|
|
318
|
-
|
|
319
|
-
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
|
|
320
|
-
{
|
|
321
|
-
|
|
322
|
-
#if TRNG == TCONG
|
|
323
|
-
uint jcong=seed_z+work;
|
|
324
|
-
#elif TRNG == TSHR3
|
|
325
|
-
uint jsr=seed_w+work;
|
|
326
|
-
#elif TRNG == TMWC
|
|
327
|
-
uint z=seed_z+work;
|
|
328
|
-
uint w=seed_w+work;
|
|
329
|
-
#elif TRNG == TKISS
|
|
330
|
-
uint jcong=seed_z+work;
|
|
331
|
-
uint jsr=seed_w+work;
|
|
332
|
-
uint z=seed_z-work;
|
|
333
|
-
uint w=seed_w-work;
|
|
334
|
-
#endif
|
|
335
|
-
|
|
336
|
-
ulong total=0;
|
|
337
|
-
|
|
338
|
-
for (ulong i=0;i<iterations;i++) {
|
|
339
|
-
|
|
340
|
-
#if TYPE == TINT32
|
|
341
|
-
#define THEONE 1073741824
|
|
342
|
-
#if TRNG == TCONG
|
|
343
|
-
uint x=CONG>>17 ;
|
|
344
|
-
uint y=CONG>>17 ;
|
|
345
|
-
#elif TRNG == TSHR3
|
|
346
|
-
uint x=SHR3>>17 ;
|
|
347
|
-
uint y=SHR3>>17 ;
|
|
348
|
-
#elif TRNG == TMWC
|
|
349
|
-
uint x=MWC>>17 ;
|
|
350
|
-
uint y=MWC>>17 ;
|
|
351
|
-
#elif TRNG == TKISS
|
|
352
|
-
uint x=KISS>>17 ;
|
|
353
|
-
uint y=KISS>>17 ;
|
|
354
|
-
#endif
|
|
355
|
-
#elif TYPE == TINT64
|
|
356
|
-
#define THEONE 4611686018427387904
|
|
357
|
-
#if TRNG == TCONG
|
|
358
|
-
ulong x=(ulong)(CONG>>1) ;
|
|
359
|
-
ulong y=(ulong)(CONG>>1) ;
|
|
360
|
-
#elif TRNG == TSHR3
|
|
361
|
-
ulong x=(ulong)(SHR3>>1) ;
|
|
362
|
-
ulong y=(ulong)(SHR3>>1) ;
|
|
363
|
-
#elif TRNG == TMWC
|
|
364
|
-
ulong x=(ulong)(MWC>>1) ;
|
|
365
|
-
ulong y=(ulong)(MWC>>1) ;
|
|
366
|
-
#elif TRNG == TKISS
|
|
367
|
-
ulong x=(ulong)(KISS>>1) ;
|
|
368
|
-
ulong y=(ulong)(KISS>>1) ;
|
|
369
|
-
#endif
|
|
370
|
-
#elif TYPE == TFP32
|
|
371
|
-
#define THEONE 1.0f
|
|
372
|
-
#if TRNG == TCONG
|
|
373
|
-
float x=CONGfp ;
|
|
374
|
-
float y=CONGfp ;
|
|
375
|
-
#elif TRNG == TSHR3
|
|
376
|
-
float x=SHR3fp ;
|
|
377
|
-
float y=SHR3fp ;
|
|
378
|
-
#elif TRNG == TMWC
|
|
379
|
-
float x=MWCfp ;
|
|
380
|
-
float y=MWCfp ;
|
|
381
|
-
#elif TRNG == TKISS
|
|
382
|
-
float x=KISSfp ;
|
|
383
|
-
float y=KISSfp ;
|
|
384
|
-
#endif
|
|
385
|
-
#elif TYPE == TFP64
|
|
386
|
-
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
|
387
|
-
#define THEONE 1.0f
|
|
388
|
-
#if TRNG == TCONG
|
|
389
|
-
double x=(double)CONGfp ;
|
|
390
|
-
double y=(double)CONGfp ;
|
|
391
|
-
#elif TRNG == TSHR3
|
|
392
|
-
double x=(double)SHR3fp ;
|
|
393
|
-
double y=(double)SHR3fp ;
|
|
394
|
-
#elif TRNG == TMWC
|
|
395
|
-
double x=(double)MWCfp ;
|
|
396
|
-
double y=(double)MWCfp ;
|
|
397
|
-
#elif TRNG == TKISS
|
|
398
|
-
double x=(double)KISSfp ;
|
|
399
|
-
double y=(double)KISSfp ;
|
|
400
|
-
#endif
|
|
401
|
-
#endif
|
|
402
|
-
|
|
403
|
-
#if TEST == IFTHEN
|
|
404
|
-
if ((x*x+y*y) <= THEONE) {
|
|
405
|
-
total+=1;
|
|
406
|
-
}
|
|
407
|
-
#else
|
|
408
|
-
ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
|
|
409
|
-
total+=inside;
|
|
410
|
-
#endif
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
return(total);
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
__kernel void MainLoopGlobal(
|
|
417
|
-
__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
418
|
-
{
|
|
419
|
-
ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
|
|
420
|
-
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
421
|
-
s[get_global_id(0)]=total;
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
__kernel void MainLoopLocal(
|
|
425
|
-
__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
426
|
-
{
|
|
427
|
-
ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
|
|
428
|
-
barrier(CLK_LOCAL_MEM_FENCE);
|
|
429
|
-
s[get_local_id(0)]=total;
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
__kernel void MainLoopHybrid(
|
|
433
|
-
__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
434
|
-
{
|
|
435
|
-
ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
|
|
436
|
-
barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
|
|
437
|
-
s[get_global_id(0)]=total;
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
"""
|
|
441
|
-
return KERNEL_CODE_OPENCL
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
def MetropolisCuda(InputCU):
|
|
445
|
-
|
|
446
|
-
print("Inside ", InputCU)
|
|
447
|
-
|
|
448
|
-
iterations = InputCU["Iterations"]
|
|
449
|
-
steps = InputCU["Steps"]
|
|
450
|
-
blocks = InputCU["Blocks"]
|
|
451
|
-
threads = InputCU["Threads"]
|
|
452
|
-
Device = InputCU["Device"]
|
|
453
|
-
RNG = InputCU["RNG"]
|
|
454
|
-
ValueType = InputCU["ValueType"]
|
|
455
|
-
Seeds = InputCU["Seeds"]
|
|
456
|
-
|
|
457
|
-
Marsaglia, Computing, Test = DictionariesAPI()
|
|
458
|
-
|
|
459
|
-
try:
|
|
460
|
-
# For PyCUDA import
|
|
461
|
-
import pycuda.driver as cuda
|
|
462
|
-
from pycuda.compiler import SourceModule
|
|
463
|
-
|
|
464
|
-
cuda.init()
|
|
465
|
-
for Id in range(cuda.Device.count()):
|
|
466
|
-
if Id == Device:
|
|
467
|
-
XPU = cuda.Device(Id)
|
|
468
|
-
print("GPU selected %s" % XPU.name())
|
|
469
|
-
print
|
|
470
|
-
|
|
471
|
-
except ImportError:
|
|
472
|
-
print("Platform does not seem to support CUDA")
|
|
473
|
-
|
|
474
|
-
circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
|
|
475
|
-
circleCU = cuda.InOut(circle)
|
|
476
|
-
# circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
|
|
477
|
-
# cuda.memcpy_htod(circleCU, circle)
|
|
478
|
-
|
|
479
|
-
Context = XPU.make_context()
|
|
480
|
-
|
|
481
|
-
try:
|
|
482
|
-
mod = SourceModule(
|
|
483
|
-
KernelCodeCuda(),
|
|
484
|
-
options=[
|
|
485
|
-
"--compiler-options",
|
|
486
|
-
"-DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG], Computing[ValueType]),
|
|
487
|
-
],
|
|
488
|
-
)
|
|
489
|
-
# mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
|
|
490
|
-
# Needed to set the compiler via ccbin for CUDA9 implementation
|
|
491
|
-
# mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True) # noqa: E501
|
|
492
|
-
except Exception:
|
|
493
|
-
print("Compilation seems to break")
|
|
494
|
-
|
|
495
|
-
MetropolisBlocksCU = mod.get_function("MainLoopBlocks") # noqa: F841
|
|
496
|
-
MetropolisThreadsCU = mod.get_function("MainLoopThreads") # noqa: F841
|
|
497
|
-
MetropolisHybridCU = mod.get_function("MainLoopHybrid")
|
|
498
|
-
|
|
499
|
-
MyDuration = numpy.zeros(steps)
|
|
500
|
-
|
|
501
|
-
jobs = blocks * threads
|
|
502
|
-
|
|
503
|
-
iterationsCU = numpy.uint64(iterations / jobs)
|
|
504
|
-
if iterations % jobs != 0:
|
|
505
|
-
iterationsCU += numpy.uint64(1)
|
|
506
|
-
|
|
507
|
-
for i in range(steps):
|
|
508
|
-
start_time = time.time()
|
|
509
|
-
|
|
510
|
-
try:
|
|
511
|
-
MetropolisHybridCU(
|
|
512
|
-
circleCU,
|
|
513
|
-
numpy.uint64(iterationsCU),
|
|
514
|
-
numpy.uint32(Seeds[0]),
|
|
515
|
-
numpy.uint32(Seeds[1]),
|
|
516
|
-
grid=(blocks, 1),
|
|
517
|
-
block=(threads, 1, 1),
|
|
518
|
-
)
|
|
519
|
-
except Exception:
|
|
520
|
-
print("Crash during CUDA call")
|
|
521
|
-
|
|
522
|
-
elapsed = time.time() - start_time
|
|
523
|
-
print(
|
|
524
|
-
"(Blocks/Threads)=(%i,%i) method done in %.2f s..."
|
|
525
|
-
% (blocks, threads, elapsed)
|
|
526
|
-
)
|
|
527
|
-
|
|
528
|
-
MyDuration[i] = elapsed
|
|
529
|
-
|
|
530
|
-
OutputCU = {
|
|
531
|
-
"Inside": sum(circle),
|
|
532
|
-
"NewIterations": numpy.uint64(iterationsCU * jobs),
|
|
533
|
-
"Duration": MyDuration,
|
|
534
|
-
}
|
|
535
|
-
print(OutputCU)
|
|
536
|
-
Context.pop()
|
|
537
|
-
|
|
538
|
-
Context.detach()
|
|
539
|
-
|
|
540
|
-
return OutputCU
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
def MetropolisOpenCL(InputCL):
|
|
544
|
-
|
|
545
|
-
import pyopencl as cl
|
|
546
|
-
|
|
547
|
-
iterations = InputCL["Iterations"]
|
|
548
|
-
steps = InputCL["Steps"]
|
|
549
|
-
blocks = InputCL["Blocks"]
|
|
550
|
-
threads = InputCL["Threads"]
|
|
551
|
-
Device = InputCL["Device"]
|
|
552
|
-
RNG = InputCL["RNG"]
|
|
553
|
-
ValueType = InputCL["ValueType"]
|
|
554
|
-
TestType = InputCL["IfThen"]
|
|
555
|
-
Seeds = InputCL["Seeds"]
|
|
556
|
-
|
|
557
|
-
Marsaglia, Computing, Test = DictionariesAPI()
|
|
558
|
-
|
|
559
|
-
# Initialisation des variables en les CASTant correctement
|
|
560
|
-
Id = 0
|
|
561
|
-
HasXPU = False
|
|
562
|
-
for platform in cl.get_platforms():
|
|
563
|
-
for device in platform.get_devices():
|
|
564
|
-
if Id == Device:
|
|
565
|
-
XPU = device
|
|
566
|
-
print("CPU/GPU selected: ", device.name.lstrip())
|
|
567
|
-
HasXPU = True
|
|
568
|
-
Id += 1
|
|
569
|
-
# print(Id)
|
|
570
|
-
|
|
571
|
-
if not HasXPU:
|
|
572
|
-
print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1))
|
|
573
|
-
sys.exit()
|
|
574
|
-
|
|
575
|
-
# Je cree le contexte et la queue pour son execution
|
|
576
|
-
try:
|
|
577
|
-
ctx = cl.Context(devices=[XPU])
|
|
578
|
-
queue = cl.CommandQueue(
|
|
579
|
-
ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
|
|
580
|
-
)
|
|
581
|
-
except Exception:
|
|
582
|
-
print("Crash during context creation")
|
|
583
|
-
|
|
584
|
-
# Je recupere les flag possibles pour les buffers
|
|
585
|
-
mf = cl.mem_flags
|
|
586
|
-
|
|
587
|
-
circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
|
|
588
|
-
circleCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=circle)
|
|
589
|
-
|
|
590
|
-
MetropolisCL = cl.Program(ctx, KernelCodeOpenCL()).build(
|
|
591
|
-
options="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s"
|
|
592
|
-
% (Marsaglia[RNG], Computing[ValueType], Test[TestType])
|
|
593
|
-
)
|
|
594
|
-
|
|
595
|
-
MyDuration = numpy.zeros(steps)
|
|
596
|
-
|
|
597
|
-
jobs = blocks * threads
|
|
598
|
-
|
|
599
|
-
iterationsCL = numpy.uint64(iterations / jobs)
|
|
600
|
-
if iterations % jobs != 0:
|
|
601
|
-
iterationsCL += 1
|
|
602
|
-
|
|
603
|
-
for i in range(steps):
|
|
604
|
-
start_time = time.time()
|
|
605
|
-
if threads == 1:
|
|
606
|
-
CLLaunch = MetropolisCL.MainLoopGlobal(
|
|
607
|
-
queue,
|
|
608
|
-
(blocks,),
|
|
609
|
-
None,
|
|
610
|
-
circleCL,
|
|
611
|
-
numpy.uint64(iterationsCL),
|
|
612
|
-
numpy.uint32(Seeds[0]),
|
|
613
|
-
numpy.uint32(Seeds[1]),
|
|
614
|
-
)
|
|
615
|
-
else:
|
|
616
|
-
CLLaunch = MetropolisCL.MainLoopHybrid(
|
|
617
|
-
queue,
|
|
618
|
-
(jobs,),
|
|
619
|
-
(threads,),
|
|
620
|
-
circleCL,
|
|
621
|
-
numpy.uint64(iterationsCL),
|
|
622
|
-
numpy.uint32(Seeds[0]),
|
|
623
|
-
numpy.uint32(Seeds[1]),
|
|
624
|
-
)
|
|
625
|
-
|
|
626
|
-
CLLaunch.wait()
|
|
627
|
-
cl.enqueue_copy(queue, circle, circleCL).wait()
|
|
628
|
-
|
|
629
|
-
elapsed = time.time() - start_time
|
|
630
|
-
print(
|
|
631
|
-
"(Blocks/Threads)=(%i,%i) method done in %.2f s..."
|
|
632
|
-
% (blocks, threads, elapsed)
|
|
633
|
-
)
|
|
634
|
-
|
|
635
|
-
# Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
|
|
636
|
-
# elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
|
|
637
|
-
|
|
638
|
-
# print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
|
|
639
|
-
MyDuration[i] = elapsed
|
|
640
|
-
# AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
|
|
641
|
-
# MyPi[i]=numpy.median(AllPi)
|
|
642
|
-
# print MyPi[i],numpy.std(AllPi),MyDuration[i]
|
|
643
|
-
|
|
644
|
-
circleCL.release()
|
|
645
|
-
|
|
646
|
-
OutputCL = {
|
|
647
|
-
"Inside": sum(circle),
|
|
648
|
-
"NewIterations": numpy.uint64(iterationsCL * jobs),
|
|
649
|
-
"Duration": MyDuration,
|
|
650
|
-
}
|
|
651
|
-
# print(OutputCL)
|
|
652
|
-
return OutputCL
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
def FitAndPrint(N, D, Curves):
|
|
656
|
-
|
|
657
|
-
import matplotlib.pyplot as plt
|
|
658
|
-
from scipy.optimize import curve_fit
|
|
659
|
-
|
|
660
|
-
try:
|
|
661
|
-
coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
|
|
662
|
-
|
|
663
|
-
D_Amdahl = Amdahl(N, coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
|
|
664
|
-
coeffs_Amdahl[1] = coeffs_Amdahl[1] * coeffs_Amdahl[0] / D[0]
|
|
665
|
-
coeffs_Amdahl[2] = coeffs_Amdahl[2] * coeffs_Amdahl[0] / D[0]
|
|
666
|
-
coeffs_Amdahl[0] = D[0]
|
|
667
|
-
print(
|
|
668
|
-
"Amdahl Normalized: T=%.2f(%.6f+%.6f/N)"
|
|
669
|
-
% (coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
|
|
670
|
-
)
|
|
671
|
-
except Exception:
|
|
672
|
-
print("Impossible to fit for Amdahl law : only %i elements" % len(D))
|
|
673
|
-
|
|
674
|
-
try:
|
|
675
|
-
coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
|
|
676
|
-
|
|
677
|
-
# D_AmdahlR = AmdahlR(N, coeffs_AmdahlR[0], coeffs_AmdahlR[1])
|
|
678
|
-
coeffs_AmdahlR[1] = coeffs_AmdahlR[1] * coeffs_AmdahlR[0] / D[0]
|
|
679
|
-
coeffs_AmdahlR[0] = D[0]
|
|
680
|
-
print(
|
|
681
|
-
"Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)"
|
|
682
|
-
% (coeffs_AmdahlR[0], 1 - coeffs_AmdahlR[1], coeffs_AmdahlR[1])
|
|
683
|
-
)
|
|
684
|
-
|
|
685
|
-
except Exception:
|
|
686
|
-
print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
|
|
687
|
-
|
|
688
|
-
try:
|
|
689
|
-
coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
|
|
690
|
-
|
|
691
|
-
coeffs_Mylq[1] = coeffs_Mylq[1] * coeffs_Mylq[0] / D[0]
|
|
692
|
-
# coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
|
|
693
|
-
coeffs_Mylq[3] = coeffs_Mylq[3] * coeffs_Mylq[0] / D[0]
|
|
694
|
-
coeffs_Mylq[0] = D[0]
|
|
695
|
-
print(
|
|
696
|
-
"Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N"
|
|
697
|
-
% (coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[3], coeffs_Mylq[2])
|
|
698
|
-
)
|
|
699
|
-
D_Mylq = Mylq(N, coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[2],
|
|
700
|
-
coeffs_Mylq[3])
|
|
701
|
-
except Exception:
|
|
702
|
-
print("Impossible to fit for Mylq law : only %i elements" % len(D))
|
|
703
|
-
|
|
704
|
-
try:
|
|
705
|
-
coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
|
|
706
|
-
|
|
707
|
-
coeffs_Mylq2[1] = coeffs_Mylq2[1] * coeffs_Mylq2[0] / D[0]
|
|
708
|
-
# coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
|
|
709
|
-
# coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
|
|
710
|
-
coeffs_Mylq2[4] = coeffs_Mylq2[4] * coeffs_Mylq2[0] / D[0]
|
|
711
|
-
coeffs_Mylq2[0] = D[0]
|
|
712
|
-
print(
|
|
713
|
-
"Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2"
|
|
714
|
-
% (
|
|
715
|
-
coeffs_Mylq2[0],
|
|
716
|
-
coeffs_Mylq2[1],
|
|
717
|
-
coeffs_Mylq2[4],
|
|
718
|
-
coeffs_Mylq2[2],
|
|
719
|
-
coeffs_Mylq2[3],
|
|
720
|
-
)
|
|
721
|
-
)
|
|
722
|
-
|
|
723
|
-
except Exception:
|
|
724
|
-
print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
|
|
725
|
-
|
|
726
|
-
if Curves:
|
|
727
|
-
plt.xlabel("Number of Threads/work Items")
|
|
728
|
-
plt.ylabel("Total Elapsed Time")
|
|
729
|
-
|
|
730
|
-
(Experience,) = plt.plot(N, D, "ro")
|
|
731
|
-
try:
|
|
732
|
-
(pAmdahl,) = plt.plot(N, D_Amdahl, label="Loi de Amdahl")
|
|
733
|
-
(pMylq,) = plt.plot(N, D_Mylq, label="Loi de Mylq")
|
|
734
|
-
except Exception:
|
|
735
|
-
print("Fit curves seem not to be available")
|
|
736
|
-
|
|
737
|
-
plt.legend()
|
|
738
|
-
plt.show()
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
if __name__ == "__main__":
|
|
742
|
-
|
|
743
|
-
# Set defaults values
|
|
744
|
-
|
|
745
|
-
# GPU style can be Cuda (Nvidia implementation) or OpenCL
|
|
746
|
-
GpuStyle = "OpenCL"
|
|
747
|
-
# Iterations is integer
|
|
748
|
-
Iterations = 1000000000
|
|
749
|
-
# BlocksBlocks in first number of Blocks to explore
|
|
750
|
-
BlocksBegin = 1024
|
|
751
|
-
# BlocksEnd is last number of Blocks to explore
|
|
752
|
-
BlocksEnd = 1024
|
|
753
|
-
# BlocksStep is the step of Blocks to explore
|
|
754
|
-
BlocksStep = 1
|
|
755
|
-
# ThreadsBlocks in first number of Blocks to explore
|
|
756
|
-
ThreadsBegin = 1
|
|
757
|
-
# ThreadsEnd is last number of Blocks to explore
|
|
758
|
-
ThreadsEnd = 1
|
|
759
|
-
# ThreadsStep is the step of Blocks to explore
|
|
760
|
-
ThreadsStep = 1
|
|
761
|
-
# Redo is the times to redo the test to improve metrology
|
|
762
|
-
Redo = 1
|
|
763
|
-
# OutMetrology is method for duration estimation : False is GPU inside
|
|
764
|
-
OutMetrology = False
|
|
765
|
-
Metrology = "InMetro"
|
|
766
|
-
# Curves is True to print the curves
|
|
767
|
-
Curves = False
|
|
768
|
-
# Fit is True to print the curves
|
|
769
|
-
Fit = False
|
|
770
|
-
# Inside based on If
|
|
771
|
-
IfThen = False
|
|
772
|
-
# Marsaglia RNG
|
|
773
|
-
RNG = "MWC"
|
|
774
|
-
# Value type : INT32, INT64, FP32, FP64
|
|
775
|
-
ValueType = "FP32"
|
|
776
|
-
# Seeds for RNG
|
|
777
|
-
Seeds = 110271, 101008
|
|
778
|
-
|
|
779
|
-
HowToUse = "%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>" # noqa: E501
|
|
780
|
-
|
|
781
|
-
try:
|
|
782
|
-
opts, args = getopt.getopt(
|
|
783
|
-
sys.argv[1:],
|
|
784
|
-
"hockg:i:b:e:s:f:l:t:r:d:m:v:",
|
|
785
|
-
[
|
|
786
|
-
"gpustyle=",
|
|
787
|
-
"iterations=",
|
|
788
|
-
"blocksBegin=",
|
|
789
|
-
"blocksEnd=",
|
|
790
|
-
"blocksStep=",
|
|
791
|
-
"threadsFirst=",
|
|
792
|
-
"threadsLast=",
|
|
793
|
-
"threadssTep=",
|
|
794
|
-
"redo=",
|
|
795
|
-
"device=",
|
|
796
|
-
"marsaglia=",
|
|
797
|
-
"valuetype=",
|
|
798
|
-
],
|
|
799
|
-
)
|
|
800
|
-
except getopt.GetoptError:
|
|
801
|
-
print(HowToUse % sys.argv[0])
|
|
802
|
-
sys.exit(2)
|
|
803
|
-
|
|
804
|
-
# List of Devices
|
|
805
|
-
Devices = []
|
|
806
|
-
Alu = {}
|
|
807
|
-
|
|
808
|
-
for opt, arg in opts:
|
|
809
|
-
if opt == "-h":
|
|
810
|
-
print(HowToUse % sys.argv[0])
|
|
811
|
-
|
|
812
|
-
print("\nInformations about devices detected under OpenCL API:")
|
|
813
|
-
# For PyOpenCL import
|
|
814
|
-
try:
|
|
815
|
-
import pyopencl as cl
|
|
816
|
-
|
|
817
|
-
Id = 0
|
|
818
|
-
for platform in cl.get_platforms():
|
|
819
|
-
for device in platform.get_devices():
|
|
820
|
-
# deviceType=cl.device_type.to_string(device.type)
|
|
821
|
-
deviceType = "xPU"
|
|
822
|
-
print(
|
|
823
|
-
"Device #%i from %s of type %s : %s"
|
|
824
|
-
% (
|
|
825
|
-
Id,
|
|
826
|
-
platform.vendor.lstrip(),
|
|
827
|
-
deviceType,
|
|
828
|
-
device.name.lstrip(),
|
|
829
|
-
)
|
|
830
|
-
)
|
|
831
|
-
Id = Id + 1
|
|
832
|
-
|
|
833
|
-
except Exception:
|
|
834
|
-
print("Your platform does not seem to support OpenCL")
|
|
835
|
-
|
|
836
|
-
print("\nInformations about devices detected under CUDA API:")
|
|
837
|
-
# For PyCUDA import
|
|
838
|
-
try:
|
|
839
|
-
import pycuda.driver as cuda
|
|
840
|
-
|
|
841
|
-
cuda.init()
|
|
842
|
-
for Id in range(cuda.Device.count()):
|
|
843
|
-
device = cuda.Device(Id)
|
|
844
|
-
print("Device #%i of type GPU : %s" % (Id, device.name()))
|
|
845
|
-
print
|
|
846
|
-
except Exception:
|
|
847
|
-
print("Your platform does not seem to support CUDA")
|
|
848
|
-
|
|
849
|
-
sys.exit()
|
|
850
|
-
|
|
851
|
-
elif opt == "-o":
|
|
852
|
-
OutMetrology = True
|
|
853
|
-
Metrology = "OutMetro"
|
|
854
|
-
elif opt == "-c":
|
|
855
|
-
Curves = True
|
|
856
|
-
elif opt == "-k":
|
|
857
|
-
IfThen = True
|
|
858
|
-
elif opt in ("-d", "--device"):
|
|
859
|
-
Devices.append(int(arg))
|
|
860
|
-
elif opt in ("-g", "--gpustyle"):
|
|
861
|
-
GpuStyle = arg
|
|
862
|
-
elif opt in ("-m", "--marsaglia"):
|
|
863
|
-
RNG = arg
|
|
864
|
-
elif opt in ("-v", "--valuetype"):
|
|
865
|
-
ValueType = arg
|
|
866
|
-
elif opt in ("-i", "--iterations"):
|
|
867
|
-
Iterations = numpy.uint64(arg)
|
|
868
|
-
elif opt in ("-b", "--blocksbegin"):
|
|
869
|
-
BlocksBegin = int(arg)
|
|
870
|
-
BlocksEnd = BlocksBegin
|
|
871
|
-
elif opt in ("-e", "--blocksend"):
|
|
872
|
-
BlocksEnd = int(arg)
|
|
873
|
-
elif opt in ("-s", "--blocksstep"):
|
|
874
|
-
BlocksStep = int(arg)
|
|
875
|
-
elif opt in ("-f", "--threadsfirst"):
|
|
876
|
-
ThreadsBegin = int(arg)
|
|
877
|
-
ThreadsEnd = ThreadsBegin
|
|
878
|
-
elif opt in ("-l", "--threadslast"):
|
|
879
|
-
ThreadsEnd = int(arg)
|
|
880
|
-
elif opt in ("-t", "--threadsstep"):
|
|
881
|
-
ThreadsStep = int(arg)
|
|
882
|
-
elif opt in ("-r", "--redo"):
|
|
883
|
-
Redo = int(arg)
|
|
884
|
-
|
|
885
|
-
# If no device has been specified, take the first one!
|
|
886
|
-
if len(Devices) == 0:
|
|
887
|
-
Devices.append(0)
|
|
888
|
-
|
|
889
|
-
print("Devices Identification : %s" % Devices)
|
|
890
|
-
print("GpuStyle used : %s" % GpuStyle)
|
|
891
|
-
print("Iterations : %s" % Iterations)
|
|
892
|
-
print("Number of Blocks on begin : %s" % BlocksBegin)
|
|
893
|
-
print("Number of Blocks on end : %s" % BlocksEnd)
|
|
894
|
-
print("Step on Blocks : %s" % BlocksStep)
|
|
895
|
-
print("Number of Threads on begin : %s" % ThreadsBegin)
|
|
896
|
-
print("Number of Threads on end : %s" % ThreadsEnd)
|
|
897
|
-
print("Step on Threads : %s" % ThreadsStep)
|
|
898
|
-
print("Number of redo : %s" % Redo)
|
|
899
|
-
print("Metrology done out of XPU : %r" % OutMetrology)
|
|
900
|
-
print("Type of Marsaglia RNG used : %s" % RNG)
|
|
901
|
-
print("Type of variable : %s" % ValueType)
|
|
902
|
-
|
|
903
|
-
if GpuStyle == "CUDA":
|
|
904
|
-
try:
|
|
905
|
-
# For PyCUDA import
|
|
906
|
-
import pycuda.driver as cuda
|
|
907
|
-
|
|
908
|
-
cuda.init()
|
|
909
|
-
for Id in range(cuda.Device.count()):
|
|
910
|
-
device = cuda.Device(Id)
|
|
911
|
-
print("Device #%i of type GPU : %s" % (Id, device.name()))
|
|
912
|
-
if Id in Devices:
|
|
913
|
-
Alu[Id] = "GPU"
|
|
914
|
-
|
|
915
|
-
except ImportError:
|
|
916
|
-
print("Platform does not seem to support CUDA")
|
|
917
|
-
|
|
918
|
-
if GpuStyle == "OpenCL":
|
|
919
|
-
try:
|
|
920
|
-
# For PyOpenCL import
|
|
921
|
-
import pyopencl as cl
|
|
922
|
-
|
|
923
|
-
Id = 0
|
|
924
|
-
for platform in cl.get_platforms():
|
|
925
|
-
for device in platform.get_devices():
|
|
926
|
-
# deviceType=cl.device_type.to_string(device.type)
|
|
927
|
-
deviceType = "xPU"
|
|
928
|
-
print(
|
|
929
|
-
"Device #%i from %s of type %s : %s"
|
|
930
|
-
% (
|
|
931
|
-
Id,
|
|
932
|
-
platform.vendor.lstrip().rstrip(),
|
|
933
|
-
deviceType,
|
|
934
|
-
device.name.lstrip().rstrip(),
|
|
935
|
-
)
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
if Id in Devices:
|
|
939
|
-
# Set the Alu as detected Device Type
|
|
940
|
-
Alu[Id] = deviceType
|
|
941
|
-
Id = Id + 1
|
|
942
|
-
except ImportError:
|
|
943
|
-
print("Platform does not seem to support OpenCL")
|
|
944
|
-
|
|
945
|
-
# print(Devices,Alu)
|
|
946
|
-
|
|
947
|
-
BlocksList = range(BlocksBegin, BlocksEnd + BlocksStep, BlocksStep)
|
|
948
|
-
ThreadsList = range(ThreadsBegin, ThreadsEnd + ThreadsStep, ThreadsStep)
|
|
949
|
-
|
|
950
|
-
ExploredJobs = numpy.array([]).astype(numpy.uint32)
|
|
951
|
-
ExploredBlocks = numpy.array([]).astype(numpy.uint32)
|
|
952
|
-
ExploredThreads = numpy.array([]).astype(numpy.uint32)
|
|
953
|
-
avgD = numpy.array([]).astype(numpy.float32)
|
|
954
|
-
medD = numpy.array([]).astype(numpy.float32)
|
|
955
|
-
stdD = numpy.array([]).astype(numpy.float32)
|
|
956
|
-
minD = numpy.array([]).astype(numpy.float32)
|
|
957
|
-
maxD = numpy.array([]).astype(numpy.float32)
|
|
958
|
-
avgR = numpy.array([]).astype(numpy.float32)
|
|
959
|
-
medR = numpy.array([]).astype(numpy.float32)
|
|
960
|
-
stdR = numpy.array([]).astype(numpy.float32)
|
|
961
|
-
minR = numpy.array([]).astype(numpy.float32)
|
|
962
|
-
maxR = numpy.array([]).astype(numpy.float32)
|
|
963
|
-
|
|
964
|
-
for Blocks, Threads in itertools.product(BlocksList, ThreadsList):
|
|
965
|
-
|
|
966
|
-
# print Blocks,Threads
|
|
967
|
-
circle = numpy.zeros(Blocks * Threads).astype(numpy.uint64)
|
|
968
|
-
ExploredJobs = numpy.append(ExploredJobs, Blocks * Threads)
|
|
969
|
-
ExploredBlocks = numpy.append(ExploredBlocks, Blocks)
|
|
970
|
-
ExploredThreads = numpy.append(ExploredThreads, Threads)
|
|
971
|
-
|
|
972
|
-
if OutMetrology:
|
|
973
|
-
DurationItem = numpy.array([]).astype(numpy.float32)
|
|
974
|
-
Duration = numpy.array([]).astype(numpy.float32)
|
|
975
|
-
Rate = numpy.array([]).astype(numpy.float32)
|
|
976
|
-
for i in range(Redo):
|
|
977
|
-
start = time.time()
|
|
978
|
-
if GpuStyle == "CUDA":
|
|
979
|
-
try:
|
|
980
|
-
InputCU = {}
|
|
981
|
-
InputCU["Iterations"] = Iterations
|
|
982
|
-
InputCU["Steps"] = 1
|
|
983
|
-
InputCU["Blocks"] = Blocks
|
|
984
|
-
InputCU["Threads"] = Threads
|
|
985
|
-
InputCU["Device"] = Devices[0]
|
|
986
|
-
InputCU["RNG"] = RNG
|
|
987
|
-
InputCU["Seeds"] = Seeds
|
|
988
|
-
InputCU["ValueType"] = ValueType
|
|
989
|
-
InputCU["IfThen"] = IfThen
|
|
990
|
-
OutputCU = MetropolisCuda(InputCU)
|
|
991
|
-
Inside = OutputCU["Circle"]
|
|
992
|
-
NewIterations = OutputCU["NewIterations"]
|
|
993
|
-
Duration = OutputCU["Duration"]
|
|
994
|
-
except Exception:
|
|
995
|
-
print(
|
|
996
|
-
"Problem with (%i,%i) // computations on Cuda"
|
|
997
|
-
% (Blocks, Threads)
|
|
998
|
-
)
|
|
999
|
-
elif GpuStyle == "OpenCL":
|
|
1000
|
-
try:
|
|
1001
|
-
InputCL = {}
|
|
1002
|
-
InputCL["Iterations"] = Iterations
|
|
1003
|
-
InputCL["Steps"] = 1
|
|
1004
|
-
InputCL["Blocks"] = Blocks
|
|
1005
|
-
InputCL["Threads"] = Threads
|
|
1006
|
-
InputCL["Device"] = Devices[0]
|
|
1007
|
-
InputCL["RNG"] = RNG
|
|
1008
|
-
InputCL["Seeds"] = Seeds
|
|
1009
|
-
InputCL["ValueType"] = ValueType
|
|
1010
|
-
InputCL["IfThen"] = IfThen
|
|
1011
|
-
OutputCL = MetropolisOpenCL(InputCL)
|
|
1012
|
-
Inside = OutputCL["Circle"]
|
|
1013
|
-
NewIterations = OutputCL["NewIterations"]
|
|
1014
|
-
Duration = OutputCL["Duration"]
|
|
1015
|
-
except Exception:
|
|
1016
|
-
print(
|
|
1017
|
-
"Problem with (%i,%i) // computations on OpenCL"
|
|
1018
|
-
% (Blocks, Threads)
|
|
1019
|
-
)
|
|
1020
|
-
Duration = numpy.append(Duration, time.time() - start)
|
|
1021
|
-
Rate = numpy.append(Rate, NewIterations / Duration[-1])
|
|
1022
|
-
else:
|
|
1023
|
-
if GpuStyle == "CUDA":
|
|
1024
|
-
try:
|
|
1025
|
-
InputCU = {}
|
|
1026
|
-
InputCU["Iterations"] = Iterations
|
|
1027
|
-
InputCU["Steps"] = Redo
|
|
1028
|
-
InputCU["Blocks"] = Blocks
|
|
1029
|
-
InputCU["Threads"] = Threads
|
|
1030
|
-
InputCU["Device"] = Devices[0]
|
|
1031
|
-
InputCU["RNG"] = RNG
|
|
1032
|
-
InputCU["Seeds"] = Seeds
|
|
1033
|
-
InputCU["ValueType"] = ValueType
|
|
1034
|
-
InputCU["IfThen"] = IfThen
|
|
1035
|
-
OutputCU = MetropolisCuda(InputCU)
|
|
1036
|
-
Inside = OutputCU["Inside"]
|
|
1037
|
-
NewIterations = OutputCU["NewIterations"]
|
|
1038
|
-
Duration = OutputCU["Duration"]
|
|
1039
|
-
pycuda.context.pop() # noqa: F821
|
|
1040
|
-
except Exception:
|
|
1041
|
-
print(
|
|
1042
|
-
"Problem with (%i,%i) // computations on Cuda"
|
|
1043
|
-
% (Blocks, Threads)
|
|
1044
|
-
)
|
|
1045
|
-
elif GpuStyle == "OpenCL":
|
|
1046
|
-
try:
|
|
1047
|
-
InputCL = {}
|
|
1048
|
-
InputCL["Iterations"] = Iterations
|
|
1049
|
-
InputCL["Steps"] = Redo
|
|
1050
|
-
InputCL["Blocks"] = Blocks
|
|
1051
|
-
InputCL["Threads"] = Threads
|
|
1052
|
-
InputCL["Device"] = Devices[0]
|
|
1053
|
-
InputCL["RNG"] = RNG
|
|
1054
|
-
InputCL["Seeds"] = Seeds
|
|
1055
|
-
InputCL["ValueType"] = ValueType
|
|
1056
|
-
InputCL["IfThen"] = IfThen
|
|
1057
|
-
OutputCL = MetropolisOpenCL(InputCL)
|
|
1058
|
-
Inside = OutputCL["Inside"]
|
|
1059
|
-
NewIterations = OutputCL["NewIterations"]
|
|
1060
|
-
Duration = OutputCL["Duration"]
|
|
1061
|
-
except Exception:
|
|
1062
|
-
print(
|
|
1063
|
-
"Problem with (%i,%i) // computations on OpenCL"
|
|
1064
|
-
% (Blocks, Threads)
|
|
1065
|
-
)
|
|
1066
|
-
Rate = NewIterations / Duration[-1]
|
|
1067
|
-
print(
|
|
1068
|
-
"Itops %i\nLogItops %.2f "
|
|
1069
|
-
% (int(Rate), numpy.log(Rate) / numpy.log(10))
|
|
1070
|
-
)
|
|
1071
|
-
print("Pi estimation %.8f" % (4.0 / NewIterations * Inside))
|
|
1072
|
-
|
|
1073
|
-
avgD = numpy.append(avgD, numpy.average(Duration))
|
|
1074
|
-
medD = numpy.append(medD, numpy.median(Duration))
|
|
1075
|
-
stdD = numpy.append(stdD, numpy.std(Duration))
|
|
1076
|
-
minD = numpy.append(minD, numpy.min(Duration))
|
|
1077
|
-
maxD = numpy.append(maxD, numpy.max(Duration))
|
|
1078
|
-
avgR = numpy.append(avgR, numpy.average(Rate))
|
|
1079
|
-
medR = numpy.append(medR, numpy.median(Rate))
|
|
1080
|
-
stdR = numpy.append(stdR, numpy.std(Rate))
|
|
1081
|
-
minR = numpy.append(minR, numpy.min(Rate))
|
|
1082
|
-
maxR = numpy.append(maxR, numpy.max(Rate))
|
|
1083
|
-
|
|
1084
|
-
print(
|
|
1085
|
-
"%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i"
|
|
1086
|
-
% (
|
|
1087
|
-
avgD[-1],
|
|
1088
|
-
medD[-1],
|
|
1089
|
-
stdD[-1],
|
|
1090
|
-
minD[-1],
|
|
1091
|
-
maxD[-1],
|
|
1092
|
-
avgR[-1],
|
|
1093
|
-
medR[-1],
|
|
1094
|
-
stdR[-1],
|
|
1095
|
-
minR[-1],
|
|
1096
|
-
maxR[-1],
|
|
1097
|
-
)
|
|
1098
|
-
)
|
|
1099
|
-
|
|
1100
|
-
numpy.savez(
|
|
1101
|
-
"Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s"
|
|
1102
|
-
% (
|
|
1103
|
-
ValueType,
|
|
1104
|
-
RNG,
|
|
1105
|
-
Alu[Devices[0]],
|
|
1106
|
-
GpuStyle,
|
|
1107
|
-
BlocksBegin,
|
|
1108
|
-
BlocksEnd,
|
|
1109
|
-
ThreadsBegin,
|
|
1110
|
-
ThreadsEnd,
|
|
1111
|
-
Iterations,
|
|
1112
|
-
Devices[0],
|
|
1113
|
-
Metrology,
|
|
1114
|
-
gethostname(),
|
|
1115
|
-
),
|
|
1116
|
-
(
|
|
1117
|
-
ExploredBlocks,
|
|
1118
|
-
ExploredThreads,
|
|
1119
|
-
avgD,
|
|
1120
|
-
medD,
|
|
1121
|
-
stdD,
|
|
1122
|
-
minD,
|
|
1123
|
-
maxD,
|
|
1124
|
-
avgR,
|
|
1125
|
-
medR,
|
|
1126
|
-
stdR,
|
|
1127
|
-
minR,
|
|
1128
|
-
maxR,
|
|
1129
|
-
),
|
|
1130
|
-
)
|
|
1131
|
-
ToSave = [
|
|
1132
|
-
ExploredBlocks,
|
|
1133
|
-
ExploredThreads,
|
|
1134
|
-
avgD,
|
|
1135
|
-
medD,
|
|
1136
|
-
stdD,
|
|
1137
|
-
minD,
|
|
1138
|
-
maxD,
|
|
1139
|
-
avgR,
|
|
1140
|
-
medR,
|
|
1141
|
-
stdR,
|
|
1142
|
-
minR,
|
|
1143
|
-
maxR,
|
|
1144
|
-
]
|
|
1145
|
-
numpy.savetxt(
|
|
1146
|
-
"Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s"
|
|
1147
|
-
% (
|
|
1148
|
-
ValueType,
|
|
1149
|
-
RNG,
|
|
1150
|
-
Alu[Devices[0]],
|
|
1151
|
-
GpuStyle,
|
|
1152
|
-
BlocksBegin,
|
|
1153
|
-
BlocksEnd,
|
|
1154
|
-
ThreadsBegin,
|
|
1155
|
-
ThreadsEnd,
|
|
1156
|
-
Iterations,
|
|
1157
|
-
Devices[0],
|
|
1158
|
-
Metrology,
|
|
1159
|
-
gethostname(),
|
|
1160
|
-
),
|
|
1161
|
-
numpy.transpose(ToSave),
|
|
1162
|
-
fmt="%i %i %e %e %e %e %e %i %i %i %i %i",
|
|
1163
|
-
)
|
|
1164
|
-
|
|
1165
|
-
if Fit:
|
|
1166
|
-
FitAndPrint(ExploredJobs, median, Curves) # noqa: F821, E501 # FIXME: undefined var 'median'
|