pyopencl 2024.2.2__cp310-cp310-macosx_11_0_arm64.whl → 2024.2.4__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (102) hide show
  1. pyopencl/__init__.py +16 -4
  2. pyopencl/_cl.cpython-310-darwin.so +0 -0
  3. pyopencl/algorithm.py +3 -1
  4. pyopencl/bitonic_sort.py +2 -0
  5. pyopencl/characterize/__init__.py +23 -0
  6. pyopencl/compyte/.git +1 -0
  7. pyopencl/compyte/.github/workflows/autopush.yml +21 -0
  8. pyopencl/compyte/.github/workflows/ci.yml +30 -0
  9. pyopencl/compyte/.gitignore +21 -0
  10. pyopencl/compyte/ndarray/Makefile +31 -0
  11. pyopencl/compyte/ndarray/gpu_ndarray.h +35 -0
  12. pyopencl/compyte/ndarray/pygpu_language.h +207 -0
  13. pyopencl/compyte/ndarray/pygpu_language_cuda.cu +622 -0
  14. pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +317 -0
  15. pyopencl/compyte/ndarray/pygpu_ndarray.cpp +1546 -0
  16. pyopencl/compyte/ndarray/pygpu_ndarray.h +71 -0
  17. pyopencl/compyte/ndarray/pygpu_ndarray_object.h +232 -0
  18. pyopencl/compyte/setup.cfg +9 -0
  19. pyopencl/tools.py +60 -56
  20. pyopencl/version.py +7 -3
  21. {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/METADATA +14 -14
  22. pyopencl-2024.2.4.dist-info/RECORD +59 -0
  23. {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/WHEEL +1 -1
  24. pyopencl-2024.2.2.data/data/CITATION.cff +0 -74
  25. pyopencl-2024.2.2.data/data/CMakeLists.txt +0 -83
  26. pyopencl-2024.2.2.data/data/Makefile.in +0 -21
  27. pyopencl-2024.2.2.data/data/README.rst +0 -70
  28. pyopencl-2024.2.2.data/data/README_SETUP.txt +0 -34
  29. pyopencl-2024.2.2.data/data/aksetup_helper.py +0 -1013
  30. pyopencl-2024.2.2.data/data/configure.py +0 -6
  31. pyopencl-2024.2.2.data/data/contrib/cldis.py +0 -91
  32. pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/README +0 -29
  33. pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/translate.py +0 -1441
  34. pyopencl-2024.2.2.data/data/contrib/pyopencl.vim +0 -84
  35. pyopencl-2024.2.2.data/data/doc/Makefile +0 -23
  36. pyopencl-2024.2.2.data/data/doc/algorithm.rst +0 -214
  37. pyopencl-2024.2.2.data/data/doc/array.rst +0 -305
  38. pyopencl-2024.2.2.data/data/doc/conf.py +0 -26
  39. pyopencl-2024.2.2.data/data/doc/howto.rst +0 -105
  40. pyopencl-2024.2.2.data/data/doc/index.rst +0 -137
  41. pyopencl-2024.2.2.data/data/doc/make_constants.py +0 -561
  42. pyopencl-2024.2.2.data/data/doc/misc.rst +0 -885
  43. pyopencl-2024.2.2.data/data/doc/runtime.rst +0 -51
  44. pyopencl-2024.2.2.data/data/doc/runtime_const.rst +0 -30
  45. pyopencl-2024.2.2.data/data/doc/runtime_gl.rst +0 -78
  46. pyopencl-2024.2.2.data/data/doc/runtime_memory.rst +0 -527
  47. pyopencl-2024.2.2.data/data/doc/runtime_platform.rst +0 -184
  48. pyopencl-2024.2.2.data/data/doc/runtime_program.rst +0 -364
  49. pyopencl-2024.2.2.data/data/doc/runtime_queue.rst +0 -182
  50. pyopencl-2024.2.2.data/data/doc/subst.rst +0 -36
  51. pyopencl-2024.2.2.data/data/doc/tools.rst +0 -4
  52. pyopencl-2024.2.2.data/data/doc/types.rst +0 -42
  53. pyopencl-2024.2.2.data/data/examples/black-hole-accretion.py +0 -2227
  54. pyopencl-2024.2.2.data/data/examples/demo-struct-reduce.py +0 -75
  55. pyopencl-2024.2.2.data/data/examples/demo.py +0 -39
  56. pyopencl-2024.2.2.data/data/examples/demo_array.py +0 -32
  57. pyopencl-2024.2.2.data/data/examples/demo_array_svm.py +0 -37
  58. pyopencl-2024.2.2.data/data/examples/demo_elementwise.py +0 -34
  59. pyopencl-2024.2.2.data/data/examples/demo_elementwise_complex.py +0 -53
  60. pyopencl-2024.2.2.data/data/examples/demo_mandelbrot.py +0 -183
  61. pyopencl-2024.2.2.data/data/examples/demo_meta_codepy.py +0 -56
  62. pyopencl-2024.2.2.data/data/examples/demo_meta_template.py +0 -55
  63. pyopencl-2024.2.2.data/data/examples/dump-performance.py +0 -38
  64. pyopencl-2024.2.2.data/data/examples/dump-properties.py +0 -86
  65. pyopencl-2024.2.2.data/data/examples/gl_interop_demo.py +0 -84
  66. pyopencl-2024.2.2.data/data/examples/gl_particle_animation.py +0 -218
  67. pyopencl-2024.2.2.data/data/examples/ipython-demo.ipynb +0 -203
  68. pyopencl-2024.2.2.data/data/examples/median-filter.py +0 -99
  69. pyopencl-2024.2.2.data/data/examples/n-body.py +0 -1070
  70. pyopencl-2024.2.2.data/data/examples/narray.py +0 -37
  71. pyopencl-2024.2.2.data/data/examples/noisyImage.jpg +0 -0
  72. pyopencl-2024.2.2.data/data/examples/pi-monte-carlo.py +0 -1166
  73. pyopencl-2024.2.2.data/data/examples/svm.py +0 -82
  74. pyopencl-2024.2.2.data/data/examples/transpose.py +0 -229
  75. pyopencl-2024.2.2.data/data/pytest.ini +0 -3
  76. pyopencl-2024.2.2.data/data/src/bitlog.cpp +0 -51
  77. pyopencl-2024.2.2.data/data/src/bitlog.hpp +0 -83
  78. pyopencl-2024.2.2.data/data/src/clinfo_ext.h +0 -134
  79. pyopencl-2024.2.2.data/data/src/mempool.hpp +0 -444
  80. pyopencl-2024.2.2.data/data/src/pyopencl_ext.h +0 -77
  81. pyopencl-2024.2.2.data/data/src/tools.hpp +0 -90
  82. pyopencl-2024.2.2.data/data/src/wrap_cl.cpp +0 -61
  83. pyopencl-2024.2.2.data/data/src/wrap_cl.hpp +0 -5853
  84. pyopencl-2024.2.2.data/data/src/wrap_cl_part_1.cpp +0 -369
  85. pyopencl-2024.2.2.data/data/src/wrap_cl_part_2.cpp +0 -702
  86. pyopencl-2024.2.2.data/data/src/wrap_constants.cpp +0 -1274
  87. pyopencl-2024.2.2.data/data/src/wrap_helpers.hpp +0 -213
  88. pyopencl-2024.2.2.data/data/src/wrap_mempool.cpp +0 -738
  89. pyopencl-2024.2.2.data/data/test/add-vectors-32.spv +0 -0
  90. pyopencl-2024.2.2.data/data/test/add-vectors-64.spv +0 -0
  91. pyopencl-2024.2.2.data/data/test/empty-header.h +0 -1
  92. pyopencl-2024.2.2.data/data/test/test_algorithm.py +0 -1180
  93. pyopencl-2024.2.2.data/data/test/test_array.py +0 -2392
  94. pyopencl-2024.2.2.data/data/test/test_arrays_in_structs.py +0 -100
  95. pyopencl-2024.2.2.data/data/test/test_clmath.py +0 -529
  96. pyopencl-2024.2.2.data/data/test/test_clrandom.py +0 -75
  97. pyopencl-2024.2.2.data/data/test/test_enqueue_copy.py +0 -271
  98. pyopencl-2024.2.2.data/data/test/test_wrapper.py +0 -1565
  99. pyopencl-2024.2.2.dist-info/LICENSE +0 -282
  100. pyopencl-2024.2.2.dist-info/RECORD +0 -123
  101. pyopencl-2024.2.2.dist-info/top_level.txt +0 -1
  102. {pyopencl-2024.2.2.data/data → pyopencl-2024.2.4.dist-info/licenses}/LICENSE +0 -0
@@ -1,1166 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- #
4
- # Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
- #
6
- # performs an estimation of Pi using Monte Carlo method
7
- # a large amount of iterations is divided and distributed to compute units
8
- # a lot of options are provided to perform scalabilty tests
9
- #
10
- # use -h for complete set of options
11
- #
12
- # CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
13
- # Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
14
- #
15
-
16
- # Thanks to Andreas Klockner for PyCUDA:
17
- # http://mathema.tician.de/software/pycuda
18
- # Thanks to Andreas Klockner for PyOpenCL:
19
- # http://mathema.tician.de/software/pyopencl
20
- #
21
-
22
- # 2013-01-01 : problems with launch timeout
23
- # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
24
- # Option "Interactive" "0" in /etc/X11/xorg.conf
25
-
26
- import getopt
27
- import itertools
28
- import sys
29
- import time
30
- from socket import gethostname
31
-
32
- # Common tools
33
- import numpy
34
-
35
-
36
- class PenStacle:
37
- """Pentacle of Statistics from data"""
38
-
39
- Avg = 0
40
- Med = 0
41
- Std = 0
42
- Min = 0
43
- Max = 0
44
-
45
- def __init__(self, Data):
46
- self.Avg = numpy.average(Data)
47
- self.Med = numpy.median(Data)
48
- self.Std = numpy.std(Data)
49
- self.Max = numpy.max(Data)
50
- self.Min = numpy.min(Data)
51
-
52
- def display(self):
53
- print("%s %s %s %s %s" % (self.Avg, self.Med, self.Std, self.Min, self.Max))
54
-
55
-
56
- class Experience:
57
- """Metrology for experiences"""
58
-
59
- DeviceStyle = ""
60
- DeviceId = 0
61
- AvgD = 0
62
- MedD = 0
63
- StdD = 0
64
- MinD = 0
65
- MaxD = 0
66
- AvgR = 0
67
- MedR = 0
68
- StdR = 0
69
- MinR = 0
70
- MaxR = 0
71
-
72
- def __init__(self, DeviceStyle, DeviceId, Iterations):
73
- self.DeviceStyle = DeviceStyle
74
- self.DeviceId = DeviceId
75
- self.Iterations
76
-
77
- def Metrology(self, Data):
78
- Duration = PenStacle(Data)
79
- Rate = PenStacle(Iterations / Data)
80
- print("Duration %s" % Duration)
81
- print("Rate %s" % Rate)
82
-
83
-
84
- def DictionariesAPI():
85
- Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3}
86
- Computing = {"INT32": 0, "INT64": 1, "FP32": 2, "FP64": 3}
87
- Test = {True: 1, False: 0}
88
- return (Marsaglia, Computing, Test)
89
-
90
-
91
- # find prime factors of a number
92
- # Get for WWW :
93
- # http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
94
- def PrimeFactors(x):
95
-
96
- factorlist = numpy.array([]).astype("uint32")
97
- loop = 2
98
- while loop <= x:
99
- if x % loop == 0:
100
- x /= loop
101
- factorlist = numpy.append(factorlist, [loop])
102
- else:
103
- loop += 1
104
- return factorlist
105
-
106
-
107
- # Try to find the best thread number in Hybrid approach (Blocks&Threads)
108
- # output is thread number
109
- def BestThreadsNumber(jobs):
110
- factors = PrimeFactors(jobs)
111
- matrix = numpy.append([factors], [factors[::-1]], axis=0)
112
- threads = 1
113
- for factor in matrix.transpose().ravel():
114
- threads = threads * factor
115
- if threads * threads > jobs or threads > 512:
116
- break
117
- return int(threads)
118
-
119
-
120
- # Predicted Amdahl Law (Reduced with s=1-p)
121
- def AmdahlR(N, T1, p):
122
- return T1 * (1 - p + p / N)
123
-
124
-
125
- # Predicted Amdahl Law
126
- def Amdahl(N, T1, s, p):
127
- return T1 * (s + p / N)
128
-
129
-
130
- # Predicted Mylq Law with first order
131
- def Mylq(N, T1, s, c, p):
132
- return T1 * (s + p / N) + c * N
133
-
134
-
135
- # Predicted Mylq Law with second order
136
- def Mylq2(N, T1, s, c1, c2, p):
137
- return T1 * (s + p / N) + c1 * N + c2 * N * N
138
-
139
-
140
- def KernelCodeCuda():
141
- KERNEL_CODE_CUDA = """
142
- #define TCONG 0
143
- #define TSHR3 1
144
- #define TMWC 2
145
- #define TKISS 3
146
-
147
- #define TINT32 0
148
- #define TINT64 1
149
- #define TFP32 2
150
- #define TFP64 3
151
-
152
- #define IFTHEN 1
153
-
154
- // Marsaglia RNG very simple implementation
155
-
156
- #define znew ((z=36969*(z&65535)+(z>>16))<<16)
157
- #define wnew ((w=18000*(w&65535)+(w>>16))&65535)
158
- #define MWC (znew+wnew)
159
- #define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
- #define CONG (jcong=69069*jcong+1234567)
161
- #define KISS ((MWC^CONG)+SHR3)
162
-
163
- #define MWCfp MWC * 2.328306435454494e-10f
164
- #define KISSfp KISS * 2.328306435454494e-10f
165
- #define SHR3fp SHR3 * 2.328306435454494e-10f
166
- #define CONGfp CONG * 2.328306435454494e-10f
167
-
168
- __device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
169
- {
170
-
171
- #if TRNG == TCONG
172
- uint jcong=seed_z+work;
173
- #elif TRNG == TSHR3
174
- uint jsr=seed_w+work;
175
- #elif TRNG == TMWC
176
- uint z=seed_z+work;
177
- uint w=seed_w+work;
178
- #elif TRNG == TKISS
179
- uint jcong=seed_z+work;
180
- uint jsr=seed_w+work;
181
- uint z=seed_z-work;
182
- uint w=seed_w-work;
183
- #endif
184
-
185
- ulong total=0;
186
-
187
- for (ulong i=0;i<iterations;i++) {
188
-
189
- #if TYPE == TINT32
190
- #define THEONE 1073741824
191
- #if TRNG == TCONG
192
- uint x=CONG>>17 ;
193
- uint y=CONG>>17 ;
194
- #elif TRNG == TSHR3
195
- uint x=SHR3>>17 ;
196
- uint y=SHR3>>17 ;
197
- #elif TRNG == TMWC
198
- uint x=MWC>>17 ;
199
- uint y=MWC>>17 ;
200
- #elif TRNG == TKISS
201
- uint x=KISS>>17 ;
202
- uint y=KISS>>17 ;
203
- #endif
204
- #elif TYPE == TINT64
205
- #define THEONE 4611686018427387904
206
- #if TRNG == TCONG
207
- ulong x=(ulong)(CONG>>1) ;
208
- ulong y=(ulong)(CONG>>1) ;
209
- #elif TRNG == TSHR3
210
- ulong x=(ulong)(SHR3>>1) ;
211
- ulong y=(ulong)(SHR3>>1) ;
212
- #elif TRNG == TMWC
213
- ulong x=(ulong)(MWC>>1) ;
214
- ulong y=(ulong)(MWC>>1) ;
215
- #elif TRNG == TKISS
216
- ulong x=(ulong)(KISS>>1) ;
217
- ulong y=(ulong)(KISS>>1) ;
218
- #endif
219
- #elif TYPE == TFP32
220
- #define THEONE 1.0f
221
- #if TRNG == TCONG
222
- float x=CONGfp ;
223
- float y=CONGfp ;
224
- #elif TRNG == TSHR3
225
- float x=SHR3fp ;
226
- float y=SHR3fp ;
227
- #elif TRNG == TMWC
228
- float x=MWCfp ;
229
- float y=MWCfp ;
230
- #elif TRNG == TKISS
231
- float x=KISSfp ;
232
- float y=KISSfp ;
233
- #endif
234
- #elif TYPE == TFP64
235
- #define THEONE 1.0f
236
- #if TRNG == TCONG
237
- double x=(double)CONGfp ;
238
- double y=(double)CONGfp ;
239
- #elif TRNG == TSHR3
240
- double x=(double)SHR3fp ;
241
- double y=(double)SHR3fp ;
242
- #elif TRNG == TMWC
243
- double x=(double)MWCfp ;
244
- double y=(double)MWCfp ;
245
- #elif TRNG == TKISS
246
- double x=(double)KISSfp ;
247
- double y=(double)KISSfp ;
248
- #endif
249
- #endif
250
-
251
- #if TEST == IFTHEN
252
- if ((x*x+y*y) <=THEONE) {
253
- total+=1;
254
- }
255
- #else
256
- ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
257
- total+=inside;
258
- #endif
259
- }
260
-
261
- return(total);
262
- }
263
-
264
- __global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
265
- {
266
- ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
267
- s[blockIdx.x]=total;
268
- __syncthreads();
269
-
270
- }
271
-
272
- __global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
273
- {
274
- ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
275
- s[threadIdx.x]=total;
276
- __syncthreads();
277
-
278
- }
279
-
280
- __global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
281
- {
282
- ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
283
- s[blockDim.x*blockIdx.x+threadIdx.x]=total;
284
- __syncthreads();
285
- }
286
-
287
- """
288
- return KERNEL_CODE_CUDA
289
-
290
-
291
- def KernelCodeOpenCL():
292
- KERNEL_CODE_OPENCL = """
293
- #define TCONG 0
294
- #define TSHR3 1
295
- #define TMWC 2
296
- #define TKISS 3
297
-
298
- #define TINT32 0
299
- #define TINT64 1
300
- #define TFP32 2
301
- #define TFP64 3
302
-
303
- #define IFTHEN 1
304
-
305
- // Marsaglia RNG very simple implementation
306
- #define znew ((z=36969*(z&65535)+(z>>16))<<16)
307
- #define wnew ((w=18000*(w&65535)+(w>>16))&65535)
308
-
309
- #define MWC (znew+wnew)
310
- #define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
311
- #define CONG (jcong=69069*jcong+1234567)
312
- #define KISS ((MWC^CONG)+SHR3)
313
-
314
- #define MWCfp MWC * 2.328306435454494e-10f
315
- #define KISSfp KISS * 2.328306435454494e-10f
316
- #define CONGfp CONG * 2.328306435454494e-10f
317
- #define SHR3fp SHR3 * 2.328306435454494e-10f
318
-
319
- ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
320
- {
321
-
322
- #if TRNG == TCONG
323
- uint jcong=seed_z+work;
324
- #elif TRNG == TSHR3
325
- uint jsr=seed_w+work;
326
- #elif TRNG == TMWC
327
- uint z=seed_z+work;
328
- uint w=seed_w+work;
329
- #elif TRNG == TKISS
330
- uint jcong=seed_z+work;
331
- uint jsr=seed_w+work;
332
- uint z=seed_z-work;
333
- uint w=seed_w-work;
334
- #endif
335
-
336
- ulong total=0;
337
-
338
- for (ulong i=0;i<iterations;i++) {
339
-
340
- #if TYPE == TINT32
341
- #define THEONE 1073741824
342
- #if TRNG == TCONG
343
- uint x=CONG>>17 ;
344
- uint y=CONG>>17 ;
345
- #elif TRNG == TSHR3
346
- uint x=SHR3>>17 ;
347
- uint y=SHR3>>17 ;
348
- #elif TRNG == TMWC
349
- uint x=MWC>>17 ;
350
- uint y=MWC>>17 ;
351
- #elif TRNG == TKISS
352
- uint x=KISS>>17 ;
353
- uint y=KISS>>17 ;
354
- #endif
355
- #elif TYPE == TINT64
356
- #define THEONE 4611686018427387904
357
- #if TRNG == TCONG
358
- ulong x=(ulong)(CONG>>1) ;
359
- ulong y=(ulong)(CONG>>1) ;
360
- #elif TRNG == TSHR3
361
- ulong x=(ulong)(SHR3>>1) ;
362
- ulong y=(ulong)(SHR3>>1) ;
363
- #elif TRNG == TMWC
364
- ulong x=(ulong)(MWC>>1) ;
365
- ulong y=(ulong)(MWC>>1) ;
366
- #elif TRNG == TKISS
367
- ulong x=(ulong)(KISS>>1) ;
368
- ulong y=(ulong)(KISS>>1) ;
369
- #endif
370
- #elif TYPE == TFP32
371
- #define THEONE 1.0f
372
- #if TRNG == TCONG
373
- float x=CONGfp ;
374
- float y=CONGfp ;
375
- #elif TRNG == TSHR3
376
- float x=SHR3fp ;
377
- float y=SHR3fp ;
378
- #elif TRNG == TMWC
379
- float x=MWCfp ;
380
- float y=MWCfp ;
381
- #elif TRNG == TKISS
382
- float x=KISSfp ;
383
- float y=KISSfp ;
384
- #endif
385
- #elif TYPE == TFP64
386
- #pragma OPENCL EXTENSION cl_khr_fp64: enable
387
- #define THEONE 1.0f
388
- #if TRNG == TCONG
389
- double x=(double)CONGfp ;
390
- double y=(double)CONGfp ;
391
- #elif TRNG == TSHR3
392
- double x=(double)SHR3fp ;
393
- double y=(double)SHR3fp ;
394
- #elif TRNG == TMWC
395
- double x=(double)MWCfp ;
396
- double y=(double)MWCfp ;
397
- #elif TRNG == TKISS
398
- double x=(double)KISSfp ;
399
- double y=(double)KISSfp ;
400
- #endif
401
- #endif
402
-
403
- #if TEST == IFTHEN
404
- if ((x*x+y*y) <= THEONE) {
405
- total+=1;
406
- }
407
- #else
408
- ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
409
- total+=inside;
410
- #endif
411
- }
412
-
413
- return(total);
414
- }
415
-
416
- __kernel void MainLoopGlobal(
417
- __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
418
- {
419
- ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
420
- barrier(CLK_GLOBAL_MEM_FENCE);
421
- s[get_global_id(0)]=total;
422
- }
423
-
424
- __kernel void MainLoopLocal(
425
- __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
426
- {
427
- ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
428
- barrier(CLK_LOCAL_MEM_FENCE);
429
- s[get_local_id(0)]=total;
430
- }
431
-
432
- __kernel void MainLoopHybrid(
433
- __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
434
- {
435
- ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
436
- barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
437
- s[get_global_id(0)]=total;
438
- }
439
-
440
- """
441
- return KERNEL_CODE_OPENCL
442
-
443
-
444
- def MetropolisCuda(InputCU):
445
-
446
- print("Inside ", InputCU)
447
-
448
- iterations = InputCU["Iterations"]
449
- steps = InputCU["Steps"]
450
- blocks = InputCU["Blocks"]
451
- threads = InputCU["Threads"]
452
- Device = InputCU["Device"]
453
- RNG = InputCU["RNG"]
454
- ValueType = InputCU["ValueType"]
455
- Seeds = InputCU["Seeds"]
456
-
457
- Marsaglia, Computing, Test = DictionariesAPI()
458
-
459
- try:
460
- # For PyCUDA import
461
- import pycuda.driver as cuda
462
- from pycuda.compiler import SourceModule
463
-
464
- cuda.init()
465
- for Id in range(cuda.Device.count()):
466
- if Id == Device:
467
- XPU = cuda.Device(Id)
468
- print("GPU selected %s" % XPU.name())
469
- print
470
-
471
- except ImportError:
472
- print("Platform does not seem to support CUDA")
473
-
474
- circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
475
- circleCU = cuda.InOut(circle)
476
- # circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
477
- # cuda.memcpy_htod(circleCU, circle)
478
-
479
- Context = XPU.make_context()
480
-
481
- try:
482
- mod = SourceModule(
483
- KernelCodeCuda(),
484
- options=[
485
- "--compiler-options",
486
- "-DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG], Computing[ValueType]),
487
- ],
488
- )
489
- # mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
490
- # Needed to set the compiler via ccbin for CUDA9 implementation
491
- # mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True) # noqa: E501
492
- except Exception:
493
- print("Compilation seems to break")
494
-
495
- MetropolisBlocksCU = mod.get_function("MainLoopBlocks") # noqa: F841
496
- MetropolisThreadsCU = mod.get_function("MainLoopThreads") # noqa: F841
497
- MetropolisHybridCU = mod.get_function("MainLoopHybrid")
498
-
499
- MyDuration = numpy.zeros(steps)
500
-
501
- jobs = blocks * threads
502
-
503
- iterationsCU = numpy.uint64(iterations / jobs)
504
- if iterations % jobs != 0:
505
- iterationsCU += numpy.uint64(1)
506
-
507
- for i in range(steps):
508
- start_time = time.time()
509
-
510
- try:
511
- MetropolisHybridCU(
512
- circleCU,
513
- numpy.uint64(iterationsCU),
514
- numpy.uint32(Seeds[0]),
515
- numpy.uint32(Seeds[1]),
516
- grid=(blocks, 1),
517
- block=(threads, 1, 1),
518
- )
519
- except Exception:
520
- print("Crash during CUDA call")
521
-
522
- elapsed = time.time() - start_time
523
- print(
524
- "(Blocks/Threads)=(%i,%i) method done in %.2f s..."
525
- % (blocks, threads, elapsed)
526
- )
527
-
528
- MyDuration[i] = elapsed
529
-
530
- OutputCU = {
531
- "Inside": sum(circle),
532
- "NewIterations": numpy.uint64(iterationsCU * jobs),
533
- "Duration": MyDuration,
534
- }
535
- print(OutputCU)
536
- Context.pop()
537
-
538
- Context.detach()
539
-
540
- return OutputCU
541
-
542
-
543
- def MetropolisOpenCL(InputCL):
544
-
545
- import pyopencl as cl
546
-
547
- iterations = InputCL["Iterations"]
548
- steps = InputCL["Steps"]
549
- blocks = InputCL["Blocks"]
550
- threads = InputCL["Threads"]
551
- Device = InputCL["Device"]
552
- RNG = InputCL["RNG"]
553
- ValueType = InputCL["ValueType"]
554
- TestType = InputCL["IfThen"]
555
- Seeds = InputCL["Seeds"]
556
-
557
- Marsaglia, Computing, Test = DictionariesAPI()
558
-
559
- # Initialisation des variables en les CASTant correctement
560
- Id = 0
561
- HasXPU = False
562
- for platform in cl.get_platforms():
563
- for device in platform.get_devices():
564
- if Id == Device:
565
- XPU = device
566
- print("CPU/GPU selected: ", device.name.lstrip())
567
- HasXPU = True
568
- Id += 1
569
- # print(Id)
570
-
571
- if not HasXPU:
572
- print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1))
573
- sys.exit()
574
-
575
- # Je cree le contexte et la queue pour son execution
576
- try:
577
- ctx = cl.Context(devices=[XPU])
578
- queue = cl.CommandQueue(
579
- ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
580
- )
581
- except Exception:
582
- print("Crash during context creation")
583
-
584
- # Je recupere les flag possibles pour les buffers
585
- mf = cl.mem_flags
586
-
587
- circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
588
- circleCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=circle)
589
-
590
- MetropolisCL = cl.Program(ctx, KernelCodeOpenCL()).build(
591
- options="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s"
592
- % (Marsaglia[RNG], Computing[ValueType], Test[TestType])
593
- )
594
-
595
- MyDuration = numpy.zeros(steps)
596
-
597
- jobs = blocks * threads
598
-
599
- iterationsCL = numpy.uint64(iterations / jobs)
600
- if iterations % jobs != 0:
601
- iterationsCL += 1
602
-
603
- for i in range(steps):
604
- start_time = time.time()
605
- if threads == 1:
606
- CLLaunch = MetropolisCL.MainLoopGlobal(
607
- queue,
608
- (blocks,),
609
- None,
610
- circleCL,
611
- numpy.uint64(iterationsCL),
612
- numpy.uint32(Seeds[0]),
613
- numpy.uint32(Seeds[1]),
614
- )
615
- else:
616
- CLLaunch = MetropolisCL.MainLoopHybrid(
617
- queue,
618
- (jobs,),
619
- (threads,),
620
- circleCL,
621
- numpy.uint64(iterationsCL),
622
- numpy.uint32(Seeds[0]),
623
- numpy.uint32(Seeds[1]),
624
- )
625
-
626
- CLLaunch.wait()
627
- cl.enqueue_copy(queue, circle, circleCL).wait()
628
-
629
- elapsed = time.time() - start_time
630
- print(
631
- "(Blocks/Threads)=(%i,%i) method done in %.2f s..."
632
- % (blocks, threads, elapsed)
633
- )
634
-
635
- # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
636
- # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
637
-
638
- # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
639
- MyDuration[i] = elapsed
640
- # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
641
- # MyPi[i]=numpy.median(AllPi)
642
- # print MyPi[i],numpy.std(AllPi),MyDuration[i]
643
-
644
- circleCL.release()
645
-
646
- OutputCL = {
647
- "Inside": sum(circle),
648
- "NewIterations": numpy.uint64(iterationsCL * jobs),
649
- "Duration": MyDuration,
650
- }
651
- # print(OutputCL)
652
- return OutputCL
653
-
654
-
655
- def FitAndPrint(N, D, Curves):
656
-
657
- import matplotlib.pyplot as plt
658
- from scipy.optimize import curve_fit
659
-
660
- try:
661
- coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
662
-
663
- D_Amdahl = Amdahl(N, coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
664
- coeffs_Amdahl[1] = coeffs_Amdahl[1] * coeffs_Amdahl[0] / D[0]
665
- coeffs_Amdahl[2] = coeffs_Amdahl[2] * coeffs_Amdahl[0] / D[0]
666
- coeffs_Amdahl[0] = D[0]
667
- print(
668
- "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)"
669
- % (coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
670
- )
671
- except Exception:
672
- print("Impossible to fit for Amdahl law : only %i elements" % len(D))
673
-
674
- try:
675
- coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
676
-
677
- # D_AmdahlR = AmdahlR(N, coeffs_AmdahlR[0], coeffs_AmdahlR[1])
678
- coeffs_AmdahlR[1] = coeffs_AmdahlR[1] * coeffs_AmdahlR[0] / D[0]
679
- coeffs_AmdahlR[0] = D[0]
680
- print(
681
- "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)"
682
- % (coeffs_AmdahlR[0], 1 - coeffs_AmdahlR[1], coeffs_AmdahlR[1])
683
- )
684
-
685
- except Exception:
686
- print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
687
-
688
- try:
689
- coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
690
-
691
- coeffs_Mylq[1] = coeffs_Mylq[1] * coeffs_Mylq[0] / D[0]
692
- # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
693
- coeffs_Mylq[3] = coeffs_Mylq[3] * coeffs_Mylq[0] / D[0]
694
- coeffs_Mylq[0] = D[0]
695
- print(
696
- "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N"
697
- % (coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[3], coeffs_Mylq[2])
698
- )
699
- D_Mylq = Mylq(N, coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[2],
700
- coeffs_Mylq[3])
701
- except Exception:
702
- print("Impossible to fit for Mylq law : only %i elements" % len(D))
703
-
704
- try:
705
- coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
706
-
707
- coeffs_Mylq2[1] = coeffs_Mylq2[1] * coeffs_Mylq2[0] / D[0]
708
- # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
709
- # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
710
- coeffs_Mylq2[4] = coeffs_Mylq2[4] * coeffs_Mylq2[0] / D[0]
711
- coeffs_Mylq2[0] = D[0]
712
- print(
713
- "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2"
714
- % (
715
- coeffs_Mylq2[0],
716
- coeffs_Mylq2[1],
717
- coeffs_Mylq2[4],
718
- coeffs_Mylq2[2],
719
- coeffs_Mylq2[3],
720
- )
721
- )
722
-
723
- except Exception:
724
- print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
725
-
726
- if Curves:
727
- plt.xlabel("Number of Threads/work Items")
728
- plt.ylabel("Total Elapsed Time")
729
-
730
- (Experience,) = plt.plot(N, D, "ro")
731
- try:
732
- (pAmdahl,) = plt.plot(N, D_Amdahl, label="Loi de Amdahl")
733
- (pMylq,) = plt.plot(N, D_Mylq, label="Loi de Mylq")
734
- except Exception:
735
- print("Fit curves seem not to be available")
736
-
737
- plt.legend()
738
- plt.show()
739
-
740
-
741
- if __name__ == "__main__":
742
-
743
- # Set defaults values
744
-
745
- # GPU style can be Cuda (Nvidia implementation) or OpenCL
746
- GpuStyle = "OpenCL"
747
- # Iterations is integer
748
- Iterations = 1000000000
749
- # BlocksBlocks in first number of Blocks to explore
750
- BlocksBegin = 1024
751
- # BlocksEnd is last number of Blocks to explore
752
- BlocksEnd = 1024
753
- # BlocksStep is the step of Blocks to explore
754
- BlocksStep = 1
755
- # ThreadsBlocks in first number of Blocks to explore
756
- ThreadsBegin = 1
757
- # ThreadsEnd is last number of Blocks to explore
758
- ThreadsEnd = 1
759
- # ThreadsStep is the step of Blocks to explore
760
- ThreadsStep = 1
761
- # Redo is the times to redo the test to improve metrology
762
- Redo = 1
763
- # OutMetrology is method for duration estimation : False is GPU inside
764
- OutMetrology = False
765
- Metrology = "InMetro"
766
- # Curves is True to print the curves
767
- Curves = False
768
- # Fit is True to print the curves
769
- Fit = False
770
- # Inside based on If
771
- IfThen = False
772
- # Marsaglia RNG
773
- RNG = "MWC"
774
- # Value type : INT32, INT64, FP32, FP64
775
- ValueType = "FP32"
776
- # Seeds for RNG
777
- Seeds = 110271, 101008
778
-
779
- HowToUse = "%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>" # noqa: E501
780
-
781
- try:
782
- opts, args = getopt.getopt(
783
- sys.argv[1:],
784
- "hockg:i:b:e:s:f:l:t:r:d:m:v:",
785
- [
786
- "gpustyle=",
787
- "iterations=",
788
- "blocksBegin=",
789
- "blocksEnd=",
790
- "blocksStep=",
791
- "threadsFirst=",
792
- "threadsLast=",
793
- "threadssTep=",
794
- "redo=",
795
- "device=",
796
- "marsaglia=",
797
- "valuetype=",
798
- ],
799
- )
800
- except getopt.GetoptError:
801
- print(HowToUse % sys.argv[0])
802
- sys.exit(2)
803
-
804
- # List of Devices
805
- Devices = []
806
- Alu = {}
807
-
808
- for opt, arg in opts:
809
- if opt == "-h":
810
- print(HowToUse % sys.argv[0])
811
-
812
- print("\nInformations about devices detected under OpenCL API:")
813
- # For PyOpenCL import
814
- try:
815
- import pyopencl as cl
816
-
817
- Id = 0
818
- for platform in cl.get_platforms():
819
- for device in platform.get_devices():
820
- # deviceType=cl.device_type.to_string(device.type)
821
- deviceType = "xPU"
822
- print(
823
- "Device #%i from %s of type %s : %s"
824
- % (
825
- Id,
826
- platform.vendor.lstrip(),
827
- deviceType,
828
- device.name.lstrip(),
829
- )
830
- )
831
- Id = Id + 1
832
-
833
- except Exception:
834
- print("Your platform does not seem to support OpenCL")
835
-
836
- print("\nInformations about devices detected under CUDA API:")
837
- # For PyCUDA import
838
- try:
839
- import pycuda.driver as cuda
840
-
841
- cuda.init()
842
- for Id in range(cuda.Device.count()):
843
- device = cuda.Device(Id)
844
- print("Device #%i of type GPU : %s" % (Id, device.name()))
845
- print
846
- except Exception:
847
- print("Your platform does not seem to support CUDA")
848
-
849
- sys.exit()
850
-
851
- elif opt == "-o":
852
- OutMetrology = True
853
- Metrology = "OutMetro"
854
- elif opt == "-c":
855
- Curves = True
856
- elif opt == "-k":
857
- IfThen = True
858
- elif opt in ("-d", "--device"):
859
- Devices.append(int(arg))
860
- elif opt in ("-g", "--gpustyle"):
861
- GpuStyle = arg
862
- elif opt in ("-m", "--marsaglia"):
863
- RNG = arg
864
- elif opt in ("-v", "--valuetype"):
865
- ValueType = arg
866
- elif opt in ("-i", "--iterations"):
867
- Iterations = numpy.uint64(arg)
868
- elif opt in ("-b", "--blocksbegin"):
869
- BlocksBegin = int(arg)
870
- BlocksEnd = BlocksBegin
871
- elif opt in ("-e", "--blocksend"):
872
- BlocksEnd = int(arg)
873
- elif opt in ("-s", "--blocksstep"):
874
- BlocksStep = int(arg)
875
- elif opt in ("-f", "--threadsfirst"):
876
- ThreadsBegin = int(arg)
877
- ThreadsEnd = ThreadsBegin
878
- elif opt in ("-l", "--threadslast"):
879
- ThreadsEnd = int(arg)
880
- elif opt in ("-t", "--threadsstep"):
881
- ThreadsStep = int(arg)
882
- elif opt in ("-r", "--redo"):
883
- Redo = int(arg)
884
-
885
- # If no device has been specified, take the first one!
886
- if len(Devices) == 0:
887
- Devices.append(0)
888
-
889
- print("Devices Identification : %s" % Devices)
890
- print("GpuStyle used : %s" % GpuStyle)
891
- print("Iterations : %s" % Iterations)
892
- print("Number of Blocks on begin : %s" % BlocksBegin)
893
- print("Number of Blocks on end : %s" % BlocksEnd)
894
- print("Step on Blocks : %s" % BlocksStep)
895
- print("Number of Threads on begin : %s" % ThreadsBegin)
896
- print("Number of Threads on end : %s" % ThreadsEnd)
897
- print("Step on Threads : %s" % ThreadsStep)
898
- print("Number of redo : %s" % Redo)
899
- print("Metrology done out of XPU : %r" % OutMetrology)
900
- print("Type of Marsaglia RNG used : %s" % RNG)
901
- print("Type of variable : %s" % ValueType)
902
-
903
- if GpuStyle == "CUDA":
904
- try:
905
- # For PyCUDA import
906
- import pycuda.driver as cuda
907
-
908
- cuda.init()
909
- for Id in range(cuda.Device.count()):
910
- device = cuda.Device(Id)
911
- print("Device #%i of type GPU : %s" % (Id, device.name()))
912
- if Id in Devices:
913
- Alu[Id] = "GPU"
914
-
915
- except ImportError:
916
- print("Platform does not seem to support CUDA")
917
-
918
- if GpuStyle == "OpenCL":
919
- try:
920
- # For PyOpenCL import
921
- import pyopencl as cl
922
-
923
- Id = 0
924
- for platform in cl.get_platforms():
925
- for device in platform.get_devices():
926
- # deviceType=cl.device_type.to_string(device.type)
927
- deviceType = "xPU"
928
- print(
929
- "Device #%i from %s of type %s : %s"
930
- % (
931
- Id,
932
- platform.vendor.lstrip().rstrip(),
933
- deviceType,
934
- device.name.lstrip().rstrip(),
935
- )
936
- )
937
-
938
- if Id in Devices:
939
- # Set the Alu as detected Device Type
940
- Alu[Id] = deviceType
941
- Id = Id + 1
942
- except ImportError:
943
- print("Platform does not seem to support OpenCL")
944
-
945
- # print(Devices,Alu)
946
-
947
- BlocksList = range(BlocksBegin, BlocksEnd + BlocksStep, BlocksStep)
948
- ThreadsList = range(ThreadsBegin, ThreadsEnd + ThreadsStep, ThreadsStep)
949
-
950
- ExploredJobs = numpy.array([]).astype(numpy.uint32)
951
- ExploredBlocks = numpy.array([]).astype(numpy.uint32)
952
- ExploredThreads = numpy.array([]).astype(numpy.uint32)
953
- avgD = numpy.array([]).astype(numpy.float32)
954
- medD = numpy.array([]).astype(numpy.float32)
955
- stdD = numpy.array([]).astype(numpy.float32)
956
- minD = numpy.array([]).astype(numpy.float32)
957
- maxD = numpy.array([]).astype(numpy.float32)
958
- avgR = numpy.array([]).astype(numpy.float32)
959
- medR = numpy.array([]).astype(numpy.float32)
960
- stdR = numpy.array([]).astype(numpy.float32)
961
- minR = numpy.array([]).astype(numpy.float32)
962
- maxR = numpy.array([]).astype(numpy.float32)
963
-
964
- for Blocks, Threads in itertools.product(BlocksList, ThreadsList):
965
-
966
- # print Blocks,Threads
967
- circle = numpy.zeros(Blocks * Threads).astype(numpy.uint64)
968
- ExploredJobs = numpy.append(ExploredJobs, Blocks * Threads)
969
- ExploredBlocks = numpy.append(ExploredBlocks, Blocks)
970
- ExploredThreads = numpy.append(ExploredThreads, Threads)
971
-
972
- if OutMetrology:
973
- DurationItem = numpy.array([]).astype(numpy.float32)
974
- Duration = numpy.array([]).astype(numpy.float32)
975
- Rate = numpy.array([]).astype(numpy.float32)
976
- for i in range(Redo):
977
- start = time.time()
978
- if GpuStyle == "CUDA":
979
- try:
980
- InputCU = {}
981
- InputCU["Iterations"] = Iterations
982
- InputCU["Steps"] = 1
983
- InputCU["Blocks"] = Blocks
984
- InputCU["Threads"] = Threads
985
- InputCU["Device"] = Devices[0]
986
- InputCU["RNG"] = RNG
987
- InputCU["Seeds"] = Seeds
988
- InputCU["ValueType"] = ValueType
989
- InputCU["IfThen"] = IfThen
990
- OutputCU = MetropolisCuda(InputCU)
991
- Inside = OutputCU["Circle"]
992
- NewIterations = OutputCU["NewIterations"]
993
- Duration = OutputCU["Duration"]
994
- except Exception:
995
- print(
996
- "Problem with (%i,%i) // computations on Cuda"
997
- % (Blocks, Threads)
998
- )
999
- elif GpuStyle == "OpenCL":
1000
- try:
1001
- InputCL = {}
1002
- InputCL["Iterations"] = Iterations
1003
- InputCL["Steps"] = 1
1004
- InputCL["Blocks"] = Blocks
1005
- InputCL["Threads"] = Threads
1006
- InputCL["Device"] = Devices[0]
1007
- InputCL["RNG"] = RNG
1008
- InputCL["Seeds"] = Seeds
1009
- InputCL["ValueType"] = ValueType
1010
- InputCL["IfThen"] = IfThen
1011
- OutputCL = MetropolisOpenCL(InputCL)
1012
- Inside = OutputCL["Circle"]
1013
- NewIterations = OutputCL["NewIterations"]
1014
- Duration = OutputCL["Duration"]
1015
- except Exception:
1016
- print(
1017
- "Problem with (%i,%i) // computations on OpenCL"
1018
- % (Blocks, Threads)
1019
- )
1020
- Duration = numpy.append(Duration, time.time() - start)
1021
- Rate = numpy.append(Rate, NewIterations / Duration[-1])
1022
- else:
1023
- if GpuStyle == "CUDA":
1024
- try:
1025
- InputCU = {}
1026
- InputCU["Iterations"] = Iterations
1027
- InputCU["Steps"] = Redo
1028
- InputCU["Blocks"] = Blocks
1029
- InputCU["Threads"] = Threads
1030
- InputCU["Device"] = Devices[0]
1031
- InputCU["RNG"] = RNG
1032
- InputCU["Seeds"] = Seeds
1033
- InputCU["ValueType"] = ValueType
1034
- InputCU["IfThen"] = IfThen
1035
- OutputCU = MetropolisCuda(InputCU)
1036
- Inside = OutputCU["Inside"]
1037
- NewIterations = OutputCU["NewIterations"]
1038
- Duration = OutputCU["Duration"]
1039
- pycuda.context.pop() # noqa: F821
1040
- except Exception:
1041
- print(
1042
- "Problem with (%i,%i) // computations on Cuda"
1043
- % (Blocks, Threads)
1044
- )
1045
- elif GpuStyle == "OpenCL":
1046
- try:
1047
- InputCL = {}
1048
- InputCL["Iterations"] = Iterations
1049
- InputCL["Steps"] = Redo
1050
- InputCL["Blocks"] = Blocks
1051
- InputCL["Threads"] = Threads
1052
- InputCL["Device"] = Devices[0]
1053
- InputCL["RNG"] = RNG
1054
- InputCL["Seeds"] = Seeds
1055
- InputCL["ValueType"] = ValueType
1056
- InputCL["IfThen"] = IfThen
1057
- OutputCL = MetropolisOpenCL(InputCL)
1058
- Inside = OutputCL["Inside"]
1059
- NewIterations = OutputCL["NewIterations"]
1060
- Duration = OutputCL["Duration"]
1061
- except Exception:
1062
- print(
1063
- "Problem with (%i,%i) // computations on OpenCL"
1064
- % (Blocks, Threads)
1065
- )
1066
- Rate = NewIterations / Duration[-1]
1067
- print(
1068
- "Itops %i\nLogItops %.2f "
1069
- % (int(Rate), numpy.log(Rate) / numpy.log(10))
1070
- )
1071
- print("Pi estimation %.8f" % (4.0 / NewIterations * Inside))
1072
-
1073
- avgD = numpy.append(avgD, numpy.average(Duration))
1074
- medD = numpy.append(medD, numpy.median(Duration))
1075
- stdD = numpy.append(stdD, numpy.std(Duration))
1076
- minD = numpy.append(minD, numpy.min(Duration))
1077
- maxD = numpy.append(maxD, numpy.max(Duration))
1078
- avgR = numpy.append(avgR, numpy.average(Rate))
1079
- medR = numpy.append(medR, numpy.median(Rate))
1080
- stdR = numpy.append(stdR, numpy.std(Rate))
1081
- minR = numpy.append(minR, numpy.min(Rate))
1082
- maxR = numpy.append(maxR, numpy.max(Rate))
1083
-
1084
- print(
1085
- "%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i"
1086
- % (
1087
- avgD[-1],
1088
- medD[-1],
1089
- stdD[-1],
1090
- minD[-1],
1091
- maxD[-1],
1092
- avgR[-1],
1093
- medR[-1],
1094
- stdR[-1],
1095
- minR[-1],
1096
- maxR[-1],
1097
- )
1098
- )
1099
-
1100
- numpy.savez(
1101
- "Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s"
1102
- % (
1103
- ValueType,
1104
- RNG,
1105
- Alu[Devices[0]],
1106
- GpuStyle,
1107
- BlocksBegin,
1108
- BlocksEnd,
1109
- ThreadsBegin,
1110
- ThreadsEnd,
1111
- Iterations,
1112
- Devices[0],
1113
- Metrology,
1114
- gethostname(),
1115
- ),
1116
- (
1117
- ExploredBlocks,
1118
- ExploredThreads,
1119
- avgD,
1120
- medD,
1121
- stdD,
1122
- minD,
1123
- maxD,
1124
- avgR,
1125
- medR,
1126
- stdR,
1127
- minR,
1128
- maxR,
1129
- ),
1130
- )
1131
- ToSave = [
1132
- ExploredBlocks,
1133
- ExploredThreads,
1134
- avgD,
1135
- medD,
1136
- stdD,
1137
- minD,
1138
- maxD,
1139
- avgR,
1140
- medR,
1141
- stdR,
1142
- minR,
1143
- maxR,
1144
- ]
1145
- numpy.savetxt(
1146
- "Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s"
1147
- % (
1148
- ValueType,
1149
- RNG,
1150
- Alu[Devices[0]],
1151
- GpuStyle,
1152
- BlocksBegin,
1153
- BlocksEnd,
1154
- ThreadsBegin,
1155
- ThreadsEnd,
1156
- Iterations,
1157
- Devices[0],
1158
- Metrology,
1159
- gethostname(),
1160
- ),
1161
- numpy.transpose(ToSave),
1162
- fmt="%i %i %e %e %e %e %e %i %i %i %i %i",
1163
- )
1164
-
1165
- if Fit:
1166
- FitAndPrint(ExploredJobs, median, Curves) # noqa: F821, E501 # FIXME: undefined var 'median'