pyopencl 2024.2.7__cp312-cp312-win_amd64.whl → 2024.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (38) hide show
  1. pyopencl/__init__.py +127 -122
  2. pyopencl/_cl.cp312-win_amd64.pyd +0 -0
  3. pyopencl/_mymako.py +3 -3
  4. pyopencl/algorithm.py +10 -7
  5. pyopencl/array.py +50 -40
  6. pyopencl/bitonic_sort.py +3 -1
  7. pyopencl/bitonic_sort_templates.py +1 -1
  8. pyopencl/cache.py +23 -22
  9. pyopencl/capture_call.py +5 -4
  10. pyopencl/clrandom.py +1 -0
  11. pyopencl/compyte/dtypes.py +4 -4
  12. pyopencl/compyte/pyproject.toml +54 -0
  13. pyopencl/elementwise.py +9 -2
  14. pyopencl/invoker.py +11 -9
  15. pyopencl/ipython_ext.py +1 -1
  16. pyopencl/reduction.py +16 -10
  17. pyopencl/scan.py +38 -22
  18. pyopencl/tools.py +23 -13
  19. {pyopencl-2024.2.7.dist-info → pyopencl-2024.3.dist-info}/METADATA +11 -8
  20. pyopencl-2024.3.dist-info/RECORD +42 -0
  21. {pyopencl-2024.2.7.dist-info → pyopencl-2024.3.dist-info}/WHEEL +1 -1
  22. pyopencl/compyte/.git +0 -1
  23. pyopencl/compyte/ndarray/Makefile +0 -31
  24. pyopencl/compyte/ndarray/__init__.py +0 -0
  25. pyopencl/compyte/ndarray/gen_elemwise.py +0 -1907
  26. pyopencl/compyte/ndarray/gen_reduction.py +0 -1511
  27. pyopencl/compyte/ndarray/gpu_ndarray.h +0 -35
  28. pyopencl/compyte/ndarray/pygpu_language.h +0 -207
  29. pyopencl/compyte/ndarray/pygpu_language_cuda.cu +0 -622
  30. pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +0 -317
  31. pyopencl/compyte/ndarray/pygpu_ndarray.cpp +0 -1546
  32. pyopencl/compyte/ndarray/pygpu_ndarray.h +0 -71
  33. pyopencl/compyte/ndarray/pygpu_ndarray_object.h +0 -232
  34. pyopencl/compyte/ndarray/setup_opencl.py +0 -101
  35. pyopencl/compyte/ndarray/test_gpu_elemwise.py +0 -411
  36. pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -487
  37. pyopencl-2024.2.7.dist-info/RECORD +0 -56
  38. {pyopencl-2024.2.7.dist-info → pyopencl-2024.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,622 +0,0 @@
1
- #include <pygpu_ndarray_object.h>
2
- #include <pygpu_language.h>
3
-
4
- #include <cublas.h>
5
-
6
- #ifdef __DEVICE_EMULATION__
7
- #define NUM_VECTOR_OP_BLOCKS 4096
8
- #define NUM_VECTOR_OP_THREADS_PER_BLOCK 1 //This prevents printf from getting tangled up
9
- #else
10
- #define NUM_VECTOR_OP_BLOCKS 4096 //Max number of blocks to launch. Should be read from device properties. (#10)
11
- #define NUM_VECTOR_OP_THREADS_PER_BLOCK 256 //Should be read from device properties. (#10)
12
- #endif
13
-
14
- #if 0
15
- // Do not wait after every kernel & transfer.
16
- #define CNDA_THREAD_SYNC
17
- #else
18
- // This is useful for using normal profiling tools
19
- #define CNDA_THREAD_SYNC cudaThreadSynchronize();
20
- #endif
21
-
22
- #ifndef SHARED_SIZE
23
- #define SHARED_SIZE (16*1024)
24
- #endif
25
-
26
-
27
- char *
28
- cublasGetErrorString(cublasStatus err)
29
- {
30
- if (err == CUBLAS_STATUS_NOT_INITIALIZED) {
31
- return "CUBLAS_STATUS_NOT_INITIALIZED";
32
- } else if (err == CUBLAS_STATUS_ALLOC_FAILED){
33
- return "CUBLAS_STATUS_ALLOC_FAILED";
34
- } else if (err == CUBLAS_STATUS_INVALID_VALUE){
35
- return "CUBLAS_STATUS_INVALID_VALUE";
36
- } else if (err == CUBLAS_STATUS_MAPPING_ERROR){
37
- return "CUBLAS_STATUS_MAPPING_ERROR";
38
- } else if (err == CUBLAS_STATUS_EXECUTION_FAILED){
39
- return "CUBLAS_STATUS_EXECUTION_FAILED";
40
- } else if (err == CUBLAS_STATUS_INTERNAL_ERROR){
41
- return "CUBLAS_STATUS_INTERNAL_ERROR";
42
- } else {
43
- return "UNKNOW ERROR";
44
- }
45
-
46
- }
47
-
48
- /////////////////////////
49
- // Alloc and Free
50
- /////////////////////////
51
- void * device_malloc(size_t size)
52
- {
53
- void * rval=NULL;
54
- cudaError_t err = cudaMalloc(&rval, size);
55
- if (cudaSuccess != err){
56
- #if COMPUTE_GPU_MEM_USED
57
- fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n",
58
- (long)size, cudaGetErrorString(err),_allocated_size);
59
- #else
60
- fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n",
61
- (long)size, cudaGetErrorString(err));
62
- #endif
63
- PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).",
64
- (long)size, cudaGetErrorString(err));
65
- return NULL;
66
- }
67
- _outstanding_mallocs[0] += (rval != NULL);
68
- #if COMPUTE_GPU_MEM_USED
69
- for(int i=0;i<TABLE_SIZE;i++){
70
- if(NULL==_alloc_size_table[i].ptr){
71
- _alloc_size_table[i].ptr=rval;
72
- _alloc_size_table[i].size=size;
73
- break;
74
- }
75
- }
76
- _allocated_size += size;
77
- DPRINTF("allocated %li bytes of device memory (%s). %d already allocated, ptr: %p\n",
78
- (long)size, cudaGetErrorString(err),_allocated_size,rval);
79
- #else
80
- DPRINTF("allocated %li bytes of device memory (%s). ptr: %p\n",
81
- (long)size, cudaGetErrorString(err),rval);
82
-
83
- #endif
84
-
85
- if(ALLOC_MEMSET){
86
- //We init them to nan to make sure we catch more debug case.
87
- cudaMemset(rval, 0xFF, size);
88
- //printf("MEMSET\n");
89
- }
90
- return rval;
91
- }
92
-
93
- int device_free(void *ptr)
94
- {
95
- // if there is no gpu context, the call to cudaFree will fail; skip it entirely
96
- /*if(!g_gpu_context_active){
97
- return 0;
98
- }*/
99
- cudaError_t err = cudaFree(ptr);
100
- if (cudaSuccess != err){
101
- #if COMPUTE_GPU_MEM_USED
102
- fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n",
103
- ptr, cudaGetErrorString(err), _allocated_size);
104
- #else
105
- fprintf(stderr, "Error freeing device pointer %p (%s).\n",
106
- ptr, cudaGetErrorString(err));
107
- #endif
108
- PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)",
109
- ptr, cudaGetErrorString(err));
110
- return -1;
111
- }
112
- _outstanding_mallocs[0] -= (ptr != NULL);
113
- #if COMPUTE_GPU_MEM_USED
114
- int i=0;
115
- size_t total_freed = 0;
116
- for(;i<TABLE_SIZE;i++)
117
- if(_alloc_size_table[i].ptr==ptr){
118
- _allocated_size -= _alloc_size_table[i].size;
119
- total_freed += _alloc_size_table[i].size;
120
- _alloc_size_table[i].ptr=0;
121
- _alloc_size_table[i].size=0;
122
-
123
- break;
124
- }
125
- if(i==TABLE_SIZE)
126
- printf("Unallocated unknow size!\n");
127
- DPRINTF("freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
128
- #endif
129
- return 0;
130
- }
131
- //make the rightmost coords change fastest
132
- //TODO: why does a downward for-loop not work????
133
- //TODO: skip the last division (when d == 0)
134
- #define decl_k_elemwise_unary_rowmajor(name, F, DTYPE) \
135
- __global__ void name (unsigned int numEls, \
136
- unsigned int nd, \
137
- const ssize_t * dim, \
138
- const DTYPE * a_data, const ssize_t * a_str, \
139
- DTYPE * z_data, const ssize_t * z_str) \
140
- { \
141
- const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
142
- const unsigned int numThreads = blockDim.x * gridDim.x; \
143
- \
144
- for (unsigned int i = idx; i < numEls; i += numThreads) { \
145
- unsigned int ii = i; \
146
- const DTYPE * a_i = a_data; \
147
- DTYPE * z_i = z_data; \
148
- for (unsigned int _d = 0; _d < nd; ++_d) { \
149
- unsigned int d = nd - _d-1; \
150
- /* i_d used to be unsigned, but their is a bug in nvcc 3.0. making it signed fix the bug.*/\
151
- int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension */ \
152
- ii = ii / dim[d]; \
153
- a_i += i_d * (a_str[d]/sizeof(DTYPE)); /* increment our a and z pointers by i_d elements */ \
154
- z_i += i_d * (z_str[d]/sizeof(DTYPE)); \
155
- } \
156
- z_i[0] = F(a_i[0]); \
157
- } \
158
- }
159
-
160
- template<typename T> __device__ T unary_copy(T a) { return a; }
161
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_float, unary_copy<float>, float)
162
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_double, unary_copy<double>, double)
163
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint8, unary_copy<uint8_t>, uint8_t)
164
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int8, unary_copy<int8_t>, int8_t)
165
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint16, unary_copy<uint16_t>, uint16_t)
166
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int16, unary_copy<int16_t>, int16_t)
167
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint32, unary_copy<uint32_t>, uint32_t)
168
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int32, unary_copy<int32_t>, int32_t)
169
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint64, unary_copy<uint64_t>, uint64_t)
170
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int64, unary_copy<int64_t>, int64_t)
171
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex64, unary_copy<npy_complex64>, npy_complex64)
172
- decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex128, unary_copy<npy_complex128>, npy_complex128)
173
-
174
- //template<typename T> __device__ T unary_exp(T a) { return exp(a); }
175
- //decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
176
-
177
- template<typename T>
178
- static __global__ void k_copy_1d(const int N, const T * x, const ssize_t sx, T * y, const ssize_t sy)
179
- {
180
- for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x) {
181
- y[i*sy] = x[i*sx];
182
- }
183
- }
184
-
185
- //copy from other into self
186
- //don't allocated memory
187
- int
188
- PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast)
189
- {
190
- DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray start nd=%d\n", PyGpuNdArray_NDIM(self));
191
- assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other));
192
- assert(PyGpuNdArray_ISWRITEABLE(self));
193
- //standard elemwise size checks
194
- if (PyGpuNdArray_NDIM(self) == -1) {
195
- PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuNdArrayObject");
196
- return -1;
197
- }
198
- if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) {
199
- PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other));
200
- return -1;
201
- }
202
- //standard elemwise dim checks (also compute total size)
203
- unsigned int size = 1;
204
- unsigned int size_source = 1;
205
- for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) {
206
- if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i])
207
- && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) {
208
- PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld",
209
- i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]);
210
- return -1;
211
- }
212
- size *= (unsigned int) PyGpuNdArray_DIMS(self)[i];
213
- size_source *= (unsigned int) PyGpuNdArray_DIMS(other)[i];
214
- }
215
- if (0 == size) {
216
- return 0; //nothing to copy, we're done.
217
- }
218
-
219
- //cublas don't support negative stride
220
- bool pos_stride = true;
221
- for (int i = 0; i < PyGpuNdArray_NDIM(other); ++i)
222
- if (PyGpuNdArray_STRIDE(other,i)<0)
223
- pos_stride = false;
224
-
225
- void * other_data = PyGpuNdArray_DATA(other) + PyGpuNdArray_OFFSET(other);
226
- void * self_data = PyGpuNdArray_DATA(self) + PyGpuNdArray_OFFSET(self);
227
-
228
- //Try to transfer with cublas(we suppose it is faster)
229
- if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) &&
230
- size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT32 &&
231
- pos_stride
232
- ) {
233
- cublasScopy(size, (float*) other_data, 1, (float*) self_data, 1);
234
- CNDA_THREAD_SYNC;
235
- if (CUBLAS_STATUS_SUCCESS != cublasGetError()) {
236
- PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
237
- return -1;
238
- }
239
-
240
- DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: cublasScopy end\n");
241
- return 0;
242
- }
243
- if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) &&
244
- size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT64 &&
245
- pos_stride) {
246
- cublasDcopy(size, (double*) other_data, 1, (double*) self_data, 1);
247
- CNDA_THREAD_SYNC;
248
- if (CUBLAS_STATUS_SUCCESS != cublasGetError()) {
249
- PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
250
- return -1;
251
- }
252
- DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray cublasDcopy end\n");
253
- return 0;
254
- }
255
-
256
- //TODO: rewrite these copy operations to be more efficient
257
- // See, for example the transpose example in the cuda_sdk.
258
- switch (PyGpuNdArray_NDIM(self)) {
259
- case 0: // scalar
260
- {
261
- // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
262
- assert(0);
263
- }; break;
264
- case 1: // vector
265
- {
266
- assert(PyGpuNdArray_ISALIGNED(self));
267
- assert(PyGpuNdArray_ISALIGNED(other));
268
- DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying non-contiguous vector\n");
269
- unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
270
- unsigned int n_threads = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
271
-
272
- if (PyGpuNdArray_TYPE(self) == NPY_FLOAT32) {
273
- const int elsize = sizeof(float);
274
- k_copy_1d<<<n_blocks, n_threads>>>(size,
275
- (float*)other_data,
276
- PyGpuNdArray_STRIDES(other)[0]/elsize,
277
- (float*)self_data,
278
- PyGpuNdArray_STRIDES(self)[0]/elsize);
279
- } else if (PyGpuNdArray_TYPE(self) == NPY_FLOAT64) {
280
- const int elsize = sizeof(double);
281
- k_copy_1d<<<n_blocks, n_threads>>>(size,
282
- (double*)other_data,
283
- PyGpuNdArray_STRIDES(other)[0]/elsize,
284
- (double*)self_data,
285
- PyGpuNdArray_STRIDES(self)[0]/elsize);
286
- } else if (PyGpuNdArray_TYPE(self) == NPY_INT8) {
287
- const int elsize = sizeof(int8_t);
288
- k_copy_1d<<<n_blocks, n_threads>>>(size,
289
- (int8_t*)other_data,
290
- PyGpuNdArray_STRIDES(other)[0]/elsize,
291
- (int8_t*)self_data,
292
- PyGpuNdArray_STRIDES(self)[0]/elsize);
293
- } else if (PyGpuNdArray_TYPE(self) == NPY_INT16) {
294
- const int elsize = sizeof(int16_t);
295
- k_copy_1d<<<n_blocks, n_threads>>>(size,
296
- (int16_t*)other_data,
297
- PyGpuNdArray_STRIDES(other)[0]/elsize,
298
- (int16_t*)self_data,
299
- PyGpuNdArray_STRIDES(self)[0]/elsize);
300
- } else if (PyGpuNdArray_TYPE(self) == NPY_INT32) {
301
- const int elsize = sizeof(int32_t);
302
- k_copy_1d<<<n_blocks, n_threads>>>(size,
303
- (int32_t*)other_data,
304
- PyGpuNdArray_STRIDES(other)[0]/elsize,
305
- (int32_t*)self_data,
306
- PyGpuNdArray_STRIDES(self)[0]/elsize);
307
- } else if (PyGpuNdArray_TYPE(self) == NPY_INT64) {
308
- const int elsize = sizeof(int64_t);
309
- k_copy_1d<<<n_blocks, n_threads>>>(size,
310
- (int64_t*)other_data,
311
- PyGpuNdArray_STRIDES(other)[0]/elsize,
312
- (int64_t*)self_data,
313
- PyGpuNdArray_STRIDES(self)[0]/elsize);
314
- } else if (PyGpuNdArray_TYPE(self) == NPY_UINT8) {
315
- const int elsize = sizeof(uint8_t);
316
- k_copy_1d<<<n_blocks, n_threads>>>(size,
317
- (uint8_t*)other_data,
318
- PyGpuNdArray_STRIDES(other)[0]/elsize,
319
- (uint8_t*)self_data,
320
- PyGpuNdArray_STRIDES(self)[0]/elsize);
321
- } else if (PyGpuNdArray_TYPE(self) == NPY_UINT16) {
322
- const int elsize = sizeof(uint16_t);
323
- k_copy_1d<<<n_blocks, n_threads>>>(size,
324
- (uint16_t*)other_data,
325
- PyGpuNdArray_STRIDES(other)[0]/elsize,
326
- (uint16_t*)self_data,
327
- PyGpuNdArray_STRIDES(self)[0]/elsize);
328
- } else if (PyGpuNdArray_TYPE(self) == NPY_UINT32) {
329
- const int elsize = sizeof(uint32_t);
330
- k_copy_1d<<<n_blocks, n_threads>>>(size,
331
- (uint32_t*)other_data,
332
- PyGpuNdArray_STRIDES(other)[0]/elsize,
333
- (uint32_t*)self_data,
334
- PyGpuNdArray_STRIDES(self)[0]/elsize);
335
- } else if (PyGpuNdArray_TYPE(self) == NPY_UINT64) {
336
- const int elsize = sizeof(uint64_t);
337
- k_copy_1d<<<n_blocks, n_threads>>>(size,
338
- (uint64_t*)other_data,
339
- PyGpuNdArray_STRIDES(other)[0]/elsize,
340
- (uint64_t*)self_data,
341
- PyGpuNdArray_STRIDES(self)[0]/elsize);
342
- } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) {
343
- const int elsize = sizeof(npy_complex64);
344
- k_copy_1d<<<n_blocks, n_threads>>>(size,
345
- (npy_complex64*)other_data,
346
- PyGpuNdArray_STRIDES(other)[0]/elsize,
347
- (npy_complex64*)self_data,
348
- PyGpuNdArray_STRIDES(self)[0]/elsize);
349
- } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) {
350
- const int elsize = sizeof(npy_complex128);
351
- k_copy_1d<<<n_blocks, n_threads>>>(size,
352
- (npy_complex128*)other_data,
353
- PyGpuNdArray_STRIDES(other)[0]/elsize,
354
- (npy_complex128*)self_data,
355
- PyGpuNdArray_STRIDES(self)[0]/elsize);
356
- } else {
357
- PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n");
358
- return -1;
359
- }
360
-
361
- CNDA_THREAD_SYNC;
362
- cudaError_t err = cudaGetLastError();
363
- if( cudaSuccess != err) {
364
- PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads);
365
- return -1;
366
- }
367
- }; break;
368
- default:
369
- {
370
- assert (cudaSuccess == cudaGetLastError());
371
- assert(PyGpuNdArray_ISALIGNED(self));
372
- assert(PyGpuNdArray_ISALIGNED(other));
373
-
374
- DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying with default version unbroadcast=%d\n", unbroadcast);
375
- // Identigy the dim of the output memory.
376
- PyGpuNdArrayObject * cuda_dims = other;
377
- if(unbroadcast)
378
- cuda_dims = self;
379
-
380
- // Move the dim and strides information on the gpu memory
381
- int ndim = PyGpuNdArray_NDIM(other);
382
- void * strides_dev = device_malloc(sizeof(ssize_t)*ndim*3);
383
- ssize_t * strides_dev_p = (ssize_t *) strides_dev;
384
- cudaError_t err = cudaMemcpy(strides_dev, PyGpuNdArray_DIMS(cuda_dims), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice);
385
- if (err != cudaSuccess){
386
- PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory1: %s", cudaGetErrorString(err));
387
- return -1;
388
- }
389
- err = cudaMemcpy((void*)(strides_dev_p+ndim), PyGpuNdArray_STRIDES(other), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice);
390
- if (err != cudaSuccess){
391
- PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory2: %s", cudaGetErrorString(err));
392
- return -1;
393
- }
394
- err = cudaMemcpy((void*)(strides_dev_p+(ndim*2)), PyGpuNdArray_STRIDES(self), ndim*sizeof(ssize_t), cudaMemcpyHostToDevice);
395
- if (err != cudaSuccess){
396
- PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory3: %s", cudaGetErrorString(err));
397
- return -1;
398
- }
399
- void * strides_host = malloc(sizeof(ssize_t)*ndim*3);
400
- err = cudaMemcpy(strides_host, strides_dev, ndim*3*sizeof(ssize_t),cudaMemcpyDeviceToHost);
401
- if (err != cudaSuccess){
402
- PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory4: %s", cudaGetErrorString(err));
403
- return -1;
404
- }
405
- #ifdef DEBUG
406
- for(int i=0;i<3*ndim;i++)
407
- DPRINTF(" %ld", ((ssize_t *)strides_host)[i]);
408
- DPRINTF("\n");
409
- #endif
410
- CNDA_THREAD_SYNC;
411
- if(cudaSuccess != cudaGetLastError()){
412
- PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: error before copy\n");
413
- return -1;
414
- }
415
-
416
- // call worker routine
417
- unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
418
- unsigned int threads_per_block = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
419
-
420
- if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT32) {
421
- k_elemwise_unary_rowmajor_copy_float<<<n_blocks, threads_per_block>>>(
422
- size,
423
- (unsigned int)ndim,
424
- strides_dev_p,
425
- (const float*)other_data,
426
- strides_dev_p+ndim,
427
- (float*) self_data,
428
- strides_dev_p+(ndim*2));
429
- } else if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT64) {
430
- k_elemwise_unary_rowmajor_copy_double<<<n_blocks, threads_per_block>>>(
431
- size,
432
- (unsigned int)ndim,
433
- strides_dev_p,
434
- (const double*)other_data,
435
- strides_dev_p+ndim,
436
- (double*) self_data,
437
- strides_dev_p+(ndim*2));
438
- } else if ( PyGpuNdArray_TYPE(self) == NPY_INT8) {
439
- k_elemwise_unary_rowmajor_copy_int8<<<n_blocks, threads_per_block>>>(
440
- size,
441
- (unsigned int)ndim,
442
- strides_dev_p,
443
- (const int8_t*)other_data,
444
- strides_dev_p+ndim,
445
- (int8_t*) self_data,
446
- strides_dev_p+(ndim*2));
447
- } else if ( PyGpuNdArray_TYPE(self) == NPY_INT16) {
448
- k_elemwise_unary_rowmajor_copy_int16<<<n_blocks, threads_per_block>>>(
449
- size,
450
- (unsigned int)ndim,
451
- strides_dev_p,
452
- (const int16_t*)other_data,
453
- strides_dev_p+ndim,
454
- (int16_t*) self_data,
455
- strides_dev_p+(ndim*2));
456
- } else if ( PyGpuNdArray_TYPE(self) == NPY_INT32) {
457
- k_elemwise_unary_rowmajor_copy_int32<<<n_blocks, threads_per_block>>>(
458
- size,
459
- (unsigned int)ndim,
460
- strides_dev_p,
461
- (const int32_t*)other_data,
462
- strides_dev_p+ndim,
463
- (int32_t*) self_data,
464
- strides_dev_p+(ndim*2));
465
- } else if ( PyGpuNdArray_TYPE(self) == NPY_INT64) {
466
- k_elemwise_unary_rowmajor_copy_int64<<<n_blocks, threads_per_block>>>(
467
- size,
468
- (unsigned int)ndim,
469
- strides_dev_p,
470
- (const int64_t*)other_data,
471
- strides_dev_p+ndim,
472
- (int64_t*) self_data,
473
- strides_dev_p+(ndim*2));
474
- } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT8) {
475
- k_elemwise_unary_rowmajor_copy_uint8<<<n_blocks, threads_per_block>>>(
476
- size,
477
- (unsigned int)ndim,
478
- strides_dev_p,
479
- (const uint8_t*)other_data,
480
- strides_dev_p+ndim,
481
- (uint8_t*) self_data,
482
- strides_dev_p+(ndim*2));
483
- } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT16) {
484
- k_elemwise_unary_rowmajor_copy_uint16<<<n_blocks, threads_per_block>>>(
485
- size,
486
- (unsigned int)ndim,
487
- strides_dev_p,
488
- (const uint16_t*)other_data,
489
- strides_dev_p+ndim,
490
- (uint16_t*) self_data,
491
- strides_dev_p+(ndim*2));
492
- } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT32) {
493
- k_elemwise_unary_rowmajor_copy_uint32<<<n_blocks, threads_per_block>>>(
494
- size,
495
- (unsigned int)ndim,
496
- strides_dev_p,
497
- (const uint32_t*)other_data,
498
- strides_dev_p+ndim,
499
- (uint32_t*) self_data,
500
- strides_dev_p+(ndim*2));
501
- } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT64) {
502
- k_elemwise_unary_rowmajor_copy_uint64<<<n_blocks, threads_per_block>>>(
503
- size,
504
- (unsigned int)ndim,
505
- strides_dev_p,
506
- (const uint64_t*)other_data,
507
- strides_dev_p+ndim,
508
- (uint64_t*) self_data,
509
- strides_dev_p+(ndim*2));
510
- } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) {
511
- k_elemwise_unary_rowmajor_copy_complex64<<<n_blocks, threads_per_block>>>(
512
- size,
513
- (unsigned int)ndim,
514
- strides_dev_p,
515
- (const npy_complex64*)other_data,
516
- strides_dev_p+ndim,
517
- (npy_complex64*) self_data,
518
- strides_dev_p+(ndim*2));
519
- } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) {
520
- k_elemwise_unary_rowmajor_copy_complex128<<<n_blocks, threads_per_block>>>(
521
- size,
522
- (unsigned int)ndim,
523
- strides_dev_p,
524
- (const npy_complex128*)other_data,
525
- strides_dev_p+ndim,
526
- (npy_complex128*) self_data,
527
- strides_dev_p+(ndim*2));
528
- } else {
529
- PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n");
530
- return -1;
531
- }
532
- CNDA_THREAD_SYNC;
533
- err = cudaGetLastError();
534
- if( cudaSuccess != err) {
535
- PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block);
536
- return -1;
537
- }
538
- device_free(strides_dev);
539
- free(strides_host);
540
- }
541
- };
542
- // Set flags
543
- if (false && PyGpuNdArray_NDIM(self) == 0) {
544
- //Numpy 1.4.1 is not consistent here
545
- //When we create a new numpy ndarray of 0 dim, it is not f contiguous
546
- //But when we take a subtensor that is of 0 dim, it is f contiguous!
547
- //We make as them for now...
548
- PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
549
- PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
550
- } else {
551
- if (PyGpuNdArray_is_c_contiguous(self)) {
552
- PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
553
- } else {
554
- PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS;
555
- }
556
- if (PyGpuNdArray_is_f_contiguous(self)) {
557
- PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
558
- } else {
559
- PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
560
- }
561
- }
562
-
563
- DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray end\n");
564
- return 0;
565
- }
566
-
567
- int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes,
568
- PyGpuTransfert direction){
569
- DPRINTF("PyGpuMemcpy: start\n");
570
- cudaMemcpyKind dir;
571
- const char * ssrc;
572
- const char * ddst;
573
- if (direction == PyGpuDeviceToHost){
574
- dir = cudaMemcpyDeviceToHost;
575
- ssrc = (char*)src+dev_offset;
576
- ddst = (char*)dst;
577
- } else if (direction == PyGpuHostToDevice) {
578
- dir = cudaMemcpyHostToDevice;
579
- ssrc = (char*)src;
580
- ddst = (char*)dst + dev_offset;
581
- } else {
582
- PyErr_Format(PyExc_ValueError,
583
- "PyGpuMemcpy: Received wrong direction %d!\n",
584
- direction);
585
- return -1;
586
- }
587
- cudaError_t err = cudaMemcpy((void*)ddst, (void*)ssrc, bytes, dir);
588
- CNDA_THREAD_SYNC;
589
- if (cudaSuccess != err) {
590
- PyErr_Format(PyExc_RuntimeError, "PyGpuMemcpy: cudaMemcpy: error copying data to host (%s)",
591
- cudaGetErrorString(err));
592
- return -1;
593
- }
594
- DPRINTF("PyGpuMemcpy: end\n");
595
- return 0;
596
- }
597
-
598
- int PyGpuMemset(void * dst, int data, size_t bytes){
599
- DPRINTF("PyGpuMemset: start\n");
600
- cudaError_t err = cudaMemset(dst, data, bytes);
601
- CNDA_THREAD_SYNC;
602
- if (cudaSuccess != err) {
603
- PyErr_Format(PyExc_MemoryError, "PyGpuMemset: Error memsetting %ld bytes of device memory(%s). %p",
604
- bytes, cudaGetErrorString(err), PyGpuNdArray_DATA(dst));
605
- DPRINTF("PyGpuMemset: end error\n");
606
- return -1;
607
- }
608
- DPRINTF("PyGpuMemset: end\n");
609
- return 0;
610
- }
611
-
612
- /*
613
- Local Variables:
614
- mode:c++
615
- c-basic-offset:4
616
- c-file-style:"stroustrup"
617
- c-file-offsets:((innamespace . 0)(inline-open . 0))
618
- indent-tabs-mode:nil
619
- fill-column:79
620
- End:
621
- */
622
- // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :