cuslines 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ /* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ *
3
+ * Redistribution and use in source and binary forms, with or without
4
+ * modification, are permitted provided that the following conditions are met:
5
+ *
6
+ * 1. Redistributions of source code must retain the above copyright notice, this
7
+ * list of conditions and the following disclaimer.
8
+ *
9
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
10
+ * this list of conditions and the following disclaimer in the documentation
11
+ * and/or other materials provided with the distribution.
12
+ *
13
+ * 3. Neither the name of the copyright holder nor the names of its
14
+ * contributors may be used to endorse or promote products derived from
15
+ * this software without specific prior written permission.
16
+ *
17
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ */
28
+
29
+ #ifndef __CUDA_MACRO_H__
30
+ #define __CUDA_MACRO_H__
31
+
32
+ #define CHECK_CUDA(call) { \
33
+ cudaError_t err = call; \
34
+ if( cudaSuccess != err) { \
35
+ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
36
+ __FILE__, __LINE__, cudaGetErrorString( err) ); \
37
+ exit(EXIT_FAILURE); \
38
+ }}
39
+
40
+ #define CHECK_ERROR(errorMessage) { \
41
+ cudaError_t err = cudaGetLastError(); \
42
+ if( cudaSuccess != err) { \
43
+ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
44
+ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
45
+ exit(EXIT_FAILURE); \
46
+ }}
47
+
48
+ #if CUDART_VERSION >= 13000
49
+ #define CUDA_MEM_ADVISE(devPtr, count, advice, device) \
50
+ cudaMemLocation loc; \
51
+ loc.type = cudaMemLocationTypeDevice; \
52
+ loc.id = (device); \
53
+ CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc));
54
+ #else
55
+ #define CUDA_MEM_ADVISE(devPtr, count, advice, device) \
56
+ CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device)))
57
+ #endif
58
+
59
+
60
+ #ifdef USE_NVTX
61
+ #include "nvToolsExt.h"
62
+
63
+ const uint32_t colors4[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff};
64
+ const int num_colors4 = sizeof(colors4)/sizeof(colors4[0]);
65
+
66
+ #define START_RANGE(name,cid) { \
67
+ int color_id = cid; \
68
+ color_id = color_id%num_colors4;\
69
+ nvtxEventAttributes_t eventAttrib = {0}; \
70
+ eventAttrib.version = NVTX_VERSION; \
71
+ eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
72
+ eventAttrib.colorType = NVTX_COLOR_ARGB; \
73
+ eventAttrib.color = colors4[color_id]; \
74
+ eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
75
+ eventAttrib.message.ascii = name; \
76
+ nvtxRangePushEx(&eventAttrib); \
77
+ }
78
+ #define END_RANGE { \
79
+ nvtxRangePop(); \
80
+ }
81
+ #else
82
+ #define START_RANGE(name,cid)
83
+ #define END_RANGE
84
+ #endif
85
+
86
+ #endif
@@ -0,0 +1,171 @@
1
+ /* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ *
3
+ * Redistribution and use in source and binary forms, with or without
4
+ * modification, are permitted provided that the following conditions are met:
5
+ *
6
+ * 1. Redistributions of source code must retain the above copyright notice, this
7
+ * list of conditions and the following disclaimer.
8
+ *
9
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
10
+ * this list of conditions and the following disclaimer in the documentation
11
+ * and/or other materials provided with the distribution.
12
+ *
13
+ * 3. Neither the name of the copyright holder nor the names of its
14
+ * contributors may be used to endorse or promote products derived from
15
+ * this software without specific prior written permission.
16
+ *
17
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ */
28
+
29
+ #ifndef __CUWSORT_H__
30
+ #define __CUWSORT_H__
31
+
32
+ #define __CWSORT_MIN(a,b) (((a)<(b))?(a):(b))
33
+ #define __CWSORT_MAX(a,b) (((a)>(b))?(a):(b))
34
+
35
+ namespace cuwsort {
36
+
37
+ __device__ __constant__
38
+ int swap32[15][32] = {{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
39
+ { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23},
40
+ { 4, 5, 6, 7, 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15, 28, 29, 30, 31, 24, 25, 26, 27},
41
+ { 2, 3, 0, 1, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 20, 21, 22, 23, 16, 17, 18, 19, 24, 25, 26, 27, 30, 31, 28, 29},
42
+ { 1, 0, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27, 4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 31, 30},
43
+ { 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31},
44
+ { 0, 1, 2, 3, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 18, 19, 16, 17, 22, 23, 20, 21, 26, 27, 24, 25, 28, 29, 30, 31},
45
+ { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29, 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31},
46
+ { 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, 18, 19, 26, 27, 22, 23, 30, 31},
47
+ { 0, 1, 4, 5, 2, 3, 8, 9, 6, 7, 12, 13, 10, 11, 16, 17, 14, 15, 20, 21, 18, 19, 24, 25, 22, 23, 28, 29, 26, 27, 30, 31},
48
+ { 0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 30, 31},
49
+ { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31},
50
+ { 0, 8, 2, 10, 4, 12, 6, 14, 1, 16, 3, 18, 5, 20, 7, 22, 9, 24, 11, 26, 13, 28, 15, 30, 17, 25, 19, 27, 21, 29, 23, 31},
51
+ { 0, 4, 2, 6, 1, 8, 3, 10, 5, 12, 7, 14, 9, 16, 11, 18, 13, 20, 15, 22, 17, 24, 19, 26, 21, 28, 23, 30, 25, 29, 27, 31},
52
+ { 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23, 26, 25, 28, 27, 30, 29, 31}};
53
+
54
+ __device__ __constant__
55
+ int swap16[10][16] = {{ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},
56
+ { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
57
+ { 2, 3, 0, 1, 8, 9, 10, 11, 4, 5, 6, 7, 14, 15, 12, 13},
58
+ { 1, 0, 2, 3, 6, 7, 4, 5, 10, 11, 8, 9, 12, 13, 15, 14},
59
+ { 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15},
60
+ { 0, 1, 4, 5, 2, 3, 8, 9, 6, 7, 12, 13, 10, 11, 14, 15},
61
+ { 0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 15},
62
+ { 0, 8, 2, 10, 4, 12, 6, 14, 1, 9, 3, 11, 5, 13, 7, 15},
63
+ { 0, 4, 2, 6, 1, 8, 3, 10, 5, 12, 7, 14, 9, 13, 11, 15},
64
+ { 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 15}};
65
+
66
+ __device__ __constant__
67
+ int swap8[6][8] = {{ 4, 5, 6, 7, 0, 1, 2, 3},
68
+ { 2, 3, 0, 1, 6, 7, 4, 5},
69
+ { 1, 0, 4, 5, 2, 3, 7, 6},
70
+ { 0, 1, 3, 2, 5, 4, 6, 7},
71
+ { 0, 4, 2, 6, 1, 5, 3, 7},
72
+ { 0, 2, 1, 4, 3, 6, 5, 7}};
73
+
74
+ __device__ __constant__
75
+ int swap4[3][4] = {{ 2, 3, 0, 1},
76
+ { 1, 0, 3, 2},
77
+ { 0, 2, 1, 3}};
78
+
79
+ __device__ __constant__
80
+ int swap2[1][2] = {{ 1, 0}};
81
+
82
+ template<int GSIZE>
83
+ __device__ __forceinline__ const int* get_swap_ptr() {
84
+ if constexpr (GSIZE == 2) return (const int*)swap2;
85
+ else if constexpr (GSIZE == 4) return (const int*)swap4;
86
+ else if constexpr (GSIZE == 8) return (const int*)swap8;
87
+ else if constexpr (GSIZE == 16) return (const int*)swap16;
88
+ else if constexpr (GSIZE == 32) return (const int*)swap32;
89
+ else return nullptr;
90
+ }
91
+
92
+ template<int X>
93
+ struct STATIC_LOG2 {
94
+ enum {value = 1+STATIC_LOG2<X/2>::value};
95
+ };
96
+
97
+ template<>
98
+ struct STATIC_LOG2<1> {
99
+ enum {value = 0};
100
+ };
101
+
102
+ enum {WSORT_DIR_DEC, WSORT_DIR_INC};
103
+
104
+ template<int WSIZE,
105
+ int GSIZE, // power-pf-2 <= WSIZE
106
+ int DIRECTION,
107
+ typename KEY_T>
108
+ __device__ KEY_T warp_sort(KEY_T v) {
109
+
110
+ const int NET_LEN[] = {0, 1, 3, 6, 10, 15};
111
+ const int LOG2_GSIZE = STATIC_LOG2<GSIZE>::value;
112
+ const int NSWAP = NET_LEN[LOG2_GSIZE];
113
+
114
+ const int lid = (threadIdx.y*blockDim.x + threadIdx.x) % WSIZE;
115
+ const unsigned int WMASK = ((1ull << GSIZE)-1) << (lid & (~(GSIZE-1)));
116
+
117
+ const int gid = lid % GSIZE;
118
+
119
+ const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(get_swap_ptr<GSIZE>());
120
+
121
+ #pragma unroll
122
+ for(int i = 0; i < NSWAP; i++) {
123
+ const int srclane = swap[i][gid];
124
+ const KEY_T a = __shfl_sync(WMASK, v, srclane, GSIZE);
125
+ v = (gid < srclane == DIRECTION) ? __CWSORT_MIN(a, v) : __CWSORT_MAX(a, v);
126
+ }
127
+ return v;
128
+ }
129
+
130
+ template<int WSIZE,
131
+ int GSIZE, // power-pf-2 <= WSIZE
132
+ int DIRECTION,
133
+ typename KEY_T,
134
+ typename VAL_T>
135
+ __device__ void warp_sort(KEY_T *__restrict__ k, VAL_T *__restrict__ v) {
136
+
137
+ const int NET_LEN[] = {0, 1, 3, 6, 10, 15};
138
+ const int LOG2_GSIZE = STATIC_LOG2<GSIZE>::value;
139
+ const int NSWAP = NET_LEN[LOG2_GSIZE];
140
+
141
+ const int lid = (threadIdx.y*blockDim.x + threadIdx.x) % WSIZE;
142
+ const unsigned int WMASK = ((1ull << GSIZE)-1) << (lid & (~(GSIZE-1)));
143
+
144
+ const int gid = lid % GSIZE;
145
+
146
+ const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(get_swap_ptr<GSIZE>());
147
+
148
+ #pragma unroll
149
+ for(int i = 0; i < NSWAP; i++) {
150
+ const int srclane = swap[i][gid];
151
+
152
+ const KEY_T a = __shfl_sync(WMASK, k[0], srclane, GSIZE);
153
+ const VAL_T b = __shfl_sync(WMASK, v[0], srclane, GSIZE);
154
+
155
+ if (gid < srclane == DIRECTION) {
156
+ if (a < k[0]) {
157
+ k[0] = a;
158
+ v[0] = b;
159
+ }
160
+ } else {
161
+ if (a > k[0]) {
162
+ k[0] = a;
163
+ v[0] = b;
164
+ }
165
+ }
166
+ }
167
+ return;
168
+ }
169
+
170
+ }
171
+ #endif