ksgpu 1.0.2__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ksgpu-1.0.2 → ksgpu-1.2.0}/Makefile +85 -33
- ksgpu-1.2.0/PKG-INFO +7 -0
- ksgpu-1.2.0/include/ksgpu/Array.hpp +815 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/CpuThreadPool.hpp +6 -6
- ksgpu-1.2.0/include/ksgpu/Dtype.hpp +191 -0
- ksgpu-1.2.0/include/ksgpu/KernelTimer.hpp +114 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/constexpr_functions.hpp +5 -0
- ksgpu-1.2.0/include/ksgpu/cuda_utils.hpp +274 -0
- ksgpu-1.2.0/include/ksgpu/device_fp16.hpp +145 -0
- ksgpu-1.2.0/include/ksgpu/device_transposes.hpp +137 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/mem_utils.hpp +27 -34
- ksgpu-1.2.0/include/ksgpu/pybind11.hpp +68 -0
- ksgpu-1.2.0/include/ksgpu/pybind11_utils.hpp +69 -0
- ksgpu-1.2.0/include/ksgpu/rand_utils.hpp +155 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/string_utils.hpp +27 -12
- ksgpu-1.2.0/include/ksgpu/test_utils.hpp +56 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/time_utils.hpp +8 -2
- ksgpu-1.2.0/include/ksgpu/xassert.hpp +177 -0
- ksgpu-1.2.0/include/ksgpu.hpp +54 -0
- ksgpu-1.2.0/ksgpu/CudaStreamWrapper.py +232 -0
- ksgpu-1.2.0/ksgpu/__init__.py +60 -0
- ksgpu-1.2.0/ksgpu/pybind11_injections.py +210 -0
- ksgpu-1.2.0/ksgpu/tests.py +43 -0
- ksgpu-1.2.0/ksgpu/utils.py +86 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/pyproject.toml +4 -4
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/reverse-engineer-mma.cu +317 -320
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/scratch.cu +5 -6
- ksgpu-1.2.0/src_bin/show-devices.cu +49 -0
- ksgpu-1.2.0/src_bin/test-array.cu +548 -0
- ksgpu-1.2.0/src_bin/test-device-transpose-kernels.cu +84 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/test-memcpy-kernels.cu +13 -11
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/test-sparse-mma.cu +55 -56
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-atomic-add.cu +53 -49
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-fma.cu +39 -41
- ksgpu-1.2.0/src_bin/time-global-memory.cu +101 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-l2-cache.cu +23 -25
- ksgpu-1.2.0/src_bin/time-local-transpose.cu +139 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-memcpy-kernels.cu +37 -29
- ksgpu-1.2.0/src_bin/time-shared-memory.cu +97 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-tensor-cores.cu +117 -104
- ksgpu-1.2.0/src_bin/time-warp-shuffle.cu +172 -0
- ksgpu-1.2.0/src_lib/Array.cpp +1239 -0
- ksgpu-1.0.2/src_lib/CpuThreadPool.cu → ksgpu-1.2.0/src_lib/CpuThreadPool.cpp +37 -36
- ksgpu-1.2.0/src_lib/Dtype.cpp +204 -0
- ksgpu-1.2.0/src_lib/assert_arrays_equal.cpp +310 -0
- ksgpu-1.2.0/src_lib/cuda_utils.cpp +135 -0
- ksgpu-1.2.0/src_lib/mem_utils.cpp +513 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/src_lib/memcpy_kernels.cu +20 -20
- ksgpu-1.2.0/src_lib/rand_utils.cpp +208 -0
- ksgpu-1.0.2/src_lib/string_utils.cu → ksgpu-1.2.0/src_lib/string_utils.cpp +24 -24
- ksgpu-1.2.0/src_lib/test_utils.cu +291 -0
- ksgpu-1.2.0/src_pybind11/ksgpu_pybind11.cpp +589 -0
- ksgpu-1.2.0/src_pybind11/pybind11_utils.cpp +730 -0
- ksgpu-1.0.2/PKG-INFO +0 -4
- ksgpu-1.0.2/include/ksgpu/Array.hpp +0 -676
- ksgpu-1.0.2/include/ksgpu/Barrier.hpp +0 -44
- ksgpu-1.0.2/include/ksgpu/CudaStreamPool.hpp +0 -119
- ksgpu-1.0.2/include/ksgpu/ThreadSafeRingBuffer.hpp +0 -133
- ksgpu-1.0.2/include/ksgpu/complex_type_traits.hpp +0 -48
- ksgpu-1.0.2/include/ksgpu/cuda_utils.hpp +0 -199
- ksgpu-1.0.2/include/ksgpu/pybind11.hpp +0 -89
- ksgpu-1.0.2/include/ksgpu/pybind11_utils.hpp +0 -109
- ksgpu-1.0.2/include/ksgpu/rand_utils.hpp +0 -167
- ksgpu-1.0.2/include/ksgpu/test_utils.hpp +0 -94
- ksgpu-1.0.2/include/ksgpu/xassert.hpp +0 -84
- ksgpu-1.0.2/ksgpu/__init__.py +0 -64
- ksgpu-1.0.2/src_bin/show-devices.cu +0 -41
- ksgpu-1.0.2/src_bin/test-array.cu +0 -474
- ksgpu-1.0.2/src_bin/time-local-transpose.cu +0 -135
- ksgpu-1.0.2/src_bin/time-shared-memory.cu +0 -87
- ksgpu-1.0.2/src_bin/time-warp-shuffle.cu +0 -105
- ksgpu-1.0.2/src_lib/Array.cu +0 -408
- ksgpu-1.0.2/src_lib/Barrier.cu +0 -72
- ksgpu-1.0.2/src_lib/CudaStreamPool.cu +0 -196
- ksgpu-1.0.2/src_lib/cuda_utils.cu +0 -171
- ksgpu-1.0.2/src_lib/mem_utils.cu +0 -391
- ksgpu-1.0.2/src_lib/rand_utils.cu +0 -73
- ksgpu-1.0.2/src_lib/test_utils.cu +0 -410
- ksgpu-1.0.2/src_pybind11/ksgpu_pybind11.cu +0 -197
- ksgpu-1.0.2/src_pybind11/pybind11_utils.cu +0 -430
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/device_mma.hpp +0 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/dlpack.h +0 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/memcpy_kernels.hpp +0 -0
- {ksgpu-1.0.2 → ksgpu-1.2.0}/makefile_helper.py +0 -0
|
@@ -1,8 +1,18 @@
|
|
|
1
1
|
# This Makefile will be invoked by the python build system (e.g. via 'pip install'),
|
|
2
2
|
# but you can also build individual targets by invoking 'make' directly.
|
|
3
|
+
#
|
|
4
|
+
# Overrideable variables (either as env variables, or in a gitignored-file 'config.mk')
|
|
5
|
+
# NOTE: Conda users normally need none of these (found via $CONDA_PREFIX)
|
|
6
|
+
#
|
|
7
|
+
# NVCC_OPT e.g. '-O0 -g' (default is '-O3')
|
|
8
|
+
# NVCC e.g. '/usr/local/bin/nvcc' (the nvcc program; its flags are NVCCFLAGS)
|
|
9
|
+
# NVCCFLAGS e.g. '-std=c++20 ...' (all of nvcc's flags; usually tweak NVCC_OPT instead)
|
|
10
|
+
# NVCC_ARCH e.g. '-gencode arch=compute_90,code=sm_90' (default targets sm_80/86/89)
|
|
11
|
+
# NVCC_DEPFLAGS e.g. '-MMD -MP' (the default)
|
|
12
|
+
# PYTHON e.g. 'python3.11' (default is 'python3')
|
|
3
13
|
|
|
4
14
|
# Disable built-in rules and variables (must be first).
|
|
5
|
-
MAKEFLAGS += --no-builtin-rules
|
|
15
|
+
MAKEFLAGS += --no-builtin-rules
|
|
6
16
|
MAKEFLAGS += --no-builtin-variables
|
|
7
17
|
|
|
8
18
|
# Default target 'all' must be first target in Makefile.
|
|
@@ -16,14 +26,26 @@ all: bin lib build_wheel build_sdist
|
|
|
16
26
|
|
|
17
27
|
####################################################################################################
|
|
18
28
|
#
|
|
19
|
-
# Variables encoding configuration: PYTHON, NVCC, NVCC_ARCH,
|
|
29
|
+
# Variables encoding configuration: PYTHON, NVCC, NVCCFLAGS, NVCC_OPT, NVCC_ARCH,
|
|
30
|
+
# NVCC_DEPFLAGS. These can be overridden from the environment or a gitignored
|
|
31
|
+
# 'config.mk' (see the overridable-variables list at the top of this file), so
|
|
32
|
+
# you normally don't need to edit the defaults below.
|
|
20
33
|
#
|
|
21
|
-
# FIXME some day I'll
|
|
22
|
-
# For now, if you want to change the defaults, just edit the Makfile.
|
|
34
|
+
# FIXME some day I'll replace the env/config.mk mechanism with a proper configure script.
|
|
23
35
|
|
|
36
|
+
# Optional per-machine overrides (gitignored). Included before the '?=' defaults
|
|
37
|
+
# below so its assignments take effect.
|
|
38
|
+
-include config.mk
|
|
24
39
|
|
|
25
40
|
PYTHON ?= python3
|
|
26
|
-
|
|
41
|
+
|
|
42
|
+
# NVCC is the nvcc program alone; its flags live in NVCCFLAGS. Keeping them
|
|
43
|
+
# separate means you can point NVCC at a different toolkit (NVCC=/path/to/nvcc)
|
|
44
|
+
# without retyping the flags, and tweak the optimization level via NVCC_OPT
|
|
45
|
+
# (a debug build is just NVCC_OPT='-O0 -g') without retyping the whole command.
|
|
46
|
+
NVCC ?= nvcc
|
|
47
|
+
NVCC_OPT ?= -O3
|
|
48
|
+
NVCCFLAGS ?= -std=c++17 -m64 $(NVCC_OPT) --compiler-options -Wall,-fPIC
|
|
27
49
|
|
|
28
50
|
# Extra nvcc flags needed to build Makefile dependencies
|
|
29
51
|
# -MMD create dep file, omitting "system" headers
|
|
@@ -39,6 +61,15 @@ DEFAULT_NVCC_ARCH += -gencode arch=compute_89,code=sm_89
|
|
|
39
61
|
# DEFAULT_ARCH += -gencode arch=compute_90,code=sm_90
|
|
40
62
|
NVCC_ARCH ?= $(DEFAULT_NVCC_ARCH)
|
|
41
63
|
|
|
64
|
+
# If building inside a conda env, add -L and an RPATH to $CONDA_PREFIX/lib on
|
|
65
|
+
# every link line. ksgpu currently has no conda-lib link deps, but keeping this
|
|
66
|
+
# scaffolding here matches pirate's Makefile and makes the shared libs
|
|
67
|
+
# self-locating if a future ksgpu source file grows a conda dep.
|
|
68
|
+
ifneq ($(CONDA_PREFIX),)
|
|
69
|
+
CONDA_LIBFLAGS = -L$(CONDA_PREFIX)/lib
|
|
70
|
+
CONDA_RPATHFLAGS = -Xcompiler '"-Wl,-rpath=$(CONDA_PREFIX)/lib"'
|
|
71
|
+
endif
|
|
72
|
+
|
|
42
73
|
|
|
43
74
|
####################################################################################################
|
|
44
75
|
#
|
|
@@ -65,28 +96,37 @@ KSGPU_LIB := lib/libksgpu.so
|
|
|
65
96
|
KSGPU_PYEXT = ksgpu/ksgpu_pybind11$(PYEXT_SUFFIX)
|
|
66
97
|
|
|
67
98
|
# These get compiled into lib/libksgpu.so
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
src_lib/CpuThreadPool.cu \
|
|
72
|
-
src_lib/CudaStreamPool.cu \
|
|
73
|
-
src_lib/cuda_utils.cu \
|
|
74
|
-
src_lib/mem_utils.cu \
|
|
99
|
+
# Note: some files are .cpp (compiled via nvcc forwarding to host compiler)
|
|
100
|
+
# and some are .cu (require nvcc's CUDA frontend for device code).
|
|
101
|
+
LIB_CU_SRCFILES := \
|
|
75
102
|
src_lib/memcpy_kernels.cu \
|
|
76
|
-
src_lib/rand_utils.cu \
|
|
77
|
-
src_lib/string_utils.cu \
|
|
78
103
|
src_lib/test_utils.cu
|
|
79
104
|
|
|
105
|
+
LIB_CPP_SRCFILES := \
|
|
106
|
+
src_lib/Array.cpp \
|
|
107
|
+
src_lib/CpuThreadPool.cpp \
|
|
108
|
+
src_lib/Dtype.cpp \
|
|
109
|
+
src_lib/assert_arrays_equal.cpp \
|
|
110
|
+
src_lib/cuda_utils.cpp \
|
|
111
|
+
src_lib/mem_utils.cpp \
|
|
112
|
+
src_lib/rand_utils.cpp \
|
|
113
|
+
src_lib/string_utils.cpp
|
|
114
|
+
|
|
115
|
+
LIB_SRCFILES := $(LIB_CU_SRCFILES) $(LIB_CPP_SRCFILES)
|
|
116
|
+
|
|
80
117
|
# These get compiled into ksgpu/ksgpu_pybind11....so
|
|
81
|
-
|
|
82
|
-
src_pybind11/ksgpu_pybind11.
|
|
83
|
-
src_pybind11/pybind11_utils.
|
|
118
|
+
PYEXT_CPP_SRCFILES := \
|
|
119
|
+
src_pybind11/ksgpu_pybind11.cpp \
|
|
120
|
+
src_pybind11/pybind11_utils.cpp
|
|
121
|
+
|
|
122
|
+
PYEXT_SRCFILES := $(PYEXT_CPP_SRCFILES)
|
|
84
123
|
|
|
85
124
|
# These are in 1-1 corresponding with executables in bin/
|
|
86
125
|
# For example, 'src_bin/time-atomic-add.cu' gets compiled to 'bin/time-atomic-add'.
|
|
87
126
|
BIN_SRCFILES := \
|
|
88
127
|
src_bin/time-atomic-add.cu \
|
|
89
128
|
src_bin/time-fma.cu \
|
|
129
|
+
src_bin/time-global-memory.cu \
|
|
90
130
|
src_bin/time-l2-cache.cu \
|
|
91
131
|
src_bin/time-local-transpose.cu \
|
|
92
132
|
src_bin/time-memcpy-kernels.cu \
|
|
@@ -96,6 +136,7 @@ BIN_SRCFILES := \
|
|
|
96
136
|
src_bin/scratch.cu \
|
|
97
137
|
src_bin/reverse-engineer-mma.cu \
|
|
98
138
|
src_bin/test-array.cu \
|
|
139
|
+
src_bin/test-device-transpose-kernels.cu \
|
|
99
140
|
src_bin/test-memcpy-kernels.cu \
|
|
100
141
|
src_bin/test-sparse-mma.cu \
|
|
101
142
|
src_bin/show-devices.cu
|
|
@@ -103,20 +144,25 @@ BIN_SRCFILES := \
|
|
|
103
144
|
# Must list all python source files here.
|
|
104
145
|
# (Otherwise they won't show up in 'pip install' or pypi.)
|
|
105
146
|
PYFILES := \
|
|
106
|
-
ksgpu/__init__.py
|
|
147
|
+
ksgpu/__init__.py \
|
|
148
|
+
ksgpu/CudaStreamWrapper.py \
|
|
149
|
+
ksgpu/pybind11_injections.py \
|
|
150
|
+
ksgpu/tests.py \
|
|
151
|
+
ksgpu/utils.py
|
|
107
152
|
|
|
108
153
|
# Must list all header files here.
|
|
109
154
|
# (Otherwise they won't show up in 'pip install' or pypi.)
|
|
110
155
|
HFILES := \
|
|
156
|
+
include/ksgpu.hpp \
|
|
111
157
|
include/ksgpu/Array.hpp \
|
|
112
|
-
include/ksgpu/Barrier.hpp \
|
|
113
158
|
include/ksgpu/CpuThreadPool.hpp \
|
|
114
|
-
include/ksgpu/
|
|
115
|
-
include/ksgpu/
|
|
116
|
-
include/ksgpu/complex_type_traits.hpp \
|
|
159
|
+
include/ksgpu/Dtype.hpp \
|
|
160
|
+
include/ksgpu/KernelTimer.hpp \
|
|
117
161
|
include/ksgpu/constexpr_functions.hpp \
|
|
118
162
|
include/ksgpu/cuda_utils.hpp \
|
|
163
|
+
include/ksgpu/device_fp16.hpp \
|
|
119
164
|
include/ksgpu/device_mma.hpp \
|
|
165
|
+
include/ksgpu/device_transposes.hpp \
|
|
120
166
|
include/ksgpu/mem_utils.hpp \
|
|
121
167
|
include/ksgpu/memcpy_kernels.hpp \
|
|
122
168
|
include/ksgpu/rand_utils.hpp \
|
|
@@ -142,13 +188,14 @@ CLEAN_RMDIRS := bin lib ksgpu/__pycache__
|
|
|
142
188
|
####################################################################################################
|
|
143
189
|
|
|
144
190
|
|
|
145
|
-
LIB_OFILES := $(
|
|
146
|
-
PYEXT_OFILES := $(
|
|
191
|
+
LIB_OFILES := $(LIB_CU_SRCFILES:%.cu=%.o) $(LIB_CPP_SRCFILES:%.cpp=%.o)
|
|
192
|
+
PYEXT_OFILES := $(PYEXT_CPP_SRCFILES:%.cpp=%.o)
|
|
147
193
|
BIN_XFILES := $(BIN_SRCFILES:src_bin/%.cu=bin/%)
|
|
148
194
|
|
|
149
195
|
# Must include all .d files, or build will break!
|
|
150
196
|
ALL_SRCFILES := $(LIB_SRCFILES) $(PYEXT_SRCFILES) $(BIN_SRCFILES)
|
|
151
|
-
DEPFILES := $(
|
|
197
|
+
DEPFILES := $(LIB_CU_SRCFILES:%.cu=%.d) $(LIB_CPP_SRCFILES:%.cpp=%.d)
|
|
198
|
+
DEPFILES += $(PYEXT_CPP_SRCFILES:%.cpp=%.d) $(BIN_SRCFILES:%.cu=%.d)
|
|
152
199
|
|
|
153
200
|
SDIST_FILES := pyproject.toml Makefile makefile_helper.py
|
|
154
201
|
SDIST_FILES += $(PYFILES) $(ALL_SRCFILES) $(HFILES)
|
|
@@ -172,23 +219,28 @@ ksgpu/include:
|
|
|
172
219
|
ksgpu/lib:
|
|
173
220
|
ln -s ../lib $@
|
|
174
221
|
|
|
175
|
-
# Build object files in src_lib/ or src_bin/
|
|
222
|
+
# Build object files in src_lib/ or src_bin/ from .cu files
|
|
176
223
|
%.o: %.cu %.d
|
|
177
|
-
$(NVCC) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -c -o $@ $<
|
|
224
|
+
$(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -c -o $@ $<
|
|
225
|
+
|
|
226
|
+
# Build object files in src_lib/ from .cpp files
|
|
227
|
+
# Note: nvcc forwards .cpp files to the host compiler (no CUDA frontend processing).
|
|
228
|
+
%.o: %.cpp %.d
|
|
229
|
+
$(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -c -o $@ $<
|
|
178
230
|
|
|
179
|
-
# Build object files in src_pybind11/ with special flags.
|
|
180
|
-
src_pybind11/%.o: src_pybind11/%.
|
|
181
|
-
$(NVCC) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -I$(PYTHON_INCDIR) -I$(NUMPY_INCDIR) -I$(PYBIND11_INCDIR) -c -o $@ $<
|
|
231
|
+
# Build object files in src_pybind11/ from .cpp files, with special flags.
|
|
232
|
+
src_pybind11/%.o: src_pybind11/%.cpp src_pybind11/%.d
|
|
233
|
+
$(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -I$(PYTHON_INCDIR) -I$(NUMPY_INCDIR) -I$(PYBIND11_INCDIR) -c -o $@ $<
|
|
182
234
|
|
|
183
235
|
# Build the C++ library (lib/libksgpu.so)
|
|
184
236
|
$(KSGPU_LIB): $(LIB_OFILES)
|
|
185
237
|
@mkdir -p lib
|
|
186
|
-
$(NVCC) $(NVCC_ARCH) -shared -o $@ $^
|
|
238
|
+
$(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) -shared -o $@ $^ $(CONDA_LIBFLAGS) $(CONDA_RPATHFLAGS)
|
|
187
239
|
|
|
188
240
|
# Build binaries (bin/*)
|
|
189
241
|
bin/%: src_bin/%.o $(KSGPU_LIB)
|
|
190
242
|
@mkdir -p bin/
|
|
191
|
-
$(NVCC) $(NVCC_ARCH) -o $@ $^
|
|
243
|
+
$(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) -o $@ $^
|
|
192
244
|
|
|
193
245
|
# Build the python extension (ksgpu/ksgpu_pybind11...so)
|
|
194
246
|
# We want it to automatically pull in the C++ library ksgpu/lib/libksgpu.so.
|
|
@@ -203,7 +255,7 @@ bin/%: src_bin/%.o $(KSGPU_LIB)
|
|
|
203
255
|
# - Makefile line should look like: nvcc -Xcompiler '"-Wl,-rpath=\\$$ORIGIN/lib"'
|
|
204
256
|
|
|
205
257
|
$(KSGPU_PYEXT): $(PYEXT_OFILES) $(KSGPU_LIB) ksgpu/lib
|
|
206
|
-
$(NVCC) $(NVCC_ARCH) -shared -o $@ $(PYEXT_OFILES) -lksgpu -Lksgpu/lib -Xcompiler '"-Wl,-rpath=\\$$ORIGIN/lib"'
|
|
258
|
+
$(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) -shared -o $@ $(PYEXT_OFILES) -lksgpu -Lksgpu/lib $(CONDA_LIBFLAGS) -Xcompiler '"-Wl,-rpath=\\$$ORIGIN/lib"' $(CONDA_RPATHFLAGS)
|
|
207
259
|
|
|
208
260
|
# Needed by pip/pipmake: list of all files that go into the (non-editable) wheel.
|
|
209
261
|
wheel_files.txt: Makefile ksgpu/include ksgpu/lib
|