ksgpu 1.0.2__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {ksgpu-1.0.2 → ksgpu-1.2.0}/Makefile +85 -33
  2. ksgpu-1.2.0/PKG-INFO +7 -0
  3. ksgpu-1.2.0/include/ksgpu/Array.hpp +815 -0
  4. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/CpuThreadPool.hpp +6 -6
  5. ksgpu-1.2.0/include/ksgpu/Dtype.hpp +191 -0
  6. ksgpu-1.2.0/include/ksgpu/KernelTimer.hpp +114 -0
  7. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/constexpr_functions.hpp +5 -0
  8. ksgpu-1.2.0/include/ksgpu/cuda_utils.hpp +274 -0
  9. ksgpu-1.2.0/include/ksgpu/device_fp16.hpp +145 -0
  10. ksgpu-1.2.0/include/ksgpu/device_transposes.hpp +137 -0
  11. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/mem_utils.hpp +27 -34
  12. ksgpu-1.2.0/include/ksgpu/pybind11.hpp +68 -0
  13. ksgpu-1.2.0/include/ksgpu/pybind11_utils.hpp +69 -0
  14. ksgpu-1.2.0/include/ksgpu/rand_utils.hpp +155 -0
  15. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/string_utils.hpp +27 -12
  16. ksgpu-1.2.0/include/ksgpu/test_utils.hpp +56 -0
  17. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/time_utils.hpp +8 -2
  18. ksgpu-1.2.0/include/ksgpu/xassert.hpp +177 -0
  19. ksgpu-1.2.0/include/ksgpu.hpp +54 -0
  20. ksgpu-1.2.0/ksgpu/CudaStreamWrapper.py +232 -0
  21. ksgpu-1.2.0/ksgpu/__init__.py +60 -0
  22. ksgpu-1.2.0/ksgpu/pybind11_injections.py +210 -0
  23. ksgpu-1.2.0/ksgpu/tests.py +43 -0
  24. ksgpu-1.2.0/ksgpu/utils.py +86 -0
  25. {ksgpu-1.0.2 → ksgpu-1.2.0}/pyproject.toml +4 -4
  26. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/reverse-engineer-mma.cu +317 -320
  27. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/scratch.cu +5 -6
  28. ksgpu-1.2.0/src_bin/show-devices.cu +49 -0
  29. ksgpu-1.2.0/src_bin/test-array.cu +548 -0
  30. ksgpu-1.2.0/src_bin/test-device-transpose-kernels.cu +84 -0
  31. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/test-memcpy-kernels.cu +13 -11
  32. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/test-sparse-mma.cu +55 -56
  33. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-atomic-add.cu +53 -49
  34. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-fma.cu +39 -41
  35. ksgpu-1.2.0/src_bin/time-global-memory.cu +101 -0
  36. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-l2-cache.cu +23 -25
  37. ksgpu-1.2.0/src_bin/time-local-transpose.cu +139 -0
  38. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-memcpy-kernels.cu +37 -29
  39. ksgpu-1.2.0/src_bin/time-shared-memory.cu +97 -0
  40. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_bin/time-tensor-cores.cu +117 -104
  41. ksgpu-1.2.0/src_bin/time-warp-shuffle.cu +172 -0
  42. ksgpu-1.2.0/src_lib/Array.cpp +1239 -0
  43. ksgpu-1.0.2/src_lib/CpuThreadPool.cu → ksgpu-1.2.0/src_lib/CpuThreadPool.cpp +37 -36
  44. ksgpu-1.2.0/src_lib/Dtype.cpp +204 -0
  45. ksgpu-1.2.0/src_lib/assert_arrays_equal.cpp +310 -0
  46. ksgpu-1.2.0/src_lib/cuda_utils.cpp +135 -0
  47. ksgpu-1.2.0/src_lib/mem_utils.cpp +513 -0
  48. {ksgpu-1.0.2 → ksgpu-1.2.0}/src_lib/memcpy_kernels.cu +20 -20
  49. ksgpu-1.2.0/src_lib/rand_utils.cpp +208 -0
  50. ksgpu-1.0.2/src_lib/string_utils.cu → ksgpu-1.2.0/src_lib/string_utils.cpp +24 -24
  51. ksgpu-1.2.0/src_lib/test_utils.cu +291 -0
  52. ksgpu-1.2.0/src_pybind11/ksgpu_pybind11.cpp +589 -0
  53. ksgpu-1.2.0/src_pybind11/pybind11_utils.cpp +730 -0
  54. ksgpu-1.0.2/PKG-INFO +0 -4
  55. ksgpu-1.0.2/include/ksgpu/Array.hpp +0 -676
  56. ksgpu-1.0.2/include/ksgpu/Barrier.hpp +0 -44
  57. ksgpu-1.0.2/include/ksgpu/CudaStreamPool.hpp +0 -119
  58. ksgpu-1.0.2/include/ksgpu/ThreadSafeRingBuffer.hpp +0 -133
  59. ksgpu-1.0.2/include/ksgpu/complex_type_traits.hpp +0 -48
  60. ksgpu-1.0.2/include/ksgpu/cuda_utils.hpp +0 -199
  61. ksgpu-1.0.2/include/ksgpu/pybind11.hpp +0 -89
  62. ksgpu-1.0.2/include/ksgpu/pybind11_utils.hpp +0 -109
  63. ksgpu-1.0.2/include/ksgpu/rand_utils.hpp +0 -167
  64. ksgpu-1.0.2/include/ksgpu/test_utils.hpp +0 -94
  65. ksgpu-1.0.2/include/ksgpu/xassert.hpp +0 -84
  66. ksgpu-1.0.2/ksgpu/__init__.py +0 -64
  67. ksgpu-1.0.2/src_bin/show-devices.cu +0 -41
  68. ksgpu-1.0.2/src_bin/test-array.cu +0 -474
  69. ksgpu-1.0.2/src_bin/time-local-transpose.cu +0 -135
  70. ksgpu-1.0.2/src_bin/time-shared-memory.cu +0 -87
  71. ksgpu-1.0.2/src_bin/time-warp-shuffle.cu +0 -105
  72. ksgpu-1.0.2/src_lib/Array.cu +0 -408
  73. ksgpu-1.0.2/src_lib/Barrier.cu +0 -72
  74. ksgpu-1.0.2/src_lib/CudaStreamPool.cu +0 -196
  75. ksgpu-1.0.2/src_lib/cuda_utils.cu +0 -171
  76. ksgpu-1.0.2/src_lib/mem_utils.cu +0 -391
  77. ksgpu-1.0.2/src_lib/rand_utils.cu +0 -73
  78. ksgpu-1.0.2/src_lib/test_utils.cu +0 -410
  79. ksgpu-1.0.2/src_pybind11/ksgpu_pybind11.cu +0 -197
  80. ksgpu-1.0.2/src_pybind11/pybind11_utils.cu +0 -430
  81. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/device_mma.hpp +0 -0
  82. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/dlpack.h +0 -0
  83. {ksgpu-1.0.2 → ksgpu-1.2.0}/include/ksgpu/memcpy_kernels.hpp +0 -0
  84. {ksgpu-1.0.2 → ksgpu-1.2.0}/makefile_helper.py +0 -0
@@ -1,8 +1,18 @@
1
1
  # This Makefile will be invoked by the python build system (e.g. via 'pip install'),
2
2
  # but you can also build individual targets by invoking 'make' directly.
3
+ #
4
+ # Overrideable variables (either as env variables, or in a gitignored-file 'config.mk')
5
+ # NOTE: Conda users normally need none of these (found via $CONDA_PREFIX)
6
+ #
7
+ # NVCC_OPT e.g. '-O0 -g' (default is '-O3')
8
+ # NVCC e.g. '/usr/local/bin/nvcc' (the nvcc program; its flags are NVCCFLAGS)
9
+ # NVCCFLAGS e.g. '-std=c++20 ...' (all of nvcc's flags; usually tweak NVCC_OPT instead)
10
+ # NVCC_ARCH e.g. '-gencode arch=compute_90,code=sm_90' (default targets sm_80/86/89)
11
+ # NVCC_DEPFLAGS e.g. '-MMD -MP' (the default)
12
+ # PYTHON e.g. 'python3.11' (default is 'python3')
3
13
 
4
14
  # Disable built-in rules and variables (must be first).
5
- MAKEFLAGS += --no-builtin-rules
15
+ MAKEFLAGS += --no-builtin-rules
6
16
  MAKEFLAGS += --no-builtin-variables
7
17
 
8
18
  # Default target 'all' must be first target in Makefile.
@@ -16,14 +26,26 @@ all: bin lib build_wheel build_sdist
16
26
 
17
27
  ####################################################################################################
18
28
  #
19
- # Variables encoding configuration: PYTHON, NVCC, NVCC_ARCH, NVCC_DEPFLAGS.
29
+ # Variables encoding configuration: PYTHON, NVCC, NVCCFLAGS, NVCC_OPT, NVCC_ARCH,
30
+ # NVCC_DEPFLAGS. These can be overridden from the environment or a gitignored
31
+ # 'config.mk' (see the overridable-variables list at the top of this file), so
32
+ # you normally don't need to edit the defaults below.
20
33
  #
21
- # FIXME some day I'll define a configure-script mechanism for setting these variables.
22
- # For now, if you want to change the defaults, just edit the Makfile.
34
+ # FIXME some day I'll replace the env/config.mk mechanism with a proper configure script.
23
35
 
36
+ # Optional per-machine overrides (gitignored). Included before the '?=' defaults
37
+ # below so its assignments take effect.
38
+ -include config.mk
24
39
 
25
40
  PYTHON ?= python3
26
- NVCC ?= nvcc -std=c++17 -m64 -O3 --compiler-options -Wall,-fPIC
41
+
42
+ # NVCC is the nvcc program alone; its flags live in NVCCFLAGS. Keeping them
43
+ # separate means you can point NVCC at a different toolkit (NVCC=/path/to/nvcc)
44
+ # without retyping the flags, and tweak the optimization level via NVCC_OPT
45
+ # (a debug build is just NVCC_OPT='-O0 -g') without retyping the whole command.
46
+ NVCC ?= nvcc
47
+ NVCC_OPT ?= -O3
48
+ NVCCFLAGS ?= -std=c++17 -m64 $(NVCC_OPT) --compiler-options -Wall,-fPIC
27
49
 
28
50
  # Extra nvcc flags needed to build Makefile dependencies
29
51
  # -MMD create dep file, omitting "system" headers
@@ -39,6 +61,15 @@ DEFAULT_NVCC_ARCH += -gencode arch=compute_89,code=sm_89
39
61
  # DEFAULT_ARCH += -gencode arch=compute_90,code=sm_90
40
62
  NVCC_ARCH ?= $(DEFAULT_NVCC_ARCH)
41
63
 
64
+ # If building inside a conda env, add -L and an RPATH to $CONDA_PREFIX/lib on
65
+ # every link line. ksgpu currently has no conda-lib link deps, but keeping this
66
+ # scaffolding here matches pirate's Makefile and makes the shared libs
67
+ # self-locating if a future ksgpu source file grows a conda dep.
68
+ ifneq ($(CONDA_PREFIX),)
69
+ CONDA_LIBFLAGS = -L$(CONDA_PREFIX)/lib
70
+ CONDA_RPATHFLAGS = -Xcompiler '"-Wl,-rpath=$(CONDA_PREFIX)/lib"'
71
+ endif
72
+
42
73
 
43
74
  ####################################################################################################
44
75
  #
@@ -65,28 +96,37 @@ KSGPU_LIB := lib/libksgpu.so
65
96
  KSGPU_PYEXT = ksgpu/ksgpu_pybind11$(PYEXT_SUFFIX)
66
97
 
67
98
  # These get compiled into lib/libksgpu.so
68
- LIB_SRCFILES := \
69
- src_lib/Array.cu \
70
- src_lib/Barrier.cu \
71
- src_lib/CpuThreadPool.cu \
72
- src_lib/CudaStreamPool.cu \
73
- src_lib/cuda_utils.cu \
74
- src_lib/mem_utils.cu \
99
+ # Note: some files are .cpp (compiled via nvcc forwarding to host compiler)
100
+ # and some are .cu (require nvcc's CUDA frontend for device code).
101
+ LIB_CU_SRCFILES := \
75
102
  src_lib/memcpy_kernels.cu \
76
- src_lib/rand_utils.cu \
77
- src_lib/string_utils.cu \
78
103
  src_lib/test_utils.cu
79
104
 
105
+ LIB_CPP_SRCFILES := \
106
+ src_lib/Array.cpp \
107
+ src_lib/CpuThreadPool.cpp \
108
+ src_lib/Dtype.cpp \
109
+ src_lib/assert_arrays_equal.cpp \
110
+ src_lib/cuda_utils.cpp \
111
+ src_lib/mem_utils.cpp \
112
+ src_lib/rand_utils.cpp \
113
+ src_lib/string_utils.cpp
114
+
115
+ LIB_SRCFILES := $(LIB_CU_SRCFILES) $(LIB_CPP_SRCFILES)
116
+
80
117
  # These get compiled into ksgpu/ksgpu_pybind11....so
81
- PYEXT_SRCFILES := \
82
- src_pybind11/ksgpu_pybind11.cu \
83
- src_pybind11/pybind11_utils.cu
118
+ PYEXT_CPP_SRCFILES := \
119
+ src_pybind11/ksgpu_pybind11.cpp \
120
+ src_pybind11/pybind11_utils.cpp
121
+
122
+ PYEXT_SRCFILES := $(PYEXT_CPP_SRCFILES)
84
123
 
85
124
  # These are in 1-1 corresponding with executables in bin/
86
125
  # For example, 'src_bin/time-atomic-add.cu' gets compiled to 'bin/time-atomic-add'.
87
126
  BIN_SRCFILES := \
88
127
  src_bin/time-atomic-add.cu \
89
128
  src_bin/time-fma.cu \
129
+ src_bin/time-global-memory.cu \
90
130
  src_bin/time-l2-cache.cu \
91
131
  src_bin/time-local-transpose.cu \
92
132
  src_bin/time-memcpy-kernels.cu \
@@ -96,6 +136,7 @@ BIN_SRCFILES := \
96
136
  src_bin/scratch.cu \
97
137
  src_bin/reverse-engineer-mma.cu \
98
138
  src_bin/test-array.cu \
139
+ src_bin/test-device-transpose-kernels.cu \
99
140
  src_bin/test-memcpy-kernels.cu \
100
141
  src_bin/test-sparse-mma.cu \
101
142
  src_bin/show-devices.cu
@@ -103,20 +144,25 @@ BIN_SRCFILES := \
103
144
  # Must list all python source files here.
104
145
  # (Otherwise they won't show up in 'pip install' or pypi.)
105
146
  PYFILES := \
106
- ksgpu/__init__.py
147
+ ksgpu/__init__.py \
148
+ ksgpu/CudaStreamWrapper.py \
149
+ ksgpu/pybind11_injections.py \
150
+ ksgpu/tests.py \
151
+ ksgpu/utils.py
107
152
 
108
153
  # Must list all header files here.
109
154
  # (Otherwise they won't show up in 'pip install' or pypi.)
110
155
  HFILES := \
156
+ include/ksgpu.hpp \
111
157
  include/ksgpu/Array.hpp \
112
- include/ksgpu/Barrier.hpp \
113
158
  include/ksgpu/CpuThreadPool.hpp \
114
- include/ksgpu/CudaStreamPool.hpp \
115
- include/ksgpu/ThreadSafeRingBuffer.hpp \
116
- include/ksgpu/complex_type_traits.hpp \
159
+ include/ksgpu/Dtype.hpp \
160
+ include/ksgpu/KernelTimer.hpp \
117
161
  include/ksgpu/constexpr_functions.hpp \
118
162
  include/ksgpu/cuda_utils.hpp \
163
+ include/ksgpu/device_fp16.hpp \
119
164
  include/ksgpu/device_mma.hpp \
165
+ include/ksgpu/device_transposes.hpp \
120
166
  include/ksgpu/mem_utils.hpp \
121
167
  include/ksgpu/memcpy_kernels.hpp \
122
168
  include/ksgpu/rand_utils.hpp \
@@ -142,13 +188,14 @@ CLEAN_RMDIRS := bin lib ksgpu/__pycache__
142
188
  ####################################################################################################
143
189
 
144
190
 
145
- LIB_OFILES := $(LIB_SRCFILES:%.cu=%.o)
146
- PYEXT_OFILES := $(PYEXT_SRCFILES:%.cu=%.o)
191
+ LIB_OFILES := $(LIB_CU_SRCFILES:%.cu=%.o) $(LIB_CPP_SRCFILES:%.cpp=%.o)
192
+ PYEXT_OFILES := $(PYEXT_CPP_SRCFILES:%.cpp=%.o)
147
193
  BIN_XFILES := $(BIN_SRCFILES:src_bin/%.cu=bin/%)
148
194
 
149
195
  # Must include all .d files, or build will break!
150
196
  ALL_SRCFILES := $(LIB_SRCFILES) $(PYEXT_SRCFILES) $(BIN_SRCFILES)
151
- DEPFILES := $(ALL_SRCFILES:%.cu=%.d)
197
+ DEPFILES := $(LIB_CU_SRCFILES:%.cu=%.d) $(LIB_CPP_SRCFILES:%.cpp=%.d)
198
+ DEPFILES += $(PYEXT_CPP_SRCFILES:%.cpp=%.d) $(BIN_SRCFILES:%.cu=%.d)
152
199
 
153
200
  SDIST_FILES := pyproject.toml Makefile makefile_helper.py
154
201
  SDIST_FILES += $(PYFILES) $(ALL_SRCFILES) $(HFILES)
@@ -172,23 +219,28 @@ ksgpu/include:
172
219
  ksgpu/lib:
173
220
  ln -s ../lib $@
174
221
 
175
- # Build object files in src_lib/ or src_bin/
222
+ # Build object files in src_lib/ or src_bin/ from .cu files
176
223
  %.o: %.cu %.d
177
- $(NVCC) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -c -o $@ $<
224
+ $(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -c -o $@ $<
225
+
226
+ # Build object files in src_lib/ from .cpp files
227
+ # Note: nvcc forwards .cpp files to the host compiler (no CUDA frontend processing).
228
+ %.o: %.cpp %.d
229
+ $(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -c -o $@ $<
178
230
 
179
- # Build object files in src_pybind11/ with special flags.
180
- src_pybind11/%.o: src_pybind11/%.cu src_pybind11/%.d
181
- $(NVCC) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -I$(PYTHON_INCDIR) -I$(NUMPY_INCDIR) -I$(PYBIND11_INCDIR) -c -o $@ $<
231
+ # Build object files in src_pybind11/ from .cpp files, with special flags.
232
+ src_pybind11/%.o: src_pybind11/%.cpp src_pybind11/%.d
233
+ $(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) $(NVCC_DEPFLAGS) -I$(PYTHON_INCDIR) -I$(NUMPY_INCDIR) -I$(PYBIND11_INCDIR) -c -o $@ $<
182
234
 
183
235
  # Build the C++ library (lib/libksgpu.so)
184
236
  $(KSGPU_LIB): $(LIB_OFILES)
185
237
  @mkdir -p lib
186
- $(NVCC) $(NVCC_ARCH) -shared -o $@ $^
238
+ $(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) -shared -o $@ $^ $(CONDA_LIBFLAGS) $(CONDA_RPATHFLAGS)
187
239
 
188
240
  # Build binaries (bin/*)
189
241
  bin/%: src_bin/%.o $(KSGPU_LIB)
190
242
  @mkdir -p bin/
191
- $(NVCC) $(NVCC_ARCH) -o $@ $^
243
+ $(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) -o $@ $^
192
244
 
193
245
  # Build the python extension (ksgpu/ksgpu_pybind11...so)
194
246
  # We want it to automatically pull in the C++ library ksgpu/lib/libksgpu.so.
@@ -203,7 +255,7 @@ bin/%: src_bin/%.o $(KSGPU_LIB)
203
255
  # - Makefile line should look like: nvcc -Xcompiler '"-Wl,-rpath=\\$$ORIGIN/lib"'
204
256
 
205
257
  $(KSGPU_PYEXT): $(PYEXT_OFILES) $(KSGPU_LIB) ksgpu/lib
206
- $(NVCC) $(NVCC_ARCH) -shared -o $@ $(PYEXT_OFILES) -lksgpu -Lksgpu/lib -Xcompiler '"-Wl,-rpath=\\$$ORIGIN/lib"'
258
+ $(NVCC) $(NVCCFLAGS) $(NVCC_ARCH) -shared -o $@ $(PYEXT_OFILES) -lksgpu -Lksgpu/lib $(CONDA_LIBFLAGS) -Xcompiler '"-Wl,-rpath=\\$$ORIGIN/lib"' $(CONDA_RPATHFLAGS)
207
259
 
208
260
  # Needed by pip/pipmake: list of all files that go into the (non-editable) wheel.
209
261
  wheel_files.txt: Makefile ksgpu/include ksgpu/lib
ksgpu-1.2.0/PKG-INFO ADDED
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.2
2
+ Name: ksgpu
3
+ Version: 1.2.0
4
+ Requires-Python: >=3.8
5
+ Requires-Dist: pybind11
6
+ Requires-Dist: numpy
7
+ Requires-Dist: editables