mlx 0.30.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (599) hide show
  1. checksums.yaml +7 -0
  2. data/ext/mlx/extconf.rb +94 -0
  3. data/ext/mlx/native.cpp +8027 -0
  4. data/lib/mlx/core.rb +1678 -0
  5. data/lib/mlx/distributed_utils/common.rb +116 -0
  6. data/lib/mlx/distributed_utils/config.rb +600 -0
  7. data/lib/mlx/distributed_utils/launch.rb +490 -0
  8. data/lib/mlx/extension.rb +24 -0
  9. data/lib/mlx/nn/base.rb +388 -0
  10. data/lib/mlx/nn/init.rb +140 -0
  11. data/lib/mlx/nn/layers/activations.rb +336 -0
  12. data/lib/mlx/nn/layers/base.rb +6 -0
  13. data/lib/mlx/nn/layers/containers.rb +20 -0
  14. data/lib/mlx/nn/layers/convolution.rb +120 -0
  15. data/lib/mlx/nn/layers/convolution_transpose.rb +114 -0
  16. data/lib/mlx/nn/layers/distributed.rb +309 -0
  17. data/lib/mlx/nn/layers/dropout.rb +75 -0
  18. data/lib/mlx/nn/layers/embedding.rb +28 -0
  19. data/lib/mlx/nn/layers/linear.rb +79 -0
  20. data/lib/mlx/nn/layers/normalization.rb +216 -0
  21. data/lib/mlx/nn/layers/pooling.rb +167 -0
  22. data/lib/mlx/nn/layers/positional_encoding.rb +126 -0
  23. data/lib/mlx/nn/layers/quantized.rb +215 -0
  24. data/lib/mlx/nn/layers/recurrent.rb +135 -0
  25. data/lib/mlx/nn/layers/transformer.rb +330 -0
  26. data/lib/mlx/nn/layers/upsample.rb +97 -0
  27. data/lib/mlx/nn/layers.rb +18 -0
  28. data/lib/mlx/nn/losses.rb +251 -0
  29. data/lib/mlx/nn/utils.rb +167 -0
  30. data/lib/mlx/nn.rb +12 -0
  31. data/lib/mlx/optimizers/optimizers.rb +808 -0
  32. data/lib/mlx/optimizers/schedulers.rb +62 -0
  33. data/lib/mlx/optimizers.rb +9 -0
  34. data/lib/mlx/utils.rb +171 -0
  35. data/lib/mlx/version.rb +5 -0
  36. data/lib/mlx.rb +64 -0
  37. data/mlx/CMakeLists.txt +449 -0
  38. data/mlx/cmake/FindCUDNN.cmake +177 -0
  39. data/mlx/cmake/FindNCCL.cmake +54 -0
  40. data/mlx/cmake/Findnvpl.cmake +3 -0
  41. data/mlx/cmake/extension.cmake +50 -0
  42. data/mlx/mlx/3rdparty/.clang-format +2 -0
  43. data/mlx/mlx/3rdparty/pocketfft.h +3581 -0
  44. data/mlx/mlx/CMakeLists.txt +107 -0
  45. data/mlx/mlx/allocator.h +75 -0
  46. data/mlx/mlx/api.h +29 -0
  47. data/mlx/mlx/array.cpp +354 -0
  48. data/mlx/mlx/array.h +647 -0
  49. data/mlx/mlx/backend/common/CMakeLists.txt +9 -0
  50. data/mlx/mlx/backend/common/binary.h +97 -0
  51. data/mlx/mlx/backend/common/broadcasting.cpp +24 -0
  52. data/mlx/mlx/backend/common/broadcasting.h +11 -0
  53. data/mlx/mlx/backend/common/buffer_cache.h +158 -0
  54. data/mlx/mlx/backend/common/common.cpp +305 -0
  55. data/mlx/mlx/backend/common/compiled.cpp +243 -0
  56. data/mlx/mlx/backend/common/compiled.h +77 -0
  57. data/mlx/mlx/backend/common/copy.h +50 -0
  58. data/mlx/mlx/backend/common/hadamard.h +109 -0
  59. data/mlx/mlx/backend/common/load.cpp +57 -0
  60. data/mlx/mlx/backend/common/matmul.h +67 -0
  61. data/mlx/mlx/backend/common/reduce.cpp +154 -0
  62. data/mlx/mlx/backend/common/reduce.h +59 -0
  63. data/mlx/mlx/backend/common/slicing.cpp +71 -0
  64. data/mlx/mlx/backend/common/slicing.h +20 -0
  65. data/mlx/mlx/backend/common/ternary.h +85 -0
  66. data/mlx/mlx/backend/common/unary.h +29 -0
  67. data/mlx/mlx/backend/common/utils.cpp +231 -0
  68. data/mlx/mlx/backend/common/utils.h +205 -0
  69. data/mlx/mlx/backend/cpu/CMakeLists.txt +88 -0
  70. data/mlx/mlx/backend/cpu/arange.h +28 -0
  71. data/mlx/mlx/backend/cpu/arg_reduce.cpp +124 -0
  72. data/mlx/mlx/backend/cpu/binary.cpp +269 -0
  73. data/mlx/mlx/backend/cpu/binary.h +517 -0
  74. data/mlx/mlx/backend/cpu/binary_ops.h +98 -0
  75. data/mlx/mlx/backend/cpu/binary_two.h +166 -0
  76. data/mlx/mlx/backend/cpu/cholesky.cpp +85 -0
  77. data/mlx/mlx/backend/cpu/compiled.cpp +357 -0
  78. data/mlx/mlx/backend/cpu/compiled_preamble.h +12 -0
  79. data/mlx/mlx/backend/cpu/conv.cpp +1351 -0
  80. data/mlx/mlx/backend/cpu/copy.cpp +386 -0
  81. data/mlx/mlx/backend/cpu/copy.h +36 -0
  82. data/mlx/mlx/backend/cpu/device_info.cpp +113 -0
  83. data/mlx/mlx/backend/cpu/device_info.h +28 -0
  84. data/mlx/mlx/backend/cpu/distributed.cpp +103 -0
  85. data/mlx/mlx/backend/cpu/eig.cpp +281 -0
  86. data/mlx/mlx/backend/cpu/eigh.cpp +241 -0
  87. data/mlx/mlx/backend/cpu/encoder.cpp +16 -0
  88. data/mlx/mlx/backend/cpu/encoder.h +67 -0
  89. data/mlx/mlx/backend/cpu/eval.cpp +40 -0
  90. data/mlx/mlx/backend/cpu/eval.h +12 -0
  91. data/mlx/mlx/backend/cpu/fft.cpp +120 -0
  92. data/mlx/mlx/backend/cpu/gemm.h +26 -0
  93. data/mlx/mlx/backend/cpu/gemms/bnns.cpp +214 -0
  94. data/mlx/mlx/backend/cpu/gemms/cblas.cpp +134 -0
  95. data/mlx/mlx/backend/cpu/gemms/simd_bf16.cpp +45 -0
  96. data/mlx/mlx/backend/cpu/gemms/simd_fp16.cpp +45 -0
  97. data/mlx/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
  98. data/mlx/mlx/backend/cpu/hadamard.cpp +121 -0
  99. data/mlx/mlx/backend/cpu/indexing.cpp +854 -0
  100. data/mlx/mlx/backend/cpu/inverse.cpp +160 -0
  101. data/mlx/mlx/backend/cpu/jit_compiler.cpp +166 -0
  102. data/mlx/mlx/backend/cpu/jit_compiler.h +20 -0
  103. data/mlx/mlx/backend/cpu/lapack.h +80 -0
  104. data/mlx/mlx/backend/cpu/logsumexp.cpp +139 -0
  105. data/mlx/mlx/backend/cpu/luf.cpp +120 -0
  106. data/mlx/mlx/backend/cpu/make_compiled_preamble.ps1 +38 -0
  107. data/mlx/mlx/backend/cpu/make_compiled_preamble.sh +41 -0
  108. data/mlx/mlx/backend/cpu/masked_mm.cpp +608 -0
  109. data/mlx/mlx/backend/cpu/matmul.cpp +166 -0
  110. data/mlx/mlx/backend/cpu/primitives.cpp +478 -0
  111. data/mlx/mlx/backend/cpu/qrf.cpp +147 -0
  112. data/mlx/mlx/backend/cpu/quantized.cpp +1370 -0
  113. data/mlx/mlx/backend/cpu/reduce.cpp +587 -0
  114. data/mlx/mlx/backend/cpu/scan.cpp +338 -0
  115. data/mlx/mlx/backend/cpu/select.cpp +95 -0
  116. data/mlx/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
  117. data/mlx/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
  118. data/mlx/mlx/backend/cpu/simd/base_simd.h +319 -0
  119. data/mlx/mlx/backend/cpu/simd/math.h +193 -0
  120. data/mlx/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
  121. data/mlx/mlx/backend/cpu/simd/simd.h +4 -0
  122. data/mlx/mlx/backend/cpu/simd/type.h +11 -0
  123. data/mlx/mlx/backend/cpu/slicing.h +21 -0
  124. data/mlx/mlx/backend/cpu/softmax.cpp +170 -0
  125. data/mlx/mlx/backend/cpu/sort.cpp +481 -0
  126. data/mlx/mlx/backend/cpu/svd.cpp +289 -0
  127. data/mlx/mlx/backend/cpu/ternary.h +154 -0
  128. data/mlx/mlx/backend/cpu/threefry.cpp +31 -0
  129. data/mlx/mlx/backend/cpu/threefry.h +21 -0
  130. data/mlx/mlx/backend/cpu/unary.cpp +238 -0
  131. data/mlx/mlx/backend/cpu/unary.h +281 -0
  132. data/mlx/mlx/backend/cpu/unary_ops.h +175 -0
  133. data/mlx/mlx/backend/cuda/CMakeLists.txt +265 -0
  134. data/mlx/mlx/backend/cuda/allocator.cpp +451 -0
  135. data/mlx/mlx/backend/cuda/allocator.h +94 -0
  136. data/mlx/mlx/backend/cuda/arange.cu +68 -0
  137. data/mlx/mlx/backend/cuda/arg_reduce.cu +189 -0
  138. data/mlx/mlx/backend/cuda/bin2h.cmake +150 -0
  139. data/mlx/mlx/backend/cuda/binary/CMakeLists.txt +21 -0
  140. data/mlx/mlx/backend/cuda/binary/add.cu +7 -0
  141. data/mlx/mlx/backend/cuda/binary/arctan2.cu +7 -0
  142. data/mlx/mlx/backend/cuda/binary/binary.cuh +383 -0
  143. data/mlx/mlx/backend/cuda/binary/bitwise_binary.cu +27 -0
  144. data/mlx/mlx/backend/cuda/binary/divide.cu +7 -0
  145. data/mlx/mlx/backend/cuda/binary/equal.cu +15 -0
  146. data/mlx/mlx/backend/cuda/binary/greater.cu +7 -0
  147. data/mlx/mlx/backend/cuda/binary/greater_equal.cu +7 -0
  148. data/mlx/mlx/backend/cuda/binary/less.cu +7 -0
  149. data/mlx/mlx/backend/cuda/binary/less_equal.cu +7 -0
  150. data/mlx/mlx/backend/cuda/binary/log_add_exp.cu +7 -0
  151. data/mlx/mlx/backend/cuda/binary/logical_and.cu +7 -0
  152. data/mlx/mlx/backend/cuda/binary/logical_or.cu +7 -0
  153. data/mlx/mlx/backend/cuda/binary/maximum.cu +7 -0
  154. data/mlx/mlx/backend/cuda/binary/minimum.cu +7 -0
  155. data/mlx/mlx/backend/cuda/binary/multiply.cu +7 -0
  156. data/mlx/mlx/backend/cuda/binary/not_equal.cu +7 -0
  157. data/mlx/mlx/backend/cuda/binary/power.cu +7 -0
  158. data/mlx/mlx/backend/cuda/binary/remainder.cu +7 -0
  159. data/mlx/mlx/backend/cuda/binary/subtract.cu +7 -0
  160. data/mlx/mlx/backend/cuda/binary_two.cu +412 -0
  161. data/mlx/mlx/backend/cuda/compiled.cpp +357 -0
  162. data/mlx/mlx/backend/cuda/conv/conv.h +126 -0
  163. data/mlx/mlx/backend/cuda/conv/gemm_conv.cu +217 -0
  164. data/mlx/mlx/backend/cuda/conv/gemm_grouped_conv.cu +231 -0
  165. data/mlx/mlx/backend/cuda/conv.cpp +403 -0
  166. data/mlx/mlx/backend/cuda/copy/copy.cuh +55 -0
  167. data/mlx/mlx/backend/cuda/copy/copy_contiguous.cu +88 -0
  168. data/mlx/mlx/backend/cuda/copy/copy_general.cu +171 -0
  169. data/mlx/mlx/backend/cuda/copy/copy_general_dynamic.cu +118 -0
  170. data/mlx/mlx/backend/cuda/copy/copy_general_input.cu +229 -0
  171. data/mlx/mlx/backend/cuda/copy.cu +132 -0
  172. data/mlx/mlx/backend/cuda/cublas_utils.cpp +222 -0
  173. data/mlx/mlx/backend/cuda/cublas_utils.h +95 -0
  174. data/mlx/mlx/backend/cuda/cuda.h +21 -0
  175. data/mlx/mlx/backend/cuda/cuda_utils.h +90 -0
  176. data/mlx/mlx/backend/cuda/cudnn_utils.cpp +133 -0
  177. data/mlx/mlx/backend/cuda/cudnn_utils.h +187 -0
  178. data/mlx/mlx/backend/cuda/custom_kernel.cpp +379 -0
  179. data/mlx/mlx/backend/cuda/cutlass_utils.cuh +46 -0
  180. data/mlx/mlx/backend/cuda/delayload.cpp +80 -0
  181. data/mlx/mlx/backend/cuda/device/atomic_ops.cuh +63 -0
  182. data/mlx/mlx/backend/cuda/device/binary_ops.cuh +300 -0
  183. data/mlx/mlx/backend/cuda/device/cast_op.cuh +118 -0
  184. data/mlx/mlx/backend/cuda/device/complex.cuh +60 -0
  185. data/mlx/mlx/backend/cuda/device/config.h +12 -0
  186. data/mlx/mlx/backend/cuda/device/fp16_math.cuh +96 -0
  187. data/mlx/mlx/backend/cuda/device/gather.cuh +53 -0
  188. data/mlx/mlx/backend/cuda/device/gather_axis.cuh +65 -0
  189. data/mlx/mlx/backend/cuda/device/indexing.cuh +30 -0
  190. data/mlx/mlx/backend/cuda/device/scatter.cuh +68 -0
  191. data/mlx/mlx/backend/cuda/device/scatter_axis.cuh +67 -0
  192. data/mlx/mlx/backend/cuda/device/scatter_ops.cuh +44 -0
  193. data/mlx/mlx/backend/cuda/device/ternary_ops.cuh +13 -0
  194. data/mlx/mlx/backend/cuda/device/unary_ops.cuh +350 -0
  195. data/mlx/mlx/backend/cuda/device/utils.cuh +464 -0
  196. data/mlx/mlx/backend/cuda/device.cpp +522 -0
  197. data/mlx/mlx/backend/cuda/device.h +195 -0
  198. data/mlx/mlx/backend/cuda/device_info.cpp +232 -0
  199. data/mlx/mlx/backend/cuda/distributed.cu +121 -0
  200. data/mlx/mlx/backend/cuda/eval.cpp +66 -0
  201. data/mlx/mlx/backend/cuda/event.cu +415 -0
  202. data/mlx/mlx/backend/cuda/event.h +79 -0
  203. data/mlx/mlx/backend/cuda/fence.cpp +42 -0
  204. data/mlx/mlx/backend/cuda/gemms/cublas_gemm.cpp +233 -0
  205. data/mlx/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
  206. data/mlx/mlx/backend/cuda/gemms/cublas_gemm_batched_12_0.cpp +77 -0
  207. data/mlx/mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu +329 -0
  208. data/mlx/mlx/backend/cuda/gemms/gemv.cu +327 -0
  209. data/mlx/mlx/backend/cuda/gemms/gemv.h +34 -0
  210. data/mlx/mlx/backend/cuda/gemms/grouped_gemm.h +25 -0
  211. data/mlx/mlx/backend/cuda/gemms/grouped_gemm_unaligned.cu +358 -0
  212. data/mlx/mlx/backend/cuda/indexing.cpp +434 -0
  213. data/mlx/mlx/backend/cuda/jit_module.cpp +443 -0
  214. data/mlx/mlx/backend/cuda/jit_module.h +120 -0
  215. data/mlx/mlx/backend/cuda/kernel_utils.cu +52 -0
  216. data/mlx/mlx/backend/cuda/kernel_utils.cuh +148 -0
  217. data/mlx/mlx/backend/cuda/layer_norm.cu +417 -0
  218. data/mlx/mlx/backend/cuda/load.cpp +60 -0
  219. data/mlx/mlx/backend/cuda/logsumexp.cu +161 -0
  220. data/mlx/mlx/backend/cuda/lru_cache.h +190 -0
  221. data/mlx/mlx/backend/cuda/matmul.cpp +373 -0
  222. data/mlx/mlx/backend/cuda/no_cuda.cpp +47 -0
  223. data/mlx/mlx/backend/cuda/primitives.cpp +46 -0
  224. data/mlx/mlx/backend/cuda/quantized/affine_quantize.cu +329 -0
  225. data/mlx/mlx/backend/cuda/quantized/convert_fp8.cu +19 -0
  226. data/mlx/mlx/backend/cuda/quantized/cublas_qqmm.cpp +206 -0
  227. data/mlx/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
  228. data/mlx/mlx/backend/cuda/quantized/cuda_fp4.h +100 -0
  229. data/mlx/mlx/backend/cuda/quantized/fp_quantize.cu +496 -0
  230. data/mlx/mlx/backend/cuda/quantized/mxfp8_quantize.cuh +32 -0
  231. data/mlx/mlx/backend/cuda/quantized/no_qqmm_impl.cpp +26 -0
  232. data/mlx/mlx/backend/cuda/quantized/nvfp4_quantize.cuh +334 -0
  233. data/mlx/mlx/backend/cuda/quantized/qmv.cu +304 -0
  234. data/mlx/mlx/backend/cuda/quantized/qmv.h +21 -0
  235. data/mlx/mlx/backend/cuda/quantized/qqmm.cpp +158 -0
  236. data/mlx/mlx/backend/cuda/quantized/qqmm_impl.cpp +50 -0
  237. data/mlx/mlx/backend/cuda/quantized/qqmm_impl.h +26 -0
  238. data/mlx/mlx/backend/cuda/quantized/qqmm_utils.cu +227 -0
  239. data/mlx/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
  240. data/mlx/mlx/backend/cuda/quantized/quantized.cpp +85 -0
  241. data/mlx/mlx/backend/cuda/quantized/quantized.h +53 -0
  242. data/mlx/mlx/backend/cuda/quantized/quantized_utils.cuh +88 -0
  243. data/mlx/mlx/backend/cuda/quantized/quantized_utils.h +50 -0
  244. data/mlx/mlx/backend/cuda/random.cu +202 -0
  245. data/mlx/mlx/backend/cuda/reduce/all_reduce.cu +159 -0
  246. data/mlx/mlx/backend/cuda/reduce/col_reduce.cu +510 -0
  247. data/mlx/mlx/backend/cuda/reduce/init_reduce.cu +50 -0
  248. data/mlx/mlx/backend/cuda/reduce/reduce.cuh +71 -0
  249. data/mlx/mlx/backend/cuda/reduce/reduce_ops.cuh +211 -0
  250. data/mlx/mlx/backend/cuda/reduce/reduce_utils.cuh +145 -0
  251. data/mlx/mlx/backend/cuda/reduce/row_reduce.cu +361 -0
  252. data/mlx/mlx/backend/cuda/reduce.cu +73 -0
  253. data/mlx/mlx/backend/cuda/rms_norm.cu +536 -0
  254. data/mlx/mlx/backend/cuda/rope.cu +429 -0
  255. data/mlx/mlx/backend/cuda/scaled_dot_product_attention.cpp +681 -0
  256. data/mlx/mlx/backend/cuda/scaled_dot_product_attention.cu +796 -0
  257. data/mlx/mlx/backend/cuda/scan.cu +468 -0
  258. data/mlx/mlx/backend/cuda/slicing.cpp +111 -0
  259. data/mlx/mlx/backend/cuda/softmax.cu +162 -0
  260. data/mlx/mlx/backend/cuda/sort.cu +1076 -0
  261. data/mlx/mlx/backend/cuda/steel/defines.cuh +9 -0
  262. data/mlx/mlx/backend/cuda/steel/gemm.cuh +101 -0
  263. data/mlx/mlx/backend/cuda/steel/mma.cuh +117 -0
  264. data/mlx/mlx/backend/cuda/steel/tiles.cuh +450 -0
  265. data/mlx/mlx/backend/cuda/steel/utils.cuh +89 -0
  266. data/mlx/mlx/backend/cuda/ternary.cu +271 -0
  267. data/mlx/mlx/backend/cuda/unary/CMakeLists.txt +34 -0
  268. data/mlx/mlx/backend/cuda/unary/abs.cu +7 -0
  269. data/mlx/mlx/backend/cuda/unary/arccos.cu +7 -0
  270. data/mlx/mlx/backend/cuda/unary/arccosh.cu +7 -0
  271. data/mlx/mlx/backend/cuda/unary/arcsin.cu +7 -0
  272. data/mlx/mlx/backend/cuda/unary/arcsinh.cu +7 -0
  273. data/mlx/mlx/backend/cuda/unary/arctan.cu +7 -0
  274. data/mlx/mlx/backend/cuda/unary/arctanh.cu +7 -0
  275. data/mlx/mlx/backend/cuda/unary/bitwise_invert.cu +7 -0
  276. data/mlx/mlx/backend/cuda/unary/ceil.cu +7 -0
  277. data/mlx/mlx/backend/cuda/unary/conjugate.cu +7 -0
  278. data/mlx/mlx/backend/cuda/unary/cos.cu +7 -0
  279. data/mlx/mlx/backend/cuda/unary/cosh.cu +7 -0
  280. data/mlx/mlx/backend/cuda/unary/erf.cu +7 -0
  281. data/mlx/mlx/backend/cuda/unary/erf_inv.cu +7 -0
  282. data/mlx/mlx/backend/cuda/unary/exp.cu +7 -0
  283. data/mlx/mlx/backend/cuda/unary/expm1.cu +7 -0
  284. data/mlx/mlx/backend/cuda/unary/floor.cu +7 -0
  285. data/mlx/mlx/backend/cuda/unary/imag.cu +7 -0
  286. data/mlx/mlx/backend/cuda/unary/log.cu +21 -0
  287. data/mlx/mlx/backend/cuda/unary/log1p.cu +7 -0
  288. data/mlx/mlx/backend/cuda/unary/logical_not.cu +7 -0
  289. data/mlx/mlx/backend/cuda/unary/negative.cu +7 -0
  290. data/mlx/mlx/backend/cuda/unary/real.cu +7 -0
  291. data/mlx/mlx/backend/cuda/unary/round.cu +18 -0
  292. data/mlx/mlx/backend/cuda/unary/sigmoid.cu +7 -0
  293. data/mlx/mlx/backend/cuda/unary/sign.cu +7 -0
  294. data/mlx/mlx/backend/cuda/unary/sin.cu +7 -0
  295. data/mlx/mlx/backend/cuda/unary/sinh.cu +7 -0
  296. data/mlx/mlx/backend/cuda/unary/sqrt.cu +15 -0
  297. data/mlx/mlx/backend/cuda/unary/square.cu +7 -0
  298. data/mlx/mlx/backend/cuda/unary/tan.cu +7 -0
  299. data/mlx/mlx/backend/cuda/unary/tanh.cu +7 -0
  300. data/mlx/mlx/backend/cuda/unary/unary.cuh +224 -0
  301. data/mlx/mlx/backend/cuda/utils.cpp +116 -0
  302. data/mlx/mlx/backend/cuda/utils.h +49 -0
  303. data/mlx/mlx/backend/cuda/vector_types.cuh +48 -0
  304. data/mlx/mlx/backend/cuda/worker.cpp +79 -0
  305. data/mlx/mlx/backend/cuda/worker.h +55 -0
  306. data/mlx/mlx/backend/gpu/CMakeLists.txt +5 -0
  307. data/mlx/mlx/backend/gpu/copy.cpp +89 -0
  308. data/mlx/mlx/backend/gpu/copy.h +57 -0
  309. data/mlx/mlx/backend/gpu/device_info.h +36 -0
  310. data/mlx/mlx/backend/gpu/eval.h +18 -0
  311. data/mlx/mlx/backend/gpu/primitives.cpp +307 -0
  312. data/mlx/mlx/backend/gpu/slicing.cpp +44 -0
  313. data/mlx/mlx/backend/gpu/slicing.h +36 -0
  314. data/mlx/mlx/backend/metal/CMakeLists.txt +144 -0
  315. data/mlx/mlx/backend/metal/allocator.cpp +279 -0
  316. data/mlx/mlx/backend/metal/allocator.h +79 -0
  317. data/mlx/mlx/backend/metal/binary.cpp +257 -0
  318. data/mlx/mlx/backend/metal/binary.h +33 -0
  319. data/mlx/mlx/backend/metal/compiled.cpp +471 -0
  320. data/mlx/mlx/backend/metal/conv.cpp +1118 -0
  321. data/mlx/mlx/backend/metal/copy.cpp +235 -0
  322. data/mlx/mlx/backend/metal/custom_kernel.cpp +430 -0
  323. data/mlx/mlx/backend/metal/device.cpp +816 -0
  324. data/mlx/mlx/backend/metal/device.h +289 -0
  325. data/mlx/mlx/backend/metal/device_info.cpp +58 -0
  326. data/mlx/mlx/backend/metal/distributed.cpp +38 -0
  327. data/mlx/mlx/backend/metal/eval.cpp +97 -0
  328. data/mlx/mlx/backend/metal/event.cpp +62 -0
  329. data/mlx/mlx/backend/metal/fence.cpp +162 -0
  330. data/mlx/mlx/backend/metal/fft.cpp +807 -0
  331. data/mlx/mlx/backend/metal/hadamard.cpp +198 -0
  332. data/mlx/mlx/backend/metal/indexing.cpp +727 -0
  333. data/mlx/mlx/backend/metal/jit/includes.h +58 -0
  334. data/mlx/mlx/backend/metal/jit/indexing.h +76 -0
  335. data/mlx/mlx/backend/metal/jit_kernels.cpp +1118 -0
  336. data/mlx/mlx/backend/metal/kernels/CMakeLists.txt +193 -0
  337. data/mlx/mlx/backend/metal/kernels/arange.h +9 -0
  338. data/mlx/mlx/backend/metal/kernels/arange.metal +20 -0
  339. data/mlx/mlx/backend/metal/kernels/arg_reduce.metal +182 -0
  340. data/mlx/mlx/backend/metal/kernels/atomic.h +345 -0
  341. data/mlx/mlx/backend/metal/kernels/bf16.h +16 -0
  342. data/mlx/mlx/backend/metal/kernels/bf16_math.h +380 -0
  343. data/mlx/mlx/backend/metal/kernels/binary.h +199 -0
  344. data/mlx/mlx/backend/metal/kernels/binary.metal +109 -0
  345. data/mlx/mlx/backend/metal/kernels/binary_ops.h +330 -0
  346. data/mlx/mlx/backend/metal/kernels/binary_two.h +244 -0
  347. data/mlx/mlx/backend/metal/kernels/binary_two.metal +54 -0
  348. data/mlx/mlx/backend/metal/kernels/cexpf.h +134 -0
  349. data/mlx/mlx/backend/metal/kernels/complex.h +173 -0
  350. data/mlx/mlx/backend/metal/kernels/conv.metal +701 -0
  351. data/mlx/mlx/backend/metal/kernels/copy.h +276 -0
  352. data/mlx/mlx/backend/metal/kernels/copy.metal +75 -0
  353. data/mlx/mlx/backend/metal/kernels/defines.h +24 -0
  354. data/mlx/mlx/backend/metal/kernels/erf.h +69 -0
  355. data/mlx/mlx/backend/metal/kernels/expm1f.h +90 -0
  356. data/mlx/mlx/backend/metal/kernels/fence.metal +52 -0
  357. data/mlx/mlx/backend/metal/kernels/fft/radix.h +328 -0
  358. data/mlx/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
  359. data/mlx/mlx/backend/metal/kernels/fft.h +486 -0
  360. data/mlx/mlx/backend/metal/kernels/fft.metal +67 -0
  361. data/mlx/mlx/backend/metal/kernels/fp4.h +48 -0
  362. data/mlx/mlx/backend/metal/kernels/fp8.h +80 -0
  363. data/mlx/mlx/backend/metal/kernels/fp_quantized.h +1850 -0
  364. data/mlx/mlx/backend/metal/kernels/fp_quantized.metal +153 -0
  365. data/mlx/mlx/backend/metal/kernels/fp_quantized_nax.h +1044 -0
  366. data/mlx/mlx/backend/metal/kernels/fp_quantized_nax.metal +79 -0
  367. data/mlx/mlx/backend/metal/kernels/gemv.metal +868 -0
  368. data/mlx/mlx/backend/metal/kernels/gemv_masked.h +827 -0
  369. data/mlx/mlx/backend/metal/kernels/gemv_masked.metal +76 -0
  370. data/mlx/mlx/backend/metal/kernels/hadamard.h +182 -0
  371. data/mlx/mlx/backend/metal/kernels/indexing/gather.h +51 -0
  372. data/mlx/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
  373. data/mlx/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
  374. data/mlx/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
  375. data/mlx/mlx/backend/metal/kernels/indexing/masked_scatter.h +41 -0
  376. data/mlx/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
  377. data/mlx/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
  378. data/mlx/mlx/backend/metal/kernels/layer_norm.metal +433 -0
  379. data/mlx/mlx/backend/metal/kernels/logging.h +26 -0
  380. data/mlx/mlx/backend/metal/kernels/logsumexp.h +140 -0
  381. data/mlx/mlx/backend/metal/kernels/logsumexp.metal +18 -0
  382. data/mlx/mlx/backend/metal/kernels/quantized.h +2508 -0
  383. data/mlx/mlx/backend/metal/kernels/quantized.metal +144 -0
  384. data/mlx/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
  385. data/mlx/mlx/backend/metal/kernels/quantized_nax.metal +106 -0
  386. data/mlx/mlx/backend/metal/kernels/quantized_utils.h +90 -0
  387. data/mlx/mlx/backend/metal/kernels/random.metal +103 -0
  388. data/mlx/mlx/backend/metal/kernels/reduce.h +5 -0
  389. data/mlx/mlx/backend/metal/kernels/reduce.metal +169 -0
  390. data/mlx/mlx/backend/metal/kernels/reduce_utils.h +6 -0
  391. data/mlx/mlx/backend/metal/kernels/reduction/ops.h +275 -0
  392. data/mlx/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
  393. data/mlx/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
  394. data/mlx/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
  395. data/mlx/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
  396. data/mlx/mlx/backend/metal/kernels/rms_norm.metal +391 -0
  397. data/mlx/mlx/backend/metal/kernels/rope.metal +229 -0
  398. data/mlx/mlx/backend/metal/kernels/scaled_dot_product_attention.metal +44 -0
  399. data/mlx/mlx/backend/metal/kernels/scan.h +514 -0
  400. data/mlx/mlx/backend/metal/kernels/scan.metal +109 -0
  401. data/mlx/mlx/backend/metal/kernels/sdpa_vector.h +394 -0
  402. data/mlx/mlx/backend/metal/kernels/softmax.h +190 -0
  403. data/mlx/mlx/backend/metal/kernels/softmax.metal +24 -0
  404. data/mlx/mlx/backend/metal/kernels/sort.h +719 -0
  405. data/mlx/mlx/backend/metal/kernels/sort.metal +80 -0
  406. data/mlx/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
  407. data/mlx/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +471 -0
  408. data/mlx/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.metal +27 -0
  409. data/mlx/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
  410. data/mlx/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.metal +28 -0
  411. data/mlx/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
  412. data/mlx/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
  413. data/mlx/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
  414. data/mlx/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
  415. data/mlx/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
  416. data/mlx/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
  417. data/mlx/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
  418. data/mlx/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.metal +56 -0
  419. data/mlx/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
  420. data/mlx/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.metal +47 -0
  421. data/mlx/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
  422. data/mlx/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
  423. data/mlx/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
  424. data/mlx/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
  425. data/mlx/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
  426. data/mlx/mlx/backend/metal/kernels/steel/defines.h +7 -0
  427. data/mlx/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
  428. data/mlx/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +157 -0
  429. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
  430. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.metal +34 -0
  431. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +219 -0
  432. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.metal +30 -0
  433. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
  434. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.metal +59 -0
  435. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +143 -0
  436. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.metal +37 -0
  437. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
  438. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.metal +76 -0
  439. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
  440. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.metal +43 -0
  441. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
  442. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.metal +76 -0
  443. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk_nax.h +152 -0
  444. data/mlx/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk_nax.metal +30 -0
  445. data/mlx/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
  446. data/mlx/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
  447. data/mlx/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
  448. data/mlx/mlx/backend/metal/kernels/steel/gemm/params.h +65 -0
  449. data/mlx/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
  450. data/mlx/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
  451. data/mlx/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
  452. data/mlx/mlx/backend/metal/kernels/steel/utils.h +42 -0
  453. data/mlx/mlx/backend/metal/kernels/ternary.h +145 -0
  454. data/mlx/mlx/backend/metal/kernels/ternary.metal +48 -0
  455. data/mlx/mlx/backend/metal/kernels/ternary_ops.h +10 -0
  456. data/mlx/mlx/backend/metal/kernels/unary.h +63 -0
  457. data/mlx/mlx/backend/metal/kernels/unary.metal +115 -0
  458. data/mlx/mlx/backend/metal/kernels/unary_ops.h +454 -0
  459. data/mlx/mlx/backend/metal/kernels/utils.h +445 -0
  460. data/mlx/mlx/backend/metal/kernels.h +375 -0
  461. data/mlx/mlx/backend/metal/logsumexp.cpp +95 -0
  462. data/mlx/mlx/backend/metal/make_compiled_preamble.sh +120 -0
  463. data/mlx/mlx/backend/metal/matmul.cpp +2572 -0
  464. data/mlx/mlx/backend/metal/matmul.h +144 -0
  465. data/mlx/mlx/backend/metal/metal.cpp +50 -0
  466. data/mlx/mlx/backend/metal/metal.h +25 -0
  467. data/mlx/mlx/backend/metal/no_metal.cpp +42 -0
  468. data/mlx/mlx/backend/metal/nojit_kernels.cpp +414 -0
  469. data/mlx/mlx/backend/metal/normalization.cpp +433 -0
  470. data/mlx/mlx/backend/metal/primitives.cpp +242 -0
  471. data/mlx/mlx/backend/metal/quantized.cpp +1651 -0
  472. data/mlx/mlx/backend/metal/reduce.cpp +1038 -0
  473. data/mlx/mlx/backend/metal/reduce.h +41 -0
  474. data/mlx/mlx/backend/metal/resident.cpp +100 -0
  475. data/mlx/mlx/backend/metal/resident.h +32 -0
  476. data/mlx/mlx/backend/metal/rope.cpp +165 -0
  477. data/mlx/mlx/backend/metal/scaled_dot_product_attention.cpp +798 -0
  478. data/mlx/mlx/backend/metal/scan.cpp +145 -0
  479. data/mlx/mlx/backend/metal/scan.h +17 -0
  480. data/mlx/mlx/backend/metal/slicing.cpp +99 -0
  481. data/mlx/mlx/backend/metal/softmax.cpp +87 -0
  482. data/mlx/mlx/backend/metal/sort.cpp +368 -0
  483. data/mlx/mlx/backend/metal/ternary.cpp +160 -0
  484. data/mlx/mlx/backend/metal/ternary.h +21 -0
  485. data/mlx/mlx/backend/metal/unary.cpp +161 -0
  486. data/mlx/mlx/backend/metal/unary.h +21 -0
  487. data/mlx/mlx/backend/metal/utils.cpp +77 -0
  488. data/mlx/mlx/backend/metal/utils.h +99 -0
  489. data/mlx/mlx/backend/no_cpu/CMakeLists.txt +7 -0
  490. data/mlx/mlx/backend/no_cpu/compiled.cpp +24 -0
  491. data/mlx/mlx/backend/no_cpu/device_info.cpp +22 -0
  492. data/mlx/mlx/backend/no_cpu/primitives.cpp +146 -0
  493. data/mlx/mlx/backend/no_gpu/CMakeLists.txt +8 -0
  494. data/mlx/mlx/backend/no_gpu/allocator.cpp +134 -0
  495. data/mlx/mlx/backend/no_gpu/apple_memory.h +16 -0
  496. data/mlx/mlx/backend/no_gpu/device_info.cpp +22 -0
  497. data/mlx/mlx/backend/no_gpu/eval.cpp +24 -0
  498. data/mlx/mlx/backend/no_gpu/event.cpp +53 -0
  499. data/mlx/mlx/backend/no_gpu/fence.cpp +54 -0
  500. data/mlx/mlx/backend/no_gpu/linux_memory.h +22 -0
  501. data/mlx/mlx/backend/no_gpu/primitives.cpp +185 -0
  502. data/mlx/mlx/compile.cpp +1243 -0
  503. data/mlx/mlx/compile.h +45 -0
  504. data/mlx/mlx/compile_impl.h +70 -0
  505. data/mlx/mlx/device.cpp +72 -0
  506. data/mlx/mlx/device.h +56 -0
  507. data/mlx/mlx/distributed/CMakeLists.txt +14 -0
  508. data/mlx/mlx/distributed/distributed.cpp +197 -0
  509. data/mlx/mlx/distributed/distributed.h +61 -0
  510. data/mlx/mlx/distributed/distributed_impl.h +59 -0
  511. data/mlx/mlx/distributed/jaccl/CMakeLists.txt +12 -0
  512. data/mlx/mlx/distributed/jaccl/jaccl.cpp +178 -0
  513. data/mlx/mlx/distributed/jaccl/jaccl.h +12 -0
  514. data/mlx/mlx/distributed/jaccl/mesh.cpp +451 -0
  515. data/mlx/mlx/distributed/jaccl/mesh.h +122 -0
  516. data/mlx/mlx/distributed/jaccl/no_jaccl.cpp +20 -0
  517. data/mlx/mlx/distributed/jaccl/ring.cpp +692 -0
  518. data/mlx/mlx/distributed/jaccl/ring.h +178 -0
  519. data/mlx/mlx/distributed/jaccl/utils.cpp +329 -0
  520. data/mlx/mlx/distributed/jaccl/utils.h +342 -0
  521. data/mlx/mlx/distributed/mpi/CMakeLists.txt +5 -0
  522. data/mlx/mlx/distributed/mpi/mpi.cpp +501 -0
  523. data/mlx/mlx/distributed/mpi/mpi.h +12 -0
  524. data/mlx/mlx/distributed/mpi/mpi_declarations.h +28 -0
  525. data/mlx/mlx/distributed/mpi/no_mpi.cpp +20 -0
  526. data/mlx/mlx/distributed/nccl/CMakeLists.txt +26 -0
  527. data/mlx/mlx/distributed/nccl/nccl.cpp +443 -0
  528. data/mlx/mlx/distributed/nccl/nccl.h +12 -0
  529. data/mlx/mlx/distributed/nccl/nccl_stub/CMakeLists.txt +1 -0
  530. data/mlx/mlx/distributed/nccl/nccl_stub/nccl_stubs.cpp +54 -0
  531. data/mlx/mlx/distributed/nccl/no_nccl.cpp +20 -0
  532. data/mlx/mlx/distributed/ops.cpp +186 -0
  533. data/mlx/mlx/distributed/ops.h +57 -0
  534. data/mlx/mlx/distributed/primitives.cpp +95 -0
  535. data/mlx/mlx/distributed/primitives.h +156 -0
  536. data/mlx/mlx/distributed/reduction_ops.h +38 -0
  537. data/mlx/mlx/distributed/ring/CMakeLists.txt +5 -0
  538. data/mlx/mlx/distributed/ring/no_ring.cpp +20 -0
  539. data/mlx/mlx/distributed/ring/ring.cpp +870 -0
  540. data/mlx/mlx/distributed/ring/ring.h +12 -0
  541. data/mlx/mlx/distributed/utils.cpp +206 -0
  542. data/mlx/mlx/distributed/utils.h +67 -0
  543. data/mlx/mlx/dtype.cpp +197 -0
  544. data/mlx/mlx/dtype.h +116 -0
  545. data/mlx/mlx/dtype_utils.cpp +42 -0
  546. data/mlx/mlx/dtype_utils.h +119 -0
  547. data/mlx/mlx/einsum.cpp +941 -0
  548. data/mlx/mlx/einsum.h +23 -0
  549. data/mlx/mlx/event.h +58 -0
  550. data/mlx/mlx/export.cpp +1130 -0
  551. data/mlx/mlx/export.h +137 -0
  552. data/mlx/mlx/export_impl.h +99 -0
  553. data/mlx/mlx/fast.cpp +941 -0
  554. data/mlx/mlx/fast.h +103 -0
  555. data/mlx/mlx/fast_primitives.h +427 -0
  556. data/mlx/mlx/fence.h +39 -0
  557. data/mlx/mlx/fft.cpp +262 -0
  558. data/mlx/mlx/fft.h +159 -0
  559. data/mlx/mlx/graph_utils.cpp +175 -0
  560. data/mlx/mlx/graph_utils.h +67 -0
  561. data/mlx/mlx/io/CMakeLists.txt +25 -0
  562. data/mlx/mlx/io/gguf.cpp +470 -0
  563. data/mlx/mlx/io/gguf.h +20 -0
  564. data/mlx/mlx/io/gguf_quants.cpp +164 -0
  565. data/mlx/mlx/io/load.cpp +397 -0
  566. data/mlx/mlx/io/load.h +175 -0
  567. data/mlx/mlx/io/no_gguf.cpp +20 -0
  568. data/mlx/mlx/io/no_safetensors.cpp +37 -0
  569. data/mlx/mlx/io/safetensors.cpp +234 -0
  570. data/mlx/mlx/io.h +61 -0
  571. data/mlx/mlx/linalg.cpp +708 -0
  572. data/mlx/mlx/linalg.h +115 -0
  573. data/mlx/mlx/memory.h +80 -0
  574. data/mlx/mlx/mlx.h +25 -0
  575. data/mlx/mlx/ops.cpp +6094 -0
  576. data/mlx/mlx/ops.h +1610 -0
  577. data/mlx/mlx/primitives.cpp +5850 -0
  578. data/mlx/mlx/primitives.h +2525 -0
  579. data/mlx/mlx/random.cpp +492 -0
  580. data/mlx/mlx/random.h +283 -0
  581. data/mlx/mlx/scheduler.cpp +73 -0
  582. data/mlx/mlx/scheduler.h +189 -0
  583. data/mlx/mlx/small_vector.h +540 -0
  584. data/mlx/mlx/stream.h +42 -0
  585. data/mlx/mlx/threadpool.h +133 -0
  586. data/mlx/mlx/transforms.cpp +1065 -0
  587. data/mlx/mlx/transforms.h +231 -0
  588. data/mlx/mlx/transforms_impl.h +88 -0
  589. data/mlx/mlx/types/bf16.h +187 -0
  590. data/mlx/mlx/types/complex.h +113 -0
  591. data/mlx/mlx/types/fp16.h +234 -0
  592. data/mlx/mlx/types/half_types.h +58 -0
  593. data/mlx/mlx/types/limits.h +70 -0
  594. data/mlx/mlx/utils.cpp +302 -0
  595. data/mlx/mlx/utils.h +174 -0
  596. data/mlx/mlx/version.cpp +11 -0
  597. data/mlx/mlx/version.h +22 -0
  598. data/mlx/mlx.pc.in +52 -0
  599. metadata +643 -0
@@ -0,0 +1,719 @@
1
+ // Copyright © 2023-2024 Apple Inc.
2
+
3
+ #define MLX_MTL_CONST static constant constexpr const
4
+ #define MLX_MTL_LOOP_UNROLL _Pragma("clang loop unroll(full)")
5
+
6
+ using namespace metal;
7
+
8
+ // Based on GPU merge sort algorithm at
9
+ // https://github.com/NVIDIA/cccl/tree/main/cub/cub
10
+
11
+ ///////////////////////////////////////////////////////////////////////////////
12
+ // Thread-level sort
13
+ ///////////////////////////////////////////////////////////////////////////////
14
+
15
+ template <typename T>
16
+ METAL_FUNC void thread_swap(thread T& a, thread T& b) {
17
+ T w = a;
18
+ a = b;
19
+ b = w;
20
+ }
21
+
22
+ template <typename T, typename = void>
23
+ struct Init {
24
+ static constexpr constant T v = Limits<T>::max;
25
+ };
26
+
27
+ template <typename T>
28
+ struct Init<T, metal::enable_if_t<metal::is_floating_point_v<T>>> {
29
+ static constexpr constant T v = metal::numeric_limits<T>::quiet_NaN();
30
+ };
31
+
32
+ template <typename T>
33
+ struct LessThan {
34
+ static constexpr constant T init = Init<T>::v;
35
+ METAL_FUNC bool operator()(T a, T b) const {
36
+ if constexpr (
37
+ metal::is_floating_point_v<T> || metal::is_same_v<T, complex64_t>) {
38
+ bool an = isnan(a);
39
+ bool bn = isnan(b);
40
+ if (an | bn) {
41
+ return (!an) & bn;
42
+ }
43
+ }
44
+ return a < b;
45
+ }
46
+ };
47
+
48
+ template <
49
+ typename ValT,
50
+ typename IdxT,
51
+ bool ARG_SORT,
52
+ short N_PER_THREAD,
53
+ typename CompareOp>
54
+ struct ThreadSort {
55
+ static METAL_FUNC void sort(
56
+ thread ValT (&vals)[N_PER_THREAD],
57
+ thread IdxT (&idxs)[N_PER_THREAD]) {
58
+ CompareOp op;
59
+ MLX_MTL_LOOP_UNROLL
60
+ for (short i = 0; i < N_PER_THREAD; ++i) {
61
+ MLX_MTL_LOOP_UNROLL
62
+ for (short j = i & 1; j < N_PER_THREAD - 1; j += 2) {
63
+ if (op(vals[j + 1], vals[j])) {
64
+ thread_swap(vals[j + 1], vals[j]);
65
+ if (ARG_SORT) {
66
+ thread_swap(idxs[j + 1], idxs[j]);
67
+ }
68
+ }
69
+ }
70
+ }
71
+ }
72
+ };
73
+
74
+ ///////////////////////////////////////////////////////////////////////////////
75
+ // Threadgroup-level sort
76
+ ///////////////////////////////////////////////////////////////////////////////
77
+
78
+ template <
79
+ typename ValT,
80
+ typename IdxT,
81
+ bool ARG_SORT,
82
+ short BLOCK_THREADS,
83
+ short N_PER_THREAD,
84
+ typename CompareOp>
85
+ struct BlockMergeSort {
86
+ using thread_sort_t =
87
+ ThreadSort<ValT, IdxT, ARG_SORT, N_PER_THREAD, CompareOp>;
88
+ static METAL_FUNC int merge_partition(
89
+ const threadgroup ValT* As,
90
+ const threadgroup ValT* Bs,
91
+ short A_sz,
92
+ short B_sz,
93
+ short sort_md) {
94
+ CompareOp op;
95
+
96
+ short A_st = max(0, sort_md - B_sz);
97
+ short A_ed = min(sort_md, A_sz);
98
+
99
+ while (A_st < A_ed) {
100
+ short md = A_st + (A_ed - A_st) / 2;
101
+ auto a = As[md];
102
+ auto b = Bs[sort_md - 1 - md];
103
+
104
+ if (op(b, a)) {
105
+ A_ed = md;
106
+ } else {
107
+ A_st = md + 1;
108
+ }
109
+ }
110
+
111
+ return A_ed;
112
+ }
113
+
114
+ static METAL_FUNC void merge_step(
115
+ const threadgroup ValT* As,
116
+ const threadgroup ValT* Bs,
117
+ const threadgroup IdxT* As_idx,
118
+ const threadgroup IdxT* Bs_idx,
119
+ short A_sz,
120
+ short B_sz,
121
+ thread ValT (&vals)[N_PER_THREAD],
122
+ thread IdxT (&idxs)[N_PER_THREAD]) {
123
+ CompareOp op;
124
+ short a_idx = 0;
125
+ short b_idx = 0;
126
+
127
+ for (int i = 0; i < N_PER_THREAD; ++i) {
128
+ auto a = (a_idx < A_sz) ? As[a_idx] : ValT(CompareOp::init);
129
+ auto b = (b_idx < B_sz) ? Bs[b_idx] : ValT(CompareOp::init);
130
+ bool pred = (b_idx < B_sz) && (a_idx >= A_sz || op(b, a));
131
+
132
+ vals[i] = pred ? b : a;
133
+ if (ARG_SORT) {
134
+ if (pred) {
135
+ idxs[i] = Bs_idx[b_idx];
136
+ } else {
137
+ idxs[i] = (a_idx < A_sz) ? As_idx[a_idx] : IdxT(0);
138
+ }
139
+ }
140
+
141
+ b_idx += short(pred);
142
+ a_idx += short(!pred);
143
+ }
144
+ }
145
+
146
+ static METAL_FUNC void sort(
147
+ threadgroup ValT* tgp_vals [[threadgroup(0)]],
148
+ threadgroup IdxT* tgp_idxs [[threadgroup(1)]],
149
+ int size_sorted_axis,
150
+ uint3 lid [[thread_position_in_threadgroup]]) {
151
+ // Get thread location
152
+ int idx = lid.x * N_PER_THREAD;
153
+
154
+ // Load from shared memory
155
+ thread ValT thread_vals[N_PER_THREAD];
156
+ thread IdxT thread_idxs[N_PER_THREAD];
157
+ for (int i = 0; i < N_PER_THREAD; ++i) {
158
+ thread_vals[i] = tgp_vals[idx + i];
159
+ if (ARG_SORT) {
160
+ thread_idxs[i] = tgp_idxs[idx + i];
161
+ }
162
+ }
163
+
164
+ // Per thread sort
165
+ if (idx < size_sorted_axis) {
166
+ thread_sort_t::sort(thread_vals, thread_idxs);
167
+ }
168
+
169
+ // Do merges using threadgroup memory
170
+ for (int merge_threads = 2; merge_threads <= BLOCK_THREADS;
171
+ merge_threads *= 2) {
172
+ // Update threadgroup memory
173
+ threadgroup_barrier(mem_flags::mem_threadgroup);
174
+ for (int i = 0; i < N_PER_THREAD; ++i) {
175
+ tgp_vals[idx + i] = thread_vals[i];
176
+ if (ARG_SORT) {
177
+ tgp_idxs[idx + i] = thread_idxs[i];
178
+ }
179
+ }
180
+ threadgroup_barrier(mem_flags::mem_threadgroup);
181
+
182
+ // Find location in merge step
183
+ int merge_group = lid.x / merge_threads;
184
+ int merge_lane = lid.x % merge_threads;
185
+
186
+ int sort_sz = N_PER_THREAD * merge_threads;
187
+ int sort_st = N_PER_THREAD * merge_threads * merge_group;
188
+
189
+ // As = tgp_vals[A_st:A_ed] is sorted
190
+ // Bs = tgp_vals[B_st:B_ed] is sorted
191
+ int A_st = sort_st;
192
+ int A_ed = sort_st + sort_sz / 2;
193
+ int B_st = sort_st + sort_sz / 2;
194
+ int B_ed = sort_st + sort_sz;
195
+
196
+ const threadgroup ValT* As = tgp_vals + A_st;
197
+ const threadgroup ValT* Bs = tgp_vals + B_st;
198
+ int A_sz = A_ed - A_st;
199
+ int B_sz = B_ed - B_st;
200
+
201
+ // Find a partition of merge elements
202
+ // Ci = merge(As[partition:], Bs[sort_md - partition:])
203
+ // of size N_PER_THREAD for each merge lane i
204
+ // C = [Ci] is sorted
205
+ int sort_md = N_PER_THREAD * merge_lane;
206
+ int partition = merge_partition(As, Bs, A_sz, B_sz, sort_md);
207
+
208
+ As += partition;
209
+ Bs += sort_md - partition;
210
+
211
+ A_sz -= partition;
212
+ B_sz -= sort_md - partition;
213
+
214
+ const threadgroup IdxT* As_idx =
215
+ ARG_SORT ? tgp_idxs + A_st + partition : nullptr;
216
+ const threadgroup IdxT* Bs_idx =
217
+ ARG_SORT ? tgp_idxs + B_st + sort_md - partition : nullptr;
218
+
219
+ // Merge starting at the partition and store results in thread registers
220
+ merge_step(As, Bs, As_idx, Bs_idx, A_sz, B_sz, thread_vals, thread_idxs);
221
+ }
222
+
223
+ // Write out to shared memory
224
+ threadgroup_barrier(mem_flags::mem_threadgroup);
225
+ for (int i = 0; i < N_PER_THREAD; ++i) {
226
+ tgp_vals[idx + i] = thread_vals[i];
227
+ if (ARG_SORT) {
228
+ tgp_idxs[idx + i] = thread_idxs[i];
229
+ }
230
+ }
231
+ }
232
+ };
233
+
234
+ ///////////////////////////////////////////////////////////////////////////////
235
+ // Kernel sort
236
+ ///////////////////////////////////////////////////////////////////////////////
237
+
238
+ template <
239
+ typename T,
240
+ typename U,
241
+ bool ARG_SORT,
242
+ short BLOCK_THREADS,
243
+ short N_PER_THREAD,
244
+ typename CompareOp = LessThan<T>>
245
+ struct KernelMergeSort {
246
+ using ValT = T;
247
+ using IdxT = uint;
248
+ using block_merge_sort_t = BlockMergeSort<
249
+ ValT,
250
+ IdxT,
251
+ ARG_SORT,
252
+ BLOCK_THREADS,
253
+ N_PER_THREAD,
254
+ CompareOp>;
255
+
256
+ MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;
257
+
258
+ static METAL_FUNC void block_sort(
259
+ const device T* inp,
260
+ device U* out,
261
+ const constant int& size_sorted_axis,
262
+ const constant int& in_stride_sorted_axis,
263
+ const constant int& out_stride_sorted_axis,
264
+ const constant int& in_stride_segment_axis,
265
+ const constant int& out_stride_segment_axis,
266
+ threadgroup ValT* tgp_vals,
267
+ threadgroup IdxT* tgp_idxs,
268
+ uint3 tid [[threadgroup_position_in_grid]],
269
+ uint3 lid [[thread_position_in_threadgroup]]) {
270
+ // tid.y tells us the segment index
271
+ inp += tid.y * in_stride_segment_axis;
272
+ out += tid.y * out_stride_segment_axis;
273
+
274
+ // Copy into threadgroup memory
275
+ for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
276
+ tgp_vals[i] = i < size_sorted_axis ? inp[i * in_stride_sorted_axis]
277
+ : ValT(CompareOp::init);
278
+ if (ARG_SORT) {
279
+ tgp_idxs[i] = i;
280
+ }
281
+ }
282
+
283
+ // Sort elements within the block
284
+ threadgroup_barrier(mem_flags::mem_threadgroup);
285
+
286
+ block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);
287
+
288
+ threadgroup_barrier(mem_flags::mem_threadgroup);
289
+
290
+ // Write output
291
+ for (int i = lid.x; i < size_sorted_axis; i += BLOCK_THREADS) {
292
+ if (ARG_SORT) {
293
+ out[i * out_stride_sorted_axis] = tgp_idxs[i];
294
+ } else {
295
+ out[i * out_stride_sorted_axis] = tgp_vals[i];
296
+ }
297
+ }
298
+ }
299
+ };
300
+
301
+ template <
302
+ typename T,
303
+ typename U,
304
+ bool ARG_SORT,
305
+ short BLOCK_THREADS,
306
+ short N_PER_THREAD>
307
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort(
308
+ const device T* inp [[buffer(0)]],
309
+ device U* out [[buffer(1)]],
310
+ const constant int& size_sorted_axis [[buffer(2)]],
311
+ const constant int& in_stride_sorted_axis [[buffer(3)]],
312
+ const constant int& out_stride_sorted_axis [[buffer(4)]],
313
+ const constant int& in_stride_segment_axis [[buffer(5)]],
314
+ const constant int& out_stride_segment_axis [[buffer(6)]],
315
+ uint3 tid [[threadgroup_position_in_grid]],
316
+ uint3 lid [[thread_position_in_threadgroup]]) {
317
+ using sort_kernel =
318
+ KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
319
+ using ValT = typename sort_kernel::ValT;
320
+ using IdxT = typename sort_kernel::IdxT;
321
+
322
+ if (ARG_SORT) {
323
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
324
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
325
+ sort_kernel::block_sort(
326
+ inp,
327
+ out,
328
+ size_sorted_axis,
329
+ in_stride_sorted_axis,
330
+ out_stride_sorted_axis,
331
+ in_stride_segment_axis,
332
+ out_stride_segment_axis,
333
+ tgp_vals,
334
+ tgp_idxs,
335
+ tid,
336
+ lid);
337
+ } else {
338
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
339
+ sort_kernel::block_sort(
340
+ inp,
341
+ out,
342
+ size_sorted_axis,
343
+ in_stride_sorted_axis,
344
+ out_stride_sorted_axis,
345
+ in_stride_segment_axis,
346
+ out_stride_segment_axis,
347
+ tgp_vals,
348
+ nullptr,
349
+ tid,
350
+ lid);
351
+ }
352
+ }
353
+
354
+ constant constexpr const int zero_helper = 0;
355
+
356
+ template <
357
+ typename T,
358
+ typename U,
359
+ bool ARG_SORT,
360
+ short BLOCK_THREADS,
361
+ short N_PER_THREAD>
362
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort_nc(
363
+ const device T* inp [[buffer(0)]],
364
+ device U* out [[buffer(1)]],
365
+ const constant int& size_sorted_axis [[buffer(2)]],
366
+ const constant int& in_stride_sorted_axis [[buffer(3)]],
367
+ const constant int& out_stride_sorted_axis [[buffer(4)]],
368
+ const constant int& nc_dim [[buffer(5)]],
369
+ const constant int* nc_shape [[buffer(6)]],
370
+ const constant int64_t* in_nc_strides [[buffer(7)]],
371
+ const constant int64_t* out_nc_strides [[buffer(8)]],
372
+ uint3 tid [[threadgroup_position_in_grid]],
373
+ uint3 lid [[thread_position_in_threadgroup]]) {
374
+ using sort_kernel =
375
+ KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
376
+ using ValT = typename sort_kernel::ValT;
377
+ using IdxT = typename sort_kernel::IdxT;
378
+
379
+ auto in_block_idx = elem_to_loc(tid.y, nc_shape, in_nc_strides, nc_dim);
380
+ auto out_block_idx = elem_to_loc(tid.y, nc_shape, out_nc_strides, nc_dim);
381
+ inp += in_block_idx;
382
+ out += out_block_idx;
383
+
384
+ if (ARG_SORT) {
385
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
386
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
387
+ sort_kernel::block_sort(
388
+ inp,
389
+ out,
390
+ size_sorted_axis,
391
+ in_stride_sorted_axis,
392
+ out_stride_sorted_axis,
393
+ zero_helper,
394
+ zero_helper,
395
+ tgp_vals,
396
+ tgp_idxs,
397
+ tid,
398
+ lid);
399
+ } else {
400
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
401
+ sort_kernel::block_sort(
402
+ inp,
403
+ out,
404
+ size_sorted_axis,
405
+ in_stride_sorted_axis,
406
+ out_stride_sorted_axis,
407
+ zero_helper,
408
+ zero_helper,
409
+ tgp_vals,
410
+ nullptr,
411
+ tid,
412
+ lid);
413
+ }
414
+ }
415
+
416
+ template <
417
+ typename ValT,
418
+ typename IdxT,
419
+ bool ARG_SORT,
420
+ short BLOCK_THREADS,
421
+ short N_PER_THREAD,
422
+ typename CompareOp = LessThan<ValT>>
423
+ struct KernelMultiBlockMergeSort {
424
+ using block_merge_sort_t = BlockMergeSort<
425
+ ValT,
426
+ IdxT,
427
+ ARG_SORT,
428
+ BLOCK_THREADS,
429
+ N_PER_THREAD,
430
+ CompareOp>;
431
+
432
+ MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;
433
+
434
+ static METAL_FUNC void block_sort(
435
+ const device ValT* inp,
436
+ device ValT* out_vals,
437
+ device IdxT* out_idxs,
438
+ const constant int& size_sorted_axis,
439
+ const constant int& stride_sorted_axis,
440
+ threadgroup ValT* tgp_vals,
441
+ threadgroup IdxT* tgp_idxs,
442
+ uint3 tid [[threadgroup_position_in_grid]],
443
+ uint3 lid [[thread_position_in_threadgroup]]) {
444
+ // tid.y tells us the segment index
445
+ int base_idx = tid.x * N_PER_BLOCK;
446
+
447
+ // Copy into threadgroup memory
448
+ for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
449
+ int idx = base_idx + i;
450
+ tgp_vals[i] = idx < size_sorted_axis ? inp[idx * stride_sorted_axis]
451
+ : ValT(CompareOp::init);
452
+ tgp_idxs[i] = idx;
453
+ }
454
+
455
+ // Sort elements within the block
456
+ threadgroup_barrier(mem_flags::mem_threadgroup);
457
+
458
+ block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);
459
+
460
+ threadgroup_barrier(mem_flags::mem_threadgroup);
461
+
462
+ // Write output
463
+ for (int i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
464
+ int idx = base_idx + i;
465
+ if (idx < size_sorted_axis) {
466
+ out_vals[idx] = tgp_vals[i];
467
+ out_idxs[idx] = tgp_idxs[i];
468
+ }
469
+ }
470
+ }
471
+
472
+ static METAL_FUNC int merge_partition(
473
+ const device ValT* As,
474
+ const device ValT* Bs,
475
+ int A_sz,
476
+ int B_sz,
477
+ int sort_md) {
478
+ CompareOp op;
479
+
480
+ int A_st = max(0, sort_md - B_sz);
481
+ int A_ed = min(sort_md, A_sz);
482
+
483
+ while (A_st < A_ed) {
484
+ int md = A_st + (A_ed - A_st) / 2;
485
+ auto a = As[md];
486
+ auto b = Bs[sort_md - 1 - md];
487
+
488
+ if (op(b, a)) {
489
+ A_ed = md;
490
+ } else {
491
+ A_st = md + 1;
492
+ }
493
+ }
494
+
495
+ return A_ed;
496
+ }
497
+ };
498
+
499
+ template <
500
+ typename ValT,
501
+ typename IdxT,
502
+ bool ARG_SORT,
503
+ short BLOCK_THREADS,
504
+ short N_PER_THREAD>
505
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void mb_block_sort(
506
+ const device ValT* inp [[buffer(0)]],
507
+ device ValT* out_vals [[buffer(1)]],
508
+ device IdxT* out_idxs [[buffer(2)]],
509
+ const constant int& size_sorted_axis [[buffer(3)]],
510
+ const constant int& stride_sorted_axis [[buffer(4)]],
511
+ const constant int& nc_dim [[buffer(5)]],
512
+ const constant int* nc_shape [[buffer(6)]],
513
+ const constant int64_t* nc_strides [[buffer(7)]],
514
+ uint3 tid [[threadgroup_position_in_grid]],
515
+ uint3 lid [[thread_position_in_threadgroup]]) {
516
+ using sort_kernel = KernelMultiBlockMergeSort<
517
+ ValT,
518
+ IdxT,
519
+ ARG_SORT,
520
+ BLOCK_THREADS,
521
+ N_PER_THREAD>;
522
+
523
+ auto block_idx = elem_to_loc(tid.y, nc_shape, nc_strides, nc_dim);
524
+ inp += block_idx;
525
+ out_vals += tid.y * size_sorted_axis;
526
+ out_idxs += tid.y * size_sorted_axis;
527
+
528
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
529
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
530
+
531
+ sort_kernel::block_sort(
532
+ inp,
533
+ out_vals,
534
+ out_idxs,
535
+ size_sorted_axis,
536
+ stride_sorted_axis,
537
+ tgp_vals,
538
+ tgp_idxs,
539
+ tid,
540
+ lid);
541
+ }
542
+
543
+ template <
544
+ typename ValT,
545
+ typename IdxT,
546
+ bool ARG_SORT,
547
+ short BLOCK_THREADS,
548
+ short N_PER_THREAD>
549
+ [[kernel]] void mb_block_partition(
550
+ device IdxT* block_partitions [[buffer(0)]],
551
+ const device ValT* dev_vals [[buffer(1)]],
552
+ const device IdxT* dev_idxs [[buffer(2)]],
553
+ const constant int& size_sorted_axis [[buffer(3)]],
554
+ const constant int& merge_tiles [[buffer(4)]],
555
+ const constant int& n_blocks [[buffer(5)]],
556
+ uint3 tid [[threadgroup_position_in_grid]],
557
+ uint3 lid [[thread_position_in_threadgroup]],
558
+ uint3 tgp_dims [[threads_per_threadgroup]]) {
559
+ using sort_kernel = KernelMultiBlockMergeSort<
560
+ ValT,
561
+ IdxT,
562
+ ARG_SORT,
563
+ BLOCK_THREADS,
564
+ N_PER_THREAD>;
565
+
566
+ block_partitions += tid.y * tgp_dims.x;
567
+ dev_vals += tid.y * size_sorted_axis;
568
+ dev_idxs += tid.y * size_sorted_axis;
569
+
570
+ for (int i = lid.x; i <= n_blocks; i += tgp_dims.x) {
571
+ // Find location in merge step
572
+ int merge_group = i / merge_tiles;
573
+ int merge_lane = i % merge_tiles;
574
+
575
+ int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
576
+ int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;
577
+
578
+ int A_st = min(size_sorted_axis, sort_st);
579
+ int A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
580
+ int B_st = A_ed;
581
+ int B_ed = min(size_sorted_axis, B_st + sort_sz / 2);
582
+
583
+ int partition_at = min(B_ed - A_st, sort_kernel::N_PER_BLOCK * merge_lane);
584
+ int partition = sort_kernel::merge_partition(
585
+ dev_vals + A_st,
586
+ dev_vals + B_st,
587
+ A_ed - A_st,
588
+ B_ed - B_st,
589
+ partition_at);
590
+
591
+ block_partitions[i] = A_st + partition;
592
+ }
593
+ }
594
+
595
+ template <
596
+ typename ValT,
597
+ typename IdxT,
598
+ bool ARG_SORT,
599
+ short BLOCK_THREADS,
600
+ short N_PER_THREAD,
601
+ typename CompareOp = LessThan<ValT>>
602
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void
603
+ mb_block_merge(
604
+ const device IdxT* block_partitions [[buffer(0)]],
605
+ const device ValT* dev_vals_in [[buffer(1)]],
606
+ const device IdxT* dev_idxs_in [[buffer(2)]],
607
+ device ValT* dev_vals_out [[buffer(3)]],
608
+ device IdxT* dev_idxs_out [[buffer(4)]],
609
+ const constant int& size_sorted_axis [[buffer(5)]],
610
+ const constant int& merge_tiles [[buffer(6)]],
611
+ const constant int& num_tiles [[buffer(7)]],
612
+ uint3 tid [[threadgroup_position_in_grid]],
613
+ uint3 lid [[thread_position_in_threadgroup]]) {
614
+ using sort_kernel = KernelMultiBlockMergeSort<
615
+ ValT,
616
+ IdxT,
617
+ ARG_SORT,
618
+ BLOCK_THREADS,
619
+ N_PER_THREAD,
620
+ CompareOp>;
621
+
622
+ using block_sort_t = typename sort_kernel::block_merge_sort_t;
623
+
624
+ block_partitions += tid.y * (num_tiles + 1);
625
+ dev_vals_in += tid.y * size_sorted_axis;
626
+ dev_idxs_in += tid.y * size_sorted_axis;
627
+ dev_vals_out += tid.y * size_sorted_axis;
628
+ dev_idxs_out += tid.y * size_sorted_axis;
629
+
630
+ int block_idx = tid.x;
631
+ int merge_group = block_idx / merge_tiles;
632
+ int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;
633
+ int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
634
+ int sort_md = sort_kernel::N_PER_BLOCK * block_idx - sort_st;
635
+
636
+ int A_st = block_partitions[block_idx + 0];
637
+ int A_ed = block_partitions[block_idx + 1];
638
+ int B_st = min(size_sorted_axis, 2 * sort_st + sort_sz / 2 + sort_md - A_st);
639
+ int B_ed = min(
640
+ size_sorted_axis,
641
+ 2 * sort_st + sort_sz / 2 + sort_md + sort_kernel::N_PER_BLOCK - A_ed);
642
+
643
+ if ((block_idx % merge_tiles) == merge_tiles - 1) {
644
+ A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
645
+ B_ed = min(size_sorted_axis, sort_st + sort_sz);
646
+ }
647
+
648
+ int A_sz = A_ed - A_st;
649
+ int B_sz = B_ed - B_st;
650
+
651
+ // Load from global memory
652
+ thread ValT thread_vals[N_PER_THREAD];
653
+ thread IdxT thread_idxs[N_PER_THREAD];
654
+ for (int i = 0; i < N_PER_THREAD; i++) {
655
+ int idx = BLOCK_THREADS * i + lid.x;
656
+ if (idx < (A_sz + B_sz)) {
657
+ thread_vals[i] = (idx < A_sz) ? dev_vals_in[A_st + idx]
658
+ : dev_vals_in[B_st + idx - A_sz];
659
+ thread_idxs[i] = (idx < A_sz) ? dev_idxs_in[A_st + idx]
660
+ : dev_idxs_in[B_st + idx - A_sz];
661
+ } else {
662
+ thread_vals[i] = CompareOp::init;
663
+ thread_idxs[i] = 0;
664
+ }
665
+ }
666
+
667
+ // Write to shared memory
668
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
669
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
670
+ threadgroup_barrier(mem_flags::mem_threadgroup);
671
+ for (int i = 0; i < N_PER_THREAD; i++) {
672
+ int idx = BLOCK_THREADS * i + lid.x;
673
+ tgp_vals[idx] = thread_vals[i];
674
+ tgp_idxs[idx] = thread_idxs[i];
675
+ }
676
+ threadgroup_barrier(mem_flags::mem_threadgroup);
677
+
678
+ // Merge
679
+ int sort_md_local = min(A_sz + B_sz, N_PER_THREAD * int(lid.x));
680
+
681
+ int A_st_local = block_sort_t::merge_partition(
682
+ tgp_vals, tgp_vals + A_sz, A_sz, B_sz, sort_md_local);
683
+ int A_ed_local = A_sz;
684
+
685
+ int B_st_local = sort_md_local - A_st_local;
686
+ int B_ed_local = B_sz;
687
+
688
+ int A_sz_local = A_ed_local - A_st_local;
689
+ int B_sz_local = B_ed_local - B_st_local;
690
+
691
+ // Do merge
692
+ block_sort_t::merge_step(
693
+ tgp_vals + A_st_local,
694
+ tgp_vals + A_ed_local + B_st_local,
695
+ tgp_idxs + A_st_local,
696
+ tgp_idxs + A_ed_local + B_st_local,
697
+ A_sz_local,
698
+ B_sz_local,
699
+ thread_vals,
700
+ thread_idxs);
701
+
702
+ threadgroup_barrier(mem_flags::mem_threadgroup);
703
+ for (int i = 0; i < N_PER_THREAD; ++i) {
704
+ int idx = lid.x * N_PER_THREAD;
705
+ tgp_vals[idx + i] = thread_vals[i];
706
+ tgp_idxs[idx + i] = thread_idxs[i];
707
+ }
708
+
709
+ threadgroup_barrier(mem_flags::mem_threadgroup);
710
+ // Write output
711
+ int base_idx = tid.x * sort_kernel::N_PER_BLOCK;
712
+ for (int i = lid.x; i < sort_kernel::N_PER_BLOCK; i += BLOCK_THREADS) {
713
+ int idx = base_idx + i;
714
+ if (idx < size_sorted_axis) {
715
+ dev_vals_out[idx] = tgp_vals[i];
716
+ dev_idxs_out[idx] = tgp_idxs[i];
717
+ }
718
+ }
719
+ }