llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -0,0 +1,404 @@
1
+ #include "mmvq.cuh"
2
+ #include "vecdotq.cuh"
3
+
4
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
5
+
6
+ template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
7
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
8
+ // tell the compiler to use as many registers as it wants, see nwarps definition below
9
+ __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
10
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
11
+ static __global__ void mul_mat_vec_q(
12
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
13
+ const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
14
+
15
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
16
+ constexpr int nwarps = 1;
17
+ constexpr int rows_per_cuda_block = 1;
18
+ #else
19
+ constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
20
+ constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
21
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
22
+
23
+ const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
24
+ const int row0 = rows_per_cuda_block*blockIdx.x;
25
+ const int blocks_per_row_x = ncols_x / qk;
26
+ const int blocks_per_col_y = nrows_y / QK8_1;
27
+ constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
28
+
29
+ // partial sum for each thread
30
+ float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
31
+
32
+ const block_q_t * x = (const block_q_t *) vx;
33
+ const block_q8_1 * y = (const block_q8_1 *) vy;
34
+
35
+ for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
36
+ const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
37
+
38
+ // x block quant index when casting the quants to int
39
+ const int kqs = vdr * (tid % (qi/vdr));
40
+
41
+ #pragma unroll
42
+ for (int j = 0; j < ncols_y; ++j) {
43
+ #pragma unroll
44
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
45
+ tmp[j][i] += vec_dot_q_cuda(
46
+ &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
47
+ }
48
+ }
49
+ }
50
+
51
+ __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
52
+ if (threadIdx.y > 0) {
53
+ #pragma unroll
54
+ for (int j = 0; j < ncols_y; ++j) {
55
+ #pragma unroll
56
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
57
+ tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
58
+ }
59
+ }
60
+ }
61
+ __syncthreads();
62
+ if (threadIdx.y > 0) {
63
+ return;
64
+ }
65
+
66
+ // sum up partial sums and write back result
67
+ #pragma unroll
68
+ for (int j = 0; j < ncols_y; ++j) {
69
+ #pragma unroll
70
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
71
+ #pragma unroll
72
+ for (int l = 0; l < nwarps-1; ++l) {
73
+ tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
74
+ }
75
+ tmp[j][i] = warp_reduce_sum(tmp[j][i]);
76
+ }
77
+
78
+ if (threadIdx.x < rows_per_cuda_block) {
79
+ dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
80
+ }
81
+ }
82
+ }
83
+
84
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
85
+ static void mul_mat_vec_q_cuda(
86
+ const void * vx, const void * vy, float * dst,
87
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
88
+
89
+ GGML_ASSERT(ncols_x % qk == 0);
90
+ GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
91
+
92
+ int id = ggml_cuda_get_device();
93
+
94
+ int64_t nwarps = 1;
95
+ int64_t rows_per_cuda_block = 1;
96
+
97
+ if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
98
+ switch(ncols_y) {
99
+ case 1:
100
+ nwarps = 4;
101
+ rows_per_cuda_block = 1;
102
+ break;
103
+ case 2:
104
+ case 3:
105
+ case 4:
106
+ nwarps = 4;
107
+ rows_per_cuda_block = 2;
108
+ break;
109
+ case 5:
110
+ case 6:
111
+ case 7:
112
+ case 8:
113
+ nwarps = 2;
114
+ rows_per_cuda_block = 2;
115
+ break;
116
+ default:
117
+ GGML_ASSERT(false);
118
+ break;
119
+ }
120
+ }
121
+ const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
122
+ const dim3 block_nums(nblocks, 1, 1);
123
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
124
+
125
+ switch (ncols_y) {
126
+ case 1:
127
+ mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
128
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
129
+ break;
130
+ case 2:
131
+ mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
132
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
133
+ break;
134
+ case 3:
135
+ mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
136
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
137
+ break;
138
+ case 4:
139
+ mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
140
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
141
+ break;
142
+ case 5:
143
+ mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
144
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
145
+ break;
146
+ case 6:
147
+ mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
148
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
149
+ break;
150
+ case 7:
151
+ mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
152
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
153
+ break;
154
+ case 8:
155
+ mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
156
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
157
+ break;
158
+ default:
159
+ GGML_ASSERT(false);
160
+ break;
161
+ }
162
+ }
163
+
164
+ static void mul_mat_vec_q4_0_q8_1_cuda(
165
+ const void * vx, const void * vy, float * dst,
166
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
167
+
168
+ mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
169
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
170
+ }
171
+
172
+ static void mul_mat_vec_q4_1_q8_1_cuda(
173
+ const void * vx, const void * vy, float * dst,
174
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
175
+
176
+ mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
177
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
178
+ }
179
+
180
+ static void mul_mat_vec_q5_0_q8_1_cuda(
181
+ const void * vx, const void * vy, float * dst,
182
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
183
+
184
+ mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
185
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
186
+ }
187
+
188
+ static void mul_mat_vec_q5_1_q8_1_cuda(
189
+ const void * vx, const void * vy, float * dst,
190
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
191
+
192
+ mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
193
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
194
+ }
195
+
196
+ static void mul_mat_vec_q8_0_q8_1_cuda(
197
+ const void * vx, const void * vy, float * dst,
198
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
199
+
200
+ mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
201
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
202
+ }
203
+
204
+ static void mul_mat_vec_q2_K_q8_1_cuda(
205
+ const void * vx, const void * vy, float * dst,
206
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
207
+
208
+ mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
209
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
210
+ }
211
+
212
+ static void mul_mat_vec_q3_K_q8_1_cuda(
213
+ const void * vx, const void * vy, float * dst,
214
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
215
+
216
+ mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
217
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
218
+ }
219
+
220
+ static void mul_mat_vec_q4_K_q8_1_cuda(
221
+ const void * vx, const void * vy, float * dst,
222
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
223
+
224
+ mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
225
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
226
+ }
227
+
228
+ static void mul_mat_vec_q5_K_q8_1_cuda(
229
+ const void * vx, const void * vy, float * dst,
230
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
231
+
232
+ mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
233
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
234
+ }
235
+
236
+ static void mul_mat_vec_q6_K_q8_1_cuda(
237
+ const void * vx, const void * vy, float * dst,
238
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
239
+
240
+ mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
241
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
242
+ }
243
+
244
+ static void mul_mat_vec_iq2_xxs_q8_1_cuda(
245
+ const void * vx, const void * vy, float * dst,
246
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
247
+
248
+ mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
249
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
250
+ }
251
+
252
+ static void mul_mat_vec_iq2_xs_q8_1_cuda(
253
+ const void * vx, const void * vy, float * dst,
254
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
255
+
256
+ mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
257
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
258
+ }
259
+
260
+ static void mul_mat_vec_iq2_s_q8_1_cuda(
261
+ const void * vx, const void * vy, float * dst,
262
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
263
+
264
+ mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
265
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
266
+ }
267
+
268
+ static void mul_mat_vec_iq3_xxs_q8_1_cuda(
269
+ const void * vx, const void * vy, float * dst,
270
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
271
+
272
+ mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
273
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
274
+ }
275
+
276
+ static void mul_mat_vec_iq1_s_q8_1_cuda(
277
+ const void * vx, const void * vy, float * dst,
278
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
279
+
280
+ mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
281
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
282
+ }
283
+
284
+ static void mul_mat_vec_iq1_m_q8_1_cuda(
285
+ const void * vx, const void * vy, float * dst,
286
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
287
+
288
+ mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
289
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
290
+ }
291
+
292
+ static void mul_mat_vec_iq4_nl_q8_1_cuda(
293
+ const void * vx, const void * vy, float * dst,
294
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
295
+
296
+ mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
297
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
298
+ }
299
+
300
+ static void mul_mat_vec_iq4_xs_q8_1_cuda(
301
+ const void * vx, const void * vy, float * dst,
302
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
303
+
304
+ mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
305
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
306
+ }
307
+
308
+ static void mul_mat_vec_iq3_s_q8_1_cuda(
309
+ const void * vx, const void * vy, float * dst,
310
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
311
+
312
+ mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
313
+ (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
314
+ }
315
+
316
+ void ggml_cuda_op_mul_mat_vec_q(
317
+ ggml_backend_cuda_context & ctx,
318
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
319
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
320
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
321
+
322
+ const int64_t ne00 = src0->ne[0];
323
+ const int64_t row_diff = row_high - row_low;
324
+
325
+ const int64_t ne10 = src1->ne[0];
326
+ GGML_ASSERT(ne10 % QK8_1 == 0);
327
+
328
+ const int64_t ne0 = dst->ne[0];
329
+
330
+ int id = ggml_cuda_get_device();
331
+
332
+ // the main device has a larger memory buffer to hold the results from all GPUs
333
+ // nrows_dst == nrows of the matrix that the kernel writes into
334
+ const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
335
+
336
+ switch (src0->type) {
337
+ case GGML_TYPE_Q4_0:
338
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
339
+ break;
340
+ case GGML_TYPE_Q4_1:
341
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
342
+ break;
343
+ case GGML_TYPE_Q5_0:
344
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
345
+ break;
346
+ case GGML_TYPE_Q5_1:
347
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
348
+ break;
349
+ case GGML_TYPE_Q8_0:
350
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
351
+ break;
352
+ case GGML_TYPE_Q2_K:
353
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
354
+ break;
355
+ case GGML_TYPE_Q3_K:
356
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
357
+ break;
358
+ case GGML_TYPE_Q4_K:
359
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
360
+ break;
361
+ case GGML_TYPE_Q5_K:
362
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
363
+ break;
364
+ case GGML_TYPE_Q6_K:
365
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
366
+ break;
367
+ case GGML_TYPE_IQ2_XXS:
368
+ mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
369
+ break;
370
+ case GGML_TYPE_IQ2_XS:
371
+ mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
372
+ break;
373
+ case GGML_TYPE_IQ2_S:
374
+ mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
375
+ break;
376
+ case GGML_TYPE_IQ3_XXS:
377
+ mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
378
+ break;
379
+ case GGML_TYPE_IQ1_S:
380
+ mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
381
+ break;
382
+ case GGML_TYPE_IQ1_M:
383
+ mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
384
+ break;
385
+ case GGML_TYPE_IQ4_NL:
386
+ mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
387
+ break;
388
+ case GGML_TYPE_IQ4_XS:
389
+ mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
390
+ break;
391
+ case GGML_TYPE_IQ3_S:
392
+ mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
393
+ break;
394
+ default:
395
+ GGML_ASSERT(false);
396
+ break;
397
+ }
398
+
399
+ GGML_UNUSED(src1);
400
+ GGML_UNUSED(dst);
401
+ GGML_UNUSED(src1_ddf_i);
402
+ GGML_UNUSED(src1_ncols);
403
+ GGML_UNUSED(src1_padded_row_size);
404
+ }
@@ -0,0 +1,221 @@
1
+ #include "norm.cuh"
2
+
3
+ template <int block_size>
4
+ static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
5
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
6
+ const int tid = threadIdx.x;
7
+
8
+ float2 mean_var = make_float2(0.f, 0.f);
9
+
10
+ for (int col = tid; col < ncols; col += block_size) {
11
+ const float xi = x[row*ncols + col];
12
+ mean_var.x += xi;
13
+ mean_var.y += xi * xi;
14
+ }
15
+
16
+ // sum up partial sums
17
+ mean_var = warp_reduce_sum(mean_var);
18
+ if (block_size > WARP_SIZE) {
19
+ __shared__ float2 s_sum[32];
20
+ int warp_id = threadIdx.x / WARP_SIZE;
21
+ int lane_id = threadIdx.x % WARP_SIZE;
22
+ if (lane_id == 0) {
23
+ s_sum[warp_id] = mean_var;
24
+ }
25
+ __syncthreads();
26
+ mean_var = s_sum[lane_id];
27
+ mean_var = warp_reduce_sum(mean_var);
28
+ }
29
+
30
+ const float mean = mean_var.x / ncols;
31
+ const float var = mean_var.y / ncols - mean * mean;
32
+ const float inv_std = rsqrtf(var + eps);
33
+
34
+ for (int col = tid; col < ncols; col += block_size) {
35
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
36
+ }
37
+ }
38
+
39
+ template <int block_size>
40
+ static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
41
+ // blockIdx.x: num_groups idx
42
+ // threadIdx.x: block_size idx
43
+ int start = blockIdx.x * group_size;
44
+ int end = start + group_size;
45
+
46
+ start += threadIdx.x;
47
+
48
+ if (end >= ne_elements) {
49
+ end = ne_elements;
50
+ }
51
+
52
+ float tmp = 0.0f; // partial sum for thread in warp
53
+
54
+ for (int j = start; j < end; j += block_size) {
55
+ tmp += x[j];
56
+ }
57
+
58
+ tmp = warp_reduce_sum(tmp);
59
+ if (block_size > WARP_SIZE) {
60
+ __shared__ float s_sum[32];
61
+ int warp_id = threadIdx.x / WARP_SIZE;
62
+ int lane_id = threadIdx.x % WARP_SIZE;
63
+ if (lane_id == 0) {
64
+ s_sum[warp_id] = tmp;
65
+ }
66
+ __syncthreads();
67
+ tmp = s_sum[lane_id];
68
+ tmp = warp_reduce_sum(tmp);
69
+ }
70
+
71
+ float mean = tmp / group_size;
72
+ tmp = 0.0f;
73
+
74
+ for (int j = start; j < end; j += block_size) {
75
+ float xi = x[j] - mean;
76
+ dst[j] = xi;
77
+ tmp += xi * xi;
78
+ }
79
+
80
+ tmp = warp_reduce_sum(tmp);
81
+ if (block_size > WARP_SIZE) {
82
+ __shared__ float s_sum[32];
83
+ int warp_id = threadIdx.x / WARP_SIZE;
84
+ int lane_id = threadIdx.x % WARP_SIZE;
85
+ if (lane_id == 0) {
86
+ s_sum[warp_id] = tmp;
87
+ }
88
+ __syncthreads();
89
+ tmp = s_sum[lane_id];
90
+ tmp = warp_reduce_sum(tmp);
91
+ }
92
+
93
+ float variance = tmp / group_size;
94
+ float scale = rsqrtf(variance + eps);
95
+ for (int j = start; j < end; j += block_size) {
96
+ dst[j] *= scale;
97
+ }
98
+ }
99
+
100
+ template <int block_size>
101
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
102
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
103
+ const int tid = threadIdx.x;
104
+
105
+ float tmp = 0.0f; // partial sum for thread in warp
106
+
107
+ for (int col = tid; col < ncols; col += block_size) {
108
+ const float xi = x[row*ncols + col];
109
+ tmp += xi * xi;
110
+ }
111
+
112
+ // sum up partial sums
113
+ tmp = warp_reduce_sum(tmp);
114
+ if (block_size > WARP_SIZE) {
115
+ __shared__ float s_sum[32];
116
+ int warp_id = threadIdx.x / WARP_SIZE;
117
+ int lane_id = threadIdx.x % WARP_SIZE;
118
+ if (lane_id == 0) {
119
+ s_sum[warp_id] = tmp;
120
+ }
121
+ __syncthreads();
122
+ tmp = s_sum[lane_id];
123
+ tmp = warp_reduce_sum(tmp);
124
+ }
125
+
126
+ const float mean = tmp / ncols;
127
+ const float scale = rsqrtf(mean + eps);
128
+
129
+ for (int col = tid; col < ncols; col += block_size) {
130
+ dst[row*ncols + col] = scale * x[row*ncols + col];
131
+ }
132
+ }
133
+
134
+ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
135
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
136
+ if (ncols < 1024) {
137
+ const dim3 block_dims(WARP_SIZE, 1, 1);
138
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
139
+ } else {
140
+ const dim3 block_dims(1024, 1, 1);
141
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
142
+ }
143
+ }
144
+
145
+ static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
146
+ static const float eps = 1e-6f;
147
+ if (group_size < 1024) {
148
+ const dim3 block_dims(WARP_SIZE, 1, 1);
149
+ group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
150
+ } else {
151
+ const dim3 block_dims(1024, 1, 1);
152
+ group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
153
+ }
154
+ }
155
+
156
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
157
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
158
+ if (ncols < 1024) {
159
+ const dim3 block_dims(WARP_SIZE, 1, 1);
160
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
161
+ } else {
162
+ const dim3 block_dims(1024, 1, 1);
163
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
164
+ }
165
+ }
166
+
167
+ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
168
+ const ggml_tensor * src0 = dst->src[0];
169
+ const float * src0_d = (const float *)src0->data;
170
+ float * dst_d = (float *)dst->data;
171
+ cudaStream_t stream = ctx.stream();
172
+
173
+ GGML_ASSERT(ggml_is_contiguous(src0));
174
+
175
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
176
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
177
+
178
+ const int64_t ne00 = src0->ne[0];
179
+ const int64_t nrows = ggml_nrows(src0);
180
+
181
+ float eps;
182
+ memcpy(&eps, dst->op_params, sizeof(float));
183
+
184
+ norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
185
+ }
186
+
187
+ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
188
+ const ggml_tensor * src0 = dst->src[0];
189
+ const float * src0_d = (const float *)src0->data;
190
+ float * dst_d = (float *)dst->data;
191
+ cudaStream_t stream = ctx.stream();
192
+
193
+ GGML_ASSERT(ggml_is_contiguous(src0));
194
+
195
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
196
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
197
+
198
+ int num_groups = dst->op_params[0];
199
+ int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
200
+ group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
201
+ }
202
+
203
+ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
204
+ const ggml_tensor * src0 = dst->src[0];
205
+ const float * src0_d = (const float *)src0->data;
206
+ float * dst_d = (float *)dst->data;
207
+ cudaStream_t stream = ctx.stream();
208
+
209
+ GGML_ASSERT(ggml_is_contiguous(src0));
210
+
211
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
212
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
213
+
214
+ const int64_t ne00 = src0->ne[0];
215
+ const int64_t nrows = ggml_nrows(src0);
216
+
217
+ float eps;
218
+ memcpy(&eps, dst->op_params, sizeof(float));
219
+
220
+ rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
221
+ }
@@ -0,0 +1,49 @@
1
+ #include "pad.cuh"
2
+
3
+ static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
4
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
5
+ // blockIdx.y: idx of ne1
6
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
7
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
8
+ if (nidx >= ne0) {
9
+ return;
10
+ }
11
+
12
+ // operation
13
+ int offset_dst =
14
+ nidx +
15
+ blockIdx.y * ne0 +
16
+ blockIdx.z * ne0 * gridDim.y;
17
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
18
+ int offset_src =
19
+ nidx +
20
+ blockIdx.y * ne00 +
21
+ blockIdx.z * ne00 * ne01;
22
+ dst[offset_dst] = x[offset_src];
23
+ } else {
24
+ dst[offset_dst] = 0.0f;
25
+ }
26
+ }
27
+
28
+ static void pad_f32_cuda(const float * x, float * dst,
29
+ const int ne00, const int ne01, const int ne02, const int ne03,
30
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
31
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
32
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
33
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
34
+ }
35
+
36
+ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
37
+ const ggml_tensor * src0 = dst->src[0];
38
+ const float * src0_d = (const float *)src0->data;
39
+ float * dst_d = (float *)dst->data;
40
+ cudaStream_t stream = ctx.stream();
41
+
42
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
43
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
44
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
45
+
46
+ pad_f32_cuda(src0_d, dst_d,
47
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
48
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
49
+ }