llama_cpp 0.16.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,221 +0,0 @@
1
- #include "norm.cuh"
2
-
3
- template <int block_size>
4
- static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
5
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
6
- const int tid = threadIdx.x;
7
-
8
- float2 mean_var = make_float2(0.f, 0.f);
9
-
10
- for (int col = tid; col < ncols; col += block_size) {
11
- const float xi = x[row*ncols + col];
12
- mean_var.x += xi;
13
- mean_var.y += xi * xi;
14
- }
15
-
16
- // sum up partial sums
17
- mean_var = warp_reduce_sum(mean_var);
18
- if (block_size > WARP_SIZE) {
19
- __shared__ float2 s_sum[32];
20
- int warp_id = threadIdx.x / WARP_SIZE;
21
- int lane_id = threadIdx.x % WARP_SIZE;
22
- if (lane_id == 0) {
23
- s_sum[warp_id] = mean_var;
24
- }
25
- __syncthreads();
26
- mean_var = s_sum[lane_id];
27
- mean_var = warp_reduce_sum(mean_var);
28
- }
29
-
30
- const float mean = mean_var.x / ncols;
31
- const float var = mean_var.y / ncols - mean * mean;
32
- const float inv_std = rsqrtf(var + eps);
33
-
34
- for (int col = tid; col < ncols; col += block_size) {
35
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
36
- }
37
- }
38
-
39
- template <int block_size>
40
- static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
41
- // blockIdx.x: num_groups idx
42
- // threadIdx.x: block_size idx
43
- int start = blockIdx.x * group_size;
44
- int end = start + group_size;
45
-
46
- start += threadIdx.x;
47
-
48
- if (end >= ne_elements) {
49
- end = ne_elements;
50
- }
51
-
52
- float tmp = 0.0f; // partial sum for thread in warp
53
-
54
- for (int j = start; j < end; j += block_size) {
55
- tmp += x[j];
56
- }
57
-
58
- tmp = warp_reduce_sum(tmp);
59
- if (block_size > WARP_SIZE) {
60
- __shared__ float s_sum[32];
61
- int warp_id = threadIdx.x / WARP_SIZE;
62
- int lane_id = threadIdx.x % WARP_SIZE;
63
- if (lane_id == 0) {
64
- s_sum[warp_id] = tmp;
65
- }
66
- __syncthreads();
67
- tmp = s_sum[lane_id];
68
- tmp = warp_reduce_sum(tmp);
69
- }
70
-
71
- float mean = tmp / group_size;
72
- tmp = 0.0f;
73
-
74
- for (int j = start; j < end; j += block_size) {
75
- float xi = x[j] - mean;
76
- dst[j] = xi;
77
- tmp += xi * xi;
78
- }
79
-
80
- tmp = warp_reduce_sum(tmp);
81
- if (block_size > WARP_SIZE) {
82
- __shared__ float s_sum[32];
83
- int warp_id = threadIdx.x / WARP_SIZE;
84
- int lane_id = threadIdx.x % WARP_SIZE;
85
- if (lane_id == 0) {
86
- s_sum[warp_id] = tmp;
87
- }
88
- __syncthreads();
89
- tmp = s_sum[lane_id];
90
- tmp = warp_reduce_sum(tmp);
91
- }
92
-
93
- float variance = tmp / group_size;
94
- float scale = rsqrtf(variance + eps);
95
- for (int j = start; j < end; j += block_size) {
96
- dst[j] *= scale;
97
- }
98
- }
99
-
100
- template <int block_size>
101
- static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
102
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
103
- const int tid = threadIdx.x;
104
-
105
- float tmp = 0.0f; // partial sum for thread in warp
106
-
107
- for (int col = tid; col < ncols; col += block_size) {
108
- const float xi = x[row*ncols + col];
109
- tmp += xi * xi;
110
- }
111
-
112
- // sum up partial sums
113
- tmp = warp_reduce_sum(tmp);
114
- if (block_size > WARP_SIZE) {
115
- __shared__ float s_sum[32];
116
- int warp_id = threadIdx.x / WARP_SIZE;
117
- int lane_id = threadIdx.x % WARP_SIZE;
118
- if (lane_id == 0) {
119
- s_sum[warp_id] = tmp;
120
- }
121
- __syncthreads();
122
- tmp = s_sum[lane_id];
123
- tmp = warp_reduce_sum(tmp);
124
- }
125
-
126
- const float mean = tmp / ncols;
127
- const float scale = rsqrtf(mean + eps);
128
-
129
- for (int col = tid; col < ncols; col += block_size) {
130
- dst[row*ncols + col] = scale * x[row*ncols + col];
131
- }
132
- }
133
-
134
- static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
135
- GGML_ASSERT(ncols % WARP_SIZE == 0);
136
- if (ncols < 1024) {
137
- const dim3 block_dims(WARP_SIZE, 1, 1);
138
- norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
139
- } else {
140
- const dim3 block_dims(1024, 1, 1);
141
- norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
142
- }
143
- }
144
-
145
- static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
146
- static const float eps = 1e-6f;
147
- if (group_size < 1024) {
148
- const dim3 block_dims(WARP_SIZE, 1, 1);
149
- group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
150
- } else {
151
- const dim3 block_dims(1024, 1, 1);
152
- group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
153
- }
154
- }
155
-
156
- static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
157
- GGML_ASSERT(ncols % WARP_SIZE == 0);
158
- if (ncols < 1024) {
159
- const dim3 block_dims(WARP_SIZE, 1, 1);
160
- rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
161
- } else {
162
- const dim3 block_dims(1024, 1, 1);
163
- rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
164
- }
165
- }
166
-
167
- void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
168
- const ggml_tensor * src0 = dst->src[0];
169
- const float * src0_d = (const float *)src0->data;
170
- float * dst_d = (float *)dst->data;
171
- cudaStream_t stream = ctx.stream();
172
-
173
- GGML_ASSERT(ggml_is_contiguous(src0));
174
-
175
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
176
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
177
-
178
- const int64_t ne00 = src0->ne[0];
179
- const int64_t nrows = ggml_nrows(src0);
180
-
181
- float eps;
182
- memcpy(&eps, dst->op_params, sizeof(float));
183
-
184
- norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
185
- }
186
-
187
- void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
188
- const ggml_tensor * src0 = dst->src[0];
189
- const float * src0_d = (const float *)src0->data;
190
- float * dst_d = (float *)dst->data;
191
- cudaStream_t stream = ctx.stream();
192
-
193
- GGML_ASSERT(ggml_is_contiguous(src0));
194
-
195
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
196
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
197
-
198
- int num_groups = dst->op_params[0];
199
- int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
200
- group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
201
- }
202
-
203
- void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
204
- const ggml_tensor * src0 = dst->src[0];
205
- const float * src0_d = (const float *)src0->data;
206
- float * dst_d = (float *)dst->data;
207
- cudaStream_t stream = ctx.stream();
208
-
209
- GGML_ASSERT(ggml_is_contiguous(src0));
210
-
211
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
212
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
213
-
214
- const int64_t ne00 = src0->ne[0];
215
- const int64_t nrows = ggml_nrows(src0);
216
-
217
- float eps;
218
- memcpy(&eps, dst->op_params, sizeof(float));
219
-
220
- rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
221
- }
@@ -1,49 +0,0 @@
1
- #include "pad.cuh"
2
-
3
- static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
4
- // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
5
- // blockIdx.y: idx of ne1
6
- // blockIDx.x: idx of ne0 / BLOCK_SIZE
7
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
8
- if (nidx >= ne0) {
9
- return;
10
- }
11
-
12
- // operation
13
- int offset_dst =
14
- nidx +
15
- blockIdx.y * ne0 +
16
- blockIdx.z * ne0 * gridDim.y;
17
- if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
18
- int offset_src =
19
- nidx +
20
- blockIdx.y * ne00 +
21
- blockIdx.z * ne00 * ne01;
22
- dst[offset_dst] = x[offset_src];
23
- } else {
24
- dst[offset_dst] = 0.0f;
25
- }
26
- }
27
-
28
- static void pad_f32_cuda(const float * x, float * dst,
29
- const int ne00, const int ne01, const int ne02, const int ne03,
30
- const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
31
- int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
32
- dim3 gridDim(num_blocks, ne1, ne2*ne3);
33
- pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
34
- }
35
-
36
- void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
37
- const ggml_tensor * src0 = dst->src[0];
38
- const float * src0_d = (const float *)src0->data;
39
- float * dst_d = (float *)dst->data;
40
- cudaStream_t stream = ctx.stream();
41
-
42
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
43
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
44
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
45
-
46
- pad_f32_cuda(src0_d, dst_d,
47
- src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
48
- dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
49
- }
@@ -1,94 +0,0 @@
1
- #include "pool2d.cuh"
2
-
3
- template <typename Ti, typename To>
4
- static __global__ void pool2d_nchw_kernel(
5
- const int ih, const int iw, const int oh, const int ow,
6
- const int kh, const int kw, const int sh, const int sw,
7
- const int ph, const int pw, const int parallel_elements,
8
- const Ti* src, To* dst, const enum ggml_op_pool op) {
9
- int idx = threadIdx.x + blockIdx.x * blockDim.x;
10
- if (idx >= parallel_elements) {
11
- return;
12
- }
13
-
14
- const int I_HW = ih * iw;
15
- const int O_HW = oh * ow;
16
- const int nc = idx / O_HW;
17
- const int cur_oh = idx % O_HW / ow;
18
- const int cur_ow = idx % O_HW % ow;
19
- const Ti* i_ptr = src + nc * I_HW;
20
- To* o_ptr = dst + nc * O_HW;
21
- const int start_h = cur_oh * sh - ph;
22
- const int bh = max(0, start_h);
23
- const int eh = min(ih, start_h + kh);
24
- const int start_w = cur_ow * sw - pw;
25
- const int bw = max(0, start_w);
26
- const int ew = min(iw, start_w + kw);
27
- const To scale = 1. / (kh * kw);
28
- To res = 0;
29
-
30
- switch (op) {
31
- case GGML_OP_POOL_AVG: res = 0; break;
32
- case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
33
- default: assert(false);
34
- }
35
-
36
- for (int i = bh; i < eh; i += 1) {
37
- for (int j = bw; j < ew; j += 1) {
38
- #if __CUDA_ARCH__ >= 350
39
- Ti cur = __ldg(i_ptr + i * iw + j);
40
- #else
41
- Ti cur = i_ptr[i * iw + j];
42
- #endif
43
- switch (op) {
44
- case GGML_OP_POOL_AVG: res += cur * scale; break;
45
- case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
46
- default: assert(false);
47
- }
48
- }
49
- }
50
- o_ptr[cur_oh * ow + cur_ow] = res;
51
- }
52
-
53
- static void pool2d_nchw_kernel_f32_f32_cuda(
54
- const int ih, const int iw, const int oh, const int ow,
55
- const int kh, const int kw, const int sh, const int sw,
56
- const int ph, const int pw, const int parallel_elements,
57
- const float * src, float * dst, const enum ggml_op_pool op,
58
- cudaStream_t stream) {
59
-
60
- const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
61
- dim3 block_nums(num_blocks);
62
- pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
63
- }
64
-
65
- void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
66
- const ggml_tensor * src0 = dst->src[0];
67
- const float * src0_d = (const float *)src0->data;
68
- float * dst_d = (float *)dst->data;
69
- cudaStream_t stream = ctx.stream();
70
-
71
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
72
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
73
-
74
- const int32_t * opts = (const int32_t *)dst->op_params;
75
- enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
76
- const int k0 = opts[1];
77
- const int k1 = opts[2];
78
- const int s0 = opts[3];
79
- const int s1 = opts[4];
80
- const int p0 = opts[5];
81
- const int p1 = opts[6];
82
-
83
- const int64_t IH = src0->ne[1];
84
- const int64_t IW = src0->ne[0];
85
-
86
- const int64_t N = dst->ne[3];
87
- const int64_t OC = dst->ne[2];
88
- const int64_t OH = dst->ne[1];
89
- const int64_t OW = dst->ne[0];
90
-
91
- const int parallel_elements = N * OC * OH * OW;
92
-
93
- pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
94
- }
@@ -1,112 +0,0 @@
1
- #include "quantize.cuh"
2
- #include <cstdint>
3
-
4
- static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
5
- const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
6
-
7
- if (ix0 >= kx0_padded) {
8
- return;
9
- }
10
-
11
- const int64_t ix1 = blockIdx.y;
12
-
13
- const int64_t i_padded = ix1*kx0_padded + ix0;
14
-
15
- block_q8_1 * y = (block_q8_1 *) vy;
16
-
17
- const int64_t ib = i_padded / QK8_1; // block index
18
- const int64_t iqs = i_padded % QK8_1; // quant index
19
-
20
- const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
21
- float amax = fabsf(xi);
22
- float sum = xi;
23
-
24
- amax = warp_reduce_max(amax);
25
- sum = warp_reduce_sum(sum);
26
-
27
- const float d = amax / 127;
28
- const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
29
-
30
- y[ib].qs[iqs] = q;
31
-
32
- if (iqs > 0) {
33
- return;
34
- }
35
-
36
- reinterpret_cast<half&>(y[ib].ds.x) = d;
37
- reinterpret_cast<half&>(y[ib].ds.y) = sum;
38
- }
39
-
40
- template <bool need_sum>
41
- static __global__ void quantize_mmq_q8_1(
42
- const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
43
-
44
- const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
45
-
46
- if (ix0 >= kx0_padded) {
47
- return;
48
- }
49
-
50
- const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
51
-
52
- block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
53
-
54
- const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
55
- const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
56
- const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
57
-
58
- const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
59
- float amax = fabsf(xi);
60
-
61
- amax = warp_reduce_max(amax);
62
-
63
- float sum;
64
- if (need_sum) {
65
- sum = warp_reduce_sum(xi);
66
- }
67
-
68
- const float d = amax / 127;
69
- const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
70
-
71
- y[ib].qs[iqs] = q;
72
-
73
- if (iqs % QK8_1 != 0) {
74
- return;
75
- }
76
-
77
- if (need_sum) {
78
- y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
79
- } else {
80
- ((float *) y[ib].ds)[iqs/QK8_1] = d;
81
- }
82
- }
83
-
84
- void quantize_row_q8_1_cuda(
85
- const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
86
- const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
87
-
88
- GGML_ASSERT(kx0_padded % QK8_1 == 0);
89
-
90
- const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
91
- const dim3 num_blocks(block_num_x, kx1*channels, 1);
92
- const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
93
- quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
94
-
95
- GGML_UNUSED(type_x);
96
- }
97
-
98
- void quantize_mmq_q8_1_cuda(
99
- const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
100
- const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
101
-
102
- GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
103
-
104
- const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
105
- const dim3 num_blocks(block_num_x, kx1, channels);
106
- const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
107
- if (mmq_need_sum(type_x)) {
108
- quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
109
- } else {
110
- quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
111
- }
112
- }