llama_cpp 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,271 +0,0 @@
1
- #include "rope.cuh"
2
-
3
- struct rope_corr_dims {
4
- float v[2];
5
- };
6
-
7
- static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
8
- const float y = (i0 / 2 - low) / max(0.001f, high - low);
9
- return 1.0f - min(1.0f, max(0.0f, y));
10
- }
11
-
12
- // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
13
- // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
14
- static __device__ void rope_yarn(
15
- float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
16
- float * cos_theta, float * sin_theta) {
17
- // Get n-d rotational scaling corrected for extrapolation
18
- float theta_interp = freq_scale * theta_extrap;
19
- float theta = theta_interp;
20
- if (ext_factor != 0.0f) {
21
- float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
22
- theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
23
-
24
- // Get n-d magnitude scaling corrected for interpolation
25
- mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
26
- }
27
- *cos_theta = cosf(theta) * mscale;
28
- *sin_theta = sinf(theta) * mscale;
29
- }
30
-
31
- template<typename T, bool has_ff>
32
- static __global__ void rope_norm(
33
- const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
34
- float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
35
- const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
36
-
37
- if (i0 >= ne0) {
38
- return;
39
- }
40
-
41
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
42
-
43
- if (i0 >= n_dims) {
44
- const int i = row*ne0 + i0;
45
-
46
- dst[i + 0] = x[i + 0];
47
- dst[i + 1] = x[i + 1];
48
-
49
- return;
50
- }
51
-
52
- const int i = row*ne0 + i0;
53
- const int i2 = row/p_delta_rows;
54
-
55
- const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
56
-
57
- const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
58
-
59
- float cos_theta;
60
- float sin_theta;
61
-
62
- rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
63
-
64
- const float x0 = x[i + 0];
65
- const float x1 = x[i + 1];
66
-
67
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
68
- dst[i + 1] = x0*sin_theta + x1*cos_theta;
69
- }
70
-
71
- template<typename T, bool has_ff>
72
- static __global__ void rope_neox(
73
- const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
74
- float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
75
- const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
76
-
77
- if (i0 >= ne0) {
78
- return;
79
- }
80
-
81
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
82
-
83
- if (i0 >= n_dims) {
84
- const int i = row*ne0 + i0;
85
-
86
- dst[i + 0] = x[i + 0];
87
- dst[i + 1] = x[i + 1];
88
-
89
- return;
90
- }
91
-
92
- const int i = row*ne0 + i0/2;
93
- const int i2 = row/p_delta_rows;
94
-
95
- const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
96
-
97
- const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
98
-
99
- float cos_theta;
100
- float sin_theta;
101
-
102
- rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
103
-
104
- const float x0 = x[i + 0];
105
- const float x1 = x[i + n_dims/2];
106
-
107
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
108
- dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
109
- }
110
-
111
- template<typename T>
112
- static void rope_norm_cuda(
113
- const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
114
- float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
115
- GGML_ASSERT(ne0 % 2 == 0);
116
- const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
117
- const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
118
- const dim3 block_nums(nr, n_blocks_x, 1);
119
-
120
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
121
-
122
- if (freq_factors == nullptr) {
123
- rope_norm<T, false><<<block_nums, block_dims, 0, stream>>>(
124
- x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
125
- theta_scale, freq_factors
126
- );
127
- } else {
128
- rope_norm<T, true><<<block_nums, block_dims, 0, stream>>>(
129
- x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
130
- theta_scale, freq_factors
131
- );
132
- }
133
- }
134
-
135
- template<typename T>
136
- static void rope_neox_cuda(
137
- const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
138
- float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
139
- GGML_ASSERT(ne0 % 2 == 0);
140
- const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
141
- const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
142
- const dim3 block_nums(nr, n_blocks_x, 1);
143
-
144
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
145
-
146
- if (freq_factors == nullptr) {
147
- rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
148
- x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
149
- theta_scale, freq_factors
150
- );
151
- } else {
152
- rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
153
- x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
154
- theta_scale, freq_factors
155
- );
156
- }
157
- }
158
-
159
- static void rope_norm_cuda_f16(
160
- const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
161
- float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
162
-
163
- rope_norm_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
164
- }
165
-
166
- static void rope_norm_cuda_f32(
167
- const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
168
- float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
169
-
170
- rope_norm_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
171
- }
172
-
173
- static void rope_neox_cuda_f16(
174
- const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
175
- float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
176
-
177
- rope_neox_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
178
- }
179
-
180
- static void rope_neox_cuda_f32(
181
- const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
182
- float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
183
- ) {
184
-
185
- rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
186
- }
187
-
188
- void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
189
- const ggml_tensor * src0 = dst->src[0];
190
- const ggml_tensor * src1 = dst->src[1];
191
- const ggml_tensor * src2 = dst->src[2];
192
-
193
- const float * src0_d = (const float *)src0->data;
194
- const float * src1_d = (const float *)src1->data;
195
-
196
- float * dst_d = (float *)dst->data;
197
- cudaStream_t stream = ctx.stream();
198
-
199
- GGML_ASSERT(ggml_is_contiguous(src0));
200
- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
201
- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
202
- GGML_ASSERT(src0->type == dst->type);
203
-
204
- const int64_t ne00 = src0->ne[0];
205
- const int64_t ne01 = src0->ne[1];
206
- const int64_t nr = ggml_nrows(src0);
207
-
208
- //const int n_past = ((int32_t *) dst->op_params)[0];
209
- const int n_dims = ((int32_t *) dst->op_params)[1];
210
- const int mode = ((int32_t *) dst->op_params)[2];
211
- //const int n_ctx = ((int32_t *) dst->op_params)[3];
212
- const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
213
-
214
- // RoPE alteration for extended context
215
- float freq_base;
216
- float freq_scale;
217
- float ext_factor;
218
- float attn_factor;
219
- float beta_fast;
220
- float beta_slow;
221
-
222
- memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
223
- memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
224
- memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
225
- memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
226
- memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
227
- memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
228
-
229
- const bool is_neox = mode & 2;
230
-
231
- const int32_t * pos = (const int32_t *) src1_d;
232
-
233
- const float * freq_factors = nullptr;
234
- if (src2 != nullptr) {
235
- freq_factors = (const float *) src2->data;
236
- }
237
-
238
- rope_corr_dims corr_dims;
239
- ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
240
-
241
- // compute
242
- if (is_neox) {
243
- if (src0->type == GGML_TYPE_F32) {
244
- rope_neox_cuda_f32(
245
- (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
246
- attn_factor, corr_dims, freq_factors, stream
247
- );
248
- } else if (src0->type == GGML_TYPE_F16) {
249
- rope_neox_cuda_f16(
250
- (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
251
- attn_factor, corr_dims, freq_factors, stream
252
- );
253
- } else {
254
- GGML_ASSERT(false);
255
- }
256
- } else {
257
- if (src0->type == GGML_TYPE_F32) {
258
- rope_norm_cuda_f32(
259
- (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
260
- attn_factor, corr_dims, freq_factors, stream
261
- );
262
- } else if (src0->type == GGML_TYPE_F16) {
263
- rope_norm_cuda_f16(
264
- (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
265
- attn_factor, corr_dims, freq_factors, stream
266
- );
267
- } else {
268
- GGML_ASSERT(false);
269
- }
270
- }
271
- }
@@ -1,31 +0,0 @@
1
- #include "scale.cuh"
2
-
3
- static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
4
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
-
6
- if (i >= k) {
7
- return;
8
- }
9
-
10
- dst[i] = scale * x[i];
11
- }
12
-
13
- static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
14
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
15
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
16
- }
17
-
18
- void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
19
- const ggml_tensor * src0 = dst->src[0];
20
- const float * src0_d = (const float *)src0->data;
21
- float * dst_d = (float *)dst->data;
22
- cudaStream_t stream = ctx.stream();
23
-
24
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
25
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
26
-
27
- float scale;
28
- memcpy(&scale, dst->op_params, sizeof(float));
29
-
30
- scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
31
- }
@@ -1,206 +0,0 @@
1
- #include "common.cuh"
2
- #include "softmax.cuh"
3
-
4
- template <typename T>
5
- static __device__ __forceinline__ float t2f32(T val) {
6
- return (float) val;
7
- }
8
-
9
- template <>
10
- __device__ float __forceinline__ t2f32<half>(half val) {
11
- return __half2float(val);
12
- }
13
-
14
- template <bool vals_smem, int ncols_template, int block_size_template, typename T>
15
- static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
16
- const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
17
-
18
- const int tid = threadIdx.x;
19
- const int rowx = blockIdx.x;
20
- const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
21
-
22
- const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
23
-
24
- const int warp_id = threadIdx.x / WARP_SIZE;
25
- const int lane_id = threadIdx.x % WARP_SIZE;
26
-
27
- const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
28
-
29
- extern __shared__ float data_soft_max_f32[];
30
- float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
31
- // shared memory buffer to cache values between iterations:
32
- float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
33
-
34
- float max_val = -INFINITY;
35
-
36
- #pragma unroll
37
- for (int col0 = 0; col0 < ncols; col0 += block_size) {
38
- const int col = col0 + tid;
39
-
40
- if (ncols_template == 0 && col >= ncols) {
41
- break;
42
- }
43
-
44
- const int64_t ix = (int64_t)rowx*ncols + col;
45
- const int64_t iy = (int64_t)rowy*ncols + col;
46
-
47
- const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
48
-
49
- vals[col] = val;
50
- max_val = max(max_val, val);
51
- }
52
-
53
- // find the max value in the block
54
- max_val = warp_reduce_max(max_val);
55
- if (block_size > WARP_SIZE) {
56
- if (warp_id == 0) {
57
- buf_iw[lane_id] = -INFINITY;
58
- }
59
- __syncthreads();
60
-
61
- if (lane_id == 0) {
62
- buf_iw[warp_id] = max_val;
63
- }
64
- __syncthreads();
65
-
66
- max_val = buf_iw[lane_id];
67
- max_val = warp_reduce_max(max_val);
68
- }
69
-
70
- float tmp = 0.0f; // partial sum
71
-
72
- #pragma unroll
73
- for (int col0 = 0; col0 < ncols; col0 += block_size) {
74
- const int col = col0 + tid;
75
-
76
- if (ncols_template == 0 && col >= ncols) {
77
- break;
78
- }
79
-
80
- const float val = expf(vals[col] - max_val);
81
- tmp += val;
82
- vals[col] = val;
83
- }
84
-
85
- // find the sum of exps in the block
86
- tmp = warp_reduce_sum(tmp);
87
- if (block_size > WARP_SIZE) {
88
- __syncthreads();
89
- if (warp_id == 0) {
90
- buf_iw[lane_id] = 0.0f;
91
- }
92
- __syncthreads();
93
-
94
- if (lane_id == 0) {
95
- buf_iw[warp_id] = tmp;
96
- }
97
- __syncthreads();
98
-
99
- tmp = buf_iw[lane_id];
100
- tmp = warp_reduce_sum(tmp);
101
- }
102
-
103
- const float inv_sum = 1.0f / tmp;
104
-
105
- #pragma unroll
106
- for (int col0 = 0; col0 < ncols; col0 += block_size) {
107
- const int col = col0 + tid;
108
-
109
- if (ncols_template == 0 && col >= ncols) {
110
- return;
111
- }
112
-
113
- const int64_t idst = (int64_t)rowx*ncols + col;
114
- dst[idst] = vals[col] * inv_sum;
115
- }
116
- }
117
-
118
- template<typename T>
119
- static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
120
- int nth = WARP_SIZE;
121
- while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
122
- const dim3 block_dims(nth, 1, 1);
123
- const dim3 block_nums(nrows_x, 1, 1);
124
- const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
125
- static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
126
-
127
- const uint32_t n_head = nrows_x/nrows_y;
128
- const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
129
-
130
- const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
131
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
132
-
133
- // FIXME: this limit could be raised by ~2-4x on Ampere or newer
134
- if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
135
- switch (ncols_x) {
136
- case 32:
137
- soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
138
- break;
139
- case 64:
140
- soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
141
- break;
142
- case 128:
143
- soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
144
- break;
145
- case 256:
146
- soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
147
- break;
148
- case 512:
149
- soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
150
- break;
151
- case 1024:
152
- soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
153
- break;
154
- case 2048:
155
- soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
156
- break;
157
- case 4096:
158
- soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
159
- break;
160
- default:
161
- soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
162
- break;
163
- }
164
- } else {
165
- const size_t shmem_low = WARP_SIZE*sizeof(float);
166
- soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
167
- }
168
- }
169
-
170
- void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
171
- const ggml_tensor * src0 = dst->src[0];
172
- const ggml_tensor * src1 = dst->src[1];
173
-
174
- const float * src0_d = (const float *)src0->data;
175
- const void * src1_d = src1 ? (const void *)src1->data : nullptr;
176
-
177
- float * dst_d = (float *)dst->data;
178
- cudaStream_t stream = ctx.stream();
179
-
180
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
181
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
182
-
183
- GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
184
-
185
- const int64_t ne00 = src0->ne[0];
186
- const int64_t nrows_x = ggml_nrows(src0);
187
- const int64_t nrows_y = src0->ne[1];
188
-
189
- float scale = 1.0f;
190
- float max_bias = 0.0f;
191
-
192
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
193
- memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
194
-
195
- const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
196
-
197
- if (use_f16) {
198
- const half * src1_dd = (const half *)src1_d;
199
-
200
- soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
201
- } else {
202
- const float * src1_dd = (const float *)src1_d;
203
-
204
- soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
205
- }
206
- }
@@ -1,40 +0,0 @@
1
- #include "sumrows.cuh"
2
-
3
- static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
4
- const int row = blockIdx.x;
5
- const int col = threadIdx.x;
6
-
7
- float sum = 0.0f;
8
- for (int i = col; i < ncols; i += blockDim.x) {
9
- sum += x[row * ncols + i];
10
- }
11
-
12
- sum = warp_reduce_sum(sum);
13
-
14
- if (col == 0) {
15
- dst[row] = sum;
16
- }
17
- }
18
-
19
- static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
20
- const dim3 block_dims(WARP_SIZE, 1, 1);
21
- const dim3 block_nums(nrows, 1, 1);
22
- k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
23
- }
24
-
25
- void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
26
- const ggml_tensor * src0 = dst->src[0];
27
- const float * src0_d = (const float *)src0->data;
28
- float * dst_d = (float *)dst->data;
29
- cudaStream_t stream = ctx.stream();
30
-
31
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
32
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
33
- GGML_ASSERT(ggml_is_contiguous(src0));
34
-
35
-
36
- const int64_t ncols = src0->ne[0];
37
- const int64_t nrows = ggml_nrows(src0);
38
-
39
- sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
40
- }
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
@@ -1,5 +0,0 @@
1
- // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2
-
3
- #include "../fattn-vec-f16.cuh"
4
-
5
- DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);