llama_cpp 0.16.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,47 +0,0 @@
1
- #include "acc.cuh"
2
-
3
- static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
- const int ne10, const int ne11, const int ne12,
5
- const int nb1, const int nb2, int offset) {
6
- const int i = blockDim.x * blockIdx.x + threadIdx.x;
7
- if (i >= ne) {
8
- return;
9
- }
10
- int src1_idx = i - offset;
11
- int oz = src1_idx / nb2;
12
- int oy = (src1_idx - (oz * nb2)) / nb1;
13
- int ox = src1_idx % nb1;
14
- if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
- dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
- } else {
17
- dst[i] = x[i];
18
- }
19
- }
20
-
21
- static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
- const int ne10, const int ne11, const int ne12,
23
- const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
- int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
- acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
- }
27
-
28
- void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
- const ggml_tensor * src0 = dst->src[0];
30
- const ggml_tensor * src1 = dst->src[1];
31
- const float * src0_d = (const float *)src0->data;
32
- const float * src1_d = (const float *)src1->data;
33
- float * dst_d = (float *)dst->data;
34
- cudaStream_t stream = ctx.stream();
35
-
36
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
- GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
-
41
- int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
- int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
- // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
- int offset = dst->op_params[3] / 4; // offset in bytes
45
-
46
- acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
- }
@@ -1,34 +0,0 @@
1
- #include "arange.cuh"
2
-
3
- static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
4
- // blockIDx.x: idx of ne0 / BLOCK_SIZE
5
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
- if (nidx >= ne0) {
7
- return;
8
- }
9
- dst[nidx] = start + step * nidx;
10
- }
11
-
12
- static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13
- int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14
- arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
15
- }
16
-
17
- void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18
- float * dst_d = (float *)dst->data;
19
- cudaStream_t stream = ctx.stream();
20
-
21
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
22
-
23
- float start;
24
- float stop;
25
- float step;
26
- memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27
- memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
28
- memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
29
-
30
- int64_t steps = (int64_t)ceil((stop - start) / step);
31
- GGML_ASSERT(ggml_nelements(dst) == steps);
32
-
33
- arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34
- }
@@ -1,104 +0,0 @@
1
- #include "argsort.cuh"
2
-
3
- template<typename T>
4
- static inline __device__ void ggml_cuda_swap(T & a, T & b) {
5
- T tmp = a;
6
- a = b;
7
- b = tmp;
8
- }
9
-
10
- template<ggml_sort_order order>
11
- static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
12
- // bitonic sort
13
- int col = threadIdx.x;
14
- int row = blockIdx.y;
15
-
16
- if (col >= ncols_pad) {
17
- return;
18
- }
19
-
20
- const float * x_row = x + row * ncols;
21
- extern __shared__ int dst_row[];
22
-
23
- // initialize indices
24
- dst_row[col] = col;
25
-
26
- __syncthreads();
27
-
28
- for (int k = 2; k <= ncols_pad; k *= 2) {
29
- for (int j = k / 2; j > 0; j /= 2) {
30
- int ixj = col ^ j;
31
- if (ixj > col) {
32
- if ((col & k) == 0) {
33
- if (dst_row[col] >= ncols ||
34
- (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
35
- x_row[dst_row[col]] > x_row[dst_row[ixj]] :
36
- x_row[dst_row[col]] < x_row[dst_row[ixj]]))
37
- ) {
38
- ggml_cuda_swap(dst_row[col], dst_row[ixj]);
39
- }
40
- } else {
41
- if (dst_row[ixj] >= ncols ||
42
- (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
43
- x_row[dst_row[col]] < x_row[dst_row[ixj]] :
44
- x_row[dst_row[col]] > x_row[dst_row[ixj]]))
45
- ) {
46
- ggml_cuda_swap(dst_row[col], dst_row[ixj]);
47
- }
48
- }
49
- }
50
- __syncthreads();
51
- }
52
- }
53
-
54
- // copy the result to dst without the padding
55
- if (col < ncols) {
56
- dst[row * ncols + col] = dst_row[col];
57
- }
58
- }
59
-
60
- static int next_power_of_2(int x) {
61
- int n = 1;
62
- while (n < x) {
63
- n *= 2;
64
- }
65
- return n;
66
- }
67
-
68
- static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
69
- // bitonic sort requires ncols to be power of 2
70
- const int ncols_pad = next_power_of_2(ncols);
71
-
72
- const dim3 block_dims(ncols_pad, 1, 1);
73
- const dim3 block_nums(1, nrows, 1);
74
- const size_t shared_mem = ncols_pad * sizeof(int);
75
-
76
- // FIXME: this limit could be raised by ~2-4x on Ampere or newer
77
- GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
78
-
79
- if (order == GGML_SORT_ORDER_ASC) {
80
- k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
81
- } else if (order == GGML_SORT_ORDER_DESC) {
82
- k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
83
- } else {
84
- GGML_ASSERT(false);
85
- }
86
- }
87
-
88
- void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
89
- const ggml_tensor * src0 = dst->src[0];
90
- const float * src0_d = (const float *)src0->data;
91
- float * dst_d = (float *)dst->data;
92
- cudaStream_t stream = ctx.stream();
93
-
94
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
95
- GGML_ASSERT( dst->type == GGML_TYPE_I32);
96
- GGML_ASSERT(ggml_is_contiguous(src0));
97
-
98
- const int64_t ncols = src0->ne[0];
99
- const int64_t nrows = ggml_nrows(src0);
100
-
101
- enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
102
-
103
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
104
- }
@@ -1,280 +0,0 @@
1
- #include "binbcast.cuh"
2
-
3
- static __device__ __forceinline__ float op_repeat(const float a, const float b) {
4
- return b;
5
- GGML_UNUSED(a);
6
- }
7
-
8
- static __device__ __forceinline__ float op_add(const float a, const float b) {
9
- return a + b;
10
- }
11
-
12
- static __device__ __forceinline__ float op_mul(const float a, const float b) {
13
- return a * b;
14
- }
15
-
16
- static __device__ __forceinline__ float op_div(const float a, const float b) {
17
- return a / b;
18
- }
19
-
20
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
21
- static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
22
- int ne0, int ne1, int ne2, int ne3,
23
- int ne10, int ne11, int ne12, int ne13,
24
- /*int s0, */ int s1, int s2, int s3,
25
- /*int s00,*/ int s01, int s02, int s03,
26
- /*int s10,*/ int s11, int s12, int s13) {
27
- const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
28
- const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
29
- const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
30
- const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
31
-
32
- if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
33
- return;
34
- }
35
-
36
- const int i11 = i1 % ne11;
37
- const int i12 = i2 % ne12;
38
- const int i13 = i3 % ne13;
39
-
40
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
41
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
42
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
43
-
44
- const src0_t * src0_row = src0 + i_src0;
45
- const src1_t * src1_row = src1 + i_src1;
46
- dst_t * dst_row = dst + i_dst;
47
-
48
- for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
49
- const int i10 = i0 % ne10;
50
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
51
- }
52
- }
53
-
54
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
55
- static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
56
- int ne0, int ne1, int ne2, int ne3,
57
- int ne10, int ne11, int ne12, int ne13,
58
- /*int s0, */ int s1, int s2, int s3,
59
- /*int s00,*/ int s01, int s02, int s03,
60
- /*int s10,*/ int s11, int s12, int s13) {
61
-
62
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
63
-
64
- const int i3 = i/(ne2*ne1*ne0);
65
- const int i2 = (i/(ne1*ne0)) % ne2;
66
- const int i1 = (i/ne0) % ne1;
67
- const int i0 = i % ne0;
68
-
69
- if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
70
- return;
71
- }
72
-
73
- const int i11 = i1 % ne11;
74
- const int i12 = i2 % ne12;
75
- const int i13 = i3 % ne13;
76
-
77
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
78
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
79
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
80
-
81
- const src0_t * src0_row = src0 + i_src0;
82
- const src1_t * src1_row = src1 + i_src1;
83
- dst_t * dst_row = dst + i_dst;
84
-
85
- const int i10 = i0 % ne10;
86
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
87
- }
88
-
89
- template<float (*bin_op)(const float, const float)>
90
- struct bin_bcast_cuda {
91
- template<typename src0_t, typename src1_t, typename dst_t>
92
- void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
93
- const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
94
- cudaStream_t stream) {
95
-
96
- GGML_TENSOR_BINARY_OP_LOCALS
97
-
98
- int nr0 = ne10/ne0;
99
- int nr1 = ne11/ne1;
100
- int nr2 = ne12/ne2;
101
- int nr3 = ne13/ne3;
102
-
103
- int nr[4] = { nr0, nr1, nr2, nr3 };
104
-
105
- // collapse dimensions until first broadcast dimension
106
- int64_t cne[] = {ne0, ne1, ne2, ne3};
107
- int64_t cne0[] = {ne00, ne01, ne02, ne03};
108
- int64_t cne1[] = {ne10, ne11, ne12, ne13};
109
-
110
- size_t cnb[] = {nb0, nb1, nb2, nb3};
111
- size_t cnb0[] = {nb00, nb01, nb02, nb03};
112
- size_t cnb1[] = {nb10, nb11, nb12, nb13};
113
-
114
- auto collapse = [](int64_t cne[]) {
115
- cne[0] *= cne[1];
116
- cne[1] = cne[2];
117
- cne[2] = cne[3];
118
- cne[3] = 1;
119
- };
120
-
121
- auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
122
- cnb[1] *= cne[1];
123
- cnb[2] *= cne[2];
124
- cnb[3] *= cne[3];
125
- };
126
-
127
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
128
- for (int i = 0; i < 4; i++) {
129
- if (nr[i] != 1) {
130
- break;
131
- }
132
- if (i > 0) {
133
- collapse_nb(cnb, cne);
134
- collapse_nb(cnb0, cne0);
135
- collapse_nb(cnb1, cne1);
136
- collapse(cne);
137
- collapse(cne0);
138
- collapse(cne1);
139
- }
140
- }
141
- }
142
-
143
- {
144
- int64_t ne0 = cne[0];
145
- int64_t ne1 = cne[1];
146
- int64_t ne2 = cne[2];
147
- int64_t ne3 = cne[3];
148
-
149
- //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
150
- //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
151
- //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
152
- //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
153
-
154
- int64_t ne10 = cne1[0];
155
- int64_t ne11 = cne1[1];
156
- int64_t ne12 = cne1[2];
157
- int64_t ne13 = cne1[3];
158
-
159
- size_t nb0 = cnb[0];
160
- size_t nb1 = cnb[1];
161
- size_t nb2 = cnb[2];
162
- size_t nb3 = cnb[3];
163
-
164
- size_t nb00 = cnb0[0];
165
- size_t nb01 = cnb0[1];
166
- size_t nb02 = cnb0[2];
167
- size_t nb03 = cnb0[3];
168
-
169
- size_t nb10 = cnb1[0];
170
- size_t nb11 = cnb1[1];
171
- size_t nb12 = cnb1[2];
172
- size_t nb13 = cnb1[3];
173
-
174
- size_t s0 = nb0 / sizeof(dst_t);
175
- size_t s1 = nb1 / sizeof(dst_t);
176
- size_t s2 = nb2 / sizeof(dst_t);
177
- size_t s3 = nb3 / sizeof(dst_t);
178
-
179
- size_t s10 = nb10 / sizeof(src1_t);
180
- size_t s11 = nb11 / sizeof(src1_t);
181
- size_t s12 = nb12 / sizeof(src1_t);
182
- size_t s13 = nb13 / sizeof(src1_t);
183
-
184
- size_t s00 = nb00 / sizeof(src0_t);
185
- size_t s01 = nb01 / sizeof(src0_t);
186
- size_t s02 = nb02 / sizeof(src0_t);
187
- size_t s03 = nb03 / sizeof(src0_t);
188
-
189
- GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
190
- GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
191
- GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
192
- GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
193
-
194
- GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
195
- GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
196
- GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
197
- GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
198
-
199
- GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
200
- GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
201
- GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
202
- GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
203
-
204
- GGML_ASSERT(s0 == 1);
205
- GGML_ASSERT(s00 == 1);
206
- GGML_ASSERT(s10 == 1);
207
-
208
- const int block_size = 128;
209
-
210
- int64_t hne0 = std::max(ne0/2LL, 1LL);
211
-
212
- dim3 block_dims;
213
- block_dims.x = std::min<unsigned int>(hne0, block_size);
214
- block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
215
- block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
216
-
217
- dim3 block_nums(
218
- (hne0 + block_dims.x - 1) / block_dims.x,
219
- (ne1 + block_dims.y - 1) / block_dims.y,
220
- (ne2*ne3 + block_dims.z - 1) / block_dims.z
221
- );
222
-
223
- if (block_nums.z > 65535) {
224
- // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
225
- int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
226
- k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
227
- src0_dd, src1_dd, dst_dd,
228
- ne0, ne1, ne2, ne3,
229
- ne10, ne11, ne12, ne13,
230
- /* s0, */ s1, s2, s3,
231
- /* s00, */ s01, s02, s03,
232
- /* s10, */ s11, s12, s13);
233
- } else {
234
- k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
235
- src0_dd, src1_dd, dst_dd,
236
- ne0, ne1, ne2, ne3,
237
- ne10, ne11, ne12, ne13,
238
- /* s0, */ s1, s2, s3,
239
- /* s00, */ s01, s02, s03,
240
- /* s10, */ s11, s12, s13);
241
- }
242
- }
243
- }
244
- };
245
-
246
- template<class op>
247
- static void ggml_cuda_op_bin_bcast(
248
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
249
- const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
250
-
251
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
252
-
253
- if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
254
- op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
255
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
256
- op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
257
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
258
- op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
259
- } else {
260
- fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
261
- ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
262
- GGML_ASSERT(false);
263
- }
264
- }
265
-
266
- void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
267
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
268
- }
269
-
270
- void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
271
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
272
- }
273
-
274
- void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
275
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
276
- }
277
-
278
- void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
279
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
280
- }
@@ -1,34 +0,0 @@
1
- #include "clamp.cuh"
2
-
3
- static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
-
6
- if (i >= k) {
7
- return;
8
- }
9
-
10
- dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11
- }
12
-
13
- static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
14
- const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15
- clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16
- }
17
-
18
-
19
- void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20
- const ggml_tensor * src0 = dst->src[0];
21
- const float * src0_d = (const float *)src0->data;
22
- float * dst_d = (float *)dst->data;
23
- cudaStream_t stream = ctx.stream();
24
-
25
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
26
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
27
-
28
- float min;
29
- float max;
30
- memcpy(&min, dst->op_params, sizeof(float));
31
- memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32
-
33
- clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
34
- }
@@ -1,196 +0,0 @@
1
- #include "concat.cuh"
2
-
3
- // contiguous kernels
4
- static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
5
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
- if (nidx >= ne0) {
7
- return;
8
- }
9
-
10
- int offset_dst =
11
- nidx +
12
- blockIdx.y * ne0 +
13
- blockIdx.z * ne0 * gridDim.y;
14
-
15
- if (nidx < ne00) { // src0
16
- int offset_src =
17
- nidx +
18
- blockIdx.y * ne00 +
19
- blockIdx.z * ne00 * gridDim.y;
20
- dst[offset_dst] = x[offset_src];
21
- } else {
22
- int offset_src =
23
- (nidx - ne00) +
24
- blockIdx.y * (ne0 - ne00) +
25
- blockIdx.z * (ne0 - ne00) * gridDim.y;
26
- dst[offset_dst] = y[offset_src];
27
- }
28
- }
29
-
30
- static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
31
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
32
- if (nidx >= ne0) {
33
- return;
34
- }
35
-
36
- int offset_dst =
37
- nidx +
38
- blockIdx.y * ne0 +
39
- blockIdx.z * ne0 * gridDim.y;
40
-
41
- if (blockIdx.y < ne01) { // src0
42
- int offset_src =
43
- nidx +
44
- blockIdx.y * ne0 +
45
- blockIdx.z * ne0 * ne01;
46
- dst[offset_dst] = x[offset_src];
47
- } else {
48
- int offset_src =
49
- nidx +
50
- (blockIdx.y - ne01) * ne0 +
51
- blockIdx.z * ne0 * (gridDim.y - ne01);
52
- dst[offset_dst] = y[offset_src];
53
- }
54
- }
55
-
56
- static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
57
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
58
- if (nidx >= ne0) {
59
- return;
60
- }
61
-
62
- int offset_dst =
63
- nidx +
64
- blockIdx.y * ne0 +
65
- blockIdx.z * ne0 * gridDim.y;
66
-
67
- if (blockIdx.z < ne02) { // src0
68
- int offset_src =
69
- nidx +
70
- blockIdx.y * ne0 +
71
- blockIdx.z * ne0 * gridDim.y;
72
- dst[offset_dst] = x[offset_src];
73
- } else {
74
- int offset_src =
75
- nidx +
76
- blockIdx.y * ne0 +
77
- (blockIdx.z - ne02) * ne0 * gridDim.y;
78
- dst[offset_dst] = y[offset_src];
79
- }
80
- }
81
-
82
- static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
83
- int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
84
- dim3 gridDim(num_blocks, ne1, ne2);
85
- if (dim == 0) {
86
- concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
87
- return;
88
- }
89
- if (dim == 1) {
90
- concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
91
- return;
92
- }
93
- concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
94
- }
95
-
96
- // non-contiguous kernel (slow)
97
- static __global__ void concat_f32_non_cont(
98
- const char * src0,
99
- const char * src1,
100
- char * dst,
101
- int64_t ne00,
102
- int64_t ne01,
103
- int64_t ne02,
104
- int64_t ne03,
105
- uint64_t nb00,
106
- uint64_t nb01,
107
- uint64_t nb02,
108
- uint64_t nb03,
109
- int64_t /*ne10*/,
110
- int64_t /*ne11*/,
111
- int64_t /*ne12*/,
112
- int64_t /*ne13*/,
113
- uint64_t nb10,
114
- uint64_t nb11,
115
- uint64_t nb12,
116
- uint64_t nb13,
117
- int64_t ne0,
118
- int64_t /*ne1*/,
119
- int64_t /*ne2*/,
120
- int64_t /*ne3*/,
121
- uint64_t nb0,
122
- uint64_t nb1,
123
- uint64_t nb2,
124
- uint64_t nb3,
125
- int32_t dim) {
126
- const int64_t i3 = blockIdx.z;
127
- const int64_t i2 = blockIdx.y;
128
- const int64_t i1 = blockIdx.x;
129
-
130
- int64_t o[4] = {0, 0, 0, 0};
131
- o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
132
-
133
- const float * x;
134
-
135
- for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
136
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
137
- x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
138
- } else {
139
- x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
140
- }
141
-
142
- float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
143
-
144
- *y = *x;
145
- }
146
- }
147
-
148
-
149
- void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
150
- const ggml_tensor * src0 = dst->src[0];
151
- const ggml_tensor * src1 = dst->src[1];
152
-
153
- cudaStream_t stream = ctx.stream();
154
-
155
- const int32_t dim = ((int32_t *) dst->op_params)[0];
156
-
157
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
158
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
159
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
160
-
161
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
162
- const float * src0_d = (const float *)src0->data;
163
- const float * src1_d = (const float *)src1->data;
164
-
165
- float * dst_d = (float *)dst->data;
166
-
167
- if (dim != 3) {
168
- for (int i3 = 0; i3 < dst->ne[3]; i3++) {
169
- concat_f32_cuda(
170
- src0_d + i3 * (src0->nb[3] / 4),
171
- src1_d + i3 * (src1->nb[3] / 4),
172
- dst_d + i3 * ( dst->nb[3] / 4),
173
- src0->ne[0], src0->ne[1], src0->ne[2],
174
- dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
175
- }
176
- } else {
177
- const size_t size0 = ggml_nbytes(src0);
178
- const size_t size1 = ggml_nbytes(src1);
179
-
180
- CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
181
- CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
182
- }
183
- } else {
184
- dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
185
- concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
186
- (const char *)src0->data,
187
- (const char *)src1->data,
188
- ( char *)dst->data,
189
- src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
190
- src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
191
- src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
192
- src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
193
- dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
194
- dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
195
- }
196
- }