llama_cpp 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,47 +0,0 @@
1
- #include "acc.cuh"
2
-
3
- static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
- const int ne10, const int ne11, const int ne12,
5
- const int nb1, const int nb2, int offset) {
6
- const int i = blockDim.x * blockIdx.x + threadIdx.x;
7
- if (i >= ne) {
8
- return;
9
- }
10
- int src1_idx = i - offset;
11
- int oz = src1_idx / nb2;
12
- int oy = (src1_idx - (oz * nb2)) / nb1;
13
- int ox = src1_idx % nb1;
14
- if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
- dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
- } else {
17
- dst[i] = x[i];
18
- }
19
- }
20
-
21
- static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
- const int ne10, const int ne11, const int ne12,
23
- const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
- int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
- acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
- }
27
-
28
- void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
- const ggml_tensor * src0 = dst->src[0];
30
- const ggml_tensor * src1 = dst->src[1];
31
- const float * src0_d = (const float *)src0->data;
32
- const float * src1_d = (const float *)src1->data;
33
- float * dst_d = (float *)dst->data;
34
- cudaStream_t stream = ctx.stream();
35
-
36
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
- GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
-
41
- int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
- int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
- // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
- int offset = dst->op_params[3] / 4; // offset in bytes
45
-
46
- acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
- }
@@ -1,34 +0,0 @@
1
- #include "arange.cuh"
2
-
3
- static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
4
- // blockIDx.x: idx of ne0 / BLOCK_SIZE
5
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
- if (nidx >= ne0) {
7
- return;
8
- }
9
- dst[nidx] = start + step * nidx;
10
- }
11
-
12
- static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13
- int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14
- arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
15
- }
16
-
17
- void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18
- float * dst_d = (float *)dst->data;
19
- cudaStream_t stream = ctx.stream();
20
-
21
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
22
-
23
- float start;
24
- float stop;
25
- float step;
26
- memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27
- memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
28
- memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
29
-
30
- int64_t steps = (int64_t)ceil((stop - start) / step);
31
- GGML_ASSERT(ggml_nelements(dst) == steps);
32
-
33
- arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34
- }
@@ -1,104 +0,0 @@
1
- #include "argsort.cuh"
2
-
3
- template<typename T>
4
- static inline __device__ void ggml_cuda_swap(T & a, T & b) {
5
- T tmp = a;
6
- a = b;
7
- b = tmp;
8
- }
9
-
10
- template<ggml_sort_order order>
11
- static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
12
- // bitonic sort
13
- int col = threadIdx.x;
14
- int row = blockIdx.y;
15
-
16
- if (col >= ncols_pad) {
17
- return;
18
- }
19
-
20
- const float * x_row = x + row * ncols;
21
- extern __shared__ int dst_row[];
22
-
23
- // initialize indices
24
- dst_row[col] = col;
25
-
26
- __syncthreads();
27
-
28
- for (int k = 2; k <= ncols_pad; k *= 2) {
29
- for (int j = k / 2; j > 0; j /= 2) {
30
- int ixj = col ^ j;
31
- if (ixj > col) {
32
- if ((col & k) == 0) {
33
- if (dst_row[col] >= ncols ||
34
- (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
35
- x_row[dst_row[col]] > x_row[dst_row[ixj]] :
36
- x_row[dst_row[col]] < x_row[dst_row[ixj]]))
37
- ) {
38
- ggml_cuda_swap(dst_row[col], dst_row[ixj]);
39
- }
40
- } else {
41
- if (dst_row[ixj] >= ncols ||
42
- (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
43
- x_row[dst_row[col]] < x_row[dst_row[ixj]] :
44
- x_row[dst_row[col]] > x_row[dst_row[ixj]]))
45
- ) {
46
- ggml_cuda_swap(dst_row[col], dst_row[ixj]);
47
- }
48
- }
49
- }
50
- __syncthreads();
51
- }
52
- }
53
-
54
- // copy the result to dst without the padding
55
- if (col < ncols) {
56
- dst[row * ncols + col] = dst_row[col];
57
- }
58
- }
59
-
60
- static int next_power_of_2(int x) {
61
- int n = 1;
62
- while (n < x) {
63
- n *= 2;
64
- }
65
- return n;
66
- }
67
-
68
- static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
69
- // bitonic sort requires ncols to be power of 2
70
- const int ncols_pad = next_power_of_2(ncols);
71
-
72
- const dim3 block_dims(ncols_pad, 1, 1);
73
- const dim3 block_nums(1, nrows, 1);
74
- const size_t shared_mem = ncols_pad * sizeof(int);
75
-
76
- // FIXME: this limit could be raised by ~2-4x on Ampere or newer
77
- GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
78
-
79
- if (order == GGML_SORT_ORDER_ASC) {
80
- k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
81
- } else if (order == GGML_SORT_ORDER_DESC) {
82
- k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
83
- } else {
84
- GGML_ASSERT(false);
85
- }
86
- }
87
-
88
- void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
89
- const ggml_tensor * src0 = dst->src[0];
90
- const float * src0_d = (const float *)src0->data;
91
- float * dst_d = (float *)dst->data;
92
- cudaStream_t stream = ctx.stream();
93
-
94
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
95
- GGML_ASSERT( dst->type == GGML_TYPE_I32);
96
- GGML_ASSERT(ggml_is_contiguous(src0));
97
-
98
- const int64_t ncols = src0->ne[0];
99
- const int64_t nrows = ggml_nrows(src0);
100
-
101
- enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
102
-
103
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
104
- }
@@ -1,280 +0,0 @@
1
- #include "binbcast.cuh"
2
-
3
- static __device__ __forceinline__ float op_repeat(const float a, const float b) {
4
- return b;
5
- GGML_UNUSED(a);
6
- }
7
-
8
- static __device__ __forceinline__ float op_add(const float a, const float b) {
9
- return a + b;
10
- }
11
-
12
- static __device__ __forceinline__ float op_mul(const float a, const float b) {
13
- return a * b;
14
- }
15
-
16
- static __device__ __forceinline__ float op_div(const float a, const float b) {
17
- return a / b;
18
- }
19
-
20
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
21
- static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
22
- int ne0, int ne1, int ne2, int ne3,
23
- int ne10, int ne11, int ne12, int ne13,
24
- /*int s0, */ int s1, int s2, int s3,
25
- /*int s00,*/ int s01, int s02, int s03,
26
- /*int s10,*/ int s11, int s12, int s13) {
27
- const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
28
- const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
29
- const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
30
- const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
31
-
32
- if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
33
- return;
34
- }
35
-
36
- const int i11 = i1 % ne11;
37
- const int i12 = i2 % ne12;
38
- const int i13 = i3 % ne13;
39
-
40
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
41
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
42
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
43
-
44
- const src0_t * src0_row = src0 + i_src0;
45
- const src1_t * src1_row = src1 + i_src1;
46
- dst_t * dst_row = dst + i_dst;
47
-
48
- for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
49
- const int i10 = i0 % ne10;
50
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
51
- }
52
- }
53
-
54
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
55
- static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
56
- int ne0, int ne1, int ne2, int ne3,
57
- int ne10, int ne11, int ne12, int ne13,
58
- /*int s0, */ int s1, int s2, int s3,
59
- /*int s00,*/ int s01, int s02, int s03,
60
- /*int s10,*/ int s11, int s12, int s13) {
61
-
62
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
63
-
64
- const int i3 = i/(ne2*ne1*ne0);
65
- const int i2 = (i/(ne1*ne0)) % ne2;
66
- const int i1 = (i/ne0) % ne1;
67
- const int i0 = i % ne0;
68
-
69
- if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
70
- return;
71
- }
72
-
73
- const int i11 = i1 % ne11;
74
- const int i12 = i2 % ne12;
75
- const int i13 = i3 % ne13;
76
-
77
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
78
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
79
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
80
-
81
- const src0_t * src0_row = src0 + i_src0;
82
- const src1_t * src1_row = src1 + i_src1;
83
- dst_t * dst_row = dst + i_dst;
84
-
85
- const int i10 = i0 % ne10;
86
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
87
- }
88
-
89
- template<float (*bin_op)(const float, const float)>
90
- struct bin_bcast_cuda {
91
- template<typename src0_t, typename src1_t, typename dst_t>
92
- void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
93
- const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
94
- cudaStream_t stream) {
95
-
96
- GGML_TENSOR_BINARY_OP_LOCALS
97
-
98
- int nr0 = ne10/ne0;
99
- int nr1 = ne11/ne1;
100
- int nr2 = ne12/ne2;
101
- int nr3 = ne13/ne3;
102
-
103
- int nr[4] = { nr0, nr1, nr2, nr3 };
104
-
105
- // collapse dimensions until first broadcast dimension
106
- int64_t cne[] = {ne0, ne1, ne2, ne3};
107
- int64_t cne0[] = {ne00, ne01, ne02, ne03};
108
- int64_t cne1[] = {ne10, ne11, ne12, ne13};
109
-
110
- size_t cnb[] = {nb0, nb1, nb2, nb3};
111
- size_t cnb0[] = {nb00, nb01, nb02, nb03};
112
- size_t cnb1[] = {nb10, nb11, nb12, nb13};
113
-
114
- auto collapse = [](int64_t cne[]) {
115
- cne[0] *= cne[1];
116
- cne[1] = cne[2];
117
- cne[2] = cne[3];
118
- cne[3] = 1;
119
- };
120
-
121
- auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
122
- cnb[1] *= cne[1];
123
- cnb[2] *= cne[2];
124
- cnb[3] *= cne[3];
125
- };
126
-
127
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
128
- for (int i = 0; i < 4; i++) {
129
- if (nr[i] != 1) {
130
- break;
131
- }
132
- if (i > 0) {
133
- collapse_nb(cnb, cne);
134
- collapse_nb(cnb0, cne0);
135
- collapse_nb(cnb1, cne1);
136
- collapse(cne);
137
- collapse(cne0);
138
- collapse(cne1);
139
- }
140
- }
141
- }
142
-
143
- {
144
- int64_t ne0 = cne[0];
145
- int64_t ne1 = cne[1];
146
- int64_t ne2 = cne[2];
147
- int64_t ne3 = cne[3];
148
-
149
- //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
150
- //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
151
- //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
152
- //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
153
-
154
- int64_t ne10 = cne1[0];
155
- int64_t ne11 = cne1[1];
156
- int64_t ne12 = cne1[2];
157
- int64_t ne13 = cne1[3];
158
-
159
- size_t nb0 = cnb[0];
160
- size_t nb1 = cnb[1];
161
- size_t nb2 = cnb[2];
162
- size_t nb3 = cnb[3];
163
-
164
- size_t nb00 = cnb0[0];
165
- size_t nb01 = cnb0[1];
166
- size_t nb02 = cnb0[2];
167
- size_t nb03 = cnb0[3];
168
-
169
- size_t nb10 = cnb1[0];
170
- size_t nb11 = cnb1[1];
171
- size_t nb12 = cnb1[2];
172
- size_t nb13 = cnb1[3];
173
-
174
- size_t s0 = nb0 / sizeof(dst_t);
175
- size_t s1 = nb1 / sizeof(dst_t);
176
- size_t s2 = nb2 / sizeof(dst_t);
177
- size_t s3 = nb3 / sizeof(dst_t);
178
-
179
- size_t s10 = nb10 / sizeof(src1_t);
180
- size_t s11 = nb11 / sizeof(src1_t);
181
- size_t s12 = nb12 / sizeof(src1_t);
182
- size_t s13 = nb13 / sizeof(src1_t);
183
-
184
- size_t s00 = nb00 / sizeof(src0_t);
185
- size_t s01 = nb01 / sizeof(src0_t);
186
- size_t s02 = nb02 / sizeof(src0_t);
187
- size_t s03 = nb03 / sizeof(src0_t);
188
-
189
- GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
190
- GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
191
- GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
192
- GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
193
-
194
- GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
195
- GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
196
- GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
197
- GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
198
-
199
- GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
200
- GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
201
- GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
202
- GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
203
-
204
- GGML_ASSERT(s0 == 1);
205
- GGML_ASSERT(s00 == 1);
206
- GGML_ASSERT(s10 == 1);
207
-
208
- const int block_size = 128;
209
-
210
- int64_t hne0 = std::max(ne0/2LL, 1LL);
211
-
212
- dim3 block_dims;
213
- block_dims.x = std::min<unsigned int>(hne0, block_size);
214
- block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
215
- block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
216
-
217
- dim3 block_nums(
218
- (hne0 + block_dims.x - 1) / block_dims.x,
219
- (ne1 + block_dims.y - 1) / block_dims.y,
220
- (ne2*ne3 + block_dims.z - 1) / block_dims.z
221
- );
222
-
223
- if (block_nums.z > 65535) {
224
- // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
225
- int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
226
- k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
227
- src0_dd, src1_dd, dst_dd,
228
- ne0, ne1, ne2, ne3,
229
- ne10, ne11, ne12, ne13,
230
- /* s0, */ s1, s2, s3,
231
- /* s00, */ s01, s02, s03,
232
- /* s10, */ s11, s12, s13);
233
- } else {
234
- k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
235
- src0_dd, src1_dd, dst_dd,
236
- ne0, ne1, ne2, ne3,
237
- ne10, ne11, ne12, ne13,
238
- /* s0, */ s1, s2, s3,
239
- /* s00, */ s01, s02, s03,
240
- /* s10, */ s11, s12, s13);
241
- }
242
- }
243
- }
244
- };
245
-
246
- template<class op>
247
- static void ggml_cuda_op_bin_bcast(
248
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
249
- const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
250
-
251
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
252
-
253
- if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
254
- op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
255
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
256
- op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
257
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
258
- op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
259
- } else {
260
- fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
261
- ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
262
- GGML_ASSERT(false);
263
- }
264
- }
265
-
266
- void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
267
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
268
- }
269
-
270
- void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
271
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
272
- }
273
-
274
- void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
275
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
276
- }
277
-
278
- void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
279
- ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
280
- }
@@ -1,34 +0,0 @@
1
- #include "clamp.cuh"
2
-
3
- static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
-
6
- if (i >= k) {
7
- return;
8
- }
9
-
10
- dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11
- }
12
-
13
- static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
14
- const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15
- clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16
- }
17
-
18
-
19
- void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20
- const ggml_tensor * src0 = dst->src[0];
21
- const float * src0_d = (const float *)src0->data;
22
- float * dst_d = (float *)dst->data;
23
- cudaStream_t stream = ctx.stream();
24
-
25
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
26
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
27
-
28
- float min;
29
- float max;
30
- memcpy(&min, dst->op_params, sizeof(float));
31
- memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32
-
33
- clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
34
- }
@@ -1,196 +0,0 @@
1
- #include "concat.cuh"
2
-
3
- // contiguous kernels
4
- static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
5
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
- if (nidx >= ne0) {
7
- return;
8
- }
9
-
10
- int offset_dst =
11
- nidx +
12
- blockIdx.y * ne0 +
13
- blockIdx.z * ne0 * gridDim.y;
14
-
15
- if (nidx < ne00) { // src0
16
- int offset_src =
17
- nidx +
18
- blockIdx.y * ne00 +
19
- blockIdx.z * ne00 * gridDim.y;
20
- dst[offset_dst] = x[offset_src];
21
- } else {
22
- int offset_src =
23
- (nidx - ne00) +
24
- blockIdx.y * (ne0 - ne00) +
25
- blockIdx.z * (ne0 - ne00) * gridDim.y;
26
- dst[offset_dst] = y[offset_src];
27
- }
28
- }
29
-
30
- static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
31
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
32
- if (nidx >= ne0) {
33
- return;
34
- }
35
-
36
- int offset_dst =
37
- nidx +
38
- blockIdx.y * ne0 +
39
- blockIdx.z * ne0 * gridDim.y;
40
-
41
- if (blockIdx.y < ne01) { // src0
42
- int offset_src =
43
- nidx +
44
- blockIdx.y * ne0 +
45
- blockIdx.z * ne0 * ne01;
46
- dst[offset_dst] = x[offset_src];
47
- } else {
48
- int offset_src =
49
- nidx +
50
- (blockIdx.y - ne01) * ne0 +
51
- blockIdx.z * ne0 * (gridDim.y - ne01);
52
- dst[offset_dst] = y[offset_src];
53
- }
54
- }
55
-
56
- static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
57
- int nidx = threadIdx.x + blockIdx.x * blockDim.x;
58
- if (nidx >= ne0) {
59
- return;
60
- }
61
-
62
- int offset_dst =
63
- nidx +
64
- blockIdx.y * ne0 +
65
- blockIdx.z * ne0 * gridDim.y;
66
-
67
- if (blockIdx.z < ne02) { // src0
68
- int offset_src =
69
- nidx +
70
- blockIdx.y * ne0 +
71
- blockIdx.z * ne0 * gridDim.y;
72
- dst[offset_dst] = x[offset_src];
73
- } else {
74
- int offset_src =
75
- nidx +
76
- blockIdx.y * ne0 +
77
- (blockIdx.z - ne02) * ne0 * gridDim.y;
78
- dst[offset_dst] = y[offset_src];
79
- }
80
- }
81
-
82
- static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
83
- int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
84
- dim3 gridDim(num_blocks, ne1, ne2);
85
- if (dim == 0) {
86
- concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
87
- return;
88
- }
89
- if (dim == 1) {
90
- concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
91
- return;
92
- }
93
- concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
94
- }
95
-
96
- // non-contiguous kernel (slow)
97
- static __global__ void concat_f32_non_cont(
98
- const char * src0,
99
- const char * src1,
100
- char * dst,
101
- int64_t ne00,
102
- int64_t ne01,
103
- int64_t ne02,
104
- int64_t ne03,
105
- uint64_t nb00,
106
- uint64_t nb01,
107
- uint64_t nb02,
108
- uint64_t nb03,
109
- int64_t /*ne10*/,
110
- int64_t /*ne11*/,
111
- int64_t /*ne12*/,
112
- int64_t /*ne13*/,
113
- uint64_t nb10,
114
- uint64_t nb11,
115
- uint64_t nb12,
116
- uint64_t nb13,
117
- int64_t ne0,
118
- int64_t /*ne1*/,
119
- int64_t /*ne2*/,
120
- int64_t /*ne3*/,
121
- uint64_t nb0,
122
- uint64_t nb1,
123
- uint64_t nb2,
124
- uint64_t nb3,
125
- int32_t dim) {
126
- const int64_t i3 = blockIdx.z;
127
- const int64_t i2 = blockIdx.y;
128
- const int64_t i1 = blockIdx.x;
129
-
130
- int64_t o[4] = {0, 0, 0, 0};
131
- o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
132
-
133
- const float * x;
134
-
135
- for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
136
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
137
- x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
138
- } else {
139
- x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
140
- }
141
-
142
- float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
143
-
144
- *y = *x;
145
- }
146
- }
147
-
148
-
149
- void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
150
- const ggml_tensor * src0 = dst->src[0];
151
- const ggml_tensor * src1 = dst->src[1];
152
-
153
- cudaStream_t stream = ctx.stream();
154
-
155
- const int32_t dim = ((int32_t *) dst->op_params)[0];
156
-
157
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
158
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
159
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
160
-
161
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
162
- const float * src0_d = (const float *)src0->data;
163
- const float * src1_d = (const float *)src1->data;
164
-
165
- float * dst_d = (float *)dst->data;
166
-
167
- if (dim != 3) {
168
- for (int i3 = 0; i3 < dst->ne[3]; i3++) {
169
- concat_f32_cuda(
170
- src0_d + i3 * (src0->nb[3] / 4),
171
- src1_d + i3 * (src1->nb[3] / 4),
172
- dst_d + i3 * ( dst->nb[3] / 4),
173
- src0->ne[0], src0->ne[1], src0->ne[2],
174
- dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
175
- }
176
- } else {
177
- const size_t size0 = ggml_nbytes(src0);
178
- const size_t size1 = ggml_nbytes(src1);
179
-
180
- CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
181
- CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
182
- }
183
- } else {
184
- dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
185
- concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
186
- (const char *)src0->data,
187
- (const char *)src1->data,
188
- ( char *)dst->data,
189
- src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
190
- src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
191
- src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
192
- src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
193
- dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
194
- dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
195
- }
196
- }