llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -0,0 +1,280 @@
1
+ #include "binbcast.cuh"
2
+
3
+ static __device__ __forceinline__ float op_repeat(const float a, const float b) {
4
+ return b;
5
+ GGML_UNUSED(a);
6
+ }
7
+
8
+ static __device__ __forceinline__ float op_add(const float a, const float b) {
9
+ return a + b;
10
+ }
11
+
12
+ static __device__ __forceinline__ float op_mul(const float a, const float b) {
13
+ return a * b;
14
+ }
15
+
16
+ static __device__ __forceinline__ float op_div(const float a, const float b) {
17
+ return a / b;
18
+ }
19
+
20
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
21
+ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
22
+ int ne0, int ne1, int ne2, int ne3,
23
+ int ne10, int ne11, int ne12, int ne13,
24
+ /*int s0, */ int s1, int s2, int s3,
25
+ /*int s00,*/ int s01, int s02, int s03,
26
+ /*int s10,*/ int s11, int s12, int s13) {
27
+ const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
28
+ const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
29
+ const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
30
+ const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
31
+
32
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
33
+ return;
34
+ }
35
+
36
+ const int i11 = i1 % ne11;
37
+ const int i12 = i2 % ne12;
38
+ const int i13 = i3 % ne13;
39
+
40
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
41
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
42
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
43
+
44
+ const src0_t * src0_row = src0 + i_src0;
45
+ const src1_t * src1_row = src1 + i_src1;
46
+ dst_t * dst_row = dst + i_dst;
47
+
48
+ for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
49
+ const int i10 = i0 % ne10;
50
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
51
+ }
52
+ }
53
+
54
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
55
+ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
56
+ int ne0, int ne1, int ne2, int ne3,
57
+ int ne10, int ne11, int ne12, int ne13,
58
+ /*int s0, */ int s1, int s2, int s3,
59
+ /*int s00,*/ int s01, int s02, int s03,
60
+ /*int s10,*/ int s11, int s12, int s13) {
61
+
62
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
63
+
64
+ const int i3 = i/(ne2*ne1*ne0);
65
+ const int i2 = (i/(ne1*ne0)) % ne2;
66
+ const int i1 = (i/ne0) % ne1;
67
+ const int i0 = i % ne0;
68
+
69
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
70
+ return;
71
+ }
72
+
73
+ const int i11 = i1 % ne11;
74
+ const int i12 = i2 % ne12;
75
+ const int i13 = i3 % ne13;
76
+
77
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
78
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
79
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
80
+
81
+ const src0_t * src0_row = src0 + i_src0;
82
+ const src1_t * src1_row = src1 + i_src1;
83
+ dst_t * dst_row = dst + i_dst;
84
+
85
+ const int i10 = i0 % ne10;
86
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
87
+ }
88
+
89
+ template<float (*bin_op)(const float, const float)>
90
+ struct bin_bcast_cuda {
91
+ template<typename src0_t, typename src1_t, typename dst_t>
92
+ void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
93
+ const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
94
+ cudaStream_t stream) {
95
+
96
+ GGML_TENSOR_BINARY_OP_LOCALS
97
+
98
+ int nr0 = ne10/ne0;
99
+ int nr1 = ne11/ne1;
100
+ int nr2 = ne12/ne2;
101
+ int nr3 = ne13/ne3;
102
+
103
+ int nr[4] = { nr0, nr1, nr2, nr3 };
104
+
105
+ // collapse dimensions until first broadcast dimension
106
+ int64_t cne[] = {ne0, ne1, ne2, ne3};
107
+ int64_t cne0[] = {ne00, ne01, ne02, ne03};
108
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
109
+
110
+ size_t cnb[] = {nb0, nb1, nb2, nb3};
111
+ size_t cnb0[] = {nb00, nb01, nb02, nb03};
112
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
113
+
114
+ auto collapse = [](int64_t cne[]) {
115
+ cne[0] *= cne[1];
116
+ cne[1] = cne[2];
117
+ cne[2] = cne[3];
118
+ cne[3] = 1;
119
+ };
120
+
121
+ auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
122
+ cnb[1] *= cne[1];
123
+ cnb[2] *= cne[2];
124
+ cnb[3] *= cne[3];
125
+ };
126
+
127
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
128
+ for (int i = 0; i < 4; i++) {
129
+ if (nr[i] != 1) {
130
+ break;
131
+ }
132
+ if (i > 0) {
133
+ collapse_nb(cnb, cne);
134
+ collapse_nb(cnb0, cne0);
135
+ collapse_nb(cnb1, cne1);
136
+ collapse(cne);
137
+ collapse(cne0);
138
+ collapse(cne1);
139
+ }
140
+ }
141
+ }
142
+
143
+ {
144
+ int64_t ne0 = cne[0];
145
+ int64_t ne1 = cne[1];
146
+ int64_t ne2 = cne[2];
147
+ int64_t ne3 = cne[3];
148
+
149
+ //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
150
+ //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
151
+ //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
152
+ //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
153
+
154
+ int64_t ne10 = cne1[0];
155
+ int64_t ne11 = cne1[1];
156
+ int64_t ne12 = cne1[2];
157
+ int64_t ne13 = cne1[3];
158
+
159
+ size_t nb0 = cnb[0];
160
+ size_t nb1 = cnb[1];
161
+ size_t nb2 = cnb[2];
162
+ size_t nb3 = cnb[3];
163
+
164
+ size_t nb00 = cnb0[0];
165
+ size_t nb01 = cnb0[1];
166
+ size_t nb02 = cnb0[2];
167
+ size_t nb03 = cnb0[3];
168
+
169
+ size_t nb10 = cnb1[0];
170
+ size_t nb11 = cnb1[1];
171
+ size_t nb12 = cnb1[2];
172
+ size_t nb13 = cnb1[3];
173
+
174
+ size_t s0 = nb0 / sizeof(dst_t);
175
+ size_t s1 = nb1 / sizeof(dst_t);
176
+ size_t s2 = nb2 / sizeof(dst_t);
177
+ size_t s3 = nb3 / sizeof(dst_t);
178
+
179
+ size_t s10 = nb10 / sizeof(src1_t);
180
+ size_t s11 = nb11 / sizeof(src1_t);
181
+ size_t s12 = nb12 / sizeof(src1_t);
182
+ size_t s13 = nb13 / sizeof(src1_t);
183
+
184
+ size_t s00 = nb00 / sizeof(src0_t);
185
+ size_t s01 = nb01 / sizeof(src0_t);
186
+ size_t s02 = nb02 / sizeof(src0_t);
187
+ size_t s03 = nb03 / sizeof(src0_t);
188
+
189
+ GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
190
+ GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
191
+ GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
192
+ GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
193
+
194
+ GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
195
+ GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
196
+ GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
197
+ GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
198
+
199
+ GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
200
+ GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
201
+ GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
202
+ GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
203
+
204
+ GGML_ASSERT(s0 == 1);
205
+ GGML_ASSERT(s00 == 1);
206
+ GGML_ASSERT(s10 == 1);
207
+
208
+ const int block_size = 128;
209
+
210
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
211
+
212
+ dim3 block_dims;
213
+ block_dims.x = std::min<unsigned int>(hne0, block_size);
214
+ block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
215
+ block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
216
+
217
+ dim3 block_nums(
218
+ (hne0 + block_dims.x - 1) / block_dims.x,
219
+ (ne1 + block_dims.y - 1) / block_dims.y,
220
+ (ne2*ne3 + block_dims.z - 1) / block_dims.z
221
+ );
222
+
223
+ if (block_nums.z > 65535) {
224
+ // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
225
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
226
+ k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
227
+ src0_dd, src1_dd, dst_dd,
228
+ ne0, ne1, ne2, ne3,
229
+ ne10, ne11, ne12, ne13,
230
+ /* s0, */ s1, s2, s3,
231
+ /* s00, */ s01, s02, s03,
232
+ /* s10, */ s11, s12, s13);
233
+ } else {
234
+ k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
235
+ src0_dd, src1_dd, dst_dd,
236
+ ne0, ne1, ne2, ne3,
237
+ ne10, ne11, ne12, ne13,
238
+ /* s0, */ s1, s2, s3,
239
+ /* s00, */ s01, s02, s03,
240
+ /* s10, */ s11, s12, s13);
241
+ }
242
+ }
243
+ }
244
+ };
245
+
246
+ template<class op>
247
+ static void ggml_cuda_op_bin_bcast(
248
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
249
+ const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
250
+
251
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
252
+
253
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
254
+ op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
255
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
256
+ op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
257
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
258
+ op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
259
+ } else {
260
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
261
+ ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
262
+ GGML_ASSERT(false);
263
+ }
264
+ }
265
+
266
+ void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
267
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
268
+ }
269
+
270
+ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
271
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
272
+ }
273
+
274
+ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
275
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
276
+ }
277
+
278
+ void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
279
+ ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
280
+ }
@@ -0,0 +1,34 @@
1
+ #include "clamp.cuh"
2
+
3
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
5
+
6
+ if (i >= k) {
7
+ return;
8
+ }
9
+
10
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11
+ }
12
+
13
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
14
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16
+ }
17
+
18
+
19
+ void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20
+ const ggml_tensor * src0 = dst->src[0];
21
+ const float * src0_d = (const float *)src0->data;
22
+ float * dst_d = (float *)dst->data;
23
+ cudaStream_t stream = ctx.stream();
24
+
25
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
26
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
27
+
28
+ float min;
29
+ float max;
30
+ memcpy(&min, dst->op_params, sizeof(float));
31
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32
+
33
+ clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
34
+ }
@@ -0,0 +1,196 @@
1
+ #include "concat.cuh"
2
+
3
+ // contiguous kernels
4
+ static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
5
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
+ if (nidx >= ne0) {
7
+ return;
8
+ }
9
+
10
+ int offset_dst =
11
+ nidx +
12
+ blockIdx.y * ne0 +
13
+ blockIdx.z * ne0 * gridDim.y;
14
+
15
+ if (nidx < ne00) { // src0
16
+ int offset_src =
17
+ nidx +
18
+ blockIdx.y * ne00 +
19
+ blockIdx.z * ne00 * gridDim.y;
20
+ dst[offset_dst] = x[offset_src];
21
+ } else {
22
+ int offset_src =
23
+ (nidx - ne00) +
24
+ blockIdx.y * (ne0 - ne00) +
25
+ blockIdx.z * (ne0 - ne00) * gridDim.y;
26
+ dst[offset_dst] = y[offset_src];
27
+ }
28
+ }
29
+
30
+ static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
31
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
32
+ if (nidx >= ne0) {
33
+ return;
34
+ }
35
+
36
+ int offset_dst =
37
+ nidx +
38
+ blockIdx.y * ne0 +
39
+ blockIdx.z * ne0 * gridDim.y;
40
+
41
+ if (blockIdx.y < ne01) { // src0
42
+ int offset_src =
43
+ nidx +
44
+ blockIdx.y * ne0 +
45
+ blockIdx.z * ne0 * ne01;
46
+ dst[offset_dst] = x[offset_src];
47
+ } else {
48
+ int offset_src =
49
+ nidx +
50
+ (blockIdx.y - ne01) * ne0 +
51
+ blockIdx.z * ne0 * (gridDim.y - ne01);
52
+ dst[offset_dst] = y[offset_src];
53
+ }
54
+ }
55
+
56
+ static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
57
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
58
+ if (nidx >= ne0) {
59
+ return;
60
+ }
61
+
62
+ int offset_dst =
63
+ nidx +
64
+ blockIdx.y * ne0 +
65
+ blockIdx.z * ne0 * gridDim.y;
66
+
67
+ if (blockIdx.z < ne02) { // src0
68
+ int offset_src =
69
+ nidx +
70
+ blockIdx.y * ne0 +
71
+ blockIdx.z * ne0 * gridDim.y;
72
+ dst[offset_dst] = x[offset_src];
73
+ } else {
74
+ int offset_src =
75
+ nidx +
76
+ blockIdx.y * ne0 +
77
+ (blockIdx.z - ne02) * ne0 * gridDim.y;
78
+ dst[offset_dst] = y[offset_src];
79
+ }
80
+ }
81
+
82
+ static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
83
+ int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
84
+ dim3 gridDim(num_blocks, ne1, ne2);
85
+ if (dim == 0) {
86
+ concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
87
+ return;
88
+ }
89
+ if (dim == 1) {
90
+ concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
91
+ return;
92
+ }
93
+ concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
94
+ }
95
+
96
+ // non-contiguous kernel (slow)
97
+ static __global__ void concat_f32_non_cont(
98
+ const char * src0,
99
+ const char * src1,
100
+ char * dst,
101
+ int64_t ne00,
102
+ int64_t ne01,
103
+ int64_t ne02,
104
+ int64_t ne03,
105
+ uint64_t nb00,
106
+ uint64_t nb01,
107
+ uint64_t nb02,
108
+ uint64_t nb03,
109
+ int64_t /*ne10*/,
110
+ int64_t /*ne11*/,
111
+ int64_t /*ne12*/,
112
+ int64_t /*ne13*/,
113
+ uint64_t nb10,
114
+ uint64_t nb11,
115
+ uint64_t nb12,
116
+ uint64_t nb13,
117
+ int64_t ne0,
118
+ int64_t /*ne1*/,
119
+ int64_t /*ne2*/,
120
+ int64_t /*ne3*/,
121
+ uint64_t nb0,
122
+ uint64_t nb1,
123
+ uint64_t nb2,
124
+ uint64_t nb3,
125
+ int32_t dim) {
126
+ const int64_t i3 = blockIdx.z;
127
+ const int64_t i2 = blockIdx.y;
128
+ const int64_t i1 = blockIdx.x;
129
+
130
+ int64_t o[4] = {0, 0, 0, 0};
131
+ o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
132
+
133
+ const float * x;
134
+
135
+ for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
136
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
137
+ x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
138
+ } else {
139
+ x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
140
+ }
141
+
142
+ float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
143
+
144
+ *y = *x;
145
+ }
146
+ }
147
+
148
+
149
+ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
150
+ const ggml_tensor * src0 = dst->src[0];
151
+ const ggml_tensor * src1 = dst->src[1];
152
+
153
+ cudaStream_t stream = ctx.stream();
154
+
155
+ const int32_t dim = ((int32_t *) dst->op_params)[0];
156
+
157
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
158
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
159
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
160
+
161
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
162
+ const float * src0_d = (const float *)src0->data;
163
+ const float * src1_d = (const float *)src1->data;
164
+
165
+ float * dst_d = (float *)dst->data;
166
+
167
+ if (dim != 3) {
168
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
169
+ concat_f32_cuda(
170
+ src0_d + i3 * (src0->nb[3] / 4),
171
+ src1_d + i3 * (src1->nb[3] / 4),
172
+ dst_d + i3 * ( dst->nb[3] / 4),
173
+ src0->ne[0], src0->ne[1], src0->ne[2],
174
+ dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
175
+ }
176
+ } else {
177
+ const size_t size0 = ggml_nbytes(src0);
178
+ const size_t size1 = ggml_nbytes(src1);
179
+
180
+ CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
181
+ CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
182
+ }
183
+ } else {
184
+ dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
185
+ concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
186
+ (const char *)src0->data,
187
+ (const char *)src1->data,
188
+ ( char *)dst->data,
189
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
190
+ src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
191
+ src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
192
+ src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
193
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
194
+ dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
195
+ }
196
+ }