llama_cpp 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,47 +0,0 @@
1
- #include "tsembd.cuh"
2
-
3
- static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
4
- // blockIDx.y: idx of timesteps->ne[0]
5
- // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
6
- int i = blockIdx.y;
7
- int j = threadIdx.x + blockIdx.x * blockDim.x;
8
- float * embed_data = (float *)((char *)dst + i*nb1);
9
-
10
- if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
11
- embed_data[dim] = 0.f;
12
- }
13
-
14
- int half = dim / 2;
15
- if (j >= half) {
16
- return;
17
- }
18
-
19
- float timestep = timesteps[i];
20
- float freq = (float)expf(-logf(max_period) * j / half);
21
- float arg = timestep * freq;
22
- embed_data[j] = cosf(arg);
23
- embed_data[j + half] = sinf(arg);
24
- }
25
-
26
- static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
27
- const int dim, const int max_period, cudaStream_t stream) {
28
- int half_ceil = (dim + 1) / 2;
29
- int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
30
- dim3 gridDim(num_blocks, ne00, 1);
31
- timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
32
- }
33
-
34
- void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
35
- const ggml_tensor * src0 = dst->src[0];
36
- const float * src0_d = (const float *)src0->data;
37
- float * dst_d = (float *)dst->data;
38
- cudaStream_t stream = ctx.stream();
39
-
40
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
41
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
42
-
43
- const int dim = dst->op_params[0];
44
- const int max_period = dst->op_params[1];
45
-
46
- timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
47
- }
@@ -1,314 +0,0 @@
1
- #include "unary.cuh"
2
-
3
- static __global__ void gelu_f32(const float * x, float * dst, const int k) {
4
- const float GELU_COEF_A = 0.044715f;
5
- const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
6
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
7
-
8
- if (i >= k) {
9
- return;
10
- }
11
-
12
- float xi = x[i];
13
- dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
14
- }
15
-
16
- static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
17
- const float GELU_QUICK_COEF = -1.702f;
18
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
19
- if (i >= k) {
20
- return;
21
- }
22
- dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
23
- }
24
-
25
- static __global__ void silu_f32(const float * x, float * dst, const int k) {
26
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
27
-
28
- if (i >= k) {
29
- return;
30
- }
31
- dst[i] = x[i] / (1.0f + expf(-x[i]));
32
- }
33
-
34
- static __global__ void tanh_f32(const float * x, float * dst, int k) {
35
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
36
- if (i >= k) {
37
- return;
38
- }
39
- dst[i] = tanhf(x[i]);
40
- }
41
-
42
- static __global__ void relu_f32(const float * x, float * dst, const int k) {
43
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
44
-
45
- if (i >= k) {
46
- return;
47
- }
48
- dst[i] = fmaxf(x[i], 0);
49
- }
50
-
51
- static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
52
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
53
-
54
- if (i >= k) {
55
- return;
56
- }
57
- dst[i] = 1.0f / (1.0f + expf(-x[i]));
58
- }
59
-
60
- static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
61
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
62
-
63
- if (i >= k) {
64
- return;
65
- }
66
- dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
67
- }
68
-
69
- static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
70
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
71
-
72
- if (i >= k) {
73
- return;
74
- }
75
- dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
76
- }
77
-
78
- static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
79
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
80
- if (i >= k) {
81
- return;
82
- }
83
- dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
84
- }
85
-
86
- static __global__ void sqr_f32(const float * x, float * dst, const int k) {
87
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
88
-
89
- if (i >= k) {
90
- return;
91
- }
92
- dst[i] = x[i] * x[i];
93
- }
94
-
95
- static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
96
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
97
-
98
- if (i >= k) {
99
- return;
100
- }
101
- dst[i] = sqrtf(x[i]);
102
- }
103
-
104
- static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
105
- const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
106
- gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
107
- }
108
-
109
- static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
110
- const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
111
- gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
112
- }
113
-
114
- static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
115
- const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
116
- silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
117
- }
118
-
119
- static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
120
- const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
121
- tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
122
- }
123
-
124
- static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
125
- const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
126
- relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
127
- }
128
-
129
- static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
130
- const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
131
- sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
132
- }
133
-
134
- static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
135
- const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
136
- hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
137
- }
138
-
139
- static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
140
- const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
141
- hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
142
- }
143
-
144
- static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
145
- const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
146
- leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
147
- }
148
-
149
- static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
150
- const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
151
- sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
152
- }
153
-
154
- static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
155
- const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
156
- sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
157
- }
158
-
159
- void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
160
- const ggml_tensor * src0 = dst->src[0];
161
- const float * src0_d = (const float *)src0->data;
162
- float * dst_d = (float *)dst->data;
163
- cudaStream_t stream = ctx.stream();
164
-
165
- GGML_ASSERT(ggml_is_contiguous(src0));
166
-
167
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
168
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
169
-
170
- gelu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
171
- }
172
-
173
- void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
174
- const ggml_tensor * src0 = dst->src[0];
175
- const float * src0_d = (const float *)src0->data;
176
- float * dst_d = (float *)dst->data;
177
- cudaStream_t stream = ctx.stream();
178
-
179
- GGML_ASSERT(ggml_is_contiguous(src0));
180
-
181
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
182
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
183
-
184
- silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
185
- }
186
-
187
- void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
188
- const ggml_tensor * src0 = dst->src[0];
189
- const float * src0_d = (const float *)src0->data;
190
- float * dst_d = (float *)dst->data;
191
- cudaStream_t stream = ctx.stream();
192
-
193
- GGML_ASSERT(ggml_is_contiguous(src0));
194
-
195
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
196
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
197
-
198
- gelu_quick_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
199
- }
200
-
201
- void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
202
- const ggml_tensor * src0 = dst->src[0];
203
- const float * src0_d = (const float *)src0->data;
204
- float * dst_d = (float *)dst->data;
205
- cudaStream_t stream = ctx.stream();
206
-
207
- GGML_ASSERT(ggml_is_contiguous(src0));
208
-
209
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
210
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
211
-
212
- tanh_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
213
- }
214
-
215
- void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
216
- const ggml_tensor * src0 = dst->src[0];
217
- const float * src0_d = (const float *)src0->data;
218
- float * dst_d = (float *)dst->data;
219
- cudaStream_t stream = ctx.stream();
220
-
221
- GGML_ASSERT(ggml_is_contiguous(src0));
222
-
223
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
224
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
225
-
226
- relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
227
- }
228
-
229
- void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
230
- const ggml_tensor * src0 = dst->src[0];
231
- const float * src0_d = (const float *)src0->data;
232
- float * dst_d = (float *)dst->data;
233
- cudaStream_t stream = ctx.stream();
234
-
235
- GGML_ASSERT(ggml_is_contiguous(src0));
236
-
237
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
238
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
239
-
240
- sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
241
- }
242
-
243
- void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
244
- const ggml_tensor * src0 = dst->src[0];
245
- const float * src0_d = (const float *)src0->data;
246
- float * dst_d = (float *)dst->data;
247
- cudaStream_t stream = ctx.stream();
248
-
249
- GGML_ASSERT(ggml_is_contiguous(src0));
250
-
251
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
252
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
253
-
254
- hardsigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
255
- }
256
-
257
- void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
258
- const ggml_tensor * src0 = dst->src[0];
259
- const float * src0_d = (const float *)src0->data;
260
- float * dst_d = (float *)dst->data;
261
- cudaStream_t stream = ctx.stream();
262
-
263
- GGML_ASSERT(ggml_is_contiguous(src0));
264
-
265
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
266
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
267
-
268
- hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
269
- }
270
-
271
- void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
272
- const ggml_tensor * src0 = dst->src[0];
273
- const float * src0_d = (const float *)src0->data;
274
- float * dst_d = (float *)dst->data;
275
- cudaStream_t stream = ctx.stream();
276
-
277
- GGML_ASSERT(ggml_is_contiguous(src0));
278
-
279
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
280
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
281
-
282
- float negative_slope;
283
- memcpy(&negative_slope, dst->op_params, sizeof(float));
284
-
285
- leaky_relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), negative_slope, stream);
286
- }
287
-
288
- void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
289
- const ggml_tensor * src0 = dst->src[0];
290
- const float * src0_d = (const float *)src0->data;
291
- float * dst_d = (float *)dst->data;
292
- cudaStream_t stream = ctx.stream();
293
-
294
- GGML_ASSERT(ggml_is_contiguous(src0));
295
-
296
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
297
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
298
-
299
- sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
300
- }
301
-
302
- void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
303
- const ggml_tensor * src0 = dst->src[0];
304
- const float * src0_d = (const float *)src0->data;
305
- float * dst_d = (float *)dst->data;
306
- cudaStream_t stream = ctx.stream();
307
-
308
- GGML_ASSERT(ggml_is_contiguous(src0));
309
-
310
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
311
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
312
-
313
- sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
314
- }
@@ -1,51 +0,0 @@
1
- #include "upscale.cuh"
2
-
3
- static __global__ void upscale_f32(const float * x, float * dst,
4
- const int nb00, const int nb01, const int nb02, const int nb03,
5
- const int ne10, const int ne11, const int ne12, const int ne13,
6
- const float sf0, const float sf1, const float sf2, const float sf3) {
7
- int index = threadIdx.x + blockIdx.x * blockDim.x;
8
- if (index >= ne10 * ne11 * ne12 * ne13) {
9
- return;
10
- }
11
-
12
- int i10 = index % ne10;
13
- int i11 = (index / ne10) % ne11;
14
- int i12 = (index / (ne10 * ne11)) % ne12;
15
- int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
16
-
17
- int i00 = i10 / sf0;
18
- int i01 = i11 / sf1;
19
- int i02 = i12 / sf2;
20
- int i03 = i13 / sf3;
21
-
22
- dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
23
- }
24
-
25
- static void upscale_f32_cuda(const float * x, float * dst,
26
- const int nb00, const int nb01, const int nb02, const int nb03,
27
- const int ne10, const int ne11, const int ne12, const int ne13,
28
- const float sf0, const float sf1, const float sf2, const float sf3,
29
- cudaStream_t stream) {
30
- int dst_size = ne10 * ne11 * ne12 * ne13;
31
- int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
32
-
33
- upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
34
- }
35
-
36
- void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
37
- const ggml_tensor * src0 = dst->src[0];
38
- const float * src0_d = (const float *)src0->data;
39
- float * dst_d = (float *)dst->data;
40
- cudaStream_t stream = ctx.stream();
41
-
42
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
43
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
44
-
45
- const float sf0 = (float)dst->ne[0]/src0->ne[0];
46
- const float sf1 = (float)dst->ne[1]/src0->ne[1];
47
- const float sf2 = (float)dst->ne[2]/src0->ne[2];
48
- const float sf3 = (float)dst->ne[3]/src0->ne[3];
49
-
50
- upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
51
- }