llama_cpp 0.15.4 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 167132898a0cb63faaf4fd7583d9b988992ba7c5ec0f5602d5a158f04e0cdfa0
4
- data.tar.gz: 8a65658eb93b9cf80d5ede554b15968c495f045c32e57cc96ed732c56330d25f
3
+ metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
4
+ data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
5
5
  SHA512:
6
- metadata.gz: 9625ac088c4d5c50cc51bbbcbc744cb7041766ccbb7a42a9cd1b80b29ebe64414d39875dea5d61a87025e239ad78be2a2ea4d3f85a187684321e409fc01a40fd
7
- data.tar.gz: 6f68445f10765a4eb1124ed1cfd2afb7544d146823efad27b2b6955bb0ee822ae8b0f9cccb68777c8cb211f665a0e2531eba04a4240399af1101a5dbcd645ae9
6
+ metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
7
+ data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
2
+
3
+ **Breaking Changes**
4
+
5
+ - Bump llama.cpp from b3056 to b3091.
6
+ - Rename `type` method to `token_attr` in `Model`.
7
+ - Add constants for token attribute types.
8
+ - Remove `--with-clblast` and `--with-mpi` config options.
9
+ - Add `--with-no-openmp` config option.
10
+
1
11
  ## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
2
12
 
3
13
  - Bump llama.cpp from b2988 to b3056.
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
18
  make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
19
  make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
20
- make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
21
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
22
- make_envs << ' LLAMA_MPI=1' if with_config('mpi')
23
21
  make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
+ make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
24
23
 
25
24
  make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
26
25
 
@@ -1523,7 +1523,7 @@ public:
1523
1523
  rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
1524
1524
  rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
1525
1525
  rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
1526
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1526
+ rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
1527
1527
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1528
1528
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1529
1529
  rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
@@ -1778,10 +1778,10 @@ private:
1778
1778
  return DBL2NUM(score);
1779
1779
  }
1780
1780
 
1781
- static VALUE _llama_model_get_type(VALUE self, VALUE token_) {
1781
+ static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
1782
1782
  LLaMAModelWrapper* ptr = get_llama_model(self);
1783
1783
  const llama_token token = NUM2INT(token_);
1784
- const int type = llama_token_get_type(ptr->model, token);
1784
+ const llama_token_attr type = llama_token_get_attr(ptr->model, token);
1785
1785
  return INT2NUM(type);
1786
1786
  }
1787
1787
 
@@ -3503,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
3503
3503
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
3504
3504
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
3505
3505
 
3506
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
3507
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
3508
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
3509
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
3510
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
3511
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
3512
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
3513
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
3514
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
3515
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
3516
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
3517
+
3506
3518
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
3507
3519
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
3508
3520
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.4'
6
+ VERSION = '0.16.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b3056'
9
+ LLAMA_CPP_VERSION = 'b3091'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -32,6 +32,18 @@ module LLaMACpp
32
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
33
33
  LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
34
34
 
35
+ LLAMA_TOKEN_ATTR_UNDEFINED: Integer
36
+ LLAMA_TOKEN_ATTR_UNKNOWN: Integer
37
+ LLAMA_TOKEN_ATTR_UNUSED: Integer
38
+ LLAMA_TOKEN_ATTR_NORMAL: Integer
39
+ LLAMA_TOKEN_ATTR_CONTROL: Integer
40
+ LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
41
+ LLAMA_TOKEN_ATTR_BYTE: Integer
42
+ LLAMA_TOKEN_ATTR_NORMALIZED: Integer
43
+ LLAMA_TOKEN_ATTR_LSTRIP: Integer
44
+ LLAMA_TOKEN_ATTR_RSTRIP: Integer
45
+ LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
46
+
35
47
  LLAMA_FTYPE_ALL_F32: Integer
36
48
  LLAMA_FTYPE_MOSTLY_F16: Integer
37
49
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -147,7 +159,7 @@ module LLaMACpp
147
159
  def n_params: () -> Integer
148
160
  def text: (Integer) -> String
149
161
  def score: (Integer) -> Float
150
- def type: (Integer) -> Integer
162
+ def token_attr: (Integer) -> Integer
151
163
  def token_bos: () -> Integer
152
164
  def token_eos: () -> Integer
153
165
  def token_cls: () -> Integer
@@ -1,7 +1,7 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
5
5
  retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
57
57
  LLAMA_METAL := 1
58
58
  endif
59
59
 
60
+ LLAMA_NO_OPENMP := 1
61
+
60
62
  ifneq ($(UNAME_P),arm)
61
63
  SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
62
64
  ifeq ($(SYSCTL_M),1)
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
67
69
  endif
68
70
  endif
69
71
 
72
+ ifdef LLAMA_RPC
73
+ BUILD_TARGETS += rpc-server
74
+ endif
75
+
70
76
  default: $(BUILD_TARGETS)
71
77
 
72
78
  test: $(TEST_TARGETS)
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
135
141
  ifdef LLAMA_FAST
136
142
  MK_CFLAGS += -Ofast
137
143
  HOST_CXXFLAGS += -Ofast
144
+ ifndef LLAMA_DEBUG
138
145
  MK_NVCCFLAGS += -O3
146
+ endif # LLAMA_DEBUG
139
147
  else
140
148
  MK_CFLAGS += -O3
141
149
  MK_CXXFLAGS += -O3
150
+ ifndef LLAMA_DEBUG
142
151
  MK_NVCCFLAGS += -O3
143
- endif
152
+ endif # LLAMA_DEBUG
153
+ endif # LLAMA_FAST
144
154
 
145
155
  ifndef LLAMA_NO_CCACHE
146
156
  CCACHE := $(shell which ccache)
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
201
211
  endif
202
212
 
203
213
  ifdef LLAMA_DEBUG
204
- MK_CFLAGS += -O0 -g
205
- MK_CXXFLAGS += -O0 -g
206
- MK_LDFLAGS += -g
214
+ MK_CFLAGS += -O0 -g
215
+ MK_CXXFLAGS += -O0 -g
216
+ MK_LDFLAGS += -g
217
+ MK_NVCCFLAGS += -O0 -g
207
218
 
208
219
  ifeq ($(UNAME_S),Linux)
209
220
  MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
402
413
  endif
403
414
  endif # LLAMA_NO_ACCELERATE
404
415
 
416
+ ifndef LLAMA_NO_OPENMP
417
+ MK_CPPFLAGS += -DGGML_USE_OPENMP
418
+ MK_CFLAGS += -fopenmp
419
+ MK_CXXFLAGS += -fopenmp
420
+ endif # LLAMA_NO_OPENMP
421
+
405
422
  ifdef LLAMA_OPENBLAS
406
423
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
407
424
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
418
435
  MK_LDFLAGS += -lblis -L/usr/local/lib
419
436
  endif # LLAMA_BLIS
420
437
 
438
+ ifdef LLAMA_RPC
439
+ MK_CPPFLAGS += -DGGML_USE_RPC
440
+ OBJS += ggml-rpc.o
441
+ endif # LLAMA_RPC
442
+
421
443
  ifdef LLAMA_CUBLAS
422
444
  # LLAMA_CUBLAS is deprecated and will be removed in the future
423
445
  LLAMA_CUDA := 1
424
446
  endif
425
447
 
448
+ OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
449
+ ifdef LLAMA_CUDA_FA_ALL_QUANTS
450
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
451
+ else
452
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
453
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
454
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
455
+ endif # LLAMA_CUDA_FA_ALL_QUANTS
456
+
426
457
  ifdef LLAMA_CUDA
427
458
  ifneq ('', '$(wildcard /opt/cuda)')
428
459
  CUDA_PATH ?= /opt/cuda
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
433
464
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
434
465
  OBJS += ggml-cuda.o
435
466
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
467
+ OBJS += $(OBJS_CUDA_TEMP_INST)
436
468
  MK_NVCCFLAGS += -use_fast_math
437
469
  ifdef LLAMA_FATAL_WARNINGS
438
470
  MK_NVCCFLAGS += -Werror all-warnings
@@ -495,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
495
527
  endif # LLAMA_CUDA_NO_PEER_COPY
496
528
  ifdef LLAMA_CUDA_CCBIN
497
529
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
498
- endif
530
+ endif # LLAMA_CUDA_CCBIN
531
+ ifdef LLAMA_CUDA_FA_ALL_QUANTS
532
+ MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
533
+ endif # LLAMA_CUDA_FA_ALL_QUANTS
499
534
 
500
535
  ifdef JETSON_EOL_MODULE_DETECT
501
536
  define NVCC_COMPILE
@@ -507,30 +542,13 @@ define NVCC_COMPILE
507
542
  endef # NVCC_COMPILE
508
543
  endif # JETSON_EOL_MODULE_DETECT
509
544
 
510
- ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
545
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
511
546
  $(NVCC_COMPILE)
512
547
 
513
548
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
514
549
  $(NVCC_COMPILE)
515
550
  endif # LLAMA_CUDA
516
551
 
517
- ifdef LLAMA_CLBLAST
518
- MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
519
- MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
520
- MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
521
-
522
- # Mac provides OpenCL as a framework
523
- ifeq ($(UNAME_S),Darwin)
524
- MK_LDFLAGS += -lclblast -framework OpenCL
525
- else
526
- MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
527
- endif
528
- OBJS += ggml-opencl.o
529
-
530
- ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
531
- $(CXX) $(CXXFLAGS) -c $< -o $@
532
- endif # LLAMA_CLBLAST
533
-
534
552
  ifdef LLAMA_VULKAN
535
553
  MK_CPPFLAGS += -DGGML_USE_VULKAN
536
554
  MK_LDFLAGS += -lvulkan
@@ -573,6 +591,7 @@ ifdef LLAMA_HIP_UMA
573
591
  MK_CPPFLAGS += -DGGML_HIP_UMA
574
592
  endif # LLAMA_HIP_UMA
575
593
  MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
594
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
576
595
  MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
577
596
  HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
578
597
  HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@@ -586,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
586
605
  endif # LLAMA_CUDA_NO_PEER_COPY
587
606
  OBJS += ggml-cuda.o
588
607
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
608
+ OBJS += $(OBJS_CUDA_TEMP_INST)
589
609
 
590
610
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
591
611
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
592
612
 
593
- ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
613
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
594
614
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
595
615
 
596
616
  endif # LLAMA_HIPBLAS
@@ -628,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
628
648
  endif
629
649
  endif # LLAMA_METAL
630
650
 
651
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
652
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
653
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
654
+
631
655
  ifndef LLAMA_NO_LLAMAFILE
632
656
  sgemm.o: sgemm.cpp sgemm.h ggml.h
633
657
  $(CXX) $(CXXFLAGS) -c $< -o $@
634
658
  endif
635
659
 
660
+ ifdef LLAMA_RPC
661
+ ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
662
+ $(CXX) $(CXXFLAGS) -c $< -o $@
663
+
664
+ rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
665
+ $(CXX) $(CXXFLAGS) -c $< -o $@
666
+
667
+ rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
668
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
669
+ endif # LLAMA_RPC
670
+
636
671
  GF_CC := $(CC)
637
672
  include scripts/get-flags.mk
638
673
 
@@ -712,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
712
747
  unicode-data.o: unicode-data.cpp unicode-data.h
713
748
  $(CXX) $(CXXFLAGS) -c $< -o $@
714
749
 
715
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
716
-
717
750
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
718
751
  $(CXX) $(CXXFLAGS) -c $< -o $@
719
752
 
720
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
721
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
722
-
723
753
  common.o: common/common.cpp $(COMMON_H_DEPS)
724
754
  $(CXX) $(CXXFLAGS) -c $< -o $@
725
755
 
@@ -754,6 +784,7 @@ lib: llama.o ggml.o $(OBJS)
754
784
  clean:
755
785
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
756
786
  rm -vrf ggml-cuda/*.o
787
+ rm -vrf ggml-cuda/template-instances/*.o
757
788
 
758
789
  #
759
790
  # Examples
@@ -821,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
821
852
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
822
853
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
823
854
 
824
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
855
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
825
856
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
826
857
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
827
858
 
@@ -871,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
871
902
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
872
903
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
873
904
 
874
- beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
875
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
876
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
877
-
878
905
  finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
879
906
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
880
907
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
377
377
  galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
383
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750
750
  // this tensor was allocated without ggml-backend
751
751
  return;
752
752
  }
753
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
753
+ ggml_backend_view_init(tensor);
754
754
  }
755
755
  } else {
756
756
  if (tensor->data == NULL) {
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899
899
  if (t->view_src == NULL) {
900
900
  ggml_tallocr_alloc(&tallocr, t);
901
901
  } else if (t->buffer == NULL) {
902
- ggml_backend_view_init(buffer, t);
902
+ ggml_backend_view_init(t);
903
903
  }
904
904
  } else {
905
905
  if (t->view_src != NULL && t->buffer == NULL) {
906
906
  // view of a pre-allocated tensor
907
- ggml_backend_view_init(buffer, t);
907
+ ggml_backend_view_init(t);
908
908
  }
909
909
  }
910
910
  }
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
151
151
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
152
  ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
153
  if (dst_buf->iface.cpy_tensor) {
154
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
154
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
155
  }
156
156
  return false;
157
157
  }
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
1887
1887
 
1888
1888
  // utils
1889
1889
 
1890
- void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1890
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
1891
1891
  GGML_ASSERT(tensor->buffer == NULL);
1892
1892
  GGML_ASSERT(tensor->view_src != NULL);
1893
1893
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1894
1894
  GGML_ASSERT(tensor->view_src->data != NULL);
1895
1895
 
1896
- tensor->buffer = buffer;
1896
+ tensor->buffer = tensor->view_src->buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- ggml_backend_buffer_init_tensor(buffer, tensor);
1898
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1899
1899
  }
1900
1900
 
1901
1901
  void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1954
1954
  struct ggml_tensor * dst = node_copies[id];
1955
1955
  if (dst->view_src != NULL) {
1956
1956
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957
- ggml_backend_view_init(dst->view_src->buffer, dst);
1957
+ ggml_backend_view_init(dst);
1958
1958
  }
1959
1959
  else {
1960
1960
  ggml_backend_tensor_copy(src, dst);
@@ -225,7 +225,7 @@ extern "C" {
225
225
 
226
226
  // Tensor initialization
227
227
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
228
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
229
229
 
230
230
 
231
231
  #ifdef __cplusplus
@@ -0,0 +1,47 @@
1
+ #include "acc.cuh"
2
+
3
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
+ const int ne10, const int ne11, const int ne12,
5
+ const int nb1, const int nb2, int offset) {
6
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
7
+ if (i >= ne) {
8
+ return;
9
+ }
10
+ int src1_idx = i - offset;
11
+ int oz = src1_idx / nb2;
12
+ int oy = (src1_idx - (oz * nb2)) / nb1;
13
+ int ox = src1_idx % nb1;
14
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
+ } else {
17
+ dst[i] = x[i];
18
+ }
19
+ }
20
+
21
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
+ const int ne10, const int ne11, const int ne12,
23
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
+ }
27
+
28
+ void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
+ const ggml_tensor * src0 = dst->src[0];
30
+ const ggml_tensor * src1 = dst->src[1];
31
+ const float * src0_d = (const float *)src0->data;
32
+ const float * src1_d = (const float *)src1->data;
33
+ float * dst_d = (float *)dst->data;
34
+ cudaStream_t stream = ctx.stream();
35
+
36
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
+
41
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
+ int offset = dst->op_params[3] / 4; // offset in bytes
45
+
46
+ acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
+ }
@@ -0,0 +1,34 @@
1
+ #include "arange.cuh"
2
+
3
+ static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
4
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
5
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
+ if (nidx >= ne0) {
7
+ return;
8
+ }
9
+ dst[nidx] = start + step * nidx;
10
+ }
11
+
12
+ static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13
+ int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14
+ arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
15
+ }
16
+
17
+ void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18
+ float * dst_d = (float *)dst->data;
19
+ cudaStream_t stream = ctx.stream();
20
+
21
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
22
+
23
+ float start;
24
+ float stop;
25
+ float step;
26
+ memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27
+ memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
28
+ memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
29
+
30
+ int64_t steps = (int64_t)ceil((stop - start) / step);
31
+ GGML_ASSERT(ggml_nelements(dst) == steps);
32
+
33
+ arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34
+ }
@@ -0,0 +1,103 @@
1
+ #include "argsort.cuh"
2
+
3
+ template<typename T>
4
+ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
5
+ T tmp = a;
6
+ a = b;
7
+ b = tmp;
8
+ }
9
+
10
+ template<ggml_sort_order order>
11
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
12
+ // bitonic sort
13
+ int col = threadIdx.x;
14
+ int row = blockIdx.y;
15
+
16
+ if (col >= ncols_pad) {
17
+ return;
18
+ }
19
+
20
+ const float * x_row = x + row * ncols;
21
+ extern __shared__ int dst_row[];
22
+
23
+ // initialize indices
24
+ dst_row[col] = col;
25
+
26
+ __syncthreads();
27
+
28
+ for (int k = 2; k <= ncols_pad; k *= 2) {
29
+ for (int j = k / 2; j > 0; j /= 2) {
30
+ int ixj = col ^ j;
31
+ if (ixj > col) {
32
+ if ((col & k) == 0) {
33
+ if (dst_row[col] >= ncols ||
34
+ (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
35
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
36
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
37
+ ) {
38
+ ggml_cuda_swap(dst_row[col], dst_row[ixj]);
39
+ }
40
+ } else {
41
+ if (dst_row[ixj] >= ncols ||
42
+ (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
43
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
44
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
45
+ ) {
46
+ ggml_cuda_swap(dst_row[col], dst_row[ixj]);
47
+ }
48
+ }
49
+ }
50
+ __syncthreads();
51
+ }
52
+ }
53
+
54
+ // copy the result to dst without the padding
55
+ if (col < ncols) {
56
+ dst[row * ncols + col] = dst_row[col];
57
+ }
58
+ }
59
+
60
+ static int next_power_of_2(int x) {
61
+ int n = 1;
62
+ while (n < x) {
63
+ n *= 2;
64
+ }
65
+ return n;
66
+ }
67
+
68
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
69
+ // bitonic sort requires ncols to be power of 2
70
+ const int ncols_pad = next_power_of_2(ncols);
71
+
72
+ const dim3 block_dims(ncols_pad, 1, 1);
73
+ const dim3 block_nums(1, nrows, 1);
74
+ const size_t shared_mem = ncols_pad * sizeof(int);
75
+
76
+ GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
77
+
78
+ if (order == GGML_SORT_ORDER_ASC) {
79
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
80
+ } else if (order == GGML_SORT_ORDER_DESC) {
81
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
82
+ } else {
83
+ GGML_ASSERT(false);
84
+ }
85
+ }
86
+
87
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
88
+ const ggml_tensor * src0 = dst->src[0];
89
+ const float * src0_d = (const float *)src0->data;
90
+ float * dst_d = (float *)dst->data;
91
+ cudaStream_t stream = ctx.stream();
92
+
93
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
94
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
95
+ GGML_ASSERT(ggml_is_contiguous(src0));
96
+
97
+ const int64_t ncols = src0->ne[0];
98
+ const int64_t nrows = ggml_nrows(src0);
99
+
100
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
101
+
102
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
103
+ }