llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 167132898a0cb63faaf4fd7583d9b988992ba7c5ec0f5602d5a158f04e0cdfa0
4
- data.tar.gz: 8a65658eb93b9cf80d5ede554b15968c495f045c32e57cc96ed732c56330d25f
3
+ metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
4
+ data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
5
5
  SHA512:
6
- metadata.gz: 9625ac088c4d5c50cc51bbbcbc744cb7041766ccbb7a42a9cd1b80b29ebe64414d39875dea5d61a87025e239ad78be2a2ea4d3f85a187684321e409fc01a40fd
7
- data.tar.gz: 6f68445f10765a4eb1124ed1cfd2afb7544d146823efad27b2b6955bb0ee822ae8b0f9cccb68777c8cb211f665a0e2531eba04a4240399af1101a5dbcd645ae9
6
+ metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
7
+ data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
2
+
3
+ **Breaking Changes**
4
+
5
+ - Bump llama.cpp from b3056 to b3091.
6
+ - Rename `type` method to `token_attr` in `Model`.
7
+ - Add constants for token attribute types.
8
+ - Remove `--with-clblast` and `--with-mpi` config options.
9
+ - Add `--with-no-openmp` config option.
10
+
1
11
  ## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
2
12
 
3
13
  - Bump llama.cpp from b2988 to b3056.
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
18
  make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
19
  make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
20
- make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
21
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
22
- make_envs << ' LLAMA_MPI=1' if with_config('mpi')
23
21
  make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
+ make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
24
23
 
25
24
  make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
26
25
 
@@ -1523,7 +1523,7 @@ public:
1523
1523
  rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
1524
1524
  rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
1525
1525
  rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
1526
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1526
+ rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
1527
1527
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1528
1528
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1529
1529
  rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
@@ -1778,10 +1778,10 @@ private:
1778
1778
  return DBL2NUM(score);
1779
1779
  }
1780
1780
 
1781
- static VALUE _llama_model_get_type(VALUE self, VALUE token_) {
1781
+ static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
1782
1782
  LLaMAModelWrapper* ptr = get_llama_model(self);
1783
1783
  const llama_token token = NUM2INT(token_);
1784
- const int type = llama_token_get_type(ptr->model, token);
1784
+ const llama_token_attr type = llama_token_get_attr(ptr->model, token);
1785
1785
  return INT2NUM(type);
1786
1786
  }
1787
1787
 
@@ -3503,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
3503
3503
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
3504
3504
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
3505
3505
 
3506
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
3507
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
3508
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
3509
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
3510
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
3511
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
3512
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
3513
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
3514
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
3515
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
3516
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
3517
+
3506
3518
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
3507
3519
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
3508
3520
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.4'
6
+ VERSION = '0.16.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b3056'
9
+ LLAMA_CPP_VERSION = 'b3091'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -32,6 +32,18 @@ module LLaMACpp
32
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
33
33
  LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
34
34
 
35
+ LLAMA_TOKEN_ATTR_UNDEFINED: Integer
36
+ LLAMA_TOKEN_ATTR_UNKNOWN: Integer
37
+ LLAMA_TOKEN_ATTR_UNUSED: Integer
38
+ LLAMA_TOKEN_ATTR_NORMAL: Integer
39
+ LLAMA_TOKEN_ATTR_CONTROL: Integer
40
+ LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
41
+ LLAMA_TOKEN_ATTR_BYTE: Integer
42
+ LLAMA_TOKEN_ATTR_NORMALIZED: Integer
43
+ LLAMA_TOKEN_ATTR_LSTRIP: Integer
44
+ LLAMA_TOKEN_ATTR_RSTRIP: Integer
45
+ LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
46
+
35
47
  LLAMA_FTYPE_ALL_F32: Integer
36
48
  LLAMA_FTYPE_MOSTLY_F16: Integer
37
49
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -147,7 +159,7 @@ module LLaMACpp
147
159
  def n_params: () -> Integer
148
160
  def text: (Integer) -> String
149
161
  def score: (Integer) -> Float
150
- def type: (Integer) -> Integer
162
+ def token_attr: (Integer) -> Integer
151
163
  def token_bos: () -> Integer
152
164
  def token_eos: () -> Integer
153
165
  def token_cls: () -> Integer
@@ -1,7 +1,7 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
5
5
  retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
57
57
  LLAMA_METAL := 1
58
58
  endif
59
59
 
60
+ LLAMA_NO_OPENMP := 1
61
+
60
62
  ifneq ($(UNAME_P),arm)
61
63
  SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
62
64
  ifeq ($(SYSCTL_M),1)
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
67
69
  endif
68
70
  endif
69
71
 
72
+ ifdef LLAMA_RPC
73
+ BUILD_TARGETS += rpc-server
74
+ endif
75
+
70
76
  default: $(BUILD_TARGETS)
71
77
 
72
78
  test: $(TEST_TARGETS)
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
135
141
  ifdef LLAMA_FAST
136
142
  MK_CFLAGS += -Ofast
137
143
  HOST_CXXFLAGS += -Ofast
144
+ ifndef LLAMA_DEBUG
138
145
  MK_NVCCFLAGS += -O3
146
+ endif # LLAMA_DEBUG
139
147
  else
140
148
  MK_CFLAGS += -O3
141
149
  MK_CXXFLAGS += -O3
150
+ ifndef LLAMA_DEBUG
142
151
  MK_NVCCFLAGS += -O3
143
- endif
152
+ endif # LLAMA_DEBUG
153
+ endif # LLAMA_FAST
144
154
 
145
155
  ifndef LLAMA_NO_CCACHE
146
156
  CCACHE := $(shell which ccache)
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
201
211
  endif
202
212
 
203
213
  ifdef LLAMA_DEBUG
204
- MK_CFLAGS += -O0 -g
205
- MK_CXXFLAGS += -O0 -g
206
- MK_LDFLAGS += -g
214
+ MK_CFLAGS += -O0 -g
215
+ MK_CXXFLAGS += -O0 -g
216
+ MK_LDFLAGS += -g
217
+ MK_NVCCFLAGS += -O0 -g
207
218
 
208
219
  ifeq ($(UNAME_S),Linux)
209
220
  MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
402
413
  endif
403
414
  endif # LLAMA_NO_ACCELERATE
404
415
 
416
+ ifndef LLAMA_NO_OPENMP
417
+ MK_CPPFLAGS += -DGGML_USE_OPENMP
418
+ MK_CFLAGS += -fopenmp
419
+ MK_CXXFLAGS += -fopenmp
420
+ endif # LLAMA_NO_OPENMP
421
+
405
422
  ifdef LLAMA_OPENBLAS
406
423
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
407
424
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
418
435
  MK_LDFLAGS += -lblis -L/usr/local/lib
419
436
  endif # LLAMA_BLIS
420
437
 
438
+ ifdef LLAMA_RPC
439
+ MK_CPPFLAGS += -DGGML_USE_RPC
440
+ OBJS += ggml-rpc.o
441
+ endif # LLAMA_RPC
442
+
421
443
  ifdef LLAMA_CUBLAS
422
444
  # LLAMA_CUBLAS is deprecated and will be removed in the future
423
445
  LLAMA_CUDA := 1
424
446
  endif
425
447
 
448
+ OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
449
+ ifdef LLAMA_CUDA_FA_ALL_QUANTS
450
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
451
+ else
452
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
453
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
454
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
455
+ endif # LLAMA_CUDA_FA_ALL_QUANTS
456
+
426
457
  ifdef LLAMA_CUDA
427
458
  ifneq ('', '$(wildcard /opt/cuda)')
428
459
  CUDA_PATH ?= /opt/cuda
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
433
464
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
434
465
  OBJS += ggml-cuda.o
435
466
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
467
+ OBJS += $(OBJS_CUDA_TEMP_INST)
436
468
  MK_NVCCFLAGS += -use_fast_math
437
469
  ifdef LLAMA_FATAL_WARNINGS
438
470
  MK_NVCCFLAGS += -Werror all-warnings
@@ -495,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
495
527
  endif # LLAMA_CUDA_NO_PEER_COPY
496
528
  ifdef LLAMA_CUDA_CCBIN
497
529
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
498
- endif
530
+ endif # LLAMA_CUDA_CCBIN
531
+ ifdef LLAMA_CUDA_FA_ALL_QUANTS
532
+ MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
533
+ endif # LLAMA_CUDA_FA_ALL_QUANTS
499
534
 
500
535
  ifdef JETSON_EOL_MODULE_DETECT
501
536
  define NVCC_COMPILE
@@ -507,30 +542,13 @@ define NVCC_COMPILE
507
542
  endef # NVCC_COMPILE
508
543
  endif # JETSON_EOL_MODULE_DETECT
509
544
 
510
- ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
545
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
511
546
  $(NVCC_COMPILE)
512
547
 
513
548
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
514
549
  $(NVCC_COMPILE)
515
550
  endif # LLAMA_CUDA
516
551
 
517
- ifdef LLAMA_CLBLAST
518
- MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
519
- MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
520
- MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
521
-
522
- # Mac provides OpenCL as a framework
523
- ifeq ($(UNAME_S),Darwin)
524
- MK_LDFLAGS += -lclblast -framework OpenCL
525
- else
526
- MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
527
- endif
528
- OBJS += ggml-opencl.o
529
-
530
- ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
531
- $(CXX) $(CXXFLAGS) -c $< -o $@
532
- endif # LLAMA_CLBLAST
533
-
534
552
  ifdef LLAMA_VULKAN
535
553
  MK_CPPFLAGS += -DGGML_USE_VULKAN
536
554
  MK_LDFLAGS += -lvulkan
@@ -573,6 +591,7 @@ ifdef LLAMA_HIP_UMA
573
591
  MK_CPPFLAGS += -DGGML_HIP_UMA
574
592
  endif # LLAMA_HIP_UMA
575
593
  MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
594
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
576
595
  MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
577
596
  HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
578
597
  HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@@ -586,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
586
605
  endif # LLAMA_CUDA_NO_PEER_COPY
587
606
  OBJS += ggml-cuda.o
588
607
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
608
+ OBJS += $(OBJS_CUDA_TEMP_INST)
589
609
 
590
610
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
591
611
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
592
612
 
593
- ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
613
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
594
614
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
595
615
 
596
616
  endif # LLAMA_HIPBLAS
@@ -628,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
628
648
  endif
629
649
  endif # LLAMA_METAL
630
650
 
651
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
652
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
653
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
654
+
631
655
  ifndef LLAMA_NO_LLAMAFILE
632
656
  sgemm.o: sgemm.cpp sgemm.h ggml.h
633
657
  $(CXX) $(CXXFLAGS) -c $< -o $@
634
658
  endif
635
659
 
660
+ ifdef LLAMA_RPC
661
+ ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
662
+ $(CXX) $(CXXFLAGS) -c $< -o $@
663
+
664
+ rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
665
+ $(CXX) $(CXXFLAGS) -c $< -o $@
666
+
667
+ rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
668
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
669
+ endif # LLAMA_RPC
670
+
636
671
  GF_CC := $(CC)
637
672
  include scripts/get-flags.mk
638
673
 
@@ -712,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
712
747
  unicode-data.o: unicode-data.cpp unicode-data.h
713
748
  $(CXX) $(CXXFLAGS) -c $< -o $@
714
749
 
715
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
716
-
717
750
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
718
751
  $(CXX) $(CXXFLAGS) -c $< -o $@
719
752
 
720
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
721
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
722
-
723
753
  common.o: common/common.cpp $(COMMON_H_DEPS)
724
754
  $(CXX) $(CXXFLAGS) -c $< -o $@
725
755
 
@@ -754,6 +784,7 @@ lib: llama.o ggml.o $(OBJS)
754
784
  clean:
755
785
  rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
756
786
  rm -vrf ggml-cuda/*.o
787
+ rm -vrf ggml-cuda/template-instances/*.o
757
788
 
758
789
  #
759
790
  # Examples
@@ -821,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
821
852
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
822
853
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
823
854
 
824
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
855
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
825
856
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
826
857
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
827
858
 
@@ -871,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
871
902
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
872
903
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
873
904
 
874
- beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
875
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
876
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
877
-
878
905
  finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
879
906
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
880
907
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
377
377
  galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
383
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750
750
  // this tensor was allocated without ggml-backend
751
751
  return;
752
752
  }
753
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
753
+ ggml_backend_view_init(tensor);
754
754
  }
755
755
  } else {
756
756
  if (tensor->data == NULL) {
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899
899
  if (t->view_src == NULL) {
900
900
  ggml_tallocr_alloc(&tallocr, t);
901
901
  } else if (t->buffer == NULL) {
902
- ggml_backend_view_init(buffer, t);
902
+ ggml_backend_view_init(t);
903
903
  }
904
904
  } else {
905
905
  if (t->view_src != NULL && t->buffer == NULL) {
906
906
  // view of a pre-allocated tensor
907
- ggml_backend_view_init(buffer, t);
907
+ ggml_backend_view_init(t);
908
908
  }
909
909
  }
910
910
  }
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
151
151
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
152
  ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
153
  if (dst_buf->iface.cpy_tensor) {
154
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
154
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
155
  }
156
156
  return false;
157
157
  }
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
1887
1887
 
1888
1888
  // utils
1889
1889
 
1890
- void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1890
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
1891
1891
  GGML_ASSERT(tensor->buffer == NULL);
1892
1892
  GGML_ASSERT(tensor->view_src != NULL);
1893
1893
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1894
1894
  GGML_ASSERT(tensor->view_src->data != NULL);
1895
1895
 
1896
- tensor->buffer = buffer;
1896
+ tensor->buffer = tensor->view_src->buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- ggml_backend_buffer_init_tensor(buffer, tensor);
1898
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1899
1899
  }
1900
1900
 
1901
1901
  void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1954
1954
  struct ggml_tensor * dst = node_copies[id];
1955
1955
  if (dst->view_src != NULL) {
1956
1956
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957
- ggml_backend_view_init(dst->view_src->buffer, dst);
1957
+ ggml_backend_view_init(dst);
1958
1958
  }
1959
1959
  else {
1960
1960
  ggml_backend_tensor_copy(src, dst);
@@ -225,7 +225,7 @@ extern "C" {
225
225
 
226
226
  // Tensor initialization
227
227
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
228
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
229
229
 
230
230
 
231
231
  #ifdef __cplusplus
@@ -0,0 +1,47 @@
1
+ #include "acc.cuh"
2
+
3
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
+ const int ne10, const int ne11, const int ne12,
5
+ const int nb1, const int nb2, int offset) {
6
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
7
+ if (i >= ne) {
8
+ return;
9
+ }
10
+ int src1_idx = i - offset;
11
+ int oz = src1_idx / nb2;
12
+ int oy = (src1_idx - (oz * nb2)) / nb1;
13
+ int ox = src1_idx % nb1;
14
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
+ } else {
17
+ dst[i] = x[i];
18
+ }
19
+ }
20
+
21
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
+ const int ne10, const int ne11, const int ne12,
23
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
+ }
27
+
28
+ void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
+ const ggml_tensor * src0 = dst->src[0];
30
+ const ggml_tensor * src1 = dst->src[1];
31
+ const float * src0_d = (const float *)src0->data;
32
+ const float * src1_d = (const float *)src1->data;
33
+ float * dst_d = (float *)dst->data;
34
+ cudaStream_t stream = ctx.stream();
35
+
36
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
+
41
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
+ int offset = dst->op_params[3] / 4; // offset in bytes
45
+
46
+ acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
+ }
@@ -0,0 +1,34 @@
1
+ #include "arange.cuh"
2
+
3
+ static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
4
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
5
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
+ if (nidx >= ne0) {
7
+ return;
8
+ }
9
+ dst[nidx] = start + step * nidx;
10
+ }
11
+
12
+ static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13
+ int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14
+ arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
15
+ }
16
+
17
+ void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18
+ float * dst_d = (float *)dst->data;
19
+ cudaStream_t stream = ctx.stream();
20
+
21
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
22
+
23
+ float start;
24
+ float stop;
25
+ float step;
26
+ memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27
+ memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
28
+ memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
29
+
30
+ int64_t steps = (int64_t)ceil((stop - start) / step);
31
+ GGML_ASSERT(ggml_nelements(dst) == steps);
32
+
33
+ arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34
+ }
@@ -0,0 +1,103 @@
1
+ #include "argsort.cuh"
2
+
3
+ template<typename T>
4
+ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
5
+ T tmp = a;
6
+ a = b;
7
+ b = tmp;
8
+ }
9
+
10
+ template<ggml_sort_order order>
11
+ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
12
+ // bitonic sort
13
+ int col = threadIdx.x;
14
+ int row = blockIdx.y;
15
+
16
+ if (col >= ncols_pad) {
17
+ return;
18
+ }
19
+
20
+ const float * x_row = x + row * ncols;
21
+ extern __shared__ int dst_row[];
22
+
23
+ // initialize indices
24
+ dst_row[col] = col;
25
+
26
+ __syncthreads();
27
+
28
+ for (int k = 2; k <= ncols_pad; k *= 2) {
29
+ for (int j = k / 2; j > 0; j /= 2) {
30
+ int ixj = col ^ j;
31
+ if (ixj > col) {
32
+ if ((col & k) == 0) {
33
+ if (dst_row[col] >= ncols ||
34
+ (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
35
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
36
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
37
+ ) {
38
+ ggml_cuda_swap(dst_row[col], dst_row[ixj]);
39
+ }
40
+ } else {
41
+ if (dst_row[ixj] >= ncols ||
42
+ (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
43
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
44
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
45
+ ) {
46
+ ggml_cuda_swap(dst_row[col], dst_row[ixj]);
47
+ }
48
+ }
49
+ }
50
+ __syncthreads();
51
+ }
52
+ }
53
+
54
+ // copy the result to dst without the padding
55
+ if (col < ncols) {
56
+ dst[row * ncols + col] = dst_row[col];
57
+ }
58
+ }
59
+
60
+ static int next_power_of_2(int x) {
61
+ int n = 1;
62
+ while (n < x) {
63
+ n *= 2;
64
+ }
65
+ return n;
66
+ }
67
+
68
+ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
69
+ // bitonic sort requires ncols to be power of 2
70
+ const int ncols_pad = next_power_of_2(ncols);
71
+
72
+ const dim3 block_dims(ncols_pad, 1, 1);
73
+ const dim3 block_nums(1, nrows, 1);
74
+ const size_t shared_mem = ncols_pad * sizeof(int);
75
+
76
+ GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
77
+
78
+ if (order == GGML_SORT_ORDER_ASC) {
79
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
80
+ } else if (order == GGML_SORT_ORDER_DESC) {
81
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
82
+ } else {
83
+ GGML_ASSERT(false);
84
+ }
85
+ }
86
+
87
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
88
+ const ggml_tensor * src0 = dst->src[0];
89
+ const float * src0_d = (const float *)src0->data;
90
+ float * dst_d = (float *)dst->data;
91
+ cudaStream_t stream = ctx.stream();
92
+
93
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
94
+ GGML_ASSERT( dst->type == GGML_TYPE_I32);
95
+ GGML_ASSERT(ggml_is_contiguous(src0));
96
+
97
+ const int64_t ncols = src0->ne[0];
98
+ const int64_t nrows = ggml_nrows(src0);
99
+
100
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
101
+
102
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
103
+ }