llama_cpp 0.15.3 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +27 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +66 -36
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
  131. data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
  132. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
  133. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  134. data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
  135. data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
  136. data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
  137. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
  138. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
  139. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  140. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
  141. data/vendor/tmp/llama.cpp/ggml.c +301 -409
  142. data/vendor/tmp/llama.cpp/ggml.h +19 -23
  143. data/vendor/tmp/llama.cpp/llama.cpp +855 -651
  144. data/vendor/tmp/llama.cpp/llama.h +28 -48
  145. metadata +121 -6
  146. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  147. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  148. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  149. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
4
- data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
3
+ metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
4
+ data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
5
5
  SHA512:
6
- metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
7
- data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
6
+ metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
7
+ data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
2
+
3
+ **Breaking Changes**
4
+
5
+ - Bump llama.cpp from b3056 to b3091.
6
+ - Rename `type` method to `token_attr` in `Model`.
7
+ - Add constants for token attribute types.
8
+ - Remove `--with-clblast` and `--with-mpi` config options.
9
+ - Add `--with-no-openmp` config option.
10
+
11
+ ## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
12
+
13
+ - Bump llama.cpp from b2988 to b3056.
14
+ - Add LLAMA_VOCAB_PRE_TYPE_SMAUG constant.
15
+ - Add `token_is_control?` method to `Model`.
16
+
1
17
  ## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
2
18
 
3
19
  - Bump llama.cpp from b2917 to b2988.
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
18
  make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
19
  make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
20
- make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
21
20
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
22
- make_envs << ' LLAMA_MPI=1' if with_config('mpi')
23
21
  make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
22
+ make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
24
23
 
25
24
  make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
26
25
 
@@ -1523,7 +1523,7 @@ public:
1523
1523
  rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
1524
1524
  rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
1525
1525
  rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
1526
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1526
+ rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
1527
1527
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1528
1528
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1529
1529
  rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
@@ -1536,6 +1536,7 @@ public:
1536
1536
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1537
1537
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1538
1538
  rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1539
+ rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
1539
1540
  }
1540
1541
 
1541
1542
  private:
@@ -1777,10 +1778,10 @@ private:
1777
1778
  return DBL2NUM(score);
1778
1779
  }
1779
1780
 
1780
- static VALUE _llama_model_get_type(VALUE self, VALUE token_) {
1781
+ static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
1781
1782
  LLaMAModelWrapper* ptr = get_llama_model(self);
1782
1783
  const llama_token token = NUM2INT(token_);
1783
- const int type = llama_token_get_type(ptr->model, token);
1784
+ const llama_token_attr type = llama_token_get_attr(ptr->model, token);
1784
1785
  return INT2NUM(type);
1785
1786
  }
1786
1787
 
@@ -1848,6 +1849,16 @@ private:
1848
1849
  LLaMAModelWrapper* ptr = get_llama_model(self);
1849
1850
  return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1850
1851
  }
1852
+
1853
+ static VALUE _llama_model_token_is_control(VALUE self, VALUE token_) {
1854
+ if (!RB_INTEGER_TYPE_P(token_)) {
1855
+ rb_raise(rb_eArgError, "token must be an integer");
1856
+ return Qnil;
1857
+ }
1858
+ const llama_token token = NUM2INT(token_);
1859
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1860
+ return llama_token_is_control(ptr->model, token) ? Qtrue : Qfalse;
1861
+ }
1851
1862
  };
1852
1863
 
1853
1864
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -3482,6 +3493,7 @@ extern "C" void Init_llama_cpp(void) {
3482
3493
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3483
3494
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3484
3495
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
3496
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_SMAUG", INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG));
3485
3497
 
3486
3498
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3487
3499
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3491,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
3491
3503
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
3492
3504
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
3493
3505
 
3506
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
3507
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
3508
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
3509
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
3510
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
3511
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
3512
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
3513
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
3514
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
3515
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
3516
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
3517
+
3494
3518
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
3495
3519
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
3496
3520
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.3'
6
+ VERSION = '0.16.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2988'
9
+ LLAMA_CPP_VERSION = 'b3091'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -30,6 +30,19 @@ module LLaMACpp
30
30
  LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
31
31
  LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
32
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
33
+ LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
34
+
35
+ LLAMA_TOKEN_ATTR_UNDEFINED: Integer
36
+ LLAMA_TOKEN_ATTR_UNKNOWN: Integer
37
+ LLAMA_TOKEN_ATTR_UNUSED: Integer
38
+ LLAMA_TOKEN_ATTR_NORMAL: Integer
39
+ LLAMA_TOKEN_ATTR_CONTROL: Integer
40
+ LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
41
+ LLAMA_TOKEN_ATTR_BYTE: Integer
42
+ LLAMA_TOKEN_ATTR_NORMALIZED: Integer
43
+ LLAMA_TOKEN_ATTR_LSTRIP: Integer
44
+ LLAMA_TOKEN_ATTR_RSTRIP: Integer
45
+ LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
33
46
 
34
47
  LLAMA_FTYPE_ALL_F32: Integer
35
48
  LLAMA_FTYPE_MOSTLY_F16: Integer
@@ -146,7 +159,7 @@ module LLaMACpp
146
159
  def n_params: () -> Integer
147
160
  def text: (Integer) -> String
148
161
  def score: (Integer) -> Float
149
- def type: (Integer) -> Integer
162
+ def token_attr: (Integer) -> Integer
150
163
  def token_bos: () -> Integer
151
164
  def token_eos: () -> Integer
152
165
  def token_cls: () -> Integer
@@ -159,6 +172,7 @@ module LLaMACpp
159
172
  def token_suffix: () -> Integer
160
173
  def token_eot: () -> Integer
161
174
  def token_is_eog?: (Integer) -> bool
175
+ def token_is_control?: (Integer) -> bool
162
176
  end
163
177
 
164
178
  class Timings
@@ -1,7 +1,7 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
5
5
  retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
57
57
  LLAMA_METAL := 1
58
58
  endif
59
59
 
60
+ LLAMA_NO_OPENMP := 1
61
+
60
62
  ifneq ($(UNAME_P),arm)
61
63
  SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
62
64
  ifeq ($(SYSCTL_M),1)
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
67
69
  endif
68
70
  endif
69
71
 
72
+ ifdef LLAMA_RPC
73
+ BUILD_TARGETS += rpc-server
74
+ endif
75
+
70
76
  default: $(BUILD_TARGETS)
71
77
 
72
78
  test: $(TEST_TARGETS)
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
135
141
  ifdef LLAMA_FAST
136
142
  MK_CFLAGS += -Ofast
137
143
  HOST_CXXFLAGS += -Ofast
144
+ ifndef LLAMA_DEBUG
138
145
  MK_NVCCFLAGS += -O3
146
+ endif # LLAMA_DEBUG
139
147
  else
140
148
  MK_CFLAGS += -O3
141
149
  MK_CXXFLAGS += -O3
150
+ ifndef LLAMA_DEBUG
142
151
  MK_NVCCFLAGS += -O3
143
- endif
152
+ endif # LLAMA_DEBUG
153
+ endif # LLAMA_FAST
144
154
 
145
155
  ifndef LLAMA_NO_CCACHE
146
156
  CCACHE := $(shell which ccache)
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
201
211
  endif
202
212
 
203
213
  ifdef LLAMA_DEBUG
204
- MK_CFLAGS += -O0 -g
205
- MK_CXXFLAGS += -O0 -g
206
- MK_LDFLAGS += -g
214
+ MK_CFLAGS += -O0 -g
215
+ MK_CXXFLAGS += -O0 -g
216
+ MK_LDFLAGS += -g
217
+ MK_NVCCFLAGS += -O0 -g
207
218
 
208
219
  ifeq ($(UNAME_S),Linux)
209
220
  MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
402
413
  endif
403
414
  endif # LLAMA_NO_ACCELERATE
404
415
 
416
+ ifndef LLAMA_NO_OPENMP
417
+ MK_CPPFLAGS += -DGGML_USE_OPENMP
418
+ MK_CFLAGS += -fopenmp
419
+ MK_CXXFLAGS += -fopenmp
420
+ endif # LLAMA_NO_OPENMP
421
+
405
422
  ifdef LLAMA_OPENBLAS
406
423
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
407
424
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
418
435
  MK_LDFLAGS += -lblis -L/usr/local/lib
419
436
  endif # LLAMA_BLIS
420
437
 
438
+ ifdef LLAMA_RPC
439
+ MK_CPPFLAGS += -DGGML_USE_RPC
440
+ OBJS += ggml-rpc.o
441
+ endif # LLAMA_RPC
442
+
421
443
  ifdef LLAMA_CUBLAS
422
444
  # LLAMA_CUBLAS is deprecated and will be removed in the future
423
445
  LLAMA_CUDA := 1
424
446
  endif
425
447
 
448
+ OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
449
+ ifdef LLAMA_CUDA_FA_ALL_QUANTS
450
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
451
+ else
452
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
453
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
454
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
455
+ endif # LLAMA_CUDA_FA_ALL_QUANTS
456
+
426
457
  ifdef LLAMA_CUDA
427
458
  ifneq ('', '$(wildcard /opt/cuda)')
428
459
  CUDA_PATH ?= /opt/cuda
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
433
464
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
434
465
  OBJS += ggml-cuda.o
435
466
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
467
+ OBJS += $(OBJS_CUDA_TEMP_INST)
436
468
  MK_NVCCFLAGS += -use_fast_math
437
469
  ifdef LLAMA_FATAL_WARNINGS
438
470
  MK_NVCCFLAGS += -Werror all-warnings
@@ -443,6 +475,9 @@ endif # JETSON_EOL_MODULE_DETECT
443
475
  ifdef LLAMA_DEBUG
444
476
  MK_NVCCFLAGS += -lineinfo
445
477
  endif # LLAMA_DEBUG
478
+ ifdef LLAMA_CUDA_DEBUG
479
+ MK_NVCCFLAGS += --device-debug
480
+ endif # LLAMA_CUDA_DEBUG
446
481
  ifdef LLAMA_CUDA_NVCC
447
482
  NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
448
483
  else
@@ -492,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
492
527
  endif # LLAMA_CUDA_NO_PEER_COPY
493
528
  ifdef LLAMA_CUDA_CCBIN
494
529
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
495
- endif
530
+ endif # LLAMA_CUDA_CCBIN
531
+ ifdef LLAMA_CUDA_FA_ALL_QUANTS
532
+ MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
533
+ endif # LLAMA_CUDA_FA_ALL_QUANTS
496
534
 
497
535
  ifdef JETSON_EOL_MODULE_DETECT
498
536
  define NVCC_COMPILE
@@ -504,30 +542,13 @@ define NVCC_COMPILE
504
542
  endef # NVCC_COMPILE
505
543
  endif # JETSON_EOL_MODULE_DETECT
506
544
 
507
- ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
545
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
508
546
  $(NVCC_COMPILE)
509
547
 
510
548
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
511
549
  $(NVCC_COMPILE)
512
550
  endif # LLAMA_CUDA
513
551
 
514
- ifdef LLAMA_CLBLAST
515
- MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
516
- MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
517
- MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
518
-
519
- # Mac provides OpenCL as a framework
520
- ifeq ($(UNAME_S),Darwin)
521
- MK_LDFLAGS += -lclblast -framework OpenCL
522
- else
523
- MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
524
- endif
525
- OBJS += ggml-opencl.o
526
-
527
- ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
528
- $(CXX) $(CXXFLAGS) -c $< -o $@
529
- endif # LLAMA_CLBLAST
530
-
531
552
  ifdef LLAMA_VULKAN
532
553
  MK_CPPFLAGS += -DGGML_USE_VULKAN
533
554
  MK_LDFLAGS += -lvulkan
@@ -570,6 +591,7 @@ ifdef LLAMA_HIP_UMA
570
591
  MK_CPPFLAGS += -DGGML_HIP_UMA
571
592
  endif # LLAMA_HIP_UMA
572
593
  MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
594
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
573
595
  MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
574
596
  HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
575
597
  HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@@ -583,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
583
605
  endif # LLAMA_CUDA_NO_PEER_COPY
584
606
  OBJS += ggml-cuda.o
585
607
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
608
+ OBJS += $(OBJS_CUDA_TEMP_INST)
586
609
 
587
610
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
588
611
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
589
612
 
590
- ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
613
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
591
614
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
592
615
 
593
616
  endif # LLAMA_HIPBLAS
@@ -625,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
625
648
  endif
626
649
  endif # LLAMA_METAL
627
650
 
651
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
652
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
653
+ COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
654
+
628
655
  ifndef LLAMA_NO_LLAMAFILE
629
656
  sgemm.o: sgemm.cpp sgemm.h ggml.h
630
657
  $(CXX) $(CXXFLAGS) -c $< -o $@
631
658
  endif
632
659
 
660
+ ifdef LLAMA_RPC
661
+ ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
662
+ $(CXX) $(CXXFLAGS) -c $< -o $@
663
+
664
+ rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
665
+ $(CXX) $(CXXFLAGS) -c $< -o $@
666
+
667
+ rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
668
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
669
+ endif # LLAMA_RPC
670
+
633
671
  GF_CC := $(CC)
634
672
  include scripts/get-flags.mk
635
673
 
@@ -709,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
709
747
  unicode-data.o: unicode-data.cpp unicode-data.h
710
748
  $(CXX) $(CXXFLAGS) -c $< -o $@
711
749
 
712
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
713
-
714
750
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
715
751
  $(CXX) $(CXXFLAGS) -c $< -o $@
716
752
 
717
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
718
- COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
719
-
720
753
  common.o: common/common.cpp $(COMMON_H_DEPS)
721
754
  $(CXX) $(CXXFLAGS) -c $< -o $@
722
755
 
@@ -749,8 +782,9 @@ lib: llama.o ggml.o $(OBJS)
749
782
  ar rcs libllama.a $^
750
783
 
751
784
  clean:
752
- rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
785
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
753
786
  rm -vrf ggml-cuda/*.o
787
+ rm -vrf ggml-cuda/template-instances/*.o
754
788
 
755
789
  #
756
790
  # Examples
@@ -818,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
818
852
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
819
853
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
820
854
 
821
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
855
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
822
856
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
823
857
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
824
858
 
@@ -868,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
868
902
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
869
903
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
870
904
 
871
- beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
872
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
873
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
874
-
875
905
  finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
876
906
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
877
907
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
377
377
  galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
383
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750
750
  // this tensor was allocated without ggml-backend
751
751
  return;
752
752
  }
753
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
753
+ ggml_backend_view_init(tensor);
754
754
  }
755
755
  } else {
756
756
  if (tensor->data == NULL) {
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899
899
  if (t->view_src == NULL) {
900
900
  ggml_tallocr_alloc(&tallocr, t);
901
901
  } else if (t->buffer == NULL) {
902
- ggml_backend_view_init(buffer, t);
902
+ ggml_backend_view_init(t);
903
903
  }
904
904
  } else {
905
905
  if (t->view_src != NULL && t->buffer == NULL) {
906
906
  // view of a pre-allocated tensor
907
- ggml_backend_view_init(buffer, t);
907
+ ggml_backend_view_init(t);
908
908
  }
909
909
  }
910
910
  }
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
151
151
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
152
  ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
153
  if (dst_buf->iface.cpy_tensor) {
154
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
154
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
155
  }
156
156
  return false;
157
157
  }
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
1887
1887
 
1888
1888
  // utils
1889
1889
 
1890
- void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1890
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
1891
1891
  GGML_ASSERT(tensor->buffer == NULL);
1892
1892
  GGML_ASSERT(tensor->view_src != NULL);
1893
1893
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1894
1894
  GGML_ASSERT(tensor->view_src->data != NULL);
1895
1895
 
1896
- tensor->buffer = buffer;
1896
+ tensor->buffer = tensor->view_src->buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- ggml_backend_buffer_init_tensor(buffer, tensor);
1898
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1899
1899
  }
1900
1900
 
1901
1901
  void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1954
1954
  struct ggml_tensor * dst = node_copies[id];
1955
1955
  if (dst->view_src != NULL) {
1956
1956
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957
- ggml_backend_view_init(dst->view_src->buffer, dst);
1957
+ ggml_backend_view_init(dst);
1958
1958
  }
1959
1959
  else {
1960
1960
  ggml_backend_tensor_copy(src, dst);
@@ -225,7 +225,7 @@ extern "C" {
225
225
 
226
226
  // Tensor initialization
227
227
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
228
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
229
229
 
230
230
 
231
231
  #ifdef __cplusplus
@@ -0,0 +1,47 @@
1
+ #include "acc.cuh"
2
+
3
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
+ const int ne10, const int ne11, const int ne12,
5
+ const int nb1, const int nb2, int offset) {
6
+ const int i = blockDim.x * blockIdx.x + threadIdx.x;
7
+ if (i >= ne) {
8
+ return;
9
+ }
10
+ int src1_idx = i - offset;
11
+ int oz = src1_idx / nb2;
12
+ int oy = (src1_idx - (oz * nb2)) / nb1;
13
+ int ox = src1_idx % nb1;
14
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
+ } else {
17
+ dst[i] = x[i];
18
+ }
19
+ }
20
+
21
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
+ const int ne10, const int ne11, const int ne12,
23
+ const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
+ int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
26
+ }
27
+
28
+ void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
+ const ggml_tensor * src0 = dst->src[0];
30
+ const ggml_tensor * src1 = dst->src[1];
31
+ const float * src0_d = (const float *)src0->data;
32
+ const float * src1_d = (const float *)src1->data;
33
+ float * dst_d = (float *)dst->data;
34
+ cudaStream_t stream = ctx.stream();
35
+
36
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
+
41
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
+ int offset = dst->op_params[3] / 4; // offset in bytes
45
+
46
+ acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
47
+ }
@@ -0,0 +1,34 @@
1
+ #include "arange.cuh"
2
+
3
+ static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
4
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
5
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6
+ if (nidx >= ne0) {
7
+ return;
8
+ }
9
+ dst[nidx] = start + step * nidx;
10
+ }
11
+
12
+ static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13
+ int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14
+ arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
15
+ }
16
+
17
+ void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18
+ float * dst_d = (float *)dst->data;
19
+ cudaStream_t stream = ctx.stream();
20
+
21
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
22
+
23
+ float start;
24
+ float stop;
25
+ float step;
26
+ memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27
+ memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
28
+ memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
29
+
30
+ int64_t steps = (int64_t)ceil((stop - start) / step);
31
+ GGML_ASSERT(ggml_nelements(dst) == steps);
32
+
33
+ arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34
+ }