llama_cpp 0.15.3 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +27 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +66 -36
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
- data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
- data/vendor/tmp/llama.cpp/ggml.c +301 -409
- data/vendor/tmp/llama.cpp/ggml.h +19 -23
- data/vendor/tmp/llama.cpp/llama.cpp +855 -651
- data/vendor/tmp/llama.cpp/llama.h +28 -48
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
|
|
4
|
+
data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
|
|
7
|
+
data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,19 @@
|
|
|
1
|
+
## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
|
|
2
|
+
|
|
3
|
+
**Breaking Changes**
|
|
4
|
+
|
|
5
|
+
- Bump llama.cpp from b3056 to b3091.
|
|
6
|
+
- Rename `type` method to `token_attr` in `Model`.
|
|
7
|
+
- Add constants for token attribute types.
|
|
8
|
+
- Remove `--with-clblast` and `--with-mpi` config options.
|
|
9
|
+
- Add `--with-no-openmp` config option.
|
|
10
|
+
|
|
11
|
+
## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
|
|
12
|
+
|
|
13
|
+
- Bump llama.cpp from b2988 to b3056.
|
|
14
|
+
- Add LLAMA_VOCAB_PRE_TYPE_SMAUG constant.
|
|
15
|
+
- Add `token_is_control?` method to `Model`.
|
|
16
|
+
|
|
1
17
|
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
|
2
18
|
|
|
3
19
|
- Bump llama.cpp from b2917 to b2988.
|
data/ext/llama_cpp/extconf.rb
CHANGED
|
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
|
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
|
18
18
|
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
|
19
19
|
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
|
20
|
-
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
|
21
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
|
22
|
-
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
|
23
21
|
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
|
22
|
+
make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
|
|
24
23
|
|
|
25
24
|
make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
|
|
26
25
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
|
@@ -1523,7 +1523,7 @@ public:
|
|
|
1523
1523
|
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
|
1524
1524
|
rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
|
|
1525
1525
|
rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
|
|
1526
|
-
rb_define_method(rb_cLLaMAModel, "
|
|
1526
|
+
rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
|
|
1527
1527
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
|
1528
1528
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
|
1529
1529
|
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
|
@@ -1536,6 +1536,7 @@ public:
|
|
|
1536
1536
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
|
1537
1537
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
|
1538
1538
|
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
|
1539
|
+
rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
|
|
1539
1540
|
}
|
|
1540
1541
|
|
|
1541
1542
|
private:
|
|
@@ -1777,10 +1778,10 @@ private:
|
|
|
1777
1778
|
return DBL2NUM(score);
|
|
1778
1779
|
}
|
|
1779
1780
|
|
|
1780
|
-
static VALUE
|
|
1781
|
+
static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
|
|
1781
1782
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
|
1782
1783
|
const llama_token token = NUM2INT(token_);
|
|
1783
|
-
const
|
|
1784
|
+
const llama_token_attr type = llama_token_get_attr(ptr->model, token);
|
|
1784
1785
|
return INT2NUM(type);
|
|
1785
1786
|
}
|
|
1786
1787
|
|
|
@@ -1848,6 +1849,16 @@ private:
|
|
|
1848
1849
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
|
1849
1850
|
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
|
1850
1851
|
}
|
|
1852
|
+
|
|
1853
|
+
static VALUE _llama_model_token_is_control(VALUE self, VALUE token_) {
|
|
1854
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
|
1855
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
|
1856
|
+
return Qnil;
|
|
1857
|
+
}
|
|
1858
|
+
const llama_token token = NUM2INT(token_);
|
|
1859
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
|
1860
|
+
return llama_token_is_control(ptr->model, token) ? Qtrue : Qfalse;
|
|
1861
|
+
}
|
|
1851
1862
|
};
|
|
1852
1863
|
|
|
1853
1864
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
|
@@ -3482,6 +3493,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
|
3482
3493
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
|
3483
3494
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
|
3484
3495
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
|
3496
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_SMAUG", INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG));
|
|
3485
3497
|
|
|
3486
3498
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
|
3487
3499
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
|
@@ -3491,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
|
|
|
3491
3503
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
|
|
3492
3504
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
|
|
3493
3505
|
|
|
3506
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
|
|
3507
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
|
|
3508
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
|
|
3509
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
|
|
3510
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
|
|
3511
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
|
|
3512
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
|
|
3513
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
|
|
3514
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
|
|
3515
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
|
|
3516
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
|
|
3517
|
+
|
|
3494
3518
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
|
3495
3519
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
|
3496
3520
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
data/lib/llama_cpp/version.rb
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
|
4
4
|
module LLaMACpp
|
|
5
5
|
# The version of llama_cpp.rb you install.
|
|
6
|
-
VERSION = '0.
|
|
6
|
+
VERSION = '0.16.0'
|
|
7
7
|
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
|
9
|
-
LLAMA_CPP_VERSION = '
|
|
9
|
+
LLAMA_CPP_VERSION = 'b3091'
|
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
|
@@ -30,6 +30,19 @@ module LLaMACpp
|
|
|
30
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
|
31
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
|
32
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
|
|
34
|
+
|
|
35
|
+
LLAMA_TOKEN_ATTR_UNDEFINED: Integer
|
|
36
|
+
LLAMA_TOKEN_ATTR_UNKNOWN: Integer
|
|
37
|
+
LLAMA_TOKEN_ATTR_UNUSED: Integer
|
|
38
|
+
LLAMA_TOKEN_ATTR_NORMAL: Integer
|
|
39
|
+
LLAMA_TOKEN_ATTR_CONTROL: Integer
|
|
40
|
+
LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
|
|
41
|
+
LLAMA_TOKEN_ATTR_BYTE: Integer
|
|
42
|
+
LLAMA_TOKEN_ATTR_NORMALIZED: Integer
|
|
43
|
+
LLAMA_TOKEN_ATTR_LSTRIP: Integer
|
|
44
|
+
LLAMA_TOKEN_ATTR_RSTRIP: Integer
|
|
45
|
+
LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
|
|
33
46
|
|
|
34
47
|
LLAMA_FTYPE_ALL_F32: Integer
|
|
35
48
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
|
@@ -146,7 +159,7 @@ module LLaMACpp
|
|
|
146
159
|
def n_params: () -> Integer
|
|
147
160
|
def text: (Integer) -> String
|
|
148
161
|
def score: (Integer) -> Float
|
|
149
|
-
def
|
|
162
|
+
def token_attr: (Integer) -> Integer
|
|
150
163
|
def token_bos: () -> Integer
|
|
151
164
|
def token_eos: () -> Integer
|
|
152
165
|
def token_cls: () -> Integer
|
|
@@ -159,6 +172,7 @@ module LLaMACpp
|
|
|
159
172
|
def token_suffix: () -> Integer
|
|
160
173
|
def token_eot: () -> Integer
|
|
161
174
|
def token_is_eog?: (Integer) -> bool
|
|
175
|
+
def token_is_control?: (Integer) -> bool
|
|
162
176
|
end
|
|
163
177
|
|
|
164
178
|
class Timings
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Define the default target now so that it is always the first target
|
|
2
2
|
BUILD_TARGETS = \
|
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama
|
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
|
6
6
|
|
|
7
7
|
# Binaries only useful for tests
|
|
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
|
|
|
57
57
|
LLAMA_METAL := 1
|
|
58
58
|
endif
|
|
59
59
|
|
|
60
|
+
LLAMA_NO_OPENMP := 1
|
|
61
|
+
|
|
60
62
|
ifneq ($(UNAME_P),arm)
|
|
61
63
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
|
62
64
|
ifeq ($(SYSCTL_M),1)
|
|
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
|
|
|
67
69
|
endif
|
|
68
70
|
endif
|
|
69
71
|
|
|
72
|
+
ifdef LLAMA_RPC
|
|
73
|
+
BUILD_TARGETS += rpc-server
|
|
74
|
+
endif
|
|
75
|
+
|
|
70
76
|
default: $(BUILD_TARGETS)
|
|
71
77
|
|
|
72
78
|
test: $(TEST_TARGETS)
|
|
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
|
|
|
135
141
|
ifdef LLAMA_FAST
|
|
136
142
|
MK_CFLAGS += -Ofast
|
|
137
143
|
HOST_CXXFLAGS += -Ofast
|
|
144
|
+
ifndef LLAMA_DEBUG
|
|
138
145
|
MK_NVCCFLAGS += -O3
|
|
146
|
+
endif # LLAMA_DEBUG
|
|
139
147
|
else
|
|
140
148
|
MK_CFLAGS += -O3
|
|
141
149
|
MK_CXXFLAGS += -O3
|
|
150
|
+
ifndef LLAMA_DEBUG
|
|
142
151
|
MK_NVCCFLAGS += -O3
|
|
143
|
-
endif
|
|
152
|
+
endif # LLAMA_DEBUG
|
|
153
|
+
endif # LLAMA_FAST
|
|
144
154
|
|
|
145
155
|
ifndef LLAMA_NO_CCACHE
|
|
146
156
|
CCACHE := $(shell which ccache)
|
|
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
|
|
|
201
211
|
endif
|
|
202
212
|
|
|
203
213
|
ifdef LLAMA_DEBUG
|
|
204
|
-
MK_CFLAGS
|
|
205
|
-
MK_CXXFLAGS
|
|
206
|
-
MK_LDFLAGS
|
|
214
|
+
MK_CFLAGS += -O0 -g
|
|
215
|
+
MK_CXXFLAGS += -O0 -g
|
|
216
|
+
MK_LDFLAGS += -g
|
|
217
|
+
MK_NVCCFLAGS += -O0 -g
|
|
207
218
|
|
|
208
219
|
ifeq ($(UNAME_S),Linux)
|
|
209
220
|
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
|
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
|
|
|
402
413
|
endif
|
|
403
414
|
endif # LLAMA_NO_ACCELERATE
|
|
404
415
|
|
|
416
|
+
ifndef LLAMA_NO_OPENMP
|
|
417
|
+
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
|
418
|
+
MK_CFLAGS += -fopenmp
|
|
419
|
+
MK_CXXFLAGS += -fopenmp
|
|
420
|
+
endif # LLAMA_NO_OPENMP
|
|
421
|
+
|
|
405
422
|
ifdef LLAMA_OPENBLAS
|
|
406
423
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
|
407
424
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
|
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
|
|
|
418
435
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
|
419
436
|
endif # LLAMA_BLIS
|
|
420
437
|
|
|
438
|
+
ifdef LLAMA_RPC
|
|
439
|
+
MK_CPPFLAGS += -DGGML_USE_RPC
|
|
440
|
+
OBJS += ggml-rpc.o
|
|
441
|
+
endif # LLAMA_RPC
|
|
442
|
+
|
|
421
443
|
ifdef LLAMA_CUBLAS
|
|
422
444
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
|
423
445
|
LLAMA_CUDA := 1
|
|
424
446
|
endif
|
|
425
447
|
|
|
448
|
+
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
|
|
449
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
|
450
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
|
|
451
|
+
else
|
|
452
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
|
|
453
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
|
|
454
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
|
|
455
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
|
456
|
+
|
|
426
457
|
ifdef LLAMA_CUDA
|
|
427
458
|
ifneq ('', '$(wildcard /opt/cuda)')
|
|
428
459
|
CUDA_PATH ?= /opt/cuda
|
|
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
|
|
|
433
464
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
|
434
465
|
OBJS += ggml-cuda.o
|
|
435
466
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
|
467
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
|
436
468
|
MK_NVCCFLAGS += -use_fast_math
|
|
437
469
|
ifdef LLAMA_FATAL_WARNINGS
|
|
438
470
|
MK_NVCCFLAGS += -Werror all-warnings
|
|
@@ -443,6 +475,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
|
|
443
475
|
ifdef LLAMA_DEBUG
|
|
444
476
|
MK_NVCCFLAGS += -lineinfo
|
|
445
477
|
endif # LLAMA_DEBUG
|
|
478
|
+
ifdef LLAMA_CUDA_DEBUG
|
|
479
|
+
MK_NVCCFLAGS += --device-debug
|
|
480
|
+
endif # LLAMA_CUDA_DEBUG
|
|
446
481
|
ifdef LLAMA_CUDA_NVCC
|
|
447
482
|
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
|
448
483
|
else
|
|
@@ -492,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
|
492
527
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
|
493
528
|
ifdef LLAMA_CUDA_CCBIN
|
|
494
529
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
|
495
|
-
endif
|
|
530
|
+
endif # LLAMA_CUDA_CCBIN
|
|
531
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
|
532
|
+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
|
|
533
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
|
496
534
|
|
|
497
535
|
ifdef JETSON_EOL_MODULE_DETECT
|
|
498
536
|
define NVCC_COMPILE
|
|
@@ -504,30 +542,13 @@ define NVCC_COMPILE
|
|
|
504
542
|
endef # NVCC_COMPILE
|
|
505
543
|
endif # JETSON_EOL_MODULE_DETECT
|
|
506
544
|
|
|
507
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
|
545
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
|
508
546
|
$(NVCC_COMPILE)
|
|
509
547
|
|
|
510
548
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
|
511
549
|
$(NVCC_COMPILE)
|
|
512
550
|
endif # LLAMA_CUDA
|
|
513
551
|
|
|
514
|
-
ifdef LLAMA_CLBLAST
|
|
515
|
-
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
|
516
|
-
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
|
517
|
-
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
|
518
|
-
|
|
519
|
-
# Mac provides OpenCL as a framework
|
|
520
|
-
ifeq ($(UNAME_S),Darwin)
|
|
521
|
-
MK_LDFLAGS += -lclblast -framework OpenCL
|
|
522
|
-
else
|
|
523
|
-
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
|
524
|
-
endif
|
|
525
|
-
OBJS += ggml-opencl.o
|
|
526
|
-
|
|
527
|
-
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|
528
|
-
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
529
|
-
endif # LLAMA_CLBLAST
|
|
530
|
-
|
|
531
552
|
ifdef LLAMA_VULKAN
|
|
532
553
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
|
533
554
|
MK_LDFLAGS += -lvulkan
|
|
@@ -570,6 +591,7 @@ ifdef LLAMA_HIP_UMA
|
|
|
570
591
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
|
571
592
|
endif # LLAMA_HIP_UMA
|
|
572
593
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
|
594
|
+
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
|
573
595
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
|
574
596
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
|
575
597
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
|
@@ -583,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
|
583
605
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
|
584
606
|
OBJS += ggml-cuda.o
|
|
585
607
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
|
608
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
|
586
609
|
|
|
587
610
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
|
588
611
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
|
589
612
|
|
|
590
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
|
613
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
|
591
614
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
|
592
615
|
|
|
593
616
|
endif # LLAMA_HIPBLAS
|
|
@@ -625,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
|
625
648
|
endif
|
|
626
649
|
endif # LLAMA_METAL
|
|
627
650
|
|
|
651
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
652
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
|
653
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
|
654
|
+
|
|
628
655
|
ifndef LLAMA_NO_LLAMAFILE
|
|
629
656
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
|
630
657
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
631
658
|
endif
|
|
632
659
|
|
|
660
|
+
ifdef LLAMA_RPC
|
|
661
|
+
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
|
662
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
663
|
+
|
|
664
|
+
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
|
665
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
666
|
+
|
|
667
|
+
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
668
|
+
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
|
669
|
+
endif # LLAMA_RPC
|
|
670
|
+
|
|
633
671
|
GF_CC := $(CC)
|
|
634
672
|
include scripts/get-flags.mk
|
|
635
673
|
|
|
@@ -709,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
|
|
|
709
747
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
|
710
748
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
711
749
|
|
|
712
|
-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
713
|
-
|
|
714
750
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
|
715
751
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
716
752
|
|
|
717
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
|
718
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
|
719
|
-
|
|
720
753
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
|
721
754
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
722
755
|
|
|
@@ -749,8 +782,9 @@ lib: llama.o ggml.o $(OBJS)
|
|
|
749
782
|
ar rcs libllama.a $^
|
|
750
783
|
|
|
751
784
|
clean:
|
|
752
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
|
785
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
|
753
786
|
rm -vrf ggml-cuda/*.o
|
|
787
|
+
rm -vrf ggml-cuda/template-instances/*.o
|
|
754
788
|
|
|
755
789
|
#
|
|
756
790
|
# Examples
|
|
@@ -818,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
|
818
852
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
819
853
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
820
854
|
|
|
821
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
|
855
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
|
822
856
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
823
857
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
|
824
858
|
|
|
@@ -868,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
|
|
|
868
902
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
869
903
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
870
904
|
|
|
871
|
-
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
|
872
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
873
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
874
|
-
|
|
875
905
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
|
876
906
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
877
907
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
|
377
377
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
|
379
379
|
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
|
382
382
|
|
|
383
383
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
|
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
|
750
750
|
// this tensor was allocated without ggml-backend
|
|
751
751
|
return;
|
|
752
752
|
}
|
|
753
|
-
ggml_backend_view_init(
|
|
753
|
+
ggml_backend_view_init(tensor);
|
|
754
754
|
}
|
|
755
755
|
} else {
|
|
756
756
|
if (tensor->data == NULL) {
|
|
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
|
899
899
|
if (t->view_src == NULL) {
|
|
900
900
|
ggml_tallocr_alloc(&tallocr, t);
|
|
901
901
|
} else if (t->buffer == NULL) {
|
|
902
|
-
ggml_backend_view_init(
|
|
902
|
+
ggml_backend_view_init(t);
|
|
903
903
|
}
|
|
904
904
|
} else {
|
|
905
905
|
if (t->view_src != NULL && t->buffer == NULL) {
|
|
906
906
|
// view of a pre-allocated tensor
|
|
907
|
-
ggml_backend_view_init(
|
|
907
|
+
ggml_backend_view_init(t);
|
|
908
908
|
}
|
|
909
909
|
}
|
|
910
910
|
}
|
|
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
|
151
151
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
152
152
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
153
153
|
if (dst_buf->iface.cpy_tensor) {
|
|
154
|
-
return
|
|
154
|
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
|
155
155
|
}
|
|
156
156
|
return false;
|
|
157
157
|
}
|
|
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
|
1887
1887
|
|
|
1888
1888
|
// utils
|
|
1889
1889
|
|
|
1890
|
-
void ggml_backend_view_init(
|
|
1890
|
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
|
1891
1891
|
GGML_ASSERT(tensor->buffer == NULL);
|
|
1892
1892
|
GGML_ASSERT(tensor->view_src != NULL);
|
|
1893
1893
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
|
1894
1894
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
1895
1895
|
|
|
1896
|
-
tensor->buffer = buffer;
|
|
1896
|
+
tensor->buffer = tensor->view_src->buffer;
|
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
|
1898
|
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
|
1898
|
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
|
1899
1899
|
}
|
|
1900
1900
|
|
|
1901
1901
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
|
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
|
1954
1954
|
struct ggml_tensor * dst = node_copies[id];
|
|
1955
1955
|
if (dst->view_src != NULL) {
|
|
1956
1956
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
|
1957
|
-
ggml_backend_view_init(dst
|
|
1957
|
+
ggml_backend_view_init(dst);
|
|
1958
1958
|
}
|
|
1959
1959
|
else {
|
|
1960
1960
|
ggml_backend_tensor_copy(src, dst);
|
|
@@ -225,7 +225,7 @@ extern "C" {
|
|
|
225
225
|
|
|
226
226
|
// Tensor initialization
|
|
227
227
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
228
|
-
GGML_API void ggml_backend_view_init(
|
|
228
|
+
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
|
229
229
|
|
|
230
230
|
|
|
231
231
|
#ifdef __cplusplus
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#include "acc.cuh"
|
|
2
|
+
|
|
3
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
4
|
+
const int ne10, const int ne11, const int ne12,
|
|
5
|
+
const int nb1, const int nb2, int offset) {
|
|
6
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
7
|
+
if (i >= ne) {
|
|
8
|
+
return;
|
|
9
|
+
}
|
|
10
|
+
int src1_idx = i - offset;
|
|
11
|
+
int oz = src1_idx / nb2;
|
|
12
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
|
13
|
+
int ox = src1_idx % nb1;
|
|
14
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
|
15
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
|
16
|
+
} else {
|
|
17
|
+
dst[i] = x[i];
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
|
22
|
+
const int ne10, const int ne11, const int ne12,
|
|
23
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
|
24
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
|
25
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
29
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
30
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
31
|
+
const float * src0_d = (const float *)src0->data;
|
|
32
|
+
const float * src1_d = (const float *)src1->data;
|
|
33
|
+
float * dst_d = (float *)dst->data;
|
|
34
|
+
cudaStream_t stream = ctx.stream();
|
|
35
|
+
|
|
36
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
37
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
38
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
39
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
|
40
|
+
|
|
41
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
|
42
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
|
43
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
|
44
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
|
45
|
+
|
|
46
|
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
|
|
47
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#include "arange.cuh"
|
|
2
|
+
|
|
3
|
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
|
4
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
|
5
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
6
|
+
if (nidx >= ne0) {
|
|
7
|
+
return;
|
|
8
|
+
}
|
|
9
|
+
dst[nidx] = start + step * nidx;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
|
13
|
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
|
14
|
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
18
|
+
float * dst_d = (float *)dst->data;
|
|
19
|
+
cudaStream_t stream = ctx.stream();
|
|
20
|
+
|
|
21
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
22
|
+
|
|
23
|
+
float start;
|
|
24
|
+
float stop;
|
|
25
|
+
float step;
|
|
26
|
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
|
27
|
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
|
28
|
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
|
29
|
+
|
|
30
|
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
|
31
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
|
32
|
+
|
|
33
|
+
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
|
34
|
+
}
|