llama_cpp 0.15.3 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +27 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +66 -36
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
- data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
- data/vendor/tmp/llama.cpp/ggml.c +301 -409
- data/vendor/tmp/llama.cpp/ggml.h +19 -23
- data/vendor/tmp/llama.cpp/llama.cpp +855 -651
- data/vendor/tmp/llama.cpp/llama.h +28 -48
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
|
4
|
+
data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
|
7
|
+
data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
|
5
|
+
- Bump llama.cpp from b3056 to b3091.
|
6
|
+
- Rename `type` method to `token_attr` in `Model`.
|
7
|
+
- Add constants for token attribute types.
|
8
|
+
- Remove `--with-clblast` and `--with-mpi` config options.
|
9
|
+
- Add `--with-no-openmp` config option.
|
10
|
+
|
11
|
+
## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
|
12
|
+
|
13
|
+
- Bump llama.cpp from b2988 to b3056.
|
14
|
+
- Add LLAMA_VOCAB_PRE_TYPE_SMAUG constant.
|
15
|
+
- Add `token_is_control?` method to `Model`.
|
16
|
+
|
1
17
|
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
2
18
|
|
3
19
|
- Bump llama.cpp from b2917 to b2988.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
18
|
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
19
|
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
20
|
-
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
21
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
22
|
-
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
23
21
|
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
22
|
+
make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
|
24
23
|
|
25
24
|
make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
|
26
25
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1523,7 +1523,7 @@ public:
|
|
1523
1523
|
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
1524
1524
|
rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
|
1525
1525
|
rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
|
1526
|
-
rb_define_method(rb_cLLaMAModel, "
|
1526
|
+
rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
|
1527
1527
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1528
1528
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1529
1529
|
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
@@ -1536,6 +1536,7 @@ public:
|
|
1536
1536
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1537
1537
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1538
1538
|
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1539
|
+
rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
|
1539
1540
|
}
|
1540
1541
|
|
1541
1542
|
private:
|
@@ -1777,10 +1778,10 @@ private:
|
|
1777
1778
|
return DBL2NUM(score);
|
1778
1779
|
}
|
1779
1780
|
|
1780
|
-
static VALUE
|
1781
|
+
static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
|
1781
1782
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1782
1783
|
const llama_token token = NUM2INT(token_);
|
1783
|
-
const
|
1784
|
+
const llama_token_attr type = llama_token_get_attr(ptr->model, token);
|
1784
1785
|
return INT2NUM(type);
|
1785
1786
|
}
|
1786
1787
|
|
@@ -1848,6 +1849,16 @@ private:
|
|
1848
1849
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1849
1850
|
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1850
1851
|
}
|
1852
|
+
|
1853
|
+
static VALUE _llama_model_token_is_control(VALUE self, VALUE token_) {
|
1854
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1855
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1856
|
+
return Qnil;
|
1857
|
+
}
|
1858
|
+
const llama_token token = NUM2INT(token_);
|
1859
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1860
|
+
return llama_token_is_control(ptr->model, token) ? Qtrue : Qfalse;
|
1861
|
+
}
|
1851
1862
|
};
|
1852
1863
|
|
1853
1864
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -3482,6 +3493,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3482
3493
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3483
3494
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3484
3495
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3496
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_SMAUG", INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG));
|
3485
3497
|
|
3486
3498
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3487
3499
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
@@ -3491,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
|
|
3491
3503
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
|
3492
3504
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
|
3493
3505
|
|
3506
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
|
3507
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
|
3508
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
|
3509
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
|
3510
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
|
3511
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
|
3512
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
|
3513
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
|
3514
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
|
3515
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
|
3516
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
|
3517
|
+
|
3494
3518
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
3495
3519
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
3496
3520
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.16.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b3091'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -30,6 +30,19 @@ module LLaMACpp
|
|
30
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
31
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
32
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
|
34
|
+
|
35
|
+
LLAMA_TOKEN_ATTR_UNDEFINED: Integer
|
36
|
+
LLAMA_TOKEN_ATTR_UNKNOWN: Integer
|
37
|
+
LLAMA_TOKEN_ATTR_UNUSED: Integer
|
38
|
+
LLAMA_TOKEN_ATTR_NORMAL: Integer
|
39
|
+
LLAMA_TOKEN_ATTR_CONTROL: Integer
|
40
|
+
LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
|
41
|
+
LLAMA_TOKEN_ATTR_BYTE: Integer
|
42
|
+
LLAMA_TOKEN_ATTR_NORMALIZED: Integer
|
43
|
+
LLAMA_TOKEN_ATTR_LSTRIP: Integer
|
44
|
+
LLAMA_TOKEN_ATTR_RSTRIP: Integer
|
45
|
+
LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
|
33
46
|
|
34
47
|
LLAMA_FTYPE_ALL_F32: Integer
|
35
48
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
@@ -146,7 +159,7 @@ module LLaMACpp
|
|
146
159
|
def n_params: () -> Integer
|
147
160
|
def text: (Integer) -> String
|
148
161
|
def score: (Integer) -> Float
|
149
|
-
def
|
162
|
+
def token_attr: (Integer) -> Integer
|
150
163
|
def token_bos: () -> Integer
|
151
164
|
def token_eos: () -> Integer
|
152
165
|
def token_cls: () -> Integer
|
@@ -159,6 +172,7 @@ module LLaMACpp
|
|
159
172
|
def token_suffix: () -> Integer
|
160
173
|
def token_eot: () -> Integer
|
161
174
|
def token_is_eog?: (Integer) -> bool
|
175
|
+
def token_is_control?: (Integer) -> bool
|
162
176
|
end
|
163
177
|
|
164
178
|
class Timings
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
|
|
57
57
|
LLAMA_METAL := 1
|
58
58
|
endif
|
59
59
|
|
60
|
+
LLAMA_NO_OPENMP := 1
|
61
|
+
|
60
62
|
ifneq ($(UNAME_P),arm)
|
61
63
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
62
64
|
ifeq ($(SYSCTL_M),1)
|
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
|
|
67
69
|
endif
|
68
70
|
endif
|
69
71
|
|
72
|
+
ifdef LLAMA_RPC
|
73
|
+
BUILD_TARGETS += rpc-server
|
74
|
+
endif
|
75
|
+
|
70
76
|
default: $(BUILD_TARGETS)
|
71
77
|
|
72
78
|
test: $(TEST_TARGETS)
|
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
|
|
135
141
|
ifdef LLAMA_FAST
|
136
142
|
MK_CFLAGS += -Ofast
|
137
143
|
HOST_CXXFLAGS += -Ofast
|
144
|
+
ifndef LLAMA_DEBUG
|
138
145
|
MK_NVCCFLAGS += -O3
|
146
|
+
endif # LLAMA_DEBUG
|
139
147
|
else
|
140
148
|
MK_CFLAGS += -O3
|
141
149
|
MK_CXXFLAGS += -O3
|
150
|
+
ifndef LLAMA_DEBUG
|
142
151
|
MK_NVCCFLAGS += -O3
|
143
|
-
endif
|
152
|
+
endif # LLAMA_DEBUG
|
153
|
+
endif # LLAMA_FAST
|
144
154
|
|
145
155
|
ifndef LLAMA_NO_CCACHE
|
146
156
|
CCACHE := $(shell which ccache)
|
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
|
|
201
211
|
endif
|
202
212
|
|
203
213
|
ifdef LLAMA_DEBUG
|
204
|
-
MK_CFLAGS
|
205
|
-
MK_CXXFLAGS
|
206
|
-
MK_LDFLAGS
|
214
|
+
MK_CFLAGS += -O0 -g
|
215
|
+
MK_CXXFLAGS += -O0 -g
|
216
|
+
MK_LDFLAGS += -g
|
217
|
+
MK_NVCCFLAGS += -O0 -g
|
207
218
|
|
208
219
|
ifeq ($(UNAME_S),Linux)
|
209
220
|
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
|
|
402
413
|
endif
|
403
414
|
endif # LLAMA_NO_ACCELERATE
|
404
415
|
|
416
|
+
ifndef LLAMA_NO_OPENMP
|
417
|
+
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
418
|
+
MK_CFLAGS += -fopenmp
|
419
|
+
MK_CXXFLAGS += -fopenmp
|
420
|
+
endif # LLAMA_NO_OPENMP
|
421
|
+
|
405
422
|
ifdef LLAMA_OPENBLAS
|
406
423
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
407
424
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
|
|
418
435
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
419
436
|
endif # LLAMA_BLIS
|
420
437
|
|
438
|
+
ifdef LLAMA_RPC
|
439
|
+
MK_CPPFLAGS += -DGGML_USE_RPC
|
440
|
+
OBJS += ggml-rpc.o
|
441
|
+
endif # LLAMA_RPC
|
442
|
+
|
421
443
|
ifdef LLAMA_CUBLAS
|
422
444
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
423
445
|
LLAMA_CUDA := 1
|
424
446
|
endif
|
425
447
|
|
448
|
+
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
|
449
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
450
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
|
451
|
+
else
|
452
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
|
453
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
|
454
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
|
455
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
456
|
+
|
426
457
|
ifdef LLAMA_CUDA
|
427
458
|
ifneq ('', '$(wildcard /opt/cuda)')
|
428
459
|
CUDA_PATH ?= /opt/cuda
|
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
|
|
433
464
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
434
465
|
OBJS += ggml-cuda.o
|
435
466
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
467
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
436
468
|
MK_NVCCFLAGS += -use_fast_math
|
437
469
|
ifdef LLAMA_FATAL_WARNINGS
|
438
470
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -443,6 +475,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
|
443
475
|
ifdef LLAMA_DEBUG
|
444
476
|
MK_NVCCFLAGS += -lineinfo
|
445
477
|
endif # LLAMA_DEBUG
|
478
|
+
ifdef LLAMA_CUDA_DEBUG
|
479
|
+
MK_NVCCFLAGS += --device-debug
|
480
|
+
endif # LLAMA_CUDA_DEBUG
|
446
481
|
ifdef LLAMA_CUDA_NVCC
|
447
482
|
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
448
483
|
else
|
@@ -492,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
492
527
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
493
528
|
ifdef LLAMA_CUDA_CCBIN
|
494
529
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
495
|
-
endif
|
530
|
+
endif # LLAMA_CUDA_CCBIN
|
531
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
532
|
+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
|
533
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
496
534
|
|
497
535
|
ifdef JETSON_EOL_MODULE_DETECT
|
498
536
|
define NVCC_COMPILE
|
@@ -504,30 +542,13 @@ define NVCC_COMPILE
|
|
504
542
|
endef # NVCC_COMPILE
|
505
543
|
endif # JETSON_EOL_MODULE_DETECT
|
506
544
|
|
507
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
545
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
508
546
|
$(NVCC_COMPILE)
|
509
547
|
|
510
548
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
511
549
|
$(NVCC_COMPILE)
|
512
550
|
endif # LLAMA_CUDA
|
513
551
|
|
514
|
-
ifdef LLAMA_CLBLAST
|
515
|
-
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
516
|
-
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
517
|
-
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
518
|
-
|
519
|
-
# Mac provides OpenCL as a framework
|
520
|
-
ifeq ($(UNAME_S),Darwin)
|
521
|
-
MK_LDFLAGS += -lclblast -framework OpenCL
|
522
|
-
else
|
523
|
-
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
524
|
-
endif
|
525
|
-
OBJS += ggml-opencl.o
|
526
|
-
|
527
|
-
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
528
|
-
$(CXX) $(CXXFLAGS) -c $< -o $@
|
529
|
-
endif # LLAMA_CLBLAST
|
530
|
-
|
531
552
|
ifdef LLAMA_VULKAN
|
532
553
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
533
554
|
MK_LDFLAGS += -lvulkan
|
@@ -570,6 +591,7 @@ ifdef LLAMA_HIP_UMA
|
|
570
591
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
571
592
|
endif # LLAMA_HIP_UMA
|
572
593
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
594
|
+
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
573
595
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
574
596
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
575
597
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
@@ -583,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
583
605
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
584
606
|
OBJS += ggml-cuda.o
|
585
607
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
608
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
586
609
|
|
587
610
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
588
611
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
589
612
|
|
590
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
613
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
591
614
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
592
615
|
|
593
616
|
endif # LLAMA_HIPBLAS
|
@@ -625,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
625
648
|
endif
|
626
649
|
endif # LLAMA_METAL
|
627
650
|
|
651
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
652
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
653
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
654
|
+
|
628
655
|
ifndef LLAMA_NO_LLAMAFILE
|
629
656
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
630
657
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
631
658
|
endif
|
632
659
|
|
660
|
+
ifdef LLAMA_RPC
|
661
|
+
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
662
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
663
|
+
|
664
|
+
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
665
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
666
|
+
|
667
|
+
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
668
|
+
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
669
|
+
endif # LLAMA_RPC
|
670
|
+
|
633
671
|
GF_CC := $(CC)
|
634
672
|
include scripts/get-flags.mk
|
635
673
|
|
@@ -709,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
|
|
709
747
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
710
748
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
711
749
|
|
712
|
-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
713
|
-
|
714
750
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
715
751
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
716
752
|
|
717
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
718
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
719
|
-
|
720
753
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
721
754
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
722
755
|
|
@@ -749,8 +782,9 @@ lib: llama.o ggml.o $(OBJS)
|
|
749
782
|
ar rcs libllama.a $^
|
750
783
|
|
751
784
|
clean:
|
752
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
785
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
753
786
|
rm -vrf ggml-cuda/*.o
|
787
|
+
rm -vrf ggml-cuda/template-instances/*.o
|
754
788
|
|
755
789
|
#
|
756
790
|
# Examples
|
@@ -818,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
818
852
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
819
853
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
820
854
|
|
821
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
855
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
822
856
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
823
857
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
824
858
|
|
@@ -868,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
|
|
868
902
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
869
903
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
870
904
|
|
871
|
-
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
872
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
873
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
874
|
-
|
875
905
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
876
906
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
877
907
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
377
377
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
383
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
750
750
|
// this tensor was allocated without ggml-backend
|
751
751
|
return;
|
752
752
|
}
|
753
|
-
ggml_backend_view_init(
|
753
|
+
ggml_backend_view_init(tensor);
|
754
754
|
}
|
755
755
|
} else {
|
756
756
|
if (tensor->data == NULL) {
|
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
899
899
|
if (t->view_src == NULL) {
|
900
900
|
ggml_tallocr_alloc(&tallocr, t);
|
901
901
|
} else if (t->buffer == NULL) {
|
902
|
-
ggml_backend_view_init(
|
902
|
+
ggml_backend_view_init(t);
|
903
903
|
}
|
904
904
|
} else {
|
905
905
|
if (t->view_src != NULL && t->buffer == NULL) {
|
906
906
|
// view of a pre-allocated tensor
|
907
|
-
ggml_backend_view_init(
|
907
|
+
ggml_backend_view_init(t);
|
908
908
|
}
|
909
909
|
}
|
910
910
|
}
|
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
151
151
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
152
152
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
153
153
|
if (dst_buf->iface.cpy_tensor) {
|
154
|
-
return
|
154
|
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
155
155
|
}
|
156
156
|
return false;
|
157
157
|
}
|
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
1887
1887
|
|
1888
1888
|
// utils
|
1889
1889
|
|
1890
|
-
void ggml_backend_view_init(
|
1890
|
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
1891
1891
|
GGML_ASSERT(tensor->buffer == NULL);
|
1892
1892
|
GGML_ASSERT(tensor->view_src != NULL);
|
1893
1893
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1894
1894
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
1895
1895
|
|
1896
|
-
tensor->buffer = buffer;
|
1896
|
+
tensor->buffer = tensor->view_src->buffer;
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1898
|
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1898
|
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
1899
1899
|
}
|
1900
1900
|
|
1901
1901
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1954
1954
|
struct ggml_tensor * dst = node_copies[id];
|
1955
1955
|
if (dst->view_src != NULL) {
|
1956
1956
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1957
|
-
ggml_backend_view_init(dst
|
1957
|
+
ggml_backend_view_init(dst);
|
1958
1958
|
}
|
1959
1959
|
else {
|
1960
1960
|
ggml_backend_tensor_copy(src, dst);
|
@@ -225,7 +225,7 @@ extern "C" {
|
|
225
225
|
|
226
226
|
// Tensor initialization
|
227
227
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
228
|
-
GGML_API void ggml_backend_view_init(
|
228
|
+
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
229
229
|
|
230
230
|
|
231
231
|
#ifdef __cplusplus
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#include "acc.cuh"
|
2
|
+
|
3
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
4
|
+
const int ne10, const int ne11, const int ne12,
|
5
|
+
const int nb1, const int nb2, int offset) {
|
6
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
7
|
+
if (i >= ne) {
|
8
|
+
return;
|
9
|
+
}
|
10
|
+
int src1_idx = i - offset;
|
11
|
+
int oz = src1_idx / nb2;
|
12
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
13
|
+
int ox = src1_idx % nb1;
|
14
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
15
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
16
|
+
} else {
|
17
|
+
dst[i] = x[i];
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
22
|
+
const int ne10, const int ne11, const int ne12,
|
23
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
24
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
25
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
26
|
+
}
|
27
|
+
|
28
|
+
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
29
|
+
const ggml_tensor * src0 = dst->src[0];
|
30
|
+
const ggml_tensor * src1 = dst->src[1];
|
31
|
+
const float * src0_d = (const float *)src0->data;
|
32
|
+
const float * src1_d = (const float *)src1->data;
|
33
|
+
float * dst_d = (float *)dst->data;
|
34
|
+
cudaStream_t stream = ctx.stream();
|
35
|
+
|
36
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
37
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
38
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
39
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
40
|
+
|
41
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
42
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
43
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
44
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
45
|
+
|
46
|
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
|
47
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#include "arange.cuh"
|
2
|
+
|
3
|
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
4
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
5
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
6
|
+
if (nidx >= ne0) {
|
7
|
+
return;
|
8
|
+
}
|
9
|
+
dst[nidx] = start + step * nidx;
|
10
|
+
}
|
11
|
+
|
12
|
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
13
|
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
14
|
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
15
|
+
}
|
16
|
+
|
17
|
+
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
18
|
+
float * dst_d = (float *)dst->data;
|
19
|
+
cudaStream_t stream = ctx.stream();
|
20
|
+
|
21
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
22
|
+
|
23
|
+
float start;
|
24
|
+
float stop;
|
25
|
+
float step;
|
26
|
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
27
|
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
28
|
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
29
|
+
|
30
|
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
31
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
32
|
+
|
33
|
+
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
34
|
+
}
|