llama_cpp 0.15.4 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +15 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +13 -1
- data/vendor/tmp/llama.cpp/Makefile +62 -35
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
- data/vendor/tmp/llama.cpp/ggml.c +178 -330
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +242 -426
- data/vendor/tmp/llama.cpp/llama.h +17 -43
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
|
4
|
+
data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
|
7
|
+
data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
|
5
|
+
- Bump llama.cpp from b3056 to b3091.
|
6
|
+
- Rename `type` method to `token_attr` in `Model`.
|
7
|
+
- Add constants for token attribute types.
|
8
|
+
- Remove `--with-clblast` and `--with-mpi` config options.
|
9
|
+
- Add `--with-no-openmp` config option.
|
10
|
+
|
1
11
|
## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
|
2
12
|
|
3
13
|
- Bump llama.cpp from b2988 to b3056.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
18
|
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
19
|
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
20
|
-
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
21
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
22
|
-
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
23
21
|
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
22
|
+
make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
|
24
23
|
|
25
24
|
make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
|
26
25
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1523,7 +1523,7 @@ public:
|
|
1523
1523
|
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
1524
1524
|
rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
|
1525
1525
|
rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
|
1526
|
-
rb_define_method(rb_cLLaMAModel, "
|
1526
|
+
rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
|
1527
1527
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1528
1528
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1529
1529
|
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
@@ -1778,10 +1778,10 @@ private:
|
|
1778
1778
|
return DBL2NUM(score);
|
1779
1779
|
}
|
1780
1780
|
|
1781
|
-
static VALUE
|
1781
|
+
static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
|
1782
1782
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1783
1783
|
const llama_token token = NUM2INT(token_);
|
1784
|
-
const
|
1784
|
+
const llama_token_attr type = llama_token_get_attr(ptr->model, token);
|
1785
1785
|
return INT2NUM(type);
|
1786
1786
|
}
|
1787
1787
|
|
@@ -3503,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
|
|
3503
3503
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
|
3504
3504
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
|
3505
3505
|
|
3506
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
|
3507
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
|
3508
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
|
3509
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
|
3510
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
|
3511
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
|
3512
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
|
3513
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
|
3514
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
|
3515
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
|
3516
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
|
3517
|
+
|
3506
3518
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
3507
3519
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
3508
3520
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.16.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b3091'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -32,6 +32,18 @@ module LLaMACpp
|
|
32
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
33
33
|
LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
|
34
34
|
|
35
|
+
LLAMA_TOKEN_ATTR_UNDEFINED: Integer
|
36
|
+
LLAMA_TOKEN_ATTR_UNKNOWN: Integer
|
37
|
+
LLAMA_TOKEN_ATTR_UNUSED: Integer
|
38
|
+
LLAMA_TOKEN_ATTR_NORMAL: Integer
|
39
|
+
LLAMA_TOKEN_ATTR_CONTROL: Integer
|
40
|
+
LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
|
41
|
+
LLAMA_TOKEN_ATTR_BYTE: Integer
|
42
|
+
LLAMA_TOKEN_ATTR_NORMALIZED: Integer
|
43
|
+
LLAMA_TOKEN_ATTR_LSTRIP: Integer
|
44
|
+
LLAMA_TOKEN_ATTR_RSTRIP: Integer
|
45
|
+
LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
|
46
|
+
|
35
47
|
LLAMA_FTYPE_ALL_F32: Integer
|
36
48
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
37
49
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -147,7 +159,7 @@ module LLaMACpp
|
|
147
159
|
def n_params: () -> Integer
|
148
160
|
def text: (Integer) -> String
|
149
161
|
def score: (Integer) -> Float
|
150
|
-
def
|
162
|
+
def token_attr: (Integer) -> Integer
|
151
163
|
def token_bos: () -> Integer
|
152
164
|
def token_eos: () -> Integer
|
153
165
|
def token_cls: () -> Integer
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
|
|
57
57
|
LLAMA_METAL := 1
|
58
58
|
endif
|
59
59
|
|
60
|
+
LLAMA_NO_OPENMP := 1
|
61
|
+
|
60
62
|
ifneq ($(UNAME_P),arm)
|
61
63
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
62
64
|
ifeq ($(SYSCTL_M),1)
|
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
|
|
67
69
|
endif
|
68
70
|
endif
|
69
71
|
|
72
|
+
ifdef LLAMA_RPC
|
73
|
+
BUILD_TARGETS += rpc-server
|
74
|
+
endif
|
75
|
+
|
70
76
|
default: $(BUILD_TARGETS)
|
71
77
|
|
72
78
|
test: $(TEST_TARGETS)
|
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
|
|
135
141
|
ifdef LLAMA_FAST
|
136
142
|
MK_CFLAGS += -Ofast
|
137
143
|
HOST_CXXFLAGS += -Ofast
|
144
|
+
ifndef LLAMA_DEBUG
|
138
145
|
MK_NVCCFLAGS += -O3
|
146
|
+
endif # LLAMA_DEBUG
|
139
147
|
else
|
140
148
|
MK_CFLAGS += -O3
|
141
149
|
MK_CXXFLAGS += -O3
|
150
|
+
ifndef LLAMA_DEBUG
|
142
151
|
MK_NVCCFLAGS += -O3
|
143
|
-
endif
|
152
|
+
endif # LLAMA_DEBUG
|
153
|
+
endif # LLAMA_FAST
|
144
154
|
|
145
155
|
ifndef LLAMA_NO_CCACHE
|
146
156
|
CCACHE := $(shell which ccache)
|
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
|
|
201
211
|
endif
|
202
212
|
|
203
213
|
ifdef LLAMA_DEBUG
|
204
|
-
MK_CFLAGS
|
205
|
-
MK_CXXFLAGS
|
206
|
-
MK_LDFLAGS
|
214
|
+
MK_CFLAGS += -O0 -g
|
215
|
+
MK_CXXFLAGS += -O0 -g
|
216
|
+
MK_LDFLAGS += -g
|
217
|
+
MK_NVCCFLAGS += -O0 -g
|
207
218
|
|
208
219
|
ifeq ($(UNAME_S),Linux)
|
209
220
|
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
|
|
402
413
|
endif
|
403
414
|
endif # LLAMA_NO_ACCELERATE
|
404
415
|
|
416
|
+
ifndef LLAMA_NO_OPENMP
|
417
|
+
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
418
|
+
MK_CFLAGS += -fopenmp
|
419
|
+
MK_CXXFLAGS += -fopenmp
|
420
|
+
endif # LLAMA_NO_OPENMP
|
421
|
+
|
405
422
|
ifdef LLAMA_OPENBLAS
|
406
423
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
407
424
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
|
|
418
435
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
419
436
|
endif # LLAMA_BLIS
|
420
437
|
|
438
|
+
ifdef LLAMA_RPC
|
439
|
+
MK_CPPFLAGS += -DGGML_USE_RPC
|
440
|
+
OBJS += ggml-rpc.o
|
441
|
+
endif # LLAMA_RPC
|
442
|
+
|
421
443
|
ifdef LLAMA_CUBLAS
|
422
444
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
423
445
|
LLAMA_CUDA := 1
|
424
446
|
endif
|
425
447
|
|
448
|
+
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
|
449
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
450
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
|
451
|
+
else
|
452
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
|
453
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
|
454
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
|
455
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
456
|
+
|
426
457
|
ifdef LLAMA_CUDA
|
427
458
|
ifneq ('', '$(wildcard /opt/cuda)')
|
428
459
|
CUDA_PATH ?= /opt/cuda
|
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
|
|
433
464
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
434
465
|
OBJS += ggml-cuda.o
|
435
466
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
467
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
436
468
|
MK_NVCCFLAGS += -use_fast_math
|
437
469
|
ifdef LLAMA_FATAL_WARNINGS
|
438
470
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -495,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
495
527
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
496
528
|
ifdef LLAMA_CUDA_CCBIN
|
497
529
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
498
|
-
endif
|
530
|
+
endif # LLAMA_CUDA_CCBIN
|
531
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
532
|
+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
|
533
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
499
534
|
|
500
535
|
ifdef JETSON_EOL_MODULE_DETECT
|
501
536
|
define NVCC_COMPILE
|
@@ -507,30 +542,13 @@ define NVCC_COMPILE
|
|
507
542
|
endef # NVCC_COMPILE
|
508
543
|
endif # JETSON_EOL_MODULE_DETECT
|
509
544
|
|
510
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
545
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
511
546
|
$(NVCC_COMPILE)
|
512
547
|
|
513
548
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
514
549
|
$(NVCC_COMPILE)
|
515
550
|
endif # LLAMA_CUDA
|
516
551
|
|
517
|
-
ifdef LLAMA_CLBLAST
|
518
|
-
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
519
|
-
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
520
|
-
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
521
|
-
|
522
|
-
# Mac provides OpenCL as a framework
|
523
|
-
ifeq ($(UNAME_S),Darwin)
|
524
|
-
MK_LDFLAGS += -lclblast -framework OpenCL
|
525
|
-
else
|
526
|
-
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
527
|
-
endif
|
528
|
-
OBJS += ggml-opencl.o
|
529
|
-
|
530
|
-
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
531
|
-
$(CXX) $(CXXFLAGS) -c $< -o $@
|
532
|
-
endif # LLAMA_CLBLAST
|
533
|
-
|
534
552
|
ifdef LLAMA_VULKAN
|
535
553
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
536
554
|
MK_LDFLAGS += -lvulkan
|
@@ -573,6 +591,7 @@ ifdef LLAMA_HIP_UMA
|
|
573
591
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
574
592
|
endif # LLAMA_HIP_UMA
|
575
593
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
594
|
+
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
576
595
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
577
596
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
578
597
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
@@ -586,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
586
605
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
587
606
|
OBJS += ggml-cuda.o
|
588
607
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
608
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
589
609
|
|
590
610
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
591
611
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
592
612
|
|
593
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
613
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
594
614
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
595
615
|
|
596
616
|
endif # LLAMA_HIPBLAS
|
@@ -628,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
628
648
|
endif
|
629
649
|
endif # LLAMA_METAL
|
630
650
|
|
651
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
652
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
653
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
654
|
+
|
631
655
|
ifndef LLAMA_NO_LLAMAFILE
|
632
656
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
633
657
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
634
658
|
endif
|
635
659
|
|
660
|
+
ifdef LLAMA_RPC
|
661
|
+
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
662
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
663
|
+
|
664
|
+
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
665
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
666
|
+
|
667
|
+
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
668
|
+
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
669
|
+
endif # LLAMA_RPC
|
670
|
+
|
636
671
|
GF_CC := $(CC)
|
637
672
|
include scripts/get-flags.mk
|
638
673
|
|
@@ -712,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
|
|
712
747
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
713
748
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
714
749
|
|
715
|
-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
716
|
-
|
717
750
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
718
751
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
719
752
|
|
720
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
721
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
722
|
-
|
723
753
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
724
754
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
725
755
|
|
@@ -754,6 +784,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
754
784
|
clean:
|
755
785
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
756
786
|
rm -vrf ggml-cuda/*.o
|
787
|
+
rm -vrf ggml-cuda/template-instances/*.o
|
757
788
|
|
758
789
|
#
|
759
790
|
# Examples
|
@@ -821,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
821
852
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
822
853
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
823
854
|
|
824
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
855
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
825
856
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
826
857
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
827
858
|
|
@@ -871,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
|
|
871
902
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
872
903
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
873
904
|
|
874
|
-
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
875
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
876
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
877
|
-
|
878
905
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
879
906
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
880
907
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
377
377
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
383
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
750
750
|
// this tensor was allocated without ggml-backend
|
751
751
|
return;
|
752
752
|
}
|
753
|
-
ggml_backend_view_init(
|
753
|
+
ggml_backend_view_init(tensor);
|
754
754
|
}
|
755
755
|
} else {
|
756
756
|
if (tensor->data == NULL) {
|
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
899
899
|
if (t->view_src == NULL) {
|
900
900
|
ggml_tallocr_alloc(&tallocr, t);
|
901
901
|
} else if (t->buffer == NULL) {
|
902
|
-
ggml_backend_view_init(
|
902
|
+
ggml_backend_view_init(t);
|
903
903
|
}
|
904
904
|
} else {
|
905
905
|
if (t->view_src != NULL && t->buffer == NULL) {
|
906
906
|
// view of a pre-allocated tensor
|
907
|
-
ggml_backend_view_init(
|
907
|
+
ggml_backend_view_init(t);
|
908
908
|
}
|
909
909
|
}
|
910
910
|
}
|
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
151
151
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
152
152
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
153
153
|
if (dst_buf->iface.cpy_tensor) {
|
154
|
-
return
|
154
|
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
155
155
|
}
|
156
156
|
return false;
|
157
157
|
}
|
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
1887
1887
|
|
1888
1888
|
// utils
|
1889
1889
|
|
1890
|
-
void ggml_backend_view_init(
|
1890
|
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
1891
1891
|
GGML_ASSERT(tensor->buffer == NULL);
|
1892
1892
|
GGML_ASSERT(tensor->view_src != NULL);
|
1893
1893
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1894
1894
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
1895
1895
|
|
1896
|
-
tensor->buffer = buffer;
|
1896
|
+
tensor->buffer = tensor->view_src->buffer;
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1898
|
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1898
|
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
1899
1899
|
}
|
1900
1900
|
|
1901
1901
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1954
1954
|
struct ggml_tensor * dst = node_copies[id];
|
1955
1955
|
if (dst->view_src != NULL) {
|
1956
1956
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1957
|
-
ggml_backend_view_init(dst
|
1957
|
+
ggml_backend_view_init(dst);
|
1958
1958
|
}
|
1959
1959
|
else {
|
1960
1960
|
ggml_backend_tensor_copy(src, dst);
|
@@ -225,7 +225,7 @@ extern "C" {
|
|
225
225
|
|
226
226
|
// Tensor initialization
|
227
227
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
228
|
-
GGML_API void ggml_backend_view_init(
|
228
|
+
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
229
229
|
|
230
230
|
|
231
231
|
#ifdef __cplusplus
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#include "acc.cuh"
|
2
|
+
|
3
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
4
|
+
const int ne10, const int ne11, const int ne12,
|
5
|
+
const int nb1, const int nb2, int offset) {
|
6
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
7
|
+
if (i >= ne) {
|
8
|
+
return;
|
9
|
+
}
|
10
|
+
int src1_idx = i - offset;
|
11
|
+
int oz = src1_idx / nb2;
|
12
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
13
|
+
int ox = src1_idx % nb1;
|
14
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
15
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
16
|
+
} else {
|
17
|
+
dst[i] = x[i];
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
22
|
+
const int ne10, const int ne11, const int ne12,
|
23
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
24
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
25
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
26
|
+
}
|
27
|
+
|
28
|
+
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
29
|
+
const ggml_tensor * src0 = dst->src[0];
|
30
|
+
const ggml_tensor * src1 = dst->src[1];
|
31
|
+
const float * src0_d = (const float *)src0->data;
|
32
|
+
const float * src1_d = (const float *)src1->data;
|
33
|
+
float * dst_d = (float *)dst->data;
|
34
|
+
cudaStream_t stream = ctx.stream();
|
35
|
+
|
36
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
37
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
38
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
39
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
40
|
+
|
41
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
42
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
43
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
44
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
45
|
+
|
46
|
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
|
47
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#include "arange.cuh"
|
2
|
+
|
3
|
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
4
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
5
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
6
|
+
if (nidx >= ne0) {
|
7
|
+
return;
|
8
|
+
}
|
9
|
+
dst[nidx] = start + step * nidx;
|
10
|
+
}
|
11
|
+
|
12
|
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
13
|
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
14
|
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
15
|
+
}
|
16
|
+
|
17
|
+
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
18
|
+
float * dst_d = (float *)dst->data;
|
19
|
+
cudaStream_t stream = ctx.stream();
|
20
|
+
|
21
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
22
|
+
|
23
|
+
float start;
|
24
|
+
float stop;
|
25
|
+
float step;
|
26
|
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
27
|
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
28
|
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
29
|
+
|
30
|
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
31
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
32
|
+
|
33
|
+
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
34
|
+
}
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#include "argsort.cuh"
|
2
|
+
|
3
|
+
template<typename T>
|
4
|
+
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
|
5
|
+
T tmp = a;
|
6
|
+
a = b;
|
7
|
+
b = tmp;
|
8
|
+
}
|
9
|
+
|
10
|
+
template<ggml_sort_order order>
|
11
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
|
12
|
+
// bitonic sort
|
13
|
+
int col = threadIdx.x;
|
14
|
+
int row = blockIdx.y;
|
15
|
+
|
16
|
+
if (col >= ncols_pad) {
|
17
|
+
return;
|
18
|
+
}
|
19
|
+
|
20
|
+
const float * x_row = x + row * ncols;
|
21
|
+
extern __shared__ int dst_row[];
|
22
|
+
|
23
|
+
// initialize indices
|
24
|
+
dst_row[col] = col;
|
25
|
+
|
26
|
+
__syncthreads();
|
27
|
+
|
28
|
+
for (int k = 2; k <= ncols_pad; k *= 2) {
|
29
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
30
|
+
int ixj = col ^ j;
|
31
|
+
if (ixj > col) {
|
32
|
+
if ((col & k) == 0) {
|
33
|
+
if (dst_row[col] >= ncols ||
|
34
|
+
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
35
|
+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
36
|
+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
37
|
+
) {
|
38
|
+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
39
|
+
}
|
40
|
+
} else {
|
41
|
+
if (dst_row[ixj] >= ncols ||
|
42
|
+
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
43
|
+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
44
|
+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
45
|
+
) {
|
46
|
+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
__syncthreads();
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
// copy the result to dst without the padding
|
55
|
+
if (col < ncols) {
|
56
|
+
dst[row * ncols + col] = dst_row[col];
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
static int next_power_of_2(int x) {
|
61
|
+
int n = 1;
|
62
|
+
while (n < x) {
|
63
|
+
n *= 2;
|
64
|
+
}
|
65
|
+
return n;
|
66
|
+
}
|
67
|
+
|
68
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
69
|
+
// bitonic sort requires ncols to be power of 2
|
70
|
+
const int ncols_pad = next_power_of_2(ncols);
|
71
|
+
|
72
|
+
const dim3 block_dims(ncols_pad, 1, 1);
|
73
|
+
const dim3 block_nums(1, nrows, 1);
|
74
|
+
const size_t shared_mem = ncols_pad * sizeof(int);
|
75
|
+
|
76
|
+
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
77
|
+
|
78
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
79
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
80
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
81
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
82
|
+
} else {
|
83
|
+
GGML_ASSERT(false);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
88
|
+
const ggml_tensor * src0 = dst->src[0];
|
89
|
+
const float * src0_d = (const float *)src0->data;
|
90
|
+
float * dst_d = (float *)dst->data;
|
91
|
+
cudaStream_t stream = ctx.stream();
|
92
|
+
|
93
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
94
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
95
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
96
|
+
|
97
|
+
const int64_t ncols = src0->ne[0];
|
98
|
+
const int64_t nrows = ggml_nrows(src0);
|
99
|
+
|
100
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
101
|
+
|
102
|
+
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
103
|
+
}
|