llama_cpp 0.15.4 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +15 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +13 -1
- data/vendor/tmp/llama.cpp/Makefile +62 -35
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
- data/vendor/tmp/llama.cpp/ggml.c +178 -330
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +242 -426
- data/vendor/tmp/llama.cpp/llama.h +17 -43
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5b79658bc49026edcbd896cac4a1d904060622f2311876afbdba773021399ad1
|
4
|
+
data.tar.gz: 064fa60e433863e6919f0c0acbd238cf5d5712058cb834a139a5e5cf798d095e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3248ba69cd0eefcc8b36bdcb03fe13a86da826f4a97a4c61bc62632c2f646647dfaac2b906dd2cb672740c30046e9f588d8e9687b6b8e4bc0a5fc03134d62ec5
|
7
|
+
data.tar.gz: 91164427363b01f805ae3be98a8f44d7aba0e7c437db7daa2b396bf3329398189613036ac4cb4f5d471194edb02485e32529ca1b9c140144332a0e34107d3666
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## [[0.16.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.4...v0.16.0)] - 2024-06-08
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
|
5
|
+
- Bump llama.cpp from b3056 to b3091.
|
6
|
+
- Rename `type` method to `token_attr` in `Model`.
|
7
|
+
- Add constants for token attribute types.
|
8
|
+
- Remove `--with-clblast` and `--with-mpi` config options.
|
9
|
+
- Add `--with-no-openmp` config option.
|
10
|
+
|
1
11
|
## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
|
2
12
|
|
3
13
|
- Bump llama.cpp from b2988 to b3056.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -17,10 +17,9 @@ make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
|
|
17
17
|
make_envs << ' LLAMA_BLIS=1' if with_config('blis')
|
18
18
|
make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
|
19
19
|
make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
|
20
|
-
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
21
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
22
|
-
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
23
21
|
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
22
|
+
make_envs << ' LLAMA_NO_OPENMP=1' if with_config('no-openmp')
|
24
23
|
|
25
24
|
make_envs << ' LLAMA_METAL_EMBED_LIBRARY=1' if RUBY_PLATFORM.match?(/darwin/)
|
26
25
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1523,7 +1523,7 @@ public:
|
|
1523
1523
|
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
1524
1524
|
rb_define_method(rb_cLLaMAModel, "text", RUBY_METHOD_FUNC(_llama_model_get_text), 1);
|
1525
1525
|
rb_define_method(rb_cLLaMAModel, "score", RUBY_METHOD_FUNC(_llama_model_get_score), 1);
|
1526
|
-
rb_define_method(rb_cLLaMAModel, "
|
1526
|
+
rb_define_method(rb_cLLaMAModel, "token_attr", RUBY_METHOD_FUNC(_llama_model_get_token_attr), 1);
|
1527
1527
|
rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
|
1528
1528
|
rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
|
1529
1529
|
rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
|
@@ -1778,10 +1778,10 @@ private:
|
|
1778
1778
|
return DBL2NUM(score);
|
1779
1779
|
}
|
1780
1780
|
|
1781
|
-
static VALUE
|
1781
|
+
static VALUE _llama_model_get_token_attr(VALUE self, VALUE token_) {
|
1782
1782
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1783
1783
|
const llama_token token = NUM2INT(token_);
|
1784
|
-
const
|
1784
|
+
const llama_token_attr type = llama_token_get_attr(ptr->model, token);
|
1785
1785
|
return INT2NUM(type);
|
1786
1786
|
}
|
1787
1787
|
|
@@ -3503,6 +3503,18 @@ extern "C" void Init_llama_cpp(void) {
|
|
3503
3503
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
|
3504
3504
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
|
3505
3505
|
|
3506
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNDEFINED", INT2NUM(LLAMA_TOKEN_ATTR_UNDEFINED));
|
3507
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNKNOWN", INT2NUM(LLAMA_TOKEN_ATTR_UNKNOWN));
|
3508
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_UNUSED", INT2NUM(LLAMA_TOKEN_ATTR_UNUSED));
|
3509
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMAL", INT2NUM(LLAMA_TOKEN_ATTR_NORMAL));
|
3510
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_CONTROL", INT2NUM(LLAMA_TOKEN_ATTR_CONTROL));
|
3511
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_USER_DEFINED", INT2NUM(LLAMA_TOKEN_ATTR_USER_DEFINED));
|
3512
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_BYTE", INT2NUM(LLAMA_TOKEN_ATTR_BYTE));
|
3513
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_NORMALIZED", INT2NUM(LLAMA_TOKEN_ATTR_NORMALIZED));
|
3514
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_LSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_LSTRIP));
|
3515
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_RSTRIP", INT2NUM(LLAMA_TOKEN_ATTR_RSTRIP));
|
3516
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_ATTR_SINGLE_WORD", INT2NUM(LLAMA_TOKEN_ATTR_SINGLE_WORD));
|
3517
|
+
|
3506
3518
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
|
3507
3519
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
|
3508
3520
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.16.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b3091'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -32,6 +32,18 @@ module LLaMACpp
|
|
32
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
33
33
|
LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
|
34
34
|
|
35
|
+
LLAMA_TOKEN_ATTR_UNDEFINED: Integer
|
36
|
+
LLAMA_TOKEN_ATTR_UNKNOWN: Integer
|
37
|
+
LLAMA_TOKEN_ATTR_UNUSED: Integer
|
38
|
+
LLAMA_TOKEN_ATTR_NORMAL: Integer
|
39
|
+
LLAMA_TOKEN_ATTR_CONTROL: Integer
|
40
|
+
LLAMA_TOKEN_ATTR_USER_DEFINED: Integer
|
41
|
+
LLAMA_TOKEN_ATTR_BYTE: Integer
|
42
|
+
LLAMA_TOKEN_ATTR_NORMALIZED: Integer
|
43
|
+
LLAMA_TOKEN_ATTR_LSTRIP: Integer
|
44
|
+
LLAMA_TOKEN_ATTR_RSTRIP: Integer
|
45
|
+
LLAMA_TOKEN_ATTR_SINGLE_WORD: Integer
|
46
|
+
|
35
47
|
LLAMA_FTYPE_ALL_F32: Integer
|
36
48
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
37
49
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -147,7 +159,7 @@ module LLaMACpp
|
|
147
159
|
def n_params: () -> Integer
|
148
160
|
def text: (Integer) -> String
|
149
161
|
def score: (Integer) -> Float
|
150
|
-
def
|
162
|
+
def token_attr: (Integer) -> Integer
|
151
163
|
def token_bos: () -> Integer
|
152
164
|
def token_eos: () -> Integer
|
153
165
|
def token_cls: () -> Integer
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
|
-
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama
|
4
|
+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
5
5
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
|
|
57
57
|
LLAMA_METAL := 1
|
58
58
|
endif
|
59
59
|
|
60
|
+
LLAMA_NO_OPENMP := 1
|
61
|
+
|
60
62
|
ifneq ($(UNAME_P),arm)
|
61
63
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
62
64
|
ifeq ($(SYSCTL_M),1)
|
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
|
|
67
69
|
endif
|
68
70
|
endif
|
69
71
|
|
72
|
+
ifdef LLAMA_RPC
|
73
|
+
BUILD_TARGETS += rpc-server
|
74
|
+
endif
|
75
|
+
|
70
76
|
default: $(BUILD_TARGETS)
|
71
77
|
|
72
78
|
test: $(TEST_TARGETS)
|
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
|
|
135
141
|
ifdef LLAMA_FAST
|
136
142
|
MK_CFLAGS += -Ofast
|
137
143
|
HOST_CXXFLAGS += -Ofast
|
144
|
+
ifndef LLAMA_DEBUG
|
138
145
|
MK_NVCCFLAGS += -O3
|
146
|
+
endif # LLAMA_DEBUG
|
139
147
|
else
|
140
148
|
MK_CFLAGS += -O3
|
141
149
|
MK_CXXFLAGS += -O3
|
150
|
+
ifndef LLAMA_DEBUG
|
142
151
|
MK_NVCCFLAGS += -O3
|
143
|
-
endif
|
152
|
+
endif # LLAMA_DEBUG
|
153
|
+
endif # LLAMA_FAST
|
144
154
|
|
145
155
|
ifndef LLAMA_NO_CCACHE
|
146
156
|
CCACHE := $(shell which ccache)
|
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
|
|
201
211
|
endif
|
202
212
|
|
203
213
|
ifdef LLAMA_DEBUG
|
204
|
-
MK_CFLAGS
|
205
|
-
MK_CXXFLAGS
|
206
|
-
MK_LDFLAGS
|
214
|
+
MK_CFLAGS += -O0 -g
|
215
|
+
MK_CXXFLAGS += -O0 -g
|
216
|
+
MK_LDFLAGS += -g
|
217
|
+
MK_NVCCFLAGS += -O0 -g
|
207
218
|
|
208
219
|
ifeq ($(UNAME_S),Linux)
|
209
220
|
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
@@ -402,6 +413,12 @@ ifndef LLAMA_NO_ACCELERATE
|
|
402
413
|
endif
|
403
414
|
endif # LLAMA_NO_ACCELERATE
|
404
415
|
|
416
|
+
ifndef LLAMA_NO_OPENMP
|
417
|
+
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
418
|
+
MK_CFLAGS += -fopenmp
|
419
|
+
MK_CXXFLAGS += -fopenmp
|
420
|
+
endif # LLAMA_NO_OPENMP
|
421
|
+
|
405
422
|
ifdef LLAMA_OPENBLAS
|
406
423
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
407
424
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -418,11 +435,25 @@ ifdef LLAMA_BLIS
|
|
418
435
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
419
436
|
endif # LLAMA_BLIS
|
420
437
|
|
438
|
+
ifdef LLAMA_RPC
|
439
|
+
MK_CPPFLAGS += -DGGML_USE_RPC
|
440
|
+
OBJS += ggml-rpc.o
|
441
|
+
endif # LLAMA_RPC
|
442
|
+
|
421
443
|
ifdef LLAMA_CUBLAS
|
422
444
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
423
445
|
LLAMA_CUDA := 1
|
424
446
|
endif
|
425
447
|
|
448
|
+
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
|
449
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
450
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
|
451
|
+
else
|
452
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
|
453
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
|
454
|
+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
|
455
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
456
|
+
|
426
457
|
ifdef LLAMA_CUDA
|
427
458
|
ifneq ('', '$(wildcard /opt/cuda)')
|
428
459
|
CUDA_PATH ?= /opt/cuda
|
@@ -433,6 +464,7 @@ ifdef LLAMA_CUDA
|
|
433
464
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
434
465
|
OBJS += ggml-cuda.o
|
435
466
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
467
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
436
468
|
MK_NVCCFLAGS += -use_fast_math
|
437
469
|
ifdef LLAMA_FATAL_WARNINGS
|
438
470
|
MK_NVCCFLAGS += -Werror all-warnings
|
@@ -495,7 +527,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
495
527
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
496
528
|
ifdef LLAMA_CUDA_CCBIN
|
497
529
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
498
|
-
endif
|
530
|
+
endif # LLAMA_CUDA_CCBIN
|
531
|
+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
|
532
|
+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
|
533
|
+
endif # LLAMA_CUDA_FA_ALL_QUANTS
|
499
534
|
|
500
535
|
ifdef JETSON_EOL_MODULE_DETECT
|
501
536
|
define NVCC_COMPILE
|
@@ -507,30 +542,13 @@ define NVCC_COMPILE
|
|
507
542
|
endef # NVCC_COMPILE
|
508
543
|
endif # JETSON_EOL_MODULE_DETECT
|
509
544
|
|
510
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
545
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
511
546
|
$(NVCC_COMPILE)
|
512
547
|
|
513
548
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
514
549
|
$(NVCC_COMPILE)
|
515
550
|
endif # LLAMA_CUDA
|
516
551
|
|
517
|
-
ifdef LLAMA_CLBLAST
|
518
|
-
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
519
|
-
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
520
|
-
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
521
|
-
|
522
|
-
# Mac provides OpenCL as a framework
|
523
|
-
ifeq ($(UNAME_S),Darwin)
|
524
|
-
MK_LDFLAGS += -lclblast -framework OpenCL
|
525
|
-
else
|
526
|
-
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
527
|
-
endif
|
528
|
-
OBJS += ggml-opencl.o
|
529
|
-
|
530
|
-
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
531
|
-
$(CXX) $(CXXFLAGS) -c $< -o $@
|
532
|
-
endif # LLAMA_CLBLAST
|
533
|
-
|
534
552
|
ifdef LLAMA_VULKAN
|
535
553
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
536
554
|
MK_LDFLAGS += -lvulkan
|
@@ -573,6 +591,7 @@ ifdef LLAMA_HIP_UMA
|
|
573
591
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
574
592
|
endif # LLAMA_HIP_UMA
|
575
593
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
594
|
+
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
576
595
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
577
596
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
578
597
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
@@ -586,11 +605,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
|
586
605
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
587
606
|
OBJS += ggml-cuda.o
|
588
607
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
608
|
+
OBJS += $(OBJS_CUDA_TEMP_INST)
|
589
609
|
|
590
610
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
591
611
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
592
612
|
|
593
|
-
ggml-cuda/%.o: ggml-cuda/%.cu ggml
|
613
|
+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
|
594
614
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
595
615
|
|
596
616
|
endif # LLAMA_HIPBLAS
|
@@ -628,11 +648,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
628
648
|
endif
|
629
649
|
endif # LLAMA_METAL
|
630
650
|
|
651
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
652
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
653
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
654
|
+
|
631
655
|
ifndef LLAMA_NO_LLAMAFILE
|
632
656
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
633
657
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
634
658
|
endif
|
635
659
|
|
660
|
+
ifdef LLAMA_RPC
|
661
|
+
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
662
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
663
|
+
|
664
|
+
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
665
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
666
|
+
|
667
|
+
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
668
|
+
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
669
|
+
endif # LLAMA_RPC
|
670
|
+
|
636
671
|
GF_CC := $(CC)
|
637
672
|
include scripts/get-flags.mk
|
638
673
|
|
@@ -712,14 +747,9 @@ unicode.o: unicode.cpp unicode.h
|
|
712
747
|
unicode-data.o: unicode-data.cpp unicode-data.h
|
713
748
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
714
749
|
|
715
|
-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
716
|
-
|
717
750
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
718
751
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
719
752
|
|
720
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
721
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
722
|
-
|
723
753
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
724
754
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
725
755
|
|
@@ -754,6 +784,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
754
784
|
clean:
|
755
785
|
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
756
786
|
rm -vrf ggml-cuda/*.o
|
787
|
+
rm -vrf ggml-cuda/template-instances/*.o
|
757
788
|
|
758
789
|
#
|
759
790
|
# Examples
|
@@ -821,7 +852,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
821
852
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
822
853
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
823
854
|
|
824
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
855
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
825
856
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
826
857
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
827
858
|
|
@@ -871,10 +902,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
|
|
871
902
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
872
903
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
873
904
|
|
874
|
-
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
875
|
-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
876
|
-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
877
|
-
|
878
905
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
879
906
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
880
907
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
377
377
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
383
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
750
750
|
// this tensor was allocated without ggml-backend
|
751
751
|
return;
|
752
752
|
}
|
753
|
-
ggml_backend_view_init(
|
753
|
+
ggml_backend_view_init(tensor);
|
754
754
|
}
|
755
755
|
} else {
|
756
756
|
if (tensor->data == NULL) {
|
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
899
899
|
if (t->view_src == NULL) {
|
900
900
|
ggml_tallocr_alloc(&tallocr, t);
|
901
901
|
} else if (t->buffer == NULL) {
|
902
|
-
ggml_backend_view_init(
|
902
|
+
ggml_backend_view_init(t);
|
903
903
|
}
|
904
904
|
} else {
|
905
905
|
if (t->view_src != NULL && t->buffer == NULL) {
|
906
906
|
// view of a pre-allocated tensor
|
907
|
-
ggml_backend_view_init(
|
907
|
+
ggml_backend_view_init(t);
|
908
908
|
}
|
909
909
|
}
|
910
910
|
}
|
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
151
151
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
152
152
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
153
153
|
if (dst_buf->iface.cpy_tensor) {
|
154
|
-
return
|
154
|
+
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
155
155
|
}
|
156
156
|
return false;
|
157
157
|
}
|
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
|
1887
1887
|
|
1888
1888
|
// utils
|
1889
1889
|
|
1890
|
-
void ggml_backend_view_init(
|
1890
|
+
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
1891
1891
|
GGML_ASSERT(tensor->buffer == NULL);
|
1892
1892
|
GGML_ASSERT(tensor->view_src != NULL);
|
1893
1893
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
1894
1894
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
1895
1895
|
|
1896
|
-
tensor->buffer = buffer;
|
1896
|
+
tensor->buffer = tensor->view_src->buffer;
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1898
|
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1898
|
+
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
1899
1899
|
}
|
1900
1900
|
|
1901
1901
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1954
1954
|
struct ggml_tensor * dst = node_copies[id];
|
1955
1955
|
if (dst->view_src != NULL) {
|
1956
1956
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
1957
|
-
ggml_backend_view_init(dst
|
1957
|
+
ggml_backend_view_init(dst);
|
1958
1958
|
}
|
1959
1959
|
else {
|
1960
1960
|
ggml_backend_tensor_copy(src, dst);
|
@@ -225,7 +225,7 @@ extern "C" {
|
|
225
225
|
|
226
226
|
// Tensor initialization
|
227
227
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
228
|
-
GGML_API void ggml_backend_view_init(
|
228
|
+
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
229
229
|
|
230
230
|
|
231
231
|
#ifdef __cplusplus
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#include "acc.cuh"
|
2
|
+
|
3
|
+
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
4
|
+
const int ne10, const int ne11, const int ne12,
|
5
|
+
const int nb1, const int nb2, int offset) {
|
6
|
+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
7
|
+
if (i >= ne) {
|
8
|
+
return;
|
9
|
+
}
|
10
|
+
int src1_idx = i - offset;
|
11
|
+
int oz = src1_idx / nb2;
|
12
|
+
int oy = (src1_idx - (oz * nb2)) / nb1;
|
13
|
+
int ox = src1_idx % nb1;
|
14
|
+
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
15
|
+
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
16
|
+
} else {
|
17
|
+
dst[i] = x[i];
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
22
|
+
const int ne10, const int ne11, const int ne12,
|
23
|
+
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
24
|
+
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
25
|
+
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
26
|
+
}
|
27
|
+
|
28
|
+
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
29
|
+
const ggml_tensor * src0 = dst->src[0];
|
30
|
+
const ggml_tensor * src1 = dst->src[1];
|
31
|
+
const float * src0_d = (const float *)src0->data;
|
32
|
+
const float * src1_d = (const float *)src1->data;
|
33
|
+
float * dst_d = (float *)dst->data;
|
34
|
+
cudaStream_t stream = ctx.stream();
|
35
|
+
|
36
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
37
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
38
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
39
|
+
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
40
|
+
|
41
|
+
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
42
|
+
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
43
|
+
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
44
|
+
int offset = dst->op_params[3] / 4; // offset in bytes
|
45
|
+
|
46
|
+
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
|
47
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#include "arange.cuh"
|
2
|
+
|
3
|
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
4
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
5
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
6
|
+
if (nidx >= ne0) {
|
7
|
+
return;
|
8
|
+
}
|
9
|
+
dst[nidx] = start + step * nidx;
|
10
|
+
}
|
11
|
+
|
12
|
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
13
|
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
14
|
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
15
|
+
}
|
16
|
+
|
17
|
+
void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
18
|
+
float * dst_d = (float *)dst->data;
|
19
|
+
cudaStream_t stream = ctx.stream();
|
20
|
+
|
21
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
22
|
+
|
23
|
+
float start;
|
24
|
+
float stop;
|
25
|
+
float step;
|
26
|
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
27
|
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
28
|
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
29
|
+
|
30
|
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
31
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
32
|
+
|
33
|
+
arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
|
34
|
+
}
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#include "argsort.cuh"
|
2
|
+
|
3
|
+
template<typename T>
|
4
|
+
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
|
5
|
+
T tmp = a;
|
6
|
+
a = b;
|
7
|
+
b = tmp;
|
8
|
+
}
|
9
|
+
|
10
|
+
template<ggml_sort_order order>
|
11
|
+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
|
12
|
+
// bitonic sort
|
13
|
+
int col = threadIdx.x;
|
14
|
+
int row = blockIdx.y;
|
15
|
+
|
16
|
+
if (col >= ncols_pad) {
|
17
|
+
return;
|
18
|
+
}
|
19
|
+
|
20
|
+
const float * x_row = x + row * ncols;
|
21
|
+
extern __shared__ int dst_row[];
|
22
|
+
|
23
|
+
// initialize indices
|
24
|
+
dst_row[col] = col;
|
25
|
+
|
26
|
+
__syncthreads();
|
27
|
+
|
28
|
+
for (int k = 2; k <= ncols_pad; k *= 2) {
|
29
|
+
for (int j = k / 2; j > 0; j /= 2) {
|
30
|
+
int ixj = col ^ j;
|
31
|
+
if (ixj > col) {
|
32
|
+
if ((col & k) == 0) {
|
33
|
+
if (dst_row[col] >= ncols ||
|
34
|
+
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
35
|
+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
36
|
+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
37
|
+
) {
|
38
|
+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
39
|
+
}
|
40
|
+
} else {
|
41
|
+
if (dst_row[ixj] >= ncols ||
|
42
|
+
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
43
|
+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
44
|
+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
45
|
+
) {
|
46
|
+
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
__syncthreads();
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
// copy the result to dst without the padding
|
55
|
+
if (col < ncols) {
|
56
|
+
dst[row * ncols + col] = dst_row[col];
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
static int next_power_of_2(int x) {
|
61
|
+
int n = 1;
|
62
|
+
while (n < x) {
|
63
|
+
n *= 2;
|
64
|
+
}
|
65
|
+
return n;
|
66
|
+
}
|
67
|
+
|
68
|
+
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
69
|
+
// bitonic sort requires ncols to be power of 2
|
70
|
+
const int ncols_pad = next_power_of_2(ncols);
|
71
|
+
|
72
|
+
const dim3 block_dims(ncols_pad, 1, 1);
|
73
|
+
const dim3 block_nums(1, nrows, 1);
|
74
|
+
const size_t shared_mem = ncols_pad * sizeof(int);
|
75
|
+
|
76
|
+
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
77
|
+
|
78
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
79
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
80
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
81
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
82
|
+
} else {
|
83
|
+
GGML_ASSERT(false);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
88
|
+
const ggml_tensor * src0 = dst->src[0];
|
89
|
+
const float * src0_d = (const float *)src0->data;
|
90
|
+
float * dst_d = (float *)dst->data;
|
91
|
+
cudaStream_t stream = ctx.stream();
|
92
|
+
|
93
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
94
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
95
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
96
|
+
|
97
|
+
const int64_t ncols = src0->ne[0];
|
98
|
+
const int64_t nrows = ggml_nrows(src0);
|
99
|
+
|
100
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
101
|
+
|
102
|
+
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
103
|
+
}
|