llama_cpp 0.12.0 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
|
4
|
+
data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
|
7
|
+
data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1768 to b1833.
|
4
|
+
- Add model file type constants.
|
5
|
+
- Add `kv_cache_seq_div` method to `Context`.
|
6
|
+
|
1
7
|
## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
|
2
8
|
|
3
9
|
- Add `get_one` singleton method to `Batch`.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -2026,6 +2026,7 @@ public:
|
|
2026
2026
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
2027
2027
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
2028
2028
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
|
2029
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
2029
2030
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2030
2031
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2031
2032
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
@@ -2378,6 +2379,16 @@ private:
|
|
2378
2379
|
return Qnil;
|
2379
2380
|
}
|
2380
2381
|
|
2382
|
+
static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
|
2383
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
|
+
if (ptr->ctx == NULL) {
|
2385
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2386
|
+
return Qnil;
|
2387
|
+
}
|
2388
|
+
llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
|
2389
|
+
return Qnil;
|
2390
|
+
}
|
2391
|
+
|
2381
2392
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2382
2393
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2383
2394
|
if (ptr->ctx == NULL) {
|
@@ -3209,6 +3220,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3209
3220
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
3210
3221
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
3211
3222
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
3223
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3224
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3225
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3212
3226
|
|
3213
3227
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3214
3228
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1833'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
|
|
22
22
|
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
23
23
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
24
24
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
25
28
|
|
26
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
27
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -193,6 +196,7 @@ module LLaMACpp
|
|
193
196
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
194
197
|
def kv_cache_seq_keep: (Integer) -> void
|
195
198
|
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
199
|
+
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
196
200
|
def set_rng_seed: (Integer) -> void
|
197
201
|
def load_session_file: (session_path: String) -> void
|
198
202
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
|
-
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
3
|
+
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -620,6 +620,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
|
|
620
620
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
621
621
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
622
622
|
|
623
|
+
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
|
+
|
623
626
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
627
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
628
|
|
@@ -671,6 +674,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
671
674
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
672
675
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
673
676
|
|
677
|
+
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
678
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
679
|
+
|
674
680
|
ifdef LLAMA_METAL
|
675
681
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
676
682
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
90
90
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
91
|
|
92
92
|
// compute graph without a plan
|
93
|
-
|
93
|
+
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
94
|
|
95
95
|
// check if the backend supports an operation
|
96
96
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|
195
195
|
ggml_backend_synchronize(backend);
|
196
196
|
}
|
197
197
|
|
198
|
-
|
199
|
-
backend->iface.graph_compute(backend, cgraph)
|
198
|
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
199
|
+
if (!backend->iface.graph_compute(backend, cgraph)) {
|
200
|
+
return false;
|
201
|
+
}
|
200
202
|
|
201
203
|
// TODO: optional sync
|
202
204
|
ggml_backend_synchronize(backend);
|
205
|
+
return true;
|
203
206
|
}
|
204
207
|
|
205
208
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
597
600
|
GGML_UNUSED(backend);
|
598
601
|
}
|
599
602
|
|
600
|
-
static
|
603
|
+
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
601
604
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
602
605
|
|
603
606
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
611
614
|
cplan.work_data = cpu_ctx->work_data;
|
612
615
|
|
613
616
|
ggml_graph_compute(cgraph, &cplan);
|
617
|
+
return true;
|
614
618
|
}
|
615
619
|
|
616
620
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -58,7 +58,7 @@ extern "C" {
|
|
58
58
|
|
59
59
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
60
60
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
61
|
-
GGML_API
|
61
|
+
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
62
62
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
63
63
|
|
64
64
|
// tensor copy between different backends
|