llama_cpp 0.12.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
|
4
|
+
data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
|
7
|
+
data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1768 to b1833.
|
4
|
+
- Add model file type constants.
|
5
|
+
- Add `kv_cache_seq_div` method to `Context`.
|
6
|
+
|
1
7
|
## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
|
2
8
|
|
3
9
|
- Add `get_one` singleton method to `Batch`.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -2026,6 +2026,7 @@ public:
|
|
2026
2026
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
2027
2027
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
2028
2028
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
|
2029
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
2029
2030
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2030
2031
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2031
2032
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
@@ -2378,6 +2379,16 @@ private:
|
|
2378
2379
|
return Qnil;
|
2379
2380
|
}
|
2380
2381
|
|
2382
|
+
static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
|
2383
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
|
+
if (ptr->ctx == NULL) {
|
2385
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2386
|
+
return Qnil;
|
2387
|
+
}
|
2388
|
+
llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
|
2389
|
+
return Qnil;
|
2390
|
+
}
|
2391
|
+
|
2381
2392
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2382
2393
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2383
2394
|
if (ptr->ctx == NULL) {
|
@@ -3209,6 +3220,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3209
3220
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
3210
3221
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
3211
3222
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
3223
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3224
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3225
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3212
3226
|
|
3213
3227
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3214
3228
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1833'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
|
|
22
22
|
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
23
23
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
24
24
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
25
28
|
|
26
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
27
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -193,6 +196,7 @@ module LLaMACpp
|
|
193
196
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
194
197
|
def kv_cache_seq_keep: (Integer) -> void
|
195
198
|
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
199
|
+
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
196
200
|
def set_rng_seed: (Integer) -> void
|
197
201
|
def load_session_file: (session_path: String) -> void
|
198
202
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
|
-
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
3
|
+
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -620,6 +620,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
|
|
620
620
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
621
621
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
622
622
|
|
623
|
+
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
|
+
|
623
626
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
627
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
628
|
|
@@ -671,6 +674,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
671
674
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
672
675
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
673
676
|
|
677
|
+
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
678
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
679
|
+
|
674
680
|
ifdef LLAMA_METAL
|
675
681
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
676
682
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
90
90
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
91
|
|
92
92
|
// compute graph without a plan
|
93
|
-
|
93
|
+
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
94
|
|
95
95
|
// check if the backend supports an operation
|
96
96
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|
195
195
|
ggml_backend_synchronize(backend);
|
196
196
|
}
|
197
197
|
|
198
|
-
|
199
|
-
backend->iface.graph_compute(backend, cgraph)
|
198
|
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
199
|
+
if (!backend->iface.graph_compute(backend, cgraph)) {
|
200
|
+
return false;
|
201
|
+
}
|
200
202
|
|
201
203
|
// TODO: optional sync
|
202
204
|
ggml_backend_synchronize(backend);
|
205
|
+
return true;
|
203
206
|
}
|
204
207
|
|
205
208
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
597
600
|
GGML_UNUSED(backend);
|
598
601
|
}
|
599
602
|
|
600
|
-
static
|
603
|
+
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
601
604
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
602
605
|
|
603
606
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
611
614
|
cplan.work_data = cpu_ctx->work_data;
|
612
615
|
|
613
616
|
ggml_graph_compute(cgraph, &cplan);
|
617
|
+
return true;
|
614
618
|
}
|
615
619
|
|
616
620
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -58,7 +58,7 @@ extern "C" {
|
|
58
58
|
|
59
59
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
60
60
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
61
|
-
GGML_API
|
61
|
+
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
62
62
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
63
63
|
|
64
64
|
// tensor copy between different backends
|