llama_cpp 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +59 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -4
- data/vendor/tmp/llama.cpp/Makefile +2 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +4 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -21
- data/vendor/tmp/llama.cpp/ggml-backend.h +16 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +63 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +120 -75
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +178 -133
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3432 -1118
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1327 -773
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +227 -15
- data/vendor/tmp/llama.cpp/ggml.h +30 -4
- data/vendor/tmp/llama.cpp/llama.cpp +631 -211
- data/vendor/tmp/llama.cpp/llama.h +28 -10
- metadata +2 -2
|
@@ -129,6 +129,7 @@ extern "C" {
|
|
|
129
129
|
};
|
|
130
130
|
|
|
131
131
|
enum llama_pooling_type {
|
|
132
|
+
LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
|
|
132
133
|
LLAMA_POOLING_TYPE_NONE = 0,
|
|
133
134
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
|
134
135
|
LLAMA_POOLING_TYPE_CLS = 2,
|
|
@@ -162,7 +163,7 @@ extern "C" {
|
|
|
162
163
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
163
164
|
// - pos : the positions of the respective token in the sequence
|
|
164
165
|
// - seq_id : the sequence to which the respective token belongs
|
|
165
|
-
// - logits : if zero, the logits for the respective token will not be output
|
|
166
|
+
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
166
167
|
//
|
|
167
168
|
typedef struct llama_batch {
|
|
168
169
|
int32_t n_tokens;
|
|
@@ -172,7 +173,7 @@ extern "C" {
|
|
|
172
173
|
llama_pos * pos;
|
|
173
174
|
int32_t * n_seq_id;
|
|
174
175
|
llama_seq_id ** seq_id;
|
|
175
|
-
int8_t * logits;
|
|
176
|
+
int8_t * logits; // TODO: rename this to "output"
|
|
176
177
|
|
|
177
178
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
|
178
179
|
// for future-proof code, use the above fields instead and ignore everything below
|
|
@@ -236,7 +237,10 @@ extern "C" {
|
|
|
236
237
|
uint32_t n_batch; // prompt processing maximum batch size
|
|
237
238
|
uint32_t n_threads; // number of threads to use for generation
|
|
238
239
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
|
239
|
-
|
|
240
|
+
|
|
241
|
+
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
|
242
|
+
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
|
243
|
+
// (ignored if no pooling layer)
|
|
240
244
|
|
|
241
245
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
|
242
246
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
|
@@ -255,10 +259,15 @@ extern "C" {
|
|
|
255
259
|
enum ggml_type type_v; // data type for V cache
|
|
256
260
|
|
|
257
261
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
258
|
-
bool logits_all; // the
|
|
259
|
-
bool
|
|
262
|
+
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
263
|
+
bool embeddings; // if true, extract embeddings (together with logits)
|
|
260
264
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
261
|
-
|
|
265
|
+
|
|
266
|
+
// Abort callback
|
|
267
|
+
// if it returns true, execution of llama_decode() will be aborted
|
|
268
|
+
// currently works only with CPU execution
|
|
269
|
+
ggml_abort_callback abort_callback;
|
|
270
|
+
void * abort_callback_data;
|
|
262
271
|
};
|
|
263
272
|
|
|
264
273
|
// model quantization parameters
|
|
@@ -632,7 +641,10 @@ extern "C" {
|
|
|
632
641
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
|
633
642
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
|
634
643
|
|
|
635
|
-
//
|
|
644
|
+
// Set abort callback
|
|
645
|
+
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
646
|
+
|
|
647
|
+
// Token logits obtained from the last call to llama_decode()
|
|
636
648
|
// The logits for the last token are stored in the last row
|
|
637
649
|
// Logits for which llama_batch.logits[i] == 0 are undefined
|
|
638
650
|
// Rows: n_tokens provided with llama_batch
|
|
@@ -643,14 +655,20 @@ extern "C" {
|
|
|
643
655
|
// llama_get_logits(ctx) + i*n_vocab
|
|
644
656
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
|
645
657
|
|
|
646
|
-
// Get
|
|
647
|
-
// shape: [n_embd] (1-dimensional)
|
|
658
|
+
// Get all output token embeddings
|
|
659
|
+
// shape: [n_tokens*n_embd] (1-dimensional)
|
|
648
660
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
|
649
661
|
|
|
650
|
-
// Get the embeddings for the ith
|
|
662
|
+
// Get the embeddings for the ith token
|
|
651
663
|
// llama_get_embeddings(ctx) + i*n_embd
|
|
664
|
+
// shape: [n_embd] (1-dimensional)
|
|
652
665
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
|
653
666
|
|
|
667
|
+
// Get the embeddings for a sequence id
|
|
668
|
+
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
|
669
|
+
// shape: [n_embd] (1-dimensional)
|
|
670
|
+
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
671
|
+
|
|
654
672
|
//
|
|
655
673
|
// Vocab
|
|
656
674
|
//
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llama_cpp
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.14.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yoshoku
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-03-
|
|
11
|
+
date: 2024-03-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
|
14
14
|
email:
|