llama_cpp 0.15.3 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +27 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +66 -36
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
- data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
- data/vendor/tmp/llama.cpp/ggml.c +301 -409
- data/vendor/tmp/llama.cpp/ggml.h +19 -23
- data/vendor/tmp/llama.cpp/llama.cpp +855 -651
- data/vendor/tmp/llama.cpp/llama.h +28 -48
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -85,6 +85,7 @@ extern "C" {
|
|
|
85
85
|
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
86
86
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
87
87
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
88
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
88
89
|
};
|
|
89
90
|
|
|
90
91
|
// note: these values should be synchronized with ggml_rope
|
|
@@ -96,7 +97,7 @@ extern "C" {
|
|
|
96
97
|
LLAMA_ROPE_TYPE_GLM = 4,
|
|
97
98
|
};
|
|
98
99
|
|
|
99
|
-
enum llama_token_type {
|
|
100
|
+
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
100
101
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
|
101
102
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
|
102
103
|
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
|
@@ -106,6 +107,20 @@ extern "C" {
|
|
|
106
107
|
LLAMA_TOKEN_TYPE_BYTE = 6,
|
|
107
108
|
};
|
|
108
109
|
|
|
110
|
+
enum llama_token_attr {
|
|
111
|
+
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
|
|
112
|
+
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
|
|
113
|
+
LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
|
|
114
|
+
LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
|
|
115
|
+
LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
|
|
116
|
+
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
|
|
117
|
+
LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
|
|
118
|
+
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
|
|
119
|
+
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
|
|
120
|
+
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
|
|
121
|
+
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
|
|
122
|
+
};
|
|
123
|
+
|
|
109
124
|
// model file types
|
|
110
125
|
enum llama_ftype {
|
|
111
126
|
LLAMA_FTYPE_ALL_F32 = 0,
|
|
@@ -264,6 +279,8 @@ extern "C" {
|
|
|
264
279
|
bool check_tensors; // validate model tensor data
|
|
265
280
|
};
|
|
266
281
|
|
|
282
|
+
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
283
|
+
// https://github.com/ggerganov/llama.cpp/pull/7544
|
|
267
284
|
struct llama_context_params {
|
|
268
285
|
uint32_t seed; // RNG seed, -1 for random
|
|
269
286
|
uint32_t n_ctx; // text context, 0 = from model
|
|
@@ -290,14 +307,14 @@ extern "C" {
|
|
|
290
307
|
ggml_backend_sched_eval_callback cb_eval;
|
|
291
308
|
void * cb_eval_user_data;
|
|
292
309
|
|
|
293
|
-
enum ggml_type type_k; // data type for K cache
|
|
294
|
-
enum ggml_type type_v; // data type for V cache
|
|
310
|
+
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
|
311
|
+
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
|
295
312
|
|
|
296
313
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
297
314
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
298
315
|
bool embeddings; // if true, extract embeddings (together with logits)
|
|
299
316
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
300
|
-
bool flash_attn; // whether to use flash attention
|
|
317
|
+
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
301
318
|
|
|
302
319
|
// Abort callback
|
|
303
320
|
// if it returns true, execution of llama_decode() will be aborted
|
|
@@ -421,8 +438,8 @@ extern "C" {
|
|
|
421
438
|
|
|
422
439
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
|
423
440
|
|
|
424
|
-
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model
|
|
425
|
-
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model
|
|
441
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
|
442
|
+
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
|
426
443
|
|
|
427
444
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
|
428
445
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
|
@@ -818,11 +835,14 @@ extern "C" {
|
|
|
818
835
|
|
|
819
836
|
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
|
820
837
|
|
|
821
|
-
LLAMA_API enum
|
|
838
|
+
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
|
822
839
|
|
|
823
840
|
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
|
824
841
|
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
|
825
842
|
|
|
843
|
+
// Identify if Token Id is a control token or a render-able token
|
|
844
|
+
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
|
845
|
+
|
|
826
846
|
// Special tokens
|
|
827
847
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
|
828
848
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
|
@@ -1036,49 +1056,9 @@ extern "C" {
|
|
|
1036
1056
|
llama_token token);
|
|
1037
1057
|
|
|
1038
1058
|
//
|
|
1039
|
-
//
|
|
1059
|
+
// Model split
|
|
1040
1060
|
//
|
|
1041
1061
|
|
|
1042
|
-
struct llama_beam_view {
|
|
1043
|
-
const llama_token * tokens;
|
|
1044
|
-
|
|
1045
|
-
size_t n_tokens;
|
|
1046
|
-
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
1047
|
-
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
|
1048
|
-
};
|
|
1049
|
-
|
|
1050
|
-
// Passed to beam_search_callback function.
|
|
1051
|
-
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
|
1052
|
-
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
|
1053
|
-
// These pointers are valid only during the synchronous callback, so should not be saved.
|
|
1054
|
-
struct llama_beams_state {
|
|
1055
|
-
struct llama_beam_view * beam_views;
|
|
1056
|
-
|
|
1057
|
-
size_t n_beams; // Number of elements in beam_views[].
|
|
1058
|
-
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
|
1059
|
-
bool last_call; // True iff this is the last callback invocation.
|
|
1060
|
-
};
|
|
1061
|
-
|
|
1062
|
-
// Type of pointer to the beam_search_callback function.
|
|
1063
|
-
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
|
1064
|
-
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
|
1065
|
-
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
|
1066
|
-
|
|
1067
|
-
/// @details Deterministically returns entire sentence constructed by a beam search.
|
|
1068
|
-
/// @param ctx Pointer to the llama_context.
|
|
1069
|
-
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
|
1070
|
-
/// @param callback_data A pointer that is simply passed back to callback.
|
|
1071
|
-
/// @param n_beams Number of beams to use.
|
|
1072
|
-
/// @param n_past Number of tokens already evaluated.
|
|
1073
|
-
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
|
1074
|
-
LLAMA_API void llama_beam_search(
|
|
1075
|
-
struct llama_context * ctx,
|
|
1076
|
-
llama_beam_search_callback_fn_t callback,
|
|
1077
|
-
void * callback_data,
|
|
1078
|
-
size_t n_beams,
|
|
1079
|
-
int32_t n_past,
|
|
1080
|
-
int32_t n_predict);
|
|
1081
|
-
|
|
1082
1062
|
/// @details Build a split GGUF final path for this chunk.
|
|
1083
1063
|
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
|
1084
1064
|
// Returns the split_path length.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llama_cpp
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.16.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yoshoku
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-06-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
|
14
14
|
email:
|
|
@@ -45,16 +45,131 @@ files:
|
|
|
45
45
|
- vendor/tmp/llama.cpp/ggml-common.h
|
|
46
46
|
- vendor/tmp/llama.cpp/ggml-cuda.cu
|
|
47
47
|
- vendor/tmp/llama.cpp/ggml-cuda.h
|
|
48
|
+
- vendor/tmp/llama.cpp/ggml-cuda/acc.cu
|
|
49
|
+
- vendor/tmp/llama.cpp/ggml-cuda/arange.cu
|
|
50
|
+
- vendor/tmp/llama.cpp/ggml-cuda/argsort.cu
|
|
51
|
+
- vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu
|
|
52
|
+
- vendor/tmp/llama.cpp/ggml-cuda/clamp.cu
|
|
53
|
+
- vendor/tmp/llama.cpp/ggml-cuda/concat.cu
|
|
54
|
+
- vendor/tmp/llama.cpp/ggml-cuda/convert.cu
|
|
55
|
+
- vendor/tmp/llama.cpp/ggml-cuda/cpy.cu
|
|
56
|
+
- vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu
|
|
57
|
+
- vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu
|
|
58
|
+
- vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu
|
|
59
|
+
- vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu
|
|
60
|
+
- vendor/tmp/llama.cpp/ggml-cuda/fattn.cu
|
|
61
|
+
- vendor/tmp/llama.cpp/ggml-cuda/getrows.cu
|
|
62
|
+
- vendor/tmp/llama.cpp/ggml-cuda/im2col.cu
|
|
63
|
+
- vendor/tmp/llama.cpp/ggml-cuda/mmq.cu
|
|
64
|
+
- vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu
|
|
65
|
+
- vendor/tmp/llama.cpp/ggml-cuda/norm.cu
|
|
66
|
+
- vendor/tmp/llama.cpp/ggml-cuda/pad.cu
|
|
67
|
+
- vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu
|
|
68
|
+
- vendor/tmp/llama.cpp/ggml-cuda/quantize.cu
|
|
69
|
+
- vendor/tmp/llama.cpp/ggml-cuda/rope.cu
|
|
70
|
+
- vendor/tmp/llama.cpp/ggml-cuda/scale.cu
|
|
71
|
+
- vendor/tmp/llama.cpp/ggml-cuda/softmax.cu
|
|
72
|
+
- vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu
|
|
73
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
|
|
74
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
|
|
75
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
|
|
76
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
|
|
77
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
|
|
78
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
|
|
79
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
|
|
80
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
|
|
81
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
|
|
82
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
|
|
83
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
|
|
84
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
|
|
85
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
|
|
86
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
|
|
87
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
|
|
88
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
|
|
89
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
|
|
90
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
|
|
91
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
|
|
92
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
|
|
93
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
|
|
94
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
|
|
95
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
|
|
96
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
|
|
97
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
|
|
98
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
|
|
99
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
|
|
100
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
|
|
101
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
|
|
102
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
|
|
103
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
|
|
104
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
|
|
105
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
|
|
106
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
|
|
107
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
|
|
108
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
|
|
109
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
|
|
110
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
|
|
111
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
|
|
112
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
|
|
113
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
|
|
114
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
|
|
115
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
|
|
116
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
|
|
117
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
|
|
118
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
|
|
119
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
|
|
120
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
|
|
121
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
|
|
122
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
|
|
123
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
|
|
124
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
|
|
125
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
|
|
126
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
|
|
127
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
|
|
128
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
|
|
129
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
|
|
130
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
|
|
131
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
|
|
132
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
|
|
133
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
|
|
134
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
|
|
135
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
|
|
136
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
|
|
137
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
|
|
138
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
|
|
139
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
|
|
140
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
|
|
141
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
|
|
142
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
|
|
143
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
|
|
144
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
|
|
145
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
|
|
146
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
|
|
147
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
|
|
148
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
|
|
149
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
|
|
150
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
|
|
151
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
|
|
152
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
|
|
153
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
|
|
154
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
|
|
155
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
|
|
156
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
|
|
157
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
|
|
158
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
|
|
159
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
|
|
160
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
|
|
161
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
|
|
162
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
|
|
163
|
+
- vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
|
|
164
|
+
- vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu
|
|
165
|
+
- vendor/tmp/llama.cpp/ggml-cuda/unary.cu
|
|
166
|
+
- vendor/tmp/llama.cpp/ggml-cuda/upscale.cu
|
|
48
167
|
- vendor/tmp/llama.cpp/ggml-impl.h
|
|
49
168
|
- vendor/tmp/llama.cpp/ggml-kompute.cpp
|
|
50
169
|
- vendor/tmp/llama.cpp/ggml-kompute.h
|
|
51
170
|
- vendor/tmp/llama.cpp/ggml-metal.h
|
|
52
171
|
- vendor/tmp/llama.cpp/ggml-metal.m
|
|
53
172
|
- vendor/tmp/llama.cpp/ggml-metal.metal
|
|
54
|
-
- vendor/tmp/llama.cpp/ggml-mpi.c
|
|
55
|
-
- vendor/tmp/llama.cpp/ggml-mpi.h
|
|
56
|
-
- vendor/tmp/llama.cpp/ggml-opencl.cpp
|
|
57
|
-
- vendor/tmp/llama.cpp/ggml-opencl.h
|
|
58
173
|
- vendor/tmp/llama.cpp/ggml-quants.c
|
|
59
174
|
- vendor/tmp/llama.cpp/ggml-quants.h
|
|
60
175
|
- vendor/tmp/llama.cpp/ggml-rpc.cpp
|
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
#include "ggml-mpi.h"
|
|
2
|
-
|
|
3
|
-
#include "ggml.h"
|
|
4
|
-
|
|
5
|
-
#include <mpi.h>
|
|
6
|
-
|
|
7
|
-
#include <stdio.h>
|
|
8
|
-
#include <stdlib.h>
|
|
9
|
-
|
|
10
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
11
|
-
|
|
12
|
-
#define UNUSED GGML_UNUSED
|
|
13
|
-
|
|
14
|
-
struct ggml_mpi_context {
|
|
15
|
-
int rank;
|
|
16
|
-
int size;
|
|
17
|
-
};
|
|
18
|
-
|
|
19
|
-
void ggml_mpi_backend_init(void) {
|
|
20
|
-
MPI_Init(NULL, NULL);
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
void ggml_mpi_backend_free(void) {
|
|
24
|
-
MPI_Finalize();
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
struct ggml_mpi_context * ggml_mpi_init(void) {
|
|
28
|
-
struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
|
|
29
|
-
|
|
30
|
-
MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
|
|
31
|
-
MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
|
|
32
|
-
|
|
33
|
-
return ctx;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
void ggml_mpi_free(struct ggml_mpi_context * ctx) {
|
|
37
|
-
free(ctx);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
|
|
41
|
-
return ctx->rank;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
void ggml_mpi_eval_init(
|
|
45
|
-
struct ggml_mpi_context * ctx_mpi,
|
|
46
|
-
int * n_tokens,
|
|
47
|
-
int * n_past,
|
|
48
|
-
int * n_threads) {
|
|
49
|
-
UNUSED(ctx_mpi);
|
|
50
|
-
|
|
51
|
-
// synchronize the worker node parameters with the root node
|
|
52
|
-
MPI_Barrier(MPI_COMM_WORLD);
|
|
53
|
-
|
|
54
|
-
MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
|
55
|
-
MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
|
56
|
-
MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
|
|
60
|
-
struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
|
|
61
|
-
if (t == NULL) {
|
|
62
|
-
fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
|
|
63
|
-
return -1;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
for (int i = 0; i < gf->n_nodes; i++) {
|
|
67
|
-
if (gf->nodes[i] == t) {
|
|
68
|
-
return i;
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
|
|
73
|
-
return -1;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
|
|
77
|
-
MPI_Datatype mpi_type;
|
|
78
|
-
|
|
79
|
-
switch (t->type) {
|
|
80
|
-
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
|
|
81
|
-
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
|
|
82
|
-
default: GGML_ASSERT(false && "not implemented");
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
|
|
86
|
-
GGML_ASSERT(retval == MPI_SUCCESS);
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
|
|
90
|
-
MPI_Datatype mpi_type;
|
|
91
|
-
|
|
92
|
-
switch (t->type) {
|
|
93
|
-
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
|
|
94
|
-
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
|
|
95
|
-
default: GGML_ASSERT(false && "not implemented");
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
MPI_Status status; UNUSED(status);
|
|
99
|
-
|
|
100
|
-
const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
|
|
101
|
-
GGML_ASSERT(retval == MPI_SUCCESS);
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// TODO: there are many improvements that can be done to this implementation
|
|
105
|
-
void ggml_mpi_graph_compute_pre(
|
|
106
|
-
struct ggml_mpi_context * ctx_mpi,
|
|
107
|
-
struct ggml_cgraph * gf,
|
|
108
|
-
int n_layers) {
|
|
109
|
-
const int mpi_rank = ctx_mpi->rank;
|
|
110
|
-
const int mpi_size = ctx_mpi->size;
|
|
111
|
-
|
|
112
|
-
struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
|
|
113
|
-
if (inp_tokens == NULL) {
|
|
114
|
-
fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
|
|
115
|
-
return;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
|
|
119
|
-
if (inp0 == NULL) {
|
|
120
|
-
fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
|
|
121
|
-
return;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
GGML_ASSERT(inp0 == gf->nodes[0]);
|
|
125
|
-
|
|
126
|
-
// distribute the compute graph into slices across the MPI nodes
|
|
127
|
-
//
|
|
128
|
-
// the main node (0) processes the last layers + the remainder of the compute graph
|
|
129
|
-
// and is responsible to pass the input tokens to the first node (1)
|
|
130
|
-
//
|
|
131
|
-
// node 1: [( 0) * n_per_node, ( 1) * n_per_node)
|
|
132
|
-
// node 2: [( 1) * n_per_node, ( 2) * n_per_node)
|
|
133
|
-
// ...
|
|
134
|
-
// node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
|
|
135
|
-
// node 0: [(n-1) * n_per_node, n_nodes)
|
|
136
|
-
//
|
|
137
|
-
if (mpi_rank > 0) {
|
|
138
|
-
if (mpi_rank == 1) {
|
|
139
|
-
// the first node (1) receives the input tokens from the main node (0)
|
|
140
|
-
ggml_mpi_tensor_recv(inp_tokens, 0);
|
|
141
|
-
} else {
|
|
142
|
-
// recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
|
|
143
|
-
ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
|
|
144
|
-
}
|
|
145
|
-
} else if (mpi_size > 1) {
|
|
146
|
-
// node 0 sends the input tokens to node 1
|
|
147
|
-
ggml_mpi_tensor_send(inp_tokens, 1);
|
|
148
|
-
|
|
149
|
-
// recv the output data from the last node
|
|
150
|
-
ggml_mpi_tensor_recv(inp0, mpi_size - 1);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
{
|
|
154
|
-
const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
|
|
155
|
-
|
|
156
|
-
const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
|
|
157
|
-
|
|
158
|
-
const int il0 = (mpi_idx + 0) * n_per_node;
|
|
159
|
-
const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
|
|
160
|
-
|
|
161
|
-
char name_l0[GGML_MAX_NAME];
|
|
162
|
-
char name_l1[GGML_MAX_NAME];
|
|
163
|
-
|
|
164
|
-
snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
|
|
165
|
-
snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
|
|
166
|
-
|
|
167
|
-
const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
|
|
168
|
-
const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
|
|
169
|
-
|
|
170
|
-
if (idx_l0 < 0 || idx_l1 < 0) {
|
|
171
|
-
fprintf(stderr, "%s: layer input nodes not found\n", __func__);
|
|
172
|
-
return;
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
// attach the input data to all nodes that need it
|
|
176
|
-
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
|
|
177
|
-
for (int i = idx_l0; i < idx_l1; i++) {
|
|
178
|
-
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
|
|
179
|
-
gf->nodes[i]->src[0] = inp0;
|
|
180
|
-
}
|
|
181
|
-
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
|
|
182
|
-
gf->nodes[i]->src[1] = inp0;
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
// TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
|
|
187
|
-
for (int i = 1; i < idx_l1 - idx_l0; i++) {
|
|
188
|
-
gf->nodes[i] = gf->nodes[idx_l0 + i];
|
|
189
|
-
gf->grads[i] = gf->grads[idx_l0 + i];
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
// the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
|
|
193
|
-
if (mpi_idx != 0) {
|
|
194
|
-
gf->nodes[0]->op = GGML_OP_NONE;
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
gf->n_nodes = idx_l1 - idx_l0;
|
|
198
|
-
|
|
199
|
-
//fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
void ggml_mpi_graph_compute_post(
|
|
204
|
-
struct ggml_mpi_context * ctx_mpi,
|
|
205
|
-
struct ggml_cgraph * gf,
|
|
206
|
-
int n_layers) {
|
|
207
|
-
UNUSED(n_layers);
|
|
208
|
-
|
|
209
|
-
const int mpi_rank = ctx_mpi->rank;
|
|
210
|
-
const int mpi_size = ctx_mpi->size;
|
|
211
|
-
|
|
212
|
-
// send the output data to the next node
|
|
213
|
-
if (mpi_rank > 0) {
|
|
214
|
-
ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
|
|
215
|
-
}
|
|
216
|
-
}
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
struct ggml_context;
|
|
4
|
-
struct ggml_tensor;
|
|
5
|
-
struct ggml_cgraph;
|
|
6
|
-
|
|
7
|
-
#ifdef __cplusplus
|
|
8
|
-
extern "C" {
|
|
9
|
-
#endif
|
|
10
|
-
|
|
11
|
-
struct ggml_mpi_context;
|
|
12
|
-
|
|
13
|
-
void ggml_mpi_backend_init(void);
|
|
14
|
-
void ggml_mpi_backend_free(void);
|
|
15
|
-
|
|
16
|
-
struct ggml_mpi_context * ggml_mpi_init(void);
|
|
17
|
-
void ggml_mpi_free(struct ggml_mpi_context * ctx);
|
|
18
|
-
|
|
19
|
-
int ggml_mpi_rank(struct ggml_mpi_context * ctx);
|
|
20
|
-
|
|
21
|
-
void ggml_mpi_eval_init(
|
|
22
|
-
struct ggml_mpi_context * ctx_mpi,
|
|
23
|
-
int * n_tokens,
|
|
24
|
-
int * n_past,
|
|
25
|
-
int * n_threads);
|
|
26
|
-
|
|
27
|
-
void ggml_mpi_graph_compute_pre(
|
|
28
|
-
struct ggml_mpi_context * ctx_mpi,
|
|
29
|
-
struct ggml_cgraph * gf,
|
|
30
|
-
int n_layers);
|
|
31
|
-
|
|
32
|
-
void ggml_mpi_graph_compute_post(
|
|
33
|
-
struct ggml_mpi_context * ctx_mpi,
|
|
34
|
-
struct ggml_cgraph * gf,
|
|
35
|
-
int n_layers);
|
|
36
|
-
|
|
37
|
-
#ifdef __cplusplus
|
|
38
|
-
}
|
|
39
|
-
#endif
|