RubyGems - llama_cpp - Versions diffs - 0.6.0 → 0.7.1 - Mend

llama_cpp 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -0
data/ext/llama_cpp/extconf.rb +1 -1
data/ext/llama_cpp/llama_cpp.cpp +49 -3
data/ext/llama_cpp/src/ggml-alloc.c +62 -107
data/ext/llama_cpp/src/ggml-alloc.h +11 -5
data/ext/llama_cpp/src/ggml-backend.c +385 -0
data/ext/llama_cpp/src/ggml-backend.h +143 -0
data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
data/ext/llama_cpp/src/ggml-cuda.h +4 -0
data/ext/llama_cpp/src/ggml-metal.h +18 -1
data/ext/llama_cpp/src/ggml-metal.m +358 -131
data/ext/llama_cpp/src/ggml-metal.metal +137 -47
data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
data/ext/llama_cpp/src/ggml.c +812 -365
data/ext/llama_cpp/src/ggml.h +25 -7
data/ext/llama_cpp/src/k_quants.c +744 -2
data/ext/llama_cpp/src/k_quants.h +5 -5
data/ext/llama_cpp/src/llama.cpp +2387 -421
data/ext/llama_cpp/src/llama.h +22 -6
data/ext/llama_cpp/src/unicode.h +462 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +1 -1
data/sig/llama_cpp.rbs +5 -0
metadata +5 -2

data/ext/llama_cpp/src/k_quants.h CHANGED Viewed

@@ -29,7 +29,7 @@
 // 2-bit quantization
 // weight is represented as x = a * q + b
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 2.5625 bits per weight
 typedef struct {
     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
 // 3-bit quantization
 // weight is represented as x = a * q
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 3.4375 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
 #endif
 // 4-bit quantization
-// 16 blocks of 32 elements each
+// 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 #ifdef GGML_QKK_64
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
 #endif
 // 5-bit quantization
-// 16 blocks of 32 elements each
+// 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 #ifdef GGML_QKK_64
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
 // 6-bit quantization
 // weight is represented as x = a * q
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 6.5625 bits per weight
 typedef struct {
     uint8_t ql[QK_K/2];      // quants, lower 4 bits