llama_cpp 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +358 -131
- data/ext/llama_cpp/src/ggml-metal.metal +137 -47
- data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
- data/ext/llama_cpp/src/ggml.c +812 -365
- data/ext/llama_cpp/src/ggml.h +25 -7
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +2387 -421
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +5 -2
@@ -29,7 +29,7 @@
|
|
29
29
|
|
30
30
|
// 2-bit quantization
|
31
31
|
// weight is represented as x = a * q + b
|
32
|
-
// 16 blocks of 16
|
32
|
+
// 16 blocks of 16 elements each
|
33
33
|
// Effectively 2.5625 bits per weight
|
34
34
|
typedef struct {
|
35
35
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
41
41
|
|
42
42
|
// 3-bit quantization
|
43
43
|
// weight is represented as x = a * q
|
44
|
-
// 16 blocks of 16
|
44
|
+
// 16 blocks of 16 elements each
|
45
45
|
// Effectively 3.4375 bits per weight
|
46
46
|
#ifdef GGML_QKK_64
|
47
47
|
typedef struct {
|
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
|
|
62
62
|
#endif
|
63
63
|
|
64
64
|
// 4-bit quantization
|
65
|
-
//
|
65
|
+
// 8 blocks of 32 elements each
|
66
66
|
// weight is represented as x = a * q + b
|
67
67
|
// Effectively 4.5 bits per weight
|
68
68
|
#ifdef GGML_QKK_64
|
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
83
83
|
#endif
|
84
84
|
|
85
85
|
// 5-bit quantization
|
86
|
-
//
|
86
|
+
// 8 blocks of 32 elements each
|
87
87
|
// weight is represented as x = a * q + b
|
88
88
|
// Effectively 5.5 bits per weight
|
89
89
|
#ifdef GGML_QKK_64
|
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
107
107
|
|
108
108
|
// 6-bit quantization
|
109
109
|
// weight is represented as x = a * q
|
110
|
-
// 16 blocks of 16
|
110
|
+
// 16 blocks of 16 elements each
|
111
111
|
// Effectively 6.5625 bits per weight
|
112
112
|
typedef struct {
|
113
113
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|