llama_cpp 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
@@ -29,7 +29,7 @@
|
|
29
29
|
|
30
30
|
// 2-bit quantization
|
31
31
|
// weight is represented as x = a * q + b
|
32
|
-
// 16 blocks of 16
|
32
|
+
// 16 blocks of 16 elements each
|
33
33
|
// Effectively 2.5625 bits per weight
|
34
34
|
typedef struct {
|
35
35
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
@@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
|
41
41
|
|
42
42
|
// 3-bit quantization
|
43
43
|
// weight is represented as x = a * q
|
44
|
-
// 16 blocks of 16
|
44
|
+
// 16 blocks of 16 elements each
|
45
45
|
// Effectively 3.4375 bits per weight
|
46
46
|
#ifdef GGML_QKK_64
|
47
47
|
typedef struct {
|
@@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
|
|
62
62
|
#endif
|
63
63
|
|
64
64
|
// 4-bit quantization
|
65
|
-
//
|
65
|
+
// 8 blocks of 32 elements each
|
66
66
|
// weight is represented as x = a * q + b
|
67
67
|
// Effectively 4.5 bits per weight
|
68
68
|
#ifdef GGML_QKK_64
|
@@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
83
83
|
#endif
|
84
84
|
|
85
85
|
// 5-bit quantization
|
86
|
-
//
|
86
|
+
// 8 blocks of 32 elements each
|
87
87
|
// weight is represented as x = a * q + b
|
88
88
|
// Effectively 5.5 bits per weight
|
89
89
|
#ifdef GGML_QKK_64
|
@@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
|
107
107
|
|
108
108
|
// 6-bit quantization
|
109
109
|
// weight is represented as x = a * q
|
110
|
-
// 16 blocks of 16
|
110
|
+
// 16 blocks of 16 elements each
|
111
111
|
// Effectively 6.5625 bits per weight
|
112
112
|
typedef struct {
|
113
113
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|