llama_cpp 0.15.2 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 167132898a0cb63faaf4fd7583d9b988992ba7c5ec0f5602d5a158f04e0cdfa0
|
4
|
+
data.tar.gz: 8a65658eb93b9cf80d5ede554b15968c495f045c32e57cc96ed732c56330d25f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9625ac088c4d5c50cc51bbbcbc744cb7041766ccbb7a42a9cd1b80b29ebe64414d39875dea5d61a87025e239ad78be2a2ea4d3f85a187684321e409fc01a40fd
|
7
|
+
data.tar.gz: 6f68445f10765a4eb1124ed1cfd2afb7544d146823efad27b2b6955bb0ee822ae8b0f9cccb68777c8cb211f665a0e2531eba04a4240399af1101a5dbcd645ae9
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2988 to b3056.
|
4
|
+
- Add LLAMA_VOCAB_PRE_TYPE_SMAUG constant.
|
5
|
+
- Add `token_is_control?` method to `Model`.
|
6
|
+
|
7
|
+
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
8
|
+
|
9
|
+
- Bump llama.cpp from b2917 to b2988.
|
10
|
+
- Add constants for pre-tokenization types.
|
11
|
+
- Add `n_threads` method to `Context`.
|
12
|
+
- Add `n_threads_batch` method to `Context`.
|
13
|
+
- Add `set_n_threads` method to `Context`.
|
14
|
+
|
1
15
|
## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
|
2
16
|
|
3
17
|
- Bump llama.cpp from b2839 to b2917.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1536,6 +1536,7 @@ public:
|
|
1536
1536
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1537
1537
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1538
1538
|
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1539
|
+
rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
|
1539
1540
|
}
|
1540
1541
|
|
1541
1542
|
private:
|
@@ -1848,6 +1849,16 @@ private:
|
|
1848
1849
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1849
1850
|
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1850
1851
|
}
|
1852
|
+
|
1853
|
+
static VALUE _llama_model_token_is_control(VALUE self, VALUE token_) {
|
1854
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1855
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1856
|
+
return Qnil;
|
1857
|
+
}
|
1858
|
+
const llama_token token = NUM2INT(token_);
|
1859
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1860
|
+
return llama_token_is_control(ptr->model, token) ? Qtrue : Qfalse;
|
1861
|
+
}
|
1851
1862
|
};
|
1852
1863
|
|
1853
1864
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2122,10 +2133,13 @@ public:
|
|
2122
2133
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2123
2134
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2124
2135
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2136
|
+
rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
|
2125
2137
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2126
2138
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2127
2139
|
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2128
2140
|
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2141
|
+
rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
|
2142
|
+
rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
|
2129
2143
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2130
2144
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2131
2145
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2343,6 +2357,33 @@ private:
|
|
2343
2357
|
return output;
|
2344
2358
|
}
|
2345
2359
|
|
2360
|
+
static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
|
2361
|
+
VALUE kw_args = Qnil;
|
2362
|
+
ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
|
2363
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
2364
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2365
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2366
|
+
|
2367
|
+
VALUE n_threads = kw_values[0];
|
2368
|
+
if (!RB_INTEGER_TYPE_P(n_threads)) {
|
2369
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
2370
|
+
return Qnil;
|
2371
|
+
}
|
2372
|
+
VALUE n_threads_batch = kw_values[1];
|
2373
|
+
if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
|
2374
|
+
rb_raise(rb_eArgError, "n_threads_batch must be an integer");
|
2375
|
+
return Qnil;
|
2376
|
+
}
|
2377
|
+
|
2378
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2379
|
+
if (ptr->ctx == NULL) {
|
2380
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2381
|
+
return Qnil;
|
2382
|
+
}
|
2383
|
+
llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
|
2384
|
+
return Qnil;
|
2385
|
+
}
|
2386
|
+
|
2346
2387
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2347
2388
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2348
2389
|
if (ptr->ctx == NULL) {
|
@@ -2379,6 +2420,24 @@ private:
|
|
2379
2420
|
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2380
2421
|
}
|
2381
2422
|
|
2423
|
+
static VALUE _llama_context_n_threads(VALUE self) {
|
2424
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2425
|
+
if (ptr->ctx == NULL) {
|
2426
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2427
|
+
return Qnil;
|
2428
|
+
}
|
2429
|
+
return UINT2NUM(llama_n_threads(ptr->ctx));
|
2430
|
+
}
|
2431
|
+
|
2432
|
+
static VALUE _llama_context_n_threads_batch(VALUE self) {
|
2433
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2434
|
+
if (ptr->ctx == NULL) {
|
2435
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2436
|
+
return Qnil;
|
2437
|
+
}
|
2438
|
+
return UINT2NUM(llama_n_threads_batch(ptr->ctx));
|
2439
|
+
}
|
2440
|
+
|
2382
2441
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2383
2442
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
2443
|
if (ptr->ctx == NULL) {
|
@@ -3430,9 +3489,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
3430
3489
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
3490
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
3491
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3492
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
|
3433
3493
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
3494
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
3495
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3496
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_SMAUG", INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG));
|
3436
3497
|
|
3437
3498
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3438
3499
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b3056'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,9 +26,11 @@ module LLaMACpp
|
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
27
|
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
28
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
|
29
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
|
32
34
|
|
33
35
|
LLAMA_FTYPE_ALL_F32: Integer
|
34
36
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
@@ -158,6 +160,7 @@ module LLaMACpp
|
|
158
160
|
def token_suffix: () -> Integer
|
159
161
|
def token_eot: () -> Integer
|
160
162
|
def token_is_eog?: (Integer) -> bool
|
163
|
+
def token_is_control?: (Integer) -> bool
|
161
164
|
end
|
162
165
|
|
163
166
|
class Timings
|
@@ -241,10 +244,13 @@ module LLaMACpp
|
|
241
244
|
def embeddings_seq: (Integer) -> Array[Float]
|
242
245
|
def decode: (::LLaMACpp::Batch) -> void
|
243
246
|
def logits: () -> Array[Float]
|
247
|
+
def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
|
244
248
|
def n_ctx: () -> Integer
|
245
249
|
def n_batch: () -> Integer
|
246
250
|
def n_ubatch: () -> Integer
|
247
251
|
def n_seq_max: () -> Integer
|
252
|
+
def n_threads: () -> Integer
|
253
|
+
def n_threads_batch: () -> Integer
|
248
254
|
def timings: () -> ::LLaMACpp::Timings
|
249
255
|
def print_timings: () -> void
|
250
256
|
def reset_timings: () -> void
|
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
|
381
381
|
CUDA_POWER_ARCH = 1
|
382
382
|
endif
|
383
383
|
|
384
|
+
ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
385
|
+
MK_CFLAGS += -mlasx
|
386
|
+
MK_CXXFLAGS += -mlasx
|
387
|
+
endif
|
388
|
+
|
384
389
|
else
|
385
390
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
386
391
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
387
392
|
endif
|
388
393
|
|
389
|
-
ifdef LLAMA_QKK_64
|
390
|
-
MK_CPPFLAGS += -DGGML_QKK_64
|
391
|
-
endif
|
392
|
-
|
393
394
|
ifndef LLAMA_NO_ACCELERATE
|
394
395
|
# Mac OS - include Accelerate framework.
|
395
396
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
|
|
401
402
|
endif
|
402
403
|
endif # LLAMA_NO_ACCELERATE
|
403
404
|
|
404
|
-
ifdef LLAMA_MPI
|
405
|
-
MK_CPPFLAGS += -DGGML_USE_MPI
|
406
|
-
MK_CFLAGS += -Wno-cast-qual
|
407
|
-
MK_CXXFLAGS += -Wno-cast-qual
|
408
|
-
OBJS += ggml-mpi.o
|
409
|
-
endif # LLAMA_MPI
|
410
|
-
|
411
405
|
ifdef LLAMA_OPENBLAS
|
412
406
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
413
407
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -449,6 +443,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
|
449
443
|
ifdef LLAMA_DEBUG
|
450
444
|
MK_NVCCFLAGS += -lineinfo
|
451
445
|
endif # LLAMA_DEBUG
|
446
|
+
ifdef LLAMA_CUDA_DEBUG
|
447
|
+
MK_NVCCFLAGS += --device-debug
|
448
|
+
endif # LLAMA_CUDA_DEBUG
|
452
449
|
ifdef LLAMA_CUDA_NVCC
|
453
450
|
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
454
451
|
else
|
@@ -631,11 +628,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
631
628
|
endif
|
632
629
|
endif # LLAMA_METAL
|
633
630
|
|
634
|
-
ifdef LLAMA_MPI
|
635
|
-
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
636
|
-
$(CC) $(CFLAGS) -c $< -o $@
|
637
|
-
endif # LLAMA_MPI
|
638
|
-
|
639
631
|
ifndef LLAMA_NO_LLAMAFILE
|
640
632
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
641
633
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
65
65
|
// QK = number of values after dequantization
|
66
66
|
// QK_K = super-block size
|
67
67
|
|
68
|
-
#ifdef GGML_QKK_64
|
69
|
-
#define QK_K 64
|
70
|
-
#define K_SCALE_SIZE 4
|
71
|
-
#else
|
72
68
|
#define QK_K 256
|
73
69
|
#define K_SCALE_SIZE 12
|
74
|
-
#endif // GGML_QKK_64
|
75
70
|
|
76
71
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
77
72
|
// QR = QK / number of values before dequantization
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
131
126
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
132
127
|
#define QR4_NL 2
|
133
128
|
|
134
|
-
#if QK_K == 64
|
135
|
-
#define QI4_XS QI4_NL
|
136
|
-
#define QR4_XS QR4_NL
|
137
|
-
#else
|
138
129
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
139
130
|
#define QR4_XS 8
|
140
|
-
#endif
|
141
131
|
|
142
132
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
143
133
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
228
218
|
// weight is represented as x = a * q
|
229
219
|
// 16 blocks of 16 elements each
|
230
220
|
// Effectively 3.4375 bits per weight
|
231
|
-
#ifdef GGML_QKK_64
|
232
|
-
typedef struct {
|
233
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
234
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
235
|
-
uint8_t scales[2];
|
236
|
-
ggml_half d; // super-block scale
|
237
|
-
} block_q3_K;
|
238
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
239
|
-
#else
|
240
221
|
typedef struct {
|
241
222
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
242
223
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
244
225
|
ggml_half d; // super-block scale
|
245
226
|
} block_q3_K;
|
246
227
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
247
|
-
#endif
|
248
228
|
|
249
229
|
// 4-bit quantization
|
250
230
|
// 8 blocks of 32 elements each
|
251
231
|
// weight is represented as x = a * q + b
|
252
232
|
// Effectively 4.5 bits per weight
|
253
|
-
#ifdef GGML_QKK_64
|
254
|
-
typedef struct {
|
255
|
-
ggml_half d[2]; // super-block scales/mins
|
256
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
257
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
258
|
-
} block_q4_K;
|
259
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
260
|
-
#else
|
261
233
|
typedef struct {
|
262
234
|
union {
|
263
235
|
struct {
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
270
242
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
271
243
|
} block_q4_K;
|
272
244
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
273
|
-
#endif
|
274
245
|
|
275
246
|
// 5-bit quantization
|
276
247
|
// 8 blocks of 32 elements each
|
277
248
|
// weight is represented as x = a * q + b
|
278
249
|
// Effectively 5.5 bits per weight
|
279
|
-
#ifdef GGML_QKK_64
|
280
|
-
typedef struct {
|
281
|
-
ggml_half d; // super-block scale
|
282
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
283
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
284
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
285
|
-
} block_q5_K;
|
286
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
287
|
-
#else
|
288
250
|
typedef struct {
|
289
251
|
union {
|
290
252
|
struct {
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
298
260
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
299
261
|
} block_q5_K;
|
300
262
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
301
|
-
#endif
|
302
263
|
|
303
264
|
// 6-bit quantization
|
304
265
|
// weight is represented as x = a * q
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
356
317
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
357
318
|
|
358
319
|
// 3.4375 bpw
|
359
|
-
#if QK_K == 64
|
360
|
-
#define IQ3S_N_SCALE 2
|
361
|
-
#else
|
362
320
|
#define IQ3S_N_SCALE QK_K/64
|
363
|
-
#endif
|
364
321
|
typedef struct {
|
365
322
|
ggml_half d;
|
366
323
|
uint8_t qs[QK_K/4];
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
381
338
|
typedef struct {
|
382
339
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
340
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
-
#if QK_K == 64
|
385
|
-
ggml_half d;
|
386
|
-
#endif
|
387
341
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
342
|
} block_iq1_m;
|
389
|
-
#if QK_K == 64
|
390
|
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
-
#else
|
392
343
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
-
#endif
|
394
344
|
|
395
345
|
// Used by IQ1_M quants
|
396
346
|
typedef union {
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
406
356
|
} block_iq4_nl;
|
407
357
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
408
358
|
|
409
|
-
#if QK_K == 64
|
410
|
-
#define block_iq4_xs block_iq4_nl
|
411
|
-
#else
|
412
359
|
typedef struct {
|
413
360
|
ggml_half d;
|
414
361
|
uint16_t scales_h;
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
416
363
|
uint8_t qs[QK_K/2];
|
417
364
|
} block_iq4_xs;
|
418
365
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
419
|
-
#endif
|
420
366
|
|
421
367
|
#endif // GGML_COMMON_DECL
|
422
368
|
#endif // GGML_COMMON_DECL
|