llama_cpp 0.15.2 → 0.15.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +61 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -16
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +99 -40
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +44 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -81
- data/vendor/tmp/llama.cpp/ggml-metal.metal +91 -434
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1962 -2443
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +248 -108
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +375 -657
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +204 -225
- data/vendor/tmp/llama.cpp/ggml.c +498 -836
- data/vendor/tmp/llama.cpp/ggml.h +57 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1477 -859
- data/vendor/tmp/llama.cpp/llama.h +21 -8
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 167132898a0cb63faaf4fd7583d9b988992ba7c5ec0f5602d5a158f04e0cdfa0
|
4
|
+
data.tar.gz: 8a65658eb93b9cf80d5ede554b15968c495f045c32e57cc96ed732c56330d25f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9625ac088c4d5c50cc51bbbcbc744cb7041766ccbb7a42a9cd1b80b29ebe64414d39875dea5d61a87025e239ad78be2a2ea4d3f85a187684321e409fc01a40fd
|
7
|
+
data.tar.gz: 6f68445f10765a4eb1124ed1cfd2afb7544d146823efad27b2b6955bb0ee822ae8b0f9cccb68777c8cb211f665a0e2531eba04a4240399af1101a5dbcd645ae9
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2988 to b3056.
|
4
|
+
- Add LLAMA_VOCAB_PRE_TYPE_SMAUG constant.
|
5
|
+
- Add `token_is_control?` method to `Model`.
|
6
|
+
|
7
|
+
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
8
|
+
|
9
|
+
- Bump llama.cpp from b2917 to b2988.
|
10
|
+
- Add constants for pre-tokenization types.
|
11
|
+
- Add `n_threads` method to `Context`.
|
12
|
+
- Add `n_threads_batch` method to `Context`.
|
13
|
+
- Add `set_n_threads` method to `Context`.
|
14
|
+
|
1
15
|
## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
|
2
16
|
|
3
17
|
- Bump llama.cpp from b2839 to b2917.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1536,6 +1536,7 @@ public:
|
|
1536
1536
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1537
1537
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1538
1538
|
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1539
|
+
rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
|
1539
1540
|
}
|
1540
1541
|
|
1541
1542
|
private:
|
@@ -1848,6 +1849,16 @@ private:
|
|
1848
1849
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1849
1850
|
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1850
1851
|
}
|
1852
|
+
|
1853
|
+
static VALUE _llama_model_token_is_control(VALUE self, VALUE token_) {
|
1854
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1855
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1856
|
+
return Qnil;
|
1857
|
+
}
|
1858
|
+
const llama_token token = NUM2INT(token_);
|
1859
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1860
|
+
return llama_token_is_control(ptr->model, token) ? Qtrue : Qfalse;
|
1861
|
+
}
|
1851
1862
|
};
|
1852
1863
|
|
1853
1864
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2122,10 +2133,13 @@ public:
|
|
2122
2133
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2123
2134
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2124
2135
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2136
|
+
rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
|
2125
2137
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2126
2138
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2127
2139
|
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2128
2140
|
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2141
|
+
rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
|
2142
|
+
rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
|
2129
2143
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2130
2144
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2131
2145
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2343,6 +2357,33 @@ private:
|
|
2343
2357
|
return output;
|
2344
2358
|
}
|
2345
2359
|
|
2360
|
+
static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
|
2361
|
+
VALUE kw_args = Qnil;
|
2362
|
+
ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
|
2363
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
2364
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2365
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2366
|
+
|
2367
|
+
VALUE n_threads = kw_values[0];
|
2368
|
+
if (!RB_INTEGER_TYPE_P(n_threads)) {
|
2369
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
2370
|
+
return Qnil;
|
2371
|
+
}
|
2372
|
+
VALUE n_threads_batch = kw_values[1];
|
2373
|
+
if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
|
2374
|
+
rb_raise(rb_eArgError, "n_threads_batch must be an integer");
|
2375
|
+
return Qnil;
|
2376
|
+
}
|
2377
|
+
|
2378
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2379
|
+
if (ptr->ctx == NULL) {
|
2380
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2381
|
+
return Qnil;
|
2382
|
+
}
|
2383
|
+
llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
|
2384
|
+
return Qnil;
|
2385
|
+
}
|
2386
|
+
|
2346
2387
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2347
2388
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2348
2389
|
if (ptr->ctx == NULL) {
|
@@ -2379,6 +2420,24 @@ private:
|
|
2379
2420
|
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2380
2421
|
}
|
2381
2422
|
|
2423
|
+
static VALUE _llama_context_n_threads(VALUE self) {
|
2424
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2425
|
+
if (ptr->ctx == NULL) {
|
2426
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2427
|
+
return Qnil;
|
2428
|
+
}
|
2429
|
+
return UINT2NUM(llama_n_threads(ptr->ctx));
|
2430
|
+
}
|
2431
|
+
|
2432
|
+
static VALUE _llama_context_n_threads_batch(VALUE self) {
|
2433
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2434
|
+
if (ptr->ctx == NULL) {
|
2435
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2436
|
+
return Qnil;
|
2437
|
+
}
|
2438
|
+
return UINT2NUM(llama_n_threads_batch(ptr->ctx));
|
2439
|
+
}
|
2440
|
+
|
2382
2441
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2383
2442
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
2443
|
if (ptr->ctx == NULL) {
|
@@ -3430,9 +3489,11 @@ extern "C" void Init_llama_cpp(void) {
|
|
3430
3489
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
3490
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
3491
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3492
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
|
3433
3493
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
3494
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
3495
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
3496
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_SMAUG", INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG));
|
3436
3497
|
|
3437
3498
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
|
3438
3499
|
rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b3056'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,9 +26,11 @@ module LLaMACpp
|
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
27
|
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
28
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
|
29
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
|
32
34
|
|
33
35
|
LLAMA_FTYPE_ALL_F32: Integer
|
34
36
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
@@ -158,6 +160,7 @@ module LLaMACpp
|
|
158
160
|
def token_suffix: () -> Integer
|
159
161
|
def token_eot: () -> Integer
|
160
162
|
def token_is_eog?: (Integer) -> bool
|
163
|
+
def token_is_control?: (Integer) -> bool
|
161
164
|
end
|
162
165
|
|
163
166
|
class Timings
|
@@ -241,10 +244,13 @@ module LLaMACpp
|
|
241
244
|
def embeddings_seq: (Integer) -> Array[Float]
|
242
245
|
def decode: (::LLaMACpp::Batch) -> void
|
243
246
|
def logits: () -> Array[Float]
|
247
|
+
def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
|
244
248
|
def n_ctx: () -> Integer
|
245
249
|
def n_batch: () -> Integer
|
246
250
|
def n_ubatch: () -> Integer
|
247
251
|
def n_seq_max: () -> Integer
|
252
|
+
def n_threads: () -> Integer
|
253
|
+
def n_threads_batch: () -> Integer
|
248
254
|
def timings: () -> ::LLaMACpp::Timings
|
249
255
|
def print_timings: () -> void
|
250
256
|
def reset_timings: () -> void
|
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
|
381
381
|
CUDA_POWER_ARCH = 1
|
382
382
|
endif
|
383
383
|
|
384
|
+
ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
385
|
+
MK_CFLAGS += -mlasx
|
386
|
+
MK_CXXFLAGS += -mlasx
|
387
|
+
endif
|
388
|
+
|
384
389
|
else
|
385
390
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
386
391
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
387
392
|
endif
|
388
393
|
|
389
|
-
ifdef LLAMA_QKK_64
|
390
|
-
MK_CPPFLAGS += -DGGML_QKK_64
|
391
|
-
endif
|
392
|
-
|
393
394
|
ifndef LLAMA_NO_ACCELERATE
|
394
395
|
# Mac OS - include Accelerate framework.
|
395
396
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
|
|
401
402
|
endif
|
402
403
|
endif # LLAMA_NO_ACCELERATE
|
403
404
|
|
404
|
-
ifdef LLAMA_MPI
|
405
|
-
MK_CPPFLAGS += -DGGML_USE_MPI
|
406
|
-
MK_CFLAGS += -Wno-cast-qual
|
407
|
-
MK_CXXFLAGS += -Wno-cast-qual
|
408
|
-
OBJS += ggml-mpi.o
|
409
|
-
endif # LLAMA_MPI
|
410
|
-
|
411
405
|
ifdef LLAMA_OPENBLAS
|
412
406
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
413
407
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -449,6 +443,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
|
449
443
|
ifdef LLAMA_DEBUG
|
450
444
|
MK_NVCCFLAGS += -lineinfo
|
451
445
|
endif # LLAMA_DEBUG
|
446
|
+
ifdef LLAMA_CUDA_DEBUG
|
447
|
+
MK_NVCCFLAGS += --device-debug
|
448
|
+
endif # LLAMA_CUDA_DEBUG
|
452
449
|
ifdef LLAMA_CUDA_NVCC
|
453
450
|
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
454
451
|
else
|
@@ -631,11 +628,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
631
628
|
endif
|
632
629
|
endif # LLAMA_METAL
|
633
630
|
|
634
|
-
ifdef LLAMA_MPI
|
635
|
-
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
636
|
-
$(CC) $(CFLAGS) -c $< -o $@
|
637
|
-
endif # LLAMA_MPI
|
638
|
-
|
639
631
|
ifndef LLAMA_NO_LLAMAFILE
|
640
632
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
641
633
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
65
65
|
// QK = number of values after dequantization
|
66
66
|
// QK_K = super-block size
|
67
67
|
|
68
|
-
#ifdef GGML_QKK_64
|
69
|
-
#define QK_K 64
|
70
|
-
#define K_SCALE_SIZE 4
|
71
|
-
#else
|
72
68
|
#define QK_K 256
|
73
69
|
#define K_SCALE_SIZE 12
|
74
|
-
#endif // GGML_QKK_64
|
75
70
|
|
76
71
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
77
72
|
// QR = QK / number of values before dequantization
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
131
126
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
132
127
|
#define QR4_NL 2
|
133
128
|
|
134
|
-
#if QK_K == 64
|
135
|
-
#define QI4_XS QI4_NL
|
136
|
-
#define QR4_XS QR4_NL
|
137
|
-
#else
|
138
129
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
139
130
|
#define QR4_XS 8
|
140
|
-
#endif
|
141
131
|
|
142
132
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
143
133
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
228
218
|
// weight is represented as x = a * q
|
229
219
|
// 16 blocks of 16 elements each
|
230
220
|
// Effectively 3.4375 bits per weight
|
231
|
-
#ifdef GGML_QKK_64
|
232
|
-
typedef struct {
|
233
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
234
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
235
|
-
uint8_t scales[2];
|
236
|
-
ggml_half d; // super-block scale
|
237
|
-
} block_q3_K;
|
238
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
239
|
-
#else
|
240
221
|
typedef struct {
|
241
222
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
242
223
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
244
225
|
ggml_half d; // super-block scale
|
245
226
|
} block_q3_K;
|
246
227
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
247
|
-
#endif
|
248
228
|
|
249
229
|
// 4-bit quantization
|
250
230
|
// 8 blocks of 32 elements each
|
251
231
|
// weight is represented as x = a * q + b
|
252
232
|
// Effectively 4.5 bits per weight
|
253
|
-
#ifdef GGML_QKK_64
|
254
|
-
typedef struct {
|
255
|
-
ggml_half d[2]; // super-block scales/mins
|
256
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
257
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
258
|
-
} block_q4_K;
|
259
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
260
|
-
#else
|
261
233
|
typedef struct {
|
262
234
|
union {
|
263
235
|
struct {
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
270
242
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
271
243
|
} block_q4_K;
|
272
244
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
273
|
-
#endif
|
274
245
|
|
275
246
|
// 5-bit quantization
|
276
247
|
// 8 blocks of 32 elements each
|
277
248
|
// weight is represented as x = a * q + b
|
278
249
|
// Effectively 5.5 bits per weight
|
279
|
-
#ifdef GGML_QKK_64
|
280
|
-
typedef struct {
|
281
|
-
ggml_half d; // super-block scale
|
282
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
283
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
284
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
285
|
-
} block_q5_K;
|
286
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
287
|
-
#else
|
288
250
|
typedef struct {
|
289
251
|
union {
|
290
252
|
struct {
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
298
260
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
299
261
|
} block_q5_K;
|
300
262
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
301
|
-
#endif
|
302
263
|
|
303
264
|
// 6-bit quantization
|
304
265
|
// weight is represented as x = a * q
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
356
317
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
357
318
|
|
358
319
|
// 3.4375 bpw
|
359
|
-
#if QK_K == 64
|
360
|
-
#define IQ3S_N_SCALE 2
|
361
|
-
#else
|
362
320
|
#define IQ3S_N_SCALE QK_K/64
|
363
|
-
#endif
|
364
321
|
typedef struct {
|
365
322
|
ggml_half d;
|
366
323
|
uint8_t qs[QK_K/4];
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
381
338
|
typedef struct {
|
382
339
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
340
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
-
#if QK_K == 64
|
385
|
-
ggml_half d;
|
386
|
-
#endif
|
387
341
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
342
|
} block_iq1_m;
|
389
|
-
#if QK_K == 64
|
390
|
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
-
#else
|
392
343
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
-
#endif
|
394
344
|
|
395
345
|
// Used by IQ1_M quants
|
396
346
|
typedef union {
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
406
356
|
} block_iq4_nl;
|
407
357
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
408
358
|
|
409
|
-
#if QK_K == 64
|
410
|
-
#define block_iq4_xs block_iq4_nl
|
411
|
-
#else
|
412
359
|
typedef struct {
|
413
360
|
ggml_half d;
|
414
361
|
uint16_t scales_h;
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
416
363
|
uint8_t qs[QK_K/2];
|
417
364
|
} block_iq4_xs;
|
418
365
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
419
|
-
#endif
|
420
366
|
|
421
367
|
#endif // GGML_COMMON_DECL
|
422
368
|
#endif // GGML_COMMON_DECL
|