llama_cpp 0.15.1 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -20
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +87 -37
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +47 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +13 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +177 -190
- data/vendor/tmp/llama.cpp/ggml-metal.metal +97 -505
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +3660 -2057
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1155 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +60 -639
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +203 -224
- data/vendor/tmp/llama.cpp/ggml.c +1168 -1470
- data/vendor/tmp/llama.cpp/ggml.h +67 -44
- data/vendor/tmp/llama.cpp/llama.cpp +1371 -944
- data/vendor/tmp/llama.cpp/llama.h +13 -3
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
|
4
|
+
data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
|
7
|
+
data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2917 to b2988.
|
4
|
+
- Add constants for pre-tokenization types.
|
5
|
+
- Add `n_threads` method to `Context`.
|
6
|
+
- Add `n_threads_batch` method to `Context`.
|
7
|
+
- Add `set_n_threads` method to `Context`.
|
8
|
+
|
9
|
+
## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
|
10
|
+
|
11
|
+
- Bump llama.cpp from b2839 to b2917.
|
12
|
+
|
13
|
+
Implementation binding for rpc_servers in llama_model_params has been skipped.
|
14
|
+
|
1
15
|
## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
|
2
16
|
|
3
17
|
- Bump llama.cpp from b2781 to b2839.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -2122,10 +2122,13 @@ public:
|
|
2122
2122
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
2123
2123
|
rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
|
2124
2124
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2125
|
+
rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
|
2125
2126
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2126
2127
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2127
2128
|
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2128
2129
|
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
|
2131
|
+
rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
|
2129
2132
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2130
2133
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2131
2134
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2343,6 +2346,33 @@ private:
|
|
2343
2346
|
return output;
|
2344
2347
|
}
|
2345
2348
|
|
2349
|
+
static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
|
2350
|
+
VALUE kw_args = Qnil;
|
2351
|
+
ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
|
2352
|
+
VALUE kw_values[2] = { Qundef, Qundef };
|
2353
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
2354
|
+
rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
|
2355
|
+
|
2356
|
+
VALUE n_threads = kw_values[0];
|
2357
|
+
if (!RB_INTEGER_TYPE_P(n_threads)) {
|
2358
|
+
rb_raise(rb_eArgError, "n_threads must be an integer");
|
2359
|
+
return Qnil;
|
2360
|
+
}
|
2361
|
+
VALUE n_threads_batch = kw_values[1];
|
2362
|
+
if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
|
2363
|
+
rb_raise(rb_eArgError, "n_threads_batch must be an integer");
|
2364
|
+
return Qnil;
|
2365
|
+
}
|
2366
|
+
|
2367
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2368
|
+
if (ptr->ctx == NULL) {
|
2369
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2370
|
+
return Qnil;
|
2371
|
+
}
|
2372
|
+
llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
|
2373
|
+
return Qnil;
|
2374
|
+
}
|
2375
|
+
|
2346
2376
|
static VALUE _llama_context_n_ctx(VALUE self) {
|
2347
2377
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2348
2378
|
if (ptr->ctx == NULL) {
|
@@ -2379,6 +2409,24 @@ private:
|
|
2379
2409
|
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2380
2410
|
}
|
2381
2411
|
|
2412
|
+
static VALUE _llama_context_n_threads(VALUE self) {
|
2413
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2414
|
+
if (ptr->ctx == NULL) {
|
2415
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2416
|
+
return Qnil;
|
2417
|
+
}
|
2418
|
+
return UINT2NUM(llama_n_threads(ptr->ctx));
|
2419
|
+
}
|
2420
|
+
|
2421
|
+
static VALUE _llama_context_n_threads_batch(VALUE self) {
|
2422
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2423
|
+
if (ptr->ctx == NULL) {
|
2424
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2425
|
+
return Qnil;
|
2426
|
+
}
|
2427
|
+
return UINT2NUM(llama_n_threads_batch(ptr->ctx));
|
2428
|
+
}
|
2429
|
+
|
2382
2430
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2383
2431
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
2432
|
if (ptr->ctx == NULL) {
|
@@ -3430,6 +3478,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3430
3478
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
|
3431
3479
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
|
3432
3480
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
|
3481
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
|
3433
3482
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
|
3434
3483
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
|
3435
3484
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.15.
|
6
|
+
VERSION = '0.15.3'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2988'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -26,6 +26,7 @@ module LLaMACpp
|
|
26
26
|
LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
|
27
27
|
LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
|
28
28
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
|
29
30
|
LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
|
30
31
|
LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
|
31
32
|
LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
|
@@ -241,10 +242,13 @@ module LLaMACpp
|
|
241
242
|
def embeddings_seq: (Integer) -> Array[Float]
|
242
243
|
def decode: (::LLaMACpp::Batch) -> void
|
243
244
|
def logits: () -> Array[Float]
|
245
|
+
def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
|
244
246
|
def n_ctx: () -> Integer
|
245
247
|
def n_batch: () -> Integer
|
246
248
|
def n_ubatch: () -> Integer
|
247
249
|
def n_seq_max: () -> Integer
|
250
|
+
def n_threads: () -> Integer
|
251
|
+
def n_threads_batch: () -> Integer
|
248
252
|
def timings: () -> ::LLaMACpp::Timings
|
249
253
|
def print_timings: () -> void
|
250
254
|
def reset_timings: () -> void
|
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
|
381
381
|
CUDA_POWER_ARCH = 1
|
382
382
|
endif
|
383
383
|
|
384
|
+
ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
385
|
+
MK_CFLAGS += -mlasx
|
386
|
+
MK_CXXFLAGS += -mlasx
|
387
|
+
endif
|
388
|
+
|
384
389
|
else
|
385
390
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
386
391
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
387
392
|
endif
|
388
393
|
|
389
|
-
ifdef LLAMA_QKK_64
|
390
|
-
MK_CPPFLAGS += -DGGML_QKK_64
|
391
|
-
endif
|
392
|
-
|
393
394
|
ifndef LLAMA_NO_ACCELERATE
|
394
395
|
# Mac OS - include Accelerate framework.
|
395
396
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
|
|
401
402
|
endif
|
402
403
|
endif # LLAMA_NO_ACCELERATE
|
403
404
|
|
404
|
-
ifdef LLAMA_MPI
|
405
|
-
MK_CPPFLAGS += -DGGML_USE_MPI
|
406
|
-
MK_CFLAGS += -Wno-cast-qual
|
407
|
-
MK_CXXFLAGS += -Wno-cast-qual
|
408
|
-
OBJS += ggml-mpi.o
|
409
|
-
endif # LLAMA_MPI
|
410
|
-
|
411
405
|
ifdef LLAMA_OPENBLAS
|
412
406
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
413
407
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
@@ -562,10 +556,10 @@ endif # LLAMA_VULKAN
|
|
562
556
|
ifdef LLAMA_HIPBLAS
|
563
557
|
ifeq ($(wildcard /opt/rocm),)
|
564
558
|
ROCM_PATH ?= /usr
|
565
|
-
|
559
|
+
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
566
560
|
else
|
567
561
|
ROCM_PATH ?= /opt/rocm
|
568
|
-
|
562
|
+
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
569
563
|
endif
|
570
564
|
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
571
565
|
LLAMA_CUDA_DMMV_X ?= 32
|
@@ -577,7 +571,7 @@ ifdef LLAMA_HIP_UMA
|
|
577
571
|
endif # LLAMA_HIP_UMA
|
578
572
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
579
573
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
580
|
-
HIPFLAGS += $(addprefix --offload-arch=,$(
|
574
|
+
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
581
575
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
582
576
|
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
583
577
|
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
@@ -631,11 +625,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
|
631
625
|
endif
|
632
626
|
endif # LLAMA_METAL
|
633
627
|
|
634
|
-
ifdef LLAMA_MPI
|
635
|
-
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
636
|
-
$(CC) $(CFLAGS) -c $< -o $@
|
637
|
-
endif # LLAMA_MPI
|
638
|
-
|
639
628
|
ifndef LLAMA_NO_LLAMAFILE
|
640
629
|
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
641
630
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -760,7 +749,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
760
749
|
ar rcs libllama.a $^
|
761
750
|
|
762
751
|
clean:
|
763
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll
|
752
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
764
753
|
rm -vrf ggml-cuda/*.o
|
765
754
|
|
766
755
|
#
|
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
1182
1182
|
static char * fmt_size(size_t size) {
|
1183
1183
|
static char buffer[128];
|
1184
1184
|
if (size >= 1024*1024) {
|
1185
|
-
|
1185
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
1186
1186
|
} else {
|
1187
|
-
|
1187
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
1188
1188
|
}
|
1189
1189
|
return buffer;
|
1190
1190
|
}
|
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
|
1895
1895
|
|
1896
1896
|
tensor->buffer = buffer;
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
1898
|
-
tensor->backend = tensor->view_src->backend;
|
1899
1898
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
1900
1899
|
}
|
1901
1900
|
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
65
65
|
// QK = number of values after dequantization
|
66
66
|
// QK_K = super-block size
|
67
67
|
|
68
|
-
#ifdef GGML_QKK_64
|
69
|
-
#define QK_K 64
|
70
|
-
#define K_SCALE_SIZE 4
|
71
|
-
#else
|
72
68
|
#define QK_K 256
|
73
69
|
#define K_SCALE_SIZE 12
|
74
|
-
#endif // GGML_QKK_64
|
75
70
|
|
76
71
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
77
72
|
// QR = QK / number of values before dequantization
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
131
126
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
132
127
|
#define QR4_NL 2
|
133
128
|
|
134
|
-
#if QK_K == 64
|
135
|
-
#define QI4_XS QI4_NL
|
136
|
-
#define QR4_XS QR4_NL
|
137
|
-
#else
|
138
129
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
139
130
|
#define QR4_XS 8
|
140
|
-
#endif
|
141
131
|
|
142
132
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
143
133
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
228
218
|
// weight is represented as x = a * q
|
229
219
|
// 16 blocks of 16 elements each
|
230
220
|
// Effectively 3.4375 bits per weight
|
231
|
-
#ifdef GGML_QKK_64
|
232
|
-
typedef struct {
|
233
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
234
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
235
|
-
uint8_t scales[2];
|
236
|
-
ggml_half d; // super-block scale
|
237
|
-
} block_q3_K;
|
238
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
239
|
-
#else
|
240
221
|
typedef struct {
|
241
222
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
242
223
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
244
225
|
ggml_half d; // super-block scale
|
245
226
|
} block_q3_K;
|
246
227
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
247
|
-
#endif
|
248
228
|
|
249
229
|
// 4-bit quantization
|
250
230
|
// 8 blocks of 32 elements each
|
251
231
|
// weight is represented as x = a * q + b
|
252
232
|
// Effectively 4.5 bits per weight
|
253
|
-
#ifdef GGML_QKK_64
|
254
|
-
typedef struct {
|
255
|
-
ggml_half d[2]; // super-block scales/mins
|
256
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
257
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
258
|
-
} block_q4_K;
|
259
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
260
|
-
#else
|
261
233
|
typedef struct {
|
262
234
|
union {
|
263
235
|
struct {
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
270
242
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
271
243
|
} block_q4_K;
|
272
244
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
273
|
-
#endif
|
274
245
|
|
275
246
|
// 5-bit quantization
|
276
247
|
// 8 blocks of 32 elements each
|
277
248
|
// weight is represented as x = a * q + b
|
278
249
|
// Effectively 5.5 bits per weight
|
279
|
-
#ifdef GGML_QKK_64
|
280
|
-
typedef struct {
|
281
|
-
ggml_half d; // super-block scale
|
282
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
283
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
284
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
285
|
-
} block_q5_K;
|
286
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
287
|
-
#else
|
288
250
|
typedef struct {
|
289
251
|
union {
|
290
252
|
struct {
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
298
260
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
299
261
|
} block_q5_K;
|
300
262
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
301
|
-
#endif
|
302
263
|
|
303
264
|
// 6-bit quantization
|
304
265
|
// weight is represented as x = a * q
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
356
317
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
357
318
|
|
358
319
|
// 3.4375 bpw
|
359
|
-
#if QK_K == 64
|
360
|
-
#define IQ3S_N_SCALE 2
|
361
|
-
#else
|
362
320
|
#define IQ3S_N_SCALE QK_K/64
|
363
|
-
#endif
|
364
321
|
typedef struct {
|
365
322
|
ggml_half d;
|
366
323
|
uint8_t qs[QK_K/4];
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
381
338
|
typedef struct {
|
382
339
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
383
340
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
384
|
-
#if QK_K == 64
|
385
|
-
ggml_half d;
|
386
|
-
#endif
|
387
341
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
388
342
|
} block_iq1_m;
|
389
|
-
#if QK_K == 64
|
390
|
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
391
|
-
#else
|
392
343
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
393
|
-
#endif
|
394
344
|
|
395
345
|
// Used by IQ1_M quants
|
396
346
|
typedef union {
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
406
356
|
} block_iq4_nl;
|
407
357
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
408
358
|
|
409
|
-
#if QK_K == 64
|
410
|
-
#define block_iq4_xs block_iq4_nl
|
411
|
-
#else
|
412
359
|
typedef struct {
|
413
360
|
ggml_half d;
|
414
361
|
uint16_t scales_h;
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
416
363
|
uint8_t qs[QK_K/2];
|
417
364
|
} block_iq4_xs;
|
418
365
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
419
|
-
#endif
|
420
366
|
|
421
367
|
#endif // GGML_COMMON_DECL
|
422
368
|
#endif // GGML_COMMON_DECL
|