llama_cpp 0.15.2 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30dd4c29b86098faf7c78de5fa8e57021b631bb5eb3d14c93f63f1d186383ab8
4
- data.tar.gz: b011d891f1cd725f84821428a8db24004b52c9614e785f493f721f7abde71029
3
+ metadata.gz: 167132898a0cb63faaf4fd7583d9b988992ba7c5ec0f5602d5a158f04e0cdfa0
4
+ data.tar.gz: 8a65658eb93b9cf80d5ede554b15968c495f045c32e57cc96ed732c56330d25f
5
5
  SHA512:
6
- metadata.gz: 6c1628f93762747688f802db8593946e8581c869f63c610669b45759f644b3d19b061825b788e328b6b984977112837586ed398b6118a8f8e5f0c7f6fd0eb2dd
7
- data.tar.gz: 2f8c3d9f1e6c0f6db7e0682995c8d34179d5405d32784bf00f04a3408cb5bf4c95557bfa1692026f8d3dc9e672d6b15dec5d33cbd76ddc1d94e5ec964a9d0409
6
+ metadata.gz: 9625ac088c4d5c50cc51bbbcbc744cb7041766ccbb7a42a9cd1b80b29ebe64414d39875dea5d61a87025e239ad78be2a2ea4d3f85a187684321e409fc01a40fd
7
+ data.tar.gz: 6f68445f10765a4eb1124ed1cfd2afb7544d146823efad27b2b6955bb0ee822ae8b0f9cccb68777c8cb211f665a0e2531eba04a4240399af1101a5dbcd645ae9
data/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [[0.15.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.3...v0.15.4)] - 2024-06-01
2
+
3
+ - Bump llama.cpp from b2988 to b3056.
4
+ - Add LLAMA_VOCAB_PRE_TYPE_SMAUG constant.
5
+ - Add `token_is_control?` method to `Model`.
6
+
7
+ ## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
8
+
9
+ - Bump llama.cpp from b2917 to b2988.
10
+ - Add constants for pre-tokenization types.
11
+ - Add `n_threads` method to `Context`.
12
+ - Add `n_threads_batch` method to `Context`.
13
+ - Add `set_n_threads` method to `Context`.
14
+
1
15
  ## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
2
16
 
3
17
  - Bump llama.cpp from b2839 to b2917.
@@ -1536,6 +1536,7 @@ public:
1536
1536
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1537
1537
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1538
1538
  rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1539
+ rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
1539
1540
  }
1540
1541
 
1541
1542
  private:
@@ -1848,6 +1849,16 @@ private:
1848
1849
  LLaMAModelWrapper* ptr = get_llama_model(self);
1849
1850
  return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1850
1851
  }
1852
+
1853
+ static VALUE _llama_model_token_is_control(VALUE self, VALUE token_) {
1854
+ if (!RB_INTEGER_TYPE_P(token_)) {
1855
+ rb_raise(rb_eArgError, "token must be an integer");
1856
+ return Qnil;
1857
+ }
1858
+ const llama_token token = NUM2INT(token_);
1859
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1860
+ return llama_token_is_control(ptr->model, token) ? Qtrue : Qfalse;
1861
+ }
1851
1862
  };
1852
1863
 
1853
1864
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2122,10 +2133,13 @@ public:
2122
2133
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2123
2134
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2124
2135
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2136
+ rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
2125
2137
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2126
2138
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2127
2139
  rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2128
2140
  rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2141
+ rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
2142
+ rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
2129
2143
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2130
2144
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2131
2145
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2343,6 +2357,33 @@ private:
2343
2357
  return output;
2344
2358
  }
2345
2359
 
2360
+ static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
2361
+ VALUE kw_args = Qnil;
2362
+ ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
2363
+ VALUE kw_values[2] = { Qundef, Qundef };
2364
+ rb_scan_args(argc, argv, ":", &kw_args);
2365
+ rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2366
+
2367
+ VALUE n_threads = kw_values[0];
2368
+ if (!RB_INTEGER_TYPE_P(n_threads)) {
2369
+ rb_raise(rb_eArgError, "n_threads must be an integer");
2370
+ return Qnil;
2371
+ }
2372
+ VALUE n_threads_batch = kw_values[1];
2373
+ if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
2374
+ rb_raise(rb_eArgError, "n_threads_batch must be an integer");
2375
+ return Qnil;
2376
+ }
2377
+
2378
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2379
+ if (ptr->ctx == NULL) {
2380
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2381
+ return Qnil;
2382
+ }
2383
+ llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
2384
+ return Qnil;
2385
+ }
2386
+
2346
2387
  static VALUE _llama_context_n_ctx(VALUE self) {
2347
2388
  LLaMAContextWrapper* ptr = get_llama_context(self);
2348
2389
  if (ptr->ctx == NULL) {
@@ -2379,6 +2420,24 @@ private:
2379
2420
  return UINT2NUM(llama_n_seq_max(ptr->ctx));
2380
2421
  }
2381
2422
 
2423
+ static VALUE _llama_context_n_threads(VALUE self) {
2424
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2425
+ if (ptr->ctx == NULL) {
2426
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2427
+ return Qnil;
2428
+ }
2429
+ return UINT2NUM(llama_n_threads(ptr->ctx));
2430
+ }
2431
+
2432
+ static VALUE _llama_context_n_threads_batch(VALUE self) {
2433
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2434
+ if (ptr->ctx == NULL) {
2435
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2436
+ return Qnil;
2437
+ }
2438
+ return UINT2NUM(llama_n_threads_batch(ptr->ctx));
2439
+ }
2440
+
2382
2441
  static VALUE _llama_context_get_timings(VALUE self) {
2383
2442
  LLaMAContextWrapper* ptr = get_llama_context(self);
2384
2443
  if (ptr->ctx == NULL) {
@@ -3430,9 +3489,11 @@ extern "C" void Init_llama_cpp(void) {
3430
3489
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
3490
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
3491
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3492
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
3433
3493
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
3494
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
3495
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
3496
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_SMAUG", INT2NUM(LLAMA_VOCAB_PRE_TYPE_SMAUG));
3436
3497
 
3437
3498
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3438
3499
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.2'
6
+ VERSION = '0.15.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2917'
9
+ LLAMA_CPP_VERSION = 'b3056'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,9 +26,11 @@ module LLaMACpp
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
27
  LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
28
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
29
30
  LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
31
  LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
33
+ LLAMA_VOCAB_PRE_TYPE_SMAUG: Integer
32
34
 
33
35
  LLAMA_FTYPE_ALL_F32: Integer
34
36
  LLAMA_FTYPE_MOSTLY_F16: Integer
@@ -158,6 +160,7 @@ module LLaMACpp
158
160
  def token_suffix: () -> Integer
159
161
  def token_eot: () -> Integer
160
162
  def token_is_eog?: (Integer) -> bool
163
+ def token_is_control?: (Integer) -> bool
161
164
  end
162
165
 
163
166
  class Timings
@@ -241,10 +244,13 @@ module LLaMACpp
241
244
  def embeddings_seq: (Integer) -> Array[Float]
242
245
  def decode: (::LLaMACpp::Batch) -> void
243
246
  def logits: () -> Array[Float]
247
+ def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
244
248
  def n_ctx: () -> Integer
245
249
  def n_batch: () -> Integer
246
250
  def n_ubatch: () -> Integer
247
251
  def n_seq_max: () -> Integer
252
+ def n_threads: () -> Integer
253
+ def n_threads_batch: () -> Integer
248
254
  def timings: () -> ::LLaMACpp::Timings
249
255
  def print_timings: () -> void
250
256
  def reset_timings: () -> void
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
381
381
  CUDA_POWER_ARCH = 1
382
382
  endif
383
383
 
384
+ ifneq ($(filter loongarch64%,$(UNAME_M)),)
385
+ MK_CFLAGS += -mlasx
386
+ MK_CXXFLAGS += -mlasx
387
+ endif
388
+
384
389
  else
385
390
  MK_CFLAGS += -march=rv64gcv -mabi=lp64d
386
391
  MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
387
392
  endif
388
393
 
389
- ifdef LLAMA_QKK_64
390
- MK_CPPFLAGS += -DGGML_QKK_64
391
- endif
392
-
393
394
  ifndef LLAMA_NO_ACCELERATE
394
395
  # Mac OS - include Accelerate framework.
395
396
  # `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
401
402
  endif
402
403
  endif # LLAMA_NO_ACCELERATE
403
404
 
404
- ifdef LLAMA_MPI
405
- MK_CPPFLAGS += -DGGML_USE_MPI
406
- MK_CFLAGS += -Wno-cast-qual
407
- MK_CXXFLAGS += -Wno-cast-qual
408
- OBJS += ggml-mpi.o
409
- endif # LLAMA_MPI
410
-
411
405
  ifdef LLAMA_OPENBLAS
412
406
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
413
407
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -449,6 +443,9 @@ endif # JETSON_EOL_MODULE_DETECT
449
443
  ifdef LLAMA_DEBUG
450
444
  MK_NVCCFLAGS += -lineinfo
451
445
  endif # LLAMA_DEBUG
446
+ ifdef LLAMA_CUDA_DEBUG
447
+ MK_NVCCFLAGS += --device-debug
448
+ endif # LLAMA_CUDA_DEBUG
452
449
  ifdef LLAMA_CUDA_NVCC
453
450
  NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
454
451
  else
@@ -631,11 +628,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
631
628
  endif
632
629
  endif # LLAMA_METAL
633
630
 
634
- ifdef LLAMA_MPI
635
- ggml-mpi.o: ggml-mpi.c ggml-mpi.h
636
- $(CC) $(CFLAGS) -c $< -o $@
637
- endif # LLAMA_MPI
638
-
639
631
  ifndef LLAMA_NO_LLAMAFILE
640
632
  sgemm.o: sgemm.cpp sgemm.h ggml.h
641
633
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
65
  // QK = number of values after dequantization
66
66
  // QK_K = super-block size
67
67
 
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
68
  #define QK_K 256
73
69
  #define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
70
 
76
71
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
72
  // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
132
127
  #define QR4_NL 2
133
128
 
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
129
  #define QI4_XS (QK_K / (4*QR4_XS))
139
130
  #define QR4_XS 8
140
- #endif
141
131
 
142
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
133
 
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
218
  // weight is represented as x = a * q
229
219
  // 16 blocks of 16 elements each
230
220
  // Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8]; // quants - high bit
234
- uint8_t qs[QK_K/4]; // quants - low 2 bits
235
- uint8_t scales[2];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
221
  typedef struct {
241
222
  uint8_t hmask[QK_K/8]; // quants - high bit
242
223
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
244
225
  ggml_half d; // super-block scale
245
226
  } block_q3_K;
246
227
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
228
 
249
229
  // 4-bit quantization
250
230
  // 8 blocks of 32 elements each
251
231
  // weight is represented as x = a * q + b
252
232
  // Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2]; // super-block scales/mins
256
- uint8_t scales[2]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
233
  typedef struct {
262
234
  union {
263
235
  struct {
@@ -270,21 +242,11 @@ typedef struct {
270
242
  uint8_t qs[QK_K/2]; // 4--bit quants
271
243
  } block_q4_K;
272
244
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
245
 
275
246
  // 5-bit quantization
276
247
  // 8 blocks of 32 elements each
277
248
  // weight is represented as x = a * q + b
278
249
  // Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16]; // 8-bit block scales
283
- uint8_t qh[QK_K/8]; // quants, high bit
284
- uint8_t qs[QK_K/2]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
250
  typedef struct {
289
251
  union {
290
252
  struct {
@@ -298,7 +260,6 @@ typedef struct {
298
260
  uint8_t qs[QK_K/2]; // quants, low 4 bits
299
261
  } block_q5_K;
300
262
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
263
 
303
264
  // 6-bit quantization
304
265
  // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
317
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357
318
 
358
319
  // 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
320
  #define IQ3S_N_SCALE QK_K/64
363
- #endif
364
321
  typedef struct {
365
322
  ggml_half d;
366
323
  uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
338
  typedef struct {
382
339
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
340
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
341
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
342
  } block_iq1_m;
389
- #if QK_K == 64
390
- static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
343
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
344
 
395
345
  // Used by IQ1_M quants
396
346
  typedef union {
@@ -406,9 +356,6 @@ typedef struct {
406
356
  } block_iq4_nl;
407
357
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
358
 
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
359
  typedef struct {
413
360
  ggml_half d;
414
361
  uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
416
363
  uint8_t qs[QK_K/2];
417
364
  } block_iq4_xs;
418
365
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
366
 
421
367
  #endif // GGML_COMMON_DECL
422
368
  #endif // GGML_COMMON_DECL