llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ce6d72aeb5fb9aff775d44284bf934e164f8470973619507ef6e6eb1ac0bec4d
4
- data.tar.gz: 7c1ae823c90f957219b3edbc20f091b65a50caa984c1a6f4d137a46c376b2f0c
3
+ metadata.gz: d0a9cdf86695522e27b1e8d3ed485dfa6ab3a4fc23d9bd9e44bf8c3cb483c347
4
+ data.tar.gz: 5d97cec87f9b1df94f85f9e18dc46a1b8a4ec593c17d04e4bee0da3d28c34211
5
5
  SHA512:
6
- metadata.gz: d23cb6a63b7734df2547c5e61a699fa206878c747e274e004c829b77335a7cc7434e92168a55d8ab0a617b11eddb5d45d5057a91b92e848735fd9e852b2476cd
7
- data.tar.gz: f54b09de3cc60de81be977e9706a9beb3bf28e7740a19a57f6add543fe10cd6dc4101cbbe22dd5b62870c78a1ad4d10f57dd29b7c3e3e12b950e6575cf67b0c7
6
+ metadata.gz: 71f26009b872db64d0d0d416153b5fbd6afb598617b701cb6342d099542c962f410bccddf80b77928bfd8ab8f017a749fbc1d2ed488139d806ef0e3cf75a0e42
7
+ data.tar.gz: 808c03f6664af65cadfea23071d0b55d459c119189346762ea9632156f7f35b8d1f0e594b356726fc26abdb1c81a3bce9d697b9ca2d6324c454a31f2a442f0d7
data/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [[0.15.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.2...v0.15.3)] - 2024-05-25
2
+
3
+ - Bump llama.cpp from b2917 to b2988.
4
+ - Add constants for pre-tokenization types.
5
+ - Add `n_threads` method to `Context`.
6
+ - Add `n_threads_batch` method to `Context`.
7
+ - Add `set_n_threads` method to `Context`.
8
+
9
+ ## [[0.15.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.1...v0.15.2)] - 2024-05-18
10
+
11
+ - Bump llama.cpp from b2839 to b2917.
12
+
13
+ Implementation binding for rpc_servers in llama_model_params has been skipped.
14
+
1
15
  ## [[0.15.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.15.0...v0.15.1)] - 2024-05-11
2
16
 
3
17
  - Bump llama.cpp from b2781 to b2839.
@@ -2122,10 +2122,13 @@ public:
2122
2122
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
2123
2123
  rb_define_method(rb_cLLaMAContext, "embeddings_ith", RUBY_METHOD_FUNC(_llama_context_embeddings_ith), 1);
2124
2124
  rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
2125
+ rb_define_method(rb_cLLaMAContext, "set_n_threads", RUBY_METHOD_FUNC(_llama_context_set_n_threads), -1);
2125
2126
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
2126
2127
  rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
2127
2128
  rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
2128
2129
  rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
2130
+ rb_define_method(rb_cLLaMAContext, "n_threads", RUBY_METHOD_FUNC(_llama_context_n_threads), 0);
2131
+ rb_define_method(rb_cLLaMAContext, "n_threads_batch", RUBY_METHOD_FUNC(_llama_context_n_threads_batch), 0);
2129
2132
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
2130
2133
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
2131
2134
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2343,6 +2346,33 @@ private:
2343
2346
  return output;
2344
2347
  }
2345
2348
 
2349
+ static VALUE _llama_context_set_n_threads(int argc, VALUE* argv, VALUE self) {
2350
+ VALUE kw_args = Qnil;
2351
+ ID kw_table[2] = { rb_intern("n_threads"), rb_intern("n_threads_batch") };
2352
+ VALUE kw_values[2] = { Qundef, Qundef };
2353
+ rb_scan_args(argc, argv, ":", &kw_args);
2354
+ rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
2355
+
2356
+ VALUE n_threads = kw_values[0];
2357
+ if (!RB_INTEGER_TYPE_P(n_threads)) {
2358
+ rb_raise(rb_eArgError, "n_threads must be an integer");
2359
+ return Qnil;
2360
+ }
2361
+ VALUE n_threads_batch = kw_values[1];
2362
+ if (!RB_INTEGER_TYPE_P(n_threads_batch)) {
2363
+ rb_raise(rb_eArgError, "n_threads_batch must be an integer");
2364
+ return Qnil;
2365
+ }
2366
+
2367
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2368
+ if (ptr->ctx == NULL) {
2369
+ rb_raise(rb_eArgError, "LLaMA context is not initialized");
2370
+ return Qnil;
2371
+ }
2372
+ llama_set_n_threads(ptr->ctx, NUM2UINT(n_threads), NUM2UINT(n_threads_batch));
2373
+ return Qnil;
2374
+ }
2375
+
2346
2376
  static VALUE _llama_context_n_ctx(VALUE self) {
2347
2377
  LLaMAContextWrapper* ptr = get_llama_context(self);
2348
2378
  if (ptr->ctx == NULL) {
@@ -2379,6 +2409,24 @@ private:
2379
2409
  return UINT2NUM(llama_n_seq_max(ptr->ctx));
2380
2410
  }
2381
2411
 
2412
+ static VALUE _llama_context_n_threads(VALUE self) {
2413
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2414
+ if (ptr->ctx == NULL) {
2415
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2416
+ return Qnil;
2417
+ }
2418
+ return UINT2NUM(llama_n_threads(ptr->ctx));
2419
+ }
2420
+
2421
+ static VALUE _llama_context_n_threads_batch(VALUE self) {
2422
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2423
+ if (ptr->ctx == NULL) {
2424
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2425
+ return Qnil;
2426
+ }
2427
+ return UINT2NUM(llama_n_threads_batch(ptr->ctx));
2428
+ }
2429
+
2382
2430
  static VALUE _llama_context_get_timings(VALUE self) {
2383
2431
  LLaMAContextWrapper* ptr = get_llama_context(self);
2384
2432
  if (ptr->ctx == NULL) {
@@ -3430,6 +3478,7 @@ extern "C" void Init_llama_cpp(void) {
3430
3478
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
3479
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_REFACT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_REFACT));
3432
3480
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_COMMAND_R", INT2NUM(LLAMA_VOCAB_PRE_TYPE_COMMAND_R));
3481
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STABLELM2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STABLELM2));
3433
3482
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_QWEN2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_QWEN2));
3434
3483
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_OLMO", INT2NUM(LLAMA_VOCAB_PRE_TYPE_OLMO));
3435
3484
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DBRX", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DBRX));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.15.1'
6
+ VERSION = '0.15.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2839'
9
+ LLAMA_CPP_VERSION = 'b2988'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,7 @@ module LLaMACpp
26
26
  LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
27
  LLAMA_VOCAB_PRE_TYPE_REFACT: Integer
28
28
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R: Integer
29
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2: Integer
29
30
  LLAMA_VOCAB_PRE_TYPE_QWEN2: Integer
30
31
  LLAMA_VOCAB_PRE_TYPE_OLMO: Integer
31
32
  LLAMA_VOCAB_PRE_TYPE_DBRX: Integer
@@ -241,10 +242,13 @@ module LLaMACpp
241
242
  def embeddings_seq: (Integer) -> Array[Float]
242
243
  def decode: (::LLaMACpp::Batch) -> void
243
244
  def logits: () -> Array[Float]
245
+ def set_n_threads: (n_threads: Integer, n_threads_batch: Integer) -> void
244
246
  def n_ctx: () -> Integer
245
247
  def n_batch: () -> Integer
246
248
  def n_ubatch: () -> Integer
247
249
  def n_seq_max: () -> Integer
250
+ def n_threads: () -> Integer
251
+ def n_threads_batch: () -> Integer
248
252
  def timings: () -> ::LLaMACpp::Timings
249
253
  def print_timings: () -> void
250
254
  def reset_timings: () -> void
@@ -381,15 +381,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
381
381
  CUDA_POWER_ARCH = 1
382
382
  endif
383
383
 
384
+ ifneq ($(filter loongarch64%,$(UNAME_M)),)
385
+ MK_CFLAGS += -mlasx
386
+ MK_CXXFLAGS += -mlasx
387
+ endif
388
+
384
389
  else
385
390
  MK_CFLAGS += -march=rv64gcv -mabi=lp64d
386
391
  MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
387
392
  endif
388
393
 
389
- ifdef LLAMA_QKK_64
390
- MK_CPPFLAGS += -DGGML_QKK_64
391
- endif
392
-
393
394
  ifndef LLAMA_NO_ACCELERATE
394
395
  # Mac OS - include Accelerate framework.
395
396
  # `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -401,13 +402,6 @@ ifndef LLAMA_NO_ACCELERATE
401
402
  endif
402
403
  endif # LLAMA_NO_ACCELERATE
403
404
 
404
- ifdef LLAMA_MPI
405
- MK_CPPFLAGS += -DGGML_USE_MPI
406
- MK_CFLAGS += -Wno-cast-qual
407
- MK_CXXFLAGS += -Wno-cast-qual
408
- OBJS += ggml-mpi.o
409
- endif # LLAMA_MPI
410
-
411
405
  ifdef LLAMA_OPENBLAS
412
406
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
413
407
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -562,10 +556,10 @@ endif # LLAMA_VULKAN
562
556
  ifdef LLAMA_HIPBLAS
563
557
  ifeq ($(wildcard /opt/rocm),)
564
558
  ROCM_PATH ?= /usr
565
- GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
559
+ AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
566
560
  else
567
561
  ROCM_PATH ?= /opt/rocm
568
- GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
562
+ AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
569
563
  endif
570
564
  HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
571
565
  LLAMA_CUDA_DMMV_X ?= 32
@@ -577,7 +571,7 @@ ifdef LLAMA_HIP_UMA
577
571
  endif # LLAMA_HIP_UMA
578
572
  MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
579
573
  MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
580
- HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
574
+ HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
581
575
  HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
582
576
  HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
583
577
  HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@@ -631,11 +625,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
631
625
  endif
632
626
  endif # LLAMA_METAL
633
627
 
634
- ifdef LLAMA_MPI
635
- ggml-mpi.o: ggml-mpi.c ggml-mpi.h
636
- $(CC) $(CFLAGS) -c $< -o $@
637
- endif # LLAMA_MPI
638
-
639
628
  ifndef LLAMA_NO_LLAMAFILE
640
629
  sgemm.o: sgemm.cpp sgemm.h ggml.h
641
630
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -760,7 +749,7 @@ lib: llama.o ggml.o $(OBJS)
760
749
  ar rcs libllama.a $^
761
750
 
762
751
  clean:
763
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
752
+ rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
764
753
  rm -vrf ggml-cuda/*.o
765
754
 
766
755
  #
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1182
1182
  static char * fmt_size(size_t size) {
1183
1183
  static char buffer[128];
1184
1184
  if (size >= 1024*1024) {
1185
- sprintf(buffer, "%zuM", size/1024/1024);
1185
+ snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1186
1186
  } else {
1187
- sprintf(buffer, "%zuK", size/1024);
1187
+ snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1188
1188
  }
1189
1189
  return buffer;
1190
1190
  }
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
1895
1895
 
1896
1896
  tensor->buffer = buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- tensor->backend = tensor->view_src->backend;
1899
1898
  ggml_backend_buffer_init_tensor(buffer, tensor);
1900
1899
  }
1901
1900
 
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
65
  // QK = number of values after dequantization
66
66
  // QK_K = super-block size
67
67
 
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
68
  #define QK_K 256
73
69
  #define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
70
 
76
71
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
72
  // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
132
127
  #define QR4_NL 2
133
128
 
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
129
  #define QI4_XS (QK_K / (4*QR4_XS))
139
130
  #define QR4_XS 8
140
- #endif
141
131
 
142
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
133
 
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
218
  // weight is represented as x = a * q
229
219
  // 16 blocks of 16 elements each
230
220
  // Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8]; // quants - high bit
234
- uint8_t qs[QK_K/4]; // quants - low 2 bits
235
- uint8_t scales[2];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
221
  typedef struct {
241
222
  uint8_t hmask[QK_K/8]; // quants - high bit
242
223
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
244
225
  ggml_half d; // super-block scale
245
226
  } block_q3_K;
246
227
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
228
 
249
229
  // 4-bit quantization
250
230
  // 8 blocks of 32 elements each
251
231
  // weight is represented as x = a * q + b
252
232
  // Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2]; // super-block scales/mins
256
- uint8_t scales[2]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
233
  typedef struct {
262
234
  union {
263
235
  struct {
@@ -270,21 +242,11 @@ typedef struct {
270
242
  uint8_t qs[QK_K/2]; // 4--bit quants
271
243
  } block_q4_K;
272
244
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
245
 
275
246
  // 5-bit quantization
276
247
  // 8 blocks of 32 elements each
277
248
  // weight is represented as x = a * q + b
278
249
  // Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16]; // 8-bit block scales
283
- uint8_t qh[QK_K/8]; // quants, high bit
284
- uint8_t qs[QK_K/2]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
250
  typedef struct {
289
251
  union {
290
252
  struct {
@@ -298,7 +260,6 @@ typedef struct {
298
260
  uint8_t qs[QK_K/2]; // quants, low 4 bits
299
261
  } block_q5_K;
300
262
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
263
 
303
264
  // 6-bit quantization
304
265
  // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
317
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357
318
 
358
319
  // 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
320
  #define IQ3S_N_SCALE QK_K/64
363
- #endif
364
321
  typedef struct {
365
322
  ggml_half d;
366
323
  uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
338
  typedef struct {
382
339
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
340
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
341
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
342
  } block_iq1_m;
389
- #if QK_K == 64
390
- static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
343
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
344
 
395
345
  // Used by IQ1_M quants
396
346
  typedef union {
@@ -406,9 +356,6 @@ typedef struct {
406
356
  } block_iq4_nl;
407
357
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
358
 
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
359
  typedef struct {
413
360
  ggml_half d;
414
361
  uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
416
363
  uint8_t qs[QK_K/2];
417
364
  } block_iq4_xs;
418
365
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
366
 
421
367
  #endif // GGML_COMMON_DECL
422
368
  #endif // GGML_COMMON_DECL