@fugood/llama.node 1.3.8 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/lib/binding.js +25 -18
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +17 -17
  6. package/scripts/llama.cpp.patch +53 -4
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/LlamaContext.cpp +6 -1
  9. package/src/llama.cpp/common/arg.cpp +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  11. package/src/llama.cpp/common/chat.cpp +0 -952
  12. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  13. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  14. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -4
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +336 -3
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +11 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +234 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  22. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  23. package/src/llama.cpp/src/llama-arch.cpp +48 -3
  24. package/src/llama.cpp/src/llama-arch.h +2 -0
  25. package/src/llama.cpp/src/llama-context.cpp +6 -2
  26. package/src/llama.cpp/src/llama-hparams.h +1 -1
  27. package/src/llama.cpp/src/llama-model.cpp +102 -5
  28. package/src/llama.cpp/src/llama-model.h +4 -0
  29. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  30. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  31. package/src/llama.cpp/src/models/models.h +51 -1
  32. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
268
268
  }
269
269
 
270
270
  std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
271
- std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
271
+ std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
272
272
  std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
273
273
  std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
274
- {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
274
+ {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
275
275
  };
276
276
 
277
277
  std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@@ -183,6 +183,7 @@ endif()
183
183
  # ggml core
184
184
  set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
185
185
  option(GGML_CPU "ggml: enable CPU backend" ON)
186
+ option(GGML_SCHED_NO_REALLOC "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
186
187
 
187
188
  # 3rd party libs / backends
188
189
  option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@@ -8,7 +8,7 @@ extern "C" {
8
8
  #endif
9
9
 
10
10
  #define RPC_PROTO_MAJOR_VERSION 3
11
- #define RPC_PROTO_MINOR_VERSION 0
11
+ #define RPC_PROTO_MINOR_VERSION 5
12
12
  #define RPC_PROTO_PATCH_VERSION 0
13
13
  #define GGML_RPC_MAX_SERVERS 16
14
14
 
@@ -221,6 +221,10 @@ if (GGML_BACKEND_DL)
221
221
  target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
222
222
  endif()
223
223
 
224
+ if (GGML_SCHED_NO_REALLOC)
225
+ target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
226
+ endif()
227
+
224
228
  add_library(ggml
225
229
  ggml-backend-reg.cpp)
226
230
  add_library(ggml::ggml ALIAS ggml)
@@ -270,10 +274,13 @@ function(ggml_add_backend_library backend)
270
274
  endif()
271
275
 
272
276
  # Set versioning properties for all backend libraries
273
- set_target_properties(${backend} PROPERTIES
274
- VERSION ${GGML_VERSION}
275
- SOVERSION ${GGML_VERSION_MAJOR}
276
- )
277
+ # Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
278
+ if (NOT (APPLE AND GGML_BACKEND_DL))
279
+ set_target_properties(${backend} PROPERTIES
280
+ VERSION ${GGML_VERSION}
281
+ SOVERSION ${GGML_VERSION_MAJOR}
282
+ )
283
+ endif()
277
284
 
278
285
  if(NOT GGML_AVAILABLE_BACKENDS)
279
286
  set(GGML_AVAILABLE_BACKENDS "${backend}"
@@ -497,6 +497,140 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
497
497
  ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
498
498
  }
499
499
 
500
+ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
501
+ constexpr int qk = QK_K;
502
+ const int nb = n / qk;
503
+
504
+ constexpr int ncols_interleaved = 8;
505
+ constexpr int blocklen = 8;
506
+
507
+ assert(n % qk == 0);
508
+ assert(nr % 4 == 0);
509
+ assert(nc % ncols_interleaved == 0);
510
+
511
+ UNUSED(nb);
512
+ UNUSED(ncols_interleaved);
513
+ UNUSED(blocklen);
514
+
515
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
516
+ constexpr int col_groups = ncols_interleaved / 4; // 0123 and 4567
517
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
518
+
519
+ // 1x8 tile = 2 x 4
520
+ float32x4_t acc_f32[col_groups];
521
+
522
+ const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
523
+
524
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
525
+ const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
526
+
527
+ for (int i = 0; i < col_groups; i++) {
528
+ acc_f32[i] = vdupq_n_f32(0);
529
+ }
530
+
531
+ for (int b = 0; b < nb; b++) {
532
+ float32x4_t q4_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d)); // d0 d1 d2 d3
533
+ float32x4_t q4_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4)); // d4 d5 d6 d7
534
+ float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
535
+ float32x4_t sb_scale_0123 = vmulq_f32(q4_d_0, q8_d);
536
+ float32x4_t sb_scale_4567 = vmulq_f32(q4_d_1, q8_d);
537
+ float32x4_t q4_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin)); // dmin 0..3
538
+ float32x4_t q4_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4)); // dmin 4..7
539
+ float32x4_t sb_min_0123 = vmulq_f32(q4_dmin_0, q8_d);
540
+ float32x4_t sb_min_4567 = vmulq_f32(q4_dmin_1, q8_d);
541
+
542
+ // interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
543
+ int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
544
+ int32x4_t acc_lo[col_groups];
545
+ int32x4_t acc_hi[col_groups];
546
+
547
+ // Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
548
+ const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
549
+ int16_t bsums_arr[8];
550
+ vst1q_s16(bsums_arr, bsums);
551
+ for (int sb = 0; sb < QK_K / 64; sb++) {
552
+ for (int i = 0; i < col_groups; i++) {
553
+ acc_lo[i] = vdupq_n_s32(0);
554
+ acc_hi[i] = vdupq_n_s32(0);
555
+ }
556
+ // Need scales for the low and high nibbles
557
+ // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
558
+ int16x8_t q4sb_mins[2];
559
+ int16x8_t q4sb_scales[2];
560
+ for (int i = 0; i < 2; i++) {
561
+ int8_t aux_q4sb[8];
562
+ const int offset = sb * 24 + i * 12;
563
+ decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
564
+ q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
565
+ }
566
+
567
+ int8x16_t q8_qs[64 / 16];
568
+ for (int i = 0; i < 64 / 16; i++) {
569
+ q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
570
+ }
571
+
572
+ for (int c = 0; c < col_groups; c++) {
573
+ uint8x16_t q4_cols[8];
574
+ for (int i = 0; i < 8; i++) {
575
+ q4_cols[i] = vld1q_u8(q4_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
576
+ }
577
+
578
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[0], m4b)), q8_qs[0], 0);
579
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[1], m4b)), q8_qs[0], 1);
580
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[2], m4b)), q8_qs[0], 2);
581
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[3], m4b)), q8_qs[0], 3);
582
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[4], m4b)), q8_qs[1], 0);
583
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[5], m4b)), q8_qs[1], 1);
584
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[6], m4b)), q8_qs[1], 2);
585
+ acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[7], m4b)), q8_qs[1], 3);
586
+
587
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[0], 4)), q8_qs[2], 0);
588
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[1], 4)), q8_qs[2], 1);
589
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[2], 4)), q8_qs[2], 2);
590
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[3], 4)), q8_qs[2], 3);
591
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[4], 4)), q8_qs[3], 0);
592
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[5], 4)), q8_qs[3], 1);
593
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[6], 4)), q8_qs[3], 2);
594
+ acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[7], 4)), q8_qs[3], 3);
595
+ }
596
+
597
+ // Scales
598
+ // row c0123 blk0 and blk1
599
+ const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
600
+ const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
601
+ const float32x4_t sumf_0123 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
602
+ vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
603
+ acc_f32[0] = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
604
+ // row c4567 blk0 and blk1
605
+ const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
606
+ const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
607
+ const float32x4_t sumf_4567 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
608
+ vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
609
+ acc_f32[1] = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
610
+
611
+ // Bias Correction
612
+ const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
613
+ const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
614
+
615
+ bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
616
+ bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
617
+ bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
618
+ bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
619
+ } // for sb
620
+
621
+ acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
622
+ acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
623
+ } // for b
624
+
625
+ int base = x * ncols_interleaved;
626
+ vst1q_f32(s + base, acc_f32[0]);
627
+ vst1q_f32(s + base + 4, acc_f32[1]);
628
+ } // for x
629
+ return;
630
+ #endif // #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
631
+ ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
632
+ }
633
+
500
634
  void ggml_gemv_q4_K_8x8_q8_K(int n,
501
635
  float * GGML_RESTRICT s,
502
636
  size_t bs,
@@ -518,7 +652,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
518
652
  UNUSED(ncols_interleaved);
519
653
  UNUSED(blocklen);
520
654
 
521
- #if defined(__aarch64__) && defined(__ARM_NEON)
655
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
522
656
  constexpr int col_pairs = ncols_interleaved / 2;
523
657
  const uint8x16_t m4b = vdupq_n_u8(0x0f);
524
658
 
@@ -615,7 +749,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
615
749
  float32x4_t sb_scale = p == 0 ? sb_scale_0 : sb_scale_1;
616
750
 
617
751
  // 0123 or 4567
618
- // TODO: Single superblock mul at the end of the superblock
619
752
  float32x4_t sumf_0 =
620
753
  vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
621
754
  acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
@@ -649,7 +782,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
649
782
  vst1q_f32(s + base + 4, acc_f32[1]);
650
783
  } // for x
651
784
  return;
652
- #endif // defined(__aarch64__) && defined(__ARM_NEON)
785
+ #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
653
786
  ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
654
787
  }
655
788
 
@@ -2069,6 +2202,206 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
2069
2202
  ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
2070
2203
  }
2071
2204
 
2205
+ void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2206
+ constexpr int qk = QK_K;
2207
+ const int nb = n / qk;
2208
+
2209
+ constexpr int ncols_interleaved = 8;
2210
+ constexpr int blocklen = 4;
2211
+
2212
+ assert(n % qk == 0);
2213
+ assert(nr % 4 == 0);
2214
+ assert(nc % ncols_interleaved == 0);
2215
+
2216
+ UNUSED(nb);
2217
+ UNUSED(ncols_interleaved);
2218
+ UNUSED(blocklen);
2219
+
2220
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
2221
+ constexpr int q8_k_blocklen = 4;
2222
+ constexpr int acc_size = 2 * 4; // 2 row pairs × 4 col pairs
2223
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
2224
+
2225
+ // 8 accumulators: 2 row pairs × 4 col pairs
2226
+ float32x4_t acc_f32[acc_size];
2227
+
2228
+ for (int y = 0; y < nr / q8_k_blocklen; y++) {
2229
+ const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2230
+
2231
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2232
+ const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
2233
+
2234
+ for (int i = 0; i < acc_size; i++) {
2235
+ acc_f32[i] = vdupq_n_f32(0);
2236
+ }
2237
+
2238
+ for (int b = 0; b < nb; b++) {
2239
+ // d4 0 1 2 3, 4 5 6 7
2240
+ float32x4_t q4_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));
2241
+ float32x4_t q4_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));
2242
+ // d8 0 1 2 3
2243
+ float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
2244
+ // mins
2245
+ float32x4_t q4_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));
2246
+ float32x4_t q4_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));
2247
+
2248
+ // Precomputation of scales and mins
2249
+ float32x4_t sbd_scale_0123[q8_k_blocklen];
2250
+ float32x4_t sbd_scale_4567[q8_k_blocklen];
2251
+ float32x4_t sbd_min_0123[q8_k_blocklen];
2252
+ float32x4_t sbd_min_4567[q8_k_blocklen];
2253
+
2254
+ sbd_scale_0123[0] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 0);
2255
+ sbd_scale_4567[0] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 0);
2256
+ sbd_min_0123[0] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 0);
2257
+ sbd_min_4567[0] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 0);
2258
+
2259
+ sbd_scale_0123[1] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 1);
2260
+ sbd_scale_4567[1] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 1);
2261
+ sbd_min_0123[1] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 1);
2262
+ sbd_min_4567[1] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 1);
2263
+
2264
+ sbd_scale_0123[2] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 2);
2265
+ sbd_scale_4567[2] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 2);
2266
+ sbd_min_0123[2] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 2);
2267
+ sbd_min_4567[2] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 2);
2268
+
2269
+ sbd_scale_0123[3] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 3);
2270
+ sbd_scale_4567[3] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 3);
2271
+ sbd_min_0123[3] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 3);
2272
+ sbd_min_4567[3] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 3);
2273
+
2274
+ // Precomputation of bsums, each vpaddq calcs all the bsums for each row
2275
+ const int16x8_t bsums[q8_k_blocklen] = {
2276
+ vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
2277
+ vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
2278
+ vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
2279
+ vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
2280
+ };
2281
+ int16_t bsums_arr[QK_K / 64][8];
2282
+ for (int q8_row = 0; q8_row < 4; q8_row++) {
2283
+ vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
2284
+ }
2285
+
2286
+ // interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
2287
+ int32x4_t bias_acc[acc_size];
2288
+ for (int i = 0; i < acc_size; i++) {
2289
+ bias_acc[i] = vdupq_n_s32(0);
2290
+ }
2291
+
2292
+ for (int sb = 0; sb < QK_K / 64; sb++) {
2293
+ // Int accumulators for qs vecdot (4 row x 2 col quartets)
2294
+ int32x4_t acc_lo[acc_size];
2295
+ int32x4_t acc_hi[acc_size];
2296
+ for (int i = 0; i < acc_size; i++) {
2297
+ acc_lo[i] = vdupq_n_s32(0);
2298
+ acc_hi[i] = vdupq_n_s32(0);
2299
+ }
2300
+ // Need scales for the low and high nibbles
2301
+ // 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
2302
+ int16x8_t q4sb_scales[2];
2303
+ int16x8_t q4sb_mins[2];
2304
+ for (int i = 0; i < 2; i++) {
2305
+ int8_t aux_q4sb[8];
2306
+ const int offset = sb * 24 + i * 12;
2307
+ decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
2308
+ q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
2309
+ }
2310
+
2311
+ constexpr int reads_per_sb = 8; // 8 * 16 bytes each => 32 qs * 4 rows
2312
+ for (int k = 0; k < reads_per_sb; k++) {
2313
+ const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
2314
+ const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
2315
+
2316
+ // 0..3 & 32..35
2317
+ const uint8x16_t q4_0123 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k);
2318
+ const uint8x16_t q4_4567 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k + 16);
2319
+
2320
+ const int8x16_t q4_0123_lo = vreinterpretq_s8_u8(vandq_u8(q4_0123, m4b));
2321
+ const int8x16_t q4_0123_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_0123, 4));
2322
+
2323
+ acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q4_0123_lo, q8_blk0, 0); // 0..3 r0 c0123
2324
+ acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q4_0123_lo, q8_blk0, 1); // 0..3 r1 c0123
2325
+ acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q4_0123_lo, q8_blk0, 2); // 0..3 r2 c0123
2326
+ acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q4_0123_lo, q8_blk0, 3); // 0..3 r3 c0123
2327
+
2328
+ acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q4_0123_hi, q8_blk1, 0); // 32..35 r0 c0123
2329
+ acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q4_0123_hi, q8_blk1, 1); // 32..35 r1 c0123
2330
+ acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q4_0123_hi, q8_blk1, 2); // 32..35 r2 c0123
2331
+ acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q4_0123_hi, q8_blk1, 3); // 32..35 r3 c0123
2332
+
2333
+ const int8x16_t q4_4567_lo = vreinterpretq_s8_u8(vandq_u8(q4_4567, m4b));
2334
+ const int8x16_t q4_4567_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_4567, 4));
2335
+
2336
+ acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q4_4567_lo, q8_blk0, 0); // 0..3 r0 c4567
2337
+ acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q4_4567_lo, q8_blk0, 1); // 0..3 r1 c4567
2338
+ acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q4_4567_lo, q8_blk0, 2); // 0..3 r2 c4567
2339
+ acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q4_4567_lo, q8_blk0, 3); // 0..3 r3 c4567
2340
+
2341
+ acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q4_4567_hi, q8_blk1, 0); // 32..35 r0 c4567
2342
+ acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q4_4567_hi, q8_blk1, 1); // 32..35 r1 c4567
2343
+ acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q4_4567_hi, q8_blk1, 2); // 32..35 r2 c4567
2344
+ acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q4_4567_hi, q8_blk1, 3); // 32..35 r3 c4567
2345
+ }
2346
+
2347
+ // Scale and bias application
2348
+ // acc is stored interleaved to match output layout
2349
+ const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
2350
+ const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
2351
+ const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
2352
+ const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
2353
+ for (int row = 0; row < q8_k_blocklen; row++) {
2354
+ // Bias correction
2355
+ // row c0123 blk0 and blk1
2356
+ const float32x4_t sumf_0123 =
2357
+ vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
2358
+ vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
2359
+ acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
2360
+
2361
+ // row c4567 blk0 and blk1
2362
+ const float32x4_t sumf_4567 =
2363
+ vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
2364
+ vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
2365
+ acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
2366
+
2367
+ // Bias
2368
+ const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
2369
+ const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
2370
+
2371
+ // row c0123 blk0 and blk1
2372
+ bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
2373
+ bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
2374
+
2375
+ // row c4567 blk0 and blk1
2376
+ bias_acc[2 * row + 1] =
2377
+ vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
2378
+ bias_acc[2 * row + 1] =
2379
+ vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
2380
+ }
2381
+ } // for sb
2382
+
2383
+ for (int row = 0; row < q8_k_blocklen; row++) {
2384
+ acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
2385
+ acc_f32[2 * row + 1] =
2386
+ vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
2387
+ }
2388
+ } // for b
2389
+
2390
+ for (int i = 0; i < q8_k_blocklen; i++) {
2391
+ int row = y * q8_k_blocklen + i;
2392
+ for (int j = 0; j < 2; j++) {
2393
+ int col = x * ncols_interleaved + j * 4;
2394
+ int offset = row * bs + col;
2395
+ vst1q_f32(s + offset, acc_f32[2 * i + j]);
2396
+ }
2397
+ }
2398
+ } // for x
2399
+ } // for y
2400
+ return;
2401
+ #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
2402
+ ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
2403
+ }
2404
+
2072
2405
  void ggml_gemm_q4_K_8x8_q8_K(int n,
2073
2406
  float * GGML_RESTRICT s,
2074
2407
  size_t bs,
@@ -1,20 +1,23 @@
1
1
  #include "ggml-backend-impl.h"
2
2
 
3
3
  #if defined(__riscv) && __riscv_xlen == 64
4
- #include <sys/auxv.h>
5
-
6
- //https://github.com/torvalds/linux/blob/master/arch/riscv/include/uapi/asm/hwcap.h#L24
7
- #ifndef COMPAT_HWCAP_ISA_V
8
- #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
9
- #endif
4
+ #include <asm/hwprobe.h>
5
+ #include <asm/unistd.h>
6
+ #include <unistd.h>
10
7
 
11
8
  struct riscv64_features {
12
9
  bool has_rvv = false;
13
10
 
14
11
  riscv64_features() {
15
- uint32_t hwcap = getauxval(AT_HWCAP);
12
+ struct riscv_hwprobe probe;
13
+ probe.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
14
+ probe.value = 0;
15
+
16
+ int ret = syscall(__NR_riscv_hwprobe, &probe, 1, 0, NULL, 0);
16
17
 
17
- has_rvv = !!(hwcap & COMPAT_HWCAP_ISA_V);
18
+ if (0 == ret) {
19
+ has_rvv = !!(probe.value & RISCV_HWPROBE_IMA_V);
20
+ }
18
21
  }
19
22
  };
20
23
 
@@ -33,10 +33,12 @@
33
33
  // repack.cpp
34
34
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
35
35
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
36
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
36
37
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
37
38
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
38
39
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
39
40
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
41
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
40
42
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
41
43
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
42
44
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -44,12 +46,14 @@
44
46
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
45
47
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
46
48
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
49
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
47
50
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
48
51
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
49
52
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
50
53
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
51
54
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
52
55
  // repack.cpp
56
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
53
57
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
54
58
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
55
59
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
@@ -58,11 +62,14 @@
58
62
  #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
59
63
  // repack.cpp
60
64
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
65
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
61
66
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
62
67
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
68
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
63
69
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
64
70
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
65
71
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
72
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
66
73
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
67
74
  #elif defined(__POWERPC__) || defined(__powerpc__)
68
75
  // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
@@ -74,10 +81,12 @@
74
81
  // repack.cpp
75
82
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
76
83
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
84
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
77
85
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
78
86
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
79
87
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
80
88
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
89
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
81
90
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
82
91
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
83
92
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -85,6 +94,7 @@
85
94
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
86
95
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
87
96
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
97
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
88
98
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
89
99
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
90
100
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -99,10 +109,12 @@
99
109
  // repack.cpp
100
110
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
101
111
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
112
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
102
113
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
103
114
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
104
115
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
105
116
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
117
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
106
118
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
107
119
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
108
120
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -110,6 +122,7 @@
110
122
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
111
123
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
112
124
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
125
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
113
126
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
114
127
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
115
128
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -132,15 +145,18 @@
132
145
  // repack.cpp
133
146
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
134
147
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
148
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
135
149
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
136
150
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
137
151
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
152
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
138
153
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
139
154
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
140
155
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
141
156
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
142
157
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
143
158
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
159
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
144
160
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
145
161
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
146
162
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -161,10 +177,12 @@
161
177
  // repack.cpp
162
178
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
163
179
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
180
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
164
181
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
165
182
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
166
183
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
167
184
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
185
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
168
186
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
169
187
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
170
188
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -172,6 +190,7 @@
172
190
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
173
191
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
174
192
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
193
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
175
194
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
176
195
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
177
196
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -194,10 +213,12 @@
194
213
  // repack.cpp
195
214
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
196
215
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
216
+ #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
197
217
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
198
218
  #define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
199
219
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
200
220
  #define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
221
+ #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
201
222
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
202
223
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
203
224
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@@ -205,6 +226,7 @@
205
226
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
206
227
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
207
228
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
229
+ #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
208
230
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
209
231
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
210
232
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@@ -9766,7 +9766,8 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
9766
9766
  }
9767
9767
 
9768
9768
  const float diag = A_batch[i00 * n + i00];
9769
- GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
9769
+ assert(diag != 0.0f && "Zero diagonal in triangular matrix");
9770
+
9770
9771
  X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
9771
9772
  }
9772
9773
  }