@fugood/llama.node 1.3.8 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.js +25 -18
- package/lib/binding.ts +19 -1
- package/lib/index.js +3 -3
- package/lib/index.ts +1 -1
- package/package.json +17 -17
- package/scripts/llama.cpp.patch +53 -4
- package/src/LlamaCompletionWorker.cpp +2 -2
- package/src/LlamaContext.cpp +6 -1
- package/src/llama.cpp/common/arg.cpp +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +968 -0
- package/src/llama.cpp/common/chat.cpp +0 -952
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +336 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +11 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +234 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +48 -3
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +6 -2
- package/src/llama.cpp/src/llama-hparams.h +1 -1
- package/src/llama.cpp/src/llama-model.cpp +102 -5
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +13 -5
- package/src/llama.cpp/src/models/lfm2.cpp +5 -3
- package/src/llama.cpp/src/models/models.h +51 -1
- package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
|
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
|
|
|
268
268
|
}
|
|
269
269
|
|
|
270
270
|
std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
|
|
271
|
-
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
|
|
271
|
+
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
|
|
272
272
|
std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
|
|
273
273
|
std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
|
274
|
-
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
|
|
274
|
+
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
|
|
275
275
|
};
|
|
276
276
|
|
|
277
277
|
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
|
@@ -183,6 +183,7 @@ endif()
|
|
|
183
183
|
# ggml core
|
|
184
184
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
|
185
185
|
option(GGML_CPU "ggml: enable CPU backend" ON)
|
|
186
|
+
option(GGML_SCHED_NO_REALLOC "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
|
|
186
187
|
|
|
187
188
|
# 3rd party libs / backends
|
|
188
189
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
|
@@ -221,6 +221,10 @@ if (GGML_BACKEND_DL)
|
|
|
221
221
|
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
|
|
222
222
|
endif()
|
|
223
223
|
|
|
224
|
+
if (GGML_SCHED_NO_REALLOC)
|
|
225
|
+
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
|
|
226
|
+
endif()
|
|
227
|
+
|
|
224
228
|
add_library(ggml
|
|
225
229
|
ggml-backend-reg.cpp)
|
|
226
230
|
add_library(ggml::ggml ALIAS ggml)
|
|
@@ -270,10 +274,13 @@ function(ggml_add_backend_library backend)
|
|
|
270
274
|
endif()
|
|
271
275
|
|
|
272
276
|
# Set versioning properties for all backend libraries
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
+
# Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
|
|
278
|
+
if (NOT (APPLE AND GGML_BACKEND_DL))
|
|
279
|
+
set_target_properties(${backend} PROPERTIES
|
|
280
|
+
VERSION ${GGML_VERSION}
|
|
281
|
+
SOVERSION ${GGML_VERSION_MAJOR}
|
|
282
|
+
)
|
|
283
|
+
endif()
|
|
277
284
|
|
|
278
285
|
if(NOT GGML_AVAILABLE_BACKENDS)
|
|
279
286
|
set(GGML_AVAILABLE_BACKENDS "${backend}"
|
|
@@ -497,6 +497,140 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
497
497
|
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
498
498
|
}
|
|
499
499
|
|
|
500
|
+
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
501
|
+
constexpr int qk = QK_K;
|
|
502
|
+
const int nb = n / qk;
|
|
503
|
+
|
|
504
|
+
constexpr int ncols_interleaved = 8;
|
|
505
|
+
constexpr int blocklen = 8;
|
|
506
|
+
|
|
507
|
+
assert(n % qk == 0);
|
|
508
|
+
assert(nr % 4 == 0);
|
|
509
|
+
assert(nc % ncols_interleaved == 0);
|
|
510
|
+
|
|
511
|
+
UNUSED(nb);
|
|
512
|
+
UNUSED(ncols_interleaved);
|
|
513
|
+
UNUSED(blocklen);
|
|
514
|
+
|
|
515
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
516
|
+
constexpr int col_groups = ncols_interleaved / 4; // 0123 and 4567
|
|
517
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
518
|
+
|
|
519
|
+
// 1x8 tile = 2 x 4
|
|
520
|
+
float32x4_t acc_f32[col_groups];
|
|
521
|
+
|
|
522
|
+
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
|
|
523
|
+
|
|
524
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
525
|
+
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
526
|
+
|
|
527
|
+
for (int i = 0; i < col_groups; i++) {
|
|
528
|
+
acc_f32[i] = vdupq_n_f32(0);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
for (int b = 0; b < nb; b++) {
|
|
532
|
+
float32x4_t q4_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d)); // d0 d1 d2 d3
|
|
533
|
+
float32x4_t q4_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4)); // d4 d5 d6 d7
|
|
534
|
+
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
|
|
535
|
+
float32x4_t sb_scale_0123 = vmulq_f32(q4_d_0, q8_d);
|
|
536
|
+
float32x4_t sb_scale_4567 = vmulq_f32(q4_d_1, q8_d);
|
|
537
|
+
float32x4_t q4_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin)); // dmin 0..3
|
|
538
|
+
float32x4_t q4_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4)); // dmin 4..7
|
|
539
|
+
float32x4_t sb_min_0123 = vmulq_f32(q4_dmin_0, q8_d);
|
|
540
|
+
float32x4_t sb_min_4567 = vmulq_f32(q4_dmin_1, q8_d);
|
|
541
|
+
|
|
542
|
+
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
|
|
543
|
+
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
|
|
544
|
+
int32x4_t acc_lo[col_groups];
|
|
545
|
+
int32x4_t acc_hi[col_groups];
|
|
546
|
+
|
|
547
|
+
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
|
|
548
|
+
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
|
|
549
|
+
int16_t bsums_arr[8];
|
|
550
|
+
vst1q_s16(bsums_arr, bsums);
|
|
551
|
+
for (int sb = 0; sb < QK_K / 64; sb++) {
|
|
552
|
+
for (int i = 0; i < col_groups; i++) {
|
|
553
|
+
acc_lo[i] = vdupq_n_s32(0);
|
|
554
|
+
acc_hi[i] = vdupq_n_s32(0);
|
|
555
|
+
}
|
|
556
|
+
// Need scales for the low and high nibbles
|
|
557
|
+
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
|
558
|
+
int16x8_t q4sb_mins[2];
|
|
559
|
+
int16x8_t q4sb_scales[2];
|
|
560
|
+
for (int i = 0; i < 2; i++) {
|
|
561
|
+
int8_t aux_q4sb[8];
|
|
562
|
+
const int offset = sb * 24 + i * 12;
|
|
563
|
+
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
|
|
564
|
+
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
int8x16_t q8_qs[64 / 16];
|
|
568
|
+
for (int i = 0; i < 64 / 16; i++) {
|
|
569
|
+
q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
for (int c = 0; c < col_groups; c++) {
|
|
573
|
+
uint8x16_t q4_cols[8];
|
|
574
|
+
for (int i = 0; i < 8; i++) {
|
|
575
|
+
q4_cols[i] = vld1q_u8(q4_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[0], m4b)), q8_qs[0], 0);
|
|
579
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[1], m4b)), q8_qs[0], 1);
|
|
580
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[2], m4b)), q8_qs[0], 2);
|
|
581
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[3], m4b)), q8_qs[0], 3);
|
|
582
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[4], m4b)), q8_qs[1], 0);
|
|
583
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[5], m4b)), q8_qs[1], 1);
|
|
584
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[6], m4b)), q8_qs[1], 2);
|
|
585
|
+
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[7], m4b)), q8_qs[1], 3);
|
|
586
|
+
|
|
587
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[0], 4)), q8_qs[2], 0);
|
|
588
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[1], 4)), q8_qs[2], 1);
|
|
589
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[2], 4)), q8_qs[2], 2);
|
|
590
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[3], 4)), q8_qs[2], 3);
|
|
591
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[4], 4)), q8_qs[3], 0);
|
|
592
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[5], 4)), q8_qs[3], 1);
|
|
593
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[6], 4)), q8_qs[3], 2);
|
|
594
|
+
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[7], 4)), q8_qs[3], 3);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Scales
|
|
598
|
+
// row c0123 blk0 and blk1
|
|
599
|
+
const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
|
|
600
|
+
const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
|
|
601
|
+
const float32x4_t sumf_0123 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
|
|
602
|
+
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
|
|
603
|
+
acc_f32[0] = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
|
|
604
|
+
// row c4567 blk0 and blk1
|
|
605
|
+
const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
|
|
606
|
+
const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
|
|
607
|
+
const float32x4_t sumf_4567 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
|
|
608
|
+
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
|
|
609
|
+
acc_f32[1] = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
|
|
610
|
+
|
|
611
|
+
// Bias Correction
|
|
612
|
+
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
|
|
613
|
+
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
|
|
614
|
+
|
|
615
|
+
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
|
|
616
|
+
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
|
|
617
|
+
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
|
|
618
|
+
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
|
|
619
|
+
} // for sb
|
|
620
|
+
|
|
621
|
+
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
|
|
622
|
+
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
|
|
623
|
+
} // for b
|
|
624
|
+
|
|
625
|
+
int base = x * ncols_interleaved;
|
|
626
|
+
vst1q_f32(s + base, acc_f32[0]);
|
|
627
|
+
vst1q_f32(s + base + 4, acc_f32[1]);
|
|
628
|
+
} // for x
|
|
629
|
+
return;
|
|
630
|
+
#endif // #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
631
|
+
ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
632
|
+
}
|
|
633
|
+
|
|
500
634
|
void ggml_gemv_q4_K_8x8_q8_K(int n,
|
|
501
635
|
float * GGML_RESTRICT s,
|
|
502
636
|
size_t bs,
|
|
@@ -518,7 +652,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
|
|
518
652
|
UNUSED(ncols_interleaved);
|
|
519
653
|
UNUSED(blocklen);
|
|
520
654
|
|
|
521
|
-
#if defined(__aarch64__) && defined(__ARM_NEON)
|
|
655
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
522
656
|
constexpr int col_pairs = ncols_interleaved / 2;
|
|
523
657
|
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
524
658
|
|
|
@@ -615,7 +749,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
|
|
615
749
|
float32x4_t sb_scale = p == 0 ? sb_scale_0 : sb_scale_1;
|
|
616
750
|
|
|
617
751
|
// 0123 or 4567
|
|
618
|
-
// TODO: Single superblock mul at the end of the superblock
|
|
619
752
|
float32x4_t sumf_0 =
|
|
620
753
|
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
|
|
621
754
|
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
|
|
@@ -649,7 +782,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
|
|
649
782
|
vst1q_f32(s + base + 4, acc_f32[1]);
|
|
650
783
|
} // for x
|
|
651
784
|
return;
|
|
652
|
-
#endif // defined(__aarch64__) && defined(__ARM_NEON)
|
|
785
|
+
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
653
786
|
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
654
787
|
}
|
|
655
788
|
|
|
@@ -2069,6 +2202,206 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2069
2202
|
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
2070
2203
|
}
|
|
2071
2204
|
|
|
2205
|
+
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
2206
|
+
constexpr int qk = QK_K;
|
|
2207
|
+
const int nb = n / qk;
|
|
2208
|
+
|
|
2209
|
+
constexpr int ncols_interleaved = 8;
|
|
2210
|
+
constexpr int blocklen = 4;
|
|
2211
|
+
|
|
2212
|
+
assert(n % qk == 0);
|
|
2213
|
+
assert(nr % 4 == 0);
|
|
2214
|
+
assert(nc % ncols_interleaved == 0);
|
|
2215
|
+
|
|
2216
|
+
UNUSED(nb);
|
|
2217
|
+
UNUSED(ncols_interleaved);
|
|
2218
|
+
UNUSED(blocklen);
|
|
2219
|
+
|
|
2220
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
2221
|
+
constexpr int q8_k_blocklen = 4;
|
|
2222
|
+
constexpr int acc_size = 2 * 4; // 2 row pairs × 4 col pairs
|
|
2223
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
2224
|
+
|
|
2225
|
+
// 8 accumulators: 2 row pairs × 4 col pairs
|
|
2226
|
+
float32x4_t acc_f32[acc_size];
|
|
2227
|
+
|
|
2228
|
+
for (int y = 0; y < nr / q8_k_blocklen; y++) {
|
|
2229
|
+
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
2230
|
+
|
|
2231
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2232
|
+
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
|
2233
|
+
|
|
2234
|
+
for (int i = 0; i < acc_size; i++) {
|
|
2235
|
+
acc_f32[i] = vdupq_n_f32(0);
|
|
2236
|
+
}
|
|
2237
|
+
|
|
2238
|
+
for (int b = 0; b < nb; b++) {
|
|
2239
|
+
// d4 0 1 2 3, 4 5 6 7
|
|
2240
|
+
float32x4_t q4_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));
|
|
2241
|
+
float32x4_t q4_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));
|
|
2242
|
+
// d8 0 1 2 3
|
|
2243
|
+
float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
|
|
2244
|
+
// mins
|
|
2245
|
+
float32x4_t q4_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));
|
|
2246
|
+
float32x4_t q4_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));
|
|
2247
|
+
|
|
2248
|
+
// Precomputation of scales and mins
|
|
2249
|
+
float32x4_t sbd_scale_0123[q8_k_blocklen];
|
|
2250
|
+
float32x4_t sbd_scale_4567[q8_k_blocklen];
|
|
2251
|
+
float32x4_t sbd_min_0123[q8_k_blocklen];
|
|
2252
|
+
float32x4_t sbd_min_4567[q8_k_blocklen];
|
|
2253
|
+
|
|
2254
|
+
sbd_scale_0123[0] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 0);
|
|
2255
|
+
sbd_scale_4567[0] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 0);
|
|
2256
|
+
sbd_min_0123[0] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 0);
|
|
2257
|
+
sbd_min_4567[0] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 0);
|
|
2258
|
+
|
|
2259
|
+
sbd_scale_0123[1] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 1);
|
|
2260
|
+
sbd_scale_4567[1] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 1);
|
|
2261
|
+
sbd_min_0123[1] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 1);
|
|
2262
|
+
sbd_min_4567[1] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 1);
|
|
2263
|
+
|
|
2264
|
+
sbd_scale_0123[2] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 2);
|
|
2265
|
+
sbd_scale_4567[2] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 2);
|
|
2266
|
+
sbd_min_0123[2] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 2);
|
|
2267
|
+
sbd_min_4567[2] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 2);
|
|
2268
|
+
|
|
2269
|
+
sbd_scale_0123[3] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 3);
|
|
2270
|
+
sbd_scale_4567[3] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 3);
|
|
2271
|
+
sbd_min_0123[3] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 3);
|
|
2272
|
+
sbd_min_4567[3] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 3);
|
|
2273
|
+
|
|
2274
|
+
// Precomputation of bsums, each vpaddq calcs all the bsums for each row
|
|
2275
|
+
const int16x8_t bsums[q8_k_blocklen] = {
|
|
2276
|
+
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
|
|
2277
|
+
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
|
|
2278
|
+
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
|
|
2279
|
+
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
|
|
2280
|
+
};
|
|
2281
|
+
int16_t bsums_arr[QK_K / 64][8];
|
|
2282
|
+
for (int q8_row = 0; q8_row < 4; q8_row++) {
|
|
2283
|
+
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
|
|
2284
|
+
}
|
|
2285
|
+
|
|
2286
|
+
// interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
|
|
2287
|
+
int32x4_t bias_acc[acc_size];
|
|
2288
|
+
for (int i = 0; i < acc_size; i++) {
|
|
2289
|
+
bias_acc[i] = vdupq_n_s32(0);
|
|
2290
|
+
}
|
|
2291
|
+
|
|
2292
|
+
for (int sb = 0; sb < QK_K / 64; sb++) {
|
|
2293
|
+
// Int accumulators for qs vecdot (4 row x 2 col quartets)
|
|
2294
|
+
int32x4_t acc_lo[acc_size];
|
|
2295
|
+
int32x4_t acc_hi[acc_size];
|
|
2296
|
+
for (int i = 0; i < acc_size; i++) {
|
|
2297
|
+
acc_lo[i] = vdupq_n_s32(0);
|
|
2298
|
+
acc_hi[i] = vdupq_n_s32(0);
|
|
2299
|
+
}
|
|
2300
|
+
// Need scales for the low and high nibbles
|
|
2301
|
+
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
|
2302
|
+
int16x8_t q4sb_scales[2];
|
|
2303
|
+
int16x8_t q4sb_mins[2];
|
|
2304
|
+
for (int i = 0; i < 2; i++) {
|
|
2305
|
+
int8_t aux_q4sb[8];
|
|
2306
|
+
const int offset = sb * 24 + i * 12;
|
|
2307
|
+
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
|
|
2308
|
+
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
|
|
2309
|
+
}
|
|
2310
|
+
|
|
2311
|
+
constexpr int reads_per_sb = 8; // 8 * 16 bytes each => 32 qs * 4 rows
|
|
2312
|
+
for (int k = 0; k < reads_per_sb; k++) {
|
|
2313
|
+
const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
|
|
2314
|
+
const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
|
|
2315
|
+
|
|
2316
|
+
// 0..3 & 32..35
|
|
2317
|
+
const uint8x16_t q4_0123 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k);
|
|
2318
|
+
const uint8x16_t q4_4567 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k + 16);
|
|
2319
|
+
|
|
2320
|
+
const int8x16_t q4_0123_lo = vreinterpretq_s8_u8(vandq_u8(q4_0123, m4b));
|
|
2321
|
+
const int8x16_t q4_0123_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_0123, 4));
|
|
2322
|
+
|
|
2323
|
+
acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q4_0123_lo, q8_blk0, 0); // 0..3 r0 c0123
|
|
2324
|
+
acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q4_0123_lo, q8_blk0, 1); // 0..3 r1 c0123
|
|
2325
|
+
acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q4_0123_lo, q8_blk0, 2); // 0..3 r2 c0123
|
|
2326
|
+
acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q4_0123_lo, q8_blk0, 3); // 0..3 r3 c0123
|
|
2327
|
+
|
|
2328
|
+
acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q4_0123_hi, q8_blk1, 0); // 32..35 r0 c0123
|
|
2329
|
+
acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q4_0123_hi, q8_blk1, 1); // 32..35 r1 c0123
|
|
2330
|
+
acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q4_0123_hi, q8_blk1, 2); // 32..35 r2 c0123
|
|
2331
|
+
acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q4_0123_hi, q8_blk1, 3); // 32..35 r3 c0123
|
|
2332
|
+
|
|
2333
|
+
const int8x16_t q4_4567_lo = vreinterpretq_s8_u8(vandq_u8(q4_4567, m4b));
|
|
2334
|
+
const int8x16_t q4_4567_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_4567, 4));
|
|
2335
|
+
|
|
2336
|
+
acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q4_4567_lo, q8_blk0, 0); // 0..3 r0 c4567
|
|
2337
|
+
acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q4_4567_lo, q8_blk0, 1); // 0..3 r1 c4567
|
|
2338
|
+
acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q4_4567_lo, q8_blk0, 2); // 0..3 r2 c4567
|
|
2339
|
+
acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q4_4567_lo, q8_blk0, 3); // 0..3 r3 c4567
|
|
2340
|
+
|
|
2341
|
+
acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q4_4567_hi, q8_blk1, 0); // 32..35 r0 c4567
|
|
2342
|
+
acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q4_4567_hi, q8_blk1, 1); // 32..35 r1 c4567
|
|
2343
|
+
acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q4_4567_hi, q8_blk1, 2); // 32..35 r2 c4567
|
|
2344
|
+
acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q4_4567_hi, q8_blk1, 3); // 32..35 r3 c4567
|
|
2345
|
+
}
|
|
2346
|
+
|
|
2347
|
+
// Scale and bias application
|
|
2348
|
+
// acc is stored interleaved to match output layout
|
|
2349
|
+
const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
|
|
2350
|
+
const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
|
|
2351
|
+
const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
|
|
2352
|
+
const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
|
|
2353
|
+
for (int row = 0; row < q8_k_blocklen; row++) {
|
|
2354
|
+
// Bias correction
|
|
2355
|
+
// row c0123 blk0 and blk1
|
|
2356
|
+
const float32x4_t sumf_0123 =
|
|
2357
|
+
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
|
|
2358
|
+
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
|
|
2359
|
+
acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
|
|
2360
|
+
|
|
2361
|
+
// row c4567 blk0 and blk1
|
|
2362
|
+
const float32x4_t sumf_4567 =
|
|
2363
|
+
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
|
|
2364
|
+
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
|
|
2365
|
+
acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
|
|
2366
|
+
|
|
2367
|
+
// Bias
|
|
2368
|
+
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
|
|
2369
|
+
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
|
|
2370
|
+
|
|
2371
|
+
// row c0123 blk0 and blk1
|
|
2372
|
+
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
|
|
2373
|
+
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
|
|
2374
|
+
|
|
2375
|
+
// row c4567 blk0 and blk1
|
|
2376
|
+
bias_acc[2 * row + 1] =
|
|
2377
|
+
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
|
|
2378
|
+
bias_acc[2 * row + 1] =
|
|
2379
|
+
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
|
|
2380
|
+
}
|
|
2381
|
+
} // for sb
|
|
2382
|
+
|
|
2383
|
+
for (int row = 0; row < q8_k_blocklen; row++) {
|
|
2384
|
+
acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
|
|
2385
|
+
acc_f32[2 * row + 1] =
|
|
2386
|
+
vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
|
|
2387
|
+
}
|
|
2388
|
+
} // for b
|
|
2389
|
+
|
|
2390
|
+
for (int i = 0; i < q8_k_blocklen; i++) {
|
|
2391
|
+
int row = y * q8_k_blocklen + i;
|
|
2392
|
+
for (int j = 0; j < 2; j++) {
|
|
2393
|
+
int col = x * ncols_interleaved + j * 4;
|
|
2394
|
+
int offset = row * bs + col;
|
|
2395
|
+
vst1q_f32(s + offset, acc_f32[2 * i + j]);
|
|
2396
|
+
}
|
|
2397
|
+
}
|
|
2398
|
+
} // for x
|
|
2399
|
+
} // for y
|
|
2400
|
+
return;
|
|
2401
|
+
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
2402
|
+
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
2403
|
+
}
|
|
2404
|
+
|
|
2072
2405
|
void ggml_gemm_q4_K_8x8_q8_K(int n,
|
|
2073
2406
|
float * GGML_RESTRICT s,
|
|
2074
2407
|
size_t bs,
|
|
@@ -1,20 +1,23 @@
|
|
|
1
1
|
#include "ggml-backend-impl.h"
|
|
2
2
|
|
|
3
3
|
#if defined(__riscv) && __riscv_xlen == 64
|
|
4
|
-
#include <
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
#ifndef COMPAT_HWCAP_ISA_V
|
|
8
|
-
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
|
|
9
|
-
#endif
|
|
4
|
+
#include <asm/hwprobe.h>
|
|
5
|
+
#include <asm/unistd.h>
|
|
6
|
+
#include <unistd.h>
|
|
10
7
|
|
|
11
8
|
struct riscv64_features {
|
|
12
9
|
bool has_rvv = false;
|
|
13
10
|
|
|
14
11
|
riscv64_features() {
|
|
15
|
-
|
|
12
|
+
struct riscv_hwprobe probe;
|
|
13
|
+
probe.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
|
|
14
|
+
probe.value = 0;
|
|
15
|
+
|
|
16
|
+
int ret = syscall(__NR_riscv_hwprobe, &probe, 1, 0, NULL, 0);
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
if (0 == ret) {
|
|
19
|
+
has_rvv = !!(probe.value & RISCV_HWPROBE_IMA_V);
|
|
20
|
+
}
|
|
18
21
|
}
|
|
19
22
|
};
|
|
20
23
|
|
|
@@ -33,10 +33,12 @@
|
|
|
33
33
|
// repack.cpp
|
|
34
34
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
35
35
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
36
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
36
37
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
37
38
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
38
39
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
39
40
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
41
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
40
42
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
41
43
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
42
44
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
@@ -44,12 +46,14 @@
|
|
|
44
46
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
45
47
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
46
48
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
49
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
47
50
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
48
51
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
49
52
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
50
53
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
51
54
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
|
52
55
|
// repack.cpp
|
|
56
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
53
57
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
54
58
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
55
59
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
@@ -58,11 +62,14 @@
|
|
|
58
62
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
|
59
63
|
// repack.cpp
|
|
60
64
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
65
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
61
66
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
62
67
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
68
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
63
69
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
64
70
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
65
71
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
72
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
66
73
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
67
74
|
#elif defined(__POWERPC__) || defined(__powerpc__)
|
|
68
75
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
|
@@ -74,10 +81,12 @@
|
|
|
74
81
|
// repack.cpp
|
|
75
82
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
76
83
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
84
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
77
85
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
78
86
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
79
87
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
80
88
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
89
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
81
90
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
82
91
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
83
92
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
@@ -85,6 +94,7 @@
|
|
|
85
94
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
86
95
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
87
96
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
97
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
88
98
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
89
99
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
90
100
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
@@ -99,10 +109,12 @@
|
|
|
99
109
|
// repack.cpp
|
|
100
110
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
101
111
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
112
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
102
113
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
103
114
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
104
115
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
105
116
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
117
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
106
118
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
107
119
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
108
120
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
@@ -110,6 +122,7 @@
|
|
|
110
122
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
111
123
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
112
124
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
125
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
113
126
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
114
127
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
115
128
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
@@ -132,15 +145,18 @@
|
|
|
132
145
|
// repack.cpp
|
|
133
146
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
134
147
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
148
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
135
149
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
136
150
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
137
151
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
152
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
138
153
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
139
154
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
140
155
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
141
156
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
142
157
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
143
158
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
159
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
144
160
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
145
161
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
146
162
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
@@ -161,10 +177,12 @@
|
|
|
161
177
|
// repack.cpp
|
|
162
178
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
163
179
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
180
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
164
181
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
165
182
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
166
183
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
167
184
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
185
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
168
186
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
169
187
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
170
188
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
@@ -172,6 +190,7 @@
|
|
|
172
190
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
173
191
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
174
192
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
193
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
175
194
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
176
195
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
177
196
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
@@ -194,10 +213,12 @@
|
|
|
194
213
|
// repack.cpp
|
|
195
214
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
196
215
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
216
|
+
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
197
217
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
198
218
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
199
219
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
200
220
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
221
|
+
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
201
222
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
202
223
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
203
224
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
@@ -205,6 +226,7 @@
|
|
|
205
226
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
206
227
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
207
228
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
229
|
+
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
208
230
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
209
231
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
210
232
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
@@ -9766,7 +9766,8 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
|
|
|
9766
9766
|
}
|
|
9767
9767
|
|
|
9768
9768
|
const float diag = A_batch[i00 * n + i00];
|
|
9769
|
-
|
|
9769
|
+
assert(diag != 0.0f && "Zero diagonal in triangular matrix");
|
|
9770
|
+
|
|
9770
9771
|
X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
|
|
9771
9772
|
}
|
|
9772
9773
|
}
|