@fugood/llama.node 1.3.1 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -3
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +6 -6
- package/src/llama.cpp/CMakeLists.txt +4 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -37
- package/src/llama.cpp/common/arg.cpp +7 -0
- package/src/llama.cpp/common/common.cpp +1 -5
- package/src/llama.cpp/common/common.h +2 -1
- package/src/llama.cpp/common/download.cpp +47 -29
- package/src/llama.cpp/common/log.cpp +6 -0
- package/src/llama.cpp/common/log.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +71 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +34 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +50 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -317
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
- package/src/llama.cpp/src/CMakeLists.txt +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +32 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
- package/src/llama.cpp/src/llama-model.cpp +102 -0
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-sampling.cpp +10 -5
- package/src/llama.cpp/src/llama-vocab.cpp +16 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/afmoe.cpp +187 -0
- package/src/llama.cpp/src/models/ernie4-5.cpp +4 -5
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +2 -1
- package/src/llama.cpp/src/unicode.cpp +77 -0
|
@@ -5,10 +5,13 @@
|
|
|
5
5
|
#include <assert.h>
|
|
6
6
|
#include <atomic>
|
|
7
7
|
#include <cfloat>
|
|
8
|
+
#include <cmath>
|
|
9
|
+
#include <algorithm>
|
|
8
10
|
#include <stdexcept>
|
|
9
11
|
#include <stdint.h>
|
|
10
12
|
#include <string.h>
|
|
11
13
|
#include <string>
|
|
14
|
+
#include <vector>
|
|
12
15
|
#if defined(__linux__)
|
|
13
16
|
#include <asm/hwcap.h>
|
|
14
17
|
#include <sys/auxv.h>
|
|
@@ -38,8 +41,9 @@
|
|
|
38
41
|
|
|
39
42
|
struct ggml_kleidiai_context {
|
|
40
43
|
cpu_feature features;
|
|
41
|
-
ggml_kleidiai_kernels *
|
|
42
|
-
|
|
44
|
+
ggml_kleidiai_kernels * kernels_q4;
|
|
45
|
+
ggml_kleidiai_kernels * kernels_q8;
|
|
46
|
+
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
|
|
43
47
|
|
|
44
48
|
static const char* cpu_feature_to_string(cpu_feature f) {
|
|
45
49
|
switch (f) {
|
|
@@ -73,10 +77,14 @@ static void init_kleidiai_context(void) {
|
|
|
73
77
|
if (sme_enabled != 0) {
|
|
74
78
|
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
|
75
79
|
}
|
|
76
|
-
ctx.
|
|
80
|
+
ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
|
81
|
+
ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features);
|
|
77
82
|
#ifndef NDEBUG
|
|
78
|
-
if (ctx.
|
|
79
|
-
GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.
|
|
83
|
+
if (ctx.kernels_q4) {
|
|
84
|
+
GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu));
|
|
85
|
+
}
|
|
86
|
+
if (ctx.kernels_q8) {
|
|
87
|
+
GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu));
|
|
80
88
|
}
|
|
81
89
|
#endif
|
|
82
90
|
}
|
|
@@ -130,6 +138,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
130
138
|
if (kernels->rhs_type == GGML_TYPE_Q4_0) {
|
|
131
139
|
if (!lhs_info->packed_size_ex) return false;
|
|
132
140
|
size = lhs_info->packed_size_ex(m, k, QK4_0, mr, kr, sr);
|
|
141
|
+
} else if (kernels->rhs_type == GGML_TYPE_Q8_0) {
|
|
142
|
+
if (!lhs_info->packed_size_ex) return false;
|
|
143
|
+
size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr);
|
|
133
144
|
} else if (kernels->rhs_type == GGML_TYPE_F16) {
|
|
134
145
|
if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false;
|
|
135
146
|
const int64_t lhs_batch_size0 = op->src[1]->ne[2];
|
|
@@ -149,11 +160,13 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
149
160
|
if (dst->op == GGML_OP_MUL_MAT) {
|
|
150
161
|
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
|
151
162
|
return compute_forward_q4_0(params, dst);
|
|
163
|
+
} else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
|
|
164
|
+
return compute_forward_q8_0(params, dst);
|
|
152
165
|
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
|
153
166
|
return compute_forward_fp16(params, dst);
|
|
154
167
|
}
|
|
155
168
|
} else if (dst->op == GGML_OP_GET_ROWS) {
|
|
156
|
-
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
|
169
|
+
if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) {
|
|
157
170
|
return compute_forward_get_rows(params, dst);
|
|
158
171
|
}
|
|
159
172
|
}
|
|
@@ -400,19 +413,120 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
400
413
|
return true;
|
|
401
414
|
}
|
|
402
415
|
|
|
403
|
-
bool
|
|
404
|
-
GGML_ASSERT(dst->src[0]->type ==
|
|
405
|
-
|
|
416
|
+
bool compute_forward_q8_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
|
417
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0);
|
|
418
|
+
|
|
419
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
420
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
421
|
+
|
|
422
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
423
|
+
|
|
424
|
+
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
|
425
|
+
if (!kernels) {
|
|
406
426
|
return false;
|
|
407
427
|
}
|
|
408
428
|
|
|
429
|
+
bool is_gemv = src1->ne[1] == 1;
|
|
430
|
+
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
|
431
|
+
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
|
432
|
+
|
|
433
|
+
if (!kernel || !lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex ||
|
|
434
|
+
!kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_ex || !kernel->get_dst_offset) {
|
|
435
|
+
return false;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
const int ith = params->ith;
|
|
439
|
+
const int nth_raw = params->nth;
|
|
440
|
+
const int nth = nth_raw > 0 ? nth_raw : 1;
|
|
441
|
+
|
|
442
|
+
const size_t k = ne00;
|
|
443
|
+
const size_t m = ne11;
|
|
444
|
+
const size_t n = ne01;
|
|
445
|
+
|
|
446
|
+
size_t mr = kernel->get_mr();
|
|
447
|
+
size_t kr = kernel->get_kr();
|
|
448
|
+
size_t sr = kernel->get_sr();
|
|
449
|
+
|
|
450
|
+
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
|
|
451
|
+
uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
|
|
452
|
+
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
|
|
453
|
+
|
|
454
|
+
const size_t n_step = kernel->get_n_step();
|
|
455
|
+
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
|
456
|
+
const size_t n_start = ith * num_n_per_thread;
|
|
457
|
+
|
|
458
|
+
size_t n_to_process = 0;
|
|
459
|
+
if (n_start < n) {
|
|
460
|
+
n_to_process = num_n_per_thread;
|
|
461
|
+
if ((n_start + n_to_process) > n) {
|
|
462
|
+
n_to_process = n - n_start;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
|
467
|
+
const size_t m_start = ith * num_m_per_thread;
|
|
468
|
+
size_t m_to_process = num_m_per_thread;
|
|
469
|
+
if ((m_start + m_to_process) > m) {
|
|
470
|
+
m_to_process = m - m_start;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
if (m_start < m) {
|
|
474
|
+
const size_t src_stride = src1->nb[1];
|
|
475
|
+
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
|
476
|
+
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, 0, mr, kr, sr);
|
|
477
|
+
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
|
478
|
+
|
|
479
|
+
lhs_info->pack_func_ex(m_to_process, k, 0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
ggml_barrier(params->threadpool);
|
|
483
|
+
|
|
484
|
+
const size_t dst_stride = dst->nb[1];
|
|
485
|
+
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr);
|
|
486
|
+
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0);
|
|
487
|
+
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
|
|
488
|
+
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
|
|
489
|
+
const void * lhs_ptr = static_cast<const void *>(lhs_packed + lhs_packed_offset);
|
|
490
|
+
float * dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
|
491
|
+
|
|
492
|
+
if (n_to_process > 0) {
|
|
493
|
+
kernel->run_kernel_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
|
494
|
+
sizeof(float), -FLT_MAX, FLT_MAX);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
return true;
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
|
409
501
|
const ggml_tensor * src0 = dst->src[0];
|
|
410
502
|
const ggml_tensor * src1 = dst->src[1];
|
|
411
503
|
|
|
412
504
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
413
505
|
|
|
414
|
-
|
|
415
|
-
|
|
506
|
+
ggml_kleidiai_kernels * kernels = nullptr;
|
|
507
|
+
size_t block_len = 0;
|
|
508
|
+
size_t num_bytes_multiplier = 0;
|
|
509
|
+
|
|
510
|
+
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
|
511
|
+
if (!ctx.kernels_q4) {
|
|
512
|
+
return false;
|
|
513
|
+
}
|
|
514
|
+
kernels = ctx.kernels_q4;
|
|
515
|
+
block_len = QK4_0;
|
|
516
|
+
num_bytes_multiplier = sizeof(uint16_t);
|
|
517
|
+
} else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
|
|
518
|
+
if (!ctx.kernels_q8) {
|
|
519
|
+
return false;
|
|
520
|
+
}
|
|
521
|
+
kernels = ctx.kernels_q8;
|
|
522
|
+
block_len = QK8_0;
|
|
523
|
+
num_bytes_multiplier = sizeof(float);
|
|
524
|
+
} else {
|
|
525
|
+
return false;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
rhs_packing_info * rhs_info = &kernels->rhs_info;
|
|
529
|
+
kernel_info * kernel = &kernels->gemm;
|
|
416
530
|
if (!rhs_info->to_float || !kernel->get_nr) {
|
|
417
531
|
return false;
|
|
418
532
|
}
|
|
@@ -423,8 +537,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
423
537
|
const size_t block_rows = kernel->get_nr();
|
|
424
538
|
const size_t kr = kernel->get_kr();
|
|
425
539
|
|
|
426
|
-
const size_t
|
|
427
|
-
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0);
|
|
540
|
+
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, block_len);
|
|
428
541
|
|
|
429
542
|
const int ith = params->ith;
|
|
430
543
|
const int nth = params->nth;
|
|
@@ -439,7 +552,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
439
552
|
GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
|
|
440
553
|
|
|
441
554
|
float *out = (float *)((char *)dst->data + i * nb1);
|
|
442
|
-
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr,
|
|
555
|
+
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, block_len, num_bytes_multiplier);
|
|
443
556
|
}
|
|
444
557
|
|
|
445
558
|
return true;
|
|
@@ -447,21 +560,91 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
447
560
|
|
|
448
561
|
public:
|
|
449
562
|
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
|
|
450
|
-
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
|
|
451
|
-
GGML_ASSERT(ctx.kernels);
|
|
452
563
|
const size_t n = tensor->ne[1];
|
|
453
564
|
const size_t k = tensor->ne[0];
|
|
454
|
-
size_t nr = ctx.kernels->gemm.get_nr();
|
|
455
|
-
size_t kr = ctx.kernels->gemm.get_kr();
|
|
456
|
-
size_t sr = ctx.kernels->gemm.get_sr();
|
|
457
565
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
566
|
+
if (tensor->type == GGML_TYPE_Q4_0) {
|
|
567
|
+
if (!ctx.kernels_q4) {
|
|
568
|
+
return -1;
|
|
569
|
+
}
|
|
570
|
+
size_t nr = ctx.kernels_q4->gemm.get_nr();
|
|
571
|
+
size_t kr = ctx.kernels_q4->gemm.get_kr();
|
|
572
|
+
size_t sr = ctx.kernels_q4->gemm.get_sr();
|
|
573
|
+
|
|
574
|
+
struct kai_rhs_pack_qs4cxs1s0_param params;
|
|
575
|
+
params.lhs_zero_point = 1;
|
|
576
|
+
params.rhs_zero_point = 8;
|
|
577
|
+
ctx.kernels_q4->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, QK4_0, 0,
|
|
578
|
+
static_cast<const uint8_t *>(data),
|
|
579
|
+
nullptr, nullptr, tensor->data, 0, ¶ms);
|
|
580
|
+
GGML_UNUSED(data_size);
|
|
581
|
+
return 0;
|
|
582
|
+
} else if (tensor->type == GGML_TYPE_Q8_0) {
|
|
583
|
+
if (!ctx.kernels_q8) {
|
|
584
|
+
return -1;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
const size_t row_stride = tensor->nb[1];
|
|
588
|
+
const size_t k_blocks = (k + QK8_0 - 1) / QK8_0;
|
|
589
|
+
|
|
590
|
+
std::vector<int8_t> qdata(n * k, 0);
|
|
591
|
+
std::vector<float> scales(n, 0.0f);
|
|
592
|
+
|
|
593
|
+
for (size_t row = 0; row < n; ++row) {
|
|
594
|
+
const auto * row_blocks = reinterpret_cast<const block_q8_0 *>(
|
|
595
|
+
static_cast<const uint8_t *>(data) + row * row_stride);
|
|
596
|
+
|
|
597
|
+
float max_abs = 0.0f;
|
|
598
|
+
for (size_t block = 0; block < k_blocks; ++block) {
|
|
599
|
+
const block_q8_0 & blk = row_blocks[block];
|
|
600
|
+
const float d = GGML_FP16_TO_FP32(blk.d);
|
|
601
|
+
for (size_t l = 0; l < QK8_0; ++l) {
|
|
602
|
+
const size_t linear_idx = block * QK8_0 + l;
|
|
603
|
+
if (linear_idx >= k) {
|
|
604
|
+
break;
|
|
605
|
+
}
|
|
606
|
+
const float value = d * blk.qs[l];
|
|
607
|
+
max_abs = std::max(max_abs, std::fabs(value));
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
float scale = max_abs > 0.0f ? max_abs / 127.0f : 0.0f;
|
|
612
|
+
scales[row] = scale;
|
|
613
|
+
const float inv_scale = scale > 0.0f ? 1.0f / scale : 0.0f;
|
|
614
|
+
|
|
615
|
+
for (size_t block = 0; block < k_blocks; ++block) {
|
|
616
|
+
const block_q8_0 & blk = row_blocks[block];
|
|
617
|
+
const float d = GGML_FP16_TO_FP32(blk.d);
|
|
618
|
+
for (size_t l = 0; l < QK8_0; ++l) {
|
|
619
|
+
const size_t linear_idx = block * QK8_0 + l;
|
|
620
|
+
if (linear_idx >= k) {
|
|
621
|
+
break;
|
|
622
|
+
}
|
|
623
|
+
const float value = d * blk.qs[l];
|
|
624
|
+
int32_t q = scale > 0.0f ? static_cast<int32_t>(std::lround(value * inv_scale)) : 0;
|
|
625
|
+
q = std::clamp(q, -127, 127);
|
|
626
|
+
qdata[row * k + linear_idx] = static_cast<int8_t>(q);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
size_t nr = ctx.kernels_q8->gemm.get_nr();
|
|
632
|
+
size_t kr = ctx.kernels_q8->gemm.get_kr();
|
|
633
|
+
size_t sr = ctx.kernels_q8->gemm.get_sr();
|
|
634
|
+
|
|
635
|
+
struct kai_rhs_pack_qsi8cx_params params;
|
|
636
|
+
params.lhs_zero_point = 1;
|
|
637
|
+
params.scale_multiplier = 1.0f;
|
|
638
|
+
|
|
639
|
+
ctx.kernels_q8->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, 0, 0,
|
|
640
|
+
qdata.data(), nullptr, scales.data(),
|
|
641
|
+
tensor->data, 0, ¶ms);
|
|
642
|
+
GGML_UNUSED(data_size);
|
|
643
|
+
return 0;
|
|
644
|
+
}
|
|
462
645
|
|
|
463
|
-
return 0;
|
|
464
646
|
GGML_UNUSED(data_size);
|
|
647
|
+
return -1;
|
|
465
648
|
}
|
|
466
649
|
};
|
|
467
650
|
|
|
@@ -518,27 +701,45 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
|
|
|
518
701
|
}
|
|
519
702
|
|
|
520
703
|
static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
|
521
|
-
|
|
522
|
-
GGML_ASSERT(ctx.kernels);
|
|
704
|
+
GGML_UNUSED(buft);
|
|
523
705
|
|
|
524
|
-
const size_t n
|
|
525
|
-
const size_t k
|
|
526
|
-
|
|
527
|
-
|
|
706
|
+
const size_t n = tensor->ne[1];
|
|
707
|
+
const size_t k = tensor->ne[0];
|
|
708
|
+
|
|
709
|
+
ggml_kleidiai_kernels * kernels = nullptr;
|
|
710
|
+
size_t block_len = 0;
|
|
711
|
+
|
|
712
|
+
if (tensor->type == GGML_TYPE_Q4_0) {
|
|
713
|
+
GGML_ASSERT(ctx.kernels_q4);
|
|
714
|
+
kernels = ctx.kernels_q4;
|
|
715
|
+
block_len = QK4_0;
|
|
716
|
+
} else if (tensor->type == GGML_TYPE_Q8_0) {
|
|
717
|
+
GGML_ASSERT(ctx.kernels_q8);
|
|
718
|
+
kernels = ctx.kernels_q8;
|
|
719
|
+
block_len = QK8_0;
|
|
720
|
+
} else {
|
|
721
|
+
return 0;
|
|
722
|
+
}
|
|
528
723
|
|
|
529
|
-
|
|
724
|
+
const size_t nr = kernels->gemm.get_nr();
|
|
725
|
+
const size_t kr = kernels->gemm.get_kr();
|
|
726
|
+
const size_t packed = kernels->rhs_info.packed_size_ex(n, k, nr, kr, block_len);
|
|
727
|
+
const size_t raw = ggml_nbytes(tensor);
|
|
530
728
|
|
|
531
|
-
|
|
729
|
+
return packed > raw ? packed : raw;
|
|
532
730
|
}
|
|
533
731
|
|
|
534
732
|
namespace ggml::cpu::kleidiai {
|
|
535
733
|
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
536
734
|
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
|
537
735
|
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
|
|
538
|
-
op->src[0]->type == GGML_TYPE_Q4_0 &&
|
|
736
|
+
(op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) &&
|
|
539
737
|
op->src[0]->buffer &&
|
|
540
738
|
(ggml_n_dims(op->src[0]) == 2) &&
|
|
541
|
-
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()
|
|
739
|
+
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
|
740
|
+
if (((op->src[0]->type == GGML_TYPE_Q4_0) ? ctx.kernels_q4 : ctx.kernels_q8) == nullptr) {
|
|
741
|
+
return false;
|
|
742
|
+
}
|
|
542
743
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
|
543
744
|
return false;
|
|
544
745
|
}
|