@fugood/llama.node 1.4.7 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +23 -24
- package/src/LlamaContext.cpp +4 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +470 -223
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +44 -17
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +67 -54
- package/src/llama.cpp/common/sampling.h +8 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +110 -49
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +665 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
|
|
|
81
81
|
} ggml_arm_arch_features = { 0 };
|
|
82
82
|
#endif
|
|
83
83
|
|
|
84
|
+
#if defined(__riscv)
|
|
85
|
+
struct ggml_riscv_arch_features_type {
|
|
86
|
+
int rvv_vlen;
|
|
87
|
+
} ggml_riscv_arch_features = { 0 };
|
|
88
|
+
#endif
|
|
84
89
|
|
|
85
90
|
#if defined(_WIN32)
|
|
86
91
|
|
|
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
|
|
|
187
192
|
|
|
188
193
|
typedef pthread_t ggml_thread_t;
|
|
189
194
|
|
|
195
|
+
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
|
|
196
|
+
#define GGML_THREADPOOL_N_THREADS_BITS (16)
|
|
197
|
+
|
|
190
198
|
#if defined(__APPLE__)
|
|
191
199
|
#include <unistd.h>
|
|
192
200
|
#include <mach/mach.h>
|
|
@@ -449,7 +457,7 @@ struct ggml_threadpool {
|
|
|
449
457
|
struct ggml_cplan * cplan;
|
|
450
458
|
|
|
451
459
|
// synchronization primitives
|
|
452
|
-
atomic_int n_graph; //
|
|
460
|
+
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
|
|
453
461
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
|
454
462
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
|
455
463
|
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
@@ -457,12 +465,10 @@ struct ggml_threadpool {
|
|
|
457
465
|
// these are atomic as an annotation for thread-sanitizer
|
|
458
466
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
459
467
|
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
|
460
|
-
atomic_int
|
|
468
|
+
atomic_int abort; // Used for aborting processing of a graph
|
|
461
469
|
|
|
462
470
|
struct ggml_compute_state * workers; // per thread state
|
|
463
|
-
int
|
|
464
|
-
atomic_int n_threads_cur; // number of threads used in the current graph
|
|
465
|
-
|
|
471
|
+
int n_threads; // Number of threads in the pool
|
|
466
472
|
int32_t prio; // Scheduling priority
|
|
467
473
|
uint32_t poll; // Polling level (0 - no polling)
|
|
468
474
|
|
|
@@ -539,7 +545,7 @@ struct ggml_state {
|
|
|
539
545
|
static struct ggml_state g_state = {0};
|
|
540
546
|
|
|
541
547
|
void ggml_barrier(struct ggml_threadpool * tp) {
|
|
542
|
-
int n_threads = atomic_load_explicit(&tp->
|
|
548
|
+
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
|
|
543
549
|
if (n_threads == 1) {
|
|
544
550
|
return;
|
|
545
551
|
}
|
|
@@ -556,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
|
|
556
562
|
// last thread
|
|
557
563
|
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
|
558
564
|
|
|
559
|
-
// exit barrier (
|
|
565
|
+
// exit barrier (full seq-cst fence)
|
|
560
566
|
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
|
561
567
|
return;
|
|
562
568
|
}
|
|
@@ -702,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
|
|
|
702
708
|
#endif
|
|
703
709
|
#endif // __ARM_ARCH
|
|
704
710
|
|
|
711
|
+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
|
712
|
+
#include <riscv_vector.h>
|
|
713
|
+
static void ggml_init_riscv_arch_features(void) {
|
|
714
|
+
ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
|
|
715
|
+
}
|
|
716
|
+
#else
|
|
717
|
+
static void ggml_init_riscv_arch_features(void) {}
|
|
718
|
+
#endif
|
|
719
|
+
|
|
705
720
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
|
706
721
|
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
|
707
722
|
|
|
@@ -2628,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|
|
2628
2643
|
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
|
2629
2644
|
if (!threadpool) return;
|
|
2630
2645
|
|
|
2631
|
-
const int n_threads = threadpool->
|
|
2646
|
+
const int n_threads = threadpool->n_threads;
|
|
2632
2647
|
|
|
2633
2648
|
#ifndef GGML_USE_OPENMP
|
|
2634
2649
|
struct ggml_compute_state* workers = threadpool->workers;
|
|
@@ -2704,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2704
2719
|
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
|
2705
2720
|
}
|
|
2706
2721
|
if (n_threads <= 0) {
|
|
2707
|
-
n_threads = threadpool ? threadpool->
|
|
2722
|
+
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
|
|
2708
2723
|
}
|
|
2709
2724
|
|
|
2710
2725
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
|
@@ -2912,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2912
2927
|
|
|
2913
2928
|
struct ggml_compute_params params = {
|
|
2914
2929
|
/*.ith =*/ state->ith,
|
|
2915
|
-
/*.nth =*/ atomic_load_explicit(&tp->
|
|
2930
|
+
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
|
|
2916
2931
|
/*.wsize =*/ cplan->work_size,
|
|
2917
2932
|
/*.wdata =*/ cplan->work_data,
|
|
2918
2933
|
/*.threadpool=*/ tp,
|
|
2919
2934
|
};
|
|
2920
2935
|
|
|
2936
|
+
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
|
2937
|
+
|
|
2921
2938
|
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
|
2922
2939
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
2923
2940
|
|
|
@@ -2939,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2939
2956
|
}
|
|
2940
2957
|
}
|
|
2941
2958
|
|
|
2959
|
+
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
|
|
2960
|
+
|
|
2942
2961
|
ggml_barrier(state->threadpool);
|
|
2943
2962
|
|
|
2944
2963
|
return 0;
|
|
@@ -2946,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
2946
2965
|
|
|
2947
2966
|
#ifndef GGML_USE_OPENMP
|
|
2948
2967
|
|
|
2949
|
-
// check if thread is active
|
|
2950
|
-
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
|
|
2951
|
-
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2952
|
-
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
|
|
2953
|
-
return (state->ith < n_threads);
|
|
2954
|
-
}
|
|
2955
|
-
|
|
2956
2968
|
// check if thread is ready to proceed (exit from polling or sleeping)
|
|
2969
|
+
// returns true if loops should exit, sets state->pending to indicate new work
|
|
2957
2970
|
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
|
|
2958
2971
|
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2959
2972
|
|
|
2960
2973
|
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
|
|
2961
2974
|
|
|
2962
2975
|
// check for new graph/work
|
|
2963
|
-
int
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
state->
|
|
2976
|
+
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
|
|
2977
|
+
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
|
|
2978
|
+
if (n_graph != state->last_graph) {
|
|
2979
|
+
state->pending = (state->ith < n_threads);
|
|
2980
|
+
state->last_graph = n_graph;
|
|
2981
|
+
return true;
|
|
2967
2982
|
}
|
|
2968
2983
|
|
|
2969
|
-
return
|
|
2984
|
+
return false;
|
|
2970
2985
|
}
|
|
2971
2986
|
|
|
2972
2987
|
// sync thread state after polling
|
|
@@ -2983,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
|
|
|
2983
2998
|
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
|
2984
2999
|
struct ggml_threadpool * threadpool = state->threadpool;
|
|
2985
3000
|
|
|
2986
|
-
// Skip polling for unused threads
|
|
2987
|
-
if (!ggml_graph_compute_thread_active(state)) {
|
|
2988
|
-
return state->pending;
|
|
2989
|
-
}
|
|
2990
|
-
|
|
2991
3001
|
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
|
2992
3002
|
// Perhaps, we can adjust it dynamically based on load and things.
|
|
2993
3003
|
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
|
@@ -3049,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
|
|
3049
3059
|
ggml_graph_compute_check_for_work(state);
|
|
3050
3060
|
if (state->pending) {
|
|
3051
3061
|
state->pending = false;
|
|
3052
|
-
|
|
3053
3062
|
ggml_graph_compute_thread(state);
|
|
3054
3063
|
}
|
|
3055
3064
|
}
|
|
@@ -3064,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
|
|
3064
3073
|
|
|
3065
3074
|
ggml_mutex_lock(&threadpool->mutex);
|
|
3066
3075
|
|
|
3067
|
-
|
|
3076
|
+
// Update the number of active threads and the graph count
|
|
3077
|
+
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
|
|
3078
|
+
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
|
|
3068
3079
|
|
|
3069
|
-
|
|
3070
|
-
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3080
|
+
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
|
|
3071
3081
|
|
|
3072
3082
|
// Indicate the graph is ready to be processed
|
|
3073
3083
|
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
|
|
3074
|
-
|
|
3084
|
+
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
|
|
3075
3085
|
|
|
3076
3086
|
if (threadpool->pause) {
|
|
3077
3087
|
// Update main thread prio and affinity to match the threadpool settings
|
|
@@ -3109,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
3109
3119
|
threadpool->pause = tpp->paused;
|
|
3110
3120
|
threadpool->abort = -1;
|
|
3111
3121
|
threadpool->workers = NULL;
|
|
3112
|
-
threadpool->
|
|
3113
|
-
threadpool->n_threads_cur = tpp->n_threads;
|
|
3122
|
+
threadpool->n_threads = tpp->n_threads;
|
|
3114
3123
|
threadpool->poll = tpp->poll;
|
|
3115
3124
|
threadpool->prio = tpp->prio;
|
|
3116
3125
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
|
@@ -3205,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3205
3214
|
{
|
|
3206
3215
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
|
3207
3216
|
n_threads = omp_get_num_threads();
|
|
3208
|
-
atomic_store_explicit(&threadpool->
|
|
3217
|
+
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
|
|
3209
3218
|
}
|
|
3210
3219
|
|
|
3211
3220
|
// Apply thread CPU mask and priority
|
|
@@ -3218,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3218
3227
|
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3219
3228
|
}
|
|
3220
3229
|
} else {
|
|
3221
|
-
atomic_store_explicit(&threadpool->
|
|
3230
|
+
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
|
|
3222
3231
|
ggml_graph_compute_thread(&threadpool->workers[0]);
|
|
3223
3232
|
}
|
|
3224
3233
|
#else
|
|
3225
|
-
if (n_threads > threadpool->
|
|
3226
|
-
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->
|
|
3227
|
-
n_threads = threadpool->
|
|
3234
|
+
if (n_threads > threadpool->n_threads) {
|
|
3235
|
+
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
|
|
3236
|
+
n_threads = threadpool->n_threads;
|
|
3228
3237
|
}
|
|
3229
3238
|
|
|
3230
3239
|
// Kick all threads to start the new graph
|
|
@@ -3311,13 +3320,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
|
3311
3320
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3312
3321
|
_mm_storeu_ps(y + i, y_vec);
|
|
3313
3322
|
}
|
|
3314
|
-
|
|
3315
|
-
|
|
3316
|
-
|
|
3317
|
-
|
|
3318
|
-
|
|
3319
|
-
|
|
3323
|
+
|
|
3324
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
|
|
3325
|
+
// calculate step size
|
|
3326
|
+
const int epr = __riscv_vsetvlmax_e16m2();
|
|
3327
|
+
const int step = epr * 2;
|
|
3328
|
+
const int np = (n & ~(step - 1));
|
|
3329
|
+
|
|
3330
|
+
// unroll by 2
|
|
3331
|
+
for (; i < np; i += step) {
|
|
3332
|
+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
|
|
3333
|
+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
|
|
3334
|
+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
|
|
3335
|
+
|
|
3336
|
+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
|
|
3337
|
+
vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
|
|
3338
|
+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
|
|
3339
|
+
}
|
|
3340
|
+
|
|
3341
|
+
// leftovers
|
|
3342
|
+
int vl;
|
|
3343
|
+
for (i = np; i < n; i += vl) {
|
|
3344
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
3345
|
+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
|
|
3346
|
+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
|
|
3347
|
+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
|
|
3320
3348
|
}
|
|
3349
|
+
|
|
3321
3350
|
#endif
|
|
3322
3351
|
|
|
3323
3352
|
for (; i < n; ++i) {
|
|
@@ -3362,6 +3391,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
|
|
3362
3391
|
(const __m128i *)(x + i))),
|
|
3363
3392
|
16)));
|
|
3364
3393
|
}
|
|
3394
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
|
|
3395
|
+
// calculate step size
|
|
3396
|
+
const int epr = __riscv_vsetvlmax_e16m2();
|
|
3397
|
+
const int step = epr * 2;
|
|
3398
|
+
const int np = (n & ~(step - 1));
|
|
3399
|
+
|
|
3400
|
+
// unroll by 2
|
|
3401
|
+
for (; i < np; i += step) {
|
|
3402
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
|
|
3403
|
+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
|
|
3404
|
+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
|
|
3405
|
+
|
|
3406
|
+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
|
|
3407
|
+
vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
|
|
3408
|
+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
|
|
3409
|
+
}
|
|
3410
|
+
|
|
3411
|
+
// leftovers
|
|
3412
|
+
int vl;
|
|
3413
|
+
for (i = np; i < n; i += vl) {
|
|
3414
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
3415
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
|
|
3416
|
+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
|
|
3417
|
+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
|
|
3418
|
+
}
|
|
3365
3419
|
#endif
|
|
3366
3420
|
for (; i < n; i++) {
|
|
3367
3421
|
y[i] = GGML_BF16_TO_FP32(x[i]);
|
|
@@ -3464,6 +3518,14 @@ int ggml_cpu_has_riscv_v(void) {
|
|
|
3464
3518
|
#endif
|
|
3465
3519
|
}
|
|
3466
3520
|
|
|
3521
|
+
int ggml_cpu_get_rvv_vlen(void) {
|
|
3522
|
+
#if defined(__riscv) && defined(__riscv_v_intrinsic)
|
|
3523
|
+
return ggml_riscv_arch_features.rvv_vlen;
|
|
3524
|
+
#else
|
|
3525
|
+
return 0;
|
|
3526
|
+
#endif
|
|
3527
|
+
}
|
|
3528
|
+
|
|
3467
3529
|
int ggml_cpu_has_f16c(void) {
|
|
3468
3530
|
#if defined(__F16C__)
|
|
3469
3531
|
return 1;
|
|
@@ -3630,6 +3692,10 @@ void ggml_cpu_init(void) {
|
|
|
3630
3692
|
ggml_init_arm_arch_features();
|
|
3631
3693
|
#endif
|
|
3632
3694
|
|
|
3695
|
+
#if defined(__riscv)
|
|
3696
|
+
ggml_init_riscv_arch_features();
|
|
3697
|
+
#endif
|
|
3698
|
+
|
|
3633
3699
|
is_first_call = false;
|
|
3634
3700
|
}
|
|
3635
3701
|
|
|
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
583
583
|
if (ggml_cpu_has_riscv_v()) {
|
|
584
584
|
features.push_back({ "RISCV_V", "1" });
|
|
585
585
|
}
|
|
586
|
+
if (ggml_cpu_get_rvv_vlen() > 0) {
|
|
587
|
+
static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
|
|
588
|
+
features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
|
|
589
|
+
}
|
|
586
590
|
if (ggml_cpu_has_vsx()) {
|
|
587
591
|
features.push_back({ "VSX", "1" });
|
|
588
592
|
}
|
|
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
692
692
|
}
|
|
693
693
|
}
|
|
694
694
|
|
|
695
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
|
696
|
+
float * GGML_RESTRICT s,
|
|
697
|
+
size_t bs,
|
|
698
|
+
const void * GGML_RESTRICT vx,
|
|
699
|
+
const void * GGML_RESTRICT vy,
|
|
700
|
+
int nr,
|
|
701
|
+
int nc) {
|
|
702
|
+
const int qk = QK8_0;
|
|
703
|
+
const int nb = n / qk;
|
|
704
|
+
const int ncols_interleaved = 4;
|
|
705
|
+
const int blocklen = 4;
|
|
706
|
+
|
|
707
|
+
assert(nr == 1);
|
|
708
|
+
assert(n % qk == 0);
|
|
709
|
+
assert(nc % ncols_interleaved == 0);
|
|
710
|
+
|
|
711
|
+
UNUSED(bs);
|
|
712
|
+
UNUSED(nr);
|
|
713
|
+
|
|
714
|
+
float sumf[4];
|
|
715
|
+
int sumi;
|
|
716
|
+
|
|
717
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
718
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
719
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
720
|
+
|
|
721
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
722
|
+
sumf[j] = 0.0;
|
|
723
|
+
}
|
|
724
|
+
for (int l = 0; l < nb; l++) {
|
|
725
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
726
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
727
|
+
sumi = 0;
|
|
728
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
729
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
730
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
731
|
+
}
|
|
732
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
737
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|
743
|
+
float * GGML_RESTRICT s,
|
|
744
|
+
size_t bs,
|
|
745
|
+
const void * GGML_RESTRICT vx,
|
|
746
|
+
const void * GGML_RESTRICT vy,
|
|
747
|
+
int nr,
|
|
748
|
+
int nc) {
|
|
749
|
+
const int qk = QK8_0;
|
|
750
|
+
const int nb = n / qk;
|
|
751
|
+
const int ncols_interleaved = 4;
|
|
752
|
+
const int blocklen = 8;
|
|
753
|
+
|
|
754
|
+
assert(nr == 1);
|
|
755
|
+
assert(n % qk == 0);
|
|
756
|
+
assert(nc % ncols_interleaved == 0);
|
|
757
|
+
|
|
758
|
+
UNUSED(bs);
|
|
759
|
+
UNUSED(nr);
|
|
760
|
+
|
|
761
|
+
float sumf[4];
|
|
762
|
+
int sumi;
|
|
763
|
+
|
|
764
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
765
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
766
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
767
|
+
|
|
768
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
769
|
+
sumf[j] = 0.0;
|
|
770
|
+
}
|
|
771
|
+
for (int l = 0; l < nb; l++) {
|
|
772
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
773
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
774
|
+
sumi = 0;
|
|
775
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
776
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
777
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
778
|
+
}
|
|
779
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
784
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
|
|
695
789
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
696
790
|
const int qk = QK8_0;
|
|
697
791
|
const int nb = n / qk;
|
|
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1219
1313
|
}
|
|
1220
1314
|
}
|
|
1221
1315
|
|
|
1316
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
1317
|
+
float * GGML_RESTRICT s,
|
|
1318
|
+
size_t bs,
|
|
1319
|
+
const void * GGML_RESTRICT vx,
|
|
1320
|
+
const void * GGML_RESTRICT vy,
|
|
1321
|
+
int nr,
|
|
1322
|
+
int nc) {
|
|
1323
|
+
const int qk = QK8_0;
|
|
1324
|
+
const int nb = n / qk;
|
|
1325
|
+
const int ncols_interleaved = 4;
|
|
1326
|
+
const int blocklen = 4;
|
|
1327
|
+
|
|
1328
|
+
assert(n % qk == 0);
|
|
1329
|
+
assert(nr % 4 == 0);
|
|
1330
|
+
assert(nc % ncols_interleaved == 0);
|
|
1331
|
+
|
|
1332
|
+
float sumf[4][4];
|
|
1333
|
+
int sumi;
|
|
1334
|
+
|
|
1335
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1336
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1337
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1338
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1339
|
+
for (int m = 0; m < 4; m++) {
|
|
1340
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1341
|
+
sumf[m][j] = 0.0;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
for (int l = 0; l < nb; l++) {
|
|
1345
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1346
|
+
for (int m = 0; m < 4; m++) {
|
|
1347
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1348
|
+
sumi = 0;
|
|
1349
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1350
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1351
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1352
|
+
}
|
|
1353
|
+
sumf[m][j] +=
|
|
1354
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
for (int m = 0; m < 4; m++) {
|
|
1360
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1361
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
1369
|
+
float * GGML_RESTRICT s,
|
|
1370
|
+
size_t bs,
|
|
1371
|
+
const void * GGML_RESTRICT vx,
|
|
1372
|
+
const void * GGML_RESTRICT vy,
|
|
1373
|
+
int nr,
|
|
1374
|
+
int nc) {
|
|
1375
|
+
const int qk = QK8_0;
|
|
1376
|
+
const int nb = n / qk;
|
|
1377
|
+
const int ncols_interleaved = 4;
|
|
1378
|
+
const int blocklen = 8;
|
|
1379
|
+
|
|
1380
|
+
assert(n % qk == 0);
|
|
1381
|
+
assert(nr % 4 == 0);
|
|
1382
|
+
assert(nc % ncols_interleaved == 0);
|
|
1383
|
+
|
|
1384
|
+
float sumf[4][4];
|
|
1385
|
+
int sumi;
|
|
1386
|
+
|
|
1387
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1388
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1389
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1390
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1391
|
+
for (int m = 0; m < 4; m++) {
|
|
1392
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1393
|
+
sumf[m][j] = 0.0;
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
for (int l = 0; l < nb; l++) {
|
|
1397
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1398
|
+
for (int m = 0; m < 4; m++) {
|
|
1399
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1400
|
+
sumi = 0;
|
|
1401
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1402
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1403
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1404
|
+
}
|
|
1405
|
+
sumf[m][j] +=
|
|
1406
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1407
|
+
}
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
for (int m = 0; m < 4; m++) {
|
|
1412
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1413
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1414
|
+
}
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1222
1420
|
} // extern "C"
|
|
1223
1421
|
|
|
1422
|
+
static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
|
|
1423
|
+
block_q8_0x4 out;
|
|
1424
|
+
|
|
1425
|
+
for (int i = 0; i < 4; i++) {
|
|
1426
|
+
out.d[i] = in[i].d;
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
const int end = QK8_0 * 4 / blck_size_interleave;
|
|
1430
|
+
for (int i = 0; i < end; ++i) {
|
|
1431
|
+
int src_id = i % 4;
|
|
1432
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
|
1433
|
+
int dst_offset = i * blck_size_interleave;
|
|
1434
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
|
1435
|
+
}
|
|
1436
|
+
return out;
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1224
1439
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
1225
1440
|
block_q4_0x4 out;
|
|
1226
1441
|
|
|
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1534
1749
|
GGML_UNUSED(data_size);
|
|
1535
1750
|
}
|
|
1536
1751
|
|
|
1752
|
+
static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
1753
|
+
int interleave_block,
|
|
1754
|
+
const void * GGML_RESTRICT data,
|
|
1755
|
+
size_t data_size) {
|
|
1756
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
|
1757
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1758
|
+
constexpr int nrows_interleaved = 4;
|
|
1759
|
+
|
|
1760
|
+
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
1761
|
+
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
1762
|
+
block_q8_0 dst_tmp[4];
|
|
1763
|
+
int nrow = ggml_nrows(t);
|
|
1764
|
+
int nblocks = t->ne[0] / QK8_0;
|
|
1765
|
+
|
|
1766
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
|
1767
|
+
|
|
1768
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
1769
|
+
return -1;
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1773
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1774
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1775
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1776
|
+
}
|
|
1777
|
+
*dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
|
1778
|
+
}
|
|
1779
|
+
src += nrows_interleaved * nblocks;
|
|
1780
|
+
}
|
|
1781
|
+
return 0;
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1537
1784
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
1538
1785
|
block_iq4_nlx4 out;
|
|
1539
1786
|
|
|
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
|
|
|
1702
1949
|
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1703
1950
|
}
|
|
1704
1951
|
|
|
1952
|
+
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1953
|
+
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1957
|
+
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1705
1960
|
// gemv
|
|
1706
1961
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1707
1962
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1738
1993
|
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1739
1994
|
}
|
|
1740
1995
|
|
|
1996
|
+
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1997
|
+
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1998
|
+
}
|
|
1999
|
+
|
|
2000
|
+
template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2001
|
+
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2002
|
+
}
|
|
2003
|
+
|
|
1741
2004
|
// gemm
|
|
1742
2005
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1743
2006
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1774
2037
|
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1775
2038
|
}
|
|
1776
2039
|
|
|
2040
|
+
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2041
|
+
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2042
|
+
}
|
|
2043
|
+
|
|
2044
|
+
template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2045
|
+
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2046
|
+
}
|
|
2047
|
+
|
|
1777
2048
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
1778
2049
|
public:
|
|
1779
2050
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -2168,8 +2439,13 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2168
2439
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
2169
2440
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2170
2441
|
|
|
2442
|
+
// instance for Q8_0
|
|
2443
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
|
2444
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
|
2445
|
+
|
|
2171
2446
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
2172
|
-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2447
|
+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2448
|
+
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
2173
2449
|
if (cur->ne[1] % 8 == 0) {
|
|
2174
2450
|
return &q4_0_8x8_q8_0;
|
|
2175
2451
|
}
|
|
@@ -2217,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2217
2493
|
return &iq4_nl_4x4_q8_0;
|
|
2218
2494
|
}
|
|
2219
2495
|
}
|
|
2496
|
+
} else if (cur->type == GGML_TYPE_Q8_0) {
|
|
2497
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
2498
|
+
if (cur->ne[1] % 4 == 0) {
|
|
2499
|
+
return &q8_0_4x8_q8_0;
|
|
2500
|
+
}
|
|
2501
|
+
}
|
|
2502
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
2503
|
+
if (cur->ne[1] % 4 == 0) {
|
|
2504
|
+
return &q8_0_4x4_q8_0;
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
2220
2507
|
}
|
|
2221
2508
|
|
|
2222
2509
|
return nullptr;
|