@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type {
81
81
  } ggml_arm_arch_features = { 0 };
82
82
  #endif
83
83
 
84
+ #if defined(__riscv)
85
+ struct ggml_riscv_arch_features_type {
86
+ int rvv_vlen;
87
+ } ggml_riscv_arch_features = { 0 };
88
+ #endif
84
89
 
85
90
  #if defined(_WIN32)
86
91
 
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
187
192
 
188
193
  typedef pthread_t ggml_thread_t;
189
194
 
195
+ #define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
196
+ #define GGML_THREADPOOL_N_THREADS_BITS (16)
197
+
190
198
  #if defined(__APPLE__)
191
199
  #include <unistd.h>
192
200
  #include <mach/mach.h>
@@ -449,7 +457,7 @@ struct ggml_threadpool {
449
457
  struct ggml_cplan * cplan;
450
458
 
451
459
  // synchronization primitives
452
- atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
460
+ atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
453
461
  atomic_int GGML_CACHE_ALIGN n_barrier;
454
462
  atomic_int GGML_CACHE_ALIGN n_barrier_passed;
455
463
  atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -457,12 +465,10 @@ struct ggml_threadpool {
457
465
  // these are atomic as an annotation for thread-sanitizer
458
466
  atomic_bool stop; // Used for stopping the threadpool altogether
459
467
  atomic_bool pause; // Used for pausing the threadpool or individual threads
460
- atomic_int abort; // Used for aborting processing of a graph
468
+ atomic_int abort; // Used for aborting processing of a graph
461
469
 
462
470
  struct ggml_compute_state * workers; // per thread state
463
- int n_threads_max; // number of threads in the pool
464
- atomic_int n_threads_cur; // number of threads used in the current graph
465
-
471
+ int n_threads; // Number of threads in the pool
466
472
  int32_t prio; // Scheduling priority
467
473
  uint32_t poll; // Polling level (0 - no polling)
468
474
 
@@ -539,7 +545,7 @@ struct ggml_state {
539
545
  static struct ggml_state g_state = {0};
540
546
 
541
547
  void ggml_barrier(struct ggml_threadpool * tp) {
542
- int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
548
+ int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
543
549
  if (n_threads == 1) {
544
550
  return;
545
551
  }
@@ -556,7 +562,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
556
562
  // last thread
557
563
  atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
558
564
 
559
- // exit barrier (fill seq-cst fence)
565
+ // exit barrier (full seq-cst fence)
560
566
  atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
561
567
  return;
562
568
  }
@@ -702,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {}
702
708
  #endif
703
709
  #endif // __ARM_ARCH
704
710
 
711
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
712
+ #include <riscv_vector.h>
713
+ static void ggml_init_riscv_arch_features(void) {
714
+ ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
715
+ }
716
+ #else
717
+ static void ggml_init_riscv_arch_features(void) {}
718
+ #endif
719
+
705
720
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
706
721
  GGML_ASSERT(!ggml_get_no_alloc(ctx));
707
722
 
@@ -2628,7 +2643,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
2628
2643
  void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
2629
2644
  if (!threadpool) return;
2630
2645
 
2631
- const int n_threads = threadpool->n_threads_max;
2646
+ const int n_threads = threadpool->n_threads;
2632
2647
 
2633
2648
  #ifndef GGML_USE_OPENMP
2634
2649
  struct ggml_compute_state* workers = threadpool->workers;
@@ -2704,7 +2719,7 @@ struct ggml_cplan ggml_graph_plan(
2704
2719
  //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
2705
2720
  }
2706
2721
  if (n_threads <= 0) {
2707
- n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
2722
+ n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
2708
2723
  }
2709
2724
 
2710
2725
  #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@@ -2912,12 +2927,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2912
2927
 
2913
2928
  struct ggml_compute_params params = {
2914
2929
  /*.ith =*/ state->ith,
2915
- /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
2930
+ /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
2916
2931
  /*.wsize =*/ cplan->work_size,
2917
2932
  /*.wdata =*/ cplan->work_data,
2918
2933
  /*.threadpool=*/ tp,
2919
2934
  };
2920
2935
 
2936
+ GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2937
+
2921
2938
  for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
2922
2939
  struct ggml_tensor * node = cgraph->nodes[node_n];
2923
2940
 
@@ -2939,6 +2956,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2939
2956
  }
2940
2957
  }
2941
2958
 
2959
+ GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2960
+
2942
2961
  ggml_barrier(state->threadpool);
2943
2962
 
2944
2963
  return 0;
@@ -2946,27 +2965,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
2946
2965
 
2947
2966
  #ifndef GGML_USE_OPENMP
2948
2967
 
2949
- // check if thread is active
2950
- static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
2951
- struct ggml_threadpool * threadpool = state->threadpool;
2952
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
2953
- return (state->ith < n_threads);
2954
- }
2955
-
2956
2968
  // check if thread is ready to proceed (exit from polling or sleeping)
2969
+ // returns true if loops should exit, sets state->pending to indicate new work
2957
2970
  static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
2958
2971
  struct ggml_threadpool * threadpool = state->threadpool;
2959
2972
 
2960
2973
  if (state->pending || threadpool->stop || threadpool->pause) { return true; }
2961
2974
 
2962
2975
  // check for new graph/work
2963
- int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2964
- if (new_graph != state->last_graph) {
2965
- state->pending = ggml_graph_compute_thread_active(state);
2966
- state->last_graph = new_graph;
2976
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2977
+ int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
2978
+ if (n_graph != state->last_graph) {
2979
+ state->pending = (state->ith < n_threads);
2980
+ state->last_graph = n_graph;
2981
+ return true;
2967
2982
  }
2968
2983
 
2969
- return state->pending;
2984
+ return false;
2970
2985
  }
2971
2986
 
2972
2987
  // sync thread state after polling
@@ -2983,11 +2998,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
2983
2998
  static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
2984
2999
  struct ggml_threadpool * threadpool = state->threadpool;
2985
3000
 
2986
- // Skip polling for unused threads
2987
- if (!ggml_graph_compute_thread_active(state)) {
2988
- return state->pending;
2989
- }
2990
-
2991
3001
  // This seems to make 0 ... 100 a decent range for polling level across modern processors.
2992
3002
  // Perhaps, we can adjust it dynamically based on load and things.
2993
3003
  const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3049,7 +3059,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
3049
3059
  ggml_graph_compute_check_for_work(state);
3050
3060
  if (state->pending) {
3051
3061
  state->pending = false;
3052
-
3053
3062
  ggml_graph_compute_thread(state);
3054
3063
  }
3055
3064
  }
@@ -3064,14 +3073,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
3064
3073
 
3065
3074
  ggml_mutex_lock(&threadpool->mutex);
3066
3075
 
3067
- GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
3076
+ // Update the number of active threads and the graph count
3077
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
3078
+ n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
3068
3079
 
3069
- // Update the number of active threads
3070
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3080
+ GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
3071
3081
 
3072
3082
  // Indicate the graph is ready to be processed
3073
3083
  // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
3074
- atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
3084
+ atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
3075
3085
 
3076
3086
  if (threadpool->pause) {
3077
3087
  // Update main thread prio and affinity to match the threadpool settings
@@ -3109,8 +3119,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
3109
3119
  threadpool->pause = tpp->paused;
3110
3120
  threadpool->abort = -1;
3111
3121
  threadpool->workers = NULL;
3112
- threadpool->n_threads_max = tpp->n_threads;
3113
- threadpool->n_threads_cur = tpp->n_threads;
3122
+ threadpool->n_threads = tpp->n_threads;
3114
3123
  threadpool->poll = tpp->poll;
3115
3124
  threadpool->prio = tpp->prio;
3116
3125
  threadpool->ec = GGML_STATUS_SUCCESS;
@@ -3205,7 +3214,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3205
3214
  {
3206
3215
  // update the number of threads from the actual number of threads that we got from OpenMP
3207
3216
  n_threads = omp_get_num_threads();
3208
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3217
+ atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
3209
3218
  }
3210
3219
 
3211
3220
  // Apply thread CPU mask and priority
@@ -3218,13 +3227,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3218
3227
  ggml_graph_compute_thread(&threadpool->workers[ith]);
3219
3228
  }
3220
3229
  } else {
3221
- atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
3230
+ atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
3222
3231
  ggml_graph_compute_thread(&threadpool->workers[0]);
3223
3232
  }
3224
3233
  #else
3225
- if (n_threads > threadpool->n_threads_max) {
3226
- GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
3227
- n_threads = threadpool->n_threads_max;
3234
+ if (n_threads > threadpool->n_threads) {
3235
+ GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
3236
+ n_threads = threadpool->n_threads;
3228
3237
  }
3229
3238
 
3230
3239
  // Kick all threads to start the new graph
@@ -3311,13 +3320,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3311
3320
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3312
3321
  _mm_storeu_ps(y + i, y_vec);
3313
3322
  }
3314
- #elif defined(__riscv_zvfh)
3315
- for (int vl; i < n; i += vl) {
3316
- vl = __riscv_vsetvl_e16m1(n - i);
3317
- vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
3318
- vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
3319
- __riscv_vse32_v_f32m2(&y[i], vy, vl);
3323
+
3324
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
3325
+ // calculate step size
3326
+ const int epr = __riscv_vsetvlmax_e16m2();
3327
+ const int step = epr * 2;
3328
+ const int np = (n & ~(step - 1));
3329
+
3330
+ // unroll by 2
3331
+ for (; i < np; i += step) {
3332
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
3333
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
3334
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3335
+
3336
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
3337
+ vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
3338
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3339
+ }
3340
+
3341
+ // leftovers
3342
+ int vl;
3343
+ for (i = np; i < n; i += vl) {
3344
+ vl = __riscv_vsetvl_e16m2(n - i);
3345
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
3346
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
3347
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3320
3348
  }
3349
+
3321
3350
  #endif
3322
3351
 
3323
3352
  for (; i < n; ++i) {
@@ -3362,6 +3391,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3362
3391
  (const __m128i *)(x + i))),
3363
3392
  16)));
3364
3393
  }
3394
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
3395
+ // calculate step size
3396
+ const int epr = __riscv_vsetvlmax_e16m2();
3397
+ const int step = epr * 2;
3398
+ const int np = (n & ~(step - 1));
3399
+
3400
+ // unroll by 2
3401
+ for (; i < np; i += step) {
3402
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
3403
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
3404
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3405
+
3406
+ vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
3407
+ vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
3408
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3409
+ }
3410
+
3411
+ // leftovers
3412
+ int vl;
3413
+ for (i = np; i < n; i += vl) {
3414
+ vl = __riscv_vsetvl_e16m2(n - i);
3415
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
3416
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
3417
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3418
+ }
3365
3419
  #endif
3366
3420
  for (; i < n; i++) {
3367
3421
  y[i] = GGML_BF16_TO_FP32(x[i]);
@@ -3464,6 +3518,14 @@ int ggml_cpu_has_riscv_v(void) {
3464
3518
  #endif
3465
3519
  }
3466
3520
 
3521
+ int ggml_cpu_get_rvv_vlen(void) {
3522
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
3523
+ return ggml_riscv_arch_features.rvv_vlen;
3524
+ #else
3525
+ return 0;
3526
+ #endif
3527
+ }
3528
+
3467
3529
  int ggml_cpu_has_f16c(void) {
3468
3530
  #if defined(__F16C__)
3469
3531
  return 1;
@@ -3630,6 +3692,10 @@ void ggml_cpu_init(void) {
3630
3692
  ggml_init_arm_arch_features();
3631
3693
  #endif
3632
3694
 
3695
+ #if defined(__riscv)
3696
+ ggml_init_riscv_arch_features();
3697
+ #endif
3698
+
3633
3699
  is_first_call = false;
3634
3700
  }
3635
3701
 
@@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
583
583
  if (ggml_cpu_has_riscv_v()) {
584
584
  features.push_back({ "RISCV_V", "1" });
585
585
  }
586
+ if (ggml_cpu_get_rvv_vlen() > 0) {
587
+ static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
588
+ features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
589
+ }
586
590
  if (ggml_cpu_has_vsx()) {
587
591
  features.push_back({ "VSX", "1" });
588
592
  }
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
692
692
  }
693
693
  }
694
694
 
695
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
696
+ float * GGML_RESTRICT s,
697
+ size_t bs,
698
+ const void * GGML_RESTRICT vx,
699
+ const void * GGML_RESTRICT vy,
700
+ int nr,
701
+ int nc) {
702
+ const int qk = QK8_0;
703
+ const int nb = n / qk;
704
+ const int ncols_interleaved = 4;
705
+ const int blocklen = 4;
706
+
707
+ assert(nr == 1);
708
+ assert(n % qk == 0);
709
+ assert(nc % ncols_interleaved == 0);
710
+
711
+ UNUSED(bs);
712
+ UNUSED(nr);
713
+
714
+ float sumf[4];
715
+ int sumi;
716
+
717
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
718
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
719
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
720
+
721
+ for (int j = 0; j < ncols_interleaved; j++) {
722
+ sumf[j] = 0.0;
723
+ }
724
+ for (int l = 0; l < nb; l++) {
725
+ for (int k = 0; k < (qk / blocklen); k++) {
726
+ for (int j = 0; j < ncols_interleaved; j++) {
727
+ sumi = 0;
728
+ for (int i = 0; i < blocklen; ++i) {
729
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
730
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
731
+ }
732
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
733
+ }
734
+ }
735
+ }
736
+ for (int j = 0; j < ncols_interleaved; j++) {
737
+ s[x * ncols_interleaved + j] = sumf[j];
738
+ }
739
+ }
740
+ }
741
+
742
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
743
+ float * GGML_RESTRICT s,
744
+ size_t bs,
745
+ const void * GGML_RESTRICT vx,
746
+ const void * GGML_RESTRICT vy,
747
+ int nr,
748
+ int nc) {
749
+ const int qk = QK8_0;
750
+ const int nb = n / qk;
751
+ const int ncols_interleaved = 4;
752
+ const int blocklen = 8;
753
+
754
+ assert(nr == 1);
755
+ assert(n % qk == 0);
756
+ assert(nc % ncols_interleaved == 0);
757
+
758
+ UNUSED(bs);
759
+ UNUSED(nr);
760
+
761
+ float sumf[4];
762
+ int sumi;
763
+
764
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
765
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
766
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
767
+
768
+ for (int j = 0; j < ncols_interleaved; j++) {
769
+ sumf[j] = 0.0;
770
+ }
771
+ for (int l = 0; l < nb; l++) {
772
+ for (int k = 0; k < (qk / blocklen); k++) {
773
+ for (int j = 0; j < ncols_interleaved; j++) {
774
+ sumi = 0;
775
+ for (int i = 0; i < blocklen; ++i) {
776
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
777
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
778
+ }
779
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
780
+ }
781
+ }
782
+ }
783
+ for (int j = 0; j < ncols_interleaved; j++) {
784
+ s[x * ncols_interleaved + j] = sumf[j];
785
+ }
786
+ }
787
+ }
788
+
695
789
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
696
790
  const int qk = QK8_0;
697
791
  const int nb = n / qk;
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1219
1313
  }
1220
1314
  }
1221
1315
 
1316
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
1317
+ float * GGML_RESTRICT s,
1318
+ size_t bs,
1319
+ const void * GGML_RESTRICT vx,
1320
+ const void * GGML_RESTRICT vy,
1321
+ int nr,
1322
+ int nc) {
1323
+ const int qk = QK8_0;
1324
+ const int nb = n / qk;
1325
+ const int ncols_interleaved = 4;
1326
+ const int blocklen = 4;
1327
+
1328
+ assert(n % qk == 0);
1329
+ assert(nr % 4 == 0);
1330
+ assert(nc % ncols_interleaved == 0);
1331
+
1332
+ float sumf[4][4];
1333
+ int sumi;
1334
+
1335
+ for (int y = 0; y < nr / 4; y++) {
1336
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1337
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1338
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1339
+ for (int m = 0; m < 4; m++) {
1340
+ for (int j = 0; j < ncols_interleaved; j++) {
1341
+ sumf[m][j] = 0.0;
1342
+ }
1343
+ }
1344
+ for (int l = 0; l < nb; l++) {
1345
+ for (int k = 0; k < (qk / blocklen); k++) {
1346
+ for (int m = 0; m < 4; m++) {
1347
+ for (int j = 0; j < ncols_interleaved; j++) {
1348
+ sumi = 0;
1349
+ for (int i = 0; i < blocklen; ++i) {
1350
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1351
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1352
+ }
1353
+ sumf[m][j] +=
1354
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1355
+ }
1356
+ }
1357
+ }
1358
+ }
1359
+ for (int m = 0; m < 4; m++) {
1360
+ for (int j = 0; j < ncols_interleaved; j++) {
1361
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1362
+ }
1363
+ }
1364
+ }
1365
+ }
1366
+ }
1367
+
1368
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
1369
+ float * GGML_RESTRICT s,
1370
+ size_t bs,
1371
+ const void * GGML_RESTRICT vx,
1372
+ const void * GGML_RESTRICT vy,
1373
+ int nr,
1374
+ int nc) {
1375
+ const int qk = QK8_0;
1376
+ const int nb = n / qk;
1377
+ const int ncols_interleaved = 4;
1378
+ const int blocklen = 8;
1379
+
1380
+ assert(n % qk == 0);
1381
+ assert(nr % 4 == 0);
1382
+ assert(nc % ncols_interleaved == 0);
1383
+
1384
+ float sumf[4][4];
1385
+ int sumi;
1386
+
1387
+ for (int y = 0; y < nr / 4; y++) {
1388
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1389
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1390
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1391
+ for (int m = 0; m < 4; m++) {
1392
+ for (int j = 0; j < ncols_interleaved; j++) {
1393
+ sumf[m][j] = 0.0;
1394
+ }
1395
+ }
1396
+ for (int l = 0; l < nb; l++) {
1397
+ for (int k = 0; k < (qk / blocklen); k++) {
1398
+ for (int m = 0; m < 4; m++) {
1399
+ for (int j = 0; j < ncols_interleaved; j++) {
1400
+ sumi = 0;
1401
+ for (int i = 0; i < blocklen; ++i) {
1402
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1403
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1404
+ }
1405
+ sumf[m][j] +=
1406
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1407
+ }
1408
+ }
1409
+ }
1410
+ }
1411
+ for (int m = 0; m < 4; m++) {
1412
+ for (int j = 0; j < ncols_interleaved; j++) {
1413
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1414
+ }
1415
+ }
1416
+ }
1417
+ }
1418
+ }
1419
+
1222
1420
  } // extern "C"
1223
1421
 
1422
+ static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
1423
+ block_q8_0x4 out;
1424
+
1425
+ for (int i = 0; i < 4; i++) {
1426
+ out.d[i] = in[i].d;
1427
+ }
1428
+
1429
+ const int end = QK8_0 * 4 / blck_size_interleave;
1430
+ for (int i = 0; i < end; ++i) {
1431
+ int src_id = i % 4;
1432
+ int src_offset = (i / 4) * blck_size_interleave;
1433
+ int dst_offset = i * blck_size_interleave;
1434
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
1435
+ }
1436
+ return out;
1437
+ }
1438
+
1224
1439
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
1225
1440
  block_q4_0x4 out;
1226
1441
 
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
1534
1749
  GGML_UNUSED(data_size);
1535
1750
  }
1536
1751
 
1752
+ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
1753
+ int interleave_block,
1754
+ const void * GGML_RESTRICT data,
1755
+ size_t data_size) {
1756
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
1757
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1758
+ constexpr int nrows_interleaved = 4;
1759
+
1760
+ block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
1761
+ const block_q8_0 * src = (const block_q8_0 *) data;
1762
+ block_q8_0 dst_tmp[4];
1763
+ int nrow = ggml_nrows(t);
1764
+ int nblocks = t->ne[0] / QK8_0;
1765
+
1766
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
1767
+
1768
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1769
+ return -1;
1770
+ }
1771
+
1772
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1773
+ for (int64_t x = 0; x < nblocks; x++) {
1774
+ for (int i = 0; i < nrows_interleaved; i++) {
1775
+ dst_tmp[i] = src[x + i * nblocks];
1776
+ }
1777
+ *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
1778
+ }
1779
+ src += nrows_interleaved * nblocks;
1780
+ }
1781
+ return 0;
1782
+ }
1783
+
1537
1784
  static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
1538
1785
  block_iq4_nlx4 out;
1539
1786
 
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
1702
1949
  return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1703
1950
  }
1704
1951
 
1952
+ template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1953
+ return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
1954
+ }
1955
+
1956
+ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1957
+ return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
1958
+ }
1959
+
1705
1960
  // gemv
1706
1961
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1707
1962
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
1738
1993
  ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1739
1994
  }
1740
1995
 
1996
+ template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1997
+ ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1998
+ }
1999
+
2000
+ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2001
+ ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2002
+ }
2003
+
1741
2004
  // gemm
1742
2005
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1743
2006
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
1774
2037
  ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1775
2038
  }
1776
2039
 
2040
+ template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2041
+ ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2042
+ }
2043
+
2044
+ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2045
+ ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2046
+ }
2047
+
1777
2048
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1778
2049
  public:
1779
2050
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2168,8 +2439,13 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2168
2439
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
2169
2440
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2170
2441
 
2442
+ // instance for Q8_0
2443
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
2444
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
2445
+
2171
2446
  if (cur->type == GGML_TYPE_Q4_0) {
2172
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
2447
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2448
+ || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
2173
2449
  if (cur->ne[1] % 8 == 0) {
2174
2450
  return &q4_0_8x8_q8_0;
2175
2451
  }
@@ -2217,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2217
2493
  return &iq4_nl_4x4_q8_0;
2218
2494
  }
2219
2495
  }
2496
+ } else if (cur->type == GGML_TYPE_Q8_0) {
2497
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2498
+ if (cur->ne[1] % 4 == 0) {
2499
+ return &q8_0_4x8_q8_0;
2500
+ }
2501
+ }
2502
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2503
+ if (cur->ne[1] % 4 == 0) {
2504
+ return &q8_0_4x4_q8_0;
2505
+ }
2506
+ }
2220
2507
  }
2221
2508
 
2222
2509
  return nullptr;