@fugood/llama.node 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/CMakeLists.txt +21 -1
  2. package/lib/binding.js +1 -1
  3. package/lib/binding.ts +47 -15
  4. package/lib/index.js +26 -2
  5. package/lib/index.ts +42 -10
  6. package/package.json +15 -14
  7. package/scripts/llama.cpp.patch +31 -10
  8. package/src/LlamaContext.cpp +46 -0
  9. package/src/LlamaContext.h +2 -0
  10. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  12. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  13. package/src/llama.cpp/common/chat-parser.h +10 -0
  14. package/src/llama.cpp/common/chat.cpp +461 -87
  15. package/src/llama.cpp/common/chat.h +6 -0
  16. package/src/llama.cpp/common/common.cpp +8 -1
  17. package/src/llama.cpp/common/common.h +12 -5
  18. package/src/llama.cpp/common/json-partial.cpp +19 -2
  19. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
  20. package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
  21. package/src/llama.cpp/common/sampling.cpp +60 -6
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
  24. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
  28. package/src/llama.cpp/src/llama-grammar.cpp +17 -9
  29. package/src/llama.cpp/src/llama-impl.cpp +3 -3
  30. package/src/llama.cpp/src/llama-sampling.cpp +3 -6
  31. package/src/llama.cpp/src/llama-vocab.cpp +1 -0
@@ -2,17 +2,15 @@
2
2
 
3
3
  #pragma once
4
4
 
5
+ #include "ggml-opt.h"
6
+ #include "llama-cpp.h"
7
+
5
8
  #include <set>
6
9
  #include <sstream>
7
10
  #include <string>
8
11
  #include <string_view>
9
12
  #include <vector>
10
13
  #include <map>
11
- #include <sstream>
12
- #include <cmath>
13
-
14
- #include "ggml-opt.h"
15
- #include "llama-cpp.h"
16
14
 
17
15
  #ifdef _WIN32
18
16
  #define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@
30
28
 
31
29
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
32
30
 
31
+ struct common_time_meas {
32
+ common_time_meas(int64_t & t_acc, bool disable = false);
33
+ ~common_time_meas();
34
+
35
+ const int64_t t_start_us;
36
+
37
+ int64_t & t_acc;
38
+ };
39
+
33
40
  struct common_adapter_lora_info {
34
41
  std::string path;
35
42
  float scale;
@@ -297,8 +297,25 @@ bool common_json_parse(
297
297
  it = temptative_end;
298
298
  return true;
299
299
  }
300
- // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
301
- // fprintf(stderr, "Closing: TODO\n");
300
+ // handle unclosed top-level primitive
301
+ if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
302
+ std::string str(it, temptative_end);
303
+ const auto & magic_seed = out.healing_marker.marker = healing_marker;
304
+ if (can_parse(str + "\"")) {
305
+ // Was inside an string
306
+ str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
307
+ } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
308
+ // Was inside an string after an escape
309
+ str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
310
+ } else {
311
+ // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
312
+ // fprintf(stderr, "Closing: TODO\n");
313
+ return false;
314
+ }
315
+ out.json = json::parse(str);
316
+ it = temptative_end;
317
+ return true;
318
+ }
302
319
  return false;
303
320
  }
304
321
  out.json = json::parse(it, end);
@@ -303,6 +303,8 @@ static std::string format_literal(const std::string & literal) {
303
303
  return "\"" + escaped + "\"";
304
304
  }
305
305
 
306
+ std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
307
+
306
308
  class SchemaConverter {
307
309
  private:
308
310
  friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
@@ -18,4 +18,6 @@ struct common_grammar_options {
18
18
  bool dotall = false;
19
19
  };
20
20
 
21
+ std::string gbnf_format_literal(const std::string & literal);
22
+
21
23
  std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
@@ -3,9 +3,10 @@
3
3
  #include "common.h"
4
4
  #include "log.h"
5
5
 
6
+ #include <algorithm>
6
7
  #include <cmath>
8
+ #include <cstring>
7
9
  #include <unordered_map>
8
- #include <algorithm>
9
10
 
10
11
  // the ring buffer works similarly to std::deque, but with a fixed capacity
11
12
  // TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {
112
113
 
113
114
  llama_token_data_array cur_p;
114
115
 
116
+ void reset() {
117
+ prev.clear();
118
+
119
+ llama_sampler_reset(grmr);
120
+ llama_sampler_reset(chain);
121
+ }
122
+
115
123
  void set_logits(struct llama_context * ctx, int idx) {
116
124
  const auto * logits = llama_get_logits_ith(ctx, idx);
117
125
 
@@ -128,6 +136,12 @@ struct common_sampler {
128
136
 
129
137
  cur_p = { cur.data(), cur.size(), -1, false };
130
138
  }
139
+
140
+ common_time_meas tm() {
141
+ return common_time_meas(t_total_us, params.no_perf);
142
+ }
143
+
144
+ mutable int64_t t_total_us = 0;
131
145
  };
132
146
 
133
147
  std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
298
312
  }
299
313
 
300
314
  void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
315
+ const auto tm = gsmpl->tm();
316
+
301
317
  if (accept_grammar) {
302
318
  llama_sampler_accept(gsmpl->grmr, token);
303
319
  }
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
308
324
  }
309
325
 
310
326
  void common_sampler_reset(struct common_sampler * gsmpl) {
311
- llama_sampler_reset(gsmpl->grmr);
312
-
313
- llama_sampler_reset(gsmpl->chain);
327
+ gsmpl->reset();
314
328
  }
315
329
 
316
330
  struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
327
341
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
328
342
  // TODO: measure grammar performance
329
343
 
344
+ const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
345
+
346
+ llama_perf_sampler_data data_smpl;
347
+ llama_perf_context_data data_ctx;
348
+
349
+ memset(&data_smpl, 0, sizeof(data_smpl));
350
+ memset(&data_ctx, 0, sizeof(data_ctx));
351
+
330
352
  if (gsmpl) {
331
- llama_perf_sampler_print(gsmpl->chain);
353
+ auto & data = data_smpl;
354
+
355
+ data = llama_perf_sampler(gsmpl->chain);
356
+
357
+ // note: the sampling time includes the samplers time + extra time spent in common/sampling
358
+ LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
359
+ LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
332
360
  }
361
+
333
362
  if (ctx) {
334
- llama_perf_context_print(ctx);
363
+ auto & data = data_ctx;
364
+
365
+ data = llama_perf_context(ctx);
366
+
367
+ const double t_end_ms = 1e-3 * ggml_time_us();
368
+
369
+ const double t_total_ms = t_end_ms - data.t_start_ms;
370
+ const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
371
+ const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
372
+
373
+ LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
374
+ LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
375
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
376
+ LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
377
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
378
+ LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
379
+ LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
380
+ LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
381
+
335
382
  llama_memory_breakdown_print(ctx);
336
383
  }
337
384
  }
338
385
 
339
386
  llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
387
+ llama_synchronize(ctx);
388
+
389
+ // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
390
+ const auto tm = gsmpl->tm();
391
+
340
392
  gsmpl->set_logits(ctx, idx);
341
393
 
342
394
  auto & grmr = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
428
480
  // helpers
429
481
 
430
482
  llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
483
+ const auto tm = gsmpl->tm();
484
+
431
485
  auto * res = &gsmpl->cur_p;
432
486
 
433
487
  if (do_sort && !res->sorted) {
@@ -145,26 +145,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
145
145
 
146
146
  include(CheckCXXSourceRuns)
147
147
 
148
- function(check_arm_feature tag code)
148
+ macro(check_arm_feature tag feature code)
149
149
  set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
150
150
  set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
151
151
  check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
152
152
  if (GGML_MACHINE_SUPPORTS_${tag})
153
- set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}" PARENT_SCOPE)
153
+ set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}")
154
154
  else()
155
155
  set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
156
156
  check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
157
157
  if (GGML_MACHINE_SUPPORTS_no${tag})
158
- set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}" PARENT_SCOPE)
158
+ set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}")
159
+ list(APPEND ARCH_FLAGS -U__ARM_FEATURE_${feature})
159
160
  endif()
160
161
  endif()
161
162
  set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
162
- endfunction()
163
+ endmacro()
163
164
 
164
- check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
165
- check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
166
- check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
167
- check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
165
+ check_arm_feature(dotprod DOTPROD "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
166
+ check_arm_feature(i8mm MATMUL_INT8 "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
167
+ check_arm_feature(sve SVE "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
168
+ check_arm_feature(sme SME "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
168
169
 
169
170
  list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
170
171
  else()
@@ -216,35 +217,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
216
217
  endif()
217
218
  endif()
218
219
 
219
- # show enabled features
220
- if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
221
- set(FEAT_INPUT_FILE "NUL")
222
- else()
223
- set(FEAT_INPUT_FILE "/dev/null")
224
- endif()
220
+ message(STATUS "Checking for ARM features using flags:")
221
+ foreach(flag IN LISTS ARCH_FLAGS)
222
+ message(STATUS " ${flag}")
223
+ endforeach()
225
224
 
226
- execute_process(
227
- COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
228
- INPUT_FILE ${FEAT_INPUT_FILE}
229
- OUTPUT_VARIABLE ARM_FEATURE
230
- RESULT_VARIABLE ARM_FEATURE_RESULT
231
- )
232
- if (ARM_FEATURE_RESULT)
233
- message(WARNING "Failed to get ARM features")
234
- else()
235
- foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
236
- string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
237
- if (NOT ${feature_pos} EQUAL -1)
238
- # Special handling for MATMUL_INT8 when machine doesn't support i8mm
239
- if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
240
- message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
241
- list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
242
- else()
243
- message(STATUS "ARM feature ${feature} enabled")
244
- endif()
245
- endif()
246
- endforeach()
247
- endif()
225
+ include(CheckCXXSourceCompiles)
226
+ set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
227
+ set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}")
228
+ foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
229
+ set(ARM_FEATURE "HAVE_${feature}")
230
+ check_cxx_source_compiles(
231
+ "
232
+ #if !defined(__ARM_FEATURE_${feature})
233
+ # error \"Feature ${feature} is not defined\"
234
+ #endif
235
+ int main() { return 0; }
236
+ "
237
+ ${ARM_FEATURE}
238
+ )
239
+ endforeach()
240
+ set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
248
241
  endif()
249
242
  elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
250
243
  message(STATUS "x86 detected")
@@ -399,9 +392,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
399
392
  string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
400
393
 
401
394
  if (EXTRACTED_NUMBER GREATER_EQUAL 10)
402
- list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
395
+ list(APPEND ARCH_FLAGS -mcpu=power10)
403
396
  elseif (EXTRACTED_NUMBER EQUAL 9)
404
- list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
397
+ list(APPEND ARCH_FLAGS -mcpu=power9)
405
398
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
406
399
  list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
407
400
  else()
@@ -646,7 +646,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
646
646
  __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
647
647
  int64_t xstart = 0;
648
648
  int anr = nr - nr%16; // Used to align nr with boundary of 16
649
- #ifdef __AVX512F__
649
+ #if defined(__AVX512BW__) && defined(__AVX512DQ__)
650
650
  int anc = nc - nc%16; // Used to align nc with boundary of 16
651
651
  // Mask to mask out nibbles from packed bytes expanded to 512 bit length
652
652
  const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
@@ -1041,7 +1041,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
1041
1041
  xstart = anc/8;
1042
1042
  y = 0;
1043
1043
  }
1044
- #endif // __AVX512F__
1044
+ #endif // __AVX512BW__ && __AVX512DQ__
1045
1045
 
1046
1046
  // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
1047
1047
 
@@ -1989,7 +1989,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1989
1989
  __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
1990
1990
  int64_t xstart = 0;
1991
1991
  int anr = nr - nr % 16;; // Used to align nr with boundary of 16
1992
- #ifdef __AVX512F__
1992
+ #if defined(__AVX512BW__) && defined(__AVX512DQ__)
1993
1993
  int anc = nc - nc % 16; // Used to align nc with boundary of 16
1994
1994
  // Mask to mask out nibbles from packed bytes expanded to 512 bit length
1995
1995
  const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
@@ -2727,7 +2727,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2727
2727
  xstart = anc/8;
2728
2728
  y = 0;
2729
2729
  }
2730
- #endif //AVX512F
2730
+ #endif // __AVX512BW__ && __AVX512DQ__
2731
2731
 
2732
2732
  // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
2733
2733
  for (; y < anr / 4; y += 4) {
@@ -3467,7 +3467,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3467
3467
  __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse);
3468
3468
  scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0);
3469
3469
 
3470
- #ifdef __AVX512F__
3470
+ #if defined(__AVX512BW__) && defined(__AVX512DQ__)
3471
3471
 
3472
3472
  int anc = nc - nc % 16; // Used to align nc with boundary of 16
3473
3473
 
@@ -4947,7 +4947,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
4947
4947
  y = 0;
4948
4948
  }
4949
4949
 
4950
- #endif //AVX512F
4950
+ #endif // __AVX512BW__ && __AVX512DQ__
4951
4951
 
4952
4952
  // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
4953
4953
  for (; y < anr / 4; y += 4) {
@@ -39,7 +39,7 @@
39
39
 
40
40
  #include "kernels.h"
41
41
 
42
- #define NELEMS(x) sizeof(x) / sizeof(*x)
42
+ #define NELEMS(x) (sizeof(x) / sizeof(*x))
43
43
 
44
44
  template<size_t(*Fn)(size_t,size_t,size_t)>
45
45
  static inline size_t kernel_offs_fn3(size_t a, size_t b, size_t c) {
@@ -635,6 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
635
635
  },
636
636
  #endif
637
637
  #endif
638
+ { /* Sentinel */ }
638
639
  };
639
640
 
640
641
  static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
@@ -803,6 +804,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
803
804
  /* .op_type = */ GGML_TYPE_F32,
804
805
  },
805
806
  #endif
807
+ { /* Sentinel */ }
806
808
  };
807
809
 
808
810
  ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
@@ -810,7 +812,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
810
812
 
811
813
  if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
812
814
  #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
813
- for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
815
+ for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
814
816
  if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
815
817
  gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
816
818
  gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
@@ -820,7 +822,7 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
820
822
  }
821
823
  }
822
824
  if (!kernel) {
823
- for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) {
825
+ for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
824
826
  if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
825
827
  gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
826
828
  gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
@@ -830,6 +832,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
830
832
  }
831
833
  }
832
834
  }
835
+ #else
836
+ GGML_UNUSED(gemm_gemv_kernels);
837
+ GGML_UNUSED(gemm_gemv_kernels_q8);
838
+ GGML_UNUSED(cpu_features);
833
839
  #endif
834
840
  }
835
841
 
@@ -840,12 +846,14 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features)
840
846
  ggml_kleidiai_kernels * kernels = nullptr;
841
847
 
842
848
  #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
843
- for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
849
+ for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
844
850
  if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
845
851
  kernels = &gemm_gemv_kernels[i];
846
852
  break;
847
853
  }
848
854
  }
855
+ #else
856
+ GGML_UNUSED(features);
849
857
  #endif
850
858
 
851
859
  return kernels;
@@ -855,12 +863,14 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features)
855
863
  ggml_kleidiai_kernels * kernels = nullptr;
856
864
 
857
865
  #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
858
- for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) {
866
+ for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
859
867
  if ((features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu) {
860
868
  kernels = &gemm_gemv_kernels_q8[i];
861
869
  break;
862
870
  }
863
871
  }
872
+ #else
873
+ GGML_UNUSED(features);
864
874
  #endif
865
875
 
866
876
  return kernels;
@@ -9696,13 +9696,12 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
9696
9696
  for (int64_t i00 = 0; i00 < n; ++i00) {
9697
9697
  float sum = 0.0f;
9698
9698
  for (int64_t t = 0; t < i00; ++t) {
9699
- sum += A_batch[i00 * n + t] * X_batch[i01 * n + t];
9699
+ sum += A_batch[i00 * n + t] * X_batch[t * k + i01];
9700
9700
  }
9701
9701
 
9702
9702
  const float diag = A_batch[i00 * n + i00];
9703
9703
  GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
9704
-
9705
- X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag;
9704
+ X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
9706
9705
  }
9707
9706
  }
9708
9707
  }
@@ -160,18 +160,18 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
160
160
  #define GGML_F32xt svfloat32_t
161
161
  #define GGML_F32xt_ZERO svdup_n_f32(0.0f)
162
162
  #define GGML_F32xt_SET1(x) svdup_n_f32(x)
163
- #define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
164
- #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
165
- #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
166
- #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
163
+ #define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a)
164
+ #define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
165
+ #define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b)
166
+ #define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
167
167
  #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
168
- #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
168
+ #define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
169
169
  #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
170
- #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
170
+ #define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
171
171
  #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
172
- #define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
172
+ #define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
173
173
  #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
174
- #define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
174
+ #define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
175
175
  #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
176
176
  { \
177
177
  sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
@@ -183,7 +183,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
183
183
  sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
184
184
  (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
185
185
  }
186
- #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
186
+ #define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
187
+ GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
187
188
 
188
189
  #define GGML_F32_VEC GGML_F32xt
189
190
  #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
@@ -206,11 +207,11 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
206
207
  #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
207
208
 
208
209
  #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
209
- #define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
210
+ #define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
210
211
  #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
211
- #define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
212
+ #define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
212
213
  #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
213
- #define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
214
+ #define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
214
215
  #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
215
216
 
216
217
  #define GGML_F16x_VEC GGML_F32Cxt
@@ -224,7 +225,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
224
225
  #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
225
226
 
226
227
  #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
227
- #define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
228
+ #define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
228
229
 
229
230
  #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
230
231
  { \
@@ -234,7 +235,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
234
235
  __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
235
236
  (res) = (ggml_float) sum_f16; \
236
237
  }
237
- #define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
238
+ #define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \
239
+ GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
238
240
 
239
241
  // F16 NEON
240
242
 
@@ -698,60 +698,61 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
698
698
  }
699
699
 
700
700
  inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
701
- #if defined(GGML_SIMD)
702
- #if defined(__ARM_FEATURE_SVE)
703
- const int sve_register_length = svcntb() * 8;
704
- const int ggml_f16_epr = sve_register_length / 16;
705
- const int ggml_f16_step = 2 * ggml_f16_epr;
706
-
707
- GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
708
- const int np = (n & ~(ggml_f16_step - 1));
709
- svfloat16_t ay1, ay2;
710
-
711
- for (int i = 0; i < np; i += ggml_f16_step) {
712
- ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
713
- ay1 = GGML_F16x_VEC_MUL(ay1, vx);
714
- GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
715
-
716
- ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
717
- ay2 = GGML_F16x_VEC_MUL(ay2, vx);
718
- GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
719
- }
720
- // leftovers
721
- // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
722
- if (np < n) {
723
- svbool_t pg = svwhilelt_b16(np, n);
724
- svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
725
- svfloat16_t out = svmul_f16_m(pg, hy, vx);
726
- svst1_f16(pg, (__fp16 *)(y + np), out);
727
- }
728
- #elif defined(__riscv_v_intrinsic)
729
- // todo: RVV impl
730
- // scalar
731
- for (int i = 0; i < n; ++i) {
732
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
733
- }
734
- #else
735
- const int np = (n & ~(GGML_F16_STEP - 1));
701
+ #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
702
+ const int sve_register_length = svcntb() * 8;
703
+ const int ggml_f16_epr = sve_register_length / 16;
704
+ const int ggml_f16_step = 2 * ggml_f16_epr;
705
+
706
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
707
+ const int np = (n & ~(ggml_f16_step - 1));
708
+ svfloat16_t ay1, ay2;
709
+
710
+ for (int i = 0; i < np; i += ggml_f16_step) {
711
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
712
+ ay1 = GGML_F16x_VEC_MUL(ay1, vx);
713
+ GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
714
+
715
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
716
+ ay2 = GGML_F16x_VEC_MUL(ay2, vx);
717
+ GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
718
+ }
719
+ // leftovers
720
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721
+ if (np < n) {
722
+ svbool_t pg = svwhilelt_b16(np, n);
723
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
724
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
725
+ svst1_f16(pg, (__fp16 *)(y + np), out);
726
+ }
727
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
728
+ for (int i = 0, vl; i < n; i += vl) {
729
+ vl = __riscv_vsetvl_e16m2(n - i);
730
+ vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
731
+ vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
732
+ vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
733
+ vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
734
+ __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
735
+ }
736
+ #elif defined(GGML_SIMD)
737
+ const int np = (n & ~(GGML_F16_STEP - 1));
736
738
 
737
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
739
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
738
740
 
739
- GGML_F16_VEC ay[GGML_F16_ARR];
741
+ GGML_F16_VEC ay[GGML_F16_ARR];
740
742
 
741
- for (int i = 0; i < np; i += GGML_F16_STEP) {
742
- for (int j = 0; j < GGML_F16_ARR; j++) {
743
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
744
- ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
743
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
744
+ for (int j = 0; j < GGML_F16_ARR; j++) {
745
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
746
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
745
747
 
746
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
747
- }
748
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
748
749
  }
750
+ }
749
751
 
750
- // leftovers
751
- for (int i = np; i < n; ++i) {
752
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
753
- }
754
- #endif
752
+ // leftovers
753
+ for (int i = np; i < n; ++i) {
754
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
755
+ }
755
756
  #else
756
757
  // scalar
757
758
  for (int i = 0; i < n; ++i) {