@fugood/llama.node 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -247,6 +247,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
247
247
  params.cache_type_v = kv_cache_type_from_str(
248
248
  get_option<std::string>(options, "cache_type_v", "f16").c_str());
249
249
  params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
250
+ params.kv_unified = get_option<bool>(options, "kv_unified", true);
250
251
 
251
252
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
252
253
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
@@ -904,9 +905,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
904
905
  // guide_tokens
905
906
  std::vector<llama_token> guide_tokens;
906
907
  if (options.Has("guide_tokens")) {
907
- auto guide_tokens_array = options.Get("guide_tokens").As<Napi::Array>();
908
- for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
909
- guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
908
+ auto guide_tokens_value = options.Get("guide_tokens");
909
+ if (guide_tokens_value.IsArray()) {
910
+ auto guide_tokens_array = guide_tokens_value.As<Napi::Array>();
911
+ for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
912
+ guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
913
+ }
914
+ } else if (guide_tokens_value.IsTypedArray()) {
915
+ auto guide_tokens_typed_array = guide_tokens_value.As<Napi::TypedArray>();
916
+ if (guide_tokens_typed_array.TypedArrayType() == napi_int32_array) {
917
+ auto guide_tokens_int32_array = guide_tokens_value.As<Napi::Int32Array>();
918
+ size_t length = guide_tokens_int32_array.ElementLength();
919
+ const int32_t* data = guide_tokens_int32_array.Data();
920
+ guide_tokens.resize(length);
921
+ memcpy(guide_tokens.data(), data, length * sizeof(int32_t));
922
+ } else {
923
+ Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
924
+ return env.Undefined();
925
+ }
926
+ } else {
927
+ Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
928
+ return env.Undefined();
910
929
  }
911
930
  }
912
931
 
@@ -1291,14 +1310,16 @@ tts_type LlamaContext::getTTSType(Napi::Env env, nlohmann::json speaker) {
1291
1310
  return OUTETTS_V0_2;
1292
1311
  }
1293
1312
 
1294
- // initVocoder(path: string): boolean
1313
+ // initVocoder(params?: object): boolean
1295
1314
  Napi::Value LlamaContext::InitVocoder(const Napi::CallbackInfo &info) {
1296
1315
  Napi::Env env = info.Env();
1297
1316
  if (info.Length() < 1 || !info[0].IsObject()) {
1298
- Napi::TypeError::New(env, "Object is expected for vocoder path")
1317
+ Napi::TypeError::New(env, "Object is expected for vocoder options")
1299
1318
  .ThrowAsJavaScriptException();
1300
1319
  }
1301
- auto vocoder_path = info[0].As<Napi::Object>().Get("path").ToString().Utf8Value();
1320
+ auto options = info[0].As<Napi::Object>();
1321
+ auto vocoder_path = options.Get("path").ToString().Utf8Value();
1322
+ auto n_batch = get_option<int32_t>(options, "n_batch", _sess->params().n_batch);
1302
1323
  if (vocoder_path.empty()) {
1303
1324
  Napi::TypeError::New(env, "vocoder path is required")
1304
1325
  .ThrowAsJavaScriptException();
@@ -1314,6 +1335,7 @@ Napi::Value LlamaContext::InitVocoder(const Napi::CallbackInfo &info) {
1314
1335
  _vocoder.params.model.path = vocoder_path;
1315
1336
  _vocoder.params.embedding = true;
1316
1337
  _vocoder.params.ctx_shift = false;
1338
+ _vocoder.params.n_batch = n_batch;
1317
1339
  _vocoder.params.n_ubatch = _vocoder.params.n_batch;
1318
1340
  common_init_result result = common_init_from_params(_vocoder.params);
1319
1341
  if (result.model == nullptr || result.context == nullptr) {
@@ -1342,7 +1364,7 @@ Napi::Value LlamaContext::IsVocoderEnabled(const Napi::CallbackInfo &info) {
1342
1364
  return Napi::Boolean::New(env, _has_vocoder);
1343
1365
  }
1344
1366
 
1345
- // getFormattedAudioCompletion(speaker: string|null, text: string): string
1367
+ // getFormattedAudioCompletion(speaker: string|null, text: string): object
1346
1368
  Napi::Value
1347
1369
  LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1348
1370
  Napi::Env env = info.Env();
@@ -1369,9 +1391,16 @@ LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1369
1391
  audio_text = audio_text_from_speaker(speaker, type);
1370
1392
  audio_data = audio_data_from_speaker(speaker, type);
1371
1393
  }
1372
- return Napi::String::New(env, "<|im_start|>\n" + audio_text +
1373
- process_text(text, type) +
1374
- "<|text_end|>\n" + audio_data + "\n");
1394
+ std::string prompt = "<|im_start|>\n" + audio_text +
1395
+ process_text(text, type) +
1396
+ "<|text_end|>\n" + audio_data + "\n";
1397
+ Napi::Object result = Napi::Object::New(env);
1398
+ result.Set("prompt", prompt);
1399
+ const char *grammar = get_tts_grammar(type);
1400
+ if (grammar != nullptr) {
1401
+ result.Set("grammar", grammar);
1402
+ }
1403
+ return result;
1375
1404
  }
1376
1405
 
1377
1406
  // getAudioCompletionGuideTokens(text: string): Int32Array
@@ -1412,6 +1441,10 @@ LlamaContext::GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info) {
1412
1441
  if (tmp.size() > 0) {
1413
1442
  result.push_back(tmp[0]);
1414
1443
  }
1444
+
1445
+ // Add Audio End, forcing stop generation
1446
+ result.push_back(common_tokenize(vocab, "<|audio_end|>", false, true)[0]);
1447
+
1415
1448
  auto tokens = Napi::Int32Array::New(env, result.size());
1416
1449
  memcpy(tokens.Data(), result.data(), result.size() * sizeof(int32_t));
1417
1450
  return tokens;
@@ -1446,7 +1479,7 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
1446
1479
  .ThrowAsJavaScriptException();
1447
1480
  return env.Undefined();
1448
1481
  }
1449
- if (type == OUTETTS_V0_3 || type == OUTETTS_V0_2) {
1482
+ if (type == OUTETTS_V0_1 || type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
1450
1483
  tokens.erase(
1451
1484
  std::remove_if(tokens.begin(), tokens.end(),
1452
1485
  [](llama_token t) { return t < 151672 || t > 155772; }),
@@ -1612,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1612
1612
  [](common_params & params, const std::string & value) {
1613
1613
  params.antiprompt.emplace_back(value);
1614
1614
  }
1615
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1615
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1616
1616
  add_opt(common_arg(
1617
1617
  {"-sp", "--special"},
1618
1618
  string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2655,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2655
2655
  params.i_chunk = value;
2656
2656
  }
2657
2657
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2658
+ add_opt(common_arg(
2659
+ {"--show-statistics"},
2660
+ string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2661
+ [](common_params & params) {
2662
+ params.show_statistics = true;
2663
+ }
2664
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2658
2665
  add_opt(common_arg(
2659
2666
  {"--parse-special"},
2660
2667
  string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -433,9 +433,10 @@ struct common_params {
433
433
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
434
434
  int32_t i_chunk = 0; // start processing from this chunk
435
435
 
436
- bool process_output = false; // collect data for the output tensor
437
- bool compute_ppl = true; // whether to compute perplexity
438
- bool parse_special = false; // whether to parse special tokens during imatrix tokenization
436
+ bool process_output = false; // collect data for the output tensor
437
+ bool compute_ppl = true; // whether to compute perplexity
438
+ bool show_statistics = false; // show imatrix statistics per tensor
439
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
439
440
 
440
441
  // cvector-generator params
441
442
  int n_pca_batch = 100;
@@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
- option(GGML_NNPA "ggml: enable nnpa" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
135
135
 
136
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
137
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
174
174
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
175
175
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
176
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
177
+ option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
178
+ option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
177
179
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
178
180
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
179
181
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -70,10 +70,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
70
70
  if (GGML_OPENMP)
71
71
  find_package(OpenMP)
72
72
  if (OpenMP_FOUND)
73
+ set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
73
74
  target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
74
75
 
75
76
  target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
76
77
  else()
78
+ set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
77
79
  message(WARNING "OpenMP not found")
78
80
  endif()
79
81
  endif()
@@ -456,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
456
458
  list(APPEND ARCH_FLAGS -march=z16)
457
459
  elseif (${S390X_M} MATCHES "9175|9176")
458
460
  # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
461
+ # binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
459
462
  message(STATUS "z17 target")
460
463
  list(APPEND ARCH_FLAGS -march=z17)
461
464
  else()
@@ -494,9 +497,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
494
497
 
495
498
  # Fetch KleidiAI sources:
496
499
  include(FetchContent)
497
- set(KLEIDIAI_COMMIT_TAG "v1.9.0")
500
+ set(KLEIDIAI_COMMIT_TAG "v1.11.0")
498
501
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
499
- set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
502
+ set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
500
503
 
501
504
  if (POLICY CMP0135)
502
505
  cmake_policy(SET CMP0135 NEW)
@@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
544
544
  __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
545
545
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
546
546
  __m128 tmp = max4;
547
- max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
547
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 ));
548
548
  const float max_scalar = ((v4f32)max4)[0];
549
549
 
550
550
  // Quantize these floats
@@ -22,9 +22,94 @@
22
22
 
23
23
  #include "kai_common.h"
24
24
 
25
+ #include "simd-mappings.h"
26
+
25
27
  #include "kernels.h"
26
28
 
27
29
  #define NELEMS(x) sizeof(x) / sizeof(*x)
30
+
31
+ static const size_t INT4_PER_BYTE = 2;
32
+ static const size_t INT4_BITS = 4;
33
+ static const int Q4_0_ZERO_POINT = 8;
34
+ const size_t INT4_PER_UINT16 = 4;
35
+
36
+ static void dequantize_row_qsi4c32pscalef16(
37
+ const void *packed_data,
38
+ int32_t row_idx,
39
+ int64_t nc,
40
+ float *out,
41
+ size_t nr_pack,
42
+ size_t packed_row_stride,
43
+ size_t kr,
44
+ size_t bl,
45
+ size_t num_bytes_multiplier
46
+ ) {
47
+ size_t group_idx = row_idx / nr_pack;
48
+ size_t row_in_group = row_idx % nr_pack;
49
+ const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
50
+ size_t num_blocks = nc / bl;
51
+ const uint8_t *block_ptr = packed_group;
52
+
53
+ for (size_t b = 0; b < num_blocks; ++b) {
54
+ uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier));
55
+ float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
56
+
57
+ const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier;
58
+ size_t num_segments = bl / kr;
59
+ size_t num_bytes_per_segment = kr / INT4_PER_BYTE;
60
+
61
+ for (size_t s = 0; s < num_segments; ++s) {
62
+ const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment;
63
+ const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment;
64
+ for (size_t k = 0; k < num_bytes_per_segment; ++k) {
65
+ uint8_t byte = qbytes[k] ^ 0x88;
66
+ int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT;
67
+ int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT;
68
+ out[b * bl + s * num_bytes_per_segment + k] = x0 * scale;
69
+ out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale;
70
+ }
71
+ }
72
+ block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment;
73
+ }
74
+ }
75
+
76
+ static void dequantize_row_qsi4c32ps1s0scalef16(
77
+ const void *packed_data,
78
+ int32_t row_idx,
79
+ int64_t k,
80
+ float *out,
81
+ size_t nr,
82
+ size_t packed_row_stride,
83
+ size_t kr,
84
+ size_t bl,
85
+ size_t num_bytes_multiplier
86
+ ) {
87
+ const size_t num_blocks = k / bl;
88
+ const size_t bl4 = bl / INT4_PER_UINT16;
89
+
90
+ size_t group_idx = row_idx / nr;
91
+ size_t row_in_group = row_idx % nr;
92
+
93
+ const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
94
+ const uint16_t *qdata = (const uint16_t *)packed_group;
95
+ const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier));
96
+
97
+ for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
98
+ uint16_t scale_f16 = scales[row_in_group + block_idx * nr];
99
+ float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
100
+
101
+ for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) {
102
+ uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group];
103
+
104
+ for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) {
105
+ int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT;
106
+ out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale;
107
+ }
108
+ }
109
+ }
110
+ GGML_UNUSED(kr);
111
+ }
112
+
28
113
  static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
29
114
  #if defined(__ARM_FEATURE_SME)
30
115
  {
@@ -63,8 +148,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
63
148
  /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
64
149
  },
65
150
  /* .rhs_info = */ {
66
- /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
67
- /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
151
+ /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
152
+ /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
153
+ /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
154
+ /* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16,
68
155
  },
69
156
  /* .required_cpu = */ CPU_FEATURE_SME,
70
157
  /* .lhs_type = */ GGML_TYPE_F32,
@@ -107,8 +194,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
107
194
  /* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
108
195
  },
109
196
  /* .rhs_info = */ {
110
- /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
111
- /* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
197
+ /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
198
+ /* .packed_stride = */ NULL,
199
+ /* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
200
+ /* .to_float = */ NULL,
112
201
  },
113
202
  /* .required_cpu = */ CPU_FEATURE_SME,
114
203
  /* .lhs_type = */ GGML_TYPE_F32,
@@ -154,8 +243,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
154
243
  /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
155
244
  },
156
245
  /* .rhs_info = */ {
157
- /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
158
- /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
246
+ /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
247
+ /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
248
+ /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
249
+ /* .to_float = */ dequantize_row_qsi4c32pscalef16,
159
250
  },
160
251
  /* .required_cpu = */ CPU_FEATURE_DOTPROD,
161
252
  /* .lhs_type = */ GGML_TYPE_F32,
@@ -200,8 +291,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
200
291
  /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
201
292
  },
202
293
  /* .rhs_info = */ {
203
- /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
204
- /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
294
+ /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
295
+ /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
296
+ /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
297
+ /* .to_float = */ dequantize_row_qsi4c32pscalef16,
205
298
  },
206
299
  /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
207
300
  /* .lhs_type = */ GGML_TYPE_F32,
@@ -247,8 +340,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
247
340
  /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
248
341
  },
249
342
  /* .rhs_info = */ {
250
- /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
251
- /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
343
+ /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
344
+ /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
345
+ /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
346
+ /* .to_float = */ dequantize_row_qsi4c32pscalef16,
252
347
  },
253
348
  /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
254
349
  /* .lhs_type = */ GGML_TYPE_F32,
@@ -293,8 +388,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
293
388
  /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
294
389
  },
295
390
  /* .rhs_info = */ {
296
- /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
297
- /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
391
+ /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
392
+ /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
393
+ /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
394
+ /* .to_float = */ dequantize_row_qsi4c32pscalef16,
298
395
  },
299
396
  /* .required_cpu = */ CPU_FEATURE_DOTPROD,
300
397
  /* .lhs_type = */ GGML_TYPE_F32,
@@ -71,12 +71,15 @@ struct rhs_packing_info {
71
71
  std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
72
72
  std::function<size_t(size_t n, size_t k)>
73
73
  > packed_size;
74
+ size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl);
74
75
  std::variant<
75
76
  std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
76
77
  const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
77
78
  std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
78
79
  const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
79
80
  > pack_func;
81
+ void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out, size_t nr_pack, size_t packed_row_stride,
82
+ size_t kr, size_t bl, size_t num_bytes_multiplier);
80
83
  };
81
84
 
82
85
  struct ggml_kleidiai_kernels {
@@ -40,6 +40,17 @@ struct ggml_kleidiai_context {
40
40
  ggml_kleidiai_kernels * kernels;
41
41
  } static ctx = { CPU_FEATURE_NONE, NULL };
42
42
 
43
+ static const char* cpu_feature_to_string(cpu_feature f) {
44
+ switch (f) {
45
+ case CPU_FEATURE_NONE: return "NONE";
46
+ case CPU_FEATURE_DOTPROD: return "DOTPROD";
47
+ case CPU_FEATURE_I8MM: return "I8MM";
48
+ case CPU_FEATURE_SVE: return "SVE";
49
+ case CPU_FEATURE_SME: return "SME";
50
+ default: return "UNKNOWN";
51
+ }
52
+ }
53
+
43
54
  static void init_kleidiai_context(void) {
44
55
 
45
56
  ggml_critical_section_start();
@@ -62,6 +73,11 @@ static void init_kleidiai_context(void) {
62
73
  ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
63
74
  }
64
75
  ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
76
+ #ifndef NDEBUG
77
+ if (ctx.kernels) {
78
+ GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu));
79
+ }
80
+ #endif
65
81
  }
66
82
  ggml_critical_section_end();
67
83
  }
@@ -102,6 +118,9 @@ static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint1
102
118
 
103
119
  class tensor_traits : public ggml::cpu::tensor_traits {
104
120
  bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
121
+ if (op->op != GGML_OP_MUL_MAT) {
122
+ return false;
123
+ }
105
124
  ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
106
125
  GGML_ASSERT(kernels);
107
126
  kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
@@ -135,6 +154,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
135
154
  } else if (dst->src[0]->type == GGML_TYPE_F16) {
136
155
  return compute_forward_kv_cache(params, dst);
137
156
  }
157
+ } else if (dst->op == GGML_OP_GET_ROWS) {
158
+ if (dst->src[0]->type == GGML_TYPE_Q4_0) {
159
+ return compute_forward_get_rows(params, dst);
160
+ }
138
161
  }
139
162
  return false;
140
163
  }
@@ -270,6 +293,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
270
293
  }
271
294
 
272
295
  bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
296
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
297
+
273
298
  const ggml_tensor * src0 = dst->src[0];
274
299
  const ggml_tensor * src1 = dst->src[1];
275
300
 
@@ -342,8 +367,49 @@ class tensor_traits : public ggml::cpu::tensor_traits {
342
367
  return true;
343
368
  }
344
369
 
370
+ bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
371
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
372
+ GGML_ASSERT(ctx.kernels);
373
+
374
+ const ggml_tensor * src0 = dst->src[0];
375
+ const ggml_tensor * src1 = dst->src[1];
376
+
377
+ GGML_TENSOR_BINARY_OP_LOCALS
378
+
379
+ rhs_packing_info * rhs_info = &ctx.kernels->rhs_info;
380
+ kernel_info * kernel = &ctx.kernels->gemm;
381
+
382
+ const int64_t nc = ne00;
383
+ const int64_t nr = ggml_nelements(src1);
384
+
385
+ const size_t block_rows = kernel->get_nr();
386
+ const size_t kr = kernel->get_kr();
387
+
388
+ const size_t num_bytes_multiplier = sizeof(uint16_t);
389
+ const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0);
390
+
391
+ const int ith = params->ith;
392
+ const int nth = params->nth;
393
+
394
+ const int dr = (nr + nth - 1) / nth;
395
+ const int ir0 = dr * ith;
396
+ const int ir1 = MIN(ir0 + dr, nr);
397
+
398
+ for (int64_t i = ir0; i < ir1; ++i) {
399
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
400
+ int64_t row_idx = ((const int32_t *)src1->data)[i];
401
+ GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
402
+
403
+ float *out = (float *)((char *)dst->data + i * nb1);
404
+ rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier);
405
+ }
406
+
407
+ return true;
408
+ }
409
+
345
410
  public:
346
411
  int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
412
+ GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
347
413
  GGML_ASSERT(ctx.kernels);
348
414
  const size_t n = tensor->ne[1];
349
415
  const size_t k = tensor->ne[0];
@@ -351,17 +417,12 @@ public:
351
417
  size_t kr = ctx.kernels->gemm.get_kr();
352
418
  size_t sr = ctx.kernels->gemm.get_sr();
353
419
 
354
- #ifndef NDEBUG
355
- const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
356
- GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
357
- #endif
358
420
  struct kai_rhs_pack_qs4cxs1s0_param params;
359
421
  params.lhs_zero_point = 1;
360
422
  params.rhs_zero_point = 8;
361
423
  variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, &params);
362
424
 
363
425
  return 0;
364
-
365
426
  GGML_UNUSED(data_size);
366
427
  }
367
428
  };
@@ -375,8 +436,8 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
375
436
  static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
376
437
  tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
377
438
 
378
- GGML_UNUSED(buffer);
379
439
  return GGML_STATUS_SUCCESS;
440
+ GGML_UNUSED(buffer);
380
441
  }
381
442
 
382
443
  static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
@@ -418,18 +479,35 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
418
479
  GGML_UNUSED(buft);
419
480
  }
420
481
 
482
+ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
483
+ GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
484
+ GGML_ASSERT(ctx.kernels);
485
+
486
+ const size_t n = tensor->ne[1];
487
+ const size_t k = tensor->ne[0];
488
+ const size_t nr = ctx.kernels->gemm.get_nr();
489
+ const size_t kr = ctx.kernels->gemm.get_kr();
490
+
491
+ return variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
492
+
493
+ GGML_UNUSED(buft);
494
+ }
495
+
421
496
  namespace ggml::cpu::kleidiai {
422
497
  class extra_buffer_type : ggml::cpu::extra_buffer_type {
423
498
  bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
424
- if (op->op == GGML_OP_MUL_MAT &&
499
+ if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
425
500
  op->src[0]->type == GGML_TYPE_Q4_0 &&
426
501
  op->src[0]->buffer &&
427
502
  (ggml_n_dims(op->src[0]) == 2) &&
428
503
  op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
504
+ if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
505
+ return false;
506
+ }
429
507
  if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
430
508
  return false;
431
509
  }
432
- if (op->src[1]->type == GGML_TYPE_F32 &&
510
+ if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) &&
433
511
  ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
434
512
  return true;
435
513
  }
@@ -438,7 +516,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
438
516
  }
439
517
 
440
518
  ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
441
- if (op->op == GGML_OP_MUL_MAT) {
519
+ if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) {
442
520
  if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
443
521
  return (ggml::cpu::tensor_traits *) op->src[0]->extra;
444
522
  }
@@ -469,7 +547,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
469
547
  /* .alloc_buffer = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer,
470
548
  /* .get_alignment = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment,
471
549
  /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
472
- /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
550
+ /* .get_alloc_size = */ ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size,
473
551
  /* .is_host = */ nullptr,
474
552
  },
475
553
  /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
@@ -14,7 +14,6 @@
14
14
  #include <cmath>
15
15
  #include <cstring>
16
16
  #include <cassert>
17
- #include <cstdlib> // for qsort
18
17
  #include <cstdio> // for GGML_ASSERT
19
18
 
20
19
  #include "repack.h"
@@ -956,6 +956,7 @@ extern "C" {
956
956
  // in the order they have appeared in the batch.
957
957
  // Rows: number of tokens for which llama_batch.logits[i] != 0
958
958
  // Cols: n_vocab
959
+ // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
959
960
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
960
961
 
961
962
  // Logits for the ith token. For positive indices, Equivalent to:
@@ -970,6 +971,7 @@ extern "C" {
970
971
  // in the order they have appeared in the batch.
971
972
  // shape: [n_outputs*n_embd]
972
973
  // Otherwise, returns NULL.
974
+ // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
973
975
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
974
976
 
975
977
  // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -1933,12 +1933,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1933
1933
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1934
1934
  }
1935
1935
  },
1936
- {
1937
- LLM_ARCH_UNKNOWN,
1938
- {
1939
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1940
- },
1941
- },
1942
1936
  {
1943
1937
  LLM_ARCH_DREAM,
1944
1938
  {
@@ -1956,6 +1950,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1956
1950
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1957
1951
  },
1958
1952
  },
1953
+ {
1954
+ LLM_ARCH_UNKNOWN,
1955
+ {
1956
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1957
+ },
1958
+ },
1959
1959
  };
1960
1960
 
1961
1961
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -718,10 +718,9 @@ int32_t llm_chat_apply_template(
718
718
  }
719
719
 
720
720
  ss << message->content << "<|im_end|>";
721
-
722
- if (add_ass) {
723
- ss << "<|im_assistant|>assistant<|im_middle|>";
724
- }
721
+ }
722
+ if (add_ass) {
723
+ ss << "<|im_assistant|>assistant<|im_middle|>";
725
724
  }
726
725
  } else {
727
726
  // template not supported