cui-llama.rn 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.cpp CHANGED
@@ -1105,7 +1105,7 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
1105
1105
  #define CURL_MAX_RETRY 3
1106
1106
  #define CURL_RETRY_DELAY_SECONDS 2
1107
1107
 
1108
- static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1108
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1109
1109
  int remaining_attempts = max_attempts;
1110
1110
 
1111
1111
  while (remaining_attempts > 0) {
@@ -1129,7 +1129,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1129
1129
  }
1130
1130
 
1131
1131
  static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1132
-
1133
1132
  // Initialize libcurl
1134
1133
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1135
1134
  if (!curl) {
@@ -1202,11 +1201,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
1202
1201
  std::string etag;
1203
1202
  std::string last_modified;
1204
1203
  };
1204
+
1205
1205
  common_load_model_from_url_headers headers;
1206
+
1206
1207
  {
1207
1208
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1208
1209
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1209
- common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1210
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1210
1211
 
1211
1212
  static std::regex header_regex("([^:]+): (.*)\r\n");
1212
1213
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1790,7 +1791,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1790
1791
  break;
1791
1792
  case 0: // max absolute
1792
1793
  for (int i = 0; i < n; i++) {
1793
- if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1794
+ if (sum < std::abs(inp[i])) {
1795
+ sum = std::abs(inp[i]);
1796
+ }
1794
1797
  }
1795
1798
  sum /= 32760.0; // make an int16 range
1796
1799
  break;
package/cpp/common.h CHANGED
@@ -91,6 +91,7 @@ enum llama_example {
91
91
  LLAMA_EXAMPLE_LLAVA,
92
92
  LLAMA_EXAMPLE_LOOKUP,
93
93
  LLAMA_EXAMPLE_PARALLEL,
94
+ LLAMA_EXAMPLE_TTS,
94
95
 
95
96
  LLAMA_EXAMPLE_COUNT,
96
97
  };
@@ -170,6 +171,7 @@ struct common_params_sampling {
170
171
 
171
172
  struct common_params_speculative {
172
173
  std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
174
+
173
175
  int32_t n_ctx = 0; // draft context size
174
176
  int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
175
177
  int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@@ -183,6 +185,14 @@ struct common_params_speculative {
183
185
  std::string model = ""; // draft model for speculative decoding // NOLINT
184
186
  };
185
187
 
188
+ struct common_params_vocoder {
189
+ std::string hf_repo = ""; // HF repo // NOLINT
190
+ std::string hf_file = ""; // HF file // NOLINT
191
+
192
+ std::string model = ""; // model path // NOLINT
193
+ std::string model_url = ""; // model url to download // NOLINT
194
+ };
195
+
186
196
  struct common_params {
187
197
 
188
198
  void * progress_callback_user_data = nullptr;
@@ -229,8 +239,9 @@ struct common_params {
229
239
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
230
240
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
231
241
 
232
- struct common_params_sampling sampling;
242
+ struct common_params_sampling sampling;
233
243
  struct common_params_speculative speculative;
244
+ struct common_params_vocoder vocoder;
234
245
 
235
246
  std::string model = ""; // model path // NOLINT
236
247
  std::string model_alias = ""; // model alias // NOLINT
@@ -611,7 +622,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
611
622
  // Embedding utils
612
623
  //
613
624
 
614
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
625
+ // TODO: repace embd_norm with an enum
626
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
615
627
 
616
628
  float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
617
629
 
package/cpp/ggml-alloc.c CHANGED
@@ -534,7 +534,6 @@ static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_gg
534
534
  size_t offset = lm_ggml_dyn_tallocr_alloc(alloc, size, node);
535
535
  hn->buffer_id = buffer_id;
536
536
  hn->offset = offset;
537
- return;
538
537
  }
539
538
  }
540
539
 
@@ -66,6 +66,26 @@
66
66
  #include "ggml-kompute.h"
67
67
  #endif
68
68
 
69
+ // disable C++17 deprecation warning for std::codecvt_utf8
70
+ #if defined(__clang__)
71
+ # pragma clang diagnostic push
72
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
73
+ #endif
74
+
75
+ static std::wstring utf8_to_utf16(const std::string & str) {
76
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
77
+ return converter.from_bytes(str);
78
+ }
79
+
80
+ static std::string utf16_to_utf8(const std::wstring & str) {
81
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
82
+ return converter.to_bytes(str);
83
+ }
84
+
85
+ #if defined(__clang__)
86
+ # pragma clang diagnostic pop
87
+ #endif
88
+
69
89
  #ifdef _WIN32
70
90
 
71
91
  using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
88
108
  return handle;
89
109
  }
90
110
 
91
- static dl_handle * dl_load_library(const std::string & path) {
92
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
93
- return dl_load_library(converter.from_bytes(path));
94
- }
95
-
96
111
  static void * dl_get_sym(dl_handle * handle, const char * name) {
97
112
  DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
98
113
  SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -114,8 +129,8 @@ struct dl_handle_deleter {
114
129
  }
115
130
  };
116
131
 
117
- static void * dl_load_library(const std::string & path) {
118
- dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
132
+ static void * dl_load_library(const std::wstring & path) {
133
+ dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
119
134
 
120
135
  return handle;
121
136
  }
@@ -202,11 +217,11 @@ struct lm_ggml_backend_registry {
202
217
  devices.push_back(device);
203
218
  }
204
219
 
205
- lm_ggml_backend_reg_t load_backend(const char * path, bool silent) {
220
+ lm_ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
206
221
  dl_handle_ptr handle { dl_load_library(path) };
207
222
  if (!handle) {
208
223
  if (!silent) {
209
- LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
224
+ LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
210
225
  }
211
226
  return nullptr;
212
227
  }
@@ -214,7 +229,7 @@ struct lm_ggml_backend_registry {
214
229
  auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
215
230
  if (score_fn && score_fn() == 0) {
216
231
  if (!silent) {
217
- LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
232
+ LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
218
233
  }
219
234
  return nullptr;
220
235
  }
@@ -222,7 +237,7 @@ struct lm_ggml_backend_registry {
222
237
  auto backend_init_fn = (lm_ggml_backend_init_t) dl_get_sym(handle.get(), "lm_ggml_backend_init");
223
238
  if (!backend_init_fn) {
224
239
  if (!silent) {
225
- LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, path);
240
+ LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
226
241
  }
227
242
  return nullptr;
228
243
  }
@@ -231,16 +246,16 @@ struct lm_ggml_backend_registry {
231
246
  if (!reg || reg->api_version != LM_GGML_BACKEND_API_VERSION) {
232
247
  if (!silent) {
233
248
  if (!reg) {
234
- LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, path);
249
+ LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
235
250
  } else {
236
251
  LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
237
- __func__, path, reg->api_version, LM_GGML_BACKEND_API_VERSION);
252
+ __func__, utf16_to_utf8(path).c_str(), reg->api_version, LM_GGML_BACKEND_API_VERSION);
238
253
  }
239
254
  }
240
255
  return nullptr;
241
256
  }
242
257
 
243
- LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), path);
258
+ LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
244
259
 
245
260
  register_backend(reg, std::move(handle));
246
261
 
@@ -376,14 +391,14 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
376
391
 
377
392
  // Dynamic loading
378
393
  lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path) {
379
- return get_reg().load_backend(path, false);
394
+ return get_reg().load_backend(utf8_to_utf16(path), false);
380
395
  }
381
396
 
382
397
  void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg) {
383
398
  get_reg().unload_backend(reg, true);
384
399
  }
385
400
 
386
- static std::string get_executable_path() {
401
+ static std::wstring get_executable_path() {
387
402
  #if defined(__APPLE__)
388
403
  // get executable path
389
404
  std::vector<char> path;
@@ -401,13 +416,17 @@ static std::string get_executable_path() {
401
416
  if (last_slash != std::string::npos) {
402
417
  base_path = base_path.substr(0, last_slash);
403
418
  }
404
- return base_path + "/";
405
- #elif defined(__linux__)
419
+ return utf8_to_utf16(base_path + "/");
420
+ #elif defined(__linux__) || defined(__FreeBSD__)
406
421
  std::string base_path = ".";
407
422
  std::vector<char> path(1024);
408
423
  while (true) {
409
424
  // get executable path
425
+ # if defined(__linux__)
410
426
  ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
427
+ # elif defined(__FreeBSD__)
428
+ ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
429
+ # endif
411
430
  if (len == -1) {
412
431
  break;
413
432
  }
@@ -423,57 +442,63 @@ static std::string get_executable_path() {
423
442
  path.resize(path.size() * 2);
424
443
  }
425
444
 
426
- return base_path + "/";
445
+ return utf8_to_utf16(base_path + "/");
427
446
  #elif defined(_WIN32)
428
- std::vector<char> path(MAX_PATH);
429
- DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
447
+ std::vector<wchar_t> path(MAX_PATH);
448
+ DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
430
449
  if (len == 0) {
431
- return "";
450
+ return {};
432
451
  }
433
- std::string base_path(path.data(), len);
452
+ std::wstring base_path(path.data(), len);
434
453
  // remove executable name
435
454
  auto last_slash = base_path.find_last_of('\\');
436
455
  if (last_slash != std::string::npos) {
437
456
  base_path = base_path.substr(0, last_slash);
438
457
  }
439
- return base_path + "\\";
458
+ return base_path + L"\\";
459
+ #else
460
+ return {};
440
461
  #endif
441
462
  }
442
463
 
443
- static std::string backend_filename_prefix() {
464
+ static std::wstring backend_filename_prefix() {
444
465
  #ifdef _WIN32
445
- return "ggml-";
466
+ return L"ggml-";
446
467
  #else
447
- return "libggml-";
468
+ return L"libggml-";
448
469
  #endif
449
470
  }
450
471
 
451
- static std::string backend_filename_suffix() {
472
+ static std::wstring backend_filename_suffix() {
452
473
  #ifdef _WIN32
453
- return ".dll";
474
+ return L".dll";
454
475
  #else
455
- return ".so";
476
+ return L".so";
477
+ #endif
478
+ }
479
+
480
+ static std::wstring path_separator() {
481
+ #ifdef _WIN32
482
+ return L"\\";
483
+ #else
484
+ return L"/";
456
485
  #endif
457
486
  }
458
487
 
459
488
  static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
460
489
  // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
461
490
  // TODO: search system paths
462
- std::string file_prefix = backend_filename_prefix() + name + "-";
463
- std::vector<std::string> search_paths;
491
+ std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
492
+ std::vector<std::wstring> search_paths;
464
493
  if (user_search_path == nullptr) {
465
- search_paths.push_back("./");
494
+ search_paths.push_back(L"." + path_separator());
466
495
  search_paths.push_back(get_executable_path());
467
496
  } else {
468
- #if defined(_WIN32)
469
- search_paths.push_back(std::string(user_search_path) + "\\");
470
- #else
471
- search_paths.push_back(std::string(user_search_path) + "/");
472
- #endif
497
+ search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
473
498
  }
474
499
 
475
500
  int best_score = 0;
476
- std::string best_path;
501
+ std::wstring best_path;
477
502
 
478
503
  namespace fs = std::filesystem;
479
504
  for (const auto & search_path : search_paths) {
@@ -483,27 +508,27 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
483
508
  fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
484
509
  for (const auto & entry : dir_it) {
485
510
  if (entry.is_regular_file()) {
486
- std::string filename = entry.path().filename().string();
487
- std::string ext = entry.path().extension().string();
511
+ std::wstring filename = entry.path().filename().wstring();
512
+ std::wstring ext = entry.path().extension().wstring();
488
513
  if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
489
- dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
514
+ dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
490
515
  if (!handle && !silent) {
491
- LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
516
+ LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
492
517
  }
493
518
  if (handle) {
494
519
  auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
495
520
  if (score_fn) {
496
521
  int s = score_fn();
497
522
  #ifndef NDEBUG
498
- LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
523
+ LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
499
524
  #endif
500
525
  if (s > best_score) {
501
526
  best_score = s;
502
- best_path = entry.path().string();
527
+ best_path = entry.path().wstring();
503
528
  }
504
529
  } else {
505
530
  if (!silent) {
506
- LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
531
+ LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
507
532
  }
508
533
  }
509
534
  }
@@ -515,15 +540,15 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
515
540
  if (best_score == 0) {
516
541
  // try to load the base backend
517
542
  for (const auto & search_path : search_paths) {
518
- std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
543
+ std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
519
544
  if (fs::exists(path)) {
520
- return get_reg().load_backend(path.c_str(), silent);
545
+ return get_reg().load_backend(path, silent);
521
546
  }
522
547
  }
523
548
  return nullptr;
524
549
  }
525
550
 
526
- return get_reg().load_backend(best_path.c_str(), silent);
551
+ return get_reg().load_backend(best_path, silent);
527
552
  }
528
553
 
529
554
  void lm_ggml_backend_load_all() {
@@ -564,21 +564,21 @@ static void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
564
564
 
565
565
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
566
566
  if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
567
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
567
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
568
568
 
569
569
  for (int c = 0; c < nc; c += ncols_interleaved) {
570
- const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
570
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
571
571
  float32x4_t acc = vdupq_n_f32(0);
572
572
  for (int b = 0; b < nb; b++) {
573
- int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
574
- int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
575
- int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
576
- int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
577
- float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
573
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
574
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
575
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
576
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
577
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
578
578
 
579
579
  int8x16_t a0 = vld1q_s8(a_ptr->qs);
580
580
  int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
581
- float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
581
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
582
582
 
583
583
  int32x4_t ret = vdupq_n_s32(0);
584
584
 
@@ -647,72 +647,52 @@ static void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
647
647
  UNUSED(ncols_interleaved);
648
648
  UNUSED(blocklen);
649
649
 
650
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
651
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
652
- const void * b_ptr = vx;
653
- const void * a_ptr = vy;
654
- float * res_ptr = s;
650
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
651
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
652
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
655
653
 
656
- __asm__ __volatile__(
657
- "movi v2.16b, #0x4\n"
658
- "movi v1.16b, #0xf0\n"
659
- "add %x[b_ptr], %x[b_ptr], #0x8\n"
660
- "1:" // Column loop
661
- "add x23, %x[a_ptr], #0x2\n"
662
- "movi v0.16b, #0x0\n"
663
- "mov x22, %x[nb]\n"
664
- "2:" // Block loop
665
- "ldr q31, [%x[b_ptr], #0x0]\n"
666
- "ldr q30, [%x[b_ptr], #0x10]\n"
667
- "mov x21, x23\n"
668
- "movi v29.4s, #0x0\n"
669
- "ldr q28, [%x[b_ptr], #0x20]\n"
670
- "ldr q27, [%x[b_ptr], #0x30]\n"
671
- "movi v26.4s, #0x0\n"
672
- "sub x20, x23, #0x2\n"
673
- "ld1r { v25.8h }, [x20]\n"
674
- "ldr q24, [%x[b_ptr], #-0x8]\n"
675
- "sub x22, x22, #0x1\n"
676
- "add x23, x23, #0x22\n"
677
- "ld1r { v23.2d }, [x21], #0x8\n"
678
- "sshl v22.16b, v31.16b, v2.16b\n"
679
- "sshl v16.16b, v30.16b, v2.16b\n"
680
- "add %x[b_ptr], %x[b_ptr], #0x48\n"
681
- "ld1r { v21.2d }, [x21], #0x8\n"
682
- "sshl v20.16b, v28.16b, v2.16b\n"
683
- "sshl v19.16b, v27.16b, v2.16b\n"
684
- "ld1r { v18.2d }, [x21], #0x8\n"
685
- "ld1r { v17.2d }, [x21], #0x8\n"
686
- "and v31.16b, v31.16b, v1.16b\n"
687
- "and v30.16b, v30.16b, v1.16b\n"
688
- ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
689
- ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
690
- "and v28.16b, v28.16b, v1.16b\n"
691
- "and v27.16b, v27.16b, v1.16b\n"
692
- "fcvtl v25.4s, v25.4h\n"
693
- "fcvtl v16.4s, v24.4h\n"
694
- ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
695
- ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
696
- "fmul v16.4s, v16.4s, v25.4s\n"
697
- ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
698
- ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
699
- ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
700
- ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
701
- "addp v29.4s, v29.4s, v26.4s\n"
702
- "scvtf v29.4s, v29.4s, #0x4\n"
703
- "fmla v0.4s, v29.4s, v16.4s\n"
704
- "cbnz x22, 2b\n"
705
- "sub %x[nc], %x[nc], #0x4\n"
706
- "str q0, [%x[res_ptr], #0x0]\n"
707
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
708
- "cbnz %x[nc], 1b\n"
709
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
710
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
711
- : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
712
- );
654
+ for (int c = 0; c < nc; c += ncols_interleaved) {
655
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
656
+ float32x4_t acc = vdupq_n_f32(0);
657
+ for (int b = 0; b < nb; b++) {
658
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
659
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
660
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
661
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
662
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
663
+
664
+ int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
665
+ int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
666
+ int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
667
+ int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
668
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
669
+
670
+ int32x4_t ret0 = vdupq_n_s32(0);
671
+ int32x4_t ret1 = vdupq_n_s32(0);
672
+
673
+ ret0 = vdotq_s32(ret0, b0 << 4, a0);
674
+ ret1 = vdotq_s32(ret1, b1 << 4, a0);
675
+ ret0 = vdotq_s32(ret0, b2 << 4, a1);
676
+ ret1 = vdotq_s32(ret1, b3 << 4, a1);
677
+
678
+ ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
679
+ ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
680
+ ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
681
+ ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
682
+
683
+ int32x4_t ret = vpaddq_s32(ret0, ret1);
684
+
685
+ acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
686
+ vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
687
+ a_ptr++;
688
+ b_ptr++;
689
+ }
690
+ vst1q_f32(s, acc);
691
+ s += ncols_interleaved;
692
+ }
713
693
  return;
714
694
  }
715
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
695
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
716
696
  float sumf[4];
717
697
  int sumi;
718
698
 
package/cpp/ggml-cpu.c CHANGED
@@ -985,7 +985,7 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
985
985
  #define LM_GGML_F16_STEP 32
986
986
  #define LM_GGML_F16_EPR 4
987
987
 
988
- static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
988
+ static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
989
989
  float tmp[4];
990
990
 
991
991
  tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
@@ -996,7 +996,7 @@ static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
996
996
  return _mm_loadu_ps(tmp);
997
997
  }
998
998
 
999
- static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) {
999
+ static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
1000
1000
  float arr[4];
1001
1001
 
1002
1002
  _mm_storeu_ps(arr, y);
@@ -7418,14 +7418,14 @@ static void lm_ggml_compute_forward_mul_mat(
7418
7418
  if (src1_cont) {
7419
7419
  for (int64_t i13 = 0; i13 < ne13; i13++)
7420
7420
  for (int64_t i12 = 0; i12 < ne12; i12++)
7421
- if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7421
+ if (!llamafile_sgemm(params,
7422
+ ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7422
7423
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7423
7424
  nb01/lm_ggml_type_size(src0->type),
7424
7425
  (const char *)src1->data + i12*nb12 + i13*nb13,
7425
7426
  nb11/lm_ggml_type_size(src1->type),
7426
7427
  (char *)dst->data + i12*nb2 + i13*nb3,
7427
7428
  nb1/lm_ggml_type_size(dst->type),
7428
- ith, nth,
7429
7429
  src0->type,
7430
7430
  src1->type,
7431
7431
  dst->type))
@@ -7470,14 +7470,14 @@ UseGgmlGemm1:;
7470
7470
 
7471
7471
  for (int64_t i13 = 0; i13 < ne13; i13++)
7472
7472
  for (int64_t i12 = 0; i12 < ne12; i12++)
7473
- if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7473
+ if (!llamafile_sgemm(params,
7474
+ ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7474
7475
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7475
7476
  nb01/lm_ggml_type_size(src0->type),
7476
7477
  (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
7477
7478
  row_size/lm_ggml_type_size(vec_dot_type),
7478
7479
  (char *)dst->data + i12*nb2 + i13*nb3,
7479
7480
  nb1/lm_ggml_type_size(dst->type),
7480
- ith, nth,
7481
7481
  src0->type,
7482
7482
  vec_dot_type,
7483
7483
  dst->type))
package/cpp/ggml-cpu.cpp CHANGED
@@ -393,8 +393,11 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
393
393
  switch (op->op) {
394
394
  case LM_GGML_OP_CPY:
395
395
  return
396
+ op->type != LM_GGML_TYPE_IQ3_XXS &&
397
+ op->type != LM_GGML_TYPE_IQ3_S &&
396
398
  op->type != LM_GGML_TYPE_IQ2_XXS &&
397
399
  op->type != LM_GGML_TYPE_IQ2_XS &&
400
+ op->type != LM_GGML_TYPE_IQ2_S &&
398
401
  op->type != LM_GGML_TYPE_IQ1_S &&
399
402
  op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
400
403
  case LM_GGML_OP_MUL_MAT:
@@ -518,6 +521,12 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
518
521
  if (lm_ggml_cpu_has_sve()) {
519
522
  features.push_back({ "SVE", "1" });
520
523
  }
524
+ if (lm_ggml_cpu_has_dotprod()) {
525
+ features.push_back({ "DOTPROD", "1" });
526
+ }
527
+ if (lm_ggml_cpu_has_matmul_int8()) {
528
+ features.push_back({ "MATMUL_INT8", "1" });
529
+ }
521
530
  if (lm_ggml_cpu_get_sve_cnt() > 0) {
522
531
  static std::string sve_cnt = std::to_string(lm_ggml_cpu_get_sve_cnt());
523
532
  features.push_back({ "SVE_CNT", sve_cnt.c_str() });
package/cpp/ggml-impl.h CHANGED
@@ -551,6 +551,22 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
551
551
  #define LM_GGML_FP32_TO_BF16(x) lm_ggml_compute_fp32_to_bf16(x)
552
552
  #define LM_GGML_BF16_TO_FP32(x) lm_ggml_compute_bf16_to_fp32(x)
553
553
 
554
+ // expose GGUF internals for test code
555
+
556
+ LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
557
+
558
+ LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
559
+
560
+ struct lm_gguf_buf {
561
+ void * data;
562
+ size_t size;
563
+ size_t offset;
564
+ };
565
+ LM_GGML_API struct lm_gguf_buf lm_gguf_buf_init(size_t size);
566
+ LM_GGML_API void lm_gguf_buf_free(struct lm_gguf_buf buf);
567
+
568
+ LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta);
569
+
554
570
  #ifdef __cplusplus
555
571
  }
556
572
  #endif