cui-llama.rn 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/android/src/main/CMakeLists.txt +14 -8
  2. package/android/src/main/jni.cpp +38 -37
  3. package/cpp/common.cpp +50 -30
  4. package/cpp/common.h +32 -13
  5. package/cpp/ggml-alloc.c +0 -1
  6. package/cpp/ggml-backend-reg.cpp +79 -49
  7. package/cpp/ggml-backend.cpp +5 -2
  8. package/cpp/ggml-cpp.h +1 -0
  9. package/cpp/ggml-cpu-aarch64.cpp +57 -72
  10. package/cpp/ggml-cpu-quants.c +5 -1
  11. package/cpp/ggml-cpu.c +6 -6
  12. package/cpp/ggml-cpu.cpp +9 -0
  13. package/cpp/ggml-impl.h +11 -0
  14. package/cpp/ggml-metal.m +2 -2
  15. package/cpp/ggml.c +129 -1388
  16. package/cpp/ggml.h +29 -152
  17. package/cpp/gguf.cpp +1325 -0
  18. package/cpp/gguf.h +202 -0
  19. package/cpp/llama-adapter.cpp +346 -0
  20. package/cpp/llama-adapter.h +73 -0
  21. package/cpp/llama-arch.cpp +1434 -0
  22. package/cpp/llama-arch.h +395 -0
  23. package/cpp/llama-batch.cpp +368 -0
  24. package/cpp/llama-batch.h +88 -0
  25. package/cpp/llama-chat.cpp +567 -0
  26. package/cpp/llama-chat.h +51 -0
  27. package/cpp/llama-context.cpp +1771 -0
  28. package/cpp/llama-context.h +128 -0
  29. package/cpp/llama-cparams.cpp +1 -0
  30. package/cpp/llama-cparams.h +37 -0
  31. package/cpp/llama-cpp.h +30 -0
  32. package/cpp/llama-grammar.cpp +16 -15
  33. package/cpp/llama-grammar.h +5 -6
  34. package/cpp/llama-hparams.cpp +71 -0
  35. package/cpp/llama-hparams.h +140 -0
  36. package/cpp/llama-impl.cpp +167 -0
  37. package/cpp/llama-impl.h +16 -136
  38. package/cpp/llama-kv-cache.cpp +718 -0
  39. package/cpp/llama-kv-cache.h +218 -0
  40. package/cpp/llama-mmap.cpp +589 -0
  41. package/cpp/llama-mmap.h +67 -0
  42. package/cpp/llama-model-loader.cpp +1011 -0
  43. package/cpp/llama-model-loader.h +158 -0
  44. package/cpp/llama-model.cpp +2202 -0
  45. package/cpp/llama-model.h +391 -0
  46. package/cpp/llama-sampling.cpp +117 -4
  47. package/cpp/llama-vocab.cpp +26 -29
  48. package/cpp/llama-vocab.h +14 -2
  49. package/cpp/llama.cpp +8839 -19131
  50. package/cpp/llama.cpp.rej +23 -0
  51. package/cpp/llama.h +31 -9
  52. package/cpp/rn-llama.hpp +39 -37
  53. package/cpp/sgemm.cpp +1091 -378
  54. package/cpp/sgemm.h +2 -2
  55. package/cpp/unicode.cpp +6 -0
  56. package/package.json +1 -1
@@ -66,6 +66,26 @@
66
66
  #include "ggml-kompute.h"
67
67
  #endif
68
68
 
69
+ // disable C++17 deprecation warning for std::codecvt_utf8
70
+ #if defined(__clang__)
71
+ # pragma clang diagnostic push
72
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
73
+ #endif
74
+
75
+ static std::wstring utf8_to_utf16(const std::string & str) {
76
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
77
+ return converter.from_bytes(str);
78
+ }
79
+
80
+ static std::string utf16_to_utf8(const std::wstring & str) {
81
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
82
+ return converter.to_bytes(str);
83
+ }
84
+
85
+ #if defined(__clang__)
86
+ # pragma clang diagnostic pop
87
+ #endif
88
+
69
89
  #ifdef _WIN32
70
90
 
71
91
  using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
88
108
  return handle;
89
109
  }
90
110
 
91
- static dl_handle * dl_load_library(const std::string & path) {
92
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
93
- return dl_load_library(converter.from_bytes(path));
94
- }
95
-
96
111
  static void * dl_get_sym(dl_handle * handle, const char * name) {
97
112
  DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
98
113
  SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -114,8 +129,8 @@ struct dl_handle_deleter {
114
129
  }
115
130
  };
116
131
 
117
- static void * dl_load_library(const std::string & path) {
118
- dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
132
+ static void * dl_load_library(const std::wstring & path) {
133
+ dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
119
134
 
120
135
  return handle;
121
136
  }
@@ -202,11 +217,11 @@ struct lm_ggml_backend_registry {
202
217
  devices.push_back(device);
203
218
  }
204
219
 
205
- lm_ggml_backend_reg_t load_backend(const char * path, bool silent) {
220
+ lm_ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
206
221
  dl_handle_ptr handle { dl_load_library(path) };
207
222
  if (!handle) {
208
223
  if (!silent) {
209
- LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
224
+ LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
210
225
  }
211
226
  return nullptr;
212
227
  }
@@ -214,7 +229,7 @@ struct lm_ggml_backend_registry {
214
229
  auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
215
230
  if (score_fn && score_fn() == 0) {
216
231
  if (!silent) {
217
- LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
232
+ LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
218
233
  }
219
234
  return nullptr;
220
235
  }
@@ -222,7 +237,7 @@ struct lm_ggml_backend_registry {
222
237
  auto backend_init_fn = (lm_ggml_backend_init_t) dl_get_sym(handle.get(), "lm_ggml_backend_init");
223
238
  if (!backend_init_fn) {
224
239
  if (!silent) {
225
- LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, path);
240
+ LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
226
241
  }
227
242
  return nullptr;
228
243
  }
@@ -231,16 +246,16 @@ struct lm_ggml_backend_registry {
231
246
  if (!reg || reg->api_version != LM_GGML_BACKEND_API_VERSION) {
232
247
  if (!silent) {
233
248
  if (!reg) {
234
- LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, path);
249
+ LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
235
250
  } else {
236
251
  LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
237
- __func__, path, reg->api_version, LM_GGML_BACKEND_API_VERSION);
252
+ __func__, utf16_to_utf8(path).c_str(), reg->api_version, LM_GGML_BACKEND_API_VERSION);
238
253
  }
239
254
  }
240
255
  return nullptr;
241
256
  }
242
257
 
243
- LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), path);
258
+ LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
244
259
 
245
260
  register_backend(reg, std::move(handle));
246
261
 
@@ -376,14 +391,14 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
376
391
 
377
392
  // Dynamic loading
378
393
  lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path) {
379
- return get_reg().load_backend(path, false);
394
+ return get_reg().load_backend(utf8_to_utf16(path), false);
380
395
  }
381
396
 
382
397
  void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg) {
383
398
  get_reg().unload_backend(reg, true);
384
399
  }
385
400
 
386
- static std::string get_executable_path() {
401
+ static std::wstring get_executable_path() {
387
402
  #if defined(__APPLE__)
388
403
  // get executable path
389
404
  std::vector<char> path;
@@ -401,13 +416,17 @@ static std::string get_executable_path() {
401
416
  if (last_slash != std::string::npos) {
402
417
  base_path = base_path.substr(0, last_slash);
403
418
  }
404
- return base_path + "/";
405
- #elif defined(__linux__)
419
+ return utf8_to_utf16(base_path + "/");
420
+ #elif defined(__linux__) || defined(__FreeBSD__)
406
421
  std::string base_path = ".";
407
422
  std::vector<char> path(1024);
408
423
  while (true) {
409
424
  // get executable path
425
+ # if defined(__linux__)
410
426
  ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
427
+ # elif defined(__FreeBSD__)
428
+ ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
429
+ # endif
411
430
  if (len == -1) {
412
431
  break;
413
432
  }
@@ -423,57 +442,63 @@ static std::string get_executable_path() {
423
442
  path.resize(path.size() * 2);
424
443
  }
425
444
 
426
- return base_path + "/";
445
+ return utf8_to_utf16(base_path + "/");
427
446
  #elif defined(_WIN32)
428
- std::vector<char> path(MAX_PATH);
429
- DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
447
+ std::vector<wchar_t> path(MAX_PATH);
448
+ DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
430
449
  if (len == 0) {
431
- return "";
450
+ return {};
432
451
  }
433
- std::string base_path(path.data(), len);
452
+ std::wstring base_path(path.data(), len);
434
453
  // remove executable name
435
454
  auto last_slash = base_path.find_last_of('\\');
436
455
  if (last_slash != std::string::npos) {
437
456
  base_path = base_path.substr(0, last_slash);
438
457
  }
439
- return base_path + "\\";
458
+ return base_path + L"\\";
459
+ #else
460
+ return {};
440
461
  #endif
441
462
  }
442
463
 
443
- static std::string backend_filename_prefix() {
464
+ static std::wstring backend_filename_prefix() {
444
465
  #ifdef _WIN32
445
- return "ggml-";
466
+ return L"ggml-";
446
467
  #else
447
- return "libggml-";
468
+ return L"libggml-";
448
469
  #endif
449
470
  }
450
471
 
451
- static std::string backend_filename_suffix() {
472
+ static std::wstring backend_filename_suffix() {
452
473
  #ifdef _WIN32
453
- return ".dll";
474
+ return L".dll";
454
475
  #else
455
- return ".so";
476
+ return L".so";
477
+ #endif
478
+ }
479
+
480
+ static std::wstring path_separator() {
481
+ #ifdef _WIN32
482
+ return L"\\";
483
+ #else
484
+ return L"/";
456
485
  #endif
457
486
  }
458
487
 
459
488
  static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
460
489
  // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
461
490
  // TODO: search system paths
462
- std::string file_prefix = backend_filename_prefix() + name + "-";
463
- std::vector<std::string> search_paths;
491
+ std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
492
+ std::vector<std::wstring> search_paths;
464
493
  if (user_search_path == nullptr) {
465
- search_paths.push_back("./");
494
+ search_paths.push_back(L"." + path_separator());
466
495
  search_paths.push_back(get_executable_path());
467
496
  } else {
468
- #if defined(_WIN32)
469
- search_paths.push_back(std::string(user_search_path) + "\\");
470
- #else
471
- search_paths.push_back(std::string(user_search_path) + "/");
472
- #endif
497
+ search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
473
498
  }
474
499
 
475
500
  int best_score = 0;
476
- std::string best_path;
501
+ std::wstring best_path;
477
502
 
478
503
  namespace fs = std::filesystem;
479
504
  for (const auto & search_path : search_paths) {
@@ -483,27 +508,27 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
483
508
  fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
484
509
  for (const auto & entry : dir_it) {
485
510
  if (entry.is_regular_file()) {
486
- std::string filename = entry.path().filename().string();
487
- std::string ext = entry.path().extension().string();
511
+ std::wstring filename = entry.path().filename().wstring();
512
+ std::wstring ext = entry.path().extension().wstring();
488
513
  if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
489
- dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
514
+ dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
490
515
  if (!handle && !silent) {
491
- LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
516
+ LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
492
517
  }
493
518
  if (handle) {
494
519
  auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
495
520
  if (score_fn) {
496
521
  int s = score_fn();
497
522
  #ifndef NDEBUG
498
- LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
523
+ LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
499
524
  #endif
500
525
  if (s > best_score) {
501
526
  best_score = s;
502
- best_path = entry.path().string();
527
+ best_path = entry.path().wstring();
503
528
  }
504
529
  } else {
505
530
  if (!silent) {
506
- LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
531
+ LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
507
532
  }
508
533
  }
509
534
  }
@@ -515,15 +540,15 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
515
540
  if (best_score == 0) {
516
541
  // try to load the base backend
517
542
  for (const auto & search_path : search_paths) {
518
- std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
543
+ std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
519
544
  if (fs::exists(path)) {
520
- return get_reg().load_backend(path.c_str(), silent);
545
+ return get_reg().load_backend(path, silent);
521
546
  }
522
547
  }
523
548
  return nullptr;
524
549
  }
525
550
 
526
- return get_reg().load_backend(best_path.c_str(), silent);
551
+ return get_reg().load_backend(best_path, silent);
527
552
  }
528
553
 
529
554
  void lm_ggml_backend_load_all() {
@@ -549,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
549
574
  lm_ggml_backend_load_best("opencl", silent, dir_path);
550
575
  lm_ggml_backend_load_best("musa", silent, dir_path);
551
576
  lm_ggml_backend_load_best("cpu", silent, dir_path);
577
+ // check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
578
+ const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
579
+ if (backend_path) {
580
+ lm_ggml_backend_load(backend_path);
581
+ }
552
582
  }
@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
764
764
  if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
765
765
  int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
766
766
  // check if a backend with higher prio wants to offload the op
767
- if (src_backend_id == sched->n_backends - 1) {
767
+ if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
768
768
  for (int b = 0; b < src_backend_id; b++) {
769
769
  if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
770
770
  SET_CAUSE(tensor, "1.off");
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
795
795
  for (int i = 0; i < graph->n_nodes; i++) {
796
796
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
797
797
  lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
798
- LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
798
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
799
799
  sched->splits[cur_split].n_inputs);
800
800
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
801
+ if (j == 0) {
802
+ LM_GGML_LOG_DEBUG(": ");
803
+ }
801
804
  LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
802
805
  fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
803
806
  }
package/cpp/ggml-cpp.h CHANGED
@@ -7,6 +7,7 @@
7
7
  #include "ggml.h"
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
+ #include "gguf.h"
10
11
  #include <memory>
11
12
 
12
13
  // Smart pointers for ggml types
@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
194
194
  }
195
195
 
196
196
  static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
197
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
197
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
198
198
  const __m256i zero = _mm256_setzero_si256();
199
199
  return _mm256_dpbusd_epi32(zero, ax, sy);
200
+ #elif defined(__AVXVNNI__)
201
+ const __m256i zero = _mm256_setzero_si256();
202
+ return _mm256_dpbusd_avx_epi32(zero, ax, sy);
200
203
  #else
201
204
  // Perform multiplication and create 16-bit values
202
205
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -564,21 +567,21 @@ static void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
564
567
 
565
568
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
566
569
  if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
567
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
570
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
568
571
 
569
572
  for (int c = 0; c < nc; c += ncols_interleaved) {
570
- const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
573
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
571
574
  float32x4_t acc = vdupq_n_f32(0);
572
575
  for (int b = 0; b < nb; b++) {
573
- int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
574
- int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
575
- int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
576
- int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
577
- float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
576
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
577
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
578
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
579
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
580
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
578
581
 
579
582
  int8x16_t a0 = vld1q_s8(a_ptr->qs);
580
583
  int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
581
- float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
584
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
582
585
 
583
586
  int32x4_t ret = vdupq_n_s32(0);
584
587
 
@@ -647,72 +650,52 @@ static void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
647
650
  UNUSED(ncols_interleaved);
648
651
  UNUSED(blocklen);
649
652
 
650
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
651
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
652
- const void * b_ptr = vx;
653
- const void * a_ptr = vy;
654
- float * res_ptr = s;
653
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
654
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
655
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
655
656
 
656
- __asm__ __volatile__(
657
- "movi v2.16b, #0x4\n"
658
- "movi v1.16b, #0xf0\n"
659
- "add %x[b_ptr], %x[b_ptr], #0x8\n"
660
- "1:" // Column loop
661
- "add x23, %x[a_ptr], #0x2\n"
662
- "movi v0.16b, #0x0\n"
663
- "mov x22, %x[nb]\n"
664
- "2:" // Block loop
665
- "ldr q31, [%x[b_ptr], #0x0]\n"
666
- "ldr q30, [%x[b_ptr], #0x10]\n"
667
- "mov x21, x23\n"
668
- "movi v29.4s, #0x0\n"
669
- "ldr q28, [%x[b_ptr], #0x20]\n"
670
- "ldr q27, [%x[b_ptr], #0x30]\n"
671
- "movi v26.4s, #0x0\n"
672
- "sub x20, x23, #0x2\n"
673
- "ld1r { v25.8h }, [x20]\n"
674
- "ldr q24, [%x[b_ptr], #-0x8]\n"
675
- "sub x22, x22, #0x1\n"
676
- "add x23, x23, #0x22\n"
677
- "ld1r { v23.2d }, [x21], #0x8\n"
678
- "sshl v22.16b, v31.16b, v2.16b\n"
679
- "sshl v16.16b, v30.16b, v2.16b\n"
680
- "add %x[b_ptr], %x[b_ptr], #0x48\n"
681
- "ld1r { v21.2d }, [x21], #0x8\n"
682
- "sshl v20.16b, v28.16b, v2.16b\n"
683
- "sshl v19.16b, v27.16b, v2.16b\n"
684
- "ld1r { v18.2d }, [x21], #0x8\n"
685
- "ld1r { v17.2d }, [x21], #0x8\n"
686
- "and v31.16b, v31.16b, v1.16b\n"
687
- "and v30.16b, v30.16b, v1.16b\n"
688
- ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
689
- ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
690
- "and v28.16b, v28.16b, v1.16b\n"
691
- "and v27.16b, v27.16b, v1.16b\n"
692
- "fcvtl v25.4s, v25.4h\n"
693
- "fcvtl v16.4s, v24.4h\n"
694
- ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
695
- ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
696
- "fmul v16.4s, v16.4s, v25.4s\n"
697
- ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
698
- ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
699
- ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
700
- ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
701
- "addp v29.4s, v29.4s, v26.4s\n"
702
- "scvtf v29.4s, v29.4s, #0x4\n"
703
- "fmla v0.4s, v29.4s, v16.4s\n"
704
- "cbnz x22, 2b\n"
705
- "sub %x[nc], %x[nc], #0x4\n"
706
- "str q0, [%x[res_ptr], #0x0]\n"
707
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
708
- "cbnz %x[nc], 1b\n"
709
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
710
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
711
- : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
712
- );
657
+ for (int c = 0; c < nc; c += ncols_interleaved) {
658
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
659
+ float32x4_t acc = vdupq_n_f32(0);
660
+ for (int b = 0; b < nb; b++) {
661
+ int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
662
+ int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
663
+ int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
664
+ int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
665
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
666
+
667
+ int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
668
+ int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
669
+ int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
670
+ int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
671
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
672
+
673
+ int32x4_t ret0 = vdupq_n_s32(0);
674
+ int32x4_t ret1 = vdupq_n_s32(0);
675
+
676
+ ret0 = vdotq_s32(ret0, b0 << 4, a0);
677
+ ret1 = vdotq_s32(ret1, b1 << 4, a0);
678
+ ret0 = vdotq_s32(ret0, b2 << 4, a1);
679
+ ret1 = vdotq_s32(ret1, b3 << 4, a1);
680
+
681
+ ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
682
+ ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
683
+ ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
684
+ ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
685
+
686
+ int32x4_t ret = vpaddq_s32(ret0, ret1);
687
+
688
+ acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
689
+ vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
690
+ a_ptr++;
691
+ b_ptr++;
692
+ }
693
+ vst1q_f32(s, acc);
694
+ s += ncols_interleaved;
695
+ }
713
696
  return;
714
697
  }
715
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
698
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
716
699
  float sumf[4];
717
700
  int sumi;
718
701
 
@@ -4186,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
4186
4169
  buffer->buft = buft;
4187
4170
  buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
4188
4171
  buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
4172
+ buffer->iface.get_tensor = nullptr;
4173
+ buffer->iface.cpy_tensor = nullptr;
4189
4174
  return buffer;
4190
4175
  }
4191
4176
 
@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
103
103
  }
104
104
 
105
105
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
106
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
106
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
107
107
  const __m256i zero = _mm256_setzero_si256();
108
108
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
109
109
  return _mm256_cvtepi32_ps(summed_pairs);
110
+ #elif defined(__AVXVNNI__)
111
+ const __m256i zero = _mm256_setzero_si256();
112
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
113
+ return _mm256_cvtepi32_ps(summed_pairs);
110
114
  #else
111
115
  // Perform multiplication and create 16-bit values
112
116
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
package/cpp/ggml-cpu.c CHANGED
@@ -985,7 +985,7 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
985
985
  #define LM_GGML_F16_STEP 32
986
986
  #define LM_GGML_F16_EPR 4
987
987
 
988
- static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
988
+ static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
989
989
  float tmp[4];
990
990
 
991
991
  tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
@@ -996,7 +996,7 @@ static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
996
996
  return _mm_loadu_ps(tmp);
997
997
  }
998
998
 
999
- static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) {
999
+ static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
1000
1000
  float arr[4];
1001
1001
 
1002
1002
  _mm_storeu_ps(arr, y);
@@ -7418,14 +7418,14 @@ static void lm_ggml_compute_forward_mul_mat(
7418
7418
  if (src1_cont) {
7419
7419
  for (int64_t i13 = 0; i13 < ne13; i13++)
7420
7420
  for (int64_t i12 = 0; i12 < ne12; i12++)
7421
- if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7421
+ if (!llamafile_sgemm(params,
7422
+ ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7422
7423
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7423
7424
  nb01/lm_ggml_type_size(src0->type),
7424
7425
  (const char *)src1->data + i12*nb12 + i13*nb13,
7425
7426
  nb11/lm_ggml_type_size(src1->type),
7426
7427
  (char *)dst->data + i12*nb2 + i13*nb3,
7427
7428
  nb1/lm_ggml_type_size(dst->type),
7428
- ith, nth,
7429
7429
  src0->type,
7430
7430
  src1->type,
7431
7431
  dst->type))
@@ -7470,14 +7470,14 @@ UseGgmlGemm1:;
7470
7470
 
7471
7471
  for (int64_t i13 = 0; i13 < ne13; i13++)
7472
7472
  for (int64_t i12 = 0; i12 < ne12; i12++)
7473
- if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7473
+ if (!llamafile_sgemm(params,
7474
+ ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
7474
7475
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
7475
7476
  nb01/lm_ggml_type_size(src0->type),
7476
7477
  (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
7477
7478
  row_size/lm_ggml_type_size(vec_dot_type),
7478
7479
  (char *)dst->data + i12*nb2 + i13*nb3,
7479
7480
  nb1/lm_ggml_type_size(dst->type),
7480
- ith, nth,
7481
7481
  src0->type,
7482
7482
  vec_dot_type,
7483
7483
  dst->type))
package/cpp/ggml-cpu.cpp CHANGED
@@ -393,8 +393,11 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
393
393
  switch (op->op) {
394
394
  case LM_GGML_OP_CPY:
395
395
  return
396
+ op->type != LM_GGML_TYPE_IQ3_XXS &&
397
+ op->type != LM_GGML_TYPE_IQ3_S &&
396
398
  op->type != LM_GGML_TYPE_IQ2_XXS &&
397
399
  op->type != LM_GGML_TYPE_IQ2_XS &&
400
+ op->type != LM_GGML_TYPE_IQ2_S &&
398
401
  op->type != LM_GGML_TYPE_IQ1_S &&
399
402
  op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
400
403
  case LM_GGML_OP_MUL_MAT:
@@ -518,6 +521,12 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
518
521
  if (lm_ggml_cpu_has_sve()) {
519
522
  features.push_back({ "SVE", "1" });
520
523
  }
524
+ if (lm_ggml_cpu_has_dotprod()) {
525
+ features.push_back({ "DOTPROD", "1" });
526
+ }
527
+ if (lm_ggml_cpu_has_matmul_int8()) {
528
+ features.push_back({ "MATMUL_INT8", "1" });
529
+ }
521
530
  if (lm_ggml_cpu_get_sve_cnt() > 0) {
522
531
  static std::string sve_cnt = std::to_string(lm_ggml_cpu_get_sve_cnt());
523
532
  features.push_back({ "SVE_CNT", sve_cnt.c_str() });
package/cpp/ggml-impl.h CHANGED
@@ -3,6 +3,8 @@
3
3
  // GGML internal header
4
4
 
5
5
  #include "ggml.h"
6
+ #include "gguf.h"
7
+
6
8
  #include <assert.h>
7
9
  #include <math.h>
8
10
  #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -554,3 +556,12 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
554
556
  #ifdef __cplusplus
555
557
  }
556
558
  #endif
559
+
560
+ #ifdef __cplusplus
561
+ #include <vector>
562
+
563
+ // expose GGUF internals for test code
564
+ LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
565
+ LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
566
+ LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
567
+ #endif // __cplusplus
package/cpp/ggml-metal.m CHANGED
@@ -2067,8 +2067,8 @@ static void lm_ggml_metal_encode_node(
2067
2067
  LM_GGML_ASSERT(ne12 % ne02 == 0);
2068
2068
  LM_GGML_ASSERT(ne13 % ne03 == 0);
2069
2069
 
2070
- const uint r2 = ne12/ne02;
2071
- const uint r3 = ne13/ne03;
2070
+ const uint32_t r2 = ne12/ne02;
2071
+ const uint32_t r3 = ne13/ne03;
2072
2072
 
2073
2073
  // find the break-even point where the matrix-matrix kernel becomes more efficient compared
2074
2074
  // to the matrix-vector kernel