cui-llama.rn 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +14 -8
- package/android/src/main/jni.cpp +38 -37
- package/cpp/common.cpp +50 -30
- package/cpp/common.h +32 -13
- package/cpp/ggml-alloc.c +0 -1
- package/cpp/ggml-backend-reg.cpp +79 -49
- package/cpp/ggml-backend.cpp +5 -2
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/ggml-cpu-aarch64.cpp +57 -72
- package/cpp/ggml-cpu-quants.c +5 -1
- package/cpp/ggml-cpu.c +6 -6
- package/cpp/ggml-cpu.cpp +9 -0
- package/cpp/ggml-impl.h +11 -0
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml.c +129 -1388
- package/cpp/ggml.h +29 -152
- package/cpp/gguf.cpp +1325 -0
- package/cpp/gguf.h +202 -0
- package/cpp/llama-adapter.cpp +346 -0
- package/cpp/llama-adapter.h +73 -0
- package/cpp/llama-arch.cpp +1434 -0
- package/cpp/llama-arch.h +395 -0
- package/cpp/llama-batch.cpp +368 -0
- package/cpp/llama-batch.h +88 -0
- package/cpp/llama-chat.cpp +567 -0
- package/cpp/llama-chat.h +51 -0
- package/cpp/llama-context.cpp +1771 -0
- package/cpp/llama-context.h +128 -0
- package/cpp/llama-cparams.cpp +1 -0
- package/cpp/llama-cparams.h +37 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +16 -15
- package/cpp/llama-grammar.h +5 -6
- package/cpp/llama-hparams.cpp +71 -0
- package/cpp/llama-hparams.h +140 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +16 -136
- package/cpp/llama-kv-cache.cpp +718 -0
- package/cpp/llama-kv-cache.h +218 -0
- package/cpp/llama-mmap.cpp +589 -0
- package/cpp/llama-mmap.h +67 -0
- package/cpp/llama-model-loader.cpp +1011 -0
- package/cpp/llama-model-loader.h +158 -0
- package/cpp/llama-model.cpp +2202 -0
- package/cpp/llama-model.h +391 -0
- package/cpp/llama-sampling.cpp +117 -4
- package/cpp/llama-vocab.cpp +26 -29
- package/cpp/llama-vocab.h +14 -2
- package/cpp/llama.cpp +8839 -19131
- package/cpp/llama.cpp.rej +23 -0
- package/cpp/llama.h +31 -9
- package/cpp/rn-llama.hpp +39 -37
- package/cpp/sgemm.cpp +1091 -378
- package/cpp/sgemm.h +2 -2
- package/cpp/unicode.cpp +6 -0
- package/package.json +1 -1
package/cpp/ggml-backend-reg.cpp
CHANGED
@@ -66,6 +66,26 @@
|
|
66
66
|
#include "ggml-kompute.h"
|
67
67
|
#endif
|
68
68
|
|
69
|
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
70
|
+
#if defined(__clang__)
|
71
|
+
# pragma clang diagnostic push
|
72
|
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
73
|
+
#endif
|
74
|
+
|
75
|
+
static std::wstring utf8_to_utf16(const std::string & str) {
|
76
|
+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
77
|
+
return converter.from_bytes(str);
|
78
|
+
}
|
79
|
+
|
80
|
+
static std::string utf16_to_utf8(const std::wstring & str) {
|
81
|
+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
82
|
+
return converter.to_bytes(str);
|
83
|
+
}
|
84
|
+
|
85
|
+
#if defined(__clang__)
|
86
|
+
# pragma clang diagnostic pop
|
87
|
+
#endif
|
88
|
+
|
69
89
|
#ifdef _WIN32
|
70
90
|
|
71
91
|
using dl_handle = std::remove_pointer_t<HMODULE>;
|
@@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
|
|
88
108
|
return handle;
|
89
109
|
}
|
90
110
|
|
91
|
-
static dl_handle * dl_load_library(const std::string & path) {
|
92
|
-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
93
|
-
return dl_load_library(converter.from_bytes(path));
|
94
|
-
}
|
95
|
-
|
96
111
|
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
97
112
|
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
98
113
|
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
@@ -114,8 +129,8 @@ struct dl_handle_deleter {
|
|
114
129
|
}
|
115
130
|
};
|
116
131
|
|
117
|
-
static void * dl_load_library(const std::
|
118
|
-
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
132
|
+
static void * dl_load_library(const std::wstring & path) {
|
133
|
+
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
119
134
|
|
120
135
|
return handle;
|
121
136
|
}
|
@@ -202,11 +217,11 @@ struct lm_ggml_backend_registry {
|
|
202
217
|
devices.push_back(device);
|
203
218
|
}
|
204
219
|
|
205
|
-
lm_ggml_backend_reg_t load_backend(const
|
220
|
+
lm_ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
206
221
|
dl_handle_ptr handle { dl_load_library(path) };
|
207
222
|
if (!handle) {
|
208
223
|
if (!silent) {
|
209
|
-
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
224
|
+
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
210
225
|
}
|
211
226
|
return nullptr;
|
212
227
|
}
|
@@ -214,7 +229,7 @@ struct lm_ggml_backend_registry {
|
|
214
229
|
auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
|
215
230
|
if (score_fn && score_fn() == 0) {
|
216
231
|
if (!silent) {
|
217
|
-
LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
232
|
+
LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
218
233
|
}
|
219
234
|
return nullptr;
|
220
235
|
}
|
@@ -222,7 +237,7 @@ struct lm_ggml_backend_registry {
|
|
222
237
|
auto backend_init_fn = (lm_ggml_backend_init_t) dl_get_sym(handle.get(), "lm_ggml_backend_init");
|
223
238
|
if (!backend_init_fn) {
|
224
239
|
if (!silent) {
|
225
|
-
LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, path);
|
240
|
+
LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
226
241
|
}
|
227
242
|
return nullptr;
|
228
243
|
}
|
@@ -231,16 +246,16 @@ struct lm_ggml_backend_registry {
|
|
231
246
|
if (!reg || reg->api_version != LM_GGML_BACKEND_API_VERSION) {
|
232
247
|
if (!silent) {
|
233
248
|
if (!reg) {
|
234
|
-
LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, path);
|
249
|
+
LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
235
250
|
} else {
|
236
251
|
LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
237
|
-
__func__, path, reg->api_version, LM_GGML_BACKEND_API_VERSION);
|
252
|
+
__func__, utf16_to_utf8(path).c_str(), reg->api_version, LM_GGML_BACKEND_API_VERSION);
|
238
253
|
}
|
239
254
|
}
|
240
255
|
return nullptr;
|
241
256
|
}
|
242
257
|
|
243
|
-
LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), path);
|
258
|
+
LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
244
259
|
|
245
260
|
register_backend(reg, std::move(handle));
|
246
261
|
|
@@ -376,14 +391,14 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
|
376
391
|
|
377
392
|
// Dynamic loading
|
378
393
|
lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path) {
|
379
|
-
return get_reg().load_backend(path, false);
|
394
|
+
return get_reg().load_backend(utf8_to_utf16(path), false);
|
380
395
|
}
|
381
396
|
|
382
397
|
void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg) {
|
383
398
|
get_reg().unload_backend(reg, true);
|
384
399
|
}
|
385
400
|
|
386
|
-
static std::
|
401
|
+
static std::wstring get_executable_path() {
|
387
402
|
#if defined(__APPLE__)
|
388
403
|
// get executable path
|
389
404
|
std::vector<char> path;
|
@@ -401,13 +416,17 @@ static std::string get_executable_path() {
|
|
401
416
|
if (last_slash != std::string::npos) {
|
402
417
|
base_path = base_path.substr(0, last_slash);
|
403
418
|
}
|
404
|
-
return base_path + "/";
|
405
|
-
#elif defined(__linux__)
|
419
|
+
return utf8_to_utf16(base_path + "/");
|
420
|
+
#elif defined(__linux__) || defined(__FreeBSD__)
|
406
421
|
std::string base_path = ".";
|
407
422
|
std::vector<char> path(1024);
|
408
423
|
while (true) {
|
409
424
|
// get executable path
|
425
|
+
# if defined(__linux__)
|
410
426
|
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
427
|
+
# elif defined(__FreeBSD__)
|
428
|
+
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
|
429
|
+
# endif
|
411
430
|
if (len == -1) {
|
412
431
|
break;
|
413
432
|
}
|
@@ -423,57 +442,63 @@ static std::string get_executable_path() {
|
|
423
442
|
path.resize(path.size() * 2);
|
424
443
|
}
|
425
444
|
|
426
|
-
return base_path + "/";
|
445
|
+
return utf8_to_utf16(base_path + "/");
|
427
446
|
#elif defined(_WIN32)
|
428
|
-
std::vector<
|
429
|
-
DWORD len =
|
447
|
+
std::vector<wchar_t> path(MAX_PATH);
|
448
|
+
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
430
449
|
if (len == 0) {
|
431
|
-
return
|
450
|
+
return {};
|
432
451
|
}
|
433
|
-
std::
|
452
|
+
std::wstring base_path(path.data(), len);
|
434
453
|
// remove executable name
|
435
454
|
auto last_slash = base_path.find_last_of('\\');
|
436
455
|
if (last_slash != std::string::npos) {
|
437
456
|
base_path = base_path.substr(0, last_slash);
|
438
457
|
}
|
439
|
-
return base_path + "\\";
|
458
|
+
return base_path + L"\\";
|
459
|
+
#else
|
460
|
+
return {};
|
440
461
|
#endif
|
441
462
|
}
|
442
463
|
|
443
|
-
static std::
|
464
|
+
static std::wstring backend_filename_prefix() {
|
444
465
|
#ifdef _WIN32
|
445
|
-
return "ggml-";
|
466
|
+
return L"ggml-";
|
446
467
|
#else
|
447
|
-
return "libggml-";
|
468
|
+
return L"libggml-";
|
448
469
|
#endif
|
449
470
|
}
|
450
471
|
|
451
|
-
static std::
|
472
|
+
static std::wstring backend_filename_suffix() {
|
452
473
|
#ifdef _WIN32
|
453
|
-
return ".dll";
|
474
|
+
return L".dll";
|
454
475
|
#else
|
455
|
-
return ".so";
|
476
|
+
return L".so";
|
477
|
+
#endif
|
478
|
+
}
|
479
|
+
|
480
|
+
static std::wstring path_separator() {
|
481
|
+
#ifdef _WIN32
|
482
|
+
return L"\\";
|
483
|
+
#else
|
484
|
+
return L"/";
|
456
485
|
#endif
|
457
486
|
}
|
458
487
|
|
459
488
|
static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
460
489
|
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
461
490
|
// TODO: search system paths
|
462
|
-
std::
|
463
|
-
std::vector<std::
|
491
|
+
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
492
|
+
std::vector<std::wstring> search_paths;
|
464
493
|
if (user_search_path == nullptr) {
|
465
|
-
search_paths.push_back("
|
494
|
+
search_paths.push_back(L"." + path_separator());
|
466
495
|
search_paths.push_back(get_executable_path());
|
467
496
|
} else {
|
468
|
-
|
469
|
-
search_paths.push_back(std::string(user_search_path) + "\\");
|
470
|
-
#else
|
471
|
-
search_paths.push_back(std::string(user_search_path) + "/");
|
472
|
-
#endif
|
497
|
+
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
473
498
|
}
|
474
499
|
|
475
500
|
int best_score = 0;
|
476
|
-
std::
|
501
|
+
std::wstring best_path;
|
477
502
|
|
478
503
|
namespace fs = std::filesystem;
|
479
504
|
for (const auto & search_path : search_paths) {
|
@@ -483,27 +508,27 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
|
|
483
508
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
484
509
|
for (const auto & entry : dir_it) {
|
485
510
|
if (entry.is_regular_file()) {
|
486
|
-
std::
|
487
|
-
std::
|
511
|
+
std::wstring filename = entry.path().filename().wstring();
|
512
|
+
std::wstring ext = entry.path().extension().wstring();
|
488
513
|
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
489
|
-
dl_handle_ptr handle { dl_load_library(entry.path().
|
514
|
+
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
490
515
|
if (!handle && !silent) {
|
491
|
-
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().
|
516
|
+
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
492
517
|
}
|
493
518
|
if (handle) {
|
494
519
|
auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
|
495
520
|
if (score_fn) {
|
496
521
|
int s = score_fn();
|
497
522
|
#ifndef NDEBUG
|
498
|
-
LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().
|
523
|
+
LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
499
524
|
#endif
|
500
525
|
if (s > best_score) {
|
501
526
|
best_score = s;
|
502
|
-
best_path = entry.path().
|
527
|
+
best_path = entry.path().wstring();
|
503
528
|
}
|
504
529
|
} else {
|
505
530
|
if (!silent) {
|
506
|
-
LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, entry.path().
|
531
|
+
LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
507
532
|
}
|
508
533
|
}
|
509
534
|
}
|
@@ -515,15 +540,15 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
|
|
515
540
|
if (best_score == 0) {
|
516
541
|
// try to load the base backend
|
517
542
|
for (const auto & search_path : search_paths) {
|
518
|
-
std::
|
543
|
+
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
519
544
|
if (fs::exists(path)) {
|
520
|
-
return get_reg().load_backend(path
|
545
|
+
return get_reg().load_backend(path, silent);
|
521
546
|
}
|
522
547
|
}
|
523
548
|
return nullptr;
|
524
549
|
}
|
525
550
|
|
526
|
-
return get_reg().load_backend(best_path
|
551
|
+
return get_reg().load_backend(best_path, silent);
|
527
552
|
}
|
528
553
|
|
529
554
|
void lm_ggml_backend_load_all() {
|
@@ -549,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
|
|
549
574
|
lm_ggml_backend_load_best("opencl", silent, dir_path);
|
550
575
|
lm_ggml_backend_load_best("musa", silent, dir_path);
|
551
576
|
lm_ggml_backend_load_best("cpu", silent, dir_path);
|
577
|
+
// check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
|
578
|
+
const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
|
579
|
+
if (backend_path) {
|
580
|
+
lm_ggml_backend_load(backend_path);
|
581
|
+
}
|
552
582
|
}
|
package/cpp/ggml-backend.cpp
CHANGED
@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
|
|
764
764
|
if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
765
765
|
int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
766
766
|
// check if a backend with higher prio wants to offload the op
|
767
|
-
if (src_backend_id == sched->n_backends - 1) {
|
767
|
+
if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
|
768
768
|
for (int b = 0; b < src_backend_id; b++) {
|
769
769
|
if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
|
770
770
|
SET_CAUSE(tensor, "1.off");
|
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
|
|
795
795
|
for (int i = 0; i < graph->n_nodes; i++) {
|
796
796
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
797
797
|
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
798
|
-
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs
|
798
|
+
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
|
799
799
|
sched->splits[cur_split].n_inputs);
|
800
800
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
801
|
+
if (j == 0) {
|
802
|
+
LM_GGML_LOG_DEBUG(": ");
|
803
|
+
}
|
801
804
|
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
802
805
|
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
803
806
|
}
|
package/cpp/ggml-cpp.h
CHANGED
package/cpp/ggml-cpu-aarch64.cpp
CHANGED
@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
|
|
194
194
|
}
|
195
195
|
|
196
196
|
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
|
197
|
-
#if defined(
|
197
|
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
198
198
|
const __m256i zero = _mm256_setzero_si256();
|
199
199
|
return _mm256_dpbusd_epi32(zero, ax, sy);
|
200
|
+
#elif defined(__AVXVNNI__)
|
201
|
+
const __m256i zero = _mm256_setzero_si256();
|
202
|
+
return _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
200
203
|
#else
|
201
204
|
// Perform multiplication and create 16-bit values
|
202
205
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
@@ -564,21 +567,21 @@ static void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
|
|
564
567
|
|
565
568
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
566
569
|
if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
|
567
|
-
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
|
570
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
|
568
571
|
|
569
572
|
for (int c = 0; c < nc; c += ncols_interleaved) {
|
570
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
|
573
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
571
574
|
float32x4_t acc = vdupq_n_f32(0);
|
572
575
|
for (int b = 0; b < nb; b++) {
|
573
|
-
int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
|
574
|
-
int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
|
575
|
-
int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
|
576
|
-
int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
|
577
|
-
float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
|
576
|
+
int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
|
577
|
+
int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
|
578
|
+
int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
|
579
|
+
int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
|
580
|
+
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
578
581
|
|
579
582
|
int8x16_t a0 = vld1q_s8(a_ptr->qs);
|
580
583
|
int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
|
581
|
-
float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
|
584
|
+
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
582
585
|
|
583
586
|
int32x4_t ret = vdupq_n_s32(0);
|
584
587
|
|
@@ -647,72 +650,52 @@ static void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
|
|
647
650
|
UNUSED(ncols_interleaved);
|
648
651
|
UNUSED(blocklen);
|
649
652
|
|
650
|
-
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(
|
651
|
-
if (lm_ggml_cpu_has_neon() &&
|
652
|
-
const
|
653
|
-
const void * a_ptr = vy;
|
654
|
-
float * res_ptr = s;
|
653
|
+
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
654
|
+
if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
|
655
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
|
655
656
|
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
|
696
|
-
"fmul v16.4s, v16.4s, v25.4s\n"
|
697
|
-
".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
|
698
|
-
".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
|
699
|
-
".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
|
700
|
-
".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
|
701
|
-
"addp v29.4s, v29.4s, v26.4s\n"
|
702
|
-
"scvtf v29.4s, v29.4s, #0x4\n"
|
703
|
-
"fmla v0.4s, v29.4s, v16.4s\n"
|
704
|
-
"cbnz x22, 2b\n"
|
705
|
-
"sub %x[nc], %x[nc], #0x4\n"
|
706
|
-
"str q0, [%x[res_ptr], #0x0]\n"
|
707
|
-
"add %x[res_ptr], %x[res_ptr], #0x10\n"
|
708
|
-
"cbnz %x[nc], 1b\n"
|
709
|
-
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
|
710
|
-
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
711
|
-
: "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
|
712
|
-
);
|
657
|
+
for (int c = 0; c < nc; c += ncols_interleaved) {
|
658
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
659
|
+
float32x4_t acc = vdupq_n_f32(0);
|
660
|
+
for (int b = 0; b < nb; b++) {
|
661
|
+
int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
|
662
|
+
int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
|
663
|
+
int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
|
664
|
+
int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
|
665
|
+
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
666
|
+
|
667
|
+
int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
|
668
|
+
int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
|
669
|
+
int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
|
670
|
+
int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
|
671
|
+
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
672
|
+
|
673
|
+
int32x4_t ret0 = vdupq_n_s32(0);
|
674
|
+
int32x4_t ret1 = vdupq_n_s32(0);
|
675
|
+
|
676
|
+
ret0 = vdotq_s32(ret0, b0 << 4, a0);
|
677
|
+
ret1 = vdotq_s32(ret1, b1 << 4, a0);
|
678
|
+
ret0 = vdotq_s32(ret0, b2 << 4, a1);
|
679
|
+
ret1 = vdotq_s32(ret1, b3 << 4, a1);
|
680
|
+
|
681
|
+
ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
|
682
|
+
ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
|
683
|
+
ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
|
684
|
+
ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
|
685
|
+
|
686
|
+
int32x4_t ret = vpaddq_s32(ret0, ret1);
|
687
|
+
|
688
|
+
acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
|
689
|
+
vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
690
|
+
a_ptr++;
|
691
|
+
b_ptr++;
|
692
|
+
}
|
693
|
+
vst1q_f32(s, acc);
|
694
|
+
s += ncols_interleaved;
|
695
|
+
}
|
713
696
|
return;
|
714
697
|
}
|
715
|
-
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(
|
698
|
+
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
716
699
|
float sumf[4];
|
717
700
|
int sumi;
|
718
701
|
|
@@ -4186,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
|
|
4186
4169
|
buffer->buft = buft;
|
4187
4170
|
buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
|
4188
4171
|
buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
|
4172
|
+
buffer->iface.get_tensor = nullptr;
|
4173
|
+
buffer->iface.cpy_tensor = nullptr;
|
4189
4174
|
return buffer;
|
4190
4175
|
}
|
4191
4176
|
|
package/cpp/ggml-cpu-quants.c
CHANGED
@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
103
103
|
}
|
104
104
|
|
105
105
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
106
|
-
#if defined(
|
106
|
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
107
107
|
const __m256i zero = _mm256_setzero_si256();
|
108
108
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
109
109
|
return _mm256_cvtepi32_ps(summed_pairs);
|
110
|
+
#elif defined(__AVXVNNI__)
|
111
|
+
const __m256i zero = _mm256_setzero_si256();
|
112
|
+
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
113
|
+
return _mm256_cvtepi32_ps(summed_pairs);
|
110
114
|
#else
|
111
115
|
// Perform multiplication and create 16-bit values
|
112
116
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
package/cpp/ggml-cpu.c
CHANGED
@@ -985,7 +985,7 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
|
|
985
985
|
#define LM_GGML_F16_STEP 32
|
986
986
|
#define LM_GGML_F16_EPR 4
|
987
987
|
|
988
|
-
static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
|
988
|
+
static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
|
989
989
|
float tmp[4];
|
990
990
|
|
991
991
|
tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
|
@@ -996,7 +996,7 @@ static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
|
|
996
996
|
return _mm_loadu_ps(tmp);
|
997
997
|
}
|
998
998
|
|
999
|
-
static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) {
|
999
|
+
static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
1000
1000
|
float arr[4];
|
1001
1001
|
|
1002
1002
|
_mm_storeu_ps(arr, y);
|
@@ -7418,14 +7418,14 @@ static void lm_ggml_compute_forward_mul_mat(
|
|
7418
7418
|
if (src1_cont) {
|
7419
7419
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
7420
7420
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
7421
|
-
if (!llamafile_sgemm(
|
7421
|
+
if (!llamafile_sgemm(params,
|
7422
|
+
ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
|
7422
7423
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
7423
7424
|
nb01/lm_ggml_type_size(src0->type),
|
7424
7425
|
(const char *)src1->data + i12*nb12 + i13*nb13,
|
7425
7426
|
nb11/lm_ggml_type_size(src1->type),
|
7426
7427
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
7427
7428
|
nb1/lm_ggml_type_size(dst->type),
|
7428
|
-
ith, nth,
|
7429
7429
|
src0->type,
|
7430
7430
|
src1->type,
|
7431
7431
|
dst->type))
|
@@ -7470,14 +7470,14 @@ UseGgmlGemm1:;
|
|
7470
7470
|
|
7471
7471
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
7472
7472
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
7473
|
-
if (!llamafile_sgemm(
|
7473
|
+
if (!llamafile_sgemm(params,
|
7474
|
+
ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
|
7474
7475
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
7475
7476
|
nb01/lm_ggml_type_size(src0->type),
|
7476
7477
|
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
7477
7478
|
row_size/lm_ggml_type_size(vec_dot_type),
|
7478
7479
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
7479
7480
|
nb1/lm_ggml_type_size(dst->type),
|
7480
|
-
ith, nth,
|
7481
7481
|
src0->type,
|
7482
7482
|
vec_dot_type,
|
7483
7483
|
dst->type))
|
package/cpp/ggml-cpu.cpp
CHANGED
@@ -393,8 +393,11 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
|
|
393
393
|
switch (op->op) {
|
394
394
|
case LM_GGML_OP_CPY:
|
395
395
|
return
|
396
|
+
op->type != LM_GGML_TYPE_IQ3_XXS &&
|
397
|
+
op->type != LM_GGML_TYPE_IQ3_S &&
|
396
398
|
op->type != LM_GGML_TYPE_IQ2_XXS &&
|
397
399
|
op->type != LM_GGML_TYPE_IQ2_XS &&
|
400
|
+
op->type != LM_GGML_TYPE_IQ2_S &&
|
398
401
|
op->type != LM_GGML_TYPE_IQ1_S &&
|
399
402
|
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
400
403
|
case LM_GGML_OP_MUL_MAT:
|
@@ -518,6 +521,12 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
|
|
518
521
|
if (lm_ggml_cpu_has_sve()) {
|
519
522
|
features.push_back({ "SVE", "1" });
|
520
523
|
}
|
524
|
+
if (lm_ggml_cpu_has_dotprod()) {
|
525
|
+
features.push_back({ "DOTPROD", "1" });
|
526
|
+
}
|
527
|
+
if (lm_ggml_cpu_has_matmul_int8()) {
|
528
|
+
features.push_back({ "MATMUL_INT8", "1" });
|
529
|
+
}
|
521
530
|
if (lm_ggml_cpu_get_sve_cnt() > 0) {
|
522
531
|
static std::string sve_cnt = std::to_string(lm_ggml_cpu_get_sve_cnt());
|
523
532
|
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
package/cpp/ggml-impl.h
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
// GGML internal header
|
4
4
|
|
5
5
|
#include "ggml.h"
|
6
|
+
#include "gguf.h"
|
7
|
+
|
6
8
|
#include <assert.h>
|
7
9
|
#include <math.h>
|
8
10
|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
@@ -554,3 +556,12 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
|
554
556
|
#ifdef __cplusplus
|
555
557
|
}
|
556
558
|
#endif
|
559
|
+
|
560
|
+
#ifdef __cplusplus
|
561
|
+
#include <vector>
|
562
|
+
|
563
|
+
// expose GGUF internals for test code
|
564
|
+
LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
|
565
|
+
LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
|
566
|
+
LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
567
|
+
#endif // __cplusplus
|
package/cpp/ggml-metal.m
CHANGED
@@ -2067,8 +2067,8 @@ static void lm_ggml_metal_encode_node(
|
|
2067
2067
|
LM_GGML_ASSERT(ne12 % ne02 == 0);
|
2068
2068
|
LM_GGML_ASSERT(ne13 % ne03 == 0);
|
2069
2069
|
|
2070
|
-
const
|
2071
|
-
const
|
2070
|
+
const uint32_t r2 = ne12/ne02;
|
2071
|
+
const uint32_t r3 = ne13/ne03;
|
2072
2072
|
|
2073
2073
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
2074
2074
|
// to the matrix-vector kernel
|