cui-llama.rn 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/common.cpp +7 -4
- package/cpp/common.h +14 -2
- package/cpp/ggml-alloc.c +0 -1
- package/cpp/ggml-backend-reg.cpp +74 -49
- package/cpp/ggml-cpu-aarch64.cpp +51 -71
- package/cpp/ggml-cpu.c +6 -6
- package/cpp/ggml-cpu.cpp +9 -0
- package/cpp/ggml-impl.h +16 -0
- package/cpp/ggml.c +153 -136
- package/cpp/ggml.h +29 -12
- package/cpp/llama-grammar.cpp +15 -15
- package/cpp/llama-grammar.h +2 -5
- package/cpp/llama-vocab.cpp +5 -1
- package/cpp/llama-vocab.h +1 -1
- package/cpp/llama.cpp +992 -300
- package/cpp/llama.h +0 -3
- package/cpp/sgemm.cpp +265 -258
- package/cpp/sgemm.h +2 -2
- package/package.json +1 -1
package/cpp/common.cpp
CHANGED
@@ -1105,7 +1105,7 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
|
|
1105
1105
|
#define CURL_MAX_RETRY 3
|
1106
1106
|
#define CURL_RETRY_DELAY_SECONDS 2
|
1107
1107
|
|
1108
|
-
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
1108
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
1109
1109
|
int remaining_attempts = max_attempts;
|
1110
1110
|
|
1111
1111
|
while (remaining_attempts > 0) {
|
@@ -1129,7 +1129,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
1129
1129
|
}
|
1130
1130
|
|
1131
1131
|
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
1132
|
-
|
1133
1132
|
// Initialize libcurl
|
1134
1133
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
1135
1134
|
if (!curl) {
|
@@ -1202,11 +1201,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
1202
1201
|
std::string etag;
|
1203
1202
|
std::string last_modified;
|
1204
1203
|
};
|
1204
|
+
|
1205
1205
|
common_load_model_from_url_headers headers;
|
1206
|
+
|
1206
1207
|
{
|
1207
1208
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
1208
1209
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
1209
|
-
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
1210
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
1210
1211
|
|
1211
1212
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
1212
1213
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
@@ -1790,7 +1791,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
1790
1791
|
break;
|
1791
1792
|
case 0: // max absolute
|
1792
1793
|
for (int i = 0; i < n; i++) {
|
1793
|
-
if (sum < std::abs(inp[i]))
|
1794
|
+
if (sum < std::abs(inp[i])) {
|
1795
|
+
sum = std::abs(inp[i]);
|
1796
|
+
}
|
1794
1797
|
}
|
1795
1798
|
sum /= 32760.0; // make an int16 range
|
1796
1799
|
break;
|
package/cpp/common.h
CHANGED
@@ -91,6 +91,7 @@ enum llama_example {
|
|
91
91
|
LLAMA_EXAMPLE_LLAVA,
|
92
92
|
LLAMA_EXAMPLE_LOOKUP,
|
93
93
|
LLAMA_EXAMPLE_PARALLEL,
|
94
|
+
LLAMA_EXAMPLE_TTS,
|
94
95
|
|
95
96
|
LLAMA_EXAMPLE_COUNT,
|
96
97
|
};
|
@@ -170,6 +171,7 @@ struct common_params_sampling {
|
|
170
171
|
|
171
172
|
struct common_params_speculative {
|
172
173
|
std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
|
174
|
+
|
173
175
|
int32_t n_ctx = 0; // draft context size
|
174
176
|
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
175
177
|
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
@@ -183,6 +185,14 @@ struct common_params_speculative {
|
|
183
185
|
std::string model = ""; // draft model for speculative decoding // NOLINT
|
184
186
|
};
|
185
187
|
|
188
|
+
struct common_params_vocoder {
|
189
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
190
|
+
std::string hf_file = ""; // HF file // NOLINT
|
191
|
+
|
192
|
+
std::string model = ""; // model path // NOLINT
|
193
|
+
std::string model_url = ""; // model url to download // NOLINT
|
194
|
+
};
|
195
|
+
|
186
196
|
struct common_params {
|
187
197
|
|
188
198
|
void * progress_callback_user_data = nullptr;
|
@@ -229,8 +239,9 @@ struct common_params {
|
|
229
239
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
230
240
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
231
241
|
|
232
|
-
struct common_params_sampling
|
242
|
+
struct common_params_sampling sampling;
|
233
243
|
struct common_params_speculative speculative;
|
244
|
+
struct common_params_vocoder vocoder;
|
234
245
|
|
235
246
|
std::string model = ""; // model path // NOLINT
|
236
247
|
std::string model_alias = ""; // model alias // NOLINT
|
@@ -611,7 +622,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
|
|
611
622
|
// Embedding utils
|
612
623
|
//
|
613
624
|
|
614
|
-
|
625
|
+
// TODO: repace embd_norm with an enum
|
626
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
615
627
|
|
616
628
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
617
629
|
|
package/cpp/ggml-alloc.c
CHANGED
package/cpp/ggml-backend-reg.cpp
CHANGED
@@ -66,6 +66,26 @@
|
|
66
66
|
#include "ggml-kompute.h"
|
67
67
|
#endif
|
68
68
|
|
69
|
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
70
|
+
#if defined(__clang__)
|
71
|
+
# pragma clang diagnostic push
|
72
|
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
73
|
+
#endif
|
74
|
+
|
75
|
+
static std::wstring utf8_to_utf16(const std::string & str) {
|
76
|
+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
77
|
+
return converter.from_bytes(str);
|
78
|
+
}
|
79
|
+
|
80
|
+
static std::string utf16_to_utf8(const std::wstring & str) {
|
81
|
+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
82
|
+
return converter.to_bytes(str);
|
83
|
+
}
|
84
|
+
|
85
|
+
#if defined(__clang__)
|
86
|
+
# pragma clang diagnostic pop
|
87
|
+
#endif
|
88
|
+
|
69
89
|
#ifdef _WIN32
|
70
90
|
|
71
91
|
using dl_handle = std::remove_pointer_t<HMODULE>;
|
@@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
|
|
88
108
|
return handle;
|
89
109
|
}
|
90
110
|
|
91
|
-
static dl_handle * dl_load_library(const std::string & path) {
|
92
|
-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
93
|
-
return dl_load_library(converter.from_bytes(path));
|
94
|
-
}
|
95
|
-
|
96
111
|
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
97
112
|
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
98
113
|
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
@@ -114,8 +129,8 @@ struct dl_handle_deleter {
|
|
114
129
|
}
|
115
130
|
};
|
116
131
|
|
117
|
-
static void * dl_load_library(const std::
|
118
|
-
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
132
|
+
static void * dl_load_library(const std::wstring & path) {
|
133
|
+
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
119
134
|
|
120
135
|
return handle;
|
121
136
|
}
|
@@ -202,11 +217,11 @@ struct lm_ggml_backend_registry {
|
|
202
217
|
devices.push_back(device);
|
203
218
|
}
|
204
219
|
|
205
|
-
lm_ggml_backend_reg_t load_backend(const
|
220
|
+
lm_ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
206
221
|
dl_handle_ptr handle { dl_load_library(path) };
|
207
222
|
if (!handle) {
|
208
223
|
if (!silent) {
|
209
|
-
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
224
|
+
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
210
225
|
}
|
211
226
|
return nullptr;
|
212
227
|
}
|
@@ -214,7 +229,7 @@ struct lm_ggml_backend_registry {
|
|
214
229
|
auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
|
215
230
|
if (score_fn && score_fn() == 0) {
|
216
231
|
if (!silent) {
|
217
|
-
LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
232
|
+
LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
218
233
|
}
|
219
234
|
return nullptr;
|
220
235
|
}
|
@@ -222,7 +237,7 @@ struct lm_ggml_backend_registry {
|
|
222
237
|
auto backend_init_fn = (lm_ggml_backend_init_t) dl_get_sym(handle.get(), "lm_ggml_backend_init");
|
223
238
|
if (!backend_init_fn) {
|
224
239
|
if (!silent) {
|
225
|
-
LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, path);
|
240
|
+
LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
226
241
|
}
|
227
242
|
return nullptr;
|
228
243
|
}
|
@@ -231,16 +246,16 @@ struct lm_ggml_backend_registry {
|
|
231
246
|
if (!reg || reg->api_version != LM_GGML_BACKEND_API_VERSION) {
|
232
247
|
if (!silent) {
|
233
248
|
if (!reg) {
|
234
|
-
LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, path);
|
249
|
+
LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
235
250
|
} else {
|
236
251
|
LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
237
|
-
__func__, path, reg->api_version, LM_GGML_BACKEND_API_VERSION);
|
252
|
+
__func__, utf16_to_utf8(path).c_str(), reg->api_version, LM_GGML_BACKEND_API_VERSION);
|
238
253
|
}
|
239
254
|
}
|
240
255
|
return nullptr;
|
241
256
|
}
|
242
257
|
|
243
|
-
LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), path);
|
258
|
+
LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
244
259
|
|
245
260
|
register_backend(reg, std::move(handle));
|
246
261
|
|
@@ -376,14 +391,14 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
|
376
391
|
|
377
392
|
// Dynamic loading
|
378
393
|
lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path) {
|
379
|
-
return get_reg().load_backend(path, false);
|
394
|
+
return get_reg().load_backend(utf8_to_utf16(path), false);
|
380
395
|
}
|
381
396
|
|
382
397
|
void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg) {
|
383
398
|
get_reg().unload_backend(reg, true);
|
384
399
|
}
|
385
400
|
|
386
|
-
static std::
|
401
|
+
static std::wstring get_executable_path() {
|
387
402
|
#if defined(__APPLE__)
|
388
403
|
// get executable path
|
389
404
|
std::vector<char> path;
|
@@ -401,13 +416,17 @@ static std::string get_executable_path() {
|
|
401
416
|
if (last_slash != std::string::npos) {
|
402
417
|
base_path = base_path.substr(0, last_slash);
|
403
418
|
}
|
404
|
-
return base_path + "/";
|
405
|
-
#elif defined(__linux__)
|
419
|
+
return utf8_to_utf16(base_path + "/");
|
420
|
+
#elif defined(__linux__) || defined(__FreeBSD__)
|
406
421
|
std::string base_path = ".";
|
407
422
|
std::vector<char> path(1024);
|
408
423
|
while (true) {
|
409
424
|
// get executable path
|
425
|
+
# if defined(__linux__)
|
410
426
|
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
427
|
+
# elif defined(__FreeBSD__)
|
428
|
+
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
|
429
|
+
# endif
|
411
430
|
if (len == -1) {
|
412
431
|
break;
|
413
432
|
}
|
@@ -423,57 +442,63 @@ static std::string get_executable_path() {
|
|
423
442
|
path.resize(path.size() * 2);
|
424
443
|
}
|
425
444
|
|
426
|
-
return base_path + "/";
|
445
|
+
return utf8_to_utf16(base_path + "/");
|
427
446
|
#elif defined(_WIN32)
|
428
|
-
std::vector<
|
429
|
-
DWORD len =
|
447
|
+
std::vector<wchar_t> path(MAX_PATH);
|
448
|
+
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
430
449
|
if (len == 0) {
|
431
|
-
return
|
450
|
+
return {};
|
432
451
|
}
|
433
|
-
std::
|
452
|
+
std::wstring base_path(path.data(), len);
|
434
453
|
// remove executable name
|
435
454
|
auto last_slash = base_path.find_last_of('\\');
|
436
455
|
if (last_slash != std::string::npos) {
|
437
456
|
base_path = base_path.substr(0, last_slash);
|
438
457
|
}
|
439
|
-
return base_path + "\\";
|
458
|
+
return base_path + L"\\";
|
459
|
+
#else
|
460
|
+
return {};
|
440
461
|
#endif
|
441
462
|
}
|
442
463
|
|
443
|
-
static std::
|
464
|
+
static std::wstring backend_filename_prefix() {
|
444
465
|
#ifdef _WIN32
|
445
|
-
return "ggml-";
|
466
|
+
return L"ggml-";
|
446
467
|
#else
|
447
|
-
return "libggml-";
|
468
|
+
return L"libggml-";
|
448
469
|
#endif
|
449
470
|
}
|
450
471
|
|
451
|
-
static std::
|
472
|
+
static std::wstring backend_filename_suffix() {
|
452
473
|
#ifdef _WIN32
|
453
|
-
return ".dll";
|
474
|
+
return L".dll";
|
454
475
|
#else
|
455
|
-
return ".so";
|
476
|
+
return L".so";
|
477
|
+
#endif
|
478
|
+
}
|
479
|
+
|
480
|
+
static std::wstring path_separator() {
|
481
|
+
#ifdef _WIN32
|
482
|
+
return L"\\";
|
483
|
+
#else
|
484
|
+
return L"/";
|
456
485
|
#endif
|
457
486
|
}
|
458
487
|
|
459
488
|
static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
460
489
|
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
461
490
|
// TODO: search system paths
|
462
|
-
std::
|
463
|
-
std::vector<std::
|
491
|
+
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
492
|
+
std::vector<std::wstring> search_paths;
|
464
493
|
if (user_search_path == nullptr) {
|
465
|
-
search_paths.push_back("
|
494
|
+
search_paths.push_back(L"." + path_separator());
|
466
495
|
search_paths.push_back(get_executable_path());
|
467
496
|
} else {
|
468
|
-
|
469
|
-
search_paths.push_back(std::string(user_search_path) + "\\");
|
470
|
-
#else
|
471
|
-
search_paths.push_back(std::string(user_search_path) + "/");
|
472
|
-
#endif
|
497
|
+
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
473
498
|
}
|
474
499
|
|
475
500
|
int best_score = 0;
|
476
|
-
std::
|
501
|
+
std::wstring best_path;
|
477
502
|
|
478
503
|
namespace fs = std::filesystem;
|
479
504
|
for (const auto & search_path : search_paths) {
|
@@ -483,27 +508,27 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
|
|
483
508
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
484
509
|
for (const auto & entry : dir_it) {
|
485
510
|
if (entry.is_regular_file()) {
|
486
|
-
std::
|
487
|
-
std::
|
511
|
+
std::wstring filename = entry.path().filename().wstring();
|
512
|
+
std::wstring ext = entry.path().extension().wstring();
|
488
513
|
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
489
|
-
dl_handle_ptr handle { dl_load_library(entry.path().
|
514
|
+
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
490
515
|
if (!handle && !silent) {
|
491
|
-
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().
|
516
|
+
LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
492
517
|
}
|
493
518
|
if (handle) {
|
494
519
|
auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
|
495
520
|
if (score_fn) {
|
496
521
|
int s = score_fn();
|
497
522
|
#ifndef NDEBUG
|
498
|
-
LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().
|
523
|
+
LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
499
524
|
#endif
|
500
525
|
if (s > best_score) {
|
501
526
|
best_score = s;
|
502
|
-
best_path = entry.path().
|
527
|
+
best_path = entry.path().wstring();
|
503
528
|
}
|
504
529
|
} else {
|
505
530
|
if (!silent) {
|
506
|
-
LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, entry.path().
|
531
|
+
LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
507
532
|
}
|
508
533
|
}
|
509
534
|
}
|
@@ -515,15 +540,15 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
|
|
515
540
|
if (best_score == 0) {
|
516
541
|
// try to load the base backend
|
517
542
|
for (const auto & search_path : search_paths) {
|
518
|
-
std::
|
543
|
+
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
519
544
|
if (fs::exists(path)) {
|
520
|
-
return get_reg().load_backend(path
|
545
|
+
return get_reg().load_backend(path, silent);
|
521
546
|
}
|
522
547
|
}
|
523
548
|
return nullptr;
|
524
549
|
}
|
525
550
|
|
526
|
-
return get_reg().load_backend(best_path
|
551
|
+
return get_reg().load_backend(best_path, silent);
|
527
552
|
}
|
528
553
|
|
529
554
|
void lm_ggml_backend_load_all() {
|
package/cpp/ggml-cpu-aarch64.cpp
CHANGED
@@ -564,21 +564,21 @@ static void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
|
|
564
564
|
|
565
565
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
566
566
|
if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
|
567
|
-
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
|
567
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
|
568
568
|
|
569
569
|
for (int c = 0; c < nc; c += ncols_interleaved) {
|
570
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
|
570
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
571
571
|
float32x4_t acc = vdupq_n_f32(0);
|
572
572
|
for (int b = 0; b < nb; b++) {
|
573
|
-
int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
|
574
|
-
int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
|
575
|
-
int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
|
576
|
-
int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
|
577
|
-
float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
|
573
|
+
int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
|
574
|
+
int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
|
575
|
+
int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
|
576
|
+
int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
|
577
|
+
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
578
578
|
|
579
579
|
int8x16_t a0 = vld1q_s8(a_ptr->qs);
|
580
580
|
int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
|
581
|
-
float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
|
581
|
+
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
582
582
|
|
583
583
|
int32x4_t ret = vdupq_n_s32(0);
|
584
584
|
|
@@ -647,72 +647,52 @@ static void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
|
|
647
647
|
UNUSED(ncols_interleaved);
|
648
648
|
UNUSED(blocklen);
|
649
649
|
|
650
|
-
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(
|
651
|
-
if (lm_ggml_cpu_has_neon() &&
|
652
|
-
const
|
653
|
-
const void * a_ptr = vy;
|
654
|
-
float * res_ptr = s;
|
650
|
+
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
651
|
+
if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
|
652
|
+
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
|
655
653
|
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
|
696
|
-
"fmul v16.4s, v16.4s, v25.4s\n"
|
697
|
-
".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
|
698
|
-
".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
|
699
|
-
".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
|
700
|
-
".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
|
701
|
-
"addp v29.4s, v29.4s, v26.4s\n"
|
702
|
-
"scvtf v29.4s, v29.4s, #0x4\n"
|
703
|
-
"fmla v0.4s, v29.4s, v16.4s\n"
|
704
|
-
"cbnz x22, 2b\n"
|
705
|
-
"sub %x[nc], %x[nc], #0x4\n"
|
706
|
-
"str q0, [%x[res_ptr], #0x0]\n"
|
707
|
-
"add %x[res_ptr], %x[res_ptr], #0x10\n"
|
708
|
-
"cbnz %x[nc], 1b\n"
|
709
|
-
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
|
710
|
-
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
|
711
|
-
: "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
|
712
|
-
);
|
654
|
+
for (int c = 0; c < nc; c += ncols_interleaved) {
|
655
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
656
|
+
float32x4_t acc = vdupq_n_f32(0);
|
657
|
+
for (int b = 0; b < nb; b++) {
|
658
|
+
int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
|
659
|
+
int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
|
660
|
+
int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
|
661
|
+
int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
|
662
|
+
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
663
|
+
|
664
|
+
int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
|
665
|
+
int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
|
666
|
+
int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
|
667
|
+
int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
|
668
|
+
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
669
|
+
|
670
|
+
int32x4_t ret0 = vdupq_n_s32(0);
|
671
|
+
int32x4_t ret1 = vdupq_n_s32(0);
|
672
|
+
|
673
|
+
ret0 = vdotq_s32(ret0, b0 << 4, a0);
|
674
|
+
ret1 = vdotq_s32(ret1, b1 << 4, a0);
|
675
|
+
ret0 = vdotq_s32(ret0, b2 << 4, a1);
|
676
|
+
ret1 = vdotq_s32(ret1, b3 << 4, a1);
|
677
|
+
|
678
|
+
ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
|
679
|
+
ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
|
680
|
+
ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
|
681
|
+
ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
|
682
|
+
|
683
|
+
int32x4_t ret = vpaddq_s32(ret0, ret1);
|
684
|
+
|
685
|
+
acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
|
686
|
+
vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
687
|
+
a_ptr++;
|
688
|
+
b_ptr++;
|
689
|
+
}
|
690
|
+
vst1q_f32(s, acc);
|
691
|
+
s += ncols_interleaved;
|
692
|
+
}
|
713
693
|
return;
|
714
694
|
}
|
715
|
-
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(
|
695
|
+
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
716
696
|
float sumf[4];
|
717
697
|
int sumi;
|
718
698
|
|
package/cpp/ggml-cpu.c
CHANGED
@@ -985,7 +985,7 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
|
|
985
985
|
#define LM_GGML_F16_STEP 32
|
986
986
|
#define LM_GGML_F16_EPR 4
|
987
987
|
|
988
|
-
static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
|
988
|
+
static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
|
989
989
|
float tmp[4];
|
990
990
|
|
991
991
|
tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
|
@@ -996,7 +996,7 @@ static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
|
|
996
996
|
return _mm_loadu_ps(tmp);
|
997
997
|
}
|
998
998
|
|
999
|
-
static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) {
|
999
|
+
static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
|
1000
1000
|
float arr[4];
|
1001
1001
|
|
1002
1002
|
_mm_storeu_ps(arr, y);
|
@@ -7418,14 +7418,14 @@ static void lm_ggml_compute_forward_mul_mat(
|
|
7418
7418
|
if (src1_cont) {
|
7419
7419
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
7420
7420
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
7421
|
-
if (!llamafile_sgemm(
|
7421
|
+
if (!llamafile_sgemm(params,
|
7422
|
+
ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
|
7422
7423
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
7423
7424
|
nb01/lm_ggml_type_size(src0->type),
|
7424
7425
|
(const char *)src1->data + i12*nb12 + i13*nb13,
|
7425
7426
|
nb11/lm_ggml_type_size(src1->type),
|
7426
7427
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
7427
7428
|
nb1/lm_ggml_type_size(dst->type),
|
7428
|
-
ith, nth,
|
7429
7429
|
src0->type,
|
7430
7430
|
src1->type,
|
7431
7431
|
dst->type))
|
@@ -7470,14 +7470,14 @@ UseGgmlGemm1:;
|
|
7470
7470
|
|
7471
7471
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
7472
7472
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
7473
|
-
if (!llamafile_sgemm(
|
7473
|
+
if (!llamafile_sgemm(params,
|
7474
|
+
ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
|
7474
7475
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
7475
7476
|
nb01/lm_ggml_type_size(src0->type),
|
7476
7477
|
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
7477
7478
|
row_size/lm_ggml_type_size(vec_dot_type),
|
7478
7479
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
7479
7480
|
nb1/lm_ggml_type_size(dst->type),
|
7480
|
-
ith, nth,
|
7481
7481
|
src0->type,
|
7482
7482
|
vec_dot_type,
|
7483
7483
|
dst->type))
|
package/cpp/ggml-cpu.cpp
CHANGED
@@ -393,8 +393,11 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
|
|
393
393
|
switch (op->op) {
|
394
394
|
case LM_GGML_OP_CPY:
|
395
395
|
return
|
396
|
+
op->type != LM_GGML_TYPE_IQ3_XXS &&
|
397
|
+
op->type != LM_GGML_TYPE_IQ3_S &&
|
396
398
|
op->type != LM_GGML_TYPE_IQ2_XXS &&
|
397
399
|
op->type != LM_GGML_TYPE_IQ2_XS &&
|
400
|
+
op->type != LM_GGML_TYPE_IQ2_S &&
|
398
401
|
op->type != LM_GGML_TYPE_IQ1_S &&
|
399
402
|
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
400
403
|
case LM_GGML_OP_MUL_MAT:
|
@@ -518,6 +521,12 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
|
|
518
521
|
if (lm_ggml_cpu_has_sve()) {
|
519
522
|
features.push_back({ "SVE", "1" });
|
520
523
|
}
|
524
|
+
if (lm_ggml_cpu_has_dotprod()) {
|
525
|
+
features.push_back({ "DOTPROD", "1" });
|
526
|
+
}
|
527
|
+
if (lm_ggml_cpu_has_matmul_int8()) {
|
528
|
+
features.push_back({ "MATMUL_INT8", "1" });
|
529
|
+
}
|
521
530
|
if (lm_ggml_cpu_get_sve_cnt() > 0) {
|
522
531
|
static std::string sve_cnt = std::to_string(lm_ggml_cpu_get_sve_cnt());
|
523
532
|
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
package/cpp/ggml-impl.h
CHANGED
@@ -551,6 +551,22 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
|
|
551
551
|
#define LM_GGML_FP32_TO_BF16(x) lm_ggml_compute_fp32_to_bf16(x)
|
552
552
|
#define LM_GGML_BF16_TO_FP32(x) lm_ggml_compute_bf16_to_fp32(x)
|
553
553
|
|
554
|
+
// expose GGUF internals for test code
|
555
|
+
|
556
|
+
LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
|
557
|
+
|
558
|
+
LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
|
559
|
+
|
560
|
+
struct lm_gguf_buf {
|
561
|
+
void * data;
|
562
|
+
size_t size;
|
563
|
+
size_t offset;
|
564
|
+
};
|
565
|
+
LM_GGML_API struct lm_gguf_buf lm_gguf_buf_init(size_t size);
|
566
|
+
LM_GGML_API void lm_gguf_buf_free(struct lm_gguf_buf buf);
|
567
|
+
|
568
|
+
LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta);
|
569
|
+
|
554
570
|
#ifdef __cplusplus
|
555
571
|
}
|
556
572
|
#endif
|