@fugood/llama.node 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -3
- package/lib/binding.js +7 -17
- package/lib/binding.ts +116 -32
- package/lib/index.js +7 -9
- package/lib/index.ts +34 -25
- package/package.json +17 -14
- package/src/LlamaCompletionWorker.cpp +5 -4
- package/src/LlamaContext.cpp +44 -11
- package/src/llama.cpp/common/arg.cpp +8 -1
- package/src/llama.cpp/common/common.h +4 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/src/llama.cpp/include/llama.h +2 -0
- package/src/llama.cpp/src/llama-arch.cpp +6 -6
- package/src/llama.cpp/src/llama-chat.cpp +3 -4
- package/src/llama.cpp/src/llama-context.cpp +49 -14
- package/src/llama.cpp/src/llama-context.h +13 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -0
- package/src/llama.cpp/src/llama-model.cpp +19 -2
- package/src/tts_utils.cpp +12 -0
- package/src/tts_utils.h +40 -1
package/src/LlamaContext.cpp
CHANGED
|
@@ -247,6 +247,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
247
247
|
params.cache_type_v = kv_cache_type_from_str(
|
|
248
248
|
get_option<std::string>(options, "cache_type_v", "f16").c_str());
|
|
249
249
|
params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
|
|
250
|
+
params.kv_unified = get_option<bool>(options, "kv_unified", true);
|
|
250
251
|
|
|
251
252
|
params.use_mlock = get_option<bool>(options, "use_mlock", false);
|
|
252
253
|
params.use_mmap = get_option<bool>(options, "use_mmap", true);
|
|
@@ -904,9 +905,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
904
905
|
// guide_tokens
|
|
905
906
|
std::vector<llama_token> guide_tokens;
|
|
906
907
|
if (options.Has("guide_tokens")) {
|
|
907
|
-
auto
|
|
908
|
-
|
|
909
|
-
|
|
908
|
+
auto guide_tokens_value = options.Get("guide_tokens");
|
|
909
|
+
if (guide_tokens_value.IsArray()) {
|
|
910
|
+
auto guide_tokens_array = guide_tokens_value.As<Napi::Array>();
|
|
911
|
+
for (size_t i = 0; i < guide_tokens_array.Length(); i++) {
|
|
912
|
+
guide_tokens.push_back(guide_tokens_array.Get(i).ToNumber().Int32Value());
|
|
913
|
+
}
|
|
914
|
+
} else if (guide_tokens_value.IsTypedArray()) {
|
|
915
|
+
auto guide_tokens_typed_array = guide_tokens_value.As<Napi::TypedArray>();
|
|
916
|
+
if (guide_tokens_typed_array.TypedArrayType() == napi_int32_array) {
|
|
917
|
+
auto guide_tokens_int32_array = guide_tokens_value.As<Napi::Int32Array>();
|
|
918
|
+
size_t length = guide_tokens_int32_array.ElementLength();
|
|
919
|
+
const int32_t* data = guide_tokens_int32_array.Data();
|
|
920
|
+
guide_tokens.resize(length);
|
|
921
|
+
memcpy(guide_tokens.data(), data, length * sizeof(int32_t));
|
|
922
|
+
} else {
|
|
923
|
+
Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
|
|
924
|
+
return env.Undefined();
|
|
925
|
+
}
|
|
926
|
+
} else {
|
|
927
|
+
Napi::TypeError::New(env, "guide_tokens must be Array<number> or Int32Array").ThrowAsJavaScriptException();
|
|
928
|
+
return env.Undefined();
|
|
910
929
|
}
|
|
911
930
|
}
|
|
912
931
|
|
|
@@ -1291,14 +1310,16 @@ tts_type LlamaContext::getTTSType(Napi::Env env, nlohmann::json speaker) {
|
|
|
1291
1310
|
return OUTETTS_V0_2;
|
|
1292
1311
|
}
|
|
1293
1312
|
|
|
1294
|
-
// initVocoder(
|
|
1313
|
+
// initVocoder(params?: object): boolean
|
|
1295
1314
|
Napi::Value LlamaContext::InitVocoder(const Napi::CallbackInfo &info) {
|
|
1296
1315
|
Napi::Env env = info.Env();
|
|
1297
1316
|
if (info.Length() < 1 || !info[0].IsObject()) {
|
|
1298
|
-
Napi::TypeError::New(env, "Object is expected for vocoder
|
|
1317
|
+
Napi::TypeError::New(env, "Object is expected for vocoder options")
|
|
1299
1318
|
.ThrowAsJavaScriptException();
|
|
1300
1319
|
}
|
|
1301
|
-
auto
|
|
1320
|
+
auto options = info[0].As<Napi::Object>();
|
|
1321
|
+
auto vocoder_path = options.Get("path").ToString().Utf8Value();
|
|
1322
|
+
auto n_batch = get_option<int32_t>(options, "n_batch", _sess->params().n_batch);
|
|
1302
1323
|
if (vocoder_path.empty()) {
|
|
1303
1324
|
Napi::TypeError::New(env, "vocoder path is required")
|
|
1304
1325
|
.ThrowAsJavaScriptException();
|
|
@@ -1314,6 +1335,7 @@ Napi::Value LlamaContext::InitVocoder(const Napi::CallbackInfo &info) {
|
|
|
1314
1335
|
_vocoder.params.model.path = vocoder_path;
|
|
1315
1336
|
_vocoder.params.embedding = true;
|
|
1316
1337
|
_vocoder.params.ctx_shift = false;
|
|
1338
|
+
_vocoder.params.n_batch = n_batch;
|
|
1317
1339
|
_vocoder.params.n_ubatch = _vocoder.params.n_batch;
|
|
1318
1340
|
common_init_result result = common_init_from_params(_vocoder.params);
|
|
1319
1341
|
if (result.model == nullptr || result.context == nullptr) {
|
|
@@ -1342,7 +1364,7 @@ Napi::Value LlamaContext::IsVocoderEnabled(const Napi::CallbackInfo &info) {
|
|
|
1342
1364
|
return Napi::Boolean::New(env, _has_vocoder);
|
|
1343
1365
|
}
|
|
1344
1366
|
|
|
1345
|
-
// getFormattedAudioCompletion(speaker: string|null, text: string):
|
|
1367
|
+
// getFormattedAudioCompletion(speaker: string|null, text: string): object
|
|
1346
1368
|
Napi::Value
|
|
1347
1369
|
LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
|
|
1348
1370
|
Napi::Env env = info.Env();
|
|
@@ -1369,9 +1391,16 @@ LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
|
|
|
1369
1391
|
audio_text = audio_text_from_speaker(speaker, type);
|
|
1370
1392
|
audio_data = audio_data_from_speaker(speaker, type);
|
|
1371
1393
|
}
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1394
|
+
std::string prompt = "<|im_start|>\n" + audio_text +
|
|
1395
|
+
process_text(text, type) +
|
|
1396
|
+
"<|text_end|>\n" + audio_data + "\n";
|
|
1397
|
+
Napi::Object result = Napi::Object::New(env);
|
|
1398
|
+
result.Set("prompt", prompt);
|
|
1399
|
+
const char *grammar = get_tts_grammar(type);
|
|
1400
|
+
if (grammar != nullptr) {
|
|
1401
|
+
result.Set("grammar", grammar);
|
|
1402
|
+
}
|
|
1403
|
+
return result;
|
|
1375
1404
|
}
|
|
1376
1405
|
|
|
1377
1406
|
// getAudioCompletionGuideTokens(text: string): Int32Array
|
|
@@ -1412,6 +1441,10 @@ LlamaContext::GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info) {
|
|
|
1412
1441
|
if (tmp.size() > 0) {
|
|
1413
1442
|
result.push_back(tmp[0]);
|
|
1414
1443
|
}
|
|
1444
|
+
|
|
1445
|
+
// Add Audio End, forcing stop generation
|
|
1446
|
+
result.push_back(common_tokenize(vocab, "<|audio_end|>", false, true)[0]);
|
|
1447
|
+
|
|
1415
1448
|
auto tokens = Napi::Int32Array::New(env, result.size());
|
|
1416
1449
|
memcpy(tokens.Data(), result.data(), result.size() * sizeof(int32_t));
|
|
1417
1450
|
return tokens;
|
|
@@ -1446,7 +1479,7 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
|
|
|
1446
1479
|
.ThrowAsJavaScriptException();
|
|
1447
1480
|
return env.Undefined();
|
|
1448
1481
|
}
|
|
1449
|
-
if (type ==
|
|
1482
|
+
if (type == OUTETTS_V0_1 || type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
|
1450
1483
|
tokens.erase(
|
|
1451
1484
|
std::remove_if(tokens.begin(), tokens.end(),
|
|
1452
1485
|
[](llama_token t) { return t < 151672 || t > 155772; }),
|
|
@@ -1612,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1612
1612
|
[](common_params & params, const std::string & value) {
|
|
1613
1613
|
params.antiprompt.emplace_back(value);
|
|
1614
1614
|
}
|
|
1615
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1615
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
1616
1616
|
add_opt(common_arg(
|
|
1617
1617
|
{"-sp", "--special"},
|
|
1618
1618
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
@@ -2655,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2655
2655
|
params.i_chunk = value;
|
|
2656
2656
|
}
|
|
2657
2657
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2658
|
+
add_opt(common_arg(
|
|
2659
|
+
{"--show-statistics"},
|
|
2660
|
+
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
|
|
2661
|
+
[](common_params & params) {
|
|
2662
|
+
params.show_statistics = true;
|
|
2663
|
+
}
|
|
2664
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2658
2665
|
add_opt(common_arg(
|
|
2659
2666
|
{"--parse-special"},
|
|
2660
2667
|
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
|
@@ -433,9 +433,10 @@ struct common_params {
|
|
|
433
433
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
434
434
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
435
435
|
|
|
436
|
-
bool process_output
|
|
437
|
-
bool compute_ppl
|
|
438
|
-
bool
|
|
436
|
+
bool process_output = false; // collect data for the output tensor
|
|
437
|
+
bool compute_ppl = true; // whether to compute perplexity
|
|
438
|
+
bool show_statistics = false; // show imatrix statistics per tensor
|
|
439
|
+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
|
439
440
|
|
|
440
441
|
// cvector-generator params
|
|
441
442
|
int n_pca_batch = 100;
|
|
@@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
131
131
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
132
132
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
133
133
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
134
|
-
option(GGML_NNPA "ggml: enable nnpa"
|
|
134
|
+
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
|
135
135
|
|
|
136
136
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
137
137
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|
|
174
174
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
175
175
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
176
176
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
177
|
+
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
178
|
+
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
|
177
179
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
178
180
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
179
181
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -70,10 +70,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
70
70
|
if (GGML_OPENMP)
|
|
71
71
|
find_package(OpenMP)
|
|
72
72
|
if (OpenMP_FOUND)
|
|
73
|
+
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
|
73
74
|
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
|
74
75
|
|
|
75
76
|
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
|
76
77
|
else()
|
|
78
|
+
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
|
77
79
|
message(WARNING "OpenMP not found")
|
|
78
80
|
endif()
|
|
79
81
|
endif()
|
|
@@ -456,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
456
458
|
list(APPEND ARCH_FLAGS -march=z16)
|
|
457
459
|
elseif (${S390X_M} MATCHES "9175|9176")
|
|
458
460
|
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
|
461
|
+
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
|
459
462
|
message(STATUS "z17 target")
|
|
460
463
|
list(APPEND ARCH_FLAGS -march=z17)
|
|
461
464
|
else()
|
|
@@ -494,9 +497,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
494
497
|
|
|
495
498
|
# Fetch KleidiAI sources:
|
|
496
499
|
include(FetchContent)
|
|
497
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
500
|
+
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
|
|
498
501
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
499
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
502
|
+
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
|
|
500
503
|
|
|
501
504
|
if (POLICY CMP0135)
|
|
502
505
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
544
544
|
__m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
|
|
545
545
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
|
546
546
|
__m128 tmp = max4;
|
|
547
|
-
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4,
|
|
547
|
+
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 ));
|
|
548
548
|
const float max_scalar = ((v4f32)max4)[0];
|
|
549
549
|
|
|
550
550
|
// Quantize these floats
|
|
@@ -22,9 +22,94 @@
|
|
|
22
22
|
|
|
23
23
|
#include "kai_common.h"
|
|
24
24
|
|
|
25
|
+
#include "simd-mappings.h"
|
|
26
|
+
|
|
25
27
|
#include "kernels.h"
|
|
26
28
|
|
|
27
29
|
#define NELEMS(x) sizeof(x) / sizeof(*x)
|
|
30
|
+
|
|
31
|
+
static const size_t INT4_PER_BYTE = 2;
|
|
32
|
+
static const size_t INT4_BITS = 4;
|
|
33
|
+
static const int Q4_0_ZERO_POINT = 8;
|
|
34
|
+
const size_t INT4_PER_UINT16 = 4;
|
|
35
|
+
|
|
36
|
+
static void dequantize_row_qsi4c32pscalef16(
|
|
37
|
+
const void *packed_data,
|
|
38
|
+
int32_t row_idx,
|
|
39
|
+
int64_t nc,
|
|
40
|
+
float *out,
|
|
41
|
+
size_t nr_pack,
|
|
42
|
+
size_t packed_row_stride,
|
|
43
|
+
size_t kr,
|
|
44
|
+
size_t bl,
|
|
45
|
+
size_t num_bytes_multiplier
|
|
46
|
+
) {
|
|
47
|
+
size_t group_idx = row_idx / nr_pack;
|
|
48
|
+
size_t row_in_group = row_idx % nr_pack;
|
|
49
|
+
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
|
|
50
|
+
size_t num_blocks = nc / bl;
|
|
51
|
+
const uint8_t *block_ptr = packed_group;
|
|
52
|
+
|
|
53
|
+
for (size_t b = 0; b < num_blocks; ++b) {
|
|
54
|
+
uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier));
|
|
55
|
+
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
|
|
56
|
+
|
|
57
|
+
const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier;
|
|
58
|
+
size_t num_segments = bl / kr;
|
|
59
|
+
size_t num_bytes_per_segment = kr / INT4_PER_BYTE;
|
|
60
|
+
|
|
61
|
+
for (size_t s = 0; s < num_segments; ++s) {
|
|
62
|
+
const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment;
|
|
63
|
+
const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment;
|
|
64
|
+
for (size_t k = 0; k < num_bytes_per_segment; ++k) {
|
|
65
|
+
uint8_t byte = qbytes[k] ^ 0x88;
|
|
66
|
+
int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT;
|
|
67
|
+
int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT;
|
|
68
|
+
out[b * bl + s * num_bytes_per_segment + k] = x0 * scale;
|
|
69
|
+
out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
static void dequantize_row_qsi4c32ps1s0scalef16(
|
|
77
|
+
const void *packed_data,
|
|
78
|
+
int32_t row_idx,
|
|
79
|
+
int64_t k,
|
|
80
|
+
float *out,
|
|
81
|
+
size_t nr,
|
|
82
|
+
size_t packed_row_stride,
|
|
83
|
+
size_t kr,
|
|
84
|
+
size_t bl,
|
|
85
|
+
size_t num_bytes_multiplier
|
|
86
|
+
) {
|
|
87
|
+
const size_t num_blocks = k / bl;
|
|
88
|
+
const size_t bl4 = bl / INT4_PER_UINT16;
|
|
89
|
+
|
|
90
|
+
size_t group_idx = row_idx / nr;
|
|
91
|
+
size_t row_in_group = row_idx % nr;
|
|
92
|
+
|
|
93
|
+
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
|
|
94
|
+
const uint16_t *qdata = (const uint16_t *)packed_group;
|
|
95
|
+
const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier));
|
|
96
|
+
|
|
97
|
+
for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
|
|
98
|
+
uint16_t scale_f16 = scales[row_in_group + block_idx * nr];
|
|
99
|
+
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
|
|
100
|
+
|
|
101
|
+
for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) {
|
|
102
|
+
uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group];
|
|
103
|
+
|
|
104
|
+
for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) {
|
|
105
|
+
int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT;
|
|
106
|
+
out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
GGML_UNUSED(kr);
|
|
111
|
+
}
|
|
112
|
+
|
|
28
113
|
static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
29
114
|
#if defined(__ARM_FEATURE_SME)
|
|
30
115
|
{
|
|
@@ -63,8 +148,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
63
148
|
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
|
|
64
149
|
},
|
|
65
150
|
/* .rhs_info = */ {
|
|
66
|
-
/* .packed_size
|
|
67
|
-
/* .
|
|
151
|
+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
|
152
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
|
153
|
+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
|
154
|
+
/* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16,
|
|
68
155
|
},
|
|
69
156
|
/* .required_cpu = */ CPU_FEATURE_SME,
|
|
70
157
|
/* .lhs_type = */ GGML_TYPE_F32,
|
|
@@ -107,8 +194,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
107
194
|
/* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
|
|
108
195
|
},
|
|
109
196
|
/* .rhs_info = */ {
|
|
110
|
-
/* .packed_size
|
|
111
|
-
/* .
|
|
197
|
+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
|
198
|
+
/* .packed_stride = */ NULL,
|
|
199
|
+
/* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
|
200
|
+
/* .to_float = */ NULL,
|
|
112
201
|
},
|
|
113
202
|
/* .required_cpu = */ CPU_FEATURE_SME,
|
|
114
203
|
/* .lhs_type = */ GGML_TYPE_F32,
|
|
@@ -154,8 +243,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
154
243
|
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
|
155
244
|
},
|
|
156
245
|
/* .rhs_info = */ {
|
|
157
|
-
/* .packed_size
|
|
158
|
-
/* .
|
|
246
|
+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
247
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
248
|
+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
249
|
+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
|
159
250
|
},
|
|
160
251
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
|
161
252
|
/* .lhs_type = */ GGML_TYPE_F32,
|
|
@@ -200,8 +291,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
200
291
|
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
|
201
292
|
},
|
|
202
293
|
/* .rhs_info = */ {
|
|
203
|
-
/* .packed_size
|
|
204
|
-
/* .
|
|
294
|
+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
295
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
296
|
+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
297
|
+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
|
205
298
|
},
|
|
206
299
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
|
207
300
|
/* .lhs_type = */ GGML_TYPE_F32,
|
|
@@ -247,8 +340,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
247
340
|
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
|
248
341
|
},
|
|
249
342
|
/* .rhs_info = */ {
|
|
250
|
-
/* .packed_size
|
|
251
|
-
/* .
|
|
343
|
+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
344
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
345
|
+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
346
|
+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
|
252
347
|
},
|
|
253
348
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
|
254
349
|
/* .lhs_type = */ GGML_TYPE_F32,
|
|
@@ -293,8 +388,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
293
388
|
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
|
294
389
|
},
|
|
295
390
|
/* .rhs_info = */ {
|
|
296
|
-
/* .packed_size
|
|
297
|
-
/* .
|
|
391
|
+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
392
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
393
|
+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
394
|
+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
|
298
395
|
},
|
|
299
396
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
|
300
397
|
/* .lhs_type = */ GGML_TYPE_F32,
|
|
@@ -71,12 +71,15 @@ struct rhs_packing_info {
|
|
|
71
71
|
std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
|
|
72
72
|
std::function<size_t(size_t n, size_t k)>
|
|
73
73
|
> packed_size;
|
|
74
|
+
size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl);
|
|
74
75
|
std::variant<
|
|
75
76
|
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
|
|
76
77
|
const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
|
|
77
78
|
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
|
|
78
79
|
const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
|
|
79
80
|
> pack_func;
|
|
81
|
+
void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out, size_t nr_pack, size_t packed_row_stride,
|
|
82
|
+
size_t kr, size_t bl, size_t num_bytes_multiplier);
|
|
80
83
|
};
|
|
81
84
|
|
|
82
85
|
struct ggml_kleidiai_kernels {
|
|
@@ -40,6 +40,17 @@ struct ggml_kleidiai_context {
|
|
|
40
40
|
ggml_kleidiai_kernels * kernels;
|
|
41
41
|
} static ctx = { CPU_FEATURE_NONE, NULL };
|
|
42
42
|
|
|
43
|
+
static const char* cpu_feature_to_string(cpu_feature f) {
|
|
44
|
+
switch (f) {
|
|
45
|
+
case CPU_FEATURE_NONE: return "NONE";
|
|
46
|
+
case CPU_FEATURE_DOTPROD: return "DOTPROD";
|
|
47
|
+
case CPU_FEATURE_I8MM: return "I8MM";
|
|
48
|
+
case CPU_FEATURE_SVE: return "SVE";
|
|
49
|
+
case CPU_FEATURE_SME: return "SME";
|
|
50
|
+
default: return "UNKNOWN";
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
43
54
|
static void init_kleidiai_context(void) {
|
|
44
55
|
|
|
45
56
|
ggml_critical_section_start();
|
|
@@ -62,6 +73,11 @@ static void init_kleidiai_context(void) {
|
|
|
62
73
|
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
|
63
74
|
}
|
|
64
75
|
ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
|
76
|
+
#ifndef NDEBUG
|
|
77
|
+
if (ctx.kernels) {
|
|
78
|
+
GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu));
|
|
79
|
+
}
|
|
80
|
+
#endif
|
|
65
81
|
}
|
|
66
82
|
ggml_critical_section_end();
|
|
67
83
|
}
|
|
@@ -102,6 +118,9 @@ static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint1
|
|
|
102
118
|
|
|
103
119
|
class tensor_traits : public ggml::cpu::tensor_traits {
|
|
104
120
|
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
|
121
|
+
if (op->op != GGML_OP_MUL_MAT) {
|
|
122
|
+
return false;
|
|
123
|
+
}
|
|
105
124
|
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
|
|
106
125
|
GGML_ASSERT(kernels);
|
|
107
126
|
kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
|
|
@@ -135,6 +154,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
135
154
|
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
|
136
155
|
return compute_forward_kv_cache(params, dst);
|
|
137
156
|
}
|
|
157
|
+
} else if (dst->op == GGML_OP_GET_ROWS) {
|
|
158
|
+
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
|
159
|
+
return compute_forward_get_rows(params, dst);
|
|
160
|
+
}
|
|
138
161
|
}
|
|
139
162
|
return false;
|
|
140
163
|
}
|
|
@@ -270,6 +293,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
270
293
|
}
|
|
271
294
|
|
|
272
295
|
bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
|
296
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
|
|
297
|
+
|
|
273
298
|
const ggml_tensor * src0 = dst->src[0];
|
|
274
299
|
const ggml_tensor * src1 = dst->src[1];
|
|
275
300
|
|
|
@@ -342,8 +367,49 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
|
|
342
367
|
return true;
|
|
343
368
|
}
|
|
344
369
|
|
|
370
|
+
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
|
371
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
|
|
372
|
+
GGML_ASSERT(ctx.kernels);
|
|
373
|
+
|
|
374
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
375
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
376
|
+
|
|
377
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
378
|
+
|
|
379
|
+
rhs_packing_info * rhs_info = &ctx.kernels->rhs_info;
|
|
380
|
+
kernel_info * kernel = &ctx.kernels->gemm;
|
|
381
|
+
|
|
382
|
+
const int64_t nc = ne00;
|
|
383
|
+
const int64_t nr = ggml_nelements(src1);
|
|
384
|
+
|
|
385
|
+
const size_t block_rows = kernel->get_nr();
|
|
386
|
+
const size_t kr = kernel->get_kr();
|
|
387
|
+
|
|
388
|
+
const size_t num_bytes_multiplier = sizeof(uint16_t);
|
|
389
|
+
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0);
|
|
390
|
+
|
|
391
|
+
const int ith = params->ith;
|
|
392
|
+
const int nth = params->nth;
|
|
393
|
+
|
|
394
|
+
const int dr = (nr + nth - 1) / nth;
|
|
395
|
+
const int ir0 = dr * ith;
|
|
396
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
397
|
+
|
|
398
|
+
for (int64_t i = ir0; i < ir1; ++i) {
|
|
399
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
400
|
+
int64_t row_idx = ((const int32_t *)src1->data)[i];
|
|
401
|
+
GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
|
|
402
|
+
|
|
403
|
+
float *out = (float *)((char *)dst->data + i * nb1);
|
|
404
|
+
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
return true;
|
|
408
|
+
}
|
|
409
|
+
|
|
345
410
|
public:
|
|
346
411
|
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
|
|
412
|
+
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
|
|
347
413
|
GGML_ASSERT(ctx.kernels);
|
|
348
414
|
const size_t n = tensor->ne[1];
|
|
349
415
|
const size_t k = tensor->ne[0];
|
|
@@ -351,17 +417,12 @@ public:
|
|
|
351
417
|
size_t kr = ctx.kernels->gemm.get_kr();
|
|
352
418
|
size_t sr = ctx.kernels->gemm.get_sr();
|
|
353
419
|
|
|
354
|
-
#ifndef NDEBUG
|
|
355
|
-
const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
|
|
356
|
-
GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
|
|
357
|
-
#endif
|
|
358
420
|
struct kai_rhs_pack_qs4cxs1s0_param params;
|
|
359
421
|
params.lhs_zero_point = 1;
|
|
360
422
|
params.rhs_zero_point = 8;
|
|
361
423
|
variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, ¶ms);
|
|
362
424
|
|
|
363
425
|
return 0;
|
|
364
|
-
|
|
365
426
|
GGML_UNUSED(data_size);
|
|
366
427
|
}
|
|
367
428
|
};
|
|
@@ -375,8 +436,8 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
|
|
|
375
436
|
static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
|
376
437
|
tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
|
|
377
438
|
|
|
378
|
-
GGML_UNUSED(buffer);
|
|
379
439
|
return GGML_STATUS_SUCCESS;
|
|
440
|
+
GGML_UNUSED(buffer);
|
|
380
441
|
}
|
|
381
442
|
|
|
382
443
|
static void ggml_backend_cpu_kleidiai_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
|
@@ -418,18 +479,35 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
|
|
|
418
479
|
GGML_UNUSED(buft);
|
|
419
480
|
}
|
|
420
481
|
|
|
482
|
+
static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
|
483
|
+
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
|
|
484
|
+
GGML_ASSERT(ctx.kernels);
|
|
485
|
+
|
|
486
|
+
const size_t n = tensor->ne[1];
|
|
487
|
+
const size_t k = tensor->ne[0];
|
|
488
|
+
const size_t nr = ctx.kernels->gemm.get_nr();
|
|
489
|
+
const size_t kr = ctx.kernels->gemm.get_kr();
|
|
490
|
+
|
|
491
|
+
return variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
|
|
492
|
+
|
|
493
|
+
GGML_UNUSED(buft);
|
|
494
|
+
}
|
|
495
|
+
|
|
421
496
|
namespace ggml::cpu::kleidiai {
|
|
422
497
|
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
423
498
|
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
|
424
|
-
if (op->op == GGML_OP_MUL_MAT &&
|
|
499
|
+
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
|
|
425
500
|
op->src[0]->type == GGML_TYPE_Q4_0 &&
|
|
426
501
|
op->src[0]->buffer &&
|
|
427
502
|
(ggml_n_dims(op->src[0]) == 2) &&
|
|
428
503
|
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
|
|
504
|
+
if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
|
|
505
|
+
return false;
|
|
506
|
+
}
|
|
429
507
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
|
430
508
|
return false;
|
|
431
509
|
}
|
|
432
|
-
if (op->src[1]->type == GGML_TYPE_F32 &&
|
|
510
|
+
if ((op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_I32) &&
|
|
433
511
|
ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
|
|
434
512
|
return true;
|
|
435
513
|
}
|
|
@@ -438,7 +516,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
|
438
516
|
}
|
|
439
517
|
|
|
440
518
|
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
|
441
|
-
if (op->op == GGML_OP_MUL_MAT) {
|
|
519
|
+
if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) {
|
|
442
520
|
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
|
443
521
|
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
|
444
522
|
}
|
|
@@ -469,7 +547,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
|
|
|
469
547
|
/* .alloc_buffer = */ ggml_backend_cpu_kleidiai_buffer_type_alloc_buffer,
|
|
470
548
|
/* .get_alignment = */ ggml_backend_cpu_kleidiai_buffer_type_get_alignment,
|
|
471
549
|
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
|
472
|
-
/* .get_alloc_size = */
|
|
550
|
+
/* .get_alloc_size = */ ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size,
|
|
473
551
|
/* .is_host = */ nullptr,
|
|
474
552
|
},
|
|
475
553
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
|
@@ -956,6 +956,7 @@ extern "C" {
|
|
|
956
956
|
// in the order they have appeared in the batch.
|
|
957
957
|
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
|
958
958
|
// Cols: n_vocab
|
|
959
|
+
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
|
959
960
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
|
960
961
|
|
|
961
962
|
// Logits for the ith token. For positive indices, Equivalent to:
|
|
@@ -970,6 +971,7 @@ extern "C" {
|
|
|
970
971
|
// in the order they have appeared in the batch.
|
|
971
972
|
// shape: [n_outputs*n_embd]
|
|
972
973
|
// Otherwise, returns NULL.
|
|
974
|
+
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
|
973
975
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
|
974
976
|
|
|
975
977
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
|
@@ -1933,12 +1933,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1933
1933
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
1934
1934
|
}
|
|
1935
1935
|
},
|
|
1936
|
-
{
|
|
1937
|
-
LLM_ARCH_UNKNOWN,
|
|
1938
|
-
{
|
|
1939
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1940
|
-
},
|
|
1941
|
-
},
|
|
1942
1936
|
{
|
|
1943
1937
|
LLM_ARCH_DREAM,
|
|
1944
1938
|
{
|
|
@@ -1956,6 +1950,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1956
1950
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1957
1951
|
},
|
|
1958
1952
|
},
|
|
1953
|
+
{
|
|
1954
|
+
LLM_ARCH_UNKNOWN,
|
|
1955
|
+
{
|
|
1956
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1957
|
+
},
|
|
1958
|
+
},
|
|
1959
1959
|
};
|
|
1960
1960
|
|
|
1961
1961
|
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
@@ -718,10 +718,9 @@ int32_t llm_chat_apply_template(
|
|
|
718
718
|
}
|
|
719
719
|
|
|
720
720
|
ss << message->content << "<|im_end|>";
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
}
|
|
721
|
+
}
|
|
722
|
+
if (add_ass) {
|
|
723
|
+
ss << "<|im_assistant|>assistant<|im_middle|>";
|
|
725
724
|
}
|
|
726
725
|
} else {
|
|
727
726
|
// template not supported
|