@fugood/llama.node 1.0.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -3
- package/lib/binding.ts +117 -32
- package/lib/index.js +7 -9
- package/lib/index.ts +34 -25
- package/package.json +17 -14
- package/src/LlamaCompletionWorker.cpp +24 -6
- package/src/LlamaContext.cpp +38 -8
- package/src/llama.cpp/common/arg.cpp +8 -1
- package/src/llama.cpp/common/common.h +4 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/src/llama.cpp/include/llama.h +2 -0
- package/src/llama.cpp/src/llama-arch.cpp +6 -6
- package/src/llama.cpp/src/llama-chat.cpp +3 -4
- package/src/llama.cpp/src/llama-context.cpp +49 -14
- package/src/llama.cpp/src/llama-context.h +13 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -0
- package/src/llama.cpp/src/llama-model.cpp +19 -2
- package/src/tts_utils.cpp +12 -0
- package/src/tts_utils.h +40 -1
|
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
|
|
105
105
|
|
|
106
106
|
{
|
|
107
107
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
108
|
-
|
|
108
|
+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
|
109
109
|
|
|
110
110
|
if (!supports_set_rows && !cparams.kv_unified) {
|
|
111
111
|
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
@@ -508,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
|
|
508
508
|
}
|
|
509
509
|
|
|
510
510
|
float * llama_context::get_logits() {
|
|
511
|
+
output_reorder();
|
|
512
|
+
|
|
511
513
|
return logits;
|
|
512
514
|
}
|
|
513
515
|
|
|
514
516
|
float * llama_context::get_logits_ith(int32_t i) {
|
|
515
517
|
int64_t j = -1;
|
|
516
518
|
|
|
519
|
+
output_reorder();
|
|
520
|
+
|
|
517
521
|
try {
|
|
518
522
|
if (logits == nullptr) {
|
|
519
523
|
throw std::runtime_error("no logits");
|
|
@@ -550,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
|
|
|
550
554
|
}
|
|
551
555
|
|
|
552
556
|
float * llama_context::get_embeddings() {
|
|
557
|
+
output_reorder();
|
|
558
|
+
|
|
553
559
|
return embd;
|
|
554
560
|
}
|
|
555
561
|
|
|
556
562
|
float * llama_context::get_embeddings_ith(int32_t i) {
|
|
557
563
|
int64_t j = -1;
|
|
558
564
|
|
|
565
|
+
output_reorder();
|
|
566
|
+
|
|
559
567
|
try {
|
|
560
568
|
if (embd == nullptr) {
|
|
561
569
|
throw std::runtime_error("no embeddings");
|
|
@@ -891,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
891
899
|
}
|
|
892
900
|
}
|
|
893
901
|
|
|
902
|
+
if (!supports_set_rows) {
|
|
903
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
904
|
+
// overlap with device computation.
|
|
905
|
+
ggml_backend_sched_reset(sched.get());
|
|
906
|
+
}
|
|
907
|
+
|
|
894
908
|
// TODO: hacky solution
|
|
895
909
|
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
|
896
910
|
//cross.t_embd = t_embd;
|
|
@@ -970,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
970
984
|
|
|
971
985
|
// TODO: this clear of the buffer can easily be forgotten - need something better
|
|
972
986
|
embd_seq.clear();
|
|
987
|
+
output_swaps.clear();
|
|
973
988
|
|
|
974
989
|
bool did_optimize = false;
|
|
975
990
|
|
|
@@ -1189,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1189
1204
|
// make the outputs have the same order they had in the user-provided batch
|
|
1190
1205
|
// note: this is mostly relevant for recurrent models atm
|
|
1191
1206
|
if (!sorted_output) {
|
|
1192
|
-
const uint32_t n_vocab = model.vocab.n_tokens();
|
|
1193
|
-
const uint64_t n_embd = model.hparams.n_embd;
|
|
1194
|
-
|
|
1195
1207
|
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
|
1196
1208
|
|
|
1197
1209
|
// TODO: is there something more efficient which also minimizes swaps?
|
|
@@ -1207,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1207
1219
|
continue;
|
|
1208
1220
|
}
|
|
1209
1221
|
std::swap(out_ids[i], out_ids[j_min]);
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
}
|
|
1214
|
-
}
|
|
1215
|
-
if (embd_size > 0) {
|
|
1216
|
-
for (uint32_t k = 0; k < n_embd; k++) {
|
|
1217
|
-
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
|
1218
|
-
}
|
|
1219
|
-
}
|
|
1222
|
+
|
|
1223
|
+
// remember the swaps and apply them lazily upon logits/embeddings access
|
|
1224
|
+
output_swaps.push_back({ i, j_min });
|
|
1220
1225
|
}
|
|
1221
1226
|
|
|
1222
1227
|
std::fill(output_ids.begin(), output_ids.end(), -1);
|
|
@@ -1230,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1230
1235
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
|
1231
1236
|
//synchronize();
|
|
1232
1237
|
|
|
1238
|
+
if (!supports_set_rows) {
|
|
1239
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
1240
|
+
// overlap with device computation.
|
|
1241
|
+
ggml_backend_sched_reset(sched.get());
|
|
1242
|
+
}
|
|
1243
|
+
|
|
1233
1244
|
return 0;
|
|
1234
1245
|
}
|
|
1235
1246
|
|
|
@@ -1307,6 +1318,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1307
1318
|
return n_outputs_max;
|
|
1308
1319
|
}
|
|
1309
1320
|
|
|
1321
|
+
void llama_context::output_reorder() {
|
|
1322
|
+
const uint32_t n_vocab = model.vocab.n_tokens();
|
|
1323
|
+
const uint64_t n_embd = model.hparams.n_embd;
|
|
1324
|
+
|
|
1325
|
+
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
|
|
1326
|
+
const uint32_t i0 = output_swaps[s].i0;
|
|
1327
|
+
const uint32_t i1 = output_swaps[s].i1;
|
|
1328
|
+
|
|
1329
|
+
if (logits_size > 0) {
|
|
1330
|
+
for (uint32_t k = 0; k < n_vocab; k++) {
|
|
1331
|
+
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
if (embd_size > 0) {
|
|
1336
|
+
for (uint32_t k = 0; k < n_embd; k++) {
|
|
1337
|
+
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
output_swaps.clear();
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1310
1345
|
//
|
|
1311
1346
|
// graph
|
|
1312
1347
|
//
|
|
@@ -181,6 +181,8 @@ private:
|
|
|
181
181
|
// Returns max number of outputs for which space was reserved.
|
|
182
182
|
uint32_t output_reserve(int32_t n_outputs);
|
|
183
183
|
|
|
184
|
+
void output_reorder();
|
|
185
|
+
|
|
184
186
|
//
|
|
185
187
|
// graph
|
|
186
188
|
//
|
|
@@ -250,6 +252,13 @@ private:
|
|
|
250
252
|
|
|
251
253
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
|
252
254
|
|
|
255
|
+
struct swap_info {
|
|
256
|
+
uint32_t i0;
|
|
257
|
+
uint32_t i1;
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
std::vector<swap_info> output_swaps;
|
|
261
|
+
|
|
253
262
|
ggml_backend_sched_ptr sched;
|
|
254
263
|
|
|
255
264
|
ggml_backend_t backend_cpu = nullptr;
|
|
@@ -278,6 +287,10 @@ private:
|
|
|
278
287
|
|
|
279
288
|
bool has_evaluated_once = false;
|
|
280
289
|
|
|
290
|
+
// env: LLAMA_SET_ROWS (temporary)
|
|
291
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
292
|
+
bool supports_set_rows = false;
|
|
293
|
+
|
|
281
294
|
// perf
|
|
282
295
|
mutable int64_t t_start_us = 0;
|
|
283
296
|
mutable int64_t t_load_us = 0;
|
|
@@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
768
768
|
// Iterate and write all the keys first, each row is a cell
|
|
769
769
|
// Get whole range at a time
|
|
770
770
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
771
|
+
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
|
772
|
+
if (r_l[il] == nullptr) continue;
|
|
771
773
|
|
|
772
774
|
// Write key type
|
|
773
775
|
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
|
@@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
787
789
|
|
|
788
790
|
if (!s_trans) {
|
|
789
791
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
792
|
+
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
|
793
|
+
if (s_l[il] == nullptr) continue;
|
|
790
794
|
|
|
791
795
|
// Write value type
|
|
792
796
|
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
|
@@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|
|
807
811
|
// When v is transposed, we also need the element size and get the element ranges from each row
|
|
808
812
|
const uint32_t mem_size = size;
|
|
809
813
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
814
|
+
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
|
815
|
+
if (s_l[il] == nullptr) continue;
|
|
816
|
+
|
|
810
817
|
const uint32_t n_embd_s = hparams.n_embd_s();
|
|
811
818
|
|
|
812
819
|
// Write value type
|
|
@@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
951
958
|
|
|
952
959
|
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
|
|
953
960
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
961
|
+
// skip null layers
|
|
962
|
+
if (r_l[il] == nullptr) continue;
|
|
954
963
|
|
|
955
964
|
// Read type of key
|
|
956
965
|
int32_t r_type_i_ref;
|
|
@@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
978
987
|
|
|
979
988
|
if (!s_trans) {
|
|
980
989
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
990
|
+
// skip null layers
|
|
991
|
+
if (s_l[il] == nullptr) continue;
|
|
981
992
|
|
|
982
993
|
// Read type of value
|
|
983
994
|
int32_t s_type_i_ref;
|
|
984
995
|
io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
|
|
985
996
|
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
|
997
|
+
|
|
986
998
|
if (s_type_i != s_type_i_ref) {
|
|
987
999
|
LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
|
|
988
1000
|
return false;
|
|
@@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1005
1017
|
} else {
|
|
1006
1018
|
// For each layer, read the values for each cell (transposed)
|
|
1007
1019
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
1020
|
+
// skip null layers
|
|
1021
|
+
if (s_l[il] == nullptr) continue;
|
|
1022
|
+
|
|
1008
1023
|
const uint32_t n_embd_s = hparams.n_embd_s();
|
|
1009
1024
|
|
|
1010
1025
|
// Read type of value
|
|
@@ -646,6 +646,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
646
646
|
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
647
647
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
648
648
|
|
|
649
|
+
// MiniCPM uses rope by default, unlike Granite which uses it as a switch
|
|
650
|
+
hparams.rope_finetuned = true;
|
|
651
|
+
|
|
649
652
|
switch (hparams.n_layer) {
|
|
650
653
|
case 52: type = LLM_TYPE_1B; break;
|
|
651
654
|
case 40: type = LLM_TYPE_2B; break;
|
|
@@ -1544,7 +1547,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1544
1547
|
ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
|
|
1545
1548
|
|
|
1546
1549
|
switch (hparams.n_layer) {
|
|
1547
|
-
case 12:
|
|
1550
|
+
case 12:
|
|
1551
|
+
switch (hparams.n_embd) {
|
|
1552
|
+
case 768: type = LLM_TYPE_190M; break;
|
|
1553
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1554
|
+
} break;
|
|
1548
1555
|
case 24:
|
|
1549
1556
|
switch (hparams.n_embd) {
|
|
1550
1557
|
case 1024: type = LLM_TYPE_450M; break;
|
|
@@ -1557,7 +1564,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1557
1564
|
case 3584: type = LLM_TYPE_7B; break;
|
|
1558
1565
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1559
1566
|
} break;
|
|
1560
|
-
case 32:
|
|
1567
|
+
case 32:
|
|
1568
|
+
switch (hparams.n_embd) {
|
|
1569
|
+
case 2560: type = LLM_TYPE_2_9B; break;
|
|
1570
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1571
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1572
|
+
} break;
|
|
1573
|
+
case 61:
|
|
1574
|
+
switch (hparams.n_embd) {
|
|
1575
|
+
case 4096: type = LLM_TYPE_14B; break;
|
|
1576
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1577
|
+
} break;
|
|
1561
1578
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1562
1579
|
}
|
|
1563
1580
|
} break;
|
package/src/tts_utils.cpp
CHANGED
|
@@ -357,3 +357,15 @@ std::vector<float> embd_to_audio(const float *embd, const int n_codes,
|
|
|
357
357
|
|
|
358
358
|
return audio;
|
|
359
359
|
}
|
|
360
|
+
|
|
361
|
+
const char *get_tts_grammar(const tts_type type) {
|
|
362
|
+
switch (type) {
|
|
363
|
+
case OUTETTS_V0_1:
|
|
364
|
+
return OUTETTS_V1_GRAMMAR;
|
|
365
|
+
case OUTETTS_V0_2:
|
|
366
|
+
case OUTETTS_V0_3:
|
|
367
|
+
return OUTETTS_V2_GRAMMAR;
|
|
368
|
+
default:
|
|
369
|
+
return nullptr;
|
|
370
|
+
}
|
|
371
|
+
}
|
package/src/tts_utils.h
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
#include <nlohmann/json.hpp>
|
|
10
10
|
|
|
11
|
-
enum tts_type { UNKNOWN = -1,
|
|
11
|
+
enum tts_type { UNKNOWN = -1, OUTETTS_V0_1 = 1, OUTETTS_V0_2 = 2, OUTETTS_V0_3 = 3 };
|
|
12
12
|
|
|
13
13
|
static std::string anyascii_string(const std::string &input);
|
|
14
14
|
|
|
@@ -20,6 +20,8 @@ std::string process_text(const std::string &text, const tts_type tts_type);
|
|
|
20
20
|
std::vector<float> embd_to_audio(const float *embd, const int n_codes,
|
|
21
21
|
const int n_embd, const int n_thread);
|
|
22
22
|
|
|
23
|
+
const char *get_tts_grammar(const tts_type type);
|
|
24
|
+
|
|
23
25
|
// the default speaker profile is from:
|
|
24
26
|
// https://github.com/edwko/OuteTTS/blob/main/outetts/version/v1/default_speakers/en_male_1.json
|
|
25
27
|
static const char *DEFAULT_AUDIO_TEXT =
|
|
@@ -62,3 +64,40 @@ and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><
|
|
|
62
64
|
it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
|
|
63
65
|
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
|
|
64
66
|
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
|
|
67
|
+
|
|
68
|
+
static const char *OUTETTS_V1_GRAMMAR = R"(
|
|
69
|
+
root ::= NL? wordAudioBlock+ audioEnd NL eos?
|
|
70
|
+
wordAudioBlock ::= WORD codeBlock NL
|
|
71
|
+
codeBlock ::= TIME CODE{1,144}
|
|
72
|
+
eos ::= "<|im_end|>"
|
|
73
|
+
codeStart ::= "<|code_start|>"
|
|
74
|
+
codeEnd ::= "<|code_end|>"
|
|
75
|
+
audioEnd ::= "<|audio_end|>"
|
|
76
|
+
WORD ::= [A-Za-z]+
|
|
77
|
+
NL ::= "\n"
|
|
78
|
+
TIME ::= "<|t_" DECIMAL "|>"
|
|
79
|
+
CODE ::= "<|" DIGITS "|>"
|
|
80
|
+
DIGITS ::= [0-9]+
|
|
81
|
+
DECIMAL ::= [0-9]+ "." [0-9]+
|
|
82
|
+
)";
|
|
83
|
+
|
|
84
|
+
static const char *OUTETTS_V2_GRAMMAR = R"(
|
|
85
|
+
root ::= NL? content+ audioEnd NL eos?
|
|
86
|
+
content ::= wordAudioBlock | emotionBlock
|
|
87
|
+
wordAudioBlock ::= WORD punch* codeBlock space NL
|
|
88
|
+
codeBlock ::= TIME CODE{1,144}
|
|
89
|
+
emotionBlock ::= emotionStart TEXT emotionEnd space NL
|
|
90
|
+
TEXT ::= [A-Za-z0-9 .,?!]+
|
|
91
|
+
eos ::= "<|im_end|>"
|
|
92
|
+
emotionStart ::= "<|emotion_start|>"
|
|
93
|
+
emotionEnd ::= "<|emotion_end|>"
|
|
94
|
+
audioEnd ::= "<|audio_end|>"
|
|
95
|
+
space ::= "<|space|>"
|
|
96
|
+
WORD ::= [A-Za-z]+
|
|
97
|
+
NL ::= "\n"
|
|
98
|
+
TIME ::= "<|t_" DECIMAL "|>"
|
|
99
|
+
CODE ::= "<|" DIGITS "|>"
|
|
100
|
+
DIGITS ::= [0-9]+
|
|
101
|
+
DECIMAL ::= [0-9]+ "." [0-9]+
|
|
102
|
+
punch ::= "<|" [a-z_]+ "|>"
|
|
103
|
+
)";
|