@fugood/llama.node 1.0.6 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,7 +105,7 @@ llama_context::llama_context(
105
105
 
106
106
  {
107
107
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108
- const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
108
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
109
109
 
110
110
  if (!supports_set_rows && !cparams.kv_unified) {
111
111
  LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -508,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
508
508
  }
509
509
 
510
510
  float * llama_context::get_logits() {
511
+ output_reorder();
512
+
511
513
  return logits;
512
514
  }
513
515
 
514
516
  float * llama_context::get_logits_ith(int32_t i) {
515
517
  int64_t j = -1;
516
518
 
519
+ output_reorder();
520
+
517
521
  try {
518
522
  if (logits == nullptr) {
519
523
  throw std::runtime_error("no logits");
@@ -550,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
550
554
  }
551
555
 
552
556
  float * llama_context::get_embeddings() {
557
+ output_reorder();
558
+
553
559
  return embd;
554
560
  }
555
561
 
556
562
  float * llama_context::get_embeddings_ith(int32_t i) {
557
563
  int64_t j = -1;
558
564
 
565
+ output_reorder();
566
+
559
567
  try {
560
568
  if (embd == nullptr) {
561
569
  throw std::runtime_error("no embeddings");
@@ -891,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
891
899
  }
892
900
  }
893
901
 
902
+ if (!supports_set_rows) {
903
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
904
+ // overlap with device computation.
905
+ ggml_backend_sched_reset(sched.get());
906
+ }
907
+
894
908
  // TODO: hacky solution
895
909
  if (model.arch == LLM_ARCH_T5 && t_embd) {
896
910
  //cross.t_embd = t_embd;
@@ -970,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
970
984
 
971
985
  // TODO: this clear of the buffer can easily be forgotten - need something better
972
986
  embd_seq.clear();
987
+ output_swaps.clear();
973
988
 
974
989
  bool did_optimize = false;
975
990
 
@@ -1189,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
1189
1204
  // make the outputs have the same order they had in the user-provided batch
1190
1205
  // note: this is mostly relevant for recurrent models atm
1191
1206
  if (!sorted_output) {
1192
- const uint32_t n_vocab = model.vocab.n_tokens();
1193
- const uint64_t n_embd = model.hparams.n_embd;
1194
-
1195
1207
  GGML_ASSERT((size_t) n_outputs == out_ids.size());
1196
1208
 
1197
1209
  // TODO: is there something more efficient which also minimizes swaps?
@@ -1207,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
1207
1219
  continue;
1208
1220
  }
1209
1221
  std::swap(out_ids[i], out_ids[j_min]);
1210
- if (logits_size > 0) {
1211
- for (uint32_t k = 0; k < n_vocab; k++) {
1212
- std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
1213
- }
1214
- }
1215
- if (embd_size > 0) {
1216
- for (uint32_t k = 0; k < n_embd; k++) {
1217
- std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
1218
- }
1219
- }
1222
+
1223
+ // remember the swaps and apply them lazily upon logits/embeddings access
1224
+ output_swaps.push_back({ i, j_min });
1220
1225
  }
1221
1226
 
1222
1227
  std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1230,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
1230
1235
  // wait for the computation to finish (automatically done when obtaining the model output)
1231
1236
  //synchronize();
1232
1237
 
1238
+ if (!supports_set_rows) {
1239
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1240
+ // overlap with device computation.
1241
+ ggml_backend_sched_reset(sched.get());
1242
+ }
1243
+
1233
1244
  return 0;
1234
1245
  }
1235
1246
 
@@ -1307,6 +1318,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1307
1318
  return n_outputs_max;
1308
1319
  }
1309
1320
 
1321
+ void llama_context::output_reorder() {
1322
+ const uint32_t n_vocab = model.vocab.n_tokens();
1323
+ const uint64_t n_embd = model.hparams.n_embd;
1324
+
1325
+ for (uint32_t s = 0; s < output_swaps.size(); ++s) {
1326
+ const uint32_t i0 = output_swaps[s].i0;
1327
+ const uint32_t i1 = output_swaps[s].i1;
1328
+
1329
+ if (logits_size > 0) {
1330
+ for (uint32_t k = 0; k < n_vocab; k++) {
1331
+ std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
1332
+ }
1333
+ }
1334
+
1335
+ if (embd_size > 0) {
1336
+ for (uint32_t k = 0; k < n_embd; k++) {
1337
+ std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
1338
+ }
1339
+ }
1340
+ }
1341
+
1342
+ output_swaps.clear();
1343
+ }
1344
+
1310
1345
  //
1311
1346
  // graph
1312
1347
  //
@@ -181,6 +181,8 @@ private:
181
181
  // Returns max number of outputs for which space was reserved.
182
182
  uint32_t output_reserve(int32_t n_outputs);
183
183
 
184
+ void output_reorder();
185
+
184
186
  //
185
187
  // graph
186
188
  //
@@ -250,6 +252,13 @@ private:
250
252
 
251
253
  std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
252
254
 
255
+ struct swap_info {
256
+ uint32_t i0;
257
+ uint32_t i1;
258
+ };
259
+
260
+ std::vector<swap_info> output_swaps;
261
+
253
262
  ggml_backend_sched_ptr sched;
254
263
 
255
264
  ggml_backend_t backend_cpu = nullptr;
@@ -278,6 +287,10 @@ private:
278
287
 
279
288
  bool has_evaluated_once = false;
280
289
 
290
+ // env: LLAMA_SET_ROWS (temporary)
291
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14285
292
+ bool supports_set_rows = false;
293
+
281
294
  // perf
282
295
  mutable int64_t t_start_us = 0;
283
296
  mutable int64_t t_load_us = 0;
@@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
768
768
  // Iterate and write all the keys first, each row is a cell
769
769
  // Get whole range at a time
770
770
  for (uint32_t il = 0; il < n_layer; ++il) {
771
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
772
+ if (r_l[il] == nullptr) continue;
771
773
 
772
774
  // Write key type
773
775
  const int32_t r_type_i = (int32_t)r_l[il]->type;
@@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
787
789
 
788
790
  if (!s_trans) {
789
791
  for (uint32_t il = 0; il < n_layer; ++il) {
792
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
793
+ if (s_l[il] == nullptr) continue;
790
794
 
791
795
  // Write value type
792
796
  const int32_t s_type_i = (int32_t)s_l[il]->type;
@@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
807
811
  // When v is transposed, we also need the element size and get the element ranges from each row
808
812
  const uint32_t mem_size = size;
809
813
  for (uint32_t il = 0; il < n_layer; ++il) {
814
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
815
+ if (s_l[il] == nullptr) continue;
816
+
810
817
  const uint32_t n_embd_s = hparams.n_embd_s();
811
818
 
812
819
  // Write value type
@@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
951
958
 
952
959
  // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
953
960
  for (uint32_t il = 0; il < n_layer; ++il) {
961
+ // skip null layers
962
+ if (r_l[il] == nullptr) continue;
954
963
 
955
964
  // Read type of key
956
965
  int32_t r_type_i_ref;
@@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
978
987
 
979
988
  if (!s_trans) {
980
989
  for (uint32_t il = 0; il < n_layer; ++il) {
990
+ // skip null layers
991
+ if (s_l[il] == nullptr) continue;
981
992
 
982
993
  // Read type of value
983
994
  int32_t s_type_i_ref;
984
995
  io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
985
996
  const int32_t s_type_i = (int32_t)s_l[il]->type;
997
+
986
998
  if (s_type_i != s_type_i_ref) {
987
999
  LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
988
1000
  return false;
@@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
1005
1017
  } else {
1006
1018
  // For each layer, read the values for each cell (transposed)
1007
1019
  for (uint32_t il = 0; il < n_layer; ++il) {
1020
+ // skip null layers
1021
+ if (s_l[il] == nullptr) continue;
1022
+
1008
1023
  const uint32_t n_embd_s = hparams.n_embd_s();
1009
1024
 
1010
1025
  // Read type of value
@@ -646,6 +646,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
646
646
  ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
647
647
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
648
648
 
649
+ // MiniCPM uses rope by default, unlike Granite which uses it as a switch
650
+ hparams.rope_finetuned = true;
651
+
649
652
  switch (hparams.n_layer) {
650
653
  case 52: type = LLM_TYPE_1B; break;
651
654
  case 40: type = LLM_TYPE_2B; break;
@@ -1544,7 +1547,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1544
1547
  ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
1545
1548
 
1546
1549
  switch (hparams.n_layer) {
1547
- case 12: type = LLM_TYPE_190M; break;
1550
+ case 12:
1551
+ switch (hparams.n_embd) {
1552
+ case 768: type = LLM_TYPE_190M; break;
1553
+ default: type = LLM_TYPE_UNKNOWN;
1554
+ } break;
1548
1555
  case 24:
1549
1556
  switch (hparams.n_embd) {
1550
1557
  case 1024: type = LLM_TYPE_450M; break;
@@ -1557,7 +1564,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1557
1564
  case 3584: type = LLM_TYPE_7B; break;
1558
1565
  default: type = LLM_TYPE_UNKNOWN;
1559
1566
  } break;
1560
- case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World
1567
+ case 32:
1568
+ switch (hparams.n_embd) {
1569
+ case 2560: type = LLM_TYPE_2_9B; break;
1570
+ case 4096: type = LLM_TYPE_7B; break;
1571
+ default: type = LLM_TYPE_UNKNOWN;
1572
+ } break;
1573
+ case 61:
1574
+ switch (hparams.n_embd) {
1575
+ case 4096: type = LLM_TYPE_14B; break;
1576
+ default: type = LLM_TYPE_UNKNOWN;
1577
+ } break;
1561
1578
  default: type = LLM_TYPE_UNKNOWN;
1562
1579
  }
1563
1580
  } break;
package/src/tts_utils.cpp CHANGED
@@ -357,3 +357,15 @@ std::vector<float> embd_to_audio(const float *embd, const int n_codes,
357
357
 
358
358
  return audio;
359
359
  }
360
+
361
+ const char *get_tts_grammar(const tts_type type) {
362
+ switch (type) {
363
+ case OUTETTS_V0_1:
364
+ return OUTETTS_V1_GRAMMAR;
365
+ case OUTETTS_V0_2:
366
+ case OUTETTS_V0_3:
367
+ return OUTETTS_V2_GRAMMAR;
368
+ default:
369
+ return nullptr;
370
+ }
371
+ }
package/src/tts_utils.h CHANGED
@@ -8,7 +8,7 @@
8
8
 
9
9
  #include <nlohmann/json.hpp>
10
10
 
11
- enum tts_type { UNKNOWN = -1, OUTETTS_V0_2 = 1, OUTETTS_V0_3 = 2 };
11
+ enum tts_type { UNKNOWN = -1, OUTETTS_V0_1 = 1, OUTETTS_V0_2 = 2, OUTETTS_V0_3 = 3 };
12
12
 
13
13
  static std::string anyascii_string(const std::string &input);
14
14
 
@@ -20,6 +20,8 @@ std::string process_text(const std::string &text, const tts_type tts_type);
20
20
  std::vector<float> embd_to_audio(const float *embd, const int n_codes,
21
21
  const int n_embd, const int n_thread);
22
22
 
23
+ const char *get_tts_grammar(const tts_type type);
24
+
23
25
  // the default speaker profile is from:
24
26
  // https://github.com/edwko/OuteTTS/blob/main/outetts/version/v1/default_speakers/en_male_1.json
25
27
  static const char *DEFAULT_AUDIO_TEXT =
@@ -62,3 +64,40 @@ and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><
62
64
  it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
63
65
  looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
64
66
  lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
67
+
68
+ static const char *OUTETTS_V1_GRAMMAR = R"(
69
+ root ::= NL? wordAudioBlock+ audioEnd NL eos?
70
+ wordAudioBlock ::= WORD codeBlock NL
71
+ codeBlock ::= TIME CODE{1,144}
72
+ eos ::= "<|im_end|>"
73
+ codeStart ::= "<|code_start|>"
74
+ codeEnd ::= "<|code_end|>"
75
+ audioEnd ::= "<|audio_end|>"
76
+ WORD ::= [A-Za-z]+
77
+ NL ::= "\n"
78
+ TIME ::= "<|t_" DECIMAL "|>"
79
+ CODE ::= "<|" DIGITS "|>"
80
+ DIGITS ::= [0-9]+
81
+ DECIMAL ::= [0-9]+ "." [0-9]+
82
+ )";
83
+
84
+ static const char *OUTETTS_V2_GRAMMAR = R"(
85
+ root ::= NL? content+ audioEnd NL eos?
86
+ content ::= wordAudioBlock | emotionBlock
87
+ wordAudioBlock ::= WORD punch* codeBlock space NL
88
+ codeBlock ::= TIME CODE{1,144}
89
+ emotionBlock ::= emotionStart TEXT emotionEnd space NL
90
+ TEXT ::= [A-Za-z0-9 .,?!]+
91
+ eos ::= "<|im_end|>"
92
+ emotionStart ::= "<|emotion_start|>"
93
+ emotionEnd ::= "<|emotion_end|>"
94
+ audioEnd ::= "<|audio_end|>"
95
+ space ::= "<|space|>"
96
+ WORD ::= [A-Za-z]+
97
+ NL ::= "\n"
98
+ TIME ::= "<|t_" DECIMAL "|>"
99
+ CODE ::= "<|" DIGITS "|>"
100
+ DIGITS ::= [0-9]+
101
+ DECIMAL ::= [0-9]+ "." [0-9]+
102
+ punch ::= "<|" [a-z_]+ "|>"
103
+ )";