@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -536,6 +536,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
536
536
  // OAI utils
537
537
  //
538
538
 
539
+ // used by /completions endpoint
539
540
  static json oaicompat_completion_params_parse(const json & body) {
540
541
  json llama_params;
541
542
 
@@ -580,13 +581,19 @@ static json oaicompat_completion_params_parse(const json & body) {
580
581
  return llama_params;
581
582
  }
582
583
 
583
- static json oaicompat_completion_params_parse(
584
+ struct oaicompat_parser_options {
585
+ bool use_jinja;
586
+ bool prefill_assistant;
587
+ common_reasoning_format reasoning_format;
588
+ common_chat_templates * tmpls;
589
+ bool allow_image;
590
+ bool allow_audio;
591
+ };
592
+
593
+ // used by /chat/completions endpoint
594
+ static json oaicompat_chat_params_parse(
584
595
  const json & body, /* openai api json semantics */
585
- bool use_jinja,
586
- bool prefill_assistant,
587
- common_reasoning_format reasoning_format,
588
- const struct common_chat_templates * tmpls,
589
- bool allow_non_text,
596
+ const oaicompat_parser_options & opt,
590
597
  std::vector<raw_buffer> & out_files)
591
598
  {
592
599
  json llama_params;
@@ -598,11 +605,11 @@ static json oaicompat_completion_params_parse(
598
605
  if (stream) {
599
606
  throw std::runtime_error("Cannot use tools with stream");
600
607
  }
601
- if (!use_jinja) {
608
+ if (!opt.use_jinja) {
602
609
  throw std::runtime_error("tools param requires --jinja flag");
603
610
  }
604
611
  }
605
- if (!use_jinja) {
612
+ if (!opt.use_jinja) {
606
613
  if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
607
614
  throw std::runtime_error("Unsupported param: tool_choice");
608
615
  }
@@ -667,12 +674,12 @@ static json oaicompat_completion_params_parse(
667
674
 
668
675
  for (auto & p : content) {
669
676
  std::string type = json_value(p, "type", std::string());
670
- json image_url = json_value(p, "image_url", json::object());
671
677
  if (type == "image_url") {
672
- if (!allow_non_text) {
673
- throw std::runtime_error("image input is not supported by this server");
678
+ if (!opt.allow_image) {
679
+ throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
674
680
  }
675
681
 
682
+ json image_url = json_value(p, "image_url", json::object());
676
683
  std::string url = json_value(image_url, "url", std::string());
677
684
  if (string_starts_with(url, "http")) {
678
685
  // download remote image
@@ -710,8 +717,31 @@ static json oaicompat_completion_params_parse(
710
717
 
711
718
  // replace this chunk with a marker
712
719
  p["type"] = "text";
713
- p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
720
+ p["text"] = mtmd_default_marker();
714
721
  p.erase("image_url");
722
+
723
+ } else if (type == "input_audio") {
724
+ if (!opt.allow_audio) {
725
+ throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
726
+ }
727
+
728
+ json input_audio = json_value(p, "input_audio", json::object());
729
+ std::string data = json_value(input_audio, "data", std::string());
730
+ std::string format = json_value(input_audio, "format", std::string());
731
+ // while we also support flac, we don't allow it here so we matches the OAI spec
732
+ if (format != "wav" && format != "mp3") {
733
+ throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
734
+ }
735
+ auto decoded_data = base64_decode(data); // expected to be base64 encoded
736
+ out_files.push_back(decoded_data);
737
+
738
+ // replace this chunk with a marker
739
+ p["type"] = "text";
740
+ p["text"] = mtmd_default_marker();
741
+ p.erase("input_audio");
742
+
743
+ } else if (type != "text") {
744
+ throw std::runtime_error("unsupported content[].type");
715
745
  }
716
746
  }
717
747
  }
@@ -723,9 +753,9 @@ static json oaicompat_completion_params_parse(
723
753
  inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
724
754
  inputs.grammar = grammar;
725
755
  inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
726
- inputs.use_jinja = use_jinja;
756
+ inputs.use_jinja = opt.use_jinja;
727
757
  inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
728
- inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
758
+ inputs.extract_reasoning = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE;
729
759
  inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
730
760
  if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
731
761
  throw std::runtime_error("Cannot use custom grammar constraints with tools.");
@@ -733,7 +763,7 @@ static json oaicompat_completion_params_parse(
733
763
 
734
764
  // if the assistant message appears at the end of list, we do not add end-of-turn token
735
765
  // for ex. this can be useful to modify the reasoning process in reasoning models
736
- bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
766
+ bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
737
767
  common_chat_msg last_message;
738
768
  if (prefill_assistant_message) {
739
769
  last_message = inputs.messages.back();
@@ -749,7 +779,7 @@ static json oaicompat_completion_params_parse(
749
779
  }
750
780
 
751
781
  // Apply chat template to the list of messages
752
- auto chat_params = common_chat_templates_apply(tmpls, inputs);
782
+ auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
753
783
 
754
784
  /* Append assistant prefilled message */
755
785
  if (prefill_assistant_message) {
@@ -1040,7 +1070,7 @@ struct server_tokens {
1040
1070
  private: // disallow accessing these members directly, risking out-of-sync
1041
1071
 
1042
1072
  // map a **start** position in tokens to the image chunk
1043
- std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
1073
+ std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
1044
1074
 
1045
1075
  // list of tokens
1046
1076
  // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
@@ -1051,7 +1081,7 @@ private: // disallow accessing these members directly, risking out-of-sync
1051
1081
  // for ex. with input of 5 text tokens and 2 images:
1052
1082
  // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1053
1083
  // pos 0 1 2 3 4 5 6 7 8 9
1054
- // map_pos_to_image will contain: {5, img0}, {8, img1}
1084
+ // map_pos_to_media will contain: {5, img0}, {8, img1}
1055
1085
 
1056
1086
  public:
1057
1087
  server_tokens() = default;
@@ -1090,15 +1120,15 @@ public:
1090
1120
  }
1091
1121
  oss << "\n";
1092
1122
  oss << "image pos: ";
1093
- for (const auto & it : map_pos_to_image) {
1123
+ for (const auto & it : map_pos_to_media) {
1094
1124
  oss << it.first << ", ";
1095
1125
  }
1096
1126
  return oss.str();
1097
1127
  }
1098
1128
 
1099
1129
  const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
1100
- auto it = map_pos_to_image.find(pos);
1101
- if (it != map_pos_to_image.end()) {
1130
+ auto it = map_pos_to_media.find(pos);
1131
+ if (it != map_pos_to_media.end()) {
1102
1132
  return it->second;
1103
1133
  } else {
1104
1134
  throw std::runtime_error("Chunk not found");
@@ -1115,16 +1145,15 @@ public:
1115
1145
  // will create a copy of the chunk if it contains non-text data
1116
1146
  void push_back(const mtmd_input_chunk * chunk) {
1117
1147
  auto type = mtmd_input_chunk_get_type(chunk);
1118
- if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1148
+ if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1119
1149
  GGML_ASSERT(has_mtmd);
1120
- auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
1121
- const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
1150
+ const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
1122
1151
  llama_pos start_pos = tokens.size();
1123
1152
  for (int i = 0; i < n_pos; ++i) {
1124
1153
  tokens.emplace_back(LLAMA_TOKEN_NULL);
1125
1154
  }
1126
1155
  mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
1127
- map_pos_to_image[start_pos] = std::move(new_chunk);
1156
+ map_pos_to_media[start_pos] = std::move(new_chunk);
1128
1157
  } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1129
1158
  size_t n_tokens;
1130
1159
  auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
@@ -1169,6 +1198,9 @@ public:
1169
1198
  void keep_first(size_t n) {
1170
1199
  GGML_ASSERT(n <= tokens.size());
1171
1200
  if (has_mtmd) {
1201
+ if (n == tokens.size()) {
1202
+ return; // nothing to do
1203
+ }
1172
1204
  // we throw an error if we try to remove a token in the middle of an image
1173
1205
  // for ex. with input of 5 text tokens and 2 images:
1174
1206
  // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
@@ -1183,10 +1215,10 @@ public:
1183
1215
  }
1184
1216
  }
1185
1217
  // remove all image chunks that are not used anymore
1186
- for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
1218
+ for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
1187
1219
  llama_pos pos = it->first;
1188
1220
  if (pos >= (llama_pos)n) {
1189
- it = map_pos_to_image.erase(it);
1221
+ it = map_pos_to_media.erase(it);
1190
1222
  } else {
1191
1223
  ++it;
1192
1224
  }
@@ -1217,14 +1249,12 @@ public:
1217
1249
  const auto & a_chunk = find_chunk(i);
1218
1250
  const auto & b_chunk = b.find_chunk(i);
1219
1251
  GGML_ASSERT(a_chunk && b_chunk);
1220
- const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
1221
- const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
1222
- std::string ai_id = mtmd_image_tokens_get_id(a_img);
1223
- std::string bi_id = mtmd_image_tokens_get_id(b_img);
1224
- size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
1225
- size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
1252
+ std::string ai_id = mtmd_input_chunk_get_id(a_chunk.get());
1253
+ std::string bi_id = mtmd_input_chunk_get_id(b_chunk.get());
1254
+ size_t a_pos = mtmd_input_chunk_get_n_pos(a_chunk.get());
1255
+ size_t b_pos = mtmd_input_chunk_get_n_pos(b_chunk.get());
1226
1256
  if (ai_id == bi_id && a_pos == b_pos) {
1227
- GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
1257
+ GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
1228
1258
  i += a_pos - 1; // will be +1 by the for loop
1229
1259
  continue;
1230
1260
  } else {
@@ -1250,8 +1280,7 @@ public:
1250
1280
  if (t == LLAMA_TOKEN_NULL) {
1251
1281
  try {
1252
1282
  const auto & chunk = find_chunk(i);
1253
- const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
1254
- size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
1283
+ size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
1255
1284
  i += n_pos - 1; // will be +1 by the for loop
1256
1285
  } catch (const std::exception & e) {
1257
1286
  return false;
@@ -1270,22 +1299,21 @@ public:
1270
1299
  llama_pos n_past,
1271
1300
  int32_t seq_id,
1272
1301
  llama_pos & n_pos_out) {
1273
- auto it = map_pos_to_image.find(n_past);
1274
- if (it == map_pos_to_image.end()) {
1275
- throw std::runtime_error("Chunk not found");
1276
- }
1277
- SRV_INF("%s\n", "processing image...");
1302
+ auto & chunk = find_chunk(n_past);
1303
+ const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
1304
+ ? "image" : "audio";
1305
+ SRV_INF("processing %s...\n", name);
1278
1306
  int32_t n_batch = llama_n_batch(ctx);
1279
1307
  int64_t t0 = ggml_time_ms();
1280
1308
  llama_pos new_n_past = n_past;
1281
1309
  int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
1282
- it->second.get(), // chunk
1310
+ chunk.get(),
1283
1311
  n_past,
1284
1312
  seq_id,
1285
1313
  n_batch,
1286
1314
  true, // logits last
1287
1315
  &new_n_past);
1288
- SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
1316
+ SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
1289
1317
  if (result != 0) {
1290
1318
  LOG_ERR("mtmd_helper_eval failed with status %d", result);
1291
1319
  n_pos_out = n_past;
@@ -579,6 +579,8 @@ int main(int argc, char ** argv) {
579
579
 
580
580
  params.model = params.vocoder.model;
581
581
  params.embedding = true;
582
+ params.ctx_shift = false; // silence warning
583
+ params.n_ubatch = params.n_batch;
582
584
 
583
585
  common_init_result llama_init_cts = common_init_from_params(params);
584
586
 
@@ -1020,8 +1022,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
1020
1022
  }
1021
1023
  GGML_ASSERT(batch.n_tokens == n_codes);
1022
1024
 
1023
- if (llama_decode(ctx_cts, batch) != 0) {
1024
- LOG_ERR("%s: llama_decode() failed\n", __func__);
1025
+ if (llama_encode(ctx_cts, batch) != 0) {
1026
+ LOG_ERR("%s: llama_encode() failed\n", __func__);
1025
1027
  return 1;
1026
1028
  }
1027
1029
 
@@ -0,0 +1,342 @@
1
+ #include "tts_utils.h"
2
+
3
+ using json = nlohmann::json;
4
+
5
+ std::string audio_text_from_speaker(json speaker,
6
+ const tts_type type = OUTETTS_V0_2) {
7
+ std::string audio_text = "<|text_start|>";
8
+
9
+ if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
10
+ std::string separator =
11
+ (type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
12
+ for (const auto &word : speaker["words"]) {
13
+ audio_text += word["word"].get<std::string>() + separator;
14
+ }
15
+ }
16
+
17
+ return audio_text;
18
+ }
19
+
20
+ std::string audio_data_from_speaker(json speaker,
21
+ const tts_type type = OUTETTS_V0_2) {
22
+ std::string audio_data = "<|audio_start|>\n";
23
+
24
+ if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
25
+ std::string code_start = (type == OUTETTS_V0_3) ? "" : "<|code_start|>";
26
+ std::string code_end =
27
+ (type == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
28
+ for (const auto &word : speaker["words"]) {
29
+ std::string word_text = word["word"].get<std::string>();
30
+ double duration = word["duration"].get<double>();
31
+ std::vector<int> codes = word["codes"].get<std::vector<int>>();
32
+
33
+ // Create the audio output entry
34
+ std::ostringstream word_entry;
35
+ word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
36
+ << duration << "|>" + code_start;
37
+ for (const auto &Code : codes) {
38
+ word_entry << "<|" << Code << "|>";
39
+ }
40
+ word_entry << code_end << "\n";
41
+ audio_data += word_entry.str();
42
+ }
43
+ }
44
+
45
+ return audio_data;
46
+ }
47
+
48
+ static const std::map<int, std::string> ones = {
49
+ {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"},
50
+ {4, "four"}, {5, "five"}, {6, "six"}, {7, "seven"},
51
+ {8, "eight"}, {9, "nine"}, {10, "ten"}, {11, "eleven"},
52
+ {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"}, {15, "fifteen"},
53
+ {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}};
54
+
55
+ static const std::map<int, std::string> tens = {
56
+ {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
57
+ {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}};
58
+
59
+ // Convert a number less than 1000 to words
60
+ std::string convert_less_than_thousand(int num) {
61
+ std::string result;
62
+
63
+ if (num >= 100) {
64
+ result += ones.at(num / 100) + " hundred ";
65
+ num %= 100;
66
+ }
67
+
68
+ if (num >= 20) {
69
+ result += tens.at(num / 10);
70
+ if (num % 10 > 0) {
71
+ result += "-" + ones.at(num % 10);
72
+ }
73
+ } else if (num > 0) {
74
+ result += ones.at(num);
75
+ }
76
+
77
+ return result;
78
+ }
79
+
80
+ std::string number_to_words(const std::string &number_str) {
81
+ try {
82
+ size_t decimal_pos = number_str.find('.');
83
+ std::string integer_part = number_str.substr(0, decimal_pos);
84
+
85
+ int int_number = std::stoi(integer_part);
86
+ std::string result;
87
+
88
+ if (int_number == 0) {
89
+ result = "zero";
90
+ } else {
91
+ if (int_number >= 1000000000) {
92
+ int billions = int_number / 1000000000;
93
+ result += convert_less_than_thousand(billions) + " billion ";
94
+ int_number %= 1000000000;
95
+ }
96
+
97
+ if (int_number >= 1000000) {
98
+ int millions = int_number / 1000000;
99
+ result += convert_less_than_thousand(millions) + " million ";
100
+ int_number %= 1000000;
101
+ }
102
+
103
+ if (int_number >= 1000) {
104
+ int thousands = int_number / 1000;
105
+ result += convert_less_than_thousand(thousands) + " thousand ";
106
+ int_number %= 1000;
107
+ }
108
+
109
+ if (int_number > 0) {
110
+ result += convert_less_than_thousand(int_number);
111
+ }
112
+ }
113
+
114
+ // Handle decimal part
115
+ if (decimal_pos != std::string::npos) {
116
+ result += " point";
117
+ std::string decimal_part = number_str.substr(decimal_pos + 1);
118
+ for (char digit : decimal_part) {
119
+ result += " " + ones.at(digit - '0');
120
+ }
121
+ }
122
+
123
+ return result;
124
+ } catch (const std::exception &e) {
125
+ // Skip if fails
126
+ return " ";
127
+ }
128
+ }
129
+
130
+ std::string replace_numbers_with_words(const std::string &input_text) {
131
+ std::regex number_pattern(R"(\d+(\.\d+)?)");
132
+ std::string result;
133
+ auto it = std::sregex_iterator(input_text.begin(), input_text.end(),
134
+ number_pattern);
135
+ auto end = std::sregex_iterator();
136
+
137
+ size_t last_pos = 0;
138
+ for (std::sregex_iterator i = it; i != end; ++i) {
139
+ const std::smatch &match = *i;
140
+ result.append(input_text, last_pos, match.position() - last_pos);
141
+ result.append(number_to_words(match.str()));
142
+ last_pos = match.position() + match.length();
143
+ }
144
+ result.append(input_text, last_pos);
145
+
146
+ return result;
147
+ }
148
+
149
+ // Based on:
150
+ // https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
151
+ std::string process_text(const std::string &text,
152
+ const tts_type tts_type = OUTETTS_V0_2) {
153
+
154
+ // For now I skipped text romanization as I am unsure how to handle
155
+ // uroman and MeCab implementations in C++
156
+ // maybe something like https://github.com/anyascii/anyascii/ could work.
157
+ // currently only English would be supported in this function
158
+
159
+ std::string processed_text = replace_numbers_with_words(text);
160
+
161
+ std::transform(processed_text.begin(), processed_text.end(),
162
+ processed_text.begin(), ::tolower);
163
+
164
+ std::regex special_chars(R"([-_/,\.\\])");
165
+ processed_text = std::regex_replace(processed_text, special_chars, " ");
166
+
167
+ std::regex non_alpha(R"([^a-z\s])");
168
+ processed_text = std::regex_replace(processed_text, non_alpha, "");
169
+
170
+ std::regex multiple_spaces(R"(\s+)");
171
+ processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
172
+
173
+ processed_text =
174
+ std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
175
+
176
+ /*
177
+ Replace spaces with the separator token same as in line 365
178
+
179
+ for (auto & c : prompt_user) {
180
+ if (c == ' ') {
181
+ prompt_clean += "<|text_sep|>";
182
+ */
183
+ std::string separator =
184
+ (tts_type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
185
+ processed_text =
186
+ std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
187
+
188
+ return processed_text;
189
+ }
190
+
191
+ void fill_hann_window(int length, bool periodic, float *output) {
192
+ int offset = -1;
193
+ if (periodic) {
194
+ offset = 0;
195
+ }
196
+ for (int i = 0; i < length; i++) {
197
+ output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
198
+ }
199
+ }
200
+
201
+ void twiddle(float *real, float *imag, int k, int N) {
202
+ float angle = 2 * M_PI * k / N;
203
+ *real = cos(angle);
204
+ *imag = sin(angle);
205
+ }
206
+
207
+ void irfft(int n, const float *inp_cplx, float *out_real) {
208
+ int N = n / 2 + 1;
209
+
210
+ std::vector<float> real_input(N);
211
+ std::vector<float> imag_input(N);
212
+ for (int i = 0; i < N; ++i) {
213
+ real_input[i] = inp_cplx[2 * i];
214
+ imag_input[i] = inp_cplx[2 * i + 1];
215
+ }
216
+
217
+ std::vector<float> real_output(n);
218
+ std::vector<float> imag_output(n);
219
+
220
+ for (int k = 0; k < n; ++k) {
221
+ real_output[k] = 0.0f;
222
+ imag_output[k] = 0.0f;
223
+ for (int m = 0; m < N; ++m) {
224
+ float twiddle_real;
225
+ float twiddle_imag;
226
+
227
+ twiddle(&twiddle_real, &twiddle_imag, k * m, n);
228
+
229
+ real_output[k] +=
230
+ real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
231
+ imag_output[k] +=
232
+ real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
233
+ }
234
+ }
235
+
236
+ for (int i = 0; i < n; ++i) {
237
+ out_real[i] = real_output[i] / N;
238
+ }
239
+ }
240
+
241
+ void fold(const std::vector<float> &data, int64_t n_out, int64_t n_win,
242
+ int64_t n_hop, int64_t n_pad, std::vector<float> &output) {
243
+ int64_t output_height = n_out;
244
+ int64_t kernel_w = n_win;
245
+ int64_t stride_w = n_hop;
246
+ int64_t width = n_out;
247
+
248
+ output.resize(width, 0.0f);
249
+
250
+ int64_t col_idx = 0;
251
+ for (int64_t w_col = 0; w_col < width; ++w_col) {
252
+ int64_t start = w_col * stride_w - n_pad;
253
+ int64_t end = start + kernel_w;
254
+
255
+ for (int64_t w_im = start; w_im < end; ++w_im) {
256
+ if (w_im >= 0 && w_im < output_height && col_idx < (int64_t)data.size()) {
257
+ output[w_im] += data[col_idx];
258
+ }
259
+ col_idx++;
260
+ }
261
+ }
262
+
263
+ output.resize(n_out - 2 * n_pad);
264
+ }
265
+
266
+ std::vector<float> embd_to_audio(const float *embd, const int n_codes,
267
+ const int n_embd, const int n_thread) {
268
+ const int n_fft = 1280;
269
+ const int n_hop = 320;
270
+ const int n_win = 1280;
271
+ const int n_pad = (n_win - n_hop) / 2;
272
+ const int n_out = (n_codes - 1) * n_hop + n_win;
273
+
274
+ std::vector<float> hann(n_fft);
275
+
276
+ fill_hann_window(hann.size(), true, hann.data());
277
+
278
+ int n_spec = n_embd * n_codes;
279
+
280
+ std::vector<float> E(n_spec);
281
+ std::vector<float> S(n_spec);
282
+ std::vector<float> ST(n_spec);
283
+
284
+ for (int l = 0; l < n_codes; ++l) {
285
+ for (int k = 0; k < n_embd; ++k) {
286
+ E[k * n_codes + l] = embd[l * n_embd + k];
287
+ }
288
+ }
289
+
290
+ for (int k = 0; k < n_embd / 2; ++k) {
291
+ for (int l = 0; l < n_codes; ++l) {
292
+ float mag = E[(k)*n_codes + l];
293
+ float phi = E[(k + n_embd / 2) * n_codes + l];
294
+
295
+ mag = exp(mag);
296
+
297
+ if (mag > 1e2) {
298
+ mag = 1e2;
299
+ }
300
+ S[2 * (k * n_codes + l) + 0] = mag * cosf(phi);
301
+ S[2 * (k * n_codes + l) + 1] = mag * sinf(phi);
302
+ }
303
+ }
304
+
305
+ for (int l = 0; l < n_codes; ++l) {
306
+ for (int k = 0; k < n_embd / 2; ++k) {
307
+ ST[l * n_embd + 2 * k + 0] = S[2 * (k * n_codes + l) + 0];
308
+ ST[l * n_embd + 2 * k + 1] = S[2 * (k * n_codes + l) + 1];
309
+ }
310
+ }
311
+
312
+ std::vector<float> res(n_codes * n_fft);
313
+ std::vector<float> hann2(n_codes * n_fft);
314
+
315
+ std::vector<std::thread> workers(n_thread);
316
+ for (int i = 0; i < n_thread; ++i) {
317
+ workers[i] = std::thread([&, i]() {
318
+ for (int l = i; l < n_codes; l += n_thread) {
319
+ irfft(n_fft, ST.data() + l * n_embd, res.data() + l * n_fft);
320
+ for (int j = 0; j < n_fft; ++j) {
321
+ res[l * n_fft + j] *= hann[j];
322
+ hann2[l * n_fft + j] = hann[j] * hann[j];
323
+ }
324
+ }
325
+ });
326
+ }
327
+ for (int i = 0; i < n_thread; ++i) {
328
+ workers[i].join();
329
+ }
330
+
331
+ std::vector<float> audio;
332
+ std::vector<float> env;
333
+
334
+ fold(res, n_out, n_win, n_hop, n_pad, audio);
335
+ fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
336
+
337
+ for (size_t i = 0; i < audio.size(); ++i) {
338
+ audio[i] /= env[i];
339
+ }
340
+
341
+ return audio;
342
+ }