npm - @fugood/llama.node - Versions diffs - 0.4.7 → 0.6.0 - Mend

@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/CMakeLists.txt +4 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/lib/binding.ts +66 -6
package/lib/index.js +59 -17
package/lib/index.ts +74 -23
package/package.json +1 -1
package/src/DecodeAudioTokenWorker.cpp +40 -0
package/src/DecodeAudioTokenWorker.h +22 -0
package/src/EmbeddingWorker.cpp +7 -5
package/src/LlamaCompletionWorker.cpp +68 -54
package/src/LlamaCompletionWorker.h +7 -8
package/src/LlamaContext.cpp +551 -235
package/src/LlamaContext.h +26 -4
package/src/LoadSessionWorker.cpp +4 -2
package/src/SaveSessionWorker.cpp +10 -6
package/src/TokenizeWorker.cpp +23 -14
package/src/TokenizeWorker.h +2 -2
package/src/addons.cc +8 -11
package/src/common.hpp +129 -126
package/src/llama.cpp/.github/workflows/build.yml +2 -2
package/src/llama.cpp/.github/workflows/release.yml +152 -129
package/src/llama.cpp/.github/workflows/winget.yml +42 -0
package/src/llama.cpp/common/arg.cpp +14 -13
package/src/llama.cpp/common/common.cpp +4 -75
package/src/llama.cpp/common/common.h +7 -12
package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
package/src/llama.cpp/examples/simple/simple.cpp +1 -1
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
package/src/llama.cpp/ggml/include/ggml.h +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
package/src/llama.cpp/ggml/src/ggml.c +64 -18
package/src/llama.cpp/include/llama.h +24 -124
package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
package/src/llama.cpp/src/llama-batch.cpp +3 -1
package/src/llama.cpp/src/llama-context.cpp +60 -110
package/src/llama.cpp/src/llama-graph.cpp +137 -233
package/src/llama.cpp/src/llama-graph.h +49 -7
package/src/llama.cpp/src/llama-hparams.cpp +17 -1
package/src/llama.cpp/src/llama-hparams.h +34 -5
package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
package/src/llama.cpp/src/llama-kv-cache.h +201 -85
package/src/llama.cpp/src/llama-memory.h +3 -2
package/src/llama.cpp/src/llama-model.cpp +273 -94
package/src/llama.cpp/src/llama-model.h +4 -1
package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
package/src/llama.cpp/tools/mtmd/clip.h +6 -4
package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
package/src/llama.cpp/tools/run/run.cpp +2 -2
package/src/llama.cpp/tools/server/server.cpp +158 -47
package/src/llama.cpp/tools/server/utils.hpp +71 -43
package/src/llama.cpp/tools/tts/tts.cpp +4 -2
package/src/tts_utils.cpp +342 -0
package/src/tts_utils.h +62 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0

package/src/llama.cpp/tools/server/utils.hpp CHANGED Viewed

@@ -536,6 +536,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 // OAI utils
 //
+// used by /completions endpoint
 static json oaicompat_completion_params_parse(const json & body) {
     json llama_params;
@@ -580,13 +581,19 @@ static json oaicompat_completion_params_parse(const json & body) {
     return llama_params;
 }
-static json oaicompat_completion_params_parse(
+struct oaicompat_parser_options {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    common_chat_templates * tmpls;
+    bool allow_image;
+    bool allow_audio;
+};
+// used by /chat/completions endpoint
+static json oaicompat_chat_params_parse(
     const json & body, /* openai api json semantics */
-    bool use_jinja,
-    bool prefill_assistant,
-    common_reasoning_format reasoning_format,
-    const struct common_chat_templates * tmpls,
-    bool allow_non_text,
+    const oaicompat_parser_options & opt,
     std::vector<raw_buffer> & out_files)
 {
     json llama_params;
@@ -598,11 +605,11 @@ static json oaicompat_completion_params_parse(
         if (stream) {
             throw std::runtime_error("Cannot use tools with stream");
         }
-        if (!use_jinja) {
+        if (!opt.use_jinja) {
             throw std::runtime_error("tools param requires --jinja flag");
         }
     }
-    if (!use_jinja) {
+    if (!opt.use_jinja) {
         if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
             throw std::runtime_error("Unsupported param: tool_choice");
         }
@@ -667,12 +674,12 @@ static json oaicompat_completion_params_parse(
         for (auto & p : content) {
             std::string type      = json_value(p, "type", std::string());
-            json        image_url = json_value(p, "image_url", json::object());
             if (type == "image_url") {
-                if (!allow_non_text) {
-                    throw std::runtime_error("image input is not supported by this server");
+                if (!opt.allow_image) {
+                    throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
                 }
+                json image_url  = json_value(p, "image_url", json::object());
                 std::string url = json_value(image_url, "url", std::string());
                 if (string_starts_with(url, "http")) {
                     // download remote image
@@ -710,8 +717,31 @@ static json oaicompat_completion_params_parse(
                 // replace this chunk with a marker
                 p["type"] = "text";
-                p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
+                p["text"] = mtmd_default_marker();
                 p.erase("image_url");
+            } else if (type == "input_audio") {
+                if (!opt.allow_audio) {
+                    throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+                json input_audio   = json_value(p, "input_audio", json::object());
+                std::string data   = json_value(input_audio, "data", std::string());
+                std::string format = json_value(input_audio, "format", std::string());
+                // while we also support flac, we don't allow it here so we matches the OAI spec
+                if (format != "wav" && format != "mp3") {
+                    throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
+                }
+                auto decoded_data = base64_decode(data); // expected to be base64 encoded
+                out_files.push_back(decoded_data);
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("input_audio");
+            } else if (type != "text") {
+                throw std::runtime_error("unsupported content[].type");
             }
         }
     }
@@ -723,9 +753,9 @@ static json oaicompat_completion_params_parse(
     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
     inputs.grammar               = grammar;
     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
-    inputs.use_jinja             = use_jinja;
+    inputs.use_jinja             = opt.use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
-    inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    inputs.extract_reasoning     = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE;
     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
         throw std::runtime_error("Cannot use custom grammar constraints with tools.");
@@ -733,7 +763,7 @@ static json oaicompat_completion_params_parse(
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();
@@ -749,7 +779,7 @@ static json oaicompat_completion_params_parse(
     }
     // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(tmpls, inputs);
+    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
     /* Append assistant prefilled message */
     if (prefill_assistant_message) {
@@ -1040,7 +1070,7 @@ struct server_tokens {
 private: // disallow accessing these members directly, risking out-of-sync
     // map a **start** position in tokens to the image chunk
-    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
+    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
     // list of tokens
     // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
@@ -1051,7 +1081,7 @@ private: // disallow accessing these members directly, risking out-of-sync
     // for ex. with input of 5 text tokens and 2 images:
     //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
     // pos  0   1   2   3   4   5      6      7      8      9
-    // map_pos_to_image will contain: {5, img0}, {8, img1}
+    // map_pos_to_media will contain: {5, img0}, {8, img1}
 public:
     server_tokens() = default;
@@ -1090,15 +1120,15 @@ public:
         }
         oss << "\n";
         oss << "image pos: ";
-        for (const auto & it : map_pos_to_image) {
+        for (const auto & it : map_pos_to_media) {
             oss << it.first << ", ";
         }
         return oss.str();
     }
     const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
-        auto it = map_pos_to_image.find(pos);
-        if (it != map_pos_to_image.end()) {
+        auto it = map_pos_to_media.find(pos);
+        if (it != map_pos_to_media.end()) {
             return it->second;
         } else {
             throw std::runtime_error("Chunk not found");
@@ -1115,16 +1145,15 @@ public:
     // will create a copy of the chunk if it contains non-text data
     void push_back(const mtmd_input_chunk * chunk) {
         auto type = mtmd_input_chunk_get_type(chunk);
-        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
             GGML_ASSERT(has_mtmd);
-            auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-            const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+            const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
             llama_pos start_pos = tokens.size();
             for (int i = 0; i < n_pos; ++i) {
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
             mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
-            map_pos_to_image[start_pos] = std::move(new_chunk);
+            map_pos_to_media[start_pos] = std::move(new_chunk);
         } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             size_t n_tokens;
             auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
@@ -1169,6 +1198,9 @@ public:
     void keep_first(size_t n) {
         GGML_ASSERT(n <= tokens.size());
         if (has_mtmd) {
+            if (n == tokens.size()) {
+                return; // nothing to do
+            }
             // we throw an error if we try to remove a token in the middle of an image
             // for ex. with input of 5 text tokens and 2 images:
             //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
@@ -1183,10 +1215,10 @@ public:
                 }
             }
             // remove all image chunks that are not used anymore
-            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
+            for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
                 llama_pos pos = it->first;
                 if (pos >= (llama_pos)n) {
-                    it = map_pos_to_image.erase(it);
+                    it = map_pos_to_media.erase(it);
                 } else {
                     ++it;
                 }
@@ -1217,14 +1249,12 @@ public:
                 const auto & a_chunk =   find_chunk(i);
                 const auto & b_chunk = b.find_chunk(i);
                 GGML_ASSERT(a_chunk && b_chunk);
-                const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
-                const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
-                std::string ai_id  = mtmd_image_tokens_get_id(a_img);
-                std::string bi_id  = mtmd_image_tokens_get_id(b_img);
-                size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img);
-                size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img);
+                std::string ai_id  = mtmd_input_chunk_get_id(a_chunk.get());
+                std::string bi_id  = mtmd_input_chunk_get_id(b_chunk.get());
+                size_t a_pos       = mtmd_input_chunk_get_n_pos(a_chunk.get());
+                size_t b_pos       = mtmd_input_chunk_get_n_pos(b_chunk.get());
                 if (ai_id == bi_id && a_pos == b_pos) {
-                    GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
+                    GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
                     i += a_pos - 1; // will be +1 by the for loop
                     continue;
                 } else {
@@ -1250,8 +1280,7 @@ public:
             if (t == LLAMA_TOKEN_NULL) {
                 try {
                     const auto & chunk = find_chunk(i);
-                    const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
-                    size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+                    size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
                     i += n_pos - 1; // will be +1 by the for loop
                 } catch (const std::exception & e) {
                     return false;
@@ -1270,22 +1299,21 @@ public:
                 llama_pos n_past,
                 int32_t seq_id,
                 llama_pos & n_pos_out) {
-        auto it = map_pos_to_image.find(n_past);
-        if (it == map_pos_to_image.end()) {
-            throw std::runtime_error("Chunk not found");
-        }
-        SRV_INF("%s\n", "processing image...");
+        auto & chunk = find_chunk(n_past);
+        const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
+                            ? "image" : "audio";
+        SRV_INF("processing %s...\n", name);
         int32_t n_batch = llama_n_batch(ctx);
         int64_t t0 = ggml_time_ms();
         llama_pos new_n_past = n_past;
         int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
-            it->second.get(), // chunk
+            chunk.get(),
             n_past,
             seq_id,
             n_batch,
             true, // logits last
             &new_n_past);
-        SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
         if (result != 0) {
             LOG_ERR("mtmd_helper_eval failed with status %d", result);
             n_pos_out = n_past;

package/src/llama.cpp/tools/tts/tts.cpp CHANGED Viewed

@@ -579,6 +579,8 @@ int main(int argc, char ** argv) {
     params.model = params.vocoder.model;
     params.embedding = true;
+    params.ctx_shift = false; // silence warning
+    params.n_ubatch = params.n_batch;
     common_init_result llama_init_cts = common_init_from_params(params);
@@ -1020,8 +1022,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
     }
     GGML_ASSERT(batch.n_tokens == n_codes);
-    if (llama_decode(ctx_cts, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
+    if (llama_encode(ctx_cts, batch) != 0) {
+        LOG_ERR("%s: llama_encode() failed\n", __func__);
         return 1;
     }

package/src/tts_utils.cpp ADDED Viewed

@@ -0,0 +1,342 @@
+#include "tts_utils.h"
+using json = nlohmann::json;
+std::string audio_text_from_speaker(json speaker,
+                                    const tts_type type = OUTETTS_V0_2) {
+  std::string audio_text = "<|text_start|>";
+  if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
+    std::string separator =
+        (type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
+    for (const auto &word : speaker["words"]) {
+      audio_text += word["word"].get<std::string>() + separator;
+    }
+  }
+  return audio_text;
+}
+std::string audio_data_from_speaker(json speaker,
+                                    const tts_type type = OUTETTS_V0_2) {
+  std::string audio_data = "<|audio_start|>\n";
+  if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
+    std::string code_start = (type == OUTETTS_V0_3) ? "" : "<|code_start|>";
+    std::string code_end =
+        (type == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
+    for (const auto &word : speaker["words"]) {
+      std::string word_text = word["word"].get<std::string>();
+      double duration = word["duration"].get<double>();
+      std::vector<int> codes = word["codes"].get<std::vector<int>>();
+      // Create the audio output entry
+      std::ostringstream word_entry;
+      word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
+                 << duration << "|>" + code_start;
+      for (const auto &Code : codes) {
+        word_entry << "<|" << Code << "|>";
+      }
+      word_entry << code_end << "\n";
+      audio_data += word_entry.str();
+    }
+  }
+  return audio_data;
+}
+static const std::map<int, std::string> ones = {
+    {0, "zero"},     {1, "one"},        {2, "two"},       {3, "three"},
+    {4, "four"},     {5, "five"},       {6, "six"},       {7, "seven"},
+    {8, "eight"},    {9, "nine"},       {10, "ten"},      {11, "eleven"},
+    {12, "twelve"},  {13, "thirteen"},  {14, "fourteen"}, {15, "fifteen"},
+    {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}};
+static const std::map<int, std::string> tens = {
+    {2, "twenty"}, {3, "thirty"},  {4, "forty"},  {5, "fifty"},
+    {6, "sixty"},  {7, "seventy"}, {8, "eighty"}, {9, "ninety"}};
+// Convert a number less than 1000 to words
+std::string convert_less_than_thousand(int num) {
+  std::string result;
+  if (num >= 100) {
+    result += ones.at(num / 100) + " hundred ";
+    num %= 100;
+  }
+  if (num >= 20) {
+    result += tens.at(num / 10);
+    if (num % 10 > 0) {
+      result += "-" + ones.at(num % 10);
+    }
+  } else if (num > 0) {
+    result += ones.at(num);
+  }
+  return result;
+}
+std::string number_to_words(const std::string &number_str) {
+  try {
+    size_t decimal_pos = number_str.find('.');
+    std::string integer_part = number_str.substr(0, decimal_pos);
+    int int_number = std::stoi(integer_part);
+    std::string result;
+    if (int_number == 0) {
+      result = "zero";
+    } else {
+      if (int_number >= 1000000000) {
+        int billions = int_number / 1000000000;
+        result += convert_less_than_thousand(billions) + " billion ";
+        int_number %= 1000000000;
+      }
+      if (int_number >= 1000000) {
+        int millions = int_number / 1000000;
+        result += convert_less_than_thousand(millions) + " million ";
+        int_number %= 1000000;
+      }
+      if (int_number >= 1000) {
+        int thousands = int_number / 1000;
+        result += convert_less_than_thousand(thousands) + " thousand ";
+        int_number %= 1000;
+      }
+      if (int_number > 0) {
+        result += convert_less_than_thousand(int_number);
+      }
+    }
+    // Handle decimal part
+    if (decimal_pos != std::string::npos) {
+      result += " point";
+      std::string decimal_part = number_str.substr(decimal_pos + 1);
+      for (char digit : decimal_part) {
+        result += " " + ones.at(digit - '0');
+      }
+    }
+    return result;
+  } catch (const std::exception &e) {
+    // Skip if fails
+    return " ";
+  }
+}
+std::string replace_numbers_with_words(const std::string &input_text) {
+  std::regex number_pattern(R"(\d+(\.\d+)?)");
+  std::string result;
+  auto it = std::sregex_iterator(input_text.begin(), input_text.end(),
+                                 number_pattern);
+  auto end = std::sregex_iterator();
+  size_t last_pos = 0;
+  for (std::sregex_iterator i = it; i != end; ++i) {
+    const std::smatch &match = *i;
+    result.append(input_text, last_pos, match.position() - last_pos);
+    result.append(number_to_words(match.str()));
+    last_pos = match.position() + match.length();
+  }
+  result.append(input_text, last_pos);
+  return result;
+}
+// Based on:
+// https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
+std::string process_text(const std::string &text,
+                         const tts_type tts_type = OUTETTS_V0_2) {
+  // For now I skipped text romanization as I am unsure how to handle
+  // uroman and MeCab implementations in C++
+  // maybe something like https://github.com/anyascii/anyascii/ could work.
+  // currently only English would be supported in this function
+  std::string processed_text = replace_numbers_with_words(text);
+  std::transform(processed_text.begin(), processed_text.end(),
+                 processed_text.begin(), ::tolower);
+  std::regex special_chars(R"([-_/,\.\\])");
+  processed_text = std::regex_replace(processed_text, special_chars, " ");
+  std::regex non_alpha(R"([^a-z\s])");
+  processed_text = std::regex_replace(processed_text, non_alpha, "");
+  std::regex multiple_spaces(R"(\s+)");
+  processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
+  processed_text =
+      std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
+  /*
+      Replace spaces with the separator token same as in line 365
+      for (auto & c : prompt_user) {
+      if (c == ' ') {
+          prompt_clean += "<|text_sep|>";
+  */
+  std::string separator =
+      (tts_type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
+  processed_text =
+      std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
+  return processed_text;
+}
+void fill_hann_window(int length, bool periodic, float *output) {
+  int offset = -1;
+  if (periodic) {
+    offset = 0;
+  }
+  for (int i = 0; i < length; i++) {
+    output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+  }
+}
+void twiddle(float *real, float *imag, int k, int N) {
+  float angle = 2 * M_PI * k / N;
+  *real = cos(angle);
+  *imag = sin(angle);
+}
+void irfft(int n, const float *inp_cplx, float *out_real) {
+  int N = n / 2 + 1;
+  std::vector<float> real_input(N);
+  std::vector<float> imag_input(N);
+  for (int i = 0; i < N; ++i) {
+    real_input[i] = inp_cplx[2 * i];
+    imag_input[i] = inp_cplx[2 * i + 1];
+  }
+  std::vector<float> real_output(n);
+  std::vector<float> imag_output(n);
+  for (int k = 0; k < n; ++k) {
+    real_output[k] = 0.0f;
+    imag_output[k] = 0.0f;
+    for (int m = 0; m < N; ++m) {
+      float twiddle_real;
+      float twiddle_imag;
+      twiddle(&twiddle_real, &twiddle_imag, k * m, n);
+      real_output[k] +=
+          real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
+      imag_output[k] +=
+          real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
+    }
+  }
+  for (int i = 0; i < n; ++i) {
+    out_real[i] = real_output[i] / N;
+  }
+}
+void fold(const std::vector<float> &data, int64_t n_out, int64_t n_win,
+          int64_t n_hop, int64_t n_pad, std::vector<float> &output) {
+  int64_t output_height = n_out;
+  int64_t kernel_w = n_win;
+  int64_t stride_w = n_hop;
+  int64_t width = n_out;
+  output.resize(width, 0.0f);
+  int64_t col_idx = 0;
+  for (int64_t w_col = 0; w_col < width; ++w_col) {
+    int64_t start = w_col * stride_w - n_pad;
+    int64_t end = start + kernel_w;
+    for (int64_t w_im = start; w_im < end; ++w_im) {
+      if (w_im >= 0 && w_im < output_height && col_idx < (int64_t)data.size()) {
+        output[w_im] += data[col_idx];
+      }
+      col_idx++;
+    }
+  }
+  output.resize(n_out - 2 * n_pad);
+}
+std::vector<float> embd_to_audio(const float *embd, const int n_codes,
+                                 const int n_embd, const int n_thread) {
+  const int n_fft = 1280;
+  const int n_hop = 320;
+  const int n_win = 1280;
+  const int n_pad = (n_win - n_hop) / 2;
+  const int n_out = (n_codes - 1) * n_hop + n_win;
+  std::vector<float> hann(n_fft);
+  fill_hann_window(hann.size(), true, hann.data());
+  int n_spec = n_embd * n_codes;
+  std::vector<float> E(n_spec);
+  std::vector<float> S(n_spec);
+  std::vector<float> ST(n_spec);
+  for (int l = 0; l < n_codes; ++l) {
+    for (int k = 0; k < n_embd; ++k) {
+      E[k * n_codes + l] = embd[l * n_embd + k];
+    }
+  }
+  for (int k = 0; k < n_embd / 2; ++k) {
+    for (int l = 0; l < n_codes; ++l) {
+      float mag = E[(k)*n_codes + l];
+      float phi = E[(k + n_embd / 2) * n_codes + l];
+      mag = exp(mag);
+      if (mag > 1e2) {
+        mag = 1e2;
+      }
+      S[2 * (k * n_codes + l) + 0] = mag * cosf(phi);
+      S[2 * (k * n_codes + l) + 1] = mag * sinf(phi);
+    }
+  }
+  for (int l = 0; l < n_codes; ++l) {
+    for (int k = 0; k < n_embd / 2; ++k) {
+      ST[l * n_embd + 2 * k + 0] = S[2 * (k * n_codes + l) + 0];
+      ST[l * n_embd + 2 * k + 1] = S[2 * (k * n_codes + l) + 1];
+    }
+  }
+  std::vector<float> res(n_codes * n_fft);
+  std::vector<float> hann2(n_codes * n_fft);
+  std::vector<std::thread> workers(n_thread);
+  for (int i = 0; i < n_thread; ++i) {
+    workers[i] = std::thread([&, i]() {
+      for (int l = i; l < n_codes; l += n_thread) {
+        irfft(n_fft, ST.data() + l * n_embd, res.data() + l * n_fft);
+        for (int j = 0; j < n_fft; ++j) {
+          res[l * n_fft + j] *= hann[j];
+          hann2[l * n_fft + j] = hann[j] * hann[j];
+        }
+      }
+    });
+  }
+  for (int i = 0; i < n_thread; ++i) {
+    workers[i].join();
+  }
+  std::vector<float> audio;
+  std::vector<float> env;
+  fold(res, n_out, n_win, n_hop, n_pad, audio);
+  fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+  for (size_t i = 0; i < audio.size(); ++i) {
+    audio[i] /= env[i];
+  }
+  return audio;
+}