@fugood/llama.node 1.1.11 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +111 -1
- package/src/llama.cpp/common/chat.h +3 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +14 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +0 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +218 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +27 -4
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +62 -56
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +54 -9
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +1 -23
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +159 -1
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
package/src/anyascii.h
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
ISC License
|
|
3
|
-
|
|
4
|
-
Copyright (c) 2020-2023, Hunter WB <hunterwb.com>
|
|
5
|
-
|
|
6
|
-
Permission to use, copy, modify, and/or distribute this software for any
|
|
7
|
-
purpose with or without fee is hereby granted, provided that the above
|
|
8
|
-
copyright notice and this permission notice appear in all copies.
|
|
9
|
-
|
|
10
|
-
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
11
|
-
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
12
|
-
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
13
|
-
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
14
|
-
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
15
|
-
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
16
|
-
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
17
|
-
*/
|
|
18
|
-
|
|
19
|
-
#ifndef ANYASCII_H
|
|
20
|
-
#define ANYASCII_H
|
|
21
|
-
|
|
22
|
-
#ifdef __cplusplus
|
|
23
|
-
extern "C" {
|
|
24
|
-
#endif
|
|
25
|
-
|
|
26
|
-
#include <stddef.h>
|
|
27
|
-
#include <stdint.h>
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Gets the ASCII transliteration of a Unicode code point
|
|
31
|
-
*
|
|
32
|
-
* @param utf32 A Unicode code point
|
|
33
|
-
* @param ascii A pointer for the result to be written to; not null-terminated
|
|
34
|
-
* @return The number of chars in *ascii
|
|
35
|
-
*/
|
|
36
|
-
size_t anyascii(uint_least32_t utf32, const char **ascii);
|
|
37
|
-
|
|
38
|
-
#ifdef __cplusplus
|
|
39
|
-
}
|
|
40
|
-
#endif
|
|
41
|
-
|
|
42
|
-
#endif
|
package/src/tts_utils.cpp
DELETED
|
@@ -1,371 +0,0 @@
|
|
|
1
|
-
#include "tts_utils.h"
|
|
2
|
-
#include "anyascii.h"
|
|
3
|
-
#include <codecvt>
|
|
4
|
-
|
|
5
|
-
using json = nlohmann::json;
|
|
6
|
-
|
|
7
|
-
static std::string anyascii_string(const std::string &input) {
|
|
8
|
-
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
9
|
-
auto wstr = converter.from_bytes(input);
|
|
10
|
-
std::string output;
|
|
11
|
-
for (char32_t c : wstr) {
|
|
12
|
-
const char *r;
|
|
13
|
-
size_t rlen = anyascii(c, &r);
|
|
14
|
-
output.append(r, rlen);
|
|
15
|
-
}
|
|
16
|
-
return output;
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
std::string audio_text_from_speaker(json speaker,
|
|
20
|
-
const tts_type type = OUTETTS_V0_2) {
|
|
21
|
-
std::string audio_text = "<|text_start|>";
|
|
22
|
-
|
|
23
|
-
if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
|
24
|
-
std::string separator =
|
|
25
|
-
(type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
|
26
|
-
for (const auto &word : speaker["words"]) {
|
|
27
|
-
audio_text += word["word"].get<std::string>() + separator;
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
return audio_text;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
std::string audio_data_from_speaker(json speaker,
|
|
35
|
-
const tts_type type = OUTETTS_V0_2) {
|
|
36
|
-
std::string audio_data = "<|audio_start|>\n";
|
|
37
|
-
|
|
38
|
-
if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
|
39
|
-
std::string code_start = (type == OUTETTS_V0_3) ? "" : "<|code_start|>";
|
|
40
|
-
std::string code_end =
|
|
41
|
-
(type == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
|
|
42
|
-
for (const auto &word : speaker["words"]) {
|
|
43
|
-
std::string word_text = word["word"].get<std::string>();
|
|
44
|
-
double duration = word["duration"].get<double>();
|
|
45
|
-
std::vector<int> codes = word["codes"].get<std::vector<int>>();
|
|
46
|
-
|
|
47
|
-
// Create the audio output entry
|
|
48
|
-
std::ostringstream word_entry;
|
|
49
|
-
word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
|
|
50
|
-
<< duration << "|>" + code_start;
|
|
51
|
-
for (const auto &Code : codes) {
|
|
52
|
-
word_entry << "<|" << Code << "|>";
|
|
53
|
-
}
|
|
54
|
-
word_entry << code_end << "\n";
|
|
55
|
-
audio_data += word_entry.str();
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
return audio_data;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
static const std::map<int, std::string> ones = {
|
|
63
|
-
{0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"},
|
|
64
|
-
{4, "four"}, {5, "five"}, {6, "six"}, {7, "seven"},
|
|
65
|
-
{8, "eight"}, {9, "nine"}, {10, "ten"}, {11, "eleven"},
|
|
66
|
-
{12, "twelve"}, {13, "thirteen"}, {14, "fourteen"}, {15, "fifteen"},
|
|
67
|
-
{16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}};
|
|
68
|
-
|
|
69
|
-
static const std::map<int, std::string> tens = {
|
|
70
|
-
{2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
|
|
71
|
-
{6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}};
|
|
72
|
-
|
|
73
|
-
// Convert a number less than 1000 to words
|
|
74
|
-
std::string convert_less_than_thousand(int num) {
|
|
75
|
-
std::string result;
|
|
76
|
-
|
|
77
|
-
if (num >= 100) {
|
|
78
|
-
result += ones.at(num / 100) + " hundred ";
|
|
79
|
-
num %= 100;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
if (num >= 20) {
|
|
83
|
-
result += tens.at(num / 10);
|
|
84
|
-
if (num % 10 > 0) {
|
|
85
|
-
result += "-" + ones.at(num % 10);
|
|
86
|
-
}
|
|
87
|
-
} else if (num > 0) {
|
|
88
|
-
result += ones.at(num);
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
return result;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
std::string number_to_words(const std::string &number_str) {
|
|
95
|
-
try {
|
|
96
|
-
size_t decimal_pos = number_str.find('.');
|
|
97
|
-
std::string integer_part = number_str.substr(0, decimal_pos);
|
|
98
|
-
|
|
99
|
-
int int_number = std::stoi(integer_part);
|
|
100
|
-
std::string result;
|
|
101
|
-
|
|
102
|
-
if (int_number == 0) {
|
|
103
|
-
result = "zero";
|
|
104
|
-
} else {
|
|
105
|
-
if (int_number >= 1000000000) {
|
|
106
|
-
int billions = int_number / 1000000000;
|
|
107
|
-
result += convert_less_than_thousand(billions) + " billion ";
|
|
108
|
-
int_number %= 1000000000;
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
if (int_number >= 1000000) {
|
|
112
|
-
int millions = int_number / 1000000;
|
|
113
|
-
result += convert_less_than_thousand(millions) + " million ";
|
|
114
|
-
int_number %= 1000000;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
if (int_number >= 1000) {
|
|
118
|
-
int thousands = int_number / 1000;
|
|
119
|
-
result += convert_less_than_thousand(thousands) + " thousand ";
|
|
120
|
-
int_number %= 1000;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
if (int_number > 0) {
|
|
124
|
-
result += convert_less_than_thousand(int_number);
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
// Handle decimal part
|
|
129
|
-
if (decimal_pos != std::string::npos) {
|
|
130
|
-
result += " point";
|
|
131
|
-
std::string decimal_part = number_str.substr(decimal_pos + 1);
|
|
132
|
-
for (char digit : decimal_part) {
|
|
133
|
-
result += " " + ones.at(digit - '0');
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
return result;
|
|
138
|
-
} catch (const std::exception &e) {
|
|
139
|
-
// Skip if fails
|
|
140
|
-
return " ";
|
|
141
|
-
}
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
std::string replace_numbers_with_words(const std::string &input_text) {
|
|
145
|
-
std::regex number_pattern(R"(\d+(\.\d+)?)");
|
|
146
|
-
std::string result;
|
|
147
|
-
auto it = std::sregex_iterator(input_text.begin(), input_text.end(),
|
|
148
|
-
number_pattern);
|
|
149
|
-
auto end = std::sregex_iterator();
|
|
150
|
-
|
|
151
|
-
size_t last_pos = 0;
|
|
152
|
-
for (std::sregex_iterator i = it; i != end; ++i) {
|
|
153
|
-
const std::smatch &match = *i;
|
|
154
|
-
result.append(input_text, last_pos, match.position() - last_pos);
|
|
155
|
-
result.append(number_to_words(match.str()));
|
|
156
|
-
last_pos = match.position() + match.length();
|
|
157
|
-
}
|
|
158
|
-
result.append(input_text, last_pos);
|
|
159
|
-
|
|
160
|
-
return result;
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
std::string process_text(const std::string &text,
|
|
164
|
-
const tts_type tts_type = OUTETTS_V0_2) {
|
|
165
|
-
std::string processed_text = replace_numbers_with_words(text);
|
|
166
|
-
|
|
167
|
-
if (tts_type == OUTETTS_V0_2 || tts_type == OUTETTS_V0_3) {
|
|
168
|
-
processed_text = anyascii_string(processed_text);
|
|
169
|
-
|
|
170
|
-
std::regex dashes(R"([—–-])");
|
|
171
|
-
processed_text = std::regex_replace(processed_text, dashes, " ");
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
std::transform(processed_text.begin(), processed_text.end(),
|
|
175
|
-
processed_text.begin(), ::tolower);
|
|
176
|
-
|
|
177
|
-
std::regex special_chars(R"([-_/,\.\\])");
|
|
178
|
-
processed_text = std::regex_replace(processed_text, special_chars, " ");
|
|
179
|
-
|
|
180
|
-
std::regex non_alpha(R"([^a-z\s])");
|
|
181
|
-
processed_text = std::regex_replace(processed_text, non_alpha, "");
|
|
182
|
-
|
|
183
|
-
std::regex multiple_spaces(R"(\s+)");
|
|
184
|
-
processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
|
|
185
|
-
|
|
186
|
-
processed_text =
|
|
187
|
-
std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
|
|
188
|
-
|
|
189
|
-
/*
|
|
190
|
-
Replace spaces with the separator token same as in line 365
|
|
191
|
-
|
|
192
|
-
for (auto & c : prompt_user) {
|
|
193
|
-
if (c == ' ') {
|
|
194
|
-
prompt_clean += "<|text_sep|>";
|
|
195
|
-
*/
|
|
196
|
-
std::string separator =
|
|
197
|
-
(tts_type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
|
198
|
-
processed_text =
|
|
199
|
-
std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
|
|
200
|
-
|
|
201
|
-
return processed_text;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
#ifdef _WIN32
|
|
205
|
-
#define M_PI 3.14159265358979323846
|
|
206
|
-
#endif
|
|
207
|
-
|
|
208
|
-
void fill_hann_window(int length, bool periodic, float *output) {
|
|
209
|
-
int offset = -1;
|
|
210
|
-
if (periodic) {
|
|
211
|
-
offset = 0;
|
|
212
|
-
}
|
|
213
|
-
for (int i = 0; i < length; i++) {
|
|
214
|
-
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
void twiddle(float *real, float *imag, int k, int N) {
|
|
219
|
-
float angle = 2 * M_PI * k / N;
|
|
220
|
-
*real = cos(angle);
|
|
221
|
-
*imag = sin(angle);
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
void irfft(int n, const float *inp_cplx, float *out_real) {
|
|
225
|
-
int N = n / 2 + 1;
|
|
226
|
-
|
|
227
|
-
std::vector<float> real_input(N);
|
|
228
|
-
std::vector<float> imag_input(N);
|
|
229
|
-
for (int i = 0; i < N; ++i) {
|
|
230
|
-
real_input[i] = inp_cplx[2 * i];
|
|
231
|
-
imag_input[i] = inp_cplx[2 * i + 1];
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
std::vector<float> real_output(n);
|
|
235
|
-
std::vector<float> imag_output(n);
|
|
236
|
-
|
|
237
|
-
for (int k = 0; k < n; ++k) {
|
|
238
|
-
real_output[k] = 0.0f;
|
|
239
|
-
imag_output[k] = 0.0f;
|
|
240
|
-
for (int m = 0; m < N; ++m) {
|
|
241
|
-
float twiddle_real;
|
|
242
|
-
float twiddle_imag;
|
|
243
|
-
|
|
244
|
-
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
|
|
245
|
-
|
|
246
|
-
real_output[k] +=
|
|
247
|
-
real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
|
|
248
|
-
imag_output[k] +=
|
|
249
|
-
real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
for (int i = 0; i < n; ++i) {
|
|
254
|
-
out_real[i] = real_output[i] / N;
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
void fold(const std::vector<float> &data, int64_t n_out, int64_t n_win,
|
|
259
|
-
int64_t n_hop, int64_t n_pad, std::vector<float> &output) {
|
|
260
|
-
int64_t output_height = n_out;
|
|
261
|
-
int64_t kernel_w = n_win;
|
|
262
|
-
int64_t stride_w = n_hop;
|
|
263
|
-
int64_t width = n_out;
|
|
264
|
-
|
|
265
|
-
output.resize(width, 0.0f);
|
|
266
|
-
|
|
267
|
-
int64_t col_idx = 0;
|
|
268
|
-
for (int64_t w_col = 0; w_col < width; ++w_col) {
|
|
269
|
-
int64_t start = w_col * stride_w - n_pad;
|
|
270
|
-
int64_t end = start + kernel_w;
|
|
271
|
-
|
|
272
|
-
for (int64_t w_im = start; w_im < end; ++w_im) {
|
|
273
|
-
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t)data.size()) {
|
|
274
|
-
output[w_im] += data[col_idx];
|
|
275
|
-
}
|
|
276
|
-
col_idx++;
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
output.resize(n_out - 2 * n_pad);
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
std::vector<float> embd_to_audio(const float *embd, const int n_codes,
|
|
284
|
-
const int n_embd, const int n_thread) {
|
|
285
|
-
const int n_fft = 1280;
|
|
286
|
-
const int n_hop = 320;
|
|
287
|
-
const int n_win = 1280;
|
|
288
|
-
const int n_pad = (n_win - n_hop) / 2;
|
|
289
|
-
const int n_out = (n_codes - 1) * n_hop + n_win;
|
|
290
|
-
|
|
291
|
-
std::vector<float> hann(n_fft);
|
|
292
|
-
|
|
293
|
-
fill_hann_window(hann.size(), true, hann.data());
|
|
294
|
-
|
|
295
|
-
int n_spec = n_embd * n_codes;
|
|
296
|
-
|
|
297
|
-
std::vector<float> E(n_spec);
|
|
298
|
-
std::vector<float> S(n_spec);
|
|
299
|
-
std::vector<float> ST(n_spec);
|
|
300
|
-
|
|
301
|
-
for (int l = 0; l < n_codes; ++l) {
|
|
302
|
-
for (int k = 0; k < n_embd; ++k) {
|
|
303
|
-
E[k * n_codes + l] = embd[l * n_embd + k];
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
for (int k = 0; k < n_embd / 2; ++k) {
|
|
308
|
-
for (int l = 0; l < n_codes; ++l) {
|
|
309
|
-
float mag = E[(k)*n_codes + l];
|
|
310
|
-
float phi = E[(k + n_embd / 2) * n_codes + l];
|
|
311
|
-
|
|
312
|
-
mag = exp(mag);
|
|
313
|
-
|
|
314
|
-
if (mag > 1e2) {
|
|
315
|
-
mag = 1e2;
|
|
316
|
-
}
|
|
317
|
-
S[2 * (k * n_codes + l) + 0] = mag * cosf(phi);
|
|
318
|
-
S[2 * (k * n_codes + l) + 1] = mag * sinf(phi);
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
for (int l = 0; l < n_codes; ++l) {
|
|
323
|
-
for (int k = 0; k < n_embd / 2; ++k) {
|
|
324
|
-
ST[l * n_embd + 2 * k + 0] = S[2 * (k * n_codes + l) + 0];
|
|
325
|
-
ST[l * n_embd + 2 * k + 1] = S[2 * (k * n_codes + l) + 1];
|
|
326
|
-
}
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
std::vector<float> res(n_codes * n_fft);
|
|
330
|
-
std::vector<float> hann2(n_codes * n_fft);
|
|
331
|
-
|
|
332
|
-
std::vector<std::thread> workers(n_thread);
|
|
333
|
-
for (int i = 0; i < n_thread; ++i) {
|
|
334
|
-
workers[i] = std::thread([&, i]() {
|
|
335
|
-
for (int l = i; l < n_codes; l += n_thread) {
|
|
336
|
-
irfft(n_fft, ST.data() + l * n_embd, res.data() + l * n_fft);
|
|
337
|
-
for (int j = 0; j < n_fft; ++j) {
|
|
338
|
-
res[l * n_fft + j] *= hann[j];
|
|
339
|
-
hann2[l * n_fft + j] = hann[j] * hann[j];
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
});
|
|
343
|
-
}
|
|
344
|
-
for (int i = 0; i < n_thread; ++i) {
|
|
345
|
-
workers[i].join();
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
std::vector<float> audio;
|
|
349
|
-
std::vector<float> env;
|
|
350
|
-
|
|
351
|
-
fold(res, n_out, n_win, n_hop, n_pad, audio);
|
|
352
|
-
fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
|
|
353
|
-
|
|
354
|
-
for (size_t i = 0; i < audio.size(); ++i) {
|
|
355
|
-
audio[i] /= env[i];
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
return audio;
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
const char *get_tts_grammar(const tts_type type) {
|
|
362
|
-
switch (type) {
|
|
363
|
-
case OUTETTS_V0_1:
|
|
364
|
-
return OUTETTS_V1_GRAMMAR;
|
|
365
|
-
case OUTETTS_V0_2:
|
|
366
|
-
case OUTETTS_V0_3:
|
|
367
|
-
return OUTETTS_V2_GRAMMAR;
|
|
368
|
-
default:
|
|
369
|
-
return nullptr;
|
|
370
|
-
}
|
|
371
|
-
}
|
package/src/tts_utils.h
DELETED
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
#include <regex>
|
|
4
|
-
#include <sstream>
|
|
5
|
-
#include <string>
|
|
6
|
-
#include <thread>
|
|
7
|
-
#include <vector>
|
|
8
|
-
|
|
9
|
-
#include <nlohmann/json.hpp>
|
|
10
|
-
|
|
11
|
-
enum tts_type { UNKNOWN = -1, OUTETTS_V0_1 = 1, OUTETTS_V0_2 = 2, OUTETTS_V0_3 = 3 };
|
|
12
|
-
|
|
13
|
-
static std::string anyascii_string(const std::string &input);
|
|
14
|
-
|
|
15
|
-
std::string audio_text_from_speaker(nlohmann::json speaker,
|
|
16
|
-
const tts_type type);
|
|
17
|
-
std::string audio_data_from_speaker(nlohmann::json speaker,
|
|
18
|
-
const tts_type type);
|
|
19
|
-
std::string process_text(const std::string &text, const tts_type tts_type);
|
|
20
|
-
std::vector<float> embd_to_audio(const float *embd, const int n_codes,
|
|
21
|
-
const int n_embd, const int n_thread);
|
|
22
|
-
|
|
23
|
-
const char *get_tts_grammar(const tts_type type);
|
|
24
|
-
|
|
25
|
-
// the default speaker profile is from:
|
|
26
|
-
// https://github.com/edwko/OuteTTS/blob/main/outetts/version/v1/default_speakers/en_male_1.json
|
|
27
|
-
static const char *DEFAULT_AUDIO_TEXT =
|
|
28
|
-
"<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|"
|
|
29
|
-
"text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>"
|
|
30
|
-
"pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<"
|
|
31
|
-
"|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|"
|
|
32
|
-
"text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_"
|
|
33
|
-
"sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>"
|
|
34
|
-
"enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<"
|
|
35
|
-
"|text_sep|>";
|
|
36
|
-
static const char *DEFAULT_AUDIO_DATA = R"(<|audio_start|>
|
|
37
|
-
the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
|
|
38
|
-
overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
|
|
39
|
-
package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
|
|
40
|
-
from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
|
|
41
|
-
just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
|
|
42
|
-
two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
|
|
43
|
-
people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
|
|
44
|
-
is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
|
|
45
|
-
pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
|
|
46
|
-
remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
|
|
47
|
-
sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
|
|
48
|
-
i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
|
|
49
|
-
have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
|
|
50
|
-
some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
|
|
51
|
-
critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
|
|
52
|
-
about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
|
|
53
|
-
some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
|
|
54
|
-
of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
|
|
55
|
-
the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
|
|
56
|
-
gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
|
|
57
|
-
aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
|
|
58
|
-
but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
|
|
59
|
-
its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
|
|
60
|
-
still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
|
|
61
|
-
really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
|
|
62
|
-
enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
|
|
63
|
-
and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
|
|
64
|
-
it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
|
|
65
|
-
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
|
|
66
|
-
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
|
|
67
|
-
|
|
68
|
-
static const char *OUTETTS_V1_GRAMMAR = R"(
|
|
69
|
-
root ::= NL? wordAudioBlock+ audioEnd NL eos?
|
|
70
|
-
wordAudioBlock ::= WORD codeBlock NL
|
|
71
|
-
codeBlock ::= TIME CODE*
|
|
72
|
-
eos ::= "<|im_end|>"
|
|
73
|
-
codeStart ::= "<|code_start|>"
|
|
74
|
-
codeEnd ::= "<|code_end|>"
|
|
75
|
-
audioEnd ::= "<|audio_end|>"
|
|
76
|
-
WORD ::= [A-Za-z]+
|
|
77
|
-
NL ::= "\n"
|
|
78
|
-
TIME ::= "<|t_" DECIMAL "|>"
|
|
79
|
-
CODE ::= "<|" DIGITS "|>"
|
|
80
|
-
DIGITS ::= [0-9]+
|
|
81
|
-
DECIMAL ::= [0-9]+ "." [0-9]+
|
|
82
|
-
)";
|
|
83
|
-
|
|
84
|
-
static const char *OUTETTS_V2_GRAMMAR = R"(
|
|
85
|
-
root ::= NL? content+ audioEnd NL eos?
|
|
86
|
-
content ::= wordAudioBlock | emotionBlock
|
|
87
|
-
wordAudioBlock ::= WORD punch* codeBlock space NL
|
|
88
|
-
codeBlock ::= TIME CODE*
|
|
89
|
-
emotionBlock ::= emotionStart TEXT emotionEnd space NL
|
|
90
|
-
TEXT ::= [A-Za-z0-9 .,?!]+
|
|
91
|
-
eos ::= "<|im_end|>"
|
|
92
|
-
emotionStart ::= "<|emotion_start|>"
|
|
93
|
-
emotionEnd ::= "<|emotion_end|>"
|
|
94
|
-
audioEnd ::= "<|audio_end|>"
|
|
95
|
-
space ::= "<|space|>"
|
|
96
|
-
WORD ::= [A-Za-z]+
|
|
97
|
-
NL ::= [\n]
|
|
98
|
-
TIME ::= "<|t_" DECIMAL "|>"
|
|
99
|
-
CODE ::= "<|" DIGITS "|>"
|
|
100
|
-
DIGITS ::= [0-9]+
|
|
101
|
-
DECIMAL ::= [0-9]+ "." [0-9]+
|
|
102
|
-
punch ::= "<|" [a-z_]+ "|>"
|
|
103
|
-
)";
|