llama-cpp-capacitor 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +9 -9
- package/cpp/LICENSE +21 -0
- package/cpp/README.md +4 -0
- package/cpp/anyascii.c +22223 -0
- package/cpp/anyascii.h +42 -0
- package/cpp/chat-parser.cpp +393 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +2315 -0
- package/cpp/chat.h +221 -0
- package/cpp/common.cpp +1619 -0
- package/cpp/common.h +744 -0
- package/cpp/ggml-alloc.c +1028 -0
- package/cpp/ggml-alloc.h +76 -0
- package/cpp/ggml-backend-impl.h +255 -0
- package/cpp/ggml-backend-reg.cpp +600 -0
- package/cpp/ggml-backend.cpp +2118 -0
- package/cpp/ggml-backend.h +354 -0
- package/cpp/ggml-common.h +1878 -0
- package/cpp/ggml-cpp.h +39 -0
- package/cpp/ggml-cpu/amx/amx.cpp +221 -0
- package/cpp/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +3650 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +1891 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml-cpu/arch-fallback.h +215 -0
- package/cpp/ggml-cpu/binary-ops.cpp +158 -0
- package/cpp/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml-cpu/common.h +73 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +525 -0
- package/cpp/ggml-cpu/ggml-cpu.c +3578 -0
- package/cpp/ggml-cpu/ggml-cpu.cpp +672 -0
- package/cpp/ggml-cpu/ops.cpp +10587 -0
- package/cpp/ggml-cpu/ops.h +114 -0
- package/cpp/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml-cpu/quants.h +97 -0
- package/cpp/ggml-cpu/repack.cpp +1982 -0
- package/cpp/ggml-cpu/repack.h +120 -0
- package/cpp/ggml-cpu/simd-mappings.h +1184 -0
- package/cpp/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml-cpu/traits.h +38 -0
- package/cpp/ggml-cpu/unary-ops.cpp +186 -0
- package/cpp/ggml-cpu/unary-ops.h +28 -0
- package/cpp/ggml-cpu/vec.cpp +348 -0
- package/cpp/ggml-cpu/vec.h +1121 -0
- package/cpp/ggml-cpu.h +145 -0
- package/cpp/ggml-impl.h +622 -0
- package/cpp/ggml-metal-impl.h +688 -0
- package/cpp/ggml-metal.h +66 -0
- package/cpp/ggml-metal.m +6833 -0
- package/cpp/ggml-opt.cpp +1093 -0
- package/cpp/ggml-opt.h +256 -0
- package/cpp/ggml-quants.c +5324 -0
- package/cpp/ggml-quants.h +106 -0
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +14 -0
- package/cpp/ggml.c +7108 -0
- package/cpp/ggml.h +2492 -0
- package/cpp/gguf.cpp +1358 -0
- package/cpp/gguf.h +202 -0
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +985 -0
- package/cpp/json-schema-to-grammar.h +21 -0
- package/cpp/llama-adapter.cpp +388 -0
- package/cpp/llama-adapter.h +76 -0
- package/cpp/llama-arch.cpp +2355 -0
- package/cpp/llama-arch.h +499 -0
- package/cpp/llama-batch.cpp +875 -0
- package/cpp/llama-batch.h +160 -0
- package/cpp/llama-chat.cpp +783 -0
- package/cpp/llama-chat.h +65 -0
- package/cpp/llama-context.cpp +2748 -0
- package/cpp/llama-context.h +306 -0
- package/cpp/llama-cparams.cpp +5 -0
- package/cpp/llama-cparams.h +41 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +1229 -0
- package/cpp/llama-grammar.h +173 -0
- package/cpp/llama-graph.cpp +1891 -0
- package/cpp/llama-graph.h +810 -0
- package/cpp/llama-hparams.cpp +180 -0
- package/cpp/llama-hparams.h +233 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +61 -0
- package/cpp/llama-io.cpp +15 -0
- package/cpp/llama-io.h +35 -0
- package/cpp/llama-kv-cache-iswa.cpp +318 -0
- package/cpp/llama-kv-cache-iswa.h +135 -0
- package/cpp/llama-kv-cache.cpp +2059 -0
- package/cpp/llama-kv-cache.h +374 -0
- package/cpp/llama-kv-cells.h +491 -0
- package/cpp/llama-memory-hybrid.cpp +258 -0
- package/cpp/llama-memory-hybrid.h +137 -0
- package/cpp/llama-memory-recurrent.cpp +1146 -0
- package/cpp/llama-memory-recurrent.h +179 -0
- package/cpp/llama-memory.cpp +59 -0
- package/cpp/llama-memory.h +119 -0
- package/cpp/llama-mmap.cpp +600 -0
- package/cpp/llama-mmap.h +68 -0
- package/cpp/llama-model-loader.cpp +1164 -0
- package/cpp/llama-model-loader.h +170 -0
- package/cpp/llama-model-saver.cpp +282 -0
- package/cpp/llama-model-saver.h +37 -0
- package/cpp/llama-model.cpp +19042 -0
- package/cpp/llama-model.h +491 -0
- package/cpp/llama-sampling.cpp +2575 -0
- package/cpp/llama-sampling.h +32 -0
- package/cpp/llama-vocab.cpp +3792 -0
- package/cpp/llama-vocab.h +176 -0
- package/cpp/llama.cpp +358 -0
- package/cpp/llama.h +1373 -0
- package/cpp/log.cpp +427 -0
- package/cpp/log.h +103 -0
- package/cpp/minja/chat-template.hpp +550 -0
- package/cpp/minja/minja.hpp +3009 -0
- package/cpp/nlohmann/json.hpp +25526 -0
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-completion.cpp +681 -0
- package/cpp/rn-completion.h +116 -0
- package/cpp/rn-llama.cpp +345 -0
- package/cpp/rn-llama.h +149 -0
- package/cpp/rn-mtmd.hpp +602 -0
- package/cpp/rn-tts.cpp +591 -0
- package/cpp/rn-tts.h +59 -0
- package/cpp/sampling.cpp +579 -0
- package/cpp/sampling.h +107 -0
- package/cpp/tools/mtmd/clip-impl.h +473 -0
- package/cpp/tools/mtmd/clip.cpp +4322 -0
- package/cpp/tools/mtmd/clip.h +106 -0
- package/cpp/tools/mtmd/miniaudio/miniaudio.h +93468 -0
- package/cpp/tools/mtmd/mtmd-audio.cpp +769 -0
- package/cpp/tools/mtmd/mtmd-audio.h +47 -0
- package/cpp/tools/mtmd/mtmd-helper.cpp +460 -0
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +1066 -0
- package/cpp/tools/mtmd/mtmd.h +298 -0
- package/cpp/tools/mtmd/stb/stb_image.h +7988 -0
- package/cpp/unicode-data.cpp +7034 -0
- package/cpp/unicode-data.h +20 -0
- package/cpp/unicode.cpp +1061 -0
- package/cpp/unicode.h +68 -0
- package/package.json +2 -1
package/cpp/rn-tts.cpp
ADDED
|
@@ -0,0 +1,591 @@
|
|
|
1
|
+
#include "rn-tts.h"
|
|
2
|
+
#include "rn-llama.h"
|
|
3
|
+
#include "anyascii.h"
|
|
4
|
+
#include "common.h"
|
|
5
|
+
#include <regex>
|
|
6
|
+
#include <map>
|
|
7
|
+
#include <sstream>
|
|
8
|
+
#include <iomanip>
|
|
9
|
+
#include <codecvt>
|
|
10
|
+
#include <locale>
|
|
11
|
+
#include <thread>
|
|
12
|
+
#include <cmath>
|
|
13
|
+
|
|
14
|
+
namespace rnllama {
|
|
15
|
+
|
|
16
|
+
// Constants definitions
|
|
17
|
+
const std::string default_audio_text = "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>";
|
|
18
|
+
|
|
19
|
+
const std::string default_audio_data = R"(<|audio_start|>
|
|
20
|
+
the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
|
|
21
|
+
overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
|
|
22
|
+
package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
|
|
23
|
+
from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
|
|
24
|
+
just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
|
|
25
|
+
two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
|
|
26
|
+
people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
|
|
27
|
+
is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
|
|
28
|
+
pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
|
|
29
|
+
remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
|
|
30
|
+
sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
|
|
31
|
+
i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
|
|
32
|
+
have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
|
|
33
|
+
some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
|
|
34
|
+
critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
|
|
35
|
+
about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
|
|
36
|
+
some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
|
|
37
|
+
of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
|
|
38
|
+
the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
|
|
39
|
+
gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
|
|
40
|
+
aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
|
|
41
|
+
but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
|
|
42
|
+
its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
|
|
43
|
+
still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
|
|
44
|
+
really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
|
|
45
|
+
enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
|
|
46
|
+
and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
|
|
47
|
+
it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
|
|
48
|
+
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
|
|
49
|
+
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
|
|
50
|
+
|
|
51
|
+
const char *OUTETTS_V1_GRAMMAR = R"(
|
|
52
|
+
root ::= NL? wordAudioBlock+ audioEnd NL eos?
|
|
53
|
+
wordAudioBlock ::= WORD codeBlock NL
|
|
54
|
+
codeBlock ::= TIME CODE*
|
|
55
|
+
eos ::= "<|im_end|>"
|
|
56
|
+
codeStart ::= "<|code_start|>"
|
|
57
|
+
codeEnd ::= "<|code_end|>"
|
|
58
|
+
audioEnd ::= "<|audio_end|>"
|
|
59
|
+
WORD ::= [A-Za-z]+
|
|
60
|
+
NL ::= [\n]
|
|
61
|
+
TIME ::= "<|t_" DECIMAL "|>"
|
|
62
|
+
CODE ::= "<|" DIGITS "|>"
|
|
63
|
+
DIGITS ::= [0-9]+
|
|
64
|
+
DECIMAL ::= [0-9]+ "." [0-9]+
|
|
65
|
+
)";
|
|
66
|
+
|
|
67
|
+
const char *OUTETTS_V2_GRAMMAR = R"(
|
|
68
|
+
root ::= NL? content+ audioEnd NL eos?
|
|
69
|
+
content ::= wordAudioBlock | emotionBlock
|
|
70
|
+
wordAudioBlock ::= WORD punch* codeBlock space NL
|
|
71
|
+
codeBlock ::= TIME CODE*
|
|
72
|
+
emotionBlock ::= emotionStart TEXT emotionEnd space NL
|
|
73
|
+
TEXT ::= [A-Za-z0-9 .,?!]+
|
|
74
|
+
eos ::= "<|im_end|>"
|
|
75
|
+
emotionStart ::= "<|emotion_start|>"
|
|
76
|
+
emotionEnd ::= "<|emotion_end|>"
|
|
77
|
+
audioEnd ::= "<|audio_end|>"
|
|
78
|
+
space ::= "<|space|>"
|
|
79
|
+
WORD ::= [A-Za-z]+
|
|
80
|
+
NL ::= [\n]
|
|
81
|
+
TIME ::= "<|t_" DECIMAL "|>"
|
|
82
|
+
CODE ::= "<|" DIGITS "|>"
|
|
83
|
+
DIGITS ::= [0-9]+
|
|
84
|
+
DECIMAL ::= [0-9]+ "." [0-9]+
|
|
85
|
+
punch ::= "<|" [a-z_]+ "|>"
|
|
86
|
+
)";
|
|
87
|
+
|
|
88
|
+
// Number conversion maps and functions
|
|
89
|
+
static const std::map<int, std::string> ones = {
|
|
90
|
+
{0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
|
|
91
|
+
{5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
|
|
92
|
+
{10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
|
|
93
|
+
{15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
static const std::map<int, std::string> tens = {
|
|
97
|
+
{2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
|
|
98
|
+
{6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
// Convert a number less than 1000 to words
|
|
102
|
+
static std::string convert_less_than_thousand(int num) {
|
|
103
|
+
std::string result;
|
|
104
|
+
|
|
105
|
+
if (num >= 100) {
|
|
106
|
+
result += ones.at(num / 100) + " hundred ";
|
|
107
|
+
num %= 100;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (num >= 20) {
|
|
111
|
+
result += tens.at(num / 10);
|
|
112
|
+
if (num % 10 > 0) {
|
|
113
|
+
result += "-" + ones.at(num % 10);
|
|
114
|
+
}
|
|
115
|
+
} else if (num > 0) {
|
|
116
|
+
result += ones.at(num);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return result;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
std::string number_to_words(const std::string & number_str) {
|
|
123
|
+
try {
|
|
124
|
+
size_t decimal_pos = number_str.find('.');
|
|
125
|
+
std::string integer_part = number_str.substr(0, decimal_pos);
|
|
126
|
+
|
|
127
|
+
int int_number = std::stoi(integer_part);
|
|
128
|
+
std::string result;
|
|
129
|
+
|
|
130
|
+
if (int_number == 0) {
|
|
131
|
+
result = "zero";
|
|
132
|
+
} else {
|
|
133
|
+
if (int_number >= 1000000000) {
|
|
134
|
+
int billions = int_number / 1000000000;
|
|
135
|
+
result += convert_less_than_thousand(billions) + " billion ";
|
|
136
|
+
int_number %= 1000000000;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (int_number >= 1000000) {
|
|
140
|
+
int millions = int_number / 1000000;
|
|
141
|
+
result += convert_less_than_thousand(millions) + " million ";
|
|
142
|
+
int_number %= 1000000;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (int_number >= 1000) {
|
|
146
|
+
int thousands = int_number / 1000;
|
|
147
|
+
result += convert_less_than_thousand(thousands) + " thousand ";
|
|
148
|
+
int_number %= 1000;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (int_number > 0) {
|
|
152
|
+
result += convert_less_than_thousand(int_number);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Handle decimal part
|
|
157
|
+
if (decimal_pos != std::string::npos) {
|
|
158
|
+
result += " point";
|
|
159
|
+
std::string decimal_part = number_str.substr(decimal_pos + 1);
|
|
160
|
+
for (char digit : decimal_part) {
|
|
161
|
+
result += " " + ones.at(digit - '0');
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return result;
|
|
166
|
+
} catch (const std::exception& e) {
|
|
167
|
+
// Skip if fails
|
|
168
|
+
return " ";
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
std::string replace_numbers_with_words(const std::string & input_text) {
|
|
173
|
+
std::regex number_pattern(R"(\d+(\.\d+)?)");
|
|
174
|
+
std::string result;
|
|
175
|
+
auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
|
|
176
|
+
auto end = std::sregex_iterator();
|
|
177
|
+
|
|
178
|
+
size_t last_pos = 0;
|
|
179
|
+
for (std::sregex_iterator i = it; i != end; ++i) {
|
|
180
|
+
const std::smatch& match = *i;
|
|
181
|
+
result.append(input_text, last_pos, match.position() - last_pos);
|
|
182
|
+
result.append(number_to_words(match.str()));
|
|
183
|
+
last_pos = match.position() + match.length();
|
|
184
|
+
}
|
|
185
|
+
result.append(input_text, last_pos);
|
|
186
|
+
|
|
187
|
+
return result;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
static std::string anyascii_string(const std::string &input) {
|
|
191
|
+
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
192
|
+
auto wstr = converter.from_bytes(input);
|
|
193
|
+
std::string output;
|
|
194
|
+
for (char32_t c : wstr) {
|
|
195
|
+
const char *r;
|
|
196
|
+
size_t rlen = anyascii(c, &r);
|
|
197
|
+
output.append(r, rlen);
|
|
198
|
+
}
|
|
199
|
+
return output;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
std::string process_text(const std::string & text, const tts_type tts_type) {
|
|
203
|
+
std::string processed_text = replace_numbers_with_words(text);
|
|
204
|
+
|
|
205
|
+
if (tts_type == OUTETTS_V0_2 || tts_type == OUTETTS_V0_3) {
|
|
206
|
+
processed_text = anyascii_string(processed_text);
|
|
207
|
+
|
|
208
|
+
std::regex dashes(R"([—–-])");
|
|
209
|
+
processed_text = std::regex_replace(processed_text, dashes, " ");
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
std::transform(processed_text.begin(), processed_text.end(),
|
|
213
|
+
processed_text.begin(), ::tolower);
|
|
214
|
+
|
|
215
|
+
std::regex special_chars(R"([-_/,\.\\])");
|
|
216
|
+
processed_text = std::regex_replace(processed_text, special_chars, " ");
|
|
217
|
+
|
|
218
|
+
std::regex non_alpha(R"([^a-z\s])");
|
|
219
|
+
processed_text = std::regex_replace(processed_text, non_alpha, "");
|
|
220
|
+
|
|
221
|
+
std::regex multiple_spaces(R"(\s+)");
|
|
222
|
+
processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
|
|
223
|
+
|
|
224
|
+
processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
|
|
225
|
+
|
|
226
|
+
std::string separator = (tts_type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
|
227
|
+
processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
|
|
228
|
+
|
|
229
|
+
return processed_text;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
std::string audio_text_from_speaker(json speaker, const tts_type type) {
|
|
233
|
+
std::string audio_text = "<|text_start|>";
|
|
234
|
+
|
|
235
|
+
if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
|
236
|
+
std::string separator = (type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
|
|
237
|
+
for (const auto &word : speaker["words"]) {
|
|
238
|
+
audio_text += word["word"].get<std::string>() + separator;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return audio_text;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
std::string audio_data_from_speaker(json speaker, const tts_type type) {
|
|
246
|
+
std::string audio_data = "<|audio_start|>\n";
|
|
247
|
+
|
|
248
|
+
if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
|
|
249
|
+
std::string code_start = (type == OUTETTS_V0_3) ? "" : "<|code_start|>";
|
|
250
|
+
std::string code_end = (type == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
|
|
251
|
+
for (const auto &word : speaker["words"]) {
|
|
252
|
+
std::string word_text = word["word"].get<std::string>();
|
|
253
|
+
double duration = word["duration"].get<double>();
|
|
254
|
+
std::vector<int> codes = word["codes"].get<std::vector<int>>();
|
|
255
|
+
|
|
256
|
+
// Create the audio output entry
|
|
257
|
+
std::ostringstream word_entry;
|
|
258
|
+
word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
|
|
259
|
+
<< duration << "|>" + code_start;
|
|
260
|
+
for (const auto &Code : codes) {
|
|
261
|
+
word_entry << "<|" << Code << "|>";
|
|
262
|
+
}
|
|
263
|
+
word_entry << code_end << "\n";
|
|
264
|
+
audio_data += word_entry.str();
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
return audio_data;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Constructor and destructor implementations
|
|
272
|
+
llama_rn_context_tts::llama_rn_context_tts(const std::string &vocoder_model_path, int batch_size) {
|
|
273
|
+
common_params vocoder_params;
|
|
274
|
+
vocoder_params.model.path = vocoder_model_path;
|
|
275
|
+
vocoder_params.embedding = true;
|
|
276
|
+
vocoder_params.ctx_shift = false;
|
|
277
|
+
if (batch_size > 0) {
|
|
278
|
+
vocoder_params.n_batch = batch_size;
|
|
279
|
+
}
|
|
280
|
+
vocoder_params.n_ubatch = vocoder_params.n_batch;
|
|
281
|
+
|
|
282
|
+
init_result = common_init_from_params(vocoder_params);
|
|
283
|
+
params = vocoder_params;
|
|
284
|
+
model = init_result.model.get();
|
|
285
|
+
ctx = init_result.context.get();
|
|
286
|
+
|
|
287
|
+
if (model == nullptr || ctx == nullptr) {
|
|
288
|
+
LOG_ERROR("Failed to load vocoder model: %s", vocoder_model_path.c_str());
|
|
289
|
+
throw std::runtime_error("Failed to load vocoder model");
|
|
290
|
+
}
|
|
291
|
+
type = UNKNOWN; // Will be determined when used
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
llama_rn_context_tts::~llama_rn_context_tts() {
|
|
295
|
+
// init_result will handle cleanup automatically when it goes out of scope
|
|
296
|
+
model = nullptr;
|
|
297
|
+
ctx = nullptr;
|
|
298
|
+
type = UNKNOWN;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
void llama_rn_context_tts::setGuideTokens(const std::vector<llama_token> &tokens) {
|
|
302
|
+
guide_tokens = tokens;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Audio processing functions - FFT and related utilities
|
|
306
|
+
static void fill_hann_window(int length, bool periodic, float * output) {
|
|
307
|
+
int offset = -1;
|
|
308
|
+
if (periodic) {
|
|
309
|
+
offset = 0;
|
|
310
|
+
}
|
|
311
|
+
for (int i = 0; i < length; i++) {
|
|
312
|
+
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
static void twiddle(float * real, float * imag, int k, int N) {
|
|
317
|
+
float angle = 2 * M_PI * k / N;
|
|
318
|
+
*real = cos(angle);
|
|
319
|
+
*imag = sin(angle);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
static void irfft(int n, const float * inp_cplx, float * out_real) {
|
|
323
|
+
int N = n / 2 + 1;
|
|
324
|
+
|
|
325
|
+
std::vector<float> real_input(N);
|
|
326
|
+
std::vector<float> imag_input(N);
|
|
327
|
+
for (int i = 0; i < N; ++i) {
|
|
328
|
+
real_input[i] = inp_cplx[2 * i];
|
|
329
|
+
imag_input[i] = inp_cplx[2 * i + 1];
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
std::vector<float> real_output(n);
|
|
333
|
+
std::vector<float> imag_output(n);
|
|
334
|
+
|
|
335
|
+
for (int k = 0; k < n; ++k) {
|
|
336
|
+
real_output[k] = 0.0f;
|
|
337
|
+
imag_output[k] = 0.0f;
|
|
338
|
+
for (int m = 0; m < N; ++m) {
|
|
339
|
+
float twiddle_real;
|
|
340
|
+
float twiddle_imag;
|
|
341
|
+
|
|
342
|
+
twiddle(&twiddle_real, &twiddle_imag, k * m, n);
|
|
343
|
+
|
|
344
|
+
real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
|
|
345
|
+
imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
for (int i = 0; i < n; ++i) {
|
|
350
|
+
out_real[i] = real_output[i] / N;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
|
|
355
|
+
int64_t output_height = n_out;
|
|
356
|
+
int64_t kernel_w = n_win;
|
|
357
|
+
int64_t stride_w = n_hop;
|
|
358
|
+
int64_t width = n_out;
|
|
359
|
+
|
|
360
|
+
output.resize(width, 0.0f);
|
|
361
|
+
|
|
362
|
+
int64_t col_idx = 0;
|
|
363
|
+
for (int64_t w_col = 0; w_col < width; ++w_col) {
|
|
364
|
+
int64_t start = w_col * stride_w - n_pad;
|
|
365
|
+
int64_t end = start + kernel_w;
|
|
366
|
+
|
|
367
|
+
for (int64_t w_im = start; w_im < end; ++w_im) {
|
|
368
|
+
if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
|
|
369
|
+
output[w_im] += data[col_idx];
|
|
370
|
+
}
|
|
371
|
+
col_idx++;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
output.resize(n_out - 2 * n_pad);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
std::vector<float> embd_to_audio(
|
|
379
|
+
const float * embd,
|
|
380
|
+
const int n_codes,
|
|
381
|
+
const int n_embd,
|
|
382
|
+
const int n_thread) {
|
|
383
|
+
const int n_fft = 1280;
|
|
384
|
+
const int n_hop = 320;
|
|
385
|
+
const int n_win = 1280;
|
|
386
|
+
const int n_pad = (n_win - n_hop)/2;
|
|
387
|
+
const int n_out = (n_codes - 1)*n_hop + n_win;
|
|
388
|
+
|
|
389
|
+
std::vector<float> hann(n_fft);
|
|
390
|
+
|
|
391
|
+
fill_hann_window(hann.size(), true, hann.data());
|
|
392
|
+
|
|
393
|
+
int n_spec = n_embd*n_codes;
|
|
394
|
+
|
|
395
|
+
std::vector<float> E (n_spec);
|
|
396
|
+
std::vector<float> S (n_spec);
|
|
397
|
+
std::vector<float> ST(n_spec);
|
|
398
|
+
|
|
399
|
+
for (int l = 0; l < n_codes; ++l) {
|
|
400
|
+
for (int k = 0; k < n_embd; ++k) {
|
|
401
|
+
E[k*n_codes + l] = embd[l*n_embd + k];
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
for (int k = 0; k < n_embd/2; ++k) {
|
|
406
|
+
for (int l = 0; l < n_codes; ++l) {
|
|
407
|
+
float mag = E[(k )*n_codes + l];
|
|
408
|
+
float phi = E[(k + n_embd/2)*n_codes + l];
|
|
409
|
+
|
|
410
|
+
mag = exp(mag);
|
|
411
|
+
|
|
412
|
+
if (mag > 1e2) {
|
|
413
|
+
mag = 1e2;
|
|
414
|
+
}
|
|
415
|
+
S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
|
|
416
|
+
S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
for (int l = 0; l < n_codes; ++l) {
|
|
421
|
+
for (int k = 0; k < n_embd/2; ++k) {
|
|
422
|
+
ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
|
|
423
|
+
ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
std::vector<float> res (n_codes*n_fft);
|
|
428
|
+
std::vector<float> hann2(n_codes*n_fft);
|
|
429
|
+
|
|
430
|
+
std::vector<std::thread> workers(n_thread);
|
|
431
|
+
for (int i = 0; i < n_thread; ++i) {
|
|
432
|
+
workers[i] = std::thread([&, i]() {
|
|
433
|
+
for (int l = i; l < n_codes; l += n_thread) {
|
|
434
|
+
irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
|
|
435
|
+
for (int j = 0; j < n_fft; ++j) {
|
|
436
|
+
res [l*n_fft + j] *= hann[j];
|
|
437
|
+
hann2[l*n_fft + j] = hann[j] * hann[j];
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
for (int i = 0; i < n_thread; ++i) {
|
|
443
|
+
workers[i].join();
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
std::vector<float> audio;
|
|
447
|
+
std::vector<float> env;
|
|
448
|
+
|
|
449
|
+
fold(res, n_out, n_win, n_hop, n_pad, audio);
|
|
450
|
+
fold(hann2, n_out, n_win, n_hop, n_pad, env);
|
|
451
|
+
|
|
452
|
+
for (size_t i = 0; i < audio.size(); ++i) {
|
|
453
|
+
audio[i] /= env[i];
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
return audio;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// Forward declarations from rn-llama.h
|
|
460
|
+
extern bool rnllama_verbose;
|
|
461
|
+
void log(const char *level, const char *function, int line, const char *format, ...);
|
|
462
|
+
|
|
463
|
+
#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
464
|
+
#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
465
|
+
#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
466
|
+
|
|
467
|
+
// TTS member functions
|
|
468
|
+
tts_type llama_rn_context_tts::getTTSType(llama_rn_context* main_ctx, json speaker) {
|
|
469
|
+
if (speaker.is_object() && speaker.contains("version")) {
|
|
470
|
+
std::string version = speaker["version"].get<std::string>();
|
|
471
|
+
if (version == "0.2") {
|
|
472
|
+
return OUTETTS_V0_2;
|
|
473
|
+
} else if (version == "0.3") {
|
|
474
|
+
return OUTETTS_V0_3;
|
|
475
|
+
} else {
|
|
476
|
+
LOG_ERROR("Unsupported speaker version '%s'\n", version.c_str());
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
if (type != UNKNOWN) {
|
|
480
|
+
return type;
|
|
481
|
+
}
|
|
482
|
+
const char *chat_template = llama_model_chat_template(main_ctx->model, nullptr);
|
|
483
|
+
if (chat_template && std::string(chat_template) == "outetts-0.3") {
|
|
484
|
+
return OUTETTS_V0_3;
|
|
485
|
+
}
|
|
486
|
+
if (main_ctx->model->name.find("OuteTTS 0.1") != std::string::npos) {
|
|
487
|
+
return OUTETTS_V0_1;
|
|
488
|
+
}
|
|
489
|
+
return OUTETTS_V0_2;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
llama_rn_audio_completion_result llama_rn_context_tts::getFormattedAudioCompletion(llama_rn_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak) {
|
|
493
|
+
std::string audio_text = default_audio_text;
|
|
494
|
+
std::string audio_data = default_audio_data;
|
|
495
|
+
|
|
496
|
+
json speaker = speaker_json_str.empty() ? json::object() : json::parse(speaker_json_str);
|
|
497
|
+
const tts_type tts_type = getTTSType(main_ctx, speaker);
|
|
498
|
+
if (tts_type == UNKNOWN) {
|
|
499
|
+
LOG_ERROR("Unknown TTS version");
|
|
500
|
+
return {"", nullptr};
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
if (tts_type == OUTETTS_V0_3) {
|
|
504
|
+
audio_text = std::regex_replace(audio_text, std::regex(R"(<\|text_sep\|>)"), "<|space|>");
|
|
505
|
+
audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_start\|>)"), "");
|
|
506
|
+
audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_end\|>)"), "<|space|>");
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (!speaker_json_str.empty()) {
|
|
510
|
+
audio_text = audio_text_from_speaker(speaker, tts_type);
|
|
511
|
+
audio_data = audio_data_from_speaker(speaker, tts_type);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
std::string prompt = "<|im_start|>\n" + audio_text + process_text(text_to_speak, tts_type) + "<|text_end|>\n" + audio_data + "\n";
|
|
515
|
+
|
|
516
|
+
if (tts_type == OUTETTS_V0_1) {
|
|
517
|
+
return {prompt, OUTETTS_V1_GRAMMAR};
|
|
518
|
+
} else if (tts_type == OUTETTS_V0_2 || tts_type == OUTETTS_V0_3) {
|
|
519
|
+
return {prompt, OUTETTS_V2_GRAMMAR};
|
|
520
|
+
} else {
|
|
521
|
+
return {prompt, nullptr};
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
std::vector<llama_token> llama_rn_context_tts::getAudioCompletionGuideTokens(llama_rn_context* main_ctx, const std::string &text_to_speak) {
|
|
526
|
+
const llama_vocab * vocab = llama_model_get_vocab(main_ctx->model);
|
|
527
|
+
const tts_type tts_type = getTTSType(main_ctx);
|
|
528
|
+
std::string clean_text = process_text(text_to_speak, tts_type);
|
|
529
|
+
|
|
530
|
+
const std::string& delimiter = (tts_type == OUTETTS_V0_3 ? "<|space|>" : "<|text_sep|>");
|
|
531
|
+
|
|
532
|
+
std::vector<llama_token> result;
|
|
533
|
+
size_t start = 0;
|
|
534
|
+
size_t end = clean_text.find(delimiter);
|
|
535
|
+
|
|
536
|
+
//first token is always a newline, as it was not previously added
|
|
537
|
+
result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
|
|
538
|
+
|
|
539
|
+
while (end != std::string::npos) {
|
|
540
|
+
std::string current_word = clean_text.substr(start, end - start);
|
|
541
|
+
auto tmp = common_tokenize(vocab, current_word, false, true);
|
|
542
|
+
result.push_back(tmp[0]);
|
|
543
|
+
start = end + delimiter.length();
|
|
544
|
+
end = clean_text.find(delimiter, start);
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
// Add the last part
|
|
548
|
+
std::string current_word = clean_text.substr(start);
|
|
549
|
+
auto tmp = common_tokenize(vocab, current_word, false, true);
|
|
550
|
+
if (tmp.size() > 0) {
|
|
551
|
+
result.push_back(tmp[0]);
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// Add Audio End, forcing stop generation
|
|
555
|
+
result.push_back(common_tokenize(vocab, "<|audio_end|>", false, true)[0]);
|
|
556
|
+
|
|
557
|
+
return result;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
std::vector<float> llama_rn_context_tts::decodeAudioTokens(llama_rn_context* main_ctx, const std::vector<llama_token> &tokens) {
|
|
561
|
+
std::vector<llama_token> tokens_audio = tokens;
|
|
562
|
+
tts_type tts_type = getTTSType(main_ctx);
|
|
563
|
+
if (tts_type == OUTETTS_V0_3 || tts_type == OUTETTS_V0_2) {
|
|
564
|
+
tokens_audio.erase(std::remove_if(tokens_audio.begin(), tokens_audio.end(), [](llama_token t) { return t < 151672 || t > 155772; }), tokens_audio.end());
|
|
565
|
+
for (auto & token : tokens_audio) {
|
|
566
|
+
token -= 151672;
|
|
567
|
+
}
|
|
568
|
+
} else {
|
|
569
|
+
LOG_ERROR("Unsupported audio tokens");
|
|
570
|
+
return std::vector<float>();
|
|
571
|
+
}
|
|
572
|
+
const int n_codes = tokens_audio.size();
|
|
573
|
+
llama_batch batch = llama_batch_init(n_codes, 0, 1);
|
|
574
|
+
for (size_t i = 0; i < tokens_audio.size(); ++i) {
|
|
575
|
+
llama_batch_add(&batch, tokens_audio[i], i, { 0 }, true);
|
|
576
|
+
}
|
|
577
|
+
if (batch.n_tokens != n_codes) {
|
|
578
|
+
LOG_ERROR("batch.n_tokens != n_codes: %d != %d", batch.n_tokens, n_codes);
|
|
579
|
+
return std::vector<float>();
|
|
580
|
+
}
|
|
581
|
+
if (llama_encode(ctx, batch) != 0) {
|
|
582
|
+
LOG_ERROR("llama_encode() failed");
|
|
583
|
+
return std::vector<float>();
|
|
584
|
+
}
|
|
585
|
+
llama_synchronize(ctx);
|
|
586
|
+
const int n_embd = llama_model_n_embd(model);
|
|
587
|
+
const float * embd = llama_get_embeddings(ctx);
|
|
588
|
+
return embd_to_audio(embd, n_codes, n_embd, main_ctx->params.cpuparams.n_threads);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
}
|
package/cpp/rn-tts.h
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#ifndef RNTTS_H
|
|
2
|
+
#define RNTTS_H
|
|
3
|
+
|
|
4
|
+
#include <vector>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "llama.h"
|
|
7
|
+
#include "nlohmann/json.hpp"
|
|
8
|
+
#include "common.h"
|
|
9
|
+
|
|
10
|
+
using json = nlohmann::ordered_json;
|
|
11
|
+
|
|
12
|
+
namespace rnllama {
|
|
13
|
+
|
|
14
|
+
// Forward declarations
|
|
15
|
+
struct llama_rn_context;
|
|
16
|
+
|
|
17
|
+
// TTS type enumeration
|
|
18
|
+
enum tts_type {
|
|
19
|
+
UNKNOWN = -1,
|
|
20
|
+
OUTETTS_V0_1 = 0,
|
|
21
|
+
OUTETTS_V0_2 = 1,
|
|
22
|
+
OUTETTS_V0_3 = 2,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Audio completion result structure
|
|
26
|
+
struct llama_rn_audio_completion_result {
|
|
27
|
+
std::string prompt;
|
|
28
|
+
const char *grammar;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
// TTS context for TTS-specific functionality
|
|
32
|
+
struct llama_rn_context_tts {
|
|
33
|
+
// TTS state fields
|
|
34
|
+
std::vector<llama_token> audio_tokens;
|
|
35
|
+
std::vector<llama_token> guide_tokens;
|
|
36
|
+
bool next_token_uses_guide_token = true;
|
|
37
|
+
|
|
38
|
+
// Vocoder fields (from llama_rn_context_vocoder)
|
|
39
|
+
common_init_result init_result;
|
|
40
|
+
common_params params;
|
|
41
|
+
llama_model *model = nullptr;
|
|
42
|
+
llama_context *ctx = nullptr;
|
|
43
|
+
tts_type type = UNKNOWN;
|
|
44
|
+
|
|
45
|
+
// Constructor and destructor
|
|
46
|
+
llama_rn_context_tts(const std::string &vocoder_model_path, int batch_size = -1);
|
|
47
|
+
~llama_rn_context_tts();
|
|
48
|
+
|
|
49
|
+
// TTS utility methods
|
|
50
|
+
tts_type getTTSType(llama_rn_context* main_ctx, json speaker = nullptr);
|
|
51
|
+
llama_rn_audio_completion_result getFormattedAudioCompletion(llama_rn_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak);
|
|
52
|
+
std::vector<llama_token> getAudioCompletionGuideTokens(llama_rn_context* main_ctx, const std::string &text_to_speak);
|
|
53
|
+
std::vector<float> decodeAudioTokens(llama_rn_context* main_ctx, const std::vector<llama_token> &tokens);
|
|
54
|
+
void setGuideTokens(const std::vector<llama_token> &tokens);
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#endif /* RNTTS_H */
|