@livekit/agents 0.4.6 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -0
- package/dist/audio.cjs +77 -0
- package/dist/audio.cjs.map +1 -0
- package/dist/audio.js +48 -37
- package/dist/audio.js.map +1 -1
- package/dist/cli.cjs +131 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.js +96 -122
- package/dist/cli.js.map +1 -1
- package/dist/generator.cjs +36 -0
- package/dist/generator.cjs.map +1 -0
- package/dist/generator.js +8 -22
- package/dist/generator.js.map +1 -1
- package/dist/http_server.cjs +72 -0
- package/dist/http_server.cjs.map +1 -0
- package/dist/http_server.d.ts +1 -1
- package/dist/http_server.js +44 -47
- package/dist/http_server.js.map +1 -1
- package/dist/index.cjs +78 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +26 -28
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_executor.cjs +33 -0
- package/dist/ipc/job_executor.cjs.map +1 -0
- package/dist/ipc/job_executor.js +7 -4
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_main.cjs +147 -0
- package/dist/ipc/job_main.cjs.map +1 -0
- package/dist/ipc/job_main.d.ts +1 -1
- package/dist/ipc/job_main.js +103 -103
- package/dist/ipc/job_main.js.map +1 -1
- package/dist/ipc/message.cjs +17 -0
- package/dist/ipc/message.cjs.map +1 -0
- package/dist/ipc/message.js +0 -1
- package/dist/ipc/message.js.map +1 -1
- package/dist/ipc/proc_job_executor.cjs +174 -0
- package/dist/ipc/proc_job_executor.cjs.map +1 -0
- package/dist/ipc/proc_job_executor.js +130 -126
- package/dist/ipc/proc_job_executor.js.map +1 -1
- package/dist/ipc/proc_pool.cjs +126 -0
- package/dist/ipc/proc_pool.cjs.map +1 -0
- package/dist/ipc/proc_pool.js +93 -96
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/job.cjs +230 -0
- package/dist/job.cjs.map +1 -0
- package/dist/job.d.ts +6 -1
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +195 -198
- package/dist/job.js.map +1 -1
- package/dist/llm/chat_context.cjs +131 -0
- package/dist/llm/chat_context.cjs.map +1 -0
- package/dist/llm/chat_context.js +98 -86
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/function_context.cjs +103 -0
- package/dist/llm/function_context.cjs.map +1 -0
- package/dist/llm/function_context.js +72 -81
- package/dist/llm/function_context.js.map +1 -1
- package/dist/llm/function_context.test.cjs +218 -0
- package/dist/llm/function_context.test.cjs.map +1 -0
- package/dist/llm/function_context.test.js +209 -210
- package/dist/llm/function_context.test.js.map +1 -1
- package/dist/llm/index.cjs +43 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.js +22 -6
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +76 -0
- package/dist/llm/llm.cjs.map +1 -0
- package/dist/llm/llm.js +48 -42
- package/dist/llm/llm.js.map +1 -1
- package/dist/log.cjs +57 -0
- package/dist/log.cjs.map +1 -0
- package/dist/log.js +27 -26
- package/dist/log.js.map +1 -1
- package/dist/multimodal/agent_playout.cjs +228 -0
- package/dist/multimodal/agent_playout.cjs.map +1 -0
- package/dist/multimodal/agent_playout.d.ts +1 -1
- package/dist/multimodal/agent_playout.js +193 -180
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/index.cjs +25 -0
- package/dist/multimodal/index.cjs.map +1 -0
- package/dist/multimodal/index.js +2 -5
- package/dist/multimodal/index.js.map +1 -1
- package/dist/multimodal/multimodal_agent.cjs +404 -0
- package/dist/multimodal/multimodal_agent.cjs.map +1 -0
- package/dist/multimodal/multimodal_agent.d.ts +1 -1
- package/dist/multimodal/multimodal_agent.js +351 -330
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.cjs +172 -0
- package/dist/pipeline/agent_output.cjs.map +1 -0
- package/dist/pipeline/agent_output.js +136 -138
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/agent_playout.cjs +169 -0
- package/dist/pipeline/agent_playout.cjs.map +1 -0
- package/dist/pipeline/agent_playout.js +126 -136
- package/dist/pipeline/agent_playout.js.map +1 -1
- package/dist/pipeline/human_input.cjs +158 -0
- package/dist/pipeline/human_input.cjs.map +1 -0
- package/dist/pipeline/human_input.js +124 -125
- package/dist/pipeline/human_input.js.map +1 -1
- package/dist/pipeline/index.cjs +31 -0
- package/dist/pipeline/index.cjs.map +1 -0
- package/dist/pipeline/index.js +8 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +642 -0
- package/dist/pipeline/pipeline_agent.cjs.map +1 -0
- package/dist/pipeline/pipeline_agent.js +595 -651
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +128 -0
- package/dist/pipeline/speech_handle.cjs.map +1 -0
- package/dist/pipeline/speech_handle.js +102 -100
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/plugin.cjs +46 -0
- package/dist/plugin.cjs.map +1 -0
- package/dist/plugin.js +20 -20
- package/dist/plugin.js.map +1 -1
- package/dist/stt/index.cjs +38 -0
- package/dist/stt/index.cjs.map +1 -0
- package/dist/stt/index.js +13 -5
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +87 -0
- package/dist/stt/stream_adapter.cjs.map +1 -0
- package/dist/stt/stream_adapter.js +58 -55
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +98 -0
- package/dist/stt/stt.cjs.map +1 -0
- package/dist/stt/stt.js +63 -98
- package/dist/stt/stt.js.map +1 -1
- package/dist/tokenize/basic/basic.cjs +98 -0
- package/dist/tokenize/basic/basic.cjs.map +1 -0
- package/dist/tokenize/basic/basic.d.ts +1 -1
- package/dist/tokenize/basic/basic.d.ts.map +1 -1
- package/dist/tokenize/basic/basic.js +56 -45
- package/dist/tokenize/basic/basic.js.map +1 -1
- package/dist/tokenize/basic/hyphenator.cjs +425 -0
- package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
- package/dist/tokenize/basic/hyphenator.js +66 -82
- package/dist/tokenize/basic/hyphenator.js.map +1 -1
- package/dist/tokenize/basic/index.cjs +35 -0
- package/dist/tokenize/basic/index.cjs.map +1 -0
- package/dist/tokenize/basic/index.js +7 -4
- package/dist/tokenize/basic/index.js.map +1 -1
- package/dist/tokenize/basic/paragraph.cjs +57 -0
- package/dist/tokenize/basic/paragraph.cjs.map +1 -0
- package/dist/tokenize/basic/paragraph.js +30 -35
- package/dist/tokenize/basic/paragraph.js.map +1 -1
- package/dist/tokenize/basic/sentence.cjs +89 -0
- package/dist/tokenize/basic/sentence.cjs.map +1 -0
- package/dist/tokenize/basic/sentence.d.ts.map +1 -1
- package/dist/tokenize/basic/sentence.js +62 -57
- package/dist/tokenize/basic/sentence.js.map +1 -1
- package/dist/tokenize/basic/word.cjs +44 -0
- package/dist/tokenize/basic/word.cjs.map +1 -0
- package/dist/tokenize/basic/word.js +17 -20
- package/dist/tokenize/basic/word.js.map +1 -1
- package/dist/tokenize/index.cjs +55 -0
- package/dist/tokenize/index.cjs.map +1 -0
- package/dist/tokenize/index.js +18 -7
- package/dist/tokenize/index.js.map +1 -1
- package/dist/tokenize/token_stream.cjs +164 -0
- package/dist/tokenize/token_stream.cjs.map +1 -0
- package/dist/tokenize/token_stream.js +133 -139
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/tokenize/tokenizer.cjs +184 -0
- package/dist/tokenize/tokenizer.cjs.map +1 -0
- package/dist/tokenize/tokenizer.js +138 -99
- package/dist/tokenize/tokenizer.js.map +1 -1
- package/dist/tokenize/tokenizer.test.cjs +220 -0
- package/dist/tokenize/tokenizer.test.cjs.map +1 -0
- package/dist/tokenize/tokenizer.test.d.ts +2 -0
- package/dist/tokenize/tokenizer.test.d.ts.map +1 -0
- package/dist/tokenize/tokenizer.test.js +219 -0
- package/dist/tokenize/tokenizer.test.js.map +1 -0
- package/dist/transcription.cjs +131 -0
- package/dist/transcription.cjs.map +1 -0
- package/dist/transcription.js +99 -96
- package/dist/transcription.js.map +1 -1
- package/dist/tts/index.cjs +38 -0
- package/dist/tts/index.cjs.map +1 -0
- package/dist/tts/index.js +13 -5
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +78 -0
- package/dist/tts/stream_adapter.cjs.map +1 -0
- package/dist/tts/stream_adapter.js +50 -47
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +127 -0
- package/dist/tts/tts.cjs.map +1 -0
- package/dist/tts/tts.js +90 -120
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +284 -0
- package/dist/utils.cjs.map +1 -0
- package/dist/utils.js +242 -247
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +92 -0
- package/dist/vad.cjs.map +1 -0
- package/dist/vad.js +57 -52
- package/dist/vad.js.map +1 -1
- package/dist/version.cjs +29 -0
- package/dist/version.cjs.map +1 -0
- package/dist/version.js +4 -4
- package/dist/version.js.map +1 -1
- package/dist/worker.cjs +577 -0
- package/dist/worker.cjs.map +1 -0
- package/dist/worker.d.ts +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +512 -484
- package/dist/worker.js.map +1 -1
- package/package.json +18 -8
- package/src/ipc/job_main.ts +66 -64
- package/src/job.ts +3 -2
- package/src/pipeline/pipeline_agent.ts +23 -23
- package/src/tokenize/basic/basic.ts +1 -1
- package/src/tokenize/basic/sentence.ts +14 -8
- package/src/tokenize/tokenizer.test.ts +255 -0
- package/src/worker.ts +1 -0
|
@@ -1,60 +1,65 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
1
|
+
const splitSentences = (text, minLength = 20) => {
|
|
2
|
+
const alphabets = /([A-Za-z])/g;
|
|
3
|
+
const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;
|
|
4
|
+
const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;
|
|
5
|
+
const starters = /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
|
|
6
|
+
const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
|
|
7
|
+
const websites = /[.](com|net|org|io|gov|edu|me)/g;
|
|
8
|
+
const digits = /([0-9])/g;
|
|
9
|
+
const dots = /\.{2,}/g;
|
|
10
|
+
text = text.replaceAll("\n", " ");
|
|
11
|
+
text = text.replaceAll(prefixes, "$1<prd>");
|
|
12
|
+
text = text.replaceAll(websites, "<prd>$2");
|
|
13
|
+
text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, "g"), "$1<prd>$2");
|
|
14
|
+
text = text.replaceAll(dots, (match) => "<prd>".repeat(match.length));
|
|
15
|
+
text = text.replaceAll("Ph.D.", "Ph<prd>D<prd>");
|
|
16
|
+
text = text.replaceAll(new RegExp(`\\s${alphabets.source}[.] `, "g"), " $1<prd> ");
|
|
17
|
+
text = text.replaceAll(new RegExp(`${acronyms.source} ${starters.source}`, "g"), "$1<stop> $2");
|
|
18
|
+
text = text.replaceAll(
|
|
19
|
+
new RegExp(`${alphabets.source}[.]${alphabets.source}[.]${alphabets.source}[.]`, "g"),
|
|
20
|
+
"$1<prd>$2<prd>$3<prd>"
|
|
21
|
+
);
|
|
22
|
+
text = text.replaceAll(
|
|
23
|
+
new RegExp(`${alphabets.source}[.]${alphabets.source}[.]`, "g"),
|
|
24
|
+
"$1<prd>$2<prd>"
|
|
25
|
+
);
|
|
26
|
+
text = text.replaceAll(
|
|
27
|
+
new RegExp(` ${suffixes.source}[.] ${starters.source}`, "g"),
|
|
28
|
+
"$1<stop> $2"
|
|
29
|
+
);
|
|
30
|
+
text = text.replaceAll(new RegExp(` ${suffixes.source}[.]`, "g"), "$1<prd>");
|
|
31
|
+
text = text.replaceAll(new RegExp(` ${alphabets.source}[.]`, "g"), "$1<prd>");
|
|
32
|
+
text = text.replaceAll(".\u201D", "\u201D.");
|
|
33
|
+
text = text.replaceAll('."', '".');
|
|
34
|
+
text = text.replaceAll('!"', '"!');
|
|
35
|
+
text = text.replaceAll('?"', '"?');
|
|
36
|
+
text = text.replaceAll(".", ".<stop>");
|
|
37
|
+
text = text.replaceAll("?", "?<stop>");
|
|
38
|
+
text = text.replaceAll("!", "!<stop>");
|
|
39
|
+
text = text.replaceAll("<prd>", ".");
|
|
40
|
+
const split = text.split("<stop>");
|
|
41
|
+
text = text.replaceAll("<stop>", "");
|
|
42
|
+
const sentences = [];
|
|
43
|
+
let buf = "";
|
|
44
|
+
let start = 0;
|
|
45
|
+
let end = 0;
|
|
46
|
+
for (const match of split) {
|
|
47
|
+
const sentence = match.trim();
|
|
48
|
+
if (!sentence) continue;
|
|
49
|
+
buf += " " + sentence;
|
|
50
|
+
end += match.length;
|
|
51
|
+
if (buf.length > minLength) {
|
|
52
|
+
sentences.push([buf.slice(1), start, end]);
|
|
53
|
+
start = end;
|
|
54
|
+
buf = "";
|
|
54
55
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
}
|
|
57
|
+
if (buf) {
|
|
58
|
+
sentences.push([buf.slice(1), start, text.length - 1]);
|
|
59
|
+
}
|
|
60
|
+
return sentences;
|
|
61
|
+
};
|
|
62
|
+
export {
|
|
63
|
+
splitSentences
|
|
59
64
|
};
|
|
60
65
|
//# sourceMappingURL=sentence.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/sentence.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Split the text into sentences.\n */\nexport const splitSentences = (text: string, minLength = 20): [string, number, number][] => {\n const alphabets = /([A-Za-z])/g;\n const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;\n const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;\n const starters =\n /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)/g;\n const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;\n const websites = /[.](com|net|org|io|gov|edu|me)/g;\n const digits = /([0-9])/g;\n const dots = /\\.{2,}/g;\n\n text = text.replaceAll('\\n', ' ');\n text = text.replaceAll(prefixes, '$1<prd>');\n text = text.replaceAll(websites, '<prd>$2');\n text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, 'g'), '$1<prd>$2');\n text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));\n text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');\n text = text.replaceAll(new RegExp(`\\\\s${alphabets.source}[.] `, 'g'), ' $1<prd> ');\n text = text.replaceAll(new RegExp(`${acronyms.source} ${starters.source}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(\n new RegExp(`${alphabets.source}[.]${alphabets.source}[.]${alphabets.source}[.]`, 'g'),\n '$1<prd>$2<prd>$3<prd>',\n );\n text = text.replaceAll(\n new RegExp(`${alphabets.source}[.]${alphabets.source}[.]`, 'g'),\n '$1<prd>$2<prd>',\n );\n text = text.replaceAll(\n new RegExp(` ${suffixes.source}[.] ${starters.source}`, 'g'),\n '$1<stop> $2',\n );\n text = text.replaceAll(new RegExp(` ${suffixes.source}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(` ${alphabets.source}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll('.”', '”.');\n text = text.replaceAll('.\"', '\".');\n text = text.replaceAll('!\"', '\"!');\n text = text.replaceAll('?\"', '\"?');\n text = text.replaceAll('.', '.<stop>');\n text = text.replaceAll('?', '?<stop>');\n text = text.replaceAll('!', '!<stop>');\n text = text.replaceAll('<prd>', '.');\n\n const split = text.split('<stop>');\n text = text.replaceAll('<stop>', '');\n\n const sentences: [string, number, number][] = [];\n let buf = '';\n let start = 0;\n let end = 0;\n for (const match of split) {\n const sentence = match.trim();\n if (!sentence) continue;\n\n buf += ' ' + sentence;\n end += match.length;\n if (buf.length > minLength) {\n sentences.push([buf.slice(1), start, end]);\n start = end;\n buf = '';\n }\n }\n\n if (buf) {\n sentences.push([buf.slice(1), start, text.length - 1]);\n }\n\n return sentences;\n};\n"],"mappings":"AAOO,MAAM,iBAAiB,CAAC,MAAc,YAAY,OAAmC;AAC1F,QAAM,YAAY;AAClB,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,WACJ;AACF,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,SAAS;AACf,QAAM,OAAO;AAEb,SAAO,KAAK,WAAW,MAAM,GAAG;AAChC,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,OAAO,MAAM,MAAM,OAAO,MAAM,IAAI,GAAG,GAAG,WAAW;AAC1F,SAAO,KAAK,WAAW,MAAM,CAAC,UAAU,QAAQ,OAAO,MAAM,MAAM,CAAC;AACpE,SAAO,KAAK,WAAW,SAAS,eAAe;AAC/C,SAAO,KAAK,WAAW,IAAI,OAAO,MAAM,UAAU,MAAM,QAAQ,GAAG,GAAG,WAAW;AACjF,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,SAAS,MAAM,IAAI,SAAS,MAAM,IAAI,GAAG,GAAG,aAAa;AAC9F,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,UAAU,MAAM,MAAM,UAAU,MAAM,MAAM,UAAU,MAAM,OAAO,GAAG;AAAA,IACpF;AAAA,EACF;AACA,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,UAAU,MAAM,MAAM,UAAU,MAAM,OAAO,GAAG;AAAA,IAC9D;AAAA,EACF;AACA,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,IAAI,SAAS,MAAM,OAAO,SAAS,MAAM,IAAI,GAAG;AAAA,IAC3D;AAAA,EACF;AACA,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,SAAS,MAAM,OAAO,GAAG,GAAG,SAAS;AAC3E,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,UAAU,MAAM,OAAO,GAAG,GAAG,SAAS;AAC5E,SAAO,KAAK,WAAW,WAAM,SAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,SAAS,GAAG;AAEnC,QAAM,QAAQ,KAAK,MAAM,QAAQ;AACjC,SAAO,KAAK,WAAW,UAAU,EAAE;AAEnC,QAAM,YAAwC,CAAC;AAC/C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,MAAM;AACV,aAAW,SAAS,OAAO;AACzB,UAAM,WAAW,MAAM,KAAK;AAC5B,QAAI,CAAC,SAAU;AAEf,WAAO,MAAM;AACb,WAAO,MAAM;AACb,QAAI,IAAI,SAAS,WAAW;AAC1B,gBAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,GAAG,CAAC;AACzC,cAAQ;AACR,YAAM;AAAA,IACR;AAAA,EACF;AAEA,MAAI,KAAK;AACP,cAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC;AAAA,EACvD;AAEA,SAAO;AACT;","names":[]}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var word_exports = {};
|
|
20
|
+
__export(word_exports, {
|
|
21
|
+
splitWords: () => splitWords
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(word_exports);
|
|
24
|
+
var import_tokenizer = require("../tokenizer.cjs");
|
|
25
|
+
const splitWords = (text, ignorePunctuation = true) => {
|
|
26
|
+
const re = /\S+/g;
|
|
27
|
+
const words = [];
|
|
28
|
+
let arr;
|
|
29
|
+
while ((arr = re.exec(text)) !== null) {
|
|
30
|
+
let word = arr[0];
|
|
31
|
+
const start = arr.index;
|
|
32
|
+
const end = start + word.length;
|
|
33
|
+
if (ignorePunctuation) {
|
|
34
|
+
word = word.replace(new RegExp(`[${import_tokenizer.PUNCTUATIONS.join("")}]`, "g"), "");
|
|
35
|
+
}
|
|
36
|
+
words.push([word, start, end]);
|
|
37
|
+
}
|
|
38
|
+
return words;
|
|
39
|
+
};
|
|
40
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
41
|
+
0 && (module.exports = {
|
|
42
|
+
splitWords
|
|
43
|
+
});
|
|
44
|
+
//# sourceMappingURL=word.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/word.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { PUNCTUATIONS } from '../tokenizer.js';\n\n/**\n * Split the text into words.\n */\nexport const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {\n const re = /\\S+/g;\n const words: [string, number, number][] = [];\n\n let arr;\n while ((arr = re.exec(text)) !== null) {\n let word = arr[0];\n const start = arr.index;\n const end = start + word.length;\n\n if (ignorePunctuation) {\n word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');\n }\n\n words.push([word, start, end]);\n }\n\n return words;\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,uBAA6B;AAKtB,MAAM,aAAa,CAAC,MAAc,oBAAoB,SAAqC;AAChG,QAAM,KAAK;AACX,QAAM,QAAoC,CAAC;AAE3C,MAAI;AACJ,UAAQ,MAAM,GAAG,KAAK,IAAI,OAAO,MAAM;AACrC,QAAI,OAAO,IAAI,CAAC;AAChB,UAAM,QAAQ,IAAI;AAClB,UAAM,MAAM,QAAQ,KAAK;AAEzB,QAAI,mBAAmB;AACrB,aAAO,KAAK,QAAQ,IAAI,OAAO,IAAI,8BAAa,KAAK,EAAE,CAAC,KAAK,GAAG,GAAG,EAAE;AAAA,IACvE;AAEA,UAAM,KAAK,CAAC,MAAM,OAAO,GAAG,CAAC;AAAA,EAC/B;AAEA,SAAO;AACT;","names":[]}
|
|
@@ -1,23 +1,20 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
const
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
while ((arr = re.exec(text)) !== null) {
|
|
13
|
-
let word = arr[0];
|
|
14
|
-
const start = arr.index;
|
|
15
|
-
const end = start + word.length;
|
|
16
|
-
if (ignorePunctuation) {
|
|
17
|
-
word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');
|
|
18
|
-
}
|
|
19
|
-
words.push([word, start, end]);
|
|
1
|
+
import { PUNCTUATIONS } from "../tokenizer.js";
|
|
2
|
+
const splitWords = (text, ignorePunctuation = true) => {
|
|
3
|
+
const re = /\S+/g;
|
|
4
|
+
const words = [];
|
|
5
|
+
let arr;
|
|
6
|
+
while ((arr = re.exec(text)) !== null) {
|
|
7
|
+
let word = arr[0];
|
|
8
|
+
const start = arr.index;
|
|
9
|
+
const end = start + word.length;
|
|
10
|
+
if (ignorePunctuation) {
|
|
11
|
+
word = word.replace(new RegExp(`[${PUNCTUATIONS.join("")}]`, "g"), "");
|
|
20
12
|
}
|
|
21
|
-
|
|
13
|
+
words.push([word, start, end]);
|
|
14
|
+
}
|
|
15
|
+
return words;
|
|
16
|
+
};
|
|
17
|
+
export {
|
|
18
|
+
splitWords
|
|
22
19
|
};
|
|
23
20
|
//# sourceMappingURL=word.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/word.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { PUNCTUATIONS } from '../tokenizer.js';\n\n/**\n * Split the text into words.\n */\nexport const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {\n const re = /\\S+/g;\n const words: [string, number, number][] = [];\n\n let arr;\n while ((arr = re.exec(text)) !== null) {\n let word = arr[0];\n const start = arr.index;\n const end = start + word.length;\n\n if (ignorePunctuation) {\n word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');\n }\n\n words.push([word, start, end]);\n }\n\n return words;\n};\n"],"mappings":"AAGA,SAAS,oBAAoB;AAKtB,MAAM,aAAa,CAAC,MAAc,oBAAoB,SAAqC;AAChG,QAAM,KAAK;AACX,QAAM,QAAoC,CAAC;AAE3C,MAAI;AACJ,UAAQ,MAAM,GAAG,KAAK,IAAI,OAAO,MAAM;AACrC,QAAI,OAAO,IAAI,CAAC;AAChB,UAAM,QAAQ,IAAI;AAClB,UAAM,MAAM,QAAQ,KAAK;AAEzB,QAAI,mBAAmB;AACrB,aAAO,KAAK,QAAQ,IAAI,OAAO,IAAI,aAAa,KAAK,EAAE,CAAC,KAAK,GAAG,GAAG,EAAE;AAAA,IACvE;AAEA,UAAM,KAAK,CAAC,MAAM,OAAO,GAAG,CAAC;AAAA,EAC/B;AAEA,SAAO;AACT;","names":[]}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var tokenize_exports = {};
|
|
30
|
+
__export(tokenize_exports, {
|
|
31
|
+
BufferedSentenceStream: () => import_token_stream.BufferedSentenceStream,
|
|
32
|
+
BufferedTokenStream: () => import_token_stream.BufferedTokenStream,
|
|
33
|
+
BufferedWordStream: () => import_token_stream.BufferedWordStream,
|
|
34
|
+
SentenceStream: () => import_tokenizer.SentenceStream,
|
|
35
|
+
SentenceTokenizer: () => import_tokenizer.SentenceTokenizer,
|
|
36
|
+
WordStream: () => import_tokenizer.WordStream,
|
|
37
|
+
WordTokenizer: () => import_tokenizer.WordTokenizer,
|
|
38
|
+
basic: () => basic
|
|
39
|
+
});
|
|
40
|
+
module.exports = __toCommonJS(tokenize_exports);
|
|
41
|
+
var basic = __toESM(require("./basic/index.cjs"), 1);
|
|
42
|
+
var import_tokenizer = require("./tokenizer.cjs");
|
|
43
|
+
var import_token_stream = require("./token_stream.cjs");
|
|
44
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
45
|
+
0 && (module.exports = {
|
|
46
|
+
BufferedSentenceStream,
|
|
47
|
+
BufferedTokenStream,
|
|
48
|
+
BufferedWordStream,
|
|
49
|
+
SentenceStream,
|
|
50
|
+
SentenceTokenizer,
|
|
51
|
+
WordStream,
|
|
52
|
+
WordTokenizer,
|
|
53
|
+
basic
|
|
54
|
+
});
|
|
55
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/tokenize/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport * as basic from './basic/index.js';\n\nexport {\n type TokenData,\n SentenceTokenizer,\n SentenceStream,\n WordTokenizer,\n WordStream,\n} from './tokenizer.js';\n\nexport { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js';\n\nexport { basic };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,YAAuB;AAEvB,uBAMO;AAEP,0BAAgF;","names":[]}
|
package/dist/tokenize/index.js
CHANGED
|
@@ -1,8 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
1
|
+
import * as basic from "./basic/index.js";
|
|
2
|
+
import {
|
|
3
|
+
SentenceTokenizer,
|
|
4
|
+
SentenceStream,
|
|
5
|
+
WordTokenizer,
|
|
6
|
+
WordStream
|
|
7
|
+
} from "./tokenizer.js";
|
|
8
|
+
import { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from "./token_stream.js";
|
|
9
|
+
export {
|
|
10
|
+
BufferedSentenceStream,
|
|
11
|
+
BufferedTokenStream,
|
|
12
|
+
BufferedWordStream,
|
|
13
|
+
SentenceStream,
|
|
14
|
+
SentenceTokenizer,
|
|
15
|
+
WordStream,
|
|
16
|
+
WordTokenizer,
|
|
17
|
+
basic
|
|
18
|
+
};
|
|
8
19
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"
|
|
1
|
+
{"version":3,"sources":["../../src/tokenize/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport * as basic from './basic/index.js';\n\nexport {\n type TokenData,\n SentenceTokenizer,\n SentenceStream,\n WordTokenizer,\n WordStream,\n} from './tokenizer.js';\n\nexport { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js';\n\nexport { basic };\n"],"mappings":"AAGA,YAAY,WAAW;AAEvB;AAAA,EAEE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAEP,SAAS,wBAAwB,qBAAqB,0BAA0B;","names":[]}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var token_stream_exports = {};
|
|
20
|
+
__export(token_stream_exports, {
|
|
21
|
+
BufferedSentenceStream: () => BufferedSentenceStream,
|
|
22
|
+
BufferedTokenStream: () => BufferedTokenStream,
|
|
23
|
+
BufferedWordStream: () => BufferedWordStream
|
|
24
|
+
});
|
|
25
|
+
module.exports = __toCommonJS(token_stream_exports);
|
|
26
|
+
var import_node_crypto = require("node:crypto");
|
|
27
|
+
var import_utils = require("../utils.cjs");
|
|
28
|
+
var import_tokenizer = require("./tokenizer.cjs");
|
|
29
|
+
class BufferedTokenStream {
|
|
30
|
+
queue = new import_utils.AsyncIterableQueue();
|
|
31
|
+
closed = false;
|
|
32
|
+
#func;
|
|
33
|
+
#minTokenLength;
|
|
34
|
+
#minContextLength;
|
|
35
|
+
#bufTokens = [];
|
|
36
|
+
#inBuf = "";
|
|
37
|
+
#outBuf = "";
|
|
38
|
+
#currentSegmentId;
|
|
39
|
+
constructor(func, minTokenLength, minContextLength) {
|
|
40
|
+
this.#func = func;
|
|
41
|
+
this.#minTokenLength = minTokenLength;
|
|
42
|
+
this.#minContextLength = minContextLength;
|
|
43
|
+
this.#currentSegmentId = (0, import_node_crypto.randomUUID)();
|
|
44
|
+
}
|
|
45
|
+
/** Push a string of text into the token stream */
|
|
46
|
+
pushText(text) {
|
|
47
|
+
if (this.closed) {
|
|
48
|
+
throw new Error("Stream is closed");
|
|
49
|
+
}
|
|
50
|
+
this.#inBuf += text;
|
|
51
|
+
if (this.#inBuf.length < this.#minContextLength) return;
|
|
52
|
+
while (true) {
|
|
53
|
+
const tokens = this.#func(this.#inBuf);
|
|
54
|
+
if (tokens.length <= 1) break;
|
|
55
|
+
if (this.#outBuf) this.#outBuf += " ";
|
|
56
|
+
const tok = tokens.shift();
|
|
57
|
+
let tokText = tok;
|
|
58
|
+
if (tok.length > 1 && typeof tok[1] === "number") {
|
|
59
|
+
tokText = tok[0];
|
|
60
|
+
}
|
|
61
|
+
this.#outBuf += tokText;
|
|
62
|
+
if (this.#outBuf.length >= this.#minTokenLength) {
|
|
63
|
+
this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
|
|
64
|
+
this.#outBuf = "";
|
|
65
|
+
}
|
|
66
|
+
if (typeof tok !== "string") {
|
|
67
|
+
this.#inBuf = this.#inBuf.slice(tok[2]);
|
|
68
|
+
} else {
|
|
69
|
+
this.#inBuf = this.#inBuf.slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length).trimStart();
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
/** Flush the stream, causing it to process all pending text */
|
|
74
|
+
flush() {
|
|
75
|
+
if (this.closed) {
|
|
76
|
+
throw new Error("Stream is closed");
|
|
77
|
+
}
|
|
78
|
+
if (this.#inBuf || this.#outBuf) {
|
|
79
|
+
const tokens = this.#func(this.#inBuf);
|
|
80
|
+
if (tokens) {
|
|
81
|
+
if (this.#outBuf) this.#outBuf += " ";
|
|
82
|
+
if (typeof tokens[0] !== "string") {
|
|
83
|
+
this.#outBuf += tokens.map((tok) => tok[0]).join(" ");
|
|
84
|
+
} else {
|
|
85
|
+
this.#outBuf += tokens.join(" ");
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (this.#outBuf) {
|
|
89
|
+
this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
|
|
90
|
+
}
|
|
91
|
+
this.#currentSegmentId = (0, import_node_crypto.randomUUID)();
|
|
92
|
+
}
|
|
93
|
+
this.#inBuf = "";
|
|
94
|
+
this.#outBuf = "";
|
|
95
|
+
}
|
|
96
|
+
/** Mark the input as ended and forbid additional pushes */
|
|
97
|
+
endInput() {
|
|
98
|
+
if (this.closed) {
|
|
99
|
+
throw new Error("Stream is closed");
|
|
100
|
+
}
|
|
101
|
+
this.flush();
|
|
102
|
+
this.close();
|
|
103
|
+
}
|
|
104
|
+
next() {
|
|
105
|
+
return this.queue.next();
|
|
106
|
+
}
|
|
107
|
+
/** Close both the input and output of the token stream */
|
|
108
|
+
close() {
|
|
109
|
+
this.queue.close();
|
|
110
|
+
this.closed = true;
|
|
111
|
+
}
|
|
112
|
+
[Symbol.asyncIterator]() {
|
|
113
|
+
return this;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
class BufferedSentenceStream extends import_tokenizer.SentenceStream {
|
|
117
|
+
#stream;
|
|
118
|
+
constructor(func, minTokenLength, minContextLength) {
|
|
119
|
+
super();
|
|
120
|
+
this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
|
|
121
|
+
}
|
|
122
|
+
pushText(text) {
|
|
123
|
+
this.#stream.pushText(text);
|
|
124
|
+
}
|
|
125
|
+
flush() {
|
|
126
|
+
this.#stream.flush();
|
|
127
|
+
}
|
|
128
|
+
close() {
|
|
129
|
+
super.close();
|
|
130
|
+
this.#stream.close();
|
|
131
|
+
}
|
|
132
|
+
next() {
|
|
133
|
+
return this.#stream.next();
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
class BufferedWordStream extends import_tokenizer.WordStream {
|
|
137
|
+
#stream;
|
|
138
|
+
constructor(func, minTokenLength, minContextLength) {
|
|
139
|
+
super();
|
|
140
|
+
this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
|
|
141
|
+
}
|
|
142
|
+
pushText(text) {
|
|
143
|
+
this.#stream.pushText(text);
|
|
144
|
+
}
|
|
145
|
+
flush() {
|
|
146
|
+
this.#stream.flush();
|
|
147
|
+
}
|
|
148
|
+
endInput() {
|
|
149
|
+
this.#stream.endInput();
|
|
150
|
+
}
|
|
151
|
+
close() {
|
|
152
|
+
this.#stream.close();
|
|
153
|
+
}
|
|
154
|
+
next() {
|
|
155
|
+
return this.#stream.next();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
159
|
+
0 && (module.exports = {
|
|
160
|
+
BufferedSentenceStream,
|
|
161
|
+
BufferedTokenStream,
|
|
162
|
+
BufferedWordStream
|
|
163
|
+
});
|
|
164
|
+
//# sourceMappingURL=token_stream.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/tokenize/token_stream.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { randomUUID } from 'node:crypto';\nimport { AsyncIterableQueue } from '../utils.js';\nimport type { TokenData } from './tokenizer.js';\nimport { SentenceStream, WordStream } from './tokenizer.js';\n\ntype TokenizeFunc = (x: string) => string[] | [string, number, number][];\n\nexport class BufferedTokenStream implements AsyncIterableIterator<TokenData> {\n protected queue = new AsyncIterableQueue<TokenData>();\n protected closed = false;\n\n #func: TokenizeFunc;\n #minTokenLength: number;\n #minContextLength: number;\n #bufTokens: string[] = [];\n #inBuf = '';\n #outBuf = '';\n #currentSegmentId: string;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n this.#func = func;\n this.#minTokenLength = minTokenLength;\n this.#minContextLength = minContextLength;\n\n this.#currentSegmentId = randomUUID();\n }\n\n /** Push a string of text into the token stream */\n pushText(text: string) {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n this.#inBuf += text;\n if (this.#inBuf.length < this.#minContextLength) return;\n\n while (true) {\n const tokens = this.#func(this.#inBuf);\n if (tokens.length <= 1) break;\n\n if (this.#outBuf) this.#outBuf += ' ';\n\n const tok = tokens.shift()!;\n let tokText = tok as string;\n if (tok.length > 1 && typeof tok[1] === 'number') {\n tokText = tok[0];\n }\n\n this.#outBuf += tokText;\n if (this.#outBuf.length >= this.#minTokenLength) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n this.#outBuf = '';\n }\n\n if (typeof tok! !== 'string') {\n this.#inBuf = this.#inBuf.slice(tok![2]);\n } else {\n this.#inBuf = this.#inBuf\n .slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length)\n .trimStart();\n }\n }\n }\n\n /** Flush the stream, causing it to process all pending text */\n flush() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.#inBuf || this.#outBuf) {\n const tokens = this.#func(this.#inBuf);\n if (tokens) {\n if (this.#outBuf) this.#outBuf += ' ';\n\n if (typeof tokens[0] !== 'string') {\n this.#outBuf += tokens.map((tok) => tok[0]).join(' ');\n } else {\n this.#outBuf += tokens.join(' ');\n }\n }\n\n if (this.#outBuf) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n }\n\n this.#currentSegmentId = randomUUID();\n }\n\n this.#inBuf = '';\n this.#outBuf = '';\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.flush();\n this.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the token stream */\n close() {\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): BufferedTokenStream {\n return this;\n }\n}\n\nexport class BufferedSentenceStream extends SentenceStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n close() {\n super.close();\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n\nexport class BufferedWordStream extends WordStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n endInput() {\n this.#stream.endInput();\n }\n\n close() {\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,yBAA2B;AAC3B,mBAAmC;AAEnC,uBAA2C;AAIpC,MAAM,oBAAgE;AAAA,EACjE,QAAQ,IAAI,gCAA8B;AAAA,EAC1C,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAuB,CAAC;AAAA,EACxB,SAAS;AAAA,EACT,UAAU;AAAA,EACV;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,SAAK,QAAQ;AACb,SAAK,kBAAkB;AACvB,SAAK,oBAAoB;AAEzB,SAAK,wBAAoB,+BAAW;AAAA,EACtC;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,SAAK,UAAU;AACf,QAAI,KAAK,OAAO,SAAS,KAAK,kBAAmB;AAEjD,WAAO,MAAM;AACX,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,OAAO,UAAU,EAAG;AAExB,UAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAM,MAAM,OAAO,MAAM;AACzB,UAAI,UAAU;AACd,UAAI,IAAI,SAAS,KAAK,OAAO,IAAI,CAAC,MAAM,UAAU;AAChD,kBAAU,IAAI,CAAC;AAAA,MACjB;AAEA,WAAK,WAAW;AAChB,UAAI,KAAK,QAAQ,UAAU,KAAK,iBAAiB;AAC/C,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AACzE,aAAK,UAAU;AAAA,MACjB;AAEA,UAAI,OAAO,QAAS,UAAU;AAC5B,aAAK,SAAS,KAAK,OAAO,MAAM,IAAK,CAAC,CAAC;AAAA,MACzC,OAAO;AACL,aAAK,SAAS,KAAK,OAChB,MAAM,KAAK,IAAI,GAAG,KAAK,OAAO,QAAQ,GAAG,CAAC,IAAI,IAAI,MAAM,EACxD,UAAU;AAAA,MACf;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,UAAU,KAAK,SAAS;AAC/B,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,QAAQ;AACV,YAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAI,OAAO,OAAO,CAAC,MAAM,UAAU;AACjC,eAAK,WAAW,OAAO,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC,EAAE,KAAK,GAAG;AAAA,QACtD,OAAO;AACL,eAAK,WAAW,OAAO,KAAK,GAAG;AAAA,QACjC;AAAA,MACF;AAEA,UAAI,KAAK,SAAS;AAChB,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AAAA,MAC3E;AAEA,WAAK,wBAAoB,+BAAW;AAAA,IACtC;AAEA,SAAK,SAAS;AACd,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM;AACX,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAyB;AAC5C,WAAO;AAAA,EACT;AACF;AAEO,MAAM,+BAA+B,gCAAe;AAAA,EACzD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,QAAQ;AACN,UAAM,MAAM;AACZ,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;AAEO,MAAM,2BAA2B,4BAAW;AAAA,EACjD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,WAAW;AACT,SAAK,QAAQ,SAAS;AAAA,EACxB;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;","names":[]}
|