@livekit/agents 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +6 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/inference_runner.cjs +38 -0
- package/dist/inference_runner.cjs.map +1 -0
- package/dist/inference_runner.d.ts +11 -0
- package/dist/inference_runner.d.ts.map +1 -0
- package/dist/inference_runner.js +14 -0
- package/dist/inference_runner.js.map +1 -0
- package/dist/ipc/index.cjs +23 -0
- package/dist/ipc/index.cjs.map +1 -0
- package/dist/ipc/index.d.ts +2 -0
- package/dist/ipc/index.d.ts.map +1 -0
- package/dist/ipc/index.js +2 -0
- package/dist/ipc/index.js.map +1 -0
- package/dist/ipc/inference_executor.cjs +17 -0
- package/dist/ipc/inference_executor.cjs.map +1 -0
- package/dist/ipc/inference_executor.d.ts +4 -0
- package/dist/ipc/inference_executor.d.ts.map +1 -0
- package/dist/ipc/inference_executor.js +1 -0
- package/dist/ipc/inference_executor.js.map +1 -0
- package/dist/ipc/inference_proc_executor.cjs +97 -0
- package/dist/ipc/inference_proc_executor.cjs.map +1 -0
- package/dist/ipc/inference_proc_executor.d.ts +23 -0
- package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
- package/dist/ipc/inference_proc_executor.js +72 -0
- package/dist/ipc/inference_proc_executor.js.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.cjs +90 -0
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
- package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.js +67 -0
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
- package/dist/ipc/job_executor.cjs +8 -7
- package/dist/ipc/job_executor.cjs.map +1 -1
- package/dist/ipc/job_executor.d.ts +14 -15
- package/dist/ipc/job_executor.d.ts.map +1 -1
- package/dist/ipc/job_executor.js +7 -6
- package/dist/ipc/job_executor.js.map +1 -1
- package/dist/ipc/job_proc_executor.cjs +108 -0
- package/dist/ipc/job_proc_executor.cjs.map +1 -0
- package/dist/ipc/job_proc_executor.d.ts +19 -0
- package/dist/ipc/job_proc_executor.d.ts.map +1 -0
- package/dist/ipc/job_proc_executor.js +83 -0
- package/dist/ipc/job_proc_executor.js.map +1 -0
- package/dist/ipc/{job_main.cjs → job_proc_lazy_main.cjs} +41 -36
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
- package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
- package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
- package/dist/ipc/{job_main.js → job_proc_lazy_main.js} +41 -11
- package/dist/ipc/job_proc_lazy_main.js.map +1 -0
- package/dist/ipc/message.cjs.map +1 -1
- package/dist/ipc/message.d.ts +17 -0
- package/dist/ipc/message.d.ts.map +1 -1
- package/dist/ipc/proc_pool.cjs +30 -4
- package/dist/ipc/proc_pool.cjs.map +1 -1
- package/dist/ipc/proc_pool.d.ts +5 -1
- package/dist/ipc/proc_pool.d.ts.map +1 -1
- package/dist/ipc/proc_pool.js +30 -4
- package/dist/ipc/proc_pool.js.map +1 -1
- package/dist/ipc/{proc_job_executor.cjs → supervised_proc.cjs} +58 -46
- package/dist/ipc/supervised_proc.cjs.map +1 -0
- package/dist/ipc/supervised_proc.d.ts +30 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -0
- package/dist/ipc/{proc_job_executor.js → supervised_proc.js} +54 -32
- package/dist/ipc/supervised_proc.js.map +1 -0
- package/dist/job.cjs +18 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.ts +9 -1
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +17 -1
- package/dist/job.js.map +1 -1
- package/dist/metrics/base.cjs +2 -2
- package/dist/metrics/base.cjs.map +1 -1
- package/dist/metrics/base.d.ts +1 -1
- package/dist/metrics/base.d.ts.map +1 -1
- package/dist/metrics/base.js +2 -2
- package/dist/metrics/base.js.map +1 -1
- package/dist/multimodal/agent_playout.cjs +13 -14
- package/dist/multimodal/agent_playout.cjs.map +1 -1
- package/dist/multimodal/agent_playout.d.ts +4 -4
- package/dist/multimodal/agent_playout.d.ts.map +1 -1
- package/dist/multimodal/agent_playout.js +13 -14
- package/dist/multimodal/agent_playout.js.map +1 -1
- package/dist/multimodal/multimodal_agent.cjs +12 -8
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +13 -9
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/agent_output.cjs +20 -4
- package/dist/pipeline/agent_output.cjs.map +1 -1
- package/dist/pipeline/agent_output.d.ts +4 -2
- package/dist/pipeline/agent_output.d.ts.map +1 -1
- package/dist/pipeline/agent_output.js +20 -4
- package/dist/pipeline/agent_output.js.map +1 -1
- package/dist/pipeline/agent_playout.cjs +9 -3
- package/dist/pipeline/agent_playout.cjs.map +1 -1
- package/dist/pipeline/agent_playout.d.ts +4 -2
- package/dist/pipeline/agent_playout.d.ts.map +1 -1
- package/dist/pipeline/agent_playout.js +9 -3
- package/dist/pipeline/agent_playout.js.map +1 -1
- package/dist/pipeline/human_input.cjs +6 -0
- package/dist/pipeline/human_input.cjs.map +1 -1
- package/dist/pipeline/human_input.d.ts +3 -1
- package/dist/pipeline/human_input.d.ts.map +1 -1
- package/dist/pipeline/human_input.js +6 -0
- package/dist/pipeline/human_input.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +79 -12
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +8 -0
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +79 -12
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +16 -4
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +16 -4
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/tokenize/basic/basic.cjs +2 -0
- package/dist/tokenize/basic/basic.cjs.map +1 -1
- package/dist/tokenize/basic/basic.d.ts +2 -0
- package/dist/tokenize/basic/basic.d.ts.map +1 -1
- package/dist/tokenize/basic/basic.js +1 -0
- package/dist/tokenize/basic/basic.js.map +1 -1
- package/dist/tokenize/basic/index.cjs +2 -0
- package/dist/tokenize/basic/index.cjs.map +1 -1
- package/dist/tokenize/basic/index.d.ts +1 -1
- package/dist/tokenize/basic/index.d.ts.map +1 -1
- package/dist/tokenize/basic/index.js +8 -1
- package/dist/tokenize/basic/index.js.map +1 -1
- package/dist/tokenize/token_stream.cjs +5 -3
- package/dist/tokenize/token_stream.cjs.map +1 -1
- package/dist/tokenize/token_stream.d.ts.map +1 -1
- package/dist/tokenize/token_stream.js +5 -3
- package/dist/tokenize/token_stream.js.map +1 -1
- package/dist/transcription.cjs +203 -86
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.ts +24 -17
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js +201 -85
- package/dist/transcription.js.map +1 -1
- package/dist/worker.cjs +42 -9
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts +5 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +42 -9
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/index.ts +3 -1
- package/src/inference_runner.ts +19 -0
- package/src/ipc/index.ts +5 -0
- package/src/ipc/inference_executor.ts +7 -0
- package/src/ipc/inference_proc_executor.ts +93 -0
- package/src/ipc/inference_proc_lazy_main.ts +86 -0
- package/src/ipc/job_executor.ts +15 -17
- package/src/ipc/job_proc_executor.ts +112 -0
- package/src/ipc/{job_main.ts → job_proc_lazy_main.ts} +44 -14
- package/src/ipc/message.ts +14 -1
- package/src/ipc/proc_pool.ts +33 -3
- package/src/ipc/{proc_job_executor.ts → supervised_proc.ts} +80 -30
- package/src/job.ts +21 -0
- package/src/metrics/base.ts +7 -10
- package/src/multimodal/agent_playout.ts +14 -16
- package/src/multimodal/multimodal_agent.ts +13 -9
- package/src/pipeline/agent_output.ts +34 -5
- package/src/pipeline/agent_playout.ts +10 -1
- package/src/pipeline/human_input.ts +8 -0
- package/src/pipeline/pipeline_agent.ts +96 -11
- package/src/stt/stream_adapter.ts +17 -5
- package/src/tokenize/basic/basic.ts +2 -0
- package/src/tokenize/basic/index.ts +7 -1
- package/src/tokenize/token_stream.ts +6 -3
- package/src/transcription.ts +270 -96
- package/src/worker.ts +42 -5
- package/dist/ipc/job_main.cjs.map +0 -1
- package/dist/ipc/job_main.d.ts +0 -8
- package/dist/ipc/job_main.d.ts.map +0 -1
- package/dist/ipc/job_main.js.map +0 -1
- package/dist/ipc/proc_job_executor.cjs.map +0 -1
- package/dist/ipc/proc_job_executor.d.ts +0 -15
- package/dist/ipc/proc_job_executor.d.ts.map +0 -1
- package/dist/ipc/proc_job_executor.js.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/stt/stream_adapter.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { VAD, VADStream } from '../vad.js';\nimport { VADEventType } from '../vad.js';\nimport type { SpeechEvent } from './stt.js';\nimport { STT, SpeechEventType, SpeechStream } from './stt.js';\n\nexport class StreamAdapter extends STT {\n #stt: STT;\n #vad: VAD;\n label: string;\n\n constructor(stt: STT, vad: VAD) {\n super({ streaming: true, interimResults: false });\n this.#stt = stt;\n this.#vad = vad;\n this.label = `stt.StreamAdapter<${this.#stt.label}>`;\n\n this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {\n this.emit(SpeechEventType.METRICS_COLLECTED, metrics);\n });\n }\n\n _recognize(frame: AudioFrame): Promise<SpeechEvent> {\n return this.#stt.recognize(frame);\n }\n\n stream(): StreamAdapterWrapper {\n return new StreamAdapterWrapper(this.#stt, this.#vad);\n }\n}\n\nexport class StreamAdapterWrapper extends SpeechStream {\n #stt: STT;\n #vadStream: VADStream;\n label: string;\n\n constructor(stt: STT, vad: VAD) {\n super(stt);\n this.#stt = stt;\n this.#vadStream = vad.stream();\n this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;\n\n this.#run();\n }\n\n async monitorMetrics() {\n return; // do nothing\n }\n\n async #run() {\n const forwardInput = async () => {\n for await (const input of this.input) {\n if (input === SpeechStream.FLUSH_SENTINEL) {\n this.#vadStream.flush();\n } else {\n this.#vadStream.pushFrame(input);\n }\n }\n this.#vadStream.endInput();\n };\n\n const recognize = async () => {\n for await (const ev of this.#vadStream) {\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.output.put({ type: SpeechEventType.START_OF_SPEECH });\n break;\n case VADEventType.END_OF_SPEECH:\n this.output.put({ type: SpeechEventType.END_OF_SPEECH });\n\n const event = await this.#stt.recognize(ev.frames);\n
|
|
1
|
+
{"version":3,"sources":["../../src/stt/stream_adapter.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { log } from '../log.js';\nimport type { VAD, VADStream } from '../vad.js';\nimport { VADEventType } from '../vad.js';\nimport type { SpeechEvent } from './stt.js';\nimport { STT, SpeechEventType, SpeechStream } from './stt.js';\n\nexport class StreamAdapter extends STT {\n #stt: STT;\n #vad: VAD;\n label: string;\n\n constructor(stt: STT, vad: VAD) {\n super({ streaming: true, interimResults: false });\n this.#stt = stt;\n this.#vad = vad;\n this.label = `stt.StreamAdapter<${this.#stt.label}>`;\n\n this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {\n this.emit(SpeechEventType.METRICS_COLLECTED, metrics);\n });\n }\n\n _recognize(frame: AudioFrame): Promise<SpeechEvent> {\n return this.#stt.recognize(frame);\n }\n\n stream(): StreamAdapterWrapper {\n return new StreamAdapterWrapper(this.#stt, this.#vad);\n }\n}\n\nexport class StreamAdapterWrapper extends SpeechStream {\n #stt: STT;\n #vadStream: VADStream;\n label: string;\n\n constructor(stt: STT, vad: VAD) {\n super(stt);\n this.#stt = stt;\n this.#vadStream = vad.stream();\n this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;\n\n this.#run();\n }\n\n async monitorMetrics() {\n return; // do nothing\n }\n\n async #run() {\n const forwardInput = async () => {\n for await (const input of this.input) {\n if (input === SpeechStream.FLUSH_SENTINEL) {\n this.#vadStream.flush();\n } else {\n this.#vadStream.pushFrame(input);\n }\n }\n this.#vadStream.endInput();\n };\n\n const recognize = async () => {\n for await (const ev of this.#vadStream) {\n switch (ev.type) {\n case VADEventType.START_OF_SPEECH:\n this.output.put({ type: SpeechEventType.START_OF_SPEECH });\n break;\n case VADEventType.END_OF_SPEECH:\n this.output.put({ type: SpeechEventType.END_OF_SPEECH });\n\n try {\n const event = await this.#stt.recognize(ev.frames);\n if (!event.alternatives![0].text) {\n continue;\n }\n\n this.output.put(event);\n break;\n } catch (error) {\n let logger = log();\n if (error instanceof Error) {\n logger = logger.child({ error: error.message });\n } else {\n logger = logger.child({ error });\n }\n logger.error(`${this.label}: provider recognize task failed`);\n continue;\n }\n }\n }\n };\n\n Promise.all([forwardInput(), recognize()]);\n }\n}\n"],"mappings":"AAIA,SAAS,WAAW;AAEpB,SAAS,oBAAoB;AAE7B,SAAS,KAAK,iBAAiB,oBAAoB;AAE5C,MAAM,sBAAsB,IAAI;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,KAAU,KAAU;AAC9B,UAAM,EAAE,WAAW,MAAM,gBAAgB,MAAM,CAAC;AAChD,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,QAAQ,qBAAqB,KAAK,KAAK,KAAK;AAEjD,SAAK,KAAK,GAAG,gBAAgB,mBAAmB,CAAC,YAAY;AAC3D,WAAK,KAAK,gBAAgB,mBAAmB,OAAO;AAAA,IACtD,CAAC;AAAA,EACH;AAAA,EAEA,WAAW,OAAyC;AAClD,WAAO,KAAK,KAAK,UAAU,KAAK;AAAA,EAClC;AAAA,EAEA,SAA+B;AAC7B,WAAO,IAAI,qBAAqB,KAAK,MAAM,KAAK,IAAI;AAAA,EACtD;AACF;AAEO,MAAM,6BAA6B,aAAa;AAAA,EACrD;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,KAAU,KAAU;AAC9B,UAAM,GAAG;AACT,SAAK,OAAO;AACZ,SAAK,aAAa,IAAI,OAAO;AAC7B,SAAK,QAAQ,4BAA4B,KAAK,KAAK,KAAK;AAExD,SAAK,KAAK;AAAA,EACZ;AAAA,EAEA,MAAM,iBAAiB;AACrB;AAAA,EACF;AAAA,EAEA,MAAM,OAAO;AACX,UAAM,eAAe,YAAY;AAC/B,uBAAiB,SAAS,KAAK,OAAO;AACpC,YAAI,UAAU,aAAa,gBAAgB;AACzC,eAAK,WAAW,MAAM;AAAA,QACxB,OAAO;AACL,eAAK,WAAW,UAAU,KAAK;AAAA,QACjC;AAAA,MACF;AACA,WAAK,WAAW,SAAS;AAAA,IAC3B;AAEA,UAAM,YAAY,YAAY;AAC5B,uBAAiB,MAAM,KAAK,YAAY;AACtC,gBAAQ,GAAG,MAAM;AAAA,UACf,KAAK,aAAa;AAChB,iBAAK,OAAO,IAAI,EAAE,MAAM,gBAAgB,gBAAgB,CAAC;AACzD;AAAA,UACF,KAAK,aAAa;AAChB,iBAAK,OAAO,IAAI,EAAE,MAAM,gBAAgB,cAAc,CAAC;AAEvD,gBAAI;AACF,oBAAM,QAAQ,MAAM,KAAK,KAAK,UAAU,GAAG,MAAM;AACjD,kBAAI,CAAC,MAAM,aAAc,CAAC,EAAE,MAAM;AAChC;AAAA,cACF;AAEA,mBAAK,OAAO,IAAI,KAAK;AACrB;AAAA,YACF,SAAS,OAAO;AACd,kBAAI,SAAS,IAAI;AACjB,kBAAI,iBAAiB,OAAO;AAC1B,yBAAS,OAAO,MAAM,EAAE,OAAO,MAAM,QAAQ,CAAC;AAAA,cAChD,OAAO;AACL,yBAAS,OAAO,MAAM,EAAE,MAAM,CAAC;AAAA,cACjC;AACA,qBAAO,MAAM,GAAG,KAAK,KAAK,kCAAkC;AAC5D;AAAA,YACF;AAAA,QACJ;AAAA,MACF;AAAA,IACF;AAEA,YAAQ,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC,CAAC;AAAA,EAC3C;AACF;","names":[]}
|
|
@@ -31,6 +31,7 @@ __export(basic_exports, {
|
|
|
31
31
|
SentenceTokenizer: () => SentenceTokenizer,
|
|
32
32
|
WordTokenizer: () => WordTokenizer,
|
|
33
33
|
hyphenateWord: () => hyphenateWord,
|
|
34
|
+
splitWords: () => import_word.splitWords,
|
|
34
35
|
tokenizeParagraphs: () => tokenizeParagraphs
|
|
35
36
|
});
|
|
36
37
|
module.exports = __toCommonJS(basic_exports);
|
|
@@ -93,6 +94,7 @@ const tokenizeParagraphs = (text) => {
|
|
|
93
94
|
SentenceTokenizer,
|
|
94
95
|
WordTokenizer,
|
|
95
96
|
hyphenateWord,
|
|
97
|
+
splitWords,
|
|
96
98
|
tokenizeParagraphs
|
|
97
99
|
});
|
|
98
100
|
//# sourceMappingURL=basic.cjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/tokenize/basic/basic.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { BufferedSentenceStream, BufferedWordStream } from '../token_stream.js';\nimport * as tokenizer from '../tokenizer.js';\nimport { hyphenator } from './hyphenator.js';\nimport { splitParagraphs } from './paragraph.js';\nimport { splitSentences } from './sentence.js';\nimport { splitWords } from './word.js';\n\ninterface TokenizerOptions {\n language: string;\n minSentenceLength: number;\n streamContextLength: number;\n}\n\nexport class SentenceTokenizer extends tokenizer.SentenceTokenizer {\n #config: TokenizerOptions;\n\n constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {\n super();\n this.#config = {\n language,\n minSentenceLength,\n streamContextLength,\n };\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.SentenceStream {\n return new BufferedSentenceStream(\n (text: string) => splitSentences(text, this.#config.minSentenceLength),\n this.#config.minSentenceLength,\n this.#config.streamContextLength,\n );\n }\n}\n\nexport class WordTokenizer extends tokenizer.WordTokenizer {\n #ignorePunctuation: boolean;\n\n constructor(ignorePunctuation = true) {\n super();\n this.#ignorePunctuation = ignorePunctuation;\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.WordStream {\n return new BufferedWordStream(\n (text: string) => splitWords(text, this.#ignorePunctuation),\n 1,\n 1,\n );\n }\n}\n\nexport const hyphenateWord = (word: string): string[] => {\n return hyphenator.hyphenateWord(word);\n};\n\nexport const tokenizeParagraphs = (text: string): string[] => {\n return splitParagraphs(text).map((tok) => tok[0]);\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,0BAA2D;AAC3D,gBAA2B;AAC3B,wBAA2B;AAC3B,uBAAgC;AAChC,sBAA+B;AAC/B,kBAA2B;AAQpB,MAAM,0BAA0B,UAAU,kBAAkB;AAAA,EACjE;AAAA,EAEA,YAAY,WAAW,SAAS,oBAAoB,IAAI,sBAAsB,IAAI;AAChF,UAAM;AACN,SAAK,UAAU;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,eAAO,gCAAe,MAAM,KAAK,QAAQ,iBAAiB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACjF;AAAA;AAAA,EAGA,OAAO,UAA6C;AAClD,WAAO,IAAI;AAAA,MACT,CAAC,aAAiB,gCAAe,MAAM,KAAK,QAAQ,iBAAiB;AAAA,MACrE,KAAK,QAAQ;AAAA,MACb,KAAK,QAAQ;AAAA,IACf;AAAA,EACF;AACF;AAEO,MAAM,sBAAsB,UAAU,cAAc;AAAA,EACzD;AAAA,EAEA,YAAY,oBAAoB,MAAM;AACpC,UAAM;AACN,SAAK,qBAAqB;AAAA,EAC5B;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,eAAO,wBAAW,MAAM,KAAK,kBAAkB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACtE;AAAA;AAAA,EAGA,OAAO,UAAyC;AAC9C,WAAO,IAAI;AAAA,MACT,CAAC,aAAiB,wBAAW,MAAM,KAAK,kBAAkB;AAAA,MAC1D;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACF;AAEO,MAAM,gBAAgB,CAAC,SAA2B;AACvD,SAAO,6BAAW,cAAc,IAAI;AACtC;
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/basic.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { BufferedSentenceStream, BufferedWordStream } from '../token_stream.js';\nimport * as tokenizer from '../tokenizer.js';\nimport { hyphenator } from './hyphenator.js';\nimport { splitParagraphs } from './paragraph.js';\nimport { splitSentences } from './sentence.js';\nimport { splitWords } from './word.js';\n\ninterface TokenizerOptions {\n language: string;\n minSentenceLength: number;\n streamContextLength: number;\n}\n\nexport class SentenceTokenizer extends tokenizer.SentenceTokenizer {\n #config: TokenizerOptions;\n\n constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {\n super();\n this.#config = {\n language,\n minSentenceLength,\n streamContextLength,\n };\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.SentenceStream {\n return new BufferedSentenceStream(\n (text: string) => splitSentences(text, this.#config.minSentenceLength),\n this.#config.minSentenceLength,\n this.#config.streamContextLength,\n );\n }\n}\n\nexport class WordTokenizer extends tokenizer.WordTokenizer {\n #ignorePunctuation: boolean;\n\n constructor(ignorePunctuation = true) {\n super();\n this.#ignorePunctuation = ignorePunctuation;\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.WordStream {\n return new BufferedWordStream(\n (text: string) => splitWords(text, this.#ignorePunctuation),\n 1,\n 1,\n );\n }\n}\n\nexport const hyphenateWord = (word: string): string[] => {\n return hyphenator.hyphenateWord(word);\n};\n\nexport { splitWords };\n\nexport const tokenizeParagraphs = (text: string): string[] => {\n return splitParagraphs(text).map((tok) => tok[0]);\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,0BAA2D;AAC3D,gBAA2B;AAC3B,wBAA2B;AAC3B,uBAAgC;AAChC,sBAA+B;AAC/B,kBAA2B;AAQpB,MAAM,0BAA0B,UAAU,kBAAkB;AAAA,EACjE;AAAA,EAEA,YAAY,WAAW,SAAS,oBAAoB,IAAI,sBAAsB,IAAI;AAChF,UAAM;AACN,SAAK,UAAU;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,eAAO,gCAAe,MAAM,KAAK,QAAQ,iBAAiB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACjF;AAAA;AAAA,EAGA,OAAO,UAA6C;AAClD,WAAO,IAAI;AAAA,MACT,CAAC,aAAiB,gCAAe,MAAM,KAAK,QAAQ,iBAAiB;AAAA,MACrE,KAAK,QAAQ;AAAA,MACb,KAAK,QAAQ;AAAA,IACf;AAAA,EACF;AACF;AAEO,MAAM,sBAAsB,UAAU,cAAc;AAAA,EACzD;AAAA,EAEA,YAAY,oBAAoB,MAAM;AACpC,UAAM;AACN,SAAK,qBAAqB;AAAA,EAC5B;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,eAAO,wBAAW,MAAM,KAAK,kBAAkB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACtE;AAAA;AAAA,EAGA,OAAO,UAAyC;AAC9C,WAAO,IAAI;AAAA,MACT,CAAC,aAAiB,wBAAW,MAAM,KAAK,kBAAkB;AAAA,MAC1D;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACF;AAEO,MAAM,gBAAgB,CAAC,SAA2B;AACvD,SAAO,6BAAW,cAAc,IAAI;AACtC;AAIO,MAAM,qBAAqB,CAAC,SAA2B;AAC5D,aAAO,kCAAgB,IAAI,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAClD;","names":[]}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import * as tokenizer from '../tokenizer.js';
|
|
2
|
+
import { splitWords } from './word.js';
|
|
2
3
|
export declare class SentenceTokenizer extends tokenizer.SentenceTokenizer {
|
|
3
4
|
#private;
|
|
4
5
|
constructor(language?: string, minSentenceLength?: number, streamContextLength?: number);
|
|
@@ -12,5 +13,6 @@ export declare class WordTokenizer extends tokenizer.WordTokenizer {
|
|
|
12
13
|
stream(language?: string): tokenizer.WordStream;
|
|
13
14
|
}
|
|
14
15
|
export declare const hyphenateWord: (word: string) => string[];
|
|
16
|
+
export { splitWords };
|
|
15
17
|
export declare const tokenizeParagraphs: (text: string) => string[];
|
|
16
18
|
//# sourceMappingURL=basic.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"basic.d.ts","sourceRoot":"","sources":["../../../src/tokenize/basic/basic.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,SAAS,MAAM,iBAAiB,CAAC;
|
|
1
|
+
{"version":3,"file":"basic.d.ts","sourceRoot":"","sources":["../../../src/tokenize/basic/basic.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,SAAS,MAAM,iBAAiB,CAAC;AAI7C,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAQvC,qBAAa,iBAAkB,SAAQ,SAAS,CAAC,iBAAiB;;gBAGpD,QAAQ,SAAU,EAAE,iBAAiB,SAAK,EAAE,mBAAmB,SAAK;IAUhF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE;IAKnD,MAAM,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC,cAAc;CAOpD;AAED,qBAAa,aAAc,SAAQ,SAAS,CAAC,aAAa;;gBAG5C,iBAAiB,UAAO;IAMpC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE;IAKnD,MAAM,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC,UAAU;CAOhD;AAED,eAAO,MAAM,aAAa,SAAU,MAAM,KAAG,MAAM,EAElD,CAAC;AAEF,OAAO,EAAE,UAAU,EAAE,CAAC;AAEtB,eAAO,MAAM,kBAAkB,SAAU,MAAM,KAAG,MAAM,EAEvD,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/tokenize/basic/basic.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { BufferedSentenceStream, BufferedWordStream } from '../token_stream.js';\nimport * as tokenizer from '../tokenizer.js';\nimport { hyphenator } from './hyphenator.js';\nimport { splitParagraphs } from './paragraph.js';\nimport { splitSentences } from './sentence.js';\nimport { splitWords } from './word.js';\n\ninterface TokenizerOptions {\n language: string;\n minSentenceLength: number;\n streamContextLength: number;\n}\n\nexport class SentenceTokenizer extends tokenizer.SentenceTokenizer {\n #config: TokenizerOptions;\n\n constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {\n super();\n this.#config = {\n language,\n minSentenceLength,\n streamContextLength,\n };\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.SentenceStream {\n return new BufferedSentenceStream(\n (text: string) => splitSentences(text, this.#config.minSentenceLength),\n this.#config.minSentenceLength,\n this.#config.streamContextLength,\n );\n }\n}\n\nexport class WordTokenizer extends tokenizer.WordTokenizer {\n #ignorePunctuation: boolean;\n\n constructor(ignorePunctuation = true) {\n super();\n this.#ignorePunctuation = ignorePunctuation;\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.WordStream {\n return new BufferedWordStream(\n (text: string) => splitWords(text, this.#ignorePunctuation),\n 1,\n 1,\n );\n }\n}\n\nexport const hyphenateWord = (word: string): string[] => {\n return hyphenator.hyphenateWord(word);\n};\n\nexport const tokenizeParagraphs = (text: string): string[] => {\n return splitParagraphs(text).map((tok) => tok[0]);\n};\n"],"mappings":"AAGA,SAAS,wBAAwB,0BAA0B;AAC3D,YAAY,eAAe;AAC3B,SAAS,kBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,sBAAsB;AAC/B,SAAS,kBAAkB;AAQpB,MAAM,0BAA0B,UAAU,kBAAkB;AAAA,EACjE;AAAA,EAEA,YAAY,WAAW,SAAS,oBAAoB,IAAI,sBAAsB,IAAI;AAChF,UAAM;AACN,SAAK,UAAU;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,WAAO,eAAe,MAAM,KAAK,QAAQ,iBAAiB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACjF;AAAA;AAAA,EAGA,OAAO,UAA6C;AAClD,WAAO,IAAI;AAAA,MACT,CAAC,SAAiB,eAAe,MAAM,KAAK,QAAQ,iBAAiB;AAAA,MACrE,KAAK,QAAQ;AAAA,MACb,KAAK,QAAQ;AAAA,IACf;AAAA,EACF;AACF;AAEO,MAAM,sBAAsB,UAAU,cAAc;AAAA,EACzD;AAAA,EAEA,YAAY,oBAAoB,MAAM;AACpC,UAAM;AACN,SAAK,qBAAqB;AAAA,EAC5B;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,WAAO,WAAW,MAAM,KAAK,kBAAkB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACtE;AAAA;AAAA,EAGA,OAAO,UAAyC;AAC9C,WAAO,IAAI;AAAA,MACT,CAAC,SAAiB,WAAW,MAAM,KAAK,kBAAkB;AAAA,MAC1D;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACF;AAEO,MAAM,gBAAgB,CAAC,SAA2B;AACvD,SAAO,WAAW,cAAc,IAAI;AACtC;
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/basic.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { BufferedSentenceStream, BufferedWordStream } from '../token_stream.js';\nimport * as tokenizer from '../tokenizer.js';\nimport { hyphenator } from './hyphenator.js';\nimport { splitParagraphs } from './paragraph.js';\nimport { splitSentences } from './sentence.js';\nimport { splitWords } from './word.js';\n\ninterface TokenizerOptions {\n language: string;\n minSentenceLength: number;\n streamContextLength: number;\n}\n\nexport class SentenceTokenizer extends tokenizer.SentenceTokenizer {\n #config: TokenizerOptions;\n\n constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {\n super();\n this.#config = {\n language,\n minSentenceLength,\n streamContextLength,\n };\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.SentenceStream {\n return new BufferedSentenceStream(\n (text: string) => splitSentences(text, this.#config.minSentenceLength),\n this.#config.minSentenceLength,\n this.#config.streamContextLength,\n );\n }\n}\n\nexport class WordTokenizer extends tokenizer.WordTokenizer {\n #ignorePunctuation: boolean;\n\n constructor(ignorePunctuation = true) {\n super();\n this.#ignorePunctuation = ignorePunctuation;\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n tokenize(text: string, language?: string): string[] {\n return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n stream(language?: string): tokenizer.WordStream {\n return new BufferedWordStream(\n (text: string) => splitWords(text, this.#ignorePunctuation),\n 1,\n 1,\n );\n }\n}\n\nexport const hyphenateWord = (word: string): string[] => {\n return hyphenator.hyphenateWord(word);\n};\n\nexport { splitWords };\n\nexport const tokenizeParagraphs = (text: string): string[] => {\n return splitParagraphs(text).map((tok) => tok[0]);\n};\n"],"mappings":"AAGA,SAAS,wBAAwB,0BAA0B;AAC3D,YAAY,eAAe;AAC3B,SAAS,kBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,sBAAsB;AAC/B,SAAS,kBAAkB;AAQpB,MAAM,0BAA0B,UAAU,kBAAkB;AAAA,EACjE;AAAA,EAEA,YAAY,WAAW,SAAS,oBAAoB,IAAI,sBAAsB,IAAI;AAChF,UAAM;AACN,SAAK,UAAU;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,WAAO,eAAe,MAAM,KAAK,QAAQ,iBAAiB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACjF;AAAA;AAAA,EAGA,OAAO,UAA6C;AAClD,WAAO,IAAI;AAAA,MACT,CAAC,SAAiB,eAAe,MAAM,KAAK,QAAQ,iBAAiB;AAAA,MACrE,KAAK,QAAQ;AAAA,MACb,KAAK,QAAQ;AAAA,IACf;AAAA,EACF;AACF;AAEO,MAAM,sBAAsB,UAAU,cAAc;AAAA,EACzD;AAAA,EAEA,YAAY,oBAAoB,MAAM;AACpC,UAAM;AACN,SAAK,qBAAqB;AAAA,EAC5B;AAAA;AAAA,EAGA,SAAS,MAAc,UAA6B;AAClD,WAAO,WAAW,MAAM,KAAK,kBAAkB,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAAA,EACtE;AAAA;AAAA,EAGA,OAAO,UAAyC;AAC9C,WAAO,IAAI;AAAA,MACT,CAAC,SAAiB,WAAW,MAAM,KAAK,kBAAkB;AAAA,MAC1D;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACF;AAEO,MAAM,gBAAgB,CAAC,SAA2B;AACvD,SAAO,WAAW,cAAc,IAAI;AACtC;AAIO,MAAM,qBAAqB,CAAC,SAA2B;AAC5D,SAAO,gBAAgB,IAAI,EAAE,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;AAClD;","names":[]}
|
|
@@ -21,6 +21,7 @@ __export(basic_exports, {
|
|
|
21
21
|
SentenceTokenizer: () => import_basic.SentenceTokenizer,
|
|
22
22
|
WordTokenizer: () => import_basic.WordTokenizer,
|
|
23
23
|
hyphenateWord: () => import_basic.hyphenateWord,
|
|
24
|
+
splitWords: () => import_basic.splitWords,
|
|
24
25
|
tokenizeParagraphs: () => import_basic.tokenizeParagraphs
|
|
25
26
|
});
|
|
26
27
|
module.exports = __toCommonJS(basic_exports);
|
|
@@ -30,6 +31,7 @@ var import_basic = require("./basic.cjs");
|
|
|
30
31
|
SentenceTokenizer,
|
|
31
32
|
WordTokenizer,
|
|
32
33
|
hyphenateWord,
|
|
34
|
+
splitWords,
|
|
33
35
|
tokenizeParagraphs
|
|
34
36
|
});
|
|
35
37
|
//# sourceMappingURL=index.cjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/tokenize/basic/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport {
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport {\n SentenceTokenizer,\n WordTokenizer,\n tokenizeParagraphs,\n hyphenateWord,\n splitWords,\n} from './basic.js';\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,mBAMO;","names":[]}
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js';
|
|
1
|
+
export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord, splitWords, } from './basic.js';
|
|
2
2
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tokenize/basic/index.ts"],"names":[],"mappings":"AAIA,OAAO,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tokenize/basic/index.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,iBAAiB,EACjB,aAAa,EACb,kBAAkB,EAClB,aAAa,EACb,UAAU,GACX,MAAM,YAAY,CAAC"}
|
|
@@ -1,8 +1,15 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
SentenceTokenizer,
|
|
3
|
+
WordTokenizer,
|
|
4
|
+
tokenizeParagraphs,
|
|
5
|
+
hyphenateWord,
|
|
6
|
+
splitWords
|
|
7
|
+
} from "./basic.js";
|
|
2
8
|
export {
|
|
3
9
|
SentenceTokenizer,
|
|
4
10
|
WordTokenizer,
|
|
5
11
|
hyphenateWord,
|
|
12
|
+
splitWords,
|
|
6
13
|
tokenizeParagraphs
|
|
7
14
|
};
|
|
8
15
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/tokenize/basic/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport {
|
|
1
|
+
{"version":3,"sources":["../../../src/tokenize/basic/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport {\n SentenceTokenizer,\n WordTokenizer,\n tokenizeParagraphs,\n hyphenateWord,\n splitWords,\n} from './basic.js';\n"],"mappings":"AAIA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;","names":[]}
|
|
@@ -54,9 +54,11 @@ class BufferedTokenStream {
|
|
|
54
54
|
if (tokens.length <= 1) break;
|
|
55
55
|
if (this.#outBuf) this.#outBuf += " ";
|
|
56
56
|
const tok = tokens.shift();
|
|
57
|
-
let tokText
|
|
58
|
-
if (
|
|
57
|
+
let tokText;
|
|
58
|
+
if (Array.isArray(tok)) {
|
|
59
59
|
tokText = tok[0];
|
|
60
|
+
} else {
|
|
61
|
+
tokText = tok;
|
|
60
62
|
}
|
|
61
63
|
this.#outBuf += tokText;
|
|
62
64
|
if (this.#outBuf.length >= this.#minTokenLength) {
|
|
@@ -79,7 +81,7 @@ class BufferedTokenStream {
|
|
|
79
81
|
const tokens = this.#func(this.#inBuf);
|
|
80
82
|
if (tokens) {
|
|
81
83
|
if (this.#outBuf) this.#outBuf += " ";
|
|
82
|
-
if (
|
|
84
|
+
if (Array.isArray(tokens[0])) {
|
|
83
85
|
this.#outBuf += tokens.map((tok) => tok[0]).join(" ");
|
|
84
86
|
} else {
|
|
85
87
|
this.#outBuf += tokens.join(" ");
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tokenize/token_stream.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { randomUUID } from 'node:crypto';\nimport { AsyncIterableQueue } from '../utils.js';\nimport type { TokenData } from './tokenizer.js';\nimport { SentenceStream, WordStream } from './tokenizer.js';\n\ntype TokenizeFunc = (x: string) => string[] | [string, number, number][];\n\nexport class BufferedTokenStream implements AsyncIterableIterator<TokenData> {\n protected queue = new AsyncIterableQueue<TokenData>();\n protected closed = false;\n\n #func: TokenizeFunc;\n #minTokenLength: number;\n #minContextLength: number;\n #bufTokens: string[] = [];\n #inBuf = '';\n #outBuf = '';\n #currentSegmentId: string;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n this.#func = func;\n this.#minTokenLength = minTokenLength;\n this.#minContextLength = minContextLength;\n\n this.#currentSegmentId = randomUUID();\n }\n\n /** Push a string of text into the token stream */\n pushText(text: string) {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n this.#inBuf += text;\n if (this.#inBuf.length < this.#minContextLength) return;\n\n while (true) {\n const tokens = this.#func(this.#inBuf);\n if (tokens.length <= 1) break;\n\n if (this.#outBuf) this.#outBuf += ' ';\n\n const tok = tokens.shift()!;\n let tokText
|
|
1
|
+
{"version":3,"sources":["../../src/tokenize/token_stream.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { randomUUID } from 'node:crypto';\nimport { AsyncIterableQueue } from '../utils.js';\nimport type { TokenData } from './tokenizer.js';\nimport { SentenceStream, WordStream } from './tokenizer.js';\n\ntype TokenizeFunc = (x: string) => string[] | [string, number, number][];\n\nexport class BufferedTokenStream implements AsyncIterableIterator<TokenData> {\n protected queue = new AsyncIterableQueue<TokenData>();\n protected closed = false;\n\n #func: TokenizeFunc;\n #minTokenLength: number;\n #minContextLength: number;\n #bufTokens: string[] = [];\n #inBuf = '';\n #outBuf = '';\n #currentSegmentId: string;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n this.#func = func;\n this.#minTokenLength = minTokenLength;\n this.#minContextLength = minContextLength;\n\n this.#currentSegmentId = randomUUID();\n }\n\n /** Push a string of text into the token stream */\n pushText(text: string) {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n this.#inBuf += text;\n if (this.#inBuf.length < this.#minContextLength) return;\n\n while (true) {\n const tokens = this.#func(this.#inBuf);\n if (tokens.length <= 1) break;\n\n if (this.#outBuf) this.#outBuf += ' ';\n\n const tok = tokens.shift()!;\n let tokText: string;\n if (Array.isArray(tok)) {\n tokText = tok[0];\n } else {\n tokText = tok;\n }\n\n this.#outBuf += tokText;\n\n if (this.#outBuf.length >= this.#minTokenLength) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n this.#outBuf = '';\n }\n\n if (typeof tok! !== 'string') {\n this.#inBuf = this.#inBuf.slice(tok![2]);\n } else {\n this.#inBuf = this.#inBuf\n .slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length)\n .trimStart();\n }\n }\n }\n\n /** Flush the stream, causing it to process all pending text */\n flush() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.#inBuf || this.#outBuf) {\n const tokens = this.#func(this.#inBuf);\n if (tokens) {\n if (this.#outBuf) this.#outBuf += ' ';\n\n if (Array.isArray(tokens[0])) {\n this.#outBuf += tokens.map((tok) => tok[0]).join(' ');\n } else {\n this.#outBuf += tokens.join(' ');\n }\n }\n\n if (this.#outBuf) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n }\n\n this.#currentSegmentId = randomUUID();\n }\n\n this.#inBuf = '';\n this.#outBuf = '';\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.flush();\n this.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the token stream */\n close() {\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): BufferedTokenStream {\n return this;\n }\n}\n\nexport class BufferedSentenceStream extends SentenceStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n close() {\n super.close();\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n\nexport class BufferedWordStream extends WordStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n endInput() {\n this.#stream.endInput();\n }\n\n close() {\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,yBAA2B;AAC3B,mBAAmC;AAEnC,uBAA2C;AAIpC,MAAM,oBAAgE;AAAA,EACjE,QAAQ,IAAI,gCAA8B;AAAA,EAC1C,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAuB,CAAC;AAAA,EACxB,SAAS;AAAA,EACT,UAAU;AAAA,EACV;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,SAAK,QAAQ;AACb,SAAK,kBAAkB;AACvB,SAAK,oBAAoB;AAEzB,SAAK,wBAAoB,+BAAW;AAAA,EACtC;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,SAAK,UAAU;AACf,QAAI,KAAK,OAAO,SAAS,KAAK,kBAAmB;AAEjD,WAAO,MAAM;AACX,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,OAAO,UAAU,EAAG;AAExB,UAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAM,MAAM,OAAO,MAAM;AACzB,UAAI;AACJ,UAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,kBAAU,IAAI,CAAC;AAAA,MACjB,OAAO;AACL,kBAAU;AAAA,MACZ;AAEA,WAAK,WAAW;AAEhB,UAAI,KAAK,QAAQ,UAAU,KAAK,iBAAiB;AAC/C,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AACzE,aAAK,UAAU;AAAA,MACjB;AAEA,UAAI,OAAO,QAAS,UAAU;AAC5B,aAAK,SAAS,KAAK,OAAO,MAAM,IAAK,CAAC,CAAC;AAAA,MACzC,OAAO;AACL,aAAK,SAAS,KAAK,OAChB,MAAM,KAAK,IAAI,GAAG,KAAK,OAAO,QAAQ,GAAG,CAAC,IAAI,IAAI,MAAM,EACxD,UAAU;AAAA,MACf;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,UAAU,KAAK,SAAS;AAC/B,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,QAAQ;AACV,YAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAI,MAAM,QAAQ,OAAO,CAAC,CAAC,GAAG;AAC5B,eAAK,WAAW,OAAO,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC,EAAE,KAAK,GAAG;AAAA,QACtD,OAAO;AACL,eAAK,WAAW,OAAO,KAAK,GAAG;AAAA,QACjC;AAAA,MACF;AAEA,UAAI,KAAK,SAAS;AAChB,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AAAA,MAC3E;AAEA,WAAK,wBAAoB,+BAAW;AAAA,IACtC;AAEA,SAAK,SAAS;AACd,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM;AACX,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAyB;AAC5C,WAAO;AAAA,EACT;AACF;AAEO,MAAM,+BAA+B,gCAAe;AAAA,EACzD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,QAAQ;AACN,UAAM,MAAM;AACZ,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;AAEO,MAAM,2BAA2B,4BAAW;AAAA,EACjD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,WAAW;AACT,SAAK,QAAQ,SAAS;AAAA,EACxB;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;","names":[]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"token_stream.d.ts","sourceRoot":"","sources":["../../src/tokenize/token_stream.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE5D,KAAK,YAAY,GAAG,CAAC,CAAC,EAAE,MAAM,KAAK,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;AAEzE,qBAAa,mBAAoB,YAAW,qBAAqB,CAAC,SAAS,CAAC;;IAC1E,SAAS,CAAC,KAAK,gCAAuC;IACtD,SAAS,CAAC,MAAM,UAAS;gBAUb,IAAI,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM;IAQhF,kDAAkD;IAClD,QAAQ,CAAC,IAAI,EAAE,MAAM;
|
|
1
|
+
{"version":3,"file":"token_stream.d.ts","sourceRoot":"","sources":["../../src/tokenize/token_stream.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAE5D,KAAK,YAAY,GAAG,CAAC,CAAC,EAAE,MAAM,KAAK,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;AAEzE,qBAAa,mBAAoB,YAAW,qBAAqB,CAAC,SAAS,CAAC;;IAC1E,SAAS,CAAC,KAAK,gCAAuC;IACtD,SAAS,CAAC,MAAM,UAAS;gBAUb,IAAI,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM;IAQhF,kDAAkD;IAClD,QAAQ,CAAC,IAAI,EAAE,MAAM;IAuCrB,+DAA+D;IAC/D,KAAK;IA4BL,2DAA2D;IAC3D,QAAQ;IAQR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,SAAS,CAAC,CAAC;IAI1C,0DAA0D;IAC1D,KAAK;IAKL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,mBAAmB;CAG9C;AAED,qBAAa,sBAAuB,SAAQ,cAAc;;gBAG5C,IAAI,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM;IAKhF,QAAQ,CAAC,IAAI,EAAE,MAAM;IAIrB,KAAK;IAIL,KAAK;IAKL,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,SAAS,CAAC,CAAC;CAG3C;AAED,qBAAa,kBAAmB,SAAQ,UAAU;;gBAGpC,IAAI,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM;IAKhF,QAAQ,CAAC,IAAI,EAAE,MAAM;IAIrB,KAAK;IAIL,QAAQ;IAIR,KAAK;IAIL,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,SAAS,CAAC,CAAC;CAG3C"}
|
|
@@ -29,9 +29,11 @@ class BufferedTokenStream {
|
|
|
29
29
|
if (tokens.length <= 1) break;
|
|
30
30
|
if (this.#outBuf) this.#outBuf += " ";
|
|
31
31
|
const tok = tokens.shift();
|
|
32
|
-
let tokText
|
|
33
|
-
if (
|
|
32
|
+
let tokText;
|
|
33
|
+
if (Array.isArray(tok)) {
|
|
34
34
|
tokText = tok[0];
|
|
35
|
+
} else {
|
|
36
|
+
tokText = tok;
|
|
35
37
|
}
|
|
36
38
|
this.#outBuf += tokText;
|
|
37
39
|
if (this.#outBuf.length >= this.#minTokenLength) {
|
|
@@ -54,7 +56,7 @@ class BufferedTokenStream {
|
|
|
54
56
|
const tokens = this.#func(this.#inBuf);
|
|
55
57
|
if (tokens) {
|
|
56
58
|
if (this.#outBuf) this.#outBuf += " ";
|
|
57
|
-
if (
|
|
59
|
+
if (Array.isArray(tokens[0])) {
|
|
58
60
|
this.#outBuf += tokens.map((tok) => tok[0]).join(" ");
|
|
59
61
|
} else {
|
|
60
62
|
this.#outBuf += tokens.join(" ");
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/tokenize/token_stream.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { randomUUID } from 'node:crypto';\nimport { AsyncIterableQueue } from '../utils.js';\nimport type { TokenData } from './tokenizer.js';\nimport { SentenceStream, WordStream } from './tokenizer.js';\n\ntype TokenizeFunc = (x: string) => string[] | [string, number, number][];\n\nexport class BufferedTokenStream implements AsyncIterableIterator<TokenData> {\n protected queue = new AsyncIterableQueue<TokenData>();\n protected closed = false;\n\n #func: TokenizeFunc;\n #minTokenLength: number;\n #minContextLength: number;\n #bufTokens: string[] = [];\n #inBuf = '';\n #outBuf = '';\n #currentSegmentId: string;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n this.#func = func;\n this.#minTokenLength = minTokenLength;\n this.#minContextLength = minContextLength;\n\n this.#currentSegmentId = randomUUID();\n }\n\n /** Push a string of text into the token stream */\n pushText(text: string) {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n this.#inBuf += text;\n if (this.#inBuf.length < this.#minContextLength) return;\n\n while (true) {\n const tokens = this.#func(this.#inBuf);\n if (tokens.length <= 1) break;\n\n if (this.#outBuf) this.#outBuf += ' ';\n\n const tok = tokens.shift()!;\n let tokText
|
|
1
|
+
{"version":3,"sources":["../../src/tokenize/token_stream.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { randomUUID } from 'node:crypto';\nimport { AsyncIterableQueue } from '../utils.js';\nimport type { TokenData } from './tokenizer.js';\nimport { SentenceStream, WordStream } from './tokenizer.js';\n\ntype TokenizeFunc = (x: string) => string[] | [string, number, number][];\n\nexport class BufferedTokenStream implements AsyncIterableIterator<TokenData> {\n protected queue = new AsyncIterableQueue<TokenData>();\n protected closed = false;\n\n #func: TokenizeFunc;\n #minTokenLength: number;\n #minContextLength: number;\n #bufTokens: string[] = [];\n #inBuf = '';\n #outBuf = '';\n #currentSegmentId: string;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n this.#func = func;\n this.#minTokenLength = minTokenLength;\n this.#minContextLength = minContextLength;\n\n this.#currentSegmentId = randomUUID();\n }\n\n /** Push a string of text into the token stream */\n pushText(text: string) {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n this.#inBuf += text;\n if (this.#inBuf.length < this.#minContextLength) return;\n\n while (true) {\n const tokens = this.#func(this.#inBuf);\n if (tokens.length <= 1) break;\n\n if (this.#outBuf) this.#outBuf += ' ';\n\n const tok = tokens.shift()!;\n let tokText: string;\n if (Array.isArray(tok)) {\n tokText = tok[0];\n } else {\n tokText = tok;\n }\n\n this.#outBuf += tokText;\n\n if (this.#outBuf.length >= this.#minTokenLength) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n this.#outBuf = '';\n }\n\n if (typeof tok! !== 'string') {\n this.#inBuf = this.#inBuf.slice(tok![2]);\n } else {\n this.#inBuf = this.#inBuf\n .slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length)\n .trimStart();\n }\n }\n }\n\n /** Flush the stream, causing it to process all pending text */\n flush() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.#inBuf || this.#outBuf) {\n const tokens = this.#func(this.#inBuf);\n if (tokens) {\n if (this.#outBuf) this.#outBuf += ' ';\n\n if (Array.isArray(tokens[0])) {\n this.#outBuf += tokens.map((tok) => tok[0]).join(' ');\n } else {\n this.#outBuf += tokens.join(' ');\n }\n }\n\n if (this.#outBuf) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n }\n\n this.#currentSegmentId = randomUUID();\n }\n\n this.#inBuf = '';\n this.#outBuf = '';\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.flush();\n this.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the token stream */\n close() {\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): BufferedTokenStream {\n return this;\n }\n}\n\nexport class BufferedSentenceStream extends SentenceStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n close() {\n super.close();\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n\nexport class BufferedWordStream extends WordStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n endInput() {\n this.#stream.endInput();\n }\n\n close() {\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n"],"mappings":"AAGA,SAAS,kBAAkB;AAC3B,SAAS,0BAA0B;AAEnC,SAAS,gBAAgB,kBAAkB;AAIpC,MAAM,oBAAgE;AAAA,EACjE,QAAQ,IAAI,mBAA8B;AAAA,EAC1C,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAuB,CAAC;AAAA,EACxB,SAAS;AAAA,EACT,UAAU;AAAA,EACV;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,SAAK,QAAQ;AACb,SAAK,kBAAkB;AACvB,SAAK,oBAAoB;AAEzB,SAAK,oBAAoB,WAAW;AAAA,EACtC;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,SAAK,UAAU;AACf,QAAI,KAAK,OAAO,SAAS,KAAK,kBAAmB;AAEjD,WAAO,MAAM;AACX,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,OAAO,UAAU,EAAG;AAExB,UAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAM,MAAM,OAAO,MAAM;AACzB,UAAI;AACJ,UAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,kBAAU,IAAI,CAAC;AAAA,MACjB,OAAO;AACL,kBAAU;AAAA,MACZ;AAEA,WAAK,WAAW;AAEhB,UAAI,KAAK,QAAQ,UAAU,KAAK,iBAAiB;AAC/C,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AACzE,aAAK,UAAU;AAAA,MACjB;AAEA,UAAI,OAAO,QAAS,UAAU;AAC5B,aAAK,SAAS,KAAK,OAAO,MAAM,IAAK,CAAC,CAAC;AAAA,MACzC,OAAO;AACL,aAAK,SAAS,KAAK,OAChB,MAAM,KAAK,IAAI,GAAG,KAAK,OAAO,QAAQ,GAAG,CAAC,IAAI,IAAI,MAAM,EACxD,UAAU;AAAA,MACf;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,UAAU,KAAK,SAAS;AAC/B,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,QAAQ;AACV,YAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAI,MAAM,QAAQ,OAAO,CAAC,CAAC,GAAG;AAC5B,eAAK,WAAW,OAAO,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC,EAAE,KAAK,GAAG;AAAA,QACtD,OAAO;AACL,eAAK,WAAW,OAAO,KAAK,GAAG;AAAA,QACjC;AAAA,MACF;AAEA,UAAI,KAAK,SAAS;AAChB,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AAAA,MAC3E;AAEA,WAAK,oBAAoB,WAAW;AAAA,IACtC;AAEA,SAAK,SAAS;AACd,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM;AACX,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAyB;AAC5C,WAAO;AAAA,EACT;AACF;AAEO,MAAM,+BAA+B,eAAe;AAAA,EACzD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,QAAQ;AACN,UAAM,MAAM;AACZ,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;AAEO,MAAM,2BAA2B,WAAW;AAAA,EACjD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,WAAW;AACT,SAAK,QAAQ,SAAS;AAAA,EACxB;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;","names":[]}
|
package/dist/transcription.cjs
CHANGED
|
@@ -18,114 +18,231 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
18
18
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
19
|
var transcription_exports = {};
|
|
20
20
|
__export(transcription_exports, {
|
|
21
|
-
|
|
21
|
+
TextAudioSynchronizer: () => TextAudioSynchronizer,
|
|
22
|
+
defaultTextSyncOptions: () => defaultTextSyncOptions
|
|
22
23
|
});
|
|
23
24
|
module.exports = __toCommonJS(transcription_exports);
|
|
24
|
-
var
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
25
|
+
var import_protocol = require("@livekit/protocol");
|
|
26
|
+
var import_rtc_node = require("@livekit/rtc-node");
|
|
27
|
+
var import_node_crypto = require("node:crypto");
|
|
28
|
+
var import_node_events = require("node:events");
|
|
29
|
+
var import_tokenize = require("./tokenize/index.cjs");
|
|
30
|
+
var import_utils = require("./utils.cjs");
|
|
31
|
+
const STANDARD_SPEECH_RATE = 3830;
|
|
32
|
+
const defaultTextSyncOptions = {
|
|
33
|
+
language: "",
|
|
34
|
+
speed: 1,
|
|
35
|
+
newSentenceDelay: 400,
|
|
36
|
+
sentenceTokenizer: new import_tokenize.basic.SentenceTokenizer(),
|
|
37
|
+
hyphenateWord: import_tokenize.basic.hyphenateWord,
|
|
38
|
+
splitWords: import_tokenize.basic.splitWords
|
|
39
|
+
};
|
|
40
|
+
class TextAudioSynchronizer extends import_node_events.EventEmitter {
|
|
41
|
+
#opts;
|
|
42
|
+
#speed;
|
|
43
|
+
#closed = false;
|
|
44
|
+
#interrupted = false;
|
|
45
|
+
#closeFut = new import_utils.Future();
|
|
46
|
+
#playingSegIndex = -1;
|
|
47
|
+
#finishedSegIndex = -1;
|
|
48
|
+
#textQChanged = new import_utils.AsyncIterableQueue();
|
|
49
|
+
#textQ = [];
|
|
50
|
+
#audioQChanged = new import_utils.AsyncIterableQueue();
|
|
51
|
+
#audioQ = [];
|
|
52
|
+
#playedText = "";
|
|
53
|
+
#task;
|
|
54
|
+
#audioData;
|
|
55
|
+
#textData;
|
|
56
|
+
constructor(opts) {
|
|
57
|
+
super();
|
|
58
|
+
this.#opts = opts;
|
|
59
|
+
this.#speed = opts.speed * STANDARD_SPEECH_RATE;
|
|
55
60
|
}
|
|
56
61
|
pushAudio(frame) {
|
|
57
|
-
this.#
|
|
62
|
+
this.#checkNotClosed();
|
|
63
|
+
if (!this.#audioData) {
|
|
64
|
+
this.#audioData = { pushedDuration: 0, done: false };
|
|
65
|
+
this.#audioQ.push(this.#audioData);
|
|
66
|
+
this.#audioQChanged.put(1);
|
|
67
|
+
}
|
|
68
|
+
this.#audioData.pushedDuration += frame.samplesPerChannel / frame.sampleRate;
|
|
58
69
|
}
|
|
59
70
|
pushText(text) {
|
|
60
|
-
this.#
|
|
71
|
+
this.#checkNotClosed();
|
|
72
|
+
if (!this.#textData) {
|
|
73
|
+
this.#textData = {
|
|
74
|
+
sentenceStream: this.#opts.sentenceTokenizer.stream(),
|
|
75
|
+
pushedText: "",
|
|
76
|
+
done: false,
|
|
77
|
+
forwardedHyphens: 0,
|
|
78
|
+
forwardedSentences: 0
|
|
79
|
+
};
|
|
80
|
+
this.#textQ.push(this.#textData);
|
|
81
|
+
this.#textQChanged.put(1);
|
|
82
|
+
}
|
|
83
|
+
this.#textData.pushedText += text;
|
|
84
|
+
this.#textData.sentenceStream.pushText(text);
|
|
61
85
|
}
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
86
|
+
markAudioSegmentEnd() {
|
|
87
|
+
this.#checkNotClosed();
|
|
88
|
+
if (!this.#audioData) {
|
|
89
|
+
this.pushAudio(new import_rtc_node.AudioFrame(new Int16Array(), 24e3, 1, 0));
|
|
90
|
+
}
|
|
91
|
+
this.#audioData.done = true;
|
|
92
|
+
this.#audioData = void 0;
|
|
67
93
|
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
this.#
|
|
94
|
+
markTextSegmentEnd() {
|
|
95
|
+
var _a, _b;
|
|
96
|
+
this.#checkNotClosed();
|
|
97
|
+
if (!this.#textData) {
|
|
98
|
+
this.pushText("");
|
|
99
|
+
}
|
|
100
|
+
this.#textData.done = true;
|
|
101
|
+
(_a = this.#textData) == null ? void 0 : _a.sentenceStream.flush();
|
|
102
|
+
(_b = this.#textData) == null ? void 0 : _b.sentenceStream.close();
|
|
103
|
+
this.#textData = void 0;
|
|
71
104
|
}
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
}
|
|
105
|
+
segmentPlayoutStarted() {
|
|
106
|
+
this.#checkNotClosed();
|
|
107
|
+
this.#playingSegIndex++;
|
|
108
|
+
if (!this.#task) {
|
|
109
|
+
this.#task = this.#mainLoop();
|
|
78
110
|
}
|
|
79
111
|
}
|
|
80
|
-
|
|
81
|
-
|
|
112
|
+
segmentPlayoutFinished() {
|
|
113
|
+
this.#checkNotClosed();
|
|
114
|
+
this.#finishedSegIndex++;
|
|
82
115
|
}
|
|
83
|
-
|
|
84
|
-
this.#
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
}
|
|
116
|
+
get playedText() {
|
|
117
|
+
return this.#playedText;
|
|
118
|
+
}
|
|
119
|
+
async close(interrupt) {
|
|
120
|
+
if (this.#closed) {
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
this.#closed = true;
|
|
124
|
+
this.#interrupted = interrupt;
|
|
125
|
+
this.#closeFut.resolve();
|
|
126
|
+
for (const textData of this.#textQ) {
|
|
127
|
+
textData == null ? void 0 : textData.sentenceStream.close();
|
|
96
128
|
}
|
|
97
|
-
|
|
98
|
-
|
|
129
|
+
this.#textQ.push(void 0);
|
|
130
|
+
this.#audioQ.push(void 0);
|
|
131
|
+
this.#textQChanged.put(1);
|
|
132
|
+
this.#audioQChanged.put(1);
|
|
133
|
+
await this.#task;
|
|
134
|
+
}
|
|
135
|
+
async #mainLoop() {
|
|
136
|
+
let segIndex = 0;
|
|
137
|
+
let qDone = false;
|
|
138
|
+
while (!qDone) {
|
|
139
|
+
await this.#textQChanged.next();
|
|
140
|
+
await this.#audioQChanged.next();
|
|
141
|
+
while (this.#textQ.length && this.#audioQ.length) {
|
|
142
|
+
const textData = this.#textQ.pop();
|
|
143
|
+
const audioData = this.#audioQ.pop();
|
|
144
|
+
if (!(textData && audioData)) {
|
|
145
|
+
qDone = true;
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
while (!this.#closed) {
|
|
149
|
+
if (this.#playingSegIndex >= segIndex) break;
|
|
150
|
+
await this.#sleepIfNotClosed(125);
|
|
151
|
+
}
|
|
152
|
+
const sentenceStream = textData.sentenceStream;
|
|
153
|
+
const forwardStartTime = Date.now();
|
|
154
|
+
for await (const ev of sentenceStream) {
|
|
155
|
+
await this.#syncSentence(segIndex, forwardStartTime, textData, audioData, ev.token);
|
|
156
|
+
}
|
|
157
|
+
segIndex++;
|
|
158
|
+
}
|
|
99
159
|
}
|
|
100
160
|
}
|
|
101
|
-
async #
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
161
|
+
async #syncSentence(segIndex, segStartTime, textData, audioData, sentence) {
|
|
162
|
+
let realSpeed;
|
|
163
|
+
if (audioData.pushedDuration > 0 && audioData.done) {
|
|
164
|
+
realSpeed = this.#calcHyphens(textData.pushedText).length / audioData.pushedDuration;
|
|
165
|
+
}
|
|
166
|
+
const segId = "SG_" + (0, import_node_crypto.randomUUID)();
|
|
167
|
+
const words = this.#opts.splitWords(sentence);
|
|
168
|
+
const processedWords = [];
|
|
169
|
+
const ogText = this.#playedText;
|
|
170
|
+
for (const [word, _, end] of words) {
|
|
171
|
+
if (segIndex <= this.#finishedSegIndex) break;
|
|
172
|
+
if (this.#interrupted) return;
|
|
173
|
+
const wordHyphens = this.#opts.hyphenateWord(word).length;
|
|
174
|
+
processedWords.push(word);
|
|
175
|
+
const elapsed = Date.now() - segStartTime;
|
|
176
|
+
const text = sentence.slice(0, end);
|
|
177
|
+
let speed = this.#speed;
|
|
178
|
+
let delay;
|
|
179
|
+
if (realSpeed) {
|
|
180
|
+
speed = realSpeed;
|
|
181
|
+
const estimatedPausesMs = textData.forwardedSentences * this.#opts.newSentenceDelay;
|
|
182
|
+
const hyphPauses = estimatedPausesMs * speed;
|
|
183
|
+
const targetHyphens = Math.round(speed * elapsed);
|
|
184
|
+
const dt = targetHyphens - textData.forwardedHyphens - hyphPauses;
|
|
185
|
+
const toWaitHyphens = Math.max(0, wordHyphens - dt);
|
|
186
|
+
delay = toWaitHyphens / speed;
|
|
187
|
+
} else {
|
|
188
|
+
delay = wordHyphens / speed;
|
|
189
|
+
}
|
|
190
|
+
const firstDelay = Math.min(delay / 2, 2 / speed);
|
|
191
|
+
await this.#sleepIfNotClosed(firstDelay * 1e6);
|
|
192
|
+
this.emit(
|
|
193
|
+
"textUpdated",
|
|
194
|
+
new import_protocol.TranscriptionSegment({
|
|
195
|
+
id: segId,
|
|
196
|
+
text,
|
|
112
197
|
startTime: BigInt(0),
|
|
113
198
|
endTime: BigInt(0),
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
199
|
+
final: false,
|
|
200
|
+
language: this.#opts.language
|
|
201
|
+
})
|
|
202
|
+
);
|
|
203
|
+
this.#playedText = `${ogText} ${text}`;
|
|
204
|
+
await this.#sleepIfNotClosed((delay - firstDelay) * 1e6);
|
|
205
|
+
textData.forwardedHyphens += wordHyphens;
|
|
206
|
+
}
|
|
207
|
+
this.emit(
|
|
208
|
+
"textUpdated",
|
|
209
|
+
new import_protocol.TranscriptionSegment({
|
|
210
|
+
id: segId,
|
|
211
|
+
text: sentence,
|
|
212
|
+
startTime: BigInt(0),
|
|
213
|
+
endTime: BigInt(0),
|
|
214
|
+
final: true,
|
|
215
|
+
language: this.#opts.language
|
|
216
|
+
})
|
|
217
|
+
);
|
|
218
|
+
this.#playedText = `${ogText} ${sentence}`;
|
|
219
|
+
await this.#sleepIfNotClosed(this.#opts.newSentenceDelay);
|
|
220
|
+
textData.forwardedSentences++;
|
|
118
221
|
}
|
|
119
|
-
async
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
222
|
+
async #sleepIfNotClosed(delay) {
|
|
223
|
+
await Promise.race([
|
|
224
|
+
this.#closeFut.await,
|
|
225
|
+
new Promise((resolve) => setTimeout(resolve, delay))
|
|
226
|
+
]);
|
|
227
|
+
}
|
|
228
|
+
#calcHyphens(text) {
|
|
229
|
+
const hyphens = [];
|
|
230
|
+
const words = this.#opts.splitWords(text);
|
|
231
|
+
for (const word of words) {
|
|
232
|
+
const n = this.#opts.hyphenateWord(word[0]);
|
|
233
|
+
hyphens.push(...n);
|
|
234
|
+
}
|
|
235
|
+
return hyphens;
|
|
236
|
+
}
|
|
237
|
+
#checkNotClosed() {
|
|
238
|
+
if (this.#closed) {
|
|
239
|
+
throw new Error("TextAudioSynchronizer is closed");
|
|
123
240
|
}
|
|
124
|
-
await this.#publishTranscription(true);
|
|
125
241
|
}
|
|
126
242
|
}
|
|
127
243
|
// Annotate the CommonJS export names for ESM import in node:
|
|
128
244
|
0 && (module.exports = {
|
|
129
|
-
|
|
245
|
+
TextAudioSynchronizer,
|
|
246
|
+
defaultTextSyncOptions
|
|
130
247
|
});
|
|
131
248
|
//# sourceMappingURL=transcription.cjs.map
|