@livekit/agents 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. package/README.md +17 -0
  2. package/dist/audio.cjs +77 -0
  3. package/dist/audio.cjs.map +1 -0
  4. package/dist/audio.js +48 -37
  5. package/dist/audio.js.map +1 -1
  6. package/dist/cli.cjs +131 -0
  7. package/dist/cli.cjs.map +1 -0
  8. package/dist/cli.js +96 -122
  9. package/dist/cli.js.map +1 -1
  10. package/dist/generator.cjs +36 -0
  11. package/dist/generator.cjs.map +1 -0
  12. package/dist/generator.js +8 -22
  13. package/dist/generator.js.map +1 -1
  14. package/dist/http_server.cjs +72 -0
  15. package/dist/http_server.cjs.map +1 -0
  16. package/dist/http_server.d.ts +1 -1
  17. package/dist/http_server.js +44 -47
  18. package/dist/http_server.js.map +1 -1
  19. package/dist/index.cjs +78 -0
  20. package/dist/index.cjs.map +1 -0
  21. package/dist/index.js +26 -28
  22. package/dist/index.js.map +1 -1
  23. package/dist/ipc/job_executor.cjs +33 -0
  24. package/dist/ipc/job_executor.cjs.map +1 -0
  25. package/dist/ipc/job_executor.js +7 -4
  26. package/dist/ipc/job_executor.js.map +1 -1
  27. package/dist/ipc/job_main.cjs +147 -0
  28. package/dist/ipc/job_main.cjs.map +1 -0
  29. package/dist/ipc/job_main.d.ts +1 -1
  30. package/dist/ipc/job_main.js +103 -103
  31. package/dist/ipc/job_main.js.map +1 -1
  32. package/dist/ipc/message.cjs +17 -0
  33. package/dist/ipc/message.cjs.map +1 -0
  34. package/dist/ipc/message.js +0 -1
  35. package/dist/ipc/message.js.map +1 -1
  36. package/dist/ipc/proc_job_executor.cjs +174 -0
  37. package/dist/ipc/proc_job_executor.cjs.map +1 -0
  38. package/dist/ipc/proc_job_executor.js +130 -126
  39. package/dist/ipc/proc_job_executor.js.map +1 -1
  40. package/dist/ipc/proc_pool.cjs +126 -0
  41. package/dist/ipc/proc_pool.cjs.map +1 -0
  42. package/dist/ipc/proc_pool.js +93 -96
  43. package/dist/ipc/proc_pool.js.map +1 -1
  44. package/dist/job.cjs +230 -0
  45. package/dist/job.cjs.map +1 -0
  46. package/dist/job.js +195 -198
  47. package/dist/job.js.map +1 -1
  48. package/dist/llm/chat_context.cjs +131 -0
  49. package/dist/llm/chat_context.cjs.map +1 -0
  50. package/dist/llm/chat_context.js +98 -86
  51. package/dist/llm/chat_context.js.map +1 -1
  52. package/dist/llm/function_context.cjs +103 -0
  53. package/dist/llm/function_context.cjs.map +1 -0
  54. package/dist/llm/function_context.js +72 -81
  55. package/dist/llm/function_context.js.map +1 -1
  56. package/dist/llm/function_context.test.cjs +218 -0
  57. package/dist/llm/function_context.test.cjs.map +1 -0
  58. package/dist/llm/function_context.test.js +209 -210
  59. package/dist/llm/function_context.test.js.map +1 -1
  60. package/dist/llm/index.cjs +43 -0
  61. package/dist/llm/index.cjs.map +1 -0
  62. package/dist/llm/index.js +22 -6
  63. package/dist/llm/index.js.map +1 -1
  64. package/dist/llm/llm.cjs +76 -0
  65. package/dist/llm/llm.cjs.map +1 -0
  66. package/dist/llm/llm.js +48 -42
  67. package/dist/llm/llm.js.map +1 -1
  68. package/dist/log.cjs +57 -0
  69. package/dist/log.cjs.map +1 -0
  70. package/dist/log.js +27 -26
  71. package/dist/log.js.map +1 -1
  72. package/dist/multimodal/agent_playout.cjs +228 -0
  73. package/dist/multimodal/agent_playout.cjs.map +1 -0
  74. package/dist/multimodal/agent_playout.d.ts +1 -1
  75. package/dist/multimodal/agent_playout.js +193 -180
  76. package/dist/multimodal/agent_playout.js.map +1 -1
  77. package/dist/multimodal/index.cjs +25 -0
  78. package/dist/multimodal/index.cjs.map +1 -0
  79. package/dist/multimodal/index.js +2 -5
  80. package/dist/multimodal/index.js.map +1 -1
  81. package/dist/multimodal/multimodal_agent.cjs +404 -0
  82. package/dist/multimodal/multimodal_agent.cjs.map +1 -0
  83. package/dist/multimodal/multimodal_agent.d.ts +2 -2
  84. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  85. package/dist/multimodal/multimodal_agent.js +351 -303
  86. package/dist/multimodal/multimodal_agent.js.map +1 -1
  87. package/dist/pipeline/agent_output.cjs +172 -0
  88. package/dist/pipeline/agent_output.cjs.map +1 -0
  89. package/dist/pipeline/agent_output.js +136 -138
  90. package/dist/pipeline/agent_output.js.map +1 -1
  91. package/dist/pipeline/agent_playout.cjs +169 -0
  92. package/dist/pipeline/agent_playout.cjs.map +1 -0
  93. package/dist/pipeline/agent_playout.js +126 -136
  94. package/dist/pipeline/agent_playout.js.map +1 -1
  95. package/dist/pipeline/human_input.cjs +158 -0
  96. package/dist/pipeline/human_input.cjs.map +1 -0
  97. package/dist/pipeline/human_input.js +124 -125
  98. package/dist/pipeline/human_input.js.map +1 -1
  99. package/dist/pipeline/index.cjs +31 -0
  100. package/dist/pipeline/index.cjs.map +1 -0
  101. package/dist/pipeline/index.js +8 -4
  102. package/dist/pipeline/index.js.map +1 -1
  103. package/dist/pipeline/pipeline_agent.cjs +642 -0
  104. package/dist/pipeline/pipeline_agent.cjs.map +1 -0
  105. package/dist/pipeline/pipeline_agent.d.ts +1 -0
  106. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  107. package/dist/pipeline/pipeline_agent.js +595 -650
  108. package/dist/pipeline/pipeline_agent.js.map +1 -1
  109. package/dist/pipeline/speech_handle.cjs +128 -0
  110. package/dist/pipeline/speech_handle.cjs.map +1 -0
  111. package/dist/pipeline/speech_handle.js +102 -100
  112. package/dist/pipeline/speech_handle.js.map +1 -1
  113. package/dist/plugin.cjs +46 -0
  114. package/dist/plugin.cjs.map +1 -0
  115. package/dist/plugin.js +20 -20
  116. package/dist/plugin.js.map +1 -1
  117. package/dist/stt/index.cjs +38 -0
  118. package/dist/stt/index.cjs.map +1 -0
  119. package/dist/stt/index.js +13 -5
  120. package/dist/stt/index.js.map +1 -1
  121. package/dist/stt/stream_adapter.cjs +87 -0
  122. package/dist/stt/stream_adapter.cjs.map +1 -0
  123. package/dist/stt/stream_adapter.js +58 -55
  124. package/dist/stt/stream_adapter.js.map +1 -1
  125. package/dist/stt/stt.cjs +98 -0
  126. package/dist/stt/stt.cjs.map +1 -0
  127. package/dist/stt/stt.js +63 -98
  128. package/dist/stt/stt.js.map +1 -1
  129. package/dist/tokenize/basic/basic.cjs +98 -0
  130. package/dist/tokenize/basic/basic.cjs.map +1 -0
  131. package/dist/tokenize/basic/basic.js +56 -45
  132. package/dist/tokenize/basic/basic.js.map +1 -1
  133. package/dist/tokenize/basic/hyphenator.cjs +425 -0
  134. package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
  135. package/dist/tokenize/basic/hyphenator.js +66 -82
  136. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  137. package/dist/tokenize/basic/index.cjs +35 -0
  138. package/dist/tokenize/basic/index.cjs.map +1 -0
  139. package/dist/tokenize/basic/index.js +7 -4
  140. package/dist/tokenize/basic/index.js.map +1 -1
  141. package/dist/tokenize/basic/paragraph.cjs +57 -0
  142. package/dist/tokenize/basic/paragraph.cjs.map +1 -0
  143. package/dist/tokenize/basic/paragraph.js +30 -35
  144. package/dist/tokenize/basic/paragraph.js.map +1 -1
  145. package/dist/tokenize/basic/sentence.cjs +83 -0
  146. package/dist/tokenize/basic/sentence.cjs.map +1 -0
  147. package/dist/tokenize/basic/sentence.js +56 -57
  148. package/dist/tokenize/basic/sentence.js.map +1 -1
  149. package/dist/tokenize/basic/word.cjs +44 -0
  150. package/dist/tokenize/basic/word.cjs.map +1 -0
  151. package/dist/tokenize/basic/word.js +17 -20
  152. package/dist/tokenize/basic/word.js.map +1 -1
  153. package/dist/tokenize/index.cjs +55 -0
  154. package/dist/tokenize/index.cjs.map +1 -0
  155. package/dist/tokenize/index.js +18 -7
  156. package/dist/tokenize/index.js.map +1 -1
  157. package/dist/tokenize/token_stream.cjs +164 -0
  158. package/dist/tokenize/token_stream.cjs.map +1 -0
  159. package/dist/tokenize/token_stream.js +133 -139
  160. package/dist/tokenize/token_stream.js.map +1 -1
  161. package/dist/tokenize/tokenizer.cjs +184 -0
  162. package/dist/tokenize/tokenizer.cjs.map +1 -0
  163. package/dist/tokenize/tokenizer.js +138 -99
  164. package/dist/tokenize/tokenizer.js.map +1 -1
  165. package/dist/transcription.cjs +131 -0
  166. package/dist/transcription.cjs.map +1 -0
  167. package/dist/transcription.d.ts +2 -0
  168. package/dist/transcription.d.ts.map +1 -1
  169. package/dist/transcription.js +99 -93
  170. package/dist/transcription.js.map +1 -1
  171. package/dist/tts/index.cjs +38 -0
  172. package/dist/tts/index.cjs.map +1 -0
  173. package/dist/tts/index.js +13 -5
  174. package/dist/tts/index.js.map +1 -1
  175. package/dist/tts/stream_adapter.cjs +78 -0
  176. package/dist/tts/stream_adapter.cjs.map +1 -0
  177. package/dist/tts/stream_adapter.js +50 -47
  178. package/dist/tts/stream_adapter.js.map +1 -1
  179. package/dist/tts/tts.cjs +127 -0
  180. package/dist/tts/tts.cjs.map +1 -0
  181. package/dist/tts/tts.js +90 -120
  182. package/dist/tts/tts.js.map +1 -1
  183. package/dist/utils.cjs +284 -0
  184. package/dist/utils.cjs.map +1 -0
  185. package/dist/utils.js +242 -247
  186. package/dist/utils.js.map +1 -1
  187. package/dist/vad.cjs +92 -0
  188. package/dist/vad.cjs.map +1 -0
  189. package/dist/vad.js +57 -52
  190. package/dist/vad.js.map +1 -1
  191. package/dist/version.cjs +29 -0
  192. package/dist/version.cjs.map +1 -0
  193. package/dist/version.js +4 -4
  194. package/dist/version.js.map +1 -1
  195. package/dist/worker.cjs +576 -0
  196. package/dist/worker.cjs.map +1 -0
  197. package/dist/worker.d.ts +1 -1
  198. package/dist/worker.js +511 -484
  199. package/dist/worker.js.map +1 -1
  200. package/package.json +23 -7
  201. package/src/ipc/job_main.ts +66 -64
  202. package/src/multimodal/multimodal_agent.ts +29 -2
  203. package/src/pipeline/pipeline_agent.ts +25 -24
  204. package/src/transcription.ts +5 -0
  205. package/.turbo/turbo-build.log +0 -4
  206. package/CHANGELOG.md +0 -165
  207. package/api-extractor.json +0 -20
  208. package/tsconfig.json +0 -16
  209. package/tsconfig.tsbuildinfo +0 -1
@@ -1,60 +1,59 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- /**
5
- * Split the text into sentences.
6
- */
7
- export const splitSentences = (text, minLength = 20) => {
8
- const alphabets = /([A-Za-z])/g;
9
- const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;
10
- const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;
11
- const starters = /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
12
- const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
13
- const websites = /[.](com|net|org|io|gov|edu|me)/g;
14
- const digits = /([0-9])/g;
15
- const dots = /\.{2,}/g;
16
- text = text.replaceAll('\n', ' ');
17
- text = text.replaceAll(prefixes, '$1<prd>');
18
- text = text.replaceAll(websites, '<prd>$2');
19
- text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');
20
- text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));
21
- text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');
22
- text = text.replaceAll(new RegExp(`\s${alphabets}[.]`, 'g'), '$1<prd>');
23
- text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');
24
- text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>$3<prd>');
25
- text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');
26
- text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');
27
- text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');
28
- text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');
29
- text = text.replaceAll('.”', '”.');
30
- text = text.replaceAll('."', '".');
31
- text = text.replaceAll('!"', '"!');
32
- text = text.replaceAll('?"', '"?');
33
- text = text.replaceAll('.', '.<stop>');
34
- text = text.replaceAll('?', '?<stop>');
35
- text = text.replaceAll('!', '!<stop>');
36
- text = text.replaceAll('<prd>', '.');
37
- const split = text.split('<stop>');
38
- text = text.replaceAll('<stop>', '');
39
- const sentences = [];
40
- let buf = '';
41
- let start = 0;
42
- let end = 0;
43
- for (const match of split) {
44
- const sentence = match.trim();
45
- if (!sentence)
46
- continue;
47
- buf += ' ' + sentence;
48
- end += match.length;
49
- if (buf.length > minLength) {
50
- sentences.push([buf.slice(1), start, end]);
51
- start = end;
52
- buf = '';
53
- }
1
+ const splitSentences = (text, minLength = 20) => {
2
+ const alphabets = /([A-Za-z])/g;
3
+ const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;
4
+ const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;
5
+ const starters = /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
6
+ const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
7
+ const websites = /[.](com|net|org|io|gov|edu|me)/g;
8
+ const digits = /([0-9])/g;
9
+ const dots = /\.{2,}/g;
10
+ text = text.replaceAll("\n", " ");
11
+ text = text.replaceAll(prefixes, "$1<prd>");
12
+ text = text.replaceAll(websites, "<prd>$2");
13
+ text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, "g"), "$1<prd>$2");
14
+ text = text.replaceAll(dots, (match) => "<prd>".repeat(match.length));
15
+ text = text.replaceAll("Ph.D.", "Ph<prd>D<prd>");
16
+ text = text.replaceAll(new RegExp(`s${alphabets}[.]`, "g"), "$1<prd>");
17
+ text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, "g"), "$1<stop> $2");
18
+ text = text.replaceAll(
19
+ new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, "g"),
20
+ "$1<prd>$2<prd>$3<prd>"
21
+ );
22
+ text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, "g"), "$1<prd>$2<prd>");
23
+ text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, "g"), "$1<stop> $2");
24
+ text = text.replaceAll(new RegExp(` ${suffixes}[.]`, "g"), "$1<prd>");
25
+ text = text.replaceAll(new RegExp(` ${alphabets}[.]`, "g"), "$1<prd>");
26
+ text = text.replaceAll(".\u201D", "\u201D.");
27
+ text = text.replaceAll('."', '".');
28
+ text = text.replaceAll('!"', '"!');
29
+ text = text.replaceAll('?"', '"?');
30
+ text = text.replaceAll(".", ".<stop>");
31
+ text = text.replaceAll("?", "?<stop>");
32
+ text = text.replaceAll("!", "!<stop>");
33
+ text = text.replaceAll("<prd>", ".");
34
+ const split = text.split("<stop>");
35
+ text = text.replaceAll("<stop>", "");
36
+ const sentences = [];
37
+ let buf = "";
38
+ let start = 0;
39
+ let end = 0;
40
+ for (const match of split) {
41
+ const sentence = match.trim();
42
+ if (!sentence) continue;
43
+ buf += " " + sentence;
44
+ end += match.length;
45
+ if (buf.length > minLength) {
46
+ sentences.push([buf.slice(1), start, end]);
47
+ start = end;
48
+ buf = "";
54
49
  }
55
- if (buf) {
56
- sentences.push([buf.slice(1), start, text.length - 1]);
57
- }
58
- return sentences;
50
+ }
51
+ if (buf) {
52
+ sentences.push([buf.slice(1), start, text.length - 1]);
53
+ }
54
+ return sentences;
55
+ };
56
+ export {
57
+ splitSentences
59
58
  };
60
59
  //# sourceMappingURL=sentence.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"sentence.js","sourceRoot":"","sources":["../../../src/tokenize/basic/sentence.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AAEtC;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,SAAS,GAAG,EAAE,EAA8B,EAAE;IACzF,MAAM,SAAS,GAAG,aAAa,CAAC;IAChC,MAAM,QAAQ,GAAG,uBAAuB,CAAC;IACzC,MAAM,QAAQ,GAAG,qBAAqB,CAAC;IACvC,MAAM,QAAQ,GACZ,mHAAmH,CAAC;IACtH,MAAM,QAAQ,GAAG,kCAAkC,CAAC;IACpD,MAAM,QAAQ,GAAG,iCAAiC,CAAC;IACnD,MAAM,MAAM,GAAG,UAAU,CAAC;IAC1B,MAAM,IAAI,GAAG,SAAS,CAAC;IAEvB,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAClC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC5C,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC5C,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,GAAG,MAAM,MAAM,MAAM,EAAE,EAAE,GAAG,CAAC,EAAE,WAAW,CAAC,CAAC;IAC9E,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IACtE,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IACjD,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,KAAK,SAAS,KAAK,EAAE,GAAG,CAAC,EAAE,SAAS,CAAC,CAAC;IACxE,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,GAAG,QAAQ,IAAI,QAAQ,EAAE,EAAE,GAAG,CAAC,EAAE,aAAa,CAAC,CAAC;IAClF,IAAI,GAAG,IAAI,CAAC,UAAU,CACpB,IAAI,MAAM,CAAC,GAAG,SAAS,MAAM,SAAS,MAAM,SAAS,KAAK,EAAE,GAAG,CAAC,EAChE,uBAAuB,CACxB,CAAC;IACF,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,GAAG,SAAS,MAAM,SAAS,KAAK,EAAE,GAAG,CAAC,EAAE,gBAAgB,CAAC,CAAC;IAC5F,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,IAAI,QAAQ,OAAO,QAAQ,EAAE,EAAE,GAAG,CAAC,EAAE,aAAa,CAAC,CAAC;IACtF,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,IAAI,QAAQ,KAAK,EAAE,GAAG,CAAC,EAAE,SAAS,CAAC,CAAC;IACtE,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,MAAM,CAAC,IAAI,SAAS,KAAK,EAAE,GAAG,CAAC,EAAE,SAAS,CAAC,CAAC;IACvE,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACnC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACnC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACnC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACnC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;IAErC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACnC,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAErC,MAAM,SAAS,GAA+B,EAAE,CAAC;IACjD,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,MAAM,KAAK,IAAI,KAAK,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;QAC9B,IAAI,CAAC,QAAQ;YAAE,SAAS;QAExB,GAAG,IAAI,GAAG,GAAG,QAAQ,CAAC;QACtB,GAAG,IAAI,KAAK,CAAC,MAAM,CAAC;QACpB,IAAI,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC3B,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3C,KAAK,GAAG,GAAG,CAAC;YACZ,GAAG,GAAG,EAAE,CAAC;QACX,CAAC;IACH,CAAC;IAED,IAAI,GAAG,EAAE,CAAC;QACR,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC"}
1
+ {"version":3,"sources":["../../../src/tokenize/basic/sentence.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Split the text into sentences.\n */\nexport const splitSentences = (text: string, minLength = 20): [string, number, number][] => {\n const alphabets = /([A-Za-z])/g;\n const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;\n const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;\n const starters =\n /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)/g;\n const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;\n const websites = /[.](com|net|org|io|gov|edu|me)/g;\n const digits = /([0-9])/g;\n const dots = /\\.{2,}/g;\n\n text = text.replaceAll('\\n', ' ');\n text = text.replaceAll(prefixes, '$1<prd>');\n text = text.replaceAll(websites, '<prd>$2');\n text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');\n text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));\n text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');\n text = text.replaceAll(new RegExp(`\\s${alphabets}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(\n new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'),\n '$1<prd>$2<prd>$3<prd>',\n );\n text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');\n text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll('.”', '”.');\n text = text.replaceAll('.\"', '\".');\n text = text.replaceAll('!\"', '\"!');\n text = text.replaceAll('?\"', '\"?');\n text = text.replaceAll('.', '.<stop>');\n text = text.replaceAll('?', '?<stop>');\n text = text.replaceAll('!', '!<stop>');\n text = text.replaceAll('<prd>', '.');\n\n const split = text.split('<stop>');\n text = text.replaceAll('<stop>', '');\n\n const sentences: [string, number, number][] = [];\n let buf = '';\n let start = 0;\n let end = 0;\n for (const match of split) {\n const sentence = match.trim();\n if (!sentence) continue;\n\n buf += ' ' + sentence;\n end += match.length;\n if (buf.length > minLength) {\n sentences.push([buf.slice(1), start, end]);\n start = end;\n buf = '';\n }\n }\n\n if (buf) {\n sentences.push([buf.slice(1), start, text.length - 1]);\n }\n\n return sentences;\n};\n"],"mappings":"AAOO,MAAM,iBAAiB,CAAC,MAAc,YAAY,OAAmC;AAC1F,QAAM,YAAY;AAClB,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,WACJ;AACF,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,SAAS;AACf,QAAM,OAAO;AAEb,SAAO,KAAK,WAAW,MAAM,GAAG;AAChC,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,MAAM,MAAM,MAAM,IAAI,GAAG,GAAG,WAAW;AAC5E,SAAO,KAAK,WAAW,MAAM,CAAC,UAAU,QAAQ,OAAO,MAAM,MAAM,CAAC;AACpE,SAAO,KAAK,WAAW,SAAS,eAAe;AAC/C,SAAO,KAAK,WAAW,IAAI,OAAO,IAAK,SAAS,OAAO,GAAG,GAAG,SAAS;AACtE,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,QAAQ,IAAI,QAAQ,IAAI,GAAG,GAAG,aAAa;AAChF,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,SAAS,MAAM,SAAS,MAAM,SAAS,OAAO,GAAG;AAAA,IAC/D;AAAA,EACF;AACA,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,SAAS,MAAM,SAAS,OAAO,GAAG,GAAG,gBAAgB;AAC1F,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,QAAQ,OAAO,QAAQ,IAAI,GAAG,GAAG,aAAa;AACpF,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,QAAQ,OAAO,GAAG,GAAG,SAAS;AACpE,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,SAAS,OAAO,GAAG,GAAG,SAAS;AACrE,SAAO,KAAK,WAAW,WAAM,SAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,SAAS,GAAG;AAEnC,QAAM,QAAQ,KAAK,MAAM,QAAQ;AACjC,SAAO,KAAK,WAAW,UAAU,EAAE;AAEnC,QAAM,YAAwC,CAAC;AAC/C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,MAAM;AACV,aAAW,SAAS,OAAO;AACzB,UAAM,WAAW,MAAM,KAAK;AAC5B,QAAI,CAAC,SAAU;AAEf,WAAO,MAAM;AACb,WAAO,MAAM;AACb,QAAI,IAAI,SAAS,WAAW;AAC1B,gBAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,GAAG,CAAC;AACzC,cAAQ;AACR,YAAM;AAAA,IACR;AAAA,EACF;AAEA,MAAI,KAAK;AACP,cAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC;AAAA,EACvD;AAEA,SAAO;AACT;","names":[]}
@@ -0,0 +1,44 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var word_exports = {};
20
+ __export(word_exports, {
21
+ splitWords: () => splitWords
22
+ });
23
+ module.exports = __toCommonJS(word_exports);
24
+ var import_tokenizer = require("../tokenizer.cjs");
25
+ const splitWords = (text, ignorePunctuation = true) => {
26
+ const re = /\S+/g;
27
+ const words = [];
28
+ let arr;
29
+ while ((arr = re.exec(text)) !== null) {
30
+ let word = arr[0];
31
+ const start = arr.index;
32
+ const end = start + word.length;
33
+ if (ignorePunctuation) {
34
+ word = word.replace(new RegExp(`[${import_tokenizer.PUNCTUATIONS.join("")}]`, "g"), "");
35
+ }
36
+ words.push([word, start, end]);
37
+ }
38
+ return words;
39
+ };
40
+ // Annotate the CommonJS export names for ESM import in node:
41
+ 0 && (module.exports = {
42
+ splitWords
43
+ });
44
+ //# sourceMappingURL=word.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../../src/tokenize/basic/word.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { PUNCTUATIONS } from '../tokenizer.js';\n\n/**\n * Split the text into words.\n */\nexport const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {\n const re = /\\S+/g;\n const words: [string, number, number][] = [];\n\n let arr;\n while ((arr = re.exec(text)) !== null) {\n let word = arr[0];\n const start = arr.index;\n const end = start + word.length;\n\n if (ignorePunctuation) {\n word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');\n }\n\n words.push([word, start, end]);\n }\n\n return words;\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,uBAA6B;AAKtB,MAAM,aAAa,CAAC,MAAc,oBAAoB,SAAqC;AAChG,QAAM,KAAK;AACX,QAAM,QAAoC,CAAC;AAE3C,MAAI;AACJ,UAAQ,MAAM,GAAG,KAAK,IAAI,OAAO,MAAM;AACrC,QAAI,OAAO,IAAI,CAAC;AAChB,UAAM,QAAQ,IAAI;AAClB,UAAM,MAAM,QAAQ,KAAK;AAEzB,QAAI,mBAAmB;AACrB,aAAO,KAAK,QAAQ,IAAI,OAAO,IAAI,8BAAa,KAAK,EAAE,CAAC,KAAK,GAAG,GAAG,EAAE;AAAA,IACvE;AAEA,UAAM,KAAK,CAAC,MAAM,OAAO,GAAG,CAAC;AAAA,EAC/B;AAEA,SAAO;AACT;","names":[]}
@@ -1,23 +1,20 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- import { PUNCTUATIONS } from '../tokenizer.js';
5
- /**
6
- * Split the text into words.
7
- */
8
- export const splitWords = (text, ignorePunctuation = true) => {
9
- const re = /\S+/g;
10
- const words = [];
11
- let arr;
12
- while ((arr = re.exec(text)) !== null) {
13
- let word = arr[0];
14
- const start = arr.index;
15
- const end = start + word.length;
16
- if (ignorePunctuation) {
17
- word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');
18
- }
19
- words.push([word, start, end]);
1
+ import { PUNCTUATIONS } from "../tokenizer.js";
2
+ const splitWords = (text, ignorePunctuation = true) => {
3
+ const re = /\S+/g;
4
+ const words = [];
5
+ let arr;
6
+ while ((arr = re.exec(text)) !== null) {
7
+ let word = arr[0];
8
+ const start = arr.index;
9
+ const end = start + word.length;
10
+ if (ignorePunctuation) {
11
+ word = word.replace(new RegExp(`[${PUNCTUATIONS.join("")}]`, "g"), "");
20
12
  }
21
- return words;
13
+ words.push([word, start, end]);
14
+ }
15
+ return words;
16
+ };
17
+ export {
18
+ splitWords
22
19
  };
23
20
  //# sourceMappingURL=word.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"word.js","sourceRoot":"","sources":["../../../src/tokenize/basic/word.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAE/C;;GAEG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,IAAY,EAAE,iBAAiB,GAAG,IAAI,EAA8B,EAAE;IAC/F,MAAM,EAAE,GAAG,MAAM,CAAC;IAClB,MAAM,KAAK,GAA+B,EAAE,CAAC;IAE7C,IAAI,GAAG,CAAC;IACR,OAAO,CAAC,GAAG,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtC,IAAI,IAAI,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAClB,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC;QACxB,MAAM,GAAG,GAAG,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC;QAEhC,IAAI,iBAAiB,EAAE,CAAC;YACtB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,IAAI,YAAY,CAAC,IAAI,CAAC,EAAE,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,EAAE,CAAC,CAAC;QACzE,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC,CAAC"}
1
+ {"version":3,"sources":["../../../src/tokenize/basic/word.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { PUNCTUATIONS } from '../tokenizer.js';\n\n/**\n * Split the text into words.\n */\nexport const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {\n const re = /\\S+/g;\n const words: [string, number, number][] = [];\n\n let arr;\n while ((arr = re.exec(text)) !== null) {\n let word = arr[0];\n const start = arr.index;\n const end = start + word.length;\n\n if (ignorePunctuation) {\n word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');\n }\n\n words.push([word, start, end]);\n }\n\n return words;\n};\n"],"mappings":"AAGA,SAAS,oBAAoB;AAKtB,MAAM,aAAa,CAAC,MAAc,oBAAoB,SAAqC;AAChG,QAAM,KAAK;AACX,QAAM,QAAoC,CAAC;AAE3C,MAAI;AACJ,UAAQ,MAAM,GAAG,KAAK,IAAI,OAAO,MAAM;AACrC,QAAI,OAAO,IAAI,CAAC;AAChB,UAAM,QAAQ,IAAI;AAClB,UAAM,MAAM,QAAQ,KAAK;AAEzB,QAAI,mBAAmB;AACrB,aAAO,KAAK,QAAQ,IAAI,OAAO,IAAI,aAAa,KAAK,EAAE,CAAC,KAAK,GAAG,GAAG,EAAE;AAAA,IACvE;AAEA,UAAM,KAAK,CAAC,MAAM,OAAO,GAAG,CAAC;AAAA,EAC/B;AAEA,SAAO;AACT;","names":[]}
@@ -0,0 +1,55 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var tokenize_exports = {};
30
+ __export(tokenize_exports, {
31
+ BufferedSentenceStream: () => import_token_stream.BufferedSentenceStream,
32
+ BufferedTokenStream: () => import_token_stream.BufferedTokenStream,
33
+ BufferedWordStream: () => import_token_stream.BufferedWordStream,
34
+ SentenceStream: () => import_tokenizer.SentenceStream,
35
+ SentenceTokenizer: () => import_tokenizer.SentenceTokenizer,
36
+ WordStream: () => import_tokenizer.WordStream,
37
+ WordTokenizer: () => import_tokenizer.WordTokenizer,
38
+ basic: () => basic
39
+ });
40
+ module.exports = __toCommonJS(tokenize_exports);
41
+ var basic = __toESM(require("./basic/index.cjs"), 1);
42
+ var import_tokenizer = require("./tokenizer.cjs");
43
+ var import_token_stream = require("./token_stream.cjs");
44
+ // Annotate the CommonJS export names for ESM import in node:
45
+ 0 && (module.exports = {
46
+ BufferedSentenceStream,
47
+ BufferedTokenStream,
48
+ BufferedWordStream,
49
+ SentenceStream,
50
+ SentenceTokenizer,
51
+ WordStream,
52
+ WordTokenizer,
53
+ basic
54
+ });
55
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/tokenize/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport * as basic from './basic/index.js';\n\nexport {\n type TokenData,\n SentenceTokenizer,\n SentenceStream,\n WordTokenizer,\n WordStream,\n} from './tokenizer.js';\n\nexport { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js';\n\nexport { basic };\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,YAAuB;AAEvB,uBAMO;AAEP,0BAAgF;","names":[]}
@@ -1,8 +1,19 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- import * as basic from './basic/index.js';
5
- export { SentenceTokenizer, SentenceStream, WordTokenizer, WordStream, } from './tokenizer.js';
6
- export { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js';
7
- export { basic };
1
+ import * as basic from "./basic/index.js";
2
+ import {
3
+ SentenceTokenizer,
4
+ SentenceStream,
5
+ WordTokenizer,
6
+ WordStream
7
+ } from "./tokenizer.js";
8
+ import { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from "./token_stream.js";
9
+ export {
10
+ BufferedSentenceStream,
11
+ BufferedTokenStream,
12
+ BufferedWordStream,
13
+ SentenceStream,
14
+ SentenceTokenizer,
15
+ WordStream,
16
+ WordTokenizer,
17
+ basic
18
+ };
8
19
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/tokenize/index.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,KAAK,KAAK,MAAM,kBAAkB,CAAC;AAE1C,OAAO,EAEL,iBAAiB,EACjB,cAAc,EACd,aAAa,EACb,UAAU,GACX,MAAM,gBAAgB,CAAC;AAExB,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AAEpG,OAAO,EAAE,KAAK,EAAE,CAAC"}
1
+ {"version":3,"sources":["../../src/tokenize/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport * as basic from './basic/index.js';\n\nexport {\n type TokenData,\n SentenceTokenizer,\n SentenceStream,\n WordTokenizer,\n WordStream,\n} from './tokenizer.js';\n\nexport { BufferedSentenceStream, BufferedTokenStream, BufferedWordStream } from './token_stream.js';\n\nexport { basic };\n"],"mappings":"AAGA,YAAY,WAAW;AAEvB;AAAA,EAEE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAEP,SAAS,wBAAwB,qBAAqB,0BAA0B;","names":[]}
@@ -0,0 +1,164 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var token_stream_exports = {};
20
+ __export(token_stream_exports, {
21
+ BufferedSentenceStream: () => BufferedSentenceStream,
22
+ BufferedTokenStream: () => BufferedTokenStream,
23
+ BufferedWordStream: () => BufferedWordStream
24
+ });
25
+ module.exports = __toCommonJS(token_stream_exports);
26
+ var import_node_crypto = require("node:crypto");
27
+ var import_utils = require("../utils.cjs");
28
+ var import_tokenizer = require("./tokenizer.cjs");
29
+ class BufferedTokenStream {
30
+ queue = new import_utils.AsyncIterableQueue();
31
+ closed = false;
32
+ #func;
33
+ #minTokenLength;
34
+ #minContextLength;
35
+ #bufTokens = [];
36
+ #inBuf = "";
37
+ #outBuf = "";
38
+ #currentSegmentId;
39
+ constructor(func, minTokenLength, minContextLength) {
40
+ this.#func = func;
41
+ this.#minTokenLength = minTokenLength;
42
+ this.#minContextLength = minContextLength;
43
+ this.#currentSegmentId = (0, import_node_crypto.randomUUID)();
44
+ }
45
+ /** Push a string of text into the token stream */
46
+ pushText(text) {
47
+ if (this.closed) {
48
+ throw new Error("Stream is closed");
49
+ }
50
+ this.#inBuf += text;
51
+ if (this.#inBuf.length < this.#minContextLength) return;
52
+ while (true) {
53
+ const tokens = this.#func(this.#inBuf);
54
+ if (tokens.length <= 1) break;
55
+ if (this.#outBuf) this.#outBuf += " ";
56
+ const tok = tokens.shift();
57
+ let tokText = tok;
58
+ if (tok.length > 1 && typeof tok[1] === "number") {
59
+ tokText = tok[0];
60
+ }
61
+ this.#outBuf += tokText;
62
+ if (this.#outBuf.length >= this.#minTokenLength) {
63
+ this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
64
+ this.#outBuf = "";
65
+ }
66
+ if (typeof tok !== "string") {
67
+ this.#inBuf = this.#inBuf.slice(tok[2]);
68
+ } else {
69
+ this.#inBuf = this.#inBuf.slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length).trimStart();
70
+ }
71
+ }
72
+ }
73
+ /** Flush the stream, causing it to process all pending text */
74
+ flush() {
75
+ if (this.closed) {
76
+ throw new Error("Stream is closed");
77
+ }
78
+ if (this.#inBuf || this.#outBuf) {
79
+ const tokens = this.#func(this.#inBuf);
80
+ if (tokens) {
81
+ if (this.#outBuf) this.#outBuf += " ";
82
+ if (typeof tokens[0] !== "string") {
83
+ this.#outBuf += tokens.map((tok) => tok[0]).join(" ");
84
+ } else {
85
+ this.#outBuf += tokens.join(" ");
86
+ }
87
+ }
88
+ if (this.#outBuf) {
89
+ this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });
90
+ }
91
+ this.#currentSegmentId = (0, import_node_crypto.randomUUID)();
92
+ }
93
+ this.#inBuf = "";
94
+ this.#outBuf = "";
95
+ }
96
+ /** Mark the input as ended and forbid additional pushes */
97
+ endInput() {
98
+ if (this.closed) {
99
+ throw new Error("Stream is closed");
100
+ }
101
+ this.flush();
102
+ this.close();
103
+ }
104
+ next() {
105
+ return this.queue.next();
106
+ }
107
+ /** Close both the input and output of the token stream */
108
+ close() {
109
+ this.queue.close();
110
+ this.closed = true;
111
+ }
112
+ [Symbol.asyncIterator]() {
113
+ return this;
114
+ }
115
+ }
116
+ class BufferedSentenceStream extends import_tokenizer.SentenceStream {
117
+ #stream;
118
+ constructor(func, minTokenLength, minContextLength) {
119
+ super();
120
+ this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
121
+ }
122
+ pushText(text) {
123
+ this.#stream.pushText(text);
124
+ }
125
+ flush() {
126
+ this.#stream.flush();
127
+ }
128
+ close() {
129
+ super.close();
130
+ this.#stream.close();
131
+ }
132
+ next() {
133
+ return this.#stream.next();
134
+ }
135
+ }
136
+ class BufferedWordStream extends import_tokenizer.WordStream {
137
+ #stream;
138
+ constructor(func, minTokenLength, minContextLength) {
139
+ super();
140
+ this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);
141
+ }
142
+ pushText(text) {
143
+ this.#stream.pushText(text);
144
+ }
145
+ flush() {
146
+ this.#stream.flush();
147
+ }
148
+ endInput() {
149
+ this.#stream.endInput();
150
+ }
151
+ close() {
152
+ this.#stream.close();
153
+ }
154
+ next() {
155
+ return this.#stream.next();
156
+ }
157
+ }
158
+ // Annotate the CommonJS export names for ESM import in node:
159
+ 0 && (module.exports = {
160
+ BufferedSentenceStream,
161
+ BufferedTokenStream,
162
+ BufferedWordStream
163
+ });
164
+ //# sourceMappingURL=token_stream.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/tokenize/token_stream.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { randomUUID } from 'node:crypto';\nimport { AsyncIterableQueue } from '../utils.js';\nimport type { TokenData } from './tokenizer.js';\nimport { SentenceStream, WordStream } from './tokenizer.js';\n\ntype TokenizeFunc = (x: string) => string[] | [string, number, number][];\n\nexport class BufferedTokenStream implements AsyncIterableIterator<TokenData> {\n protected queue = new AsyncIterableQueue<TokenData>();\n protected closed = false;\n\n #func: TokenizeFunc;\n #minTokenLength: number;\n #minContextLength: number;\n #bufTokens: string[] = [];\n #inBuf = '';\n #outBuf = '';\n #currentSegmentId: string;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n this.#func = func;\n this.#minTokenLength = minTokenLength;\n this.#minContextLength = minContextLength;\n\n this.#currentSegmentId = randomUUID();\n }\n\n /** Push a string of text into the token stream */\n pushText(text: string) {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n this.#inBuf += text;\n if (this.#inBuf.length < this.#minContextLength) return;\n\n while (true) {\n const tokens = this.#func(this.#inBuf);\n if (tokens.length <= 1) break;\n\n if (this.#outBuf) this.#outBuf += ' ';\n\n const tok = tokens.shift()!;\n let tokText = tok as string;\n if (tok.length > 1 && typeof tok[1] === 'number') {\n tokText = tok[0];\n }\n\n this.#outBuf += tokText;\n if (this.#outBuf.length >= this.#minTokenLength) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n this.#outBuf = '';\n }\n\n if (typeof tok! !== 'string') {\n this.#inBuf = this.#inBuf.slice(tok![2]);\n } else {\n this.#inBuf = this.#inBuf\n .slice(Math.max(0, this.#inBuf.indexOf(tok)) + tok.length)\n .trimStart();\n }\n }\n }\n\n /** Flush the stream, causing it to process all pending text */\n flush() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n\n if (this.#inBuf || this.#outBuf) {\n const tokens = this.#func(this.#inBuf);\n if (tokens) {\n if (this.#outBuf) this.#outBuf += ' ';\n\n if (typeof tokens[0] !== 'string') {\n this.#outBuf += tokens.map((tok) => tok[0]).join(' ');\n } else {\n this.#outBuf += tokens.join(' ');\n }\n }\n\n if (this.#outBuf) {\n this.queue.put({ token: this.#outBuf, segmentId: this.#currentSegmentId });\n }\n\n this.#currentSegmentId = randomUUID();\n }\n\n this.#inBuf = '';\n this.#outBuf = '';\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.flush();\n this.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the token stream */\n close() {\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): BufferedTokenStream {\n return this;\n }\n}\n\nexport class BufferedSentenceStream extends SentenceStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n close() {\n super.close();\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n\nexport class BufferedWordStream extends WordStream {\n #stream: BufferedTokenStream;\n\n constructor(func: TokenizeFunc, minTokenLength: number, minContextLength: number) {\n super();\n this.#stream = new BufferedTokenStream(func, minTokenLength, minContextLength);\n }\n\n pushText(text: string) {\n this.#stream.pushText(text);\n }\n\n flush() {\n this.#stream.flush();\n }\n\n endInput() {\n this.#stream.endInput();\n }\n\n close() {\n this.#stream.close();\n }\n\n next(): Promise<IteratorResult<TokenData>> {\n return this.#stream.next();\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,yBAA2B;AAC3B,mBAAmC;AAEnC,uBAA2C;AAIpC,MAAM,oBAAgE;AAAA,EACjE,QAAQ,IAAI,gCAA8B;AAAA,EAC1C,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EACA;AAAA,EACA,aAAuB,CAAC;AAAA,EACxB,SAAS;AAAA,EACT,UAAU;AAAA,EACV;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,SAAK,QAAQ;AACb,SAAK,kBAAkB;AACvB,SAAK,oBAAoB;AAEzB,SAAK,wBAAoB,+BAAW;AAAA,EACtC;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,SAAK,UAAU;AACf,QAAI,KAAK,OAAO,SAAS,KAAK,kBAAmB;AAEjD,WAAO,MAAM;AACX,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,OAAO,UAAU,EAAG;AAExB,UAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAM,MAAM,OAAO,MAAM;AACzB,UAAI,UAAU;AACd,UAAI,IAAI,SAAS,KAAK,OAAO,IAAI,CAAC,MAAM,UAAU;AAChD,kBAAU,IAAI,CAAC;AAAA,MACjB;AAEA,WAAK,WAAW;AAChB,UAAI,KAAK,QAAQ,UAAU,KAAK,iBAAiB;AAC/C,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AACzE,aAAK,UAAU;AAAA,MACjB;AAEA,UAAI,OAAO,QAAS,UAAU;AAC5B,aAAK,SAAS,KAAK,OAAO,MAAM,IAAK,CAAC,CAAC;AAAA,MACzC,OAAO;AACL,aAAK,SAAS,KAAK,OAChB,MAAM,KAAK,IAAI,GAAG,KAAK,OAAO,QAAQ,GAAG,CAAC,IAAI,IAAI,MAAM,EACxD,UAAU;AAAA,MACf;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AAEA,QAAI,KAAK,UAAU,KAAK,SAAS;AAC/B,YAAM,SAAS,KAAK,MAAM,KAAK,MAAM;AACrC,UAAI,QAAQ;AACV,YAAI,KAAK,QAAS,MAAK,WAAW;AAElC,YAAI,OAAO,OAAO,CAAC,MAAM,UAAU;AACjC,eAAK,WAAW,OAAO,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC,EAAE,KAAK,GAAG;AAAA,QACtD,OAAO;AACL,eAAK,WAAW,OAAO,KAAK,GAAG;AAAA,QACjC;AAAA,MACF;AAEA,UAAI,KAAK,SAAS;AAChB,aAAK,MAAM,IAAI,EAAE,OAAO,KAAK,SAAS,WAAW,KAAK,kBAAkB,CAAC;AAAA,MAC3E;AAEA,WAAK,wBAAoB,+BAAW;AAAA,IACtC;AAEA,SAAK,SAAS;AACd,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM;AACX,SAAK,MAAM;AAAA,EACb;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAyB;AAC5C,WAAO;AAAA,EACT;AACF;AAEO,MAAM,+BAA+B,gCAAe;AAAA,EACzD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,QAAQ;AACN,UAAM,MAAM;AACZ,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;AAEO,MAAM,2BAA2B,4BAAW;AAAA,EACjD;AAAA,EAEA,YAAY,MAAoB,gBAAwB,kBAA0B;AAChF,UAAM;AACN,SAAK,UAAU,IAAI,oBAAoB,MAAM,gBAAgB,gBAAgB;AAAA,EAC/E;AAAA,EAEA,SAAS,MAAc;AACrB,SAAK,QAAQ,SAAS,IAAI;AAAA,EAC5B;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,WAAW;AACT,SAAK,QAAQ,SAAS;AAAA,EACxB;AAAA,EAEA,QAAQ;AACN,SAAK,QAAQ,MAAM;AAAA,EACrB;AAAA,EAEA,OAA2C;AACzC,WAAO,KAAK,QAAQ,KAAK;AAAA,EAC3B;AACF;","names":[]}