@livekit/agents 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/job.cjs +2 -2
  2. package/dist/job.cjs.map +1 -1
  3. package/dist/job.d.ts +6 -1
  4. package/dist/job.d.ts.map +1 -1
  5. package/dist/job.js +2 -2
  6. package/dist/job.js.map +1 -1
  7. package/dist/pipeline/agent_output.cjs +9 -2
  8. package/dist/pipeline/agent_output.cjs.map +1 -1
  9. package/dist/pipeline/agent_output.d.ts +1 -0
  10. package/dist/pipeline/agent_output.d.ts.map +1 -1
  11. package/dist/pipeline/agent_output.js +9 -2
  12. package/dist/pipeline/agent_output.js.map +1 -1
  13. package/dist/pipeline/pipeline_agent.cjs +2 -4
  14. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  15. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  16. package/dist/pipeline/pipeline_agent.js +2 -4
  17. package/dist/pipeline/pipeline_agent.js.map +1 -1
  18. package/dist/tokenize/basic/basic.cjs +1 -1
  19. package/dist/tokenize/basic/basic.cjs.map +1 -1
  20. package/dist/tokenize/basic/basic.d.ts +1 -1
  21. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  22. package/dist/tokenize/basic/basic.js +1 -1
  23. package/dist/tokenize/basic/basic.js.map +1 -1
  24. package/dist/tokenize/basic/sentence.cjs +14 -8
  25. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  26. package/dist/tokenize/basic/sentence.d.ts.map +1 -1
  27. package/dist/tokenize/basic/sentence.js +14 -8
  28. package/dist/tokenize/basic/sentence.js.map +1 -1
  29. package/dist/tokenize/tokenizer.test.cjs +220 -0
  30. package/dist/tokenize/tokenizer.test.cjs.map +1 -0
  31. package/dist/tokenize/tokenizer.test.d.ts +2 -0
  32. package/dist/tokenize/tokenizer.test.d.ts.map +1 -0
  33. package/dist/tokenize/tokenizer.test.js +219 -0
  34. package/dist/tokenize/tokenizer.test.js.map +1 -0
  35. package/dist/worker.cjs +2 -1
  36. package/dist/worker.cjs.map +1 -1
  37. package/dist/worker.d.ts.map +1 -1
  38. package/dist/worker.js +2 -1
  39. package/dist/worker.js.map +1 -1
  40. package/package.json +1 -1
  41. package/src/job.ts +3 -2
  42. package/src/pipeline/agent_output.ts +14 -7
  43. package/src/pipeline/pipeline_agent.ts +2 -6
  44. package/src/tokenize/basic/basic.ts +1 -1
  45. package/src/tokenize/basic/sentence.ts +14 -8
  46. package/src/tokenize/tokenizer.test.ts +255 -0
  47. package/src/worker.ts +1 -0
@@ -1 +1 @@
1
- {"version":3,"sources":["../../../src/tokenize/basic/sentence.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Split the text into sentences.\n */\nexport const splitSentences = (text: string, minLength = 20): [string, number, number][] => {\n const alphabets = /([A-Za-z])/g;\n const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;\n const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;\n const starters =\n /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)/g;\n const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;\n const websites = /[.](com|net|org|io|gov|edu|me)/g;\n const digits = /([0-9])/g;\n const dots = /\\.{2,}/g;\n\n text = text.replaceAll('\\n', ' ');\n text = text.replaceAll(prefixes, '$1<prd>');\n text = text.replaceAll(websites, '<prd>$2');\n text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');\n text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));\n text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');\n text = text.replaceAll(new RegExp(`\\s${alphabets}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(\n new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'),\n '$1<prd>$2<prd>$3<prd>',\n );\n text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');\n text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll('.”', '”.');\n text = text.replaceAll('.\"', '\".');\n text = text.replaceAll('!\"', '\"!');\n text = text.replaceAll('?\"', '\"?');\n text = text.replaceAll('.', '.<stop>');\n text = text.replaceAll('?', '?<stop>');\n text = text.replaceAll('!', '!<stop>');\n text = text.replaceAll('<prd>', '.');\n\n const split = text.split('<stop>');\n text = text.replaceAll('<stop>', '');\n\n const sentences: [string, number, number][] = [];\n let buf = '';\n let start = 0;\n let end = 0;\n for (const match of split) {\n const sentence = match.trim();\n if (!sentence) continue;\n\n buf += ' ' + sentence;\n end += match.length;\n if (buf.length > minLength) {\n sentences.push([buf.slice(1), start, end]);\n start = end;\n buf = '';\n }\n }\n\n if (buf) {\n sentences.push([buf.slice(1), start, text.length - 1]);\n }\n\n return sentences;\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAOO,MAAM,iBAAiB,CAAC,MAAc,YAAY,OAAmC;AAC1F,QAAM,YAAY;AAClB,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,WACJ;AACF,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,SAAS;AACf,QAAM,OAAO;AAEb,SAAO,KAAK,WAAW,MAAM,GAAG;AAChC,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,MAAM,MAAM,MAAM,IAAI,GAAG,GAAG,WAAW;AAC5E,SAAO,KAAK,WAAW,MAAM,CAAC,UAAU,QAAQ,OAAO,MAAM,MAAM,CAAC;AACpE,SAAO,KAAK,WAAW,SAAS,eAAe;AAC/C,SAAO,KAAK,WAAW,IAAI,OAAO,IAAK,SAAS,OAAO,GAAG,GAAG,SAAS;AACtE,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,QAAQ,IAAI,QAAQ,IAAI,GAAG,GAAG,aAAa;AAChF,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,SAAS,MAAM,SAAS,MAAM,SAAS,OAAO,GAAG;AAAA,IAC/D;AAAA,EACF;AACA,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,SAAS,MAAM,SAAS,OAAO,GAAG,GAAG,gBAAgB;AAC1F,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,QAAQ,OAAO,QAAQ,IAAI,GAAG,GAAG,aAAa;AACpF,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,QAAQ,OAAO,GAAG,GAAG,SAAS;AACpE,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,SAAS,OAAO,GAAG,GAAG,SAAS;AACrE,SAAO,KAAK,WAAW,WAAM,SAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,SAAS,GAAG;AAEnC,QAAM,QAAQ,KAAK,MAAM,QAAQ;AACjC,SAAO,KAAK,WAAW,UAAU,EAAE;AAEnC,QAAM,YAAwC,CAAC;AAC/C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,MAAM;AACV,aAAW,SAAS,OAAO;AACzB,UAAM,WAAW,MAAM,KAAK;AAC5B,QAAI,CAAC,SAAU;AAEf,WAAO,MAAM;AACb,WAAO,MAAM;AACb,QAAI,IAAI,SAAS,WAAW;AAC1B,gBAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,GAAG,CAAC;AACzC,cAAQ;AACR,YAAM;AAAA,IACR;AAAA,EACF;AAEA,MAAI,KAAK;AACP,cAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC;AAAA,EACvD;AAEA,SAAO;AACT;","names":[]}
1
+ {"version":3,"sources":["../../../src/tokenize/basic/sentence.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Split the text into sentences.\n */\nexport const splitSentences = (text: string, minLength = 20): [string, number, number][] => {\n const alphabets = /([A-Za-z])/g;\n const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;\n const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;\n const starters =\n /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)/g;\n const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;\n const websites = /[.](com|net|org|io|gov|edu|me)/g;\n const digits = /([0-9])/g;\n const dots = /\\.{2,}/g;\n\n text = text.replaceAll('\\n', ' ');\n text = text.replaceAll(prefixes, '$1<prd>');\n text = text.replaceAll(websites, '<prd>$2');\n text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, 'g'), '$1<prd>$2');\n text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));\n text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');\n text = text.replaceAll(new RegExp(`\\\\s${alphabets.source}[.] `, 'g'), ' $1<prd> ');\n text = text.replaceAll(new RegExp(`${acronyms.source} ${starters.source}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(\n new RegExp(`${alphabets.source}[.]${alphabets.source}[.]${alphabets.source}[.]`, 'g'),\n '$1<prd>$2<prd>$3<prd>',\n );\n text = text.replaceAll(\n new RegExp(`${alphabets.source}[.]${alphabets.source}[.]`, 'g'),\n '$1<prd>$2<prd>',\n );\n text = text.replaceAll(\n new RegExp(` ${suffixes.source}[.] ${starters.source}`, 'g'),\n '$1<stop> $2',\n );\n text = text.replaceAll(new RegExp(` ${suffixes.source}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(` ${alphabets.source}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll('.”', '”.');\n text = text.replaceAll('.\"', '\".');\n text = text.replaceAll('!\"', '\"!');\n text = text.replaceAll('?\"', '\"?');\n text = text.replaceAll('.', '.<stop>');\n text = text.replaceAll('?', '?<stop>');\n text = text.replaceAll('!', '!<stop>');\n text = text.replaceAll('<prd>', '.');\n\n const split = text.split('<stop>');\n text = text.replaceAll('<stop>', '');\n\n const sentences: [string, number, number][] = [];\n let buf = '';\n let start = 0;\n let end = 0;\n for (const match of split) {\n const sentence = match.trim();\n if (!sentence) continue;\n\n buf += ' ' + sentence;\n end += match.length;\n if (buf.length > minLength) {\n sentences.push([buf.slice(1), start, end]);\n start = end;\n buf = '';\n }\n }\n\n if (buf) {\n sentences.push([buf.slice(1), start, text.length - 1]);\n }\n\n return sentences;\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAOO,MAAM,iBAAiB,CAAC,MAAc,YAAY,OAAmC;AAC1F,QAAM,YAAY;AAClB,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,WACJ;AACF,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,SAAS;AACf,QAAM,OAAO;AAEb,SAAO,KAAK,WAAW,MAAM,GAAG;AAChC,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,OAAO,MAAM,MAAM,OAAO,MAAM,IAAI,GAAG,GAAG,WAAW;AAC1F,SAAO,KAAK,WAAW,MAAM,CAAC,UAAU,QAAQ,OAAO,MAAM,MAAM,CAAC;AACpE,SAAO,KAAK,WAAW,SAAS,eAAe;AAC/C,SAAO,KAAK,WAAW,IAAI,OAAO,MAAM,UAAU,MAAM,QAAQ,GAAG,GAAG,WAAW;AACjF,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,SAAS,MAAM,IAAI,SAAS,MAAM,IAAI,GAAG,GAAG,aAAa;AAC9F,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,UAAU,MAAM,MAAM,UAAU,MAAM,MAAM,UAAU,MAAM,OAAO,GAAG;AAAA,IACpF;AAAA,EACF;AACA,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,UAAU,MAAM,MAAM,UAAU,MAAM,OAAO,GAAG;AAAA,IAC9D;AAAA,EACF;AACA,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,IAAI,SAAS,MAAM,OAAO,SAAS,MAAM,IAAI,GAAG;AAAA,IAC3D;AAAA,EACF;AACA,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,SAAS,MAAM,OAAO,GAAG,GAAG,SAAS;AAC3E,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,UAAU,MAAM,OAAO,GAAG,GAAG,SAAS;AAC5E,SAAO,KAAK,WAAW,WAAM,SAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,SAAS,GAAG;AAEnC,QAAM,QAAQ,KAAK,MAAM,QAAQ;AACjC,SAAO,KAAK,WAAW,UAAU,EAAE;AAEnC,QAAM,YAAwC,CAAC;AAC/C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,MAAM;AACV,aAAW,SAAS,OAAO;AACzB,UAAM,WAAW,MAAM,KAAK;AAC5B,QAAI,CAAC,SAAU;AAEf,WAAO,MAAM;AACb,WAAO,MAAM;AACb,QAAI,IAAI,SAAS,WAAW;AAC1B,gBAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,GAAG,CAAC;AACzC,cAAQ;AACR,YAAM;AAAA,IACR;AAAA,EACF;AAEA,MAAI,KAAK;AACP,cAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC;AAAA,EACvD;AAEA,SAAO;AACT;","names":[]}
@@ -1 +1 @@
1
- {"version":3,"file":"sentence.d.ts","sourceRoot":"","sources":["../../../src/tokenize/basic/sentence.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,eAAO,MAAM,cAAc,SAAU,MAAM,yBAAmB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EA6DrF,CAAC"}
1
+ {"version":3,"file":"sentence.d.ts","sourceRoot":"","sources":["../../../src/tokenize/basic/sentence.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,eAAO,MAAM,cAAc,SAAU,MAAM,yBAAmB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAmErF,CAAC"}
@@ -10,19 +10,25 @@ const splitSentences = (text, minLength = 20) => {
10
10
  text = text.replaceAll("\n", " ");
11
11
  text = text.replaceAll(prefixes, "$1<prd>");
12
12
  text = text.replaceAll(websites, "<prd>$2");
13
- text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, "g"), "$1<prd>$2");
13
+ text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, "g"), "$1<prd>$2");
14
14
  text = text.replaceAll(dots, (match) => "<prd>".repeat(match.length));
15
15
  text = text.replaceAll("Ph.D.", "Ph<prd>D<prd>");
16
- text = text.replaceAll(new RegExp(`s${alphabets}[.]`, "g"), "$1<prd>");
17
- text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, "g"), "$1<stop> $2");
16
+ text = text.replaceAll(new RegExp(`\\s${alphabets.source}[.] `, "g"), " $1<prd> ");
17
+ text = text.replaceAll(new RegExp(`${acronyms.source} ${starters.source}`, "g"), "$1<stop> $2");
18
18
  text = text.replaceAll(
19
- new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, "g"),
19
+ new RegExp(`${alphabets.source}[.]${alphabets.source}[.]${alphabets.source}[.]`, "g"),
20
20
  "$1<prd>$2<prd>$3<prd>"
21
21
  );
22
- text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, "g"), "$1<prd>$2<prd>");
23
- text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, "g"), "$1<stop> $2");
24
- text = text.replaceAll(new RegExp(` ${suffixes}[.]`, "g"), "$1<prd>");
25
- text = text.replaceAll(new RegExp(` ${alphabets}[.]`, "g"), "$1<prd>");
22
+ text = text.replaceAll(
23
+ new RegExp(`${alphabets.source}[.]${alphabets.source}[.]`, "g"),
24
+ "$1<prd>$2<prd>"
25
+ );
26
+ text = text.replaceAll(
27
+ new RegExp(` ${suffixes.source}[.] ${starters.source}`, "g"),
28
+ "$1<stop> $2"
29
+ );
30
+ text = text.replaceAll(new RegExp(` ${suffixes.source}[.]`, "g"), "$1<prd>");
31
+ text = text.replaceAll(new RegExp(` ${alphabets.source}[.]`, "g"), "$1<prd>");
26
32
  text = text.replaceAll(".\u201D", "\u201D.");
27
33
  text = text.replaceAll('."', '".');
28
34
  text = text.replaceAll('!"', '"!');
@@ -1 +1 @@
1
- {"version":3,"sources":["../../../src/tokenize/basic/sentence.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Split the text into sentences.\n */\nexport const splitSentences = (text: string, minLength = 20): [string, number, number][] => {\n const alphabets = /([A-Za-z])/g;\n const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;\n const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;\n const starters =\n /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)/g;\n const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;\n const websites = /[.](com|net|org|io|gov|edu|me)/g;\n const digits = /([0-9])/g;\n const dots = /\\.{2,}/g;\n\n text = text.replaceAll('\\n', ' ');\n text = text.replaceAll(prefixes, '$1<prd>');\n text = text.replaceAll(websites, '<prd>$2');\n text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');\n text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));\n text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');\n text = text.replaceAll(new RegExp(`\\s${alphabets}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(\n new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'),\n '$1<prd>$2<prd>$3<prd>',\n );\n text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');\n text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll('.”', '”.');\n text = text.replaceAll('.\"', '\".');\n text = text.replaceAll('!\"', '\"!');\n text = text.replaceAll('?\"', '\"?');\n text = text.replaceAll('.', '.<stop>');\n text = text.replaceAll('?', '?<stop>');\n text = text.replaceAll('!', '!<stop>');\n text = text.replaceAll('<prd>', '.');\n\n const split = text.split('<stop>');\n text = text.replaceAll('<stop>', '');\n\n const sentences: [string, number, number][] = [];\n let buf = '';\n let start = 0;\n let end = 0;\n for (const match of split) {\n const sentence = match.trim();\n if (!sentence) continue;\n\n buf += ' ' + sentence;\n end += match.length;\n if (buf.length > minLength) {\n sentences.push([buf.slice(1), start, end]);\n start = end;\n buf = '';\n }\n }\n\n if (buf) {\n sentences.push([buf.slice(1), start, text.length - 1]);\n }\n\n return sentences;\n};\n"],"mappings":"AAOO,MAAM,iBAAiB,CAAC,MAAc,YAAY,OAAmC;AAC1F,QAAM,YAAY;AAClB,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,WACJ;AACF,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,SAAS;AACf,QAAM,OAAO;AAEb,SAAO,KAAK,WAAW,MAAM,GAAG;AAChC,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,MAAM,MAAM,MAAM,IAAI,GAAG,GAAG,WAAW;AAC5E,SAAO,KAAK,WAAW,MAAM,CAAC,UAAU,QAAQ,OAAO,MAAM,MAAM,CAAC;AACpE,SAAO,KAAK,WAAW,SAAS,eAAe;AAC/C,SAAO,KAAK,WAAW,IAAI,OAAO,IAAK,SAAS,OAAO,GAAG,GAAG,SAAS;AACtE,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,QAAQ,IAAI,QAAQ,IAAI,GAAG,GAAG,aAAa;AAChF,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,SAAS,MAAM,SAAS,MAAM,SAAS,OAAO,GAAG;AAAA,IAC/D;AAAA,EACF;AACA,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,SAAS,MAAM,SAAS,OAAO,GAAG,GAAG,gBAAgB;AAC1F,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,QAAQ,OAAO,QAAQ,IAAI,GAAG,GAAG,aAAa;AACpF,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,QAAQ,OAAO,GAAG,GAAG,SAAS;AACpE,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,SAAS,OAAO,GAAG,GAAG,SAAS;AACrE,SAAO,KAAK,WAAW,WAAM,SAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,SAAS,GAAG;AAEnC,QAAM,QAAQ,KAAK,MAAM,QAAQ;AACjC,SAAO,KAAK,WAAW,UAAU,EAAE;AAEnC,QAAM,YAAwC,CAAC;AAC/C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,MAAM;AACV,aAAW,SAAS,OAAO;AACzB,UAAM,WAAW,MAAM,KAAK;AAC5B,QAAI,CAAC,SAAU;AAEf,WAAO,MAAM;AACb,WAAO,MAAM;AACb,QAAI,IAAI,SAAS,WAAW;AAC1B,gBAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,GAAG,CAAC;AACzC,cAAQ;AACR,YAAM;AAAA,IACR;AAAA,EACF;AAEA,MAAI,KAAK;AACP,cAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC;AAAA,EACvD;AAEA,SAAO;AACT;","names":[]}
1
+ {"version":3,"sources":["../../../src/tokenize/basic/sentence.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\n/**\n * Split the text into sentences.\n */\nexport const splitSentences = (text: string, minLength = 20): [string, number, number][] => {\n const alphabets = /([A-Za-z])/g;\n const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;\n const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;\n const starters =\n /(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)/g;\n const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;\n const websites = /[.](com|net|org|io|gov|edu|me)/g;\n const digits = /([0-9])/g;\n const dots = /\\.{2,}/g;\n\n text = text.replaceAll('\\n', ' ');\n text = text.replaceAll(prefixes, '$1<prd>');\n text = text.replaceAll(websites, '<prd>$2');\n text = text.replaceAll(new RegExp(`${digits.source}[.]${digits.source}`, 'g'), '$1<prd>$2');\n text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));\n text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');\n text = text.replaceAll(new RegExp(`\\\\s${alphabets.source}[.] `, 'g'), ' $1<prd> ');\n text = text.replaceAll(new RegExp(`${acronyms.source} ${starters.source}`, 'g'), '$1<stop> $2');\n text = text.replaceAll(\n new RegExp(`${alphabets.source}[.]${alphabets.source}[.]${alphabets.source}[.]`, 'g'),\n '$1<prd>$2<prd>$3<prd>',\n );\n text = text.replaceAll(\n new RegExp(`${alphabets.source}[.]${alphabets.source}[.]`, 'g'),\n '$1<prd>$2<prd>',\n );\n text = text.replaceAll(\n new RegExp(` ${suffixes.source}[.] ${starters.source}`, 'g'),\n '$1<stop> $2',\n );\n text = text.replaceAll(new RegExp(` ${suffixes.source}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll(new RegExp(` ${alphabets.source}[.]`, 'g'), '$1<prd>');\n text = text.replaceAll('.”', '”.');\n text = text.replaceAll('.\"', '\".');\n text = text.replaceAll('!\"', '\"!');\n text = text.replaceAll('?\"', '\"?');\n text = text.replaceAll('.', '.<stop>');\n text = text.replaceAll('?', '?<stop>');\n text = text.replaceAll('!', '!<stop>');\n text = text.replaceAll('<prd>', '.');\n\n const split = text.split('<stop>');\n text = text.replaceAll('<stop>', '');\n\n const sentences: [string, number, number][] = [];\n let buf = '';\n let start = 0;\n let end = 0;\n for (const match of split) {\n const sentence = match.trim();\n if (!sentence) continue;\n\n buf += ' ' + sentence;\n end += match.length;\n if (buf.length > minLength) {\n sentences.push([buf.slice(1), start, end]);\n start = end;\n buf = '';\n }\n }\n\n if (buf) {\n sentences.push([buf.slice(1), start, text.length - 1]);\n }\n\n return sentences;\n};\n"],"mappings":"AAOO,MAAM,iBAAiB,CAAC,MAAc,YAAY,OAAmC;AAC1F,QAAM,YAAY;AAClB,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,WACJ;AACF,QAAM,WAAW;AACjB,QAAM,WAAW;AACjB,QAAM,SAAS;AACf,QAAM,OAAO;AAEb,SAAO,KAAK,WAAW,MAAM,GAAG;AAChC,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,UAAU,SAAS;AAC1C,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,OAAO,MAAM,MAAM,OAAO,MAAM,IAAI,GAAG,GAAG,WAAW;AAC1F,SAAO,KAAK,WAAW,MAAM,CAAC,UAAU,QAAQ,OAAO,MAAM,MAAM,CAAC;AACpE,SAAO,KAAK,WAAW,SAAS,eAAe;AAC/C,SAAO,KAAK,WAAW,IAAI,OAAO,MAAM,UAAU,MAAM,QAAQ,GAAG,GAAG,WAAW;AACjF,SAAO,KAAK,WAAW,IAAI,OAAO,GAAG,SAAS,MAAM,IAAI,SAAS,MAAM,IAAI,GAAG,GAAG,aAAa;AAC9F,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,UAAU,MAAM,MAAM,UAAU,MAAM,MAAM,UAAU,MAAM,OAAO,GAAG;AAAA,IACpF;AAAA,EACF;AACA,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,GAAG,UAAU,MAAM,MAAM,UAAU,MAAM,OAAO,GAAG;AAAA,IAC9D;AAAA,EACF;AACA,SAAO,KAAK;AAAA,IACV,IAAI,OAAO,IAAI,SAAS,MAAM,OAAO,SAAS,MAAM,IAAI,GAAG;AAAA,IAC3D;AAAA,EACF;AACA,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,SAAS,MAAM,OAAO,GAAG,GAAG,SAAS;AAC3E,SAAO,KAAK,WAAW,IAAI,OAAO,IAAI,UAAU,MAAM,OAAO,GAAG,GAAG,SAAS;AAC5E,SAAO,KAAK,WAAW,WAAM,SAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,MAAM,IAAI;AACjC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,KAAK,SAAS;AACrC,SAAO,KAAK,WAAW,SAAS,GAAG;AAEnC,QAAM,QAAQ,KAAK,MAAM,QAAQ;AACjC,SAAO,KAAK,WAAW,UAAU,EAAE;AAEnC,QAAM,YAAwC,CAAC;AAC/C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,MAAM;AACV,aAAW,SAAS,OAAO;AACzB,UAAM,WAAW,MAAM,KAAK;AAC5B,QAAI,CAAC,SAAU;AAEf,WAAO,MAAM;AACb,WAAO,MAAM;AACb,QAAI,IAAI,SAAS,WAAW;AAC1B,gBAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,GAAG,CAAC;AACzC,cAAQ;AACR,YAAM;AAAA,IACR;AAAA,EACF;AAEA,MAAI,KAAK;AACP,cAAU,KAAK,CAAC,IAAI,MAAM,CAAC,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC;AAAA,EACvD;AAEA,SAAO;AACT;","names":[]}
@@ -0,0 +1,220 @@
1
+ "use strict";
2
+ var import_vitest = require("vitest");
3
+ var import_basic = require("./basic/index.cjs");
4
+ var import_paragraph = require("./basic/paragraph.cjs");
5
+ const TEXT = "Hi! LiveKit is a platform for live audio and video applications and services. R.T.C stands for Real-Time Communication... again R.T.C. Mr. Theo is testing the sentence tokenizer. This is a test. Another test. A short sentence. A longer sentence that is longer than the previous sentence. f(x) = x * 2.54 + 42. Hey! Hi! Hello! ";
6
+ const EXPECTED_MIN_20 = [
7
+ "Hi! LiveKit is a platform for live audio and video applications and services.",
8
+ "R.T.C stands for Real-Time Communication... again R.T.C.",
9
+ "Mr. Theo is testing the sentence tokenizer.",
10
+ "This is a test. Another test.",
11
+ "A short sentence. A longer sentence that is longer than the previous sentence.",
12
+ "f(x) = x * 2.54 + 42.",
13
+ "Hey! Hi! Hello!"
14
+ ];
15
+ const WORDS_TEXT = "This is a test. Blabla another test! multiple consecutive spaces: done";
16
+ const WORDS_EXPECTED = [
17
+ "This",
18
+ "is",
19
+ "a",
20
+ "test",
21
+ "Blabla",
22
+ "another",
23
+ "test",
24
+ "multiple",
25
+ "consecutive",
26
+ "spaces",
27
+ "done"
28
+ ];
29
+ const WORDS_PUNCT_TEXT = 'This is <phoneme alphabet="cmu-arpabet" ph="AE K CH UW AH L IY">actually</phoneme> tricky to handle.';
30
+ const WORDS_PUNCT_EXPECTED = [
31
+ "This",
32
+ "is",
33
+ "<phoneme",
34
+ 'alphabet="cmu-arpabet"',
35
+ 'ph="AE',
36
+ "K",
37
+ "CH",
38
+ "UW",
39
+ "AH",
40
+ "L",
41
+ 'IY">actually</phoneme>',
42
+ "tricky",
43
+ "to",
44
+ "handle."
45
+ ];
46
+ const HYPHENATOR_TEXT = ["Segment", "expected", "communication", "window", "welcome", "bedroom"];
47
+ const HYPHENATOR_EXPECTED = [
48
+ ["Seg", "ment"],
49
+ ["ex", "pect", "ed"],
50
+ ["com", "mu", "ni", "ca", "tion"],
51
+ ["win", "dow"],
52
+ ["wel", "come"],
53
+ ["bed", "room"]
54
+ ];
55
+ const PARAGRAPH_TEST_CASES = [
56
+ ["Single paragraph.", [["Single paragraph.", 0, 17]]],
57
+ [
58
+ "Paragraph 1.\n\nParagraph 2.",
59
+ [
60
+ ["Paragraph 1.", 0, 12],
61
+ ["Paragraph 2.", 14, 26]
62
+ ]
63
+ ],
64
+ [
65
+ "Para 1.\n\nPara 2.\n\nPara 3.",
66
+ [
67
+ ["Para 1.", 0, 7],
68
+ ["Para 2.", 9, 16],
69
+ ["Para 3.", 18, 25]
70
+ ]
71
+ ],
72
+ ["\n\nParagraph with leading newlines.", [["Paragraph with leading newlines.", 2, 34]]],
73
+ ["Paragraph with trailing newlines.\n\n", [["Paragraph with trailing newlines.", 0, 33]]],
74
+ [
75
+ "\n\n Paragraph with leading and trailing spaces. \n\n",
76
+ [["Paragraph with leading and trailing spaces.", 4, 47]]
77
+ ],
78
+ [
79
+ "Para 1.\n\n\n\nPara 2.",
80
+ // Multiple newlines between paragraphs
81
+ [
82
+ ["Para 1.", 0, 7],
83
+ ["Para 2.", 11, 18]
84
+ ]
85
+ ],
86
+ [
87
+ "Para 1.\n \n \nPara 2.",
88
+ // Newlines with spaces between paragraphs
89
+ [
90
+ ["Para 1.", 0, 7],
91
+ ["Para 2.", 12, 19]
92
+ ]
93
+ ],
94
+ [
95
+ "",
96
+ // Empty string
97
+ []
98
+ ],
99
+ [
100
+ "\n\n\n",
101
+ // Only newlines
102
+ []
103
+ ],
104
+ [
105
+ "Line 1\nLine 2\nLine 3",
106
+ // Single paragraph with newlines
107
+ [["Line 1\nLine 2\nLine 3", 0, 20]]
108
+ ]
109
+ ];
110
+ (0, import_vitest.describe)("tokenizer", () => {
111
+ (0, import_vitest.describe)("SentenceTokenizer", () => {
112
+ const tokenizer = new import_basic.SentenceTokenizer();
113
+ (0, import_vitest.it)("should tokenize sentences correctly", () => {
114
+ (0, import_vitest.expect)(tokenizer.tokenize(TEXT).every((x, i) => EXPECTED_MIN_20[i] === x)).toBeTruthy();
115
+ });
116
+ (0, import_vitest.it)("should stream tokenize sentences correctly", async () => {
117
+ const pattern = [1, 2, 4];
118
+ let text = TEXT;
119
+ const chunks = [];
120
+ const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))).fill(pattern).flat()[Symbol.iterator]();
121
+ for (const size of patternIter) {
122
+ if (!text) break;
123
+ chunks.push(text.slice(void 0, size));
124
+ text = text.slice(size);
125
+ }
126
+ const stream = tokenizer.stream();
127
+ for (const chunk of chunks) {
128
+ stream.pushText(chunk);
129
+ }
130
+ stream.endInput();
131
+ stream.close();
132
+ for (const x of EXPECTED_MIN_20) {
133
+ await stream.next().then((value) => {
134
+ if (value.value) {
135
+ (0, import_vitest.expect)(value.value.token).toStrictEqual(x);
136
+ }
137
+ });
138
+ }
139
+ });
140
+ });
141
+ (0, import_vitest.describe)("WordTokenizer", () => {
142
+ const tokenizer = new import_basic.WordTokenizer();
143
+ (0, import_vitest.it)("should tokenize words correctly", () => {
144
+ (0, import_vitest.expect)(tokenizer.tokenize(WORDS_TEXT).every((x, i) => WORDS_EXPECTED[i] === x)).toBeTruthy();
145
+ });
146
+ (0, import_vitest.it)("should stream tokenize words correctly", async () => {
147
+ const pattern = [1, 2, 4];
148
+ let text = WORDS_TEXT;
149
+ const chunks = [];
150
+ const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))).fill(pattern).flat()[Symbol.iterator]();
151
+ for (const size of patternIter) {
152
+ if (!text) break;
153
+ chunks.push(text.slice(void 0, size));
154
+ text = text.slice(size);
155
+ }
156
+ const stream = tokenizer.stream();
157
+ for (const chunk of chunks) {
158
+ stream.pushText(chunk);
159
+ }
160
+ stream.endInput();
161
+ stream.close();
162
+ for (const x of WORDS_EXPECTED) {
163
+ await stream.next().then((value) => {
164
+ if (value.value) {
165
+ (0, import_vitest.expect)(value.value.token).toStrictEqual(x);
166
+ }
167
+ });
168
+ }
169
+ });
170
+ (0, import_vitest.describe)("punctuation handling", () => {
171
+ const tokenizerPunct = new import_basic.WordTokenizer(false);
172
+ (0, import_vitest.it)("should tokenize words correctly", () => {
173
+ (0, import_vitest.expect)(
174
+ tokenizerPunct.tokenize(WORDS_PUNCT_TEXT).every((x, i) => WORDS_PUNCT_EXPECTED[i] === x)
175
+ ).toBeTruthy();
176
+ });
177
+ (0, import_vitest.it)("should stream tokenize words correctly", async () => {
178
+ const pattern = [1, 2, 4];
179
+ let text = WORDS_PUNCT_TEXT;
180
+ const chunks = [];
181
+ const patternIter = Array(
182
+ Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))
183
+ ).fill(pattern).flat()[Symbol.iterator]();
184
+ for (const size of patternIter) {
185
+ if (!text) break;
186
+ chunks.push(text.slice(void 0, size));
187
+ text = text.slice(size);
188
+ }
189
+ const stream = tokenizerPunct.stream();
190
+ for (const chunk of chunks) {
191
+ stream.pushText(chunk);
192
+ }
193
+ stream.endInput();
194
+ stream.close();
195
+ for (const x of WORDS_PUNCT_EXPECTED) {
196
+ await stream.next().then((value) => {
197
+ if (value.value) {
198
+ (0, import_vitest.expect)(value.value.token).toStrictEqual(x);
199
+ }
200
+ });
201
+ }
202
+ });
203
+ });
204
+ });
205
+ (0, import_vitest.describe)("hyphenateWord", () => {
206
+ (0, import_vitest.it)("should hyphenate correctly", () => {
207
+ HYPHENATOR_TEXT.forEach((x, i) => {
208
+ (0, import_vitest.expect)((0, import_basic.hyphenateWord)(x)).toStrictEqual(HYPHENATOR_EXPECTED[i]);
209
+ });
210
+ });
211
+ });
212
+ (0, import_vitest.describe)("splitParagraphs", () => {
213
+ (0, import_vitest.it)("should tokenize paragraphs correctly", () => {
214
+ PARAGRAPH_TEST_CASES.forEach(([a, b]) => {
215
+ (0, import_vitest.expect)((0, import_paragraph.splitParagraphs)(a)).toStrictEqual(b);
216
+ });
217
+ });
218
+ });
219
+ });
220
+ //# sourceMappingURL=tokenizer.test.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/tokenize/tokenizer.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { describe, expect, it } from 'vitest';\nimport { SentenceTokenizer, WordTokenizer, hyphenateWord } from './basic/index.js';\nimport { splitParagraphs } from './basic/paragraph.js';\n\nconst TEXT =\n 'Hi! ' +\n 'LiveKit is a platform for live audio and video applications and services. ' +\n 'R.T.C stands for Real-Time Communication... again R.T.C. ' +\n 'Mr. Theo is testing the sentence tokenizer. ' +\n 'This is a test. Another test. ' +\n 'A short sentence. ' +\n 'A longer sentence that is longer than the previous sentence. ' +\n 'f(x) = x * 2.54 + 42. ' +\n 'Hey! Hi! Hello! ';\n\nconst EXPECTED_MIN_20 = [\n 'Hi! LiveKit is a platform for live audio and video applications and services.',\n 'R.T.C stands for Real-Time Communication... again R.T.C.',\n 'Mr. Theo is testing the sentence tokenizer.',\n 'This is a test. Another test.',\n 'A short sentence. A longer sentence that is longer than the previous sentence.',\n 'f(x) = x * 2.54 + 42.',\n 'Hey! Hi! Hello!',\n];\n\nconst WORDS_TEXT = 'This is a test. Blabla another test! multiple consecutive spaces: done';\nconst WORDS_EXPECTED = [\n 'This',\n 'is',\n 'a',\n 'test',\n 'Blabla',\n 'another',\n 'test',\n 'multiple',\n 'consecutive',\n 'spaces',\n 'done',\n];\n\nconst WORDS_PUNCT_TEXT =\n 'This is <phoneme alphabet=\"cmu-arpabet\" ph=\"AE K CH UW AH L IY\">actually</phoneme> tricky to handle.';\nconst WORDS_PUNCT_EXPECTED = [\n 'This',\n 'is',\n '<phoneme',\n 'alphabet=\"cmu-arpabet\"',\n 'ph=\"AE',\n 'K',\n 'CH',\n 'UW',\n 'AH',\n 'L',\n 'IY\">actually</phoneme>',\n 'tricky',\n 'to',\n 'handle.',\n];\n\nconst HYPHENATOR_TEXT = ['Segment', 'expected', 'communication', 'window', 'welcome', 'bedroom'];\nconst HYPHENATOR_EXPECTED = [\n ['Seg', 'ment'],\n ['ex', 'pect', 'ed'],\n ['com', 'mu', 'ni', 'ca', 'tion'],\n ['win', 'dow'],\n ['wel', 'come'],\n ['bed', 'room'],\n];\n\nconst PARAGRAPH_TEST_CASES: [string, [string, number, number][]][] = [\n ['Single paragraph.', [['Single paragraph.', 0, 17]]],\n [\n 'Paragraph 1.\\n\\nParagraph 2.',\n [\n ['Paragraph 1.', 0, 12],\n ['Paragraph 2.', 14, 26],\n ],\n ],\n [\n 'Para 1.\\n\\nPara 2.\\n\\nPara 3.',\n [\n ['Para 1.', 0, 7],\n ['Para 2.', 9, 16],\n ['Para 3.', 18, 25],\n ],\n ],\n ['\\n\\nParagraph with leading newlines.', [['Paragraph with leading newlines.', 2, 34]]],\n ['Paragraph with trailing newlines.\\n\\n', [['Paragraph with trailing newlines.', 0, 33]]],\n [\n '\\n\\n Paragraph with leading and trailing spaces. \\n\\n',\n [['Paragraph with leading and trailing spaces.', 4, 47]],\n ],\n [\n 'Para 1.\\n\\n\\n\\nPara 2.', // Multiple newlines between paragraphs\n [\n ['Para 1.', 0, 7],\n ['Para 2.', 11, 18],\n ],\n ],\n [\n 'Para 1.\\n \\n \\nPara 2.', // Newlines with spaces between paragraphs\n [\n ['Para 1.', 0, 7],\n ['Para 2.', 12, 19],\n ],\n ],\n [\n '', // Empty string\n [],\n ],\n [\n '\\n\\n\\n', // Only newlines\n [],\n ],\n [\n 'Line 1\\nLine 2\\nLine 3', // Single paragraph with newlines\n [['Line 1\\nLine 2\\nLine 3', 0, 20]],\n ],\n];\n\ndescribe('tokenizer', () => {\n describe('SentenceTokenizer', () => {\n const tokenizer = new SentenceTokenizer();\n\n it('should tokenize sentences correctly', () => {\n expect(tokenizer.tokenize(TEXT).every((x, i) => EXPECTED_MIN_20[i] === x)).toBeTruthy();\n });\n\n it('should stream tokenize sentences correctly', async () => {\n const pattern = [1, 2, 4];\n let text = TEXT;\n const chunks = [];\n const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)))\n .fill(pattern)\n .flat()\n [Symbol.iterator]();\n\n for (const size of patternIter) {\n if (!text) break;\n chunks.push(text.slice(undefined, size));\n text = text.slice(size);\n }\n const stream = tokenizer.stream();\n for (const chunk of chunks) {\n stream.pushText(chunk);\n }\n stream.endInput();\n stream.close();\n\n for (const x of EXPECTED_MIN_20) {\n await stream.next().then((value) => {\n if (value.value) {\n expect(value.value.token).toStrictEqual(x);\n }\n });\n }\n });\n });\n describe('WordTokenizer', () => {\n const tokenizer = new WordTokenizer();\n\n it('should tokenize words correctly', () => {\n expect(tokenizer.tokenize(WORDS_TEXT).every((x, i) => WORDS_EXPECTED[i] === x)).toBeTruthy();\n });\n\n it('should stream tokenize words correctly', async () => {\n const pattern = [1, 2, 4];\n let text = WORDS_TEXT;\n const chunks = [];\n const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)))\n .fill(pattern)\n .flat()\n [Symbol.iterator]();\n\n for (const size of patternIter) {\n if (!text) break;\n chunks.push(text.slice(undefined, size));\n text = text.slice(size);\n }\n const stream = tokenizer.stream();\n for (const chunk of chunks) {\n stream.pushText(chunk);\n }\n stream.endInput();\n stream.close();\n\n for (const x of WORDS_EXPECTED) {\n await stream.next().then((value) => {\n if (value.value) {\n expect(value.value.token).toStrictEqual(x);\n }\n });\n }\n });\n\n describe('punctuation handling', () => {\n const tokenizerPunct = new WordTokenizer(false);\n\n it('should tokenize words correctly', () => {\n expect(\n tokenizerPunct.tokenize(WORDS_PUNCT_TEXT).every((x, i) => WORDS_PUNCT_EXPECTED[i] === x),\n ).toBeTruthy();\n });\n\n it('should stream tokenize words correctly', async () => {\n const pattern = [1, 2, 4];\n let text = WORDS_PUNCT_TEXT;\n const chunks = [];\n const patternIter = Array(\n Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)),\n )\n .fill(pattern)\n .flat()\n [Symbol.iterator]();\n\n for (const size of patternIter) {\n if (!text) break;\n chunks.push(text.slice(undefined, size));\n text = text.slice(size);\n }\n const stream = tokenizerPunct.stream();\n for (const chunk of chunks) {\n stream.pushText(chunk);\n }\n stream.endInput();\n stream.close();\n\n for (const x of WORDS_PUNCT_EXPECTED) {\n await stream.next().then((value) => {\n if (value.value) {\n expect(value.value.token).toStrictEqual(x);\n }\n });\n }\n });\n });\n });\n describe('hyphenateWord', () => {\n it('should hyphenate correctly', () => {\n HYPHENATOR_TEXT.forEach((x, i) => {\n expect(hyphenateWord(x)).toStrictEqual(HYPHENATOR_EXPECTED[i]);\n });\n });\n });\n describe('splitParagraphs', () => {\n it('should tokenize paragraphs correctly', () => {\n PARAGRAPH_TEST_CASES.forEach(([a, b]) => {\n expect(splitParagraphs(a)).toStrictEqual(b);\n });\n });\n });\n});\n"],"mappings":";AAGA,oBAAqC;AACrC,mBAAgE;AAChE,uBAAgC;AAEhC,MAAM,OACJ;AAUF,MAAM,kBAAkB;AAAA,EACtB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,MAAM,aAAa;AACnB,MAAM,iBAAiB;AAAA,EACrB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,MAAM,mBACJ;AACF,MAAM,uBAAuB;AAAA,EAC3B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,MAAM,kBAAkB,CAAC,WAAW,YAAY,iBAAiB,UAAU,WAAW,SAAS;AAC/F,MAAM,sBAAsB;AAAA,EAC1B,CAAC,OAAO,MAAM;AAAA,EACd,CAAC,MAAM,QAAQ,IAAI;AAAA,EACnB,CAAC,OAAO,MAAM,MAAM,MAAM,MAAM;AAAA,EAChC,CAAC,OAAO,KAAK;AAAA,EACb,CAAC,OAAO,MAAM;AAAA,EACd,CAAC,OAAO,MAAM;AAChB;AAEA,MAAM,uBAA+D;AAAA,EACnE,CAAC,qBAAqB,CAAC,CAAC,qBAAqB,GAAG,EAAE,CAAC,CAAC;AAAA,EACpD;AAAA,IACE;AAAA,IACA;AAAA,MACE,CAAC,gBAAgB,GAAG,EAAE;AAAA,MACtB,CAAC,gBAAgB,IAAI,EAAE;AAAA,IACzB;AAAA,EACF;AAAA,EACA;AAAA,IACE;AAAA,IACA;AAAA,MACE,CAAC,WAAW,GAAG,CAAC;AAAA,MAChB,CAAC,WAAW,GAAG,EAAE;AAAA,MACjB,CAAC,WAAW,IAAI,EAAE;AAAA,IACpB;AAAA,EACF;AAAA,EACA,CAAC,wCAAwC,CAAC,CAAC,oCAAoC,GAAG,EAAE,CAAC,CAAC;AAAA,EACtF,CAAC,yCAAyC,CAAC,CAAC,qCAAqC,GAAG,EAAE,CAAC,CAAC;AAAA,EACxF;AAAA,IACE;AAAA,IACA,CAAC,CAAC,+CAA+C,GAAG,EAAE,CAAC;AAAA,EACzD;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA;AAAA,MACE,CAAC,WAAW,GAAG,CAAC;AAAA,MAChB,CAAC,WAAW,IAAI,EAAE;AAAA,IACpB;AAAA,EACF;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA;AAAA,MACE,CAAC,WAAW,GAAG,CAAC;AAAA,MAChB,CAAC,WAAW,IAAI,EAAE;AAAA,IACpB;AAAA,EACF;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA,CAAC;AAAA,EACH;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA,CAAC;AAAA,EACH;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA,CAAC,CAAC,0BAA0B,GAAG,EAAE,CAAC;AAAA,EACpC;AACF;AAAA,IAEA,wBAAS,aAAa,MAAM;AAC1B,8BAAS,qBAAqB,MAAM;AAClC,UAAM,YAAY,IAAI,+BAAkB;AAExC,0BAAG,uCAAuC,MAAM;AAC9C,gCAAO,UAAU,SAAS,IAAI,EAAE,MAAM,CAAC,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW;AAAA,IACxF,CAAC;AAED,0BAAG,8CAA8C,YAAY;AAC3D,YAAM,UAAU,CAAC,GAAG,GAAG,CAAC;AACxB,UAAI,OAAO;AACX,YAAM,SAAS,CAAC;AAChB,YAAM,cAAc,MAAM,KAAK,KAAK,KAAK,SAAS,QAAQ,OAAO,CAAC,KAAK,QAAQ,MAAM,KAAK,CAAC,CAAC,CAAC,EAC1F,KAAK,OAAO,EACZ,KAAK,EACL,OAAO,QAAQ,EAAE;AAEpB,iBAAW,QAAQ,aAAa;AAC9B,YAAI,CAAC,KAAM;AACX,eAAO,KAAK,KAAK,MAAM,QAAW,IAAI,CAAC;AACvC,eAAO,KAAK,MAAM,IAAI;AAAA,MACxB;AACA,YAAM,SAAS,UAAU,OAAO;AAChC,iBAAW,SAAS,QAAQ;AAC1B,eAAO,SAAS,KAAK;AAAA,MACvB;AACA,aAAO,SAAS;AAChB,aAAO,MAAM;AAEb,iBAAW,KAAK,iBAAiB;AAC/B,cAAM,OAAO,KAAK,EAAE,KAAK,CAAC,UAAU;AAClC,cAAI,MAAM,OAAO;AACf,sCAAO,MAAM,MAAM,KAAK,EAAE,cAAc,CAAC;AAAA,UAC3C;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACD,8BAAS,iBAAiB,MAAM;AAC9B,UAAM,YAAY,IAAI,2BAAc;AAEpC,0BAAG,mCAAmC,MAAM;AAC1C,gCAAO,UAAU,SAAS,UAAU,EAAE,MAAM,CAAC,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW;AAAA,IAC7F,CAAC;AAED,0BAAG,0CAA0C,YAAY;AACvD,YAAM,UAAU,CAAC,GAAG,GAAG,CAAC;AACxB,UAAI,OAAO;AACX,YAAM,SAAS,CAAC;AAChB,YAAM,cAAc,MAAM,KAAK,KAAK,KAAK,SAAS,QAAQ,OAAO,CAAC,KAAK,QAAQ,MAAM,KAAK,CAAC,CAAC,CAAC,EAC1F,KAAK,OAAO,EACZ,KAAK,EACL,OAAO,QAAQ,EAAE;AAEpB,iBAAW,QAAQ,aAAa;AAC9B,YAAI,CAAC,KAAM;AACX,eAAO,KAAK,KAAK,MAAM,QAAW,IAAI,CAAC;AACvC,eAAO,KAAK,MAAM,IAAI;AAAA,MACxB;AACA,YAAM,SAAS,UAAU,OAAO;AAChC,iBAAW,SAAS,QAAQ;AAC1B,eAAO,SAAS,KAAK;AAAA,MACvB;AACA,aAAO,SAAS;AAChB,aAAO,MAAM;AAEb,iBAAW,KAAK,gBAAgB;AAC9B,cAAM,OAAO,KAAK,EAAE,KAAK,CAAC,UAAU;AAClC,cAAI,MAAM,OAAO;AACf,sCAAO,MAAM,MAAM,KAAK,EAAE,cAAc,CAAC;AAAA,UAC3C;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF,CAAC;AAED,gCAAS,wBAAwB,MAAM;AACrC,YAAM,iBAAiB,IAAI,2BAAc,KAAK;AAE9C,4BAAG,mCAAmC,MAAM;AAC1C;AAAA,UACE,eAAe,SAAS,gBAAgB,EAAE,MAAM,CAAC,GAAG,MAAM,qBAAqB,CAAC,MAAM,CAAC;AAAA,QACzF,EAAE,WAAW;AAAA,MACf,CAAC;AAED,4BAAG,0CAA0C,YAAY;AACvD,cAAM,UAAU,CAAC,GAAG,GAAG,CAAC;AACxB,YAAI,OAAO;AACX,cAAM,SAAS,CAAC;AAChB,cAAM,cAAc;AAAA,UAClB,KAAK,KAAK,KAAK,SAAS,QAAQ,OAAO,CAAC,KAAK,QAAQ,MAAM,KAAK,CAAC,CAAC;AAAA,QACpE,EACG,KAAK,OAAO,EACZ,KAAK,EACL,OAAO,QAAQ,EAAE;AAEpB,mBAAW,QAAQ,aAAa;AAC9B,cAAI,CAAC,KAAM;AACX,iBAAO,KAAK,KAAK,MAAM,QAAW,IAAI,CAAC;AACvC,iBAAO,KAAK,MAAM,IAAI;AAAA,QACxB;AACA,cAAM,SAAS,eAAe,OAAO;AACrC,mBAAW,SAAS,QAAQ;AAC1B,iBAAO,SAAS,KAAK;AAAA,QACvB;AACA,eAAO,SAAS;AAChB,eAAO,MAAM;AAEb,mBAAW,KAAK,sBAAsB;AACpC,gBAAM,OAAO,KAAK,EAAE,KAAK,CAAC,UAAU;AAClC,gBAAI,MAAM,OAAO;AACf,wCAAO,MAAM,MAAM,KAAK,EAAE,cAAc,CAAC;AAAA,YAC3C;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF,CAAC;AAAA,IACH,CAAC;AAAA,EACH,CAAC;AACD,8BAAS,iBAAiB,MAAM;AAC9B,0BAAG,8BAA8B,MAAM;AACrC,sBAAgB,QAAQ,CAAC,GAAG,MAAM;AAChC,sCAAO,4BAAc,CAAC,CAAC,EAAE,cAAc,oBAAoB,CAAC,CAAC;AAAA,MAC/D,CAAC;AAAA,IACH,CAAC;AAAA,EACH,CAAC;AACD,8BAAS,mBAAmB,MAAM;AAChC,0BAAG,wCAAwC,MAAM;AAC/C,2BAAqB,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM;AACvC,sCAAO,kCAAgB,CAAC,CAAC,EAAE,cAAc,CAAC;AAAA,MAC5C,CAAC;AAAA,IACH,CAAC;AAAA,EACH,CAAC;AACH,CAAC;","names":[]}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=tokenizer.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.test.d.ts","sourceRoot":"","sources":["../../src/tokenize/tokenizer.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,219 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { SentenceTokenizer, WordTokenizer, hyphenateWord } from "./basic/index.js";
3
+ import { splitParagraphs } from "./basic/paragraph.js";
4
+ const TEXT = "Hi! LiveKit is a platform for live audio and video applications and services. R.T.C stands for Real-Time Communication... again R.T.C. Mr. Theo is testing the sentence tokenizer. This is a test. Another test. A short sentence. A longer sentence that is longer than the previous sentence. f(x) = x * 2.54 + 42. Hey! Hi! Hello! ";
5
+ const EXPECTED_MIN_20 = [
6
+ "Hi! LiveKit is a platform for live audio and video applications and services.",
7
+ "R.T.C stands for Real-Time Communication... again R.T.C.",
8
+ "Mr. Theo is testing the sentence tokenizer.",
9
+ "This is a test. Another test.",
10
+ "A short sentence. A longer sentence that is longer than the previous sentence.",
11
+ "f(x) = x * 2.54 + 42.",
12
+ "Hey! Hi! Hello!"
13
+ ];
14
+ const WORDS_TEXT = "This is a test. Blabla another test! multiple consecutive spaces: done";
15
+ const WORDS_EXPECTED = [
16
+ "This",
17
+ "is",
18
+ "a",
19
+ "test",
20
+ "Blabla",
21
+ "another",
22
+ "test",
23
+ "multiple",
24
+ "consecutive",
25
+ "spaces",
26
+ "done"
27
+ ];
28
+ const WORDS_PUNCT_TEXT = 'This is <phoneme alphabet="cmu-arpabet" ph="AE K CH UW AH L IY">actually</phoneme> tricky to handle.';
29
+ const WORDS_PUNCT_EXPECTED = [
30
+ "This",
31
+ "is",
32
+ "<phoneme",
33
+ 'alphabet="cmu-arpabet"',
34
+ 'ph="AE',
35
+ "K",
36
+ "CH",
37
+ "UW",
38
+ "AH",
39
+ "L",
40
+ 'IY">actually</phoneme>',
41
+ "tricky",
42
+ "to",
43
+ "handle."
44
+ ];
45
+ const HYPHENATOR_TEXT = ["Segment", "expected", "communication", "window", "welcome", "bedroom"];
46
+ const HYPHENATOR_EXPECTED = [
47
+ ["Seg", "ment"],
48
+ ["ex", "pect", "ed"],
49
+ ["com", "mu", "ni", "ca", "tion"],
50
+ ["win", "dow"],
51
+ ["wel", "come"],
52
+ ["bed", "room"]
53
+ ];
54
+ const PARAGRAPH_TEST_CASES = [
55
+ ["Single paragraph.", [["Single paragraph.", 0, 17]]],
56
+ [
57
+ "Paragraph 1.\n\nParagraph 2.",
58
+ [
59
+ ["Paragraph 1.", 0, 12],
60
+ ["Paragraph 2.", 14, 26]
61
+ ]
62
+ ],
63
+ [
64
+ "Para 1.\n\nPara 2.\n\nPara 3.",
65
+ [
66
+ ["Para 1.", 0, 7],
67
+ ["Para 2.", 9, 16],
68
+ ["Para 3.", 18, 25]
69
+ ]
70
+ ],
71
+ ["\n\nParagraph with leading newlines.", [["Paragraph with leading newlines.", 2, 34]]],
72
+ ["Paragraph with trailing newlines.\n\n", [["Paragraph with trailing newlines.", 0, 33]]],
73
+ [
74
+ "\n\n Paragraph with leading and trailing spaces. \n\n",
75
+ [["Paragraph with leading and trailing spaces.", 4, 47]]
76
+ ],
77
+ [
78
+ "Para 1.\n\n\n\nPara 2.",
79
+ // Multiple newlines between paragraphs
80
+ [
81
+ ["Para 1.", 0, 7],
82
+ ["Para 2.", 11, 18]
83
+ ]
84
+ ],
85
+ [
86
+ "Para 1.\n \n \nPara 2.",
87
+ // Newlines with spaces between paragraphs
88
+ [
89
+ ["Para 1.", 0, 7],
90
+ ["Para 2.", 12, 19]
91
+ ]
92
+ ],
93
+ [
94
+ "",
95
+ // Empty string
96
+ []
97
+ ],
98
+ [
99
+ "\n\n\n",
100
+ // Only newlines
101
+ []
102
+ ],
103
+ [
104
+ "Line 1\nLine 2\nLine 3",
105
+ // Single paragraph with newlines
106
+ [["Line 1\nLine 2\nLine 3", 0, 20]]
107
+ ]
108
+ ];
109
+ describe("tokenizer", () => {
110
+ describe("SentenceTokenizer", () => {
111
+ const tokenizer = new SentenceTokenizer();
112
+ it("should tokenize sentences correctly", () => {
113
+ expect(tokenizer.tokenize(TEXT).every((x, i) => EXPECTED_MIN_20[i] === x)).toBeTruthy();
114
+ });
115
+ it("should stream tokenize sentences correctly", async () => {
116
+ const pattern = [1, 2, 4];
117
+ let text = TEXT;
118
+ const chunks = [];
119
+ const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))).fill(pattern).flat()[Symbol.iterator]();
120
+ for (const size of patternIter) {
121
+ if (!text) break;
122
+ chunks.push(text.slice(void 0, size));
123
+ text = text.slice(size);
124
+ }
125
+ const stream = tokenizer.stream();
126
+ for (const chunk of chunks) {
127
+ stream.pushText(chunk);
128
+ }
129
+ stream.endInput();
130
+ stream.close();
131
+ for (const x of EXPECTED_MIN_20) {
132
+ await stream.next().then((value) => {
133
+ if (value.value) {
134
+ expect(value.value.token).toStrictEqual(x);
135
+ }
136
+ });
137
+ }
138
+ });
139
+ });
140
+ describe("WordTokenizer", () => {
141
+ const tokenizer = new WordTokenizer();
142
+ it("should tokenize words correctly", () => {
143
+ expect(tokenizer.tokenize(WORDS_TEXT).every((x, i) => WORDS_EXPECTED[i] === x)).toBeTruthy();
144
+ });
145
+ it("should stream tokenize words correctly", async () => {
146
+ const pattern = [1, 2, 4];
147
+ let text = WORDS_TEXT;
148
+ const chunks = [];
149
+ const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))).fill(pattern).flat()[Symbol.iterator]();
150
+ for (const size of patternIter) {
151
+ if (!text) break;
152
+ chunks.push(text.slice(void 0, size));
153
+ text = text.slice(size);
154
+ }
155
+ const stream = tokenizer.stream();
156
+ for (const chunk of chunks) {
157
+ stream.pushText(chunk);
158
+ }
159
+ stream.endInput();
160
+ stream.close();
161
+ for (const x of WORDS_EXPECTED) {
162
+ await stream.next().then((value) => {
163
+ if (value.value) {
164
+ expect(value.value.token).toStrictEqual(x);
165
+ }
166
+ });
167
+ }
168
+ });
169
+ describe("punctuation handling", () => {
170
+ const tokenizerPunct = new WordTokenizer(false);
171
+ it("should tokenize words correctly", () => {
172
+ expect(
173
+ tokenizerPunct.tokenize(WORDS_PUNCT_TEXT).every((x, i) => WORDS_PUNCT_EXPECTED[i] === x)
174
+ ).toBeTruthy();
175
+ });
176
+ it("should stream tokenize words correctly", async () => {
177
+ const pattern = [1, 2, 4];
178
+ let text = WORDS_PUNCT_TEXT;
179
+ const chunks = [];
180
+ const patternIter = Array(
181
+ Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0))
182
+ ).fill(pattern).flat()[Symbol.iterator]();
183
+ for (const size of patternIter) {
184
+ if (!text) break;
185
+ chunks.push(text.slice(void 0, size));
186
+ text = text.slice(size);
187
+ }
188
+ const stream = tokenizerPunct.stream();
189
+ for (const chunk of chunks) {
190
+ stream.pushText(chunk);
191
+ }
192
+ stream.endInput();
193
+ stream.close();
194
+ for (const x of WORDS_PUNCT_EXPECTED) {
195
+ await stream.next().then((value) => {
196
+ if (value.value) {
197
+ expect(value.value.token).toStrictEqual(x);
198
+ }
199
+ });
200
+ }
201
+ });
202
+ });
203
+ });
204
+ describe("hyphenateWord", () => {
205
+ it("should hyphenate correctly", () => {
206
+ HYPHENATOR_TEXT.forEach((x, i) => {
207
+ expect(hyphenateWord(x)).toStrictEqual(HYPHENATOR_EXPECTED[i]);
208
+ });
209
+ });
210
+ });
211
+ describe("splitParagraphs", () => {
212
+ it("should tokenize paragraphs correctly", () => {
213
+ PARAGRAPH_TEST_CASES.forEach(([a, b]) => {
214
+ expect(splitParagraphs(a)).toStrictEqual(b);
215
+ });
216
+ });
217
+ });
218
+ });
219
+ //# sourceMappingURL=tokenizer.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/tokenize/tokenizer.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { describe, expect, it } from 'vitest';\nimport { SentenceTokenizer, WordTokenizer, hyphenateWord } from './basic/index.js';\nimport { splitParagraphs } from './basic/paragraph.js';\n\nconst TEXT =\n 'Hi! ' +\n 'LiveKit is a platform for live audio and video applications and services. ' +\n 'R.T.C stands for Real-Time Communication... again R.T.C. ' +\n 'Mr. Theo is testing the sentence tokenizer. ' +\n 'This is a test. Another test. ' +\n 'A short sentence. ' +\n 'A longer sentence that is longer than the previous sentence. ' +\n 'f(x) = x * 2.54 + 42. ' +\n 'Hey! Hi! Hello! ';\n\nconst EXPECTED_MIN_20 = [\n 'Hi! LiveKit is a platform for live audio and video applications and services.',\n 'R.T.C stands for Real-Time Communication... again R.T.C.',\n 'Mr. Theo is testing the sentence tokenizer.',\n 'This is a test. Another test.',\n 'A short sentence. A longer sentence that is longer than the previous sentence.',\n 'f(x) = x * 2.54 + 42.',\n 'Hey! Hi! Hello!',\n];\n\nconst WORDS_TEXT = 'This is a test. Blabla another test! multiple consecutive spaces: done';\nconst WORDS_EXPECTED = [\n 'This',\n 'is',\n 'a',\n 'test',\n 'Blabla',\n 'another',\n 'test',\n 'multiple',\n 'consecutive',\n 'spaces',\n 'done',\n];\n\nconst WORDS_PUNCT_TEXT =\n 'This is <phoneme alphabet=\"cmu-arpabet\" ph=\"AE K CH UW AH L IY\">actually</phoneme> tricky to handle.';\nconst WORDS_PUNCT_EXPECTED = [\n 'This',\n 'is',\n '<phoneme',\n 'alphabet=\"cmu-arpabet\"',\n 'ph=\"AE',\n 'K',\n 'CH',\n 'UW',\n 'AH',\n 'L',\n 'IY\">actually</phoneme>',\n 'tricky',\n 'to',\n 'handle.',\n];\n\nconst HYPHENATOR_TEXT = ['Segment', 'expected', 'communication', 'window', 'welcome', 'bedroom'];\nconst HYPHENATOR_EXPECTED = [\n ['Seg', 'ment'],\n ['ex', 'pect', 'ed'],\n ['com', 'mu', 'ni', 'ca', 'tion'],\n ['win', 'dow'],\n ['wel', 'come'],\n ['bed', 'room'],\n];\n\nconst PARAGRAPH_TEST_CASES: [string, [string, number, number][]][] = [\n ['Single paragraph.', [['Single paragraph.', 0, 17]]],\n [\n 'Paragraph 1.\\n\\nParagraph 2.',\n [\n ['Paragraph 1.', 0, 12],\n ['Paragraph 2.', 14, 26],\n ],\n ],\n [\n 'Para 1.\\n\\nPara 2.\\n\\nPara 3.',\n [\n ['Para 1.', 0, 7],\n ['Para 2.', 9, 16],\n ['Para 3.', 18, 25],\n ],\n ],\n ['\\n\\nParagraph with leading newlines.', [['Paragraph with leading newlines.', 2, 34]]],\n ['Paragraph with trailing newlines.\\n\\n', [['Paragraph with trailing newlines.', 0, 33]]],\n [\n '\\n\\n Paragraph with leading and trailing spaces. \\n\\n',\n [['Paragraph with leading and trailing spaces.', 4, 47]],\n ],\n [\n 'Para 1.\\n\\n\\n\\nPara 2.', // Multiple newlines between paragraphs\n [\n ['Para 1.', 0, 7],\n ['Para 2.', 11, 18],\n ],\n ],\n [\n 'Para 1.\\n \\n \\nPara 2.', // Newlines with spaces between paragraphs\n [\n ['Para 1.', 0, 7],\n ['Para 2.', 12, 19],\n ],\n ],\n [\n '', // Empty string\n [],\n ],\n [\n '\\n\\n\\n', // Only newlines\n [],\n ],\n [\n 'Line 1\\nLine 2\\nLine 3', // Single paragraph with newlines\n [['Line 1\\nLine 2\\nLine 3', 0, 20]],\n ],\n];\n\ndescribe('tokenizer', () => {\n describe('SentenceTokenizer', () => {\n const tokenizer = new SentenceTokenizer();\n\n it('should tokenize sentences correctly', () => {\n expect(tokenizer.tokenize(TEXT).every((x, i) => EXPECTED_MIN_20[i] === x)).toBeTruthy();\n });\n\n it('should stream tokenize sentences correctly', async () => {\n const pattern = [1, 2, 4];\n let text = TEXT;\n const chunks = [];\n const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)))\n .fill(pattern)\n .flat()\n [Symbol.iterator]();\n\n for (const size of patternIter) {\n if (!text) break;\n chunks.push(text.slice(undefined, size));\n text = text.slice(size);\n }\n const stream = tokenizer.stream();\n for (const chunk of chunks) {\n stream.pushText(chunk);\n }\n stream.endInput();\n stream.close();\n\n for (const x of EXPECTED_MIN_20) {\n await stream.next().then((value) => {\n if (value.value) {\n expect(value.value.token).toStrictEqual(x);\n }\n });\n }\n });\n });\n describe('WordTokenizer', () => {\n const tokenizer = new WordTokenizer();\n\n it('should tokenize words correctly', () => {\n expect(tokenizer.tokenize(WORDS_TEXT).every((x, i) => WORDS_EXPECTED[i] === x)).toBeTruthy();\n });\n\n it('should stream tokenize words correctly', async () => {\n const pattern = [1, 2, 4];\n let text = WORDS_TEXT;\n const chunks = [];\n const patternIter = Array(Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)))\n .fill(pattern)\n .flat()\n [Symbol.iterator]();\n\n for (const size of patternIter) {\n if (!text) break;\n chunks.push(text.slice(undefined, size));\n text = text.slice(size);\n }\n const stream = tokenizer.stream();\n for (const chunk of chunks) {\n stream.pushText(chunk);\n }\n stream.endInput();\n stream.close();\n\n for (const x of WORDS_EXPECTED) {\n await stream.next().then((value) => {\n if (value.value) {\n expect(value.value.token).toStrictEqual(x);\n }\n });\n }\n });\n\n describe('punctuation handling', () => {\n const tokenizerPunct = new WordTokenizer(false);\n\n it('should tokenize words correctly', () => {\n expect(\n tokenizerPunct.tokenize(WORDS_PUNCT_TEXT).every((x, i) => WORDS_PUNCT_EXPECTED[i] === x),\n ).toBeTruthy();\n });\n\n it('should stream tokenize words correctly', async () => {\n const pattern = [1, 2, 4];\n let text = WORDS_PUNCT_TEXT;\n const chunks = [];\n const patternIter = Array(\n Math.ceil(text.length / pattern.reduce((sum, num) => sum + num, 0)),\n )\n .fill(pattern)\n .flat()\n [Symbol.iterator]();\n\n for (const size of patternIter) {\n if (!text) break;\n chunks.push(text.slice(undefined, size));\n text = text.slice(size);\n }\n const stream = tokenizerPunct.stream();\n for (const chunk of chunks) {\n stream.pushText(chunk);\n }\n stream.endInput();\n stream.close();\n\n for (const x of WORDS_PUNCT_EXPECTED) {\n await stream.next().then((value) => {\n if (value.value) {\n expect(value.value.token).toStrictEqual(x);\n }\n });\n }\n });\n });\n });\n describe('hyphenateWord', () => {\n it('should hyphenate correctly', () => {\n HYPHENATOR_TEXT.forEach((x, i) => {\n expect(hyphenateWord(x)).toStrictEqual(HYPHENATOR_EXPECTED[i]);\n });\n });\n });\n describe('splitParagraphs', () => {\n it('should tokenize paragraphs correctly', () => {\n PARAGRAPH_TEST_CASES.forEach(([a, b]) => {\n expect(splitParagraphs(a)).toStrictEqual(b);\n });\n });\n });\n});\n"],"mappings":"AAGA,SAAS,UAAU,QAAQ,UAAU;AACrC,SAAS,mBAAmB,eAAe,qBAAqB;AAChE,SAAS,uBAAuB;AAEhC,MAAM,OACJ;AAUF,MAAM,kBAAkB;AAAA,EACtB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,MAAM,aAAa;AACnB,MAAM,iBAAiB;AAAA,EACrB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,MAAM,mBACJ;AACF,MAAM,uBAAuB;AAAA,EAC3B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,MAAM,kBAAkB,CAAC,WAAW,YAAY,iBAAiB,UAAU,WAAW,SAAS;AAC/F,MAAM,sBAAsB;AAAA,EAC1B,CAAC,OAAO,MAAM;AAAA,EACd,CAAC,MAAM,QAAQ,IAAI;AAAA,EACnB,CAAC,OAAO,MAAM,MAAM,MAAM,MAAM;AAAA,EAChC,CAAC,OAAO,KAAK;AAAA,EACb,CAAC,OAAO,MAAM;AAAA,EACd,CAAC,OAAO,MAAM;AAChB;AAEA,MAAM,uBAA+D;AAAA,EACnE,CAAC,qBAAqB,CAAC,CAAC,qBAAqB,GAAG,EAAE,CAAC,CAAC;AAAA,EACpD;AAAA,IACE;AAAA,IACA;AAAA,MACE,CAAC,gBAAgB,GAAG,EAAE;AAAA,MACtB,CAAC,gBAAgB,IAAI,EAAE;AAAA,IACzB;AAAA,EACF;AAAA,EACA;AAAA,IACE;AAAA,IACA;AAAA,MACE,CAAC,WAAW,GAAG,CAAC;AAAA,MAChB,CAAC,WAAW,GAAG,EAAE;AAAA,MACjB,CAAC,WAAW,IAAI,EAAE;AAAA,IACpB;AAAA,EACF;AAAA,EACA,CAAC,wCAAwC,CAAC,CAAC,oCAAoC,GAAG,EAAE,CAAC,CAAC;AAAA,EACtF,CAAC,yCAAyC,CAAC,CAAC,qCAAqC,GAAG,EAAE,CAAC,CAAC;AAAA,EACxF;AAAA,IACE;AAAA,IACA,CAAC,CAAC,+CAA+C,GAAG,EAAE,CAAC;AAAA,EACzD;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA;AAAA,MACE,CAAC,WAAW,GAAG,CAAC;AAAA,MAChB,CAAC,WAAW,IAAI,EAAE;AAAA,IACpB;AAAA,EACF;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA;AAAA,MACE,CAAC,WAAW,GAAG,CAAC;AAAA,MAChB,CAAC,WAAW,IAAI,EAAE;AAAA,IACpB;AAAA,EACF;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA,CAAC;AAAA,EACH;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA,CAAC;AAAA,EACH;AAAA,EACA;AAAA,IACE;AAAA;AAAA,IACA,CAAC,CAAC,0BAA0B,GAAG,EAAE,CAAC;AAAA,EACpC;AACF;AAEA,SAAS,aAAa,MAAM;AAC1B,WAAS,qBAAqB,MAAM;AAClC,UAAM,YAAY,IAAI,kBAAkB;AAExC,OAAG,uCAAuC,MAAM;AAC9C,aAAO,UAAU,SAAS,IAAI,EAAE,MAAM,CAAC,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW;AAAA,IACxF,CAAC;AAED,OAAG,8CAA8C,YAAY;AAC3D,YAAM,UAAU,CAAC,GAAG,GAAG,CAAC;AACxB,UAAI,OAAO;AACX,YAAM,SAAS,CAAC;AAChB,YAAM,cAAc,MAAM,KAAK,KAAK,KAAK,SAAS,QAAQ,OAAO,CAAC,KAAK,QAAQ,MAAM,KAAK,CAAC,CAAC,CAAC,EAC1F,KAAK,OAAO,EACZ,KAAK,EACL,OAAO,QAAQ,EAAE;AAEpB,iBAAW,QAAQ,aAAa;AAC9B,YAAI,CAAC,KAAM;AACX,eAAO,KAAK,KAAK,MAAM,QAAW,IAAI,CAAC;AACvC,eAAO,KAAK,MAAM,IAAI;AAAA,MACxB;AACA,YAAM,SAAS,UAAU,OAAO;AAChC,iBAAW,SAAS,QAAQ;AAC1B,eAAO,SAAS,KAAK;AAAA,MACvB;AACA,aAAO,SAAS;AAChB,aAAO,MAAM;AAEb,iBAAW,KAAK,iBAAiB;AAC/B,cAAM,OAAO,KAAK,EAAE,KAAK,CAAC,UAAU;AAClC,cAAI,MAAM,OAAO;AACf,mBAAO,MAAM,MAAM,KAAK,EAAE,cAAc,CAAC;AAAA,UAC3C;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACD,WAAS,iBAAiB,MAAM;AAC9B,UAAM,YAAY,IAAI,cAAc;AAEpC,OAAG,mCAAmC,MAAM;AAC1C,aAAO,UAAU,SAAS,UAAU,EAAE,MAAM,CAAC,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,CAAC,EAAE,WAAW;AAAA,IAC7F,CAAC;AAED,OAAG,0CAA0C,YAAY;AACvD,YAAM,UAAU,CAAC,GAAG,GAAG,CAAC;AACxB,UAAI,OAAO;AACX,YAAM,SAAS,CAAC;AAChB,YAAM,cAAc,MAAM,KAAK,KAAK,KAAK,SAAS,QAAQ,OAAO,CAAC,KAAK,QAAQ,MAAM,KAAK,CAAC,CAAC,CAAC,EAC1F,KAAK,OAAO,EACZ,KAAK,EACL,OAAO,QAAQ,EAAE;AAEpB,iBAAW,QAAQ,aAAa;AAC9B,YAAI,CAAC,KAAM;AACX,eAAO,KAAK,KAAK,MAAM,QAAW,IAAI,CAAC;AACvC,eAAO,KAAK,MAAM,IAAI;AAAA,MACxB;AACA,YAAM,SAAS,UAAU,OAAO;AAChC,iBAAW,SAAS,QAAQ;AAC1B,eAAO,SAAS,KAAK;AAAA,MACvB;AACA,aAAO,SAAS;AAChB,aAAO,MAAM;AAEb,iBAAW,KAAK,gBAAgB;AAC9B,cAAM,OAAO,KAAK,EAAE,KAAK,CAAC,UAAU;AAClC,cAAI,MAAM,OAAO;AACf,mBAAO,MAAM,MAAM,KAAK,EAAE,cAAc,CAAC;AAAA,UAC3C;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF,CAAC;AAED,aAAS,wBAAwB,MAAM;AACrC,YAAM,iBAAiB,IAAI,cAAc,KAAK;AAE9C,SAAG,mCAAmC,MAAM;AAC1C;AAAA,UACE,eAAe,SAAS,gBAAgB,EAAE,MAAM,CAAC,GAAG,MAAM,qBAAqB,CAAC,MAAM,CAAC;AAAA,QACzF,EAAE,WAAW;AAAA,MACf,CAAC;AAED,SAAG,0CAA0C,YAAY;AACvD,cAAM,UAAU,CAAC,GAAG,GAAG,CAAC;AACxB,YAAI,OAAO;AACX,cAAM,SAAS,CAAC;AAChB,cAAM,cAAc;AAAA,UAClB,KAAK,KAAK,KAAK,SAAS,QAAQ,OAAO,CAAC,KAAK,QAAQ,MAAM,KAAK,CAAC,CAAC;AAAA,QACpE,EACG,KAAK,OAAO,EACZ,KAAK,EACL,OAAO,QAAQ,EAAE;AAEpB,mBAAW,QAAQ,aAAa;AAC9B,cAAI,CAAC,KAAM;AACX,iBAAO,KAAK,KAAK,MAAM,QAAW,IAAI,CAAC;AACvC,iBAAO,KAAK,MAAM,IAAI;AAAA,QACxB;AACA,cAAM,SAAS,eAAe,OAAO;AACrC,mBAAW,SAAS,QAAQ;AAC1B,iBAAO,SAAS,KAAK;AAAA,QACvB;AACA,eAAO,SAAS;AAChB,eAAO,MAAM;AAEb,mBAAW,KAAK,sBAAsB;AACpC,gBAAM,OAAO,KAAK,EAAE,KAAK,CAAC,UAAU;AAClC,gBAAI,MAAM,OAAO;AACf,qBAAO,MAAM,MAAM,KAAK,EAAE,cAAc,CAAC;AAAA,YAC3C;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF,CAAC;AAAA,IACH,CAAC;AAAA,EACH,CAAC;AACD,WAAS,iBAAiB,MAAM;AAC9B,OAAG,8BAA8B,MAAM;AACrC,sBAAgB,QAAQ,CAAC,GAAG,MAAM;AAChC,eAAO,cAAc,CAAC,CAAC,EAAE,cAAc,oBAAoB,CAAC,CAAC;AAAA,MAC/D,CAAC;AAAA,IACH,CAAC;AAAA,EACH,CAAC;AACD,WAAS,mBAAmB,MAAM;AAChC,OAAG,wCAAwC,MAAM;AAC/C,2BAAqB,QAAQ,CAAC,CAAC,GAAG,CAAC,MAAM;AACvC,eAAO,gBAAgB,CAAC,CAAC,EAAE,cAAc,CAAC;AAAA,MAC5C,CAAC;AAAA,IACH,CAAC;AAAA,EACH,CAAC;AACH,CAAC;","names":[]}
package/dist/worker.cjs CHANGED
@@ -500,7 +500,8 @@ class Worker {
500
500
  available: true,
501
501
  participantIdentity: args.identity,
502
502
  participantName: args.name,
503
- participantMetadata: args.metadata
503
+ participantMetadata: args.metadata,
504
+ participantAttributes: args.attributes
504
505
  }
505
506
  }
506
507
  })