@supertone/supertone 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +119 -69
- package/custom_test/realtime_tts_player.ts +177 -12
- package/custom_test/test_pronunciation_dictionary.ts +227 -0
- package/custom_test/test_real_api.ts +1677 -162
- package/custom_test/test_text_utils_chunk_text_punctuation.ts +55 -0
- package/dist/commonjs/lib/config.d.ts +2 -2
- package/dist/commonjs/lib/config.d.ts.map +1 -1
- package/dist/commonjs/lib/config.js +2 -2
- package/dist/commonjs/lib/config.js.map +1 -1
- package/dist/commonjs/lib/custom_utils/index.d.ts +1 -0
- package/dist/commonjs/lib/custom_utils/index.d.ts.map +1 -1
- package/dist/commonjs/lib/custom_utils/index.js +5 -1
- package/dist/commonjs/lib/custom_utils/index.js.map +1 -1
- package/dist/commonjs/lib/custom_utils/pronunciation_utils.d.ts +24 -0
- package/dist/commonjs/lib/custom_utils/pronunciation_utils.d.ts.map +1 -0
- package/dist/commonjs/lib/custom_utils/pronunciation_utils.js +145 -0
- package/dist/commonjs/lib/custom_utils/pronunciation_utils.js.map +1 -0
- package/dist/commonjs/lib/custom_utils/text_utils.d.ts +8 -1
- package/dist/commonjs/lib/custom_utils/text_utils.d.ts.map +1 -1
- package/dist/commonjs/lib/custom_utils/text_utils.js +125 -7
- package/dist/commonjs/lib/custom_utils/text_utils.js.map +1 -1
- package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
- package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
- package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js +48 -3
- package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
- package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
- package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
- package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js +46 -3
- package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js.map +1 -1
- package/dist/commonjs/sdk/texttospeech.d.ts +17 -6
- package/dist/commonjs/sdk/texttospeech.d.ts.map +1 -1
- package/dist/commonjs/sdk/texttospeech.js +48 -25
- package/dist/commonjs/sdk/texttospeech.js.map +1 -1
- package/dist/esm/lib/config.d.ts +2 -2
- package/dist/esm/lib/config.d.ts.map +1 -1
- package/dist/esm/lib/config.js +2 -2
- package/dist/esm/lib/config.js.map +1 -1
- package/dist/esm/lib/custom_utils/index.d.ts +1 -0
- package/dist/esm/lib/custom_utils/index.d.ts.map +1 -1
- package/dist/esm/lib/custom_utils/index.js +2 -0
- package/dist/esm/lib/custom_utils/index.js.map +1 -1
- package/dist/esm/lib/custom_utils/pronunciation_utils.d.ts +24 -0
- package/dist/esm/lib/custom_utils/pronunciation_utils.d.ts.map +1 -0
- package/dist/esm/lib/custom_utils/pronunciation_utils.js +140 -0
- package/dist/esm/lib/custom_utils/pronunciation_utils.js.map +1 -0
- package/dist/esm/lib/custom_utils/text_utils.d.ts +8 -1
- package/dist/esm/lib/custom_utils/text_utils.d.ts.map +1 -1
- package/dist/esm/lib/custom_utils/text_utils.js +125 -7
- package/dist/esm/lib/custom_utils/text_utils.js.map +1 -1
- package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
- package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
- package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js +47 -2
- package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
- package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
- package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
- package/dist/esm/models/predictttsdurationusingcharacterrequest.js +45 -2
- package/dist/esm/models/predictttsdurationusingcharacterrequest.js.map +1 -1
- package/dist/esm/sdk/texttospeech.d.ts +17 -6
- package/dist/esm/sdk/texttospeech.d.ts.map +1 -1
- package/dist/esm/sdk/texttospeech.js +49 -26
- package/dist/esm/sdk/texttospeech.js.map +1 -1
- package/examples/custom_voices/create_cloned_voice.ts +4 -3
- package/examples/custom_voices/delete_custom_voice.ts +2 -7
- package/examples/custom_voices/edit_custom_voice.ts +2 -6
- package/examples/custom_voices/get_custom_voice.ts +2 -7
- package/examples/custom_voices/list_custom_voices.ts +2 -7
- package/examples/custom_voices/search_custom_voices.ts +2 -6
- package/examples/text_to_speech/create_speech.ts +3 -8
- package/examples/text_to_speech/create_speech_long_text.ts +3 -7
- package/examples/text_to_speech/create_speech_with_phonemes.ts +3 -7
- package/examples/text_to_speech/create_speech_with_voice_settings.ts +3 -8
- package/examples/text_to_speech/predict_duration.ts +3 -7
- package/examples/text_to_speech/stream_speech.ts +3 -7
- package/examples/text_to_speech/stream_speech_long_text.ts +3 -7
- package/examples/text_to_speech/stream_speech_with_phonemes.ts +3 -7
- package/examples/text_to_speech/stream_speech_with_voice_settings.ts +3 -7
- package/examples/usage/get_credit_balance.ts +2 -6
- package/examples/usage/get_usage.ts +2 -6
- package/examples/usage/get_voice_usage.ts +2 -7
- package/examples/voices/get_voice.ts +2 -6
- package/examples/voices/list_voices.ts +2 -6
- package/examples/voices/search_voices.ts +2 -7
- package/jsr.json +1 -1
- package/openapi.json +101 -9
- package/package.json +1 -1
- package/src/lib/config.ts +41 -41
- package/src/lib/custom_utils/index.ts +7 -0
- package/src/lib/custom_utils/pronunciation_utils.ts +193 -0
- package/src/lib/custom_utils/text_utils.ts +138 -7
- package/src/models/apiconverttexttospeechusingcharacterrequest.ts +62 -3
- package/src/models/predictttsdurationusingcharacterrequest.ts +64 -3
- package/src/sdk/texttospeech.ts +99 -68
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Smoke test for multilingual sentence punctuation splitting in chunkText().
|
|
4
|
+
*
|
|
5
|
+
* Run:
|
|
6
|
+
* npx ts-node custom_test/test_text_utils_chunk_text_punctuation.ts
|
|
7
|
+
* # or after build:
|
|
8
|
+
* node dist/custom_test/test_text_utils_chunk_text_punctuation.js
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { chunkText } from "../src/lib/custom_utils/text_utils.js";
|
|
12
|
+
|
|
13
|
+
function assertSplits(
|
|
14
|
+
text: string,
|
|
15
|
+
expectedChunks: string[],
|
|
16
|
+
maxLength: number
|
|
17
|
+
): void {
|
|
18
|
+
const got = chunkText(text, maxLength);
|
|
19
|
+
const passed = JSON.stringify(got) === JSON.stringify(expectedChunks);
|
|
20
|
+
|
|
21
|
+
if (!passed) {
|
|
22
|
+
throw new Error(
|
|
23
|
+
`\ntext=${JSON.stringify(text)}\nexpected=${JSON.stringify(expectedChunks)}\ngot=${JSON.stringify(got)}`
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function main(): void {
|
|
29
|
+
// English / many EU languages
|
|
30
|
+
assertSplits("Hello. World!", ["Hello. ", "World!"], 8);
|
|
31
|
+
|
|
32
|
+
// Korean (mostly ASCII punctuation in practice, plus ellipsis)
|
|
33
|
+
assertSplits("안...반가… 네.", ["안...", "반가… ", "네."], 4);
|
|
34
|
+
|
|
35
|
+
// Japanese
|
|
36
|
+
assertSplits(
|
|
37
|
+
"こんにちは。元気ですか?はい!",
|
|
38
|
+
["こんにちは。", "元気ですか?", "はい!"],
|
|
39
|
+
6
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
// Arabic (short samples to avoid max_length merge issues)
|
|
43
|
+
assertSplits("مر؟ نعم۔", ["مر؟ ", "نعم۔"], 5);
|
|
44
|
+
|
|
45
|
+
// Hindi
|
|
46
|
+
assertSplits("हाँ। नहीं॥", ["हाँ। ", "नहीं॥"], 6);
|
|
47
|
+
|
|
48
|
+
// Greek question mark (U+037E)
|
|
49
|
+
assertSplits("Γεια;Καλά.", ["Γεια;", "Καλά."], 5);
|
|
50
|
+
|
|
51
|
+
console.log("OK: chunkText punctuation smoke test passed");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
main();
|
|
55
|
+
|
|
@@ -31,8 +31,8 @@ export declare function serverURLFromOptions(options: SDKOptions): URL | null;
|
|
|
31
31
|
export declare const SDK_METADATA: {
|
|
32
32
|
readonly language: "typescript";
|
|
33
33
|
readonly openapiDocVersion: "0.8.69";
|
|
34
|
-
readonly sdkVersion: "0.1.
|
|
34
|
+
readonly sdkVersion: "0.1.3";
|
|
35
35
|
readonly genVersion: "2.686.7";
|
|
36
|
-
readonly userAgent: "speakeasy-sdk/typescript 0.1.
|
|
36
|
+
readonly userAgent: "speakeasy-sdk/typescript 0.1.2 2.686.7 0.8.69 @supertone/supertone";
|
|
37
37
|
};
|
|
38
38
|
//# sourceMappingURL=config.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../../src/lib/config.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAG3C;;GAEG;AACH,eAAO,MAAM,UAAU,uCAKb,CAAC;AAEX,MAAM,MAAM,UAAU,GAAG;
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../../src/lib/config.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAG3C;;GAEG;AACH,eAAO,MAAM,UAAU,uCAKb,CAAC;AAEX,MAAM,MAAM,UAAU,GAAG;IACxB,MAAM,CAAC,EAAE,MAAM,GAAG,CAAC,MAAM,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,SAAS,CAAC;IAEtD,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC/B;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC/B;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC/B;;OAEG;IACH,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB,CAAC;AAEF,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,UAAU,GAAG,GAAG,GAAG,IAAI,CAepE;AAED,eAAO,MAAM,YAAY;;;;;;CAOf,CAAC"}
|
|
@@ -31,8 +31,8 @@ function serverURLFromOptions(options) {
|
|
|
31
31
|
exports.SDK_METADATA = {
|
|
32
32
|
language: "typescript",
|
|
33
33
|
openapiDocVersion: "0.8.69",
|
|
34
|
-
sdkVersion: "0.1.
|
|
34
|
+
sdkVersion: "0.1.3",
|
|
35
35
|
genVersion: "2.686.7",
|
|
36
|
-
userAgent: "speakeasy-sdk/typescript 0.1.
|
|
36
|
+
userAgent: "speakeasy-sdk/typescript 0.1.2 2.686.7 0.8.69 @supertone/supertone",
|
|
37
37
|
};
|
|
38
38
|
//# sourceMappingURL=config.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../../src/lib/config.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAyCH,oDAeC;AAnDD,qCAA8C;AAE9C;;GAEG;AACU,QAAA,UAAU,GAAG;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../../src/lib/config.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAyCH,oDAeC;AAnDD,qCAA8C;AAE9C;;GAEG;AACU,QAAA,UAAU,GAAG;IACzB;;OAEG;IACH,0BAA0B;CACjB,CAAC;AA0BX,SAAgB,oBAAoB,CAAC,OAAmB;IACvD,IAAI,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;IAElC,MAAM,MAAM,GAAW,EAAE,CAAC;IAE1B,IAAI,CAAC,SAAS,EAAE,CAAC;QAChB,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,CAAC,CAAC;QACzC,IAAI,SAAS,GAAG,CAAC,IAAI,SAAS,IAAI,kBAAU,CAAC,MAAM,EAAE,CAAC;YACrD,MAAM,IAAI,KAAK,CAAC,wBAAwB,SAAS,EAAE,CAAC,CAAC;QACtD,CAAC;QACD,SAAS,GAAG,kBAAU,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;IACzC,CAAC;IAED,MAAM,CAAC,GAAG,IAAA,mBAAU,EAAC,SAAS,CAAC,CAAC,MAAM,CAAC,CAAC;IACxC,OAAO,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC;AACnB,CAAC;AAEY,QAAA,YAAY,GAAG;IAC3B,QAAQ,EAAE,YAAY;IACtB,iBAAiB,EAAE,QAAQ;IAC3B,UAAU,EAAE,OAAO;IACnB,UAAU,EAAE,SAAS;IACrB,SAAS,EACR,oEAAoE;CAC5D,CAAC"}
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
export * from "./constants.js";
|
|
8
8
|
export { chunkText, extractAudioFromNdjson } from "./text_utils.js";
|
|
9
|
+
export { applyPronunciationDictionary, PronunciationDictionaryValidationError, type PronunciationDictionaryEntry, } from "./pronunciation_utils.js";
|
|
9
10
|
export { mergeWavBinary, mergeMp3Binary, removeWavHeader, removeMp3Header, detectAudioFormat, extractAudioFromResponse, extractAudioFromResponses, } from "./audio_utils.js";
|
|
10
11
|
export { mergePhonemeData, adjustPhonemeTiming, createEmptyPhonemeDict, type PhonemeData, } from "./phoneme_utils.js";
|
|
11
12
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/lib/custom_utils/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,cAAc,gBAAgB,CAAC;AAG/B,OAAO,EAAE,SAAS,EAAE,sBAAsB,EAAE,MAAM,iBAAiB,CAAC;AAGpE,OAAO,EACN,cAAc,EACd,cAAc,EACd,eAAe,EACf,eAAe,EACf,iBAAiB,EACjB,wBAAwB,EACxB,yBAAyB,GACzB,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACN,gBAAgB,EAChB,mBAAmB,EACnB,sBAAsB,EACtB,KAAK,WAAW,GAChB,MAAM,oBAAoB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/lib/custom_utils/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,cAAc,gBAAgB,CAAC;AAG/B,OAAO,EAAE,SAAS,EAAE,sBAAsB,EAAE,MAAM,iBAAiB,CAAC;AAGpE,OAAO,EACN,4BAA4B,EAC5B,sCAAsC,EACtC,KAAK,4BAA4B,GACjC,MAAM,0BAA0B,CAAC;AAGlC,OAAO,EACN,cAAc,EACd,cAAc,EACd,eAAe,EACf,eAAe,EACf,iBAAiB,EACjB,wBAAwB,EACxB,yBAAyB,GACzB,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACN,gBAAgB,EAChB,mBAAmB,EACnB,sBAAsB,EACtB,KAAK,WAAW,GAChB,MAAM,oBAAoB,CAAC"}
|
|
@@ -20,13 +20,17 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
20
20
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
21
21
|
};
|
|
22
22
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
23
|
-
exports.createEmptyPhonemeDict = exports.adjustPhonemeTiming = exports.mergePhonemeData = exports.extractAudioFromResponses = exports.extractAudioFromResponse = exports.detectAudioFormat = exports.removeMp3Header = exports.removeWavHeader = exports.mergeMp3Binary = exports.mergeWavBinary = exports.extractAudioFromNdjson = exports.chunkText = void 0;
|
|
23
|
+
exports.createEmptyPhonemeDict = exports.adjustPhonemeTiming = exports.mergePhonemeData = exports.extractAudioFromResponses = exports.extractAudioFromResponse = exports.detectAudioFormat = exports.removeMp3Header = exports.removeWavHeader = exports.mergeMp3Binary = exports.mergeWavBinary = exports.PronunciationDictionaryValidationError = exports.applyPronunciationDictionary = exports.extractAudioFromNdjson = exports.chunkText = void 0;
|
|
24
24
|
// Export all constants
|
|
25
25
|
__exportStar(require("./constants.js"), exports);
|
|
26
26
|
// Export text utilities
|
|
27
27
|
var text_utils_js_1 = require("./text_utils.js");
|
|
28
28
|
Object.defineProperty(exports, "chunkText", { enumerable: true, get: function () { return text_utils_js_1.chunkText; } });
|
|
29
29
|
Object.defineProperty(exports, "extractAudioFromNdjson", { enumerable: true, get: function () { return text_utils_js_1.extractAudioFromNdjson; } });
|
|
30
|
+
// Export pronunciation utilities
|
|
31
|
+
var pronunciation_utils_js_1 = require("./pronunciation_utils.js");
|
|
32
|
+
Object.defineProperty(exports, "applyPronunciationDictionary", { enumerable: true, get: function () { return pronunciation_utils_js_1.applyPronunciationDictionary; } });
|
|
33
|
+
Object.defineProperty(exports, "PronunciationDictionaryValidationError", { enumerable: true, get: function () { return pronunciation_utils_js_1.PronunciationDictionaryValidationError; } });
|
|
30
34
|
// Export audio utilities
|
|
31
35
|
var audio_utils_js_1 = require("./audio_utils.js");
|
|
32
36
|
Object.defineProperty(exports, "mergeWavBinary", { enumerable: true, get: function () { return audio_utils_js_1.mergeWavBinary; } });
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/lib/custom_utils/index.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;;;;;;;;;;;;;;;AAEH,uBAAuB;AACvB,iDAA+B;AAE/B,wBAAwB;AACxB,iDAAoE;AAA3D,0GAAA,SAAS,OAAA;AAAE,uHAAA,sBAAsB,OAAA;AAE1C,yBAAyB;AACzB,mDAQ0B;AAPzB,gHAAA,cAAc,OAAA;AACd,gHAAA,cAAc,OAAA;AACd,iHAAA,eAAe,OAAA;AACf,iHAAA,eAAe,OAAA;AACf,mHAAA,iBAAiB,OAAA;AACjB,0HAAA,wBAAwB,OAAA;AACxB,2HAAA,yBAAyB,OAAA;AAG1B,2BAA2B;AAC3B,uDAK4B;AAJ3B,oHAAA,gBAAgB,OAAA;AAChB,uHAAA,mBAAmB,OAAA;AACnB,0HAAA,sBAAsB,OAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/lib/custom_utils/index.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;;;;;;;;;;;;;;;AAEH,uBAAuB;AACvB,iDAA+B;AAE/B,wBAAwB;AACxB,iDAAoE;AAA3D,0GAAA,SAAS,OAAA;AAAE,uHAAA,sBAAsB,OAAA;AAE1C,iCAAiC;AACjC,mEAIkC;AAHjC,sIAAA,4BAA4B,OAAA;AAC5B,gJAAA,sCAAsC,OAAA;AAIvC,yBAAyB;AACzB,mDAQ0B;AAPzB,gHAAA,cAAc,OAAA;AACd,gHAAA,cAAc,OAAA;AACd,iHAAA,eAAe,OAAA;AACf,iHAAA,eAAe,OAAA;AACf,mHAAA,iBAAiB,OAAA;AACjB,0HAAA,wBAAwB,OAAA;AACxB,2HAAA,yBAAyB,OAAA;AAG1B,2BAA2B;AAC3B,uDAK4B;AAJ3B,oHAAA,gBAAgB,OAAA;AAChB,uHAAA,mBAAmB,OAAA;AACnB,0HAAA,sBAAsB,OAAA"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pronunciation dictionary substitution utilities.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors the Python implementation policy:
|
|
5
|
+
* - Apply rules in input order
|
|
6
|
+
* - partial_match=false: word-boundary exact matches only
|
|
7
|
+
* - partial_match=true: substring matches (no boundaries)
|
|
8
|
+
* - No re-substitution: replaced segments are shielded via opaque tokens
|
|
9
|
+
*
|
|
10
|
+
* Validation:
|
|
11
|
+
* - pronunciation_dictionary omitted/undefined/null -> return original text
|
|
12
|
+
* - pronunciation_dictionary must be an array of objects
|
|
13
|
+
* - each object must have: text (string, non-empty), pronunciation (string, non-empty), partial_match (boolean)
|
|
14
|
+
*/
|
|
15
|
+
export declare class PronunciationDictionaryValidationError extends Error {
|
|
16
|
+
constructor(message: string);
|
|
17
|
+
}
|
|
18
|
+
export type PronunciationDictionaryEntry = {
|
|
19
|
+
text: string;
|
|
20
|
+
pronunciation: string;
|
|
21
|
+
partial_match: boolean;
|
|
22
|
+
};
|
|
23
|
+
export declare function applyPronunciationDictionary(text: string, pronunciation_dictionary?: unknown): string;
|
|
24
|
+
//# sourceMappingURL=pronunciation_utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pronunciation_utils.d.ts","sourceRoot":"","sources":["../../../../src/lib/custom_utils/pronunciation_utils.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,qBAAa,sCAAuC,SAAQ,KAAK;gBACnD,OAAO,EAAE,MAAM;CAI5B;AAED,MAAM,MAAM,4BAA4B,GAAG;IACzC,IAAI,EAAE,MAAM,CAAC;IACb,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,wBAAgB,4BAA4B,CAC1C,IAAI,EAAE,MAAM,EACZ,wBAAwB,CAAC,EAAE,OAAO,GACjC,MAAM,CAuER"}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Pronunciation dictionary substitution utilities.
|
|
4
|
+
*
|
|
5
|
+
* Mirrors the Python implementation policy:
|
|
6
|
+
* - Apply rules in input order
|
|
7
|
+
* - partial_match=false: word-boundary exact matches only
|
|
8
|
+
* - partial_match=true: substring matches (no boundaries)
|
|
9
|
+
* - No re-substitution: replaced segments are shielded via opaque tokens
|
|
10
|
+
*
|
|
11
|
+
* Validation:
|
|
12
|
+
* - pronunciation_dictionary omitted/undefined/null -> return original text
|
|
13
|
+
* - pronunciation_dictionary must be an array of objects
|
|
14
|
+
* - each object must have: text (string, non-empty), pronunciation (string, non-empty), partial_match (boolean)
|
|
15
|
+
*/
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.PronunciationDictionaryValidationError = void 0;
|
|
18
|
+
exports.applyPronunciationDictionary = applyPronunciationDictionary;
|
|
19
|
+
class PronunciationDictionaryValidationError extends Error {
|
|
20
|
+
constructor(message) {
|
|
21
|
+
super(message);
|
|
22
|
+
this.name = "PronunciationDictionaryValidationError";
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
exports.PronunciationDictionaryValidationError = PronunciationDictionaryValidationError;
|
|
26
|
+
function applyPronunciationDictionary(text, pronunciation_dictionary) {
|
|
27
|
+
// Match Python behavior: return early for null, undefined, or empty array
|
|
28
|
+
if (pronunciation_dictionary == null ||
|
|
29
|
+
(Array.isArray(pronunciation_dictionary) &&
|
|
30
|
+
pronunciation_dictionary.length === 0)) {
|
|
31
|
+
return text;
|
|
32
|
+
}
|
|
33
|
+
if (typeof text !== "string") {
|
|
34
|
+
throw new PronunciationDictionaryValidationError(`\`text\` must be string, got ${typeof text}`);
|
|
35
|
+
}
|
|
36
|
+
if (!Array.isArray(pronunciation_dictionary)) {
|
|
37
|
+
throw new PronunciationDictionaryValidationError("`pronunciation_dictionary` must be an array of objects");
|
|
38
|
+
}
|
|
39
|
+
// Prevent re-substitution:
|
|
40
|
+
// replace matches with unique opaque tokens first,
|
|
41
|
+
// then expand tokens to pronunciations at the end.
|
|
42
|
+
const tokenToPronunciation = new Map();
|
|
43
|
+
let working = text;
|
|
44
|
+
for (let idx = 0; idx < pronunciation_dictionary.length; idx++) {
|
|
45
|
+
const entry = validateEntry(pronunciation_dictionary[idx], idx);
|
|
46
|
+
const src = entry.text;
|
|
47
|
+
const dst = entry.pronunciation;
|
|
48
|
+
const partial = entry.partial_match;
|
|
49
|
+
const token = makeUniqueToken(idx, working, tokenToPronunciation);
|
|
50
|
+
if (partial) {
|
|
51
|
+
const re = new RegExp(escapeRegExp(src), "g");
|
|
52
|
+
const newWorking = working.replace(re, token);
|
|
53
|
+
if (newWorking === working)
|
|
54
|
+
continue; // No match found
|
|
55
|
+
tokenToPronunciation.set(token, dst);
|
|
56
|
+
working = newWorking;
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
// Exact match with word-boundary semantics (Unicode-aware-ish).
|
|
60
|
+
// Python uses Unicode \w; in JS, \w is ASCII-only. To mirror behavior better across scripts,
|
|
61
|
+
// we define "word char" as: letter or number or underscore.
|
|
62
|
+
//
|
|
63
|
+
// We avoid lookbehind for broader runtime compatibility by capturing the left boundary.
|
|
64
|
+
//
|
|
65
|
+
// Pattern: (^|[^WORD_CHARS]) (SRC) (?=[^WORD_CHARS]|$)
|
|
66
|
+
// (IMPORTANT) WORD_CHARS must not include surrounding [] because we embed it into other [].
|
|
67
|
+
const WORD_CHARS = "\\p{L}\\p{N}_";
|
|
68
|
+
const srcEsc = escapeRegExp(src);
|
|
69
|
+
const pattern = `(^|[^${WORD_CHARS}])(${srcEsc})(?=[^${WORD_CHARS}]|$)`;
|
|
70
|
+
const re = new RegExp(pattern, "gu");
|
|
71
|
+
// Replace keeping the left boundary (group 1)
|
|
72
|
+
const newWorking = working.replace(re, `$1${token}`);
|
|
73
|
+
if (newWorking === working)
|
|
74
|
+
continue; // No match found
|
|
75
|
+
tokenToPronunciation.set(token, dst);
|
|
76
|
+
working = newWorking;
|
|
77
|
+
}
|
|
78
|
+
// Expand tokens into pronunciations.
|
|
79
|
+
for (const [token, pron] of tokenToPronunciation.entries()) {
|
|
80
|
+
working = working.split(token).join(pron);
|
|
81
|
+
}
|
|
82
|
+
return working;
|
|
83
|
+
}
|
|
84
|
+
function validateEntry(raw, idx) {
|
|
85
|
+
if (raw == null || typeof raw !== "object" || Array.isArray(raw)) {
|
|
86
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}] must be an object, got ${raw === null ? "null" : Array.isArray(raw) ? "array" : typeof raw}`);
|
|
87
|
+
}
|
|
88
|
+
const obj = raw;
|
|
89
|
+
const missing = [];
|
|
90
|
+
if (!("text" in obj))
|
|
91
|
+
missing.push("text");
|
|
92
|
+
if (!("pronunciation" in obj))
|
|
93
|
+
missing.push("pronunciation");
|
|
94
|
+
if (!("partial_match" in obj))
|
|
95
|
+
missing.push("partial_match");
|
|
96
|
+
if (missing.length) {
|
|
97
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}] missing required field(s): ${missing.join(", ")}`);
|
|
98
|
+
}
|
|
99
|
+
const src = obj["text"];
|
|
100
|
+
const dst = obj["pronunciation"];
|
|
101
|
+
const partial = obj["partial_match"];
|
|
102
|
+
if (typeof src !== "string") {
|
|
103
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}].text must be string, got ${typeof src}`);
|
|
104
|
+
}
|
|
105
|
+
if (typeof dst !== "string") {
|
|
106
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}].pronunciation must be string, got ${typeof dst}`);
|
|
107
|
+
}
|
|
108
|
+
if (typeof partial !== "boolean") {
|
|
109
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}].partial_match must be boolean, got ${typeof partial}`);
|
|
110
|
+
}
|
|
111
|
+
if (src === "") {
|
|
112
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}].text must not be empty`);
|
|
113
|
+
}
|
|
114
|
+
if (dst === "") {
|
|
115
|
+
throw new PronunciationDictionaryValidationError(`pronunciation_dictionary[${idx}].pronunciation must not be empty`);
|
|
116
|
+
}
|
|
117
|
+
return { text: src, pronunciation: dst, partial_match: partial };
|
|
118
|
+
}
|
|
119
|
+
function escapeRegExp(s) {
|
|
120
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
121
|
+
}
|
|
122
|
+
function makeUniqueToken(idx, working, existing) {
|
|
123
|
+
// Private Use Area markers to minimize collision with typical text.
|
|
124
|
+
const base = `\uE000PD${idx}\uE001`;
|
|
125
|
+
if (!working.includes(base) && !existing.has(base))
|
|
126
|
+
return base;
|
|
127
|
+
while (true) {
|
|
128
|
+
const suffix = safeRandomHex();
|
|
129
|
+
const token = `\uE000PD${idx}_${suffix}\uE001`;
|
|
130
|
+
if (!working.includes(token) && !existing.has(token))
|
|
131
|
+
return token;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
function safeRandomHex() {
|
|
135
|
+
// Prefer crypto.randomUUID when available (browser / modern runtimes)
|
|
136
|
+
const c = globalThis.crypto;
|
|
137
|
+
if (c && typeof c.randomUUID === "function") {
|
|
138
|
+
return String(c.randomUUID()).replace(/-/g, "");
|
|
139
|
+
}
|
|
140
|
+
// Fallback: not cryptographically strong, but fine for uniqueness tokenization.
|
|
141
|
+
return (Math.random().toString(16).slice(2) +
|
|
142
|
+
Math.random().toString(16).slice(2) +
|
|
143
|
+
Date.now().toString(16));
|
|
144
|
+
}
|
|
145
|
+
//# sourceMappingURL=pronunciation_utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pronunciation_utils.js","sourceRoot":"","sources":["../../../../src/lib/custom_utils/pronunciation_utils.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;GAaG;;;AAeH,oEA0EC;AAvFD,MAAa,sCAAuC,SAAQ,KAAK;IAC/D,YAAY,OAAe;QACzB,KAAK,CAAC,OAAO,CAAC,CAAC;QACf,IAAI,CAAC,IAAI,GAAG,wCAAwC,CAAC;IACvD,CAAC;CACF;AALD,wFAKC;AAQD,SAAgB,4BAA4B,CAC1C,IAAY,EACZ,wBAAkC;IAElC,0EAA0E;IAC1E,IACE,wBAAwB,IAAI,IAAI;QAChC,CAAC,KAAK,CAAC,OAAO,CAAC,wBAAwB,CAAC;YACtC,wBAAwB,CAAC,MAAM,KAAK,CAAC,CAAC,EACxC,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC7B,MAAM,IAAI,sCAAsC,CAC9C,gCAAgC,OAAO,IAAI,EAAE,CAC9C,CAAC;IACJ,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,wBAAwB,CAAC,EAAE,CAAC;QAC7C,MAAM,IAAI,sCAAsC,CAC9C,wDAAwD,CACzD,CAAC;IACJ,CAAC;IAED,2BAA2B;IAC3B,mDAAmD;IACnD,mDAAmD;IACnD,MAAM,oBAAoB,GAAG,IAAI,GAAG,EAAkB,CAAC;IACvD,IAAI,OAAO,GAAG,IAAI,CAAC;IAEnB,KAAK,IAAI,GAAG,GAAG,CAAC,EAAE,GAAG,GAAG,wBAAwB,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC;QAC/D,MAAM,KAAK,GAAG,aAAa,CAAC,wBAAwB,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;QAChE,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC;QACvB,MAAM,GAAG,GAAG,KAAK,CAAC,aAAa,CAAC;QAChC,MAAM,OAAO,GAAG,KAAK,CAAC,aAAa,CAAC;QAEpC,MAAM,KAAK,GAAG,eAAe,CAAC,GAAG,EAAE,OAAO,EAAE,oBAAoB,CAAC,CAAC;QAElE,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;YAC9C,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;YAC9C,IAAI,UAAU,KAAK,OAAO;gBAAE,SAAS,CAAC,iBAAiB;YACvD,oBAAoB,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YACrC,OAAO,GAAG,UAAU,CAAC;YACrB,SAAS;QACX,CAAC;QAED,gEAAgE;QAChE,6FAA6F;QAC7F,4DAA4D;QAC5D,EAAE;QACF,wFAAwF;QACxF,EAAE;QACF,uDAAuD;QACvD,4FAA4F;QAC5F,MAAM,UAAU,GAAG,eAAe,CAAC;QACnC,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;QACjC,MAAM,OAAO,GAAG,QAAQ,UAAU,MAAM,MAAM,SAAS,UAAU,MAAM,CAAC;QACxE,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;QAErC,8CAA8C;QAC9C,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,EAAE,EAAE,KAAK,KAAK,EAAE,CAAC,CAAC;QACrD,IAAI,UAAU,KAAK,OAAO;YAAE,SAAS,CAAC,iBAAiB;QACvD,oBAAoB,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QACrC,OAAO,GAAG,UAAU,CAAC;IACvB,CAAC;IAED,qCAAqC;IACrC,KAAK,MAAM,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,oBAAoB,CAAC,OAAO,EAAE,EAAE,CAAC;QAC3D,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CAAC,GAAY,EAAE,GAAW;IAC9C,IAAI,GAAG,IAAI,IAAI,IAAI,OAAO,GAAG,KAAK,QAAQ,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QACjE,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,4BAC7B,GAAG,KAAK,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,GAChE,EAAE,CACH,CAAC;IACJ,CAAC;IAED,MAAM,GAAG,GAAG,GAA8B,CAAC;IAC3C,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,CAAC,CAAC,MAAM,IAAI,GAAG,CAAC;QAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC3C,IAAI,CAAC,CAAC,eAAe,IAAI,GAAG,CAAC;QAAE,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7D,IAAI,CAAC,CAAC,eAAe,IAAI,GAAG,CAAC;QAAE,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,gCAAgC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACpF,CAAC;IACJ,CAAC;IAED,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC;IACxB,MAAM,GAAG,GAAG,GAAG,CAAC,eAAe,CAAC,CAAC;IACjC,MAAM,OAAO,GAAG,GAAG,CAAC,eAAe,CAAC,CAAC;IAErC,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,8BAA8B,OAAO,GAAG,EAAE,CAC1E,CAAC;IACJ,CAAC;IACD,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,uCAAuC,OAAO,GAAG,EAAE,CACnF,CAAC;IACJ,CAAC;IACD,IAAI,OAAO,OAAO,KAAK,SAAS,EAAE,CAAC;QACjC,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,wCAAwC,OAAO,OAAO,EAAE,CACxF,CAAC;IACJ,CAAC;IAED,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;QACf,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,0BAA0B,CAC1D,CAAC;IACJ,CAAC;IACD,IAAI,GAAG,KAAK,EAAE,EAAE,CAAC;QACf,MAAM,IAAI,sCAAsC,CAC9C,4BAA4B,GAAG,mCAAmC,CACnE,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,GAAG,EAAE,aAAa,EAAE,GAAG,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC;AACnE,CAAC;AAED,SAAS,YAAY,CAAC,CAAS;IAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;AAClD,CAAC;AAED,SAAS,eAAe,CACtB,GAAW,EACX,OAAe,EACf,QAA6B;IAE7B,oEAAoE;IACpE,MAAM,IAAI,GAAG,WAAW,GAAG,QAAQ,CAAC;IACpC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAEhE,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,aAAa,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,WAAW,GAAG,IAAI,MAAM,QAAQ,CAAC;QAC/C,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;IACrE,CAAC;AACH,CAAC;AAED,SAAS,aAAa;IACpB,sEAAsE;IACtE,MAAM,CAAC,GAAI,UAAkB,CAAC,MAAM,CAAC;IACrC,IAAI,CAAC,IAAI,OAAO,CAAC,CAAC,UAAU,KAAK,UAAU,EAAE,CAAC;QAC5C,OAAO,MAAM,CAAC,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,gFAAgF;IAChF,OAAO,CACL,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC;QACnC,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC;QACnC,IAAI,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CACxB,CAAC;AACJ,CAAC"}
|
|
@@ -12,9 +12,16 @@
|
|
|
12
12
|
* It handles various punctuation patterns and provides graceful fallback to
|
|
13
13
|
* word/character boundaries when necessary.
|
|
14
14
|
*
|
|
15
|
+
* Chunking Strategy:
|
|
16
|
+
* 1. First, split by sentence boundaries (multilingual punctuation)
|
|
17
|
+
* 2. Merge sentences into chunks up to maxLength
|
|
18
|
+
* 3. If a sentence exceeds maxLength:
|
|
19
|
+
* - For text with spaces: split by words
|
|
20
|
+
* - For text without spaces (Japanese, etc.): split by characters
|
|
21
|
+
*
|
|
15
22
|
* @param text - Input text to be segmented
|
|
16
23
|
* @param maxLength - Maximum length of each chunk
|
|
17
|
-
* @returns Array of text chunks
|
|
24
|
+
* @returns Array of text chunks, each guaranteed to be <= maxLength
|
|
18
25
|
*/
|
|
19
26
|
export declare function chunkText(text: string, maxLength?: number): string[];
|
|
20
27
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"text_utils.d.ts","sourceRoot":"","sources":["../../../../src/lib/custom_utils/text_utils.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;
|
|
1
|
+
{"version":3,"file":"text_utils.d.ts","sourceRoot":"","sources":["../../../../src/lib/custom_utils/text_utils.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAkHH;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,SAAS,CACxB,IAAI,EAAE,MAAM,EACZ,SAAS,GAAE,MAAgC,GACzC,MAAM,EAAE,CAyCV;AAED;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,CA8BpE"}
|
|
@@ -9,6 +9,103 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
9
9
|
exports.chunkText = chunkText;
|
|
10
10
|
exports.extractAudioFromNdjson = extractAudioFromNdjson;
|
|
11
11
|
const constants_js_1 = require("./constants.js");
|
|
12
|
+
/**
|
|
13
|
+
* Sentence-ending punctuation pattern for multilingual support.
|
|
14
|
+
*
|
|
15
|
+
* Supported languages: English, Korean, Japanese, Bulgarian, Czech, Danish,
|
|
16
|
+
* Greek, Spanish, Estonian, Finnish, Hungarian, Italian, Dutch, Polish,
|
|
17
|
+
* Portuguese, Romanian, Arabic, German, French, Hindi, Indonesian, Russian,
|
|
18
|
+
* Vietnamese, Chinese, Thai, and more.
|
|
19
|
+
*
|
|
20
|
+
* Punctuation groups:
|
|
21
|
+
* - ASCII basics: . ! ? ; :
|
|
22
|
+
* - Ellipsis: … (U+2026), ‥ (U+2025)
|
|
23
|
+
* - CJK fullwidth: 。!?;:。、
|
|
24
|
+
* - Arabic/Urdu: ؟ ؛ ۔ ،
|
|
25
|
+
* - Devanagari (Hindi/Sanskrit): । ॥
|
|
26
|
+
* - Greek question mark: ; (U+037E)
|
|
27
|
+
*/
|
|
28
|
+
const SENTENCE_PUNCTUATION = ".!?;:…‥。!?;:。、؟؛۔،।॥\u037E";
|
|
29
|
+
const SENTENCE_SPLIT_PATTERN = new RegExp(`([${SENTENCE_PUNCTUATION}]+\\s*)`, "u");
|
|
30
|
+
/**
|
|
31
|
+
* Check if text contains spaces (to determine if word-based splitting is possible)
|
|
32
|
+
*
|
|
33
|
+
* @param text - Text to check
|
|
34
|
+
* @returns true if text contains spaces
|
|
35
|
+
*/
|
|
36
|
+
function hasSpaces(text) {
|
|
37
|
+
return /\s/.test(text);
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Split text by words, ensuring each chunk is under maxLength.
|
|
41
|
+
* Used for languages with spaces (English, Korean, etc.)
|
|
42
|
+
*
|
|
43
|
+
* @param text - Text to split
|
|
44
|
+
* @param maxLength - Maximum length of each chunk
|
|
45
|
+
* @returns Array of text chunks
|
|
46
|
+
*/
|
|
47
|
+
function splitByWords(text, maxLength) {
|
|
48
|
+
const words = text.split(/(\s+)/);
|
|
49
|
+
const chunks = [];
|
|
50
|
+
let currentChunk = "";
|
|
51
|
+
for (const word of words) {
|
|
52
|
+
if (currentChunk.length + word.length <= maxLength) {
|
|
53
|
+
currentChunk += word;
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
if (currentChunk.trim()) {
|
|
57
|
+
chunks.push(currentChunk.trim());
|
|
58
|
+
}
|
|
59
|
+
// If a single word exceeds maxLength, split by characters
|
|
60
|
+
if (word.trim().length > maxLength) {
|
|
61
|
+
const charChunks = splitByCharacters(word.trim(), maxLength);
|
|
62
|
+
chunks.push(...charChunks);
|
|
63
|
+
currentChunk = "";
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
currentChunk = word;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (currentChunk.trim()) {
|
|
71
|
+
chunks.push(currentChunk.trim());
|
|
72
|
+
}
|
|
73
|
+
return chunks;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Split text by characters, ensuring each chunk is under maxLength.
|
|
77
|
+
* Used for languages without spaces (Japanese, Chinese, etc.)
|
|
78
|
+
*
|
|
79
|
+
* @param text - Text to split
|
|
80
|
+
* @param maxLength - Maximum length of each chunk
|
|
81
|
+
* @returns Array of text chunks
|
|
82
|
+
*/
|
|
83
|
+
function splitByCharacters(text, maxLength) {
|
|
84
|
+
const chunks = [];
|
|
85
|
+
for (let i = 0; i < text.length; i += maxLength) {
|
|
86
|
+
chunks.push(text.slice(i, i + maxLength));
|
|
87
|
+
}
|
|
88
|
+
return chunks;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Split a single chunk that exceeds maxLength into smaller chunks.
|
|
92
|
+
* Uses word-based splitting for texts with spaces, character-based for texts without.
|
|
93
|
+
*
|
|
94
|
+
* @param chunk - Text chunk to split
|
|
95
|
+
* @param maxLength - Maximum length of each chunk
|
|
96
|
+
* @returns Array of text chunks, all under maxLength
|
|
97
|
+
*/
|
|
98
|
+
function splitOversizedChunk(chunk, maxLength) {
|
|
99
|
+
if (chunk.length <= maxLength) {
|
|
100
|
+
return [chunk];
|
|
101
|
+
}
|
|
102
|
+
// Check if text has spaces (word-based splitting possible)
|
|
103
|
+
if (hasSpaces(chunk)) {
|
|
104
|
+
return splitByWords(chunk, maxLength);
|
|
105
|
+
}
|
|
106
|
+
// No spaces: use character-based splitting (Japanese, Chinese, etc.)
|
|
107
|
+
return splitByCharacters(chunk, maxLength);
|
|
108
|
+
}
|
|
12
109
|
/**
|
|
13
110
|
* Split input text into sentence chunks suitable for TTS processing.
|
|
14
111
|
*
|
|
@@ -17,33 +114,54 @@ const constants_js_1 = require("./constants.js");
|
|
|
17
114
|
* It handles various punctuation patterns and provides graceful fallback to
|
|
18
115
|
* word/character boundaries when necessary.
|
|
19
116
|
*
|
|
117
|
+
* Chunking Strategy:
|
|
118
|
+
* 1. First, split by sentence boundaries (multilingual punctuation)
|
|
119
|
+
* 2. Merge sentences into chunks up to maxLength
|
|
120
|
+
* 3. If a sentence exceeds maxLength:
|
|
121
|
+
* - For text with spaces: split by words
|
|
122
|
+
* - For text without spaces (Japanese, etc.): split by characters
|
|
123
|
+
*
|
|
20
124
|
* @param text - Input text to be segmented
|
|
21
125
|
* @param maxLength - Maximum length of each chunk
|
|
22
|
-
* @returns Array of text chunks
|
|
126
|
+
* @returns Array of text chunks, each guaranteed to be <= maxLength
|
|
23
127
|
*/
|
|
24
128
|
function chunkText(text, maxLength = constants_js_1.DEFAULT_MAX_TEXT_LENGTH) {
|
|
25
129
|
if (text.length <= maxLength) {
|
|
26
130
|
return [text];
|
|
27
131
|
}
|
|
28
|
-
// Split by sentence boundaries
|
|
29
|
-
const sentences = text.split(
|
|
30
|
-
const
|
|
132
|
+
// Step 1: Split by sentence boundaries (multilingual punctuation)
|
|
133
|
+
const sentences = text.split(SENTENCE_SPLIT_PATTERN);
|
|
134
|
+
const preliminaryChunks = [];
|
|
31
135
|
let currentChunk = "";
|
|
136
|
+
// Step 2: Merge sentences into chunks up to maxLength
|
|
32
137
|
for (const sentence of sentences) {
|
|
33
138
|
if (currentChunk.length + sentence.length <= maxLength) {
|
|
34
139
|
currentChunk += sentence;
|
|
35
140
|
}
|
|
36
141
|
else {
|
|
37
142
|
if (currentChunk) {
|
|
38
|
-
|
|
143
|
+
preliminaryChunks.push(currentChunk);
|
|
39
144
|
}
|
|
40
145
|
currentChunk = sentence;
|
|
41
146
|
}
|
|
42
147
|
}
|
|
43
148
|
if (currentChunk) {
|
|
44
|
-
|
|
149
|
+
preliminaryChunks.push(currentChunk);
|
|
45
150
|
}
|
|
46
|
-
|
|
151
|
+
// Step 3: Handle oversized chunks (split by words or characters)
|
|
152
|
+
const finalChunks = [];
|
|
153
|
+
for (const chunk of preliminaryChunks) {
|
|
154
|
+
if (chunk.length <= maxLength) {
|
|
155
|
+
finalChunks.push(chunk);
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
// Chunk exceeds maxLength, need to split further
|
|
159
|
+
const subChunks = splitOversizedChunk(chunk, maxLength);
|
|
160
|
+
finalChunks.push(...subChunks);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
// Filter out empty chunks
|
|
164
|
+
return finalChunks.filter((chunk) => chunk.length > 0);
|
|
47
165
|
}
|
|
48
166
|
/**
|
|
49
167
|
* Extract audio data from NDJSON response.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"text_utils.js","sourceRoot":"","sources":["../../../../src/lib/custom_utils/text_utils.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;
|
|
1
|
+
{"version":3,"file":"text_utils.js","sourceRoot":"","sources":["../../../../src/lib/custom_utils/text_utils.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;AAqIH,8BA4CC;AAWD,wDA8BC;AAxND,iDAAyD;AAEzD;;;;;;;;;;;;;;;GAeG;AACH,MAAM,oBAAoB,GAAG,4BAA4B,CAAC;AAC1D,MAAM,sBAAsB,GAAG,IAAI,MAAM,CACxC,KAAK,oBAAoB,SAAS,EAClC,GAAG,CACH,CAAC;AAEF;;;;;GAKG;AACH,SAAS,SAAS,CAAC,IAAY;IAC9B,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACxB,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,YAAY,CAAC,IAAY,EAAE,SAAiB;IACpD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAClC,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;IAEtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,YAAY,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YACpD,YAAY,IAAI,IAAI,CAAC;QACtB,CAAC;aAAM,CAAC;YACP,IAAI,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;gBACzB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;YAClC,CAAC;YACD,0DAA0D;YAC1D,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;gBACpC,MAAM,UAAU,GAAG,iBAAiB,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,SAAS,CAAC,CAAC;gBAC7D,MAAM,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;gBAC3B,YAAY,GAAG,EAAE,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACP,YAAY,GAAG,IAAI,CAAC;YACrB,CAAC;QACF,CAAC;IACF,CAAC;IAED,IAAI,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;QACzB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,iBAAiB,CAAC,IAAY,EAAE,SAAiB;IACzD,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QACjD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC;IAC3C,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,mBAAmB,CAAC,KAAa,EAAE,SAAiB;IAC5D,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC/B,OAAO,CAAC,KAAK,CAAC,CAAC;IAChB,CAAC;IAED,2DAA2D;IAC3D,IAAI,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,YAAY,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IACvC,CAAC;IAED,qEAAqE;IACrE,OAAO,iBAAiB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;AAC5C,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,SAAgB,SAAS,CACxB,IAAY,EACZ,YAAoB,sCAAuB;IAE3C,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,CAAC,IAAI,CAAC,CAAC;IACf,CAAC;IAED,kEAAkE;IAClE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;IAErD,MAAM,iBAAiB,GAAa,EAAE,CAAC;IACvC,IAAI,YAAY,GAAG,EAAE,CAAC;IAEtB,sDAAsD;IACtD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QAClC,IAAI,YAAY,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YACxD,YAAY,IAAI,QAAQ,CAAC;QAC1B,CAAC;aAAM,CAAC;YACP,IAAI,YAAY,EAAE,CAAC;gBAClB,iBAAiB,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACtC,CAAC;YACD,YAAY,GAAG,QAAQ,CAAC;QACzB,CAAC;IACF,CAAC;IAED,IAAI,YAAY,EAAE,CAAC;QAClB,iBAAiB,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IACtC,CAAC;IAED,iEAAiE;IACjE,MAAM,WAAW,GAAa,EAAE,CAAC;IACjC,KAAK,MAAM,KAAK,IAAI,iBAAiB,EAAE,CAAC;QACvC,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC/B,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,CAAC;aAAM,CAAC;YACP,iDAAiD;YACjD,MAAM,SAAS,GAAG,mBAAmB,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;YACxD,WAAW,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;QAChC,CAAC;IACF,CAAC;IAED,0BAA0B;IAC1B,OAAO,WAAW,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACxD,CAAC;AAED;;;;;;;;GAQG;AACH,SAAgB,sBAAsB,CAAC,SAAiB;IACvD,qCAAqC;IACrC,IAAI,CAAC;QACJ,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QACnC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACvB,OAAO,kBAAkB,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,CAAC;IACF,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACZ,8CAA8C;IAC/C,CAAC;IAED,kCAAkC;IAClC,MAAM,KAAK,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAiB,EAAE,CAAC;IAErC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YACjB,IAAI,CAAC;gBACJ,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;gBAC9B,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;oBACvB,WAAW,CAAC,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC;gBACzD,CAAC;YACF,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACZ,SAAS;YACV,CAAC;QACF,CAAC;IACF,CAAC;IAED,mBAAmB;IACnB,OAAO,gBAAgB,CAAC,WAAW,CAAC,CAAC;AACtC,CAAC;AAED;;;;;GAKG;AACH,SAAS,kBAAkB,CAAC,MAAc;IACzC,+CAA+C;IAC/C,IAAI,OAAO,MAAM,KAAK,WAAW,EAAE,CAAC;QACnC,UAAU;QACV,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC;IACtD,CAAC;SAAM,CAAC;QACP,UAAU;QACV,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACvC,CAAC;QACD,OAAO,KAAK,CAAC;IACd,CAAC;AACF,CAAC;AAED;;;;;GAKG;AACH,SAAS,gBAAgB,CAAC,MAAoB;IAC7C,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACrE,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,WAAW,CAAC,CAAC;IAC3C,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;QACxB,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC;IACtB,CAAC;IACD,OAAO,MAAM,CAAC;AACf,CAAC"}
|