@supertone/supertone 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +119 -69
  2. package/custom_test/realtime_tts_player.ts +177 -12
  3. package/custom_test/test_pronunciation_dictionary.ts +227 -0
  4. package/custom_test/test_real_api.ts +1677 -162
  5. package/custom_test/test_text_utils_chunk_text_punctuation.ts +55 -0
  6. package/dist/commonjs/lib/config.d.ts +2 -2
  7. package/dist/commonjs/lib/config.d.ts.map +1 -1
  8. package/dist/commonjs/lib/config.js +2 -2
  9. package/dist/commonjs/lib/config.js.map +1 -1
  10. package/dist/commonjs/lib/custom_utils/index.d.ts +1 -0
  11. package/dist/commonjs/lib/custom_utils/index.d.ts.map +1 -1
  12. package/dist/commonjs/lib/custom_utils/index.js +5 -1
  13. package/dist/commonjs/lib/custom_utils/index.js.map +1 -1
  14. package/dist/commonjs/lib/custom_utils/pronunciation_utils.d.ts +24 -0
  15. package/dist/commonjs/lib/custom_utils/pronunciation_utils.d.ts.map +1 -0
  16. package/dist/commonjs/lib/custom_utils/pronunciation_utils.js +145 -0
  17. package/dist/commonjs/lib/custom_utils/pronunciation_utils.js.map +1 -0
  18. package/dist/commonjs/lib/custom_utils/text_utils.d.ts +8 -1
  19. package/dist/commonjs/lib/custom_utils/text_utils.d.ts.map +1 -1
  20. package/dist/commonjs/lib/custom_utils/text_utils.js +125 -7
  21. package/dist/commonjs/lib/custom_utils/text_utils.js.map +1 -1
  22. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  23. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  24. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js +48 -3
  25. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  26. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  27. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  28. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js +46 -3
  29. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  30. package/dist/commonjs/sdk/texttospeech.d.ts +17 -6
  31. package/dist/commonjs/sdk/texttospeech.d.ts.map +1 -1
  32. package/dist/commonjs/sdk/texttospeech.js +48 -25
  33. package/dist/commonjs/sdk/texttospeech.js.map +1 -1
  34. package/dist/esm/lib/config.d.ts +2 -2
  35. package/dist/esm/lib/config.d.ts.map +1 -1
  36. package/dist/esm/lib/config.js +2 -2
  37. package/dist/esm/lib/config.js.map +1 -1
  38. package/dist/esm/lib/custom_utils/index.d.ts +1 -0
  39. package/dist/esm/lib/custom_utils/index.d.ts.map +1 -1
  40. package/dist/esm/lib/custom_utils/index.js +2 -0
  41. package/dist/esm/lib/custom_utils/index.js.map +1 -1
  42. package/dist/esm/lib/custom_utils/pronunciation_utils.d.ts +24 -0
  43. package/dist/esm/lib/custom_utils/pronunciation_utils.d.ts.map +1 -0
  44. package/dist/esm/lib/custom_utils/pronunciation_utils.js +140 -0
  45. package/dist/esm/lib/custom_utils/pronunciation_utils.js.map +1 -0
  46. package/dist/esm/lib/custom_utils/text_utils.d.ts +8 -1
  47. package/dist/esm/lib/custom_utils/text_utils.d.ts.map +1 -1
  48. package/dist/esm/lib/custom_utils/text_utils.js +125 -7
  49. package/dist/esm/lib/custom_utils/text_utils.js.map +1 -1
  50. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  51. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  52. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js +47 -2
  53. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  54. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  55. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  56. package/dist/esm/models/predictttsdurationusingcharacterrequest.js +45 -2
  57. package/dist/esm/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  58. package/dist/esm/sdk/texttospeech.d.ts +17 -6
  59. package/dist/esm/sdk/texttospeech.d.ts.map +1 -1
  60. package/dist/esm/sdk/texttospeech.js +49 -26
  61. package/dist/esm/sdk/texttospeech.js.map +1 -1
  62. package/examples/custom_voices/create_cloned_voice.ts +4 -3
  63. package/examples/custom_voices/delete_custom_voice.ts +2 -7
  64. package/examples/custom_voices/edit_custom_voice.ts +2 -6
  65. package/examples/custom_voices/get_custom_voice.ts +2 -7
  66. package/examples/custom_voices/list_custom_voices.ts +2 -7
  67. package/examples/custom_voices/search_custom_voices.ts +2 -6
  68. package/examples/text_to_speech/create_speech.ts +3 -8
  69. package/examples/text_to_speech/create_speech_long_text.ts +3 -7
  70. package/examples/text_to_speech/create_speech_with_phonemes.ts +3 -7
  71. package/examples/text_to_speech/create_speech_with_voice_settings.ts +3 -8
  72. package/examples/text_to_speech/predict_duration.ts +3 -7
  73. package/examples/text_to_speech/stream_speech.ts +3 -7
  74. package/examples/text_to_speech/stream_speech_long_text.ts +3 -7
  75. package/examples/text_to_speech/stream_speech_with_phonemes.ts +3 -7
  76. package/examples/text_to_speech/stream_speech_with_voice_settings.ts +3 -7
  77. package/examples/usage/get_credit_balance.ts +2 -6
  78. package/examples/usage/get_usage.ts +2 -6
  79. package/examples/usage/get_voice_usage.ts +2 -7
  80. package/examples/voices/get_voice.ts +2 -6
  81. package/examples/voices/list_voices.ts +2 -6
  82. package/examples/voices/search_voices.ts +2 -7
  83. package/jsr.json +1 -1
  84. package/openapi.json +101 -9
  85. package/package.json +1 -1
  86. package/src/lib/config.ts +41 -41
  87. package/src/lib/custom_utils/index.ts +7 -0
  88. package/src/lib/custom_utils/pronunciation_utils.ts +193 -0
  89. package/src/lib/custom_utils/text_utils.ts +138 -7
  90. package/src/models/apiconverttexttospeechusingcharacterrequest.ts +62 -3
  91. package/src/models/predictttsdurationusingcharacterrequest.ts +64 -3
  92. package/src/sdk/texttospeech.ts +99 -68
@@ -127,19 +127,25 @@ async function extractAudioData(response: any): Promise<Uint8Array> {
127
127
  console.log(` šŸ” Debug - has audioBase64: ${"audioBase64" in result}`);
128
128
  console.log(` šŸ” Debug - has getReader: ${"getReader" in result}`);
129
129
  }
130
-
130
+
131
131
  // Check for capital-case Result (SDK internal structure)
132
- if (!result || (typeof result === "object" && Object.keys(result).length === 0)) {
132
+ if (
133
+ !result ||
134
+ (typeof result === "object" && Object.keys(result).length === 0)
135
+ ) {
133
136
  console.log(` šŸ’” Checking SDK internal Result field...`);
134
137
  if ((response as any).Result) {
135
138
  result = (response as any).Result;
136
139
  console.log(` āœ… Found Result (capital R) - using that instead`);
137
140
  }
138
141
  }
139
-
142
+
140
143
  // Debug response headers
141
144
  if (response.headers) {
142
- console.log(` šŸ” Debug - response headers:`, JSON.stringify(response.headers, null, 2));
145
+ console.log(
146
+ ` šŸ” Debug - response headers:`,
147
+ JSON.stringify(response.headers, null, 2)
148
+ );
143
149
  }
144
150
 
145
151
  if (result instanceof Uint8Array) {
@@ -198,7 +204,7 @@ async function extractAudioData(response: any): Promise<Uint8Array> {
198
204
  return bytes;
199
205
  }
200
206
  }
201
-
207
+
202
208
  // Handle empty object case - this might happen when the SDK doesn't properly parse audio responses
203
209
  if (
204
210
  typeof result === "object" &&
@@ -207,22 +213,25 @@ async function extractAudioData(response: any): Promise<Uint8Array> {
207
213
  ) {
208
214
  console.log(` āš ļø Warning: Empty result object detected`);
209
215
  console.log(` šŸ’” This might be a parsing issue with the SDK`);
210
- console.log(` šŸ’” Check if the response was actually a stream but got parsed as an empty object`);
211
-
216
+ console.log(
217
+ ` šŸ’” Check if the response was actually a stream but got parsed as an empty object`
218
+ );
219
+
212
220
  throw new Error(
213
221
  `Empty result object - SDK may have failed to parse audio stream response. ` +
214
- `This usually happens when audio/* content-type responses are not properly handled.`
222
+ `This usually happens when audio/* content-type responses are not properly handled.`
215
223
  );
216
224
  }
217
225
 
218
226
  // Enhanced error message with debug info
219
- const errorDetails = typeof result === "object" && result !== null
220
- ? `constructor: ${result.constructor.name}, keys: [${Object.keys(result).join(", ")}]`
221
- : `value: ${result}`;
222
-
223
- throw new Error(
224
- `Unsupported result type: ${typeof result}, ${errorDetails}`
225
- );
227
+ const errorDetails =
228
+ typeof result === "object" && result !== null
229
+ ? `constructor: ${result.constructor.name}, keys: [${Object.keys(
230
+ result
231
+ ).join(", ")}]`
232
+ : `value: ${result}`;
233
+
234
+ throw new Error(`Unsupported result type: ${typeof result}, ${errorDetails}`);
226
235
  }
227
236
 
228
237
  /**
@@ -928,12 +937,15 @@ async function testCreateSpeechLongText(
928
937
  }
929
938
 
930
939
  /**
931
- * Test TTS streaming with long text
940
+ * Test TTS with long text WITHOUT punctuation (word-based chunking)
941
+ * This tests the word-based splitting fallback when sentences exceed 300 chars
932
942
  */
933
- async function testStreamSpeechLongText(
943
+ async function testCreateSpeechLongSentenceNoPunctuation(
934
944
  voiceId: string | null
935
945
  ): Promise<[boolean, any]> {
936
- console.log("šŸ“” Long Text Streaming TTS Test");
946
+ console.log(
947
+ "šŸ“œ Long Sentence WITHOUT Punctuation Test (Word-based chunking)"
948
+ );
937
949
 
938
950
  if (!voiceId) {
939
951
  console.log(" āš ļø No voice ID available");
@@ -945,44 +957,64 @@ async function testStreamSpeechLongText(
945
957
  const models = await import("../src/models/index.js");
946
958
  const client = new Supertone({ apiKey: API_KEY });
947
959
 
948
- const longText = `
949
- Hello! This is a long text streaming test.
950
- The SDK automatically chunks and streams the audio in real-time.
951
- This enables efficient processing of longer content without waiting for complete generation.
952
- `
953
- .trim()
954
- .repeat(3);
960
+ // Long text without punctuation - forces word-based splitting
961
+ // This is a single continuous sentence with no periods or other punctuation marks
962
+ const longSentenceNoPunctuation =
963
+ "This is a very long sentence without any punctuation marks that is designed to test the word based chunking feature of the SDK when a sentence exceeds the maximum character limit of three hundred characters the system should automatically split this text by word boundaries rather than sentence boundaries to ensure proper processing and this behavior is critical for handling user generated content that may not follow standard punctuation conventions such as chat messages or informal text inputs that users commonly provide in real world applications where grammatically correct sentences are not always guaranteed";
955
964
 
956
- console.log(` šŸ” Streaming long text with voice '${voiceId}'...`);
957
- console.log(` Text length: ${longText.length} characters`);
965
+ const actualLength = longSentenceNoPunctuation.length;
966
+ console.log(
967
+ ` šŸ“ Text length: ${actualLength} characters (single sentence, no punctuation)`
968
+ );
969
+ console.log(` šŸ”§ Expected behavior: Word-based chunking`);
958
970
  console.log(" āš ļø This test consumes credits!");
959
971
 
960
- const response = await client.textToSpeech.streamSpeech({
972
+ const response = await client.textToSpeech.createSpeech({
961
973
  voiceId,
962
974
  apiConvertTextToSpeechUsingCharacterRequest: {
963
- text: longText,
975
+ text: longSentenceNoPunctuation,
964
976
  language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
965
977
  outputFormat:
966
978
  models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
979
+ style: "neutral",
980
+ model: "sona_speech_1",
967
981
  },
968
982
  });
969
983
 
970
- console.log(` āœ… Stream started successfully`);
984
+ if (response.result) {
985
+ const audioData = await extractAudioData(response);
986
+
987
+ console.log(
988
+ ` āœ… Word-based chunking TTS success: ${audioData.length} bytes`
989
+ );
990
+ console.log(
991
+ ` šŸŽÆ Long sentence without punctuation processed correctly!`
992
+ );
993
+
994
+ const outputFile = "test_word_chunking_speech_output.wav";
995
+ fs.writeFileSync(outputFile, audioData);
996
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
997
+
998
+ const estimatedChunks = Math.ceil(actualLength / 300);
999
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1000
+ }
971
1001
 
972
1002
  return [true, response];
973
1003
  } catch (e: any) {
974
- console.error(` āŒ Error: ${e.message || e}`);
1004
+ logDetailedError(e, "Long sentence word-based chunking");
975
1005
  return [false, e];
976
1006
  }
977
1007
  }
978
1008
 
979
1009
  /**
980
- * Test TTS with voice settings
1010
+ * Test TTS with Japanese text (character-based chunking)
1011
+ * Japanese doesn't use spaces, AND this test uses NO punctuation marks (ć€‚ļ¼ļ¼Ÿetc)
1012
+ * to ensure the SDK uses character-based splitting
981
1013
  */
982
- async function testCreateSpeechWithVoiceSettings(
1014
+ async function testCreateSpeechJapaneseNoSpaces(
983
1015
  voiceId: string | null
984
1016
  ): Promise<[boolean, any]> {
985
- console.log("šŸŽ›ļø TTS with Voice Settings Test");
1017
+ console.log("šŸ‡ÆšŸ‡µ Japanese Text Test (Character-based chunking)");
986
1018
 
987
1019
  if (!voiceId) {
988
1020
  console.log(" āš ļø No voice ID available");
@@ -994,58 +1026,72 @@ async function testCreateSpeechWithVoiceSettings(
994
1026
  const models = await import("../src/models/index.js");
995
1027
  const client = new Supertone({ apiKey: API_KEY });
996
1028
 
997
- const voiceSettings = {
998
- pitchShift: 0.95,
999
- pitchVariance: 1.1,
1000
- speed: 0.9,
1001
- };
1002
-
1029
+ // Long Japanese text WITHOUT spaces AND WITHOUT punctuation - forces character-based splitting
1030
+ // This text intentionally has NO punctuation marks (ć€‚ļ¼ļ¼Ÿetc) to test pure character-based chunking
1031
+ // Text length: ~450 characters (exceeds 300 char limit)
1032
+ const longJapaneseText =
1033
+ "ę—„ęœ¬čŖžć®ćƒ†ć‚­ć‚¹ćƒˆćÆé€šåøøć‚¹ćƒšćƒ¼ć‚¹ć‚’å«ć¾ćŖć„ćŸć‚ē‰¹åˆ„ćŖå‡¦ē†ćŒåæ…č¦ć§ć™" +
1034
+ "ć“ć®ćƒ†ć‚¹ćƒˆćÆäø‰ē™¾ę–‡å­—ć‚’č¶…ćˆć‚‹é•·ć„ę—„ęœ¬čŖžćƒ†ć‚­ć‚¹ćƒˆćŒę­£ć—ćå‡¦ē†ć•ć‚Œć‚‹ć“ćØć‚’ē¢ŗčŖć—ć¾ć™" +
1035
+ "č‡Ŗē„¶čØ€čŖžå‡¦ē†ęŠ€č”“ć®ē™ŗå±•ć«ć‚ˆć‚ŠéŸ³å£°åˆęˆć®å“č³ŖćÆå¤§å¹…ć«å‘äøŠć—ć¾ć—ćŸ" +
1036
+ "ē‰¹ć«ćƒ‡ć‚£ćƒ¼ćƒ—ćƒ©ćƒ¼ćƒ‹ćƒ³ć‚°ć‚’ę“»ē”Øć—ćŸęœ€ę–°ć®ćƒ†ć‚­ć‚¹ćƒˆéŸ³å£°å¤‰ę›ć‚·ć‚¹ćƒ†ćƒ ćÆäŗŗé–“ć®ē™ŗč©±ć«éžåøøć«čæ‘ć„č‡Ŗē„¶ćŖéŸ³å£°ć‚’ē”Ÿęˆć§ćć¾ć™" +
1037
+ "ć‚¹ćƒšćƒ¼ć‚¹ćŒćŖć„čØ€čŖžć§ćÆę–‡å­—å˜ä½ć§ć®åˆ†å‰²ćŒåæ…č¦ć§ć‚ć‚Šć“ć®SDKćÆćć®ć‚ˆć†ćŖēŠ¶ę³ć‚’č‡Ŗå‹•ēš„ć«ę¤œå‡ŗć—ć¦é©åˆ‡ć«å‡¦ē†ć—ć¾ć™" +
1038
+ "ć“ć‚Œć«ć‚ˆć‚Šę—„ęœ¬čŖžäø­å›½čŖžéŸ“å›½čŖžćŖć©ć®ć‚¢ć‚øć‚¢čØ€čŖžć§ć‚‚å•é”ŒćŖćé•·ć„ćƒ†ć‚­ć‚¹ćƒˆć‚’éŸ³å£°ć«å¤‰ę›ć™ć‚‹ć“ćØćŒć§ćć¾ć™" +
1039
+ "éŸ³å£°åˆęˆęŠ€č”“ćÆč¦–č¦šéšœå®³č€…ć®ćŸć‚ć®ć‚¢ć‚Æć‚»ć‚·ćƒ“ćƒŖćƒ†ć‚£ćƒ„ćƒ¼ćƒ«ć‹ć‚‰åÆ¾č©±åž‹AIć‚¢ć‚·ć‚¹ć‚æćƒ³ćƒˆć¾ć§å¹…åŗƒć„ē”Øé€”ć§ę“»ē”Øć•ć‚Œć¦ć„ć¾ć™" +
1040
+ "ć•ć‚‰ć«ćƒŖć‚¢ćƒ«ć‚æć‚¤ćƒ ć‚¹ćƒˆćƒŖćƒ¼ćƒŸćƒ³ć‚°ęŠ€č”“ćØēµ„ćæåˆć‚ć›ć‚‹ć“ćØć§å¾…ć”ę™‚é–“ć‚’å¤§å¹…ć«ēŸ­ēø®ć—å„Ŗć‚ŒćŸćƒ¦ćƒ¼ć‚¶ćƒ¼ä½“éØ“ć‚’ęä¾›ć™ć‚‹ć“ćØćŒć§ćć¾ć™" +
1041
+ "ęœ€ę–°ć®éŸ³å£°åˆęˆęŠ€č”“ćÆę„Ÿęƒ…ć‚„ęŠ‘ęšć‚‚č‡Ŗē„¶ć«č”Øē¾ć§ćć‚‹ć‚ˆć†ć«ćŖć‚Šć¾ć—ćŸ";
1042
+
1043
+ const actualLength = longJapaneseText.length;
1003
1044
  console.log(
1004
- ` šŸ” TTS conversion with voice settings using voice '${voiceId}'...`
1045
+ ` šŸ“ Text length: ${actualLength} characters (Japanese, no spaces, no punctuation)`
1005
1046
  );
1006
1047
  console.log(
1007
- ` Settings: pitchShift=${voiceSettings.pitchShift}, speed=${voiceSettings.speed}`
1048
+ ` šŸ”§ Expected behavior: Character-based chunking (300 chars per chunk)`
1008
1049
  );
1009
1050
  console.log(" āš ļø This test consumes credits!");
1010
1051
 
1011
1052
  const response = await client.textToSpeech.createSpeech({
1012
1053
  voiceId,
1013
1054
  apiConvertTextToSpeechUsingCharacterRequest: {
1014
- text: "Hello world! This is a voice settings test. You can hear the adjusted pitch and speed.",
1015
- language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1055
+ text: longJapaneseText,
1056
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ja,
1016
1057
  outputFormat:
1017
1058
  models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1018
1059
  style: "neutral",
1019
1060
  model: "sona_speech_1",
1020
- voiceSettings,
1021
- includePhonemes: false,
1022
1061
  },
1023
1062
  });
1024
1063
 
1025
- console.log(` āœ… TTS with voice settings success`);
1026
-
1027
1064
  if (response.result) {
1028
- const outputFile = "test_voice_settings_speech_output.wav";
1029
1065
  const audioData = await extractAudioData(response);
1030
1066
 
1067
+ console.log(
1068
+ ` āœ… Character-based chunking TTS success: ${audioData.length} bytes`
1069
+ );
1070
+ console.log(` šŸŽÆ Japanese text without spaces processed correctly!`);
1071
+
1072
+ const outputFile = "test_japanese_char_chunking_speech_output.wav";
1031
1073
  fs.writeFileSync(outputFile, audioData);
1032
- console.log(` šŸ’¾ Voice settings audio file saved: ${outputFile}`);
1074
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
1075
+
1076
+ const estimatedChunks = Math.ceil(actualLength / 300);
1077
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1033
1078
  }
1034
1079
 
1035
1080
  return [true, response];
1036
1081
  } catch (e: any) {
1037
- console.error(` āŒ Error: ${e.message || e}`);
1082
+ logDetailedError(e, "Japanese character-based chunking");
1038
1083
  return [false, e];
1039
1084
  }
1040
1085
  }
1041
1086
 
1042
1087
  /**
1043
- * Test TTS with phoneme information
1088
+ * Test TTS with Arabic text and Arabic punctuation marks (؟ Ų› Ū”)
1089
+ * This tests multilingual sentence punctuation support added in fix/text_utils
1044
1090
  */
1045
- async function testCreateSpeechWithPhonemes(
1091
+ async function testCreateSpeechArabicPunctuation(
1046
1092
  voiceId: string | null
1047
1093
  ): Promise<[boolean, any]> {
1048
- console.log("šŸ”¤ TTS with Phoneme Information Test");
1094
+ console.log("šŸ‡øšŸ‡¦ Arabic Text with Arabic Punctuation Test");
1049
1095
 
1050
1096
  if (!voiceId) {
1051
1097
  console.log(" āš ļø No voice ID available");
@@ -1057,81 +1103,71 @@ async function testCreateSpeechWithPhonemes(
1057
1103
  const models = await import("../src/models/index.js");
1058
1104
  const client = new Supertone({ apiKey: API_KEY });
1059
1105
 
1106
+ // Arabic text with Arabic punctuation marks (؟ Ų› Ū” ،)
1107
+ // Text length: ~350 characters (exceeds 300 char limit)
1108
+ const arabicText =
1109
+ "Ł…Ų±Ų­ŲØŲ§ ŲØŁƒŁ… في Ų§Ų®ŲŖŲØŲ§Ų± ŲŖŁ‚Ł†ŁŠŲ© ŲŖŲ­ŁˆŁŠŁ„ النص ؄لى ŁƒŁ„Ų§Ł…ŲŸ " +
1110
+ "هذا النظام ŁŠŲÆŲ¹Ł… اللغة Ų§Ł„Ų¹Ų±ŲØŁŠŲ© ŲØŲ“ŁƒŁ„ ŁƒŲ§Ł…Ł„Ų› " +
1111
+ "ŁŠŁ…ŁƒŁ†Ł‡ التعرف على علامات Ų§Ł„ŲŖŲ±Ł‚ŁŠŁ… Ų§Ł„Ų¹Ų±ŲØŁŠŲ© Ł…Ų«Ł„ علامة الاستفهام ŁˆŲ¹Ł„Ų§Ł…Ų© الفاصلة Ų§Ł„Ł…Ł†Ł‚ŁˆŲ·Ų©Ū” " +
1112
+ "ŲŖŁ‚Ł†ŁŠŲ© Ų§Ł„Ų°ŁƒŲ§Ų” Ų§Ł„Ų§ŲµŲ·Ł†Ų§Ų¹ŁŠ تتطور ŲØŲ³Ų±Ų¹Ų© كبيرة، " +
1113
+ "ŁˆŲ§Ł„Ų¢Ł† ŁŠŁ…ŁƒŁ†Ł†Ų§ ŲŖŲ­ŁˆŁŠŁ„ Ų§Ł„Ł†ŲµŁˆŲµ Ų§Ł„Ų·ŁˆŁŠŁ„Ų© ؄لى ŁƒŁ„Ų§Ł… طبيعي؟ " +
1114
+ "هذا الاختبار ŁŠŲŖŲ­Ł‚Ł‚ من أن النظام ŁŠŁ‚Ų³Ł… النص ŲØŲ“ŁƒŁ„ صحيح عند علامات Ų§Ł„ŲŖŲ±Ł‚ŁŠŁ… Ų§Ł„Ų¹Ų±ŲØŁŠŲ©Ų› " +
1115
+ "نأمل أن ŁŠŲ¹Ł…Ł„ ŁƒŁ„ ؓيؔ ŲØŲ“ŁƒŁ„ Ł…Ų«Ų§Ł„ŁŠŪ”";
1116
+
1117
+ const actualLength = arabicText.length;
1060
1118
  console.log(
1061
- ` šŸ” TTS conversion with phonemes using voice '${voiceId}'...`
1119
+ ` šŸ“ Text length: ${actualLength} characters (Arabic with Arabic punctuation)`
1062
1120
  );
1121
+ console.log(` šŸ”§ Expected behavior: Sentence-based chunking with Arabic punctuation (؟ Ų› Ū”)`);
1063
1122
  console.log(" āš ļø This test consumes credits!");
1064
1123
 
1124
+ if (actualLength <= 300) {
1125
+ console.log(` āŒ Text length ${actualLength} is <= 300, test may not trigger chunking`);
1126
+ }
1127
+
1065
1128
  const response = await client.textToSpeech.createSpeech({
1066
1129
  voiceId,
1067
1130
  apiConvertTextToSpeechUsingCharacterRequest: {
1068
- text: "Hello world! This is a phoneme timing test.",
1069
- language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1131
+ text: arabicText,
1132
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ar,
1070
1133
  outputFormat:
1071
1134
  models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1072
1135
  style: "neutral",
1073
- model: "sona_speech_1",
1074
- includePhonemes: true,
1136
+ model: models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech2,
1075
1137
  },
1076
1138
  });
1077
1139
 
1078
- console.log(` āœ… TTS with phonemes success`);
1079
-
1080
1140
  if (response.result) {
1081
- const outputFile = "test_phoneme_speech_output.wav";
1141
+ const audioData = await extractAudioData(response);
1082
1142
 
1083
- // Check if response is JSON with phonemes data
1084
- if (
1085
- typeof response.result === "object" &&
1086
- "audioBase64" in response.result
1087
- ) {
1088
- const audioData = await extractAudioData(response);
1089
- fs.writeFileSync(outputFile, audioData);
1090
- console.log(` šŸ’¾ Phoneme audio file saved: ${outputFile}`);
1143
+ console.log(
1144
+ ` āœ… Arabic punctuation chunking TTS success: ${audioData.length} bytes`
1145
+ );
1146
+ console.log(` šŸŽÆ Arabic text with Arabic punctuation processed correctly!`);
1091
1147
 
1092
- // Display phoneme information as JSON
1093
- const phonemes = (response.result as any).phonemes;
1094
- if (phonemes) {
1095
- console.log(` šŸ“Š Phoneme data (JSON):`);
1096
- console.log(JSON.stringify(phonemes, null, 2));
1097
- console.log(` šŸ“ˆ Summary:`);
1098
- console.log(` Symbols count: ${phonemes.symbols?.length || 0}`);
1099
- console.log(
1100
- ` Durations count: ${phonemes.durations_seconds?.length || 0}`
1101
- );
1102
- console.log(
1103
- ` Start times count: ${
1104
- phonemes.start_times_seconds?.length || 0
1105
- }`
1106
- );
1107
- if (phonemes.symbols && phonemes.symbols.length > 0) {
1108
- console.log(
1109
- ` First 5 symbols: ${phonemes.symbols.slice(0, 5).join(", ")}`
1110
- );
1111
- }
1112
- }
1113
- } else {
1114
- // Binary audio without phonemes
1115
- const audioData = await extractAudioData(response);
1116
- fs.writeFileSync(outputFile, audioData);
1117
- console.log(` šŸ’¾ Phoneme audio file saved: ${outputFile}`);
1118
- }
1148
+ const outputFile = "test_arabic_punctuation_speech_output.wav";
1149
+ fs.writeFileSync(outputFile, audioData);
1150
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
1151
+
1152
+ const estimatedChunks = Math.ceil(actualLength / 300);
1153
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1119
1154
  }
1120
1155
 
1121
1156
  return [true, response];
1122
1157
  } catch (e: any) {
1123
- console.error(` āŒ Error: ${e.message || e}`);
1158
+ logDetailedError(e, "Arabic punctuation chunking");
1124
1159
  return [false, e];
1125
1160
  }
1126
1161
  }
1127
1162
 
1128
1163
  /**
1129
- * Test TTS streaming with phonemes
1164
+ * Test TTS with Hindi text and Devanagari punctuation marks (ą„¤ ą„„)
1165
+ * This tests multilingual sentence punctuation support added in fix/text_utils
1130
1166
  */
1131
- async function testStreamSpeechWithPhonemes(
1167
+ async function testCreateSpeechHindiPunctuation(
1132
1168
  voiceId: string | null
1133
1169
  ): Promise<[boolean, any]> {
1134
- console.log("šŸ“” TTS Streaming with Phonemes Test");
1170
+ console.log("šŸ‡®šŸ‡³ Hindi Text with Devanagari Punctuation Test");
1135
1171
 
1136
1172
  if (!voiceId) {
1137
1173
  console.log(" āš ļø No voice ID available");
@@ -1143,38 +1179,71 @@ async function testStreamSpeechWithPhonemes(
1143
1179
  const models = await import("../src/models/index.js");
1144
1180
  const client = new Supertone({ apiKey: API_KEY });
1145
1181
 
1182
+ // Hindi text with Devanagari punctuation marks (ą„¤ ą„„)
1183
+ // Text length: ~380 characters (exceeds 300 char limit)
1184
+ const hindiText =
1185
+ "ą¤Øą¤®ą¤øą„ą¤¤ą„‡ और ą¤øą„ą¤µą¤¾ą¤—ą¤¤ ą¤¹ą„ˆ आपका इस ą¤Ŗą¤°ą„€ą¤•ą„ą¤·ą¤£ ą¤®ą„‡ą¤‚ą„¤ " +
1186
+ "यह ą¤Ŗą„ą¤°ą¤£ą¤¾ą¤²ą„€ ą¤¹ą¤æą¤‚ą¤¦ą„€ भाषा का ą¤Ŗą„‚ą¤°ą„ą¤£ ą¤øą¤®ą¤°ą„ą¤„ą¤Ø ą¤•ą¤°ą¤¤ą„€ ą¤¹ą„ˆą„¤ " +
1187
+ "ą¤¦ą„‡ą¤µą¤Øą¤¾ą¤—ą¤°ą„€ लिपि ą¤®ą„‡ą¤‚ ą¤Ŗą„‚ą¤°ą„ą¤£ विराम और ą¤¦ą„‹ą¤¹ą¤°ą¤¾ दंऔ ą¤œą„ˆą¤øą„‡ विराम ą¤šą¤æą¤¹ą„ą¤Ø ą¤¹ą„‹ą¤¤ą„‡ ą¤¹ą„ˆą¤‚ą„„ " +
1188
+ "ą¤•ą„ƒą¤¤ą„ą¤°ą¤æą¤® ą¤¬ą„ą¤¦ą„ą¤§ą¤æą¤®ą¤¤ą„ą¤¤ą¤¾ ą¤•ą„€ ą¤¤ą¤•ą¤Øą„€ą¤• ą¤¬ą¤¹ą„ą¤¤ ą¤¤ą„‡ą¤œą„€ ą¤øą„‡ विकसित ą¤¹ą„‹ ą¤°ą¤¹ą„€ ą¤¹ą„ˆą„¤ " +
1189
+ "अब हम ą¤²ą¤‚ą¤¬ą„‡ ą¤Ŗą¤¾ą¤ ą„‹ą¤‚ ą¤•ą„‹ ą¤øą„ą¤µą¤¾ą¤­ą¤¾ą¤µą¤æą¤• ą¤µą¤¾ą¤£ą„€ ą¤®ą„‡ą¤‚ बदल ą¤øą¤•ą¤¤ą„‡ ą¤¹ą„ˆą¤‚ą„¤ " +
1190
+ "यह ą¤Ŗą¤°ą„€ą¤•ą„ą¤·ą¤£ ą¤œą¤¾ą¤‚ą¤šą¤¤ą¤¾ ą¤¹ą„ˆ कि ą¤øą¤æą¤øą„ą¤Ÿą¤® ą¤¹ą¤æą¤‚ą¤¦ą„€ विराम ą¤šą¤æą¤¹ą„ą¤Øą„‹ą¤‚ पर ą¤øą¤¹ą„€ ढंग ą¤øą„‡ पाठ ą¤•ą„‹ विभाजित करता ą¤¹ą„ˆą„¤ " +
1191
+ "ą¤¹ą¤®ą„‡ą¤‚ आशा ą¤¹ą„ˆ कि सब ą¤•ą„ą¤› ą¤ ą„€ą¤• ą¤øą„‡ काम ą¤•ą¤°ą„‡ą¤—ą¤¾ą„„";
1192
+
1193
+ const actualLength = hindiText.length;
1146
1194
  console.log(
1147
- ` šŸ” Streaming speech with phonemes for voice '${voiceId}'...`
1195
+ ` šŸ“ Text length: ${actualLength} characters (Hindi with Devanagari punctuation)`
1148
1196
  );
1197
+ console.log(` šŸ”§ Expected behavior: Sentence-based chunking with Devanagari punctuation (ą„¤ ą„„)`);
1149
1198
  console.log(" āš ļø This test consumes credits!");
1150
1199
 
1151
- const response = await client.textToSpeech.streamSpeech({
1200
+ if (actualLength <= 300) {
1201
+ console.log(` āŒ Text length ${actualLength} is <= 300, test may not trigger chunking`);
1202
+ }
1203
+
1204
+ const response = await client.textToSpeech.createSpeech({
1152
1205
  voiceId,
1153
1206
  apiConvertTextToSpeechUsingCharacterRequest: {
1154
- text: "Streaming with phoneme timing information.",
1155
- language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1207
+ text: hindiText,
1208
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Hi,
1156
1209
  outputFormat:
1157
1210
  models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1158
- includePhonemes: true,
1211
+ style: "neutral",
1212
+ model: models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech2,
1159
1213
  },
1160
1214
  });
1161
1215
 
1162
- console.log(` āœ… Stream with phonemes started successfully`);
1216
+ if (response.result) {
1217
+ const audioData = await extractAudioData(response);
1218
+
1219
+ console.log(
1220
+ ` āœ… Hindi punctuation chunking TTS success: ${audioData.length} bytes`
1221
+ );
1222
+ console.log(` šŸŽÆ Hindi text with Devanagari punctuation processed correctly!`);
1223
+
1224
+ const outputFile = "test_hindi_punctuation_speech_output.wav";
1225
+ fs.writeFileSync(outputFile, audioData);
1226
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
1227
+
1228
+ const estimatedChunks = Math.ceil(actualLength / 300);
1229
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1230
+ }
1163
1231
 
1164
1232
  return [true, response];
1165
1233
  } catch (e: any) {
1166
- console.error(` āŒ Error: ${e.message || e}`);
1234
+ logDetailedError(e, "Hindi punctuation chunking");
1167
1235
  return [false, e];
1168
1236
  }
1169
1237
  }
1170
1238
 
1171
1239
  /**
1172
- * Test duration prediction with voice settings
1240
+ * Test TTS with ellipsis punctuation marks (…  )
1241
+ * This tests multilingual sentence punctuation support added in fix/text_utils
1173
1242
  */
1174
- async function testPredictDurationWithVoiceSettings(
1243
+ async function testCreateSpeechEllipsisPunctuation(
1175
1244
  voiceId: string | null
1176
1245
  ): Promise<[boolean, any]> {
1177
- console.log("ā±ļø Duration Prediction with Voice Settings Test");
1246
+ console.log("ā³ Text with Ellipsis Punctuation Test (…  )");
1178
1247
 
1179
1248
  if (!voiceId) {
1180
1249
  console.log(" āš ļø No voice ID available");
@@ -1186,40 +1255,71 @@ async function testPredictDurationWithVoiceSettings(
1186
1255
  const models = await import("../src/models/index.js");
1187
1256
  const client = new Supertone({ apiKey: API_KEY });
1188
1257
 
1189
- const voiceSettings = {
1190
- speed: 0.8,
1191
- };
1192
-
1258
+ // Text with ellipsis punctuation marks (…  )
1259
+ // Text length: ~380 characters (exceeds 300 char limit)
1260
+ const ellipsisText =
1261
+ "Sometimes we need to pause and think… " +
1262
+ "The ellipsis character is used to indicate a trailing thought or a pause in speech… " +
1263
+ "This test verifies that the text chunking system correctly handles Unicode ellipsis characters  " +
1264
+ "There are actually multiple types of ellipsis in Unicode… " +
1265
+ "The horizontal ellipsis U+2026 and the two dot leader U+2025 are both supported  " +
1266
+ "When processing long texts the SDK should split at these punctuation marks… " +
1267
+ "This ensures natural pauses in the generated speech output  " +
1268
+ "Let us verify that everything works correctly…";
1269
+
1270
+ const actualLength = ellipsisText.length;
1193
1271
  console.log(
1194
- ` šŸ” Predicting duration with voice settings for voice '${voiceId}'...`
1272
+ ` šŸ“ Text length: ${actualLength} characters (with ellipsis punctuation)`
1195
1273
  );
1196
- console.log(` Settings: speed=${voiceSettings.speed}`);
1274
+ console.log(` šŸ”§ Expected behavior: Sentence-based chunking with ellipsis (…  )`);
1275
+ console.log(" āš ļø This test consumes credits!");
1197
1276
 
1198
- const response = await client.textToSpeech.predictDuration({
1277
+ if (actualLength <= 300) {
1278
+ console.log(` āŒ Text length ${actualLength} is <= 300, test may not trigger chunking`);
1279
+ }
1280
+
1281
+ const response = await client.textToSpeech.createSpeech({
1199
1282
  voiceId,
1200
- predictTTSDurationUsingCharacterRequest: {
1201
- text: "This is a duration test with adjusted speed.",
1202
- language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1203
- voiceSettings,
1283
+ apiConvertTextToSpeechUsingCharacterRequest: {
1284
+ text: ellipsisText,
1285
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1286
+ outputFormat:
1287
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1288
+ style: "neutral",
1289
+ model: models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech1,
1204
1290
  },
1205
1291
  });
1206
1292
 
1207
- console.log(` āœ… Predicted duration: ${response.duration}s`);
1293
+ if (response.result) {
1294
+ const audioData = await extractAudioData(response);
1295
+
1296
+ console.log(
1297
+ ` āœ… Ellipsis punctuation chunking TTS success: ${audioData.length} bytes`
1298
+ );
1299
+ console.log(` šŸŽÆ Text with ellipsis punctuation processed correctly!`);
1300
+
1301
+ const outputFile = "test_ellipsis_punctuation_speech_output.wav";
1302
+ fs.writeFileSync(outputFile, audioData);
1303
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
1304
+
1305
+ const estimatedChunks = Math.ceil(actualLength / 300);
1306
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1307
+ }
1208
1308
 
1209
1309
  return [true, response];
1210
1310
  } catch (e: any) {
1211
- console.error(` āŒ Error: ${e.message || e}`);
1311
+ logDetailedError(e, "Ellipsis punctuation chunking");
1212
1312
  return [false, e];
1213
1313
  }
1214
1314
  }
1215
1315
 
1216
1316
  /**
1217
- * Test TTS streaming with voice settings
1317
+ * Test TTS streaming with long text
1218
1318
  */
1219
- async function testStreamSpeechWithVoiceSettings(
1319
+ async function testStreamSpeechLongText(
1220
1320
  voiceId: string | null
1221
1321
  ): Promise<[boolean, any]> {
1222
- console.log("šŸ“” TTS Streaming with Voice Settings Test");
1322
+ console.log("šŸ“” Long Text Streaming TTS Test");
1223
1323
 
1224
1324
  if (!voiceId) {
1225
1325
  console.log(" āš ļø No voice ID available");
@@ -1231,31 +1331,29 @@ async function testStreamSpeechWithVoiceSettings(
1231
1331
  const models = await import("../src/models/index.js");
1232
1332
  const client = new Supertone({ apiKey: API_KEY });
1233
1333
 
1234
- const voiceSettings = {
1235
- pitchShift: 1.05,
1236
- speed: 1.1,
1237
- };
1334
+ const longText = `
1335
+ Hello! This is a long text streaming test.
1336
+ The SDK automatically chunks and streams the audio in real-time.
1337
+ This enables efficient processing of longer content without waiting for complete generation.
1338
+ `
1339
+ .trim()
1340
+ .repeat(3);
1238
1341
 
1239
- console.log(
1240
- ` šŸ” Streaming speech with voice settings for voice '${voiceId}'...`
1241
- );
1242
- console.log(
1243
- ` Settings: pitchShift=${voiceSettings.pitchShift}, speed=${voiceSettings.speed}`
1244
- );
1342
+ console.log(` šŸ” Streaming long text with voice '${voiceId}'...`);
1343
+ console.log(` Text length: ${longText.length} characters`);
1245
1344
  console.log(" āš ļø This test consumes credits!");
1246
1345
 
1247
1346
  const response = await client.textToSpeech.streamSpeech({
1248
1347
  voiceId,
1249
1348
  apiConvertTextToSpeechUsingCharacterRequest: {
1250
- text: "Streaming with adjusted voice settings.",
1349
+ text: longText,
1251
1350
  language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1252
1351
  outputFormat:
1253
1352
  models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1254
- voiceSettings,
1255
1353
  },
1256
1354
  });
1257
1355
 
1258
- console.log(` āœ… Stream with voice settings started successfully`);
1356
+ console.log(` āœ… Stream started successfully`);
1259
1357
 
1260
1358
  return [true, response];
1261
1359
  } catch (e: any) {
@@ -1265,12 +1363,12 @@ async function testStreamSpeechWithVoiceSettings(
1265
1363
  }
1266
1364
 
1267
1365
  /**
1268
- * Test MP3 format TTS
1366
+ * Test TTS with voice settings
1269
1367
  */
1270
- async function testCreateSpeechMp3(
1368
+ async function testCreateSpeechWithVoiceSettings(
1271
1369
  voiceId: string | null
1272
1370
  ): Promise<[boolean, any]> {
1273
- console.log("šŸŽ¤ MP3 Format TTS Test");
1371
+ console.log("šŸŽ›ļø TTS with Voice Settings Test");
1274
1372
 
1275
1373
  if (!voiceId) {
1276
1374
  console.log(" āš ļø No voice ID available");
@@ -1282,39 +1380,1021 @@ async function testCreateSpeechMp3(
1282
1380
  const models = await import("../src/models/index.js");
1283
1381
  const client = new Supertone({ apiKey: API_KEY });
1284
1382
 
1285
- console.log(` šŸ” MP3 TTS conversion with voice '${voiceId}'...`);
1383
+ const voiceSettings = {
1384
+ pitchShift: 0.95,
1385
+ pitchVariance: 1.1,
1386
+ speed: 0.9,
1387
+ };
1388
+
1389
+ console.log(
1390
+ ` šŸ” TTS conversion with voice settings using voice '${voiceId}'...`
1391
+ );
1392
+ console.log(
1393
+ ` Settings: pitchShift=${voiceSettings.pitchShift}, speed=${voiceSettings.speed}`
1394
+ );
1286
1395
  console.log(" āš ļø This test consumes credits!");
1287
1396
 
1288
1397
  const response = await client.textToSpeech.createSpeech({
1289
1398
  voiceId,
1290
1399
  apiConvertTextToSpeechUsingCharacterRequest: {
1291
- text: "Hello! This is an MP3 format SDK test. Let's verify if it works correctly.",
1400
+ text: "Hello world! This is a voice settings test. You can hear the adjusted pitch and speed.",
1292
1401
  language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1293
1402
  outputFormat:
1294
- models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Mp3,
1403
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1295
1404
  style: "neutral",
1296
1405
  model: "sona_speech_1",
1406
+ voiceSettings,
1407
+ includePhonemes: false,
1297
1408
  },
1298
1409
  });
1299
1410
 
1300
- console.log(` āœ… MP3 TTS conversion success`);
1411
+ console.log(` āœ… TTS with voice settings success`);
1301
1412
 
1302
1413
  if (response.result) {
1303
- const outputFile = "test_create_speech_output.mp3";
1414
+ const outputFile = "test_voice_settings_speech_output.wav";
1304
1415
  const audioData = await extractAudioData(response);
1305
1416
 
1306
1417
  fs.writeFileSync(outputFile, audioData);
1307
- console.log(` šŸ’¾ MP3 audio file saved: ${outputFile}`);
1418
+ console.log(` šŸ’¾ Voice settings audio file saved: ${outputFile}`);
1419
+ }
1308
1420
 
1309
- // Verify MP3 header
1310
- const header = audioData.slice(0, 10);
1311
- if (header[0] === 0x49 && header[1] === 0x44 && header[2] === 0x33) {
1312
- console.log(` āœ… Valid MP3 file generated (ID3 tag)`);
1313
- } else if (
1314
- (header[0] === 0xff && header[1] === 0xfb) ||
1315
- (header[0] === 0xff && header[1] === 0xfa)
1316
- ) {
1317
- console.log(` āœ… Valid MP3 file generated (MPEG frame)`);
1421
+ return [true, response];
1422
+ } catch (e: any) {
1423
+ console.error(` āŒ Error: ${e.message || e}`);
1424
+ return [false, e];
1425
+ }
1426
+ }
1427
+
1428
+ /**
1429
+ * Test TTS with phoneme information
1430
+ */
1431
+ async function testCreateSpeechWithPhonemes(
1432
+ voiceId: string | null
1433
+ ): Promise<[boolean, any]> {
1434
+ console.log("šŸ”¤ TTS with Phoneme Information Test");
1435
+
1436
+ if (!voiceId) {
1437
+ console.log(" āš ļø No voice ID available");
1438
+ return [false, null];
1439
+ }
1440
+
1441
+ try {
1442
+ const { Supertone } = await import("../src/index.js");
1443
+ const models = await import("../src/models/index.js");
1444
+ const client = new Supertone({ apiKey: API_KEY });
1445
+
1446
+ console.log(
1447
+ ` šŸ” TTS conversion with phonemes using voice '${voiceId}'...`
1448
+ );
1449
+ console.log(" āš ļø This test consumes credits!");
1450
+
1451
+ const response = await client.textToSpeech.createSpeech({
1452
+ voiceId,
1453
+ apiConvertTextToSpeechUsingCharacterRequest: {
1454
+ text: "Hello world! This is a phoneme timing test.",
1455
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1456
+ outputFormat:
1457
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1458
+ style: "neutral",
1459
+ model: "sona_speech_1",
1460
+ includePhonemes: true,
1461
+ },
1462
+ });
1463
+
1464
+ console.log(` āœ… TTS with phonemes success`);
1465
+
1466
+ if (response.result) {
1467
+ const outputFile = "test_phoneme_speech_output.wav";
1468
+
1469
+ // Check if response is JSON with phonemes data
1470
+ if (
1471
+ typeof response.result === "object" &&
1472
+ "audioBase64" in response.result
1473
+ ) {
1474
+ const audioData = await extractAudioData(response);
1475
+ fs.writeFileSync(outputFile, audioData);
1476
+ console.log(` šŸ’¾ Phoneme audio file saved: ${outputFile}`);
1477
+
1478
+ // Display phoneme information as JSON
1479
+ const phonemes = (response.result as any).phonemes;
1480
+ if (phonemes) {
1481
+ console.log(` šŸ“Š Phoneme data (JSON):`);
1482
+ console.log(JSON.stringify(phonemes, null, 2));
1483
+ console.log(` šŸ“ˆ Summary:`);
1484
+ console.log(` Symbols count: ${phonemes.symbols?.length || 0}`);
1485
+ console.log(
1486
+ ` Durations count: ${phonemes.durations_seconds?.length || 0}`
1487
+ );
1488
+ console.log(
1489
+ ` Start times count: ${
1490
+ phonemes.start_times_seconds?.length || 0
1491
+ }`
1492
+ );
1493
+ if (phonemes.symbols && phonemes.symbols.length > 0) {
1494
+ console.log(
1495
+ ` First 5 symbols: ${phonemes.symbols.slice(0, 5).join(", ")}`
1496
+ );
1497
+ }
1498
+ }
1499
+ } else {
1500
+ // Binary audio without phonemes
1501
+ const audioData = await extractAudioData(response);
1502
+ fs.writeFileSync(outputFile, audioData);
1503
+ console.log(` šŸ’¾ Phoneme audio file saved: ${outputFile}`);
1504
+ }
1505
+ }
1506
+
1507
+ return [true, response];
1508
+ } catch (e: any) {
1509
+ console.error(` āŒ Error: ${e.message || e}`);
1510
+ return [false, e];
1511
+ }
1512
+ }
1513
+
1514
+ /**
1515
+ * Test TTS streaming with phonemes
1516
+ */
1517
+ async function testStreamSpeechWithPhonemes(
1518
+ voiceId: string | null
1519
+ ): Promise<[boolean, any]> {
1520
+ console.log("šŸ“” TTS Streaming with Phonemes Test");
1521
+
1522
+ if (!voiceId) {
1523
+ console.log(" āš ļø No voice ID available");
1524
+ return [false, null];
1525
+ }
1526
+
1527
+ try {
1528
+ const { Supertone } = await import("../src/index.js");
1529
+ const models = await import("../src/models/index.js");
1530
+ const client = new Supertone({ apiKey: API_KEY });
1531
+
1532
+ console.log(
1533
+ ` šŸ” Streaming speech with phonemes for voice '${voiceId}'...`
1534
+ );
1535
+ console.log(" āš ļø This test consumes credits!");
1536
+
1537
+ const response = await client.textToSpeech.streamSpeech({
1538
+ voiceId,
1539
+ apiConvertTextToSpeechUsingCharacterRequest: {
1540
+ text: "Streaming with phoneme timing information.",
1541
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1542
+ outputFormat:
1543
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1544
+ includePhonemes: true,
1545
+ },
1546
+ });
1547
+
1548
+ console.log(` āœ… Stream with phonemes started successfully`);
1549
+
1550
+ return [true, response];
1551
+ } catch (e: any) {
1552
+ console.error(` āŒ Error: ${e.message || e}`);
1553
+ return [false, e];
1554
+ }
1555
+ }
1556
+
1557
+ // =============================================================================
1558
+ // Model & Language Compatibility Tests
1559
+ // =============================================================================
1560
+
1561
+ /**
1562
+ * Model-Language compatibility matrix
1563
+ * - sona_speech_1: ko, en, ja
1564
+ * - sona_speech_2: all languages (23 languages)
1565
+ * - supertonic_api_1: ko, en, ja, es, pt
1566
+ */
1567
+ const MODEL_LANGUAGE_MATRIX = {
1568
+ sona_speech_1: ["ko", "en", "ja"],
1569
+ sona_speech_2: [
1570
+ "en",
1571
+ "ko",
1572
+ "ja",
1573
+ "bg",
1574
+ "cs",
1575
+ "da",
1576
+ "el",
1577
+ "es",
1578
+ "et",
1579
+ "fi",
1580
+ "hu",
1581
+ "it",
1582
+ "nl",
1583
+ "pl",
1584
+ "pt",
1585
+ "ro",
1586
+ "ar",
1587
+ "de",
1588
+ "fr",
1589
+ "hi",
1590
+ "id",
1591
+ "ru",
1592
+ "vi",
1593
+ ],
1594
+ supertonic_api_1: ["ko", "en", "ja", "es", "pt"],
1595
+ } as const;
1596
+
1597
+ /**
1598
+ * Test TTS with sona_speech_2 model
1599
+ */
1600
+ async function testCreateSpeechWithSonaSpeech2(
1601
+ voiceId: string | null
1602
+ ): Promise<[boolean, any]> {
1603
+ console.log("šŸ¤– TTS with sona_speech_2 Model Test");
1604
+
1605
+ if (!voiceId) {
1606
+ console.log(" āš ļø No voice ID available");
1607
+ return [false, null];
1608
+ }
1609
+
1610
+ try {
1611
+ const { Supertone } = await import("../src/index.js");
1612
+ const models = await import("../src/models/index.js");
1613
+ const client = new Supertone({ apiKey: API_KEY });
1614
+
1615
+ const testText =
1616
+ "Hello! Testing sona_speech_2 model for text-to-speech conversion.";
1617
+ console.log(` šŸ” Creating speech with sona_speech_2 model`);
1618
+ console.log(` Voice ID: ${voiceId}`);
1619
+ console.log(` Model: sona_speech_2`);
1620
+ console.log(" āš ļø This test consumes credits!");
1621
+
1622
+ const response = await client.textToSpeech.createSpeech({
1623
+ voiceId,
1624
+ apiConvertTextToSpeechUsingCharacterRequest: {
1625
+ text: testText,
1626
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1627
+ outputFormat:
1628
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1629
+ model:
1630
+ models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech2,
1631
+ },
1632
+ });
1633
+
1634
+ console.log(` āœ… sona_speech_2 TTS success`);
1635
+
1636
+ if (response.result) {
1637
+ const audioData = await extractAudioData(response);
1638
+ const outputFile = "test_sona_speech_2_output.wav";
1639
+ fs.writeFileSync(outputFile, audioData);
1640
+ console.log(
1641
+ ` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`
1642
+ );
1643
+ }
1644
+
1645
+ return [true, response];
1646
+ } catch (e: any) {
1647
+ logDetailedError(e, "sona_speech_2 TTS");
1648
+ return [false, e];
1649
+ }
1650
+ }
1651
+
1652
+ /**
1653
+ * Test TTS with supertonic_api_1 model
1654
+ */
1655
+ async function testCreateSpeechWithSupertonicApi1(
1656
+ voiceId: string | null
1657
+ ): Promise<[boolean, any]> {
1658
+ console.log("šŸ¤– TTS with supertonic_api_1 Model Test");
1659
+
1660
+ if (!voiceId) {
1661
+ console.log(" āš ļø No voice ID available");
1662
+ return [false, null];
1663
+ }
1664
+
1665
+ try {
1666
+ const { Supertone } = await import("../src/index.js");
1667
+ const models = await import("../src/models/index.js");
1668
+ const client = new Supertone({ apiKey: API_KEY });
1669
+
1670
+ const testText =
1671
+ "Hello! Testing supertonic_api_1 model for text-to-speech conversion.";
1672
+ console.log(` šŸ” Creating speech with supertonic_api_1 model`);
1673
+ console.log(` Voice ID: ${voiceId}`);
1674
+ console.log(` Model: supertonic_api_1`);
1675
+ console.log(" āš ļø This test consumes credits!");
1676
+
1677
+ const response = await client.textToSpeech.createSpeech({
1678
+ voiceId,
1679
+ apiConvertTextToSpeechUsingCharacterRequest: {
1680
+ text: testText,
1681
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1682
+ outputFormat:
1683
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1684
+ model:
1685
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1686
+ .SupertonicApi1,
1687
+ },
1688
+ });
1689
+
1690
+ console.log(` āœ… supertonic_api_1 TTS success`);
1691
+
1692
+ if (response.result) {
1693
+ const audioData = await extractAudioData(response);
1694
+ const outputFile = "test_supertonic_api_1_output.wav";
1695
+ fs.writeFileSync(outputFile, audioData);
1696
+ console.log(
1697
+ ` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`
1698
+ );
1699
+ }
1700
+
1701
+ return [true, response];
1702
+ } catch (e: any) {
1703
+ logDetailedError(e, "supertonic_api_1 TTS");
1704
+ return [false, e];
1705
+ }
1706
+ }
1707
+
1708
+ /**
1709
+ * Test TTS with unsupported model (should fail with validation error)
1710
+ */
1711
+ async function testCreateSpeechWithUnsupportedModel(
1712
+ voiceId: string | null
1713
+ ): Promise<[boolean, any]> {
1714
+ console.log("🚫 TTS with Unsupported Model Test (Expected to Fail)");
1715
+
1716
+ if (!voiceId) {
1717
+ console.log(" āš ļø No voice ID available");
1718
+ return [false, null];
1719
+ }
1720
+
1721
+ try {
1722
+ const { Supertone } = await import("../src/index.js");
1723
+ const models = await import("../src/models/index.js");
1724
+ const client = new Supertone({ apiKey: API_KEY });
1725
+
1726
+ const testText = "This should fail with unsupported model.";
1727
+ console.log(
1728
+ ` šŸ” Attempting TTS with unsupported model: 'invalid_model_xyz'`
1729
+ );
1730
+
1731
+ // Using type assertion to bypass TypeScript validation for testing
1732
+ const response = await client.textToSpeech.createSpeech({
1733
+ voiceId,
1734
+ apiConvertTextToSpeechUsingCharacterRequest: {
1735
+ text: testText,
1736
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1737
+ outputFormat:
1738
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1739
+ model: "invalid_model_xyz" as any, // Intentionally invalid model
1740
+ },
1741
+ });
1742
+
1743
+ // If we reach here, the test failed (should have thrown an error)
1744
+ console.log(` āŒ Expected error but got success - this is unexpected!`);
1745
+ return [false, response];
1746
+ } catch (e: any) {
1747
+ // Expected to fail - this is the success case for this test
1748
+ console.log(` āœ… Correctly rejected unsupported model`);
1749
+ console.log(` šŸ“‹ Error type: ${e.constructor?.name || typeof e}`);
1750
+ console.log(` šŸ“‹ Error message: ${e.message?.substring(0, 100) || e}`);
1751
+ return [true, e];
1752
+ }
1753
+ }
1754
+
1755
+ /**
1756
+ * Test prediction with sona_speech_2 model
1757
+ */
1758
+ async function testPredictDurationWithSonaSpeech2(
1759
+ voiceId: string | null
1760
+ ): Promise<[boolean, any]> {
1761
+ console.log("ā±ļø Duration Prediction with sona_speech_2 Model Test");
1762
+
1763
+ if (!voiceId) {
1764
+ console.log(" āš ļø No voice ID available");
1765
+ return [false, null];
1766
+ }
1767
+
1768
+ try {
1769
+ const { Supertone } = await import("../src/index.js");
1770
+ const models = await import("../src/models/index.js");
1771
+ const client = new Supertone({ apiKey: API_KEY });
1772
+
1773
+ const testText = "Testing duration prediction with sona_speech_2 model.";
1774
+ console.log(` šŸ” Predicting duration with sona_speech_2 model`);
1775
+
1776
+ const response = await client.textToSpeech.predictDuration({
1777
+ voiceId,
1778
+ predictTTSDurationUsingCharacterRequest: {
1779
+ text: testText,
1780
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1781
+ model: models.PredictTTSDurationUsingCharacterRequestModel.SonaSpeech2,
1782
+ },
1783
+ });
1784
+
1785
+ console.log(
1786
+ ` āœ… sona_speech_2 duration prediction: ${response.duration}s`
1787
+ );
1788
+ return [true, response];
1789
+ } catch (e: any) {
1790
+ logDetailedError(e, "sona_speech_2 duration prediction");
1791
+ return [false, e];
1792
+ }
1793
+ }
1794
+
1795
+ /**
1796
+ * Test prediction with supertonic_api_1 model
1797
+ */
1798
+ async function testPredictDurationWithSupertonicApi1(
1799
+ voiceId: string | null
1800
+ ): Promise<[boolean, any]> {
1801
+ console.log("ā±ļø Duration Prediction with supertonic_api_1 Model Test");
1802
+
1803
+ if (!voiceId) {
1804
+ console.log(" āš ļø No voice ID available");
1805
+ return [false, null];
1806
+ }
1807
+
1808
+ try {
1809
+ const { Supertone } = await import("../src/index.js");
1810
+ const models = await import("../src/models/index.js");
1811
+ const client = new Supertone({ apiKey: API_KEY });
1812
+
1813
+ const testText = "Testing duration prediction with supertonic_api_1 model.";
1814
+ console.log(` šŸ” Predicting duration with supertonic_api_1 model`);
1815
+
1816
+ const response = await client.textToSpeech.predictDuration({
1817
+ voiceId,
1818
+ predictTTSDurationUsingCharacterRequest: {
1819
+ text: testText,
1820
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1821
+ model:
1822
+ models.PredictTTSDurationUsingCharacterRequestModel.SupertonicApi1,
1823
+ },
1824
+ });
1825
+
1826
+ console.log(
1827
+ ` āœ… supertonic_api_1 duration prediction: ${response.duration}s`
1828
+ );
1829
+ return [true, response];
1830
+ } catch (e: any) {
1831
+ logDetailedError(e, "supertonic_api_1 duration prediction");
1832
+ return [false, e];
1833
+ }
1834
+ }
1835
+
1836
+ /**
1837
+ * Test prediction with unsupported model (should fail with validation error)
1838
+ */
1839
+ async function testPredictDurationWithUnsupportedModel(
1840
+ voiceId: string | null
1841
+ ): Promise<[boolean, any]> {
1842
+ console.log(
1843
+ "🚫 Duration Prediction with Unsupported Model Test (Expected to Fail)"
1844
+ );
1845
+
1846
+ if (!voiceId) {
1847
+ console.log(" āš ļø No voice ID available");
1848
+ return [false, null];
1849
+ }
1850
+
1851
+ try {
1852
+ const { Supertone } = await import("../src/index.js");
1853
+ const models = await import("../src/models/index.js");
1854
+ const client = new Supertone({ apiKey: API_KEY });
1855
+
1856
+ const testText = "This should fail with unsupported model.";
1857
+ console.log(
1858
+ ` šŸ” Attempting prediction with unsupported model: 'invalid_model_xyz'`
1859
+ );
1860
+
1861
+ const response = await client.textToSpeech.predictDuration({
1862
+ voiceId,
1863
+ predictTTSDurationUsingCharacterRequest: {
1864
+ text: testText,
1865
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1866
+ model: "invalid_model_xyz" as any, // Intentionally invalid model
1867
+ },
1868
+ });
1869
+
1870
+ console.log(` āŒ Expected error but got success - this is unexpected!`);
1871
+ return [false, response];
1872
+ } catch (e: any) {
1873
+ console.log(` āœ… Correctly rejected unsupported model`);
1874
+ console.log(` šŸ“‹ Error type: ${e.constructor?.name || typeof e}`);
1875
+ console.log(` šŸ“‹ Error message: ${e.message?.substring(0, 100) || e}`);
1876
+ return [true, e];
1877
+ }
1878
+ }
1879
+
1880
+ // =============================================================================
1881
+ // Multilingual Tests per Model
1882
+ // =============================================================================
1883
+
1884
+ /**
1885
+ * Test TTS multilingual support with sona_speech_1 (supports: ko, en, ja)
1886
+ */
1887
+ async function testMultilingualSonaSpeech1(
1888
+ voiceId: string | null
1889
+ ): Promise<[boolean, any]> {
1890
+ console.log("šŸŒ Multilingual Test - sona_speech_1 (ko, en, ja)");
1891
+
1892
+ if (!voiceId) {
1893
+ console.log(" āš ļø No voice ID available");
1894
+ return [false, null];
1895
+ }
1896
+
1897
+ const testCases = [
1898
+ {
1899
+ lang: "ko" as const,
1900
+ text: "ģ•ˆė…•ķ•˜ģ„øģš”, ģ†Œė‚˜ ģŠ¤ķ”¼ģ¹˜ 원 ėŖØėøģž…ė‹ˆė‹¤.",
1901
+ label: "Korean",
1902
+ },
1903
+ {
1904
+ lang: "en" as const,
1905
+ text: "Hello, this is sona_speech_1 model.",
1906
+ label: "English",
1907
+ },
1908
+ {
1909
+ lang: "ja" as const,
1910
+ text: "ć“ć‚“ć«ć”ćÆć€ć‚½ćƒŠć‚¹ćƒ”ćƒ¼ćƒćƒÆćƒ³ćƒ¢ćƒ‡ćƒ«ć§ć™ć€‚",
1911
+ label: "Japanese",
1912
+ },
1913
+ ];
1914
+
1915
+ try {
1916
+ const { Supertone } = await import("../src/index.js");
1917
+ const models = await import("../src/models/index.js");
1918
+ const client = new Supertone({ apiKey: API_KEY });
1919
+
1920
+ let allPassed = true;
1921
+ const results: any[] = [];
1922
+
1923
+ for (const tc of testCases) {
1924
+ console.log(` šŸ” Testing ${tc.label} (${tc.lang})...`);
1925
+
1926
+ try {
1927
+ const langEnum =
1928
+ models.APIConvertTextToSpeechUsingCharacterRequestLanguage[
1929
+ (tc.lang.charAt(0).toUpperCase() +
1930
+ tc.lang.slice(
1931
+ 1
1932
+ )) as keyof typeof models.APIConvertTextToSpeechUsingCharacterRequestLanguage
1933
+ ];
1934
+
1935
+ const response = await client.textToSpeech.createSpeech({
1936
+ voiceId,
1937
+ apiConvertTextToSpeechUsingCharacterRequest: {
1938
+ text: tc.text,
1939
+ language: langEnum,
1940
+ outputFormat:
1941
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat
1942
+ .Wav,
1943
+ model:
1944
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1945
+ .SonaSpeech1,
1946
+ },
1947
+ });
1948
+
1949
+ console.log(` āœ… ${tc.label} success`);
1950
+ results.push({ lang: tc.lang, success: true });
1951
+ } catch (e: any) {
1952
+ console.log(
1953
+ ` āŒ ${tc.label} failed: ${e.message?.substring(0, 50)}`
1954
+ );
1955
+ results.push({ lang: tc.lang, success: false, error: e.message });
1956
+ allPassed = false;
1957
+ }
1958
+ }
1959
+
1960
+ console.log(
1961
+ ` šŸ“Š Result: ${results.filter((r) => r.success).length}/${
1962
+ testCases.length
1963
+ } languages passed`
1964
+ );
1965
+ return [allPassed, results];
1966
+ } catch (e: any) {
1967
+ logDetailedError(e, "sona_speech_1 multilingual");
1968
+ return [false, e];
1969
+ }
1970
+ }
1971
+
1972
+ /**
1973
+ * Test TTS multilingual support with sona_speech_2 (supports all languages)
1974
+ */
1975
+ async function testMultilingualSonaSpeech2(
1976
+ voiceId: string | null
1977
+ ): Promise<[boolean, any]> {
1978
+ console.log("šŸŒ Multilingual Test - sona_speech_2 (all languages sample)");
1979
+
1980
+ if (!voiceId) {
1981
+ console.log(" āš ļø No voice ID available");
1982
+ return [false, null];
1983
+ }
1984
+
1985
+ // Test a diverse subset of languages
1986
+ const testCases = [
1987
+ { lang: "Ko" as const, text: "ģ•ˆė…•ķ•˜ģ„øģš”.", label: "Korean" },
1988
+ { lang: "En" as const, text: "Hello.", label: "English" },
1989
+ { lang: "Ja" as const, text: "こんにごは。", label: "Japanese" },
1990
+ { lang: "Es" as const, text: "Hola.", label: "Spanish" },
1991
+ { lang: "Fr" as const, text: "Bonjour.", label: "French" },
1992
+ { lang: "De" as const, text: "Hallo.", label: "German" },
1993
+ { lang: "Ar" as const, text: "Ł…Ų±Ų­ŲØŲ§.", label: "Arabic" },
1994
+ { lang: "Hi" as const, text: "ą¤Øą¤®ą¤øą„ą¤¤ą„‡ą„¤", label: "Hindi" },
1995
+ ];
1996
+
1997
+ try {
1998
+ const { Supertone } = await import("../src/index.js");
1999
+ const models = await import("../src/models/index.js");
2000
+ const client = new Supertone({ apiKey: API_KEY });
2001
+
2002
+ let allPassed = true;
2003
+ const results: any[] = [];
2004
+
2005
+ for (const tc of testCases) {
2006
+ console.log(` šŸ” Testing ${tc.label} (${tc.lang})...`);
2007
+
2008
+ try {
2009
+ const langEnum =
2010
+ models.APIConvertTextToSpeechUsingCharacterRequestLanguage[tc.lang];
2011
+
2012
+ const response = await client.textToSpeech.createSpeech({
2013
+ voiceId,
2014
+ apiConvertTextToSpeechUsingCharacterRequest: {
2015
+ text: tc.text,
2016
+ language: langEnum,
2017
+ outputFormat:
2018
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat
2019
+ .Wav,
2020
+ model:
2021
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
2022
+ .SonaSpeech2,
2023
+ },
2024
+ });
2025
+
2026
+ console.log(` āœ… ${tc.label} success`);
2027
+ results.push({ lang: tc.lang, success: true });
2028
+ } catch (e: any) {
2029
+ console.log(
2030
+ ` āŒ ${tc.label} failed: ${e.message?.substring(0, 50)}`
2031
+ );
2032
+ results.push({ lang: tc.lang, success: false, error: e.message });
2033
+ allPassed = false;
2034
+ }
2035
+ }
2036
+
2037
+ console.log(
2038
+ ` šŸ“Š Result: ${results.filter((r) => r.success).length}/${
2039
+ testCases.length
2040
+ } languages passed`
2041
+ );
2042
+ return [allPassed, results];
2043
+ } catch (e: any) {
2044
+ logDetailedError(e, "sona_speech_2 multilingual");
2045
+ return [false, e];
2046
+ }
2047
+ }
2048
+
2049
+ /**
2050
+ * Test TTS multilingual support with supertonic_api_1 (supports: ko, en, ja, es, pt)
2051
+ */
2052
+ async function testMultilingualSupertonicApi1(
2053
+ voiceId: string | null
2054
+ ): Promise<[boolean, any]> {
2055
+ console.log("šŸŒ Multilingual Test - supertonic_api_1 (ko, en, ja, es, pt)");
2056
+
2057
+ if (!voiceId) {
2058
+ console.log(" āš ļø No voice ID available");
2059
+ return [false, null];
2060
+ }
2061
+
2062
+ const testCases = [
2063
+ {
2064
+ lang: "Ko" as const,
2065
+ text: "ģ•ˆė…•ķ•˜ģ„øģš”, ģŠˆķ¼ķ† ė‹‰ API 원 ėŖØėøģž…ė‹ˆė‹¤.",
2066
+ label: "Korean",
2067
+ },
2068
+ {
2069
+ lang: "En" as const,
2070
+ text: "Hello, this is supertonic_api_1 model.",
2071
+ label: "English",
2072
+ },
2073
+ {
2074
+ lang: "Ja" as const,
2075
+ text: "ć“ć‚“ć«ć”ćÆć€ć‚¹ćƒ¼ćƒ‘ćƒ¼ćƒˆćƒ‹ćƒƒć‚ÆAPIćƒÆćƒ³ć§ć™ć€‚",
2076
+ label: "Japanese",
2077
+ },
2078
+ {
2079
+ lang: "Es" as const,
2080
+ text: "Hola, este es el modelo supertonic_api_1.",
2081
+ label: "Spanish",
2082
+ },
2083
+ {
2084
+ lang: "Pt" as const,
2085
+ text: "OlĆ”, este Ć© o modelo supertonic_api_1.",
2086
+ label: "Portuguese",
2087
+ },
2088
+ ];
2089
+
2090
+ try {
2091
+ const { Supertone } = await import("../src/index.js");
2092
+ const models = await import("../src/models/index.js");
2093
+ const client = new Supertone({ apiKey: API_KEY });
2094
+
2095
+ let allPassed = true;
2096
+ const results: any[] = [];
2097
+
2098
+ for (const tc of testCases) {
2099
+ console.log(` šŸ” Testing ${tc.label} (${tc.lang})...`);
2100
+
2101
+ try {
2102
+ const langEnum =
2103
+ models.APIConvertTextToSpeechUsingCharacterRequestLanguage[tc.lang];
2104
+
2105
+ const response = await client.textToSpeech.createSpeech({
2106
+ voiceId,
2107
+ apiConvertTextToSpeechUsingCharacterRequest: {
2108
+ text: tc.text,
2109
+ language: langEnum,
2110
+ outputFormat:
2111
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat
2112
+ .Wav,
2113
+ model:
2114
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
2115
+ .SupertonicApi1,
2116
+ },
2117
+ });
2118
+
2119
+ console.log(` āœ… ${tc.label} success`);
2120
+ results.push({ lang: tc.lang, success: true });
2121
+ } catch (e: any) {
2122
+ console.log(
2123
+ ` āŒ ${tc.label} failed: ${e.message?.substring(0, 50)}`
2124
+ );
2125
+ results.push({ lang: tc.lang, success: false, error: e.message });
2126
+ allPassed = false;
2127
+ }
2128
+ }
2129
+
2130
+ console.log(
2131
+ ` šŸ“Š Result: ${results.filter((r) => r.success).length}/${
2132
+ testCases.length
2133
+ } languages passed`
2134
+ );
2135
+ return [allPassed, results];
2136
+ } catch (e: any) {
2137
+ logDetailedError(e, "supertonic_api_1 multilingual");
2138
+ return [false, e];
2139
+ }
2140
+ }
2141
+
2142
+ /**
2143
+ * Test unsupported language for sona_speech_1 (should fail with French)
2144
+ */
2145
+ async function testUnsupportedLanguageSonaSpeech1(
2146
+ voiceId: string | null
2147
+ ): Promise<[boolean, any]> {
2148
+ console.log(
2149
+ "🚫 Unsupported Language Test - sona_speech_1 with French (Expected to Fail)"
2150
+ );
2151
+
2152
+ if (!voiceId) {
2153
+ console.log(" āš ļø No voice ID available");
2154
+ return [false, null];
2155
+ }
2156
+
2157
+ try {
2158
+ const { Supertone } = await import("../src/index.js");
2159
+ const models = await import("../src/models/index.js");
2160
+ const client = new Supertone({ apiKey: API_KEY });
2161
+
2162
+ console.log(` šŸ” Attempting sona_speech_1 with French (unsupported)`);
2163
+
2164
+ const response = await client.textToSpeech.createSpeech({
2165
+ voiceId,
2166
+ apiConvertTextToSpeechUsingCharacterRequest: {
2167
+ text: "Bonjour, ceci est un test.",
2168
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Fr, // French - not supported by sona_speech_1
2169
+ outputFormat:
2170
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2171
+ model:
2172
+ models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech1,
2173
+ },
2174
+ });
2175
+
2176
+ // If we reach here, the API didn't reject - may need server-side validation
2177
+ console.log(
2178
+ ` āš ļø API accepted the request - server-side validation may not enforce language restriction`
2179
+ );
2180
+ console.log(
2181
+ ` šŸ“‹ Note: Language restriction may be enforced at API level, not SDK level`
2182
+ );
2183
+ return [
2184
+ true,
2185
+ { note: "API accepted - language restriction may be server-side" },
2186
+ ];
2187
+ } catch (e: any) {
2188
+ console.log(
2189
+ ` āœ… Correctly rejected unsupported language for sona_speech_1`
2190
+ );
2191
+ console.log(` šŸ“‹ Error: ${e.message?.substring(0, 100)}`);
2192
+ return [true, e];
2193
+ }
2194
+ }
2195
+
2196
+ /**
2197
+ * Test unsupported language for supertonic_api_1 (should fail with German)
2198
+ */
2199
+ async function testUnsupportedLanguageSupertonicApi1(
2200
+ voiceId: string | null
2201
+ ): Promise<[boolean, any]> {
2202
+ console.log(
2203
+ "🚫 Unsupported Language Test - supertonic_api_1 with German (Expected to Fail)"
2204
+ );
2205
+
2206
+ if (!voiceId) {
2207
+ console.log(" āš ļø No voice ID available");
2208
+ return [false, null];
2209
+ }
2210
+
2211
+ try {
2212
+ const { Supertone } = await import("../src/index.js");
2213
+ const models = await import("../src/models/index.js");
2214
+ const client = new Supertone({ apiKey: API_KEY });
2215
+
2216
+ console.log(` šŸ” Attempting supertonic_api_1 with German (unsupported)`);
2217
+
2218
+ const response = await client.textToSpeech.createSpeech({
2219
+ voiceId,
2220
+ apiConvertTextToSpeechUsingCharacterRequest: {
2221
+ text: "Hallo, das ist ein Test.",
2222
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.De, // German - not supported by supertonic_api_1
2223
+ outputFormat:
2224
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2225
+ model:
2226
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
2227
+ .SupertonicApi1,
2228
+ },
2229
+ });
2230
+
2231
+ // If we reach here, the API didn't reject - may need server-side validation
2232
+ console.log(
2233
+ ` āš ļø API accepted the request - server-side validation may not enforce language restriction`
2234
+ );
2235
+ console.log(
2236
+ ` šŸ“‹ Note: Language restriction may be enforced at API level, not SDK level`
2237
+ );
2238
+ return [
2239
+ true,
2240
+ { note: "API accepted - language restriction may be server-side" },
2241
+ ];
2242
+ } catch (e: any) {
2243
+ console.log(
2244
+ ` āœ… Correctly rejected unsupported language for supertonic_api_1`
2245
+ );
2246
+ console.log(` šŸ“‹ Error: ${e.message?.substring(0, 100)}`);
2247
+ return [true, e];
2248
+ }
2249
+ }
2250
+
2251
+ /**
2252
+ * Test duration prediction with voice settings
2253
+ */
2254
+ async function testPredictDurationWithVoiceSettings(
2255
+ voiceId: string | null
2256
+ ): Promise<[boolean, any]> {
2257
+ console.log("ā±ļø Duration Prediction with Voice Settings Test");
2258
+
2259
+ if (!voiceId) {
2260
+ console.log(" āš ļø No voice ID available");
2261
+ return [false, null];
2262
+ }
2263
+
2264
+ try {
2265
+ const { Supertone } = await import("../src/index.js");
2266
+ const models = await import("../src/models/index.js");
2267
+ const client = new Supertone({ apiKey: API_KEY });
2268
+
2269
+ const voiceSettings = {
2270
+ speed: 0.8,
2271
+ };
2272
+
2273
+ console.log(
2274
+ ` šŸ” Predicting duration with voice settings for voice '${voiceId}'...`
2275
+ );
2276
+ console.log(` Settings: speed=${voiceSettings.speed}`);
2277
+
2278
+ const response = await client.textToSpeech.predictDuration({
2279
+ voiceId,
2280
+ predictTTSDurationUsingCharacterRequest: {
2281
+ text: "This is a duration test with adjusted speed.",
2282
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
2283
+ voiceSettings,
2284
+ },
2285
+ });
2286
+
2287
+ console.log(` āœ… Predicted duration: ${response.duration}s`);
2288
+
2289
+ return [true, response];
2290
+ } catch (e: any) {
2291
+ console.error(` āŒ Error: ${e.message || e}`);
2292
+ return [false, e];
2293
+ }
2294
+ }
2295
+
2296
+ /**
2297
+ * Test TTS streaming with voice settings
2298
+ */
2299
+ async function testStreamSpeechWithVoiceSettings(
2300
+ voiceId: string | null
2301
+ ): Promise<[boolean, any]> {
2302
+ console.log("šŸ“” TTS Streaming with Voice Settings Test");
2303
+
2304
+ if (!voiceId) {
2305
+ console.log(" āš ļø No voice ID available");
2306
+ return [false, null];
2307
+ }
2308
+
2309
+ try {
2310
+ const { Supertone } = await import("../src/index.js");
2311
+ const models = await import("../src/models/index.js");
2312
+ const client = new Supertone({ apiKey: API_KEY });
2313
+
2314
+ const voiceSettings = {
2315
+ pitchShift: 1.05,
2316
+ speed: 1.1,
2317
+ };
2318
+
2319
+ console.log(
2320
+ ` šŸ” Streaming speech with voice settings for voice '${voiceId}'...`
2321
+ );
2322
+ console.log(
2323
+ ` Settings: pitchShift=${voiceSettings.pitchShift}, speed=${voiceSettings.speed}`
2324
+ );
2325
+ console.log(" āš ļø This test consumes credits!");
2326
+
2327
+ const response = await client.textToSpeech.streamSpeech({
2328
+ voiceId,
2329
+ apiConvertTextToSpeechUsingCharacterRequest: {
2330
+ text: "Streaming with adjusted voice settings.",
2331
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2332
+ outputFormat:
2333
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2334
+ voiceSettings,
2335
+ },
2336
+ });
2337
+
2338
+ console.log(` āœ… Stream with voice settings started successfully`);
2339
+
2340
+ return [true, response];
2341
+ } catch (e: any) {
2342
+ console.error(` āŒ Error: ${e.message || e}`);
2343
+ return [false, e];
2344
+ }
2345
+ }
2346
+
2347
+ /**
2348
+ * Test MP3 format TTS
2349
+ */
2350
+ async function testCreateSpeechMp3(
2351
+ voiceId: string | null
2352
+ ): Promise<[boolean, any]> {
2353
+ console.log("šŸŽ¤ MP3 Format TTS Test");
2354
+
2355
+ if (!voiceId) {
2356
+ console.log(" āš ļø No voice ID available");
2357
+ return [false, null];
2358
+ }
2359
+
2360
+ try {
2361
+ const { Supertone } = await import("../src/index.js");
2362
+ const models = await import("../src/models/index.js");
2363
+ const client = new Supertone({ apiKey: API_KEY });
2364
+
2365
+ console.log(` šŸ” MP3 TTS conversion with voice '${voiceId}'...`);
2366
+ console.log(" āš ļø This test consumes credits!");
2367
+
2368
+ const response = await client.textToSpeech.createSpeech({
2369
+ voiceId,
2370
+ apiConvertTextToSpeechUsingCharacterRequest: {
2371
+ text: "Hello! This is an MP3 format SDK test. Let's verify if it works correctly.",
2372
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2373
+ outputFormat:
2374
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Mp3,
2375
+ style: "neutral",
2376
+ model: "sona_speech_1",
2377
+ },
2378
+ });
2379
+
2380
+ console.log(` āœ… MP3 TTS conversion success`);
2381
+
2382
+ if (response.result) {
2383
+ const outputFile = "test_create_speech_output.mp3";
2384
+ const audioData = await extractAudioData(response);
2385
+
2386
+ fs.writeFileSync(outputFile, audioData);
2387
+ console.log(` šŸ’¾ MP3 audio file saved: ${outputFile}`);
2388
+
2389
+ // Verify MP3 header
2390
+ const header = audioData.slice(0, 10);
2391
+ if (header[0] === 0x49 && header[1] === 0x44 && header[2] === 0x33) {
2392
+ console.log(` āœ… Valid MP3 file generated (ID3 tag)`);
2393
+ } else if (
2394
+ (header[0] === 0xff && header[1] === 0xfb) ||
2395
+ (header[0] === 0xff && header[1] === 0xfa)
2396
+ ) {
2397
+ console.log(` āœ… Valid MP3 file generated (MPEG frame)`);
1318
2398
  } else {
1319
2399
  console.log(
1320
2400
  ` šŸ“„ MP3 header: ${Array.from(header.slice(0, 10))
@@ -1536,6 +2616,304 @@ async function testCreateSpeechWithChunking(
1536
2616
  }
1537
2617
  }
1538
2618
 
2619
+ // =============================================================================
2620
+ // Pronunciation Dictionary Tests
2621
+ // =============================================================================
2622
+
2623
+ /**
2624
+ * Test TTS with pronunciation dictionary (basic test with partial_match=true/false)
2625
+ */
2626
+ async function testCreateSpeechWithPronunciationDictionary(
2627
+ voiceId: string | null
2628
+ ): Promise<[boolean, any]> {
2629
+ console.log("šŸ“– TTS with Pronunciation Dictionary Test");
2630
+
2631
+ if (!voiceId) {
2632
+ console.log(" āš ļø No voice ID available");
2633
+ return [false, null];
2634
+ }
2635
+
2636
+ try {
2637
+ const { Supertone } = await import("../src/index.js");
2638
+ const models = await import("../src/models/index.js");
2639
+ const client = new Supertone({ apiKey: API_KEY });
2640
+
2641
+ // Test text with abbreviations and special terms
2642
+ const testText =
2643
+ "The CEO of OpenAI announced that GPT models are improving. Dr. Smith from MIT said AI research is accelerating.";
2644
+
2645
+ // Pronunciation dictionary with both partial_match=true and partial_match=false cases
2646
+ const pronunciationDictionary = [
2647
+ // partial_match=false: exact word boundary match
2648
+ { text: "CEO", pronunciation: "Chief Executive Officer", partial_match: false },
2649
+ { text: "MIT", pronunciation: "Massachusetts Institute of Technology", partial_match: false },
2650
+ { text: "AI", pronunciation: "Artificial Intelligence", partial_match: false },
2651
+ // partial_match=true: substring match (will match "OpenAI" -> "OpenArtificial Intelligence")
2652
+ { text: "GPT", pronunciation: "Generative Pre-trained Transformer", partial_match: true },
2653
+ { text: "Dr.", pronunciation: "Doctor", partial_match: true },
2654
+ ];
2655
+
2656
+ console.log(` šŸ” Original text: "${testText}"`);
2657
+ console.log(` šŸ“– Pronunciation dictionary entries: ${pronunciationDictionary.length}`);
2658
+ console.log(` - partial_match=false: CEO, MIT, AI (word boundary match)`);
2659
+ console.log(` - partial_match=true: GPT, Dr. (substring match)`);
2660
+ console.log(" āš ļø This test consumes credits!");
2661
+
2662
+ const response = await client.textToSpeech.createSpeech(
2663
+ {
2664
+ voiceId,
2665
+ apiConvertTextToSpeechUsingCharacterRequest: {
2666
+ text: testText,
2667
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2668
+ outputFormat:
2669
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2670
+ style: "neutral",
2671
+ model: "sona_speech_1",
2672
+ },
2673
+ },
2674
+ {
2675
+ pronunciationDictionary,
2676
+ }
2677
+ );
2678
+
2679
+ console.log(` āœ… TTS with pronunciation dictionary success`);
2680
+
2681
+ if (response.result) {
2682
+ const audioData = await extractAudioData(response);
2683
+ const outputFile = "test_pronunciation_dictionary_output.wav";
2684
+ fs.writeFileSync(outputFile, audioData);
2685
+ console.log(` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`);
2686
+ }
2687
+
2688
+ return [true, response];
2689
+ } catch (e: any) {
2690
+ logDetailedError(e, "Pronunciation dictionary TTS");
2691
+ return [false, e];
2692
+ }
2693
+ }
2694
+
2695
+ /**
2696
+ * Test TTS with pronunciation dictionary causing text to exceed 300 chars (triggers chunking)
2697
+ */
2698
+ async function testCreateSpeechWithPronunciationDictionaryLongText(
2699
+ voiceId: string | null
2700
+ ): Promise<[boolean, any]> {
2701
+ console.log("šŸ“– TTS with Pronunciation Dictionary + Long Text Chunking Test");
2702
+
2703
+ if (!voiceId) {
2704
+ console.log(" āš ļø No voice ID available");
2705
+ return [false, null];
2706
+ }
2707
+
2708
+ try {
2709
+ const { Supertone } = await import("../src/index.js");
2710
+ const models = await import("../src/models/index.js");
2711
+ const client = new Supertone({ apiKey: API_KEY });
2712
+
2713
+ // Short original text (~200 chars) that will exceed 300 chars after pronunciation dictionary expansion
2714
+ const testText =
2715
+ "AI and ML are revolutionizing tech. The CEO of OpenAI discussed GPT advancements. " +
2716
+ "Dr. Kim from MIT explained how NLP and CV work together. AWS and GCP provide cloud AI services.";
2717
+
2718
+ // Pronunciation dictionary that expands abbreviations significantly
2719
+ const pronunciationDictionary = [
2720
+ // partial_match=false: exact word boundary matches
2721
+ { text: "AI", pronunciation: "Artificial Intelligence", partial_match: false },
2722
+ { text: "ML", pronunciation: "Machine Learning", partial_match: false },
2723
+ { text: "CEO", pronunciation: "Chief Executive Officer", partial_match: false },
2724
+ { text: "MIT", pronunciation: "Massachusetts Institute of Technology", partial_match: false },
2725
+ { text: "NLP", pronunciation: "Natural Language Processing", partial_match: false },
2726
+ { text: "CV", pronunciation: "Computer Vision", partial_match: false },
2727
+ { text: "AWS", pronunciation: "Amazon Web Services", partial_match: false },
2728
+ { text: "GCP", pronunciation: "Google Cloud Platform", partial_match: false },
2729
+ // partial_match=true: substring matches
2730
+ { text: "GPT", pronunciation: "Generative Pre-trained Transformer", partial_match: true },
2731
+ { text: "Dr.", pronunciation: "Doctor", partial_match: true },
2732
+ { text: "tech", pronunciation: "technology", partial_match: true },
2733
+ ];
2734
+
2735
+ const originalLength = testText.length;
2736
+
2737
+ console.log(` šŸ” Original text length: ${originalLength} characters (under 300)`);
2738
+ console.log(` šŸ“– Pronunciation dictionary entries: ${pronunciationDictionary.length}`);
2739
+ console.log(` - partial_match=false: AI, ML, CEO, MIT, NLP, CV, AWS, GCP`);
2740
+ console.log(` - partial_match=true: GPT, Dr., tech`);
2741
+ console.log(` šŸ”§ Expected: Text will expand to 300+ chars, triggering auto-chunking`);
2742
+ console.log(" āš ļø This test consumes credits!");
2743
+
2744
+ const response = await client.textToSpeech.createSpeech(
2745
+ {
2746
+ voiceId,
2747
+ apiConvertTextToSpeechUsingCharacterRequest: {
2748
+ text: testText,
2749
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2750
+ outputFormat:
2751
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2752
+ style: "neutral",
2753
+ model: "sona_speech_1",
2754
+ },
2755
+ },
2756
+ {
2757
+ pronunciationDictionary,
2758
+ }
2759
+ );
2760
+
2761
+ console.log(` āœ… TTS with pronunciation dictionary + long text chunking success`);
2762
+ console.log(` šŸŽÆ Auto-chunking was triggered after pronunciation expansion!`);
2763
+
2764
+ if (response.result) {
2765
+ const audioData = await extractAudioData(response);
2766
+ const outputFile = "test_pronunciation_dictionary_long_text_output.wav";
2767
+ fs.writeFileSync(outputFile, audioData);
2768
+ console.log(` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`);
2769
+ }
2770
+
2771
+ return [true, response];
2772
+ } catch (e: any) {
2773
+ logDetailedError(e, "Pronunciation dictionary long text TTS");
2774
+ return [false, e];
2775
+ }
2776
+ }
2777
+
2778
+ /**
2779
+ * Test TTS streaming with pronunciation dictionary
2780
+ */
2781
+ async function testStreamSpeechWithPronunciationDictionary(
2782
+ voiceId: string | null
2783
+ ): Promise<[boolean, any]> {
2784
+ console.log("šŸ“” TTS Streaming with Pronunciation Dictionary Test");
2785
+
2786
+ if (!voiceId) {
2787
+ console.log(" āš ļø No voice ID available");
2788
+ return [false, null];
2789
+ }
2790
+
2791
+ try {
2792
+ const { Supertone } = await import("../src/index.js");
2793
+ const models = await import("../src/models/index.js");
2794
+ const client = new Supertone({ apiKey: API_KEY });
2795
+
2796
+ const testText =
2797
+ "The API documentation explains how to use the SDK. " +
2798
+ "Dr. Lee from NASA discussed the new AI system.";
2799
+
2800
+ const pronunciationDictionary = [
2801
+ { text: "API", pronunciation: "Application Programming Interface", partial_match: false },
2802
+ { text: "SDK", pronunciation: "Software Development Kit", partial_match: false },
2803
+ { text: "NASA", pronunciation: "National Aeronautics and Space Administration", partial_match: false },
2804
+ { text: "AI", pronunciation: "Artificial Intelligence", partial_match: false },
2805
+ { text: "Dr.", pronunciation: "Doctor", partial_match: true },
2806
+ ];
2807
+
2808
+ console.log(` šŸ” Original text: "${testText}"`);
2809
+ console.log(` šŸ“– Pronunciation dictionary entries: ${pronunciationDictionary.length}`);
2810
+ console.log(" āš ļø This test consumes credits!");
2811
+
2812
+ const response = await client.textToSpeech.streamSpeech(
2813
+ {
2814
+ voiceId,
2815
+ apiConvertTextToSpeechUsingCharacterRequest: {
2816
+ text: testText,
2817
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2818
+ outputFormat:
2819
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2820
+ },
2821
+ },
2822
+ {
2823
+ pronunciationDictionary,
2824
+ }
2825
+ );
2826
+
2827
+ console.log(` āœ… Stream with pronunciation dictionary started successfully`);
2828
+
2829
+ // Consume the stream and save to file
2830
+ if (response.result) {
2831
+ const audioData = await extractAudioData(response);
2832
+ const outputFile = "test_pronunciation_dictionary_stream_output.wav";
2833
+ fs.writeFileSync(outputFile, audioData);
2834
+ console.log(` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`);
2835
+ }
2836
+
2837
+ return [true, response];
2838
+ } catch (e: any) {
2839
+ logDetailedError(e, "Pronunciation dictionary streaming TTS");
2840
+ return [false, e];
2841
+ }
2842
+ }
2843
+
2844
+ /**
2845
+ * Test TTS streaming with pronunciation dictionary + long text (triggers chunking)
2846
+ */
2847
+ async function testStreamSpeechWithPronunciationDictionaryLongText(
2848
+ voiceId: string | null
2849
+ ): Promise<[boolean, any]> {
2850
+ console.log("šŸ“” TTS Streaming with Pronunciation Dictionary + Long Text Test");
2851
+
2852
+ if (!voiceId) {
2853
+ console.log(" āš ļø No voice ID available");
2854
+ return [false, null];
2855
+ }
2856
+
2857
+ try {
2858
+ const { Supertone } = await import("../src/index.js");
2859
+ const models = await import("../src/models/index.js");
2860
+ const client = new Supertone({ apiKey: API_KEY });
2861
+
2862
+ // Short text that will expand after pronunciation dictionary
2863
+ const testText =
2864
+ "AI is everywhere. ML powers many apps. The CEO spoke about GPT. " +
2865
+ "Dr. Smith from MIT and UCLA collaborated on NLP research. AWS and GCP offer AI services.";
2866
+
2867
+ const pronunciationDictionary = [
2868
+ { text: "AI", pronunciation: "Artificial Intelligence", partial_match: false },
2869
+ { text: "ML", pronunciation: "Machine Learning", partial_match: false },
2870
+ { text: "CEO", pronunciation: "Chief Executive Officer", partial_match: false },
2871
+ { text: "MIT", pronunciation: "Massachusetts Institute of Technology", partial_match: false },
2872
+ { text: "UCLA", pronunciation: "University of California Los Angeles", partial_match: false },
2873
+ { text: "NLP", pronunciation: "Natural Language Processing", partial_match: false },
2874
+ { text: "AWS", pronunciation: "Amazon Web Services", partial_match: false },
2875
+ { text: "GCP", pronunciation: "Google Cloud Platform", partial_match: false },
2876
+ { text: "GPT", pronunciation: "Generative Pre-trained Transformer", partial_match: true },
2877
+ { text: "Dr.", pronunciation: "Doctor", partial_match: true },
2878
+ ];
2879
+
2880
+ console.log(` šŸ” Original text length: ${testText.length} characters`);
2881
+ console.log(` šŸ“– Pronunciation dictionary entries: ${pronunciationDictionary.length}`);
2882
+ console.log(` šŸ”§ Expected: Text will expand to 300+ chars, triggering stream chunking`);
2883
+ console.log(" āš ļø This test consumes credits!");
2884
+
2885
+ const response = await client.textToSpeech.streamSpeech(
2886
+ {
2887
+ voiceId,
2888
+ apiConvertTextToSpeechUsingCharacterRequest: {
2889
+ text: testText,
2890
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2891
+ outputFormat:
2892
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2893
+ },
2894
+ },
2895
+ {
2896
+ pronunciationDictionary,
2897
+ }
2898
+ );
2899
+
2900
+ console.log(` āœ… Stream with pronunciation dictionary + long text started successfully`);
2901
+ console.log(` šŸŽÆ Stream chunking was triggered after pronunciation expansion!`);
2902
+
2903
+ if (response.result) {
2904
+ const audioData = await extractAudioData(response);
2905
+ const outputFile = "test_pronunciation_dictionary_stream_long_text_output.wav";
2906
+ fs.writeFileSync(outputFile, audioData);
2907
+ console.log(` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`);
2908
+ }
2909
+
2910
+ return [true, response];
2911
+ } catch (e: any) {
2912
+ logDetailedError(e, "Pronunciation dictionary streaming long text TTS");
2913
+ return [false, e];
2914
+ }
2915
+ }
2916
+
1539
2917
  /**
1540
2918
  * Main test execution
1541
2919
  */
@@ -1549,7 +2927,7 @@ async function main(): Promise<boolean> {
1549
2927
  console.log("");
1550
2928
 
1551
2929
  const testResults: TestResult = {};
1552
- let voiceIdForTTS: string | null = null;
2930
+ const voiceIdForTTS: string = "91992bbd4758bdcf9c9b01";
1553
2931
  let customVoiceId: string | null = null;
1554
2932
  let createdCustomVoiceId: string | null = null;
1555
2933
 
@@ -1572,9 +2950,6 @@ async function main(): Promise<boolean> {
1572
2950
 
1573
2951
  [success, result] = await testListVoices();
1574
2952
  testResults["list_voices"] = success;
1575
- if (success && result.voiceId) {
1576
- voiceIdForTTS = result.voiceId;
1577
- }
1578
2953
 
1579
2954
  [success, result] = await testSearchVoices();
1580
2955
  testResults["search_voices"] = success;
@@ -1643,6 +3018,67 @@ async function main(): Promise<boolean> {
1643
3018
  [success, result] = await testStreamSpeech(voiceIdForTTS);
1644
3019
  testResults["stream_speech"] = success;
1645
3020
 
3021
+ // 5.5 New Model Tests (sona_speech_2, supertonic_api_1)
3022
+ console.log("\nšŸ¤– New Model Tests (sona_speech_2, supertonic_api_1)");
3023
+ console.log("-".repeat(60));
3024
+ console.log("āš ļø These tests consume credits!");
3025
+ console.log("");
3026
+
3027
+ [success, result] = await testCreateSpeechWithSonaSpeech2(voiceIdForTTS);
3028
+ testResults["create_speech_sona_speech_2"] = success;
3029
+
3030
+ [success, result] = await testCreateSpeechWithSupertonicApi1(voiceIdForTTS);
3031
+ testResults["create_speech_supertonic_api_1"] = success;
3032
+
3033
+ [success, result] = await testCreateSpeechWithUnsupportedModel(
3034
+ voiceIdForTTS
3035
+ );
3036
+ testResults["create_speech_unsupported_model"] = success;
3037
+
3038
+ [success, result] = await testPredictDurationWithSonaSpeech2(voiceIdForTTS);
3039
+ testResults["predict_duration_sona_speech_2"] = success;
3040
+
3041
+ [success, result] = await testPredictDurationWithSupertonicApi1(
3042
+ voiceIdForTTS
3043
+ );
3044
+ testResults["predict_duration_supertonic_api_1"] = success;
3045
+
3046
+ [success, result] = await testPredictDurationWithUnsupportedModel(
3047
+ voiceIdForTTS
3048
+ );
3049
+ testResults["predict_duration_unsupported_model"] = success;
3050
+
3051
+ // 5.6 Multilingual Tests per Model
3052
+ console.log("\nšŸŒ Multilingual Tests per Model");
3053
+ console.log("-".repeat(60));
3054
+ console.log("āš ļø These tests consume credits!");
3055
+ console.log("");
3056
+
3057
+ [success, result] = await testMultilingualSonaSpeech1(voiceIdForTTS);
3058
+ testResults["multilingual_sona_speech_1"] = success;
3059
+
3060
+ [success, result] = await testMultilingualSonaSpeech2(voiceIdForTTS);
3061
+ testResults["multilingual_sona_speech_2"] = success;
3062
+
3063
+ [success, result] = await testMultilingualSupertonicApi1(voiceIdForTTS);
3064
+ testResults["multilingual_supertonic_api_1"] = success;
3065
+
3066
+ // 5.7 Unsupported Language Tests
3067
+ console.log("\n🚫 Unsupported Language Tests");
3068
+ console.log("-".repeat(60));
3069
+ console.log(
3070
+ "āš ļø These tests verify error handling for unsupported model-language combinations!"
3071
+ );
3072
+ console.log("");
3073
+
3074
+ [success, result] = await testUnsupportedLanguageSonaSpeech1(voiceIdForTTS);
3075
+ testResults["unsupported_lang_sona_speech_1"] = success;
3076
+
3077
+ [success, result] = await testUnsupportedLanguageSupertonicApi1(
3078
+ voiceIdForTTS
3079
+ );
3080
+ testResults["unsupported_lang_supertonic_api_1"] = success;
3081
+
1646
3082
  // 6. TTS Long Text Tests
1647
3083
  console.log("\nšŸ“œ Text-to-Speech Long Text Tests");
1648
3084
  console.log("-".repeat(60));
@@ -1652,6 +3088,29 @@ async function main(): Promise<boolean> {
1652
3088
  [success, result] = await testCreateSpeechLongText(voiceIdForTTS);
1653
3089
  testResults["create_speech_long_text"] = success;
1654
3090
 
3091
+ [success, result] = await testCreateSpeechLongSentenceNoPunctuation(
3092
+ voiceIdForTTS
3093
+ );
3094
+ testResults["create_speech_long_sentence_no_punctuation"] = success;
3095
+
3096
+ [success, result] = await testCreateSpeechJapaneseNoSpaces(voiceIdForTTS);
3097
+ testResults["create_speech_japanese_no_spaces"] = success;
3098
+
3099
+ // 6.5 Multilingual Punctuation Tests (fix/text_utils)
3100
+ console.log("\nšŸŒ Multilingual Punctuation Chunking Tests");
3101
+ console.log("-".repeat(60));
3102
+ console.log("āš ļø These tests verify multilingual sentence punctuation support!");
3103
+ console.log("");
3104
+
3105
+ [success, result] = await testCreateSpeechArabicPunctuation(voiceIdForTTS);
3106
+ testResults["create_speech_arabic_punctuation"] = success;
3107
+
3108
+ [success, result] = await testCreateSpeechHindiPunctuation(voiceIdForTTS);
3109
+ testResults["create_speech_hindi_punctuation"] = success;
3110
+
3111
+ [success, result] = await testCreateSpeechEllipsisPunctuation(voiceIdForTTS);
3112
+ testResults["create_speech_ellipsis_punctuation"] = success;
3113
+
1655
3114
  [success, result] = await testStreamSpeechLongText(voiceIdForTTS);
1656
3115
  testResults["stream_speech_long_text"] = success;
1657
3116
 
@@ -1704,6 +3163,32 @@ async function main(): Promise<boolean> {
1704
3163
 
1705
3164
  [success, result] = await testStreamSpeechLongTextMp3(voiceIdForTTS);
1706
3165
  testResults["stream_speech_long_text_mp3"] = success;
3166
+
3167
+ // 10. Pronunciation Dictionary Tests
3168
+ console.log("\nšŸ“– Pronunciation Dictionary Tests");
3169
+ console.log("-".repeat(60));
3170
+ console.log("āš ļø These tests consume credits!");
3171
+ console.log("");
3172
+
3173
+ [success, result] = await testCreateSpeechWithPronunciationDictionary(
3174
+ voiceIdForTTS
3175
+ );
3176
+ testResults["create_speech_pronunciation_dictionary"] = success;
3177
+
3178
+ [success, result] = await testCreateSpeechWithPronunciationDictionaryLongText(
3179
+ voiceIdForTTS
3180
+ );
3181
+ testResults["create_speech_pronunciation_dictionary_long_text"] = success;
3182
+
3183
+ [success, result] = await testStreamSpeechWithPronunciationDictionary(
3184
+ voiceIdForTTS
3185
+ );
3186
+ testResults["stream_speech_pronunciation_dictionary"] = success;
3187
+
3188
+ [success, result] = await testStreamSpeechWithPronunciationDictionaryLongText(
3189
+ voiceIdForTTS
3190
+ );
3191
+ testResults["stream_speech_pronunciation_dictionary_long_text"] = success;
1707
3192
  }
1708
3193
 
1709
3194
  // Results Summary
@@ -1761,6 +3246,12 @@ async function main(): Promise<boolean> {
1761
3246
  " • Text-to-Speech: predictDuration, createSpeech, streamSpeech"
1762
3247
  );
1763
3248
  console.log(" • TTS Long Text: createSpeechLongText, streamSpeechLongText");
3249
+ console.log(
3250
+ " • TTS Chunking Strategies: Word-based (no punctuation), Character-based (Japanese)"
3251
+ );
3252
+ console.log(
3253
+ " • Multilingual Punctuation: Arabic (؟ Ų› Ū”), Hindi (ą„¤ ą„„), Ellipsis (…  )"
3254
+ );
1764
3255
  console.log(
1765
3256
  " • TTS with Voice Settings: createSpeechWithVoiceSettings, predictDurationWithVoiceSettings, streamSpeechWithVoiceSettings"
1766
3257
  );
@@ -1773,6 +3264,30 @@ async function main(): Promise<boolean> {
1773
3264
  console.log(
1774
3265
  " • Custom Features: Auto-chunking in createSpeech/streamSpeech (transparent)"
1775
3266
  );
3267
+ console.log(
3268
+ " • Pronunciation Dictionary: createSpeech/streamSpeech with pronunciationDictionary option"
3269
+ );
3270
+ console.log(
3271
+ " - partial_match=false (word boundary) and partial_match=true (substring)"
3272
+ );
3273
+ console.log(
3274
+ " - Long text chunking after pronunciation expansion"
3275
+ );
3276
+ console.log("");
3277
+ console.log("šŸ¤– New Model & Language Tests:");
3278
+ console.log(
3279
+ " • New Models: sona_speech_2, supertonic_api_1 (createSpeech & predictDuration)"
3280
+ );
3281
+ console.log(
3282
+ " • Unsupported Model Validation: Error handling for invalid model names"
3283
+ );
3284
+ console.log(" • Multilingual per Model:");
3285
+ console.log(" - sona_speech_1: ko, en, ja");
3286
+ console.log(" - sona_speech_2: all 23 languages");
3287
+ console.log(" - supertonic_api_1: ko, en, ja, es, pt");
3288
+ console.log(
3289
+ " • Unsupported Language Validation: Error handling for invalid model-language combinations"
3290
+ );
1776
3291
 
1777
3292
  if (customVoiceId) {
1778
3293
  console.log("");