@supertone/supertone 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/FUNCTIONS.md +2 -2
  2. package/README.md +108 -44
  3. package/custom_test/realtime_tts_player.ts +64 -3
  4. package/custom_test/test_real_api.ts +1040 -105
  5. package/dist/commonjs/lib/config.d.ts +2 -2
  6. package/dist/commonjs/lib/config.d.ts.map +1 -1
  7. package/dist/commonjs/lib/config.js +2 -2
  8. package/dist/commonjs/lib/config.js.map +1 -1
  9. package/dist/commonjs/lib/custom_utils/text_utils.d.ts +8 -1
  10. package/dist/commonjs/lib/custom_utils/text_utils.d.ts.map +1 -1
  11. package/dist/commonjs/lib/custom_utils/text_utils.js +108 -7
  12. package/dist/commonjs/lib/custom_utils/text_utils.js.map +1 -1
  13. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  14. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  15. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js +48 -3
  16. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  17. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  18. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  19. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js +46 -3
  20. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  21. package/dist/commonjs/sdk/texttospeech.d.ts.map +1 -1
  22. package/dist/commonjs/sdk/texttospeech.js +12 -9
  23. package/dist/commonjs/sdk/texttospeech.js.map +1 -1
  24. package/dist/esm/lib/config.d.ts +2 -2
  25. package/dist/esm/lib/config.d.ts.map +1 -1
  26. package/dist/esm/lib/config.js +2 -2
  27. package/dist/esm/lib/config.js.map +1 -1
  28. package/dist/esm/lib/custom_utils/text_utils.d.ts +8 -1
  29. package/dist/esm/lib/custom_utils/text_utils.d.ts.map +1 -1
  30. package/dist/esm/lib/custom_utils/text_utils.js +108 -7
  31. package/dist/esm/lib/custom_utils/text_utils.js.map +1 -1
  32. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  33. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  34. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js +47 -2
  35. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  36. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  37. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  38. package/dist/esm/models/predictttsdurationusingcharacterrequest.js +45 -2
  39. package/dist/esm/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  40. package/dist/esm/sdk/texttospeech.d.ts.map +1 -1
  41. package/dist/esm/sdk/texttospeech.js +12 -9
  42. package/dist/esm/sdk/texttospeech.js.map +1 -1
  43. package/examples/custom_voices/create_cloned_voice.ts +4 -3
  44. package/examples/custom_voices/delete_custom_voice.ts +2 -7
  45. package/examples/custom_voices/edit_custom_voice.ts +2 -6
  46. package/examples/custom_voices/get_custom_voice.ts +2 -7
  47. package/examples/custom_voices/list_custom_voices.ts +2 -7
  48. package/examples/custom_voices/search_custom_voices.ts +2 -6
  49. package/examples/package.json +2 -2
  50. package/examples/textToSpeechCreateSpeech.example.ts +2 -2
  51. package/examples/text_to_speech/create_speech.ts +3 -8
  52. package/examples/text_to_speech/create_speech_long_text.ts +3 -7
  53. package/examples/text_to_speech/create_speech_with_phonemes.ts +3 -7
  54. package/examples/text_to_speech/create_speech_with_voice_settings.ts +3 -8
  55. package/examples/text_to_speech/predict_duration.ts +3 -7
  56. package/examples/text_to_speech/stream_speech.ts +3 -7
  57. package/examples/text_to_speech/stream_speech_long_text.ts +3 -7
  58. package/examples/text_to_speech/stream_speech_with_phonemes.ts +3 -7
  59. package/examples/text_to_speech/stream_speech_with_voice_settings.ts +3 -7
  60. package/examples/usage/get_credit_balance.ts +2 -6
  61. package/examples/usage/get_usage.ts +2 -6
  62. package/examples/usage/get_voice_usage.ts +2 -7
  63. package/examples/voices/get_voice.ts +2 -6
  64. package/examples/voices/list_voices.ts +2 -6
  65. package/examples/voices/search_voices.ts +2 -7
  66. package/jsr.json +2 -2
  67. package/openapi.json +101 -9
  68. package/package.json +26 -10
  69. package/src/lib/config.ts +3 -2
  70. package/src/lib/custom_utils/text_utils.ts +117 -7
  71. package/src/models/apiconverttexttospeechusingcharacterrequest.ts +62 -3
  72. package/src/models/predictttsdurationusingcharacterrequest.ts +64 -3
  73. package/src/sdk/texttospeech.ts +474 -465
@@ -127,19 +127,25 @@ async function extractAudioData(response: any): Promise<Uint8Array> {
127
127
  console.log(` šŸ” Debug - has audioBase64: ${"audioBase64" in result}`);
128
128
  console.log(` šŸ” Debug - has getReader: ${"getReader" in result}`);
129
129
  }
130
-
130
+
131
131
  // Check for capital-case Result (SDK internal structure)
132
- if (!result || (typeof result === "object" && Object.keys(result).length === 0)) {
132
+ if (
133
+ !result ||
134
+ (typeof result === "object" && Object.keys(result).length === 0)
135
+ ) {
133
136
  console.log(` šŸ’” Checking SDK internal Result field...`);
134
137
  if ((response as any).Result) {
135
138
  result = (response as any).Result;
136
139
  console.log(` āœ… Found Result (capital R) - using that instead`);
137
140
  }
138
141
  }
139
-
142
+
140
143
  // Debug response headers
141
144
  if (response.headers) {
142
- console.log(` šŸ” Debug - response headers:`, JSON.stringify(response.headers, null, 2));
145
+ console.log(
146
+ ` šŸ” Debug - response headers:`,
147
+ JSON.stringify(response.headers, null, 2)
148
+ );
143
149
  }
144
150
 
145
151
  if (result instanceof Uint8Array) {
@@ -198,7 +204,7 @@ async function extractAudioData(response: any): Promise<Uint8Array> {
198
204
  return bytes;
199
205
  }
200
206
  }
201
-
207
+
202
208
  // Handle empty object case - this might happen when the SDK doesn't properly parse audio responses
203
209
  if (
204
210
  typeof result === "object" &&
@@ -207,22 +213,25 @@ async function extractAudioData(response: any): Promise<Uint8Array> {
207
213
  ) {
208
214
  console.log(` āš ļø Warning: Empty result object detected`);
209
215
  console.log(` šŸ’” This might be a parsing issue with the SDK`);
210
- console.log(` šŸ’” Check if the response was actually a stream but got parsed as an empty object`);
211
-
216
+ console.log(
217
+ ` šŸ’” Check if the response was actually a stream but got parsed as an empty object`
218
+ );
219
+
212
220
  throw new Error(
213
221
  `Empty result object - SDK may have failed to parse audio stream response. ` +
214
- `This usually happens when audio/* content-type responses are not properly handled.`
222
+ `This usually happens when audio/* content-type responses are not properly handled.`
215
223
  );
216
224
  }
217
225
 
218
226
  // Enhanced error message with debug info
219
- const errorDetails = typeof result === "object" && result !== null
220
- ? `constructor: ${result.constructor.name}, keys: [${Object.keys(result).join(", ")}]`
221
- : `value: ${result}`;
222
-
223
- throw new Error(
224
- `Unsupported result type: ${typeof result}, ${errorDetails}`
225
- );
227
+ const errorDetails =
228
+ typeof result === "object" && result !== null
229
+ ? `constructor: ${result.constructor.name}, keys: [${Object.keys(
230
+ result
231
+ ).join(", ")}]`
232
+ : `value: ${result}`;
233
+
234
+ throw new Error(`Unsupported result type: ${typeof result}, ${errorDetails}`);
226
235
  }
227
236
 
228
237
  /**
@@ -927,6 +936,154 @@ async function testCreateSpeechLongText(
927
936
  }
928
937
  }
929
938
 
939
+ /**
940
+ * Test TTS with long text WITHOUT punctuation (word-based chunking)
941
+ * This tests the word-based splitting fallback when sentences exceed 300 chars
942
+ */
943
+ async function testCreateSpeechLongSentenceNoPunctuation(
944
+ voiceId: string | null
945
+ ): Promise<[boolean, any]> {
946
+ console.log(
947
+ "šŸ“œ Long Sentence WITHOUT Punctuation Test (Word-based chunking)"
948
+ );
949
+
950
+ if (!voiceId) {
951
+ console.log(" āš ļø No voice ID available");
952
+ return [false, null];
953
+ }
954
+
955
+ try {
956
+ const { Supertone } = await import("../src/index.js");
957
+ const models = await import("../src/models/index.js");
958
+ const client = new Supertone({ apiKey: API_KEY });
959
+
960
+ // Long text without punctuation - forces word-based splitting
961
+ // This is a single continuous sentence with no periods or other punctuation marks
962
+ const longSentenceNoPunctuation =
963
+ "This is a very long sentence without any punctuation marks that is designed to test the word based chunking feature of the SDK when a sentence exceeds the maximum character limit of three hundred characters the system should automatically split this text by word boundaries rather than sentence boundaries to ensure proper processing and this behavior is critical for handling user generated content that may not follow standard punctuation conventions such as chat messages or informal text inputs that users commonly provide in real world applications where grammatically correct sentences are not always guaranteed";
964
+
965
+ const actualLength = longSentenceNoPunctuation.length;
966
+ console.log(
967
+ ` šŸ“ Text length: ${actualLength} characters (single sentence, no punctuation)`
968
+ );
969
+ console.log(` šŸ”§ Expected behavior: Word-based chunking`);
970
+ console.log(" āš ļø This test consumes credits!");
971
+
972
+ const response = await client.textToSpeech.createSpeech({
973
+ voiceId,
974
+ apiConvertTextToSpeechUsingCharacterRequest: {
975
+ text: longSentenceNoPunctuation,
976
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
977
+ outputFormat:
978
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
979
+ style: "neutral",
980
+ model: "sona_speech_1",
981
+ },
982
+ });
983
+
984
+ if (response.result) {
985
+ const audioData = await extractAudioData(response);
986
+
987
+ console.log(
988
+ ` āœ… Word-based chunking TTS success: ${audioData.length} bytes`
989
+ );
990
+ console.log(
991
+ ` šŸŽÆ Long sentence without punctuation processed correctly!`
992
+ );
993
+
994
+ const outputFile = "test_word_chunking_speech_output.wav";
995
+ fs.writeFileSync(outputFile, audioData);
996
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
997
+
998
+ const estimatedChunks = Math.ceil(actualLength / 300);
999
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1000
+ }
1001
+
1002
+ return [true, response];
1003
+ } catch (e: any) {
1004
+ logDetailedError(e, "Long sentence word-based chunking");
1005
+ return [false, e];
1006
+ }
1007
+ }
1008
+
1009
+ /**
1010
+ * Test TTS with Japanese text (character-based chunking)
1011
+ * Japanese doesn't use spaces, AND this test uses NO punctuation marks (ć€‚ļ¼ļ¼Ÿetc)
1012
+ * to ensure the SDK uses character-based splitting
1013
+ */
1014
+ async function testCreateSpeechJapaneseNoSpaces(
1015
+ voiceId: string | null
1016
+ ): Promise<[boolean, any]> {
1017
+ console.log("šŸ‡ÆšŸ‡µ Japanese Text Test (Character-based chunking)");
1018
+
1019
+ if (!voiceId) {
1020
+ console.log(" āš ļø No voice ID available");
1021
+ return [false, null];
1022
+ }
1023
+
1024
+ try {
1025
+ const { Supertone } = await import("../src/index.js");
1026
+ const models = await import("../src/models/index.js");
1027
+ const client = new Supertone({ apiKey: API_KEY });
1028
+
1029
+ // Long Japanese text WITHOUT spaces AND WITHOUT punctuation - forces character-based splitting
1030
+ // This text intentionally has NO punctuation marks (ć€‚ļ¼ļ¼Ÿetc) to test pure character-based chunking
1031
+ // Text length: ~450 characters (exceeds 300 char limit)
1032
+ const longJapaneseText =
1033
+ "ę—„ęœ¬čŖžć®ćƒ†ć‚­ć‚¹ćƒˆćÆé€šåøøć‚¹ćƒšćƒ¼ć‚¹ć‚’å«ć¾ćŖć„ćŸć‚ē‰¹åˆ„ćŖå‡¦ē†ćŒåæ…č¦ć§ć™" +
1034
+ "ć“ć®ćƒ†ć‚¹ćƒˆćÆäø‰ē™¾ę–‡å­—ć‚’č¶…ćˆć‚‹é•·ć„ę—„ęœ¬čŖžćƒ†ć‚­ć‚¹ćƒˆćŒę­£ć—ćå‡¦ē†ć•ć‚Œć‚‹ć“ćØć‚’ē¢ŗčŖć—ć¾ć™" +
1035
+ "č‡Ŗē„¶čØ€čŖžå‡¦ē†ęŠ€č”“ć®ē™ŗå±•ć«ć‚ˆć‚ŠéŸ³å£°åˆęˆć®å“č³ŖćÆå¤§å¹…ć«å‘äøŠć—ć¾ć—ćŸ" +
1036
+ "ē‰¹ć«ćƒ‡ć‚£ćƒ¼ćƒ—ćƒ©ćƒ¼ćƒ‹ćƒ³ć‚°ć‚’ę“»ē”Øć—ćŸęœ€ę–°ć®ćƒ†ć‚­ć‚¹ćƒˆéŸ³å£°å¤‰ę›ć‚·ć‚¹ćƒ†ćƒ ćÆäŗŗé–“ć®ē™ŗč©±ć«éžåøøć«čæ‘ć„č‡Ŗē„¶ćŖéŸ³å£°ć‚’ē”Ÿęˆć§ćć¾ć™" +
1037
+ "ć‚¹ćƒšćƒ¼ć‚¹ćŒćŖć„čØ€čŖžć§ćÆę–‡å­—å˜ä½ć§ć®åˆ†å‰²ćŒåæ…č¦ć§ć‚ć‚Šć“ć®SDKćÆćć®ć‚ˆć†ćŖēŠ¶ę³ć‚’č‡Ŗå‹•ēš„ć«ę¤œå‡ŗć—ć¦é©åˆ‡ć«å‡¦ē†ć—ć¾ć™" +
1038
+ "ć“ć‚Œć«ć‚ˆć‚Šę—„ęœ¬čŖžäø­å›½čŖžéŸ“å›½čŖžćŖć©ć®ć‚¢ć‚øć‚¢čØ€čŖžć§ć‚‚å•é”ŒćŖćé•·ć„ćƒ†ć‚­ć‚¹ćƒˆć‚’éŸ³å£°ć«å¤‰ę›ć™ć‚‹ć“ćØćŒć§ćć¾ć™" +
1039
+ "éŸ³å£°åˆęˆęŠ€č”“ćÆč¦–č¦šéšœå®³č€…ć®ćŸć‚ć®ć‚¢ć‚Æć‚»ć‚·ćƒ“ćƒŖćƒ†ć‚£ćƒ„ćƒ¼ćƒ«ć‹ć‚‰åÆ¾č©±åž‹AIć‚¢ć‚·ć‚¹ć‚æćƒ³ćƒˆć¾ć§å¹…åŗƒć„ē”Øé€”ć§ę“»ē”Øć•ć‚Œć¦ć„ć¾ć™" +
1040
+ "ć•ć‚‰ć«ćƒŖć‚¢ćƒ«ć‚æć‚¤ćƒ ć‚¹ćƒˆćƒŖćƒ¼ćƒŸćƒ³ć‚°ęŠ€č”“ćØēµ„ćæåˆć‚ć›ć‚‹ć“ćØć§å¾…ć”ę™‚é–“ć‚’å¤§å¹…ć«ēŸ­ēø®ć—å„Ŗć‚ŒćŸćƒ¦ćƒ¼ć‚¶ćƒ¼ä½“éØ“ć‚’ęä¾›ć™ć‚‹ć“ćØćŒć§ćć¾ć™" +
1041
+ "ęœ€ę–°ć®éŸ³å£°åˆęˆęŠ€č”“ćÆę„Ÿęƒ…ć‚„ęŠ‘ęšć‚‚č‡Ŗē„¶ć«č”Øē¾ć§ćć‚‹ć‚ˆć†ć«ćŖć‚Šć¾ć—ćŸ";
1042
+
1043
+ const actualLength = longJapaneseText.length;
1044
+ console.log(
1045
+ ` šŸ“ Text length: ${actualLength} characters (Japanese, no spaces, no punctuation)`
1046
+ );
1047
+ console.log(
1048
+ ` šŸ”§ Expected behavior: Character-based chunking (300 chars per chunk)`
1049
+ );
1050
+ console.log(" āš ļø This test consumes credits!");
1051
+
1052
+ const response = await client.textToSpeech.createSpeech({
1053
+ voiceId,
1054
+ apiConvertTextToSpeechUsingCharacterRequest: {
1055
+ text: longJapaneseText,
1056
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ja,
1057
+ outputFormat:
1058
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1059
+ style: "neutral",
1060
+ model: "sona_speech_1",
1061
+ },
1062
+ });
1063
+
1064
+ if (response.result) {
1065
+ const audioData = await extractAudioData(response);
1066
+
1067
+ console.log(
1068
+ ` āœ… Character-based chunking TTS success: ${audioData.length} bytes`
1069
+ );
1070
+ console.log(` šŸŽÆ Japanese text without spaces processed correctly!`);
1071
+
1072
+ const outputFile = "test_japanese_char_chunking_speech_output.wav";
1073
+ fs.writeFileSync(outputFile, audioData);
1074
+ console.log(` šŸ’¾ Audio saved: ${outputFile}`);
1075
+
1076
+ const estimatedChunks = Math.ceil(actualLength / 300);
1077
+ console.log(` šŸ“Š Estimated chunks: ${estimatedChunks}`);
1078
+ }
1079
+
1080
+ return [true, response];
1081
+ } catch (e: any) {
1082
+ logDetailedError(e, "Japanese character-based chunking");
1083
+ return [false, e];
1084
+ }
1085
+ }
1086
+
930
1087
  /**
931
1088
  * Test TTS streaming with long text
932
1089
  */
@@ -1168,13 +1325,53 @@ async function testStreamSpeechWithPhonemes(
1168
1325
  }
1169
1326
  }
1170
1327
 
1328
+ // =============================================================================
1329
+ // Model & Language Compatibility Tests
1330
+ // =============================================================================
1331
+
1171
1332
  /**
1172
- * Test duration prediction with voice settings
1333
+ * Model-Language compatibility matrix
1334
+ * - sona_speech_1: ko, en, ja
1335
+ * - sona_speech_2: all languages (23 languages)
1336
+ * - supertonic_api_1: ko, en, ja, es, pt
1173
1337
  */
1174
- async function testPredictDurationWithVoiceSettings(
1338
+ const MODEL_LANGUAGE_MATRIX = {
1339
+ sona_speech_1: ["ko", "en", "ja"],
1340
+ sona_speech_2: [
1341
+ "en",
1342
+ "ko",
1343
+ "ja",
1344
+ "bg",
1345
+ "cs",
1346
+ "da",
1347
+ "el",
1348
+ "es",
1349
+ "et",
1350
+ "fi",
1351
+ "hu",
1352
+ "it",
1353
+ "nl",
1354
+ "pl",
1355
+ "pt",
1356
+ "ro",
1357
+ "ar",
1358
+ "de",
1359
+ "fr",
1360
+ "hi",
1361
+ "id",
1362
+ "ru",
1363
+ "vi",
1364
+ ],
1365
+ supertonic_api_1: ["ko", "en", "ja", "es", "pt"],
1366
+ } as const;
1367
+
1368
+ /**
1369
+ * Test TTS with sona_speech_2 model
1370
+ */
1371
+ async function testCreateSpeechWithSonaSpeech2(
1175
1372
  voiceId: string | null
1176
1373
  ): Promise<[boolean, any]> {
1177
- console.log("ā±ļø Duration Prediction with Voice Settings Test");
1374
+ console.log("šŸ¤– TTS with sona_speech_2 Model Test");
1178
1375
 
1179
1376
  if (!voiceId) {
1180
1377
  console.log(" āš ļø No voice ID available");
@@ -1186,40 +1383,50 @@ async function testPredictDurationWithVoiceSettings(
1186
1383
  const models = await import("../src/models/index.js");
1187
1384
  const client = new Supertone({ apiKey: API_KEY });
1188
1385
 
1189
- const voiceSettings = {
1190
- speed: 0.8,
1191
- };
1192
-
1193
- console.log(
1194
- ` šŸ” Predicting duration with voice settings for voice '${voiceId}'...`
1195
- );
1196
- console.log(` Settings: speed=${voiceSettings.speed}`);
1386
+ const testText =
1387
+ "Hello! Testing sona_speech_2 model for text-to-speech conversion.";
1388
+ console.log(` šŸ” Creating speech with sona_speech_2 model`);
1389
+ console.log(` Voice ID: ${voiceId}`);
1390
+ console.log(` Model: sona_speech_2`);
1391
+ console.log(" āš ļø This test consumes credits!");
1197
1392
 
1198
- const response = await client.textToSpeech.predictDuration({
1393
+ const response = await client.textToSpeech.createSpeech({
1199
1394
  voiceId,
1200
- predictTTSDurationUsingCharacterRequest: {
1201
- text: "This is a duration test with adjusted speed.",
1202
- language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1203
- voiceSettings,
1395
+ apiConvertTextToSpeechUsingCharacterRequest: {
1396
+ text: testText,
1397
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1398
+ outputFormat:
1399
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1400
+ model:
1401
+ models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech2,
1204
1402
  },
1205
1403
  });
1206
1404
 
1207
- console.log(` āœ… Predicted duration: ${response.duration}s`);
1405
+ console.log(` āœ… sona_speech_2 TTS success`);
1406
+
1407
+ if (response.result) {
1408
+ const audioData = await extractAudioData(response);
1409
+ const outputFile = "test_sona_speech_2_output.wav";
1410
+ fs.writeFileSync(outputFile, audioData);
1411
+ console.log(
1412
+ ` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`
1413
+ );
1414
+ }
1208
1415
 
1209
1416
  return [true, response];
1210
1417
  } catch (e: any) {
1211
- console.error(` āŒ Error: ${e.message || e}`);
1418
+ logDetailedError(e, "sona_speech_2 TTS");
1212
1419
  return [false, e];
1213
1420
  }
1214
1421
  }
1215
1422
 
1216
1423
  /**
1217
- * Test TTS streaming with voice settings
1424
+ * Test TTS with supertonic_api_1 model
1218
1425
  */
1219
- async function testStreamSpeechWithVoiceSettings(
1426
+ async function testCreateSpeechWithSupertonicApi1(
1220
1427
  voiceId: string | null
1221
1428
  ): Promise<[boolean, any]> {
1222
- console.log("šŸ“” TTS Streaming with Voice Settings Test");
1429
+ console.log("šŸ¤– TTS with supertonic_api_1 Model Test");
1223
1430
 
1224
1431
  if (!voiceId) {
1225
1432
  console.log(" āš ļø No voice ID available");
@@ -1231,46 +1438,51 @@ async function testStreamSpeechWithVoiceSettings(
1231
1438
  const models = await import("../src/models/index.js");
1232
1439
  const client = new Supertone({ apiKey: API_KEY });
1233
1440
 
1234
- const voiceSettings = {
1235
- pitchShift: 1.05,
1236
- speed: 1.1,
1237
- };
1238
-
1239
- console.log(
1240
- ` šŸ” Streaming speech with voice settings for voice '${voiceId}'...`
1241
- );
1242
- console.log(
1243
- ` Settings: pitchShift=${voiceSettings.pitchShift}, speed=${voiceSettings.speed}`
1244
- );
1441
+ const testText =
1442
+ "Hello! Testing supertonic_api_1 model for text-to-speech conversion.";
1443
+ console.log(` šŸ” Creating speech with supertonic_api_1 model`);
1444
+ console.log(` Voice ID: ${voiceId}`);
1445
+ console.log(` Model: supertonic_api_1`);
1245
1446
  console.log(" āš ļø This test consumes credits!");
1246
1447
 
1247
- const response = await client.textToSpeech.streamSpeech({
1448
+ const response = await client.textToSpeech.createSpeech({
1248
1449
  voiceId,
1249
1450
  apiConvertTextToSpeechUsingCharacterRequest: {
1250
- text: "Streaming with adjusted voice settings.",
1451
+ text: testText,
1251
1452
  language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1252
1453
  outputFormat:
1253
1454
  models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1254
- voiceSettings,
1455
+ model:
1456
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1457
+ .SupertonicApi1,
1255
1458
  },
1256
1459
  });
1257
1460
 
1258
- console.log(` āœ… Stream with voice settings started successfully`);
1461
+ console.log(` āœ… supertonic_api_1 TTS success`);
1462
+
1463
+ if (response.result) {
1464
+ const audioData = await extractAudioData(response);
1465
+ const outputFile = "test_supertonic_api_1_output.wav";
1466
+ fs.writeFileSync(outputFile, audioData);
1467
+ console.log(
1468
+ ` šŸ’¾ Audio saved: ${outputFile} (${audioData.length} bytes)`
1469
+ );
1470
+ }
1259
1471
 
1260
1472
  return [true, response];
1261
1473
  } catch (e: any) {
1262
- console.error(` āŒ Error: ${e.message || e}`);
1474
+ logDetailedError(e, "supertonic_api_1 TTS");
1263
1475
  return [false, e];
1264
1476
  }
1265
1477
  }
1266
1478
 
1267
1479
  /**
1268
- * Test MP3 format TTS
1480
+ * Test TTS with unsupported model (should fail with validation error)
1269
1481
  */
1270
- async function testCreateSpeechMp3(
1482
+ async function testCreateSpeechWithUnsupportedModel(
1271
1483
  voiceId: string | null
1272
1484
  ): Promise<[boolean, any]> {
1273
- console.log("šŸŽ¤ MP3 Format TTS Test");
1485
+ console.log("🚫 TTS with Unsupported Model Test (Expected to Fail)");
1274
1486
 
1275
1487
  if (!voiceId) {
1276
1488
  console.log(" āš ļø No voice ID available");
@@ -1282,62 +1494,82 @@ async function testCreateSpeechMp3(
1282
1494
  const models = await import("../src/models/index.js");
1283
1495
  const client = new Supertone({ apiKey: API_KEY });
1284
1496
 
1285
- console.log(` šŸ” MP3 TTS conversion with voice '${voiceId}'...`);
1286
- console.log(" āš ļø This test consumes credits!");
1497
+ const testText = "This should fail with unsupported model.";
1498
+ console.log(
1499
+ ` šŸ” Attempting TTS with unsupported model: 'invalid_model_xyz'`
1500
+ );
1287
1501
 
1502
+ // Using type assertion to bypass TypeScript validation for testing
1288
1503
  const response = await client.textToSpeech.createSpeech({
1289
1504
  voiceId,
1290
1505
  apiConvertTextToSpeechUsingCharacterRequest: {
1291
- text: "Hello! This is an MP3 format SDK test. Let's verify if it works correctly.",
1506
+ text: testText,
1292
1507
  language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
1293
1508
  outputFormat:
1294
- models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Mp3,
1295
- style: "neutral",
1296
- model: "sona_speech_1",
1509
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1510
+ model: "invalid_model_xyz" as any, // Intentionally invalid model
1297
1511
  },
1298
1512
  });
1299
1513
 
1300
- console.log(` āœ… MP3 TTS conversion success`);
1514
+ // If we reach here, the test failed (should have thrown an error)
1515
+ console.log(` āŒ Expected error but got success - this is unexpected!`);
1516
+ return [false, response];
1517
+ } catch (e: any) {
1518
+ // Expected to fail - this is the success case for this test
1519
+ console.log(` āœ… Correctly rejected unsupported model`);
1520
+ console.log(` šŸ“‹ Error type: ${e.constructor?.name || typeof e}`);
1521
+ console.log(` šŸ“‹ Error message: ${e.message?.substring(0, 100) || e}`);
1522
+ return [true, e];
1523
+ }
1524
+ }
1301
1525
 
1302
- if (response.result) {
1303
- const outputFile = "test_create_speech_output.mp3";
1304
- const audioData = await extractAudioData(response);
1526
+ /**
1527
+ * Test prediction with sona_speech_2 model
1528
+ */
1529
+ async function testPredictDurationWithSonaSpeech2(
1530
+ voiceId: string | null
1531
+ ): Promise<[boolean, any]> {
1532
+ console.log("ā±ļø Duration Prediction with sona_speech_2 Model Test");
1305
1533
 
1306
- fs.writeFileSync(outputFile, audioData);
1307
- console.log(` šŸ’¾ MP3 audio file saved: ${outputFile}`);
1534
+ if (!voiceId) {
1535
+ console.log(" āš ļø No voice ID available");
1536
+ return [false, null];
1537
+ }
1308
1538
 
1309
- // Verify MP3 header
1310
- const header = audioData.slice(0, 10);
1311
- if (header[0] === 0x49 && header[1] === 0x44 && header[2] === 0x33) {
1312
- console.log(` āœ… Valid MP3 file generated (ID3 tag)`);
1313
- } else if (
1314
- (header[0] === 0xff && header[1] === 0xfb) ||
1315
- (header[0] === 0xff && header[1] === 0xfa)
1316
- ) {
1317
- console.log(` āœ… Valid MP3 file generated (MPEG frame)`);
1318
- } else {
1319
- console.log(
1320
- ` šŸ“„ MP3 header: ${Array.from(header.slice(0, 10))
1321
- .map((b) => b.toString(16).padStart(2, "0"))
1322
- .join(" ")} (needs verification)`
1323
- );
1324
- }
1325
- }
1539
+ try {
1540
+ const { Supertone } = await import("../src/index.js");
1541
+ const models = await import("../src/models/index.js");
1542
+ const client = new Supertone({ apiKey: API_KEY });
1543
+
1544
+ const testText = "Testing duration prediction with sona_speech_2 model.";
1545
+ console.log(` šŸ” Predicting duration with sona_speech_2 model`);
1546
+
1547
+ const response = await client.textToSpeech.predictDuration({
1548
+ voiceId,
1549
+ predictTTSDurationUsingCharacterRequest: {
1550
+ text: testText,
1551
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1552
+ model: models.PredictTTSDurationUsingCharacterRequestModel.SonaSpeech2,
1553
+ },
1554
+ });
1326
1555
 
1556
+ console.log(
1557
+ ` āœ… sona_speech_2 duration prediction: ${response.duration}s`
1558
+ );
1327
1559
  return [true, response];
1328
1560
  } catch (e: any) {
1329
- console.error(` āŒ Error: ${e.message || e}`);
1561
+ logDetailedError(e, "sona_speech_2 duration prediction");
1330
1562
  return [false, e];
1331
1563
  }
1332
1564
  }
1333
1565
 
1334
1566
  /**
1335
- * Test MP3 format with long text
1567
+ * Test prediction with supertonic_api_1 model
1336
1568
  */
1337
- async function testCreateSpeechLongTextMp3(
1569
+ async function testPredictDurationWithSupertonicApi1(
1338
1570
  voiceId: string | null
1339
1571
  ): Promise<[boolean, any]> {
1340
- console.log("šŸ“œ Long Text MP3 Auto-Chunking TTS Test (300+ chars)");
1572
+ console.log("ā±ļø Duration Prediction with supertonic_api_1 Model Test");
1341
1573
 
1342
1574
  if (!voiceId) {
1343
1575
  console.log(" āš ļø No voice ID available");
@@ -1349,20 +1581,639 @@ async function testCreateSpeechLongTextMp3(
1349
1581
  const models = await import("../src/models/index.js");
1350
1582
  const client = new Supertone({ apiKey: API_KEY });
1351
1583
 
1352
- const longText = `
1353
- Hello! This is a very long text MP3 auto-chunking TTS test exceeding 300 characters.
1354
- The newly implemented SDK automatically divides long text into multiple chunks for processing.
1355
- Real-time streaming text-to-speech technology plays a crucial role in modern AI applications.
1356
- It is an indispensable technology especially in conversational services, live broadcasting, and real-time translation services.
1357
- Through the auto-chunking feature, long texts are naturally divided into multiple small segments for processing.
1358
- Each segment is intelligently segmented considering sentence and word boundaries, enabling natural speech generation.
1359
- Now users don't need to worry about text length or output format, as the SDK automatically handles everything in MP3 format too.
1360
- `.trim();
1584
+ const testText = "Testing duration prediction with supertonic_api_1 model.";
1585
+ console.log(` šŸ” Predicting duration with supertonic_api_1 model`);
1361
1586
 
1362
- const actualLength = longText.length;
1363
- console.log(
1364
- ` šŸ“ Test text length: ${actualLength} characters (exceeds 300)`
1365
- );
1587
+ const response = await client.textToSpeech.predictDuration({
1588
+ voiceId,
1589
+ predictTTSDurationUsingCharacterRequest: {
1590
+ text: testText,
1591
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1592
+ model:
1593
+ models.PredictTTSDurationUsingCharacterRequestModel.SupertonicApi1,
1594
+ },
1595
+ });
1596
+
1597
+ console.log(
1598
+ ` āœ… supertonic_api_1 duration prediction: ${response.duration}s`
1599
+ );
1600
+ return [true, response];
1601
+ } catch (e: any) {
1602
+ logDetailedError(e, "supertonic_api_1 duration prediction");
1603
+ return [false, e];
1604
+ }
1605
+ }
1606
+
1607
+ /**
1608
+ * Test prediction with unsupported model (should fail with validation error)
1609
+ */
1610
+ async function testPredictDurationWithUnsupportedModel(
1611
+ voiceId: string | null
1612
+ ): Promise<[boolean, any]> {
1613
+ console.log(
1614
+ "🚫 Duration Prediction with Unsupported Model Test (Expected to Fail)"
1615
+ );
1616
+
1617
+ if (!voiceId) {
1618
+ console.log(" āš ļø No voice ID available");
1619
+ return [false, null];
1620
+ }
1621
+
1622
+ try {
1623
+ const { Supertone } = await import("../src/index.js");
1624
+ const models = await import("../src/models/index.js");
1625
+ const client = new Supertone({ apiKey: API_KEY });
1626
+
1627
+ const testText = "This should fail with unsupported model.";
1628
+ console.log(
1629
+ ` šŸ” Attempting prediction with unsupported model: 'invalid_model_xyz'`
1630
+ );
1631
+
1632
+ const response = await client.textToSpeech.predictDuration({
1633
+ voiceId,
1634
+ predictTTSDurationUsingCharacterRequest: {
1635
+ text: testText,
1636
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
1637
+ model: "invalid_model_xyz" as any, // Intentionally invalid model
1638
+ },
1639
+ });
1640
+
1641
+ console.log(` āŒ Expected error but got success - this is unexpected!`);
1642
+ return [false, response];
1643
+ } catch (e: any) {
1644
+ console.log(` āœ… Correctly rejected unsupported model`);
1645
+ console.log(` šŸ“‹ Error type: ${e.constructor?.name || typeof e}`);
1646
+ console.log(` šŸ“‹ Error message: ${e.message?.substring(0, 100) || e}`);
1647
+ return [true, e];
1648
+ }
1649
+ }
1650
+
1651
+ // =============================================================================
1652
+ // Multilingual Tests per Model
1653
+ // =============================================================================
1654
+
1655
+ /**
1656
+ * Test TTS multilingual support with sona_speech_1 (supports: ko, en, ja)
1657
+ */
1658
+ async function testMultilingualSonaSpeech1(
1659
+ voiceId: string | null
1660
+ ): Promise<[boolean, any]> {
1661
+ console.log("šŸŒ Multilingual Test - sona_speech_1 (ko, en, ja)");
1662
+
1663
+ if (!voiceId) {
1664
+ console.log(" āš ļø No voice ID available");
1665
+ return [false, null];
1666
+ }
1667
+
1668
+ const testCases = [
1669
+ {
1670
+ lang: "ko" as const,
1671
+ text: "ģ•ˆė…•ķ•˜ģ„øģš”, ģ†Œė‚˜ ģŠ¤ķ”¼ģ¹˜ 원 ėŖØėøģž…ė‹ˆė‹¤.",
1672
+ label: "Korean",
1673
+ },
1674
+ {
1675
+ lang: "en" as const,
1676
+ text: "Hello, this is sona_speech_1 model.",
1677
+ label: "English",
1678
+ },
1679
+ {
1680
+ lang: "ja" as const,
1681
+ text: "ć“ć‚“ć«ć”ćÆć€ć‚½ćƒŠć‚¹ćƒ”ćƒ¼ćƒćƒÆćƒ³ćƒ¢ćƒ‡ćƒ«ć§ć™ć€‚",
1682
+ label: "Japanese",
1683
+ },
1684
+ ];
1685
+
1686
+ try {
1687
+ const { Supertone } = await import("../src/index.js");
1688
+ const models = await import("../src/models/index.js");
1689
+ const client = new Supertone({ apiKey: API_KEY });
1690
+
1691
+ let allPassed = true;
1692
+ const results: any[] = [];
1693
+
1694
+ for (const tc of testCases) {
1695
+ console.log(` šŸ” Testing ${tc.label} (${tc.lang})...`);
1696
+
1697
+ try {
1698
+ const langEnum =
1699
+ models.APIConvertTextToSpeechUsingCharacterRequestLanguage[
1700
+ (tc.lang.charAt(0).toUpperCase() +
1701
+ tc.lang.slice(
1702
+ 1
1703
+ )) as keyof typeof models.APIConvertTextToSpeechUsingCharacterRequestLanguage
1704
+ ];
1705
+
1706
+ const response = await client.textToSpeech.createSpeech({
1707
+ voiceId,
1708
+ apiConvertTextToSpeechUsingCharacterRequest: {
1709
+ text: tc.text,
1710
+ language: langEnum,
1711
+ outputFormat:
1712
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat
1713
+ .Wav,
1714
+ model:
1715
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1716
+ .SonaSpeech1,
1717
+ },
1718
+ });
1719
+
1720
+ console.log(` āœ… ${tc.label} success`);
1721
+ results.push({ lang: tc.lang, success: true });
1722
+ } catch (e: any) {
1723
+ console.log(
1724
+ ` āŒ ${tc.label} failed: ${e.message?.substring(0, 50)}`
1725
+ );
1726
+ results.push({ lang: tc.lang, success: false, error: e.message });
1727
+ allPassed = false;
1728
+ }
1729
+ }
1730
+
1731
+ console.log(
1732
+ ` šŸ“Š Result: ${results.filter((r) => r.success).length}/${
1733
+ testCases.length
1734
+ } languages passed`
1735
+ );
1736
+ return [allPassed, results];
1737
+ } catch (e: any) {
1738
+ logDetailedError(e, "sona_speech_1 multilingual");
1739
+ return [false, e];
1740
+ }
1741
+ }
1742
+
1743
+ /**
1744
+ * Test TTS multilingual support with sona_speech_2 (supports all languages)
1745
+ */
1746
+ async function testMultilingualSonaSpeech2(
1747
+ voiceId: string | null
1748
+ ): Promise<[boolean, any]> {
1749
+ console.log("šŸŒ Multilingual Test - sona_speech_2 (all languages sample)");
1750
+
1751
+ if (!voiceId) {
1752
+ console.log(" āš ļø No voice ID available");
1753
+ return [false, null];
1754
+ }
1755
+
1756
+ // Test a diverse subset of languages
1757
+ const testCases = [
1758
+ { lang: "Ko" as const, text: "ģ•ˆė…•ķ•˜ģ„øģš”.", label: "Korean" },
1759
+ { lang: "En" as const, text: "Hello.", label: "English" },
1760
+ { lang: "Ja" as const, text: "こんにごは。", label: "Japanese" },
1761
+ { lang: "Es" as const, text: "Hola.", label: "Spanish" },
1762
+ { lang: "Fr" as const, text: "Bonjour.", label: "French" },
1763
+ { lang: "De" as const, text: "Hallo.", label: "German" },
1764
+ { lang: "Ar" as const, text: "Ł…Ų±Ų­ŲØŲ§.", label: "Arabic" },
1765
+ { lang: "Hi" as const, text: "ą¤Øą¤®ą¤øą„ą¤¤ą„‡ą„¤", label: "Hindi" },
1766
+ ];
1767
+
1768
+ try {
1769
+ const { Supertone } = await import("../src/index.js");
1770
+ const models = await import("../src/models/index.js");
1771
+ const client = new Supertone({ apiKey: API_KEY });
1772
+
1773
+ let allPassed = true;
1774
+ const results: any[] = [];
1775
+
1776
+ for (const tc of testCases) {
1777
+ console.log(` šŸ” Testing ${tc.label} (${tc.lang})...`);
1778
+
1779
+ try {
1780
+ const langEnum =
1781
+ models.APIConvertTextToSpeechUsingCharacterRequestLanguage[tc.lang];
1782
+
1783
+ const response = await client.textToSpeech.createSpeech({
1784
+ voiceId,
1785
+ apiConvertTextToSpeechUsingCharacterRequest: {
1786
+ text: tc.text,
1787
+ language: langEnum,
1788
+ outputFormat:
1789
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat
1790
+ .Wav,
1791
+ model:
1792
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1793
+ .SonaSpeech2,
1794
+ },
1795
+ });
1796
+
1797
+ console.log(` āœ… ${tc.label} success`);
1798
+ results.push({ lang: tc.lang, success: true });
1799
+ } catch (e: any) {
1800
+ console.log(
1801
+ ` āŒ ${tc.label} failed: ${e.message?.substring(0, 50)}`
1802
+ );
1803
+ results.push({ lang: tc.lang, success: false, error: e.message });
1804
+ allPassed = false;
1805
+ }
1806
+ }
1807
+
1808
+ console.log(
1809
+ ` šŸ“Š Result: ${results.filter((r) => r.success).length}/${
1810
+ testCases.length
1811
+ } languages passed`
1812
+ );
1813
+ return [allPassed, results];
1814
+ } catch (e: any) {
1815
+ logDetailedError(e, "sona_speech_2 multilingual");
1816
+ return [false, e];
1817
+ }
1818
+ }
1819
+
1820
+ /**
1821
+ * Test TTS multilingual support with supertonic_api_1 (supports: ko, en, ja, es, pt)
1822
+ */
1823
+ async function testMultilingualSupertonicApi1(
1824
+ voiceId: string | null
1825
+ ): Promise<[boolean, any]> {
1826
+ console.log("šŸŒ Multilingual Test - supertonic_api_1 (ko, en, ja, es, pt)");
1827
+
1828
+ if (!voiceId) {
1829
+ console.log(" āš ļø No voice ID available");
1830
+ return [false, null];
1831
+ }
1832
+
1833
+ const testCases = [
1834
+ {
1835
+ lang: "Ko" as const,
1836
+ text: "ģ•ˆė…•ķ•˜ģ„øģš”, ģŠˆķ¼ķ† ė‹‰ API 원 ėŖØėøģž…ė‹ˆė‹¤.",
1837
+ label: "Korean",
1838
+ },
1839
+ {
1840
+ lang: "En" as const,
1841
+ text: "Hello, this is supertonic_api_1 model.",
1842
+ label: "English",
1843
+ },
1844
+ {
1845
+ lang: "Ja" as const,
1846
+ text: "ć“ć‚“ć«ć”ćÆć€ć‚¹ćƒ¼ćƒ‘ćƒ¼ćƒˆćƒ‹ćƒƒć‚ÆAPIćƒÆćƒ³ć§ć™ć€‚",
1847
+ label: "Japanese",
1848
+ },
1849
+ {
1850
+ lang: "Es" as const,
1851
+ text: "Hola, este es el modelo supertonic_api_1.",
1852
+ label: "Spanish",
1853
+ },
1854
+ {
1855
+ lang: "Pt" as const,
1856
+ text: "OlĆ”, este Ć© o modelo supertonic_api_1.",
1857
+ label: "Portuguese",
1858
+ },
1859
+ ];
1860
+
1861
+ try {
1862
+ const { Supertone } = await import("../src/index.js");
1863
+ const models = await import("../src/models/index.js");
1864
+ const client = new Supertone({ apiKey: API_KEY });
1865
+
1866
+ let allPassed = true;
1867
+ const results: any[] = [];
1868
+
1869
+ for (const tc of testCases) {
1870
+ console.log(` šŸ” Testing ${tc.label} (${tc.lang})...`);
1871
+
1872
+ try {
1873
+ const langEnum =
1874
+ models.APIConvertTextToSpeechUsingCharacterRequestLanguage[tc.lang];
1875
+
1876
+ const response = await client.textToSpeech.createSpeech({
1877
+ voiceId,
1878
+ apiConvertTextToSpeechUsingCharacterRequest: {
1879
+ text: tc.text,
1880
+ language: langEnum,
1881
+ outputFormat:
1882
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat
1883
+ .Wav,
1884
+ model:
1885
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1886
+ .SupertonicApi1,
1887
+ },
1888
+ });
1889
+
1890
+ console.log(` āœ… ${tc.label} success`);
1891
+ results.push({ lang: tc.lang, success: true });
1892
+ } catch (e: any) {
1893
+ console.log(
1894
+ ` āŒ ${tc.label} failed: ${e.message?.substring(0, 50)}`
1895
+ );
1896
+ results.push({ lang: tc.lang, success: false, error: e.message });
1897
+ allPassed = false;
1898
+ }
1899
+ }
1900
+
1901
+ console.log(
1902
+ ` šŸ“Š Result: ${results.filter((r) => r.success).length}/${
1903
+ testCases.length
1904
+ } languages passed`
1905
+ );
1906
+ return [allPassed, results];
1907
+ } catch (e: any) {
1908
+ logDetailedError(e, "supertonic_api_1 multilingual");
1909
+ return [false, e];
1910
+ }
1911
+ }
1912
+
1913
+ /**
1914
+ * Test unsupported language for sona_speech_1 (should fail with French)
1915
+ */
1916
+ async function testUnsupportedLanguageSonaSpeech1(
1917
+ voiceId: string | null
1918
+ ): Promise<[boolean, any]> {
1919
+ console.log(
1920
+ "🚫 Unsupported Language Test - sona_speech_1 with French (Expected to Fail)"
1921
+ );
1922
+
1923
+ if (!voiceId) {
1924
+ console.log(" āš ļø No voice ID available");
1925
+ return [false, null];
1926
+ }
1927
+
1928
+ try {
1929
+ const { Supertone } = await import("../src/index.js");
1930
+ const models = await import("../src/models/index.js");
1931
+ const client = new Supertone({ apiKey: API_KEY });
1932
+
1933
+ console.log(` šŸ” Attempting sona_speech_1 with French (unsupported)`);
1934
+
1935
+ const response = await client.textToSpeech.createSpeech({
1936
+ voiceId,
1937
+ apiConvertTextToSpeechUsingCharacterRequest: {
1938
+ text: "Bonjour, ceci est un test.",
1939
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Fr, // French - not supported by sona_speech_1
1940
+ outputFormat:
1941
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1942
+ model:
1943
+ models.APIConvertTextToSpeechUsingCharacterRequestModel.SonaSpeech1,
1944
+ },
1945
+ });
1946
+
1947
+ // If we reach here, the API didn't reject - may need server-side validation
1948
+ console.log(
1949
+ ` āš ļø API accepted the request - server-side validation may not enforce language restriction`
1950
+ );
1951
+ console.log(
1952
+ ` šŸ“‹ Note: Language restriction may be enforced at API level, not SDK level`
1953
+ );
1954
+ return [
1955
+ true,
1956
+ { note: "API accepted - language restriction may be server-side" },
1957
+ ];
1958
+ } catch (e: any) {
1959
+ console.log(
1960
+ ` āœ… Correctly rejected unsupported language for sona_speech_1`
1961
+ );
1962
+ console.log(` šŸ“‹ Error: ${e.message?.substring(0, 100)}`);
1963
+ return [true, e];
1964
+ }
1965
+ }
1966
+
1967
+ /**
1968
+ * Test unsupported language for supertonic_api_1 (should fail with German)
1969
+ */
1970
+ async function testUnsupportedLanguageSupertonicApi1(
1971
+ voiceId: string | null
1972
+ ): Promise<[boolean, any]> {
1973
+ console.log(
1974
+ "🚫 Unsupported Language Test - supertonic_api_1 with German (Expected to Fail)"
1975
+ );
1976
+
1977
+ if (!voiceId) {
1978
+ console.log(" āš ļø No voice ID available");
1979
+ return [false, null];
1980
+ }
1981
+
1982
+ try {
1983
+ const { Supertone } = await import("../src/index.js");
1984
+ const models = await import("../src/models/index.js");
1985
+ const client = new Supertone({ apiKey: API_KEY });
1986
+
1987
+ console.log(` šŸ” Attempting supertonic_api_1 with German (unsupported)`);
1988
+
1989
+ const response = await client.textToSpeech.createSpeech({
1990
+ voiceId,
1991
+ apiConvertTextToSpeechUsingCharacterRequest: {
1992
+ text: "Hallo, das ist ein Test.",
1993
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.De, // German - not supported by supertonic_api_1
1994
+ outputFormat:
1995
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
1996
+ model:
1997
+ models.APIConvertTextToSpeechUsingCharacterRequestModel
1998
+ .SupertonicApi1,
1999
+ },
2000
+ });
2001
+
2002
+ // If we reach here, the API didn't reject - may need server-side validation
2003
+ console.log(
2004
+ ` āš ļø API accepted the request - server-side validation may not enforce language restriction`
2005
+ );
2006
+ console.log(
2007
+ ` šŸ“‹ Note: Language restriction may be enforced at API level, not SDK level`
2008
+ );
2009
+ return [
2010
+ true,
2011
+ { note: "API accepted - language restriction may be server-side" },
2012
+ ];
2013
+ } catch (e: any) {
2014
+ console.log(
2015
+ ` āœ… Correctly rejected unsupported language for supertonic_api_1`
2016
+ );
2017
+ console.log(` šŸ“‹ Error: ${e.message?.substring(0, 100)}`);
2018
+ return [true, e];
2019
+ }
2020
+ }
2021
+
2022
+ /**
2023
+ * Test duration prediction with voice settings
2024
+ */
2025
+ async function testPredictDurationWithVoiceSettings(
2026
+ voiceId: string | null
2027
+ ): Promise<[boolean, any]> {
2028
+ console.log("ā±ļø Duration Prediction with Voice Settings Test");
2029
+
2030
+ if (!voiceId) {
2031
+ console.log(" āš ļø No voice ID available");
2032
+ return [false, null];
2033
+ }
2034
+
2035
+ try {
2036
+ const { Supertone } = await import("../src/index.js");
2037
+ const models = await import("../src/models/index.js");
2038
+ const client = new Supertone({ apiKey: API_KEY });
2039
+
2040
+ const voiceSettings = {
2041
+ speed: 0.8,
2042
+ };
2043
+
2044
+ console.log(
2045
+ ` šŸ” Predicting duration with voice settings for voice '${voiceId}'...`
2046
+ );
2047
+ console.log(` Settings: speed=${voiceSettings.speed}`);
2048
+
2049
+ const response = await client.textToSpeech.predictDuration({
2050
+ voiceId,
2051
+ predictTTSDurationUsingCharacterRequest: {
2052
+ text: "This is a duration test with adjusted speed.",
2053
+ language: models.PredictTTSDurationUsingCharacterRequestLanguage.En,
2054
+ voiceSettings,
2055
+ },
2056
+ });
2057
+
2058
+ console.log(` āœ… Predicted duration: ${response.duration}s`);
2059
+
2060
+ return [true, response];
2061
+ } catch (e: any) {
2062
+ console.error(` āŒ Error: ${e.message || e}`);
2063
+ return [false, e];
2064
+ }
2065
+ }
2066
+
2067
+ /**
2068
+ * Test TTS streaming with voice settings
2069
+ */
2070
+ async function testStreamSpeechWithVoiceSettings(
2071
+ voiceId: string | null
2072
+ ): Promise<[boolean, any]> {
2073
+ console.log("šŸ“” TTS Streaming with Voice Settings Test");
2074
+
2075
+ if (!voiceId) {
2076
+ console.log(" āš ļø No voice ID available");
2077
+ return [false, null];
2078
+ }
2079
+
2080
+ try {
2081
+ const { Supertone } = await import("../src/index.js");
2082
+ const models = await import("../src/models/index.js");
2083
+ const client = new Supertone({ apiKey: API_KEY });
2084
+
2085
+ const voiceSettings = {
2086
+ pitchShift: 1.05,
2087
+ speed: 1.1,
2088
+ };
2089
+
2090
+ console.log(
2091
+ ` šŸ” Streaming speech with voice settings for voice '${voiceId}'...`
2092
+ );
2093
+ console.log(
2094
+ ` Settings: pitchShift=${voiceSettings.pitchShift}, speed=${voiceSettings.speed}`
2095
+ );
2096
+ console.log(" āš ļø This test consumes credits!");
2097
+
2098
+ const response = await client.textToSpeech.streamSpeech({
2099
+ voiceId,
2100
+ apiConvertTextToSpeechUsingCharacterRequest: {
2101
+ text: "Streaming with adjusted voice settings.",
2102
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2103
+ outputFormat:
2104
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
2105
+ voiceSettings,
2106
+ },
2107
+ });
2108
+
2109
+ console.log(` āœ… Stream with voice settings started successfully`);
2110
+
2111
+ return [true, response];
2112
+ } catch (e: any) {
2113
+ console.error(` āŒ Error: ${e.message || e}`);
2114
+ return [false, e];
2115
+ }
2116
+ }
2117
+
2118
+ /**
2119
+ * Test MP3 format TTS
2120
+ */
2121
+ async function testCreateSpeechMp3(
2122
+ voiceId: string | null
2123
+ ): Promise<[boolean, any]> {
2124
+ console.log("šŸŽ¤ MP3 Format TTS Test");
2125
+
2126
+ if (!voiceId) {
2127
+ console.log(" āš ļø No voice ID available");
2128
+ return [false, null];
2129
+ }
2130
+
2131
+ try {
2132
+ const { Supertone } = await import("../src/index.js");
2133
+ const models = await import("../src/models/index.js");
2134
+ const client = new Supertone({ apiKey: API_KEY });
2135
+
2136
+ console.log(` šŸ” MP3 TTS conversion with voice '${voiceId}'...`);
2137
+ console.log(" āš ļø This test consumes credits!");
2138
+
2139
+ const response = await client.textToSpeech.createSpeech({
2140
+ voiceId,
2141
+ apiConvertTextToSpeechUsingCharacterRequest: {
2142
+ text: "Hello! This is an MP3 format SDK test. Let's verify if it works correctly.",
2143
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
2144
+ outputFormat:
2145
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Mp3,
2146
+ style: "neutral",
2147
+ model: "sona_speech_1",
2148
+ },
2149
+ });
2150
+
2151
+ console.log(` āœ… MP3 TTS conversion success`);
2152
+
2153
+ if (response.result) {
2154
+ const outputFile = "test_create_speech_output.mp3";
2155
+ const audioData = await extractAudioData(response);
2156
+
2157
+ fs.writeFileSync(outputFile, audioData);
2158
+ console.log(` šŸ’¾ MP3 audio file saved: ${outputFile}`);
2159
+
2160
+ // Verify MP3 header
2161
+ const header = audioData.slice(0, 10);
2162
+ if (header[0] === 0x49 && header[1] === 0x44 && header[2] === 0x33) {
2163
+ console.log(` āœ… Valid MP3 file generated (ID3 tag)`);
2164
+ } else if (
2165
+ (header[0] === 0xff && header[1] === 0xfb) ||
2166
+ (header[0] === 0xff && header[1] === 0xfa)
2167
+ ) {
2168
+ console.log(` āœ… Valid MP3 file generated (MPEG frame)`);
2169
+ } else {
2170
+ console.log(
2171
+ ` šŸ“„ MP3 header: ${Array.from(header.slice(0, 10))
2172
+ .map((b) => b.toString(16).padStart(2, "0"))
2173
+ .join(" ")} (needs verification)`
2174
+ );
2175
+ }
2176
+ }
2177
+
2178
+ return [true, response];
2179
+ } catch (e: any) {
2180
+ console.error(` āŒ Error: ${e.message || e}`);
2181
+ return [false, e];
2182
+ }
2183
+ }
2184
+
2185
+ /**
2186
+ * Test MP3 format with long text
2187
+ */
2188
+ async function testCreateSpeechLongTextMp3(
2189
+ voiceId: string | null
2190
+ ): Promise<[boolean, any]> {
2191
+ console.log("šŸ“œ Long Text MP3 Auto-Chunking TTS Test (300+ chars)");
2192
+
2193
+ if (!voiceId) {
2194
+ console.log(" āš ļø No voice ID available");
2195
+ return [false, null];
2196
+ }
2197
+
2198
+ try {
2199
+ const { Supertone } = await import("../src/index.js");
2200
+ const models = await import("../src/models/index.js");
2201
+ const client = new Supertone({ apiKey: API_KEY });
2202
+
2203
+ const longText = `
2204
+ Hello! This is a very long text MP3 auto-chunking TTS test exceeding 300 characters.
2205
+ The newly implemented SDK automatically divides long text into multiple chunks for processing.
2206
+ Real-time streaming text-to-speech technology plays a crucial role in modern AI applications.
2207
+ It is an indispensable technology especially in conversational services, live broadcasting, and real-time translation services.
2208
+ Through the auto-chunking feature, long texts are naturally divided into multiple small segments for processing.
2209
+ Each segment is intelligently segmented considering sentence and word boundaries, enabling natural speech generation.
2210
+ Now users don't need to worry about text length or output format, as the SDK automatically handles everything in MP3 format too.
2211
+ `.trim();
2212
+
2213
+ const actualLength = longText.length;
2214
+ console.log(
2215
+ ` šŸ“ Test text length: ${actualLength} characters (exceeds 300)`
2216
+ );
1366
2217
  console.log(` šŸ”§ Auto-chunking enabled for MP3 format`);
1367
2218
 
1368
2219
  console.log(` šŸ” Converting long text to MP3 with voice '${voiceId}'...`);
@@ -1549,7 +2400,7 @@ async function main(): Promise<boolean> {
1549
2400
  console.log("");
1550
2401
 
1551
2402
  const testResults: TestResult = {};
1552
- let voiceIdForTTS: string | null = null;
2403
+ const voiceIdForTTS: string = "91992bbd4758bdcf9c9b01";
1553
2404
  let customVoiceId: string | null = null;
1554
2405
  let createdCustomVoiceId: string | null = null;
1555
2406
 
@@ -1572,9 +2423,6 @@ async function main(): Promise<boolean> {
1572
2423
 
1573
2424
  [success, result] = await testListVoices();
1574
2425
  testResults["list_voices"] = success;
1575
- if (success && result.voiceId) {
1576
- voiceIdForTTS = result.voiceId;
1577
- }
1578
2426
 
1579
2427
  [success, result] = await testSearchVoices();
1580
2428
  testResults["search_voices"] = success;
@@ -1643,6 +2491,67 @@ async function main(): Promise<boolean> {
1643
2491
  [success, result] = await testStreamSpeech(voiceIdForTTS);
1644
2492
  testResults["stream_speech"] = success;
1645
2493
 
2494
+ // 5.5 New Model Tests (sona_speech_2, supertonic_api_1)
2495
+ console.log("\nšŸ¤– New Model Tests (sona_speech_2, supertonic_api_1)");
2496
+ console.log("-".repeat(60));
2497
+ console.log("āš ļø These tests consume credits!");
2498
+ console.log("");
2499
+
2500
+ [success, result] = await testCreateSpeechWithSonaSpeech2(voiceIdForTTS);
2501
+ testResults["create_speech_sona_speech_2"] = success;
2502
+
2503
+ [success, result] = await testCreateSpeechWithSupertonicApi1(voiceIdForTTS);
2504
+ testResults["create_speech_supertonic_api_1"] = success;
2505
+
2506
+ [success, result] = await testCreateSpeechWithUnsupportedModel(
2507
+ voiceIdForTTS
2508
+ );
2509
+ testResults["create_speech_unsupported_model"] = success;
2510
+
2511
+ [success, result] = await testPredictDurationWithSonaSpeech2(voiceIdForTTS);
2512
+ testResults["predict_duration_sona_speech_2"] = success;
2513
+
2514
+ [success, result] = await testPredictDurationWithSupertonicApi1(
2515
+ voiceIdForTTS
2516
+ );
2517
+ testResults["predict_duration_supertonic_api_1"] = success;
2518
+
2519
+ [success, result] = await testPredictDurationWithUnsupportedModel(
2520
+ voiceIdForTTS
2521
+ );
2522
+ testResults["predict_duration_unsupported_model"] = success;
2523
+
2524
+ // 5.6 Multilingual Tests per Model
2525
+ console.log("\nšŸŒ Multilingual Tests per Model");
2526
+ console.log("-".repeat(60));
2527
+ console.log("āš ļø These tests consume credits!");
2528
+ console.log("");
2529
+
2530
+ [success, result] = await testMultilingualSonaSpeech1(voiceIdForTTS);
2531
+ testResults["multilingual_sona_speech_1"] = success;
2532
+
2533
+ [success, result] = await testMultilingualSonaSpeech2(voiceIdForTTS);
2534
+ testResults["multilingual_sona_speech_2"] = success;
2535
+
2536
+ [success, result] = await testMultilingualSupertonicApi1(voiceIdForTTS);
2537
+ testResults["multilingual_supertonic_api_1"] = success;
2538
+
2539
+ // 5.7 Unsupported Language Tests
2540
+ console.log("\n🚫 Unsupported Language Tests");
2541
+ console.log("-".repeat(60));
2542
+ console.log(
2543
+ "āš ļø These tests verify error handling for unsupported model-language combinations!"
2544
+ );
2545
+ console.log("");
2546
+
2547
+ [success, result] = await testUnsupportedLanguageSonaSpeech1(voiceIdForTTS);
2548
+ testResults["unsupported_lang_sona_speech_1"] = success;
2549
+
2550
+ [success, result] = await testUnsupportedLanguageSupertonicApi1(
2551
+ voiceIdForTTS
2552
+ );
2553
+ testResults["unsupported_lang_supertonic_api_1"] = success;
2554
+
1646
2555
  // 6. TTS Long Text Tests
1647
2556
  console.log("\nšŸ“œ Text-to-Speech Long Text Tests");
1648
2557
  console.log("-".repeat(60));
@@ -1652,6 +2561,14 @@ async function main(): Promise<boolean> {
1652
2561
  [success, result] = await testCreateSpeechLongText(voiceIdForTTS);
1653
2562
  testResults["create_speech_long_text"] = success;
1654
2563
 
2564
+ [success, result] = await testCreateSpeechLongSentenceNoPunctuation(
2565
+ voiceIdForTTS
2566
+ );
2567
+ testResults["create_speech_long_sentence_no_punctuation"] = success;
2568
+
2569
+ [success, result] = await testCreateSpeechJapaneseNoSpaces(voiceIdForTTS);
2570
+ testResults["create_speech_japanese_no_spaces"] = success;
2571
+
1655
2572
  [success, result] = await testStreamSpeechLongText(voiceIdForTTS);
1656
2573
  testResults["stream_speech_long_text"] = success;
1657
2574
 
@@ -1761,6 +2678,9 @@ async function main(): Promise<boolean> {
1761
2678
  " • Text-to-Speech: predictDuration, createSpeech, streamSpeech"
1762
2679
  );
1763
2680
  console.log(" • TTS Long Text: createSpeechLongText, streamSpeechLongText");
2681
+ console.log(
2682
+ " • TTS Chunking Strategies: Word-based (no punctuation), Character-based (Japanese)"
2683
+ );
1764
2684
  console.log(
1765
2685
  " • TTS with Voice Settings: createSpeechWithVoiceSettings, predictDurationWithVoiceSettings, streamSpeechWithVoiceSettings"
1766
2686
  );
@@ -1773,6 +2693,21 @@ async function main(): Promise<boolean> {
1773
2693
  console.log(
1774
2694
  " • Custom Features: Auto-chunking in createSpeech/streamSpeech (transparent)"
1775
2695
  );
2696
+ console.log("");
2697
+ console.log("šŸ¤– New Model & Language Tests:");
2698
+ console.log(
2699
+ " • New Models: sona_speech_2, supertonic_api_1 (createSpeech & predictDuration)"
2700
+ );
2701
+ console.log(
2702
+ " • Unsupported Model Validation: Error handling for invalid model names"
2703
+ );
2704
+ console.log(" • Multilingual per Model:");
2705
+ console.log(" - sona_speech_1: ko, en, ja");
2706
+ console.log(" - sona_speech_2: all 23 languages");
2707
+ console.log(" - supertonic_api_1: ko, en, ja, es, pt");
2708
+ console.log(
2709
+ " • Unsupported Language Validation: Error handling for invalid model-language combinations"
2710
+ );
1776
2711
 
1777
2712
  if (customVoiceId) {
1778
2713
  console.log("");