@supertone/supertone 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/FUNCTIONS.md +2 -2
  2. package/README.md +108 -44
  3. package/custom_test/realtime_tts_player.ts +64 -3
  4. package/custom_test/test_real_api.ts +1040 -105
  5. package/dist/commonjs/lib/config.d.ts +2 -2
  6. package/dist/commonjs/lib/config.d.ts.map +1 -1
  7. package/dist/commonjs/lib/config.js +2 -2
  8. package/dist/commonjs/lib/config.js.map +1 -1
  9. package/dist/commonjs/lib/custom_utils/text_utils.d.ts +8 -1
  10. package/dist/commonjs/lib/custom_utils/text_utils.d.ts.map +1 -1
  11. package/dist/commonjs/lib/custom_utils/text_utils.js +108 -7
  12. package/dist/commonjs/lib/custom_utils/text_utils.js.map +1 -1
  13. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  14. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  15. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js +48 -3
  16. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  17. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  18. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  19. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js +46 -3
  20. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  21. package/dist/commonjs/sdk/texttospeech.d.ts.map +1 -1
  22. package/dist/commonjs/sdk/texttospeech.js +12 -9
  23. package/dist/commonjs/sdk/texttospeech.js.map +1 -1
  24. package/dist/esm/lib/config.d.ts +2 -2
  25. package/dist/esm/lib/config.d.ts.map +1 -1
  26. package/dist/esm/lib/config.js +2 -2
  27. package/dist/esm/lib/config.js.map +1 -1
  28. package/dist/esm/lib/custom_utils/text_utils.d.ts +8 -1
  29. package/dist/esm/lib/custom_utils/text_utils.d.ts.map +1 -1
  30. package/dist/esm/lib/custom_utils/text_utils.js +108 -7
  31. package/dist/esm/lib/custom_utils/text_utils.js.map +1 -1
  32. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  33. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  34. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js +47 -2
  35. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  36. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  37. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  38. package/dist/esm/models/predictttsdurationusingcharacterrequest.js +45 -2
  39. package/dist/esm/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  40. package/dist/esm/sdk/texttospeech.d.ts.map +1 -1
  41. package/dist/esm/sdk/texttospeech.js +12 -9
  42. package/dist/esm/sdk/texttospeech.js.map +1 -1
  43. package/examples/custom_voices/create_cloned_voice.ts +4 -3
  44. package/examples/custom_voices/delete_custom_voice.ts +2 -7
  45. package/examples/custom_voices/edit_custom_voice.ts +2 -6
  46. package/examples/custom_voices/get_custom_voice.ts +2 -7
  47. package/examples/custom_voices/list_custom_voices.ts +2 -7
  48. package/examples/custom_voices/search_custom_voices.ts +2 -6
  49. package/examples/package.json +2 -2
  50. package/examples/textToSpeechCreateSpeech.example.ts +2 -2
  51. package/examples/text_to_speech/create_speech.ts +3 -8
  52. package/examples/text_to_speech/create_speech_long_text.ts +3 -7
  53. package/examples/text_to_speech/create_speech_with_phonemes.ts +3 -7
  54. package/examples/text_to_speech/create_speech_with_voice_settings.ts +3 -8
  55. package/examples/text_to_speech/predict_duration.ts +3 -7
  56. package/examples/text_to_speech/stream_speech.ts +3 -7
  57. package/examples/text_to_speech/stream_speech_long_text.ts +3 -7
  58. package/examples/text_to_speech/stream_speech_with_phonemes.ts +3 -7
  59. package/examples/text_to_speech/stream_speech_with_voice_settings.ts +3 -7
  60. package/examples/usage/get_credit_balance.ts +2 -6
  61. package/examples/usage/get_usage.ts +2 -6
  62. package/examples/usage/get_voice_usage.ts +2 -7
  63. package/examples/voices/get_voice.ts +2 -6
  64. package/examples/voices/list_voices.ts +2 -6
  65. package/examples/voices/search_voices.ts +2 -7
  66. package/jsr.json +2 -2
  67. package/openapi.json +101 -9
  68. package/package.json +26 -10
  69. package/src/lib/config.ts +3 -2
  70. package/src/lib/custom_utils/text_utils.ts +117 -7
  71. package/src/models/apiconverttexttospeechusingcharacterrequest.ts +62 -3
  72. package/src/models/predictttsdurationusingcharacterrequest.ts +64 -3
  73. package/src/sdk/texttospeech.ts +474 -465
@@ -7,6 +7,94 @@
7
7
 
8
8
  import { DEFAULT_MAX_TEXT_LENGTH } from "./constants.js";
9
9
 
10
+ /**
11
+ * Check if text contains spaces (to determine if word-based splitting is possible)
12
+ *
13
+ * @param text - Text to check
14
+ * @returns true if text contains spaces
15
+ */
16
+ function hasSpaces(text: string): boolean {
17
+ return /\s/.test(text);
18
+ }
19
+
20
+ /**
21
+ * Split text by words, ensuring each chunk is under maxLength.
22
+ * Used for languages with spaces (English, Korean, etc.)
23
+ *
24
+ * @param text - Text to split
25
+ * @param maxLength - Maximum length of each chunk
26
+ * @returns Array of text chunks
27
+ */
28
+ function splitByWords(text: string, maxLength: number): string[] {
29
+ const words = text.split(/(\s+)/);
30
+ const chunks: string[] = [];
31
+ let currentChunk = "";
32
+
33
+ for (const word of words) {
34
+ if (currentChunk.length + word.length <= maxLength) {
35
+ currentChunk += word;
36
+ } else {
37
+ if (currentChunk.trim()) {
38
+ chunks.push(currentChunk.trim());
39
+ }
40
+ // If a single word exceeds maxLength, split by characters
41
+ if (word.trim().length > maxLength) {
42
+ const charChunks = splitByCharacters(word.trim(), maxLength);
43
+ chunks.push(...charChunks);
44
+ currentChunk = "";
45
+ } else {
46
+ currentChunk = word;
47
+ }
48
+ }
49
+ }
50
+
51
+ if (currentChunk.trim()) {
52
+ chunks.push(currentChunk.trim());
53
+ }
54
+
55
+ return chunks;
56
+ }
57
+
58
+ /**
59
+ * Split text by characters, ensuring each chunk is under maxLength.
60
+ * Used for languages without spaces (Japanese, Chinese, etc.)
61
+ *
62
+ * @param text - Text to split
63
+ * @param maxLength - Maximum length of each chunk
64
+ * @returns Array of text chunks
65
+ */
66
+ function splitByCharacters(text: string, maxLength: number): string[] {
67
+ const chunks: string[] = [];
68
+
69
+ for (let i = 0; i < text.length; i += maxLength) {
70
+ chunks.push(text.slice(i, i + maxLength));
71
+ }
72
+
73
+ return chunks;
74
+ }
75
+
76
+ /**
77
+ * Split a single chunk that exceeds maxLength into smaller chunks.
78
+ * Uses word-based splitting for texts with spaces, character-based for texts without.
79
+ *
80
+ * @param chunk - Text chunk to split
81
+ * @param maxLength - Maximum length of each chunk
82
+ * @returns Array of text chunks, all under maxLength
83
+ */
84
+ function splitOversizedChunk(chunk: string, maxLength: number): string[] {
85
+ if (chunk.length <= maxLength) {
86
+ return [chunk];
87
+ }
88
+
89
+ // Check if text has spaces (word-based splitting possible)
90
+ if (hasSpaces(chunk)) {
91
+ return splitByWords(chunk, maxLength);
92
+ }
93
+
94
+ // No spaces: use character-based splitting (Japanese, Chinese, etc.)
95
+ return splitByCharacters(chunk, maxLength);
96
+ }
97
+
10
98
  /**
11
99
  * Split input text into sentence chunks suitable for TTS processing.
12
100
  *
@@ -15,9 +103,16 @@ import { DEFAULT_MAX_TEXT_LENGTH } from "./constants.js";
15
103
  * It handles various punctuation patterns and provides graceful fallback to
16
104
  * word/character boundaries when necessary.
17
105
  *
106
+ * Chunking Strategy:
107
+ * 1. First, split by sentence boundaries (punctuation: .!?;:)
108
+ * 2. Merge sentences into chunks up to maxLength
109
+ * 3. If a sentence exceeds maxLength:
110
+ * - For text with spaces: split by words
111
+ * - For text without spaces (Japanese, etc.): split by characters
112
+ *
18
113
  * @param text - Input text to be segmented
19
114
  * @param maxLength - Maximum length of each chunk
20
- * @returns Array of text chunks
115
+ * @returns Array of text chunks, each guaranteed to be <= maxLength
21
116
  */
22
117
  export function chunkText(
23
118
  text: string,
@@ -27,28 +122,43 @@ export function chunkText(
27
122
  return [text];
28
123
  }
29
124
 
30
- // Split by sentence boundaries
31
- const sentences = text.split(/([.!?;:]+\s*)/);
125
+ // Step 1: Split by sentence boundaries (including various punctuation marks)
126
+ // Includes Western punctuation (.!?;:) and CJK punctuation (。!?;:)
127
+ const sentences = text.split(/([.!?;:。!?;:]+\s*)/);
32
128
 
33
- const chunks: string[] = [];
129
+ const preliminaryChunks: string[] = [];
34
130
  let currentChunk = "";
35
131
 
132
+ // Step 2: Merge sentences into chunks up to maxLength
36
133
  for (const sentence of sentences) {
37
134
  if (currentChunk.length + sentence.length <= maxLength) {
38
135
  currentChunk += sentence;
39
136
  } else {
40
137
  if (currentChunk) {
41
- chunks.push(currentChunk);
138
+ preliminaryChunks.push(currentChunk);
42
139
  }
43
140
  currentChunk = sentence;
44
141
  }
45
142
  }
46
143
 
47
144
  if (currentChunk) {
48
- chunks.push(currentChunk);
145
+ preliminaryChunks.push(currentChunk);
49
146
  }
50
147
 
51
- return chunks;
148
+ // Step 3: Handle oversized chunks (split by words or characters)
149
+ const finalChunks: string[] = [];
150
+ for (const chunk of preliminaryChunks) {
151
+ if (chunk.length <= maxLength) {
152
+ finalChunks.push(chunk);
153
+ } else {
154
+ // Chunk exceeds maxLength, need to split further
155
+ const subChunks = splitOversizedChunk(chunk, maxLength);
156
+ finalChunks.push(...subChunks);
157
+ }
158
+ }
159
+
160
+ // Filter out empty chunks
161
+ return finalChunks.filter((chunk) => chunk.length > 0);
52
162
  }
53
163
 
54
164
  /**
@@ -22,8 +22,26 @@ export const APIConvertTextToSpeechUsingCharacterRequestLanguage = {
22
22
  En: "en",
23
23
  Ko: "ko",
24
24
  Ja: "ja",
25
+ Bg: "bg",
26
+ Cs: "cs",
27
+ Da: "da",
28
+ El: "el",
25
29
  Es: "es",
30
+ Et: "et",
31
+ Fi: "fi",
32
+ Hu: "hu",
33
+ It: "it",
34
+ Nl: "nl",
35
+ Pl: "pl",
26
36
  Pt: "pt",
37
+ Ro: "ro",
38
+ Ar: "ar",
39
+ De: "de",
40
+ Fr: "fr",
41
+ Hi: "hi",
42
+ Id: "id",
43
+ Ru: "ru",
44
+ Vi: "vi",
27
45
  } as const;
28
46
  /**
29
47
  * The language code of the text
@@ -32,6 +50,22 @@ export type APIConvertTextToSpeechUsingCharacterRequestLanguage = ClosedEnum<
32
50
  typeof APIConvertTextToSpeechUsingCharacterRequestLanguage
33
51
  >;
34
52
 
53
+ /**
54
+ * The model type to use for the text-to-speech conversion
55
+ */
56
+ export const APIConvertTextToSpeechUsingCharacterRequestModel = {
57
+ SonaSpeech1: "sona_speech_1",
58
+ SonaSpeech2: "sona_speech_2",
59
+ SonaSpeech2t: "sona_speech_2t",
60
+ SupertonicApi1: "supertonic_api_1",
61
+ } as const;
62
+ /**
63
+ * The model type to use for the text-to-speech conversion
64
+ */
65
+ export type APIConvertTextToSpeechUsingCharacterRequestModel = ClosedEnum<
66
+ typeof APIConvertTextToSpeechUsingCharacterRequestModel
67
+ >;
68
+
35
69
  /**
36
70
  * The desired output format of the audio file (wav, mp3). Default is wav.
37
71
  */
@@ -61,7 +95,7 @@ export type APIConvertTextToSpeechUsingCharacterRequest = {
61
95
  /**
62
96
  * The model type to use for the text-to-speech conversion
63
97
  */
64
- model?: string | undefined;
98
+ model?: APIConvertTextToSpeechUsingCharacterRequestModel | undefined;
65
99
  /**
66
100
  * The desired output format of the audio file (wav, mp3). Default is wav.
67
101
  */
@@ -98,6 +132,29 @@ export namespace APIConvertTextToSpeechUsingCharacterRequestLanguage$ {
98
132
  APIConvertTextToSpeechUsingCharacterRequestLanguage$outboundSchema;
99
133
  }
100
134
 
135
+ /** @internal */
136
+ export const APIConvertTextToSpeechUsingCharacterRequestModel$inboundSchema:
137
+ z.ZodNativeEnum<typeof APIConvertTextToSpeechUsingCharacterRequestModel> = z
138
+ .nativeEnum(APIConvertTextToSpeechUsingCharacterRequestModel);
139
+
140
+ /** @internal */
141
+ export const APIConvertTextToSpeechUsingCharacterRequestModel$outboundSchema:
142
+ z.ZodNativeEnum<typeof APIConvertTextToSpeechUsingCharacterRequestModel> =
143
+ APIConvertTextToSpeechUsingCharacterRequestModel$inboundSchema;
144
+
145
+ /**
146
+ * @internal
147
+ * @deprecated This namespace will be removed in future versions. Use schemas and types that are exported directly from this module.
148
+ */
149
+ export namespace APIConvertTextToSpeechUsingCharacterRequestModel$ {
150
+ /** @deprecated use `APIConvertTextToSpeechUsingCharacterRequestModel$inboundSchema` instead. */
151
+ export const inboundSchema =
152
+ APIConvertTextToSpeechUsingCharacterRequestModel$inboundSchema;
153
+ /** @deprecated use `APIConvertTextToSpeechUsingCharacterRequestModel$outboundSchema` instead. */
154
+ export const outboundSchema =
155
+ APIConvertTextToSpeechUsingCharacterRequestModel$outboundSchema;
156
+ }
157
+
101
158
  /** @internal */
102
159
  export const APIConvertTextToSpeechUsingCharacterRequestOutputFormat$inboundSchema:
103
160
  z.ZodNativeEnum<
@@ -133,7 +190,8 @@ export const APIConvertTextToSpeechUsingCharacterRequest$inboundSchema:
133
190
  text: z.string(),
134
191
  language: APIConvertTextToSpeechUsingCharacterRequestLanguage$inboundSchema,
135
192
  style: z.string().optional(),
136
- model: z.string().default("sona_speech_1"),
193
+ model: APIConvertTextToSpeechUsingCharacterRequestModel$inboundSchema
194
+ .default("sona_speech_1"),
137
195
  output_format:
138
196
  APIConvertTextToSpeechUsingCharacterRequestOutputFormat$inboundSchema
139
197
  .default("wav"),
@@ -169,7 +227,8 @@ export const APIConvertTextToSpeechUsingCharacterRequest$outboundSchema:
169
227
  language:
170
228
  APIConvertTextToSpeechUsingCharacterRequestLanguage$outboundSchema,
171
229
  style: z.string().optional(),
172
- model: z.string().default("sona_speech_1"),
230
+ model: APIConvertTextToSpeechUsingCharacterRequestModel$outboundSchema
231
+ .default("sona_speech_1"),
173
232
  outputFormat:
174
233
  APIConvertTextToSpeechUsingCharacterRequestOutputFormat$outboundSchema
175
234
  .default("wav"),
@@ -22,8 +22,26 @@ export const PredictTTSDurationUsingCharacterRequestLanguage = {
22
22
  En: "en",
23
23
  Ko: "ko",
24
24
  Ja: "ja",
25
+ Bg: "bg",
26
+ Cs: "cs",
27
+ Da: "da",
28
+ El: "el",
25
29
  Es: "es",
30
+ Et: "et",
31
+ Fi: "fi",
32
+ Hu: "hu",
33
+ It: "it",
34
+ Nl: "nl",
35
+ Pl: "pl",
26
36
  Pt: "pt",
37
+ Ro: "ro",
38
+ Ar: "ar",
39
+ De: "de",
40
+ Fr: "fr",
41
+ Hi: "hi",
42
+ Id: "id",
43
+ Ru: "ru",
44
+ Vi: "vi",
27
45
  } as const;
28
46
  /**
29
47
  * Language code of the voice
@@ -32,6 +50,22 @@ export type PredictTTSDurationUsingCharacterRequestLanguage = ClosedEnum<
32
50
  typeof PredictTTSDurationUsingCharacterRequestLanguage
33
51
  >;
34
52
 
53
+ /**
54
+ * The model type to use for the text-to-speech conversion
55
+ */
56
+ export const PredictTTSDurationUsingCharacterRequestModel = {
57
+ SonaSpeech1: "sona_speech_1",
58
+ SonaSpeech2: "sona_speech_2",
59
+ SonaSpeech2t: "sona_speech_2t",
60
+ SupertonicApi1: "supertonic_api_1",
61
+ } as const;
62
+ /**
63
+ * The model type to use for the text-to-speech conversion
64
+ */
65
+ export type PredictTTSDurationUsingCharacterRequestModel = ClosedEnum<
66
+ typeof PredictTTSDurationUsingCharacterRequestModel
67
+ >;
68
+
35
69
  /**
36
70
  * The desired output format of the audio file (wav, mp3). Default is wav.
37
71
  */
@@ -62,7 +96,7 @@ export type PredictTTSDurationUsingCharacterRequest = {
62
96
  /**
63
97
  * The model type to use for the text-to-speech conversion
64
98
  */
65
- model?: string | undefined;
99
+ model?: PredictTTSDurationUsingCharacterRequestModel | undefined;
66
100
  /**
67
101
  * The desired output format of the audio file (wav, mp3). Default is wav.
68
102
  */
@@ -95,6 +129,29 @@ export namespace PredictTTSDurationUsingCharacterRequestLanguage$ {
95
129
  PredictTTSDurationUsingCharacterRequestLanguage$outboundSchema;
96
130
  }
97
131
 
132
+ /** @internal */
133
+ export const PredictTTSDurationUsingCharacterRequestModel$inboundSchema:
134
+ z.ZodNativeEnum<typeof PredictTTSDurationUsingCharacterRequestModel> = z
135
+ .nativeEnum(PredictTTSDurationUsingCharacterRequestModel);
136
+
137
+ /** @internal */
138
+ export const PredictTTSDurationUsingCharacterRequestModel$outboundSchema:
139
+ z.ZodNativeEnum<typeof PredictTTSDurationUsingCharacterRequestModel> =
140
+ PredictTTSDurationUsingCharacterRequestModel$inboundSchema;
141
+
142
+ /**
143
+ * @internal
144
+ * @deprecated This namespace will be removed in future versions. Use schemas and types that are exported directly from this module.
145
+ */
146
+ export namespace PredictTTSDurationUsingCharacterRequestModel$ {
147
+ /** @deprecated use `PredictTTSDurationUsingCharacterRequestModel$inboundSchema` instead. */
148
+ export const inboundSchema =
149
+ PredictTTSDurationUsingCharacterRequestModel$inboundSchema;
150
+ /** @deprecated use `PredictTTSDurationUsingCharacterRequestModel$outboundSchema` instead. */
151
+ export const outboundSchema =
152
+ PredictTTSDurationUsingCharacterRequestModel$outboundSchema;
153
+ }
154
+
98
155
  /** @internal */
99
156
  export const PredictTTSDurationUsingCharacterRequestOutputFormat$inboundSchema:
100
157
  z.ZodNativeEnum<typeof PredictTTSDurationUsingCharacterRequestOutputFormat> =
@@ -127,7 +184,9 @@ export const PredictTTSDurationUsingCharacterRequest$inboundSchema: z.ZodType<
127
184
  text: z.string(),
128
185
  language: PredictTTSDurationUsingCharacterRequestLanguage$inboundSchema,
129
186
  style: z.string().optional(),
130
- model: z.string().default("sona_speech_1"),
187
+ model: PredictTTSDurationUsingCharacterRequestModel$inboundSchema.default(
188
+ "sona_speech_1",
189
+ ),
131
190
  output_format:
132
191
  PredictTTSDurationUsingCharacterRequestOutputFormat$inboundSchema.default(
133
192
  "wav",
@@ -159,7 +218,9 @@ export const PredictTTSDurationUsingCharacterRequest$outboundSchema: z.ZodType<
159
218
  text: z.string(),
160
219
  language: PredictTTSDurationUsingCharacterRequestLanguage$outboundSchema,
161
220
  style: z.string().optional(),
162
- model: z.string().default("sona_speech_1"),
221
+ model: PredictTTSDurationUsingCharacterRequestModel$outboundSchema.default(
222
+ "sona_speech_1",
223
+ ),
163
224
  outputFormat:
164
225
  PredictTTSDurationUsingCharacterRequestOutputFormat$outboundSchema.default(
165
226
  "wav",