@supertone/supertone 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +119 -69
  2. package/custom_test/realtime_tts_player.ts +177 -12
  3. package/custom_test/test_pronunciation_dictionary.ts +227 -0
  4. package/custom_test/test_real_api.ts +1677 -162
  5. package/custom_test/test_text_utils_chunk_text_punctuation.ts +55 -0
  6. package/dist/commonjs/lib/config.d.ts +2 -2
  7. package/dist/commonjs/lib/config.d.ts.map +1 -1
  8. package/dist/commonjs/lib/config.js +2 -2
  9. package/dist/commonjs/lib/config.js.map +1 -1
  10. package/dist/commonjs/lib/custom_utils/index.d.ts +1 -0
  11. package/dist/commonjs/lib/custom_utils/index.d.ts.map +1 -1
  12. package/dist/commonjs/lib/custom_utils/index.js +5 -1
  13. package/dist/commonjs/lib/custom_utils/index.js.map +1 -1
  14. package/dist/commonjs/lib/custom_utils/pronunciation_utils.d.ts +24 -0
  15. package/dist/commonjs/lib/custom_utils/pronunciation_utils.d.ts.map +1 -0
  16. package/dist/commonjs/lib/custom_utils/pronunciation_utils.js +145 -0
  17. package/dist/commonjs/lib/custom_utils/pronunciation_utils.js.map +1 -0
  18. package/dist/commonjs/lib/custom_utils/text_utils.d.ts +8 -1
  19. package/dist/commonjs/lib/custom_utils/text_utils.d.ts.map +1 -1
  20. package/dist/commonjs/lib/custom_utils/text_utils.js +125 -7
  21. package/dist/commonjs/lib/custom_utils/text_utils.js.map +1 -1
  22. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  23. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  24. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js +48 -3
  25. package/dist/commonjs/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  26. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  27. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  28. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js +46 -3
  29. package/dist/commonjs/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  30. package/dist/commonjs/sdk/texttospeech.d.ts +17 -6
  31. package/dist/commonjs/sdk/texttospeech.d.ts.map +1 -1
  32. package/dist/commonjs/sdk/texttospeech.js +48 -25
  33. package/dist/commonjs/sdk/texttospeech.js.map +1 -1
  34. package/dist/esm/lib/config.d.ts +2 -2
  35. package/dist/esm/lib/config.d.ts.map +1 -1
  36. package/dist/esm/lib/config.js +2 -2
  37. package/dist/esm/lib/config.js.map +1 -1
  38. package/dist/esm/lib/custom_utils/index.d.ts +1 -0
  39. package/dist/esm/lib/custom_utils/index.d.ts.map +1 -1
  40. package/dist/esm/lib/custom_utils/index.js +2 -0
  41. package/dist/esm/lib/custom_utils/index.js.map +1 -1
  42. package/dist/esm/lib/custom_utils/pronunciation_utils.d.ts +24 -0
  43. package/dist/esm/lib/custom_utils/pronunciation_utils.d.ts.map +1 -0
  44. package/dist/esm/lib/custom_utils/pronunciation_utils.js +140 -0
  45. package/dist/esm/lib/custom_utils/pronunciation_utils.js.map +1 -0
  46. package/dist/esm/lib/custom_utils/text_utils.d.ts +8 -1
  47. package/dist/esm/lib/custom_utils/text_utils.d.ts.map +1 -1
  48. package/dist/esm/lib/custom_utils/text_utils.js +125 -7
  49. package/dist/esm/lib/custom_utils/text_utils.js.map +1 -1
  50. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts +92 -1
  51. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.d.ts.map +1 -1
  52. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js +47 -2
  53. package/dist/esm/models/apiconverttexttospeechusingcharacterrequest.js.map +1 -1
  54. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts +92 -1
  55. package/dist/esm/models/predictttsdurationusingcharacterrequest.d.ts.map +1 -1
  56. package/dist/esm/models/predictttsdurationusingcharacterrequest.js +45 -2
  57. package/dist/esm/models/predictttsdurationusingcharacterrequest.js.map +1 -1
  58. package/dist/esm/sdk/texttospeech.d.ts +17 -6
  59. package/dist/esm/sdk/texttospeech.d.ts.map +1 -1
  60. package/dist/esm/sdk/texttospeech.js +49 -26
  61. package/dist/esm/sdk/texttospeech.js.map +1 -1
  62. package/examples/custom_voices/create_cloned_voice.ts +4 -3
  63. package/examples/custom_voices/delete_custom_voice.ts +2 -7
  64. package/examples/custom_voices/edit_custom_voice.ts +2 -6
  65. package/examples/custom_voices/get_custom_voice.ts +2 -7
  66. package/examples/custom_voices/list_custom_voices.ts +2 -7
  67. package/examples/custom_voices/search_custom_voices.ts +2 -6
  68. package/examples/text_to_speech/create_speech.ts +3 -8
  69. package/examples/text_to_speech/create_speech_long_text.ts +3 -7
  70. package/examples/text_to_speech/create_speech_with_phonemes.ts +3 -7
  71. package/examples/text_to_speech/create_speech_with_voice_settings.ts +3 -8
  72. package/examples/text_to_speech/predict_duration.ts +3 -7
  73. package/examples/text_to_speech/stream_speech.ts +3 -7
  74. package/examples/text_to_speech/stream_speech_long_text.ts +3 -7
  75. package/examples/text_to_speech/stream_speech_with_phonemes.ts +3 -7
  76. package/examples/text_to_speech/stream_speech_with_voice_settings.ts +3 -7
  77. package/examples/usage/get_credit_balance.ts +2 -6
  78. package/examples/usage/get_usage.ts +2 -6
  79. package/examples/usage/get_voice_usage.ts +2 -7
  80. package/examples/voices/get_voice.ts +2 -6
  81. package/examples/voices/list_voices.ts +2 -6
  82. package/examples/voices/search_voices.ts +2 -7
  83. package/jsr.json +1 -1
  84. package/openapi.json +101 -9
  85. package/package.json +1 -1
  86. package/src/lib/config.ts +41 -41
  87. package/src/lib/custom_utils/index.ts +7 -0
  88. package/src/lib/custom_utils/pronunciation_utils.ts +193 -0
  89. package/src/lib/custom_utils/text_utils.ts +138 -7
  90. package/src/models/apiconverttexttospeechusingcharacterrequest.ts +62 -3
  91. package/src/models/predictttsdurationusingcharacterrequest.ts +64 -3
  92. package/src/sdk/texttospeech.ts +99 -68
package/README.md CHANGED
@@ -3,15 +3,12 @@
3
3
  ![LOGO](https://github.com/supertone-inc/supertone-ts/blob/main/images/hero-light.png?raw=true)
4
4
 
5
5
  <!-- Start Summary [summary] -->
6
+ ## Summary
6
7
 
7
- ## API & Docs
8
-
9
- The official TypesSript SDK for [Supertone API](https://www.supertone.ai/en/api)
10
-
8
+ Supertone Public API: Supertone API is a RESTful API for using our state-of-the-art AI voice models.
11
9
  <!-- End Summary [summary] -->
12
10
 
13
11
  <!-- Start SDK Installation [installation] -->
14
-
15
12
  ## SDK Installation
16
13
 
17
14
  The SDK can be installed with either [npm](https://www.npmjs.com/), [pnpm](https://pnpm.io/), [bun](https://bun.sh/) or [yarn](https://classic.yarnpkg.com/en/) package managers.
@@ -45,20 +42,16 @@ yarn add @supertone/supertone zod
45
42
 
46
43
  > [!NOTE]
47
44
  > This package is published with CommonJS and ES Modules (ESM) support.
48
-
49
45
  <!-- End SDK Installation [installation] -->
50
46
 
51
47
  <!-- Start Requirements [requirements] -->
52
-
53
48
  ## Requirements
54
49
 
55
50
  For supported JavaScript runtimes, please consult [RUNTIMES.md](RUNTIMES.md).
56
-
57
51
  <!-- End Requirements [requirements] -->
58
52
 
59
53
  <!-- Start SDK Example Usage [usage] -->
60
-
61
- ## Speech Generation Example
54
+ ## SDK Example Usage
62
55
 
63
56
  ### Example
64
57
 
@@ -84,11 +77,9 @@ async function run() {
84
77
  run();
85
78
 
86
79
  ```
87
-
88
80
  <!-- End SDK Example Usage [usage] -->
89
81
 
90
82
  <!-- Start Authentication [security] -->
91
-
92
83
  ## Authentication
93
84
 
94
85
  ### Per-Client Security Schemes
@@ -100,7 +91,6 @@ This SDK supports the following security scheme globally:
100
91
  | `apiKey` | apiKey | API key |
101
92
 
102
93
  To authenticate with the API the `apiKey` parameter must be set when initializing the SDK client instance. For example:
103
-
104
94
  ```typescript
105
95
  import { Supertone } from "@supertone/supertone";
106
96
 
@@ -123,44 +113,108 @@ async function run() {
123
113
  run();
124
114
 
125
115
  ```
126
-
127
116
  <!-- End Authentication [security] -->
128
117
 
129
- <!-- Start Available Resources and Operations [operations] -->
118
+ <!-- Start Models [models] -->
119
+
120
+ ## Models
121
+
122
+ Supertone’s Text-to-Speech API provides multiple TTS models, each with different supported languages, available voice settings, and streaming capabilities.
123
+
124
+ ### Model Overview
125
+
126
+ | Model Name | Identifier | Streaming Support (`stream_speech`) | Voice Settings Support |
127
+ |--------------------|-------------------|--------------------------------------|----------------------------------------------------------|
128
+ | **SONA Speech 1** | `sona_speech_1` | ✅ Supported | Supports **all** Voice Settings |
129
+ | **Supertonic API 1** | `supertonic_api_1` | ❌ Not supported | Supports **only** the `speed` setting (others are ignored) |
130
+ | **SONA Speech 2** | `sona_speech_2` | ❌ Not supported | Supports **pitch_shift**, **pitch_variance**, **speed** |
131
+
132
+ > [!NOTE]
133
+ > **Streaming Support**
134
+ >
135
+ > Streaming TTS using the `stream_speech` endpoint is **only available for the `sona_speech_1` model**.
136
+
137
+ ---
138
+
139
+ ### Supported Languages by Model
130
140
 
141
+ > [!NOTE]
142
+ > The set of supported input languages varies depending on the TTS model.
143
+
144
+ - **sona_speech_1**
145
+ - `en`, `ko`, `ja`
146
+
147
+ - **supertonic_api_1**
148
+ - `en`, `ko`, `ja`, `es`, `pt`
149
+
150
+ - **sona_speech_2**
151
+ - `en`, `ko`, `ja`, `bg`, `cs`, `da`, `el`, `es`, `et`, `fi`, `hu`, `it`, `nl`, `pl`, `pt`, `ro`,
152
+ `ar`, `de`, `fr`, `hi`, `id`, `ru`, `vi`
153
+
154
+ ---
155
+
156
+ ### Voice Settings (Optional)
157
+
158
+ Some TTS models support optional voice settings that allow fine control over output speech characteristics (e.g., speed, pitch, pitch variance).
159
+
160
+ > [!NOTE]
161
+ > The available Voice Settings vary depending on the TTS model.
162
+
163
+ - **sona_speech_1**
164
+ - Supports **all** available Voice Settings.
165
+
166
+ - **supertonic_api_1**
167
+ - Supports **only** the `speed` setting.
168
+ All other settings will be ignored.
169
+
170
+ - **sona_speech_2**
171
+ - Supports the following Voice Settings:
172
+ - `pitch_shift`
173
+ - `pitch_variance`
174
+ - `speed`
175
+
176
+ > All Voice Settings are optional. When omitted, each model’s default values will be applied.
177
+
178
+ <!-- End Models [models] -->
179
+
180
+ <!-- Start Available Resources and Operations [operations] -->
131
181
  ## Available Resources and Operations
132
182
 
183
+ <details open>
184
+ <summary>Available methods</summary>
185
+
133
186
  ### [customVoices](docs/sdks/customvoices/README.md)
134
187
 
135
- - [createClonedVoice](docs/sdks/customvoices/README.md#createclonedvoice) - Create cloned voice
136
- - [listCustomVoices](docs/sdks/customvoices/README.md#listcustomvoices) - Gets custom (cloned) voices
137
- - [searchCustomVoices](docs/sdks/customvoices/README.md#searchcustomvoices) - Search custom (cloned) voices
138
- - [getCustomVoice](docs/sdks/customvoices/README.md#getcustomvoice) - Get single cloned voice
139
- - [editCustomVoice](docs/sdks/customvoices/README.md#editcustomvoice) - Update cloned voice (partial update)
140
- - [deleteCustomVoice](docs/sdks/customvoices/README.md#deletecustomvoice) - Delete cloned voice
188
+ * [createClonedVoice](docs/sdks/customvoices/README.md#createclonedvoice) - Create cloned voice
189
+ * [listCustomVoices](docs/sdks/customvoices/README.md#listcustomvoices) - Gets custom (cloned) voices
190
+ * [searchCustomVoices](docs/sdks/customvoices/README.md#searchcustomvoices) - Search custom (cloned) voices
191
+ * [getCustomVoice](docs/sdks/customvoices/README.md#getcustomvoice) - Get single cloned voice
192
+ * [editCustomVoice](docs/sdks/customvoices/README.md#editcustomvoice) - Update cloned voice (partial update)
193
+ * [deleteCustomVoice](docs/sdks/customvoices/README.md#deletecustomvoice) - Delete cloned voice
194
+
141
195
 
142
196
  ### [textToSpeech](docs/sdks/texttospeech/README.md)
143
197
 
144
- - [createSpeech](docs/sdks/texttospeech/README.md#createspeech) - Convert text to speech
145
- - [streamSpeech](docs/sdks/texttospeech/README.md#streamspeech) - Convert text to speech with streaming response
146
- - [predictDuration](docs/sdks/texttospeech/README.md#predictduration) - Predict text-to-speech duration
198
+ * [createSpeech](docs/sdks/texttospeech/README.md#createspeech) - Convert text to speech
199
+ * [streamSpeech](docs/sdks/texttospeech/README.md#streamspeech) - Convert text to speech with streaming response
200
+ * [predictDuration](docs/sdks/texttospeech/README.md#predictduration) - Predict text-to-speech duration
147
201
 
148
202
  ### [usage](docs/sdks/usage/README.md)
149
203
 
150
- - [getVoiceUsage](docs/sdks/usage/README.md#getvoiceusage) - Retrieve TTS API usage data
151
- - [getUsage](docs/sdks/usage/README.md#getusage) - Retrieve advanced API usage analytics
152
- - [getCreditBalance](docs/sdks/usage/README.md#getcreditbalance) - Retrieve credit balance
204
+ * [getVoiceUsage](docs/sdks/usage/README.md#getvoiceusage) - Retrieve TTS API usage data
205
+ * [getUsage](docs/sdks/usage/README.md#getusage) - Retrieve advanced API usage analytics
206
+ * [getCreditBalance](docs/sdks/usage/README.md#getcreditbalance) - Retrieve credit balance
153
207
 
154
208
  ### [voices](docs/sdks/voices/README.md)
155
209
 
156
- - [listVoices](docs/sdks/voices/README.md#listvoices) - Gets available voices
157
- - [searchVoices](docs/sdks/voices/README.md#searchvoices) - Search voices.
158
- - [getVoice](docs/sdks/voices/README.md#getvoice) - Get voice details by ID
210
+ * [listVoices](docs/sdks/voices/README.md#listvoices) - Gets available voices
211
+ * [searchVoices](docs/sdks/voices/README.md#searchvoices) - Search voices.
212
+ * [getVoice](docs/sdks/voices/README.md#getvoice) - Get voice details by ID
159
213
 
214
+ </details>
160
215
  <!-- End Available Resources and Operations [operations] -->
161
216
 
162
217
  <!-- Start Error Handling [errors] -->
163
-
164
218
  ## Error Handling
165
219
 
166
220
  [`SupertoneError`](./src/models/errors/supertoneerror.ts) is the base class for all HTTP error responses. It has the following properties:
@@ -174,44 +228,7 @@ run();
174
228
  | `error.rawResponse` | `Response` | Raw HTTP response |
175
229
  | `error.data$` | | Optional. Some errors may contain structured data. [See Error Classes](#error-classes). |
176
230
 
177
- ### Error Classes
178
-
179
- **Primary error:**
180
-
181
- - [`SupertoneError`](./src/models/errors/supertoneerror.ts): The base class for HTTP error responses.
182
-
183
- <details><summary>Less common errors (16)</summary>
184
-
185
- <br />
186
-
187
- **Network errors:**
188
-
189
- - [`ConnectionError`](./src/models/errors/httpclienterrors.ts): HTTP client was unable to make a request to a server.
190
- - [`RequestTimeoutError`](./src/models/errors/httpclienterrors.ts): HTTP request timed out due to an AbortSignal signal.
191
- - [`RequestAbortedError`](./src/models/errors/httpclienterrors.ts): HTTP request was aborted by the client.
192
- - [`InvalidRequestError`](./src/models/errors/httpclienterrors.ts): Any input used to create a request is invalid.
193
- - [`UnexpectedClientError`](./src/models/errors/httpclienterrors.ts): Unrecognised or unexpected error.
194
-
195
- **Inherit from [`SupertoneError`](./src/models/errors/supertoneerror.ts)**:
196
-
197
- - [`UnauthorizedErrorResponse`](./src/models/errors/unauthorizederrorresponse.ts): Unauthorized: Invalid API key. Status code `401`. Applicable to 10 of 15 methods.\*
198
- - [`InternalServerErrorResponse`](./src/models/errors/internalservererrorresponse.ts): Status code `500`. Applicable to 10 of 15 methods.\*
199
- - [`NotFoundErrorResponse`](./src/models/errors/notfounderrorresponse.ts): Status code `404`. Applicable to 9 of 15 methods.\*
200
- - [`BadRequestErrorResponse`](./src/models/errors/badrequesterrorresponse.ts): Status code `400`. Applicable to 5 of 15 methods.\*
201
- - [`ForbiddenErrorResponse`](./src/models/errors/forbiddenerrorresponse.ts): Status code `403`. Applicable to 4 of 15 methods.\*
202
- - [`RequestTimeoutErrorResponse`](./src/models/errors/requesttimeouterrorresponse.ts): Status code `408`. Applicable to 4 of 15 methods.\*
203
- - [`TooManyRequestsErrorResponse`](./src/models/errors/toomanyrequestserrorresponse.ts): Status code `429`. Applicable to 4 of 15 methods.\*
204
- - [`PaymentRequiredErrorResponse`](./src/models/errors/paymentrequirederrorresponse.ts): Status code `402`. Applicable to 3 of 15 methods.\*
205
- - [`PayloadTooLargeErrorResponse`](./src/models/errors/payloadtoolargeerrorresponse.ts): Payload Too Large: File size exceeds 3MB limit. Status code `413`. Applicable to 1 of 15 methods.\*
206
- - [`UnsupportedMediaTypeErrorResponse`](./src/models/errors/unsupportedmediatypeerrorresponse.ts): Unsupported Media Type: Invalid audio file format. Status code `415`. Applicable to 1 of 15 methods.\*
207
- - [`ResponseValidationError`](./src/models/errors/responsevalidationerror.ts): Type mismatch between the data returned from the server and the structure expected by the SDK. See `error.rawValue` for the raw value and `error.pretty()` for a nicely formatted multi-line string.
208
-
209
- </details>
210
-
211
- \* Check [the method documentation](#available-resources-and-operations) to see if the error is applicable.
212
-
213
- ### Error Handling Example
214
-
231
+ ### Example
215
232
  ```typescript
216
233
  import { Supertone } from "@supertone/supertone";
217
234
  import * as errors from "@supertone/supertone/models/errors";
@@ -252,6 +269,38 @@ run();
252
269
 
253
270
  ```
254
271
 
272
+ ### Error Classes
273
+ **Primary error:**
274
+ * [`SupertoneError`](./src/models/errors/supertoneerror.ts): The base class for HTTP error responses.
275
+
276
+ <details><summary>Less common errors (16)</summary>
277
+
278
+ <br />
279
+
280
+ **Network errors:**
281
+ * [`ConnectionError`](./src/models/errors/httpclienterrors.ts): HTTP client was unable to make a request to a server.
282
+ * [`RequestTimeoutError`](./src/models/errors/httpclienterrors.ts): HTTP request timed out due to an AbortSignal signal.
283
+ * [`RequestAbortedError`](./src/models/errors/httpclienterrors.ts): HTTP request was aborted by the client.
284
+ * [`InvalidRequestError`](./src/models/errors/httpclienterrors.ts): Any input used to create a request is invalid.
285
+ * [`UnexpectedClientError`](./src/models/errors/httpclienterrors.ts): Unrecognised or unexpected error.
286
+
287
+
288
+ **Inherit from [`SupertoneError`](./src/models/errors/supertoneerror.ts)**:
289
+ * [`UnauthorizedErrorResponse`](./src/models/errors/unauthorizederrorresponse.ts): Unauthorized: Invalid API key. Status code `401`. Applicable to 10 of 15 methods.*
290
+ * [`InternalServerErrorResponse`](./src/models/errors/internalservererrorresponse.ts): Status code `500`. Applicable to 10 of 15 methods.*
291
+ * [`NotFoundErrorResponse`](./src/models/errors/notfounderrorresponse.ts): Status code `404`. Applicable to 9 of 15 methods.*
292
+ * [`BadRequestErrorResponse`](./src/models/errors/badrequesterrorresponse.ts): Status code `400`. Applicable to 5 of 15 methods.*
293
+ * [`ForbiddenErrorResponse`](./src/models/errors/forbiddenerrorresponse.ts): Status code `403`. Applicable to 4 of 15 methods.*
294
+ * [`RequestTimeoutErrorResponse`](./src/models/errors/requesttimeouterrorresponse.ts): Status code `408`. Applicable to 4 of 15 methods.*
295
+ * [`TooManyRequestsErrorResponse`](./src/models/errors/toomanyrequestserrorresponse.ts): Status code `429`. Applicable to 4 of 15 methods.*
296
+ * [`PaymentRequiredErrorResponse`](./src/models/errors/paymentrequirederrorresponse.ts): Status code `402`. Applicable to 3 of 15 methods.*
297
+ * [`PayloadTooLargeErrorResponse`](./src/models/errors/payloadtoolargeerrorresponse.ts): Payload Too Large: File size exceeds 3MB limit. Status code `413`. Applicable to 1 of 15 methods.*
298
+ * [`UnsupportedMediaTypeErrorResponse`](./src/models/errors/unsupportedmediatypeerrorresponse.ts): Unsupported Media Type: Invalid audio file format. Status code `415`. Applicable to 1 of 15 methods.*
299
+ * [`ResponseValidationError`](./src/models/errors/responsevalidationerror.ts): Type mismatch between the data returned from the server and the structure expected by the SDK. See `error.rawValue` for the raw value and `error.pretty()` for a nicely formatted multi-line string.
300
+
301
+ </details>
302
+
303
+ \* Check [the method documentation](#available-resources-and-operations) to see if the error is applicable.
255
304
  <!-- End Error Handling [errors] -->
256
305
 
257
306
  <!-- Start Additional Example Code [examples] -->
@@ -262,4 +311,5 @@ Additional example code can be found in the [examples](https://github.com/supert
262
311
 
263
312
  <!-- End Additional Example Code [examples] -->
264
313
 
314
+
265
315
  <!-- Placeholder for Future Speakeasy SDK Sections -->
@@ -6,6 +6,7 @@
6
6
  import { spawn, ChildProcess } from "child_process";
7
7
  import { Supertone } from "../src/index.js";
8
8
  import * as models from "../src/models/index.js";
9
+ import type { PronunciationDictionaryEntry } from "../src/lib/custom_utils/index.js";
9
10
  import * as dotenv from "dotenv";
10
11
  import * as path from "path";
11
12
  import { fileURLToPath } from "url";
@@ -275,10 +276,17 @@ class SimpleMpvPlayer {
275
276
  */
276
277
  async function simpleStreamingTts(
277
278
  voiceId: string,
278
- text: string
279
+ text: string,
280
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage = models
281
+ .APIConvertTextToSpeechUsingCharacterRequestLanguage.Ko,
282
+ pronunciationDictionary?: PronunciationDictionaryEntry[]
279
283
  ): Promise<boolean> {
280
284
  console.log(`📝 "${text.slice(0, 50)}${text.length > 50 ? "..." : ""}"`);
281
285
  console.log(`📏 Text length: ${text.length} characters`);
286
+ console.log(`🌐 Language: ${language}`);
287
+ if (pronunciationDictionary && pronunciationDictionary.length > 0) {
288
+ console.log(`📖 Pronunciation dictionary: ${pronunciationDictionary.length} entries`);
289
+ }
282
290
 
283
291
  const player = new SimpleMpvPlayer();
284
292
 
@@ -292,17 +300,20 @@ async function simpleStreamingTts(
292
300
  player.markApiCallStart();
293
301
  console.log(" ⏱️ API call started...");
294
302
 
295
- const response = await client.textToSpeech.streamSpeech({
296
- voiceId: voiceId,
297
- apiConvertTextToSpeechUsingCharacterRequest: {
298
- text: text,
299
- language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ko,
300
- outputFormat:
301
- models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
302
- style: "neutral",
303
- model: "sona_speech_1",
303
+ const response = await client.textToSpeech.streamSpeech(
304
+ {
305
+ voiceId: voiceId,
306
+ apiConvertTextToSpeechUsingCharacterRequest: {
307
+ text: text,
308
+ language: language,
309
+ outputFormat:
310
+ models.APIConvertTextToSpeechUsingCharacterRequestOutputFormat.Wav,
311
+ style: "neutral",
312
+ model: "sona_speech_1",
313
+ },
304
314
  },
305
- });
315
+ pronunciationDictionary ? { pronunciationDictionary } : undefined
316
+ );
306
317
 
307
318
  // Mark API call end (response received)
308
319
  player.markApiCallEnd();
@@ -384,7 +395,7 @@ async function simpleStreamingTts(
384
395
  */
385
396
  async function simpleDemo(): Promise<void> {
386
397
  const voiceId = "91992bbd4758bdcf9c9b01";
387
- const scenarios = [
398
+ const scenarios: string[] = [
388
399
  "안녕하세요! 심플한 테스트입니다.",
389
400
 
390
401
  "실시간 텍스트 음성 변환 기술은 정말 놀랍습니다. 이 기술을 통해 긴 텍스트도 즉시 음성으로 들을 수 있게 되었습니다.",
@@ -401,6 +412,115 @@ async function simpleDemo(): Promise<void> {
401
412
  "옛날 한 작은 마을에 천재적인 재능을 가진 젊은 개발자가 살고 있었습니다. 그의 이름은 민준이였고, 어릴 때부터 컴퓨터와 프로그래밍에 남다른 관심을 보였습니다. 대학에서 컴퓨터 과학을 전공한 민준은 졸업 후 스타트업에 입사했습니다. 그곳에서 그는 인공지능과 음성 기술에 대한 깊은 지식을 쌓게 되었습니다. 어느 날, 민준은 시각 장애가 있는 친구 서연을 만났습니다. 서연은 인터넷의 수많은 정보를 텍스트로만 접할 수 있어 많은 불편함을 겪고 있었습니다. 당시의 음성 합성 기술은 로봇 같은 목소리를 내며, 긴 텍스트를 읽어주려면 모든 처리가 끝날 때까지 기다려야 했습니다. 이를 본 민준은 더 자연스럽고 빠른 음성 합성 기술을 만들기로 결심했습니다. 밤낮없이 연구에 매진한 민준은 혁신적인 아이디어를 떠올렸습니다. 긴 텍스트를 작은 단위로 나누어 실시간으로 처리하고, 첫 번째 부분이 완성되는 즉시 재생을 시작하는 스트리밍 방식이었습니다. 이 기술을 구현하기 위해 그는 최신 딥러닝 모델과 신경망 아키텍처를 연구했습니다. 수많은 시행착오를 거쳐 마침내 자연스러운 음성을 실시간으로 생성할 수 있는 시스템을 완성했습니다. 그의 기술은 문장의 문맥과 감정까지 이해하여 적절한 억양과 속도로 읽어주었습니다.",
402
413
  ];
403
414
 
415
+ // Additional test scenarios for word-based and character-based chunking
416
+ const additionalScenarios: Array<{
417
+ text: string;
418
+ label: string;
419
+ category: string;
420
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage;
421
+ pronunciationDictionary?: PronunciationDictionaryEntry[];
422
+ }> = [
423
+ {
424
+ // Korean text WITHOUT punctuation to test word-based chunking
425
+ // Text length: ~450 characters (exceeds 300 char limit)
426
+ text: "이것은 구두점 없이 매우 긴 문장을 테스트하는 것으로 삼백 글자를 초과하는 텍스트에서 단어 기반 분할이 올바르게 작동하는지 확인하기 위한 것입니다 이러한 경우 SDK는 문장 경계 대신 단어 경계를 사용하여 텍스트를 적절한 크기로 나누어야 하며 이는 사용자가 생성한 콘텐츠에서 흔히 발생할 수 있는 상황입니다 예를 들어 채팅 메시지나 비공식적인 텍스트 입력에서는 올바른 문법과 구두점이 항상 보장되지 않기 때문입니다 또한 실시간 스트리밍 환경에서는 사용자가 빠르게 입력하는 경우가 많아서 구두점을 생략하는 경우가 빈번하게 발생합니다 이러한 상황에서도 SDK는 안정적으로 텍스트를 처리하고 자연스러운 음성을 생성해야 합니다 따라서 단어 기반 분할 기능은 매우 중요한 역할을 담당합니다",
427
+ label:
428
+ "Long sentence without punctuation (Word-based chunking, 450+ chars)",
429
+ category: "Word-based Chunking Test",
430
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ko,
431
+ },
432
+ {
433
+ // Japanese text WITHOUT punctuation marks (。!?etc) to test pure character-based chunking
434
+ // Text length: ~450 characters (exceeds 300 char limit)
435
+ text: "日本語のテキストは通常スペースを含まないため特別な処理が必要ですこのテストは三百文字を超える長い日本語テキストが正しく処理されることを確認します自然言語処理技術の発展により音声合成の品質は大幅に向上しました特にディープラーニングを活用した最新のテキスト音声変換システムは人間の発話に非常に近い自然な音声を生成できますスペースがない言語では文字単位での分割が必要でありこのSDKはそのような状況を自動的に検出して適切に処理しますこれにより日本語中国語韓国語などのアジア言語でも問題なく長いテキストを音声に変換することができます音声合成技術は視覚障害者のためのアクセシビリティツールから対話型AIアシスタントまで幅広い用途で活用されていますさらにリアルタイムストリーミング技術と組み合わせることで待ち時間を大幅に短縮し優れたユーザー体験を提供することができます最新の音声合成技術は感情や抑揚も自然に表現できるようになりました",
436
+ label:
437
+ "Japanese text without spaces AND punctuation (Character-based chunking, 450+ chars)",
438
+ category: "Character-based Chunking Test",
439
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ja,
440
+ },
441
+ {
442
+ // English text with ellipsis punctuation (… ‥) - tests fix/text_utils multilingual punctuation
443
+ // Text length: ~380 characters (exceeds 300 char limit)
444
+ text: "Sometimes we need to pause and think… The ellipsis character is used to indicate a trailing thought or a pause in speech… This test verifies that the text chunking system correctly handles Unicode ellipsis characters‥ There are multiple types of ellipsis in Unicode… The horizontal ellipsis and the two dot leader are both supported‥ When processing long texts the SDK should split at these punctuation marks… This ensures natural pauses in the generated speech output‥ Let us verify everything works correctly…",
445
+ label: "Ellipsis punctuation test (… ‥) - 380+ chars",
446
+ category: "Multilingual Punctuation Test",
447
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
448
+ },
449
+ {
450
+ // Korean text with ellipsis (…) - tests Korean with Unicode ellipsis
451
+ // Text length: ~350 characters (exceeds 300 char limit)
452
+ text: "한국어 텍스트에서 말줄임표는 생각의 흐름을 나타냅니다… 이 테스트는 유니코드 말줄임표 문자가 올바르게 처리되는지 확인합니다… 인공지능 기술이 발전하면서 음성 합성의 품질도 크게 향상되었습니다… 특히 딥러닝을 활용한 최신 시스템은 매우 자연스러운 음성을 생성할 수 있습니다… 긴 텍스트를 처리할 때 SDK는 이러한 구두점에서 적절히 분할해야 합니다… 이를 통해 자연스러운 음성 출력이 가능해집니다… 실시간 스트리밍 기술과 결합하면 더욱 빠른 응답을 제공할 수 있습니다… 음성 합성 기술은 접근성 도구부터 AI 어시스턴트까지 다양하게 활용됩니다… 모든 것이 제대로 작동하는지 확인해 봅시다…",
453
+ label: "Korean ellipsis punctuation test (…) - 350+ chars",
454
+ category: "Multilingual Punctuation Test",
455
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ko,
456
+ },
457
+ {
458
+ // Japanese text WITH CJK punctuation (。!?) - tests CJK punctuation splitting
459
+ // Text length: ~320 characters (exceeds 300 char limit)
460
+ text: "日本語のテキストは通常スペースを含まないため特別な処理が必要です。このテストは日本語の句読点で正しく分割されることを確認します。自然言語処理技術の発展により音声合成の品質は大幅に向上しました。特にディープラーニングを活用した最新のテキスト音声変換システムは人間の発話に非常に近い自然な音声を生成できます。スペースがない言語では句読点での分割が重要です。このSDKはそのような状況を自動的に検出して適切に処理します。リアルタイムストリーミング技術と組み合わせることで待ち時間を大幅に短縮できます。これにより日本語でも問題なく長いテキストを音声に変換することができます。音声合成技術の未来はとても明るいです。",
461
+ label: "Japanese CJK punctuation test (。) - 320+ chars",
462
+ category: "Multilingual Punctuation Test",
463
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ja,
464
+ },
465
+ // Pronunciation Dictionary Tests
466
+ {
467
+ // Basic pronunciation dictionary test with partial_match=true/false
468
+ text: "The CEO of OpenAI announced that GPT models are improving. Dr. Smith from MIT said AI research is accelerating.",
469
+ label: "Pronunciation dictionary (partial_match=true/false)",
470
+ category: "Pronunciation Dictionary Test",
471
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
472
+ pronunciationDictionary: [
473
+ // partial_match=false: exact word boundary match
474
+ { text: "CEO", pronunciation: "Chief Executive Officer", partial_match: false },
475
+ { text: "MIT", pronunciation: "Massachusetts Institute of Technology", partial_match: false },
476
+ { text: "AI", pronunciation: "Artificial Intelligence", partial_match: false },
477
+ // partial_match=true: substring match
478
+ { text: "GPT", pronunciation: "Generative Pre-trained Transformer", partial_match: true },
479
+ { text: "Dr.", pronunciation: "Doctor", partial_match: true },
480
+ ],
481
+ },
482
+ {
483
+ // Pronunciation dictionary causing text expansion to exceed 300 chars (triggers chunking)
484
+ // Original text: ~190 chars, After expansion: 400+ chars
485
+ text: "AI and ML are revolutionizing tech. The CEO discussed GPT advancements. Dr. Kim from MIT explained how NLP and CV work together. AWS and GCP provide cloud AI services.",
486
+ label: "Pronunciation dictionary + Long text chunking (~190 chars -> 400+ chars)",
487
+ category: "Pronunciation Dictionary + Chunking Test",
488
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.En,
489
+ pronunciationDictionary: [
490
+ // partial_match=false: exact word boundary matches
491
+ { text: "AI", pronunciation: "Artificial Intelligence", partial_match: false },
492
+ { text: "ML", pronunciation: "Machine Learning", partial_match: false },
493
+ { text: "CEO", pronunciation: "Chief Executive Officer", partial_match: false },
494
+ { text: "MIT", pronunciation: "Massachusetts Institute of Technology", partial_match: false },
495
+ { text: "NLP", pronunciation: "Natural Language Processing", partial_match: false },
496
+ { text: "CV", pronunciation: "Computer Vision", partial_match: false },
497
+ { text: "AWS", pronunciation: "Amazon Web Services", partial_match: false },
498
+ { text: "GCP", pronunciation: "Google Cloud Platform", partial_match: false },
499
+ // partial_match=true: substring matches
500
+ { text: "GPT", pronunciation: "Generative Pre-trained Transformer", partial_match: true },
501
+ { text: "Dr.", pronunciation: "Doctor", partial_match: true },
502
+ { text: "tech", pronunciation: "technology", partial_match: true },
503
+ ],
504
+ },
505
+ {
506
+ // Korean pronunciation dictionary test
507
+ text: "SK와 LG의 CEO가 AI와 ML 기술에 대해 발표했습니다. Dr. 김 박사가 MIT에서 NLP 연구 성과를 공개했습니다.",
508
+ label: "Korean pronunciation dictionary test",
509
+ category: "Pronunciation Dictionary Test",
510
+ language: models.APIConvertTextToSpeechUsingCharacterRequestLanguage.Ko,
511
+ pronunciationDictionary: [
512
+ { text: "SK", pronunciation: "에스케이", partial_match: false },
513
+ { text: "LG", pronunciation: "엘지", partial_match: false },
514
+ { text: "CEO", pronunciation: "최고경영자", partial_match: false },
515
+ { text: "AI", pronunciation: "인공지능", partial_match: false },
516
+ { text: "ML", pronunciation: "머신러닝", partial_match: false },
517
+ { text: "MIT", pronunciation: "매사추세츠 공과대학교", partial_match: false },
518
+ { text: "NLP", pronunciation: "자연어처리", partial_match: false },
519
+ { text: "Dr.", pronunciation: "닥터", partial_match: true },
520
+ ],
521
+ },
522
+ ];
523
+
404
524
  for (let i = 0; i < scenarios.length; i++) {
405
525
  console.log(`\n🔥 Scenario ${i + 1}/${scenarios.length}`);
406
526
 
@@ -429,6 +549,37 @@ async function simpleDemo(): Promise<void> {
429
549
  }
430
550
  }
431
551
 
552
+ // Run additional scenarios for chunking tests
553
+ console.log("\n" + "=".repeat(60));
554
+ console.log("🔧 Additional Chunking Test Scenarios");
555
+ console.log("=".repeat(60));
556
+
557
+ for (let i = 0; i < additionalScenarios.length; i++) {
558
+ const scenario = additionalScenarios[i];
559
+ console.log(
560
+ `\n🔬 Additional Scenario ${i + 1}/${additionalScenarios.length}`
561
+ );
562
+ console.log(`📂 Category: ${scenario.category}`);
563
+ console.log(`📝 ${scenario.label}`);
564
+ console.log("─".repeat(50));
565
+
566
+ const success = await simpleStreamingTts(
567
+ voiceId,
568
+ scenario.text,
569
+ scenario.language,
570
+ scenario.pronunciationDictionary
571
+ );
572
+
573
+ if (!success) {
574
+ console.log(`❌ Additional scenario ${i + 1} failed`);
575
+ }
576
+
577
+ if (i < additionalScenarios.length - 1) {
578
+ console.log("\n⏳ Waiting...");
579
+ await new Promise((resolve) => setTimeout(resolve, 2000));
580
+ }
581
+ }
582
+
432
583
  console.log("\n🎉 Demo completed!");
433
584
  console.log("\n📊 Tested text length ranges:");
434
585
  console.log(" • Short text: ~100 chars");
@@ -436,6 +587,20 @@ async function simpleDemo(): Promise<void> {
436
587
  console.log(" • Long text: 300~500 chars");
437
588
  console.log(" • Very long text: 500~800 chars");
438
589
  console.log(" • Extra long text: 800+ chars");
590
+ console.log("\n🔧 Chunking strategy tests:");
591
+ console.log(" • Word-based chunking: Long sentences without punctuation");
592
+ console.log(
593
+ " • Character-based chunking: Japanese/Chinese text without spaces"
594
+ );
595
+ console.log("\n🌍 Multilingual punctuation tests:");
596
+ console.log(" • Ellipsis: English (… ‥)");
597
+ console.log(" • Korean ellipsis: Korean (…)");
598
+ console.log(" • CJK punctuation: Japanese (。)");
599
+ console.log("\n📖 Pronunciation dictionary tests:");
600
+ console.log(" • partial_match=false: Word boundary matching (CEO, MIT, AI)");
601
+ console.log(" • partial_match=true: Substring matching (GPT, Dr., tech)");
602
+ console.log(" • Long text chunking: Text expansion exceeding 300 chars");
603
+ console.log(" • Korean pronunciation: SK, LG, CEO, AI, ML, MIT, NLP");
439
604
  }
440
605
 
441
606
  /**