@cartesia/cartesia-js 2.1.5 → 2.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/api/resources/apiStatus/client/Client.js +1 -1
  2. package/api/resources/datasets/client/Client.d.ts +2 -5
  3. package/api/resources/datasets/client/Client.js +6 -9
  4. package/api/resources/datasets/client/requests/UploadDatasetFileRequest.d.ts +0 -4
  5. package/api/resources/infill/client/Client.d.ts +9 -1
  6. package/api/resources/infill/client/Client.js +10 -2
  7. package/api/resources/voices/client/Client.d.ts +51 -41
  8. package/api/resources/voices/client/Client.js +83 -73
  9. package/api/resources/voices/types/BaseVoiceId.d.ts +1 -1
  10. package/api/resources/voices/types/CreateVoiceRequest.d.ts +1 -0
  11. package/api/resources/voices/types/LocalizeVoiceRequest.d.ts +6 -1
  12. package/dist/api/resources/apiStatus/client/Client.js +1 -1
  13. package/dist/api/resources/datasets/client/Client.d.ts +2 -5
  14. package/dist/api/resources/datasets/client/Client.js +6 -9
  15. package/dist/api/resources/datasets/client/requests/UploadDatasetFileRequest.d.ts +0 -4
  16. package/dist/api/resources/infill/client/Client.d.ts +9 -1
  17. package/dist/api/resources/infill/client/Client.js +10 -2
  18. package/dist/api/resources/voices/client/Client.d.ts +51 -41
  19. package/dist/api/resources/voices/client/Client.js +83 -73
  20. package/dist/api/resources/voices/types/BaseVoiceId.d.ts +1 -1
  21. package/dist/api/resources/voices/types/CreateVoiceRequest.d.ts +1 -0
  22. package/dist/api/resources/voices/types/LocalizeVoiceRequest.d.ts +6 -1
  23. package/dist/serialization/resources/voices/types/CreateVoiceRequest.d.ts +2 -0
  24. package/dist/serialization/resources/voices/types/CreateVoiceRequest.js +2 -0
  25. package/dist/serialization/resources/voices/types/LocalizeVoiceRequest.d.ts +3 -2
  26. package/dist/serialization/resources/voices/types/LocalizeVoiceRequest.js +3 -2
  27. package/dist/version.d.ts +1 -1
  28. package/dist/version.js +1 -1
  29. package/package.json +1 -1
  30. package/reference.md +91 -124
  31. package/serialization/resources/voices/types/CreateVoiceRequest.d.ts +2 -0
  32. package/serialization/resources/voices/types/CreateVoiceRequest.js +2 -0
  33. package/serialization/resources/voices/types/LocalizeVoiceRequest.d.ts +3 -2
  34. package/serialization/resources/voices/types/LocalizeVoiceRequest.js +3 -2
  35. package/version.d.ts +1 -1
  36. package/version.js +1 -1
package/reference.md CHANGED
@@ -98,7 +98,7 @@ await client.datasets.list();
98
98
 
99
99
  ```typescript
100
100
  await client.datasets.create({
101
- name: "string",
101
+ name: "name",
102
102
  });
103
103
  ```
104
104
 
@@ -147,7 +147,7 @@ await client.datasets.create({
147
147
  <dd>
148
148
 
149
149
  ```typescript
150
- await client.datasets.listFiles("string");
150
+ await client.datasets.listFiles("id");
151
151
  ```
152
152
 
153
153
  </dd>
@@ -182,70 +182,6 @@ await client.datasets.listFiles("string");
182
182
  </dl>
183
183
  </details>
184
184
 
185
- <details><summary><code>client.datasets.<a href="/src/api/resources/datasets/client/Client.ts">uploadFile</a>(file, id, { ...params }) -> void</code></summary>
186
- <dl>
187
- <dd>
188
-
189
- #### 🔌 Usage
190
-
191
- <dl>
192
- <dd>
193
-
194
- <dl>
195
- <dd>
196
-
197
- ```typescript
198
- await client.datasets.uploadFile(fs.createReadStream("/path/to/your/file"), "string", {});
199
- ```
200
-
201
- </dd>
202
- </dl>
203
- </dd>
204
- </dl>
205
-
206
- #### ⚙️ Parameters
207
-
208
- <dl>
209
- <dd>
210
-
211
- <dl>
212
- <dd>
213
-
214
- **file:** `File | fs.ReadStream | Blob`
215
-
216
- </dd>
217
- </dl>
218
-
219
- <dl>
220
- <dd>
221
-
222
- **id:** `string`
223
-
224
- </dd>
225
- </dl>
226
-
227
- <dl>
228
- <dd>
229
-
230
- **request:** `Cartesia.UploadDatasetFileRequest`
231
-
232
- </dd>
233
- </dl>
234
-
235
- <dl>
236
- <dd>
237
-
238
- **requestOptions:** `Datasets.RequestOptions`
239
-
240
- </dd>
241
- </dl>
242
- </dd>
243
- </dl>
244
-
245
- </dd>
246
- </dl>
247
- </details>
248
-
249
185
  ## Infill
250
186
 
251
187
  <details><summary><code>client.infill.<a href="/src/api/resources/infill/client/Client.ts">bytes</a>(leftAudio, rightAudio, { ...params }) -> stream.Readable</code></summary>
@@ -262,16 +198,24 @@ await client.datasets.uploadFile(fs.createReadStream("/path/to/your/file"), "str
262
198
 
263
199
  Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
264
200
 
265
- The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.
201
+ **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
266
202
 
267
203
  Only the `sonic-preview` model is supported for infill at this time.
268
204
 
269
205
  At least one of `left_audio` or `right_audio` must be provided.
270
206
 
271
- </dd>
272
- </dl>
273
- </dd>
274
- </dl>
207
+ As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
208
+
209
+ - Use longer infill transcripts
210
+ - This gives the model more flexibility to adapt to the rest of the audio
211
+ - Target natural pauses in the audio when deciding where to clip
212
+ - This means you don't need word-level timestamps to be as precise
213
+ - Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
214
+ - This helps the model generate more natural transitions
215
+ </dd>
216
+ </dl>
217
+ </dd>
218
+ </dl>
275
219
 
276
220
  #### 🔌 Usage
277
221
 
@@ -359,7 +303,7 @@ await client.infill.bytes(fs.createReadStream("/path/to/your/file"), fs.createRe
359
303
 
360
304
  ```typescript
361
305
  await client.tts.bytes({
362
- modelId: "sonic-english",
306
+ modelId: "sonic",
363
307
  transcript: "Hello, world!",
364
308
  voice: {
365
309
  mode: "id",
@@ -420,7 +364,7 @@ await client.tts.bytes({
420
364
 
421
365
  ```typescript
422
366
  const response = await client.tts.sse({
423
- modelId: "sonic-english",
367
+ modelId: "sonic",
424
368
  transcript: "Hello, world!",
425
369
  voice: {
426
370
  mode: "id",
@@ -656,10 +600,29 @@ await client.voices.list();
656
600
  </dl>
657
601
  </details>
658
602
 
659
- <details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">create</a>({ ...params }) -> Cartesia.Voice</code></summary>
603
+ <details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">clone</a>(clip, { ...params }) -> Cartesia.VoiceMetadata</code></summary>
604
+ <dl>
605
+ <dd>
606
+
607
+ #### 📝 Description
608
+
609
+ <dl>
610
+ <dd>
611
+
660
612
  <dl>
661
613
  <dd>
662
614
 
615
+ Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
616
+
617
+ Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
618
+
619
+ Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
620
+
621
+ </dd>
622
+ </dl>
623
+ </dd>
624
+ </dl>
625
+
663
626
  #### 🔌 Usage
664
627
 
665
628
  <dl>
@@ -669,18 +632,12 @@ await client.voices.list();
669
632
  <dd>
670
633
 
671
634
  ```typescript
672
- await client.voices.create({
673
- name: "string",
674
- description: "string",
675
- embedding: [
676
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
677
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
678
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
679
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
680
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
681
- 1, 1, 1, 1, 1, 1, 1,
682
- ],
635
+ await client.voices.clone(fs.createReadStream("/path/to/your/file"), {
636
+ name: "A high-stability cloned voice",
637
+ description: "Copied from Cartesia docs",
638
+ mode: "stability",
683
639
  language: "en",
640
+ enhance: true,
684
641
  });
685
642
  ```
686
643
 
@@ -697,7 +654,15 @@ await client.voices.create({
697
654
  <dl>
698
655
  <dd>
699
656
 
700
- **request:** `Cartesia.CreateVoiceRequest`
657
+ **clip:** `File | fs.ReadStream | Blob`
658
+
659
+ </dd>
660
+ </dl>
661
+
662
+ <dl>
663
+ <dd>
664
+
665
+ **request:** `Cartesia.CloneVoiceRequest`
701
666
 
702
667
  </dd>
703
668
  </dl>
@@ -729,7 +694,7 @@ await client.voices.create({
729
694
  <dd>
730
695
 
731
696
  ```typescript
732
- await client.voices.delete("string");
697
+ await client.voices.delete("id");
733
698
  ```
734
699
 
735
700
  </dd>
@@ -777,9 +742,9 @@ await client.voices.delete("string");
777
742
  <dd>
778
743
 
779
744
  ```typescript
780
- await client.voices.update("string", {
781
- name: "string",
782
- description: "string",
745
+ await client.voices.update("id", {
746
+ name: "name",
747
+ description: "description",
783
748
  });
784
749
  ```
785
750
 
@@ -836,7 +801,7 @@ await client.voices.update("string", {
836
801
  <dd>
837
802
 
838
803
  ```typescript
839
- await client.voices.get("string");
804
+ await client.voices.get("id");
840
805
  ```
841
806
 
842
807
  </dd>
@@ -871,10 +836,25 @@ await client.voices.get("string");
871
836
  </dl>
872
837
  </details>
873
838
 
874
- <details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">localize</a>({ ...params }) -> Cartesia.EmbeddingResponse</code></summary>
839
+ <details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">localize</a>({ ...params }) -> Cartesia.VoiceMetadata</code></summary>
840
+ <dl>
841
+ <dd>
842
+
843
+ #### 📝 Description
844
+
845
+ <dl>
846
+ <dd>
847
+
875
848
  <dl>
876
849
  <dd>
877
850
 
851
+ Create a new voice from an existing voice localized to a new language and dialect.
852
+
853
+ </dd>
854
+ </dl>
855
+ </dd>
856
+ </dl>
857
+
878
858
  #### 🔌 Usage
879
859
 
880
860
  <dl>
@@ -885,17 +865,12 @@ await client.voices.get("string");
885
865
 
886
866
  ```typescript
887
867
  await client.voices.localize({
888
- embedding: [
889
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
892
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
893
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
894
- 1, 1, 1, 1, 1, 1, 1,
895
- ],
896
- language: "en",
897
- originalSpeakerGender: "male",
898
- dialect: "au",
868
+ voiceId: "694f9389-aac1-45b6-b726-9d9369183238",
869
+ name: "Sarah Peninsular Spanish",
870
+ description: "Sarah Voice in Peninsular Spanish",
871
+ language: "es",
872
+ originalSpeakerGender: "female",
873
+ dialect: "pe",
899
874
  });
900
875
  ```
901
876
 
@@ -947,7 +922,11 @@ await client.voices.localize({
947
922
  await client.voices.mix({
948
923
  voices: [
949
924
  {
950
- id: "string",
925
+ id: "id",
926
+ weight: 1.1,
927
+ },
928
+ {
929
+ id: "id",
951
930
  weight: 1.1,
952
931
  },
953
932
  ],
@@ -986,7 +965,7 @@ await client.voices.mix({
986
965
  </dl>
987
966
  </details>
988
967
 
989
- <details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">clone</a>(clip, { ...params }) -> Cartesia.VoiceMetadata</code></summary>
968
+ <details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">create</a>({ ...params }) -> Cartesia.VoiceMetadata</code></summary>
990
969
  <dl>
991
970
  <dd>
992
971
 
@@ -998,11 +977,7 @@ await client.voices.mix({
998
977
  <dl>
999
978
  <dd>
1000
979
 
1001
- Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
1002
-
1003
- Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
1004
-
1005
- Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
980
+ Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
1006
981
 
1007
982
  </dd>
1008
983
  </dl>
@@ -1018,12 +993,12 @@ Stability mode clones are more stable, but may not sound as similar to the sourc
1018
993
  <dd>
1019
994
 
1020
995
  ```typescript
1021
- await client.voices.clone(fs.createReadStream("/path/to/your/file"), {
1022
- name: "A high-stability cloned voice",
1023
- description: "Copied from Cartesia docs",
1024
- mode: "stability",
996
+ await client.voices.create({
997
+ name: "My Custom Voice",
998
+ description: "A custom voice created through the API",
999
+ embedding: [],
1025
1000
  language: "en",
1026
- enhance: true,
1001
+ baseVoiceId: "123e4567-e89b-12d3-a456-426614174000",
1027
1002
  });
1028
1003
  ```
1029
1004
 
@@ -1040,15 +1015,7 @@ await client.voices.clone(fs.createReadStream("/path/to/your/file"), {
1040
1015
  <dl>
1041
1016
  <dd>
1042
1017
 
1043
- **clip:** `File | fs.ReadStream | Blob`
1044
-
1045
- </dd>
1046
- </dl>
1047
-
1048
- <dl>
1049
- <dd>
1050
-
1051
- **request:** `Cartesia.CloneVoiceRequest`
1018
+ **request:** `Cartesia.CreateVoiceRequest`
1052
1019
 
1053
1020
  </dd>
1054
1021
  </dl>
@@ -6,6 +6,7 @@ import * as Cartesia from "../../../../api/index";
6
6
  import * as core from "../../../../core";
7
7
  import { Embedding } from "../../embedding/types/Embedding";
8
8
  import { SupportedLanguage } from "../../tts/types/SupportedLanguage";
9
+ import { BaseVoiceId } from "./BaseVoiceId";
9
10
  export declare const CreateVoiceRequest: core.serialization.ObjectSchema<serializers.CreateVoiceRequest.Raw, Cartesia.CreateVoiceRequest>;
10
11
  export declare namespace CreateVoiceRequest {
11
12
  interface Raw {
@@ -13,5 +14,6 @@ export declare namespace CreateVoiceRequest {
13
14
  description: string;
14
15
  embedding: Embedding.Raw;
15
16
  language?: SupportedLanguage.Raw | null;
17
+ base_voice_id?: BaseVoiceId.Raw | null;
16
18
  }
17
19
  }
@@ -40,9 +40,11 @@ exports.CreateVoiceRequest = void 0;
40
40
  const core = __importStar(require("../../../../core"));
41
41
  const Embedding_1 = require("../../embedding/types/Embedding");
42
42
  const SupportedLanguage_1 = require("../../tts/types/SupportedLanguage");
43
+ const BaseVoiceId_1 = require("./BaseVoiceId");
43
44
  exports.CreateVoiceRequest = core.serialization.object({
44
45
  name: core.serialization.string(),
45
46
  description: core.serialization.string(),
46
47
  embedding: Embedding_1.Embedding,
47
48
  language: SupportedLanguage_1.SupportedLanguage.optional(),
49
+ baseVoiceId: core.serialization.property("base_voice_id", BaseVoiceId_1.BaseVoiceId.optional()),
48
50
  });
@@ -4,14 +4,15 @@
4
4
  import * as serializers from "../../../index";
5
5
  import * as Cartesia from "../../../../api/index";
6
6
  import * as core from "../../../../core";
7
- import { Embedding } from "../../embedding/types/Embedding";
8
7
  import { LocalizeTargetLanguage } from "./LocalizeTargetLanguage";
9
8
  import { Gender } from "./Gender";
10
9
  import { LocalizeDialect } from "./LocalizeDialect";
11
10
  export declare const LocalizeVoiceRequest: core.serialization.ObjectSchema<serializers.LocalizeVoiceRequest.Raw, Cartesia.LocalizeVoiceRequest>;
12
11
  export declare namespace LocalizeVoiceRequest {
13
12
  interface Raw {
14
- embedding: Embedding.Raw;
13
+ voice_id: string;
14
+ name: string;
15
+ description: string;
15
16
  language: LocalizeTargetLanguage.Raw;
16
17
  original_speaker_gender: Gender.Raw;
17
18
  dialect?: LocalizeDialect.Raw | null;
@@ -38,12 +38,13 @@ var __importStar = (this && this.__importStar) || (function () {
38
38
  Object.defineProperty(exports, "__esModule", { value: true });
39
39
  exports.LocalizeVoiceRequest = void 0;
40
40
  const core = __importStar(require("../../../../core"));
41
- const Embedding_1 = require("../../embedding/types/Embedding");
42
41
  const LocalizeTargetLanguage_1 = require("./LocalizeTargetLanguage");
43
42
  const Gender_1 = require("./Gender");
44
43
  const LocalizeDialect_1 = require("./LocalizeDialect");
45
44
  exports.LocalizeVoiceRequest = core.serialization.object({
46
- embedding: Embedding_1.Embedding,
45
+ voiceId: core.serialization.property("voice_id", core.serialization.string()),
46
+ name: core.serialization.string(),
47
+ description: core.serialization.string(),
47
48
  language: LocalizeTargetLanguage_1.LocalizeTargetLanguage,
48
49
  originalSpeakerGender: core.serialization.property("original_speaker_gender", Gender_1.Gender),
49
50
  dialect: LocalizeDialect_1.LocalizeDialect.optional(),
package/version.d.ts CHANGED
@@ -1 +1 @@
1
- export declare const SDK_VERSION = "2.1.5";
1
+ export declare const SDK_VERSION = "2.1.7";
package/version.js CHANGED
@@ -1,4 +1,4 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.SDK_VERSION = void 0;
4
- exports.SDK_VERSION = "2.1.5";
4
+ exports.SDK_VERSION = "2.1.7";