npm - @cartesia/cartesia-js - Versions diffs - 2.1.5 → 2.1.7 - Mend

@cartesia/cartesia-js 2.1.5 → 2.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/reference.md CHANGED Viewed

@@ -98,7 +98,7 @@ await client.datasets.list();
 ```typescript
 await client.datasets.create({
-    name: "string",
+    name: "name",
 });
 ```
@@ -147,7 +147,7 @@ await client.datasets.create({
 <dd>
 ```typescript
-await client.datasets.listFiles("string");
+await client.datasets.listFiles("id");
 ```
 </dd>
@@ -182,70 +182,6 @@ await client.datasets.listFiles("string");
 </dl>
 </details>
-<details><summary><code>client.datasets.<a href="/src/api/resources/datasets/client/Client.ts">uploadFile</a>(file, id, { ...params }) -> void</code></summary>
-<dl>
-<dd>
-#### 🔌 Usage
-<dl>
-<dd>
-<dl>
-<dd>
-```typescript
-await client.datasets.uploadFile(fs.createReadStream("/path/to/your/file"), "string", {});
-```
-</dd>
-</dl>
-</dd>
-</dl>
-#### ⚙️ Parameters
-<dl>
-<dd>
-<dl>
-<dd>
-**file:** `File | fs.ReadStream | Blob`
-</dd>
-</dl>
-<dl>
-<dd>
-**id:** `string`
-</dd>
-</dl>
-<dl>
-<dd>
-**request:** `Cartesia.UploadDatasetFileRequest`
-</dd>
-</dl>
-<dl>
-<dd>
-**requestOptions:** `Datasets.RequestOptions`
-</dd>
-</dl>
-</dd>
-</dl>
-</dd>
-</dl>
-</details>
 ## Infill
 <details><summary><code>client.infill.<a href="/src/api/resources/infill/client/Client.ts">bytes</a>(leftAudio, rightAudio, { ...params }) -> stream.Readable</code></summary>
@@ -262,16 +198,24 @@ await client.datasets.uploadFile(fs.createReadStream("/path/to/your/file"), "str
 Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
-The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.
+**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
 Only the `sonic-preview` model is supported for infill at this time.
 At least one of `left_audio` or `right_audio` must be provided.
-</dd>
-</dl>
-</dd>
-</dl>
+As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
+- Use longer infill transcripts
+    - This gives the model more flexibility to adapt to the rest of the audio
+- Target natural pauses in the audio when deciding where to clip
+    - This means you don't need word-level timestamps to be as precise
+- Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
+    - This helps the model generate more natural transitions
+      </dd>
+      </dl>
+      </dd>
+      </dl>
 #### 🔌 Usage
@@ -359,7 +303,7 @@ await client.infill.bytes(fs.createReadStream("/path/to/your/file"), fs.createRe
 ```typescript
 await client.tts.bytes({
-    modelId: "sonic-english",
+    modelId: "sonic",
     transcript: "Hello, world!",
     voice: {
         mode: "id",
@@ -420,7 +364,7 @@ await client.tts.bytes({
 ```typescript
 const response = await client.tts.sse({
-    modelId: "sonic-english",
+    modelId: "sonic",
     transcript: "Hello, world!",
     voice: {
         mode: "id",
@@ -656,10 +600,29 @@ await client.voices.list();
 </dl>
 </details>
-<details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">create</a>({ ...params }) -> Cartesia.Voice</code></summary>
+<details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">clone</a>(clip, { ...params }) -> Cartesia.VoiceMetadata</code></summary>
+<dl>
+<dd>
+#### 📝 Description
+<dl>
+<dd>
 <dl>
 <dd>
+Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
+Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
+Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
+</dd>
+</dl>
+</dd>
+</dl>
 #### 🔌 Usage
 <dl>
@@ -669,18 +632,12 @@ await client.voices.list();
 <dd>
 ```typescript
-await client.voices.create({
-    name: "string",
-    description: "string",
-    embedding: [
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1,
-    ],
+await client.voices.clone(fs.createReadStream("/path/to/your/file"), {
+    name: "A high-stability cloned voice",
+    description: "Copied from Cartesia docs",
+    mode: "stability",
     language: "en",
+    enhance: true,
 });
 ```
@@ -697,7 +654,15 @@ await client.voices.create({
 <dl>
 <dd>
-**request:** `Cartesia.CreateVoiceRequest`
+**clip:** `File | fs.ReadStream | Blob`
+</dd>
+</dl>
+<dl>
+<dd>
+**request:** `Cartesia.CloneVoiceRequest`
 </dd>
 </dl>
@@ -729,7 +694,7 @@ await client.voices.create({
 <dd>
 ```typescript
-await client.voices.delete("string");
+await client.voices.delete("id");
 ```
 </dd>
@@ -777,9 +742,9 @@ await client.voices.delete("string");
 <dd>
 ```typescript
-await client.voices.update("string", {
-    name: "string",
-    description: "string",
+await client.voices.update("id", {
+    name: "name",
+    description: "description",
 });
 ```
@@ -836,7 +801,7 @@ await client.voices.update("string", {
 <dd>
 ```typescript
-await client.voices.get("string");
+await client.voices.get("id");
 ```
 </dd>
@@ -871,10 +836,25 @@ await client.voices.get("string");
 </dl>
 </details>
-<details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">localize</a>({ ...params }) -> Cartesia.EmbeddingResponse</code></summary>
+<details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">localize</a>({ ...params }) -> Cartesia.VoiceMetadata</code></summary>
+<dl>
+<dd>
+#### 📝 Description
+<dl>
+<dd>
 <dl>
 <dd>
+Create a new voice from an existing voice localized to a new language and dialect.
+</dd>
+</dl>
+</dd>
+</dl>
 #### 🔌 Usage
 <dl>
@@ -885,17 +865,12 @@ await client.voices.get("string");
 ```typescript
 await client.voices.localize({
-    embedding: [
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1,
-    ],
-    language: "en",
-    originalSpeakerGender: "male",
-    dialect: "au",
+    voiceId: "694f9389-aac1-45b6-b726-9d9369183238",
+    name: "Sarah Peninsular Spanish",
+    description: "Sarah Voice in Peninsular Spanish",
+    language: "es",
+    originalSpeakerGender: "female",
+    dialect: "pe",
 });
 ```
@@ -947,7 +922,11 @@ await client.voices.localize({
 await client.voices.mix({
     voices: [
         {
-            id: "string",
+            id: "id",
+            weight: 1.1,
+        },
+        {
+            id: "id",
             weight: 1.1,
         },
     ],
@@ -986,7 +965,7 @@ await client.voices.mix({
 </dl>
 </details>
-<details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">clone</a>(clip, { ...params }) -> Cartesia.VoiceMetadata</code></summary>
+<details><summary><code>client.voices.<a href="/src/api/resources/voices/client/Client.ts">create</a>({ ...params }) -> Cartesia.VoiceMetadata</code></summary>
 <dl>
 <dd>
@@ -998,11 +977,7 @@ await client.voices.mix({
 <dl>
 <dd>
-Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
-Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
-Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
+Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
 </dd>
 </dl>
@@ -1018,12 +993,12 @@ Stability mode clones are more stable, but may not sound as similar to the sourc
 <dd>
 ```typescript
-await client.voices.clone(fs.createReadStream("/path/to/your/file"), {
-    name: "A high-stability cloned voice",
-    description: "Copied from Cartesia docs",
-    mode: "stability",
+await client.voices.create({
+    name: "My Custom Voice",
+    description: "A custom voice created through the API",
+    embedding: [],
     language: "en",
-    enhance: true,
+    baseVoiceId: "123e4567-e89b-12d3-a456-426614174000",
 });
 ```
@@ -1040,15 +1015,7 @@ await client.voices.clone(fs.createReadStream("/path/to/your/file"), {
 <dl>
 <dd>
-**clip:** `File | fs.ReadStream | Blob`
-</dd>
-</dl>
-<dl>
-<dd>
-**request:** `Cartesia.CloneVoiceRequest`
+**request:** `Cartesia.CreateVoiceRequest`
 </dd>
 </dl>

package/serialization/resources/voices/types/CreateVoiceRequest.d.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import * as Cartesia from "../../../../api/index";
 import * as core from "../../../../core";
 import { Embedding } from "../../embedding/types/Embedding";
 import { SupportedLanguage } from "../../tts/types/SupportedLanguage";
+import { BaseVoiceId } from "./BaseVoiceId";
 export declare const CreateVoiceRequest: core.serialization.ObjectSchema<serializers.CreateVoiceRequest.Raw, Cartesia.CreateVoiceRequest>;
 export declare namespace CreateVoiceRequest {
     interface Raw {
@@ -13,5 +14,6 @@ export declare namespace CreateVoiceRequest {
         description: string;
         embedding: Embedding.Raw;
         language?: SupportedLanguage.Raw | null;
+        base_voice_id?: BaseVoiceId.Raw | null;
     }
 }

package/serialization/resources/voices/types/CreateVoiceRequest.js CHANGED Viewed

@@ -40,9 +40,11 @@ exports.CreateVoiceRequest = void 0;
 const core = __importStar(require("../../../../core"));
 const Embedding_1 = require("../../embedding/types/Embedding");
 const SupportedLanguage_1 = require("../../tts/types/SupportedLanguage");
+const BaseVoiceId_1 = require("./BaseVoiceId");
 exports.CreateVoiceRequest = core.serialization.object({
     name: core.serialization.string(),
     description: core.serialization.string(),
     embedding: Embedding_1.Embedding,
     language: SupportedLanguage_1.SupportedLanguage.optional(),
+    baseVoiceId: core.serialization.property("base_voice_id", BaseVoiceId_1.BaseVoiceId.optional()),
 });

package/serialization/resources/voices/types/LocalizeVoiceRequest.d.ts CHANGED Viewed

@@ -4,14 +4,15 @@
 import * as serializers from "../../../index";
 import * as Cartesia from "../../../../api/index";
 import * as core from "../../../../core";
-import { Embedding } from "../../embedding/types/Embedding";
 import { LocalizeTargetLanguage } from "./LocalizeTargetLanguage";
 import { Gender } from "./Gender";
 import { LocalizeDialect } from "./LocalizeDialect";
 export declare const LocalizeVoiceRequest: core.serialization.ObjectSchema<serializers.LocalizeVoiceRequest.Raw, Cartesia.LocalizeVoiceRequest>;
 export declare namespace LocalizeVoiceRequest {
     interface Raw {
-        embedding: Embedding.Raw;
+        voice_id: string;
+        name: string;
+        description: string;
         language: LocalizeTargetLanguage.Raw;
         original_speaker_gender: Gender.Raw;
         dialect?: LocalizeDialect.Raw | null;

package/serialization/resources/voices/types/LocalizeVoiceRequest.js CHANGED Viewed

@@ -38,12 +38,13 @@ var __importStar = (this && this.__importStar) || (function () {
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.LocalizeVoiceRequest = void 0;
 const core = __importStar(require("../../../../core"));
-const Embedding_1 = require("../../embedding/types/Embedding");
 const LocalizeTargetLanguage_1 = require("./LocalizeTargetLanguage");
 const Gender_1 = require("./Gender");
 const LocalizeDialect_1 = require("./LocalizeDialect");
 exports.LocalizeVoiceRequest = core.serialization.object({
-    embedding: Embedding_1.Embedding,
+    voiceId: core.serialization.property("voice_id", core.serialization.string()),
+    name: core.serialization.string(),
+    description: core.serialization.string(),
     language: LocalizeTargetLanguage_1.LocalizeTargetLanguage,
     originalSpeakerGender: core.serialization.property("original_speaker_gender", Gender_1.Gender),
     dialect: LocalizeDialect_1.LocalizeDialect.optional(),

package/version.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export declare const SDK_VERSION = "2.1.5";
1	+ export declare const SDK_VERSION = "2.1.7";

package/version.js CHANGED Viewed

@@ -1,4 +1,4 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.SDK_VERSION = void 0;
-exports.SDK_VERSION = "2.1.5";
+exports.SDK_VERSION = "2.1.7";