@cartesia/cartesia-js 1.0.0-alpha.4 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/.turbo/turbo-build.log +49 -49
  2. package/CHANGELOG.md +23 -0
  3. package/LICENSE.md +21 -0
  4. package/README.md +102 -21
  5. package/dist/{chunk-VK7LBMVI.js → chunk-2NA5SEML.js} +2 -2
  6. package/dist/{chunk-PQ5EVEEH.js → chunk-5M33ZF3Y.js} +1 -1
  7. package/dist/{chunk-PQ6CIPFW.js → chunk-6YQ6KDIQ.js} +44 -5
  8. package/dist/{chunk-IQAXBRHU.js → chunk-ASZKHN7Q.js} +53 -29
  9. package/dist/{chunk-RO7TY474.js → chunk-BHY7MNGT.js} +11 -6
  10. package/dist/{chunk-WIFMLPT5.js → chunk-GHY2WEOK.js} +13 -0
  11. package/dist/{chunk-SGXUEFII.js → chunk-KUSVZXDT.js} +2 -2
  12. package/dist/{chunk-36JBKJUN.js → chunk-LZO6K34D.js} +20 -7
  13. package/dist/{chunk-3FL2SNIR.js → chunk-NQVZNVOU.js} +1 -1
  14. package/dist/{chunk-ISRU7PLL.js → chunk-OFH3ML4L.js} +3 -3
  15. package/dist/index.cjs +129 -39
  16. package/dist/index.d.cts +4 -4
  17. package/dist/index.d.ts +4 -4
  18. package/dist/index.js +15 -9
  19. package/dist/lib/client.js +2 -2
  20. package/dist/lib/constants.js +1 -1
  21. package/dist/lib/index.cjs +106 -33
  22. package/dist/lib/index.js +8 -8
  23. package/dist/react/index.cjs +231 -92
  24. package/dist/react/index.d.cts +4 -3
  25. package/dist/react/index.d.ts +4 -3
  26. package/dist/react/index.js +117 -64
  27. package/dist/react/utils.js +2 -2
  28. package/dist/tts/index.cjs +106 -33
  29. package/dist/tts/index.js +6 -6
  30. package/dist/tts/player.cjs +23 -5
  31. package/dist/tts/player.d.cts +6 -0
  32. package/dist/tts/player.d.ts +6 -0
  33. package/dist/tts/player.js +4 -3
  34. package/dist/tts/source.cjs +50 -4
  35. package/dist/tts/source.d.cts +16 -6
  36. package/dist/tts/source.d.ts +16 -6
  37. package/dist/tts/source.js +4 -2
  38. package/dist/tts/utils.cjs +18 -6
  39. package/dist/tts/utils.d.cts +7 -5
  40. package/dist/tts/utils.d.ts +7 -5
  41. package/dist/tts/utils.js +3 -2
  42. package/dist/tts/websocket.cjs +106 -33
  43. package/dist/tts/websocket.d.cts +20 -10
  44. package/dist/tts/websocket.d.ts +20 -10
  45. package/dist/tts/websocket.js +5 -5
  46. package/dist/types/index.d.cts +60 -4
  47. package/dist/types/index.d.ts +60 -4
  48. package/dist/voices/index.js +3 -3
  49. package/package.json +1 -1
  50. package/src/index.ts +2 -0
  51. package/src/react/index.ts +117 -62
  52. package/src/tts/player.ts +15 -8
  53. package/src/tts/source.ts +53 -7
  54. package/src/tts/utils.ts +26 -12
  55. package/src/tts/websocket.ts +42 -19
  56. package/src/types/index.ts +81 -3
package/src/tts/source.ts CHANGED
@@ -1,13 +1,30 @@
1
1
  import Emittery from "emittery";
2
- import type { SourceEventData } from "../types";
2
+ import type { Encoding, SourceEventData, TypedArray } from "../types";
3
+
4
+ type EncodingInfo = {
5
+ arrayType:
6
+ | Float32ArrayConstructor
7
+ | Int16ArrayConstructor
8
+ | Uint8ArrayConstructor;
9
+ bytesPerElement: number;
10
+ };
11
+
12
+ export const ENCODING_MAP: Record<Encoding, EncodingInfo> = {
13
+ pcm_f32le: { arrayType: Float32Array, bytesPerElement: 4 },
14
+ pcm_s16le: { arrayType: Int16Array, bytesPerElement: 2 },
15
+ pcm_alaw: { arrayType: Uint8Array, bytesPerElement: 1 },
16
+ pcm_mulaw: { arrayType: Uint8Array, bytesPerElement: 1 },
17
+ };
3
18
 
4
19
  export default class Source {
5
20
  #emitter = new Emittery<SourceEventData>();
6
- #buffer: Float32Array;
21
+ #buffer: TypedArray;
7
22
  #readIndex = 0;
8
23
  #writeIndex = 0;
9
24
  #closed = false;
10
25
  #sampleRate: number;
26
+ #encoding: Encoding;
27
+ #container: string;
11
28
 
12
29
  on = this.#emitter.on.bind(this.#emitter);
13
30
  once = this.#emitter.once.bind(this.#emitter);
@@ -20,21 +37,46 @@ export default class Source {
20
37
  * @param options - Options for the Source.
21
38
  * @param options.sampleRate - The sample rate of the audio.
22
39
  */
23
- constructor({ sampleRate }: { sampleRate: number }) {
40
+ constructor({
41
+ sampleRate,
42
+ encoding,
43
+ container,
44
+ }: { sampleRate: number; encoding: string; container: string }) {
24
45
  this.#sampleRate = sampleRate;
25
- this.#buffer = new Float32Array(1024); // Initial size, can be adjusted
46
+ this.#encoding = encoding as Encoding;
47
+ this.#container = container;
48
+ this.#buffer = this.#createBuffer(1024); // Initial size, can be adjusted
26
49
  }
27
50
 
28
51
  get sampleRate() {
29
52
  return this.#sampleRate;
30
53
  }
31
54
 
55
+ get encoding() {
56
+ return this.#encoding;
57
+ }
58
+
59
+ get container() {
60
+ return this.#container;
61
+ }
62
+
63
+ /**
64
+ * Create a new buffer for the source.
65
+ *
66
+ * @param size - The size of the buffer to create.
67
+ * @returns The new buffer as a TypedArray based on the encoding.
68
+ */
69
+ #createBuffer(size: number): TypedArray {
70
+ const { arrayType: ArrayType } = ENCODING_MAP[this.#encoding];
71
+ return new ArrayType(size);
72
+ }
73
+
32
74
  /**
33
75
  * Append audio to the buffer.
34
76
  *
35
77
  * @param src The audio to append.
36
78
  */
37
- async enqueue(src: Float32Array) {
79
+ async enqueue(src: TypedArray) {
38
80
  const requiredCapacity = this.#writeIndex + src.length;
39
81
 
40
82
  // Resize buffer if necessary
@@ -44,7 +86,7 @@ export default class Source {
44
86
  newCapacity *= 2; // Double the buffer size
45
87
  }
46
88
 
47
- const newBuffer = new Float32Array(newCapacity);
89
+ const newBuffer = this.#createBuffer(newCapacity);
48
90
  newBuffer.set(this.#buffer);
49
91
  this.#buffer = newBuffer;
50
92
  }
@@ -62,7 +104,7 @@ export default class Source {
62
104
  * @returns The number of samples read. If the source is closed, this will be
63
105
  * less than the length of the provided buffer.
64
106
  */
65
- async read(dst: Float32Array): Promise<number> {
107
+ async read(dst: TypedArray): Promise<number> {
66
108
  // Read the buffer into the provided buffer.
67
109
  const targetReadIndex = this.#readIndex + dst.length;
68
110
 
@@ -100,6 +142,10 @@ export default class Source {
100
142
  return this.#readIndex;
101
143
  }
102
144
 
145
+ get writeIndex() {
146
+ return this.#writeIndex;
147
+ }
148
+
103
149
  /**
104
150
  * Close the source. This signals that no more audio will be enqueued.
105
151
  *
package/src/tts/utils.ts CHANGED
@@ -1,25 +1,38 @@
1
1
  import base64 from "base64-js";
2
2
  import type Emittery from "emittery";
3
- import type { Chunk, EmitteryCallbacks, Sentinel } from "../types";
3
+ import type {
4
+ Chunk,
5
+ EmitteryCallbacks,
6
+ Encoding,
7
+ Sentinel,
8
+ TypedArray,
9
+ WebSocketResponse,
10
+ } from "../types";
11
+ import { ENCODING_MAP } from "./source";
4
12
 
5
13
  /**
6
- * Convert base64-encoded audio buffer(s) to a Float32Array.
14
+ * Convert base64-encoded audio buffer(s) to a TypedArray.
7
15
  *
8
16
  * @param b64 The base64-encoded audio buffer, or an array of base64-encoded
9
17
  * audio buffers.
10
- * @returns The audio buffer(s) as a Float32Array.
18
+ * @param encoding The encoding of the audio buffer(s).
19
+ * @returns The audio buffer(s) as a TypedArray.
11
20
  */
12
- export function base64ToArray(b64: Chunk[]): Float32Array {
21
+ export function base64ToArray(b64: Chunk[], encoding: string): TypedArray {
13
22
  const byteArrays = filterSentinel(b64).map((b) => base64.toByteArray(b));
23
+
24
+ const { arrayType: ArrayType, bytesPerElement } =
25
+ ENCODING_MAP[encoding as Encoding];
26
+
14
27
  const totalLength = byteArrays.reduce(
15
- (acc, arr) => acc + arr.length / Float32Array.BYTES_PER_ELEMENT,
28
+ (acc, arr) => acc + arr.length / bytesPerElement,
16
29
  0,
17
30
  );
18
- const result = new Float32Array(totalLength);
31
+ const result = new ArrayType(totalLength);
19
32
 
20
33
  let offset = 0;
21
34
  for (const arr of byteArrays) {
22
- const floats = new Float32Array(arr.buffer);
35
+ const floats = new ArrayType(arr.buffer);
23
36
  result.set(floats, offset);
24
37
  offset += floats.length;
25
38
  }
@@ -70,26 +83,27 @@ export function createMessageHandlerForContextId(
70
83
  chunk,
71
84
  message,
72
85
  }: {
73
- chunk: Chunk;
86
+ chunk?: Chunk;
74
87
  message: string;
88
+ data: WebSocketResponse;
75
89
  }) => void,
76
90
  ) {
77
91
  return (event: MessageEvent) => {
78
92
  if (typeof event.data !== "string") {
79
93
  return; // Ignore non-string messages.
80
94
  }
81
- const message = JSON.parse(event.data);
95
+ const message: WebSocketResponse = JSON.parse(event.data);
82
96
  if (message.context_id !== contextId) {
83
97
  return; // Ignore messages for other contexts.
84
98
  }
85
- let chunk: Chunk;
99
+ let chunk: Chunk | undefined;
86
100
  if (message.done) {
87
101
  // Convert the done message to a sentinel value.
88
102
  chunk = getSentinel();
89
- } else {
103
+ } else if (message.type === "chunk") {
90
104
  chunk = message.data;
91
105
  }
92
- handler({ chunk, message: event.data });
106
+ handler({ chunk, message: event.data, data: message });
93
107
  };
94
108
  }
95
109
 
@@ -6,8 +6,10 @@ import { CARTESIA_VERSION, constructApiUrl } from "../lib/constants";
6
6
  import type {
7
7
  ConnectionEventData,
8
8
  EmitteryCallbacks,
9
+ StreamOptions,
9
10
  StreamRequest,
10
11
  WebSocketOptions,
12
+ WordTimestamps,
11
13
  } from "../types";
12
14
  import Source from "./source";
13
15
  import {
@@ -21,6 +23,8 @@ export default class WebSocket extends Client {
21
23
  socket?: PartySocketWebSocket;
22
24
  #isConnected = false;
23
25
  #sampleRate: number;
26
+ #container: string;
27
+ #encoding: string;
24
28
 
25
29
  /**
26
30
  * Create a new WebSocket client.
@@ -28,50 +32,58 @@ export default class WebSocket extends Client {
28
32
  * @param args - Arguments to pass to the Client constructor.
29
33
  */
30
34
  constructor(
31
- { sampleRate }: WebSocketOptions,
35
+ { sampleRate, container, encoding }: WebSocketOptions,
32
36
  ...args: ConstructorParameters<typeof Client>
33
37
  ) {
34
38
  super(...args);
35
39
 
36
40
  this.#sampleRate = sampleRate;
41
+ this.#container = container ?? "raw"; // Default to raw audio for backwards compatibility.
42
+ this.#encoding = encoding ?? "pcm_f32le"; // Default to 32-bit floating point PCM for backwards compatibility.
37
43
  }
38
44
 
39
45
  /**
40
- * Send a message over the WebSocket in order to start a stream.
46
+ * Send a message over the WebSocket to start a stream.
41
47
  *
42
- * @param inputs - Stream options.
48
+ * @param inputs - Stream options. Defined in the StreamRequest type.
43
49
  * @param options - Options for the stream.
44
50
  * @param options.timeout - The maximum time to wait for a chunk before cancelling the stream.
45
- * If `0`, the stream will not time out.
51
+ * If set to `0`, the stream will not time out.
46
52
  * @returns A Source object that can be passed to a Player to play the audio.
53
+ * @returns An Emittery instance that emits messages from the WebSocket.
54
+ * @returns An abort function that can be called to cancel the stream.
47
55
  */
48
- send(
49
- inputs: StreamRequest["inputs"],
50
- { timeout = 0 }: StreamRequest["options"] = {},
51
- ) {
56
+ send({ ...inputs }: StreamRequest, { timeout = 0 }: StreamOptions = {}) {
52
57
  if (!this.#isConnected) {
53
58
  throw new Error("Not connected to WebSocket. Call .connect() first.");
54
59
  }
55
60
 
61
+ if (!inputs.context_id) {
62
+ inputs.context_id = this.#generateId();
63
+ }
64
+ if (!inputs.output_format) {
65
+ inputs.output_format = {
66
+ container: this.#container,
67
+ encoding: this.#encoding,
68
+ sample_rate: this.#sampleRate,
69
+ };
70
+ }
71
+
56
72
  // Send audio request.
57
- const contextId = this.#generateId();
58
73
  this.socket?.send(
59
74
  JSON.stringify({
60
- context_id: contextId,
61
75
  ...inputs,
62
- output_format: {
63
- container: "raw",
64
- encoding: "pcm_f32le",
65
- sample_rate: this.#sampleRate,
66
- },
67
76
  }),
68
77
  );
69
78
 
70
79
  const emitter = new Emittery<{
71
80
  message: string;
81
+ timestamps: WordTimestamps;
72
82
  }>();
73
83
  const source = new Source({
74
84
  sampleRate: this.#sampleRate,
85
+ encoding: this.#encoding,
86
+ container: this.#container,
75
87
  });
76
88
  // Used to signal that the stream is complete, either because the
77
89
  // WebSocket has closed, or because the stream has finished.
@@ -82,19 +94,26 @@ export default class WebSocket extends Client {
82
94
  timeoutId = setTimeout(streamCompleteController.abort, timeout);
83
95
  }
84
96
  const handleMessage = createMessageHandlerForContextId(
85
- contextId,
86
- async ({ chunk, message }) => {
97
+ inputs.context_id,
98
+ async ({ chunk, message, data }) => {
87
99
  emitter.emit("message", message);
100
+ if (data.type === "timestamps") {
101
+ emitter.emit("timestamps", data.word_timestamps);
102
+ return;
103
+ }
88
104
  if (isSentinel(chunk)) {
89
105
  await source.close();
90
106
  streamCompleteController.abort();
91
107
  return;
92
108
  }
93
- await source.enqueue(base64ToArray([chunk]));
94
109
  if (timeoutId) {
95
110
  clearTimeout(timeoutId);
96
111
  timeoutId = setTimeout(streamCompleteController.abort, timeout);
97
112
  }
113
+ if (!chunk) {
114
+ return;
115
+ }
116
+ await source.enqueue(base64ToArray([chunk], this.#encoding));
98
117
  },
99
118
  );
100
119
  this.socket?.addEventListener("message", handleMessage, {
@@ -125,7 +144,11 @@ export default class WebSocket extends Client {
125
144
  }
126
145
  });
127
146
 
128
- return { source, ...getEmitteryCallbacks(emitter) };
147
+ return {
148
+ source,
149
+ ...getEmitteryCallbacks(emitter),
150
+ stop: streamCompleteController.abort.bind(streamCompleteController),
151
+ };
129
152
  }
130
153
 
131
154
  /**
@@ -14,13 +14,85 @@ export type ConnectionEventData = {
14
14
  close: never;
15
15
  };
16
16
 
17
+ export type VoiceSpecifier =
18
+ | {
19
+ mode: "id";
20
+ id: string;
21
+ }
22
+ | {
23
+ mode: "embedding";
24
+ embedding: number[];
25
+ };
26
+
27
+ export type Emotion =
28
+ | "anger"
29
+ | "sadness"
30
+ | "positivity"
31
+ | "curiosity"
32
+ | "surprise";
33
+ export type Intensity = "lowest" | "low" | "high" | "highest";
34
+ export type EmotionControl = Emotion | `${Emotion}:${Intensity}`;
35
+
36
+ export type VoiceOptions = VoiceSpecifier & {
37
+ __experimental_controls?: {
38
+ speed?: "slowest" | "slow" | "normal" | "fast" | "fastest";
39
+ emotion?: EmotionControl[];
40
+ };
41
+ };
42
+
17
43
  export type StreamRequest = {
18
- inputs: object;
19
- options: {
20
- timeout?: number;
44
+ model_id: string;
45
+ transcript: string;
46
+ voice: VoiceOptions;
47
+ output_format?: {
48
+ container: string;
49
+ encoding: string;
50
+ sample_rate: number;
21
51
  };
52
+ context_id?: string;
53
+ continue?: boolean;
54
+ duration?: number;
55
+ language?: string;
56
+ add_timestamps?: boolean;
57
+ };
58
+
59
+ export type StreamOptions = {
60
+ timeout?: number;
61
+ };
62
+
63
+ export type WebSocketBaseResponse = {
64
+ context_id: string;
65
+ status_code: number;
66
+ done: boolean;
67
+ };
68
+
69
+ export type WordTimestamps = {
70
+ words: string[];
71
+ start: number[];
72
+ end: number[];
73
+ };
74
+
75
+ export type WebSocketTimestampsResponse = WebSocketBaseResponse & {
76
+ type: "timestamps";
77
+ word_timestamps: WordTimestamps;
78
+ };
79
+
80
+ export type WebSocketChunkResponse = WebSocketBaseResponse & {
81
+ type: "chunk";
82
+ data: string;
83
+ step_time: number;
84
+ };
85
+
86
+ export type WebSocketErrorResponse = WebSocketBaseResponse & {
87
+ type: "error";
88
+ error: string;
22
89
  };
23
90
 
91
+ export type WebSocketResponse =
92
+ | WebSocketTimestampsResponse
93
+ | WebSocketChunkResponse
94
+ | WebSocketErrorResponse;
95
+
24
96
  export type EmitteryCallbacks<T> = {
25
97
  on: Emittery<T>["on"];
26
98
  off: Emittery<T>["off"];
@@ -56,6 +128,8 @@ export type CloneResponse = {
56
128
  };
57
129
 
58
130
  export type WebSocketOptions = {
131
+ container?: string;
132
+ encoding?: string;
59
133
  sampleRate: number;
60
134
  };
61
135
 
@@ -65,3 +139,7 @@ export type SourceEventData = {
65
139
  wait: never;
66
140
  read: never;
67
141
  };
142
+
143
+ export type TypedArray = Float32Array | Int16Array | Uint8Array;
144
+
145
+ export type Encoding = "pcm_f32le" | "pcm_s16le" | "pcm_alaw" | "pcm_mulaw";