@realtimex/sdk 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -80,16 +80,53 @@ interface Task {
80
80
  runs: TaskRun[];
81
81
  }
82
82
  interface TTSOptions {
83
+ /** Voice ID (provider-specific) */
83
84
  voice?: string;
85
+ /** Model ID (provider-specific) */
84
86
  model?: string;
87
+ /** Speech speed (0.5-2.0) */
85
88
  speed?: number;
89
+ /** TTS provider ID */
86
90
  provider?: string;
91
+ /** Language code (e.g., 'en', 'es', 'fr') - for Supertonic */
92
+ language?: string;
93
+ /** Quality level (1-20) - for Supertonic num_inference_steps */
94
+ num_inference_steps?: number;
95
+ }
96
+ interface TTSProviderConfig {
97
+ /** Available voice/speaker IDs */
98
+ voices: string[];
99
+ /** Supported languages (for multilingual providers) */
100
+ languages?: string[];
101
+ /** Speed range */
102
+ speed?: {
103
+ min: number;
104
+ max: number;
105
+ default: number;
106
+ };
107
+ /** Quality range (for providers that support it) */
108
+ quality?: {
109
+ min: number;
110
+ max: number;
111
+ default: number;
112
+ description?: string;
113
+ };
87
114
  }
88
115
  interface TTSProvider {
116
+ /** Provider ID (e.g., 'elevenlabs', 'supertonic_local') */
89
117
  id: string;
118
+ /** Display name */
90
119
  name: string;
91
- type: 'remote' | 'local';
92
- voices: string[];
120
+ /** Provider type: 'server' (remote API) or 'client' (local) */
121
+ type: 'server' | 'client';
122
+ /** Whether provider is configured and ready */
123
+ configured: boolean;
124
+ /** Whether streaming is supported */
125
+ supportsStreaming: boolean;
126
+ /** Optional note about provider requirements */
127
+ note?: string;
128
+ /** Configuration options */
129
+ config?: TTSProviderConfig;
93
130
  }
94
131
  interface TTSProvidersResponse {
95
132
  success: boolean;
@@ -97,6 +134,24 @@ interface TTSProvidersResponse {
97
134
  default: string;
98
135
  error?: string;
99
136
  }
137
+ interface TTSChunk {
138
+ /** Chunk index (0-based) */
139
+ index: number;
140
+ /** Total number of chunks */
141
+ total: number;
142
+ /** Decoded audio data (ArrayBuffer) - ready for playback */
143
+ audio: ArrayBuffer;
144
+ /** Audio MIME type */
145
+ mimeType: string;
146
+ }
147
+ interface TTSChunkEvent {
148
+ type: 'info' | 'chunk' | 'error' | 'done';
149
+ data: TTSChunk | {
150
+ totalChunks: number;
151
+ } | {
152
+ error: string;
153
+ };
154
+ }
100
155
 
101
156
  /**
102
157
  * Activities Module - HTTP Proxy to RealtimeX Main App
@@ -680,19 +735,22 @@ declare class TTSModule {
680
735
  */
681
736
  speak(text: string, options?: TTSOptions): Promise<ArrayBuffer>;
682
737
  /**
683
- * Generate speech from text (returns stream)
738
+ * Generate speech from text with streaming (yields decoded audio chunks)
739
+ * Uses SSE internally but returns decoded ArrayBuffer chunks for easy playback.
684
740
  *
685
741
  * @example
686
742
  * ```ts
687
- * const stream = await sdk.tts.speakStream("Hello world");
688
- * for await (const chunk of stream) {
689
- * // Play chunk...
743
+ * for await (const chunk of sdk.tts.speakStream("Hello world")) {
744
+ * // chunk.audio is ArrayBuffer (already decoded!)
745
+ * const blob = new Blob([chunk.audio], { type: chunk.mimeType });
746
+ * const audio = new Audio(URL.createObjectURL(blob));
747
+ * await audio.play();
690
748
  * }
691
749
  * ```
692
750
  */
693
- speakStream(text: string, options?: TTSOptions): AsyncGenerator<Uint8Array>;
751
+ speakStream(text: string, options?: TTSOptions): AsyncGenerator<TTSChunk>;
694
752
  /**
695
- * List available TTS providers
753
+ * List available TTS providers with configuration options
696
754
  */
697
755
  listProviders(): Promise<TTSProvider[]>;
698
756
  }
@@ -745,4 +803,4 @@ declare class RealtimeXSDK {
745
803
  getAppDataDir(): Promise<string>;
746
804
  }
747
805
 
748
- export { ActivitiesModule, type Activity, type Agent, ApiModule, type ChatMessage, type ChatOptions, type ChatResponse, type EmbedOptions, type EmbedResponse, LLMModule, LLMPermissionError, LLMProviderError, PermissionDeniedError, PermissionRequiredError, PortModule, type Provider, type ProvidersResponse, RealtimeXSDK, type SDKConfig, type StreamChunk, TTSModule, type TTSOptions, type TTSProvider, type TTSProvidersResponse, type Task, TaskModule, type TaskRun, type Thread, type TriggerAgentPayload, type TriggerAgentResponse, type VectorDeleteOptions, type VectorDeleteResponse, type VectorQueryOptions, type VectorQueryResponse, type VectorQueryResult, type VectorRecord, VectorStore, type VectorUpsertOptions, type VectorUpsertResponse, WebhookModule, type Workspace };
806
+ export { ActivitiesModule, type Activity, type Agent, ApiModule, type ChatMessage, type ChatOptions, type ChatResponse, type EmbedOptions, type EmbedResponse, LLMModule, LLMPermissionError, LLMProviderError, PermissionDeniedError, PermissionRequiredError, PortModule, type Provider, type ProvidersResponse, RealtimeXSDK, type SDKConfig, type StreamChunk, type TTSChunk, type TTSChunkEvent, TTSModule, type TTSOptions, type TTSProvider, type TTSProviderConfig, type TTSProvidersResponse, type Task, TaskModule, type TaskRun, type Thread, type TriggerAgentPayload, type TriggerAgentResponse, type VectorDeleteOptions, type VectorDeleteResponse, type VectorQueryOptions, type VectorQueryResponse, type VectorQueryResult, type VectorRecord, VectorStore, type VectorUpsertOptions, type VectorUpsertResponse, WebhookModule, type Workspace };
package/dist/index.d.ts CHANGED
@@ -80,16 +80,53 @@ interface Task {
80
80
  runs: TaskRun[];
81
81
  }
82
82
  interface TTSOptions {
83
+ /** Voice ID (provider-specific) */
83
84
  voice?: string;
85
+ /** Model ID (provider-specific) */
84
86
  model?: string;
87
+ /** Speech speed (0.5-2.0) */
85
88
  speed?: number;
89
+ /** TTS provider ID */
86
90
  provider?: string;
91
+ /** Language code (e.g., 'en', 'es', 'fr') - for Supertonic */
92
+ language?: string;
93
+ /** Quality level (1-20) - for Supertonic num_inference_steps */
94
+ num_inference_steps?: number;
95
+ }
96
+ interface TTSProviderConfig {
97
+ /** Available voice/speaker IDs */
98
+ voices: string[];
99
+ /** Supported languages (for multilingual providers) */
100
+ languages?: string[];
101
+ /** Speed range */
102
+ speed?: {
103
+ min: number;
104
+ max: number;
105
+ default: number;
106
+ };
107
+ /** Quality range (for providers that support it) */
108
+ quality?: {
109
+ min: number;
110
+ max: number;
111
+ default: number;
112
+ description?: string;
113
+ };
87
114
  }
88
115
  interface TTSProvider {
116
+ /** Provider ID (e.g., 'elevenlabs', 'supertonic_local') */
89
117
  id: string;
118
+ /** Display name */
90
119
  name: string;
91
- type: 'remote' | 'local';
92
- voices: string[];
120
+ /** Provider type: 'server' (remote API) or 'client' (local) */
121
+ type: 'server' | 'client';
122
+ /** Whether provider is configured and ready */
123
+ configured: boolean;
124
+ /** Whether streaming is supported */
125
+ supportsStreaming: boolean;
126
+ /** Optional note about provider requirements */
127
+ note?: string;
128
+ /** Configuration options */
129
+ config?: TTSProviderConfig;
93
130
  }
94
131
  interface TTSProvidersResponse {
95
132
  success: boolean;
@@ -97,6 +134,24 @@ interface TTSProvidersResponse {
97
134
  default: string;
98
135
  error?: string;
99
136
  }
137
+ interface TTSChunk {
138
+ /** Chunk index (0-based) */
139
+ index: number;
140
+ /** Total number of chunks */
141
+ total: number;
142
+ /** Decoded audio data (ArrayBuffer) - ready for playback */
143
+ audio: ArrayBuffer;
144
+ /** Audio MIME type */
145
+ mimeType: string;
146
+ }
147
+ interface TTSChunkEvent {
148
+ type: 'info' | 'chunk' | 'error' | 'done';
149
+ data: TTSChunk | {
150
+ totalChunks: number;
151
+ } | {
152
+ error: string;
153
+ };
154
+ }
100
155
 
101
156
  /**
102
157
  * Activities Module - HTTP Proxy to RealtimeX Main App
@@ -680,19 +735,22 @@ declare class TTSModule {
680
735
  */
681
736
  speak(text: string, options?: TTSOptions): Promise<ArrayBuffer>;
682
737
  /**
683
- * Generate speech from text (returns stream)
738
+ * Generate speech from text with streaming (yields decoded audio chunks)
739
+ * Uses SSE internally but returns decoded ArrayBuffer chunks for easy playback.
684
740
  *
685
741
  * @example
686
742
  * ```ts
687
- * const stream = await sdk.tts.speakStream("Hello world");
688
- * for await (const chunk of stream) {
689
- * // Play chunk...
743
+ * for await (const chunk of sdk.tts.speakStream("Hello world")) {
744
+ * // chunk.audio is ArrayBuffer (already decoded!)
745
+ * const blob = new Blob([chunk.audio], { type: chunk.mimeType });
746
+ * const audio = new Audio(URL.createObjectURL(blob));
747
+ * await audio.play();
690
748
  * }
691
749
  * ```
692
750
  */
693
- speakStream(text: string, options?: TTSOptions): AsyncGenerator<Uint8Array>;
751
+ speakStream(text: string, options?: TTSOptions): AsyncGenerator<TTSChunk>;
694
752
  /**
695
- * List available TTS providers
753
+ * List available TTS providers with configuration options
696
754
  */
697
755
  listProviders(): Promise<TTSProvider[]>;
698
756
  }
@@ -745,4 +803,4 @@ declare class RealtimeXSDK {
745
803
  getAppDataDir(): Promise<string>;
746
804
  }
747
805
 
748
- export { ActivitiesModule, type Activity, type Agent, ApiModule, type ChatMessage, type ChatOptions, type ChatResponse, type EmbedOptions, type EmbedResponse, LLMModule, LLMPermissionError, LLMProviderError, PermissionDeniedError, PermissionRequiredError, PortModule, type Provider, type ProvidersResponse, RealtimeXSDK, type SDKConfig, type StreamChunk, TTSModule, type TTSOptions, type TTSProvider, type TTSProvidersResponse, type Task, TaskModule, type TaskRun, type Thread, type TriggerAgentPayload, type TriggerAgentResponse, type VectorDeleteOptions, type VectorDeleteResponse, type VectorQueryOptions, type VectorQueryResponse, type VectorQueryResult, type VectorRecord, VectorStore, type VectorUpsertOptions, type VectorUpsertResponse, WebhookModule, type Workspace };
806
+ export { ActivitiesModule, type Activity, type Agent, ApiModule, type ChatMessage, type ChatOptions, type ChatResponse, type EmbedOptions, type EmbedResponse, LLMModule, LLMPermissionError, LLMProviderError, PermissionDeniedError, PermissionRequiredError, PortModule, type Provider, type ProvidersResponse, RealtimeXSDK, type SDKConfig, type StreamChunk, type TTSChunk, type TTSChunkEvent, TTSModule, type TTSOptions, type TTSProvider, type TTSProviderConfig, type TTSProvidersResponse, type Task, TaskModule, type TaskRun, type Thread, type TriggerAgentPayload, type TriggerAgentResponse, type VectorDeleteOptions, type VectorDeleteResponse, type VectorQueryOptions, type VectorQueryResponse, type VectorQueryResult, type VectorRecord, VectorStore, type VectorUpsertOptions, type VectorUpsertResponse, WebhookModule, type Workspace };
package/dist/index.js CHANGED
@@ -1034,7 +1034,7 @@ var TTSModule = class {
1034
1034
  if (!response.ok) {
1035
1035
  const data = await response.json();
1036
1036
  if (data.code === "PERMISSION_REQUIRED") {
1037
- const permission = data.permission || "tts.speak";
1037
+ const permission = data.permission || "tts.generate";
1038
1038
  const granted = await this.requestPermission(permission);
1039
1039
  if (granted) {
1040
1040
  return this.request(method, endpoint, body, isStream);
@@ -1068,35 +1068,94 @@ var TTSModule = class {
1068
1068
  });
1069
1069
  }
1070
1070
  /**
1071
- * Generate speech from text (returns stream)
1071
+ * Generate speech from text with streaming (yields decoded audio chunks)
1072
+ * Uses SSE internally but returns decoded ArrayBuffer chunks for easy playback.
1072
1073
  *
1073
1074
  * @example
1074
1075
  * ```ts
1075
- * const stream = await sdk.tts.speakStream("Hello world");
1076
- * for await (const chunk of stream) {
1077
- * // Play chunk...
1076
+ * for await (const chunk of sdk.tts.speakStream("Hello world")) {
1077
+ * // chunk.audio is ArrayBuffer (already decoded!)
1078
+ * const blob = new Blob([chunk.audio], { type: chunk.mimeType });
1079
+ * const audio = new Audio(URL.createObjectURL(blob));
1080
+ * await audio.play();
1078
1081
  * }
1079
1082
  * ```
1080
1083
  */
1081
1084
  async *speakStream(text, options = {}) {
1082
- const body = await this.request("POST", "/sdk/tts/stream", {
1083
- text,
1084
- ...options
1085
- }, true);
1086
- if (!body) throw new Error("No response body");
1087
- const reader = body.getReader();
1085
+ const response = await fetch(`${this.baseUrl}/sdk/tts/stream`, {
1086
+ method: "POST",
1087
+ headers: this.headers,
1088
+ body: JSON.stringify({ text, ...options })
1089
+ });
1090
+ if (!response.ok) {
1091
+ const data = await response.json();
1092
+ if (data.code === "PERMISSION_REQUIRED") {
1093
+ const permission = data.permission || "tts.generate";
1094
+ const granted = await this.requestPermission(permission);
1095
+ if (granted) {
1096
+ yield* this.speakStream(text, options);
1097
+ return;
1098
+ }
1099
+ throw new PermissionDeniedError(permission);
1100
+ }
1101
+ throw new Error(data.error || `Streaming failed: ${response.status}`);
1102
+ }
1103
+ const reader = response.body?.getReader();
1104
+ if (!reader) throw new Error("No response body");
1105
+ const decoder = new TextDecoder();
1106
+ let buffer = "";
1107
+ let eventType = "";
1088
1108
  try {
1089
1109
  while (true) {
1090
1110
  const { done, value } = await reader.read();
1091
1111
  if (done) break;
1092
- yield value;
1112
+ buffer += decoder.decode(value, { stream: true });
1113
+ const lines = buffer.split("\n");
1114
+ buffer = lines.pop() || "";
1115
+ for (const line of lines) {
1116
+ const trimmedLine = line.trim();
1117
+ if (!trimmedLine) continue;
1118
+ if (trimmedLine.startsWith("event:")) {
1119
+ eventType = trimmedLine.slice(6).trim();
1120
+ } else if (trimmedLine.startsWith("data:")) {
1121
+ const eventData = trimmedLine.slice(5).trim();
1122
+ if (eventType === "chunk" && eventData) {
1123
+ try {
1124
+ const parsed = JSON.parse(eventData);
1125
+ const binaryString = atob(parsed.audio);
1126
+ const bytes = new Uint8Array(binaryString.length);
1127
+ for (let i = 0; i < binaryString.length; i++) {
1128
+ bytes[i] = binaryString.charCodeAt(i);
1129
+ }
1130
+ yield {
1131
+ index: parsed.index,
1132
+ total: parsed.total,
1133
+ audio: bytes.buffer,
1134
+ mimeType: parsed.mimeType
1135
+ };
1136
+ } catch (e) {
1137
+ console.warn("[TTS SDK] Failed to parse chunk:", e);
1138
+ }
1139
+ } else if (eventType === "error" && eventData) {
1140
+ try {
1141
+ const err = JSON.parse(eventData);
1142
+ throw new Error(err.error || "TTS streaming error");
1143
+ } catch (e) {
1144
+ if (e instanceof Error && e.message !== "TTS streaming error") {
1145
+ throw e;
1146
+ }
1147
+ }
1148
+ }
1149
+ eventType = "";
1150
+ }
1151
+ }
1093
1152
  }
1094
1153
  } finally {
1095
1154
  reader.releaseLock();
1096
1155
  }
1097
1156
  }
1098
1157
  /**
1099
- * List available TTS providers
1158
+ * List available TTS providers with configuration options
1100
1159
  */
1101
1160
  async listProviders() {
1102
1161
  const data = await this.request("GET", "/sdk/tts/providers");
package/dist/index.mjs CHANGED
@@ -986,7 +986,7 @@ var TTSModule = class {
986
986
  if (!response.ok) {
987
987
  const data = await response.json();
988
988
  if (data.code === "PERMISSION_REQUIRED") {
989
- const permission = data.permission || "tts.speak";
989
+ const permission = data.permission || "tts.generate";
990
990
  const granted = await this.requestPermission(permission);
991
991
  if (granted) {
992
992
  return this.request(method, endpoint, body, isStream);
@@ -1020,35 +1020,94 @@ var TTSModule = class {
1020
1020
  });
1021
1021
  }
1022
1022
  /**
1023
- * Generate speech from text (returns stream)
1023
+ * Generate speech from text with streaming (yields decoded audio chunks)
1024
+ * Uses SSE internally but returns decoded ArrayBuffer chunks for easy playback.
1024
1025
  *
1025
1026
  * @example
1026
1027
  * ```ts
1027
- * const stream = await sdk.tts.speakStream("Hello world");
1028
- * for await (const chunk of stream) {
1029
- * // Play chunk...
1028
+ * for await (const chunk of sdk.tts.speakStream("Hello world")) {
1029
+ * // chunk.audio is ArrayBuffer (already decoded!)
1030
+ * const blob = new Blob([chunk.audio], { type: chunk.mimeType });
1031
+ * const audio = new Audio(URL.createObjectURL(blob));
1032
+ * await audio.play();
1030
1033
  * }
1031
1034
  * ```
1032
1035
  */
1033
1036
  async *speakStream(text, options = {}) {
1034
- const body = await this.request("POST", "/sdk/tts/stream", {
1035
- text,
1036
- ...options
1037
- }, true);
1038
- if (!body) throw new Error("No response body");
1039
- const reader = body.getReader();
1037
+ const response = await fetch(`${this.baseUrl}/sdk/tts/stream`, {
1038
+ method: "POST",
1039
+ headers: this.headers,
1040
+ body: JSON.stringify({ text, ...options })
1041
+ });
1042
+ if (!response.ok) {
1043
+ const data = await response.json();
1044
+ if (data.code === "PERMISSION_REQUIRED") {
1045
+ const permission = data.permission || "tts.generate";
1046
+ const granted = await this.requestPermission(permission);
1047
+ if (granted) {
1048
+ yield* this.speakStream(text, options);
1049
+ return;
1050
+ }
1051
+ throw new PermissionDeniedError(permission);
1052
+ }
1053
+ throw new Error(data.error || `Streaming failed: ${response.status}`);
1054
+ }
1055
+ const reader = response.body?.getReader();
1056
+ if (!reader) throw new Error("No response body");
1057
+ const decoder = new TextDecoder();
1058
+ let buffer = "";
1059
+ let eventType = "";
1040
1060
  try {
1041
1061
  while (true) {
1042
1062
  const { done, value } = await reader.read();
1043
1063
  if (done) break;
1044
- yield value;
1064
+ buffer += decoder.decode(value, { stream: true });
1065
+ const lines = buffer.split("\n");
1066
+ buffer = lines.pop() || "";
1067
+ for (const line of lines) {
1068
+ const trimmedLine = line.trim();
1069
+ if (!trimmedLine) continue;
1070
+ if (trimmedLine.startsWith("event:")) {
1071
+ eventType = trimmedLine.slice(6).trim();
1072
+ } else if (trimmedLine.startsWith("data:")) {
1073
+ const eventData = trimmedLine.slice(5).trim();
1074
+ if (eventType === "chunk" && eventData) {
1075
+ try {
1076
+ const parsed = JSON.parse(eventData);
1077
+ const binaryString = atob(parsed.audio);
1078
+ const bytes = new Uint8Array(binaryString.length);
1079
+ for (let i = 0; i < binaryString.length; i++) {
1080
+ bytes[i] = binaryString.charCodeAt(i);
1081
+ }
1082
+ yield {
1083
+ index: parsed.index,
1084
+ total: parsed.total,
1085
+ audio: bytes.buffer,
1086
+ mimeType: parsed.mimeType
1087
+ };
1088
+ } catch (e) {
1089
+ console.warn("[TTS SDK] Failed to parse chunk:", e);
1090
+ }
1091
+ } else if (eventType === "error" && eventData) {
1092
+ try {
1093
+ const err = JSON.parse(eventData);
1094
+ throw new Error(err.error || "TTS streaming error");
1095
+ } catch (e) {
1096
+ if (e instanceof Error && e.message !== "TTS streaming error") {
1097
+ throw e;
1098
+ }
1099
+ }
1100
+ }
1101
+ eventType = "";
1102
+ }
1103
+ }
1045
1104
  }
1046
1105
  } finally {
1047
1106
  reader.releaseLock();
1048
1107
  }
1049
1108
  }
1050
1109
  /**
1051
- * List available TTS providers
1110
+ * List available TTS providers with configuration options
1052
1111
  */
1053
1112
  async listProviders() {
1054
1113
  const data = await this.request("GET", "/sdk/tts/providers");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@realtimex/sdk",
3
- "version": "1.2.0",
3
+ "version": "1.2.2",
4
4
  "description": "SDK for building Local Apps that integrate with RealtimeX",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",