@upliftai/sdk-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,242 @@
1
+ # @upliftai/sdk-js
2
+
3
+ Official Node.js SDK for the [UpliftAI](https://upliftai.org) API. Build Urdu voice agents, add text-to-speech to WhatsApp bots, or transcribe call center audio.
4
+
5
+ [Documentation](https://docs.upliftai.org) · [Voices](https://docs.upliftai.org/orator_voices) · [API Reference](https://docs.upliftai.org/api-reference)
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ npm install @upliftai/sdk-js
11
+ ```
12
+
13
+ ## Quick start
14
+
15
+ Generate speech and save to a file:
16
+
17
+ ```ts
18
+ import { writeFileSync } from "node:fs";
19
+ import { UpliftAI } from "@upliftai/sdk-js";
20
+
21
+ const client = new UpliftAI({
22
+ apiKey: "sk_...", // or defaults to process.env.UPLIFTAI_API_KEY
23
+ });
24
+
25
+ const { audio } = await client.tts.create({
26
+ text: "السلام علیکم، میں آپ کی کیا مدد کر سکتا ہوں؟", // "Hello, how can I help you?"
27
+ voiceId: "v_meklc281",
28
+ });
29
+
30
+ writeFileSync("hello.wav", audio);
31
+ ```
32
+
33
+ ### Options
34
+
35
+ | Option | Default | Description |
36
+ |---|---|---|
37
+ | `apiKey` | `process.env.UPLIFTAI_API_KEY` | API key |
38
+ | `timeout` | `30000` | Request timeout (ms) |
39
+ | `maxRetries` | `2` | Retries on 429 and 5xx |
40
+
41
+ ```ts
42
+ const client = new UpliftAI({ apiKey: "sk_...", timeout: 60_000 });
43
+ ```
44
+
45
+ ## Text-to-speech
46
+
47
+ ### Generate audio
48
+
49
+ Returns the full audio buffer. Best for batch/offline use.
50
+
51
+ ```ts
52
+ import { writeFileSync } from "node:fs";
53
+
54
+ const { audio, metadata } = await client.tts.create({
55
+ text: "آج موسم بہت اچھا ہے", // "The weather is great today"
56
+ voiceId: "v_meklc281",
57
+ outputFormat: "MP3_22050_128", // optional, defaults to WAV_22050_32
58
+ });
59
+
60
+ writeFileSync("output.mp3", audio);
61
+ console.log(metadata.contentType); // "audio/mp3"
62
+ ```
63
+
64
+ ### Stream audio
65
+
66
+ Returns a `Readable` stream. First chunk arrives quickly — use for real-time playback. Uses http streaming.
67
+
68
+ ```ts
69
+ const { stream, metadata } = await client.tts.createStream({
70
+ text: "اردو میں ایک لمبا جملہ", // "A long sentence in Urdu"
71
+ voiceId: "v_meklc281",
72
+ outputFormat: "MP3_22050_64",
73
+ });
74
+
75
+ for await (const chunk of stream) {
76
+ process.stdout.write(chunk); // or pipe to speaker/file
77
+ }
78
+ ```
79
+
80
+ ### Async jobs
81
+
82
+ Enqueue a job and retrieve audio later. Returns a `temporaryUrl` you can pass directly to a frontend, WhatsApp, or `<audio>` element — no auth required.
83
+
84
+ ```ts
85
+ const { mediaId, temporaryUrl } = await client.tts.enqueue({
86
+ text: "بعد میں حاصل کریں", // "Retrieve later"
87
+ voiceId: "v_meklc281",
88
+ });
89
+
90
+ // Option 1: retrieve server-side
91
+ const { stream } = await client.tts.retrieve(mediaId);
92
+
93
+ // Option 2: pass URL directly to client — no auth needed
94
+ console.log(temporaryUrl);
95
+ // https://api.upliftai.org/v1/synthesis/stream-audio/media_abc?token=eyJ...
96
+ ```
97
+
98
+ ### WebSocket (real-time)
99
+
100
+ Persistent connection for low-latency streaming. Use one connection per conversation/user session. Defaults to `PCM_22050_16` output format.
101
+
102
+ ```ts
103
+ const ws = await client.tts.connect();
104
+
105
+ // 1. Stream a sentence
106
+ const s1 = ws.stream({ text: "پہلا جملہ۔", voiceId: "v_meklc281" }); // "First sentence."
107
+ for await (const event of s1) {
108
+ if (event.type === "audio") speaker.write(event.audio);
109
+ }
110
+
111
+ // 2. User interrupts — cancel everything
112
+ ws.cancelAll(); // or cancel a specific stream with s1.cancel()
113
+
114
+ // 3. Start a new stream on the same connection
115
+ const s2 = ws.stream({ text: "نیا جواب۔", voiceId: "v_meklc281" }); // "New response."
116
+ for await (const event of s2) {
117
+ if (event.type === "audio") speaker.write(event.audio);
118
+ }
119
+
120
+ ws.close();
121
+ ```
122
+
123
+ Events: `audio_start`, `audio`, `audio_end`, `error`.
124
+
125
+ #### Real-time voice agent (pseudocode)
126
+
127
+ For conversational AI, break your LLM output into sentences and stream each one as it arrives. This gives the lowest time-to-first-audio since synthesis starts before the LLM finishes generating. If you use [LiveKit](https://livekit.io), the UpliftAI plugin handles this automatically.
128
+
129
+ ```ts
130
+ const ws = await client.tts.connect();
131
+
132
+ // LLM streams tokens → your tokenizer emits complete sentences
133
+ for await (const sentence of tokenizeSentences(llmStream)) {
134
+ const stream = ws.stream({ text: sentence, voiceId: "v_meklc281" });
135
+
136
+ for await (const event of stream) {
137
+ if (event.type === "audio") player.write(event.audio);
138
+ }
139
+ }
140
+
141
+ // User interrupts mid-response
142
+ ws.cancelAll(); // stops all in-flight audio immediately
143
+
144
+ ws.close();
145
+ ```
146
+
147
+ We will be building a context aware stremaing solution in the future, so you don't have to worry about tokenization and sentence breaking. Stay tuned!
148
+
149
+ ### Phrase replacements
150
+
151
+ Control pronunciation of specific words and phrases. Perfect for handling:
152
+
153
+ - **Brand names** — convert English spellings to Urdu phonetics
154
+ - **Technical terms** — ensure consistent pronunciation
155
+ - **LLM outputs** — fix common misspellings from AI models
156
+ - **Regional variations** — adapt to local dialects
157
+
158
+ ```ts
159
+ const config = await client.tts.phraseReplacements.create({
160
+ phraseReplacements: [
161
+ { phrase: "Meezan bank", replacement: "میزان بینک" }, // English brand name → Urdu pronunciation
162
+ ],
163
+ });
164
+
165
+ await client.tts.create({
166
+ text: "ہماری API بہت تیز ہے", // "Our API is very fast"
167
+ voiceId: "v_meklc281",
168
+ phraseReplacementConfigId: config.configId,
169
+ });
170
+ ```
171
+
172
+ [Read more about phrase replacements](https://docs.upliftai.org/orator#phrase-replacement-for-perfect-pronunciation)
173
+
174
+ ## Speech-to-text
175
+
176
+ Accepts a file path, `Buffer`, or readable stream. Pass `fileName` with Buffer/stream inputs so the server can detect the audio format.
177
+
178
+ ```ts
179
+ // From file path
180
+ const { transcript } = await client.stt.transcribe({
181
+ file: "./recording.mp3",
182
+ model: "scribe",
183
+ });
184
+
185
+ // From buffer
186
+ const { transcript } = await client.stt.transcribe({
187
+ file: audioBuffer,
188
+ fileName: "recording.mp3",
189
+ model: "scribe",
190
+ language: "ur",
191
+ });
192
+ ```
193
+
194
+ | Model | Description |
195
+ |---|---|
196
+ | `scribe` | Higher accuracy, recommended for most use cases |
197
+ | `scribe-mini` | Faster, lower cost |
198
+
199
+ ## Error handling
200
+
201
+ All errors include a `requestId` for debugging with UpliftAI support.
202
+
203
+ ```ts
204
+ import {
205
+ UpliftAIError,
206
+ UpliftAIAuthError, // 401
207
+ UpliftAIInsufficientBalanceError, // 402
208
+ UpliftAIRateLimitError, // 429
209
+ } from "@upliftai/sdk-js";
210
+
211
+ try {
212
+ await client.tts.create({ text: "...", voiceId: "..." });
213
+ } catch (err) {
214
+ if (err instanceof UpliftAIRateLimitError) {
215
+ // back off and retry
216
+ }
217
+ if (err instanceof UpliftAIError) {
218
+ console.log(err.statusCode, err.requestId);
219
+ }
220
+ }
221
+ ```
222
+
223
+ ## Output formats
224
+
225
+ | Format | Use case |
226
+ |---|---|
227
+ | `WAV_22050_32` | General purpose (default) |
228
+ | `WAV_22050_16` | General purpose, smaller files |
229
+ | `MP3_22050_128` | Web playback, high quality |
230
+ | `MP3_22050_64` | Web playback, balanced |
231
+ | `MP3_22050_32` | Web playback, low bandwidth |
232
+ | `PCM_22050_16` | Real-time streaming, WebSocket default |
233
+ | `OGG_22050_16` | Web playback, open format, streaming not support at this time |
234
+ | `ULAW_8000_8` | Telephony (SIP, PSTN) |
235
+
236
+ ## Requirements
237
+
238
+ Node.js >= 18 · TypeScript types included · ESM and CommonJS supported
239
+
240
+ ## License
241
+
242
+ MIT
@@ -0,0 +1,304 @@
1
+ import * as node_stream from 'node:stream';
2
+
3
+ interface HttpClientOptions {
4
+ baseUrl: string;
5
+ apiKey: string;
6
+ timeout?: number;
7
+ maxRetries?: number;
8
+ }
9
+ declare class HttpClient {
10
+ private baseUrl;
11
+ private apiKey;
12
+ private timeout;
13
+ private maxRetries;
14
+ constructor(options: HttpClientOptions);
15
+ private headers;
16
+ private fetchWithRetry;
17
+ private retryDelay;
18
+ postJSON<T>(path: string, body: Record<string, unknown>): Promise<{
19
+ data: T;
20
+ headers: Headers;
21
+ }>;
22
+ postJSONForBuffer(path: string, body: Record<string, unknown>): Promise<{
23
+ buffer: Buffer;
24
+ headers: Headers;
25
+ }>;
26
+ postJSONForStream(path: string, body: Record<string, unknown>): Promise<{
27
+ body: ReadableStream<Uint8Array>;
28
+ headers: Headers;
29
+ }>;
30
+ postMultipart<T>(path: string, formData: FormData): Promise<{
31
+ data: T;
32
+ headers: Headers;
33
+ }>;
34
+ get<T>(path: string): Promise<{
35
+ data: T;
36
+ headers: Headers;
37
+ }>;
38
+ getStream(path: string, query?: Record<string, string>): Promise<{
39
+ body: ReadableStream<Uint8Array>;
40
+ headers: Headers;
41
+ }>;
42
+ private throwForStatus;
43
+ private safeText;
44
+ }
45
+
46
+ type OutputFormat = 'PCM_22050_16' | 'WAV_22050_16' | 'WAV_22050_32' | 'MP3_22050_32' | 'MP3_22050_64' | 'MP3_22050_128' | 'OGG_22050_16' | 'ULAW_8000_8';
47
+ interface TTSRequest {
48
+ text: string;
49
+ voiceId: string;
50
+ outputFormat?: OutputFormat;
51
+ phraseReplacementConfigId?: string;
52
+ }
53
+ interface AudioMetadata {
54
+ requestId: string;
55
+ duration: number;
56
+ contentType: string;
57
+ sampleRate: number;
58
+ bitRate: number;
59
+ }
60
+ interface AudioResponse {
61
+ audio: Buffer;
62
+ metadata: AudioMetadata;
63
+ }
64
+ interface StreamResponse {
65
+ stream: node_stream.Readable;
66
+ metadata: AudioMetadata;
67
+ }
68
+ /**
69
+ * Result of enqueuing a TTS job. Use `mediaId` with `retrieve()` to fetch
70
+ * the audio, or pass `temporaryUrl` directly to a frontend/client (e.g.
71
+ * WhatsApp, browser audio element) without downloading first.
72
+ */
73
+ interface EnqueueResponse {
74
+ mediaId: string;
75
+ token: string;
76
+ /** Pre-signed URL to stream audio directly — no auth required. Short-lived, do not persist. */
77
+ temporaryUrl: string;
78
+ }
79
+ interface WSAudioStart {
80
+ type: 'audio_start';
81
+ requestId: string;
82
+ timestamp: number;
83
+ }
84
+ interface WSAudio {
85
+ type: 'audio';
86
+ requestId: string;
87
+ sequence: number;
88
+ audio: Buffer;
89
+ }
90
+ interface WSAudioEnd {
91
+ type: 'audio_end';
92
+ requestId: string;
93
+ timestamp: number;
94
+ }
95
+ interface WSError {
96
+ type: 'error';
97
+ requestId: string;
98
+ code: string;
99
+ message: string;
100
+ }
101
+ type TTSStreamEvent = WSAudioStart | WSAudio | WSAudioEnd | WSError;
102
+ interface TranscriptionRequestBase {
103
+ model?: 'scribe' | 'scribe-mini';
104
+ language?: 'ur';
105
+ domain?: 'phone-commerce' | 'farming';
106
+ }
107
+ interface TranscriptionRequestFromPath extends TranscriptionRequestBase {
108
+ /** Path to an audio file. Extension is used for content-type detection. */
109
+ file: string;
110
+ fileName?: never;
111
+ }
112
+ interface TranscriptionRequestFromBuffer extends TranscriptionRequestBase {
113
+ /** Audio data as a Buffer or readable stream. */
114
+ file: Buffer | NodeJS.ReadableStream;
115
+ /**
116
+ * Filename hint for content-type detection on the server (e.g. `'call.mp3'`).
117
+ * The extension tells the server what format the audio is in.
118
+ */
119
+ fileName: string;
120
+ }
121
+ type TranscriptionRequest = TranscriptionRequestFromPath | TranscriptionRequestFromBuffer;
122
+ interface TranscriptionResponse {
123
+ transcript: string;
124
+ }
125
+ interface PhraseReplacement {
126
+ phrase: string;
127
+ replacement: string;
128
+ }
129
+ interface PhraseReplacementConfig {
130
+ configId: string;
131
+ phraseReplacements: PhraseReplacement[];
132
+ }
133
+ interface UpliftAIOptions {
134
+ apiKey?: string;
135
+ baseUrl?: string;
136
+ timeout?: number;
137
+ maxRetries?: number;
138
+ }
139
+ interface TTSStream extends AsyncIterable<TTSStreamEvent> {
140
+ cancel(): Promise<void>;
141
+ requestId: string;
142
+ }
143
+ type WSReadyState = 'connecting' | 'open' | 'closing' | 'closed';
144
+ interface TTSWebSocket {
145
+ stream(request: TTSRequest & {
146
+ requestId?: string;
147
+ }): TTSStream;
148
+ cancelAll(): void;
149
+ readonly activeStreams: number;
150
+ close(): void;
151
+ readonly readyState: WSReadyState;
152
+ readonly sessionId: string;
153
+ on(event: 'error', listener: (error: Error) => void): this;
154
+ on(event: 'close', listener: (code: number, reason: string) => void): this;
155
+ }
156
+
157
+ declare class PhraseReplacements {
158
+ private http;
159
+ constructor(http: HttpClient);
160
+ create(replacements: PhraseReplacement[]): Promise<PhraseReplacementConfig>;
161
+ get(configId: string): Promise<PhraseReplacementConfig>;
162
+ list(): Promise<PhraseReplacementConfig[]>;
163
+ update(configId: string, replacements: PhraseReplacement[]): Promise<PhraseReplacementConfig>;
164
+ }
165
+
166
+ /** Text-to-speech resource. Access via `client.tts`. */
167
+ declare class TTS {
168
+ private http;
169
+ private apiKey;
170
+ private baseUrl;
171
+ private wsBaseUrl;
172
+ /** Manage phrase replacement configs for pronunciation control. */
173
+ readonly phraseReplacements: PhraseReplacements;
174
+ constructor(http: HttpClient, apiKey: string, baseUrl: string, wsBaseUrl: string);
175
+ /**
176
+ * Synthesize text and return the full audio buffer.
177
+ *
178
+ * Generates the complete audio before returning. Faster end-to-end than
179
+ * streaming, but the caller must wait for the entire file. Best for
180
+ * batch/offline use cases where latency to first byte doesn't matter.
181
+ *
182
+ * @example
183
+ * const { audio, metadata } = await client.tts.create({ text: 'سلام', voiceId: 'v_meklc281' });
184
+ * fs.writeFileSync('output.mp3', audio);
185
+ */
186
+ create(request: TTSRequest): Promise<AudioResponse>;
187
+ /**
188
+ * Synthesize text and return a readable stream of audio chunks.
189
+ *
190
+ * The first chunk arrives quickly, but total generation is slower than
191
+ * `create()`. Use this in latency-sensitive environments like live agents,
192
+ * phone calls, or real-time playback where you want audio to start playing
193
+ * immediately rather than waiting for the full file.
194
+ *
195
+ * @example
196
+ * const { stream, metadata } = await client.tts.createStream({ text: 'سلام', voiceId: 'v_meklc281' });
197
+ * for await (const chunk of stream) speaker.write(chunk);
198
+ */
199
+ createStream(request: TTSRequest): Promise<StreamResponse>;
200
+ /**
201
+ * Enqueue an async TTS job. Returns a `mediaId` to retrieve the audio later.
202
+ *
203
+ * Use for batch processing or when you don't need audio immediately.
204
+ * Poll or call `retrieve(mediaId)` when the audio is ready.
205
+ *
206
+ * @example
207
+ * const { mediaId, temporaryUrl } = await client.tts.enqueue({ text: 'سلام', voiceId: 'v_meklc281' });
208
+ * // retrieve server-side
209
+ * const audio = await client.tts.retrieve(mediaId);
210
+ * // or pass URL directly to a client/browser
211
+ * console.log(temporaryUrl);
212
+ */
213
+ enqueue(request: TTSRequest): Promise<EnqueueResponse>;
214
+ /**
215
+ * Enqueue an async TTS job with streaming retrieval.
216
+ *
217
+ * Same as `enqueue()`, but when retrieved via `retrieve(mediaId)` the audio
218
+ * streams in chunks instead of arriving as a single buffer.
219
+ *
220
+ * @example
221
+ * const { mediaId, temporaryUrl } = await client.tts.enqueueStream({ text: 'سلام', voiceId: 'v_meklc281' });
222
+ * const stream = await client.tts.retrieve(mediaId);
223
+ * for await (const chunk of stream) speaker.write(chunk);
224
+ */
225
+ enqueueStream(request: TTSRequest): Promise<EnqueueResponse>;
226
+ /**
227
+ * Retrieve audio from a previously enqueued job.
228
+ *
229
+ * Returns the audio stream along with metadata (encoding, sample rate, etc.)
230
+ * from response headers.
231
+ *
232
+ * @example
233
+ * const { stream, metadata } = await client.tts.retrieve('<mediaId from enqueue>');
234
+ * console.log(metadata.contentType); // 'audio/mpeg'
235
+ * for await (const chunk of stream) fs.appendFileSync('out.mp3', chunk);
236
+ */
237
+ retrieve(mediaId: string): Promise<StreamResponse>;
238
+ /**
239
+ * Open a persistent WebSocket connection for low-latency streaming TTS.
240
+ *
241
+ * Supports multiple concurrent streams on one connection, multiplexed by
242
+ * requestId. Use for real-time conversational AI, live agents, and
243
+ * interactive use cases. Resolves once the connection is ready.
244
+ *
245
+ * Open one connection per conversation or user session — don't share across
246
+ * unrelated contexts.
247
+ *
248
+ * @example
249
+ * const ws = await client.tts.connect();
250
+ * // Stream sentence-by-sentence as your LLM generates
251
+ * for await (const sentence of llm.streamSentences(prompt)) {
252
+ * const stream = ws.stream({ text: sentence, voiceId: 'v_meklc281' });
253
+ * for await (const event of stream) {
254
+ * if (event.type === 'audio') speaker.write(event.audio);
255
+ * }
256
+ * }
257
+ * ws.close();
258
+ */
259
+ private buildTemporaryUrl;
260
+ connect(): Promise<TTSWebSocket>;
261
+ }
262
+
263
+ /** Speech-to-text resource. Access via `client.stt`. */
264
+ declare class STT {
265
+ private http;
266
+ constructor(http: HttpClient);
267
+ /**
268
+ * Transcribe audio to text.
269
+ *
270
+ * Accepts a file path, Buffer, or readable stream as input.
271
+ *
272
+ * @example
273
+ * // From file path (extension used for content-type detection)
274
+ * const { transcript } = await client.stt.transcribe({ file: './call.mp3', model: 'scribe' });
275
+ *
276
+ * // From Buffer (pass fileName so the server knows the format)
277
+ * const { transcript } = await client.stt.transcribe({ file: audioBuffer, fileName: 'call.mp3', language: 'ur' });
278
+ */
279
+ transcribe(request: TranscriptionRequest): Promise<TranscriptionResponse>;
280
+ }
281
+
282
+ declare class UpliftAI {
283
+ readonly tts: TTS;
284
+ readonly stt: STT;
285
+ constructor(options?: UpliftAIOptions);
286
+ }
287
+
288
+ declare class UpliftAIError extends Error {
289
+ readonly statusCode?: number | undefined;
290
+ readonly code?: string | undefined;
291
+ readonly requestId?: string | undefined;
292
+ constructor(message: string, statusCode?: number | undefined, code?: string | undefined, requestId?: string | undefined);
293
+ }
294
+ declare class UpliftAIAuthError extends UpliftAIError {
295
+ constructor(message?: string, requestId?: string);
296
+ }
297
+ declare class UpliftAIInsufficientBalanceError extends UpliftAIError {
298
+ constructor(message?: string, requestId?: string);
299
+ }
300
+ declare class UpliftAIRateLimitError extends UpliftAIError {
301
+ constructor(message?: string, requestId?: string);
302
+ }
303
+
304
+ export { type AudioMetadata, type AudioResponse, type EnqueueResponse, type OutputFormat, type PhraseReplacement, type PhraseReplacementConfig, type StreamResponse, type TTSRequest, type TTSStream, type TTSStreamEvent, type TTSWebSocket, type TranscriptionRequest, type TranscriptionRequestFromBuffer, type TranscriptionRequestFromPath, type TranscriptionResponse, UpliftAI, UpliftAIAuthError, UpliftAIError, UpliftAIInsufficientBalanceError, type UpliftAIOptions, UpliftAIRateLimitError, type WSAudio, type WSAudioEnd, type WSAudioStart, type WSError, type WSReadyState, UpliftAI as default };