npm - @upliftai/sdk-js - Versions diffs - 0.1.0 - Mend

@upliftai/sdk-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,242 @@
+# @upliftai/sdk-js
+Official Node.js SDK for the [UpliftAI](https://upliftai.org) API. Build Urdu voice agents, add text-to-speech to WhatsApp bots, or transcribe call center audio.
+[Documentation](https://docs.upliftai.org) · [Voices](https://docs.upliftai.org/orator_voices) · [API Reference](https://docs.upliftai.org/api-reference)
+## Install
+```bash
+npm install @upliftai/sdk-js
+```
+## Quick start
+Generate speech and save to a file:
+```ts
+import { writeFileSync } from "node:fs";
+import { UpliftAI } from "@upliftai/sdk-js";
+const client = new UpliftAI({
+  apiKey: "sk_...", // or defaults to process.env.UPLIFTAI_API_KEY
+});
+const { audio } = await client.tts.create({
+  text: "السلام علیکم، میں آپ کی کیا مدد کر سکتا ہوں؟", // "Hello, how can I help you?"
+  voiceId: "v_meklc281",
+});
+writeFileSync("hello.wav", audio);
+```
+### Options
+| Option | Default | Description |
+|---|---|---|
+| `apiKey` | `process.env.UPLIFTAI_API_KEY` | API key |
+| `timeout` | `30000` | Request timeout (ms) |
+| `maxRetries` | `2` | Retries on 429 and 5xx |
+```ts
+const client = new UpliftAI({ apiKey: "sk_...", timeout: 60_000 });
+```
+## Text-to-speech
+### Generate audio
+Returns the full audio buffer. Best for batch/offline use.
+```ts
+import { writeFileSync } from "node:fs";
+const { audio, metadata } = await client.tts.create({
+  text: "آج موسم بہت اچھا ہے", // "The weather is great today"
+  voiceId: "v_meklc281",
+  outputFormat: "MP3_22050_128", // optional, defaults to WAV_22050_32
+});
+writeFileSync("output.mp3", audio);
+console.log(metadata.contentType); // "audio/mp3"
+```
+### Stream audio
+Returns a `Readable` stream. First chunk arrives quickly — use for real-time playback. Uses http streaming.
+```ts
+const { stream, metadata } = await client.tts.createStream({
+  text: "اردو میں ایک لمبا جملہ", // "A long sentence in Urdu"
+  voiceId: "v_meklc281",
+  outputFormat: "MP3_22050_64",
+});
+for await (const chunk of stream) {
+  process.stdout.write(chunk); // or pipe to speaker/file
+}
+```
+### Async jobs
+Enqueue a job and retrieve audio later. Returns a `temporaryUrl` you can pass directly to a frontend, WhatsApp, or `<audio>` element — no auth required.
+```ts
+const { mediaId, temporaryUrl } = await client.tts.enqueue({
+  text: "بعد میں حاصل کریں", // "Retrieve later"
+  voiceId: "v_meklc281",
+});
+// Option 1: retrieve server-side
+const { stream } = await client.tts.retrieve(mediaId);
+// Option 2: pass URL directly to client — no auth needed
+console.log(temporaryUrl);
+// https://api.upliftai.org/v1/synthesis/stream-audio/media_abc?token=eyJ...
+```
+### WebSocket (real-time)
+Persistent connection for low-latency streaming. Use one connection per conversation/user session. Defaults to `PCM_22050_16` output format.
+```ts
+const ws = await client.tts.connect();
+// 1. Stream a sentence
+const s1 = ws.stream({ text: "پہلا جملہ۔", voiceId: "v_meklc281" }); // "First sentence."
+for await (const event of s1) {
+  if (event.type === "audio") speaker.write(event.audio);
+}
+// 2. User interrupts — cancel everything
+ws.cancelAll(); // or cancel a specific stream with s1.cancel()
+// 3. Start a new stream on the same connection
+const s2 = ws.stream({ text: "نیا جواب۔", voiceId: "v_meklc281" }); // "New response."
+for await (const event of s2) {
+  if (event.type === "audio") speaker.write(event.audio);
+}
+ws.close();
+```
+Events: `audio_start`, `audio`, `audio_end`, `error`.
+#### Real-time voice agent (pseudocode)
+For conversational AI, break your LLM output into sentences and stream each one as it arrives. This gives the lowest time-to-first-audio since synthesis starts before the LLM finishes generating. If you use [LiveKit](https://livekit.io), the UpliftAI plugin handles this automatically.
+```ts
+const ws = await client.tts.connect();
+// LLM streams tokens → your tokenizer emits complete sentences
+for await (const sentence of tokenizeSentences(llmStream)) {
+  const stream = ws.stream({ text: sentence, voiceId: "v_meklc281" });
+  for await (const event of stream) {
+    if (event.type === "audio") player.write(event.audio);
+  }
+}
+// User interrupts mid-response
+ws.cancelAll(); // stops all in-flight audio immediately
+ws.close();
+```
+We will be building a context aware stremaing solution in the future, so you don't have to worry about tokenization and sentence breaking. Stay tuned!
+### Phrase replacements
+Control pronunciation of specific words and phrases. Perfect for handling:
+- **Brand names** — convert English spellings to Urdu phonetics
+- **Technical terms** — ensure consistent pronunciation
+- **LLM outputs** — fix common misspellings from AI models
+- **Regional variations** — adapt to local dialects
+```ts
+const config = await client.tts.phraseReplacements.create({
+  phraseReplacements: [
+    { phrase: "Meezan bank", replacement: "میزان بینک" }, // English brand name → Urdu pronunciation
+  ],
+});
+await client.tts.create({
+  text: "ہماری API بہت تیز ہے", // "Our API is very fast"
+  voiceId: "v_meklc281",
+  phraseReplacementConfigId: config.configId,
+});
+```
+[Read more about phrase replacements](https://docs.upliftai.org/orator#phrase-replacement-for-perfect-pronunciation)
+## Speech-to-text
+Accepts a file path, `Buffer`, or readable stream. Pass `fileName` with Buffer/stream inputs so the server can detect the audio format.
+```ts
+// From file path
+const { transcript } = await client.stt.transcribe({
+  file: "./recording.mp3",
+  model: "scribe",
+});
+// From buffer
+const { transcript } = await client.stt.transcribe({
+  file: audioBuffer,
+  fileName: "recording.mp3",
+  model: "scribe",
+  language: "ur",
+});
+```
+| Model | Description |
+|---|---|
+| `scribe` | Higher accuracy, recommended for most use cases |
+| `scribe-mini` | Faster, lower cost |
+## Error handling
+All errors include a `requestId` for debugging with UpliftAI support.
+```ts
+import {
+  UpliftAIError,
+  UpliftAIAuthError,                // 401
+  UpliftAIInsufficientBalanceError,  // 402
+  UpliftAIRateLimitError,            // 429
+} from "@upliftai/sdk-js";
+try {
+  await client.tts.create({ text: "...", voiceId: "..." });
+} catch (err) {
+  if (err instanceof UpliftAIRateLimitError) {
+    // back off and retry
+  }
+  if (err instanceof UpliftAIError) {
+    console.log(err.statusCode, err.requestId);
+  }
+}
+```
+## Output formats
+| Format | Use case |
+|---|---|
+| `WAV_22050_32` | General purpose (default) |
+| `WAV_22050_16` | General purpose, smaller files |
+| `MP3_22050_128` | Web playback, high quality |
+| `MP3_22050_64` | Web playback, balanced |
+| `MP3_22050_32` | Web playback, low bandwidth |
+| `PCM_22050_16` | Real-time streaming, WebSocket default |
+| `OGG_22050_16` | Web playback, open format, streaming not support at this time |
+| `ULAW_8000_8` | Telephony (SIP, PSTN) |
+## Requirements
+Node.js >= 18 · TypeScript types included · ESM and CommonJS supported
+## License
+MIT

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,304 @@
+import * as node_stream from 'node:stream';
+interface HttpClientOptions {
+    baseUrl: string;
+    apiKey: string;
+    timeout?: number;
+    maxRetries?: number;
+}
+declare class HttpClient {
+    private baseUrl;
+    private apiKey;
+    private timeout;
+    private maxRetries;
+    constructor(options: HttpClientOptions);
+    private headers;
+    private fetchWithRetry;
+    private retryDelay;
+    postJSON<T>(path: string, body: Record<string, unknown>): Promise<{
+        data: T;
+        headers: Headers;
+    }>;
+    postJSONForBuffer(path: string, body: Record<string, unknown>): Promise<{
+        buffer: Buffer;
+        headers: Headers;
+    }>;
+    postJSONForStream(path: string, body: Record<string, unknown>): Promise<{
+        body: ReadableStream<Uint8Array>;
+        headers: Headers;
+    }>;
+    postMultipart<T>(path: string, formData: FormData): Promise<{
+        data: T;
+        headers: Headers;
+    }>;
+    get<T>(path: string): Promise<{
+        data: T;
+        headers: Headers;
+    }>;
+    getStream(path: string, query?: Record<string, string>): Promise<{
+        body: ReadableStream<Uint8Array>;
+        headers: Headers;
+    }>;
+    private throwForStatus;
+    private safeText;
+}
+type OutputFormat = 'PCM_22050_16' | 'WAV_22050_16' | 'WAV_22050_32' | 'MP3_22050_32' | 'MP3_22050_64' | 'MP3_22050_128' | 'OGG_22050_16' | 'ULAW_8000_8';
+interface TTSRequest {
+    text: string;
+    voiceId: string;
+    outputFormat?: OutputFormat;
+    phraseReplacementConfigId?: string;
+}
+interface AudioMetadata {
+    requestId: string;
+    duration: number;
+    contentType: string;
+    sampleRate: number;
+    bitRate: number;
+}
+interface AudioResponse {
+    audio: Buffer;
+    metadata: AudioMetadata;
+}
+interface StreamResponse {
+    stream: node_stream.Readable;
+    metadata: AudioMetadata;
+}
+/**
+ * Result of enqueuing a TTS job. Use `mediaId` with `retrieve()` to fetch
+ * the audio, or pass `temporaryUrl` directly to a frontend/client (e.g.
+ * WhatsApp, browser audio element) without downloading first.
+ */
+interface EnqueueResponse {
+    mediaId: string;
+    token: string;
+    /** Pre-signed URL to stream audio directly — no auth required. Short-lived, do not persist. */
+    temporaryUrl: string;
+}
+interface WSAudioStart {
+    type: 'audio_start';
+    requestId: string;
+    timestamp: number;
+}
+interface WSAudio {
+    type: 'audio';
+    requestId: string;
+    sequence: number;
+    audio: Buffer;
+}
+interface WSAudioEnd {
+    type: 'audio_end';
+    requestId: string;
+    timestamp: number;
+}
+interface WSError {
+    type: 'error';
+    requestId: string;
+    code: string;
+    message: string;
+}
+type TTSStreamEvent = WSAudioStart | WSAudio | WSAudioEnd | WSError;
+interface TranscriptionRequestBase {
+    model?: 'scribe' | 'scribe-mini';
+    language?: 'ur';
+    domain?: 'phone-commerce' | 'farming';
+}
+interface TranscriptionRequestFromPath extends TranscriptionRequestBase {
+    /** Path to an audio file. Extension is used for content-type detection. */
+    file: string;
+    fileName?: never;
+}
+interface TranscriptionRequestFromBuffer extends TranscriptionRequestBase {
+    /** Audio data as a Buffer or readable stream. */
+    file: Buffer | NodeJS.ReadableStream;
+    /**
+     * Filename hint for content-type detection on the server (e.g. `'call.mp3'`).
+     * The extension tells the server what format the audio is in.
+     */
+    fileName: string;
+}
+type TranscriptionRequest = TranscriptionRequestFromPath | TranscriptionRequestFromBuffer;
+interface TranscriptionResponse {
+    transcript: string;
+}
+interface PhraseReplacement {
+    phrase: string;
+    replacement: string;
+}
+interface PhraseReplacementConfig {
+    configId: string;
+    phraseReplacements: PhraseReplacement[];
+}
+interface UpliftAIOptions {
+    apiKey?: string;
+    baseUrl?: string;
+    timeout?: number;
+    maxRetries?: number;
+}
+interface TTSStream extends AsyncIterable<TTSStreamEvent> {
+    cancel(): Promise<void>;
+    requestId: string;
+}
+type WSReadyState = 'connecting' | 'open' | 'closing' | 'closed';
+interface TTSWebSocket {
+    stream(request: TTSRequest & {
+        requestId?: string;
+    }): TTSStream;
+    cancelAll(): void;
+    readonly activeStreams: number;
+    close(): void;
+    readonly readyState: WSReadyState;
+    readonly sessionId: string;
+    on(event: 'error', listener: (error: Error) => void): this;
+    on(event: 'close', listener: (code: number, reason: string) => void): this;
+}
+declare class PhraseReplacements {
+    private http;
+    constructor(http: HttpClient);
+    create(replacements: PhraseReplacement[]): Promise<PhraseReplacementConfig>;
+    get(configId: string): Promise<PhraseReplacementConfig>;
+    list(): Promise<PhraseReplacementConfig[]>;
+    update(configId: string, replacements: PhraseReplacement[]): Promise<PhraseReplacementConfig>;
+}
+/** Text-to-speech resource. Access via `client.tts`. */
+declare class TTS {
+    private http;
+    private apiKey;
+    private baseUrl;
+    private wsBaseUrl;
+    /** Manage phrase replacement configs for pronunciation control. */
+    readonly phraseReplacements: PhraseReplacements;
+    constructor(http: HttpClient, apiKey: string, baseUrl: string, wsBaseUrl: string);
+    /**
+     * Synthesize text and return the full audio buffer.
+     *
+     * Generates the complete audio before returning. Faster end-to-end than
+     * streaming, but the caller must wait for the entire file. Best for
+     * batch/offline use cases where latency to first byte doesn't matter.
+     *
+     * @example
+     * const { audio, metadata } = await client.tts.create({ text: 'سلام', voiceId: 'v_meklc281' });
+     * fs.writeFileSync('output.mp3', audio);
+     */
+    create(request: TTSRequest): Promise<AudioResponse>;
+    /**
+     * Synthesize text and return a readable stream of audio chunks.
+     *
+     * The first chunk arrives quickly, but total generation is slower than
+     * `create()`. Use this in latency-sensitive environments like live agents,
+     * phone calls, or real-time playback where you want audio to start playing
+     * immediately rather than waiting for the full file.
+     *
+     * @example
+     * const { stream, metadata } = await client.tts.createStream({ text: 'سلام', voiceId: 'v_meklc281' });
+     * for await (const chunk of stream) speaker.write(chunk);
+     */
+    createStream(request: TTSRequest): Promise<StreamResponse>;
+    /**
+     * Enqueue an async TTS job. Returns a `mediaId` to retrieve the audio later.
+     *
+     * Use for batch processing or when you don't need audio immediately.
+     * Poll or call `retrieve(mediaId)` when the audio is ready.
+     *
+     * @example
+     * const { mediaId, temporaryUrl } = await client.tts.enqueue({ text: 'سلام', voiceId: 'v_meklc281' });
+     * // retrieve server-side
+     * const audio = await client.tts.retrieve(mediaId);
+     * // or pass URL directly to a client/browser
+     * console.log(temporaryUrl);
+     */
+    enqueue(request: TTSRequest): Promise<EnqueueResponse>;
+    /**
+     * Enqueue an async TTS job with streaming retrieval.
+     *
+     * Same as `enqueue()`, but when retrieved via `retrieve(mediaId)` the audio
+     * streams in chunks instead of arriving as a single buffer.
+     *
+     * @example
+     * const { mediaId, temporaryUrl } = await client.tts.enqueueStream({ text: 'سلام', voiceId: 'v_meklc281' });
+     * const stream = await client.tts.retrieve(mediaId);
+     * for await (const chunk of stream) speaker.write(chunk);
+     */
+    enqueueStream(request: TTSRequest): Promise<EnqueueResponse>;
+    /**
+     * Retrieve audio from a previously enqueued job.
+     *
+     * Returns the audio stream along with metadata (encoding, sample rate, etc.)
+     * from response headers.
+     *
+     * @example
+     * const { stream, metadata } = await client.tts.retrieve('<mediaId from enqueue>');
+     * console.log(metadata.contentType); // 'audio/mpeg'
+     * for await (const chunk of stream) fs.appendFileSync('out.mp3', chunk);
+     */
+    retrieve(mediaId: string): Promise<StreamResponse>;
+    /**
+     * Open a persistent WebSocket connection for low-latency streaming TTS.
+     *
+     * Supports multiple concurrent streams on one connection, multiplexed by
+     * requestId. Use for real-time conversational AI, live agents, and
+     * interactive use cases. Resolves once the connection is ready.
+     *
+     * Open one connection per conversation or user session — don't share across
+     * unrelated contexts.
+     *
+     * @example
+     * const ws = await client.tts.connect();
+     * // Stream sentence-by-sentence as your LLM generates
+     * for await (const sentence of llm.streamSentences(prompt)) {
+     *   const stream = ws.stream({ text: sentence, voiceId: 'v_meklc281' });
+     *   for await (const event of stream) {
+     *     if (event.type === 'audio') speaker.write(event.audio);
+     *   }
+     * }
+     * ws.close();
+     */
+    private buildTemporaryUrl;
+    connect(): Promise<TTSWebSocket>;
+}
+/** Speech-to-text resource. Access via `client.stt`. */
+declare class STT {
+    private http;
+    constructor(http: HttpClient);
+    /**
+     * Transcribe audio to text.
+     *
+     * Accepts a file path, Buffer, or readable stream as input.
+     *
+     * @example
+     * // From file path (extension used for content-type detection)
+     * const { transcript } = await client.stt.transcribe({ file: './call.mp3', model: 'scribe' });
+     *
+     * // From Buffer (pass fileName so the server knows the format)
+     * const { transcript } = await client.stt.transcribe({ file: audioBuffer, fileName: 'call.mp3', language: 'ur' });
+     */
+    transcribe(request: TranscriptionRequest): Promise<TranscriptionResponse>;
+}
+declare class UpliftAI {
+    readonly tts: TTS;
+    readonly stt: STT;
+    constructor(options?: UpliftAIOptions);
+}
+declare class UpliftAIError extends Error {
+    readonly statusCode?: number | undefined;
+    readonly code?: string | undefined;
+    readonly requestId?: string | undefined;
+    constructor(message: string, statusCode?: number | undefined, code?: string | undefined, requestId?: string | undefined);
+}
+declare class UpliftAIAuthError extends UpliftAIError {
+    constructor(message?: string, requestId?: string);
+}
+declare class UpliftAIInsufficientBalanceError extends UpliftAIError {
+    constructor(message?: string, requestId?: string);
+}
+declare class UpliftAIRateLimitError extends UpliftAIError {
+    constructor(message?: string, requestId?: string);
+}
+export { type AudioMetadata, type AudioResponse, type EnqueueResponse, type OutputFormat, type PhraseReplacement, type PhraseReplacementConfig, type StreamResponse, type TTSRequest, type TTSStream, type TTSStreamEvent, type TTSWebSocket, type TranscriptionRequest, type TranscriptionRequestFromBuffer, type TranscriptionRequestFromPath, type TranscriptionResponse, UpliftAI, UpliftAIAuthError, UpliftAIError, UpliftAIInsufficientBalanceError, type UpliftAIOptions, UpliftAIRateLimitError, type WSAudio, type WSAudioEnd, type WSAudioStart, type WSError, type WSReadyState, UpliftAI as default };