@p8n.ai/pi-listens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/config.ts ADDED
@@ -0,0 +1,182 @@
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { homedir, tmpdir } from "node:os";
3
+ import { join } from "node:path";
4
+
5
+ export type SttMode = "transcribe" | "translate" | "verbatim" | "translit" | "codemix";
6
+ export type RecordMode = "utterance" | "fixed";
7
+
8
+ export interface PiListensConfig {
9
+ apiKey?: string;
10
+ sttModel: string;
11
+ sttMode: SttMode;
12
+ sttLanguageCode: string;
13
+ translateInputToEnglish: boolean;
14
+ ttsModel: string;
15
+ ttsLanguageCode: string;
16
+ ttsSpeaker: string;
17
+ ttsPace?: number;
18
+ ttsTemperature?: number;
19
+ ttsSampleRate: number;
20
+ ttsOutputCodec: "wav" | "mp3" | "linear16" | "mulaw" | "alaw" | "opus" | "flac" | "aac";
21
+ recordSeconds: number;
22
+ recordSampleRate: number;
23
+ recordMode: RecordMode;
24
+ silenceStartSeconds: number;
25
+ silenceStopSeconds: number;
26
+ silenceThreshold: string;
27
+ recordCommand?: string;
28
+ playCommand?: string;
29
+ streamCommand?: string;
30
+ streamChunkMs: number;
31
+ streamMaxSeconds: number;
32
+ audioDir: string;
33
+ deleteAudio: boolean;
34
+ textFallback: boolean;
35
+ autoSpeakAssistant: boolean;
36
+ maxAutoSpeakChars: number;
37
+ }
38
+
39
+ const DEFAULT_CONFIG: PiListensConfig = {
40
+ sttModel: "saaras:v3",
41
+ sttMode: "transcribe",
42
+ sttLanguageCode: "unknown",
43
+ translateInputToEnglish: true,
44
+ ttsModel: "bulbul:v3",
45
+ ttsLanguageCode: "en-IN",
46
+ ttsSpeaker: "shubh",
47
+ ttsPace: 1,
48
+ ttsTemperature: 0.6,
49
+ ttsSampleRate: 24000,
50
+ ttsOutputCodec: "wav",
51
+ recordSeconds: 300,
52
+ recordSampleRate: 16000,
53
+ streamChunkMs: 250,
54
+ streamMaxSeconds: 300,
55
+ recordMode: "utterance",
56
+ silenceStartSeconds: 0.2,
57
+ silenceStopSeconds: 3.5,
58
+ silenceThreshold: "1%",
59
+ audioDir: join(tmpdir(), "pi-listens"),
60
+ deleteAudio: true,
61
+ textFallback: true,
62
+ autoSpeakAssistant: false,
63
+ maxAutoSpeakChars: 900,
64
+ };
65
+
66
+ type RawConfig = Partial<PiListensConfig>;
67
+
68
+ export function resolveConfig(cwd: string): PiListensConfig {
69
+ const legacyUserPath = join(homedir(), ".pi", "agent", "pi-listens.json");
70
+ const userPath = join(homedir(), ".pi", "pi-listens.json");
71
+ const projectPath = join(cwd, ".pi", "pi-listens.json");
72
+ const fileConfig = {
73
+ ...readJson(legacyUserPath),
74
+ ...readJson(userPath),
75
+ ...readJson(projectPath),
76
+ };
77
+
78
+ const envConfig: RawConfig = {
79
+ apiKey: env("SARVAM_API_KEY") ?? env("SARVAM_API_SUBSCRIPTION_KEY") ?? env("PI_LISTENS_SARVAM_API_KEY"),
80
+ sttModel: env("PI_LISTENS_STT_MODEL"),
81
+ sttMode: parseSttMode(env("PI_LISTENS_STT_MODE")),
82
+ sttLanguageCode: env("PI_LISTENS_STT_LANGUAGE"),
83
+ translateInputToEnglish: parseBoolean(env("PI_LISTENS_TRANSLATE_INPUT_TO_ENGLISH")),
84
+ ttsModel: env("PI_LISTENS_TTS_MODEL"),
85
+ ttsLanguageCode: env("PI_LISTENS_TTS_LANGUAGE"),
86
+ ttsSpeaker: env("PI_LISTENS_TTS_SPEAKER"),
87
+ ttsPace: parseNumber(env("PI_LISTENS_TTS_PACE")),
88
+ ttsTemperature: parseNumber(env("PI_LISTENS_TTS_TEMPERATURE")),
89
+ ttsSampleRate: parseInteger(env("PI_LISTENS_TTS_SAMPLE_RATE")),
90
+ ttsOutputCodec: parseCodec(env("PI_LISTENS_TTS_OUTPUT_CODEC")),
91
+ recordSeconds: parseInteger(env("PI_LISTENS_RECORD_SECONDS")),
92
+ recordSampleRate: parseInteger(env("PI_LISTENS_RECORD_SAMPLE_RATE")),
93
+ recordMode: parseRecordMode(env("PI_LISTENS_RECORD_MODE")),
94
+ silenceStartSeconds: parseNumber(env("PI_LISTENS_SILENCE_START_SECONDS")),
95
+ silenceStopSeconds: parseNumber(env("PI_LISTENS_SILENCE_STOP_SECONDS")),
96
+ silenceThreshold: env("PI_LISTENS_SILENCE_THRESHOLD"),
97
+ recordCommand: env("PI_LISTENS_RECORD_COMMAND"),
98
+ playCommand: env("PI_LISTENS_PLAY_COMMAND"),
99
+ streamCommand: env("PI_LISTENS_STREAM_COMMAND"),
100
+ streamChunkMs: parseInteger(env("PI_LISTENS_STREAM_CHUNK_MS")),
101
+ streamMaxSeconds: parseInteger(env("PI_LISTENS_STREAM_MAX_SECONDS")),
102
+ audioDir: env("PI_LISTENS_AUDIO_DIR"),
103
+ deleteAudio: parseBoolean(env("PI_LISTENS_DELETE_AUDIO")),
104
+ textFallback: parseBoolean(env("PI_LISTENS_TEXT_FALLBACK")),
105
+ autoSpeakAssistant: parseBoolean(env("PI_LISTENS_AUTO_SPEAK")),
106
+ maxAutoSpeakChars: parseInteger(env("PI_LISTENS_MAX_AUTO_SPEAK_CHARS")),
107
+ };
108
+
109
+ return mergeDefined(DEFAULT_CONFIG, fileConfig, envConfig);
110
+ }
111
+
112
+ function readJson(path: string): RawConfig {
113
+ if (!existsSync(path)) return {};
114
+ try {
115
+ const parsed = JSON.parse(readFileSync(path, "utf8")) as RawConfig;
116
+ return parsed && typeof parsed === "object" ? parsed : {};
117
+ } catch {
118
+ return {};
119
+ }
120
+ }
121
+
122
+ function env(name: string): string | undefined {
123
+ const value = process.env[name];
124
+ return value && value.trim() ? value.trim() : undefined;
125
+ }
126
+
127
+ function parseBoolean(value: string | undefined): boolean | undefined {
128
+ if (value === undefined) return undefined;
129
+ if (["1", "true", "yes", "on"].includes(value.toLowerCase())) return true;
130
+ if (["0", "false", "no", "off"].includes(value.toLowerCase())) return false;
131
+ return undefined;
132
+ }
133
+
134
+ function parseInteger(value: string | undefined): number | undefined {
135
+ if (value === undefined) return undefined;
136
+ const parsed = Number.parseInt(value, 10);
137
+ return Number.isFinite(parsed) ? parsed : undefined;
138
+ }
139
+
140
+ function parseNumber(value: string | undefined): number | undefined {
141
+ if (value === undefined) return undefined;
142
+ const parsed = Number.parseFloat(value);
143
+ return Number.isFinite(parsed) ? parsed : undefined;
144
+ }
145
+
146
+ function parseSttMode(value: string | undefined): SttMode | undefined {
147
+ if (!value) return undefined;
148
+ const allowed = new Set(["transcribe", "translate", "verbatim", "translit", "codemix"]);
149
+ return allowed.has(value) ? (value as SttMode) : undefined;
150
+ }
151
+
152
+ function parseRecordMode(value: string | undefined): RecordMode | undefined {
153
+ if (!value) return undefined;
154
+ return value === "utterance" || value === "fixed" ? value : undefined;
155
+ }
156
+
157
+ function parseCodec(value: string | undefined): PiListensConfig["ttsOutputCodec"] | undefined {
158
+ if (!value) return undefined;
159
+ const allowed = new Set(["wav", "mp3", "linear16", "mulaw", "alaw", "opus", "flac", "aac"]);
160
+ return allowed.has(value) ? (value as PiListensConfig["ttsOutputCodec"]) : undefined;
161
+ }
162
+
163
+ function mergeDefined(...configs: RawConfig[]): PiListensConfig {
164
+ const merged: Record<string, unknown> = {};
165
+ for (const config of configs) {
166
+ for (const [key, value] of Object.entries(config)) {
167
+ if (value !== undefined) merged[key] = value;
168
+ }
169
+ }
170
+ return merged as unknown as PiListensConfig;
171
+ }
172
+
173
+ export function maskSecret(value: string | undefined): string {
174
+ if (!value) return "not set";
175
+ if (value.length <= 8) return "set";
176
+ return `${value.slice(0, 4)}…${value.slice(-4)}`;
177
+ }
178
+
179
+ export function audioExtensionForCodec(codec: PiListensConfig["ttsOutputCodec"]): string {
180
+ if (codec === "linear16" || codec === "mulaw" || codec === "alaw") return "raw";
181
+ return codec;
182
+ }
package/src/index.ts ADDED
@@ -0,0 +1,84 @@
1
+ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
2
+ import { createAudioRuntime, type AudioRuntime } from "./audio.js";
3
+ import { maskSecret, resolveConfig, type PiListensConfig } from "./config.js";
4
+ import { SarvamSpeechClient } from "./sarvam.js";
5
+ import { attachStateToServices, maybeContinueVoiceLoop, registerVoiceCommands, stopVoiceMode, type VoiceModeState } from "./commands.js";
6
+ import { registerVoiceTools, type VoiceToolServices } from "./tools.js";
7
+ import { firstTextContent } from "./text.js";
8
+
9
+ export default function piListensExtension(pi: ExtensionAPI) {
10
+ let config: PiListensConfig = resolveConfig(process.cwd());
11
+ let audio: AudioRuntime = createAudioRuntime(config);
12
+ let lastCwd = process.cwd();
13
+
14
+ const speech = new SarvamSpeechClient(() => config);
15
+ const state: VoiceModeState = {
16
+
17
+ enabled: false,
18
+ autoListen: false,
19
+ autoSpeakAssistant: config.autoSpeakAssistant,
20
+ isListening: false,
21
+ status: "idle",
22
+ recordSeconds: config.recordSeconds,
23
+ silenceStopSeconds: config.silenceStopSeconds,
24
+ };
25
+
26
+ const services: VoiceToolServices = {
27
+ getConfig: () => config,
28
+ getAudio: () => audio,
29
+ getSpeech: () => speech,
30
+ };
31
+ attachStateToServices(services, state);
32
+
33
+ function reloadConfig(cwd: string) {
34
+ lastCwd = cwd;
35
+ config = resolveConfig(cwd);
36
+ audio = createAudioRuntime(config);
37
+ if (!state.enabled) { state.autoSpeakAssistant = config.autoSpeakAssistant; state.recordSeconds = config.recordSeconds; state.silenceStopSeconds = config.silenceStopSeconds; }
38
+ }
39
+
40
+ registerVoiceTools(pi, services);
41
+ registerVoiceCommands(pi, services, state);
42
+
43
+ pi.on("session_start", async (_event, ctx) => {
44
+ reloadConfig(ctx.cwd);
45
+ const audioInfo = audio.describe();
46
+ const ready = Boolean(config.apiKey) && audioInfo.recorder !== "missing" && audioInfo.player !== "missing";
47
+ ctx.ui.setStatus("pi-listens", state.enabled ? "voice on" : ready ? "voice ready" : "voice setup needed");
48
+ if (!ready) {
49
+ ctx.ui.notify(
50
+ [
51
+ "pi-listens is loaded but not fully ready.",
52
+ `Sarvam API key: ${maskSecret(config.apiKey)}`,
53
+ `Recorder: ${audioInfo.recorder}`,
54
+ `Player: ${audioInfo.player}`,
55
+ "Run /voice-status or call voice_setup_check for details.",
56
+ ].join("\n"),
57
+ "warning",
58
+ );
59
+ }
60
+ });
61
+
62
+ pi.on("session_shutdown", async (_event, ctx) => {
63
+ stopVoiceMode(services, state, ctx);
64
+ });
65
+
66
+ pi.on("before_agent_start", async (event) => {
67
+ return {
68
+ systemPrompt: `${event.systemPrompt}\n\nPi Listens voice guidance:\n- The user may primarily interact by speech through Sarvam AI. Text input is still possible.\n- When voice mode is active, treat it as a hands-free conversation: listen only while the voice UI/input tool is active, then pause listening while you work.\n- Use voice_output for concise spoken progress, completion, or status updates that matter to the user.\n- When you need clarification, confirmation, or any user input, prefer voice_ask with a concise spoken question instead of asking only in text.\n- Use voice_input only after the user already knows you are listening.\n- Do not speak code blocks, logs, diffs, stack traces, or long explanations; summarize them briefly and leave detail in text.`,
69
+ };
70
+ });
71
+
72
+ pi.on("message_end", async (event) => {
73
+ if (event.message.role !== "assistant") return;
74
+ state.lastAssistantText = firstTextContent(event.message);
75
+ });
76
+
77
+ pi.on("agent_end", async (_event, ctx) => {
78
+ await maybeContinueVoiceLoop(pi, services, state, ctx);
79
+ });
80
+
81
+ pi.on("session_tree", async (_event, ctx) => {
82
+ reloadConfig(ctx.cwd || lastCwd);
83
+ });
84
+ }
package/src/sarvam.ts ADDED
@@ -0,0 +1,311 @@
1
+ import { readFile, writeFile } from "node:fs/promises";
2
+ import { setTimeout as delay } from "node:timers/promises";
3
+ import { SarvamAIClient } from "sarvamai";
4
+ import type { AudioRuntime } from "./audio.js";
5
+ import type { PiListensConfig, SttMode } from "./config.js";
6
+
7
+ export interface TranscriptionResult {
8
+ transcript: string;
9
+ languageCode?: string;
10
+ languageProbability?: number;
11
+ requestId?: string;
12
+ }
13
+
14
+ export interface SynthesisResult {
15
+ path: string;
16
+ bytes: number;
17
+ }
18
+
19
+ type StreamingData = {
20
+ transcript?: string;
21
+ request_id?: string;
22
+ language_code?: string;
23
+ language_probability?: number;
24
+ error?: string;
25
+ code?: string;
26
+ event_type?: string;
27
+ signal_type?: string;
28
+ };
29
+
30
+ type StreamingResponse = {
31
+ type?: "data" | "error" | "events" | string;
32
+ data?: StreamingData;
33
+ };
34
+
35
+ type StreamingSocket = {
36
+ transcribe(params: { audio: string; sample_rate: number; encoding: "audio/wav" }): void;
37
+ flush(): void;
38
+ close(): void;
39
+ waitForOpen(): Promise<void>;
40
+ onMessage(handler: (message: StreamingResponse) => void): void;
41
+ onError(handler: (error: Error) => void): void;
42
+ };
43
+
44
+ export class SarvamSpeechClient {
45
+ private client: SarvamAIClient | null = null;
46
+ private clientKey: string | null = null;
47
+
48
+ constructor(private readonly getConfig: () => PiListensConfig) {}
49
+
50
+ async transcribeMicrophone(audio: AudioRuntime, signal?: AbortSignal, options: { seconds?: number; mode?: SttMode } = {}): Promise<TranscriptionResult> {
51
+ const config = this.getConfig();
52
+ return this.withStreamingSocket(signal, options.mode, "pcm_s16le", async (socket, collect) => {
53
+ const recorderController = new AbortController();
54
+ const stopRecorder = () => recorderController.abort();
55
+ signal?.addEventListener("abort", stopRecorder, { once: true });
56
+ const startedAt = Date.now();
57
+ let speechStarted = false;
58
+ let lastVoiceAt = Date.now();
59
+ let pending = Buffer.alloc(0);
60
+ const chunkBytes = Math.max(1600, Math.round(config.recordSampleRate * 2 * (config.streamChunkMs / 1000)));
61
+ const maxSeconds = Math.max(1, Math.round(options.seconds ?? config.streamMaxSeconds ?? config.recordSeconds));
62
+
63
+ const streamSignal = combineSignals(signal, recorderController.signal);
64
+ try {
65
+ for await (const chunk of audio.streamPcm(streamSignal.signal)) {
66
+ if (signal?.aborted) throw new Error("Cancelled");
67
+ const now = Date.now();
68
+ const rms = pcm16Rms(chunk);
69
+ if (rms > silenceThresholdAmplitude(config.silenceThreshold)) {
70
+ speechStarted = true;
71
+ lastVoiceAt = now;
72
+ }
73
+
74
+ pending = Buffer.concat([pending, chunk]);
75
+ while (pending.byteLength >= chunkBytes) {
76
+ const audioChunk = pending.subarray(0, chunkBytes);
77
+ pending = pending.subarray(chunkBytes);
78
+ socket.transcribe({ audio: audioChunk.toString("base64"), sample_rate: config.recordSampleRate, encoding: "audio/wav" });
79
+ }
80
+
81
+ const hitMaxDuration = now - startedAt >= maxSeconds * 1000;
82
+ const hitTrailingSilence = speechStarted && now - lastVoiceAt >= config.silenceStopSeconds * 1000;
83
+ if (hitMaxDuration || hitTrailingSilence) break;
84
+ }
85
+ if (pending.byteLength > 0) {
86
+ socket.transcribe({ audio: pending.toString("base64"), sample_rate: config.recordSampleRate, encoding: "audio/wav" });
87
+ }
88
+ socket.flush();
89
+ await collect();
90
+ } finally {
91
+ streamSignal.cleanup();
92
+ recorderController.abort();
93
+ signal?.removeEventListener("abort", stopRecorder);
94
+ }
95
+ });
96
+ }
97
+
98
+ async transcribeFile(path: string, signal?: AbortSignal, options: { mode?: SttMode } = {}): Promise<TranscriptionResult> {
99
+ const config = this.getConfig();
100
+ const audio = await readFile(path);
101
+ return this.withStreamingSocket(signal, options.mode, "wav", async (socket, collect) => {
102
+ socket.transcribe({ audio: audio.toString("base64"), sample_rate: config.recordSampleRate, encoding: "audio/wav" });
103
+ socket.flush();
104
+ await collect();
105
+ });
106
+ }
107
+
108
+ async synthesizeToFile(text: string, path: string, signal?: AbortSignal): Promise<SynthesisResult> {
109
+ const config = this.getConfig();
110
+ const client = this.getClient(config);
111
+ const response = await client.textToSpeech.convertStream(
112
+ {
113
+ text,
114
+ target_language_code: config.ttsLanguageCode as never,
115
+ speaker: config.ttsSpeaker as never,
116
+ model: config.ttsModel as never,
117
+ pace: config.ttsPace,
118
+ temperature: config.ttsTemperature,
119
+ speech_sample_rate: config.ttsSampleRate as never,
120
+ enable_preprocessing: true,
121
+ output_audio_codec: config.ttsOutputCodec as never,
122
+ },
123
+ { abortSignal: signal },
124
+ );
125
+ const arrayBuffer = await response.arrayBuffer();
126
+ const buffer = Buffer.from(arrayBuffer);
127
+ await writeFile(path, buffer);
128
+ return { path, bytes: buffer.byteLength };
129
+ }
130
+
131
+ private async withStreamingSocket(
132
+ signal: AbortSignal | undefined,
133
+ mode: SttMode | undefined,
134
+ inputAudioCodec: "wav" | "pcm_s16le",
135
+ streamAudio: (
136
+ socket: StreamingSocket,
137
+ collect: () => Promise<void>,
138
+ ) => Promise<void>,
139
+ ): Promise<TranscriptionResult> {
140
+ const config = this.getConfig();
141
+ this.getClient(config); // validate/cache API key for TTS; STT uses raw WebSocket so the documented `mode` query parameter is preserved.
142
+ let transcript = "";
143
+ let requestId: string | undefined;
144
+ let languageCode: string | undefined;
145
+ let languageProbability: number | undefined;
146
+ let streamError: Error | undefined;
147
+ let lastMessageAt = Date.now();
148
+
149
+ const socket = connectStreamingSocket(config, mode ?? (config.translateInputToEnglish ? "translate" : config.sttMode), inputAudioCodec);
150
+
151
+ const closeOnAbort = () => socket.close();
152
+ signal?.addEventListener("abort", closeOnAbort, { once: true });
153
+ socket.onMessage((message: StreamingResponse) => {
154
+ lastMessageAt = Date.now();
155
+ if (message.type === "error") {
156
+ streamError = new Error(message.data?.error ?? message.data?.code ?? "Sarvam streaming STT failed");
157
+ return;
158
+ }
159
+ if (message.type !== "data") return;
160
+ const data = message.data;
161
+ if (!data) return;
162
+ transcript = mergeTranscript(transcript, data.transcript ?? "");
163
+ requestId = data.request_id ?? requestId;
164
+ languageCode = data.language_code ?? languageCode;
165
+ languageProbability = data.language_probability ?? languageProbability;
166
+ });
167
+ socket.onError((error: Error) => { streamError = error; });
168
+
169
+ try {
170
+ await socket.waitForOpen();
171
+ await streamAudio(socket, async () => {
172
+ const startedWaitingAt = Date.now();
173
+ while (Date.now() - startedWaitingAt < 3000) {
174
+ if (streamError) throw streamError;
175
+ if (Date.now() - lastMessageAt > 850 && transcript.trim()) break;
176
+ await delay(100, undefined, { signal }).catch((err) => { throw err; });
177
+ }
178
+ });
179
+ if (streamError) throw streamError;
180
+ return { transcript: transcript.trim(), languageCode, languageProbability, requestId };
181
+ } finally {
182
+ signal?.removeEventListener("abort", closeOnAbort);
183
+ socket.close();
184
+ }
185
+ }
186
+
187
+ private getClient(config: PiListensConfig): SarvamAIClient {
188
+ if (!config.apiKey) {
189
+ throw new Error("Sarvam API key is not configured. Set SARVAM_API_KEY or run with a pi-listens config file.");
190
+ }
191
+ if (!this.client || this.clientKey !== config.apiKey) {
192
+ this.client = new SarvamAIClient({ apiSubscriptionKey: config.apiKey });
193
+ this.clientKey = config.apiKey;
194
+ }
195
+ return this.client;
196
+ }
197
+ }
198
+
199
+ function mergeTranscript(existing: string, incoming: string): string {
200
+ const previous = existing.trim();
201
+ const next = incoming.trim();
202
+ if (!next) return previous;
203
+ if (!previous) return next;
204
+ if (next === previous || previous.endsWith(next) || previous.includes(next)) return previous;
205
+ if (next.startsWith(previous)) return next;
206
+ return `${previous} ${next}`;
207
+ }
208
+
209
+ function pcm16Rms(buffer: Buffer): number {
210
+ let total = 0;
211
+ let count = 0;
212
+ for (let offset = 0; offset + 1 < buffer.byteLength; offset += 2) {
213
+ const sample = buffer.readInt16LE(offset);
214
+ total += sample * sample;
215
+ count++;
216
+ }
217
+ return count ? Math.sqrt(total / count) : 0;
218
+ }
219
+
220
+ function silenceThresholdAmplitude(threshold: string): number {
221
+ const trimmed = threshold.trim();
222
+ if (trimmed.endsWith("%")) {
223
+ const percent = Number.parseFloat(trimmed.slice(0, -1));
224
+ if (Number.isFinite(percent)) return 32767 * (percent / 100);
225
+ }
226
+ const numeric = Number.parseFloat(trimmed);
227
+ return Number.isFinite(numeric) ? numeric : 327;
228
+ }
229
+
230
+ function connectStreamingSocket(config: PiListensConfig, mode: SttMode, inputAudioCodec: "wav" | "pcm_s16le"): StreamingSocket {
231
+ if (!config.apiKey) {
232
+ throw new Error("Sarvam API key is not configured. Set SARVAM_API_KEY or run with a pi-listens config file.");
233
+ }
234
+ const url = new URL("wss://api.sarvam.ai/speech-to-text/ws");
235
+ url.searchParams.set("language-code", config.sttLanguageCode);
236
+ url.searchParams.set("model", config.sttModel);
237
+ url.searchParams.set("mode", mode);
238
+ url.searchParams.set("input_audio_codec", inputAudioCodec);
239
+ url.searchParams.set("sample_rate", String(config.recordSampleRate));
240
+ url.searchParams.set("high_vad_sensitivity", "true");
241
+ url.searchParams.set("vad_signals", "true");
242
+ url.searchParams.set("flush_signal", "true");
243
+
244
+ const ws = new WebSocket(url, [`api-subscription-key.${config.apiKey}`]);
245
+ const messageHandlers = new Set<(message: StreamingResponse) => void>();
246
+ const errorHandlers = new Set<(error: Error) => void>();
247
+ ws.addEventListener("message", (event) => {
248
+ try {
249
+ const parsed = JSON.parse(String(event.data)) as StreamingResponse;
250
+ for (const handler of messageHandlers) handler(parsed);
251
+ } catch (err) {
252
+ const error = err instanceof Error ? err : new Error(String(err));
253
+ for (const handler of errorHandlers) handler(error);
254
+ }
255
+ });
256
+ ws.addEventListener("error", () => {
257
+ for (const handler of errorHandlers) handler(new Error("Sarvam streaming WebSocket error"));
258
+ });
259
+
260
+ return {
261
+ transcribe(params) {
262
+ ws.send(JSON.stringify({ audio: { data: params.audio, sample_rate: params.sample_rate, encoding: params.encoding } }));
263
+ },
264
+ flush() {
265
+ ws.send(JSON.stringify({ type: "flush" }));
266
+ },
267
+ close() {
268
+ if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) ws.close();
269
+ },
270
+ waitForOpen() {
271
+ if (ws.readyState === WebSocket.OPEN) return Promise.resolve();
272
+ return new Promise<void>((resolve, reject) => {
273
+ const cleanup = () => {
274
+ ws.removeEventListener("open", onOpen);
275
+ ws.removeEventListener("error", onError);
276
+ ws.removeEventListener("close", onClose);
277
+ };
278
+ const onOpen = () => { cleanup(); resolve(); };
279
+ const onError = () => { cleanup(); reject(new Error("Sarvam streaming WebSocket failed to open")); };
280
+ const onClose = () => { cleanup(); reject(new Error("Sarvam streaming WebSocket closed before opening")); };
281
+ ws.addEventListener("open", onOpen, { once: true });
282
+ ws.addEventListener("error", onError, { once: true });
283
+ ws.addEventListener("close", onClose, { once: true });
284
+ });
285
+ },
286
+ onMessage(handler) { messageHandlers.add(handler); },
287
+ onError(handler) { errorHandlers.add(handler); },
288
+ };
289
+ }
290
+
291
+ type CombinedSignal = { signal?: AbortSignal; cleanup: () => void };
292
+
293
+ function combineSignals(...signals: Array<AbortSignal | undefined>): CombinedSignal {
294
+ const active = signals.filter((signal): signal is AbortSignal => Boolean(signal));
295
+ if (active.length === 0) return { signal: undefined, cleanup: () => undefined };
296
+ if (active.length === 1) return { signal: active[0], cleanup: () => undefined };
297
+ const controller = new AbortController();
298
+ const attached: AbortSignal[] = [];
299
+ const abort = () => controller.abort();
300
+ for (const signal of active) {
301
+ if (signal.aborted) { controller.abort(); break; }
302
+ signal.addEventListener("abort", abort, { once: true });
303
+ attached.push(signal);
304
+ }
305
+ return {
306
+ signal: controller.signal,
307
+ cleanup: () => {
308
+ for (const signal of attached) signal.removeEventListener("abort", abort);
309
+ },
310
+ };
311
+ }
package/src/text.ts ADDED
@@ -0,0 +1,33 @@
1
+ export function firstTextContent(message: unknown): string {
2
+ if (!message || typeof message !== "object") return "";
3
+ const content = (message as { content?: unknown }).content;
4
+ if (typeof content === "string") return content;
5
+ if (!Array.isArray(content)) return "";
6
+ return content
7
+ .map((part) => {
8
+ if (!part || typeof part !== "object") return "";
9
+ const p = part as { type?: string; text?: string };
10
+ return p.type === "text" && typeof p.text === "string" ? p.text : "";
11
+ })
12
+ .filter(Boolean)
13
+ .join("\n")
14
+ .trim();
15
+ }
16
+
17
+ export function prepareSpokenText(text: string, maxChars: number): string {
18
+ let prepared = text
19
+ .replace(/```[\s\S]*?```/g, " I am skipping a code block. ")
20
+ .replace(/`([^`]+)`/g, "$1")
21
+ .replace(/https?:\/\/\S+/g, "link")
22
+ .replace(/\s+/g, " ")
23
+ .trim();
24
+ if (prepared.length > maxChars) {
25
+ prepared = `${prepared.slice(0, Math.max(0, maxChars - 80)).trim()}… I have more details on screen.`;
26
+ }
27
+ return prepared;
28
+ }
29
+
30
+ export function conciseTranscript(transcript: string): string {
31
+ const trimmed = transcript.trim();
32
+ return trimmed.length === 0 ? "(no speech recognized)" : trimmed;
33
+ }