@openclaw/voice-call 2026.3.13 → 2026.5.2-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -5
- package/api.ts +16 -0
- package/cli-metadata.ts +10 -0
- package/config-api.ts +12 -0
- package/index.test.ts +943 -0
- package/index.ts +379 -149
- package/openclaw.plugin.json +384 -157
- package/package.json +35 -5
- package/runtime-api.ts +20 -0
- package/runtime-entry.ts +1 -0
- package/setup-api.ts +47 -0
- package/src/allowlist.test.ts +18 -0
- package/src/cli.ts +533 -68
- package/src/config-compat.test.ts +120 -0
- package/src/config-compat.ts +227 -0
- package/src/config.test.ts +273 -12
- package/src/config.ts +355 -72
- package/src/core-bridge.ts +2 -147
- package/src/deep-merge.test.ts +40 -0
- package/src/gateway-continue-operation.ts +200 -0
- package/src/http-headers.ts +6 -3
- package/src/manager/context.ts +6 -5
- package/src/manager/events.test.ts +243 -19
- package/src/manager/events.ts +61 -31
- package/src/manager/lifecycle.ts +53 -0
- package/src/manager/lookup.test.ts +52 -0
- package/src/manager/outbound.test.ts +528 -0
- package/src/manager/outbound.ts +163 -57
- package/src/manager/store.ts +18 -6
- package/src/manager/timers.test.ts +129 -0
- package/src/manager/timers.ts +4 -3
- package/src/manager/twiml.test.ts +13 -0
- package/src/manager/twiml.ts +8 -0
- package/src/manager.closed-loop.test.ts +30 -12
- package/src/manager.inbound-allowlist.test.ts +77 -10
- package/src/manager.notify.test.ts +344 -20
- package/src/manager.restore.test.ts +95 -8
- package/src/manager.test-harness.ts +8 -6
- package/src/manager.ts +79 -5
- package/src/media-stream.test.ts +578 -81
- package/src/media-stream.ts +235 -54
- package/src/providers/base.ts +19 -0
- package/src/providers/mock.ts +7 -1
- package/src/providers/plivo.test.ts +50 -6
- package/src/providers/plivo.ts +14 -6
- package/src/providers/shared/call-status.ts +2 -1
- package/src/providers/shared/guarded-json-api.test.ts +106 -0
- package/src/providers/shared/guarded-json-api.ts +1 -1
- package/src/providers/telnyx.test.ts +178 -6
- package/src/providers/telnyx.ts +40 -3
- package/src/providers/twilio/api.test.ts +145 -0
- package/src/providers/twilio/api.ts +67 -16
- package/src/providers/twilio/twiml-policy.ts +6 -10
- package/src/providers/twilio/webhook.ts +1 -1
- package/src/providers/twilio.test.ts +425 -25
- package/src/providers/twilio.ts +230 -77
- package/src/providers/twilio.types.ts +17 -0
- package/src/realtime-defaults.ts +3 -0
- package/src/realtime-fast-context.test.ts +88 -0
- package/src/realtime-fast-context.ts +165 -0
- package/src/realtime-transcription.runtime.ts +4 -0
- package/src/realtime-voice.runtime.ts +5 -0
- package/src/response-generator.test.ts +321 -0
- package/src/response-generator.ts +213 -53
- package/src/response-model.test.ts +71 -0
- package/src/response-model.ts +23 -0
- package/src/runtime.test.ts +429 -0
- package/src/runtime.ts +270 -24
- package/src/telephony-audio.test.ts +61 -0
- package/src/telephony-audio.ts +1 -79
- package/src/telephony-tts.test.ts +133 -12
- package/src/telephony-tts.ts +155 -2
- package/src/test-fixtures.ts +28 -7
- package/src/tts-provider-voice.test.ts +34 -0
- package/src/tts-provider-voice.ts +21 -0
- package/src/tunnel.test.ts +166 -0
- package/src/tunnel.ts +1 -1
- package/src/types.ts +24 -37
- package/src/utils.test.ts +17 -0
- package/src/voice-mapping.test.ts +34 -0
- package/src/voice-mapping.ts +3 -2
- package/src/webhook/realtime-handler.test.ts +598 -0
- package/src/webhook/realtime-handler.ts +485 -0
- package/src/webhook/stale-call-reaper.test.ts +88 -0
- package/src/webhook/stale-call-reaper.ts +5 -0
- package/src/webhook/tailscale.test.ts +214 -0
- package/src/webhook/tailscale.ts +19 -5
- package/src/webhook-exposure.test.ts +33 -0
- package/src/webhook-exposure.ts +84 -0
- package/src/webhook-security.test.ts +172 -21
- package/src/webhook-security.ts +43 -29
- package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
- package/src/webhook.test.ts +1145 -27
- package/src/webhook.ts +523 -102
- package/src/webhook.types.ts +5 -0
- package/src/websocket-test-support.ts +72 -0
- package/tsconfig.json +16 -0
- package/CHANGELOG.md +0 -121
- package/src/providers/index.ts +0 -10
- package/src/providers/stt-openai-realtime.test.ts +0 -42
- package/src/providers/stt-openai-realtime.ts +0 -311
- package/src/providers/tts-openai.test.ts +0 -43
- package/src/providers/tts-openai.ts +0 -221
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { once } from "node:events";
|
|
2
|
+
import http from "node:http";
|
|
3
|
+
import { WebSocket } from "ws";
|
|
4
|
+
|
|
5
|
+
export const withTimeout = async <T>(promise: Promise<T>, timeoutMs = 2000): Promise<T> => {
|
|
6
|
+
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
7
|
+
const timeout = new Promise<never>((_, reject) => {
|
|
8
|
+
timer = setTimeout(() => reject(new Error(`Timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
try {
|
|
12
|
+
return await Promise.race([promise, timeout]);
|
|
13
|
+
} finally {
|
|
14
|
+
if (timer) {
|
|
15
|
+
clearTimeout(timer);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
export const startUpgradeWsServer = async (params: {
|
|
21
|
+
urlPath: string;
|
|
22
|
+
onUpgrade: (
|
|
23
|
+
request: http.IncomingMessage,
|
|
24
|
+
socket: Parameters<http.Server["emit"]>[2],
|
|
25
|
+
head: Buffer,
|
|
26
|
+
) => void;
|
|
27
|
+
}): Promise<{
|
|
28
|
+
url: string;
|
|
29
|
+
close: () => Promise<void>;
|
|
30
|
+
}> => {
|
|
31
|
+
const server = http.createServer();
|
|
32
|
+
server.on("upgrade", (request, socket, head) => {
|
|
33
|
+
params.onUpgrade(request, socket, head);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
await new Promise<void>((resolve) => {
|
|
37
|
+
server.listen(0, "127.0.0.1", resolve);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
const address = server.address();
|
|
41
|
+
if (!address || typeof address === "string") {
|
|
42
|
+
throw new Error("Failed to resolve test server address");
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
url: `ws://127.0.0.1:${address.port}${params.urlPath}`,
|
|
47
|
+
close: async () => {
|
|
48
|
+
await new Promise<void>((resolve, reject) => {
|
|
49
|
+
server.close((err) => (err ? reject(err) : resolve()));
|
|
50
|
+
});
|
|
51
|
+
},
|
|
52
|
+
};
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
export const connectWs = async (url: string): Promise<WebSocket> => {
|
|
56
|
+
const ws = new WebSocket(url);
|
|
57
|
+
await withTimeout(once(ws, "open") as Promise<[unknown]>);
|
|
58
|
+
return ws;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
export const waitForClose = async (
|
|
62
|
+
ws: WebSocket,
|
|
63
|
+
): Promise<{
|
|
64
|
+
code: number;
|
|
65
|
+
reason: string;
|
|
66
|
+
}> => {
|
|
67
|
+
const [code, reason] = (await withTimeout(once(ws, "close") as Promise<[number, Buffer]>)) ?? [];
|
|
68
|
+
return {
|
|
69
|
+
code,
|
|
70
|
+
reason: Buffer.isBuffer(reason) ? reason.toString("utf8") : String(reason || ""),
|
|
71
|
+
};
|
|
72
|
+
};
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"extends": "../tsconfig.package-boundary.base.json",
|
|
3
|
+
"compilerOptions": {
|
|
4
|
+
"rootDir": "."
|
|
5
|
+
},
|
|
6
|
+
"include": ["./*.ts", "./src/**/*.ts"],
|
|
7
|
+
"exclude": [
|
|
8
|
+
"./**/*.test.ts",
|
|
9
|
+
"./dist/**",
|
|
10
|
+
"./node_modules/**",
|
|
11
|
+
"./src/test-support/**",
|
|
12
|
+
"./src/**/*test-helpers.ts",
|
|
13
|
+
"./src/**/*test-harness.ts",
|
|
14
|
+
"./src/**/*test-support.ts"
|
|
15
|
+
]
|
|
16
|
+
}
|
package/CHANGELOG.md
DELETED
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
## 2026.3.13
|
|
4
|
-
|
|
5
|
-
### Changes
|
|
6
|
-
|
|
7
|
-
- Version alignment with core OpenClaw release numbers.
|
|
8
|
-
|
|
9
|
-
## 2026.3.12
|
|
10
|
-
|
|
11
|
-
### Changes
|
|
12
|
-
|
|
13
|
-
- Version alignment with core OpenClaw release numbers.
|
|
14
|
-
|
|
15
|
-
## 2026.3.11
|
|
16
|
-
|
|
17
|
-
### Changes
|
|
18
|
-
|
|
19
|
-
- Version alignment with core OpenClaw release numbers.
|
|
20
|
-
|
|
21
|
-
## 2026.3.10
|
|
22
|
-
|
|
23
|
-
### Changes
|
|
24
|
-
|
|
25
|
-
- Version alignment with core OpenClaw release numbers.
|
|
26
|
-
|
|
27
|
-
## 2026.3.9
|
|
28
|
-
|
|
29
|
-
### Changes
|
|
30
|
-
|
|
31
|
-
- Version alignment with core OpenClaw release numbers.
|
|
32
|
-
|
|
33
|
-
## 2026.3.8-beta.1
|
|
34
|
-
|
|
35
|
-
### Changes
|
|
36
|
-
|
|
37
|
-
- Version alignment with core OpenClaw release numbers.
|
|
38
|
-
|
|
39
|
-
## 2026.3.8
|
|
40
|
-
|
|
41
|
-
### Changes
|
|
42
|
-
|
|
43
|
-
- Version alignment with core OpenClaw release numbers.
|
|
44
|
-
|
|
45
|
-
## 2026.3.7
|
|
46
|
-
|
|
47
|
-
### Changes
|
|
48
|
-
|
|
49
|
-
- Version alignment with core OpenClaw release numbers.
|
|
50
|
-
|
|
51
|
-
## 2026.3.3
|
|
52
|
-
|
|
53
|
-
### Changes
|
|
54
|
-
|
|
55
|
-
- Version alignment with core OpenClaw release numbers.
|
|
56
|
-
|
|
57
|
-
## 2026.3.2
|
|
58
|
-
|
|
59
|
-
### Changes
|
|
60
|
-
|
|
61
|
-
- Version alignment with core OpenClaw release numbers.
|
|
62
|
-
|
|
63
|
-
## 2026.3.1
|
|
64
|
-
|
|
65
|
-
### Changes
|
|
66
|
-
|
|
67
|
-
- Version alignment with core OpenClaw release numbers.
|
|
68
|
-
|
|
69
|
-
## 2026.2.26
|
|
70
|
-
|
|
71
|
-
### Changes
|
|
72
|
-
|
|
73
|
-
- Version alignment with core OpenClaw release numbers.
|
|
74
|
-
|
|
75
|
-
## 2026.2.25
|
|
76
|
-
|
|
77
|
-
### Changes
|
|
78
|
-
|
|
79
|
-
- Version alignment with core OpenClaw release numbers.
|
|
80
|
-
|
|
81
|
-
## 2026.2.24
|
|
82
|
-
|
|
83
|
-
### Changes
|
|
84
|
-
|
|
85
|
-
- Version alignment with core OpenClaw release numbers.
|
|
86
|
-
|
|
87
|
-
## 2026.2.22
|
|
88
|
-
|
|
89
|
-
### Changes
|
|
90
|
-
|
|
91
|
-
- Version alignment with core OpenClaw release numbers.
|
|
92
|
-
|
|
93
|
-
## 2026.1.26
|
|
94
|
-
|
|
95
|
-
### Changes
|
|
96
|
-
|
|
97
|
-
- Breaking: voice-call TTS now uses core `messages.tts` (plugin TTS config deep‑merges with core).
|
|
98
|
-
- Telephony TTS supports OpenAI + ElevenLabs; Edge TTS is ignored for calls.
|
|
99
|
-
- Removed legacy `tts.model`/`tts.voice`/`tts.instructions` plugin fields.
|
|
100
|
-
- Ngrok free-tier bypass renamed to `tunnel.allowNgrokFreeTierLoopbackBypass` and gated to loopback + `tunnel.provider="ngrok"`.
|
|
101
|
-
|
|
102
|
-
## 0.1.0
|
|
103
|
-
|
|
104
|
-
### Highlights
|
|
105
|
-
|
|
106
|
-
- First public release of the @openclaw/voice-call plugin.
|
|
107
|
-
|
|
108
|
-
### Features
|
|
109
|
-
|
|
110
|
-
- Providers: Twilio (Programmable Voice + Media Streams), Telnyx (Call Control v2), and mock provider for local dev.
|
|
111
|
-
- Call flows: outbound notify vs. conversation modes, configurable auto‑hangup, and multi‑turn continuation.
|
|
112
|
-
- Inbound handling: policy controls (disabled/allowlist/open), allowlist matching, and inbound greeting.
|
|
113
|
-
- Webhooks: built‑in server with configurable bind/port/path plus `publicUrl` override.
|
|
114
|
-
- Exposure helpers: ngrok + Tailscale serve/funnel; dev‑only signature bypass for ngrok free tier.
|
|
115
|
-
- Streaming: OpenAI Realtime STT over media WebSocket with partial + final transcripts.
|
|
116
|
-
- Speech: OpenAI TTS (model/voice/instructions) with Twilio `<Say>` fallback.
|
|
117
|
-
- Tooling: `voice_call` tool actions for initiate/continue/speak/end/status.
|
|
118
|
-
- Gateway RPC: `voicecall.initiate|continue|speak|end|status` (+ legacy `voicecall.start`).
|
|
119
|
-
- CLI: `openclaw voicecall` commands (call/start/continue/speak/end/status/tail/expose).
|
|
120
|
-
- Observability: JSONL call logs and `voicecall tail` for live inspection.
|
|
121
|
-
- Response controls: `responseModel`, `responseSystemPrompt`, and `responseTimeoutMs` for auto‑responses.
|
package/src/providers/index.ts
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
export type { VoiceCallProvider } from "./base.js";
|
|
2
|
-
export { MockProvider } from "./mock.js";
|
|
3
|
-
export {
|
|
4
|
-
OpenAIRealtimeSTTProvider,
|
|
5
|
-
type RealtimeSTTConfig,
|
|
6
|
-
type RealtimeSTTSession,
|
|
7
|
-
} from "./stt-openai-realtime.js";
|
|
8
|
-
export { TelnyxProvider } from "./telnyx.js";
|
|
9
|
-
export { TwilioProvider } from "./twilio.js";
|
|
10
|
-
export { PlivoProvider } from "./plivo.js";
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import { describe, expect, it } from "vitest";
|
|
2
|
-
import type { RealtimeSTTConfig } from "./stt-openai-realtime.js";
|
|
3
|
-
import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js";
|
|
4
|
-
|
|
5
|
-
type ProviderInternals = {
|
|
6
|
-
vadThreshold: number;
|
|
7
|
-
silenceDurationMs: number;
|
|
8
|
-
};
|
|
9
|
-
|
|
10
|
-
function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals {
|
|
11
|
-
const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record<string, unknown>;
|
|
12
|
-
return {
|
|
13
|
-
vadThreshold: provider["vadThreshold"] as number,
|
|
14
|
-
silenceDurationMs: provider["silenceDurationMs"] as number,
|
|
15
|
-
};
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
describe("OpenAIRealtimeSTTProvider constructor defaults", () => {
|
|
19
|
-
it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => {
|
|
20
|
-
const provider = readProviderInternals({
|
|
21
|
-
apiKey: "sk-test", // pragma: allowlist secret
|
|
22
|
-
vadThreshold: 0,
|
|
23
|
-
});
|
|
24
|
-
expect(provider.vadThreshold).toBe(0);
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
it("uses silenceDurationMs: 0 when explicitly configured", () => {
|
|
28
|
-
const provider = readProviderInternals({
|
|
29
|
-
apiKey: "sk-test", // pragma: allowlist secret
|
|
30
|
-
silenceDurationMs: 0,
|
|
31
|
-
});
|
|
32
|
-
expect(provider.silenceDurationMs).toBe(0);
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
it("falls back to defaults when values are undefined", () => {
|
|
36
|
-
const provider = readProviderInternals({
|
|
37
|
-
apiKey: "sk-test", // pragma: allowlist secret
|
|
38
|
-
});
|
|
39
|
-
expect(provider.vadThreshold).toBe(0.5);
|
|
40
|
-
expect(provider.silenceDurationMs).toBe(800);
|
|
41
|
-
});
|
|
42
|
-
});
|
|
@@ -1,311 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* OpenAI Realtime STT Provider
|
|
3
|
-
*
|
|
4
|
-
* Uses the OpenAI Realtime API for streaming transcription with:
|
|
5
|
-
* - Direct mu-law audio support (no conversion needed)
|
|
6
|
-
* - Built-in server-side VAD for turn detection
|
|
7
|
-
* - Low-latency streaming transcription
|
|
8
|
-
* - Partial transcript callbacks for real-time UI updates
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
import WebSocket from "ws";
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Configuration for OpenAI Realtime STT.
|
|
15
|
-
*/
|
|
16
|
-
export interface RealtimeSTTConfig {
|
|
17
|
-
/** OpenAI API key */
|
|
18
|
-
apiKey: string;
|
|
19
|
-
/** Model to use (default: gpt-4o-transcribe) */
|
|
20
|
-
model?: string;
|
|
21
|
-
/** Silence duration in ms before considering speech ended (default: 800) */
|
|
22
|
-
silenceDurationMs?: number;
|
|
23
|
-
/** VAD threshold 0-1 (default: 0.5) */
|
|
24
|
-
vadThreshold?: number;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
* Session for streaming audio and receiving transcripts.
|
|
29
|
-
*/
|
|
30
|
-
export interface RealtimeSTTSession {
|
|
31
|
-
/** Connect to the transcription service */
|
|
32
|
-
connect(): Promise<void>;
|
|
33
|
-
/** Send mu-law audio data (8kHz mono) */
|
|
34
|
-
sendAudio(audio: Buffer): void;
|
|
35
|
-
/** Wait for next complete transcript (after VAD detects end of speech) */
|
|
36
|
-
waitForTranscript(timeoutMs?: number): Promise<string>;
|
|
37
|
-
/** Set callback for partial transcripts (streaming) */
|
|
38
|
-
onPartial(callback: (partial: string) => void): void;
|
|
39
|
-
/** Set callback for final transcripts */
|
|
40
|
-
onTranscript(callback: (transcript: string) => void): void;
|
|
41
|
-
/** Set callback when speech starts (VAD) */
|
|
42
|
-
onSpeechStart(callback: () => void): void;
|
|
43
|
-
/** Close the session */
|
|
44
|
-
close(): void;
|
|
45
|
-
/** Check if session is connected */
|
|
46
|
-
isConnected(): boolean;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
/**
|
|
50
|
-
* Provider factory for OpenAI Realtime STT sessions.
|
|
51
|
-
*/
|
|
52
|
-
export class OpenAIRealtimeSTTProvider {
|
|
53
|
-
readonly name = "openai-realtime";
|
|
54
|
-
private apiKey: string;
|
|
55
|
-
private model: string;
|
|
56
|
-
private silenceDurationMs: number;
|
|
57
|
-
private vadThreshold: number;
|
|
58
|
-
|
|
59
|
-
constructor(config: RealtimeSTTConfig) {
|
|
60
|
-
if (!config.apiKey) {
|
|
61
|
-
throw new Error("OpenAI API key required for Realtime STT");
|
|
62
|
-
}
|
|
63
|
-
this.apiKey = config.apiKey;
|
|
64
|
-
this.model = config.model || "gpt-4o-transcribe";
|
|
65
|
-
this.silenceDurationMs = config.silenceDurationMs ?? 800;
|
|
66
|
-
this.vadThreshold = config.vadThreshold ?? 0.5;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* Create a new realtime transcription session.
|
|
71
|
-
*/
|
|
72
|
-
createSession(): RealtimeSTTSession {
|
|
73
|
-
return new OpenAIRealtimeSTTSession(
|
|
74
|
-
this.apiKey,
|
|
75
|
-
this.model,
|
|
76
|
-
this.silenceDurationMs,
|
|
77
|
-
this.vadThreshold,
|
|
78
|
-
);
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
/**
|
|
83
|
-
* WebSocket-based session for real-time speech-to-text.
|
|
84
|
-
*/
|
|
85
|
-
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
|
|
86
|
-
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
|
87
|
-
private static readonly RECONNECT_DELAY_MS = 1000;
|
|
88
|
-
|
|
89
|
-
private ws: WebSocket | null = null;
|
|
90
|
-
private connected = false;
|
|
91
|
-
private closed = false;
|
|
92
|
-
private reconnectAttempts = 0;
|
|
93
|
-
private pendingTranscript = "";
|
|
94
|
-
private onTranscriptCallback: ((transcript: string) => void) | null = null;
|
|
95
|
-
private onPartialCallback: ((partial: string) => void) | null = null;
|
|
96
|
-
private onSpeechStartCallback: (() => void) | null = null;
|
|
97
|
-
|
|
98
|
-
constructor(
|
|
99
|
-
private readonly apiKey: string,
|
|
100
|
-
private readonly model: string,
|
|
101
|
-
private readonly silenceDurationMs: number,
|
|
102
|
-
private readonly vadThreshold: number,
|
|
103
|
-
) {}
|
|
104
|
-
|
|
105
|
-
async connect(): Promise<void> {
|
|
106
|
-
this.closed = false;
|
|
107
|
-
this.reconnectAttempts = 0;
|
|
108
|
-
return this.doConnect();
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
private async doConnect(): Promise<void> {
|
|
112
|
-
return new Promise((resolve, reject) => {
|
|
113
|
-
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
|
|
114
|
-
|
|
115
|
-
this.ws = new WebSocket(url, {
|
|
116
|
-
headers: {
|
|
117
|
-
Authorization: `Bearer ${this.apiKey}`,
|
|
118
|
-
"OpenAI-Beta": "realtime=v1",
|
|
119
|
-
},
|
|
120
|
-
});
|
|
121
|
-
|
|
122
|
-
this.ws.on("open", () => {
|
|
123
|
-
console.log("[RealtimeSTT] WebSocket connected");
|
|
124
|
-
this.connected = true;
|
|
125
|
-
this.reconnectAttempts = 0;
|
|
126
|
-
|
|
127
|
-
// Configure the transcription session
|
|
128
|
-
this.sendEvent({
|
|
129
|
-
type: "transcription_session.update",
|
|
130
|
-
session: {
|
|
131
|
-
input_audio_format: "g711_ulaw",
|
|
132
|
-
input_audio_transcription: {
|
|
133
|
-
model: this.model,
|
|
134
|
-
},
|
|
135
|
-
turn_detection: {
|
|
136
|
-
type: "server_vad",
|
|
137
|
-
threshold: this.vadThreshold,
|
|
138
|
-
prefix_padding_ms: 300,
|
|
139
|
-
silence_duration_ms: this.silenceDurationMs,
|
|
140
|
-
},
|
|
141
|
-
},
|
|
142
|
-
});
|
|
143
|
-
|
|
144
|
-
resolve();
|
|
145
|
-
});
|
|
146
|
-
|
|
147
|
-
this.ws.on("message", (data: Buffer) => {
|
|
148
|
-
try {
|
|
149
|
-
const event = JSON.parse(data.toString());
|
|
150
|
-
this.handleEvent(event);
|
|
151
|
-
} catch (e) {
|
|
152
|
-
console.error("[RealtimeSTT] Failed to parse event:", e);
|
|
153
|
-
}
|
|
154
|
-
});
|
|
155
|
-
|
|
156
|
-
this.ws.on("error", (error) => {
|
|
157
|
-
console.error("[RealtimeSTT] WebSocket error:", error);
|
|
158
|
-
if (!this.connected) {
|
|
159
|
-
reject(error);
|
|
160
|
-
}
|
|
161
|
-
});
|
|
162
|
-
|
|
163
|
-
this.ws.on("close", (code, reason) => {
|
|
164
|
-
console.log(
|
|
165
|
-
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
|
|
166
|
-
);
|
|
167
|
-
this.connected = false;
|
|
168
|
-
|
|
169
|
-
// Attempt reconnection if not intentionally closed
|
|
170
|
-
if (!this.closed) {
|
|
171
|
-
void this.attemptReconnect();
|
|
172
|
-
}
|
|
173
|
-
});
|
|
174
|
-
|
|
175
|
-
setTimeout(() => {
|
|
176
|
-
if (!this.connected) {
|
|
177
|
-
reject(new Error("Realtime STT connection timeout"));
|
|
178
|
-
}
|
|
179
|
-
}, 10000);
|
|
180
|
-
});
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
private async attemptReconnect(): Promise<void> {
|
|
184
|
-
if (this.closed) {
|
|
185
|
-
return;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) {
|
|
189
|
-
console.error(
|
|
190
|
-
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
|
|
191
|
-
);
|
|
192
|
-
return;
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
this.reconnectAttempts++;
|
|
196
|
-
const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
|
|
197
|
-
console.log(
|
|
198
|
-
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
|
|
199
|
-
);
|
|
200
|
-
|
|
201
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
202
|
-
|
|
203
|
-
if (this.closed) {
|
|
204
|
-
return;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
try {
|
|
208
|
-
await this.doConnect();
|
|
209
|
-
console.log("[RealtimeSTT] Reconnected successfully");
|
|
210
|
-
} catch (error) {
|
|
211
|
-
console.error("[RealtimeSTT] Reconnect failed:", error);
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
private handleEvent(event: {
|
|
216
|
-
type: string;
|
|
217
|
-
delta?: string;
|
|
218
|
-
transcript?: string;
|
|
219
|
-
error?: unknown;
|
|
220
|
-
}): void {
|
|
221
|
-
switch (event.type) {
|
|
222
|
-
case "transcription_session.created":
|
|
223
|
-
case "transcription_session.updated":
|
|
224
|
-
case "input_audio_buffer.speech_stopped":
|
|
225
|
-
case "input_audio_buffer.committed":
|
|
226
|
-
console.log(`[RealtimeSTT] ${event.type}`);
|
|
227
|
-
break;
|
|
228
|
-
|
|
229
|
-
case "conversation.item.input_audio_transcription.delta":
|
|
230
|
-
if (event.delta) {
|
|
231
|
-
this.pendingTranscript += event.delta;
|
|
232
|
-
this.onPartialCallback?.(this.pendingTranscript);
|
|
233
|
-
}
|
|
234
|
-
break;
|
|
235
|
-
|
|
236
|
-
case "conversation.item.input_audio_transcription.completed":
|
|
237
|
-
if (event.transcript) {
|
|
238
|
-
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
|
|
239
|
-
this.onTranscriptCallback?.(event.transcript);
|
|
240
|
-
}
|
|
241
|
-
this.pendingTranscript = "";
|
|
242
|
-
break;
|
|
243
|
-
|
|
244
|
-
case "input_audio_buffer.speech_started":
|
|
245
|
-
console.log("[RealtimeSTT] Speech started");
|
|
246
|
-
this.pendingTranscript = "";
|
|
247
|
-
this.onSpeechStartCallback?.();
|
|
248
|
-
break;
|
|
249
|
-
|
|
250
|
-
case "error":
|
|
251
|
-
console.error("[RealtimeSTT] Error:", event.error);
|
|
252
|
-
break;
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
private sendEvent(event: unknown): void {
|
|
257
|
-
if (this.ws?.readyState === WebSocket.OPEN) {
|
|
258
|
-
this.ws.send(JSON.stringify(event));
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
sendAudio(muLawData: Buffer): void {
|
|
263
|
-
if (!this.connected) {
|
|
264
|
-
return;
|
|
265
|
-
}
|
|
266
|
-
this.sendEvent({
|
|
267
|
-
type: "input_audio_buffer.append",
|
|
268
|
-
audio: muLawData.toString("base64"),
|
|
269
|
-
});
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
onPartial(callback: (partial: string) => void): void {
|
|
273
|
-
this.onPartialCallback = callback;
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
onTranscript(callback: (transcript: string) => void): void {
|
|
277
|
-
this.onTranscriptCallback = callback;
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
onSpeechStart(callback: () => void): void {
|
|
281
|
-
this.onSpeechStartCallback = callback;
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
async waitForTranscript(timeoutMs = 30000): Promise<string> {
|
|
285
|
-
return new Promise((resolve, reject) => {
|
|
286
|
-
const timeout = setTimeout(() => {
|
|
287
|
-
this.onTranscriptCallback = null;
|
|
288
|
-
reject(new Error("Transcript timeout"));
|
|
289
|
-
}, timeoutMs);
|
|
290
|
-
|
|
291
|
-
this.onTranscriptCallback = (transcript) => {
|
|
292
|
-
clearTimeout(timeout);
|
|
293
|
-
this.onTranscriptCallback = null;
|
|
294
|
-
resolve(transcript);
|
|
295
|
-
};
|
|
296
|
-
});
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
close(): void {
|
|
300
|
-
this.closed = true;
|
|
301
|
-
if (this.ws) {
|
|
302
|
-
this.ws.close();
|
|
303
|
-
this.ws = null;
|
|
304
|
-
}
|
|
305
|
-
this.connected = false;
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
isConnected(): boolean {
|
|
309
|
-
return this.connected;
|
|
310
|
-
}
|
|
311
|
-
}
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import { describe, expect, it } from "vitest";
|
|
2
|
-
import type { OpenAITTSConfig } from "./tts-openai.js";
|
|
3
|
-
import { OpenAITTSProvider } from "./tts-openai.js";
|
|
4
|
-
|
|
5
|
-
type ProviderInternals = {
|
|
6
|
-
model: string;
|
|
7
|
-
voice: string;
|
|
8
|
-
speed: number;
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
function readProviderInternals(config: OpenAITTSConfig): ProviderInternals {
|
|
12
|
-
return new OpenAITTSProvider(config) as unknown as ProviderInternals;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
describe("OpenAITTSProvider constructor defaults", () => {
|
|
16
|
-
it("uses speed: 0 when explicitly configured", () => {
|
|
17
|
-
const provider = readProviderInternals({
|
|
18
|
-
apiKey: "sk-test", // pragma: allowlist secret
|
|
19
|
-
speed: 0,
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
expect(provider.speed).toBe(0);
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it("falls back to speed default when undefined", () => {
|
|
26
|
-
const provider = readProviderInternals({
|
|
27
|
-
apiKey: "sk-test", // pragma: allowlist secret
|
|
28
|
-
});
|
|
29
|
-
|
|
30
|
-
expect(provider.speed).toBe(1.0);
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
it("treats blank model and voice overrides as unset", () => {
|
|
34
|
-
const provider = readProviderInternals({
|
|
35
|
-
apiKey: "sk-test", // pragma: allowlist secret
|
|
36
|
-
model: " ",
|
|
37
|
-
voice: "",
|
|
38
|
-
});
|
|
39
|
-
|
|
40
|
-
expect(provider.model).toBe("gpt-4o-mini-tts");
|
|
41
|
-
expect(provider.voice).toBe("coral");
|
|
42
|
-
});
|
|
43
|
-
});
|