@yak-io/javascript 0.10.1 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client.d.ts.map +1 -1
- package/dist/index.cjs +2043 -0
- package/dist/index.cjs.map +7 -0
- package/dist/index.js +2020 -16
- package/dist/index.js.map +7 -0
- package/dist/index.server.cjs +339 -0
- package/dist/index.server.cjs.map +7 -0
- package/dist/index.server.js +316 -1
- package/dist/index.server.js.map +7 -0
- package/dist/voice-session.d.ts.map +1 -1
- package/package.json +5 -3
- package/dist/client.js +0 -524
- package/dist/embed.js +0 -743
- package/dist/logger.js +0 -117
- package/dist/page-context.js +0 -71
- package/dist/server/createYakHandler.js +0 -185
- package/dist/server/index.js +0 -2
- package/dist/server/sources.js +0 -125
- package/dist/tool-name.js +0 -42
- package/dist/toolset.js +0 -119
- package/dist/types/config.js +0 -1
- package/dist/types/messaging.js +0 -1
- package/dist/types/routes.js +0 -1
- package/dist/types/tools.js +0 -1
- package/dist/version.js +0 -18
- package/dist/voice-machine.js +0 -168
- package/dist/voice-session.js +0 -520
package/dist/voice-machine.js
DELETED
|
@@ -1,168 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Pure state machine for a single voice session.
|
|
3
|
-
*
|
|
4
|
-
* The reducer has no DOM or WebRTC dependencies — it can be unit-tested by
|
|
5
|
-
* driving events through `voiceReducer` and checking the resulting state.
|
|
6
|
-
*
|
|
7
|
-
* The companion `handleRealtimeMessage` parses an OpenAI Realtime data-channel
|
|
8
|
-
* message and dispatches reducer events plus side effects (tool dispatch,
|
|
9
|
-
* sending follow-up events back over the data channel). Side effects are
|
|
10
|
-
* delegated to the injected `RealtimeMessageContext` so the function is
|
|
11
|
-
* testable with a plain in-memory mock.
|
|
12
|
-
*/
|
|
13
|
-
export const INITIAL_VOICE_MACHINE = { state: "idle" };
|
|
14
|
-
export function voiceReducer(machine, event) {
|
|
15
|
-
switch (event.type) {
|
|
16
|
-
case "start":
|
|
17
|
-
return machine.state === "idle" ? { state: "connecting" } : machine;
|
|
18
|
-
case "connected":
|
|
19
|
-
return machine.state === "connecting" ? { state: "listening" } : machine;
|
|
20
|
-
case "response_requested":
|
|
21
|
-
// The assistant is generating an unprompted turn (the opening greeting).
|
|
22
|
-
// Move to `thinking` so the subsequent `audio_delta` lands on `speaking`,
|
|
23
|
-
// matching a normal turn; no-op if we're not idling in `listening`.
|
|
24
|
-
return machine.state === "listening" ? { state: "thinking" } : machine;
|
|
25
|
-
case "speech_started":
|
|
26
|
-
if (machine.state === "idle" || machine.state === "error")
|
|
27
|
-
return machine;
|
|
28
|
-
return { state: "listening" };
|
|
29
|
-
case "speech_stopped":
|
|
30
|
-
return machine.state === "listening" ? { state: "thinking" } : machine;
|
|
31
|
-
case "audio_delta":
|
|
32
|
-
if (machine.state === "thinking" || machine.state === "speaking") {
|
|
33
|
-
return { state: "speaking" };
|
|
34
|
-
}
|
|
35
|
-
return machine;
|
|
36
|
-
case "audio_stopped":
|
|
37
|
-
return machine.state === "speaking" ? { state: "listening" } : machine;
|
|
38
|
-
case "stop":
|
|
39
|
-
return { state: "idle" };
|
|
40
|
-
case "error":
|
|
41
|
-
return { state: "error", errorMessage: event.message };
|
|
42
|
-
default: {
|
|
43
|
-
const _exhaustive = event;
|
|
44
|
-
void _exhaustive;
|
|
45
|
-
return machine;
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
function isFunctionCall(item) {
|
|
50
|
-
return item.type === "function_call";
|
|
51
|
-
}
|
|
52
|
-
function parseToolArgs(raw) {
|
|
53
|
-
if (!raw)
|
|
54
|
-
return {};
|
|
55
|
-
try {
|
|
56
|
-
return JSON.parse(raw);
|
|
57
|
-
}
|
|
58
|
-
catch {
|
|
59
|
-
return {};
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
async function dispatchFunctionCall(call, ctx) {
|
|
63
|
-
const callId = call.call_id;
|
|
64
|
-
const name = call.name;
|
|
65
|
-
if (!callId || !name)
|
|
66
|
-
return;
|
|
67
|
-
if (ctx.isDispatched(callId))
|
|
68
|
-
return;
|
|
69
|
-
ctx.markDispatched(callId);
|
|
70
|
-
const args = parseToolArgs(call.arguments);
|
|
71
|
-
let output;
|
|
72
|
-
try {
|
|
73
|
-
const result = await ctx.dispatchToolCall(name, args);
|
|
74
|
-
output = JSON.stringify(result ?? null);
|
|
75
|
-
}
|
|
76
|
-
catch (error) {
|
|
77
|
-
output = JSON.stringify({
|
|
78
|
-
error: error instanceof Error ? error.message : "Tool execution failed",
|
|
79
|
-
});
|
|
80
|
-
}
|
|
81
|
-
ctx.sendData({
|
|
82
|
-
type: "conversation.item.create",
|
|
83
|
-
item: { type: "function_call_output", call_id: callId, output },
|
|
84
|
-
});
|
|
85
|
-
ctx.sendData({ type: "response.create" });
|
|
86
|
-
}
|
|
87
|
-
function extractUsage(raw) {
|
|
88
|
-
if (!raw)
|
|
89
|
-
return null;
|
|
90
|
-
const usage = {};
|
|
91
|
-
if (typeof raw.input_tokens === "number")
|
|
92
|
-
usage.inputTokens = raw.input_tokens;
|
|
93
|
-
if (typeof raw.output_tokens === "number")
|
|
94
|
-
usage.outputTokens = raw.output_tokens;
|
|
95
|
-
const inDetails = raw.input_token_details;
|
|
96
|
-
if (inDetails) {
|
|
97
|
-
if (typeof inDetails.cached_tokens === "number") {
|
|
98
|
-
usage.cachedInputTokens = inDetails.cached_tokens;
|
|
99
|
-
}
|
|
100
|
-
if (typeof inDetails.audio_tokens === "number") {
|
|
101
|
-
usage.audioInputTokens = inDetails.audio_tokens;
|
|
102
|
-
}
|
|
103
|
-
if (typeof inDetails.text_tokens === "number") {
|
|
104
|
-
usage.textInputTokens = inDetails.text_tokens;
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
const outDetails = raw.output_token_details;
|
|
108
|
-
if (outDetails) {
|
|
109
|
-
if (typeof outDetails.audio_tokens === "number") {
|
|
110
|
-
usage.audioOutputTokens = outDetails.audio_tokens;
|
|
111
|
-
}
|
|
112
|
-
if (typeof outDetails.text_tokens === "number") {
|
|
113
|
-
usage.textOutputTokens = outDetails.text_tokens;
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
return Object.keys(usage).length > 0 ? usage : null;
|
|
117
|
-
}
|
|
118
|
-
async function handleResponseDone(response, ctx) {
|
|
119
|
-
const usage = extractUsage(response?.usage);
|
|
120
|
-
if (usage && ctx.recordUsage) {
|
|
121
|
-
try {
|
|
122
|
-
ctx.recordUsage(usage);
|
|
123
|
-
}
|
|
124
|
-
catch {
|
|
125
|
-
// recordUsage is best-effort; never let it break the session loop.
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
const calls = (response?.output ?? []).filter(isFunctionCall);
|
|
129
|
-
for (const call of calls) {
|
|
130
|
-
await dispatchFunctionCall(call, ctx);
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
export async function handleRealtimeMessage(raw, ctx) {
|
|
134
|
-
let message;
|
|
135
|
-
try {
|
|
136
|
-
message = JSON.parse(raw);
|
|
137
|
-
}
|
|
138
|
-
catch {
|
|
139
|
-
return;
|
|
140
|
-
}
|
|
141
|
-
switch (message.type) {
|
|
142
|
-
case "input_audio_buffer.speech_started":
|
|
143
|
-
ctx.send({ type: "speech_started" });
|
|
144
|
-
return;
|
|
145
|
-
case "input_audio_buffer.speech_stopped":
|
|
146
|
-
ctx.send({ type: "speech_stopped" });
|
|
147
|
-
return;
|
|
148
|
-
case "response.output_audio_transcript.delta":
|
|
149
|
-
case "response.audio_transcript.delta":
|
|
150
|
-
ctx.send({ type: "audio_delta" });
|
|
151
|
-
return;
|
|
152
|
-
case "output_audio_buffer.stopped":
|
|
153
|
-
case "response.output_audio_buffer.stopped":
|
|
154
|
-
ctx.send({ type: "audio_stopped" });
|
|
155
|
-
return;
|
|
156
|
-
case "response.done":
|
|
157
|
-
await handleResponseDone(message.response, ctx);
|
|
158
|
-
return;
|
|
159
|
-
case "error":
|
|
160
|
-
ctx.send({
|
|
161
|
-
type: "error",
|
|
162
|
-
message: message.error?.message ?? "Voice session error",
|
|
163
|
-
});
|
|
164
|
-
return;
|
|
165
|
-
default:
|
|
166
|
-
return;
|
|
167
|
-
}
|
|
168
|
-
}
|
package/dist/voice-session.js
DELETED
|
@@ -1,520 +0,0 @@
|
|
|
1
|
-
import { logger } from "./logger.js";
|
|
2
|
-
import { extractPageContext } from "./page-context.js";
|
|
3
|
-
import { uniqueToolId } from "./tool-name.js";
|
|
4
|
-
import { handleRealtimeMessage, INITIAL_VOICE_MACHINE, voiceReducer, } from "./voice-machine.js";
|
|
5
|
-
const DEFAULT_REALTIME_MODEL = "gpt-realtime";
|
|
6
|
-
const REALTIME_CALLS_URL = "https://api.openai.com/v1/realtime/calls";
|
|
7
|
-
const DEFAULT_API_ORIGIN = "https://chat.yak.io";
|
|
8
|
-
/**
|
|
9
|
-
* Resolves the API origin when no explicit `apiOrigin` is configured. Points at
|
|
10
|
-
* a local chat UI during yak's own local development; production otherwise.
|
|
11
|
-
*/
|
|
12
|
-
function getDefaultApiOrigin() {
|
|
13
|
-
if (typeof window !== "undefined" &&
|
|
14
|
-
(window.location.hostname === "localhost" || window.location.hostname === "127.0.0.1") &&
|
|
15
|
-
typeof window.__YAK_INTERNAL_DEV__ !== "undefined") {
|
|
16
|
-
return "http://localhost:3001";
|
|
17
|
-
}
|
|
18
|
-
return DEFAULT_API_ORIGIN;
|
|
19
|
-
}
|
|
20
|
-
const EMPTY_RESOURCES = {
|
|
21
|
-
pc: null,
|
|
22
|
-
dataChannel: null,
|
|
23
|
-
micStream: null,
|
|
24
|
-
audioElement: null,
|
|
25
|
-
voiceSessionId: null,
|
|
26
|
-
};
|
|
27
|
-
function emptyUsage() {
|
|
28
|
-
return {
|
|
29
|
-
inputTokens: 0,
|
|
30
|
-
cachedInputTokens: 0,
|
|
31
|
-
outputTokens: 0,
|
|
32
|
-
audioInputTokens: 0,
|
|
33
|
-
audioOutputTokens: 0,
|
|
34
|
-
textInputTokens: 0,
|
|
35
|
-
textOutputTokens: 0,
|
|
36
|
-
responseCount: 0,
|
|
37
|
-
};
|
|
38
|
-
}
|
|
39
|
-
export class YakVoiceSession {
|
|
40
|
-
config;
|
|
41
|
-
machine = INITIAL_VOICE_MACHINE;
|
|
42
|
-
resources = EMPTY_RESOURCES;
|
|
43
|
-
dispatchedCallIds = new Set();
|
|
44
|
-
listeners = new Set();
|
|
45
|
-
pageHideHandler = null;
|
|
46
|
-
/** Per-session token totals, accumulated from each `response.done` event. */
|
|
47
|
-
usage = emptyUsage();
|
|
48
|
-
/**
|
|
49
|
-
* Reverse map: hashed tool id (what OpenAI calls back with) → original host
|
|
50
|
-
* tool name (what `onToolCall` expects). Populated on every `start()` from
|
|
51
|
-
* the resolved chat config.
|
|
52
|
-
*/
|
|
53
|
-
toolNameById = new Map();
|
|
54
|
-
constructor(config) {
|
|
55
|
-
this.config = config;
|
|
56
|
-
this.attachPageHide();
|
|
57
|
-
}
|
|
58
|
-
/**
|
|
59
|
-
* Resolve the API origin lazily on each call. Environment-dependent defaults
|
|
60
|
-
* (e.g. a local chat UI) may not be ready at construction time, so resolving
|
|
61
|
-
* eagerly would risk baking in the production URL.
|
|
62
|
-
*/
|
|
63
|
-
get apiOrigin() {
|
|
64
|
-
return this.config.apiOrigin ?? getDefaultApiOrigin();
|
|
65
|
-
}
|
|
66
|
-
/** Update mutable config fields (handlers, getConfig). */
|
|
67
|
-
updateConfig(patch) {
|
|
68
|
-
this.config = { ...this.config, ...patch };
|
|
69
|
-
}
|
|
70
|
-
getState() {
|
|
71
|
-
return this.machine;
|
|
72
|
-
}
|
|
73
|
-
/**
|
|
74
|
-
* The current API origin (defaults to `https://chat.yak.io`). Useful for
|
|
75
|
-
* building URLs to static assets like the brand logo.
|
|
76
|
-
*/
|
|
77
|
-
getApiOrigin() {
|
|
78
|
-
return this.apiOrigin;
|
|
79
|
-
}
|
|
80
|
-
onStateChange(listener) {
|
|
81
|
-
this.listeners.add(listener);
|
|
82
|
-
return () => {
|
|
83
|
-
this.listeners.delete(listener);
|
|
84
|
-
};
|
|
85
|
-
}
|
|
86
|
-
/**
|
|
87
|
-
* Begin a voice session. Should be invoked from a user gesture (button
|
|
88
|
-
* click) so `getUserMedia` and audio playback both have transient activation.
|
|
89
|
-
*/
|
|
90
|
-
async start() {
|
|
91
|
-
if (this.machine.state !== "idle")
|
|
92
|
-
return;
|
|
93
|
-
logger.debug("Voice: start() called");
|
|
94
|
-
this.usage = emptyUsage();
|
|
95
|
-
this.dispatch({ type: "start" });
|
|
96
|
-
let chatConfig = this.config.chatConfig;
|
|
97
|
-
if (this.config.getConfig) {
|
|
98
|
-
try {
|
|
99
|
-
chatConfig = await this.config.getConfig();
|
|
100
|
-
logger.debug("Voice: getConfig() resolved", {
|
|
101
|
-
toolCount: chatConfig?.tools?.tools.length ?? 0,
|
|
102
|
-
routeCount: chatConfig?.routes?.routes.length ?? 0,
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
catch (err) {
|
|
106
|
-
logger.warn("Voice: getConfig() failed", err);
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
else if (chatConfig) {
|
|
110
|
-
logger.debug("Voice: using static chatConfig", {
|
|
111
|
-
toolCount: chatConfig.tools?.tools.length ?? 0,
|
|
112
|
-
routeCount: chatConfig.routes?.routes.length ?? 0,
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
else {
|
|
116
|
-
logger.debug("Voice: no chatConfig or getConfig — only built-in tools will be available");
|
|
117
|
-
}
|
|
118
|
-
// Decorate host tools with hash ids and build the reverse lookup so we
|
|
119
|
-
// can map id-named tool calls back to the original host name when the
|
|
120
|
-
// model invokes them. Mirrors the chat-ui iframe's decoration step.
|
|
121
|
-
const decoratedManifest = this.buildDecoratedManifest(chatConfig);
|
|
122
|
-
logger.debug("Voice: decorated tools", {
|
|
123
|
-
ids: decoratedManifest.tools.map((t) => `${t.id}=${t.name}`),
|
|
124
|
-
});
|
|
125
|
-
const pageContext = this.safeExtractPageContext();
|
|
126
|
-
logger.debug("Voice: page context extracted", {
|
|
127
|
-
url: pageContext?.url,
|
|
128
|
-
title: pageContext?.title,
|
|
129
|
-
textLength: pageContext?.text?.length ?? 0,
|
|
130
|
-
});
|
|
131
|
-
let mint;
|
|
132
|
-
try {
|
|
133
|
-
logger.debug("Voice: requesting ephemeral token from mint endpoint");
|
|
134
|
-
mint = await this.mintToken(chatConfig, decoratedManifest, pageContext);
|
|
135
|
-
logger.debug("Voice: mint succeeded", {
|
|
136
|
-
voiceSessionId: mint.voiceSessionId,
|
|
137
|
-
expiresAt: mint.expiresAt,
|
|
138
|
-
});
|
|
139
|
-
}
|
|
140
|
-
catch (err) {
|
|
141
|
-
await this.failWith(err instanceof Error ? err.message : "Failed to start voice session");
|
|
142
|
-
return;
|
|
143
|
-
}
|
|
144
|
-
let micStream;
|
|
145
|
-
try {
|
|
146
|
-
logger.debug("Voice: requesting microphone access");
|
|
147
|
-
micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
148
|
-
logger.debug("Voice: microphone access granted");
|
|
149
|
-
}
|
|
150
|
-
catch (err) {
|
|
151
|
-
const name = err instanceof Error ? err.name : "";
|
|
152
|
-
const message = name === "NotAllowedError" || name === "PermissionDeniedError"
|
|
153
|
-
? "Microphone permission was denied. Enable microphone access in your browser settings to use voice mode."
|
|
154
|
-
: "Could not access microphone.";
|
|
155
|
-
await this.failWith(message);
|
|
156
|
-
return;
|
|
157
|
-
}
|
|
158
|
-
const pc = new RTCPeerConnection();
|
|
159
|
-
const audioElement = document.createElement("audio");
|
|
160
|
-
audioElement.autoplay = true;
|
|
161
|
-
audioElement.style.display = "none";
|
|
162
|
-
document.body.appendChild(audioElement);
|
|
163
|
-
pc.ontrack = (event) => {
|
|
164
|
-
logger.debug("Voice: pc.ontrack received remote audio stream");
|
|
165
|
-
if (event.streams[0]) {
|
|
166
|
-
audioElement.srcObject = event.streams[0];
|
|
167
|
-
}
|
|
168
|
-
};
|
|
169
|
-
pc.oniceconnectionstatechange = () => {
|
|
170
|
-
const s = pc.iceConnectionState;
|
|
171
|
-
logger.debug("Voice: ICE connection state →", s);
|
|
172
|
-
if (s === "failed" || s === "disconnected") {
|
|
173
|
-
void this.failWith(`WebRTC connection ${s}`);
|
|
174
|
-
}
|
|
175
|
-
};
|
|
176
|
-
pc.onconnectionstatechange = () => {
|
|
177
|
-
logger.debug("Voice: peer connection state →", pc.connectionState);
|
|
178
|
-
if (pc.connectionState === "failed") {
|
|
179
|
-
void this.failWith("WebRTC connection failed");
|
|
180
|
-
}
|
|
181
|
-
};
|
|
182
|
-
for (const track of micStream.getAudioTracks()) {
|
|
183
|
-
pc.addTrack(track, micStream);
|
|
184
|
-
}
|
|
185
|
-
const dataChannel = pc.createDataChannel("oai-events");
|
|
186
|
-
dataChannel.onmessage = (event) => {
|
|
187
|
-
const raw = typeof event.data === "string" ? event.data : "";
|
|
188
|
-
if (!raw)
|
|
189
|
-
return;
|
|
190
|
-
logger.debug("Voice: ← data channel message", raw.slice(0, 200));
|
|
191
|
-
void handleRealtimeMessage(raw, this.buildMessageContext());
|
|
192
|
-
};
|
|
193
|
-
dataChannel.onopen = () => {
|
|
194
|
-
logger.debug("Voice: data channel opened");
|
|
195
|
-
this.dispatch({ type: "connected" });
|
|
196
|
-
// Kick off an opening greeting so the assistant speaks first instead of
|
|
197
|
-
// waiting silently for the user. The minted session already carries the
|
|
198
|
-
// full instructions (persona, governance, language, "first turn" rule),
|
|
199
|
-
// so a bare `response.create` is enough — do NOT attach response-level
|
|
200
|
-
// instructions here, which would replace the session instructions.
|
|
201
|
-
// Skip it when the app's voice intro is "none" (`autoGreet === false`);
|
|
202
|
-
// a missing flag defaults to greeting for back-compat.
|
|
203
|
-
if (mint.autoGreet !== false) {
|
|
204
|
-
this.dispatch({ type: "response_requested" });
|
|
205
|
-
this.sendOverDataChannel({ type: "response.create" });
|
|
206
|
-
}
|
|
207
|
-
};
|
|
208
|
-
dataChannel.onclose = () => {
|
|
209
|
-
logger.debug("Voice: data channel closed");
|
|
210
|
-
};
|
|
211
|
-
this.resources = {
|
|
212
|
-
pc,
|
|
213
|
-
dataChannel,
|
|
214
|
-
micStream,
|
|
215
|
-
audioElement,
|
|
216
|
-
voiceSessionId: mint.voiceSessionId,
|
|
217
|
-
};
|
|
218
|
-
try {
|
|
219
|
-
logger.debug("Voice: creating WebRTC offer");
|
|
220
|
-
const offer = await pc.createOffer();
|
|
221
|
-
await pc.setLocalDescription(offer);
|
|
222
|
-
logger.debug("Voice: exchanging SDP with OpenAI Realtime");
|
|
223
|
-
const answerSdp = await this.exchangeSdp(offer, mint.clientSecret);
|
|
224
|
-
await pc.setRemoteDescription({ type: "answer", sdp: answerSdp });
|
|
225
|
-
logger.debug("Voice: WebRTC negotiation complete");
|
|
226
|
-
}
|
|
227
|
-
catch (err) {
|
|
228
|
-
await this.failWith(err instanceof Error ? err.message : "Failed to negotiate voice connection");
|
|
229
|
-
return;
|
|
230
|
-
}
|
|
231
|
-
void this.postSessionEvent("start", mint.voiceSessionId, pageContext);
|
|
232
|
-
}
|
|
233
|
-
/** Stop the session and tear down all resources. */
|
|
234
|
-
async stop() {
|
|
235
|
-
logger.debug("Voice: stop() called");
|
|
236
|
-
this.dispatch({ type: "stop" });
|
|
237
|
-
await this.teardown();
|
|
238
|
-
}
|
|
239
|
-
/** Tear down everything and remove listeners. Call once before discarding the instance. */
|
|
240
|
-
destroy() {
|
|
241
|
-
void this.teardown();
|
|
242
|
-
if (this.pageHideHandler) {
|
|
243
|
-
window.removeEventListener("pagehide", this.pageHideHandler);
|
|
244
|
-
this.pageHideHandler = null;
|
|
245
|
-
}
|
|
246
|
-
this.listeners.clear();
|
|
247
|
-
}
|
|
248
|
-
// ── Internals ───────────────────────────────────────────────────────────
|
|
249
|
-
buildMessageContext() {
|
|
250
|
-
return {
|
|
251
|
-
send: (event) => this.dispatch(event),
|
|
252
|
-
sendData: (payload) => this.sendOverDataChannel(payload),
|
|
253
|
-
dispatchToolCall: (name, args) => this.routeToolCall(name, args),
|
|
254
|
-
isDispatched: (id) => this.dispatchedCallIds.has(id),
|
|
255
|
-
markDispatched: (id) => {
|
|
256
|
-
this.dispatchedCallIds.add(id);
|
|
257
|
-
},
|
|
258
|
-
recordUsage: (usage) => this.accumulateUsage(usage),
|
|
259
|
-
};
|
|
260
|
-
}
|
|
261
|
-
accumulateUsage(usage) {
|
|
262
|
-
this.usage.responseCount += 1;
|
|
263
|
-
if (typeof usage.inputTokens === "number")
|
|
264
|
-
this.usage.inputTokens += usage.inputTokens;
|
|
265
|
-
if (typeof usage.cachedInputTokens === "number") {
|
|
266
|
-
this.usage.cachedInputTokens += usage.cachedInputTokens;
|
|
267
|
-
}
|
|
268
|
-
if (typeof usage.outputTokens === "number")
|
|
269
|
-
this.usage.outputTokens += usage.outputTokens;
|
|
270
|
-
if (typeof usage.audioInputTokens === "number") {
|
|
271
|
-
this.usage.audioInputTokens += usage.audioInputTokens;
|
|
272
|
-
}
|
|
273
|
-
if (typeof usage.audioOutputTokens === "number") {
|
|
274
|
-
this.usage.audioOutputTokens += usage.audioOutputTokens;
|
|
275
|
-
}
|
|
276
|
-
if (typeof usage.textInputTokens === "number") {
|
|
277
|
-
this.usage.textInputTokens += usage.textInputTokens;
|
|
278
|
-
}
|
|
279
|
-
if (typeof usage.textOutputTokens === "number") {
|
|
280
|
-
this.usage.textOutputTokens += usage.textOutputTokens;
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
sendOverDataChannel(payload) {
|
|
284
|
-
const channel = this.resources.dataChannel;
|
|
285
|
-
if (!channel || channel.readyState !== "open") {
|
|
286
|
-
logger.warn("Voice data channel not ready; dropping payload");
|
|
287
|
-
return;
|
|
288
|
-
}
|
|
289
|
-
try {
|
|
290
|
-
const serialized = JSON.stringify(payload);
|
|
291
|
-
logger.debug("Voice: → data channel send", serialized.slice(0, 200));
|
|
292
|
-
channel.send(serialized);
|
|
293
|
-
}
|
|
294
|
-
catch (err) {
|
|
295
|
-
logger.warn("Failed to send on voice data channel", err);
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
async routeToolCall(idOrName, args) {
|
|
299
|
-
// The model calls us back using the decorated id (e.g. orders_list).
|
|
300
|
-
// Resolve it to the original host tool name; fall back to the raw value
|
|
301
|
-
// (it might be `redirect` or some non-decorated name).
|
|
302
|
-
const name = this.toolNameById.get(idOrName) ?? idOrName;
|
|
303
|
-
logger.debug("Voice: tool call dispatched", { id: idOrName, name, args });
|
|
304
|
-
// MCP tools execute server-side (the org token never reaches the browser).
|
|
305
|
-
// The model calls back with the `mcp__…` name minted by the server; relay
|
|
306
|
-
// it to the exec endpoint and feed the result back over the data channel.
|
|
307
|
-
if (name.startsWith("mcp__")) {
|
|
308
|
-
return await this.execMcpTool(name, args);
|
|
309
|
-
}
|
|
310
|
-
if (name === "redirect") {
|
|
311
|
-
const path = args?.path;
|
|
312
|
-
if (typeof path !== "string") {
|
|
313
|
-
throw new Error("redirect tool requires a string `path` argument");
|
|
314
|
-
}
|
|
315
|
-
if (this.config.onRedirect) {
|
|
316
|
-
this.config.onRedirect(path);
|
|
317
|
-
}
|
|
318
|
-
else if (typeof window !== "undefined") {
|
|
319
|
-
window.location.assign(path);
|
|
320
|
-
}
|
|
321
|
-
return { success: true, redirected: true, path };
|
|
322
|
-
}
|
|
323
|
-
if (this.config.onToolCall) {
|
|
324
|
-
return await this.config.onToolCall(name, args);
|
|
325
|
-
}
|
|
326
|
-
throw new Error(`No handler configured for tool: ${name}`);
|
|
327
|
-
}
|
|
328
|
-
/**
|
|
329
|
-
* Relay an MCP tool call to the server, which holds the org's credentials
|
|
330
|
-
* and executes against the remote MCP server. The browser only ever passes
|
|
331
|
-
* through the tool name, args, and the opaque result.
|
|
332
|
-
*/
|
|
333
|
-
async execMcpTool(toolName, args) {
|
|
334
|
-
try {
|
|
335
|
-
const res = await fetch(`${this.apiOrigin}/api/voice/mcp-exec`, {
|
|
336
|
-
method: "POST",
|
|
337
|
-
headers: { "Content-Type": "application/json" },
|
|
338
|
-
body: JSON.stringify({
|
|
339
|
-
appId: this.config.appId,
|
|
340
|
-
toolName,
|
|
341
|
-
args: args ?? {},
|
|
342
|
-
pageContext: this.safeExtractPageContext(),
|
|
343
|
-
}),
|
|
344
|
-
});
|
|
345
|
-
if (!res.ok) {
|
|
346
|
-
const body = (await res.json().catch(() => ({})));
|
|
347
|
-
return { error: body.error ?? `MCP tool failed (${res.status})` };
|
|
348
|
-
}
|
|
349
|
-
const body = (await res.json());
|
|
350
|
-
return body.result ?? {};
|
|
351
|
-
}
|
|
352
|
-
catch (err) {
|
|
353
|
-
logger.warn("Voice: MCP tool relay failed", err);
|
|
354
|
-
return { error: "The integration could not complete this request." };
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
|
-
async mintToken(chatConfig, decoratedManifest, pageContext) {
|
|
358
|
-
const res = await fetch(`${this.apiOrigin}/api/voice/realtime-token`, {
|
|
359
|
-
method: "POST",
|
|
360
|
-
headers: { "Content-Type": "application/json" },
|
|
361
|
-
body: JSON.stringify({
|
|
362
|
-
appId: this.config.appId,
|
|
363
|
-
pageContext,
|
|
364
|
-
toolManifest: decoratedManifest,
|
|
365
|
-
routeManifest: chatConfig?.routes,
|
|
366
|
-
}),
|
|
367
|
-
});
|
|
368
|
-
if (!res.ok) {
|
|
369
|
-
const body = (await res.json().catch(() => ({})));
|
|
370
|
-
throw new Error(body.error ?? `Mint failed (${res.status})`);
|
|
371
|
-
}
|
|
372
|
-
return (await res.json());
|
|
373
|
-
}
|
|
374
|
-
/**
|
|
375
|
-
* Decorate the host's tool manifest with readable, collision-free model-facing ids
|
|
376
|
-
* and populate `this.toolNameById` for reverse lookup. Mirrors the decoration the
|
|
377
|
-
* chat-ui iframe applies before sending tools to `/api/chat`. GraphQL/REST tools are
|
|
378
|
-
* ordinary manifest entries here (contributed by their adapters), so no special-casing
|
|
379
|
-
* is needed.
|
|
380
|
-
*/
|
|
381
|
-
buildDecoratedManifest(chatConfig) {
|
|
382
|
-
this.toolNameById.clear();
|
|
383
|
-
const used = new Set(["redirect"]);
|
|
384
|
-
const decoratedHostTools = (chatConfig?.tools?.tools ?? []).map((t) => {
|
|
385
|
-
const id = uniqueToolId(t.name, used);
|
|
386
|
-
used.add(id);
|
|
387
|
-
this.toolNameById.set(id, t.name);
|
|
388
|
-
return { ...t, id };
|
|
389
|
-
});
|
|
390
|
-
return { tools: decoratedHostTools };
|
|
391
|
-
}
|
|
392
|
-
async exchangeSdp(offer, clientSecret) {
|
|
393
|
-
const sdpResponse = await fetch(`${REALTIME_CALLS_URL}?model=${DEFAULT_REALTIME_MODEL}`, {
|
|
394
|
-
method: "POST",
|
|
395
|
-
headers: {
|
|
396
|
-
Authorization: `Bearer ${clientSecret}`,
|
|
397
|
-
"Content-Type": "application/sdp",
|
|
398
|
-
},
|
|
399
|
-
body: offer.sdp,
|
|
400
|
-
});
|
|
401
|
-
if (!sdpResponse.ok) {
|
|
402
|
-
const body = await sdpResponse.text().catch(() => "");
|
|
403
|
-
throw new Error(`SDP exchange failed (${sdpResponse.status}): ${body}`);
|
|
404
|
-
}
|
|
405
|
-
return await sdpResponse.text();
|
|
406
|
-
}
|
|
407
|
-
buildStopEventBody(voiceSessionId, pageContext) {
|
|
408
|
-
return {
|
|
409
|
-
appId: this.config.appId,
|
|
410
|
-
voiceSessionId,
|
|
411
|
-
event: "stop",
|
|
412
|
-
clientTimestamp: Date.now(),
|
|
413
|
-
pageContext,
|
|
414
|
-
usage: { ...this.usage },
|
|
415
|
-
};
|
|
416
|
-
}
|
|
417
|
-
async postSessionEvent(event, voiceSessionId, pageContext) {
|
|
418
|
-
try {
|
|
419
|
-
const body = event === "stop"
|
|
420
|
-
? this.buildStopEventBody(voiceSessionId, pageContext)
|
|
421
|
-
: {
|
|
422
|
-
appId: this.config.appId,
|
|
423
|
-
voiceSessionId,
|
|
424
|
-
event,
|
|
425
|
-
clientTimestamp: Date.now(),
|
|
426
|
-
pageContext,
|
|
427
|
-
};
|
|
428
|
-
await fetch(`${this.apiOrigin}/api/voice/session-event`, {
|
|
429
|
-
method: "POST",
|
|
430
|
-
headers: { "Content-Type": "application/json" },
|
|
431
|
-
body: JSON.stringify(body),
|
|
432
|
-
keepalive: event === "stop",
|
|
433
|
-
});
|
|
434
|
-
}
|
|
435
|
-
catch (err) {
|
|
436
|
-
logger.warn(`Failed to post voice.session.${event}`, err);
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
async teardown() {
|
|
440
|
-
const r = this.resources;
|
|
441
|
-
this.resources = EMPTY_RESOURCES;
|
|
442
|
-
this.dispatchedCallIds = new Set();
|
|
443
|
-
try {
|
|
444
|
-
r.dataChannel?.close();
|
|
445
|
-
}
|
|
446
|
-
catch (err) {
|
|
447
|
-
logger.warn("Error closing data channel", err);
|
|
448
|
-
}
|
|
449
|
-
try {
|
|
450
|
-
for (const track of r.micStream?.getTracks() ?? []) {
|
|
451
|
-
track.stop();
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
catch (err) {
|
|
455
|
-
logger.warn("Error stopping mic tracks", err);
|
|
456
|
-
}
|
|
457
|
-
try {
|
|
458
|
-
for (const sender of r.pc?.getSenders() ?? []) {
|
|
459
|
-
sender.track?.stop();
|
|
460
|
-
}
|
|
461
|
-
r.pc?.close();
|
|
462
|
-
}
|
|
463
|
-
catch (err) {
|
|
464
|
-
logger.warn("Error closing peer connection", err);
|
|
465
|
-
}
|
|
466
|
-
if (r.audioElement) {
|
|
467
|
-
try {
|
|
468
|
-
r.audioElement.srcObject = null;
|
|
469
|
-
r.audioElement.remove();
|
|
470
|
-
}
|
|
471
|
-
catch (err) {
|
|
472
|
-
logger.warn("Error removing audio element", err);
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
if (r.voiceSessionId) {
|
|
476
|
-
await this.postSessionEvent("stop", r.voiceSessionId, this.safeExtractPageContext());
|
|
477
|
-
}
|
|
478
|
-
}
|
|
479
|
-
async failWith(message) {
|
|
480
|
-
logger.warn("Voice session error:", message);
|
|
481
|
-
this.dispatch({ type: "error", message });
|
|
482
|
-
await this.teardown();
|
|
483
|
-
}
|
|
484
|
-
dispatch(event) {
|
|
485
|
-
const next = voiceReducer(this.machine, event);
|
|
486
|
-
if (next === this.machine)
|
|
487
|
-
return;
|
|
488
|
-
this.machine = next;
|
|
489
|
-
for (const listener of this.listeners) {
|
|
490
|
-
try {
|
|
491
|
-
listener(next);
|
|
492
|
-
}
|
|
493
|
-
catch (err) {
|
|
494
|
-
logger.warn("Voice state listener threw", err);
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
safeExtractPageContext() {
|
|
499
|
-
try {
|
|
500
|
-
return extractPageContext();
|
|
501
|
-
}
|
|
502
|
-
catch {
|
|
503
|
-
return undefined;
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
attachPageHide() {
|
|
507
|
-
if (typeof window === "undefined")
|
|
508
|
-
return;
|
|
509
|
-
this.pageHideHandler = () => {
|
|
510
|
-
const r = this.resources;
|
|
511
|
-
if (!r.voiceSessionId)
|
|
512
|
-
return;
|
|
513
|
-
const body = JSON.stringify(this.buildStopEventBody(r.voiceSessionId, undefined));
|
|
514
|
-
if (navigator.sendBeacon) {
|
|
515
|
-
navigator.sendBeacon(`${this.apiOrigin}/api/voice/session-event`, new Blob([body], { type: "application/json" }));
|
|
516
|
-
}
|
|
517
|
-
};
|
|
518
|
-
window.addEventListener("pagehide", this.pageHideHandler);
|
|
519
|
-
}
|
|
520
|
-
}
|