@navai/voice-frontend 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.en.md +33 -20
- package/README.es.md +33 -20
- package/README.md +36 -23
- package/dist/index.cjs +214 -7
- package/dist/index.d.cts +26 -1
- package/dist/index.d.ts +26 -1
- package/dist/index.js +214 -7
- package/package.json +1 -1
package/README.en.md
CHANGED
|
@@ -26,11 +26,12 @@ npm install react
|
|
|
26
26
|
|
|
27
27
|
This package is intentionally split by concern:
|
|
28
28
|
|
|
29
|
-
1. `src/backend.ts`
|
|
30
|
-
HTTP client for backend routes:
|
|
31
|
-
- `POST /navai/realtime/client-secret`
|
|
32
|
-
- `
|
|
33
|
-
- `
|
|
29
|
+
1. `src/backend.ts`
|
|
30
|
+
HTTP client for backend routes:
|
|
31
|
+
- `POST /navai/realtime/client-secret`
|
|
32
|
+
- `POST /navai/speech/synthesize`
|
|
33
|
+
- `GET /navai/functions`
|
|
34
|
+
- `POST /navai/functions/execute`
|
|
34
35
|
|
|
35
36
|
2. `src/runtime.ts`
|
|
36
37
|
Runtime resolver for:
|
|
@@ -68,11 +69,12 @@ Hook-driven runtime flow (`useWebVoiceAgent`):
|
|
|
68
69
|
|
|
69
70
|
1. Resolve runtime config from `moduleLoaders` + `defaultRoutes` + env/options.
|
|
70
71
|
2. Create backend client with `apiBaseUrl` or `NAVAI_API_URL`.
|
|
71
|
-
3. On `start()`:
|
|
72
|
-
- request client secret.
|
|
73
|
-
-
|
|
74
|
-
-
|
|
75
|
-
-
|
|
72
|
+
3. On `start()`:
|
|
73
|
+
- request client secret.
|
|
74
|
+
- read `speech.provider` from backend response.
|
|
75
|
+
- fetch backend function list.
|
|
76
|
+
- build Navai agent with local + backend functions.
|
|
77
|
+
- connect `RealtimeSession`.
|
|
76
78
|
4. On `stop()`:
|
|
77
79
|
- close session and reset state.
|
|
78
80
|
|
|
@@ -106,9 +108,18 @@ Useful types:
|
|
|
106
108
|
|
|
107
109
|
- `NavaiRoute`
|
|
108
110
|
- `NavaiFunctionDefinition`
|
|
109
|
-
- `NavaiFunctionsRegistry`
|
|
110
|
-
- `NavaiBackendFunctionDefinition`
|
|
111
|
-
- `
|
|
111
|
+
- `NavaiFunctionsRegistry`
|
|
112
|
+
- `NavaiBackendFunctionDefinition`
|
|
113
|
+
- `NavaiBackendSpeechConfig`
|
|
114
|
+
- `UseWebVoiceAgentOptions`
|
|
115
|
+
|
|
116
|
+
## Hybrid Speech Mode
|
|
117
|
+
|
|
118
|
+
When backend returns `speech.provider: "elevenlabs"`:
|
|
119
|
+
|
|
120
|
+
- `useWebVoiceAgent` updates the Realtime session to use `output_modalities: ["text"]`.
|
|
121
|
+
- assistant final text is sent to `backendClient.synthesizeSpeech(...)`.
|
|
122
|
+
- playback happens locally in the browser with the synthesized ElevenLabs audio.
|
|
112
123
|
|
|
113
124
|
## Tool Model and Behavior
|
|
114
125
|
|
|
@@ -234,16 +245,18 @@ For browser realtime multi-agent orchestration, `buildNavaiAgent` currently wire
|
|
|
234
245
|
2. `env.NAVAI_API_URL`.
|
|
235
246
|
3. fallback `http://localhost:3000`.
|
|
236
247
|
|
|
237
|
-
Methods:
|
|
238
|
-
|
|
239
|
-
- `createClientSecret(input?)`
|
|
240
|
-
- `
|
|
241
|
-
- `
|
|
248
|
+
Methods:
|
|
249
|
+
|
|
250
|
+
- `createClientSecret(input?)`
|
|
251
|
+
- `synthesizeSpeech({ text, ... })`
|
|
252
|
+
- `listFunctions()`
|
|
253
|
+
- `executeFunction({ functionName, payload })`
|
|
242
254
|
|
|
243
255
|
Error handling:
|
|
244
256
|
|
|
245
|
-
- network/HTTP failures throw for create/execute.
|
|
246
|
-
- function listing returns warnings and empty list on failures.
|
|
257
|
+
- network/HTTP failures throw for create/execute.
|
|
258
|
+
- function listing returns warnings and empty list on failures.
|
|
259
|
+
- `createClientSecret()` returns `{ value, expires_at, speech }`, where `speech.provider` is `openai` or `elevenlabs`.
|
|
247
260
|
|
|
248
261
|
## Generated Module Loader CLI
|
|
249
262
|
|
package/README.es.md
CHANGED
|
@@ -26,11 +26,12 @@ npm install react
|
|
|
26
26
|
|
|
27
27
|
El paquete esta separado por responsabilidades:
|
|
28
28
|
|
|
29
|
-
1. `src/backend.ts`
|
|
30
|
-
Cliente HTTP para rutas backend:
|
|
31
|
-
- `POST /navai/realtime/client-secret`
|
|
32
|
-
- `
|
|
33
|
-
- `
|
|
29
|
+
1. `src/backend.ts`
|
|
30
|
+
Cliente HTTP para rutas backend:
|
|
31
|
+
- `POST /navai/realtime/client-secret`
|
|
32
|
+
- `POST /navai/speech/synthesize`
|
|
33
|
+
- `GET /navai/functions`
|
|
34
|
+
- `POST /navai/functions/execute`
|
|
34
35
|
|
|
35
36
|
2. `src/runtime.ts`
|
|
36
37
|
Resolver de runtime para:
|
|
@@ -68,11 +69,12 @@ Flujo del hook (`useWebVoiceAgent`):
|
|
|
68
69
|
|
|
69
70
|
1. Resuelve runtime config desde `moduleLoaders` + `defaultRoutes` + env/opciones.
|
|
70
71
|
2. Crea backend client con `apiBaseUrl` o `NAVAI_API_URL`.
|
|
71
|
-
3. En `start()`:
|
|
72
|
-
- solicita client secret.
|
|
73
|
-
-
|
|
74
|
-
-
|
|
75
|
-
-
|
|
72
|
+
3. En `start()`:
|
|
73
|
+
- solicita client secret.
|
|
74
|
+
- lee `speech.provider` desde la respuesta backend.
|
|
75
|
+
- solicita listado de funciones backend.
|
|
76
|
+
- construye agente Navai con funciones locales + backend.
|
|
77
|
+
- conecta `RealtimeSession`.
|
|
76
78
|
4. En `stop()`:
|
|
77
79
|
- cierra sesion y resetea estado.
|
|
78
80
|
|
|
@@ -106,9 +108,18 @@ Tipos utiles:
|
|
|
106
108
|
|
|
107
109
|
- `NavaiRoute`
|
|
108
110
|
- `NavaiFunctionDefinition`
|
|
109
|
-
- `NavaiFunctionsRegistry`
|
|
110
|
-
- `NavaiBackendFunctionDefinition`
|
|
111
|
-
- `
|
|
111
|
+
- `NavaiFunctionsRegistry`
|
|
112
|
+
- `NavaiBackendFunctionDefinition`
|
|
113
|
+
- `NavaiBackendSpeechConfig`
|
|
114
|
+
- `UseWebVoiceAgentOptions`
|
|
115
|
+
|
|
116
|
+
## Modo de voz hibrido
|
|
117
|
+
|
|
118
|
+
Cuando el backend devuelve `speech.provider: "elevenlabs"`:
|
|
119
|
+
|
|
120
|
+
- `useWebVoiceAgent` actualiza la sesion Realtime con `output_modalities: ["text"]`.
|
|
121
|
+
- el texto final del asistente se envia a `backendClient.synthesizeSpeech(...)`.
|
|
122
|
+
- la reproduccion ocurre localmente en el navegador con el audio sintetizado por ElevenLabs.
|
|
112
123
|
|
|
113
124
|
## Modelo de Tools y Comportamiento
|
|
114
125
|
|
|
@@ -234,16 +245,18 @@ Prioridad de base URL en `createNavaiBackendClient`:
|
|
|
234
245
|
2. `env.NAVAI_API_URL`.
|
|
235
246
|
3. Fallback `http://localhost:3000`.
|
|
236
247
|
|
|
237
|
-
Metodos:
|
|
238
|
-
|
|
239
|
-
- `createClientSecret(input?)`
|
|
240
|
-
- `
|
|
241
|
-
- `
|
|
248
|
+
Metodos:
|
|
249
|
+
|
|
250
|
+
- `createClientSecret(input?)`
|
|
251
|
+
- `synthesizeSpeech({ text, ... })`
|
|
252
|
+
- `listFunctions()`
|
|
253
|
+
- `executeFunction({ functionName, payload })`
|
|
242
254
|
|
|
243
255
|
Manejo de errores:
|
|
244
256
|
|
|
245
|
-
- fallos de red/HTTP lanzan error en create/execute.
|
|
246
|
-
- el listado de funciones retorna warnings + lista vacia en fallos.
|
|
257
|
+
- fallos de red/HTTP lanzan error en create/execute.
|
|
258
|
+
- el listado de funciones retorna warnings + lista vacia en fallos.
|
|
259
|
+
- `createClientSecret()` retorna `{ value, expires_at, speech }`, donde `speech.provider` puede ser `openai` o `elevenlabs`.
|
|
247
260
|
|
|
248
261
|
## CLI Generador de Module Loaders
|
|
249
262
|
|
package/README.md
CHANGED
|
@@ -26,11 +26,12 @@ npm install react
|
|
|
26
26
|
|
|
27
27
|
This package is intentionally split by concern:
|
|
28
28
|
|
|
29
|
-
1. `src/backend.ts`
|
|
30
|
-
HTTP client for backend routes:
|
|
31
|
-
- `POST /navai/realtime/client-secret`
|
|
32
|
-
- `
|
|
33
|
-
- `
|
|
29
|
+
1. `src/backend.ts`
|
|
30
|
+
HTTP client for backend routes:
|
|
31
|
+
- `POST /navai/realtime/client-secret`
|
|
32
|
+
- `POST /navai/speech/synthesize`
|
|
33
|
+
- `GET /navai/functions`
|
|
34
|
+
- `POST /navai/functions/execute`
|
|
34
35
|
|
|
35
36
|
2. `src/runtime.ts`
|
|
36
37
|
Runtime resolver for:
|
|
@@ -68,11 +69,12 @@ Hook-driven runtime flow (`useWebVoiceAgent`):
|
|
|
68
69
|
|
|
69
70
|
1. Resolve runtime config from `moduleLoaders` + `defaultRoutes` + env/options.
|
|
70
71
|
2. Create backend client with `apiBaseUrl` or `NAVAI_API_URL`.
|
|
71
|
-
3. On `start()`:
|
|
72
|
-
- request client secret.
|
|
73
|
-
-
|
|
74
|
-
-
|
|
75
|
-
-
|
|
72
|
+
3. On `start()`:
|
|
73
|
+
- request client secret.
|
|
74
|
+
- read `speech.provider` from backend response.
|
|
75
|
+
- fetch backend function list.
|
|
76
|
+
- build Navai agent with local + backend functions.
|
|
77
|
+
- connect `RealtimeSession`.
|
|
76
78
|
4. On `stop()`:
|
|
77
79
|
- close session and reset state.
|
|
78
80
|
|
|
@@ -106,9 +108,18 @@ Useful types:
|
|
|
106
108
|
|
|
107
109
|
- `NavaiRoute`
|
|
108
110
|
- `NavaiFunctionDefinition`
|
|
109
|
-
- `NavaiFunctionsRegistry`
|
|
110
|
-
- `NavaiBackendFunctionDefinition`
|
|
111
|
-
- `
|
|
111
|
+
- `NavaiFunctionsRegistry`
|
|
112
|
+
- `NavaiBackendFunctionDefinition`
|
|
113
|
+
- `NavaiBackendSpeechConfig`
|
|
114
|
+
- `UseWebVoiceAgentOptions`
|
|
115
|
+
|
|
116
|
+
## Hybrid Speech Mode
|
|
117
|
+
|
|
118
|
+
When backend returns `speech.provider: "elevenlabs"`:
|
|
119
|
+
|
|
120
|
+
- `useWebVoiceAgent` updates the Realtime session to use `output_modalities: ["text"]`.
|
|
121
|
+
- assistant final text is sent to `backendClient.synthesizeSpeech(...)`.
|
|
122
|
+
- playback happens locally in the browser with the synthesized ElevenLabs audio.
|
|
112
123
|
|
|
113
124
|
## Tool Model and Behavior
|
|
114
125
|
|
|
@@ -247,16 +258,18 @@ For browser realtime multi-agent orchestration, `buildNavaiAgent` currently wire
|
|
|
247
258
|
2. `env.NAVAI_API_URL`.
|
|
248
259
|
3. fallback `http://localhost:3000`.
|
|
249
260
|
|
|
250
|
-
Methods:
|
|
251
|
-
|
|
252
|
-
- `createClientSecret(input?)`
|
|
253
|
-
- `
|
|
254
|
-
- `
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
-
|
|
261
|
+
Methods:
|
|
262
|
+
|
|
263
|
+
- `createClientSecret(input?)`
|
|
264
|
+
- `synthesizeSpeech({ text, ... })`
|
|
265
|
+
- `listFunctions()`
|
|
266
|
+
- `executeFunction({ functionName, payload })`
|
|
267
|
+
|
|
268
|
+
Error handling:
|
|
269
|
+
|
|
270
|
+
- network/HTTP failures throw for create/execute.
|
|
271
|
+
- function listing returns warnings and empty list on failures.
|
|
272
|
+
- `createClientSecret()` returns `{ value, expires_at, speech }`, where `speech.provider` is `openai` or `elevenlabs`.
|
|
260
273
|
|
|
261
274
|
## Generated Module Loader CLI
|
|
262
275
|
|
package/dist/index.cjs
CHANGED
|
@@ -1153,6 +1153,7 @@ var DEFAULT_API_BASE_URL = "http://localhost:3000";
|
|
|
1153
1153
|
var DEFAULT_CLIENT_SECRET_PATH = "/navai/realtime/client-secret";
|
|
1154
1154
|
var DEFAULT_FUNCTIONS_LIST_PATH = "/navai/functions";
|
|
1155
1155
|
var DEFAULT_FUNCTIONS_EXECUTE_PATH = "/navai/functions/execute";
|
|
1156
|
+
var DEFAULT_SPEECH_SYNTHESIZE_PATH = "/navai/speech/synthesize";
|
|
1156
1157
|
function readOptional(value) {
|
|
1157
1158
|
const trimmed = value?.trim();
|
|
1158
1159
|
return trimmed ? trimmed : void 0;
|
|
@@ -1165,6 +1166,12 @@ function joinUrl(baseUrl, path) {
|
|
|
1165
1166
|
function isRecord(value) {
|
|
1166
1167
|
return Boolean(value && typeof value === "object");
|
|
1167
1168
|
}
|
|
1169
|
+
function readSpeechConfig(payload) {
|
|
1170
|
+
if (isRecord(payload) && isRecord(payload.speech) && payload.speech.provider === "elevenlabs") {
|
|
1171
|
+
return { provider: "elevenlabs" };
|
|
1172
|
+
}
|
|
1173
|
+
return { provider: "openai" };
|
|
1174
|
+
}
|
|
1168
1175
|
async function readTextSafe(response) {
|
|
1169
1176
|
try {
|
|
1170
1177
|
return await response.text();
|
|
@@ -1185,6 +1192,7 @@ function createNavaiBackendClient(options = {}) {
|
|
|
1185
1192
|
const clientSecretUrl = joinUrl(apiBaseUrl, options.clientSecretPath ?? DEFAULT_CLIENT_SECRET_PATH);
|
|
1186
1193
|
const functionsListUrl = joinUrl(apiBaseUrl, options.functionsListPath ?? DEFAULT_FUNCTIONS_LIST_PATH);
|
|
1187
1194
|
const functionsExecuteUrl = joinUrl(apiBaseUrl, options.functionsExecutePath ?? DEFAULT_FUNCTIONS_EXECUTE_PATH);
|
|
1195
|
+
const speechSynthesizeUrl = joinUrl(apiBaseUrl, options.speechSynthesizePath ?? DEFAULT_SPEECH_SYNTHESIZE_PATH);
|
|
1188
1196
|
async function createClientSecret(input = {}) {
|
|
1189
1197
|
const response = await fetchImpl(clientSecretUrl, {
|
|
1190
1198
|
method: "POST",
|
|
@@ -1200,7 +1208,27 @@ function createNavaiBackendClient(options = {}) {
|
|
|
1200
1208
|
}
|
|
1201
1209
|
return {
|
|
1202
1210
|
value: payload.value,
|
|
1203
|
-
expires_at: typeof payload.expires_at === "number" ? payload.expires_at : void 0
|
|
1211
|
+
expires_at: typeof payload.expires_at === "number" ? payload.expires_at : void 0,
|
|
1212
|
+
speech: readSpeechConfig(payload)
|
|
1213
|
+
};
|
|
1214
|
+
}
|
|
1215
|
+
async function synthesizeSpeech(input) {
|
|
1216
|
+
const response = await fetchImpl(speechSynthesizeUrl, {
|
|
1217
|
+
method: "POST",
|
|
1218
|
+
headers: { "Content-Type": "application/json" },
|
|
1219
|
+
body: JSON.stringify(input)
|
|
1220
|
+
});
|
|
1221
|
+
if (!response.ok) {
|
|
1222
|
+
throw new Error(await readTextSafe(response));
|
|
1223
|
+
}
|
|
1224
|
+
const payload = await readJsonSafe(response);
|
|
1225
|
+
if (!isRecord(payload) || payload.provider !== "elevenlabs" || typeof payload.mimeType !== "string" || typeof payload.audioBase64 !== "string") {
|
|
1226
|
+
throw new Error("Invalid speech synthesis response.");
|
|
1227
|
+
}
|
|
1228
|
+
return {
|
|
1229
|
+
provider: "elevenlabs",
|
|
1230
|
+
mimeType: payload.mimeType,
|
|
1231
|
+
audioBase64: payload.audioBase64
|
|
1204
1232
|
};
|
|
1205
1233
|
}
|
|
1206
1234
|
async function listFunctions() {
|
|
@@ -1260,6 +1288,7 @@ function createNavaiBackendClient(options = {}) {
|
|
|
1260
1288
|
};
|
|
1261
1289
|
return {
|
|
1262
1290
|
createClientSecret,
|
|
1291
|
+
synthesizeSpeech,
|
|
1263
1292
|
listFunctions,
|
|
1264
1293
|
executeFunction
|
|
1265
1294
|
};
|
|
@@ -1621,9 +1650,77 @@ function debugLog2(message, details) {
|
|
|
1621
1650
|
}
|
|
1622
1651
|
console.log(`${DEBUG_PREFIX2} ${message}`, details);
|
|
1623
1652
|
}
|
|
1653
|
+
function isRecord2(value) {
|
|
1654
|
+
return Boolean(value && typeof value === "object");
|
|
1655
|
+
}
|
|
1656
|
+
function readRealtimeEventType(event) {
|
|
1657
|
+
if (!isRecord2(event) || typeof event.type !== "string") {
|
|
1658
|
+
return "";
|
|
1659
|
+
}
|
|
1660
|
+
return event.type.trim().toLowerCase();
|
|
1661
|
+
}
|
|
1662
|
+
function readAssistantTextFromResponseOutput(items) {
|
|
1663
|
+
const parts = [];
|
|
1664
|
+
for (const item of items) {
|
|
1665
|
+
if (!isRecord2(item) || item.type !== "message" || item.role !== "assistant") {
|
|
1666
|
+
continue;
|
|
1667
|
+
}
|
|
1668
|
+
const content = Array.isArray(item.content) ? item.content : [];
|
|
1669
|
+
for (const chunk of content) {
|
|
1670
|
+
if (!isRecord2(chunk)) {
|
|
1671
|
+
continue;
|
|
1672
|
+
}
|
|
1673
|
+
const text = chunk.type === "output_text" ? typeof chunk.text === "string" ? chunk.text : "" : chunk.type === "output_audio" ? typeof chunk.transcript === "string" ? chunk.transcript : "" : "";
|
|
1674
|
+
const normalized = text.trim();
|
|
1675
|
+
if (normalized) {
|
|
1676
|
+
parts.push(normalized);
|
|
1677
|
+
}
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
return parts.join("\n").trim();
|
|
1681
|
+
}
|
|
1682
|
+
function extractAssistantTextFromRealtimeEvent(event) {
|
|
1683
|
+
if (!isRecord2(event)) {
|
|
1684
|
+
return null;
|
|
1685
|
+
}
|
|
1686
|
+
const eventType = readRealtimeEventType(event);
|
|
1687
|
+
if (eventType === "response.output_text.done" || eventType === "response.text.done" || eventType === "response.audio_transcript.done") {
|
|
1688
|
+
const text = typeof event.text === "string" ? event.text.trim() : typeof event.transcript === "string" ? event.transcript.trim() : "";
|
|
1689
|
+
if (!text) {
|
|
1690
|
+
return null;
|
|
1691
|
+
}
|
|
1692
|
+
const key = [
|
|
1693
|
+
eventType,
|
|
1694
|
+
typeof event.response_id === "string" ? event.response_id : "",
|
|
1695
|
+
typeof event.item_id === "string" ? event.item_id : ""
|
|
1696
|
+
].filter(Boolean).join(":");
|
|
1697
|
+
return { key: key || `${eventType}:${text}`, text };
|
|
1698
|
+
}
|
|
1699
|
+
if (eventType === "response.done" && isRecord2(event.response) && Array.isArray(event.response.output)) {
|
|
1700
|
+
const text = readAssistantTextFromResponseOutput(event.response.output);
|
|
1701
|
+
if (!text) {
|
|
1702
|
+
return null;
|
|
1703
|
+
}
|
|
1704
|
+
const responseId = typeof event.response.id === "string" ? event.response.id : "";
|
|
1705
|
+
return { key: responseId ? `response.done:${responseId}` : `response.done:${text}`, text };
|
|
1706
|
+
}
|
|
1707
|
+
return null;
|
|
1708
|
+
}
|
|
1709
|
+
function audioUrlFromSynthesis(result) {
|
|
1710
|
+
const binary = atob(result.audioBase64);
|
|
1711
|
+
const bytes = new Uint8Array(binary.length);
|
|
1712
|
+
for (let index = 0; index < binary.length; index += 1) {
|
|
1713
|
+
bytes[index] = binary.charCodeAt(index);
|
|
1714
|
+
}
|
|
1715
|
+
return URL.createObjectURL(new Blob([bytes], { type: result.mimeType }));
|
|
1716
|
+
}
|
|
1624
1717
|
function useWebVoiceAgent(options) {
|
|
1625
1718
|
const sessionRef = (0, import_react.useRef)(null);
|
|
1626
1719
|
const attachedRealtimeSessionRef = (0, import_react.useRef)(null);
|
|
1720
|
+
const speechProviderRef = (0, import_react.useRef)("openai");
|
|
1721
|
+
const spokenAssistantKeysRef = (0, import_react.useRef)(/* @__PURE__ */ new Set());
|
|
1722
|
+
const playbackGenerationRef = (0, import_react.useRef)(0);
|
|
1723
|
+
const activePlaybackRef = (0, import_react.useRef)(null);
|
|
1627
1724
|
const runtimeConfigPromise = (0, import_react.useMemo)(
|
|
1628
1725
|
() => resolveNavaiFrontendRuntimeConfig({
|
|
1629
1726
|
moduleLoaders: options.moduleLoaders,
|
|
@@ -1661,6 +1758,69 @@ function useWebVoiceAgent(options) {
|
|
|
1661
1758
|
const setAgentVoiceStateIfChanged = (0, import_react.useCallback)((next) => {
|
|
1662
1759
|
setAgentVoiceState((current) => current === next ? current : next);
|
|
1663
1760
|
}, []);
|
|
1761
|
+
const clearPlayback = (0, import_react.useCallback)(
|
|
1762
|
+
(options2) => {
|
|
1763
|
+
if (options2?.invalidate) {
|
|
1764
|
+
playbackGenerationRef.current += 1;
|
|
1765
|
+
}
|
|
1766
|
+
const active = activePlaybackRef.current;
|
|
1767
|
+
if (active) {
|
|
1768
|
+
try {
|
|
1769
|
+
active.audio.pause();
|
|
1770
|
+
active.audio.currentTime = 0;
|
|
1771
|
+
} catch {
|
|
1772
|
+
}
|
|
1773
|
+
URL.revokeObjectURL(active.url);
|
|
1774
|
+
}
|
|
1775
|
+
activePlaybackRef.current = null;
|
|
1776
|
+
if (options2?.resetState !== false) {
|
|
1777
|
+
setAgentVoiceStateIfChanged("idle");
|
|
1778
|
+
}
|
|
1779
|
+
},
|
|
1780
|
+
[setAgentVoiceStateIfChanged]
|
|
1781
|
+
);
|
|
1782
|
+
const playAssistantSpeech = (0, import_react.useCallback)(
|
|
1783
|
+
async (text) => {
|
|
1784
|
+
if (speechProviderRef.current !== "elevenlabs") {
|
|
1785
|
+
return;
|
|
1786
|
+
}
|
|
1787
|
+
const normalized = text.trim();
|
|
1788
|
+
if (!normalized) {
|
|
1789
|
+
return;
|
|
1790
|
+
}
|
|
1791
|
+
clearPlayback({ resetState: false });
|
|
1792
|
+
const generation = playbackGenerationRef.current + 1;
|
|
1793
|
+
playbackGenerationRef.current = generation;
|
|
1794
|
+
setAgentVoiceStateIfChanged("speaking");
|
|
1795
|
+
try {
|
|
1796
|
+
const synthesized = await backendClient.synthesizeSpeech({ text: normalized });
|
|
1797
|
+
if (speechProviderRef.current !== "elevenlabs" || playbackGenerationRef.current !== generation) {
|
|
1798
|
+
return;
|
|
1799
|
+
}
|
|
1800
|
+
const audio = new Audio();
|
|
1801
|
+
const url = audioUrlFromSynthesis(synthesized);
|
|
1802
|
+
audio.src = url;
|
|
1803
|
+
audio.autoplay = false;
|
|
1804
|
+
activePlaybackRef.current = { audio, url };
|
|
1805
|
+
const finish = () => {
|
|
1806
|
+
if (activePlaybackRef.current?.audio === audio) {
|
|
1807
|
+
clearPlayback({ resetState: true });
|
|
1808
|
+
} else {
|
|
1809
|
+
URL.revokeObjectURL(url);
|
|
1810
|
+
}
|
|
1811
|
+
};
|
|
1812
|
+
audio.addEventListener("ended", finish, { once: true });
|
|
1813
|
+
audio.addEventListener("error", finish, { once: true });
|
|
1814
|
+
await audio.play();
|
|
1815
|
+
} catch (playbackError) {
|
|
1816
|
+
debugLog2("assistant speech playback failed", playbackError);
|
|
1817
|
+
if (playbackGenerationRef.current === generation) {
|
|
1818
|
+
clearPlayback({ resetState: true });
|
|
1819
|
+
}
|
|
1820
|
+
}
|
|
1821
|
+
},
|
|
1822
|
+
[backendClient, clearPlayback, setAgentVoiceStateIfChanged]
|
|
1823
|
+
);
|
|
1664
1824
|
const handleSessionAudioStart = (0, import_react.useCallback)(() => {
|
|
1665
1825
|
setAgentVoiceStateIfChanged("speaking");
|
|
1666
1826
|
}, [setAgentVoiceStateIfChanged]);
|
|
@@ -1668,11 +1828,35 @@ function useWebVoiceAgent(options) {
|
|
|
1668
1828
|
setAgentVoiceStateIfChanged("idle");
|
|
1669
1829
|
}, [setAgentVoiceStateIfChanged]);
|
|
1670
1830
|
const handleSessionAudioInterrupted = (0, import_react.useCallback)(() => {
|
|
1831
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1671
1832
|
setAgentVoiceStateIfChanged("idle");
|
|
1672
|
-
}, [setAgentVoiceStateIfChanged]);
|
|
1833
|
+
}, [clearPlayback, setAgentVoiceStateIfChanged]);
|
|
1673
1834
|
const handleSessionError = (0, import_react.useCallback)(() => {
|
|
1835
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1674
1836
|
setAgentVoiceStateIfChanged("idle");
|
|
1675
|
-
}, [setAgentVoiceStateIfChanged]);
|
|
1837
|
+
}, [clearPlayback, setAgentVoiceStateIfChanged]);
|
|
1838
|
+
const handleTransportEvent = (0, import_react.useCallback)(
|
|
1839
|
+
(event) => {
|
|
1840
|
+
const eventType = readRealtimeEventType(event);
|
|
1841
|
+
if (!eventType) {
|
|
1842
|
+
return;
|
|
1843
|
+
}
|
|
1844
|
+
if (eventType === "input_audio_buffer.speech_started" || eventType === "conversation.item.input_audio_transcription.started") {
|
|
1845
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1846
|
+
return;
|
|
1847
|
+
}
|
|
1848
|
+
if (speechProviderRef.current !== "elevenlabs") {
|
|
1849
|
+
return;
|
|
1850
|
+
}
|
|
1851
|
+
const assistantText = extractAssistantTextFromRealtimeEvent(event);
|
|
1852
|
+
if (!assistantText || spokenAssistantKeysRef.current.has(assistantText.key)) {
|
|
1853
|
+
return;
|
|
1854
|
+
}
|
|
1855
|
+
spokenAssistantKeysRef.current.add(assistantText.key);
|
|
1856
|
+
void playAssistantSpeech(assistantText.text);
|
|
1857
|
+
},
|
|
1858
|
+
[clearPlayback, playAssistantSpeech]
|
|
1859
|
+
);
|
|
1676
1860
|
const detachSessionAudioListeners = (0, import_react.useCallback)(() => {
|
|
1677
1861
|
const attachedSession = attachedRealtimeSessionRef.current;
|
|
1678
1862
|
if (!attachedSession) {
|
|
@@ -1681,9 +1865,16 @@ function useWebVoiceAgent(options) {
|
|
|
1681
1865
|
attachedSession.off("audio_start", handleSessionAudioStart);
|
|
1682
1866
|
attachedSession.off("audio_stopped", handleSessionAudioStopped);
|
|
1683
1867
|
attachedSession.off("audio_interrupted", handleSessionAudioInterrupted);
|
|
1868
|
+
attachedSession.off("transport_event", handleTransportEvent);
|
|
1684
1869
|
attachedSession.off("error", handleSessionError);
|
|
1685
1870
|
attachedRealtimeSessionRef.current = null;
|
|
1686
|
-
}, [
|
|
1871
|
+
}, [
|
|
1872
|
+
handleSessionAudioInterrupted,
|
|
1873
|
+
handleSessionAudioStart,
|
|
1874
|
+
handleSessionAudioStopped,
|
|
1875
|
+
handleSessionError,
|
|
1876
|
+
handleTransportEvent
|
|
1877
|
+
]);
|
|
1687
1878
|
const attachSessionAudioListeners = (0, import_react.useCallback)(
|
|
1688
1879
|
(session) => {
|
|
1689
1880
|
detachSessionAudioListeners();
|
|
@@ -1723,6 +1914,7 @@ function useWebVoiceAgent(options) {
|
|
|
1723
1914
|
session.on("history_added", (item) => {
|
|
1724
1915
|
debugLog2("session history_added", item);
|
|
1725
1916
|
});
|
|
1917
|
+
session.on("transport_event", handleTransportEvent);
|
|
1726
1918
|
session.on("error", (sessionError) => {
|
|
1727
1919
|
debugLog2("session error", sessionError);
|
|
1728
1920
|
});
|
|
@@ -1737,19 +1929,23 @@ function useWebVoiceAgent(options) {
|
|
|
1737
1929
|
handleSessionAudioInterrupted,
|
|
1738
1930
|
handleSessionAudioStart,
|
|
1739
1931
|
handleSessionAudioStopped,
|
|
1740
|
-
handleSessionError
|
|
1932
|
+
handleSessionError,
|
|
1933
|
+
handleTransportEvent
|
|
1741
1934
|
]
|
|
1742
1935
|
);
|
|
1743
1936
|
const stop = (0, import_react.useCallback)(() => {
|
|
1744
1937
|
detachSessionAudioListeners();
|
|
1938
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1745
1939
|
try {
|
|
1746
1940
|
sessionRef.current?.close();
|
|
1747
1941
|
} finally {
|
|
1748
1942
|
sessionRef.current = null;
|
|
1943
|
+
spokenAssistantKeysRef.current.clear();
|
|
1944
|
+
speechProviderRef.current = "openai";
|
|
1749
1945
|
setStatus("idle");
|
|
1750
1946
|
setAgentVoiceStateIfChanged("idle");
|
|
1751
1947
|
}
|
|
1752
|
-
}, [detachSessionAudioListeners, setAgentVoiceStateIfChanged]);
|
|
1948
|
+
}, [clearPlayback, detachSessionAudioListeners, setAgentVoiceStateIfChanged]);
|
|
1753
1949
|
(0, import_react.useEffect)(() => {
|
|
1754
1950
|
return () => {
|
|
1755
1951
|
stop();
|
|
@@ -1777,6 +1973,9 @@ function useWebVoiceAgent(options) {
|
|
|
1777
1973
|
});
|
|
1778
1974
|
const requestPayload = runtimeConfig.modelOverride ? { model: runtimeConfig.modelOverride } : {};
|
|
1779
1975
|
const secretPayload = await backendClient.createClientSecret(requestPayload);
|
|
1976
|
+
speechProviderRef.current = secretPayload.speech.provider;
|
|
1977
|
+
spokenAssistantKeysRef.current.clear();
|
|
1978
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1780
1979
|
const backendFunctionsResult = await backendClient.listFunctions();
|
|
1781
1980
|
const { agent, warnings } = await buildNavaiAgent({
|
|
1782
1981
|
navigate: options.navigate,
|
|
@@ -1788,7 +1987,11 @@ function useWebVoiceAgent(options) {
|
|
|
1788
1987
|
executeBackendFunction: backendClient.executeFunction
|
|
1789
1988
|
});
|
|
1790
1989
|
emitWarnings([...runtimeConfig.warnings, ...backendFunctionsResult.warnings, ...warnings]);
|
|
1791
|
-
const session = new import_realtime2.RealtimeSession(agent
|
|
1990
|
+
const session = secretPayload.speech.provider === "elevenlabs" ? new import_realtime2.RealtimeSession(agent, {
|
|
1991
|
+
config: {
|
|
1992
|
+
outputModalities: ["text"]
|
|
1993
|
+
}
|
|
1994
|
+
}) : new import_realtime2.RealtimeSession(agent);
|
|
1792
1995
|
attachSessionAudioListeners(session);
|
|
1793
1996
|
if (runtimeConfig.modelOverride) {
|
|
1794
1997
|
await session.connect({ apiKey: secretPayload.value, model: runtimeConfig.modelOverride });
|
|
@@ -1804,6 +2007,9 @@ function useWebVoiceAgent(options) {
|
|
|
1804
2007
|
setStatus("error");
|
|
1805
2008
|
setAgentVoiceStateIfChanged("idle");
|
|
1806
2009
|
detachSessionAudioListeners();
|
|
2010
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
2011
|
+
spokenAssistantKeysRef.current.clear();
|
|
2012
|
+
speechProviderRef.current = "openai";
|
|
1807
2013
|
try {
|
|
1808
2014
|
sessionRef.current?.close();
|
|
1809
2015
|
} catch {
|
|
@@ -1813,6 +2019,7 @@ function useWebVoiceAgent(options) {
|
|
|
1813
2019
|
}, [
|
|
1814
2020
|
attachSessionAudioListeners,
|
|
1815
2021
|
backendClient,
|
|
2022
|
+
clearPlayback,
|
|
1816
2023
|
detachSessionAudioListeners,
|
|
1817
2024
|
options.navigate,
|
|
1818
2025
|
runtimeConfigPromise,
|
package/dist/index.d.cts
CHANGED
|
@@ -104,9 +104,32 @@ type CreateClientSecretInput = {
|
|
|
104
104
|
voiceTone?: string;
|
|
105
105
|
apiKey?: string;
|
|
106
106
|
};
|
|
107
|
+
type NavaiSpeechProvider = "openai" | "elevenlabs";
|
|
108
|
+
type NavaiBackendSpeechConfig = {
|
|
109
|
+
provider: NavaiSpeechProvider;
|
|
110
|
+
};
|
|
107
111
|
type CreateClientSecretOutput = {
|
|
108
112
|
value: string;
|
|
109
113
|
expires_at?: number;
|
|
114
|
+
speech: NavaiBackendSpeechConfig;
|
|
115
|
+
};
|
|
116
|
+
type SynthesizeSpeechInput = {
|
|
117
|
+
text: string;
|
|
118
|
+
voiceId?: string;
|
|
119
|
+
modelId?: string;
|
|
120
|
+
outputFormat?: string;
|
|
121
|
+
optimizeStreamingLatency?: number;
|
|
122
|
+
voiceSettings?: {
|
|
123
|
+
stability?: number;
|
|
124
|
+
similarityBoost?: number;
|
|
125
|
+
style?: number;
|
|
126
|
+
useSpeakerBoost?: boolean;
|
|
127
|
+
};
|
|
128
|
+
};
|
|
129
|
+
type SynthesizeSpeechOutput = {
|
|
130
|
+
provider: "elevenlabs";
|
|
131
|
+
mimeType: string;
|
|
132
|
+
audioBase64: string;
|
|
110
133
|
};
|
|
111
134
|
type BackendFunctionsResult = {
|
|
112
135
|
functions: NavaiBackendFunctionDefinition[];
|
|
@@ -119,9 +142,11 @@ type CreateNavaiBackendClientOptions = {
|
|
|
119
142
|
clientSecretPath?: string;
|
|
120
143
|
functionsListPath?: string;
|
|
121
144
|
functionsExecutePath?: string;
|
|
145
|
+
speechSynthesizePath?: string;
|
|
122
146
|
};
|
|
123
147
|
type NavaiBackendClient = {
|
|
124
148
|
createClientSecret: (input?: CreateClientSecretInput) => Promise<CreateClientSecretOutput>;
|
|
149
|
+
synthesizeSpeech: (input: SynthesizeSpeechInput) => Promise<SynthesizeSpeechOutput>;
|
|
125
150
|
listFunctions: () => Promise<BackendFunctionsResult>;
|
|
126
151
|
executeFunction: ExecuteNavaiBackendFunction;
|
|
127
152
|
};
|
|
@@ -253,4 +278,4 @@ type NavaiVoiceOrbDockMicIconProps = {
|
|
|
253
278
|
};
|
|
254
279
|
declare function NavaiVoiceOrbDockMicIcon({ isActive, size }: NavaiVoiceOrbDockMicIconProps): react_jsx_runtime.JSX.Element;
|
|
255
280
|
|
|
256
|
-
export { type BuildNavaiAgentOptions, type BuildNavaiAgentResult, type CreateNavaiBackendClientOptions, type ExecuteNavaiBackendFunction, type ExecuteNavaiBackendFunctionInput, type NavaiAgentModuleConfig, type NavaiBackendClient, type NavaiBackendFunctionDefinition, type NavaiFunctionContext, type NavaiFunctionDefinition, type NavaiFunctionModuleLoaders, type NavaiFunctionPayload, type NavaiFunctionsRegistry, NavaiHeroOrb, type NavaiHeroOrbProps, NavaiMiniOrbDock, type NavaiMiniOrbDockProps, type NavaiRoute, type NavaiRuntimeAgentConfig, NavaiVoiceHeroOrb, type NavaiVoiceHeroOrbProps, type NavaiVoiceOrbBaseProps, NavaiVoiceOrbDock, NavaiVoiceOrbDockMicIcon, type NavaiVoiceOrbDockProps, type NavaiVoiceOrbMessages, type NavaiVoiceOrbPlacement, type NavaiVoiceOrbRuntimeSnapshot, type NavaiVoiceOrbThemeMode, type NavaiWebVoiceAgentLike, Orb, type OrbProps, type ResolveNavaiFrontendRuntimeConfigOptions, type ResolveNavaiFrontendRuntimeConfigResult, type UseWebVoiceAgentOptions, type UseWebVoiceAgentResult, buildNavaiAgent, clampNavaiOrbDelayMs, createNavaiBackendClient, getNavaiRoutePromptLines, loadNavaiFunctions, resolveNavaiFrontendRuntimeConfig, resolveNavaiRoute, resolveNavaiVoiceOrbRuntimeSnapshot, useWebVoiceAgent };
|
|
281
|
+
export { type BuildNavaiAgentOptions, type BuildNavaiAgentResult, type CreateNavaiBackendClientOptions, type ExecuteNavaiBackendFunction, type ExecuteNavaiBackendFunctionInput, type NavaiAgentModuleConfig, type NavaiBackendClient, type NavaiBackendFunctionDefinition, type NavaiBackendSpeechConfig, type NavaiFunctionContext, type NavaiFunctionDefinition, type NavaiFunctionModuleLoaders, type NavaiFunctionPayload, type NavaiFunctionsRegistry, NavaiHeroOrb, type NavaiHeroOrbProps, NavaiMiniOrbDock, type NavaiMiniOrbDockProps, type NavaiRoute, type NavaiRuntimeAgentConfig, NavaiVoiceHeroOrb, type NavaiVoiceHeroOrbProps, type NavaiVoiceOrbBaseProps, NavaiVoiceOrbDock, NavaiVoiceOrbDockMicIcon, type NavaiVoiceOrbDockProps, type NavaiVoiceOrbMessages, type NavaiVoiceOrbPlacement, type NavaiVoiceOrbRuntimeSnapshot, type NavaiVoiceOrbThemeMode, type NavaiWebVoiceAgentLike, Orb, type OrbProps, type ResolveNavaiFrontendRuntimeConfigOptions, type ResolveNavaiFrontendRuntimeConfigResult, type UseWebVoiceAgentOptions, type UseWebVoiceAgentResult, buildNavaiAgent, clampNavaiOrbDelayMs, createNavaiBackendClient, getNavaiRoutePromptLines, loadNavaiFunctions, resolveNavaiFrontendRuntimeConfig, resolveNavaiRoute, resolveNavaiVoiceOrbRuntimeSnapshot, useWebVoiceAgent };
|
package/dist/index.d.ts
CHANGED
|
@@ -104,9 +104,32 @@ type CreateClientSecretInput = {
|
|
|
104
104
|
voiceTone?: string;
|
|
105
105
|
apiKey?: string;
|
|
106
106
|
};
|
|
107
|
+
type NavaiSpeechProvider = "openai" | "elevenlabs";
|
|
108
|
+
type NavaiBackendSpeechConfig = {
|
|
109
|
+
provider: NavaiSpeechProvider;
|
|
110
|
+
};
|
|
107
111
|
type CreateClientSecretOutput = {
|
|
108
112
|
value: string;
|
|
109
113
|
expires_at?: number;
|
|
114
|
+
speech: NavaiBackendSpeechConfig;
|
|
115
|
+
};
|
|
116
|
+
type SynthesizeSpeechInput = {
|
|
117
|
+
text: string;
|
|
118
|
+
voiceId?: string;
|
|
119
|
+
modelId?: string;
|
|
120
|
+
outputFormat?: string;
|
|
121
|
+
optimizeStreamingLatency?: number;
|
|
122
|
+
voiceSettings?: {
|
|
123
|
+
stability?: number;
|
|
124
|
+
similarityBoost?: number;
|
|
125
|
+
style?: number;
|
|
126
|
+
useSpeakerBoost?: boolean;
|
|
127
|
+
};
|
|
128
|
+
};
|
|
129
|
+
type SynthesizeSpeechOutput = {
|
|
130
|
+
provider: "elevenlabs";
|
|
131
|
+
mimeType: string;
|
|
132
|
+
audioBase64: string;
|
|
110
133
|
};
|
|
111
134
|
type BackendFunctionsResult = {
|
|
112
135
|
functions: NavaiBackendFunctionDefinition[];
|
|
@@ -119,9 +142,11 @@ type CreateNavaiBackendClientOptions = {
|
|
|
119
142
|
clientSecretPath?: string;
|
|
120
143
|
functionsListPath?: string;
|
|
121
144
|
functionsExecutePath?: string;
|
|
145
|
+
speechSynthesizePath?: string;
|
|
122
146
|
};
|
|
123
147
|
type NavaiBackendClient = {
|
|
124
148
|
createClientSecret: (input?: CreateClientSecretInput) => Promise<CreateClientSecretOutput>;
|
|
149
|
+
synthesizeSpeech: (input: SynthesizeSpeechInput) => Promise<SynthesizeSpeechOutput>;
|
|
125
150
|
listFunctions: () => Promise<BackendFunctionsResult>;
|
|
126
151
|
executeFunction: ExecuteNavaiBackendFunction;
|
|
127
152
|
};
|
|
@@ -253,4 +278,4 @@ type NavaiVoiceOrbDockMicIconProps = {
|
|
|
253
278
|
};
|
|
254
279
|
declare function NavaiVoiceOrbDockMicIcon({ isActive, size }: NavaiVoiceOrbDockMicIconProps): react_jsx_runtime.JSX.Element;
|
|
255
280
|
|
|
256
|
-
export { type BuildNavaiAgentOptions, type BuildNavaiAgentResult, type CreateNavaiBackendClientOptions, type ExecuteNavaiBackendFunction, type ExecuteNavaiBackendFunctionInput, type NavaiAgentModuleConfig, type NavaiBackendClient, type NavaiBackendFunctionDefinition, type NavaiFunctionContext, type NavaiFunctionDefinition, type NavaiFunctionModuleLoaders, type NavaiFunctionPayload, type NavaiFunctionsRegistry, NavaiHeroOrb, type NavaiHeroOrbProps, NavaiMiniOrbDock, type NavaiMiniOrbDockProps, type NavaiRoute, type NavaiRuntimeAgentConfig, NavaiVoiceHeroOrb, type NavaiVoiceHeroOrbProps, type NavaiVoiceOrbBaseProps, NavaiVoiceOrbDock, NavaiVoiceOrbDockMicIcon, type NavaiVoiceOrbDockProps, type NavaiVoiceOrbMessages, type NavaiVoiceOrbPlacement, type NavaiVoiceOrbRuntimeSnapshot, type NavaiVoiceOrbThemeMode, type NavaiWebVoiceAgentLike, Orb, type OrbProps, type ResolveNavaiFrontendRuntimeConfigOptions, type ResolveNavaiFrontendRuntimeConfigResult, type UseWebVoiceAgentOptions, type UseWebVoiceAgentResult, buildNavaiAgent, clampNavaiOrbDelayMs, createNavaiBackendClient, getNavaiRoutePromptLines, loadNavaiFunctions, resolveNavaiFrontendRuntimeConfig, resolveNavaiRoute, resolveNavaiVoiceOrbRuntimeSnapshot, useWebVoiceAgent };
|
|
281
|
+
export { type BuildNavaiAgentOptions, type BuildNavaiAgentResult, type CreateNavaiBackendClientOptions, type ExecuteNavaiBackendFunction, type ExecuteNavaiBackendFunctionInput, type NavaiAgentModuleConfig, type NavaiBackendClient, type NavaiBackendFunctionDefinition, type NavaiBackendSpeechConfig, type NavaiFunctionContext, type NavaiFunctionDefinition, type NavaiFunctionModuleLoaders, type NavaiFunctionPayload, type NavaiFunctionsRegistry, NavaiHeroOrb, type NavaiHeroOrbProps, NavaiMiniOrbDock, type NavaiMiniOrbDockProps, type NavaiRoute, type NavaiRuntimeAgentConfig, NavaiVoiceHeroOrb, type NavaiVoiceHeroOrbProps, type NavaiVoiceOrbBaseProps, NavaiVoiceOrbDock, NavaiVoiceOrbDockMicIcon, type NavaiVoiceOrbDockProps, type NavaiVoiceOrbMessages, type NavaiVoiceOrbPlacement, type NavaiVoiceOrbRuntimeSnapshot, type NavaiVoiceOrbThemeMode, type NavaiWebVoiceAgentLike, Orb, type OrbProps, type ResolveNavaiFrontendRuntimeConfigOptions, type ResolveNavaiFrontendRuntimeConfigResult, type UseWebVoiceAgentOptions, type UseWebVoiceAgentResult, buildNavaiAgent, clampNavaiOrbDelayMs, createNavaiBackendClient, getNavaiRoutePromptLines, loadNavaiFunctions, resolveNavaiFrontendRuntimeConfig, resolveNavaiRoute, resolveNavaiVoiceOrbRuntimeSnapshot, useWebVoiceAgent };
|
package/dist/index.js
CHANGED
|
@@ -577,6 +577,7 @@ var DEFAULT_API_BASE_URL = "http://localhost:3000";
|
|
|
577
577
|
var DEFAULT_CLIENT_SECRET_PATH = "/navai/realtime/client-secret";
|
|
578
578
|
var DEFAULT_FUNCTIONS_LIST_PATH = "/navai/functions";
|
|
579
579
|
var DEFAULT_FUNCTIONS_EXECUTE_PATH = "/navai/functions/execute";
|
|
580
|
+
var DEFAULT_SPEECH_SYNTHESIZE_PATH = "/navai/speech/synthesize";
|
|
580
581
|
function readOptional(value) {
|
|
581
582
|
const trimmed = value?.trim();
|
|
582
583
|
return trimmed ? trimmed : void 0;
|
|
@@ -589,6 +590,12 @@ function joinUrl(baseUrl, path) {
|
|
|
589
590
|
function isRecord(value) {
|
|
590
591
|
return Boolean(value && typeof value === "object");
|
|
591
592
|
}
|
|
593
|
+
function readSpeechConfig(payload) {
|
|
594
|
+
if (isRecord(payload) && isRecord(payload.speech) && payload.speech.provider === "elevenlabs") {
|
|
595
|
+
return { provider: "elevenlabs" };
|
|
596
|
+
}
|
|
597
|
+
return { provider: "openai" };
|
|
598
|
+
}
|
|
592
599
|
async function readTextSafe(response) {
|
|
593
600
|
try {
|
|
594
601
|
return await response.text();
|
|
@@ -609,6 +616,7 @@ function createNavaiBackendClient(options = {}) {
|
|
|
609
616
|
const clientSecretUrl = joinUrl(apiBaseUrl, options.clientSecretPath ?? DEFAULT_CLIENT_SECRET_PATH);
|
|
610
617
|
const functionsListUrl = joinUrl(apiBaseUrl, options.functionsListPath ?? DEFAULT_FUNCTIONS_LIST_PATH);
|
|
611
618
|
const functionsExecuteUrl = joinUrl(apiBaseUrl, options.functionsExecutePath ?? DEFAULT_FUNCTIONS_EXECUTE_PATH);
|
|
619
|
+
const speechSynthesizeUrl = joinUrl(apiBaseUrl, options.speechSynthesizePath ?? DEFAULT_SPEECH_SYNTHESIZE_PATH);
|
|
612
620
|
async function createClientSecret(input = {}) {
|
|
613
621
|
const response = await fetchImpl(clientSecretUrl, {
|
|
614
622
|
method: "POST",
|
|
@@ -624,7 +632,27 @@ function createNavaiBackendClient(options = {}) {
|
|
|
624
632
|
}
|
|
625
633
|
return {
|
|
626
634
|
value: payload.value,
|
|
627
|
-
expires_at: typeof payload.expires_at === "number" ? payload.expires_at : void 0
|
|
635
|
+
expires_at: typeof payload.expires_at === "number" ? payload.expires_at : void 0,
|
|
636
|
+
speech: readSpeechConfig(payload)
|
|
637
|
+
};
|
|
638
|
+
}
|
|
639
|
+
async function synthesizeSpeech(input) {
|
|
640
|
+
const response = await fetchImpl(speechSynthesizeUrl, {
|
|
641
|
+
method: "POST",
|
|
642
|
+
headers: { "Content-Type": "application/json" },
|
|
643
|
+
body: JSON.stringify(input)
|
|
644
|
+
});
|
|
645
|
+
if (!response.ok) {
|
|
646
|
+
throw new Error(await readTextSafe(response));
|
|
647
|
+
}
|
|
648
|
+
const payload = await readJsonSafe(response);
|
|
649
|
+
if (!isRecord(payload) || payload.provider !== "elevenlabs" || typeof payload.mimeType !== "string" || typeof payload.audioBase64 !== "string") {
|
|
650
|
+
throw new Error("Invalid speech synthesis response.");
|
|
651
|
+
}
|
|
652
|
+
return {
|
|
653
|
+
provider: "elevenlabs",
|
|
654
|
+
mimeType: payload.mimeType,
|
|
655
|
+
audioBase64: payload.audioBase64
|
|
628
656
|
};
|
|
629
657
|
}
|
|
630
658
|
async function listFunctions() {
|
|
@@ -684,6 +712,7 @@ function createNavaiBackendClient(options = {}) {
|
|
|
684
712
|
};
|
|
685
713
|
return {
|
|
686
714
|
createClientSecret,
|
|
715
|
+
synthesizeSpeech,
|
|
687
716
|
listFunctions,
|
|
688
717
|
executeFunction
|
|
689
718
|
};
|
|
@@ -1045,9 +1074,77 @@ function debugLog2(message, details) {
|
|
|
1045
1074
|
}
|
|
1046
1075
|
console.log(`${DEBUG_PREFIX2} ${message}`, details);
|
|
1047
1076
|
}
|
|
1077
|
+
function isRecord2(value) {
|
|
1078
|
+
return Boolean(value && typeof value === "object");
|
|
1079
|
+
}
|
|
1080
|
+
function readRealtimeEventType(event) {
|
|
1081
|
+
if (!isRecord2(event) || typeof event.type !== "string") {
|
|
1082
|
+
return "";
|
|
1083
|
+
}
|
|
1084
|
+
return event.type.trim().toLowerCase();
|
|
1085
|
+
}
|
|
1086
|
+
function readAssistantTextFromResponseOutput(items) {
|
|
1087
|
+
const parts = [];
|
|
1088
|
+
for (const item of items) {
|
|
1089
|
+
if (!isRecord2(item) || item.type !== "message" || item.role !== "assistant") {
|
|
1090
|
+
continue;
|
|
1091
|
+
}
|
|
1092
|
+
const content = Array.isArray(item.content) ? item.content : [];
|
|
1093
|
+
for (const chunk of content) {
|
|
1094
|
+
if (!isRecord2(chunk)) {
|
|
1095
|
+
continue;
|
|
1096
|
+
}
|
|
1097
|
+
const text = chunk.type === "output_text" ? typeof chunk.text === "string" ? chunk.text : "" : chunk.type === "output_audio" ? typeof chunk.transcript === "string" ? chunk.transcript : "" : "";
|
|
1098
|
+
const normalized = text.trim();
|
|
1099
|
+
if (normalized) {
|
|
1100
|
+
parts.push(normalized);
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
return parts.join("\n").trim();
|
|
1105
|
+
}
|
|
1106
|
+
function extractAssistantTextFromRealtimeEvent(event) {
|
|
1107
|
+
if (!isRecord2(event)) {
|
|
1108
|
+
return null;
|
|
1109
|
+
}
|
|
1110
|
+
const eventType = readRealtimeEventType(event);
|
|
1111
|
+
if (eventType === "response.output_text.done" || eventType === "response.text.done" || eventType === "response.audio_transcript.done") {
|
|
1112
|
+
const text = typeof event.text === "string" ? event.text.trim() : typeof event.transcript === "string" ? event.transcript.trim() : "";
|
|
1113
|
+
if (!text) {
|
|
1114
|
+
return null;
|
|
1115
|
+
}
|
|
1116
|
+
const key = [
|
|
1117
|
+
eventType,
|
|
1118
|
+
typeof event.response_id === "string" ? event.response_id : "",
|
|
1119
|
+
typeof event.item_id === "string" ? event.item_id : ""
|
|
1120
|
+
].filter(Boolean).join(":");
|
|
1121
|
+
return { key: key || `${eventType}:${text}`, text };
|
|
1122
|
+
}
|
|
1123
|
+
if (eventType === "response.done" && isRecord2(event.response) && Array.isArray(event.response.output)) {
|
|
1124
|
+
const text = readAssistantTextFromResponseOutput(event.response.output);
|
|
1125
|
+
if (!text) {
|
|
1126
|
+
return null;
|
|
1127
|
+
}
|
|
1128
|
+
const responseId = typeof event.response.id === "string" ? event.response.id : "";
|
|
1129
|
+
return { key: responseId ? `response.done:${responseId}` : `response.done:${text}`, text };
|
|
1130
|
+
}
|
|
1131
|
+
return null;
|
|
1132
|
+
}
|
|
1133
|
+
function audioUrlFromSynthesis(result) {
|
|
1134
|
+
const binary = atob(result.audioBase64);
|
|
1135
|
+
const bytes = new Uint8Array(binary.length);
|
|
1136
|
+
for (let index = 0; index < binary.length; index += 1) {
|
|
1137
|
+
bytes[index] = binary.charCodeAt(index);
|
|
1138
|
+
}
|
|
1139
|
+
return URL.createObjectURL(new Blob([bytes], { type: result.mimeType }));
|
|
1140
|
+
}
|
|
1048
1141
|
function useWebVoiceAgent(options) {
|
|
1049
1142
|
const sessionRef = useRef(null);
|
|
1050
1143
|
const attachedRealtimeSessionRef = useRef(null);
|
|
1144
|
+
const speechProviderRef = useRef("openai");
|
|
1145
|
+
const spokenAssistantKeysRef = useRef(/* @__PURE__ */ new Set());
|
|
1146
|
+
const playbackGenerationRef = useRef(0);
|
|
1147
|
+
const activePlaybackRef = useRef(null);
|
|
1051
1148
|
const runtimeConfigPromise = useMemo(
|
|
1052
1149
|
() => resolveNavaiFrontendRuntimeConfig({
|
|
1053
1150
|
moduleLoaders: options.moduleLoaders,
|
|
@@ -1085,6 +1182,69 @@ function useWebVoiceAgent(options) {
|
|
|
1085
1182
|
const setAgentVoiceStateIfChanged = useCallback((next) => {
|
|
1086
1183
|
setAgentVoiceState((current) => current === next ? current : next);
|
|
1087
1184
|
}, []);
|
|
1185
|
+
const clearPlayback = useCallback(
|
|
1186
|
+
(options2) => {
|
|
1187
|
+
if (options2?.invalidate) {
|
|
1188
|
+
playbackGenerationRef.current += 1;
|
|
1189
|
+
}
|
|
1190
|
+
const active = activePlaybackRef.current;
|
|
1191
|
+
if (active) {
|
|
1192
|
+
try {
|
|
1193
|
+
active.audio.pause();
|
|
1194
|
+
active.audio.currentTime = 0;
|
|
1195
|
+
} catch {
|
|
1196
|
+
}
|
|
1197
|
+
URL.revokeObjectURL(active.url);
|
|
1198
|
+
}
|
|
1199
|
+
activePlaybackRef.current = null;
|
|
1200
|
+
if (options2?.resetState !== false) {
|
|
1201
|
+
setAgentVoiceStateIfChanged("idle");
|
|
1202
|
+
}
|
|
1203
|
+
},
|
|
1204
|
+
[setAgentVoiceStateIfChanged]
|
|
1205
|
+
);
|
|
1206
|
+
const playAssistantSpeech = useCallback(
|
|
1207
|
+
async (text) => {
|
|
1208
|
+
if (speechProviderRef.current !== "elevenlabs") {
|
|
1209
|
+
return;
|
|
1210
|
+
}
|
|
1211
|
+
const normalized = text.trim();
|
|
1212
|
+
if (!normalized) {
|
|
1213
|
+
return;
|
|
1214
|
+
}
|
|
1215
|
+
clearPlayback({ resetState: false });
|
|
1216
|
+
const generation = playbackGenerationRef.current + 1;
|
|
1217
|
+
playbackGenerationRef.current = generation;
|
|
1218
|
+
setAgentVoiceStateIfChanged("speaking");
|
|
1219
|
+
try {
|
|
1220
|
+
const synthesized = await backendClient.synthesizeSpeech({ text: normalized });
|
|
1221
|
+
if (speechProviderRef.current !== "elevenlabs" || playbackGenerationRef.current !== generation) {
|
|
1222
|
+
return;
|
|
1223
|
+
}
|
|
1224
|
+
const audio = new Audio();
|
|
1225
|
+
const url = audioUrlFromSynthesis(synthesized);
|
|
1226
|
+
audio.src = url;
|
|
1227
|
+
audio.autoplay = false;
|
|
1228
|
+
activePlaybackRef.current = { audio, url };
|
|
1229
|
+
const finish = () => {
|
|
1230
|
+
if (activePlaybackRef.current?.audio === audio) {
|
|
1231
|
+
clearPlayback({ resetState: true });
|
|
1232
|
+
} else {
|
|
1233
|
+
URL.revokeObjectURL(url);
|
|
1234
|
+
}
|
|
1235
|
+
};
|
|
1236
|
+
audio.addEventListener("ended", finish, { once: true });
|
|
1237
|
+
audio.addEventListener("error", finish, { once: true });
|
|
1238
|
+
await audio.play();
|
|
1239
|
+
} catch (playbackError) {
|
|
1240
|
+
debugLog2("assistant speech playback failed", playbackError);
|
|
1241
|
+
if (playbackGenerationRef.current === generation) {
|
|
1242
|
+
clearPlayback({ resetState: true });
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
},
|
|
1246
|
+
[backendClient, clearPlayback, setAgentVoiceStateIfChanged]
|
|
1247
|
+
);
|
|
1088
1248
|
const handleSessionAudioStart = useCallback(() => {
|
|
1089
1249
|
setAgentVoiceStateIfChanged("speaking");
|
|
1090
1250
|
}, [setAgentVoiceStateIfChanged]);
|
|
@@ -1092,11 +1252,35 @@ function useWebVoiceAgent(options) {
|
|
|
1092
1252
|
setAgentVoiceStateIfChanged("idle");
|
|
1093
1253
|
}, [setAgentVoiceStateIfChanged]);
|
|
1094
1254
|
const handleSessionAudioInterrupted = useCallback(() => {
|
|
1255
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1095
1256
|
setAgentVoiceStateIfChanged("idle");
|
|
1096
|
-
}, [setAgentVoiceStateIfChanged]);
|
|
1257
|
+
}, [clearPlayback, setAgentVoiceStateIfChanged]);
|
|
1097
1258
|
const handleSessionError = useCallback(() => {
|
|
1259
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1098
1260
|
setAgentVoiceStateIfChanged("idle");
|
|
1099
|
-
}, [setAgentVoiceStateIfChanged]);
|
|
1261
|
+
}, [clearPlayback, setAgentVoiceStateIfChanged]);
|
|
1262
|
+
const handleTransportEvent = useCallback(
|
|
1263
|
+
(event) => {
|
|
1264
|
+
const eventType = readRealtimeEventType(event);
|
|
1265
|
+
if (!eventType) {
|
|
1266
|
+
return;
|
|
1267
|
+
}
|
|
1268
|
+
if (eventType === "input_audio_buffer.speech_started" || eventType === "conversation.item.input_audio_transcription.started") {
|
|
1269
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1270
|
+
return;
|
|
1271
|
+
}
|
|
1272
|
+
if (speechProviderRef.current !== "elevenlabs") {
|
|
1273
|
+
return;
|
|
1274
|
+
}
|
|
1275
|
+
const assistantText = extractAssistantTextFromRealtimeEvent(event);
|
|
1276
|
+
if (!assistantText || spokenAssistantKeysRef.current.has(assistantText.key)) {
|
|
1277
|
+
return;
|
|
1278
|
+
}
|
|
1279
|
+
spokenAssistantKeysRef.current.add(assistantText.key);
|
|
1280
|
+
void playAssistantSpeech(assistantText.text);
|
|
1281
|
+
},
|
|
1282
|
+
[clearPlayback, playAssistantSpeech]
|
|
1283
|
+
);
|
|
1100
1284
|
const detachSessionAudioListeners = useCallback(() => {
|
|
1101
1285
|
const attachedSession = attachedRealtimeSessionRef.current;
|
|
1102
1286
|
if (!attachedSession) {
|
|
@@ -1105,9 +1289,16 @@ function useWebVoiceAgent(options) {
|
|
|
1105
1289
|
attachedSession.off("audio_start", handleSessionAudioStart);
|
|
1106
1290
|
attachedSession.off("audio_stopped", handleSessionAudioStopped);
|
|
1107
1291
|
attachedSession.off("audio_interrupted", handleSessionAudioInterrupted);
|
|
1292
|
+
attachedSession.off("transport_event", handleTransportEvent);
|
|
1108
1293
|
attachedSession.off("error", handleSessionError);
|
|
1109
1294
|
attachedRealtimeSessionRef.current = null;
|
|
1110
|
-
}, [
|
|
1295
|
+
}, [
|
|
1296
|
+
handleSessionAudioInterrupted,
|
|
1297
|
+
handleSessionAudioStart,
|
|
1298
|
+
handleSessionAudioStopped,
|
|
1299
|
+
handleSessionError,
|
|
1300
|
+
handleTransportEvent
|
|
1301
|
+
]);
|
|
1111
1302
|
const attachSessionAudioListeners = useCallback(
|
|
1112
1303
|
(session) => {
|
|
1113
1304
|
detachSessionAudioListeners();
|
|
@@ -1147,6 +1338,7 @@ function useWebVoiceAgent(options) {
|
|
|
1147
1338
|
session.on("history_added", (item) => {
|
|
1148
1339
|
debugLog2("session history_added", item);
|
|
1149
1340
|
});
|
|
1341
|
+
session.on("transport_event", handleTransportEvent);
|
|
1150
1342
|
session.on("error", (sessionError) => {
|
|
1151
1343
|
debugLog2("session error", sessionError);
|
|
1152
1344
|
});
|
|
@@ -1161,19 +1353,23 @@ function useWebVoiceAgent(options) {
|
|
|
1161
1353
|
handleSessionAudioInterrupted,
|
|
1162
1354
|
handleSessionAudioStart,
|
|
1163
1355
|
handleSessionAudioStopped,
|
|
1164
|
-
handleSessionError
|
|
1356
|
+
handleSessionError,
|
|
1357
|
+
handleTransportEvent
|
|
1165
1358
|
]
|
|
1166
1359
|
);
|
|
1167
1360
|
const stop = useCallback(() => {
|
|
1168
1361
|
detachSessionAudioListeners();
|
|
1362
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1169
1363
|
try {
|
|
1170
1364
|
sessionRef.current?.close();
|
|
1171
1365
|
} finally {
|
|
1172
1366
|
sessionRef.current = null;
|
|
1367
|
+
spokenAssistantKeysRef.current.clear();
|
|
1368
|
+
speechProviderRef.current = "openai";
|
|
1173
1369
|
setStatus("idle");
|
|
1174
1370
|
setAgentVoiceStateIfChanged("idle");
|
|
1175
1371
|
}
|
|
1176
|
-
}, [detachSessionAudioListeners, setAgentVoiceStateIfChanged]);
|
|
1372
|
+
}, [clearPlayback, detachSessionAudioListeners, setAgentVoiceStateIfChanged]);
|
|
1177
1373
|
useEffect(() => {
|
|
1178
1374
|
return () => {
|
|
1179
1375
|
stop();
|
|
@@ -1201,6 +1397,9 @@ function useWebVoiceAgent(options) {
|
|
|
1201
1397
|
});
|
|
1202
1398
|
const requestPayload = runtimeConfig.modelOverride ? { model: runtimeConfig.modelOverride } : {};
|
|
1203
1399
|
const secretPayload = await backendClient.createClientSecret(requestPayload);
|
|
1400
|
+
speechProviderRef.current = secretPayload.speech.provider;
|
|
1401
|
+
spokenAssistantKeysRef.current.clear();
|
|
1402
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1204
1403
|
const backendFunctionsResult = await backendClient.listFunctions();
|
|
1205
1404
|
const { agent, warnings } = await buildNavaiAgent({
|
|
1206
1405
|
navigate: options.navigate,
|
|
@@ -1212,7 +1411,11 @@ function useWebVoiceAgent(options) {
|
|
|
1212
1411
|
executeBackendFunction: backendClient.executeFunction
|
|
1213
1412
|
});
|
|
1214
1413
|
emitWarnings([...runtimeConfig.warnings, ...backendFunctionsResult.warnings, ...warnings]);
|
|
1215
|
-
const session = new RealtimeSession(agent
|
|
1414
|
+
const session = secretPayload.speech.provider === "elevenlabs" ? new RealtimeSession(agent, {
|
|
1415
|
+
config: {
|
|
1416
|
+
outputModalities: ["text"]
|
|
1417
|
+
}
|
|
1418
|
+
}) : new RealtimeSession(agent);
|
|
1216
1419
|
attachSessionAudioListeners(session);
|
|
1217
1420
|
if (runtimeConfig.modelOverride) {
|
|
1218
1421
|
await session.connect({ apiKey: secretPayload.value, model: runtimeConfig.modelOverride });
|
|
@@ -1228,6 +1431,9 @@ function useWebVoiceAgent(options) {
|
|
|
1228
1431
|
setStatus("error");
|
|
1229
1432
|
setAgentVoiceStateIfChanged("idle");
|
|
1230
1433
|
detachSessionAudioListeners();
|
|
1434
|
+
clearPlayback({ invalidate: true, resetState: true });
|
|
1435
|
+
spokenAssistantKeysRef.current.clear();
|
|
1436
|
+
speechProviderRef.current = "openai";
|
|
1231
1437
|
try {
|
|
1232
1438
|
sessionRef.current?.close();
|
|
1233
1439
|
} catch {
|
|
@@ -1237,6 +1443,7 @@ function useWebVoiceAgent(options) {
|
|
|
1237
1443
|
}, [
|
|
1238
1444
|
attachSessionAudioListeners,
|
|
1239
1445
|
backendClient,
|
|
1446
|
+
clearPlayback,
|
|
1240
1447
|
detachSessionAudioListeners,
|
|
1241
1448
|
options.navigate,
|
|
1242
1449
|
runtimeConfigPromise,
|