@guidekit/core 0.1.0-beta.1 → 0.1.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1 -1
- package/dist/index.cjs +1443 -802
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +544 -127
- package/dist/index.d.ts +544 -127
- package/dist/index.js +1441 -802
- package/dist/index.js.map +1 -1
- package/package.json +37 -14
package/dist/index.d.cts
CHANGED
|
@@ -351,8 +351,14 @@ type STTConfig = {
|
|
|
351
351
|
apiKey: string;
|
|
352
352
|
model?: 'nova-2' | 'nova-3';
|
|
353
353
|
} | {
|
|
354
|
-
provider: '
|
|
354
|
+
provider: 'elevenlabs';
|
|
355
355
|
apiKey: string;
|
|
356
|
+
language?: string;
|
|
357
|
+
} | {
|
|
358
|
+
provider: 'web-speech';
|
|
359
|
+
language?: string;
|
|
360
|
+
continuous?: boolean;
|
|
361
|
+
interimResults?: boolean;
|
|
356
362
|
};
|
|
357
363
|
/** Text-to-speech provider configuration. */
|
|
358
364
|
type TTSConfig = {
|
|
@@ -360,19 +366,26 @@ type TTSConfig = {
|
|
|
360
366
|
apiKey: string;
|
|
361
367
|
voiceId?: string;
|
|
362
368
|
} | {
|
|
363
|
-
provider: '
|
|
364
|
-
|
|
365
|
-
|
|
369
|
+
provider: 'web-speech';
|
|
370
|
+
voice?: string;
|
|
371
|
+
rate?: number;
|
|
372
|
+
pitch?: number;
|
|
373
|
+
language?: string;
|
|
366
374
|
};
|
|
375
|
+
/** Transcript event emitted by any STT adapter. */
|
|
376
|
+
interface STTTranscriptEvent {
|
|
377
|
+
text: string;
|
|
378
|
+
isFinal: boolean;
|
|
379
|
+
confidence: number;
|
|
380
|
+
timestamp: number;
|
|
381
|
+
}
|
|
367
382
|
/** Large language model provider configuration. */
|
|
368
383
|
type LLMConfig = {
|
|
369
384
|
provider: 'gemini';
|
|
370
385
|
apiKey: string;
|
|
371
386
|
model?: 'gemini-2.5-flash' | 'gemini-2.5-pro';
|
|
372
387
|
} | {
|
|
373
|
-
|
|
374
|
-
apiKey: string;
|
|
375
|
-
model?: 'gpt-4o' | 'gpt-4o-mini';
|
|
388
|
+
adapter: LLMProviderAdapter;
|
|
376
389
|
};
|
|
377
390
|
/** Top-level options that control SDK behaviour. */
|
|
378
391
|
interface GuideKitOptions {
|
|
@@ -481,11 +494,27 @@ interface GuideKitProviderProps {
|
|
|
481
494
|
}>;
|
|
482
495
|
children?: unknown;
|
|
483
496
|
}
|
|
497
|
+
/** A single JSON-Schema-style property descriptor used in tool parameter maps. */
|
|
498
|
+
interface ToolParameterSchema {
|
|
499
|
+
type: string;
|
|
500
|
+
description?: string;
|
|
501
|
+
enum?: string[];
|
|
502
|
+
items?: {
|
|
503
|
+
type: string;
|
|
504
|
+
};
|
|
505
|
+
[key: string]: unknown;
|
|
506
|
+
}
|
|
484
507
|
/** Definition of a tool that can be invoked by the LLM. */
|
|
485
508
|
interface ToolDefinition {
|
|
486
509
|
name: string;
|
|
487
510
|
description: string;
|
|
488
|
-
|
|
511
|
+
/** Flat map of param name → JSON Schema property descriptor. */
|
|
512
|
+
parameters: Record<string, ToolParameterSchema>;
|
|
513
|
+
/**
|
|
514
|
+
* List of parameter names the LLM must always provide.
|
|
515
|
+
* Omit or use [] for fully optional parameters.
|
|
516
|
+
*/
|
|
517
|
+
required?: string[];
|
|
489
518
|
schemaVersion: number;
|
|
490
519
|
}
|
|
491
520
|
/** A tool invocation request returned by the LLM. */
|
|
@@ -509,6 +538,35 @@ interface LLMProviderAdapter {
|
|
|
509
538
|
formatConversation(history: ConversationTurn[]): unknown;
|
|
510
539
|
parseResponse(stream: ReadableStream): AsyncIterable<TextChunk | ToolCall>;
|
|
511
540
|
formatToolResult(callId: string, result: unknown): unknown;
|
|
541
|
+
/**
|
|
542
|
+
* Build and execute a streaming request to the provider API.
|
|
543
|
+
* Returns the raw ReadableStream for the response body.
|
|
544
|
+
*/
|
|
545
|
+
streamRequest(params: {
|
|
546
|
+
systemPrompt: string;
|
|
547
|
+
contents: unknown;
|
|
548
|
+
userMessage?: string;
|
|
549
|
+
tools?: unknown;
|
|
550
|
+
signal?: AbortSignal;
|
|
551
|
+
timeoutMs?: number;
|
|
552
|
+
}): Promise<{
|
|
553
|
+
stream: ReadableStream<Uint8Array>;
|
|
554
|
+
response: Response;
|
|
555
|
+
}>;
|
|
556
|
+
/**
|
|
557
|
+
* Check whether a parsed response chunk indicates the response was
|
|
558
|
+
* blocked by a content/safety filter.
|
|
559
|
+
*/
|
|
560
|
+
isContentFiltered(chunk: Record<string, unknown>): boolean;
|
|
561
|
+
/**
|
|
562
|
+
* Extract token usage from a parsed response chunk.
|
|
563
|
+
* Returns `null` if no usage metadata is present in this chunk.
|
|
564
|
+
*/
|
|
565
|
+
extractUsage(chunk: Record<string, unknown>): {
|
|
566
|
+
prompt: number;
|
|
567
|
+
completion: number;
|
|
568
|
+
total: number;
|
|
569
|
+
} | null;
|
|
512
570
|
}
|
|
513
571
|
/** Decoded payload of a GuideKit session token. */
|
|
514
572
|
interface TokenPayload {
|
|
@@ -529,9 +587,9 @@ interface TokenResponse {
|
|
|
529
587
|
/** Options for `createSessionToken()` on the server side. */
|
|
530
588
|
interface CreateSessionTokenOptions {
|
|
531
589
|
signingSecret: string | string[];
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
590
|
+
sttApiKey?: string;
|
|
591
|
+
ttsApiKey?: string;
|
|
592
|
+
llmApiKey?: string;
|
|
535
593
|
expiresIn?: string;
|
|
536
594
|
allowedOrigins?: string[];
|
|
537
595
|
permissions?: string[];
|
|
@@ -586,10 +644,11 @@ declare const ErrorCodes: {
|
|
|
586
644
|
readonly VAD_PACKAGE_MISSING: "VAD_PACKAGE_MISSING";
|
|
587
645
|
readonly CONTENT_FILTER_TRIGGERED: "CONTENT_FILTER_TRIGGERED";
|
|
588
646
|
readonly PRIVACY_HOOK_CANCELLED: "PRIVACY_HOOK_CANCELLED";
|
|
647
|
+
readonly UNKNOWN: "UNKNOWN";
|
|
589
648
|
};
|
|
590
649
|
/** Union of every known error code string. */
|
|
591
650
|
type ErrorCode = (typeof ErrorCodes)[keyof typeof ErrorCodes];
|
|
592
|
-
type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai';
|
|
651
|
+
type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai' | 'web-speech' | (string & {});
|
|
593
652
|
interface GuideKitErrorOptions {
|
|
594
653
|
code: string;
|
|
595
654
|
message: string;
|
|
@@ -830,6 +889,11 @@ declare class GuideKitCore {
|
|
|
830
889
|
private setAgentState;
|
|
831
890
|
private notifyStoreListeners;
|
|
832
891
|
private buildSnapshot;
|
|
892
|
+
/**
|
|
893
|
+
* Unified built-in tool specifications — single source of truth for both
|
|
894
|
+
* tool definitions (sent to LLM) and handler registration.
|
|
895
|
+
*/
|
|
896
|
+
private getBuiltinToolSpecs;
|
|
833
897
|
/**
|
|
834
898
|
* Register all built-in tool handlers with the ToolExecutor.
|
|
835
899
|
* Called once during init() after VisualGuidance and all subsystems are ready.
|
|
@@ -1113,110 +1177,6 @@ declare class ContextManager {
|
|
|
1113
1177
|
private log;
|
|
1114
1178
|
}
|
|
1115
1179
|
|
|
1116
|
-
interface TokenUsage$1 {
|
|
1117
|
-
prompt: number;
|
|
1118
|
-
completion: number;
|
|
1119
|
-
total: number;
|
|
1120
|
-
}
|
|
1121
|
-
/**
|
|
1122
|
-
* Adapter that translates between GuideKit's internal types and the
|
|
1123
|
-
* OpenAI Chat Completions API wire format. Handles streaming via SSE,
|
|
1124
|
-
* tool formatting, and response parsing.
|
|
1125
|
-
*/
|
|
1126
|
-
declare class OpenAIAdapter implements LLMProviderAdapter {
|
|
1127
|
-
private readonly apiKey;
|
|
1128
|
-
private readonly model;
|
|
1129
|
-
constructor(config: Extract<LLMConfig, {
|
|
1130
|
-
provider: 'openai';
|
|
1131
|
-
}>);
|
|
1132
|
-
/**
|
|
1133
|
-
* Convert GuideKit tool definitions into OpenAI's `tools` format.
|
|
1134
|
-
* Each tool is wrapped as `{ type: 'function', function: { name, description, parameters } }`.
|
|
1135
|
-
*/
|
|
1136
|
-
formatTools(tools: ToolDefinition[]): unknown;
|
|
1137
|
-
/**
|
|
1138
|
-
* Convert an array of `ConversationTurn` objects into OpenAI's messages
|
|
1139
|
-
* format with `role: 'user' | 'assistant'`.
|
|
1140
|
-
*/
|
|
1141
|
-
formatConversation(history: ConversationTurn[]): Array<{
|
|
1142
|
-
role: 'user' | 'assistant';
|
|
1143
|
-
content: string;
|
|
1144
|
-
}>;
|
|
1145
|
-
/**
|
|
1146
|
-
* Parse an OpenAI SSE streaming response into an async iterable of
|
|
1147
|
-
* `TextChunk` and `ToolCall` objects.
|
|
1148
|
-
*
|
|
1149
|
-
* The OpenAI streaming endpoint sends each chunk as a JSON object
|
|
1150
|
-
* prefixed by `data: `. The final line is `data: [DONE]`.
|
|
1151
|
-
* Text content arrives in `choices[0].delta.content` and tool calls
|
|
1152
|
-
* arrive in `choices[0].delta.tool_calls`.
|
|
1153
|
-
*/
|
|
1154
|
-
parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
|
|
1155
|
-
/**
|
|
1156
|
-
* Format a tool result so it can be sent back to OpenAI as a
|
|
1157
|
-
* `tool` role message with the `tool_call_id`.
|
|
1158
|
-
*/
|
|
1159
|
-
formatToolResult(callId: string, result: unknown): {
|
|
1160
|
-
role: 'tool';
|
|
1161
|
-
tool_call_id: string;
|
|
1162
|
-
content: string;
|
|
1163
|
-
};
|
|
1164
|
-
/**
|
|
1165
|
-
* Build and execute a streaming request to the OpenAI Chat Completions API.
|
|
1166
|
-
* Returns the raw `ReadableStream` for the response body together with
|
|
1167
|
-
* the raw Response object.
|
|
1168
|
-
*/
|
|
1169
|
-
streamRequest(params: {
|
|
1170
|
-
systemPrompt: string;
|
|
1171
|
-
contents: Array<{
|
|
1172
|
-
role: string;
|
|
1173
|
-
content: string;
|
|
1174
|
-
}>;
|
|
1175
|
-
tools?: unknown;
|
|
1176
|
-
signal?: AbortSignal;
|
|
1177
|
-
timeoutMs?: number;
|
|
1178
|
-
}): Promise<{
|
|
1179
|
-
stream: ReadableStream<Uint8Array>;
|
|
1180
|
-
response: Response;
|
|
1181
|
-
}>;
|
|
1182
|
-
/**
|
|
1183
|
-
* Extract `TextChunk` and accumulate `ToolCall` data from a single parsed
|
|
1184
|
-
* OpenAI SSE JSON object.
|
|
1185
|
-
*
|
|
1186
|
-
* OpenAI tool calls arrive incrementally: the first chunk for a tool call
|
|
1187
|
-
* carries the `id` and `function.name`, while subsequent chunks append to
|
|
1188
|
-
* `function.arguments`. We accumulate these in `pendingToolCalls` and only
|
|
1189
|
-
* yield complete `ToolCall` objects when the finish_reason is 'tool_calls'
|
|
1190
|
-
* or when flushed.
|
|
1191
|
-
*/
|
|
1192
|
-
private extractChunks;
|
|
1193
|
-
/**
|
|
1194
|
-
* Flush all accumulated pending tool calls as complete `ToolCall` objects.
|
|
1195
|
-
*/
|
|
1196
|
-
private flushPendingToolCalls;
|
|
1197
|
-
/**
|
|
1198
|
-
* Extract token usage from a parsed OpenAI response chunk.
|
|
1199
|
-
* Usage data typically appears in the final chunk when `stream_options`
|
|
1200
|
-
* includes `include_usage`, or in the non-streaming response.
|
|
1201
|
-
* Returns `null` if no usage data is present.
|
|
1202
|
-
*/
|
|
1203
|
-
extractUsage(parsed: Record<string, unknown>): TokenUsage$1 | null;
|
|
1204
|
-
/**
|
|
1205
|
-
* Check whether a parsed OpenAI chunk indicates the response was
|
|
1206
|
-
* blocked by a content filter.
|
|
1207
|
-
*
|
|
1208
|
-
* OpenAI signals content filtering through:
|
|
1209
|
-
* - `choices[].finish_reason === 'content_filter'`
|
|
1210
|
-
* - `choices[].content_filter_results` with `filtered: true`
|
|
1211
|
-
*/
|
|
1212
|
-
isContentFiltered(parsed: Record<string, unknown>): boolean;
|
|
1213
|
-
/**
|
|
1214
|
-
* Translate an HTTP error response from OpenAI into the appropriate
|
|
1215
|
-
* GuideKit error class.
|
|
1216
|
-
*/
|
|
1217
|
-
private handleHttpError;
|
|
1218
|
-
}
|
|
1219
|
-
|
|
1220
1180
|
interface TokenUsage {
|
|
1221
1181
|
prompt: number;
|
|
1222
1182
|
completion: number;
|
|
@@ -1230,9 +1190,17 @@ interface TokenUsage {
|
|
|
1230
1190
|
declare class GeminiAdapter implements LLMProviderAdapter {
|
|
1231
1191
|
private readonly apiKey;
|
|
1232
1192
|
private readonly model;
|
|
1193
|
+
/**
|
|
1194
|
+
* Token usage extracted from the most recent `parseResponse` call.
|
|
1195
|
+
* Updated as each SSE chunk is parsed; the final value reflects the
|
|
1196
|
+
* cumulative usage metadata sent by Gemini (typically in the last chunk).
|
|
1197
|
+
*/
|
|
1198
|
+
private _lastUsage;
|
|
1233
1199
|
constructor(config: Extract<LLMConfig, {
|
|
1234
1200
|
provider: 'gemini';
|
|
1235
1201
|
}>);
|
|
1202
|
+
/** Token usage from the most recent parseResponse call. */
|
|
1203
|
+
get lastUsage(): TokenUsage;
|
|
1236
1204
|
/**
|
|
1237
1205
|
* Convert GuideKit tool definitions into Gemini's `functionDeclarations`
|
|
1238
1206
|
* format, wrapped inside a `tools` array.
|
|
@@ -1255,6 +1223,10 @@ declare class GeminiAdapter implements LLMProviderAdapter {
|
|
|
1255
1223
|
* The Gemini `streamGenerateContent?alt=sse` endpoint sends each chunk
|
|
1256
1224
|
* as a JSON object prefixed by `data: `. We parse line-by-line, extract
|
|
1257
1225
|
* text parts and function call parts, and yield the appropriate types.
|
|
1226
|
+
*
|
|
1227
|
+
* This method also:
|
|
1228
|
+
* - Detects content filtering and throws `ContentFilterError`.
|
|
1229
|
+
* - Tracks token usage (accessible via `lastUsage` after iteration).
|
|
1258
1230
|
*/
|
|
1259
1231
|
parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
|
|
1260
1232
|
/**
|
|
@@ -1275,16 +1247,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
|
|
|
1275
1247
|
/**
|
|
1276
1248
|
* Build and execute a streaming request to the Gemini API.
|
|
1277
1249
|
* Returns the raw `ReadableStream` for the response body together with
|
|
1278
|
-
*
|
|
1250
|
+
* the raw Response object.
|
|
1251
|
+
*
|
|
1252
|
+
* Note: The Gemini API key is passed as a URL query parameter (`key=`).
|
|
1253
|
+
* This is inherent to the Gemini REST SSE endpoint design; the key is
|
|
1254
|
+
* transmitted over HTTPS so it remains encrypted in transit. (H3)
|
|
1279
1255
|
*/
|
|
1280
1256
|
streamRequest(params: {
|
|
1281
1257
|
systemPrompt: string;
|
|
1282
|
-
contents:
|
|
1283
|
-
|
|
1284
|
-
parts: Array<{
|
|
1285
|
-
text: string;
|
|
1286
|
-
}>;
|
|
1287
|
-
}>;
|
|
1258
|
+
contents: unknown;
|
|
1259
|
+
userMessage?: string;
|
|
1288
1260
|
tools?: unknown;
|
|
1289
1261
|
signal?: AbortSignal;
|
|
1290
1262
|
timeoutMs?: number;
|
|
@@ -1317,12 +1289,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
|
|
|
1317
1289
|
* High-level orchestrator that manages LLM interactions for the GuideKit SDK.
|
|
1318
1290
|
*
|
|
1319
1291
|
* Responsibilities:
|
|
1320
|
-
* - Owns the active `LLMProviderAdapter
|
|
1292
|
+
* - Owns the active `LLMProviderAdapter`.
|
|
1321
1293
|
* - Streams responses from the provider, emitting callbacks for text chunks,
|
|
1322
1294
|
* tool calls, and token usage.
|
|
1323
1295
|
* - Handles content filter retries: if the initial response is blocked, it
|
|
1324
1296
|
* retries once with a stripped-down prompt (no tools).
|
|
1325
1297
|
* - Surfaces all errors through the SDK error hierarchy.
|
|
1298
|
+
*
|
|
1299
|
+
* The orchestrator is fully adapter-agnostic: all provider-specific logic
|
|
1300
|
+
* (SSE parsing, content filter detection, usage extraction) lives in the
|
|
1301
|
+
* adapter implementations.
|
|
1326
1302
|
*/
|
|
1327
1303
|
declare class LLMOrchestrator {
|
|
1328
1304
|
private _adapter;
|
|
@@ -1363,12 +1339,22 @@ declare class LLMOrchestrator {
|
|
|
1363
1339
|
get adapter(): LLMProviderAdapter;
|
|
1364
1340
|
/**
|
|
1365
1341
|
* Execute a streaming LLM request and collect the results.
|
|
1342
|
+
*
|
|
1343
|
+
* This method is fully adapter-agnostic: it delegates streaming,
|
|
1344
|
+
* response parsing, content-filter detection, and usage extraction
|
|
1345
|
+
* entirely to the active `LLMProviderAdapter`. No provider-specific
|
|
1346
|
+
* SSE parsing lives in the orchestrator.
|
|
1366
1347
|
*/
|
|
1367
1348
|
private executeStream;
|
|
1368
1349
|
/**
|
|
1369
1350
|
* Create the appropriate adapter for the given config.
|
|
1370
|
-
*
|
|
1371
|
-
*
|
|
1351
|
+
*
|
|
1352
|
+
* Built-in providers:
|
|
1353
|
+
* - `'gemini'` — uses the bundled `GeminiAdapter`.
|
|
1354
|
+
*
|
|
1355
|
+
* Custom adapters:
|
|
1356
|
+
* - Pass `{ adapter: myAdapter }` to use any `LLMProviderAdapter`.
|
|
1357
|
+
* Example: `llm: { adapter: myCustomAdapter }`
|
|
1372
1358
|
*/
|
|
1373
1359
|
private createAdapter;
|
|
1374
1360
|
/** Convenience accessor for the current provider name. */
|
|
@@ -2183,4 +2169,435 @@ declare class TokenManager {
|
|
|
2183
2169
|
private log;
|
|
2184
2170
|
}
|
|
2185
2171
|
|
|
2186
|
-
|
|
2172
|
+
/**
|
|
2173
|
+
* Minimal type declarations for the Web Speech API SpeechRecognition
|
|
2174
|
+
* interface. These cover the subset used by this adapter. Full type
|
|
2175
|
+
* definitions are available in lib.dom.d.ts but may not be present in
|
|
2176
|
+
* all TS configurations.
|
|
2177
|
+
*/
|
|
2178
|
+
interface SpeechRecognitionEvent {
|
|
2179
|
+
readonly resultIndex: number;
|
|
2180
|
+
readonly results: SpeechRecognitionResultList;
|
|
2181
|
+
}
|
|
2182
|
+
interface SpeechRecognitionResultList {
|
|
2183
|
+
readonly length: number;
|
|
2184
|
+
item(index: number): SpeechRecognitionResult;
|
|
2185
|
+
[index: number]: SpeechRecognitionResult;
|
|
2186
|
+
}
|
|
2187
|
+
interface SpeechRecognitionResult {
|
|
2188
|
+
readonly length: number;
|
|
2189
|
+
readonly isFinal: boolean;
|
|
2190
|
+
item(index: number): SpeechRecognitionAlternative;
|
|
2191
|
+
[index: number]: SpeechRecognitionAlternative;
|
|
2192
|
+
}
|
|
2193
|
+
interface SpeechRecognitionAlternative {
|
|
2194
|
+
readonly transcript: string;
|
|
2195
|
+
readonly confidence: number;
|
|
2196
|
+
}
|
|
2197
|
+
interface SpeechRecognitionErrorEvent {
|
|
2198
|
+
readonly error: string;
|
|
2199
|
+
readonly message: string;
|
|
2200
|
+
}
|
|
2201
|
+
interface SpeechRecognitionInstance extends EventTarget {
|
|
2202
|
+
lang: string;
|
|
2203
|
+
continuous: boolean;
|
|
2204
|
+
interimResults: boolean;
|
|
2205
|
+
maxAlternatives: number;
|
|
2206
|
+
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
|
2207
|
+
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
|
|
2208
|
+
onend: (() => void) | null;
|
|
2209
|
+
onstart: (() => void) | null;
|
|
2210
|
+
start(): void;
|
|
2211
|
+
stop(): void;
|
|
2212
|
+
abort(): void;
|
|
2213
|
+
}
|
|
2214
|
+
interface SpeechRecognitionConstructor {
|
|
2215
|
+
new (): SpeechRecognitionInstance;
|
|
2216
|
+
}
|
|
2217
|
+
declare global {
|
|
2218
|
+
var webkitSpeechRecognition: SpeechRecognitionConstructor | undefined;
|
|
2219
|
+
}
|
|
2220
|
+
interface WebSpeechSTTOptions {
|
|
2221
|
+
language?: string;
|
|
2222
|
+
continuous?: boolean;
|
|
2223
|
+
interimResults?: boolean;
|
|
2224
|
+
debug?: boolean;
|
|
2225
|
+
}
|
|
2226
|
+
declare class WebSpeechSTT {
|
|
2227
|
+
private readonly language;
|
|
2228
|
+
private readonly continuous;
|
|
2229
|
+
private readonly interimResultsEnabled;
|
|
2230
|
+
private readonly debugEnabled;
|
|
2231
|
+
private recognition;
|
|
2232
|
+
private _connected;
|
|
2233
|
+
private _suspended;
|
|
2234
|
+
/**
|
|
2235
|
+
* Whether we intentionally stopped recognition. Used to distinguish
|
|
2236
|
+
* between intentional stop and unexpected end (for auto-restart in
|
|
2237
|
+
* continuous mode).
|
|
2238
|
+
*/
|
|
2239
|
+
private _intentionalStop;
|
|
2240
|
+
/** Registered transcript callbacks. */
|
|
2241
|
+
private readonly transcriptCallbacks;
|
|
2242
|
+
constructor(options?: WebSpeechSTTOptions);
|
|
2243
|
+
/**
|
|
2244
|
+
* Check whether the Web Speech API SpeechRecognition is supported in the
|
|
2245
|
+
* current environment. Safe to call in SSR (returns false).
|
|
2246
|
+
*/
|
|
2247
|
+
static isSupported(): boolean;
|
|
2248
|
+
/** Whether recognition is currently active and connected. */
|
|
2249
|
+
get isConnected(): boolean;
|
|
2250
|
+
/**
|
|
2251
|
+
* Start speech recognition.
|
|
2252
|
+
*
|
|
2253
|
+
* Creates the SpeechRecognition instance and begins listening. Resolves
|
|
2254
|
+
* once the recognition session has started. Rejects if the API is not
|
|
2255
|
+
* supported or the browser denies permission.
|
|
2256
|
+
*/
|
|
2257
|
+
connect(): Promise<void>;
|
|
2258
|
+
/**
|
|
2259
|
+
* Send audio data. No-op for Web Speech API since it captures audio
|
|
2260
|
+
* directly from the microphone via the browser's internal pipeline.
|
|
2261
|
+
*
|
|
2262
|
+
* Provided for interface compatibility with WebSocket-based STT adapters
|
|
2263
|
+
* (DeepgramSTT, ElevenLabsSTT).
|
|
2264
|
+
*/
|
|
2265
|
+
sendAudio(_audioData: Float32Array | Int16Array): void;
|
|
2266
|
+
/**
|
|
2267
|
+
* Register a callback to receive transcript events.
|
|
2268
|
+
*
|
|
2269
|
+
* @returns An unsubscribe function. Calling it more than once is safe.
|
|
2270
|
+
*/
|
|
2271
|
+
onTranscript(callback: (event: STTTranscriptEvent) => void): () => void;
|
|
2272
|
+
/**
|
|
2273
|
+
* Gracefully stop recognition.
|
|
2274
|
+
*
|
|
2275
|
+
* Calls `stop()` on the SpeechRecognition instance which allows it to
|
|
2276
|
+
* deliver any pending final results before ending.
|
|
2277
|
+
*/
|
|
2278
|
+
close(): void;
|
|
2279
|
+
/** Force-destroy the recognition without waiting for pending results. */
|
|
2280
|
+
destroy(): void;
|
|
2281
|
+
/**
|
|
2282
|
+
* Suspend the adapter (e.g. when the device goes offline).
|
|
2283
|
+
*
|
|
2284
|
+
* Stops recognition and marks the adapter as suspended so that auto-restart
|
|
2285
|
+
* does not trigger.
|
|
2286
|
+
*/
|
|
2287
|
+
suspend(): void;
|
|
2288
|
+
/**
|
|
2289
|
+
* Resume after a prior `suspend()`. Restarts recognition if it was
|
|
2290
|
+
* running before suspension.
|
|
2291
|
+
*/
|
|
2292
|
+
resume(): void;
|
|
2293
|
+
/**
|
|
2294
|
+
* Handle SpeechRecognition result events.
|
|
2295
|
+
*
|
|
2296
|
+
* The `results` property is a SpeechRecognitionResultList containing all
|
|
2297
|
+
* results accumulated during this recognition session. We only process
|
|
2298
|
+
* results from `resultIndex` onward to avoid re-emitting old results.
|
|
2299
|
+
*/
|
|
2300
|
+
private handleResult;
|
|
2301
|
+
/**
|
|
2302
|
+
* Handle SpeechRecognition errors.
|
|
2303
|
+
*
|
|
2304
|
+
* Some errors are recoverable (e.g. `no-speech`) and some are fatal
|
|
2305
|
+
* (e.g. `not-allowed`). For recoverable errors in continuous mode,
|
|
2306
|
+
* recognition will auto-restart via the `onend` handler.
|
|
2307
|
+
*/
|
|
2308
|
+
private handleError;
|
|
2309
|
+
/**
|
|
2310
|
+
* Emit a transcript event to all registered callbacks.
|
|
2311
|
+
*
|
|
2312
|
+
* Errors thrown by individual callbacks are caught and logged so one
|
|
2313
|
+
* misbehaving subscriber does not prevent others from receiving the event.
|
|
2314
|
+
*/
|
|
2315
|
+
private emitTranscript;
|
|
2316
|
+
/**
|
|
2317
|
+
* Resolve the SpeechRecognition constructor, with the webkit-prefixed
|
|
2318
|
+
* fallback. Returns null if not available.
|
|
2319
|
+
*/
|
|
2320
|
+
private resolveSpeechRecognition;
|
|
2321
|
+
/** Reset internal state after disconnection. */
|
|
2322
|
+
private cleanup;
|
|
2323
|
+
/** Conditional debug logging. */
|
|
2324
|
+
private log;
|
|
2325
|
+
}
|
|
2326
|
+
|
|
2327
|
+
interface WebSpeechTTSOptions {
|
|
2328
|
+
voice?: string;
|
|
2329
|
+
rate?: number;
|
|
2330
|
+
pitch?: number;
|
|
2331
|
+
language?: string;
|
|
2332
|
+
debug?: boolean;
|
|
2333
|
+
}
|
|
2334
|
+
/**
|
|
2335
|
+
* Audio event compatible with the TTSAudioEvent shape used by
|
|
2336
|
+
* VoicePipeline for ElevenLabs TTS. Web Speech API does not produce
|
|
2337
|
+
* raw audio buffers, so we emit events with empty buffers and use
|
|
2338
|
+
* isFinal to signal utterance completion.
|
|
2339
|
+
*/
|
|
2340
|
+
interface WebSpeechTTSAudioEvent {
|
|
2341
|
+
audio: ArrayBuffer;
|
|
2342
|
+
isFinal: boolean;
|
|
2343
|
+
timestamp: number;
|
|
2344
|
+
}
|
|
2345
|
+
declare class WebSpeechTTS {
|
|
2346
|
+
private readonly voiceName;
|
|
2347
|
+
private readonly rate;
|
|
2348
|
+
private readonly pitch;
|
|
2349
|
+
private readonly language;
|
|
2350
|
+
private readonly debugEnabled;
|
|
2351
|
+
private _connected;
|
|
2352
|
+
private _suspended;
|
|
2353
|
+
/** Cached voice object resolved from voiceName. */
|
|
2354
|
+
private _resolvedVoice;
|
|
2355
|
+
/** Whether voices have been loaded (they load async in some browsers). */
|
|
2356
|
+
private _voicesLoaded;
|
|
2357
|
+
/** Registered audio-event callbacks. */
|
|
2358
|
+
private readonly audioCallbacks;
|
|
2359
|
+
constructor(options?: WebSpeechTTSOptions);
|
|
2360
|
+
/**
|
|
2361
|
+
* Check whether the Web Speech API SpeechSynthesis is supported in the
|
|
2362
|
+
* current environment. Safe to call in SSR (returns false).
|
|
2363
|
+
*/
|
|
2364
|
+
static isSupported(): boolean;
|
|
2365
|
+
/** Whether the adapter is connected (ready for speech). */
|
|
2366
|
+
get isConnected(): boolean;
|
|
2367
|
+
/**
|
|
2368
|
+
* Initialize the adapter.
|
|
2369
|
+
*
|
|
2370
|
+
* Loads available voices and resolves the requested voice name. Voice
|
|
2371
|
+
* loading is async in some browsers (notably Chrome) so we wait for
|
|
2372
|
+
* the `voiceschanged` event if needed.
|
|
2373
|
+
*/
|
|
2374
|
+
connect(): Promise<void>;
|
|
2375
|
+
/**
|
|
2376
|
+
* Speak the given text using the browser's speech synthesis engine.
|
|
2377
|
+
*
|
|
2378
|
+
* Returns a Promise that resolves when the utterance completes or is
|
|
2379
|
+
* cancelled. Rejects if an error occurs during synthesis.
|
|
2380
|
+
*
|
|
2381
|
+
* Also emits audio events to registered callbacks for VoicePipeline
|
|
2382
|
+
* compatibility.
|
|
2383
|
+
*/
|
|
2384
|
+
speak(text: string): void;
|
|
2385
|
+
/**
|
|
2386
|
+
* Flush / finalize the current utterance.
|
|
2387
|
+
*
|
|
2388
|
+
* No-op for Web Speech API since each speak() call is a complete
|
|
2389
|
+
* utterance. Provided for interface compatibility with ElevenLabsTTS.
|
|
2390
|
+
*/
|
|
2391
|
+
flush(): void;
|
|
2392
|
+
/**
|
|
2393
|
+
* Register a callback to receive audio output events.
|
|
2394
|
+
*
|
|
2395
|
+
* For Web Speech API, these events have empty audio buffers and are
|
|
2396
|
+
* used to signal utterance start/end for VoicePipeline state management.
|
|
2397
|
+
*
|
|
2398
|
+
* @returns An unsubscribe function. Calling it more than once is safe.
|
|
2399
|
+
*/
|
|
2400
|
+
onAudio(callback: (event: WebSpeechTTSAudioEvent) => void): () => void;
|
|
2401
|
+
/** Stop current speech synthesis and cancel any queued utterances. */
|
|
2402
|
+
stop(): void;
|
|
2403
|
+
/** Gracefully close the adapter. */
|
|
2404
|
+
close(): void;
|
|
2405
|
+
/** Force-destroy the adapter. */
|
|
2406
|
+
destroy(): void;
|
|
2407
|
+
/**
|
|
2408
|
+
* Suspend the adapter (e.g. when the device goes offline).
|
|
2409
|
+
*
|
|
2410
|
+
* Pauses any active speech synthesis and marks the adapter as suspended.
|
|
2411
|
+
*/
|
|
2412
|
+
suspend(): void;
|
|
2413
|
+
/**
|
|
2414
|
+
* Resume after a prior `suspend()`.
|
|
2415
|
+
*/
|
|
2416
|
+
resume(): void;
|
|
2417
|
+
/**
|
|
2418
|
+
* Load available voices from the browser.
|
|
2419
|
+
*
|
|
2420
|
+
* In Chrome and some other browsers, voices load asynchronously after
|
|
2421
|
+
* the page loads. We wait for the `voiceschanged` event with a timeout.
|
|
2422
|
+
*/
|
|
2423
|
+
private loadVoices;
|
|
2424
|
+
/**
|
|
2425
|
+
* Find a voice by name (case-insensitive partial match).
|
|
2426
|
+
*/
|
|
2427
|
+
private findVoice;
|
|
2428
|
+
/**
|
|
2429
|
+
* Emit an audio event to all registered callbacks.
|
|
2430
|
+
*
|
|
2431
|
+
* Errors thrown by individual callbacks are caught and logged so one
|
|
2432
|
+
* misbehaving subscriber does not prevent others from receiving the event.
|
|
2433
|
+
*/
|
|
2434
|
+
private emitAudio;
|
|
2435
|
+
/** Reset internal state. */
|
|
2436
|
+
private cleanup;
|
|
2437
|
+
/** Conditional debug logging. */
|
|
2438
|
+
private log;
|
|
2439
|
+
}
|
|
2440
|
+
|
|
2441
|
+
type VoiceState = 'idle' | 'listening' | 'processing' | 'speaking' | 'error';
|
|
2442
|
+
interface VoicePipelineOptions {
|
|
2443
|
+
sttConfig: {
|
|
2444
|
+
provider: 'deepgram';
|
|
2445
|
+
apiKey: string;
|
|
2446
|
+
model?: 'nova-2' | 'nova-3';
|
|
2447
|
+
} | {
|
|
2448
|
+
provider: 'elevenlabs';
|
|
2449
|
+
apiKey: string;
|
|
2450
|
+
language?: string;
|
|
2451
|
+
} | {
|
|
2452
|
+
provider: 'web-speech';
|
|
2453
|
+
language?: string;
|
|
2454
|
+
continuous?: boolean;
|
|
2455
|
+
interimResults?: boolean;
|
|
2456
|
+
};
|
|
2457
|
+
ttsConfig: {
|
|
2458
|
+
provider: 'elevenlabs';
|
|
2459
|
+
apiKey: string;
|
|
2460
|
+
voiceId?: string;
|
|
2461
|
+
modelId?: string;
|
|
2462
|
+
} | {
|
|
2463
|
+
provider: 'web-speech';
|
|
2464
|
+
voice?: string;
|
|
2465
|
+
rate?: number;
|
|
2466
|
+
pitch?: number;
|
|
2467
|
+
language?: string;
|
|
2468
|
+
};
|
|
2469
|
+
debug?: boolean;
|
|
2470
|
+
}
|
|
2471
|
+
declare global {
|
|
2472
|
+
var webkitAudioContext: typeof AudioContext | undefined;
|
|
2473
|
+
}
|
|
2474
|
+
declare class VoicePipeline {
|
|
2475
|
+
private readonly _sttConfig;
|
|
2476
|
+
private readonly _ttsConfig;
|
|
2477
|
+
private readonly _debug;
|
|
2478
|
+
private _state;
|
|
2479
|
+
private _destroyed;
|
|
2480
|
+
private readonly _bus;
|
|
2481
|
+
private _audioContext;
|
|
2482
|
+
private _mediaStream;
|
|
2483
|
+
private _vad;
|
|
2484
|
+
private _stt;
|
|
2485
|
+
private _tts;
|
|
2486
|
+
private _micSourceNode;
|
|
2487
|
+
private _captureProcessor;
|
|
2488
|
+
private _isForwardingToSTT;
|
|
2489
|
+
private _playbackQueue;
|
|
2490
|
+
private _jitterBufferTimer;
|
|
2491
|
+
private _isPlaybackStarted;
|
|
2492
|
+
private _nextPlaybackTime;
|
|
2493
|
+
private _activeSourceNodes;
|
|
2494
|
+
private _lastScheduledSource;
|
|
2495
|
+
private _lastTTSEcho;
|
|
2496
|
+
private _pendingLLMAbort;
|
|
2497
|
+
private readonly _stateChangeCallbacks;
|
|
2498
|
+
private readonly _transcriptCallbacks;
|
|
2499
|
+
private _unsubVADSpeechStart;
|
|
2500
|
+
private _unsubVADSpeechEnd;
|
|
2501
|
+
private _unsubSTTTranscript;
|
|
2502
|
+
private _unsubTTSAudio;
|
|
2503
|
+
constructor(options: VoicePipelineOptions);
|
|
2504
|
+
/** Current pipeline state. */
|
|
2505
|
+
get state(): VoiceState;
|
|
2506
|
+
/**
|
|
2507
|
+
* Initialize AudioContext, VAD model, and STT/TTS connections.
|
|
2508
|
+
*
|
|
2509
|
+
* **Must be called in response to a user gesture** (click / tap) to
|
|
2510
|
+
* satisfy browser autoplay policies.
|
|
2511
|
+
*/
|
|
2512
|
+
init(): Promise<void>;
|
|
2513
|
+
/**
|
|
2514
|
+
* Start listening: activate microphone, begin VAD + STT pipeline.
|
|
2515
|
+
*
|
|
2516
|
+
* Valid from: IDLE, ERROR, SPEAKING (barge-in path calls this internally).
|
|
2517
|
+
*/
|
|
2518
|
+
startListening(): Promise<void>;
|
|
2519
|
+
/** Stop listening: deactivate mic and VAD. */
|
|
2520
|
+
stopListening(): void;
|
|
2521
|
+
/**
|
|
2522
|
+
* Process a transcript through an LLM callback and speak the response.
|
|
2523
|
+
*
|
|
2524
|
+
* @param text - The user's transcript text.
|
|
2525
|
+
* @param sendToLLM - Async callback that sends text to the LLM and returns the response.
|
|
2526
|
+
*/
|
|
2527
|
+
processTranscript(text: string, sendToLLM: (text: string) => Promise<string>): Promise<void>;
|
|
2528
|
+
/** Speak text via TTS (ElevenLabs or Web Speech API). */
|
|
2529
|
+
speak(text: string): Promise<void>;
|
|
2530
|
+
/** Stop current TTS playback immediately (barge-in). */
|
|
2531
|
+
stopSpeaking(): void;
|
|
2532
|
+
/** Subscribe to state changes. Returns an unsubscribe function. */
|
|
2533
|
+
onStateChange(callback: (state: VoiceState, previous: VoiceState) => void): () => void;
|
|
2534
|
+
/** Subscribe to transcript events. Returns an unsubscribe function. */
|
|
2535
|
+
onTranscript(callback: (text: string, isFinal: boolean) => void): () => void;
|
|
2536
|
+
/** Destroy all resources held by the pipeline. */
|
|
2537
|
+
destroy(): Promise<void>;
|
|
2538
|
+
private _setState;
|
|
2539
|
+
/**
|
|
2540
|
+
* Resolve the AudioContext constructor, with Safari webkitAudioContext
|
|
2541
|
+
* fallback. Returns null if Web Audio is not available.
|
|
2542
|
+
*/
|
|
2543
|
+
private _resolveAudioContext;
|
|
2544
|
+
/**
|
|
2545
|
+
* Pre-warm the AudioContext by playing a silent buffer.
|
|
2546
|
+
* This forces the context into the "running" state and avoids a
|
|
2547
|
+
* noticeable delay on the first real playback.
|
|
2548
|
+
*/
|
|
2549
|
+
private _prewarmAudioContext;
|
|
2550
|
+
/**
|
|
2551
|
+
* Set up a ScriptProcessorNode to capture mic audio and forward it
|
|
2552
|
+
* to the STT adapter when `_isForwardingToSTT` is true.
|
|
2553
|
+
*/
|
|
2554
|
+
private _setupMicCapture;
|
|
2555
|
+
/** Tear down the mic capture ScriptProcessorNode. */
|
|
2556
|
+
private _teardownMicCapture;
|
|
2557
|
+
/** Stop all tracks on the current MediaStream. */
|
|
2558
|
+
private _stopMicTracks;
|
|
2559
|
+
private _handleVADSpeechStart;
|
|
2560
|
+
private _handleVADSpeechEnd;
|
|
2561
|
+
private _handleTranscript;
|
|
2562
|
+
/**
|
|
2563
|
+
* Handle an audio chunk from ElevenLabs TTS.
|
|
2564
|
+
*
|
|
2565
|
+
* Implements a jitter buffer: we accumulate audio for JITTER_BUFFER_MS
|
|
2566
|
+
* before starting playback to smooth out network jitter.
|
|
2567
|
+
*/
|
|
2568
|
+
private _handleTTSAudio;
|
|
2569
|
+
/** Flush the jitter buffer and start playback. */
|
|
2570
|
+
private _flushJitterBuffer;
|
|
2571
|
+
/**
|
|
2572
|
+
* Begin playback: decode all queued chunks and schedule them.
|
|
2573
|
+
* If `onDone` is provided, it is called when the last chunk finishes playing.
|
|
2574
|
+
*/
|
|
2575
|
+
private _startPlayback;
|
|
2576
|
+
/**
|
|
2577
|
+
* Decode an audio chunk (mp3 from ElevenLabs) and schedule it for
|
|
2578
|
+
* sequential playback via AudioBufferSourceNode.
|
|
2579
|
+
*/
|
|
2580
|
+
private _decodeAndSchedule;
|
|
2581
|
+
/**
|
|
2582
|
+
* Check if VAD speech-start during SPEAKING state is likely echo from
|
|
2583
|
+
* the speaker playing TTS audio rather than genuine user speech.
|
|
2584
|
+
*
|
|
2585
|
+
* Simple heuristic: if we are still within the echo window of a recent
|
|
2586
|
+
* TTS utterance, treat it as potential echo.
|
|
2587
|
+
*/
|
|
2588
|
+
private _isEchoDetected;
|
|
2589
|
+
/**
|
|
2590
|
+
* Check if a transcript is an echo of recent TTS output.
|
|
2591
|
+
*
|
|
2592
|
+
* Uses word overlap: if intersection of words > 60% of max set size
|
|
2593
|
+
* and the transcript arrived within the echo window, discard it.
|
|
2594
|
+
*/
|
|
2595
|
+
private _isTranscriptEcho;
|
|
2596
|
+
/**
|
|
2597
|
+
* Normalize text into a set of lowercase words, stripping punctuation.
|
|
2598
|
+
*/
|
|
2599
|
+
private _normalizeWords;
|
|
2600
|
+
private _log;
|
|
2601
|
+
}
|
|
2602
|
+
|
|
2603
|
+
export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, VoicePipeline, type VoicePipelineOptions, type VoiceState, WebSpeechSTT, type WebSpeechSTTOptions, WebSpeechTTS, type WebSpeechTTSAudioEvent, type WebSpeechTTSOptions, createEventBus, isGuideKitError };
|