@guidekit/core 0.1.0-beta.1 → 0.1.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/index.cjs +1639 -582
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +571 -30
- package/dist/index.d.ts +571 -30
- package/dist/index.js +1637 -583
- package/dist/index.js.map +1 -1
- package/package.json +37 -14
package/dist/index.d.ts
CHANGED
|
@@ -351,8 +351,14 @@ type STTConfig = {
|
|
|
351
351
|
apiKey: string;
|
|
352
352
|
model?: 'nova-2' | 'nova-3';
|
|
353
353
|
} | {
|
|
354
|
-
provider: '
|
|
354
|
+
provider: 'elevenlabs';
|
|
355
355
|
apiKey: string;
|
|
356
|
+
language?: string;
|
|
357
|
+
} | {
|
|
358
|
+
provider: 'web-speech';
|
|
359
|
+
language?: string;
|
|
360
|
+
continuous?: boolean;
|
|
361
|
+
interimResults?: boolean;
|
|
356
362
|
};
|
|
357
363
|
/** Text-to-speech provider configuration. */
|
|
358
364
|
type TTSConfig = {
|
|
@@ -360,19 +366,26 @@ type TTSConfig = {
|
|
|
360
366
|
apiKey: string;
|
|
361
367
|
voiceId?: string;
|
|
362
368
|
} | {
|
|
363
|
-
provider: '
|
|
364
|
-
|
|
365
|
-
|
|
369
|
+
provider: 'web-speech';
|
|
370
|
+
voice?: string;
|
|
371
|
+
rate?: number;
|
|
372
|
+
pitch?: number;
|
|
373
|
+
language?: string;
|
|
366
374
|
};
|
|
375
|
+
/** Transcript event emitted by any STT adapter. */
|
|
376
|
+
interface STTTranscriptEvent {
|
|
377
|
+
text: string;
|
|
378
|
+
isFinal: boolean;
|
|
379
|
+
confidence: number;
|
|
380
|
+
timestamp: number;
|
|
381
|
+
}
|
|
367
382
|
/** Large language model provider configuration. */
|
|
368
383
|
type LLMConfig = {
|
|
369
384
|
provider: 'gemini';
|
|
370
385
|
apiKey: string;
|
|
371
386
|
model?: 'gemini-2.5-flash' | 'gemini-2.5-pro';
|
|
372
387
|
} | {
|
|
373
|
-
|
|
374
|
-
apiKey: string;
|
|
375
|
-
model?: 'gpt-4o' | 'gpt-4o-mini';
|
|
388
|
+
adapter: LLMProviderAdapter;
|
|
376
389
|
};
|
|
377
390
|
/** Top-level options that control SDK behaviour. */
|
|
378
391
|
interface GuideKitOptions {
|
|
@@ -481,11 +494,27 @@ interface GuideKitProviderProps {
|
|
|
481
494
|
}>;
|
|
482
495
|
children?: unknown;
|
|
483
496
|
}
|
|
497
|
+
/** A single JSON-Schema-style property descriptor used in tool parameter maps. */
|
|
498
|
+
interface ToolParameterSchema {
|
|
499
|
+
type: string;
|
|
500
|
+
description?: string;
|
|
501
|
+
enum?: string[];
|
|
502
|
+
items?: {
|
|
503
|
+
type: string;
|
|
504
|
+
};
|
|
505
|
+
[key: string]: unknown;
|
|
506
|
+
}
|
|
484
507
|
/** Definition of a tool that can be invoked by the LLM. */
|
|
485
508
|
interface ToolDefinition {
|
|
486
509
|
name: string;
|
|
487
510
|
description: string;
|
|
488
|
-
|
|
511
|
+
/** Flat map of param name → JSON Schema property descriptor. */
|
|
512
|
+
parameters: Record<string, ToolParameterSchema>;
|
|
513
|
+
/**
|
|
514
|
+
* List of parameter names the LLM must always provide.
|
|
515
|
+
* Omit or use [] for fully optional parameters.
|
|
516
|
+
*/
|
|
517
|
+
required?: string[];
|
|
489
518
|
schemaVersion: number;
|
|
490
519
|
}
|
|
491
520
|
/** A tool invocation request returned by the LLM. */
|
|
@@ -509,6 +538,35 @@ interface LLMProviderAdapter {
|
|
|
509
538
|
formatConversation(history: ConversationTurn[]): unknown;
|
|
510
539
|
parseResponse(stream: ReadableStream): AsyncIterable<TextChunk | ToolCall>;
|
|
511
540
|
formatToolResult(callId: string, result: unknown): unknown;
|
|
541
|
+
/**
|
|
542
|
+
* Build and execute a streaming request to the provider API.
|
|
543
|
+
* Returns the raw ReadableStream for the response body.
|
|
544
|
+
*/
|
|
545
|
+
streamRequest(params: {
|
|
546
|
+
systemPrompt: string;
|
|
547
|
+
contents: unknown;
|
|
548
|
+
userMessage?: string;
|
|
549
|
+
tools?: unknown;
|
|
550
|
+
signal?: AbortSignal;
|
|
551
|
+
timeoutMs?: number;
|
|
552
|
+
}): Promise<{
|
|
553
|
+
stream: ReadableStream<Uint8Array>;
|
|
554
|
+
response: Response;
|
|
555
|
+
}>;
|
|
556
|
+
/**
|
|
557
|
+
* Check whether a parsed response chunk indicates the response was
|
|
558
|
+
* blocked by a content/safety filter.
|
|
559
|
+
*/
|
|
560
|
+
isContentFiltered(chunk: Record<string, unknown>): boolean;
|
|
561
|
+
/**
|
|
562
|
+
* Extract token usage from a parsed response chunk.
|
|
563
|
+
* Returns `null` if no usage metadata is present in this chunk.
|
|
564
|
+
*/
|
|
565
|
+
extractUsage(chunk: Record<string, unknown>): {
|
|
566
|
+
prompt: number;
|
|
567
|
+
completion: number;
|
|
568
|
+
total: number;
|
|
569
|
+
} | null;
|
|
512
570
|
}
|
|
513
571
|
/** Decoded payload of a GuideKit session token. */
|
|
514
572
|
interface TokenPayload {
|
|
@@ -529,9 +587,9 @@ interface TokenResponse {
|
|
|
529
587
|
/** Options for `createSessionToken()` on the server side. */
|
|
530
588
|
interface CreateSessionTokenOptions {
|
|
531
589
|
signingSecret: string | string[];
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
590
|
+
sttApiKey?: string;
|
|
591
|
+
ttsApiKey?: string;
|
|
592
|
+
llmApiKey?: string;
|
|
535
593
|
expiresIn?: string;
|
|
536
594
|
allowedOrigins?: string[];
|
|
537
595
|
permissions?: string[];
|
|
@@ -586,10 +644,11 @@ declare const ErrorCodes: {
|
|
|
586
644
|
readonly VAD_PACKAGE_MISSING: "VAD_PACKAGE_MISSING";
|
|
587
645
|
readonly CONTENT_FILTER_TRIGGERED: "CONTENT_FILTER_TRIGGERED";
|
|
588
646
|
readonly PRIVACY_HOOK_CANCELLED: "PRIVACY_HOOK_CANCELLED";
|
|
647
|
+
readonly UNKNOWN: "UNKNOWN";
|
|
589
648
|
};
|
|
590
649
|
/** Union of every known error code string. */
|
|
591
650
|
type ErrorCode = (typeof ErrorCodes)[keyof typeof ErrorCodes];
|
|
592
|
-
type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai';
|
|
651
|
+
type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai' | 'web-speech' | (string & {});
|
|
593
652
|
interface GuideKitErrorOptions {
|
|
594
653
|
code: string;
|
|
595
654
|
message: string;
|
|
@@ -830,6 +889,11 @@ declare class GuideKitCore {
|
|
|
830
889
|
private setAgentState;
|
|
831
890
|
private notifyStoreListeners;
|
|
832
891
|
private buildSnapshot;
|
|
892
|
+
/**
|
|
893
|
+
* Unified built-in tool specifications — single source of truth for both
|
|
894
|
+
* tool definitions (sent to LLM) and handler registration.
|
|
895
|
+
*/
|
|
896
|
+
private getBuiltinToolSpecs;
|
|
833
897
|
/**
|
|
834
898
|
* Register all built-in tool handlers with the ToolExecutor.
|
|
835
899
|
* Called once during init() after VisualGuidance and all subsystems are ready.
|
|
@@ -1118,17 +1182,35 @@ interface TokenUsage$1 {
|
|
|
1118
1182
|
completion: number;
|
|
1119
1183
|
total: number;
|
|
1120
1184
|
}
|
|
1185
|
+
/** Configuration for the OpenAI adapter (custom adapter pattern). */
|
|
1186
|
+
interface OpenAIAdapterConfig {
|
|
1187
|
+
apiKey: string;
|
|
1188
|
+
model?: 'gpt-4o' | 'gpt-4o-mini' | (string & {});
|
|
1189
|
+
}
|
|
1121
1190
|
/**
|
|
1122
1191
|
* Adapter that translates between GuideKit's internal types and the
|
|
1123
1192
|
* OpenAI Chat Completions API wire format. Handles streaming via SSE,
|
|
1124
1193
|
* tool formatting, and response parsing.
|
|
1194
|
+
*
|
|
1195
|
+
* Usage as a custom adapter:
|
|
1196
|
+
* ```ts
|
|
1197
|
+
* import { OpenAIAdapter } from '@guidekit/core';
|
|
1198
|
+
* const llmConfig = { adapter: new OpenAIAdapter({ apiKey: '...', model: 'gpt-4o' }) };
|
|
1199
|
+
* ```
|
|
1125
1200
|
*/
|
|
1126
1201
|
declare class OpenAIAdapter implements LLMProviderAdapter {
|
|
1127
1202
|
private readonly apiKey;
|
|
1128
1203
|
private readonly model;
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1204
|
+
/** Tracks whether the last extractChunks call emitted a done chunk. */
|
|
1205
|
+
private lastExtractEmittedDone;
|
|
1206
|
+
/**
|
|
1207
|
+
* Token usage extracted from the most recent `parseResponse` call.
|
|
1208
|
+
* Updated as each SSE chunk is parsed.
|
|
1209
|
+
*/
|
|
1210
|
+
private _lastUsage;
|
|
1211
|
+
constructor(config: OpenAIAdapterConfig);
|
|
1212
|
+
/** Token usage from the most recent parseResponse call. */
|
|
1213
|
+
get lastUsage(): TokenUsage$1;
|
|
1132
1214
|
/**
|
|
1133
1215
|
* Convert GuideKit tool definitions into OpenAI's `tools` format.
|
|
1134
1216
|
* Each tool is wrapped as `{ type: 'function', function: { name, description, parameters } }`.
|
|
@@ -1150,6 +1232,10 @@ declare class OpenAIAdapter implements LLMProviderAdapter {
|
|
|
1150
1232
|
* prefixed by `data: `. The final line is `data: [DONE]`.
|
|
1151
1233
|
* Text content arrives in `choices[0].delta.content` and tool calls
|
|
1152
1234
|
* arrive in `choices[0].delta.tool_calls`.
|
|
1235
|
+
*
|
|
1236
|
+
* This method also:
|
|
1237
|
+
* - Detects content filtering and throws `ContentFilterError`.
|
|
1238
|
+
* - Tracks token usage (accessible via `lastUsage` after iteration).
|
|
1153
1239
|
*/
|
|
1154
1240
|
parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
|
|
1155
1241
|
/**
|
|
@@ -1168,10 +1254,8 @@ declare class OpenAIAdapter implements LLMProviderAdapter {
|
|
|
1168
1254
|
*/
|
|
1169
1255
|
streamRequest(params: {
|
|
1170
1256
|
systemPrompt: string;
|
|
1171
|
-
contents:
|
|
1172
|
-
|
|
1173
|
-
content: string;
|
|
1174
|
-
}>;
|
|
1257
|
+
contents: unknown;
|
|
1258
|
+
userMessage?: string;
|
|
1175
1259
|
tools?: unknown;
|
|
1176
1260
|
signal?: AbortSignal;
|
|
1177
1261
|
timeoutMs?: number;
|
|
@@ -1230,9 +1314,17 @@ interface TokenUsage {
|
|
|
1230
1314
|
declare class GeminiAdapter implements LLMProviderAdapter {
|
|
1231
1315
|
private readonly apiKey;
|
|
1232
1316
|
private readonly model;
|
|
1317
|
+
/**
|
|
1318
|
+
* Token usage extracted from the most recent `parseResponse` call.
|
|
1319
|
+
* Updated as each SSE chunk is parsed; the final value reflects the
|
|
1320
|
+
* cumulative usage metadata sent by Gemini (typically in the last chunk).
|
|
1321
|
+
*/
|
|
1322
|
+
private _lastUsage;
|
|
1233
1323
|
constructor(config: Extract<LLMConfig, {
|
|
1234
1324
|
provider: 'gemini';
|
|
1235
1325
|
}>);
|
|
1326
|
+
/** Token usage from the most recent parseResponse call. */
|
|
1327
|
+
get lastUsage(): TokenUsage;
|
|
1236
1328
|
/**
|
|
1237
1329
|
* Convert GuideKit tool definitions into Gemini's `functionDeclarations`
|
|
1238
1330
|
* format, wrapped inside a `tools` array.
|
|
@@ -1255,6 +1347,10 @@ declare class GeminiAdapter implements LLMProviderAdapter {
|
|
|
1255
1347
|
* The Gemini `streamGenerateContent?alt=sse` endpoint sends each chunk
|
|
1256
1348
|
* as a JSON object prefixed by `data: `. We parse line-by-line, extract
|
|
1257
1349
|
* text parts and function call parts, and yield the appropriate types.
|
|
1350
|
+
*
|
|
1351
|
+
* This method also:
|
|
1352
|
+
* - Detects content filtering and throws `ContentFilterError`.
|
|
1353
|
+
* - Tracks token usage (accessible via `lastUsage` after iteration).
|
|
1258
1354
|
*/
|
|
1259
1355
|
parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
|
|
1260
1356
|
/**
|
|
@@ -1275,16 +1371,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
|
|
|
1275
1371
|
/**
|
|
1276
1372
|
* Build and execute a streaming request to the Gemini API.
|
|
1277
1373
|
* Returns the raw `ReadableStream` for the response body together with
|
|
1278
|
-
*
|
|
1374
|
+
* the raw Response object.
|
|
1375
|
+
*
|
|
1376
|
+
* Note: The Gemini API key is passed as a URL query parameter (`key=`).
|
|
1377
|
+
* This is inherent to the Gemini REST SSE endpoint design; the key is
|
|
1378
|
+
* transmitted over HTTPS so it remains encrypted in transit. (H3)
|
|
1279
1379
|
*/
|
|
1280
1380
|
streamRequest(params: {
|
|
1281
1381
|
systemPrompt: string;
|
|
1282
|
-
contents:
|
|
1283
|
-
|
|
1284
|
-
parts: Array<{
|
|
1285
|
-
text: string;
|
|
1286
|
-
}>;
|
|
1287
|
-
}>;
|
|
1382
|
+
contents: unknown;
|
|
1383
|
+
userMessage?: string;
|
|
1288
1384
|
tools?: unknown;
|
|
1289
1385
|
signal?: AbortSignal;
|
|
1290
1386
|
timeoutMs?: number;
|
|
@@ -1317,12 +1413,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
|
|
|
1317
1413
|
* High-level orchestrator that manages LLM interactions for the GuideKit SDK.
|
|
1318
1414
|
*
|
|
1319
1415
|
* Responsibilities:
|
|
1320
|
-
* - Owns the active `LLMProviderAdapter
|
|
1416
|
+
* - Owns the active `LLMProviderAdapter`.
|
|
1321
1417
|
* - Streams responses from the provider, emitting callbacks for text chunks,
|
|
1322
1418
|
* tool calls, and token usage.
|
|
1323
1419
|
* - Handles content filter retries: if the initial response is blocked, it
|
|
1324
1420
|
* retries once with a stripped-down prompt (no tools).
|
|
1325
1421
|
* - Surfaces all errors through the SDK error hierarchy.
|
|
1422
|
+
*
|
|
1423
|
+
* The orchestrator is fully adapter-agnostic: all provider-specific logic
|
|
1424
|
+
* (SSE parsing, content filter detection, usage extraction) lives in the
|
|
1425
|
+
* adapter implementations.
|
|
1326
1426
|
*/
|
|
1327
1427
|
declare class LLMOrchestrator {
|
|
1328
1428
|
private _adapter;
|
|
@@ -1363,12 +1463,22 @@ declare class LLMOrchestrator {
|
|
|
1363
1463
|
get adapter(): LLMProviderAdapter;
|
|
1364
1464
|
/**
|
|
1365
1465
|
* Execute a streaming LLM request and collect the results.
|
|
1466
|
+
*
|
|
1467
|
+
* This method is fully adapter-agnostic: it delegates streaming,
|
|
1468
|
+
* response parsing, content-filter detection, and usage extraction
|
|
1469
|
+
* entirely to the active `LLMProviderAdapter`. No provider-specific
|
|
1470
|
+
* SSE parsing lives in the orchestrator.
|
|
1366
1471
|
*/
|
|
1367
1472
|
private executeStream;
|
|
1368
1473
|
/**
|
|
1369
1474
|
* Create the appropriate adapter for the given config.
|
|
1370
|
-
*
|
|
1371
|
-
*
|
|
1475
|
+
*
|
|
1476
|
+
* Built-in providers:
|
|
1477
|
+
* - `'gemini'` — uses the bundled `GeminiAdapter`.
|
|
1478
|
+
*
|
|
1479
|
+
* Custom adapters:
|
|
1480
|
+
* - Pass `{ adapter: myAdapter }` to use any `LLMProviderAdapter`.
|
|
1481
|
+
* Example: `llm: { adapter: new OpenAIAdapter({ ... }) }`
|
|
1372
1482
|
*/
|
|
1373
1483
|
private createAdapter;
|
|
1374
1484
|
/** Convenience accessor for the current provider name. */
|
|
@@ -2183,4 +2293,435 @@ declare class TokenManager {
|
|
|
2183
2293
|
private log;
|
|
2184
2294
|
}
|
|
2185
2295
|
|
|
2186
|
-
|
|
2296
|
+
/**
|
|
2297
|
+
* Minimal type declarations for the Web Speech API SpeechRecognition
|
|
2298
|
+
* interface. These cover the subset used by this adapter. Full type
|
|
2299
|
+
* definitions are available in lib.dom.d.ts but may not be present in
|
|
2300
|
+
* all TS configurations.
|
|
2301
|
+
*/
|
|
2302
|
+
interface SpeechRecognitionEvent {
|
|
2303
|
+
readonly resultIndex: number;
|
|
2304
|
+
readonly results: SpeechRecognitionResultList;
|
|
2305
|
+
}
|
|
2306
|
+
interface SpeechRecognitionResultList {
|
|
2307
|
+
readonly length: number;
|
|
2308
|
+
item(index: number): SpeechRecognitionResult;
|
|
2309
|
+
[index: number]: SpeechRecognitionResult;
|
|
2310
|
+
}
|
|
2311
|
+
interface SpeechRecognitionResult {
|
|
2312
|
+
readonly length: number;
|
|
2313
|
+
readonly isFinal: boolean;
|
|
2314
|
+
item(index: number): SpeechRecognitionAlternative;
|
|
2315
|
+
[index: number]: SpeechRecognitionAlternative;
|
|
2316
|
+
}
|
|
2317
|
+
interface SpeechRecognitionAlternative {
|
|
2318
|
+
readonly transcript: string;
|
|
2319
|
+
readonly confidence: number;
|
|
2320
|
+
}
|
|
2321
|
+
interface SpeechRecognitionErrorEvent {
|
|
2322
|
+
readonly error: string;
|
|
2323
|
+
readonly message: string;
|
|
2324
|
+
}
|
|
2325
|
+
interface SpeechRecognitionInstance extends EventTarget {
|
|
2326
|
+
lang: string;
|
|
2327
|
+
continuous: boolean;
|
|
2328
|
+
interimResults: boolean;
|
|
2329
|
+
maxAlternatives: number;
|
|
2330
|
+
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
|
2331
|
+
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
|
|
2332
|
+
onend: (() => void) | null;
|
|
2333
|
+
onstart: (() => void) | null;
|
|
2334
|
+
start(): void;
|
|
2335
|
+
stop(): void;
|
|
2336
|
+
abort(): void;
|
|
2337
|
+
}
|
|
2338
|
+
interface SpeechRecognitionConstructor {
|
|
2339
|
+
new (): SpeechRecognitionInstance;
|
|
2340
|
+
}
|
|
2341
|
+
declare global {
|
|
2342
|
+
var webkitSpeechRecognition: SpeechRecognitionConstructor | undefined;
|
|
2343
|
+
}
|
|
2344
|
+
interface WebSpeechSTTOptions {
|
|
2345
|
+
language?: string;
|
|
2346
|
+
continuous?: boolean;
|
|
2347
|
+
interimResults?: boolean;
|
|
2348
|
+
debug?: boolean;
|
|
2349
|
+
}
|
|
2350
|
+
declare class WebSpeechSTT {
|
|
2351
|
+
private readonly language;
|
|
2352
|
+
private readonly continuous;
|
|
2353
|
+
private readonly interimResultsEnabled;
|
|
2354
|
+
private readonly debugEnabled;
|
|
2355
|
+
private recognition;
|
|
2356
|
+
private _connected;
|
|
2357
|
+
private _suspended;
|
|
2358
|
+
/**
|
|
2359
|
+
* Whether we intentionally stopped recognition. Used to distinguish
|
|
2360
|
+
* between intentional stop and unexpected end (for auto-restart in
|
|
2361
|
+
* continuous mode).
|
|
2362
|
+
*/
|
|
2363
|
+
private _intentionalStop;
|
|
2364
|
+
/** Registered transcript callbacks. */
|
|
2365
|
+
private readonly transcriptCallbacks;
|
|
2366
|
+
constructor(options?: WebSpeechSTTOptions);
|
|
2367
|
+
/**
|
|
2368
|
+
* Check whether the Web Speech API SpeechRecognition is supported in the
|
|
2369
|
+
* current environment. Safe to call in SSR (returns false).
|
|
2370
|
+
*/
|
|
2371
|
+
static isSupported(): boolean;
|
|
2372
|
+
/** Whether recognition is currently active and connected. */
|
|
2373
|
+
get isConnected(): boolean;
|
|
2374
|
+
/**
|
|
2375
|
+
* Start speech recognition.
|
|
2376
|
+
*
|
|
2377
|
+
* Creates the SpeechRecognition instance and begins listening. Resolves
|
|
2378
|
+
* once the recognition session has started. Rejects if the API is not
|
|
2379
|
+
* supported or the browser denies permission.
|
|
2380
|
+
*/
|
|
2381
|
+
connect(): Promise<void>;
|
|
2382
|
+
/**
|
|
2383
|
+
* Send audio data. No-op for Web Speech API since it captures audio
|
|
2384
|
+
* directly from the microphone via the browser's internal pipeline.
|
|
2385
|
+
*
|
|
2386
|
+
* Provided for interface compatibility with WebSocket-based STT adapters
|
|
2387
|
+
* (DeepgramSTT, ElevenLabsSTT).
|
|
2388
|
+
*/
|
|
2389
|
+
sendAudio(_audioData: Float32Array | Int16Array): void;
|
|
2390
|
+
/**
|
|
2391
|
+
* Register a callback to receive transcript events.
|
|
2392
|
+
*
|
|
2393
|
+
* @returns An unsubscribe function. Calling it more than once is safe.
|
|
2394
|
+
*/
|
|
2395
|
+
onTranscript(callback: (event: STTTranscriptEvent) => void): () => void;
|
|
2396
|
+
/**
|
|
2397
|
+
* Gracefully stop recognition.
|
|
2398
|
+
*
|
|
2399
|
+
* Calls `stop()` on the SpeechRecognition instance which allows it to
|
|
2400
|
+
* deliver any pending final results before ending.
|
|
2401
|
+
*/
|
|
2402
|
+
close(): void;
|
|
2403
|
+
/** Force-destroy the recognition without waiting for pending results. */
|
|
2404
|
+
destroy(): void;
|
|
2405
|
+
/**
|
|
2406
|
+
* Suspend the adapter (e.g. when the device goes offline).
|
|
2407
|
+
*
|
|
2408
|
+
* Stops recognition and marks the adapter as suspended so that auto-restart
|
|
2409
|
+
* does not trigger.
|
|
2410
|
+
*/
|
|
2411
|
+
suspend(): void;
|
|
2412
|
+
/**
|
|
2413
|
+
* Resume after a prior `suspend()`. Restarts recognition if it was
|
|
2414
|
+
* running before suspension.
|
|
2415
|
+
*/
|
|
2416
|
+
resume(): void;
|
|
2417
|
+
/**
|
|
2418
|
+
* Handle SpeechRecognition result events.
|
|
2419
|
+
*
|
|
2420
|
+
* The `results` property is a SpeechRecognitionResultList containing all
|
|
2421
|
+
* results accumulated during this recognition session. We only process
|
|
2422
|
+
* results from `resultIndex` onward to avoid re-emitting old results.
|
|
2423
|
+
*/
|
|
2424
|
+
private handleResult;
|
|
2425
|
+
/**
|
|
2426
|
+
* Handle SpeechRecognition errors.
|
|
2427
|
+
*
|
|
2428
|
+
* Some errors are recoverable (e.g. `no-speech`) and some are fatal
|
|
2429
|
+
* (e.g. `not-allowed`). For recoverable errors in continuous mode,
|
|
2430
|
+
* recognition will auto-restart via the `onend` handler.
|
|
2431
|
+
*/
|
|
2432
|
+
private handleError;
|
|
2433
|
+
/**
|
|
2434
|
+
* Emit a transcript event to all registered callbacks.
|
|
2435
|
+
*
|
|
2436
|
+
* Errors thrown by individual callbacks are caught and logged so one
|
|
2437
|
+
* misbehaving subscriber does not prevent others from receiving the event.
|
|
2438
|
+
*/
|
|
2439
|
+
private emitTranscript;
|
|
2440
|
+
/**
|
|
2441
|
+
* Resolve the SpeechRecognition constructor, with the webkit-prefixed
|
|
2442
|
+
* fallback. Returns null if not available.
|
|
2443
|
+
*/
|
|
2444
|
+
private resolveSpeechRecognition;
|
|
2445
|
+
/** Reset internal state after disconnection. */
|
|
2446
|
+
private cleanup;
|
|
2447
|
+
/** Conditional debug logging. */
|
|
2448
|
+
private log;
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2451
|
+
interface WebSpeechTTSOptions {
|
|
2452
|
+
voice?: string;
|
|
2453
|
+
rate?: number;
|
|
2454
|
+
pitch?: number;
|
|
2455
|
+
language?: string;
|
|
2456
|
+
debug?: boolean;
|
|
2457
|
+
}
|
|
2458
|
+
/**
|
|
2459
|
+
* Audio event compatible with the TTSAudioEvent shape used by
|
|
2460
|
+
* VoicePipeline for ElevenLabs TTS. Web Speech API does not produce
|
|
2461
|
+
* raw audio buffers, so we emit events with empty buffers and use
|
|
2462
|
+
* isFinal to signal utterance completion.
|
|
2463
|
+
*/
|
|
2464
|
+
interface WebSpeechTTSAudioEvent {
|
|
2465
|
+
audio: ArrayBuffer;
|
|
2466
|
+
isFinal: boolean;
|
|
2467
|
+
timestamp: number;
|
|
2468
|
+
}
|
|
2469
|
+
declare class WebSpeechTTS {
|
|
2470
|
+
private readonly voiceName;
|
|
2471
|
+
private readonly rate;
|
|
2472
|
+
private readonly pitch;
|
|
2473
|
+
private readonly language;
|
|
2474
|
+
private readonly debugEnabled;
|
|
2475
|
+
private _connected;
|
|
2476
|
+
private _suspended;
|
|
2477
|
+
/** Cached voice object resolved from voiceName. */
|
|
2478
|
+
private _resolvedVoice;
|
|
2479
|
+
/** Whether voices have been loaded (they load async in some browsers). */
|
|
2480
|
+
private _voicesLoaded;
|
|
2481
|
+
/** Registered audio-event callbacks. */
|
|
2482
|
+
private readonly audioCallbacks;
|
|
2483
|
+
constructor(options?: WebSpeechTTSOptions);
|
|
2484
|
+
/**
|
|
2485
|
+
* Check whether the Web Speech API SpeechSynthesis is supported in the
|
|
2486
|
+
* current environment. Safe to call in SSR (returns false).
|
|
2487
|
+
*/
|
|
2488
|
+
static isSupported(): boolean;
|
|
2489
|
+
/** Whether the adapter is connected (ready for speech). */
|
|
2490
|
+
get isConnected(): boolean;
|
|
2491
|
+
/**
|
|
2492
|
+
* Initialize the adapter.
|
|
2493
|
+
*
|
|
2494
|
+
* Loads available voices and resolves the requested voice name. Voice
|
|
2495
|
+
* loading is async in some browsers (notably Chrome) so we wait for
|
|
2496
|
+
* the `voiceschanged` event if needed.
|
|
2497
|
+
*/
|
|
2498
|
+
connect(): Promise<void>;
|
|
2499
|
+
/**
|
|
2500
|
+
* Speak the given text using the browser's speech synthesis engine.
|
|
2501
|
+
*
|
|
2502
|
+
* Returns a Promise that resolves when the utterance completes or is
|
|
2503
|
+
* cancelled. Rejects if an error occurs during synthesis.
|
|
2504
|
+
*
|
|
2505
|
+
* Also emits audio events to registered callbacks for VoicePipeline
|
|
2506
|
+
* compatibility.
|
|
2507
|
+
*/
|
|
2508
|
+
speak(text: string): void;
|
|
2509
|
+
/**
|
|
2510
|
+
* Flush / finalize the current utterance.
|
|
2511
|
+
*
|
|
2512
|
+
* No-op for Web Speech API since each speak() call is a complete
|
|
2513
|
+
* utterance. Provided for interface compatibility with ElevenLabsTTS.
|
|
2514
|
+
*/
|
|
2515
|
+
flush(): void;
|
|
2516
|
+
/**
|
|
2517
|
+
* Register a callback to receive audio output events.
|
|
2518
|
+
*
|
|
2519
|
+
* For Web Speech API, these events have empty audio buffers and are
|
|
2520
|
+
* used to signal utterance start/end for VoicePipeline state management.
|
|
2521
|
+
*
|
|
2522
|
+
* @returns An unsubscribe function. Calling it more than once is safe.
|
|
2523
|
+
*/
|
|
2524
|
+
onAudio(callback: (event: WebSpeechTTSAudioEvent) => void): () => void;
|
|
2525
|
+
/** Stop current speech synthesis and cancel any queued utterances. */
|
|
2526
|
+
stop(): void;
|
|
2527
|
+
/** Gracefully close the adapter. */
|
|
2528
|
+
close(): void;
|
|
2529
|
+
/** Force-destroy the adapter. */
|
|
2530
|
+
destroy(): void;
|
|
2531
|
+
/**
|
|
2532
|
+
* Suspend the adapter (e.g. when the device goes offline).
|
|
2533
|
+
*
|
|
2534
|
+
* Pauses any active speech synthesis and marks the adapter as suspended.
|
|
2535
|
+
*/
|
|
2536
|
+
suspend(): void;
|
|
2537
|
+
/**
|
|
2538
|
+
* Resume after a prior `suspend()`.
|
|
2539
|
+
*/
|
|
2540
|
+
resume(): void;
|
|
2541
|
+
/**
|
|
2542
|
+
* Load available voices from the browser.
|
|
2543
|
+
*
|
|
2544
|
+
* In Chrome and some other browsers, voices load asynchronously after
|
|
2545
|
+
* the page loads. We wait for the `voiceschanged` event with a timeout.
|
|
2546
|
+
*/
|
|
2547
|
+
private loadVoices;
|
|
2548
|
+
/**
|
|
2549
|
+
* Find a voice by name (case-insensitive partial match).
|
|
2550
|
+
*/
|
|
2551
|
+
private findVoice;
|
|
2552
|
+
/**
|
|
2553
|
+
* Emit an audio event to all registered callbacks.
|
|
2554
|
+
*
|
|
2555
|
+
* Errors thrown by individual callbacks are caught and logged so one
|
|
2556
|
+
* misbehaving subscriber does not prevent others from receiving the event.
|
|
2557
|
+
*/
|
|
2558
|
+
private emitAudio;
|
|
2559
|
+
/** Reset internal state. */
|
|
2560
|
+
private cleanup;
|
|
2561
|
+
/** Conditional debug logging. */
|
|
2562
|
+
private log;
|
|
2563
|
+
}
|
|
2564
|
+
|
|
2565
|
+
type VoiceState = 'idle' | 'listening' | 'processing' | 'speaking' | 'error';
|
|
2566
|
+
interface VoicePipelineOptions {
|
|
2567
|
+
sttConfig: {
|
|
2568
|
+
provider: 'deepgram';
|
|
2569
|
+
apiKey: string;
|
|
2570
|
+
model?: 'nova-2' | 'nova-3';
|
|
2571
|
+
} | {
|
|
2572
|
+
provider: 'elevenlabs';
|
|
2573
|
+
apiKey: string;
|
|
2574
|
+
language?: string;
|
|
2575
|
+
} | {
|
|
2576
|
+
provider: 'web-speech';
|
|
2577
|
+
language?: string;
|
|
2578
|
+
continuous?: boolean;
|
|
2579
|
+
interimResults?: boolean;
|
|
2580
|
+
};
|
|
2581
|
+
ttsConfig: {
|
|
2582
|
+
provider: 'elevenlabs';
|
|
2583
|
+
apiKey: string;
|
|
2584
|
+
voiceId?: string;
|
|
2585
|
+
modelId?: string;
|
|
2586
|
+
} | {
|
|
2587
|
+
provider: 'web-speech';
|
|
2588
|
+
voice?: string;
|
|
2589
|
+
rate?: number;
|
|
2590
|
+
pitch?: number;
|
|
2591
|
+
language?: string;
|
|
2592
|
+
};
|
|
2593
|
+
debug?: boolean;
|
|
2594
|
+
}
|
|
2595
|
+
declare global {
|
|
2596
|
+
var webkitAudioContext: typeof AudioContext | undefined;
|
|
2597
|
+
}
|
|
2598
|
+
declare class VoicePipeline {
|
|
2599
|
+
private readonly _sttConfig;
|
|
2600
|
+
private readonly _ttsConfig;
|
|
2601
|
+
private readonly _debug;
|
|
2602
|
+
private _state;
|
|
2603
|
+
private _destroyed;
|
|
2604
|
+
private readonly _bus;
|
|
2605
|
+
private _audioContext;
|
|
2606
|
+
private _mediaStream;
|
|
2607
|
+
private _vad;
|
|
2608
|
+
private _stt;
|
|
2609
|
+
private _tts;
|
|
2610
|
+
private _micSourceNode;
|
|
2611
|
+
private _captureProcessor;
|
|
2612
|
+
private _isForwardingToSTT;
|
|
2613
|
+
private _playbackQueue;
|
|
2614
|
+
private _jitterBufferTimer;
|
|
2615
|
+
private _isPlaybackStarted;
|
|
2616
|
+
private _nextPlaybackTime;
|
|
2617
|
+
private _activeSourceNodes;
|
|
2618
|
+
private _lastScheduledSource;
|
|
2619
|
+
private _lastTTSEcho;
|
|
2620
|
+
private _pendingLLMAbort;
|
|
2621
|
+
private readonly _stateChangeCallbacks;
|
|
2622
|
+
private readonly _transcriptCallbacks;
|
|
2623
|
+
private _unsubVADSpeechStart;
|
|
2624
|
+
private _unsubVADSpeechEnd;
|
|
2625
|
+
private _unsubSTTTranscript;
|
|
2626
|
+
private _unsubTTSAudio;
|
|
2627
|
+
constructor(options: VoicePipelineOptions);
|
|
2628
|
+
/** Current pipeline state. */
|
|
2629
|
+
get state(): VoiceState;
|
|
2630
|
+
/**
|
|
2631
|
+
* Initialize AudioContext, VAD model, and STT/TTS connections.
|
|
2632
|
+
*
|
|
2633
|
+
* **Must be called in response to a user gesture** (click / tap) to
|
|
2634
|
+
* satisfy browser autoplay policies.
|
|
2635
|
+
*/
|
|
2636
|
+
init(): Promise<void>;
|
|
2637
|
+
/**
|
|
2638
|
+
* Start listening: activate microphone, begin VAD + STT pipeline.
|
|
2639
|
+
*
|
|
2640
|
+
* Valid from: IDLE, ERROR, SPEAKING (barge-in path calls this internally).
|
|
2641
|
+
*/
|
|
2642
|
+
startListening(): Promise<void>;
|
|
2643
|
+
/** Stop listening: deactivate mic and VAD. */
|
|
2644
|
+
stopListening(): void;
|
|
2645
|
+
/**
|
|
2646
|
+
* Process a transcript through an LLM callback and speak the response.
|
|
2647
|
+
*
|
|
2648
|
+
* @param text - The user's transcript text.
|
|
2649
|
+
* @param sendToLLM - Async callback that sends text to the LLM and returns the response.
|
|
2650
|
+
*/
|
|
2651
|
+
processTranscript(text: string, sendToLLM: (text: string) => Promise<string>): Promise<void>;
|
|
2652
|
+
/** Speak text via TTS (ElevenLabs or Web Speech API). */
|
|
2653
|
+
speak(text: string): Promise<void>;
|
|
2654
|
+
/** Stop current TTS playback immediately (barge-in). */
|
|
2655
|
+
stopSpeaking(): void;
|
|
2656
|
+
/** Subscribe to state changes. Returns an unsubscribe function. */
|
|
2657
|
+
onStateChange(callback: (state: VoiceState, previous: VoiceState) => void): () => void;
|
|
2658
|
+
/** Subscribe to transcript events. Returns an unsubscribe function. */
|
|
2659
|
+
onTranscript(callback: (text: string, isFinal: boolean) => void): () => void;
|
|
2660
|
+
/** Destroy all resources held by the pipeline. */
|
|
2661
|
+
destroy(): Promise<void>;
|
|
2662
|
+
private _setState;
|
|
2663
|
+
/**
|
|
2664
|
+
* Resolve the AudioContext constructor, with Safari webkitAudioContext
|
|
2665
|
+
* fallback. Returns null if Web Audio is not available.
|
|
2666
|
+
*/
|
|
2667
|
+
private _resolveAudioContext;
|
|
2668
|
+
/**
|
|
2669
|
+
* Pre-warm the AudioContext by playing a silent buffer.
|
|
2670
|
+
* This forces the context into the "running" state and avoids a
|
|
2671
|
+
* noticeable delay on the first real playback.
|
|
2672
|
+
*/
|
|
2673
|
+
private _prewarmAudioContext;
|
|
2674
|
+
/**
|
|
2675
|
+
* Set up a ScriptProcessorNode to capture mic audio and forward it
|
|
2676
|
+
* to the STT adapter when `_isForwardingToSTT` is true.
|
|
2677
|
+
*/
|
|
2678
|
+
private _setupMicCapture;
|
|
2679
|
+
/** Tear down the mic capture ScriptProcessorNode. */
|
|
2680
|
+
private _teardownMicCapture;
|
|
2681
|
+
/** Stop all tracks on the current MediaStream. */
|
|
2682
|
+
private _stopMicTracks;
|
|
2683
|
+
private _handleVADSpeechStart;
|
|
2684
|
+
private _handleVADSpeechEnd;
|
|
2685
|
+
private _handleTranscript;
|
|
2686
|
+
/**
|
|
2687
|
+
* Handle an audio chunk from ElevenLabs TTS.
|
|
2688
|
+
*
|
|
2689
|
+
* Implements a jitter buffer: we accumulate audio for JITTER_BUFFER_MS
|
|
2690
|
+
* before starting playback to smooth out network jitter.
|
|
2691
|
+
*/
|
|
2692
|
+
private _handleTTSAudio;
|
|
2693
|
+
/** Flush the jitter buffer and start playback. */
|
|
2694
|
+
private _flushJitterBuffer;
|
|
2695
|
+
/**
|
|
2696
|
+
* Begin playback: decode all queued chunks and schedule them.
|
|
2697
|
+
* If `onDone` is provided, it is called when the last chunk finishes playing.
|
|
2698
|
+
*/
|
|
2699
|
+
private _startPlayback;
|
|
2700
|
+
/**
|
|
2701
|
+
* Decode an audio chunk (mp3 from ElevenLabs) and schedule it for
|
|
2702
|
+
* sequential playback via AudioBufferSourceNode.
|
|
2703
|
+
*/
|
|
2704
|
+
private _decodeAndSchedule;
|
|
2705
|
+
/**
|
|
2706
|
+
* Check if VAD speech-start during SPEAKING state is likely echo from
|
|
2707
|
+
* the speaker playing TTS audio rather than genuine user speech.
|
|
2708
|
+
*
|
|
2709
|
+
* Simple heuristic: if we are still within the echo window of a recent
|
|
2710
|
+
* TTS utterance, treat it as potential echo.
|
|
2711
|
+
*/
|
|
2712
|
+
private _isEchoDetected;
|
|
2713
|
+
/**
|
|
2714
|
+
* Check if a transcript is an echo of recent TTS output.
|
|
2715
|
+
*
|
|
2716
|
+
* Uses word overlap: if intersection of words > 60% of max set size
|
|
2717
|
+
* and the transcript arrived within the echo window, discard it.
|
|
2718
|
+
*/
|
|
2719
|
+
private _isTranscriptEcho;
|
|
2720
|
+
/**
|
|
2721
|
+
* Normalize text into a set of lowercase words, stripping punctuation.
|
|
2722
|
+
*/
|
|
2723
|
+
private _normalizeWords;
|
|
2724
|
+
private _log;
|
|
2725
|
+
}
|
|
2726
|
+
|
|
2727
|
+
export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, OpenAIAdapter, type OpenAIAdapterConfig, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, VoicePipeline, type VoicePipelineOptions, type VoiceState, WebSpeechSTT, type WebSpeechSTTOptions, WebSpeechTTS, type WebSpeechTTSAudioEvent, type WebSpeechTTSOptions, createEventBus, isGuideKitError };
|