@guidekit/core 0.1.0-beta.1 → 0.1.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -351,8 +351,14 @@ type STTConfig = {
351
351
  apiKey: string;
352
352
  model?: 'nova-2' | 'nova-3';
353
353
  } | {
354
- provider: 'assemblyai';
354
+ provider: 'elevenlabs';
355
355
  apiKey: string;
356
+ language?: string;
357
+ } | {
358
+ provider: 'web-speech';
359
+ language?: string;
360
+ continuous?: boolean;
361
+ interimResults?: boolean;
356
362
  };
357
363
  /** Text-to-speech provider configuration. */
358
364
  type TTSConfig = {
@@ -360,19 +366,26 @@ type TTSConfig = {
360
366
  apiKey: string;
361
367
  voiceId?: string;
362
368
  } | {
363
- provider: 'cartesia';
364
- apiKey: string;
365
- voiceId?: string;
369
+ provider: 'web-speech';
370
+ voice?: string;
371
+ rate?: number;
372
+ pitch?: number;
373
+ language?: string;
366
374
  };
375
+ /** Transcript event emitted by any STT adapter. */
376
+ interface STTTranscriptEvent {
377
+ text: string;
378
+ isFinal: boolean;
379
+ confidence: number;
380
+ timestamp: number;
381
+ }
367
382
  /** Large language model provider configuration. */
368
383
  type LLMConfig = {
369
384
  provider: 'gemini';
370
385
  apiKey: string;
371
386
  model?: 'gemini-2.5-flash' | 'gemini-2.5-pro';
372
387
  } | {
373
- provider: 'openai';
374
- apiKey: string;
375
- model?: 'gpt-4o' | 'gpt-4o-mini';
388
+ adapter: LLMProviderAdapter;
376
389
  };
377
390
  /** Top-level options that control SDK behaviour. */
378
391
  interface GuideKitOptions {
@@ -481,11 +494,27 @@ interface GuideKitProviderProps {
481
494
  }>;
482
495
  children?: unknown;
483
496
  }
497
+ /** A single JSON-Schema-style property descriptor used in tool parameter maps. */
498
+ interface ToolParameterSchema {
499
+ type: string;
500
+ description?: string;
501
+ enum?: string[];
502
+ items?: {
503
+ type: string;
504
+ };
505
+ [key: string]: unknown;
506
+ }
484
507
  /** Definition of a tool that can be invoked by the LLM. */
485
508
  interface ToolDefinition {
486
509
  name: string;
487
510
  description: string;
488
- parameters: Record<string, unknown>;
511
+ /** Flat map of param name → JSON Schema property descriptor. */
512
+ parameters: Record<string, ToolParameterSchema>;
513
+ /**
514
+ * List of parameter names the LLM must always provide.
515
+ * Omit or use [] for fully optional parameters.
516
+ */
517
+ required?: string[];
489
518
  schemaVersion: number;
490
519
  }
491
520
  /** A tool invocation request returned by the LLM. */
@@ -509,6 +538,35 @@ interface LLMProviderAdapter {
509
538
  formatConversation(history: ConversationTurn[]): unknown;
510
539
  parseResponse(stream: ReadableStream): AsyncIterable<TextChunk | ToolCall>;
511
540
  formatToolResult(callId: string, result: unknown): unknown;
541
+ /**
542
+ * Build and execute a streaming request to the provider API.
543
+ * Returns the raw ReadableStream for the response body.
544
+ */
545
+ streamRequest(params: {
546
+ systemPrompt: string;
547
+ contents: unknown;
548
+ userMessage?: string;
549
+ tools?: unknown;
550
+ signal?: AbortSignal;
551
+ timeoutMs?: number;
552
+ }): Promise<{
553
+ stream: ReadableStream<Uint8Array>;
554
+ response: Response;
555
+ }>;
556
+ /**
557
+ * Check whether a parsed response chunk indicates the response was
558
+ * blocked by a content/safety filter.
559
+ */
560
+ isContentFiltered(chunk: Record<string, unknown>): boolean;
561
+ /**
562
+ * Extract token usage from a parsed response chunk.
563
+ * Returns `null` if no usage metadata is present in this chunk.
564
+ */
565
+ extractUsage(chunk: Record<string, unknown>): {
566
+ prompt: number;
567
+ completion: number;
568
+ total: number;
569
+ } | null;
512
570
  }
513
571
  /** Decoded payload of a GuideKit session token. */
514
572
  interface TokenPayload {
@@ -529,9 +587,9 @@ interface TokenResponse {
529
587
  /** Options for `createSessionToken()` on the server side. */
530
588
  interface CreateSessionTokenOptions {
531
589
  signingSecret: string | string[];
532
- deepgramKey?: string;
533
- elevenlabsKey?: string;
534
- geminiKey?: string;
590
+ sttApiKey?: string;
591
+ ttsApiKey?: string;
592
+ llmApiKey?: string;
535
593
  expiresIn?: string;
536
594
  allowedOrigins?: string[];
537
595
  permissions?: string[];
@@ -586,10 +644,11 @@ declare const ErrorCodes: {
586
644
  readonly VAD_PACKAGE_MISSING: "VAD_PACKAGE_MISSING";
587
645
  readonly CONTENT_FILTER_TRIGGERED: "CONTENT_FILTER_TRIGGERED";
588
646
  readonly PRIVACY_HOOK_CANCELLED: "PRIVACY_HOOK_CANCELLED";
647
+ readonly UNKNOWN: "UNKNOWN";
589
648
  };
590
649
  /** Union of every known error code string. */
591
650
  type ErrorCode = (typeof ErrorCodes)[keyof typeof ErrorCodes];
592
- type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai';
651
+ type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai' | 'web-speech' | (string & {});
593
652
  interface GuideKitErrorOptions {
594
653
  code: string;
595
654
  message: string;
@@ -830,6 +889,11 @@ declare class GuideKitCore {
830
889
  private setAgentState;
831
890
  private notifyStoreListeners;
832
891
  private buildSnapshot;
892
+ /**
893
+ * Unified built-in tool specifications — single source of truth for both
894
+ * tool definitions (sent to LLM) and handler registration.
895
+ */
896
+ private getBuiltinToolSpecs;
833
897
  /**
834
898
  * Register all built-in tool handlers with the ToolExecutor.
835
899
  * Called once during init() after VisualGuidance and all subsystems are ready.
@@ -1118,17 +1182,35 @@ interface TokenUsage$1 {
1118
1182
  completion: number;
1119
1183
  total: number;
1120
1184
  }
1185
+ /** Configuration for the OpenAI adapter (custom adapter pattern). */
1186
+ interface OpenAIAdapterConfig {
1187
+ apiKey: string;
1188
+ model?: 'gpt-4o' | 'gpt-4o-mini' | (string & {});
1189
+ }
1121
1190
  /**
1122
1191
  * Adapter that translates between GuideKit's internal types and the
1123
1192
  * OpenAI Chat Completions API wire format. Handles streaming via SSE,
1124
1193
  * tool formatting, and response parsing.
1194
+ *
1195
+ * Usage as a custom adapter:
1196
+ * ```ts
1197
+ * import { OpenAIAdapter } from '@guidekit/core';
1198
+ * const llmConfig = { adapter: new OpenAIAdapter({ apiKey: '...', model: 'gpt-4o' }) };
1199
+ * ```
1125
1200
  */
1126
1201
  declare class OpenAIAdapter implements LLMProviderAdapter {
1127
1202
  private readonly apiKey;
1128
1203
  private readonly model;
1129
- constructor(config: Extract<LLMConfig, {
1130
- provider: 'openai';
1131
- }>);
1204
+ /** Tracks whether the last extractChunks call emitted a done chunk. */
1205
+ private lastExtractEmittedDone;
1206
+ /**
1207
+ * Token usage extracted from the most recent `parseResponse` call.
1208
+ * Updated as each SSE chunk is parsed.
1209
+ */
1210
+ private _lastUsage;
1211
+ constructor(config: OpenAIAdapterConfig);
1212
+ /** Token usage from the most recent parseResponse call. */
1213
+ get lastUsage(): TokenUsage$1;
1132
1214
  /**
1133
1215
  * Convert GuideKit tool definitions into OpenAI's `tools` format.
1134
1216
  * Each tool is wrapped as `{ type: 'function', function: { name, description, parameters } }`.
@@ -1150,6 +1232,10 @@ declare class OpenAIAdapter implements LLMProviderAdapter {
1150
1232
  * prefixed by `data: `. The final line is `data: [DONE]`.
1151
1233
  * Text content arrives in `choices[0].delta.content` and tool calls
1152
1234
  * arrive in `choices[0].delta.tool_calls`.
1235
+ *
1236
+ * This method also:
1237
+ * - Detects content filtering and throws `ContentFilterError`.
1238
+ * - Tracks token usage (accessible via `lastUsage` after iteration).
1153
1239
  */
1154
1240
  parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
1155
1241
  /**
@@ -1168,10 +1254,8 @@ declare class OpenAIAdapter implements LLMProviderAdapter {
1168
1254
  */
1169
1255
  streamRequest(params: {
1170
1256
  systemPrompt: string;
1171
- contents: Array<{
1172
- role: string;
1173
- content: string;
1174
- }>;
1257
+ contents: unknown;
1258
+ userMessage?: string;
1175
1259
  tools?: unknown;
1176
1260
  signal?: AbortSignal;
1177
1261
  timeoutMs?: number;
@@ -1230,9 +1314,17 @@ interface TokenUsage {
1230
1314
  declare class GeminiAdapter implements LLMProviderAdapter {
1231
1315
  private readonly apiKey;
1232
1316
  private readonly model;
1317
+ /**
1318
+ * Token usage extracted from the most recent `parseResponse` call.
1319
+ * Updated as each SSE chunk is parsed; the final value reflects the
1320
+ * cumulative usage metadata sent by Gemini (typically in the last chunk).
1321
+ */
1322
+ private _lastUsage;
1233
1323
  constructor(config: Extract<LLMConfig, {
1234
1324
  provider: 'gemini';
1235
1325
  }>);
1326
+ /** Token usage from the most recent parseResponse call. */
1327
+ get lastUsage(): TokenUsage;
1236
1328
  /**
1237
1329
  * Convert GuideKit tool definitions into Gemini's `functionDeclarations`
1238
1330
  * format, wrapped inside a `tools` array.
@@ -1255,6 +1347,10 @@ declare class GeminiAdapter implements LLMProviderAdapter {
1255
1347
  * The Gemini `streamGenerateContent?alt=sse` endpoint sends each chunk
1256
1348
  * as a JSON object prefixed by `data: `. We parse line-by-line, extract
1257
1349
  * text parts and function call parts, and yield the appropriate types.
1350
+ *
1351
+ * This method also:
1352
+ * - Detects content filtering and throws `ContentFilterError`.
1353
+ * - Tracks token usage (accessible via `lastUsage` after iteration).
1258
1354
  */
1259
1355
  parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
1260
1356
  /**
@@ -1275,16 +1371,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
1275
1371
  /**
1276
1372
  * Build and execute a streaming request to the Gemini API.
1277
1373
  * Returns the raw `ReadableStream` for the response body together with
1278
- * a promise that resolves to token usage extracted from the final chunk.
1374
+ * the raw Response object.
1375
+ *
1376
+ * Note: The Gemini API key is passed as a URL query parameter (`key=`).
1377
+ * This is inherent to the Gemini REST SSE endpoint design; the key is
1378
+ * transmitted over HTTPS so it remains encrypted in transit. (H3)
1279
1379
  */
1280
1380
  streamRequest(params: {
1281
1381
  systemPrompt: string;
1282
- contents: Array<{
1283
- role: string;
1284
- parts: Array<{
1285
- text: string;
1286
- }>;
1287
- }>;
1382
+ contents: unknown;
1383
+ userMessage?: string;
1288
1384
  tools?: unknown;
1289
1385
  signal?: AbortSignal;
1290
1386
  timeoutMs?: number;
@@ -1317,12 +1413,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
1317
1413
  * High-level orchestrator that manages LLM interactions for the GuideKit SDK.
1318
1414
  *
1319
1415
  * Responsibilities:
1320
- * - Owns the active `LLMProviderAdapter` (currently only `GeminiAdapter`).
1416
+ * - Owns the active `LLMProviderAdapter`.
1321
1417
  * - Streams responses from the provider, emitting callbacks for text chunks,
1322
1418
  * tool calls, and token usage.
1323
1419
  * - Handles content filter retries: if the initial response is blocked, it
1324
1420
  * retries once with a stripped-down prompt (no tools).
1325
1421
  * - Surfaces all errors through the SDK error hierarchy.
1422
+ *
1423
+ * The orchestrator is fully adapter-agnostic: all provider-specific logic
1424
+ * (SSE parsing, content filter detection, usage extraction) lives in the
1425
+ * adapter implementations.
1326
1426
  */
1327
1427
  declare class LLMOrchestrator {
1328
1428
  private _adapter;
@@ -1363,12 +1463,22 @@ declare class LLMOrchestrator {
1363
1463
  get adapter(): LLMProviderAdapter;
1364
1464
  /**
1365
1465
  * Execute a streaming LLM request and collect the results.
1466
+ *
1467
+ * This method is fully adapter-agnostic: it delegates streaming,
1468
+ * response parsing, content-filter detection, and usage extraction
1469
+ * entirely to the active `LLMProviderAdapter`. No provider-specific
1470
+ * SSE parsing lives in the orchestrator.
1366
1471
  */
1367
1472
  private executeStream;
1368
1473
  /**
1369
1474
  * Create the appropriate adapter for the given config.
1370
- * Currently only Gemini is implemented; other providers will be added
1371
- * as the SDK evolves.
1475
+ *
1476
+ * Built-in providers:
1477
+ * - `'gemini'` — uses the bundled `GeminiAdapter`.
1478
+ *
1479
+ * Custom adapters:
1480
+ * - Pass `{ adapter: myAdapter }` to use any `LLMProviderAdapter`.
1481
+ * Example: `llm: { adapter: new OpenAIAdapter({ ... }) }`
1372
1482
  */
1373
1483
  private createAdapter;
1374
1484
  /** Convenience accessor for the current provider name. */
@@ -2183,4 +2293,435 @@ declare class TokenManager {
2183
2293
  private log;
2184
2294
  }
2185
2295
 
2186
- export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, OpenAIAdapter, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, createEventBus, isGuideKitError };
2296
+ /**
2297
+ * Minimal type declarations for the Web Speech API SpeechRecognition
2298
+ * interface. These cover the subset used by this adapter. Full type
2299
+ * definitions are available in lib.dom.d.ts but may not be present in
2300
+ * all TS configurations.
2301
+ */
2302
+ interface SpeechRecognitionEvent {
2303
+ readonly resultIndex: number;
2304
+ readonly results: SpeechRecognitionResultList;
2305
+ }
2306
+ interface SpeechRecognitionResultList {
2307
+ readonly length: number;
2308
+ item(index: number): SpeechRecognitionResult;
2309
+ [index: number]: SpeechRecognitionResult;
2310
+ }
2311
+ interface SpeechRecognitionResult {
2312
+ readonly length: number;
2313
+ readonly isFinal: boolean;
2314
+ item(index: number): SpeechRecognitionAlternative;
2315
+ [index: number]: SpeechRecognitionAlternative;
2316
+ }
2317
+ interface SpeechRecognitionAlternative {
2318
+ readonly transcript: string;
2319
+ readonly confidence: number;
2320
+ }
2321
+ interface SpeechRecognitionErrorEvent {
2322
+ readonly error: string;
2323
+ readonly message: string;
2324
+ }
2325
+ interface SpeechRecognitionInstance extends EventTarget {
2326
+ lang: string;
2327
+ continuous: boolean;
2328
+ interimResults: boolean;
2329
+ maxAlternatives: number;
2330
+ onresult: ((event: SpeechRecognitionEvent) => void) | null;
2331
+ onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
2332
+ onend: (() => void) | null;
2333
+ onstart: (() => void) | null;
2334
+ start(): void;
2335
+ stop(): void;
2336
+ abort(): void;
2337
+ }
2338
+ interface SpeechRecognitionConstructor {
2339
+ new (): SpeechRecognitionInstance;
2340
+ }
2341
+ declare global {
2342
+ var webkitSpeechRecognition: SpeechRecognitionConstructor | undefined;
2343
+ }
2344
+ interface WebSpeechSTTOptions {
2345
+ language?: string;
2346
+ continuous?: boolean;
2347
+ interimResults?: boolean;
2348
+ debug?: boolean;
2349
+ }
2350
+ declare class WebSpeechSTT {
2351
+ private readonly language;
2352
+ private readonly continuous;
2353
+ private readonly interimResultsEnabled;
2354
+ private readonly debugEnabled;
2355
+ private recognition;
2356
+ private _connected;
2357
+ private _suspended;
2358
+ /**
2359
+ * Whether we intentionally stopped recognition. Used to distinguish
2360
+ * between intentional stop and unexpected end (for auto-restart in
2361
+ * continuous mode).
2362
+ */
2363
+ private _intentionalStop;
2364
+ /** Registered transcript callbacks. */
2365
+ private readonly transcriptCallbacks;
2366
+ constructor(options?: WebSpeechSTTOptions);
2367
+ /**
2368
+ * Check whether the Web Speech API SpeechRecognition is supported in the
2369
+ * current environment. Safe to call in SSR (returns false).
2370
+ */
2371
+ static isSupported(): boolean;
2372
+ /** Whether recognition is currently active and connected. */
2373
+ get isConnected(): boolean;
2374
+ /**
2375
+ * Start speech recognition.
2376
+ *
2377
+ * Creates the SpeechRecognition instance and begins listening. Resolves
2378
+ * once the recognition session has started. Rejects if the API is not
2379
+ * supported or the browser denies permission.
2380
+ */
2381
+ connect(): Promise<void>;
2382
+ /**
2383
+ * Send audio data. No-op for Web Speech API since it captures audio
2384
+ * directly from the microphone via the browser's internal pipeline.
2385
+ *
2386
+ * Provided for interface compatibility with WebSocket-based STT adapters
2387
+ * (DeepgramSTT, ElevenLabsSTT).
2388
+ */
2389
+ sendAudio(_audioData: Float32Array | Int16Array): void;
2390
+ /**
2391
+ * Register a callback to receive transcript events.
2392
+ *
2393
+ * @returns An unsubscribe function. Calling it more than once is safe.
2394
+ */
2395
+ onTranscript(callback: (event: STTTranscriptEvent) => void): () => void;
2396
+ /**
2397
+ * Gracefully stop recognition.
2398
+ *
2399
+ * Calls `stop()` on the SpeechRecognition instance which allows it to
2400
+ * deliver any pending final results before ending.
2401
+ */
2402
+ close(): void;
2403
+ /** Force-destroy the recognition without waiting for pending results. */
2404
+ destroy(): void;
2405
+ /**
2406
+ * Suspend the adapter (e.g. when the device goes offline).
2407
+ *
2408
+ * Stops recognition and marks the adapter as suspended so that auto-restart
2409
+ * does not trigger.
2410
+ */
2411
+ suspend(): void;
2412
+ /**
2413
+ * Resume after a prior `suspend()`. Restarts recognition if it was
2414
+ * running before suspension.
2415
+ */
2416
+ resume(): void;
2417
+ /**
2418
+ * Handle SpeechRecognition result events.
2419
+ *
2420
+ * The `results` property is a SpeechRecognitionResultList containing all
2421
+ * results accumulated during this recognition session. We only process
2422
+ * results from `resultIndex` onward to avoid re-emitting old results.
2423
+ */
2424
+ private handleResult;
2425
+ /**
2426
+ * Handle SpeechRecognition errors.
2427
+ *
2428
+ * Some errors are recoverable (e.g. `no-speech`) and some are fatal
2429
+ * (e.g. `not-allowed`). For recoverable errors in continuous mode,
2430
+ * recognition will auto-restart via the `onend` handler.
2431
+ */
2432
+ private handleError;
2433
+ /**
2434
+ * Emit a transcript event to all registered callbacks.
2435
+ *
2436
+ * Errors thrown by individual callbacks are caught and logged so one
2437
+ * misbehaving subscriber does not prevent others from receiving the event.
2438
+ */
2439
+ private emitTranscript;
2440
+ /**
2441
+ * Resolve the SpeechRecognition constructor, with the webkit-prefixed
2442
+ * fallback. Returns null if not available.
2443
+ */
2444
+ private resolveSpeechRecognition;
2445
+ /** Reset internal state after disconnection. */
2446
+ private cleanup;
2447
+ /** Conditional debug logging. */
2448
+ private log;
2449
+ }
2450
+
2451
+ interface WebSpeechTTSOptions {
2452
+ voice?: string;
2453
+ rate?: number;
2454
+ pitch?: number;
2455
+ language?: string;
2456
+ debug?: boolean;
2457
+ }
2458
+ /**
2459
+ * Audio event compatible with the TTSAudioEvent shape used by
2460
+ * VoicePipeline for ElevenLabs TTS. Web Speech API does not produce
2461
+ * raw audio buffers, so we emit events with empty buffers and use
2462
+ * isFinal to signal utterance completion.
2463
+ */
2464
+ interface WebSpeechTTSAudioEvent {
2465
+ audio: ArrayBuffer;
2466
+ isFinal: boolean;
2467
+ timestamp: number;
2468
+ }
2469
+ declare class WebSpeechTTS {
2470
+ private readonly voiceName;
2471
+ private readonly rate;
2472
+ private readonly pitch;
2473
+ private readonly language;
2474
+ private readonly debugEnabled;
2475
+ private _connected;
2476
+ private _suspended;
2477
+ /** Cached voice object resolved from voiceName. */
2478
+ private _resolvedVoice;
2479
+ /** Whether voices have been loaded (they load async in some browsers). */
2480
+ private _voicesLoaded;
2481
+ /** Registered audio-event callbacks. */
2482
+ private readonly audioCallbacks;
2483
+ constructor(options?: WebSpeechTTSOptions);
2484
+ /**
2485
+ * Check whether the Web Speech API SpeechSynthesis is supported in the
2486
+ * current environment. Safe to call in SSR (returns false).
2487
+ */
2488
+ static isSupported(): boolean;
2489
+ /** Whether the adapter is connected (ready for speech). */
2490
+ get isConnected(): boolean;
2491
+ /**
2492
+ * Initialize the adapter.
2493
+ *
2494
+ * Loads available voices and resolves the requested voice name. Voice
2495
+ * loading is async in some browsers (notably Chrome) so we wait for
2496
+ * the `voiceschanged` event if needed.
2497
+ */
2498
+ connect(): Promise<void>;
2499
+ /**
2500
+ * Speak the given text using the browser's speech synthesis engine.
2501
+ *
2502
+ * Returns a Promise that resolves when the utterance completes or is
2503
+ * cancelled. Rejects if an error occurs during synthesis.
2504
+ *
2505
+ * Also emits audio events to registered callbacks for VoicePipeline
2506
+ * compatibility.
2507
+ */
2508
+ speak(text: string): void;
2509
+ /**
2510
+ * Flush / finalize the current utterance.
2511
+ *
2512
+ * No-op for Web Speech API since each speak() call is a complete
2513
+ * utterance. Provided for interface compatibility with ElevenLabsTTS.
2514
+ */
2515
+ flush(): void;
2516
+ /**
2517
+ * Register a callback to receive audio output events.
2518
+ *
2519
+ * For Web Speech API, these events have empty audio buffers and are
2520
+ * used to signal utterance start/end for VoicePipeline state management.
2521
+ *
2522
+ * @returns An unsubscribe function. Calling it more than once is safe.
2523
+ */
2524
+ onAudio(callback: (event: WebSpeechTTSAudioEvent) => void): () => void;
2525
+ /** Stop current speech synthesis and cancel any queued utterances. */
2526
+ stop(): void;
2527
+ /** Gracefully close the adapter. */
2528
+ close(): void;
2529
+ /** Force-destroy the adapter. */
2530
+ destroy(): void;
2531
+ /**
2532
+ * Suspend the adapter (e.g. when the device goes offline).
2533
+ *
2534
+ * Pauses any active speech synthesis and marks the adapter as suspended.
2535
+ */
2536
+ suspend(): void;
2537
+ /**
2538
+ * Resume after a prior `suspend()`.
2539
+ */
2540
+ resume(): void;
2541
+ /**
2542
+ * Load available voices from the browser.
2543
+ *
2544
+ * In Chrome and some other browsers, voices load asynchronously after
2545
+ * the page loads. We wait for the `voiceschanged` event with a timeout.
2546
+ */
2547
+ private loadVoices;
2548
+ /**
2549
+ * Find a voice by name (case-insensitive partial match).
2550
+ */
2551
+ private findVoice;
2552
+ /**
2553
+ * Emit an audio event to all registered callbacks.
2554
+ *
2555
+ * Errors thrown by individual callbacks are caught and logged so one
2556
+ * misbehaving subscriber does not prevent others from receiving the event.
2557
+ */
2558
+ private emitAudio;
2559
+ /** Reset internal state. */
2560
+ private cleanup;
2561
+ /** Conditional debug logging. */
2562
+ private log;
2563
+ }
2564
+
2565
+ type VoiceState = 'idle' | 'listening' | 'processing' | 'speaking' | 'error';
2566
+ interface VoicePipelineOptions {
2567
+ sttConfig: {
2568
+ provider: 'deepgram';
2569
+ apiKey: string;
2570
+ model?: 'nova-2' | 'nova-3';
2571
+ } | {
2572
+ provider: 'elevenlabs';
2573
+ apiKey: string;
2574
+ language?: string;
2575
+ } | {
2576
+ provider: 'web-speech';
2577
+ language?: string;
2578
+ continuous?: boolean;
2579
+ interimResults?: boolean;
2580
+ };
2581
+ ttsConfig: {
2582
+ provider: 'elevenlabs';
2583
+ apiKey: string;
2584
+ voiceId?: string;
2585
+ modelId?: string;
2586
+ } | {
2587
+ provider: 'web-speech';
2588
+ voice?: string;
2589
+ rate?: number;
2590
+ pitch?: number;
2591
+ language?: string;
2592
+ };
2593
+ debug?: boolean;
2594
+ }
2595
+ declare global {
2596
+ var webkitAudioContext: typeof AudioContext | undefined;
2597
+ }
2598
+ declare class VoicePipeline {
2599
+ private readonly _sttConfig;
2600
+ private readonly _ttsConfig;
2601
+ private readonly _debug;
2602
+ private _state;
2603
+ private _destroyed;
2604
+ private readonly _bus;
2605
+ private _audioContext;
2606
+ private _mediaStream;
2607
+ private _vad;
2608
+ private _stt;
2609
+ private _tts;
2610
+ private _micSourceNode;
2611
+ private _captureProcessor;
2612
+ private _isForwardingToSTT;
2613
+ private _playbackQueue;
2614
+ private _jitterBufferTimer;
2615
+ private _isPlaybackStarted;
2616
+ private _nextPlaybackTime;
2617
+ private _activeSourceNodes;
2618
+ private _lastScheduledSource;
2619
+ private _lastTTSEcho;
2620
+ private _pendingLLMAbort;
2621
+ private readonly _stateChangeCallbacks;
2622
+ private readonly _transcriptCallbacks;
2623
+ private _unsubVADSpeechStart;
2624
+ private _unsubVADSpeechEnd;
2625
+ private _unsubSTTTranscript;
2626
+ private _unsubTTSAudio;
2627
+ constructor(options: VoicePipelineOptions);
2628
+ /** Current pipeline state. */
2629
+ get state(): VoiceState;
2630
+ /**
2631
+ * Initialize AudioContext, VAD model, and STT/TTS connections.
2632
+ *
2633
+ * **Must be called in response to a user gesture** (click / tap) to
2634
+ * satisfy browser autoplay policies.
2635
+ */
2636
+ init(): Promise<void>;
2637
+ /**
2638
+ * Start listening: activate microphone, begin VAD + STT pipeline.
2639
+ *
2640
+ * Valid from: IDLE, ERROR, SPEAKING (barge-in path calls this internally).
2641
+ */
2642
+ startListening(): Promise<void>;
2643
+ /** Stop listening: deactivate mic and VAD. */
2644
+ stopListening(): void;
2645
+ /**
2646
+ * Process a transcript through an LLM callback and speak the response.
2647
+ *
2648
+ * @param text - The user's transcript text.
2649
+ * @param sendToLLM - Async callback that sends text to the LLM and returns the response.
2650
+ */
2651
+ processTranscript(text: string, sendToLLM: (text: string) => Promise<string>): Promise<void>;
2652
+ /** Speak text via TTS (ElevenLabs or Web Speech API). */
2653
+ speak(text: string): Promise<void>;
2654
+ /** Stop current TTS playback immediately (barge-in). */
2655
+ stopSpeaking(): void;
2656
+ /** Subscribe to state changes. Returns an unsubscribe function. */
2657
+ onStateChange(callback: (state: VoiceState, previous: VoiceState) => void): () => void;
2658
+ /** Subscribe to transcript events. Returns an unsubscribe function. */
2659
+ onTranscript(callback: (text: string, isFinal: boolean) => void): () => void;
2660
+ /** Destroy all resources held by the pipeline. */
2661
+ destroy(): Promise<void>;
2662
+ private _setState;
2663
+ /**
2664
+ * Resolve the AudioContext constructor, with Safari webkitAudioContext
2665
+ * fallback. Returns null if Web Audio is not available.
2666
+ */
2667
+ private _resolveAudioContext;
2668
+ /**
2669
+ * Pre-warm the AudioContext by playing a silent buffer.
2670
+ * This forces the context into the "running" state and avoids a
2671
+ * noticeable delay on the first real playback.
2672
+ */
2673
+ private _prewarmAudioContext;
2674
+ /**
2675
+ * Set up a ScriptProcessorNode to capture mic audio and forward it
2676
+ * to the STT adapter when `_isForwardingToSTT` is true.
2677
+ */
2678
+ private _setupMicCapture;
2679
+ /** Tear down the mic capture ScriptProcessorNode. */
2680
+ private _teardownMicCapture;
2681
+ /** Stop all tracks on the current MediaStream. */
2682
+ private _stopMicTracks;
2683
+ private _handleVADSpeechStart;
2684
+ private _handleVADSpeechEnd;
2685
+ private _handleTranscript;
2686
+ /**
2687
+ * Handle an audio chunk from ElevenLabs TTS.
2688
+ *
2689
+ * Implements a jitter buffer: we accumulate audio for JITTER_BUFFER_MS
2690
+ * before starting playback to smooth out network jitter.
2691
+ */
2692
+ private _handleTTSAudio;
2693
+ /** Flush the jitter buffer and start playback. */
2694
+ private _flushJitterBuffer;
2695
+ /**
2696
+ * Begin playback: decode all queued chunks and schedule them.
2697
+ * If `onDone` is provided, it is called when the last chunk finishes playing.
2698
+ */
2699
+ private _startPlayback;
2700
+ /**
2701
+ * Decode an audio chunk (mp3 from ElevenLabs) and schedule it for
2702
+ * sequential playback via AudioBufferSourceNode.
2703
+ */
2704
+ private _decodeAndSchedule;
2705
+ /**
2706
+ * Check if VAD speech-start during SPEAKING state is likely echo from
2707
+ * the speaker playing TTS audio rather than genuine user speech.
2708
+ *
2709
+ * Simple heuristic: if we are still within the echo window of a recent
2710
+ * TTS utterance, treat it as potential echo.
2711
+ */
2712
+ private _isEchoDetected;
2713
+ /**
2714
+ * Check if a transcript is an echo of recent TTS output.
2715
+ *
2716
+ * Uses word overlap: if intersection of words > 60% of max set size
2717
+ * and the transcript arrived within the echo window, discard it.
2718
+ */
2719
+ private _isTranscriptEcho;
2720
+ /**
2721
+ * Normalize text into a set of lowercase words, stripping punctuation.
2722
+ */
2723
+ private _normalizeWords;
2724
+ private _log;
2725
+ }
2726
+
2727
+ export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, OpenAIAdapter, type OpenAIAdapterConfig, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, VoicePipeline, type VoicePipelineOptions, type VoiceState, WebSpeechSTT, type WebSpeechSTTOptions, WebSpeechTTS, type WebSpeechTTSAudioEvent, type WebSpeechTTSOptions, createEventBus, isGuideKitError };