@guidekit/core 0.1.0-beta.1 → 0.1.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -351,8 +351,14 @@ type STTConfig = {
351
351
  apiKey: string;
352
352
  model?: 'nova-2' | 'nova-3';
353
353
  } | {
354
- provider: 'assemblyai';
354
+ provider: 'elevenlabs';
355
355
  apiKey: string;
356
+ language?: string;
357
+ } | {
358
+ provider: 'web-speech';
359
+ language?: string;
360
+ continuous?: boolean;
361
+ interimResults?: boolean;
356
362
  };
357
363
  /** Text-to-speech provider configuration. */
358
364
  type TTSConfig = {
@@ -360,19 +366,26 @@ type TTSConfig = {
360
366
  apiKey: string;
361
367
  voiceId?: string;
362
368
  } | {
363
- provider: 'cartesia';
364
- apiKey: string;
365
- voiceId?: string;
369
+ provider: 'web-speech';
370
+ voice?: string;
371
+ rate?: number;
372
+ pitch?: number;
373
+ language?: string;
366
374
  };
375
+ /** Transcript event emitted by any STT adapter. */
376
+ interface STTTranscriptEvent {
377
+ text: string;
378
+ isFinal: boolean;
379
+ confidence: number;
380
+ timestamp: number;
381
+ }
367
382
  /** Large language model provider configuration. */
368
383
  type LLMConfig = {
369
384
  provider: 'gemini';
370
385
  apiKey: string;
371
386
  model?: 'gemini-2.5-flash' | 'gemini-2.5-pro';
372
387
  } | {
373
- provider: 'openai';
374
- apiKey: string;
375
- model?: 'gpt-4o' | 'gpt-4o-mini';
388
+ adapter: LLMProviderAdapter;
376
389
  };
377
390
  /** Top-level options that control SDK behaviour. */
378
391
  interface GuideKitOptions {
@@ -481,11 +494,27 @@ interface GuideKitProviderProps {
481
494
  }>;
482
495
  children?: unknown;
483
496
  }
497
+ /** A single JSON-Schema-style property descriptor used in tool parameter maps. */
498
+ interface ToolParameterSchema {
499
+ type: string;
500
+ description?: string;
501
+ enum?: string[];
502
+ items?: {
503
+ type: string;
504
+ };
505
+ [key: string]: unknown;
506
+ }
484
507
  /** Definition of a tool that can be invoked by the LLM. */
485
508
  interface ToolDefinition {
486
509
  name: string;
487
510
  description: string;
488
- parameters: Record<string, unknown>;
511
+ /** Flat map of param name → JSON Schema property descriptor. */
512
+ parameters: Record<string, ToolParameterSchema>;
513
+ /**
514
+ * List of parameter names the LLM must always provide.
515
+ * Omit or use [] for fully optional parameters.
516
+ */
517
+ required?: string[];
489
518
  schemaVersion: number;
490
519
  }
491
520
  /** A tool invocation request returned by the LLM. */
@@ -509,6 +538,35 @@ interface LLMProviderAdapter {
509
538
  formatConversation(history: ConversationTurn[]): unknown;
510
539
  parseResponse(stream: ReadableStream): AsyncIterable<TextChunk | ToolCall>;
511
540
  formatToolResult(callId: string, result: unknown): unknown;
541
+ /**
542
+ * Build and execute a streaming request to the provider API.
543
+ * Returns the raw ReadableStream for the response body.
544
+ */
545
+ streamRequest(params: {
546
+ systemPrompt: string;
547
+ contents: unknown;
548
+ userMessage?: string;
549
+ tools?: unknown;
550
+ signal?: AbortSignal;
551
+ timeoutMs?: number;
552
+ }): Promise<{
553
+ stream: ReadableStream<Uint8Array>;
554
+ response: Response;
555
+ }>;
556
+ /**
557
+ * Check whether a parsed response chunk indicates the response was
558
+ * blocked by a content/safety filter.
559
+ */
560
+ isContentFiltered(chunk: Record<string, unknown>): boolean;
561
+ /**
562
+ * Extract token usage from a parsed response chunk.
563
+ * Returns `null` if no usage metadata is present in this chunk.
564
+ */
565
+ extractUsage(chunk: Record<string, unknown>): {
566
+ prompt: number;
567
+ completion: number;
568
+ total: number;
569
+ } | null;
512
570
  }
513
571
  /** Decoded payload of a GuideKit session token. */
514
572
  interface TokenPayload {
@@ -529,9 +587,9 @@ interface TokenResponse {
529
587
  /** Options for `createSessionToken()` on the server side. */
530
588
  interface CreateSessionTokenOptions {
531
589
  signingSecret: string | string[];
532
- deepgramKey?: string;
533
- elevenlabsKey?: string;
534
- geminiKey?: string;
590
+ sttApiKey?: string;
591
+ ttsApiKey?: string;
592
+ llmApiKey?: string;
535
593
  expiresIn?: string;
536
594
  allowedOrigins?: string[];
537
595
  permissions?: string[];
@@ -586,10 +644,11 @@ declare const ErrorCodes: {
586
644
  readonly VAD_PACKAGE_MISSING: "VAD_PACKAGE_MISSING";
587
645
  readonly CONTENT_FILTER_TRIGGERED: "CONTENT_FILTER_TRIGGERED";
588
646
  readonly PRIVACY_HOOK_CANCELLED: "PRIVACY_HOOK_CANCELLED";
647
+ readonly UNKNOWN: "UNKNOWN";
589
648
  };
590
649
  /** Union of every known error code string. */
591
650
  type ErrorCode = (typeof ErrorCodes)[keyof typeof ErrorCodes];
592
- type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai';
651
+ type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai' | 'web-speech' | (string & {});
593
652
  interface GuideKitErrorOptions {
594
653
  code: string;
595
654
  message: string;
@@ -830,6 +889,11 @@ declare class GuideKitCore {
830
889
  private setAgentState;
831
890
  private notifyStoreListeners;
832
891
  private buildSnapshot;
892
+ /**
893
+ * Unified built-in tool specifications — single source of truth for both
894
+ * tool definitions (sent to LLM) and handler registration.
895
+ */
896
+ private getBuiltinToolSpecs;
833
897
  /**
834
898
  * Register all built-in tool handlers with the ToolExecutor.
835
899
  * Called once during init() after VisualGuidance and all subsystems are ready.
@@ -1113,110 +1177,6 @@ declare class ContextManager {
1113
1177
  private log;
1114
1178
  }
1115
1179
 
1116
- interface TokenUsage$1 {
1117
- prompt: number;
1118
- completion: number;
1119
- total: number;
1120
- }
1121
- /**
1122
- * Adapter that translates between GuideKit's internal types and the
1123
- * OpenAI Chat Completions API wire format. Handles streaming via SSE,
1124
- * tool formatting, and response parsing.
1125
- */
1126
- declare class OpenAIAdapter implements LLMProviderAdapter {
1127
- private readonly apiKey;
1128
- private readonly model;
1129
- constructor(config: Extract<LLMConfig, {
1130
- provider: 'openai';
1131
- }>);
1132
- /**
1133
- * Convert GuideKit tool definitions into OpenAI's `tools` format.
1134
- * Each tool is wrapped as `{ type: 'function', function: { name, description, parameters } }`.
1135
- */
1136
- formatTools(tools: ToolDefinition[]): unknown;
1137
- /**
1138
- * Convert an array of `ConversationTurn` objects into OpenAI's messages
1139
- * format with `role: 'user' | 'assistant'`.
1140
- */
1141
- formatConversation(history: ConversationTurn[]): Array<{
1142
- role: 'user' | 'assistant';
1143
- content: string;
1144
- }>;
1145
- /**
1146
- * Parse an OpenAI SSE streaming response into an async iterable of
1147
- * `TextChunk` and `ToolCall` objects.
1148
- *
1149
- * The OpenAI streaming endpoint sends each chunk as a JSON object
1150
- * prefixed by `data: `. The final line is `data: [DONE]`.
1151
- * Text content arrives in `choices[0].delta.content` and tool calls
1152
- * arrive in `choices[0].delta.tool_calls`.
1153
- */
1154
- parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
1155
- /**
1156
- * Format a tool result so it can be sent back to OpenAI as a
1157
- * `tool` role message with the `tool_call_id`.
1158
- */
1159
- formatToolResult(callId: string, result: unknown): {
1160
- role: 'tool';
1161
- tool_call_id: string;
1162
- content: string;
1163
- };
1164
- /**
1165
- * Build and execute a streaming request to the OpenAI Chat Completions API.
1166
- * Returns the raw `ReadableStream` for the response body together with
1167
- * the raw Response object.
1168
- */
1169
- streamRequest(params: {
1170
- systemPrompt: string;
1171
- contents: Array<{
1172
- role: string;
1173
- content: string;
1174
- }>;
1175
- tools?: unknown;
1176
- signal?: AbortSignal;
1177
- timeoutMs?: number;
1178
- }): Promise<{
1179
- stream: ReadableStream<Uint8Array>;
1180
- response: Response;
1181
- }>;
1182
- /**
1183
- * Extract `TextChunk` and accumulate `ToolCall` data from a single parsed
1184
- * OpenAI SSE JSON object.
1185
- *
1186
- * OpenAI tool calls arrive incrementally: the first chunk for a tool call
1187
- * carries the `id` and `function.name`, while subsequent chunks append to
1188
- * `function.arguments`. We accumulate these in `pendingToolCalls` and only
1189
- * yield complete `ToolCall` objects when the finish_reason is 'tool_calls'
1190
- * or when flushed.
1191
- */
1192
- private extractChunks;
1193
- /**
1194
- * Flush all accumulated pending tool calls as complete `ToolCall` objects.
1195
- */
1196
- private flushPendingToolCalls;
1197
- /**
1198
- * Extract token usage from a parsed OpenAI response chunk.
1199
- * Usage data typically appears in the final chunk when `stream_options`
1200
- * includes `include_usage`, or in the non-streaming response.
1201
- * Returns `null` if no usage data is present.
1202
- */
1203
- extractUsage(parsed: Record<string, unknown>): TokenUsage$1 | null;
1204
- /**
1205
- * Check whether a parsed OpenAI chunk indicates the response was
1206
- * blocked by a content filter.
1207
- *
1208
- * OpenAI signals content filtering through:
1209
- * - `choices[].finish_reason === 'content_filter'`
1210
- * - `choices[].content_filter_results` with `filtered: true`
1211
- */
1212
- isContentFiltered(parsed: Record<string, unknown>): boolean;
1213
- /**
1214
- * Translate an HTTP error response from OpenAI into the appropriate
1215
- * GuideKit error class.
1216
- */
1217
- private handleHttpError;
1218
- }
1219
-
1220
1180
  interface TokenUsage {
1221
1181
  prompt: number;
1222
1182
  completion: number;
@@ -1230,9 +1190,17 @@ interface TokenUsage {
1230
1190
  declare class GeminiAdapter implements LLMProviderAdapter {
1231
1191
  private readonly apiKey;
1232
1192
  private readonly model;
1193
+ /**
1194
+ * Token usage extracted from the most recent `parseResponse` call.
1195
+ * Updated as each SSE chunk is parsed; the final value reflects the
1196
+ * cumulative usage metadata sent by Gemini (typically in the last chunk).
1197
+ */
1198
+ private _lastUsage;
1233
1199
  constructor(config: Extract<LLMConfig, {
1234
1200
  provider: 'gemini';
1235
1201
  }>);
1202
+ /** Token usage from the most recent parseResponse call. */
1203
+ get lastUsage(): TokenUsage;
1236
1204
  /**
1237
1205
  * Convert GuideKit tool definitions into Gemini's `functionDeclarations`
1238
1206
  * format, wrapped inside a `tools` array.
@@ -1255,6 +1223,10 @@ declare class GeminiAdapter implements LLMProviderAdapter {
1255
1223
  * The Gemini `streamGenerateContent?alt=sse` endpoint sends each chunk
1256
1224
  * as a JSON object prefixed by `data: `. We parse line-by-line, extract
1257
1225
  * text parts and function call parts, and yield the appropriate types.
1226
+ *
1227
+ * This method also:
1228
+ * - Detects content filtering and throws `ContentFilterError`.
1229
+ * - Tracks token usage (accessible via `lastUsage` after iteration).
1258
1230
  */
1259
1231
  parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
1260
1232
  /**
@@ -1275,16 +1247,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
1275
1247
  /**
1276
1248
  * Build and execute a streaming request to the Gemini API.
1277
1249
  * Returns the raw `ReadableStream` for the response body together with
1278
- * a promise that resolves to token usage extracted from the final chunk.
1250
+ * the raw Response object.
1251
+ *
1252
+ * Note: The Gemini API key is passed as a URL query parameter (`key=`).
1253
+ * This is inherent to the Gemini REST SSE endpoint design; the key is
1254
+ * transmitted over HTTPS so it remains encrypted in transit. (H3)
1279
1255
  */
1280
1256
  streamRequest(params: {
1281
1257
  systemPrompt: string;
1282
- contents: Array<{
1283
- role: string;
1284
- parts: Array<{
1285
- text: string;
1286
- }>;
1287
- }>;
1258
+ contents: unknown;
1259
+ userMessage?: string;
1288
1260
  tools?: unknown;
1289
1261
  signal?: AbortSignal;
1290
1262
  timeoutMs?: number;
@@ -1317,12 +1289,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
1317
1289
  * High-level orchestrator that manages LLM interactions for the GuideKit SDK.
1318
1290
  *
1319
1291
  * Responsibilities:
1320
- * - Owns the active `LLMProviderAdapter` (currently only `GeminiAdapter`).
1292
+ * - Owns the active `LLMProviderAdapter`.
1321
1293
  * - Streams responses from the provider, emitting callbacks for text chunks,
1322
1294
  * tool calls, and token usage.
1323
1295
  * - Handles content filter retries: if the initial response is blocked, it
1324
1296
  * retries once with a stripped-down prompt (no tools).
1325
1297
  * - Surfaces all errors through the SDK error hierarchy.
1298
+ *
1299
+ * The orchestrator is fully adapter-agnostic: all provider-specific logic
1300
+ * (SSE parsing, content filter detection, usage extraction) lives in the
1301
+ * adapter implementations.
1326
1302
  */
1327
1303
  declare class LLMOrchestrator {
1328
1304
  private _adapter;
@@ -1363,12 +1339,22 @@ declare class LLMOrchestrator {
1363
1339
  get adapter(): LLMProviderAdapter;
1364
1340
  /**
1365
1341
  * Execute a streaming LLM request and collect the results.
1342
+ *
1343
+ * This method is fully adapter-agnostic: it delegates streaming,
1344
+ * response parsing, content-filter detection, and usage extraction
1345
+ * entirely to the active `LLMProviderAdapter`. No provider-specific
1346
+ * SSE parsing lives in the orchestrator.
1366
1347
  */
1367
1348
  private executeStream;
1368
1349
  /**
1369
1350
  * Create the appropriate adapter for the given config.
1370
- * Currently only Gemini is implemented; other providers will be added
1371
- * as the SDK evolves.
1351
+ *
1352
+ * Built-in providers:
1353
+ * - `'gemini'` — uses the bundled `GeminiAdapter`.
1354
+ *
1355
+ * Custom adapters:
1356
+ * - Pass `{ adapter: myAdapter }` to use any `LLMProviderAdapter`.
1357
+ * Example: `llm: { adapter: myCustomAdapter }`
1372
1358
  */
1373
1359
  private createAdapter;
1374
1360
  /** Convenience accessor for the current provider name. */
@@ -2183,4 +2169,435 @@ declare class TokenManager {
2183
2169
  private log;
2184
2170
  }
2185
2171
 
2186
- export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, OpenAIAdapter, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, createEventBus, isGuideKitError };
2172
+ /**
2173
+ * Minimal type declarations for the Web Speech API SpeechRecognition
2174
+ * interface. These cover the subset used by this adapter. Full type
2175
+ * definitions are available in lib.dom.d.ts but may not be present in
2176
+ * all TS configurations.
2177
+ */
2178
+ interface SpeechRecognitionEvent {
2179
+ readonly resultIndex: number;
2180
+ readonly results: SpeechRecognitionResultList;
2181
+ }
2182
+ interface SpeechRecognitionResultList {
2183
+ readonly length: number;
2184
+ item(index: number): SpeechRecognitionResult;
2185
+ [index: number]: SpeechRecognitionResult;
2186
+ }
2187
+ interface SpeechRecognitionResult {
2188
+ readonly length: number;
2189
+ readonly isFinal: boolean;
2190
+ item(index: number): SpeechRecognitionAlternative;
2191
+ [index: number]: SpeechRecognitionAlternative;
2192
+ }
2193
+ interface SpeechRecognitionAlternative {
2194
+ readonly transcript: string;
2195
+ readonly confidence: number;
2196
+ }
2197
+ interface SpeechRecognitionErrorEvent {
2198
+ readonly error: string;
2199
+ readonly message: string;
2200
+ }
2201
+ interface SpeechRecognitionInstance extends EventTarget {
2202
+ lang: string;
2203
+ continuous: boolean;
2204
+ interimResults: boolean;
2205
+ maxAlternatives: number;
2206
+ onresult: ((event: SpeechRecognitionEvent) => void) | null;
2207
+ onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
2208
+ onend: (() => void) | null;
2209
+ onstart: (() => void) | null;
2210
+ start(): void;
2211
+ stop(): void;
2212
+ abort(): void;
2213
+ }
2214
+ interface SpeechRecognitionConstructor {
2215
+ new (): SpeechRecognitionInstance;
2216
+ }
2217
+ declare global {
2218
+ var webkitSpeechRecognition: SpeechRecognitionConstructor | undefined;
2219
+ }
2220
+ interface WebSpeechSTTOptions {
2221
+ language?: string;
2222
+ continuous?: boolean;
2223
+ interimResults?: boolean;
2224
+ debug?: boolean;
2225
+ }
2226
+ declare class WebSpeechSTT {
2227
+ private readonly language;
2228
+ private readonly continuous;
2229
+ private readonly interimResultsEnabled;
2230
+ private readonly debugEnabled;
2231
+ private recognition;
2232
+ private _connected;
2233
+ private _suspended;
2234
+ /**
2235
+ * Whether we intentionally stopped recognition. Used to distinguish
2236
+ * between intentional stop and unexpected end (for auto-restart in
2237
+ * continuous mode).
2238
+ */
2239
+ private _intentionalStop;
2240
+ /** Registered transcript callbacks. */
2241
+ private readonly transcriptCallbacks;
2242
+ constructor(options?: WebSpeechSTTOptions);
2243
+ /**
2244
+ * Check whether the Web Speech API SpeechRecognition is supported in the
2245
+ * current environment. Safe to call in SSR (returns false).
2246
+ */
2247
+ static isSupported(): boolean;
2248
+ /** Whether recognition is currently active and connected. */
2249
+ get isConnected(): boolean;
2250
+ /**
2251
+ * Start speech recognition.
2252
+ *
2253
+ * Creates the SpeechRecognition instance and begins listening. Resolves
2254
+ * once the recognition session has started. Rejects if the API is not
2255
+ * supported or the browser denies permission.
2256
+ */
2257
+ connect(): Promise<void>;
2258
+ /**
2259
+ * Send audio data. No-op for Web Speech API since it captures audio
2260
+ * directly from the microphone via the browser's internal pipeline.
2261
+ *
2262
+ * Provided for interface compatibility with WebSocket-based STT adapters
2263
+ * (DeepgramSTT, ElevenLabsSTT).
2264
+ */
2265
+ sendAudio(_audioData: Float32Array | Int16Array): void;
2266
+ /**
2267
+ * Register a callback to receive transcript events.
2268
+ *
2269
+ * @returns An unsubscribe function. Calling it more than once is safe.
2270
+ */
2271
+ onTranscript(callback: (event: STTTranscriptEvent) => void): () => void;
2272
+ /**
2273
+ * Gracefully stop recognition.
2274
+ *
2275
+ * Calls `stop()` on the SpeechRecognition instance which allows it to
2276
+ * deliver any pending final results before ending.
2277
+ */
2278
+ close(): void;
2279
+ /** Force-destroy the recognition without waiting for pending results. */
2280
+ destroy(): void;
2281
+ /**
2282
+ * Suspend the adapter (e.g. when the device goes offline).
2283
+ *
2284
+ * Stops recognition and marks the adapter as suspended so that auto-restart
2285
+ * does not trigger.
2286
+ */
2287
+ suspend(): void;
2288
+ /**
2289
+ * Resume after a prior `suspend()`. Restarts recognition if it was
2290
+ * running before suspension.
2291
+ */
2292
+ resume(): void;
2293
+ /**
2294
+ * Handle SpeechRecognition result events.
2295
+ *
2296
+ * The `results` property is a SpeechRecognitionResultList containing all
2297
+ * results accumulated during this recognition session. We only process
2298
+ * results from `resultIndex` onward to avoid re-emitting old results.
2299
+ */
2300
+ private handleResult;
2301
+ /**
2302
+ * Handle SpeechRecognition errors.
2303
+ *
2304
+ * Some errors are recoverable (e.g. `no-speech`) and some are fatal
2305
+ * (e.g. `not-allowed`). For recoverable errors in continuous mode,
2306
+ * recognition will auto-restart via the `onend` handler.
2307
+ */
2308
+ private handleError;
2309
+ /**
2310
+ * Emit a transcript event to all registered callbacks.
2311
+ *
2312
+ * Errors thrown by individual callbacks are caught and logged so one
2313
+ * misbehaving subscriber does not prevent others from receiving the event.
2314
+ */
2315
+ private emitTranscript;
2316
+ /**
2317
+ * Resolve the SpeechRecognition constructor, with the webkit-prefixed
2318
+ * fallback. Returns null if not available.
2319
+ */
2320
+ private resolveSpeechRecognition;
2321
+ /** Reset internal state after disconnection. */
2322
+ private cleanup;
2323
+ /** Conditional debug logging. */
2324
+ private log;
2325
+ }
2326
+
2327
+ interface WebSpeechTTSOptions {
2328
+ voice?: string;
2329
+ rate?: number;
2330
+ pitch?: number;
2331
+ language?: string;
2332
+ debug?: boolean;
2333
+ }
2334
+ /**
2335
+ * Audio event compatible with the TTSAudioEvent shape used by
2336
+ * VoicePipeline for ElevenLabs TTS. Web Speech API does not produce
2337
+ * raw audio buffers, so we emit events with empty buffers and use
2338
+ * isFinal to signal utterance completion.
2339
+ */
2340
+ interface WebSpeechTTSAudioEvent {
2341
+ audio: ArrayBuffer;
2342
+ isFinal: boolean;
2343
+ timestamp: number;
2344
+ }
2345
+ declare class WebSpeechTTS {
2346
+ private readonly voiceName;
2347
+ private readonly rate;
2348
+ private readonly pitch;
2349
+ private readonly language;
2350
+ private readonly debugEnabled;
2351
+ private _connected;
2352
+ private _suspended;
2353
+ /** Cached voice object resolved from voiceName. */
2354
+ private _resolvedVoice;
2355
+ /** Whether voices have been loaded (they load async in some browsers). */
2356
+ private _voicesLoaded;
2357
+ /** Registered audio-event callbacks. */
2358
+ private readonly audioCallbacks;
2359
+ constructor(options?: WebSpeechTTSOptions);
2360
+ /**
2361
+ * Check whether the Web Speech API SpeechSynthesis is supported in the
2362
+ * current environment. Safe to call in SSR (returns false).
2363
+ */
2364
+ static isSupported(): boolean;
2365
+ /** Whether the adapter is connected (ready for speech). */
2366
+ get isConnected(): boolean;
2367
+ /**
2368
+ * Initialize the adapter.
2369
+ *
2370
+ * Loads available voices and resolves the requested voice name. Voice
2371
+ * loading is async in some browsers (notably Chrome) so we wait for
2372
+ * the `voiceschanged` event if needed.
2373
+ */
2374
+ connect(): Promise<void>;
2375
+ /**
2376
+ * Speak the given text using the browser's speech synthesis engine.
2377
+ *
2378
+ * Returns a Promise that resolves when the utterance completes or is
2379
+ * cancelled. Rejects if an error occurs during synthesis.
2380
+ *
2381
+ * Also emits audio events to registered callbacks for VoicePipeline
2382
+ * compatibility.
2383
+ */
2384
+ speak(text: string): void;
2385
+ /**
2386
+ * Flush / finalize the current utterance.
2387
+ *
2388
+ * No-op for Web Speech API since each speak() call is a complete
2389
+ * utterance. Provided for interface compatibility with ElevenLabsTTS.
2390
+ */
2391
+ flush(): void;
2392
+ /**
2393
+ * Register a callback to receive audio output events.
2394
+ *
2395
+ * For Web Speech API, these events have empty audio buffers and are
2396
+ * used to signal utterance start/end for VoicePipeline state management.
2397
+ *
2398
+ * @returns An unsubscribe function. Calling it more than once is safe.
2399
+ */
2400
+ onAudio(callback: (event: WebSpeechTTSAudioEvent) => void): () => void;
2401
+ /** Stop current speech synthesis and cancel any queued utterances. */
2402
+ stop(): void;
2403
+ /** Gracefully close the adapter. */
2404
+ close(): void;
2405
+ /** Force-destroy the adapter. */
2406
+ destroy(): void;
2407
+ /**
2408
+ * Suspend the adapter (e.g. when the device goes offline).
2409
+ *
2410
+ * Pauses any active speech synthesis and marks the adapter as suspended.
2411
+ */
2412
+ suspend(): void;
2413
+ /**
2414
+ * Resume after a prior `suspend()`.
2415
+ */
2416
+ resume(): void;
2417
+ /**
2418
+ * Load available voices from the browser.
2419
+ *
2420
+ * In Chrome and some other browsers, voices load asynchronously after
2421
+ * the page loads. We wait for the `voiceschanged` event with a timeout.
2422
+ */
2423
+ private loadVoices;
2424
+ /**
2425
+ * Find a voice by name (case-insensitive partial match).
2426
+ */
2427
+ private findVoice;
2428
+ /**
2429
+ * Emit an audio event to all registered callbacks.
2430
+ *
2431
+ * Errors thrown by individual callbacks are caught and logged so one
2432
+ * misbehaving subscriber does not prevent others from receiving the event.
2433
+ */
2434
+ private emitAudio;
2435
+ /** Reset internal state. */
2436
+ private cleanup;
2437
+ /** Conditional debug logging. */
2438
+ private log;
2439
+ }
2440
+
2441
+ type VoiceState = 'idle' | 'listening' | 'processing' | 'speaking' | 'error';
2442
+ interface VoicePipelineOptions {
2443
+ sttConfig: {
2444
+ provider: 'deepgram';
2445
+ apiKey: string;
2446
+ model?: 'nova-2' | 'nova-3';
2447
+ } | {
2448
+ provider: 'elevenlabs';
2449
+ apiKey: string;
2450
+ language?: string;
2451
+ } | {
2452
+ provider: 'web-speech';
2453
+ language?: string;
2454
+ continuous?: boolean;
2455
+ interimResults?: boolean;
2456
+ };
2457
+ ttsConfig: {
2458
+ provider: 'elevenlabs';
2459
+ apiKey: string;
2460
+ voiceId?: string;
2461
+ modelId?: string;
2462
+ } | {
2463
+ provider: 'web-speech';
2464
+ voice?: string;
2465
+ rate?: number;
2466
+ pitch?: number;
2467
+ language?: string;
2468
+ };
2469
+ debug?: boolean;
2470
+ }
2471
+ declare global {
2472
+ var webkitAudioContext: typeof AudioContext | undefined;
2473
+ }
2474
+ declare class VoicePipeline {
2475
+ private readonly _sttConfig;
2476
+ private readonly _ttsConfig;
2477
+ private readonly _debug;
2478
+ private _state;
2479
+ private _destroyed;
2480
+ private readonly _bus;
2481
+ private _audioContext;
2482
+ private _mediaStream;
2483
+ private _vad;
2484
+ private _stt;
2485
+ private _tts;
2486
+ private _micSourceNode;
2487
+ private _captureProcessor;
2488
+ private _isForwardingToSTT;
2489
+ private _playbackQueue;
2490
+ private _jitterBufferTimer;
2491
+ private _isPlaybackStarted;
2492
+ private _nextPlaybackTime;
2493
+ private _activeSourceNodes;
2494
+ private _lastScheduledSource;
2495
+ private _lastTTSEcho;
2496
+ private _pendingLLMAbort;
2497
+ private readonly _stateChangeCallbacks;
2498
+ private readonly _transcriptCallbacks;
2499
+ private _unsubVADSpeechStart;
2500
+ private _unsubVADSpeechEnd;
2501
+ private _unsubSTTTranscript;
2502
+ private _unsubTTSAudio;
2503
+ constructor(options: VoicePipelineOptions);
2504
+ /** Current pipeline state. */
2505
+ get state(): VoiceState;
2506
+ /**
2507
+ * Initialize AudioContext, VAD model, and STT/TTS connections.
2508
+ *
2509
+ * **Must be called in response to a user gesture** (click / tap) to
2510
+ * satisfy browser autoplay policies.
2511
+ */
2512
+ init(): Promise<void>;
2513
+ /**
2514
+ * Start listening: activate microphone, begin VAD + STT pipeline.
2515
+ *
2516
+ * Valid from: IDLE, ERROR, SPEAKING (barge-in path calls this internally).
2517
+ */
2518
+ startListening(): Promise<void>;
2519
+ /** Stop listening: deactivate mic and VAD. */
2520
+ stopListening(): void;
2521
+ /**
2522
+ * Process a transcript through an LLM callback and speak the response.
2523
+ *
2524
+ * @param text - The user's transcript text.
2525
+ * @param sendToLLM - Async callback that sends text to the LLM and returns the response.
2526
+ */
2527
+ processTranscript(text: string, sendToLLM: (text: string) => Promise<string>): Promise<void>;
2528
+ /** Speak text via TTS (ElevenLabs or Web Speech API). */
2529
+ speak(text: string): Promise<void>;
2530
+ /** Stop current TTS playback immediately (barge-in). */
2531
+ stopSpeaking(): void;
2532
+ /** Subscribe to state changes. Returns an unsubscribe function. */
2533
+ onStateChange(callback: (state: VoiceState, previous: VoiceState) => void): () => void;
2534
+ /** Subscribe to transcript events. Returns an unsubscribe function. */
2535
+ onTranscript(callback: (text: string, isFinal: boolean) => void): () => void;
2536
+ /** Destroy all resources held by the pipeline. */
2537
+ destroy(): Promise<void>;
2538
+ private _setState;
2539
+ /**
2540
+ * Resolve the AudioContext constructor, with Safari webkitAudioContext
2541
+ * fallback. Returns null if Web Audio is not available.
2542
+ */
2543
+ private _resolveAudioContext;
2544
+ /**
2545
+ * Pre-warm the AudioContext by playing a silent buffer.
2546
+ * This forces the context into the "running" state and avoids a
2547
+ * noticeable delay on the first real playback.
2548
+ */
2549
+ private _prewarmAudioContext;
2550
+ /**
2551
+ * Set up a ScriptProcessorNode to capture mic audio and forward it
2552
+ * to the STT adapter when `_isForwardingToSTT` is true.
2553
+ */
2554
+ private _setupMicCapture;
2555
+ /** Tear down the mic capture ScriptProcessorNode. */
2556
+ private _teardownMicCapture;
2557
+ /** Stop all tracks on the current MediaStream. */
2558
+ private _stopMicTracks;
2559
+ private _handleVADSpeechStart;
2560
+ private _handleVADSpeechEnd;
2561
+ private _handleTranscript;
2562
+ /**
2563
+ * Handle an audio chunk from ElevenLabs TTS.
2564
+ *
2565
+ * Implements a jitter buffer: we accumulate audio for JITTER_BUFFER_MS
2566
+ * before starting playback to smooth out network jitter.
2567
+ */
2568
+ private _handleTTSAudio;
2569
+ /** Flush the jitter buffer and start playback. */
2570
+ private _flushJitterBuffer;
2571
+ /**
2572
+ * Begin playback: decode all queued chunks and schedule them.
2573
+ * If `onDone` is provided, it is called when the last chunk finishes playing.
2574
+ */
2575
+ private _startPlayback;
2576
+ /**
2577
+ * Decode an audio chunk (mp3 from ElevenLabs) and schedule it for
2578
+ * sequential playback via AudioBufferSourceNode.
2579
+ */
2580
+ private _decodeAndSchedule;
2581
+ /**
2582
+ * Check if VAD speech-start during SPEAKING state is likely echo from
2583
+ * the speaker playing TTS audio rather than genuine user speech.
2584
+ *
2585
+ * Simple heuristic: if we are still within the echo window of a recent
2586
+ * TTS utterance, treat it as potential echo.
2587
+ */
2588
+ private _isEchoDetected;
2589
+ /**
2590
+ * Check if a transcript is an echo of recent TTS output.
2591
+ *
2592
+ * Uses word overlap: if intersection of words > 60% of max set size
2593
+ * and the transcript arrived within the echo window, discard it.
2594
+ */
2595
+ private _isTranscriptEcho;
2596
+ /**
2597
+ * Normalize text into a set of lowercase words, stripping punctuation.
2598
+ */
2599
+ private _normalizeWords;
2600
+ private _log;
2601
+ }
2602
+
2603
+ export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, VoicePipeline, type VoicePipelineOptions, type VoiceState, WebSpeechSTT, type WebSpeechSTTOptions, WebSpeechTTS, type WebSpeechTTSAudioEvent, type WebSpeechTTSOptions, createEventBus, isGuideKitError };