npm - @guidekit/core - Versions diffs - 0.1.0-beta.1 → 0.1.0-beta.3 - Mend

@guidekit/core 0.1.0-beta.1 → 0.1.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -351,8 +351,14 @@ type STTConfig = {
     apiKey: string;
     model?: 'nova-2' | 'nova-3';
 } | {
-    provider: 'assemblyai';
+    provider: 'elevenlabs';
     apiKey: string;
+    language?: string;
+} | {
+    provider: 'web-speech';
+    language?: string;
+    continuous?: boolean;
+    interimResults?: boolean;
 };
 /** Text-to-speech provider configuration. */
 type TTSConfig = {
@@ -360,19 +366,26 @@ type TTSConfig = {
     apiKey: string;
     voiceId?: string;
 } | {
-    provider: 'cartesia';
-    apiKey: string;
-    voiceId?: string;
+    provider: 'web-speech';
+    voice?: string;
+    rate?: number;
+    pitch?: number;
+    language?: string;
 };
+/** Transcript event emitted by any STT adapter. */
+interface STTTranscriptEvent {
+    text: string;
+    isFinal: boolean;
+    confidence: number;
+    timestamp: number;
+}
 /** Large language model provider configuration. */
 type LLMConfig = {
     provider: 'gemini';
     apiKey: string;
     model?: 'gemini-2.5-flash' | 'gemini-2.5-pro';
 } | {
-    provider: 'openai';
-    apiKey: string;
-    model?: 'gpt-4o' | 'gpt-4o-mini';
+    adapter: LLMProviderAdapter;
 };
 /** Top-level options that control SDK behaviour. */
 interface GuideKitOptions {
@@ -481,11 +494,27 @@ interface GuideKitProviderProps {
     }>;
     children?: unknown;
 }
+/** A single JSON-Schema-style property descriptor used in tool parameter maps. */
+interface ToolParameterSchema {
+    type: string;
+    description?: string;
+    enum?: string[];
+    items?: {
+        type: string;
+    };
+    [key: string]: unknown;
+}
 /** Definition of a tool that can be invoked by the LLM. */
 interface ToolDefinition {
     name: string;
     description: string;
-    parameters: Record<string, unknown>;
+    /** Flat map of param name → JSON Schema property descriptor. */
+    parameters: Record<string, ToolParameterSchema>;
+    /**
+     * List of parameter names the LLM must always provide.
+     * Omit or use [] for fully optional parameters.
+     */
+    required?: string[];
     schemaVersion: number;
 }
 /** A tool invocation request returned by the LLM. */
@@ -509,6 +538,35 @@ interface LLMProviderAdapter {
     formatConversation(history: ConversationTurn[]): unknown;
     parseResponse(stream: ReadableStream): AsyncIterable<TextChunk | ToolCall>;
     formatToolResult(callId: string, result: unknown): unknown;
+    /**
+     * Build and execute a streaming request to the provider API.
+     * Returns the raw ReadableStream for the response body.
+     */
+    streamRequest(params: {
+        systemPrompt: string;
+        contents: unknown;
+        userMessage?: string;
+        tools?: unknown;
+        signal?: AbortSignal;
+        timeoutMs?: number;
+    }): Promise<{
+        stream: ReadableStream<Uint8Array>;
+        response: Response;
+    }>;
+    /**
+     * Check whether a parsed response chunk indicates the response was
+     * blocked by a content/safety filter.
+     */
+    isContentFiltered(chunk: Record<string, unknown>): boolean;
+    /**
+     * Extract token usage from a parsed response chunk.
+     * Returns `null` if no usage metadata is present in this chunk.
+     */
+    extractUsage(chunk: Record<string, unknown>): {
+        prompt: number;
+        completion: number;
+        total: number;
+    } | null;
 }
 /** Decoded payload of a GuideKit session token. */
 interface TokenPayload {
@@ -529,9 +587,9 @@ interface TokenResponse {
 /** Options for `createSessionToken()` on the server side. */
 interface CreateSessionTokenOptions {
     signingSecret: string | string[];
-    deepgramKey?: string;
-    elevenlabsKey?: string;
-    geminiKey?: string;
+    sttApiKey?: string;
+    ttsApiKey?: string;
+    llmApiKey?: string;
     expiresIn?: string;
     allowedOrigins?: string[];
     permissions?: string[];
@@ -586,10 +644,11 @@ declare const ErrorCodes: {
     readonly VAD_PACKAGE_MISSING: "VAD_PACKAGE_MISSING";
     readonly CONTENT_FILTER_TRIGGERED: "CONTENT_FILTER_TRIGGERED";
     readonly PRIVACY_HOOK_CANCELLED: "PRIVACY_HOOK_CANCELLED";
+    readonly UNKNOWN: "UNKNOWN";
 };
 /** Union of every known error code string. */
 type ErrorCode = (typeof ErrorCodes)[keyof typeof ErrorCodes];
-type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai';
+type Provider = 'deepgram' | 'elevenlabs' | 'gemini' | 'openai' | 'web-speech' | (string & {});
 interface GuideKitErrorOptions {
     code: string;
     message: string;
@@ -830,6 +889,11 @@ declare class GuideKitCore {
     private setAgentState;
     private notifyStoreListeners;
     private buildSnapshot;
+    /**
+     * Unified built-in tool specifications — single source of truth for both
+     * tool definitions (sent to LLM) and handler registration.
+     */
+    private getBuiltinToolSpecs;
     /**
      * Register all built-in tool handlers with the ToolExecutor.
      * Called once during init() after VisualGuidance and all subsystems are ready.
@@ -1113,110 +1177,6 @@ declare class ContextManager {
     private log;
 }
-interface TokenUsage$1 {
-    prompt: number;
-    completion: number;
-    total: number;
-}
-/**
- * Adapter that translates between GuideKit's internal types and the
- * OpenAI Chat Completions API wire format. Handles streaming via SSE,
- * tool formatting, and response parsing.
- */
-declare class OpenAIAdapter implements LLMProviderAdapter {
-    private readonly apiKey;
-    private readonly model;
-    constructor(config: Extract<LLMConfig, {
-        provider: 'openai';
-    }>);
-    /**
-     * Convert GuideKit tool definitions into OpenAI's `tools` format.
-     * Each tool is wrapped as `{ type: 'function', function: { name, description, parameters } }`.
-     */
-    formatTools(tools: ToolDefinition[]): unknown;
-    /**
-     * Convert an array of `ConversationTurn` objects into OpenAI's messages
-     * format with `role: 'user' | 'assistant'`.
-     */
-    formatConversation(history: ConversationTurn[]): Array<{
-        role: 'user' | 'assistant';
-        content: string;
-    }>;
-    /**
-     * Parse an OpenAI SSE streaming response into an async iterable of
-     * `TextChunk` and `ToolCall` objects.
-     *
-     * The OpenAI streaming endpoint sends each chunk as a JSON object
-     * prefixed by `data: `. The final line is `data: [DONE]`.
-     * Text content arrives in `choices[0].delta.content` and tool calls
-     * arrive in `choices[0].delta.tool_calls`.
-     */
-    parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
-    /**
-     * Format a tool result so it can be sent back to OpenAI as a
-     * `tool` role message with the `tool_call_id`.
-     */
-    formatToolResult(callId: string, result: unknown): {
-        role: 'tool';
-        tool_call_id: string;
-        content: string;
-    };
-    /**
-     * Build and execute a streaming request to the OpenAI Chat Completions API.
-     * Returns the raw `ReadableStream` for the response body together with
-     * the raw Response object.
-     */
-    streamRequest(params: {
-        systemPrompt: string;
-        contents: Array<{
-            role: string;
-            content: string;
-        }>;
-        tools?: unknown;
-        signal?: AbortSignal;
-        timeoutMs?: number;
-    }): Promise<{
-        stream: ReadableStream<Uint8Array>;
-        response: Response;
-    }>;
-    /**
-     * Extract `TextChunk` and accumulate `ToolCall` data from a single parsed
-     * OpenAI SSE JSON object.
-     *
-     * OpenAI tool calls arrive incrementally: the first chunk for a tool call
-     * carries the `id` and `function.name`, while subsequent chunks append to
-     * `function.arguments`. We accumulate these in `pendingToolCalls` and only
-     * yield complete `ToolCall` objects when the finish_reason is 'tool_calls'
-     * or when flushed.
-     */
-    private extractChunks;
-    /**
-     * Flush all accumulated pending tool calls as complete `ToolCall` objects.
-     */
-    private flushPendingToolCalls;
-    /**
-     * Extract token usage from a parsed OpenAI response chunk.
-     * Usage data typically appears in the final chunk when `stream_options`
-     * includes `include_usage`, or in the non-streaming response.
-     * Returns `null` if no usage data is present.
-     */
-    extractUsage(parsed: Record<string, unknown>): TokenUsage$1 | null;
-    /**
-     * Check whether a parsed OpenAI chunk indicates the response was
-     * blocked by a content filter.
-     *
-     * OpenAI signals content filtering through:
-     * - `choices[].finish_reason === 'content_filter'`
-     * - `choices[].content_filter_results` with `filtered: true`
-     */
-    isContentFiltered(parsed: Record<string, unknown>): boolean;
-    /**
-     * Translate an HTTP error response from OpenAI into the appropriate
-     * GuideKit error class.
-     */
-    private handleHttpError;
-}
 interface TokenUsage {
     prompt: number;
     completion: number;
@@ -1230,9 +1190,17 @@ interface TokenUsage {
 declare class GeminiAdapter implements LLMProviderAdapter {
     private readonly apiKey;
     private readonly model;
+    /**
+     * Token usage extracted from the most recent `parseResponse` call.
+     * Updated as each SSE chunk is parsed; the final value reflects the
+     * cumulative usage metadata sent by Gemini (typically in the last chunk).
+     */
+    private _lastUsage;
     constructor(config: Extract<LLMConfig, {
         provider: 'gemini';
     }>);
+    /** Token usage from the most recent parseResponse call. */
+    get lastUsage(): TokenUsage;
     /**
      * Convert GuideKit tool definitions into Gemini's `functionDeclarations`
      * format, wrapped inside a `tools` array.
@@ -1255,6 +1223,10 @@ declare class GeminiAdapter implements LLMProviderAdapter {
      * The Gemini `streamGenerateContent?alt=sse` endpoint sends each chunk
      * as a JSON object prefixed by `data: `. We parse line-by-line, extract
      * text parts and function call parts, and yield the appropriate types.
+     *
+     * This method also:
+     * - Detects content filtering and throws `ContentFilterError`.
+     * - Tracks token usage (accessible via `lastUsage` after iteration).
      */
     parseResponse(stream: ReadableStream<Uint8Array>): AsyncIterable<TextChunk | ToolCall>;
     /**
@@ -1275,16 +1247,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
     /**
      * Build and execute a streaming request to the Gemini API.
      * Returns the raw `ReadableStream` for the response body together with
-     * a promise that resolves to token usage extracted from the final chunk.
+     * the raw Response object.
+     *
+     * Note: The Gemini API key is passed as a URL query parameter (`key=`).
+     * This is inherent to the Gemini REST SSE endpoint design; the key is
+     * transmitted over HTTPS so it remains encrypted in transit. (H3)
      */
     streamRequest(params: {
         systemPrompt: string;
-        contents: Array<{
-            role: string;
-            parts: Array<{
-                text: string;
-            }>;
-        }>;
+        contents: unknown;
+        userMessage?: string;
         tools?: unknown;
         signal?: AbortSignal;
         timeoutMs?: number;
@@ -1317,12 +1289,16 @@ declare class GeminiAdapter implements LLMProviderAdapter {
  * High-level orchestrator that manages LLM interactions for the GuideKit SDK.
  *
  * Responsibilities:
- * - Owns the active `LLMProviderAdapter` (currently only `GeminiAdapter`).
+ * - Owns the active `LLMProviderAdapter`.
  * - Streams responses from the provider, emitting callbacks for text chunks,
  *   tool calls, and token usage.
  * - Handles content filter retries: if the initial response is blocked, it
  *   retries once with a stripped-down prompt (no tools).
  * - Surfaces all errors through the SDK error hierarchy.
+ *
+ * The orchestrator is fully adapter-agnostic: all provider-specific logic
+ * (SSE parsing, content filter detection, usage extraction) lives in the
+ * adapter implementations.
  */
 declare class LLMOrchestrator {
     private _adapter;
@@ -1363,12 +1339,22 @@ declare class LLMOrchestrator {
     get adapter(): LLMProviderAdapter;
     /**
      * Execute a streaming LLM request and collect the results.
+     *
+     * This method is fully adapter-agnostic: it delegates streaming,
+     * response parsing, content-filter detection, and usage extraction
+     * entirely to the active `LLMProviderAdapter`. No provider-specific
+     * SSE parsing lives in the orchestrator.
      */
     private executeStream;
     /**
      * Create the appropriate adapter for the given config.
-     * Currently only Gemini is implemented; other providers will be added
-     * as the SDK evolves.
+     *
+     * Built-in providers:
+     * - `'gemini'` — uses the bundled `GeminiAdapter`.
+     *
+     * Custom adapters:
+     * - Pass `{ adapter: myAdapter }` to use any `LLMProviderAdapter`.
+     *   Example: `llm: { adapter: myCustomAdapter }`
      */
     private createAdapter;
     /** Convenience accessor for the current provider name. */
@@ -2183,4 +2169,435 @@ declare class TokenManager {
     private log;
 }
-export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, OpenAIAdapter, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, createEventBus, isGuideKitError };
+/**
+ * Minimal type declarations for the Web Speech API SpeechRecognition
+ * interface. These cover the subset used by this adapter. Full type
+ * definitions are available in lib.dom.d.ts but may not be present in
+ * all TS configurations.
+ */
+interface SpeechRecognitionEvent {
+    readonly resultIndex: number;
+    readonly results: SpeechRecognitionResultList;
+}
+interface SpeechRecognitionResultList {
+    readonly length: number;
+    item(index: number): SpeechRecognitionResult;
+    [index: number]: SpeechRecognitionResult;
+}
+interface SpeechRecognitionResult {
+    readonly length: number;
+    readonly isFinal: boolean;
+    item(index: number): SpeechRecognitionAlternative;
+    [index: number]: SpeechRecognitionAlternative;
+}
+interface SpeechRecognitionAlternative {
+    readonly transcript: string;
+    readonly confidence: number;
+}
+interface SpeechRecognitionErrorEvent {
+    readonly error: string;
+    readonly message: string;
+}
+interface SpeechRecognitionInstance extends EventTarget {
+    lang: string;
+    continuous: boolean;
+    interimResults: boolean;
+    maxAlternatives: number;
+    onresult: ((event: SpeechRecognitionEvent) => void) | null;
+    onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
+    onend: (() => void) | null;
+    onstart: (() => void) | null;
+    start(): void;
+    stop(): void;
+    abort(): void;
+}
+interface SpeechRecognitionConstructor {
+    new (): SpeechRecognitionInstance;
+}
+declare global {
+    var webkitSpeechRecognition: SpeechRecognitionConstructor | undefined;
+}
+interface WebSpeechSTTOptions {
+    language?: string;
+    continuous?: boolean;
+    interimResults?: boolean;
+    debug?: boolean;
+}
+declare class WebSpeechSTT {
+    private readonly language;
+    private readonly continuous;
+    private readonly interimResultsEnabled;
+    private readonly debugEnabled;
+    private recognition;
+    private _connected;
+    private _suspended;
+    /**
+     * Whether we intentionally stopped recognition. Used to distinguish
+     * between intentional stop and unexpected end (for auto-restart in
+     * continuous mode).
+     */
+    private _intentionalStop;
+    /** Registered transcript callbacks. */
+    private readonly transcriptCallbacks;
+    constructor(options?: WebSpeechSTTOptions);
+    /**
+     * Check whether the Web Speech API SpeechRecognition is supported in the
+     * current environment. Safe to call in SSR (returns false).
+     */
+    static isSupported(): boolean;
+    /** Whether recognition is currently active and connected. */
+    get isConnected(): boolean;
+    /**
+     * Start speech recognition.
+     *
+     * Creates the SpeechRecognition instance and begins listening. Resolves
+     * once the recognition session has started. Rejects if the API is not
+     * supported or the browser denies permission.
+     */
+    connect(): Promise<void>;
+    /**
+     * Send audio data. No-op for Web Speech API since it captures audio
+     * directly from the microphone via the browser's internal pipeline.
+     *
+     * Provided for interface compatibility with WebSocket-based STT adapters
+     * (DeepgramSTT, ElevenLabsSTT).
+     */
+    sendAudio(_audioData: Float32Array | Int16Array): void;
+    /**
+     * Register a callback to receive transcript events.
+     *
+     * @returns An unsubscribe function. Calling it more than once is safe.
+     */
+    onTranscript(callback: (event: STTTranscriptEvent) => void): () => void;
+    /**
+     * Gracefully stop recognition.
+     *
+     * Calls `stop()` on the SpeechRecognition instance which allows it to
+     * deliver any pending final results before ending.
+     */
+    close(): void;
+    /** Force-destroy the recognition without waiting for pending results. */
+    destroy(): void;
+    /**
+     * Suspend the adapter (e.g. when the device goes offline).
+     *
+     * Stops recognition and marks the adapter as suspended so that auto-restart
+     * does not trigger.
+     */
+    suspend(): void;
+    /**
+     * Resume after a prior `suspend()`. Restarts recognition if it was
+     * running before suspension.
+     */
+    resume(): void;
+    /**
+     * Handle SpeechRecognition result events.
+     *
+     * The `results` property is a SpeechRecognitionResultList containing all
+     * results accumulated during this recognition session. We only process
+     * results from `resultIndex` onward to avoid re-emitting old results.
+     */
+    private handleResult;
+    /**
+     * Handle SpeechRecognition errors.
+     *
+     * Some errors are recoverable (e.g. `no-speech`) and some are fatal
+     * (e.g. `not-allowed`). For recoverable errors in continuous mode,
+     * recognition will auto-restart via the `onend` handler.
+     */
+    private handleError;
+    /**
+     * Emit a transcript event to all registered callbacks.
+     *
+     * Errors thrown by individual callbacks are caught and logged so one
+     * misbehaving subscriber does not prevent others from receiving the event.
+     */
+    private emitTranscript;
+    /**
+     * Resolve the SpeechRecognition constructor, with the webkit-prefixed
+     * fallback. Returns null if not available.
+     */
+    private resolveSpeechRecognition;
+    /** Reset internal state after disconnection. */
+    private cleanup;
+    /** Conditional debug logging. */
+    private log;
+}
+interface WebSpeechTTSOptions {
+    voice?: string;
+    rate?: number;
+    pitch?: number;
+    language?: string;
+    debug?: boolean;
+}
+/**
+ * Audio event compatible with the TTSAudioEvent shape used by
+ * VoicePipeline for ElevenLabs TTS. Web Speech API does not produce
+ * raw audio buffers, so we emit events with empty buffers and use
+ * isFinal to signal utterance completion.
+ */
+interface WebSpeechTTSAudioEvent {
+    audio: ArrayBuffer;
+    isFinal: boolean;
+    timestamp: number;
+}
+declare class WebSpeechTTS {
+    private readonly voiceName;
+    private readonly rate;
+    private readonly pitch;
+    private readonly language;
+    private readonly debugEnabled;
+    private _connected;
+    private _suspended;
+    /** Cached voice object resolved from voiceName. */
+    private _resolvedVoice;
+    /** Whether voices have been loaded (they load async in some browsers). */
+    private _voicesLoaded;
+    /** Registered audio-event callbacks. */
+    private readonly audioCallbacks;
+    constructor(options?: WebSpeechTTSOptions);
+    /**
+     * Check whether the Web Speech API SpeechSynthesis is supported in the
+     * current environment. Safe to call in SSR (returns false).
+     */
+    static isSupported(): boolean;
+    /** Whether the adapter is connected (ready for speech). */
+    get isConnected(): boolean;
+    /**
+     * Initialize the adapter.
+     *
+     * Loads available voices and resolves the requested voice name. Voice
+     * loading is async in some browsers (notably Chrome) so we wait for
+     * the `voiceschanged` event if needed.
+     */
+    connect(): Promise<void>;
+    /**
+     * Speak the given text using the browser's speech synthesis engine.
+     *
+     * Returns a Promise that resolves when the utterance completes or is
+     * cancelled. Rejects if an error occurs during synthesis.
+     *
+     * Also emits audio events to registered callbacks for VoicePipeline
+     * compatibility.
+     */
+    speak(text: string): void;
+    /**
+     * Flush / finalize the current utterance.
+     *
+     * No-op for Web Speech API since each speak() call is a complete
+     * utterance. Provided for interface compatibility with ElevenLabsTTS.
+     */
+    flush(): void;
+    /**
+     * Register a callback to receive audio output events.
+     *
+     * For Web Speech API, these events have empty audio buffers and are
+     * used to signal utterance start/end for VoicePipeline state management.
+     *
+     * @returns An unsubscribe function. Calling it more than once is safe.
+     */
+    onAudio(callback: (event: WebSpeechTTSAudioEvent) => void): () => void;
+    /** Stop current speech synthesis and cancel any queued utterances. */
+    stop(): void;
+    /** Gracefully close the adapter. */
+    close(): void;
+    /** Force-destroy the adapter. */
+    destroy(): void;
+    /**
+     * Suspend the adapter (e.g. when the device goes offline).
+     *
+     * Pauses any active speech synthesis and marks the adapter as suspended.
+     */
+    suspend(): void;
+    /**
+     * Resume after a prior `suspend()`.
+     */
+    resume(): void;
+    /**
+     * Load available voices from the browser.
+     *
+     * In Chrome and some other browsers, voices load asynchronously after
+     * the page loads. We wait for the `voiceschanged` event with a timeout.
+     */
+    private loadVoices;
+    /**
+     * Find a voice by name (case-insensitive partial match).
+     */
+    private findVoice;
+    /**
+     * Emit an audio event to all registered callbacks.
+     *
+     * Errors thrown by individual callbacks are caught and logged so one
+     * misbehaving subscriber does not prevent others from receiving the event.
+     */
+    private emitAudio;
+    /** Reset internal state. */
+    private cleanup;
+    /** Conditional debug logging. */
+    private log;
+}
+type VoiceState = 'idle' | 'listening' | 'processing' | 'speaking' | 'error';
+interface VoicePipelineOptions {
+    sttConfig: {
+        provider: 'deepgram';
+        apiKey: string;
+        model?: 'nova-2' | 'nova-3';
+    } | {
+        provider: 'elevenlabs';
+        apiKey: string;
+        language?: string;
+    } | {
+        provider: 'web-speech';
+        language?: string;
+        continuous?: boolean;
+        interimResults?: boolean;
+    };
+    ttsConfig: {
+        provider: 'elevenlabs';
+        apiKey: string;
+        voiceId?: string;
+        modelId?: string;
+    } | {
+        provider: 'web-speech';
+        voice?: string;
+        rate?: number;
+        pitch?: number;
+        language?: string;
+    };
+    debug?: boolean;
+}
+declare global {
+    var webkitAudioContext: typeof AudioContext | undefined;
+}
+declare class VoicePipeline {
+    private readonly _sttConfig;
+    private readonly _ttsConfig;
+    private readonly _debug;
+    private _state;
+    private _destroyed;
+    private readonly _bus;
+    private _audioContext;
+    private _mediaStream;
+    private _vad;
+    private _stt;
+    private _tts;
+    private _micSourceNode;
+    private _captureProcessor;
+    private _isForwardingToSTT;
+    private _playbackQueue;
+    private _jitterBufferTimer;
+    private _isPlaybackStarted;
+    private _nextPlaybackTime;
+    private _activeSourceNodes;
+    private _lastScheduledSource;
+    private _lastTTSEcho;
+    private _pendingLLMAbort;
+    private readonly _stateChangeCallbacks;
+    private readonly _transcriptCallbacks;
+    private _unsubVADSpeechStart;
+    private _unsubVADSpeechEnd;
+    private _unsubSTTTranscript;
+    private _unsubTTSAudio;
+    constructor(options: VoicePipelineOptions);
+    /** Current pipeline state. */
+    get state(): VoiceState;
+    /**
+     * Initialize AudioContext, VAD model, and STT/TTS connections.
+     *
+     * **Must be called in response to a user gesture** (click / tap) to
+     * satisfy browser autoplay policies.
+     */
+    init(): Promise<void>;
+    /**
+     * Start listening: activate microphone, begin VAD + STT pipeline.
+     *
+     * Valid from: IDLE, ERROR, SPEAKING (barge-in path calls this internally).
+     */
+    startListening(): Promise<void>;
+    /** Stop listening: deactivate mic and VAD. */
+    stopListening(): void;
+    /**
+     * Process a transcript through an LLM callback and speak the response.
+     *
+     * @param text - The user's transcript text.
+     * @param sendToLLM - Async callback that sends text to the LLM and returns the response.
+     */
+    processTranscript(text: string, sendToLLM: (text: string) => Promise<string>): Promise<void>;
+    /** Speak text via TTS (ElevenLabs or Web Speech API). */
+    speak(text: string): Promise<void>;
+    /** Stop current TTS playback immediately (barge-in). */
+    stopSpeaking(): void;
+    /** Subscribe to state changes. Returns an unsubscribe function. */
+    onStateChange(callback: (state: VoiceState, previous: VoiceState) => void): () => void;
+    /** Subscribe to transcript events. Returns an unsubscribe function. */
+    onTranscript(callback: (text: string, isFinal: boolean) => void): () => void;
+    /** Destroy all resources held by the pipeline. */
+    destroy(): Promise<void>;
+    private _setState;
+    /**
+     * Resolve the AudioContext constructor, with Safari webkitAudioContext
+     * fallback. Returns null if Web Audio is not available.
+     */
+    private _resolveAudioContext;
+    /**
+     * Pre-warm the AudioContext by playing a silent buffer.
+     * This forces the context into the "running" state and avoids a
+     * noticeable delay on the first real playback.
+     */
+    private _prewarmAudioContext;
+    /**
+     * Set up a ScriptProcessorNode to capture mic audio and forward it
+     * to the STT adapter when `_isForwardingToSTT` is true.
+     */
+    private _setupMicCapture;
+    /** Tear down the mic capture ScriptProcessorNode. */
+    private _teardownMicCapture;
+    /** Stop all tracks on the current MediaStream. */
+    private _stopMicTracks;
+    private _handleVADSpeechStart;
+    private _handleVADSpeechEnd;
+    private _handleTranscript;
+    /**
+     * Handle an audio chunk from ElevenLabs TTS.
+     *
+     * Implements a jitter buffer: we accumulate audio for JITTER_BUFFER_MS
+     * before starting playback to smooth out network jitter.
+     */
+    private _handleTTSAudio;
+    /** Flush the jitter buffer and start playback. */
+    private _flushJitterBuffer;
+    /**
+     * Begin playback: decode all queued chunks and schedule them.
+     * If `onDone` is provided, it is called when the last chunk finishes playing.
+     */
+    private _startPlayback;
+    /**
+     * Decode an audio chunk (mp3 from ElevenLabs) and schedule it for
+     * sequential playback via AudioBufferSourceNode.
+     */
+    private _decodeAndSchedule;
+    /**
+     * Check if VAD speech-start during SPEAKING state is likely echo from
+     * the speaker playing TTS audio rather than genuine user speech.
+     *
+     * Simple heuristic: if we are still within the echo window of a recent
+     * TTS utterance, treat it as potential echo.
+     */
+    private _isEchoDetected;
+    /**
+     * Check if a transcript is an echo of recent TTS output.
+     *
+     * Uses word overlap: if intersection of words > 60% of max set size
+     * and the transcript arrived within the echo window, discard it.
+     */
+    private _isTranscriptEcho;
+    /**
+     * Normalize text into a set of lowercase words, stripping punctuation.
+     */
+    private _normalizeWords;
+    private _log;
+}
+export { type AgentConfig, type AgentState, type AggregatedUsage, AuthenticationError, type AwarenessOptions, type AwarenessState, AwarenessSystem, type BeforeLLMCallContext, BrowserSupportError, ConfigurationError, ConnectionManager, type ConnectionManagerOptions, type ConnectionState, ContentFilterError, type ContentMap, type ContentMapEntry, type ContentMapFunction, type ContentMapInput, ContextManager, type ContextManagerOptions, type ConversationTurn, type CreateSessionTokenOptions, DOMScanner, type DOMScannerOptions, type ErrorCode, ErrorCodes, EventBus, type EventMap, type FormField, type FormSummary, GeminiAdapter, GuideKitCore, type GuideKitCoreOptions, GuideKitError, type GuideKitErrorOptions, type GuideKitErrorType, type GuideKitEvent, type GuideKitOptions, type GuideKitProviderProps, type GuideKitStore, type GuideKitTheme, type HealthCheckResult, type HealthCheckStatus, I18n, type I18nOptions, type I18nStrings, InitializationError, type InteractiveElement, type LLMConfig, LLMOrchestrator, type LLMProviderAdapter, type LocaleInput, type NavItem, NavigationController, type NavigationControllerOptions, NetworkError, type OverlayElement, type PageModel, type PageSection, PermissionError, type ProactiveOptions, type ProactiveTrigger, ProactiveTriggerEngine, type ProactiveTriggerType, type Provider, type QueuedMessage, RateLimitError, RateLimiter, type RateLimiterOptions, type RateLimiterState, type RateLimits, type Resource, ResourceExhaustedError, ResourceManager, type ResourceManagerState, type STTConfig, type ScanMetadata, type SessionState, SingletonGuard, type SpotlightState, type SupportedLocale, type TTSConfig, type TextChunk, TimeoutError, type TokenData, TokenManager, type TokenManagerOptions, type TokenPayload, type TokenResponse, type ToolCall, type ToolCallRecord, type ToolDefinition, type ToolExecutionResult, ToolExecutor, type ToolExecutorOptions, type ToolHandler, type TooltipOptions, VisualGuidance, type VisualGuidanceOptions, VoicePipeline, type VoicePipelineOptions, type VoiceState, WebSpeechSTT, type WebSpeechSTTOptions, WebSpeechTTS, type WebSpeechTTSAudioEvent, type WebSpeechTTSOptions, createEventBus, isGuideKitError };