npm - @tekyzinc/stt-component - Versions diffs - 0.1.0 - Mend

@tekyzinc/stt-component 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.cts ADDED Viewed

@@ -0,0 +1,223 @@
+/** Supported Whisper model sizes. */
+type STTModelSize = 'tiny' | 'base' | 'small' | 'medium';
+/** Supported compute backends. */
+type STTBackend = 'webgpu' | 'wasm' | 'auto';
+/** Engine lifecycle states. */
+type STTStatus = 'idle' | 'loading' | 'ready' | 'recording' | 'processing';
+/** Correction timing configuration. */
+interface STTCorrectionConfig {
+    /** Enable mid-recording Whisper correction. Default: true */
+    enabled?: boolean;
+    /** Silence duration (ms) before triggering correction. Default: 3000 */
+    pauseThreshold?: number;
+    /** Maximum interval (ms) between forced corrections. Default: 5000 */
+    forcedInterval?: number;
+}
+/** Audio chunking configuration for long-form audio. */
+interface STTChunkingConfig {
+    /** Chunk length in seconds for Whisper processing. Default: 30 */
+    chunkLengthS?: number;
+    /** Stride length in seconds for overlapping chunks. Default: 5 */
+    strideLengthS?: number;
+}
+/** Full engine configuration. All fields optional — sensible defaults applied. */
+interface STTConfig {
+    /** Whisper model size. Default: 'tiny' */
+    model?: STTModelSize;
+    /** Compute backend preference. Default: 'auto' (WebGPU with WASM fallback) */
+    backend?: STTBackend;
+    /** Transcription language. Default: 'en' */
+    language?: string;
+    /** Model quantization dtype. Default: 'q4' */
+    dtype?: string;
+    /** Mid-recording correction settings. */
+    correction?: STTCorrectionConfig;
+    /** Audio chunking settings for long-form audio. */
+    chunking?: STTChunkingConfig;
+}
+/** Resolved configuration with all defaults applied. */
+interface ResolvedSTTConfig {
+    model: STTModelSize;
+    backend: STTBackend;
+    language: string;
+    dtype: string;
+    correction: Required<STTCorrectionConfig>;
+    chunking: Required<STTChunkingConfig>;
+}
+/** Engine state exposed to consumers via status events. */
+interface STTState {
+    status: STTStatus;
+    isModelLoaded: boolean;
+    /** Model download progress (0–100). */
+    loadProgress: number;
+    /** Active compute backend, or null if not yet determined. */
+    backend: 'webgpu' | 'wasm' | null;
+    error: string | null;
+}
+/** Structured error emitted via the 'error' event. */
+interface STTError {
+    code: string;
+    message: string;
+}
+/** Event map for the typed event emitter. */
+type STTEvents = {
+    /** Streaming interim text during recording. */
+    transcript: (text: string) => void;
+    /** Whisper-corrected text replacing interim text. */
+    correction: (text: string) => void;
+    /** Actionable error (mic denied, model fail, transcription fail). */
+    error: (error: STTError) => void;
+    /** Engine state change. */
+    status: (state: STTState) => void;
+};
+/** Handle returned by audio capture — used internally. */
+interface AudioCaptureHandle {
+    audioCtx: AudioContext;
+    stream: MediaStream;
+    samples: Float32Array[];
+    /** Retain reference to prevent GC from stopping audio processing. */
+    _processor: ScriptProcessorNode;
+}
+/** Default configuration values. */
+declare const DEFAULT_STT_CONFIG: ResolvedSTTConfig;
+/** Merge user config with defaults to produce resolved config. */
+declare function resolveConfig(config?: STTConfig): ResolvedSTTConfig;
+/**
+ * A generic, typed event emitter.
+ *
+ * Type parameter `T` is a map of event names to listener signatures,
+ * giving consumers compile-time safety on event names and callback args.
+ */
+declare class TypedEventEmitter<T extends Record<string, (...args: any[]) => void>> {
+    private listeners;
+    /** Subscribe to an event. */
+    on<K extends keyof T>(event: K, listener: T[K]): void;
+    /** Unsubscribe a specific listener. No-op if not registered. */
+    off<K extends keyof T>(event: K, listener: T[K]): void;
+    /** Emit an event, calling all registered listeners in insertion order. */
+    emit<K extends keyof T>(event: K, ...args: Parameters<T[K]>): void;
+    /** Remove all listeners, optionally for a single event. */
+    removeAllListeners(event?: keyof T): void;
+}
+/**
+ * Start capturing raw PCM audio from the microphone.
+ * Uses ScriptProcessorNode to collect Float32Array samples directly.
+ */
+declare function startCapture(): Promise<AudioCaptureHandle>;
+/**
+ * Copy current audio buffer without stopping capture.
+ * Returns a shallow copy of the samples array (each chunk is shared, not cloned).
+ */
+declare function snapshotAudio(capture: AudioCaptureHandle): Float32Array[];
+/**
+ * Concatenate sample chunks and resample to 16kHz for Whisper.
+ */
+declare function resampleAudio(samples: Float32Array[], nativeSr: number): Promise<Float32Array>;
+/**
+ * Stop capturing and return resampled audio at 16kHz.
+ */
+declare function stopCapture(capture: AudioCaptureHandle): Promise<Float32Array>;
+/** Events emitted by the WorkerManager. */
+type WorkerManagerEvents = {
+    progress: (percent: number) => void;
+    ready: () => void;
+    result: (text: string) => void;
+    error: (message: string) => void;
+};
+/**
+ * Manages the Whisper Web Worker lifecycle.
+ * Provides typed message passing and a promise-based transcription API.
+ */
+declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
+    private worker;
+    private transcribeResolve;
+    private modelReadyResolve;
+    private modelReadyReject;
+    /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
+    spawn(workerUrl?: URL): void;
+    /** Load the Whisper model in the worker. Resolves when ready. */
+    loadModel(config: ResolvedSTTConfig): Promise<void>;
+    /** Send audio to the worker for transcription. Resolves with text. */
+    transcribe(audio: Float32Array): Promise<string>;
+    /** Cancel any in-flight transcription. */
+    cancel(): void;
+    /** Terminate the worker and release resources. */
+    destroy(): void;
+    private handleMessage;
+}
+/**
+ * Manages mid-recording correction timing.
+ * Two triggers: pause detection and forced interval.
+ */
+declare class CorrectionOrchestrator {
+    private forcedTimer;
+    private lastCorrectionTime;
+    private correctionFn;
+    private config;
+    /** Create a new correction orchestrator with the given timing config. */
+    constructor(config: ResolvedSTTConfig['correction']);
+    /** Set the function to call when a correction is triggered. */
+    setCorrectionFn(fn: () => void): void;
+    /** Start the correction orchestrator (begin forced interval timer). */
+    start(): void;
+    /** Stop the orchestrator (clear all timers). */
+    stop(): void;
+    /** Called when a speech pause is detected. Triggers correction if cooldown elapsed. */
+    onPauseDetected(): void;
+    /** Force a correction now (resets timer). */
+    forceCorrection(): void;
+    private triggerCorrection;
+    private startForcedTimer;
+    private stopForcedTimer;
+    private restartForcedTimer;
+}
+/**
+ * Main STT engine — the public API for speech-to-text with Whisper correction.
+ *
+ * Usage:
+ * ```typescript
+ * const engine = new STTEngine({ model: 'tiny' });
+ * engine.on('transcript', (text) => console.log(text));
+ * engine.on('correction', (text) => console.log('corrected:', text));
+ * await engine.init();
+ * await engine.start();
+ * const finalText = await engine.stop();
+ * ```
+ */
+declare class STTEngine extends TypedEventEmitter<STTEvents> {
+    private config;
+    private workerManager;
+    private correctionOrchestrator;
+    private capture;
+    private state;
+    private workerUrl?;
+    /**
+     * Create a new STT engine instance.
+     * @param config - Optional configuration overrides (model, backend, language, etc.).
+     * @param workerUrl - Optional custom URL for the Whisper Web Worker script.
+     */
+    constructor(config?: STTConfig, workerUrl?: URL);
+    /** Initialize the engine: spawn worker and load model. */
+    init(): Promise<void>;
+    /** Start recording audio and enable correction cycles. */
+    start(): Promise<void>;
+    /** Stop recording, run final transcription, return text. */
+    stop(): Promise<string>;
+    /** Destroy the engine: terminate worker, release all resources. */
+    destroy(): void;
+    /** Get current engine state. */
+    getState(): Readonly<STTState>;
+    /** Notify the correction orchestrator of a speech pause. */
+    notifyPause(): void;
+    private performCorrection;
+    private setupWorkerListeners;
+    private updateStatus;
+    private emitError;
+}
+export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, resampleAudio, resolveConfig, snapshotAudio, startCapture, stopCapture };

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,223 @@
+/** Supported Whisper model sizes. */
+type STTModelSize = 'tiny' | 'base' | 'small' | 'medium';
+/** Supported compute backends. */
+type STTBackend = 'webgpu' | 'wasm' | 'auto';
+/** Engine lifecycle states. */
+type STTStatus = 'idle' | 'loading' | 'ready' | 'recording' | 'processing';
+/** Correction timing configuration. */
+interface STTCorrectionConfig {
+    /** Enable mid-recording Whisper correction. Default: true */
+    enabled?: boolean;
+    /** Silence duration (ms) before triggering correction. Default: 3000 */
+    pauseThreshold?: number;
+    /** Maximum interval (ms) between forced corrections. Default: 5000 */
+    forcedInterval?: number;
+}
+/** Audio chunking configuration for long-form audio. */
+interface STTChunkingConfig {
+    /** Chunk length in seconds for Whisper processing. Default: 30 */
+    chunkLengthS?: number;
+    /** Stride length in seconds for overlapping chunks. Default: 5 */
+    strideLengthS?: number;
+}
+/** Full engine configuration. All fields optional — sensible defaults applied. */
+interface STTConfig {
+    /** Whisper model size. Default: 'tiny' */
+    model?: STTModelSize;
+    /** Compute backend preference. Default: 'auto' (WebGPU with WASM fallback) */
+    backend?: STTBackend;
+    /** Transcription language. Default: 'en' */
+    language?: string;
+    /** Model quantization dtype. Default: 'q4' */
+    dtype?: string;
+    /** Mid-recording correction settings. */
+    correction?: STTCorrectionConfig;
+    /** Audio chunking settings for long-form audio. */
+    chunking?: STTChunkingConfig;
+}
+/** Resolved configuration with all defaults applied. */
+interface ResolvedSTTConfig {
+    model: STTModelSize;
+    backend: STTBackend;
+    language: string;
+    dtype: string;
+    correction: Required<STTCorrectionConfig>;
+    chunking: Required<STTChunkingConfig>;
+}
+/** Engine state exposed to consumers via status events. */
+interface STTState {
+    status: STTStatus;
+    isModelLoaded: boolean;
+    /** Model download progress (0–100). */
+    loadProgress: number;
+    /** Active compute backend, or null if not yet determined. */
+    backend: 'webgpu' | 'wasm' | null;
+    error: string | null;
+}
+/** Structured error emitted via the 'error' event. */
+interface STTError {
+    code: string;
+    message: string;
+}
+/** Event map for the typed event emitter. */
+type STTEvents = {
+    /** Streaming interim text during recording. */
+    transcript: (text: string) => void;
+    /** Whisper-corrected text replacing interim text. */
+    correction: (text: string) => void;
+    /** Actionable error (mic denied, model fail, transcription fail). */
+    error: (error: STTError) => void;
+    /** Engine state change. */
+    status: (state: STTState) => void;
+};
+/** Handle returned by audio capture — used internally. */
+interface AudioCaptureHandle {
+    audioCtx: AudioContext;
+    stream: MediaStream;
+    samples: Float32Array[];
+    /** Retain reference to prevent GC from stopping audio processing. */
+    _processor: ScriptProcessorNode;
+}
+/** Default configuration values. */
+declare const DEFAULT_STT_CONFIG: ResolvedSTTConfig;
+/** Merge user config with defaults to produce resolved config. */
+declare function resolveConfig(config?: STTConfig): ResolvedSTTConfig;
+/**
+ * A generic, typed event emitter.
+ *
+ * Type parameter `T` is a map of event names to listener signatures,
+ * giving consumers compile-time safety on event names and callback args.
+ */
+declare class TypedEventEmitter<T extends Record<string, (...args: any[]) => void>> {
+    private listeners;
+    /** Subscribe to an event. */
+    on<K extends keyof T>(event: K, listener: T[K]): void;
+    /** Unsubscribe a specific listener. No-op if not registered. */
+    off<K extends keyof T>(event: K, listener: T[K]): void;
+    /** Emit an event, calling all registered listeners in insertion order. */
+    emit<K extends keyof T>(event: K, ...args: Parameters<T[K]>): void;
+    /** Remove all listeners, optionally for a single event. */
+    removeAllListeners(event?: keyof T): void;
+}
+/**
+ * Start capturing raw PCM audio from the microphone.
+ * Uses ScriptProcessorNode to collect Float32Array samples directly.
+ */
+declare function startCapture(): Promise<AudioCaptureHandle>;
+/**
+ * Copy current audio buffer without stopping capture.
+ * Returns a shallow copy of the samples array (each chunk is shared, not cloned).
+ */
+declare function snapshotAudio(capture: AudioCaptureHandle): Float32Array[];
+/**
+ * Concatenate sample chunks and resample to 16kHz for Whisper.
+ */
+declare function resampleAudio(samples: Float32Array[], nativeSr: number): Promise<Float32Array>;
+/**
+ * Stop capturing and return resampled audio at 16kHz.
+ */
+declare function stopCapture(capture: AudioCaptureHandle): Promise<Float32Array>;
+/** Events emitted by the WorkerManager. */
+type WorkerManagerEvents = {
+    progress: (percent: number) => void;
+    ready: () => void;
+    result: (text: string) => void;
+    error: (message: string) => void;
+};
+/**
+ * Manages the Whisper Web Worker lifecycle.
+ * Provides typed message passing and a promise-based transcription API.
+ */
+declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
+    private worker;
+    private transcribeResolve;
+    private modelReadyResolve;
+    private modelReadyReject;
+    /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
+    spawn(workerUrl?: URL): void;
+    /** Load the Whisper model in the worker. Resolves when ready. */
+    loadModel(config: ResolvedSTTConfig): Promise<void>;
+    /** Send audio to the worker for transcription. Resolves with text. */
+    transcribe(audio: Float32Array): Promise<string>;
+    /** Cancel any in-flight transcription. */
+    cancel(): void;
+    /** Terminate the worker and release resources. */
+    destroy(): void;
+    private handleMessage;
+}
+/**
+ * Manages mid-recording correction timing.
+ * Two triggers: pause detection and forced interval.
+ */
+declare class CorrectionOrchestrator {
+    private forcedTimer;
+    private lastCorrectionTime;
+    private correctionFn;
+    private config;
+    /** Create a new correction orchestrator with the given timing config. */
+    constructor(config: ResolvedSTTConfig['correction']);
+    /** Set the function to call when a correction is triggered. */
+    setCorrectionFn(fn: () => void): void;
+    /** Start the correction orchestrator (begin forced interval timer). */
+    start(): void;
+    /** Stop the orchestrator (clear all timers). */
+    stop(): void;
+    /** Called when a speech pause is detected. Triggers correction if cooldown elapsed. */
+    onPauseDetected(): void;
+    /** Force a correction now (resets timer). */
+    forceCorrection(): void;
+    private triggerCorrection;
+    private startForcedTimer;
+    private stopForcedTimer;
+    private restartForcedTimer;
+}
+/**
+ * Main STT engine — the public API for speech-to-text with Whisper correction.
+ *
+ * Usage:
+ * ```typescript
+ * const engine = new STTEngine({ model: 'tiny' });
+ * engine.on('transcript', (text) => console.log(text));
+ * engine.on('correction', (text) => console.log('corrected:', text));
+ * await engine.init();
+ * await engine.start();
+ * const finalText = await engine.stop();
+ * ```
+ */
+declare class STTEngine extends TypedEventEmitter<STTEvents> {
+    private config;
+    private workerManager;
+    private correctionOrchestrator;
+    private capture;
+    private state;
+    private workerUrl?;
+    /**
+     * Create a new STT engine instance.
+     * @param config - Optional configuration overrides (model, backend, language, etc.).
+     * @param workerUrl - Optional custom URL for the Whisper Web Worker script.
+     */
+    constructor(config?: STTConfig, workerUrl?: URL);
+    /** Initialize the engine: spawn worker and load model. */
+    init(): Promise<void>;
+    /** Start recording audio and enable correction cycles. */
+    start(): Promise<void>;
+    /** Stop recording, run final transcription, return text. */
+    stop(): Promise<string>;
+    /** Destroy the engine: terminate worker, release all resources. */
+    destroy(): void;
+    /** Get current engine state. */
+    getState(): Readonly<STTState>;
+    /** Notify the correction orchestrator of a speech pause. */
+    notifyPause(): void;
+    private performCorrection;
+    private setupWorkerListeners;
+    private updateStatus;
+    private emitError;
+}
+export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, resampleAudio, resolveConfig, snapshotAudio, startCapture, stopCapture };