npm - whisper.rn - Versions diffs - 0.5.0-rc.1 → 0.5.0-rc.3 - Mend

whisper.rn 0.5.0-rc.1 → 0.5.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/src/realtime-transcription/adapters/SimulateFileAudioStreamAdapter.ts ADDED Viewed

@@ -0,0 +1,378 @@
+import type {
+  AudioStreamInterface,
+  AudioStreamConfig,
+  AudioStreamData,
+} from '../types'
+import { WavFileReader, WavFileReaderFs } from '../../utils/WavFileReader'
+export interface SimulateFileOptions {
+  fs: WavFileReaderFs
+  filePath: string
+  playbackSpeed?: number // Default: 1.0 (real-time), 0.5 (half speed), 2.0 (double speed)
+  chunkDurationMs?: number // Default: 100ms chunks
+  loop?: boolean // Default: false
+  onEndOfFile?: () => void // Callback when end of file is reached
+  logger?: (message: string) => void // Default: noop - custom logger function
+}
+export class SimulateFileAudioStreamAdapter implements AudioStreamInterface {
+  private fileReader: WavFileReader
+  private config: AudioStreamConfig | null = null
+  private options: SimulateFileOptions
+  private isInitialized = false
+  private recording = false
+  private dataCallback?: (data: AudioStreamData) => void
+  private errorCallback?: (error: string) => void
+  private statusCallback?: (isRecording: boolean) => void
+  private streamInterval?: ReturnType<typeof setInterval>
+  private currentBytePosition = 0
+  private startTime = 0
+  private pausedTime = 0
+  private hasReachedEnd = false
+  constructor(options: SimulateFileOptions) {
+    this.options = {
+      playbackSpeed: 1.0,
+      chunkDurationMs: 100,
+      loop: false,
+      logger: () => {},
+      ...options,
+    }
+    this.fileReader = new WavFileReader(this.options.fs, this.options.filePath)
+  }
+  async initialize(config: AudioStreamConfig): Promise<void> {
+    if (this.isInitialized) {
+      await this.release()
+    }
+    try {
+      this.config = config
+      // Initialize the WAV file reader
+      await this.fileReader.initialize()
+      // Validate file format matches config
+      const header = this.fileReader.getHeader()
+      if (!header) {
+        throw new Error('Failed to read WAV file header')
+      }
+      // Warn about mismatched formats but allow processing
+      if (header.sampleRate !== config.sampleRate) {
+        this.log(
+          `WAV file sample rate (${header.sampleRate}Hz) differs from config (${config.sampleRate}Hz)`,
+        )
+      }
+      if (header.channels !== config.channels) {
+        this.log(
+          `WAV file channels (${header.channels}) differs from config (${config.channels})`,
+        )
+      }
+      if (header.bitsPerSample !== config.bitsPerSample) {
+        this.log(
+          `WAV file bits per sample (${header.bitsPerSample}) differs from config (${config.bitsPerSample})`,
+        )
+      }
+      this.isInitialized = true
+      this.log(
+        `Simulate audio stream initialized: ${header.duration.toFixed(2)}s at ${
+          this.options.playbackSpeed
+        }x speed`,
+      )
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : 'Unknown initialization error'
+      this.errorCallback?.(errorMessage)
+      throw new Error(
+        `Failed to initialize SimulateFileAudioStreamAdapter: ${errorMessage}`,
+      )
+    }
+  }
+  async start(): Promise<void> {
+    if (!this.isInitialized || !this.config) {
+      throw new Error('Adapter not initialized')
+    }
+    if (this.recording) {
+      return
+    }
+    try {
+      this.recording = true
+      this.hasReachedEnd = false
+      this.startTime = Date.now() - this.pausedTime
+      this.statusCallback?.(true)
+      // Start streaming chunks
+      this.startStreaming()
+      this.log('File audio simulation started')
+    } catch (error) {
+      this.recording = false
+      this.statusCallback?.(false)
+      const errorMessage =
+        error instanceof Error ? error.message : 'Unknown start error'
+      this.errorCallback?.(errorMessage)
+      throw error
+    }
+  }
+  async stop(): Promise<void> {
+    if (!this.recording) {
+      return
+    }
+    try {
+      this.recording = false
+      this.pausedTime = Date.now() - this.startTime
+      // Stop the streaming interval
+      if (this.streamInterval) {
+        clearInterval(this.streamInterval)
+        this.streamInterval = undefined
+      }
+      this.statusCallback?.(false)
+      this.log('File audio simulation stopped')
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : 'Unknown stop error'
+      this.errorCallback?.(errorMessage)
+    }
+  }
+  isRecording(): boolean {
+    return this.recording
+  }
+  onData(callback: (data: AudioStreamData) => void): void {
+    this.dataCallback = callback
+  }
+  onError(callback: (error: string) => void): void {
+    this.errorCallback = callback
+  }
+  onStatusChange(callback: (isRecording: boolean) => void): void {
+    this.statusCallback = callback
+  }
+  async release(): Promise<void> {
+    await this.stop()
+    this.isInitialized = false
+    this.currentBytePosition = 0
+    this.pausedTime = 0
+    this.log('SimulateFileAudioStreamAdapter released')
+  }
+  /**
+   * Start the streaming process
+   */
+  private startStreaming(): void {
+    if (!this.config || !this.isInitialized) {
+      return
+    }
+    const header = this.fileReader.getHeader()
+    if (!header) {
+      this.errorCallback?.('WAV file header not available')
+      return
+    }
+    // Calculate chunk size based on desired duration
+    const chunkDurationSec = (this.options.chunkDurationMs || 100) / 1000
+    const bytesPerSecond =
+      header.sampleRate * header.channels * (header.bitsPerSample / 8)
+    const chunkSizeBytes = Math.floor(chunkDurationSec * bytesPerSecond)
+    // Adjust interval timing based on playback speed
+    const intervalMs =
+      (this.options.chunkDurationMs || 100) /
+      (this.options.playbackSpeed || 1.0)
+    this.streamInterval = setInterval(() => {
+      if (!this.recording) {
+        return
+      }
+      try {
+        this.streamNextChunk(chunkSizeBytes)
+      } catch (error) {
+        const errorMessage =
+          error instanceof Error ? error.message : 'Streaming error'
+        this.errorCallback?.(errorMessage)
+        this.stop()
+      }
+    }, intervalMs)
+  }
+  /**
+   * Stream the next audio chunk
+   */
+  private streamNextChunk(chunkSizeBytes: number): void {
+    if (!this.dataCallback || !this.config) {
+      return
+    }
+    const header = this.fileReader.getHeader()
+    if (!header) {
+      return
+    }
+    // Get the next chunk of audio data
+    const audioChunk = this.fileReader.getAudioSlice(
+      this.currentBytePosition,
+      chunkSizeBytes,
+    )
+    if (!audioChunk || audioChunk.length === 0) {
+      // End of file reached
+      if (this.options.loop) {
+        // Reset to beginning for looping
+        this.currentBytePosition = 0
+        this.startTime = Date.now()
+        this.pausedTime = 0
+        this.hasReachedEnd = false
+        this.log('Looping audio file simulation')
+        return
+      }
+      // Stop streaming due to no new buffer
+      this.log('Audio file simulation completed - no new buffer available')
+      this.hasReachedEnd = true
+      // Call the end-of-file callback if provided
+      if (this.options.onEndOfFile) {
+        this.log('Calling onEndOfFile callback')
+        this.options.onEndOfFile()
+      }
+      // Stop the stream
+      this.stop()
+      return
+    }
+    // Update position
+    this.currentBytePosition += audioChunk.length
+    // Create stream data using the original file's format
+    const streamData: AudioStreamData = {
+      data: audioChunk,
+      sampleRate: header.sampleRate,
+      channels: header.channels,
+      timestamp: Date.now(),
+    }
+    // Send the chunk
+    this.dataCallback(streamData)
+  }
+  /**
+   * Get current playback statistics
+   */
+  getStatistics() {
+    const header = this.fileReader.getHeader()
+    const currentTime = this.fileReader.byteToTime(this.currentBytePosition)
+    return {
+      filePath: this.options.filePath,
+      isRecording: this.recording,
+      currentTime,
+      totalDuration: header?.duration || 0,
+      progress: header ? currentTime / header.duration : 0,
+      playbackSpeed: this.options.playbackSpeed,
+      currentBytePosition: this.currentBytePosition,
+      totalBytes: this.fileReader.getTotalDataSize(),
+      hasReachedEnd: this.hasReachedEnd,
+      header,
+    }
+  }
+  /**
+   * Seek to a specific time position
+   */
+  seekToTime(timeSeconds: number): void {
+    const header = this.fileReader.getHeader()
+    if (!header) {
+      return
+    }
+    const clampedTime = Math.max(0, Math.min(timeSeconds, header.duration))
+    this.currentBytePosition = this.fileReader.timeToByte(clampedTime)
+    // Reset timing if we're currently playing
+    if (this.recording) {
+      this.startTime =
+        Date.now() - (clampedTime * 1000) / (this.options.playbackSpeed || 1.0)
+      this.pausedTime = 0
+    }
+    this.log(`Seeked to ${clampedTime.toFixed(2)}s`)
+  }
+  /**
+   * Set playback speed
+   */
+  setPlaybackSpeed(speed: number): void {
+    if (speed <= 0) {
+      throw new Error('Playback speed must be greater than 0')
+    }
+    this.options.playbackSpeed = speed
+    // If currently playing, restart streaming with new speed
+    if (this.recording) {
+      this.stop().then(() => {
+        this.start()
+      })
+    }
+    this.log(`Playback speed set to ${speed}x`)
+  }
+  /**
+   * Reset file buffer to beginning
+   */
+  resetBuffer(): void {
+    this.log('Resetting file buffer to beginning')
+    // Reset position and timing
+    this.currentBytePosition = 0
+    this.startTime = Date.now()
+    this.pausedTime = 0
+    this.hasReachedEnd = false
+    // If currently playing, restart streaming from beginning
+    if (this.recording) {
+      this.log('Restarting streaming from beginning')
+      // Stop and restart to apply the reset
+      this.stop().then(() => {
+        this.start()
+      })
+    }
+  }
+  /**
+   * Logger function
+   */
+  private log(message: string): void {
+    this.options.logger?.(`[SimulateFileAudioStreamAdapter] ${message}`)
+  }
+}

package/src/realtime-transcription/index.ts ADDED Viewed

@@ -0,0 +1,34 @@
+// Main transcriber class
+export { RealtimeTranscriber } from './RealtimeTranscriber'
+// Slice manager (for advanced use cases)
+export { SliceManager } from './SliceManager'
+export type { WavFileWriterFs } from '../utils/WavFileWriter'
+// Types and interfaces
+export type {
+  // Audio Stream types
+  AudioStreamData,
+  AudioStreamConfig,
+  AudioStreamInterface,
+  // VAD and event types
+  RealtimeVadEvent,
+  RealtimeTranscribeEvent,
+  RealtimeStatsEvent,
+  // Configuration types
+  RealtimeTranscriberDependencies,
+  RealtimeOptions,
+  RealtimeTranscriberCallbacks,
+  // Audio slice types
+  AudioSlice,
+  AudioSliceNoData,
+  MemoryUsage,
+} from './types'
+// VAD presets constant
+export { VAD_PRESETS } from './types'

package/src/realtime-transcription/types.ts ADDED Viewed

@@ -0,0 +1,277 @@
+import type { TranscribeOptions, TranscribeResult, VadOptions } from '../index'
+import type { WavFileWriterFs } from '../utils/WavFileWriter'
+// === Audio Stream Interfaces ===
+export interface AudioStreamData {
+  data: Uint8Array
+  sampleRate: number
+  channels: number
+  timestamp: number
+}
+export interface AudioStreamConfig {
+  sampleRate?: number
+  channels?: number
+  bitsPerSample?: number
+  bufferSize?: number
+  audioSource?: number
+}
+export interface AudioStreamInterface {
+  initialize(config: AudioStreamConfig): Promise<void>
+  start(): Promise<void>
+  stop(): Promise<void>
+  isRecording(): boolean
+  onData(callback: (data: AudioStreamData) => void): void
+  onError(callback: (error: string) => void): void
+  onStatusChange(callback: (isRecording: boolean) => void): void
+  release(): Promise<void>
+}
+// === Enhanced VAD Options ===
+// Pre-defined VAD configurations for different use cases
+/**
+ * VAD Presets Overview:
+ *
+ *                            VAD Presets
+ *                         /      |      \
+ *                Conservative  Default  Sensitive
+ *                /        |        |        \
+ *        conservative  very-conservative  sensitive  very-sensitive
+ *        (0.7 thresh)   (0.8 thresh)    (0.3 thresh) (0.2 thresh)
+ *        500ms min      750ms min       100ms min    100ms min
+ *        Clear speech   Very clear      Quiet env    Catches whispers
+ *
+ *                         Specialized Presets
+ *                      /        |        \
+ *                continuous   meeting    noisy
+ *                (60s max)    (45s max)  (0.75 thresh)
+ *                Lectures     Multi-spk   Strict for noise
+ *
+ * Key Parameters:
+ * - threshold: 0.0-1.0 (lower = more sensitive)
+ * - minSpeechDurationMs: Min duration to consider speech
+ * - minSilenceDurationMs: Min silence before ending speech
+ * - maxSpeechDurationS: Max continuous speech duration
+ * - speechPadMs: Padding around detected speech
+ * - samplesOverlap: Analysis window overlap (0.0-1.0)
+ */
+export const VAD_PRESETS = {
+  // Default - balanced performance
+  default: {
+    threshold: 0.5,
+    minSpeechDurationMs: 250,
+    minSilenceDurationMs: 100,
+    maxSpeechDurationS: 30,
+    speechPadMs: 30,
+    samplesOverlap: 0.1,
+  },
+  // Sensitive - good for quiet environments
+  sensitive: {
+    threshold: 0.3,
+    minSpeechDurationMs: 100,
+    minSilenceDurationMs: 50,
+    maxSpeechDurationS: 15,
+    speechPadMs: 50,
+    samplesOverlap: 0.2,
+  },
+  // Very sensitive - catches even quiet speech
+  'very-sensitive': {
+    threshold: 0.2,
+    minSpeechDurationMs: 100,
+    minSilenceDurationMs: 50,
+    maxSpeechDurationS: 15,
+    speechPadMs: 100,
+    samplesOverlap: 0.3,
+  },
+  // Conservative - avoids false positives
+  conservative: {
+    threshold: 0.7,
+    minSpeechDurationMs: 500,
+    minSilenceDurationMs: 200,
+    maxSpeechDurationS: 25,
+    speechPadMs: 20,
+    samplesOverlap: 0.05,
+  },
+  // Very conservative - only clear speech
+  'very-conservative': {
+    threshold: 0.8,
+    minSpeechDurationMs: 750,
+    minSilenceDurationMs: 300,
+    maxSpeechDurationS: 20,
+    speechPadMs: 10,
+    samplesOverlap: 0.05,
+  },
+  // Continuous speech - for presentations/lectures
+  continuous: {
+    threshold: 0.4,
+    minSpeechDurationMs: 200,
+    minSilenceDurationMs: 300,
+    maxSpeechDurationS: 60, // Longer segments
+    speechPadMs: 50,
+    samplesOverlap: 0.15,
+  },
+  // Meeting mode - handles multiple speakers
+  meeting: {
+    threshold: 0.45,
+    minSpeechDurationMs: 300,
+    minSilenceDurationMs: 150,
+    maxSpeechDurationS: 45,
+    speechPadMs: 75,
+    samplesOverlap: 0.2,
+  },
+  // Noisy environment - more strict thresholds
+  noisy: {
+    threshold: 0.75,
+    minSpeechDurationMs: 400,
+    minSilenceDurationMs: 100,
+    maxSpeechDurationS: 25,
+    speechPadMs: 40,
+    samplesOverlap: 0.1,
+  },
+}
+export interface RealtimeVadEvent {
+  type: 'speech_start' | 'speech_end' | 'speech_continue' | 'silence'
+  timestamp: number
+  lastSpeechDetectedTime: number
+  confidence: number
+  duration: number
+  sliceIndex: number
+  // Additional context
+  analysis?: {
+    averageAmplitude: number
+    peakAmplitude: number
+    spectralCentroid?: number
+    zeroCrossingRate?: number
+  }
+  // Adaptive threshold info
+  currentThreshold?: number
+  environmentNoise?: number
+}
+export interface RealtimeTranscribeEvent {
+  type: 'start' | 'transcribe' | 'end' | 'error'
+  sliceIndex: number
+  data?: TranscribeResult
+  isCapturing: boolean
+  processTime: number
+  recordingTime: number
+  memoryUsage?: {
+    slicesInMemory: number
+    totalSamples: number
+    estimatedMB: number
+  }
+  vadEvent?: RealtimeVadEvent
+}
+export interface RealtimeOptions {
+  // Audio settings
+  audioSliceSec?: number // default: 25
+  audioMinSec?: number // default: 1
+  maxSlicesInMemory?: number // default: 3
+  // VAD settings - now using extended options
+  vadOptions?: VadOptions
+  vadPreset?: keyof typeof VAD_PRESETS // Quick preset selection
+  // Auto-slice settings
+  autoSliceOnSpeechEnd?: boolean // default: false - automatically slice when speech ends and duration thresholds are met
+  autoSliceThreshold?: number // default: 0.85 - percentage of audioSliceSec to trigger auto-slice
+  // Transcription settings
+  transcribeOptions?: TranscribeOptions
+  // Prompt settings
+  initialPrompt?: string // Initial prompt to use for transcription
+  promptPreviousSlices?: boolean // Add transcription results from previous slices as prompt (default: true)
+  // File settings (Only used if fs dependency is provided)
+  audioOutputPath?: string
+  // Audio stream configuration
+  audioStreamConfig?: AudioStreamConfig
+  // Logger settings
+  logger?: (message: string) => void // default: noop - custom logger function
+}
+export interface AudioSlice {
+  index: number
+  data: Uint8Array
+  sampleCount: number
+  startTime: number
+  endTime: number
+  isProcessed: boolean
+  isReleased: boolean
+}
+export interface AudioSliceNoData extends Omit<AudioSlice, 'data'> {}
+export interface MemoryUsage {
+  slicesInMemory: number
+  totalSamples: number
+  estimatedMB: number
+}
+export interface RealtimeStatsEvent {
+  timestamp: number
+  type:
+    | 'slice_processed'
+    | 'vad_change'
+    | 'memory_change'
+    | 'status_change'
+  data: {
+    isActive: boolean
+    isTranscribing: boolean
+    vadEnabled: boolean
+    audioStats: any
+    vadStats: any
+    sliceStats: any
+  }
+}
+export interface RealtimeTranscriberCallbacks {
+  onTranscribe?: (event: RealtimeTranscribeEvent) => void
+  onVad?: (event: RealtimeVadEvent) => void
+  onError?: (error: string) => void
+  onStatusChange?: (isActive: boolean) => void
+  onStatsUpdate?: (event: RealtimeStatsEvent) => void
+}
+// === Context Interfaces ===
+export type WhisperContextLike = {
+  transcribeData: (
+    data: SharedArrayBuffer,
+    options: TranscribeOptions,
+  ) => {
+    stop: () => Promise<void>
+    promise: Promise<TranscribeResult>
+  }
+}
+export type WhisperVadContextLike = {
+  detectSpeechData: (
+    data: SharedArrayBuffer,
+    options: VadOptions,
+  ) => Promise<Array<{ t0: number; t1: number }>>
+}
+export interface RealtimeTranscriberDependencies {
+  whisperContext: WhisperContextLike
+  vadContext?: WhisperVadContextLike
+  audioStream: AudioStreamInterface
+  fs?: WavFileWriterFs
+}