npm - @wovin/tranz - Versions diffs - 0.1.36 → 0.2.0 - Mend

@wovin/tranz 0.1.36 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +8 -5
package/dist/{audio.min.js → audio.js} +32 -18
package/dist/index.d.ts +3 -3
package/dist/index.d.ts.map +1 -1
package/dist/{index.min.js → index.js} +161 -29
package/dist/providers.d.ts +1 -1
package/dist/providers.d.ts.map +1 -1
package/dist/{providers.min.js → providers.js} +68 -24
package/dist/utils/audio/merge-results.d.ts +14 -12
package/dist/utils/audio/merge-results.d.ts.map +1 -1
package/dist/utils/transcription/format.d.ts +27 -0
package/dist/utils/transcription/format.d.ts.map +1 -1
package/dist/utils/transcription/providers.d.ts +30 -1
package/dist/utils/transcription/providers.d.ts.map +1 -1
package/dist/utils/transcription/transcribe.d.ts +5 -0
package/dist/utils/transcription/transcribe.d.ts.map +1 -1
package/package.json +10 -8
package/src/audio.ts +25 -0
package/src/index.ts +61 -0
package/src/providers.ts +23 -0
package/src/realtime.ts +58 -0
package/src/utils/audio/index.ts +6 -0
package/src/utils/audio/merge-results.ts +198 -0
package/src/utils/audio/split.ts +504 -0
package/src/utils/file-utils.ts +16 -0
package/src/utils/transcription/format.ts +208 -0
package/src/utils/transcription/mime-detection.ts +80 -0
package/src/utils/transcription/providers.ts +572 -0
package/src/utils/transcription/realtime.ts +821 -0
package/src/utils/transcription/runtime.ts +40 -0
package/src/utils/transcription/transcribe.ts +366 -0
/package/dist/{realtime.min.js → realtime.js} +0 -0

package/src/utils/transcription/providers.ts ADDED Viewed

@@ -0,0 +1,572 @@
+/**
+ * Transcription provider types and interfaces
+ * Defines the contract for all transcription providers
+ */
+import { spawn } from "node:child_process"
+import * as fs from "node:fs"
+import path from "node:path"
+import { getName } from "../file-utils.ts"
+import { pipeline } from "node:stream"
+import { promisify } from "node:util"
+import { PipelineSource } from "node:stream"
+/**
+ * A single transcription segment (one diarized turn, or one segment-granularity unit).
+ *
+ * Timestamps are integer milliseconds — normalized at the SDK boundary so consumers
+ * can pass straight into the wovin annotation schema (see docs/annotation-schema.md).
+ *
+ * `diarization` is the anonymous, per-recording diarization label as returned by
+ * the provider (Mistral: `"speaker_1"`, Deepgram: `0`, AssemblyAI: `"A"`, …).
+ * It is NOT a real-world speaker identity — that's a separate (future) `speakerId` field.
+ *
+ * When `mergeTranscriptionResults` joins multiple chunks, `diarization` is rewritten
+ * as `` `chunk${index}/${value}` `` because per-chunk labels are not comparable
+ * across chunks.
+ */
+export interface TranscriptSegment {
+  startMs: number
+  endMs: number
+  text: string
+  diarization?: string | number
+  // future: speakerId?: string       (identified profile)
+  // future: speakerName?: string     (display name)
+}
+/**
+ * Result object returned from transcription operations
+ * Contains the transcribed text and optional provider-specific metadata
+ */
+export interface TranscriptionResult {
+  /** The transcribed text content */
+  text: string
+  /** Raw response object from the provider (optional, for debugging) */
+  rawResponse?: any
+  /** Error message if transcription failed */
+  error?: string
+  /** Confidence score of the transcription (0-1) */
+  confidence?: number
+  /** Word-level data — populated only when granularity='word' or the provider returns it. Left undefined otherwise (not `[]`). */
+  words?: any[]
+  /** Segment-level data — populated when granularity='segment' (or the provider returns it). */
+  segments?: TranscriptSegment[]
+  /** Duration of audio in seconds */
+  duration?: number
+  /** Detected or specified language code */
+  language?: string
+  /** Model used for transcription (as returned by provider) */
+  model?: string
+}
+/**
+ * Interface that all transcription providers must implement
+ * Defines the standard contract for transcription functionality
+ */
+export interface TranscriptionProvider {
+  /** Provider name/identifier */
+  name: string
+  /** Maximum audio duration in seconds (undefined = no limit) */
+  maxAudioDurationSec?: number
+  /**
+   * Transcribe audio from the given parameters
+   * @param params - Transcription parameters
+   * @returns Promise resolving to transcription result
+   */
+  transcribe(params: TranscribeParams): Promise<TranscriptionResult>
+}
+/**
+ * Parameters for transcription operations
+ * Supports both common and provider-specific options
+ */
+export interface TranscribeParams {
+  /** Path to the audio file to transcribe */
+  audioPath?: string
+  /** Audio buffer to transcribe */
+  audioBuffer?: Buffer
+  /** MIME type for audioBuffer (auto-detected if not provided) */
+  mimeType?: string
+  /** URL to audio file (e.g., IPFS gateway URL) */
+  audioUrl?: string
+  /** Model to use for transcription (provider-specific) */
+  model?: string
+  /** Language code for transcription (e.g., 'en', 'fr') */
+  language?: string
+  /** API key for authentication (provider-specific) */
+  apiKey?: string
+  /** Enable speaker diarization (Whisper-specific) */
+  diarize?: boolean
+  /** Timestamp granularity for transcription (Mistral-specific) */
+  timestampGranularity?: 'segment' | 'word'
+  /**
+   * Context biasing terms (Voxtral/Mistral-specific).
+   * Up to `VOXTRAL_LIMITS.maxContextBiasingTerms` (100) custom-vocabulary terms
+   * passed to the Voxtral transcribe endpoint as `context_bias[]` form fields.
+   * Ignored by non-Mistral providers.
+   */
+  contextBias?: string[]
+  /** Path to model file (Whisper-specific) */
+  modelPath?: string
+  /** Output directory for results (Whisper-specific) */
+  outputDir?: string
+  /** Provider configuration object */
+  config?: any
+}
+/**
+ * Union type for supported provider names
+ */
+export type ProviderName = 'whisper' | 'mistral' | 'greenpt'
+/**
+ * Factory function to create a transcription provider instance
+ * @param providerName - Name of the provider to create
+ * @param config - Optional configuration object for the provider
+ * @returns Transcription provider instance
+ */
+export function createProvider(providerName: ProviderName, config?: any): TranscriptionProvider {
+  switch (providerName) {
+    case 'whisper':
+      return new WhisperProvider(config)
+    case 'mistral':
+      return new MistralProvider()
+    case 'greenpt':
+      return new GreenPTProvider()
+    default:
+      throw new Error(`Unknown provider: ${providerName}`)
+  }
+}
+import { detectAudioMimeType } from './mime-detection.ts'
+/**
+ * Type for whisper.cpp JSON output
+ */
+type WhisperJsonOutput = Record<string, any>
+/**
+ * Whisper provider for local whisper.cpp transcription
+ * Manages model caching and local transcription execution
+ */
+export class WhisperProvider implements TranscriptionProvider {
+  name = 'whisper'
+  private cacheDir: string
+  static DEFAULTS = {
+    DIARIZE: false,
+    SILDUR: "1.3",
+    SILBUF: 0.2,
+    SILTHR: "-35dB",
+    MODEL_KEYS: {
+      tinyd: "ggml-small.en-tdrz.bin",
+      small: "ggml-small.bin",
+      medium: "ggml-medium.bin",
+    },
+    MODELS: {
+      tinyd:
+        "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin",
+      small:
+        "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
+      medium:
+        "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
+    },
+  }
+  constructor(config?: any) {
+    // Default cache directory, can be overridden via config
+    this.cacheDir = config?.cacheDir || `${process.env.HOME}/.cache/whisper-models`
+  }
+  async transcribe(params: TranscribeParams): Promise<TranscriptionResult> {
+    const {
+      audioPath,
+      outputDir = "./out",
+      diarize = WhisperProvider.DEFAULTS.DIARIZE,
+      modelPath: providedModelPath,
+    } = params
+    let modelPath = providedModelPath
+    const modelKey = (diarize ? "tinyd" : "small") as keyof typeof WhisperProvider.DEFAULTS.MODEL_KEYS
+    // Ensure model is cached if not provided
+    if (!modelPath) {
+      modelPath = await this.ensureRequestedModelIsCached(modelKey)
+    }
+    const sourceFileName = getName(audioPath)
+    const outTransPath = `${outputDir}/${sourceFileName}-transcript`
+    const tdrz = diarize ? "-tdrz" : ""
+    // Build arguments array properly
+    const args = [
+      tdrz,
+      "-t",
+      "8",
+      "-oj",
+      "-ng", // TODO: consider withGPU option
+      "-f",
+      audioPath,
+      "-m",
+      modelPath,
+      "-of",
+      outTransPath,
+    ].filter((arg) => arg !== "")
+    const cmd = `whisper-cli ${args.join(" ")}`
+    console.log("spawning  ", cmd)
+    const whisperThread = spawn(`whisper-cli`, args)
+    return new Promise<TranscriptionResult>((resolveFx) => {
+      let whisperOutput = ""
+      const handleOut = (data: string) => {
+        const str = data.toString()
+        for (const match of ["[", "main:"]) {
+          if (str.startsWith(match) || str.includes("total time"))
+            console.log(str)
+        }
+        whisperOutput += data
+      }
+      whisperThread.stdout.on("data", handleOut)
+      whisperThread.stderr.on("data", handleOut)
+      whisperThread.on("close", (code) => {
+        try {
+          const trans: WhisperJsonOutput = JSON.parse(
+            fs.readFileSync(`${outTransPath}.json`).toString()
+          )
+          // Extract text from transcription array
+          const transcriptionArray = trans.result?.transcription || []
+          const text = transcriptionArray
+            .map((entry: any) => entry.text)
+            .filter((t: any) => t)
+            .join(" ")
+            .trim()
+          resolveFx({
+            text: text || "",
+            rawResponse: trans,
+          })
+        } catch (error) {
+          const errorMessage = error instanceof Error ? error.message : String(error)
+          resolveFx({
+            text: "",
+            error: `Failed to parse transcription result: ${errorMessage}`,
+            rawResponse: undefined,
+          })
+        }
+      })
+      whisperThread.on("error", (err) => {
+        console.error("Whisper Error", { err, outTransPath, args })
+        resolveFx({
+          text: "",
+          error: `Whisper process error: ${err.message}`,
+        })
+      })
+    }).catch((whisperError) => {
+      console.error("Uncaught Whisper Error", whisperError)
+      return {
+        text: "",
+        error: `Uncaught error: ${whisperError instanceof Error ? whisperError.message : String(whisperError)}`,
+      }
+    })
+  }
+  private async ensureRequestedModelIsCached(
+    modelKey: keyof typeof WhisperProvider.DEFAULTS.MODEL_KEYS
+  ): Promise<string> {
+    if (!WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey])
+      throw new Error(`${modelKey} not known`)
+    const cachedModelsDirPath = `${this.cacheDir}/models`
+    if (!fs.existsSync(cachedModelsDirPath)) {
+      fs.mkdirSync(cachedModelsDirPath, { recursive: true })
+    }
+    const modelPath = `${cachedModelsDirPath}/${WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey]}`
+    const isModelExisting = fs.existsSync(modelPath)
+    if (!isModelExisting) {
+      const srcURL = WhisperProvider.DEFAULTS.MODELS[modelKey]
+      console.log(`
+    requested model is missing
+    Fetching ${srcURL} into ${modelPath}
+  `)
+      const data = await fetch(srcURL)
+      if (!data?.body) throw new Error("fetch failed")
+      const streamPipeline = promisify(pipeline)
+      await streamPipeline(
+        data.body as unknown as PipelineSource<any>,
+        fs.createWriteStream(modelPath)
+      )
+    } else {
+      console.log(`Found ${modelPath} \n`)
+    }
+    return modelPath
+  }
+}
+/**
+ * Voxtral/Mistral API limits
+ * These may need adjustment based on actual API constraints
+ */
+export const VOXTRAL_LIMITS = {
+  /** Maximum audio duration in seconds (3 hours for Voxtral Transcribe 2) */
+  maxAudioDurationSec: 3 * 60 * 60, // 10800 seconds = 3 hours
+  /** Maximum context biasing words/phrases */
+  maxContextBiasingTerms: 100,
+  /** Maximum file size in bytes (1GB) */
+  maxFileSizeBytes: 1024 * 1024 * 1024,
+}
+export class MistralProvider implements TranscriptionProvider {
+  name = 'mistral'
+  maxAudioDurationSec = VOXTRAL_LIMITS.maxAudioDurationSec
+  async transcribe(params: TranscribeParams): Promise<TranscriptionResult> {
+    // Validate API constraints
+    if (params.language && params.timestampGranularity) {
+      throw new Error('Cannot use both language and timestampGranularity (Mistral API limitation)')
+    }
+    // Validate diarize + timestampGranularity constraint
+    const diarize = params.diarize ?? true
+    if (diarize && params.timestampGranularity === 'word') {
+      throw new Error('When diarize is set to true, the timestamp granularity must be set to ["segment"], got ["word"]')
+    }
+    const formData = new FormData()
+    if (params.audioUrl) {
+      // URL input - use file_url parameter (no file upload needed)
+      formData.append('file_url', params.audioUrl)
+    } else {
+      // File or buffer input
+      let audioBuffer: Buffer
+      let mimeType: string
+      if (params.audioBuffer) {
+        audioBuffer = params.audioBuffer
+        mimeType = params.mimeType || detectAudioMimeType(audioBuffer)
+      } else if (params.audioPath) {
+        audioBuffer = fs.readFileSync(params.audioPath)
+        mimeType = detectAudioMimeType(audioBuffer)
+      } else {
+        return { text: '', error: 'No audio input provided (audioPath, audioBuffer, or audioUrl required)' }
+      }
+      // Extract extension from MIME type
+      const extension = mimeType === 'audio/mpeg'
+        ? 'mp3'
+        : mimeType === 'audio/wav'
+        ? 'wav'
+        : mimeType === 'audio/flac'
+        ? 'flac'
+        : 'ogg'
+      const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType })
+      formData.append('file', audioBlob, `audio.${extension}`)
+    }
+    // Add model
+    const model = params.model || 'voxtral-mini-latest'
+    formData.append('model', model)
+    // Add language if provided
+    if (params.language) {
+      formData.append('language', params.language)
+    }
+    // Add diarization - enabled by default (no extra cost)
+    if (diarize) {
+      formData.append('diarize', 'true')
+    }
+    // Add timestamp granularity - default to 'segment' (required when diarize is true)
+    // Note: timestamp_granularities is incompatible with language parameter
+    // See: https://docs.mistral.ai/capabilities/audio_transcription
+    const timestampGranularity = params.language ? undefined : (params.timestampGranularity ?? 'segment')
+    if (timestampGranularity) {
+      formData.append('timestamp_granularities', timestampGranularity)
+    }
+    // Context biasing terms (custom vocabulary) — Voxtral-specific.
+    // Sent as repeated `context_bias[]` form fields per Mistral API spec.
+    if (params.contextBias && params.contextBias.length > 0) {
+      if (params.contextBias.length > VOXTRAL_LIMITS.maxContextBiasingTerms) {
+        throw new Error(
+          `contextBias has ${params.contextBias.length} terms; Voxtral limit is ${VOXTRAL_LIMITS.maxContextBiasingTerms}`
+        )
+      }
+      for (const term of params.contextBias) {
+        formData.append('context_bias[]', term)
+      }
+    }
+    // POST to Mistral API
+    const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
+      method: 'POST',
+      headers: {
+        'Authorization': `Bearer ${params.apiKey}`,
+      },
+      body: formData,
+    })
+    // Handle errors
+    if (!response.ok) {
+      const errorText = await response.text()
+      return { text: '', error: `API returned ${response.status}: ${errorText}` }
+    }
+    // Parse response
+    const result = await response.json()
+    if (!result?.text) {
+      return { text: '', error: 'No transcription returned', rawResponse: result }
+    }
+    // Normalize segments to ms + canonical `diarization` field.
+    // Voxtral 2 returns seconds; the wovin annotation schema mandates integer ms.
+    const segments: TranscriptSegment[] | undefined = Array.isArray(result.segments) && result.segments.length > 0
+      ? result.segments.map((seg: any): TranscriptSegment => ({
+          startMs: Math.round((seg.start ?? 0) * 1000),
+          endMs: Math.round((seg.end ?? 0) * 1000),
+          text: seg.text ?? '',
+          ...(seg.speaker_id !== undefined ? { diarization: seg.speaker_id } : {}),
+        }))
+      : undefined
+    // Only surface `words` when the API actually returned them (granularity='word' path,
+    // or nested seg.words). Do not fabricate an empty array — consumers check presence.
+    let words: any[] | undefined
+    if (Array.isArray(result.words) && result.words.length > 0) {
+      words = result.words
+    } else if (Array.isArray(result.segments)) {
+      const nested = result.segments.flatMap((seg: any) => seg.words ?? [])
+      if (nested.length > 0) words = nested
+    }
+    const duration = result.usage?.prompt_audio_seconds
+    return {
+      text: result.text,
+      language: result.language ?? params.language,
+      model: result.model,
+      duration: duration,
+      ...(words ? { words } : {}),
+      ...(segments ? { segments } : {}),
+      rawResponse: result,
+    }
+  }
+}
+/**
+ * GreenPT transcription provider
+ * Uses GreenPT API for audio transcription with Deepgram-compatible response format
+ */
+export class GreenPTProvider implements TranscriptionProvider {
+  name = 'greenpt'
+  async transcribe(params: TranscribeParams): Promise<TranscriptionResult> {
+    if (!params.apiKey) {
+      return { text: '', error: 'API key is required for GreenPT provider' }
+    }
+    if (!params.audioPath) {
+      return { text: '', error: 'Audio path is required' }
+    }
+    try {
+      // Read audio file
+      const audioBuffer = fs.readFileSync(params.audioPath)
+      // Detect MIME type
+      const mimeType = detectAudioMimeType(audioBuffer)
+      // Create FormData with audio Blob
+      const formData = new FormData()
+      const extension = mimeType === 'audio/mpeg'
+        ? 'mp3'
+        : mimeType === 'audio/wav'
+          ? 'wav'
+          : mimeType === 'audio/flac'
+            ? 'flac'
+            : 'ogg'
+      const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType })
+      formData.append('file', audioBlob, `audio.${extension}`)
+      // Build query parameters
+      const queryParams = new URLSearchParams()
+      const model = params.model || 'green-s-pro'
+      queryParams.append('model', model)
+      if (params.language) {
+        queryParams.append('language', params.language)
+      }
+      if (params.diarize !== undefined) {
+        queryParams.append('diarize', String(params.diarize))
+      }
+      // Default punctuate to true
+      queryParams.append('punctuate', 'true')
+      // Build URL with query parameters
+      const url = `https://api.greenpt.ai/v1/listen?${queryParams.toString()}`
+      // Make API request
+      const response = await fetch(url, {
+        method: 'POST',
+        headers: {
+          'Authorization': `Bearer ${params.apiKey}`,
+        },
+        body: formData,
+      })
+      if (!response.ok) {
+        const errorText = await response.text()
+        return {
+          text: '',
+          error: `API returned ${response.status}: ${errorText}`,
+        }
+      }
+      const result = await response.json()
+      // Extract transcription from Deepgram-compatible response format
+      const transcript = result?.results?.channels?.[0]?.alternatives?.[0]
+      if (!transcript) {
+        return {
+          text: '',
+          error: 'No transcription returned',
+          rawResponse: result,
+        }
+      }
+      // Build text from transcript field, or from words if transcript is missing
+      let text = transcript.transcript
+      if (!text && transcript.words && transcript.words.length > 0) {
+        text = transcript.words.map((w: any) => w.word).join(' ')
+      }
+      return {
+        text: text || '',
+        confidence: transcript.confidence,
+        words: transcript.words,
+        duration: result?.metadata?.duration,
+        rawResponse: result,
+      }
+    } catch (error) {
+      const errorMessage = error instanceof Error ? error.message : String(error)
+      return {
+        text: '',
+        error: `Transcription failed: ${errorMessage}`,
+      }
+    }
+  }
+}