npm - streaming-sortformer-node - Versions diffs - 0.1.0 - Mend

streaming-sortformer-node 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +369 -0
package/dist/Sortformer.d.ts +104 -0
package/dist/Sortformer.d.ts.map +1 -0
package/dist/Sortformer.js +221 -0
package/dist/Sortformer.js.map +1 -0
package/dist/StreamingSession.d.ts +88 -0
package/dist/StreamingSession.d.ts.map +1 -0
package/dist/StreamingSession.js +128 -0
package/dist/StreamingSession.js.map +1 -0
package/dist/binding.d.ts +8 -0
package/dist/binding.d.ts.map +1 -0
package/dist/binding.js +35 -0
package/dist/binding.js.map +1 -0
package/dist/index.d.ts +5 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +4 -0
package/dist/index.js.map +1 -0
package/dist/presets.d.ts +45 -0
package/dist/presets.d.ts.map +1 -0
package/dist/presets.js +68 -0
package/dist/presets.js.map +1 -0
package/dist/types.d.ts +107 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +5 -0
package/dist/types.js.map +1 -0
package/package.json +31 -0
package/src/Sortformer.ts +253 -0
package/src/StreamingSession.ts +143 -0
package/src/binding.ts +41 -0
package/src/index.ts +13 -0
package/src/presets.ts +88 -0
package/src/types.ts +121 -0

package/src/Sortformer.ts ADDED Viewed

@@ -0,0 +1,253 @@
+/**
+ * TypeScript wrapper for the native SortFormer speaker diarization model
+ */
+import type { LoadOptions, DiarizeOptions, DiarizeResult, StreamingSessionOptions, StreamingPreset } from './types.js';
+import { LATENCY_PRESETS, OFFLINE_PARAMS } from './presets.js';
+import { getBinding } from './binding.js';
+import { StreamingSession } from './StreamingSession.js';
+/**
+ * SortFormer speaker diarization model wrapper
+ *
+ * Provides a high-level TypeScript API for loading and running the native
+ * SortFormer model for streaming speaker diarization.
+ *
+ * @example
+ * ```typescript
+ * const model = await Sortformer.load('./model.gguf', { threads: 4 });
+ * const result = await model.diarize(audioData, { mode: 'streaming', latency: '2s' });
+ * console.log(result.rttm);
+ * model.close();
+ * ```
+ */
+export class Sortformer {
+  private native: any;
+  private closed: boolean = false;
+  /**
+   * Private constructor - use static load() method instead
+   * @param native - Native SortformerModel instance from binding
+   */
+  private constructor(native: any) {
+    this.native = native;
+  }
+  /**
+   * Load a SortFormer model from a GGUF file
+   *
+   * @param modelPath - Path to the GGUF model file
+   * @param options - Optional loading configuration
+   * @returns Promise resolving to a loaded Sortformer instance
+   * @throws Error if model file not found or native binding unavailable
+   *
+   * @example
+   * ```typescript
+   * const model = await Sortformer.load('./model.gguf', { threads: 8 });
+   * ```
+   */
+  static async load(modelPath: string, options?: LoadOptions): Promise<Sortformer> {
+    // Validate input
+    if (!modelPath || typeof modelPath !== 'string') {
+      throw new TypeError('modelPath must be a non-empty string');
+    }
+    // Get native binding
+    const binding = getBinding();
+    // Create native model instance
+    // Default to 4 threads if not specified
+    const threads = options?.threads ?? 4;
+    if (threads < 1 || !Number.isInteger(threads)) {
+      throw new Error('threads must be a positive integer');
+    }
+    // Instantiate native model
+    const native = new binding.SortformerModel(modelPath, threads);
+    return new Sortformer(native);
+  }
+  /**
+   * Run diarization inference on audio samples
+   *
+   * @param audio - Audio samples as Float32Array (16kHz mono)
+   * @param options - Optional diarization configuration
+   * @returns Promise resolving to diarization results (RTTM + predictions)
+   * @throws Error if model is closed, audio is invalid, or inference fails
+   *
+   * @example
+   * ```typescript
+   * const result = await model.diarize(audioData, {
+   *   mode: 'streaming',
+   *   latency: '2s',
+   *   threshold: 0.5,
+   *   medianFilter: 11
+   * });
+   * ```
+   */
+  async diarize(audio: Float32Array, options?: DiarizeOptions): Promise<DiarizeResult> {
+    // Check if model is closed
+    if (this.closed) {
+      throw new Error('Model is closed. Cannot perform diarization.');
+    }
+    // Validate audio input
+    if (!(audio instanceof Float32Array)) {
+      throw new TypeError('audio must be a Float32Array');
+    }
+    if (audio.length === 0) {
+      throw new Error('audio cannot be empty');
+    }
+    // Validate options
+    if (options?.threshold !== undefined) {
+      if (typeof options.threshold !== 'number' || options.threshold < 0 || options.threshold > 1) {
+        throw new Error('threshold must be a number between 0 and 1');
+      }
+    }
+    if (options?.medianFilter !== undefined) {
+      if (!Number.isInteger(options.medianFilter) || options.medianFilter < 1 || options.medianFilter % 2 === 0) {
+        throw new Error('medianFilter must be a positive odd integer');
+      }
+    }
+    // Map user-friendly options to native format
+    const mode = options?.mode ?? 'offline';
+    const nativeOptions: any = {
+      threshold: options?.threshold ?? 0.5,
+      medianFilter: options?.medianFilter ?? 11,
+    };
+    // Add streaming-specific parameters if in streaming mode
+    if (mode === 'streaming') {
+      const latency = options?.latency ?? '2s';
+      const presetParams = LATENCY_PRESETS[latency];
+      if (!presetParams) {
+        throw new Error(`Unknown latency preset: ${latency}`);
+      }
+      nativeOptions.chunkLen = presetParams.chunkLen;
+      nativeOptions.rightContext = presetParams.rightContext;
+      nativeOptions.fifoLen = presetParams.fifoLen;
+      nativeOptions.spkcacheUpdatePeriod = presetParams.spkcacheUpdatePeriod;
+    } else if (mode === 'offline') {
+      // Use offline parameters
+      nativeOptions.chunkLen = OFFLINE_PARAMS.chunkLen;
+      nativeOptions.rightContext = OFFLINE_PARAMS.rightContext;
+      nativeOptions.fifoLen = OFFLINE_PARAMS.fifoLen;
+      nativeOptions.spkcacheUpdatePeriod = OFFLINE_PARAMS.spkcacheUpdatePeriod;
+    } else {
+      throw new Error(`Unknown diarization mode: ${mode}`);
+    }
+    // Call native diarization
+    const result = await this.native.diarize(audio, nativeOptions);
+    // Validate result structure
+    if (!result || typeof result !== 'object') {
+      throw new Error('Native diarization returned invalid result');
+    }
+    if (typeof result.rttm !== 'string') {
+      throw new Error('Native diarization result missing rttm string');
+    }
+    if (!(result.predictions instanceof Float32Array)) {
+      throw new Error('Native diarization result predictions must be Float32Array');
+    }
+    if (!Number.isInteger(result.frameCount) || result.frameCount < 0) {
+      throw new Error('Native diarization result frameCount must be non-negative integer');
+    }
+    if (!Number.isInteger(result.speakerCount) || result.speakerCount < 1 || result.speakerCount > 4) {
+      throw new Error('Native diarization result speakerCount must be 1-4');
+    }
+    return result as DiarizeResult;
+  }
+  /**
+   * Close the model and free native resources
+   *
+   * After calling close(), the model cannot be used for further inference.
+   * Calling close() multiple times is safe (idempotent).
+   *
+   * @example
+   * ```typescript
+   * model.close();
+   * ```
+   */
+  close(): void {
+    if (!this.closed) {
+      if (this.native && typeof this.native.close === 'function') {
+        this.native.close();
+      }
+      this.closed = true;
+    }
+  }
+  /**
+   * Check if the model is closed
+   * @returns true if the model has been closed, false otherwise
+   */
+  isClosed(): boolean {
+    return this.closed;
+  }
+  /**
+   * Create a streaming session for incremental audio processing
+   *
+   * The streaming session maintains state (speaker cache, FIFO buffer)
+   * across feed() calls, enabling true real-time diarization.
+   *
+   * @param options - Optional streaming configuration
+   * @returns A new StreamingSession instance
+   * @throws Error if model is closed
+   *
+   * @example
+   * ```typescript
+   * const session = model.createStreamingSession({ preset: 'low' });
+   *
+   * // Feed audio chunks as they arrive
+   * const result1 = session.feed(chunk1);
+   * const result2 = session.feed(chunk2);
+   *
+   * // Accumulate predictions
+   * const allPreds = [...result1.predictions, ...result2.predictions];
+   *
+   * session.close();
+   * ```
+   */
+  createStreamingSession(options?: StreamingSessionOptions): StreamingSession {
+    if (this.closed) {
+      throw new Error('Model is closed. Cannot create streaming session.');
+    }
+    const preset = options?.preset ?? '2s';
+    // Map preset string to enum value
+    const presetMap: Record<StreamingPreset, number> = {
+      'low': 0,  // SORTFORMER_PRESET_LOW_LATENCY
+      '2s': 1,   // SORTFORMER_PRESET_2S
+      '3s': 2,   // SORTFORMER_PRESET_3S
+      '5s': 3,   // SORTFORMER_PRESET_5S
+    };
+    const presetNum = presetMap[preset];
+    if (presetNum === undefined) {
+      throw new Error(`Unknown preset: ${preset}`);
+    }
+    // Get binding and create native session
+    const binding = getBinding();
+    const nativeSession = new binding.StreamingSession(this.native, presetNum);
+    return new StreamingSession(nativeSession);
+  }
+}

package/src/StreamingSession.ts ADDED Viewed

@@ -0,0 +1,143 @@
+/**
+ * TypeScript wrapper for native StreamingSession
+ */
+import type { FeedResult, StreamingPreset } from './types.js';
+/**
+ * Streaming diarization session
+ *
+ * Maintains state across incremental audio feed calls for true real-time
+ * speaker diarization. State is kept in native C code for efficiency.
+ *
+ * @example
+ * ```typescript
+ * const session = model.createStreamingSession({ preset: '2s' });
+ *
+ * // Feed audio chunks as they arrive
+ * const result1 = session.feed(chunk1);
+ * const result2 = session.feed(chunk2);
+ *
+ * // Get total frames processed
+ * console.log(session.totalFrames);
+ *
+ * // Reset for new audio stream
+ * session.reset();
+ *
+ * // Clean up
+ * session.close();
+ * ```
+ */
+export class StreamingSession {
+  private native: any;
+  private _closed: boolean = false;
+  /**
+   * Create a new streaming session
+   * @param native - Native StreamingSession instance from binding
+   * @internal
+   */
+  constructor(native: any) {
+    this.native = native;
+  }
+  /**
+   * Feed audio samples and get predictions for this chunk
+   *
+   * @param audio - Audio samples as Float32Array (16kHz mono)
+   * @returns Predictions for the new frames in this chunk
+   * @throws Error if session is closed or audio is invalid
+   *
+   * @example
+   * ```typescript
+   * const audio = new Float32Array(48000); // 3 seconds
+   * const result = session.feed(audio);
+   * console.log(`Got ${result.frameCount} new frames`);
+   * ```
+   */
+  feed(audio: Float32Array): FeedResult {
+    if (this._closed) {
+      throw new Error('Session is closed');
+    }
+    if (!(audio instanceof Float32Array)) {
+      throw new TypeError('audio must be a Float32Array');
+    }
+    const result = this.native.feed(audio);
+    return {
+      predictions: result.predictions,
+      frameCount: result.frameCount,
+    };
+  }
+  /**
+   * Flush remaining buffered audio at end of stream
+   *
+   * Call this when the audio stream ends to process any remaining
+   * buffered audio that hasn't been output yet due to latency buffering.
+   *
+   * @returns Final predictions for buffered audio
+   * @throws Error if session is closed
+   */
+  flush(): FeedResult {
+    if (this._closed) {
+      throw new Error('Session is closed');
+    }
+    const result = this.native.flush();
+    return {
+      predictions: result.predictions,
+      frameCount: result.frameCount,
+    };
+  }
+  /**
+   * Reset the streaming state for a new audio stream
+   *
+   * Clears all internal buffers (spkcache, fifo, mel overlap) while
+   * keeping the model loaded. Use this when starting a new recording.
+   *
+   * @throws Error if session is closed
+   */
+  reset(): void {
+    if (this._closed) {
+      throw new Error('Session is closed');
+    }
+    this.native.reset();
+  }
+  /**
+   * Close the session and free native resources
+   *
+   * After calling close(), the session cannot be used.
+   * Calling close() multiple times is safe (idempotent).
+   */
+  close(): void {
+    if (!this._closed) {
+      if (this.native && typeof this.native.close === 'function') {
+        this.native.close();
+      }
+      this._closed = true;
+    }
+  }
+  /**
+   * Get total frames output so far
+   */
+  get totalFrames(): number {
+    if (this._closed) {
+      return 0;
+    }
+    return this.native.getTotalFrames();
+  }
+  /**
+   * Check if the session is closed
+   */
+  get isClosed(): boolean {
+    return this._closed;
+  }
+}

package/src/binding.ts ADDED Viewed

@@ -0,0 +1,41 @@
+import { createRequire } from 'module';
+const require = createRequire(import.meta.url);
+let cachedBinding: any = null;
+/**
+ * Get the native binding for the current platform
+ * Detects platform and architecture, loads the appropriate platform-specific package
+ * @returns The native module binding
+ * @throws Error if platform is not supported or binding cannot be loaded
+ */
+export function getBinding(): any {
+  if (cachedBinding) return cachedBinding;
+  const platform = process.platform;
+  const arch = process.arch;
+  let packageName: string;
+  if (platform === 'darwin' && arch === 'arm64') {
+    packageName = '@streaming-sortformer-node/darwin-arm64';
+  } else if (platform === 'darwin' && arch === 'x64') {
+    packageName = '@streaming-sortformer-node/darwin-x64';
+  } else {
+    throw new Error(
+      `Unsupported platform: ${platform}-${arch}. ` +
+      `streaming-sortformer-node currently supports: darwin-arm64, darwin-x64`
+    );
+  }
+  try {
+    cachedBinding = require(packageName);
+    return cachedBinding;
+  } catch (e) {
+    throw new Error(
+      `Failed to load native binding from ${packageName}. ` +
+      `Make sure the package is installed: npm install ${packageName}`
+    );
+  }
+}

package/src/index.ts ADDED Viewed

@@ -0,0 +1,13 @@
+export { Sortformer } from './Sortformer.js';
+export { StreamingSession } from './StreamingSession.js';
+export type {
+  LoadOptions,
+  DiarizeOptions,
+  DiarizeResult,
+  LatencyPreset,
+  DiarizeMode,
+  StreamingPreset,
+  StreamingSessionOptions,
+  FeedResult,
+} from './types.js';
+export { LATENCY_PRESETS, OFFLINE_PARAMS } from './presets.js';

package/src/presets.ts ADDED Viewed

@@ -0,0 +1,88 @@
+/**
+ * Latency presets for streaming diarization
+ * Maps preset names to their corresponding parameter configurations
+ */
+import type { LatencyPreset } from './types';
+/**
+ * Streaming latency preset parameters
+ * Each preset controls chunk processing, buffering, and speaker cache update behavior
+ */
+export interface PresetParams {
+  /** Chunk length in frames (16kHz, hop=160) */
+  chunkLen: number;
+  /** Right context frames for conformer processing */
+  rightContext: number;
+  /** FIFO buffer length in frames */
+  fifoLen: number;
+  /** Speaker cache update period in frames */
+  spkcacheUpdatePeriod: number;
+}
+/**
+ * Streaming latency presets
+ * - 'low': ~188ms latency, minimal buffering
+ * - '2s': ~2 second latency
+ * - '3s': ~3 second latency
+ * - '5s': ~5 second latency
+ */
+export const LATENCY_PRESETS: Record<LatencyPreset, PresetParams> = {
+  'low': {
+    chunkLen: 6,
+    rightContext: 7,
+    fifoLen: 188,
+    spkcacheUpdatePeriod: 144,
+  },
+  '2s': {
+    chunkLen: 15,
+    rightContext: 10,
+    fifoLen: 100,
+    spkcacheUpdatePeriod: 144,
+  },
+  '3s': {
+    chunkLen: 30,
+    rightContext: 7,
+    fifoLen: 100,
+    spkcacheUpdatePeriod: 100,
+  },
+  '5s': {
+    chunkLen: 55,
+    rightContext: 7,
+    fifoLen: 100,
+    spkcacheUpdatePeriod: 100,
+  },
+} as const;
+/**
+ * Offline mode parameters
+ * Used when mode='offline' to process entire audio at once
+ */
+export const OFFLINE_PARAMS: PresetParams = {
+  chunkLen: 188,
+  rightContext: 1,
+  fifoLen: 0,
+  spkcacheUpdatePeriod: 188,
+} as const;
+/**
+ * Get preset parameters by name
+ * @param preset - Latency preset name
+ * @returns Preset parameters
+ * @throws Error if preset is not found
+ */
+export function getPresetParams(preset: LatencyPreset): PresetParams {
+  const params = LATENCY_PRESETS[preset];
+  if (!params) {
+    throw new Error(`Unknown latency preset: ${preset}`);
+  }
+  return params;
+}
+/**
+ * Get default preset parameters for offline mode
+ * @returns Offline mode parameters
+ */
+export function getOfflineParams(): PresetParams {
+  return OFFLINE_PARAMS;
+}

package/src/types.ts ADDED Viewed

@@ -0,0 +1,121 @@
+/**
+ * TypeScript type definitions for streaming-sortformer-node
+ */
+/**
+ * Diarization mode: offline processes entire audio at once,
+ * streaming processes audio in chunks with latency control
+ */
+export type DiarizeMode = 'offline' | 'streaming';
+/**
+ * Latency preset for streaming mode
+ * - 'low': ~188ms latency, minimal buffering
+ * - '2s': ~2 second latency
+ * - '3s': ~3 second latency
+ * - '5s': ~5 second latency
+ */
+export type LatencyPreset = 'low' | '2s' | '3s' | '5s';
+/**
+ * Options for loading a SortFormer model
+ */
+export interface LoadOptions {
+  /**
+   * Number of CPU threads to use for inference
+   * @default auto-detected based on CPU cores
+   */
+  threads?: number;
+}
+/**
+ * Options for diarization inference
+ */
+export interface DiarizeOptions {
+  /**
+   * Diarization mode: 'offline' or 'streaming'
+   * @default 'offline'
+   */
+  mode?: DiarizeMode;
+  /**
+   * Latency preset for streaming mode
+   * Only used when mode='streaming'
+   * @default '2s'
+   */
+  latency?: LatencyPreset;
+  /**
+   * Speaker activity threshold (0.0 to 1.0)
+   * Frames with prediction >= threshold are considered active
+   * @default 0.5
+   */
+  threshold?: number;
+  /**
+   * Median filter window size for smoothing predictions
+   * Must be odd number >= 1
+   * @default 11
+   */
+  medianFilter?: number;
+}
+/**
+ * Result of diarization inference
+ */
+export interface DiarizeResult {
+  /**
+   * RTTM format output (speaker diarization segments)
+   * Format: SPEAKER <filename> <channel> <start> <duration> <conf> <spk_type> <spk_id> <score>
+   */
+  rttm: string;
+  /**
+   * Raw per-frame speaker activity predictions
+   * Shape: [frameCount, 4] (4 speakers max)
+   * Values: 0.0 to 1.0 (probability of speaker activity)
+   */
+  predictions: Float32Array;
+  /**
+   * Number of frames in the output
+   */
+  frameCount: number;
+  /**
+   * Number of speakers detected (1-4)
+   */
+  speakerCount: number;
+}
+/**
+ * Streaming preset type
+ */
+export type StreamingPreset = 'low' | '2s' | '3s' | '5s';
+/**
+ * Options for creating a streaming session
+ */
+export interface StreamingSessionOptions {
+  /**
+   * Latency preset
+   * @default '2s'
+   */
+  preset?: StreamingPreset;
+}
+/**
+ * Result from feeding audio to streaming session
+ */
+export interface FeedResult {
+  /**
+   * Per-frame speaker predictions for this chunk
+   * Shape: [frameCount, 4]
+   */
+  predictions: Float32Array;
+  /**
+   * Number of new frames in this result
+   */
+  frameCount: number;
+}