npm - @framers/agentos - Versions diffs - 0.1.107 → 0.1.109 - Mend

@framers/agentos 0.1.107 → 0.1.109

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/dist/voice-pipeline/HeuristicEndpointDetector.js CHANGED Viewed

@@ -6,14 +6,30 @@
  * finished speaking. Suitable for low-latency deployments where an LLM-based
  * semantic detector would add unacceptable round-trip overhead.
  *
- * Detection strategy:
- *   1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`, or `!`,
- *      fire `turn_complete` immediately with reason `'punctuation'`.
- *   2. Otherwise, start a silence timer (default 1 500 ms). If speech does not
- *      resume before the timer fires, emit `turn_complete` with reason `'silence_timeout'`.
- *   3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed from
- *      accumulation, and re-emitted as `'backchannel_detected'` events so the
- *      pipeline can decide whether to suppress an agent response.
+ * ## Detection strategy
+ *
+ * 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`,
+ *    or `!`, fire `turn_complete` immediately with reason `'punctuation'`.
+ *    This provides the lowest-latency turn handoff for well-punctuated speech.
+ *
+ * 2. Otherwise, start a silence timer (default 1,500 ms). If speech does not
+ *    resume before the timer fires, emit `turn_complete` with reason
+ *    `'silence_timeout'`. The timeout acts as a safety net for STT providers
+ *    that don't produce terminal punctuation reliably.
+ *
+ * 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed
+ *    from accumulation, and re-emitted as `'backchannel_detected'` events so
+ *    the pipeline can decide whether to suppress an agent response.
+ *
+ * ## Why heuristic over acoustic-only?
+ *
+ * Pure silence timeout adds up to 1.5 s of unnecessary latency on every turn
+ * when the user ends a sentence cleanly. By checking for terminal punctuation,
+ * this detector can fire turn_complete immediately, cutting perceived latency
+ * by more than half for typical conversational speech.
+ *
+ * @see {@link AcousticEndpointDetector} for the purely acoustic alternative.
+ * @see {@link IEndpointDetector} for the interface contract.
  */
 import { EventEmitter } from 'node:events';
 // ---------------------------------------------------------------------------
@@ -21,15 +37,41 @@ import { EventEmitter } from 'node:events';
 // ---------------------------------------------------------------------------
 /**
  * Default silence duration (ms) after speech stops before firing `turn_complete`.
+ *
+ * 1,500 ms was chosen as a balance between:
+ * - Too short (< 800 ms): Fires mid-sentence when the user pauses to think.
+ * - Too long (> 2,000 ms): Adds noticeable latency, making the agent feel slow.
+ *
+ * This value is consistent with research on conversational turn-taking gaps
+ * (Stivers et al., 2009: modal gap ~200 ms, but STT adds 200-500 ms latency,
+ * and users expect the agent to wait slightly longer than a human would).
  */
 const DEFAULT_SILENCE_TIMEOUT_MS = 1500;
 /**
- * Terminal punctuation characters that signal sentence completion.
+ * Regular expression matching terminal punctuation characters that signal
+ * sentence completion. Only tested against the final character of the
+ * accumulated transcript text.
+ *
+ * We deliberately exclude semicolons, colons, and ellipses because they
+ * rarely indicate turn completion in spoken language.
  */
 const TERMINAL_PUNCTUATION = /[.?!]$/;
 /**
  * Normalised backchannel phrases that indicate the listener is acknowledging
  * but not taking a full conversational turn. Compared after `.trim().toLowerCase()`.
+ *
+ * These 13 phrases were selected because:
+ * - They are the most common English-language backchannel markers in the
+ *   Switchboard and Fisher telephone conversation corpora.
+ * - They are short enough that STT providers reliably produce them as
+ *   standalone final transcripts (longer phrases like "I see" risk being
+ *   part of a larger utterance).
+ * - Including both spellings of common variants (e.g. "mm hmm", "mmhmm",
+ *   "mm-hmm", "mhm") ensures robust matching across STT providers that
+ *   normalise differently.
+ *
+ * The set is intentionally conservative -- adding phrases like "I see" or
+ * "go on" risks false positives when the user is genuinely taking a turn.
  */
 const BACKCHANNEL_PHRASES = new Set([
     'uh huh',
@@ -53,18 +95,25 @@ const BACKCHANNEL_PHRASES = new Set([
  * Heuristic endpoint detector that uses terminal punctuation and a silence
  * timeout to decide when the user's turn is complete.
  *
- * Emits:
- * - `'turn_complete'` ({@link TurnCompleteEvent}) — user turn has ended.
- * - `'backchannel_detected'` (`{ text: string }`) — a backchannel phrase was
- *   recognised; accumulation is suppressed for this utterance.
+ * ## Events emitted
+ *
+ * | Event                    | Payload                  | Description                        |
+ * |--------------------------|--------------------------|------------------------------------|
+ * | `'turn_complete'`        | {@link TurnCompleteEvent}| User turn has ended.               |
+ * | `'backchannel_detected'` | `{ text: string }`       | Backchannel phrase was recognised.  |
+ *
+ * @see {@link IEndpointDetector} for the interface contract.
+ * @see {@link AcousticEndpointDetector} for the purely acoustic alternative.
  *
  * @example
  * ```typescript
  * const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
  * detector.on('turn_complete', (event) => console.log('Turn done:', event));
+ *
+ * // Simulate a punctuated sentence followed by speech_end
  * detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] });
  * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' });
- * // → 'turn_complete' fires immediately with reason 'punctuation'
+ * // -> 'turn_complete' fires immediately with reason 'punctuation'
  * ```
  */
 export class HeuristicEndpointDetector extends EventEmitter {
@@ -74,80 +123,118 @@ export class HeuristicEndpointDetector extends EventEmitter {
     /**
      * Create a new {@link HeuristicEndpointDetector}.
      *
-     * @param options — Optional configuration overrides.
+     * @param options - Optional configuration overrides.
      */
     constructor(options = {}) {
         super();
         /**
          * Active detection strategy label.
-         * Typed as `'hybrid'` to satisfy {@link IEndpointDetector.mode}; consumers
-         * that need to distinguish heuristic detectors may inspect `instanceof`.
+         * Always `'heuristic'` for this implementation.
+         *
+         * @see {@link IEndpointDetector.mode}
          */
         this.mode = 'heuristic';
-        /** The latest final transcript text accumulated for the current turn. */
+        /**
+         * The latest final transcript text accumulated for the current turn.
+         * Only updated by final (non-interim) transcript events.
+         * Reset to empty string after each `turn_complete` emission.
+         */
         this.accumulatedText = '';
-        /** Whether the VAD currently reports active speech. */
+        /**
+         * Whether the VAD currently reports active speech. Set to `true` on
+         * `speech_start` and `false` on `speech_end`. Used to prevent the
+         * silence timer from starting while the user is still speaking.
+         */
         this.speechActive = false;
-        /** Handle to a pending silence timeout, or `null` if none is running. */
+        /**
+         * Handle to a pending silence timeout, or `null` if none is running.
+         * Cleared when speech resumes or when the detector is reset.
+         */
         this.silenceTimer = null;
-        /** Wall-clock timestamp (ms) when the current turn's speech started. */
+        /**
+         * Wall-clock timestamp (ms) when the current turn's speech started.
+         * Used to compute `durationMs` in the emitted {@link TurnCompleteEvent}.
+         * `null` when no speech has been detected in the current turn.
+         */
         this.turnStartMs = null;
-        /** Confidence of the most recent final transcript. */
+        /**
+         * Confidence of the most recent final transcript. Forwarded into the
+         * emitted {@link TurnCompleteEvent}. Defaults to 1 (perfect confidence)
+         * and is updated with each final transcript event.
+         */
         this.lastConfidence = 1;
         this.silenceTimeoutMs = options.silenceTimeoutMs ?? DEFAULT_SILENCE_TIMEOUT_MS;
     }
     // ---------------------------------------------------------------------------
-    // IEndpointDetector — pushTranscript
+    // IEndpointDetector -- pushTranscript
     // ---------------------------------------------------------------------------
     /**
      * Ingest a transcript event from the upstream STT session.
      *
      * Only final events (`isFinal: true`) affect internal state. Interim results
-     * are silently ignored — they may arrive very frequently and their text is
-     * unstable.
+     * are silently ignored because:
+     * 1. They arrive very frequently (10-50 per second) and would trigger
+     *    excessive punctuation checks.
+     * 2. Their text is unstable -- a word ending with "." may be revised in
+     *    the next interim result, causing false turn-completion signals.
      *
-     * If the final text is a recognised backchannel phrase the detector emits
-     * `'backchannel_detected'` and returns without accumulating the text, so that
-     * a subsequent `speech_end` event does not trigger `turn_complete`.
+     * If the final text is a recognised backchannel phrase, the detector emits
+     * `'backchannel_detected'` and returns WITHOUT accumulating the text. This
+     * prevents a subsequent `speech_end` event from triggering `turn_complete`
+     * for what was merely an acknowledgement, not a real conversational turn.
      *
-     * @param transcript — Transcript event from the STT session.
+     * @param transcript - Transcript event from the STT session.
      */
     pushTranscript(transcript) {
         if (!transcript.isFinal) {
-            // Ignore partial/interim hypotheses — they will be superseded.
+            // Ignore partial/interim hypotheses -- they will be superseded by
+            // a subsequent final result or revised interim.
             return;
         }
         const text = transcript.text;
         const normalised = text.trim().toLowerCase();
-        // Detect backchannel acknowledgements before accumulating.
+        // Check for backchannel phrases BEFORE accumulating. This ensures that
+        // "uh huh" followed by speech_end does NOT produce a turn_complete.
         if (BACKCHANNEL_PHRASES.has(normalised)) {
             this.emit('backchannel_detected', { text });
             return;
         }
         // Accumulate the final transcript and store the confidence score.
+        // We overwrite (not append) because each final event from the STT
+        // provider represents the complete hypothesis for the current utterance.
         this.accumulatedText = text;
         this.lastConfidence = transcript.confidence;
     }
     // ---------------------------------------------------------------------------
-    // IEndpointDetector — pushVadEvent
+    // IEndpointDetector -- pushVadEvent
     // ---------------------------------------------------------------------------
     /**
      * Ingest a VAD (voice activity detection) event.
      *
-     * - `speech_start`: marks the turn as active and cancels any pending silence
-     *   timer (the user resumed speaking before the timeout elapsed).
-     * - `speech_end`: if accumulated text is available, either fires
-     *   `turn_complete` immediately (punctuation) or starts the silence timer.
-     * - `silence`: heartbeat events are ignored; only explicit `speech_end`
-     *   drives the timeout logic.
+     * Event handling by type:
+     *
+     * - **`speech_start`**: Marks the turn as active and cancels any pending
+     *   silence timer (the user resumed speaking before the timeout elapsed).
+     *   This is critical for avoiding false turn-completion when the user
+     *   takes a brief pause mid-sentence.
+     *
+     * - **`speech_end`**: If accumulated text is available, either fires
+     *   `turn_complete` immediately (when text ends with terminal punctuation)
+     *   or starts the silence timer (when no punctuation is detected).
+     *
+     * - **`silence`**: Periodic heartbeat events are ignored. The silence timer
+     *   (started on `speech_end`) already handles delayed turn-completion
+     *   independently of heartbeat cadence.
      *
-     * @param event — VAD transition event.
+     * @param event - VAD transition event.
      */
     pushVadEvent(event) {
         switch (event.type) {
             case 'speech_start': {
                 this.speechActive = true;
+                // Cancel any pending silence timer -- the user is speaking again
                 this._clearSilenceTimer();
+                // Record turn start only once (first speech_start in this turn)
                 if (this.turnStartMs === null) {
                     this.turnStartMs = event.timestamp;
                 }
@@ -156,33 +243,41 @@ export class HeuristicEndpointDetector extends EventEmitter {
             case 'speech_end': {
                 this.speechActive = false;
                 if (!this.accumulatedText) {
-                    // Nothing to flush — no transcript arrived yet.
+                    // No transcript has arrived yet -- nothing to flush.
+                    // This can happen when the VAD detects a very short burst of
+                    // noise that doesn't produce any STT output.
                     break;
                 }
                 if (TERMINAL_PUNCTUATION.test(this.accumulatedText)) {
-                    // Sentence-terminal punctuation → fire immediately.
+                    // Sentence-terminal punctuation detected -> fire immediately.
+                    // This is the fast path that eliminates the 1.5 s silence wait.
                     this._emitTurnComplete('punctuation', event.timestamp);
                 }
                 else {
-                    // No punctuation → wait for silence timeout.
+                    // No terminal punctuation -> start the silence timer.
+                    // If the user doesn't resume speaking within silenceTimeoutMs,
+                    // we'll fire turn_complete with reason 'silence_timeout'.
                     this._startSilenceTimer(event.timestamp);
                 }
                 break;
             }
             case 'silence': {
-                // Periodic heartbeat — no action required; the silence timer already
-                // handles the delayed fire if one is pending.
+                // Periodic heartbeat -- no action required. The silence timer
+                // (if running) handles delayed turn-completion independently.
                 break;
             }
         }
     }
     // ---------------------------------------------------------------------------
-    // IEndpointDetector — reset
+    // IEndpointDetector -- reset
     // ---------------------------------------------------------------------------
     /**
      * Reset all internal state, cancel pending timers, and prepare the detector
-     * for the next user turn. Should be called by the pipeline after each
-     * `turn_complete` event before audio for the next turn begins to arrive.
+     * for the next user turn.
+     *
+     * Called by the pipeline after each `turn_complete` event (both internally
+     * and by the orchestrator's flush_complete handler) to ensure clean state
+     * before audio for the next turn begins to arrive.
      */
     reset() {
         this._clearSilenceTimer();
@@ -198,11 +293,17 @@ export class HeuristicEndpointDetector extends EventEmitter {
      * Emit `turn_complete` with the currently accumulated transcript and then
      * reset internal state so the detector is ready for the next turn.
      *
-     * @param reason — The semantic reason driving this completion.
-     * @param speechEndTimestamp — Unix epoch ms timestamp of the `speech_end` event,
-     *   used to compute `durationMs`.
+     * The reset happens BEFORE the emit to ensure that any re-entrant listeners
+     * (e.g. an endpoint detector handler that immediately calls pushVadEvent)
+     * see clean state.
+     *
+     * @param reason - The semantic reason driving this completion.
+     * @param speechEndTimestamp - Unix epoch ms timestamp of the `speech_end` event,
+     *   used to compute `durationMs` as `speechEndTimestamp - turnStartMs`.
      */
     _emitTurnComplete(reason, speechEndTimestamp) {
+        // Compute speech duration. Falls back to 0 if no speech_start was recorded
+        // (defensive: should not happen in normal operation).
         const durationMs = this.turnStartMs !== null ? speechEndTimestamp - this.turnStartMs : 0;
         const event = {
             transcript: this.accumulatedText,
@@ -216,9 +317,14 @@ export class HeuristicEndpointDetector extends EventEmitter {
     }
     /**
      * Start the silence-timeout timer. If the user does not resume speaking
-     * within {@link silenceTimeoutMs} ms the detector fires `turn_complete`.
+     * within {@link silenceTimeoutMs} ms, the detector fires `turn_complete`
+     * with reason `'silence_timeout'`.
+     *
+     * Any previously running silence timer is cleared first to prevent
+     * double-fires from rapid speech_end -> speech_start -> speech_end sequences.
      *
-     * @param speechEndTimestamp — Timestamp passed through to `_emitTurnComplete`.
+     * @param speechEndTimestamp - Timestamp passed through to {@link _emitTurnComplete}
+     *   for duration calculation.
      */
     _startSilenceTimer(speechEndTimestamp) {
         this._clearSilenceTimer();
@@ -229,6 +335,7 @@ export class HeuristicEndpointDetector extends EventEmitter {
     }
     /**
      * Cancel a pending silence timer without any side effects.
+     * Safe to call when no timer is active (no-op).
      */
     _clearSilenceTimer() {
         if (this.silenceTimer !== null) {

package/dist/voice-pipeline/HeuristicEndpointDetector.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"HeuristicEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA~~;;;;;;;;;;;;;;;;GAgBG~~;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAQ3C,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E~~;;GAEG~~;AACH,MAAM,0BAA0B,GAAG,IAAK,CAAC;AAEzC~~;;GAEG~~;AACH,MAAM,oBAAoB,GAAG,QAAQ,CAAC;AAEtC~~;;;GAGG~~;AACH,MAAM,mBAAmB,~~GAAG~~,IAAI,GAAG,CAAC;~~IAClC~~,QAAQ;IACR,MAAM;IACN,MAAM;IACN,IAAI;IACJ,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,OAAO;IACP,MAAM;IACN,KAAK;IACL,KAAK;IACL,QAAQ;CACT,CAAC,CAAC;~~AAkBH~~,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E~~;;;;;;;;;;;;;;;;;GAiBG~~;AACH,MAAM,OAAO,yBACX,SAAQ,YAAY;~~IA4BpB~~,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,UAA4C,EAAE;QACxD,KAAK,EAAE,CAAC;~~QAnCV;;;;WAIG~~;QACM,SAAI,GAA8B,WAAW,CAAC;QAKvD~~,yEAAyE~~;~~QACjE~~,oBAAe,GAAG,EAAE,CAAC;QAE7B~~,uDAAuD~~;~~QAC/C~~,iBAAY,GAAG,KAAK,CAAC;QAE7B~~,yEAAyE~~;~~QACjE~~,iBAAY,GAAyC,IAAI,CAAC;QAElE~~,wEAAwE~~;~~QAChE~~,gBAAW,GAAkB,IAAI,CAAC;QAE1C~~,sDAAsD~~;~~QAC9C~~,mBAAc,GAAG,CAAC,CAAC;QAazB,IAAI,CAAC,gBAAgB,GAAG,OAAO,CAAC,gBAAgB,IAAI,0BAA0B,CAAC;IACjF,CAAC;IAED,8EAA8E;IAC9E,~~qCAAqC~~;~~IACrC~~,8EAA8E;IAE9E~~;;;;;;;;;;;;OAYG~~;IACH,cAAc,CAAC,UAA2B;QACxC,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;YACxB~~,+DAA+D~~;~~YAC/D~~,OAAO;QACT,CAAC;QAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC;QAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE7C,~~2DAA2D~~;~~QAC3D~~,IAAI,mBAAmB,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;YAC5C,OAAO;QACT,CAAC;QAED,kEAAkE;QAClE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC;IAC9C,CAAC;IAED,8EAA8E;IAC9E,~~mCAAmC~~;~~IACnC~~,8EAA8E;IAE9E~~;;;;;;;;;;;OAWG~~;IACH,YAAY,CAAC,KAAe;QAC1B,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;gBACzB,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC1B,IAAI,IAAI,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;oBAC9B,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,SAAS,CAAC;gBACrC,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,YAAY,CAAC,CAAC,CAAC;gBAClB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;gBAE1B,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;oBAC1B,~~gDAAgD~~;~~oBAChD~~,MAAM;gBACR,CAAC;gBAED,IAAI,oBAAoB,CAAC,IAAI,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;oBACpD,~~oDAAoD~~;~~oBACpD~~,IAAI,CAAC,iBAAiB,CAAC,aAAa,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;gBACzD,CAAC;qBAAM,CAAC;oBACN,~~6CAA6C~~;~~oBAC7C~~,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBAC3C,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,SAAS,CAAC,CAAC,CAAC;gBACf,~~qEAAqE~~;~~gBACrE~~,~~8CAA8C~~;~~gBAC9C~~,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,~~4BAA4B~~;~~IAC5B~~,8EAA8E;IAE9E~~;;;;OAIG~~;IACH,KAAK;QACH,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,eAAe,GAAG,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;QAC1B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;QACxB,IAAI,CAAC,cAAc,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,8EAA8E;IAC9E,kBAAkB;IAClB,8EAA8E;IAE9E~~;;;;;;;OAOG~~;IACK,iBAAiB,CACvB,MAAmC,EACnC,kBAA0B;QAE1B,MAAM,UAAU,GACd,IAAI,CAAC,WAAW,KAAK,IAAI,CAAC,CAAC,CAAC,kBAAkB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAExE,MAAM,KAAK,GAAsB;YAC/B,UAAU,EAAE,IAAI,CAAC,eAAe;YAChC,UAAU,EAAE,IAAI,CAAC,cAAc;YAC/B,UAAU;YACV,MAAM;SACP,CAAC;QAEF,0EAA0E;QAC1E,IAAI,CAAC,KAAK,EAAE,CAAC;QAEb,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;IACpC,CAAC;IAED~~;;;;;OAKG~~;IACK,kBAAkB,CAAC,kBAA0B;QACnD,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;YAClC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;YACzB,IAAI,CAAC,iBAAiB,CAAC,iBAAiB,EAAE,kBAAkB,CAAC,CAAC;QAChE,CAAC,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC5B,CAAC;IAED~~;;OAEG~~;IACK,kBAAkB;QACxB,IAAI,IAAI,CAAC,YAAY,KAAK,IAAI,EAAE,CAAC;YAC/B,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAChC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;CACF"}
1	+ {"version":3,"file":"HeuristicEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAQ3C,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E;;;;;;;;;;GAUG;AACH,MAAM,0BAA0B,GAAG,IAAK,CAAC;AAEzC;;;;;;;GAOG;AACH,MAAM,oBAAoB,GAAG,QAAQ,CAAC;AAEtC;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,mBAAmB,GAAwB,IAAI,GAAG,CAAC;IACvD,QAAQ;IACR,MAAM;IACN,MAAM;IACN,IAAI;IACJ,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,OAAO;IACP,MAAM;IACN,KAAK;IACL,KAAK;IACL,QAAQ;CACT,CAAC,CAAC;AAwBH,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,OAAO,yBACX,SAAQ,YAAY;IAgDpB,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,UAA4C,EAAE;QACxD,KAAK,EAAE,CAAC;QAvDV;;;;;WAKG;QACM,SAAI,GAA8B,WAAW,CAAC;QAKvD;;;;WAIG;QACK,oBAAe,GAAG,EAAE,CAAC;QAE7B;;;;WAIG;QACK,iBAAY,GAAG,KAAK,CAAC;QAE7B;;;WAGG;QACK,iBAAY,GAAyC,IAAI,CAAC;QAElE;;;;WAIG;QACK,gBAAW,GAAkB,IAAI,CAAC;QAE1C;;;;WAIG;QACK,mBAAc,GAAG,CAAC,CAAC;QAazB,IAAI,CAAC,gBAAgB,GAAG,OAAO,CAAC,gBAAgB,IAAI,0BAA0B,CAAC;IACjF,CAAC;IAED,8EAA8E;IAC9E,sCAAsC;IACtC,8EAA8E;IAE9E;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,CAAC,UAA2B;QACxC,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;YACxB,kEAAkE;YAClE,gDAAgD;YAChD,OAAO;QACT,CAAC;QAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC;QAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE7C,uEAAuE;QACvE,oEAAoE;QACpE,IAAI,mBAAmB,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;YAC5C,OAAO;QACT,CAAC;QAED,kEAAkE;QAClE,kEAAkE;QAClE,yEAAyE;QACzE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC;IAC9C,CAAC;IAED,8EAA8E;IAC9E,oCAAoC;IACpC,8EAA8E;IAE9E;;;;;;;;;;;;;;;;;;;OAmBG;IACH,YAAY,CAAC,KAAe;QAC1B,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;gBACzB,iEAAiE;gBACjE,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC1B,gEAAgE;gBAChE,IAAI,IAAI,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;oBAC9B,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,SAAS,CAAC;gBACrC,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,YAAY,CAAC,CAAC,CAAC;gBAClB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;gBAE1B,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;oBAC1B,qDAAqD;oBACrD,6DAA6D;oBAC7D,6CAA6C;oBAC7C,MAAM;gBACR,CAAC;gBAED,IAAI,oBAAoB,CAAC,IAAI,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;oBACpD,8DAA8D;oBAC9D,gEAAgE;oBAChE,IAAI,CAAC,iBAAiB,CAAC,aAAa,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;gBACzD,CAAC;qBAAM,CAAC;oBACN,sDAAsD;oBACtD,+DAA+D;oBAC/D,0DAA0D;oBAC1D,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBAC3C,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,SAAS,CAAC,CAAC,CAAC;gBACf,8DAA8D;gBAC9D,8DAA8D;gBAC9D,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,6BAA6B;IAC7B,8EAA8E;IAE9E;;;;;;;OAOG;IACH,KAAK;QACH,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,eAAe,GAAG,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;QAC1B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;QACxB,IAAI,CAAC,cAAc,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,8EAA8E;IAC9E,kBAAkB;IAClB,8EAA8E;IAE9E;;;;;;;;;;;OAWG;IACK,iBAAiB,CACvB,MAAmC,EACnC,kBAA0B;QAE1B,2EAA2E;QAC3E,sDAAsD;QACtD,MAAM,UAAU,GACd,IAAI,CAAC,WAAW,KAAK,IAAI,CAAC,CAAC,CAAC,kBAAkB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAExE,MAAM,KAAK,GAAsB;YAC/B,UAAU,EAAE,IAAI,CAAC,eAAe;YAChC,UAAU,EAAE,IAAI,CAAC,cAAc;YAC/B,UAAU;YACV,MAAM;SACP,CAAC;QAEF,0EAA0E;QAC1E,IAAI,CAAC,KAAK,EAAE,CAAC;QAEb,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;IACpC,CAAC;IAED;;;;;;;;;;OAUG;IACK,kBAAkB,CAAC,kBAA0B;QACnD,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;YAClC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;YACzB,IAAI,CAAC,iBAAiB,CAAC,iBAAiB,EAAE,kBAAkB,CAAC,CAAC;QAChE,CAAC,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC5B,CAAC;IAED;;;OAGG;IACK,kBAAkB;QACxB,IAAI,IAAI,CAAC,YAAY,KAAK,IAAI,EAAE,CAAC;YAC/B,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAChC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;CACF"}

package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts CHANGED Viewed

@@ -1,36 +1,91 @@
 /**
  * @module voice-pipeline/SoftFadeBargeinHandler
  *
- * Implements a three-tier soft-fade barge-in policy.
+ * Implements a three-tier soft-fade barge-in policy that maps detected speech
+ * duration to one of three actions: ignore, pause (with fade-out), or cancel.
  *
- * Very short speech detections (< `ignoreMs`) are dismissed as noise.
- * Medium-length detections trigger a fade-out pause so the user can speak
- * without an abrupt cut. Long detections (>= `cancelMs`) stop playback
- * outright and inject a conversation marker.
+ * ## Three-tier logic
+ *
+ * The handler divides the speech duration axis into three regions:
+ *
+ * ```
+ *   0 ms                ignoreMs              cancelMs
+ *   |-------- ignore --------|-------- pause --------|-------- cancel -------->
+ *          (noise)              (fade-out)              (hard stop)
+ * ```
+ *
+ * | Region                          | Action   | Rationale                                     |
+ * |---------------------------------|----------|-----------------------------------------------|
+ * | `speechDurationMs < ignoreMs`   | `ignore` | Too short to be intentional (noise, breath).  |
+ * | `ignoreMs <= speech < cancelMs` | `pause`  | Probably intentional; fade out gracefully.     |
+ * | `speechDurationMs >= cancelMs`  | `cancel` | Definitely intentional; stop immediately.      |
+ *
+ * ## Configurable thresholds
+ *
+ * - **`ignoreMs`** (default 100 ms): The noise floor. Anything shorter than
+ *   this is dismissed. Set lower in quiet environments, higher in noisy ones.
+ *
+ * - **`cancelMs`** (default 2,000 ms): The hard-stop ceiling. By this point,
+ *   the user has clearly been speaking for a while and wants to take over.
+ *   The pipeline should stop TTS immediately rather than fading.
+ *
+ * - **`fadeMs`** (default 200 ms): The duration of the audio fade-out applied
+ *   during a `'pause'` action. Shorter fades (100 ms) feel snappier; longer
+ *   fades (300+ ms) feel smoother but delay the user's ability to be heard.
+ *
+ * ## When to use soft-fade vs hard-cut
+ *
+ * Soft-fade is preferred when:
+ * - The environment is noisy and false barge-in detections are common.
+ * - The conversation is measured/educational and abrupt cuts feel jarring.
+ * - The TTS voice has long trailing prosody that benefits from a fade.
+ *
+ * Use {@link HardCutBargeinHandler} when:
+ * - The conversation is fast-paced (customer support, command interfaces).
+ * - Audio quality is high and false positives are rare.
+ * - Minimal interruption latency is critical.
+ *
+ * @see {@link HardCutBargeinHandler} for the binary hard-cut alternative.
+ * @see {@link IBargeinHandler} for the interface contract.
  */
 import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
 /**
  * Construction options for {@link SoftFadeBargeinHandler}.
+ *
+ * @example
+ * ```typescript
+ * const handler = new SoftFadeBargeinHandler({
+ *   ignoreMs: 80,
+ *   cancelMs: 1500,
+ *   fadeMs: 150,
+ * });
+ * ```
  */
 export interface SoftFadeBargeinHandlerOptions {
     /**
      * Speech duration threshold in milliseconds below which the barge-in is
-     * treated as accidental noise and ignored.
+     * treated as accidental noise and ignored. This is the lower boundary
+     * of the "pause" region.
      *
      * @defaultValue 100
      */
     ignoreMs?: number;
     /**
      * Speech duration threshold in milliseconds at or above which the barge-in
-     * triggers an immediate cancel rather than a fade-out pause. Must be greater
-     * than `ignoreMs` for the fade region to exist.
+     * triggers an immediate cancel rather than a fade-out pause. This is the
+     * upper boundary of the "pause" region. Must be greater than {@link ignoreMs}
+     * for the pause (fade) region to exist.
      *
      * @defaultValue 2000
      */
     cancelMs?: number;
     /**
      * Duration of the TTS fade-out in milliseconds applied when the speech
-     * duration falls in the range `[ignoreMs, cancelMs)`.
+     * duration falls in the "pause" range `[ignoreMs, cancelMs)`.
+     *
+     * The fade-out is applied client-side; the server sends a `{ type: 'pause', fadeMs }`
+     * control message and the client's audio player reduces volume linearly
+     * over this duration.
      *
      * @defaultValue 200
      */
@@ -39,38 +94,40 @@ export interface SoftFadeBargeinHandlerOptions {
 /**
  * Barge-in handler that applies a three-tier soft-fade strategy.
  *
- * The handler maps the confirmed speech duration to one of three actions:
+ * The handler is stateless -- each {@link handleBargein} call is evaluated
+ * independently with no memory of previous barge-in events.
  *
- * | Speech duration          | Action                                      |
- * |--------------------------|---------------------------------------------|
- * | `< ignoreMs`             | `ignore` — noise, continue TTS uninterrupted |
- * | `>= ignoreMs < cancelMs` | `pause` with `fadeMs` fade-out               |
- * | `>= cancelMs`            | `cancel` with `'[interrupted]'` marker       |
+ * @see {@link IBargeinHandler} for the interface contract.
+ * @see {@link HardCutBargeinHandler} for the binary hard-cut alternative.
  *
  * @example
- * ```ts
+ * ```typescript
  * const handler = new SoftFadeBargeinHandler({ ignoreMs: 80, cancelMs: 1500, fadeMs: 150 });
- * handler.handleBargein({ speechDurationMs: 500, ... }); // { type: 'pause', fadeMs: 150 }
- * handler.handleBargein({ speechDurationMs: 1600, ... }); // { type: 'cancel', injectMarker: '[interrupted]' }
- * handler.handleBargein({ speechDurationMs: 30, ... });  // { type: 'ignore' }
+ *
+ * handler.handleBargein({ speechDurationMs: 30, ... });   // -> { type: 'ignore' }
+ * handler.handleBargein({ speechDurationMs: 500, ... });  // -> { type: 'pause', fadeMs: 150 }
+ * handler.handleBargein({ speechDurationMs: 1600, ... }); // -> { type: 'cancel', injectMarker: '[interrupted]' }
  * ```
  */
 export declare class SoftFadeBargeinHandler implements IBargeinHandler {
     /**
      * The interruption strategy implemented by this handler.
-     * Always `'soft-fade'`.
+     * Always `'soft-fade'` -- TTS audio is faded out over a configurable window.
      */
     readonly mode: "soft-fade";
     /**
      * Speech duration below which the barge-in is dismissed as noise.
+     * @see {@link SoftFadeBargeinHandlerOptions.ignoreMs}
      */
     private readonly ignoreMs;
     /**
      * Speech duration at or above which the barge-in escalates to a full cancel.
+     * @see {@link SoftFadeBargeinHandlerOptions.cancelMs}
      */
     private readonly cancelMs;
     /**
      * Duration of the TTS audio fade-out applied during a `'pause'` action.
+     * @see {@link SoftFadeBargeinHandlerOptions.fadeMs}
      */
     private readonly fadeMs;
     /**
@@ -84,12 +141,20 @@ export declare class SoftFadeBargeinHandler implements IBargeinHandler {
      * Evaluate the barge-in context and return the pipeline action.
      *
      * Decision tree (evaluated in order):
-     * 1. `speechDurationMs < ignoreMs` → `{ type: 'ignore' }`
-     * 2. `speechDurationMs >= cancelMs` → `{ type: 'cancel', injectMarker: '[interrupted]' }`
-     * 3. Otherwise → `{ type: 'pause', fadeMs }`
+     *
+     * 1. `speechDurationMs < ignoreMs` -> `{ type: 'ignore' }`
+     *    Too short to be intentional. Likely a lip smack, breath, or noise burst.
+     *
+     * 2. `speechDurationMs >= cancelMs` -> `{ type: 'cancel', injectMarker: '[interrupted]' }`
+     *    The user has been speaking long enough that they clearly want to take over.
+     *    Stop TTS immediately and mark the conversation as interrupted.
+     *
+     * 3. Otherwise (ignoreMs <= speech < cancelMs) -> `{ type: 'pause', fadeMs }`
+     *    Probably intentional but not yet certain. Fade out TTS gracefully so the
+     *    user can be heard. If the speech stops, the pipeline can resume playback.
      *
      * @param context - Snapshot of the barge-in state at the moment of detection.
-     * @returns The pipeline action to execute.
+     * @returns The pipeline action to execute. Always synchronous (no Promise).
      */
     handleBargein(context: BargeinContext): BargeinAction;
 }

package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"SoftFadeBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/SoftFadeBargeinHandler.ts"],"names":[],"mappings":"AAAA~~;;;;;;;;;GASG~~;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;~~AAEjF;;GAEG~~;AACH,MAAM,WAAW,6BAA6B;IAC5C~~;;;;;OAKG~~;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB~~;;;;;;OAMG~~;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB~~;;;;;OAKG~~;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;~~AAED;;;;;;;;;;;;;;;;;;GAkBG~~;AACH,qBAAa,sBAAuB,YAAW,eAAe;IAC5D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,WAAW,CAAU;IAErC~~;;OAEG~~;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC~~;;OAEG~~;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC~~;;OAEG~~;IACH,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAEhC;;;;;OAKG;gBACS,OAAO,GAAE,6BAAkC;IAMvD~~;;;;;;;;;;OAUG~~;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;~~CAatD~~"}
1	+ {"version":3,"file":"SoftFadeBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/SoftFadeBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiDG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMjF;;;;;;;;;;;GAWG;AACH,MAAM,WAAW,6BAA6B;IAC5C;;;;;;OAMG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;;OAOG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;;;;OASG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAMD;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,sBAAuB,YAAW,eAAe;IAC5D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,WAAW,CAAU;IAErC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAEhC;;;;;OAKG;gBACS,OAAO,GAAE,6BAAkC;IAMvD;;;;;;;;;;;;;;;;;;OAkBG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAiBtD"}