@framers/agentos 0.1.108 → 0.1.109

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +95 -20
  2. package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
  3. package/dist/voice-pipeline/AcousticEndpointDetector.js +110 -24
  4. package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
  5. package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +66 -15
  6. package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -1
  7. package/dist/voice-pipeline/HardCutBargeinHandler.js +65 -13
  8. package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -1
  9. package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +116 -42
  10. package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
  11. package/dist/voice-pipeline/HeuristicEndpointDetector.js +159 -52
  12. package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
  13. package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +89 -24
  14. package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -1
  15. package/dist/voice-pipeline/SoftFadeBargeinHandler.js +74 -20
  16. package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -1
  17. package/dist/voice-pipeline/VoiceInterruptError.d.ts +68 -10
  18. package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
  19. package/dist/voice-pipeline/VoiceInterruptError.js +53 -6
  20. package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
  21. package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +190 -39
  22. package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
  23. package/dist/voice-pipeline/VoicePipelineOrchestrator.js +266 -53
  24. package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
  25. package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +135 -43
  26. package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -1
  27. package/dist/voice-pipeline/WebSocketStreamTransport.js +109 -47
  28. package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -1
  29. package/dist/voice-pipeline/index.d.ts +34 -1
  30. package/dist/voice-pipeline/index.d.ts.map +1 -1
  31. package/dist/voice-pipeline/index.js +41 -1
  32. package/dist/voice-pipeline/index.js.map +1 -1
  33. package/dist/voice-pipeline/types.d.ts +432 -106
  34. package/dist/voice-pipeline/types.d.ts.map +1 -1
  35. package/dist/voice-pipeline/types.js +21 -9
  36. package/dist/voice-pipeline/types.js.map +1 -1
  37. package/package.json +1 -1
@@ -6,27 +6,74 @@
6
6
  * and relies solely on the duration of post-speech silence to decide when the user
7
7
  * has finished speaking.
8
8
  *
9
- * Emits:
10
- * - `'turn_complete'` ({@link TurnCompleteEvent}) — silence exceeded the configured
11
- * `utteranceEndThresholdMs` after the most recent `speech_end` VAD event.
12
- * - `'speech_start'` () re-emitted when a `speech_start` VAD event is received.
9
+ * ## How it works
10
+ *
11
+ * This detector delegates all silence timing to a {@link SilenceDetector} instance
12
+ * (from `core/audio/`). The SilenceDetector maintains an internal timer that
13
+ * starts when `handleSpeechEnd()` is called and fires `'utterance_end_detected'`
14
+ * when silence exceeds the configured `utteranceEndThresholdMs`. A
15
+ * `handleSpeechStart()` call cancels the timer.
16
+ *
17
+ * ## Energy threshold adaptation
18
+ *
19
+ * The SilenceDetector internally uses adaptive energy thresholds from the
20
+ * {@link AdaptiveVAD}. The VAD continuously recalibrates its speech/silence
21
+ * boundary based on ambient noise levels, so the effective silence threshold
22
+ * adapts to the environment (e.g. coffee shop vs quiet room). This detector
23
+ * does not perform its own energy analysis -- it trusts the upstream VAD's
24
+ * speech_start/speech_end decisions.
25
+ *
26
+ * ## When to use
27
+ *
28
+ * Use this detector when:
29
+ * - The STT provider does not produce reliable punctuation.
30
+ * - You want the simplest possible endpoint detection with no linguistic analysis.
31
+ * - Latency tolerance is higher (the full `utteranceEndThresholdMs` is always
32
+ * consumed, unlike the {@link HeuristicEndpointDetector} which can fire
33
+ * immediately on terminal punctuation).
34
+ *
35
+ * @see {@link HeuristicEndpointDetector} for the rule-based alternative with
36
+ * punctuation-triggered fast path.
37
+ * @see {@link IEndpointDetector} for the interface contract.
38
+ * @see {@link SilenceDetector} for the underlying silence timing logic.
39
+ *
40
+ * ## Events emitted
41
+ *
42
+ * | Event | Payload | Description |
43
+ * |-------------------|--------------------------|---------------------------------------------|
44
+ * | `'turn_complete'` | {@link TurnCompleteEvent} | Silence exceeded `utteranceEndThresholdMs`. |
45
+ * | `'speech_start'` | *(none)* | Re-emitted from incoming VAD event. |
13
46
  */
14
47
  import { EventEmitter } from 'node:events';
15
48
  import type { IEndpointDetector, VadEvent, TranscriptEvent } from './types.js';
16
49
  /**
17
50
  * Constructor options for {@link AcousticEndpointDetector}.
51
+ *
52
+ * @example
53
+ * ```typescript
54
+ * const detector = new AcousticEndpointDetector({
55
+ * significantPauseThresholdMs: 1000,
56
+ * utteranceEndThresholdMs: 2000,
57
+ * });
58
+ * ```
18
59
  */
19
60
  export interface AcousticEndpointDetectorConfig {
20
61
  /**
21
62
  * Silence duration after speech (ms) that triggers a "significant pause"
22
63
  * notification on the underlying {@link SilenceDetector}. Does not directly
23
- * cause `turn_complete` to fire, but is forwarded to the SilenceDetector.
64
+ * cause `turn_complete` to fire, but can be used by other pipeline components
65
+ * to show a "thinking" indicator.
24
66
  * @defaultValue 1500
25
67
  */
26
68
  significantPauseThresholdMs?: number;
27
69
  /**
28
70
  * Silence duration after speech (ms) that triggers `turn_complete` with
29
- * `reason: 'silence_timeout'`.
71
+ * `reason: 'silence_timeout'`. This is the primary tuning knob for how
72
+ * long the pipeline waits after the user stops speaking.
73
+ *
74
+ * - Lower values (1000-2000 ms): Faster response, but may fire during natural pauses.
75
+ * - Higher values (3000-5000 ms): More tolerant of pauses, but feels sluggish.
76
+ *
30
77
  * @defaultValue 3000
31
78
  */
32
79
  utteranceEndThresholdMs?: number;
@@ -38,26 +85,45 @@ export interface AcousticEndpointDetectorConfig {
38
85
  * `speech_end` events start the silence clock; `speech_start` events cancel
39
86
  * any pending turn-complete emission. Transcript content is completely ignored.
40
87
  *
88
+ * @see {@link IEndpointDetector} for the interface contract.
89
+ * @see {@link HeuristicEndpointDetector} for the heuristic alternative.
90
+ *
41
91
  * @example
42
- * ```ts
92
+ * ```typescript
43
93
  * const detector = new AcousticEndpointDetector({ utteranceEndThresholdMs: 2000 });
44
- * detector.on('turn_complete', (event) => console.log('Turn done:', event));
45
- * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now() });
94
+ * detector.on('turn_complete', (event) => {
95
+ * console.log(`Turn done after ${event.durationMs}ms of speech`);
96
+ * });
97
+ * detector.pushVadEvent({ type: 'speech_start', timestamp: Date.now() });
98
+ * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now() + 500 });
99
+ * // -> After 2000ms of silence, 'turn_complete' fires with reason 'silence_timeout'
46
100
  * ```
47
101
  */
48
102
  export declare class AcousticEndpointDetector extends EventEmitter implements IEndpointDetector {
49
- /** @inheritdoc */
103
+ /**
104
+ * Detection mode identifier. Always `'acoustic'` for this implementation.
105
+ * @see {@link IEndpointDetector.mode}
106
+ */
50
107
  readonly mode: "acoustic";
51
- /** Underlying silence-duration tracker. */
108
+ /**
109
+ * Underlying silence-duration tracker from `core/audio/`.
110
+ * Handles the actual timer management and threshold comparison.
111
+ */
52
112
  private readonly silenceDetector;
53
113
  /**
54
- * Timestamp (ms) when the current speech segment began. Tracked so that
55
- * `durationMs` in the emitted {@link TurnCompleteEvent} can be computed.
114
+ * Timestamp (ms) when the current speech segment began. Used to compute
115
+ * `durationMs` in the emitted {@link TurnCompleteEvent} as:
116
+ * `speechEndTimeMs - speechStartTimeMs`.
117
+ *
118
+ * Reset to `null` on each {@link reset} call.
56
119
  */
57
120
  private speechStartTimeMs;
58
121
  /**
59
122
  * Timestamp (ms) when the most recent `speech_end` VAD event was received.
60
- * Used to calculate `durationMs` for the turn-complete event.
123
+ * Used together with {@link speechStartTimeMs} to calculate `durationMs`
124
+ * for the turn-complete event.
125
+ *
126
+ * Reset to `null` on each {@link reset} call.
61
127
  */
62
128
  private speechEndTimeMs;
63
129
  /**
@@ -69,21 +135,30 @@ export declare class AcousticEndpointDetector extends EventEmitter implements IE
69
135
  /**
70
136
  * Converts a {@link VadEvent} into the SilenceDetector's expected API calls.
71
137
  *
72
- * - `speech_start` resets silence state and re-emits `'speech_start'` on self.
73
- * - `speech_end` starts the silence clock.
74
- * - `silence` treated as ongoing non-speech frames.
138
+ * - **`speech_start`**: Resets silence state (cancels pending timers) and
139
+ * re-emits `'speech_start'` on this detector for pipeline consumption.
140
+ * - **`speech_end`**: Records the timestamp and starts the silence clock.
141
+ * - **`silence`**: Treated as ongoing non-speech frames, advancing the
142
+ * SilenceDetector's internal timer.
75
143
  *
76
- * @param event - Incoming VAD event.
144
+ * @param event - Incoming VAD event from the upstream voice activity detector.
77
145
  */
78
146
  pushVadEvent(event: VadEvent): void;
79
147
  /**
80
- * No-op this detector is purely acoustic and does not use transcript content.
148
+ * No-op -- this detector is purely acoustic and does not use transcript content.
149
+ *
150
+ * The method exists solely to satisfy the {@link IEndpointDetector} interface.
151
+ * Calling it has no effect and does not throw.
81
152
  *
82
153
  * @param _event - Ignored transcript event.
83
154
  */
84
155
  pushTranscript(_event: TranscriptEvent): void;
85
156
  /**
86
- * Resets all internal state and timers. Call at the start of each new turn.
157
+ * Resets all internal state and cancels pending timers.
158
+ *
159
+ * Should be called at the start of each new turn to ensure clean state.
160
+ * This also resets the underlying SilenceDetector, cancelling any pending
161
+ * utterance_end_detected timer.
87
162
  */
88
163
  reset(): void;
89
164
  }
@@ -1 +1 @@
1
- {"version":3,"file":"AcousticEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,OAAO,KAAK,EACV,iBAAiB,EACjB,QAAQ,EACR,eAAe,EAEhB,MAAM,YAAY,CAAC;AAMpB;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC7C;;;;;OAKG;IACH,2BAA2B,CAAC,EAAE,MAAM,CAAC;IAErC;;;;OAIG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAClC;AAMD;;;;;;;;;;;;;GAaG;AACH,qBAAa,wBAAyB,SAAQ,YAAa,YAAW,iBAAiB;IACrF,kBAAkB;IAClB,SAAgB,IAAI,EAAG,UAAU,CAAU;IAE3C,2CAA2C;IAC3C,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAElD;;;OAGG;IACH,OAAO,CAAC,iBAAiB,CAAuB;IAEhD;;;OAGG;IACH,OAAO,CAAC,eAAe,CAAuB;IAI9C;;;;OAIG;gBACS,MAAM,GAAE,8BAAmC;IAgCvD;;;;;;;;OAQG;IACI,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,IAAI;IAyB1C;;;;OAIG;IACI,cAAc,CAAC,MAAM,EAAE,eAAe,GAAG,IAAI;IAIpD;;OAEG;IACI,KAAK,IAAI,IAAI;CAKrB"}
1
+ {"version":3,"file":"AcousticEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,OAAO,KAAK,EACV,iBAAiB,EACjB,QAAQ,EACR,eAAe,EAEhB,MAAM,YAAY,CAAC;AAMpB;;;;;;;;;;GAUG;AACH,MAAM,WAAW,8BAA8B;IAC7C;;;;;;OAMG;IACH,2BAA2B,CAAC,EAAE,MAAM,CAAC;IAErC;;;;;;;;;OASG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAClC;AAMD;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,qBAAa,wBAAyB,SAAQ,YAAa,YAAW,iBAAiB;IACrF;;;OAGG;IACH,SAAgB,IAAI,EAAG,UAAU,CAAU;IAE3C;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAElD;;;;;;OAMG;IACH,OAAO,CAAC,iBAAiB,CAAuB;IAEhD;;;;;;OAMG;IACH,OAAO,CAAC,eAAe,CAAuB;IAM9C;;;;OAIG;gBACS,MAAM,GAAE,8BAAmC;IAuCvD;;;;;;;;;;OAUG;IACI,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,IAAI;IAwC1C;;;;;;;OAOG;IACI,cAAc,CAAC,MAAM,EAAE,eAAe,GAAG,IAAI;IAUpD;;;;;;OAMG;IACI,KAAK,IAAI,IAAI;CAKrB"}
@@ -6,10 +6,43 @@
6
6
  * and relies solely on the duration of post-speech silence to decide when the user
7
7
  * has finished speaking.
8
8
  *
9
- * Emits:
10
- * - `'turn_complete'` ({@link TurnCompleteEvent}) — silence exceeded the configured
11
- * `utteranceEndThresholdMs` after the most recent `speech_end` VAD event.
12
- * - `'speech_start'` () re-emitted when a `speech_start` VAD event is received.
9
+ * ## How it works
10
+ *
11
+ * This detector delegates all silence timing to a {@link SilenceDetector} instance
12
+ * (from `core/audio/`). The SilenceDetector maintains an internal timer that
13
+ * starts when `handleSpeechEnd()` is called and fires `'utterance_end_detected'`
14
+ * when silence exceeds the configured `utteranceEndThresholdMs`. A
15
+ * `handleSpeechStart()` call cancels the timer.
16
+ *
17
+ * ## Energy threshold adaptation
18
+ *
19
+ * The SilenceDetector internally uses adaptive energy thresholds from the
20
+ * {@link AdaptiveVAD}. The VAD continuously recalibrates its speech/silence
21
+ * boundary based on ambient noise levels, so the effective silence threshold
22
+ * adapts to the environment (e.g. coffee shop vs quiet room). This detector
23
+ * does not perform its own energy analysis -- it trusts the upstream VAD's
24
+ * speech_start/speech_end decisions.
25
+ *
26
+ * ## When to use
27
+ *
28
+ * Use this detector when:
29
+ * - The STT provider does not produce reliable punctuation.
30
+ * - You want the simplest possible endpoint detection with no linguistic analysis.
31
+ * - Latency tolerance is higher (the full `utteranceEndThresholdMs` is always
32
+ * consumed, unlike the {@link HeuristicEndpointDetector} which can fire
33
+ * immediately on terminal punctuation).
34
+ *
35
+ * @see {@link HeuristicEndpointDetector} for the rule-based alternative with
36
+ * punctuation-triggered fast path.
37
+ * @see {@link IEndpointDetector} for the interface contract.
38
+ * @see {@link SilenceDetector} for the underlying silence timing logic.
39
+ *
40
+ * ## Events emitted
41
+ *
42
+ * | Event | Payload | Description |
43
+ * |-------------------|--------------------------|---------------------------------------------|
44
+ * | `'turn_complete'` | {@link TurnCompleteEvent} | Silence exceeded `utteranceEndThresholdMs`. |
45
+ * | `'speech_start'` | *(none)* | Re-emitted from incoming VAD event. |
13
46
  */
14
47
  import { EventEmitter } from 'node:events';
15
48
  import { SilenceDetector } from '../core/audio/SilenceDetector.js';
@@ -23,14 +56,23 @@ import { SilenceDetector } from '../core/audio/SilenceDetector.js';
23
56
  * `speech_end` events start the silence clock; `speech_start` events cancel
24
57
  * any pending turn-complete emission. Transcript content is completely ignored.
25
58
  *
59
+ * @see {@link IEndpointDetector} for the interface contract.
60
+ * @see {@link HeuristicEndpointDetector} for the heuristic alternative.
61
+ *
26
62
  * @example
27
- * ```ts
63
+ * ```typescript
28
64
  * const detector = new AcousticEndpointDetector({ utteranceEndThresholdMs: 2000 });
29
- * detector.on('turn_complete', (event) => console.log('Turn done:', event));
30
- * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now() });
65
+ * detector.on('turn_complete', (event) => {
66
+ * console.log(`Turn done after ${event.durationMs}ms of speech`);
67
+ * });
68
+ * detector.pushVadEvent({ type: 'speech_start', timestamp: Date.now() });
69
+ * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now() + 500 });
70
+ * // -> After 2000ms of silence, 'turn_complete' fires with reason 'silence_timeout'
31
71
  * ```
32
72
  */
33
73
  export class AcousticEndpointDetector extends EventEmitter {
74
+ // ---------------------------------------------------------------------------
75
+ // Constructor
34
76
  // ---------------------------------------------------------------------------
35
77
  /**
36
78
  * Creates a new AcousticEndpointDetector.
@@ -39,30 +81,46 @@ export class AcousticEndpointDetector extends EventEmitter {
39
81
  */
40
82
  constructor(config = {}) {
41
83
  super();
42
- /** @inheritdoc */
84
+ /**
85
+ * Detection mode identifier. Always `'acoustic'` for this implementation.
86
+ * @see {@link IEndpointDetector.mode}
87
+ */
43
88
  this.mode = 'acoustic';
44
89
  /**
45
- * Timestamp (ms) when the current speech segment began. Tracked so that
46
- * `durationMs` in the emitted {@link TurnCompleteEvent} can be computed.
90
+ * Timestamp (ms) when the current speech segment began. Used to compute
91
+ * `durationMs` in the emitted {@link TurnCompleteEvent} as:
92
+ * `speechEndTimeMs - speechStartTimeMs`.
93
+ *
94
+ * Reset to `null` on each {@link reset} call.
47
95
  */
48
96
  this.speechStartTimeMs = null;
49
97
  /**
50
98
  * Timestamp (ms) when the most recent `speech_end` VAD event was received.
51
- * Used to calculate `durationMs` for the turn-complete event.
99
+ * Used together with {@link speechStartTimeMs} to calculate `durationMs`
100
+ * for the turn-complete event.
101
+ *
102
+ * Reset to `null` on each {@link reset} call.
52
103
  */
53
104
  this.speechEndTimeMs = null;
105
+ // Build SilenceDetector config from our options with sensible defaults
54
106
  const sdConfig = {
55
107
  significantPauseThresholdMs: config.significantPauseThresholdMs ?? 1500,
56
108
  utteranceEndThresholdMs: config.utteranceEndThresholdMs ?? 3000,
57
109
  };
58
110
  this.silenceDetector = new SilenceDetector(sdConfig);
59
- // When SilenceDetector decides the utterance has ended, fire turn_complete.
111
+ // When SilenceDetector decides the utterance has ended (silence exceeded
112
+ // utteranceEndThresholdMs), translate that into a TurnCompleteEvent.
60
113
  this.silenceDetector.on('utterance_end_detected', (_silenceDurationMs) => {
114
+ // Compute the duration of actual speech (not including silence).
115
+ // Falls back to 0 if timestamps are missing (defensive).
61
116
  const durationMs = this.speechStartTimeMs !== null && this.speechEndTimeMs !== null
62
117
  ? this.speechEndTimeMs - this.speechStartTimeMs
63
118
  : 0;
64
119
  const event = {
65
- transcript: '', // Acoustic mode has no transcript access
120
+ // Acoustic mode has no transcript access -- the orchestrator will
121
+ // use whatever transcript the STT session has accumulated separately.
122
+ transcript: '',
123
+ // Confidence is 0 because we have no STT data to score.
66
124
  confidence: 0,
67
125
  durationMs,
68
126
  reason: 'silence_timeout',
@@ -71,48 +129,76 @@ export class AcousticEndpointDetector extends EventEmitter {
71
129
  });
72
130
  }
73
131
  // ---------------------------------------------------------------------------
74
- // IEndpointDetector
132
+ // IEndpointDetector -- pushVadEvent
75
133
  // ---------------------------------------------------------------------------
76
134
  /**
77
135
  * Converts a {@link VadEvent} into the SilenceDetector's expected API calls.
78
136
  *
79
- * - `speech_start` resets silence state and re-emits `'speech_start'` on self.
80
- * - `speech_end` starts the silence clock.
81
- * - `silence` treated as ongoing non-speech frames.
137
+ * - **`speech_start`**: Resets silence state (cancels pending timers) and
138
+ * re-emits `'speech_start'` on this detector for pipeline consumption.
139
+ * - **`speech_end`**: Records the timestamp and starts the silence clock.
140
+ * - **`silence`**: Treated as ongoing non-speech frames, advancing the
141
+ * SilenceDetector's internal timer.
82
142
  *
83
- * @param event - Incoming VAD event.
143
+ * @param event - Incoming VAD event from the upstream voice activity detector.
84
144
  */
85
145
  pushVadEvent(event) {
86
- // Minimal VADResult stub — SilenceDetector's public methods only use it as
87
- // a pass-through parameter and don't inspect its contents.
146
+ // The SilenceDetector's API requires a VADResult parameter, but it only
147
+ // uses it as a pass-through and doesn't inspect its contents. We pass
148
+ // a minimal stub typed as `never` to satisfy the signature without
149
+ // introducing a dependency on the full VADResult type.
88
150
  const vadResultStub = { timestamp: event.timestamp };
89
151
  switch (event.type) {
90
152
  case 'speech_start':
153
+ // Record when speech began for duration calculation
91
154
  this.speechStartTimeMs = event.timestamp;
155
+ // Clear the previous speech_end since a new speech segment started
92
156
  this.speechEndTimeMs = null;
157
+ // Notify SilenceDetector to cancel any pending silence timer
93
158
  this.silenceDetector.handleSpeechStart(vadResultStub);
159
+ // Re-emit for pipeline consumers (e.g. barge-in detection)
94
160
  this.emit('speech_start');
95
161
  break;
96
162
  case 'speech_end':
163
+ // Record when speech ended for duration calculation
97
164
  this.speechEndTimeMs = event.timestamp;
165
+ // Start the silence clock -- if silence persists beyond
166
+ // utteranceEndThresholdMs, SilenceDetector fires utterance_end_detected.
167
+ // The second argument (0) is the energy level -- not used in our context.
98
168
  this.silenceDetector.handleSpeechEnd(vadResultStub, 0);
99
169
  break;
100
170
  case 'silence':
101
- // Periodic silence heartbeat pass as a non-speech frame.
171
+ // Periodic silence heartbeat -- advance SilenceDetector's internal
172
+ // timer by notifying it of continued non-speech activity.
102
173
  this.silenceDetector.handleNoVoiceActivity(vadResultStub);
103
174
  break;
104
175
  }
105
176
  }
177
+ // ---------------------------------------------------------------------------
178
+ // IEndpointDetector -- pushTranscript
179
+ // ---------------------------------------------------------------------------
106
180
  /**
107
- * No-op this detector is purely acoustic and does not use transcript content.
181
+ * No-op -- this detector is purely acoustic and does not use transcript content.
182
+ *
183
+ * The method exists solely to satisfy the {@link IEndpointDetector} interface.
184
+ * Calling it has no effect and does not throw.
108
185
  *
109
186
  * @param _event - Ignored transcript event.
110
187
  */
111
188
  pushTranscript(_event) {
112
- // Intentional no-op: acoustic mode ignores linguistic content.
189
+ // Intentional no-op: acoustic mode ignores all linguistic content.
190
+ // The HeuristicEndpointDetector should be used if transcript-based
191
+ // endpoint detection is desired.
113
192
  }
193
+ // ---------------------------------------------------------------------------
194
+ // IEndpointDetector -- reset
195
+ // ---------------------------------------------------------------------------
114
196
  /**
115
- * Resets all internal state and timers. Call at the start of each new turn.
197
+ * Resets all internal state and cancels pending timers.
198
+ *
199
+ * Should be called at the start of each new turn to ensure clean state.
200
+ * This also resets the underlying SilenceDetector, cancelling any pending
201
+ * utterance_end_detected timer.
116
202
  */
117
203
  reset() {
118
204
  this.speechStartTimeMs = null;
@@ -1 +1 @@
1
- {"version":3,"file":"AcousticEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,eAAe,EAA8B,MAAM,kCAAkC,CAAC;AAgC/F,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAO,wBAAyB,SAAQ,YAAY;IAmBxD,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,SAAyC,EAAE;QACrD,KAAK,EAAE,CAAC;QA1BV,kBAAkB;QACF,SAAI,GAAG,UAAmB,CAAC;QAK3C;;;WAGG;QACK,sBAAiB,GAAkB,IAAI,CAAC;QAEhD;;;WAGG;QACK,oBAAe,GAAkB,IAAI,CAAC;QAY5C,MAAM,QAAQ,GAA0B;YACtC,2BAA2B,EAAE,MAAM,CAAC,2BAA2B,IAAI,IAAI;YACvE,uBAAuB,EAAE,MAAM,CAAC,uBAAuB,IAAI,IAAI;SAChE,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC,QAAQ,CAAC,CAAC;QAErD,4EAA4E;QAC5E,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC,wBAAwB,EAAE,CAAC,kBAA0B,EAAE,EAAE;YAC/E,MAAM,UAAU,GACd,IAAI,CAAC,iBAAiB,KAAK,IAAI,IAAI,IAAI,CAAC,eAAe,KAAK,IAAI;gBAC9D,CAAC,CAAC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,iBAAiB;gBAC/C,CAAC,CAAC,CAAC,CAAC;YAER,MAAM,KAAK,GAAsB;gBAC/B,UAAU,EAAE,EAAE,EAAI,yCAAyC;gBAC3D,UAAU,EAAE,CAAC;gBACb,UAAU;gBACV,MAAM,EAAE,iBAAiB;aAC1B,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,8EAA8E;IAC9E,oBAAoB;IACpB,8EAA8E;IAE9E;;;;;;;;OAQG;IACI,YAAY,CAAC,KAAe;QACjC,2EAA2E;QAC3E,2DAA2D;QAC3D,MAAM,aAAa,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,EAAW,CAAC;QAE9D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc;gBACjB,IAAI,CAAC,iBAAiB,GAAG,KAAK,CAAC,SAAS,CAAC;gBACzC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;gBAC5B,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC;gBACtD,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC1B,MAAM;YAER,KAAK,YAAY;gBACf,IAAI,CAAC,eAAe,GAAG,KAAK,CAAC,SAAS,CAAC;gBACvC,IAAI,CAAC,eAAe,CAAC,eAAe,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC;gBACvD,MAAM;YAER,KAAK,SAAS;gBACZ,2DAA2D;gBAC3D,IAAI,CAAC,eAAe,CAAC,qBAAqB,CAAC,aAAa,CAAC,CAAC;gBAC1D,MAAM;QACV,CAAC;IACH,CAAC;IAED;;;;OAIG;IACI,cAAc,CAAC,MAAuB;QAC3C,+DAA+D;IACjE,CAAC;IAED;;OAEG;IACI,KAAK;QACV,IAAI,CAAC,iBAAiB,GAAG,IAAI,CAAC;QAC9B,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;IAC/B,CAAC;CACF"}
1
+ {"version":3,"file":"AcousticEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,eAAe,EAA8B,MAAM,kCAAkC,CAAC;AA8C/F,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,OAAO,wBAAyB,SAAQ,YAAY;IA+BxD,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,SAAyC,EAAE;QACrD,KAAK,EAAE,CAAC;QAxCV;;;WAGG;QACa,SAAI,GAAG,UAAmB,CAAC;QAQ3C;;;;;;WAMG;QACK,sBAAiB,GAAkB,IAAI,CAAC;QAEhD;;;;;;WAMG;QACK,oBAAe,GAAkB,IAAI,CAAC;QAc5C,uEAAuE;QACvE,MAAM,QAAQ,GAA0B;YACtC,2BAA2B,EAAE,MAAM,CAAC,2BAA2B,IAAI,IAAI;YACvE,uBAAuB,EAAE,MAAM,CAAC,uBAAuB,IAAI,IAAI;SAChE,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC,QAAQ,CAAC,CAAC;QAErD,yEAAyE;QACzE,qEAAqE;QACrE,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC,wBAAwB,EAAE,CAAC,kBAA0B,EAAE,EAAE;YAC/E,iEAAiE;YACjE,yDAAyD;YACzD,MAAM,UAAU,GACd,IAAI,CAAC,iBAAiB,KAAK,IAAI,IAAI,IAAI,CAAC,eAAe,KAAK,IAAI;gBAC9D,CAAC,CAAC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,iBAAiB;gBAC/C,CAAC,CAAC,CAAC,CAAC;YAER,MAAM,KAAK,GAAsB;gBAC/B,kEAAkE;gBAClE,sEAAsE;gBACtE,UAAU,EAAE,EAAE;gBACd,wDAAwD;gBACxD,UAAU,EAAE,CAAC;gBACb,UAAU;gBACV,MAAM,EAAE,iBAAiB;aAC1B,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,8EAA8E;IAC9E,oCAAoC;IACpC,8EAA8E;IAE9E;;;;;;;;;;OAUG;IACI,YAAY,CAAC,KAAe;QACjC,wEAAwE;QACxE,sEAAsE;QACtE,mEAAmE;QACnE,uDAAuD;QACvD,MAAM,aAAa,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,EAAW,CAAC;QAE9D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc;gBACjB,oDAAoD;gBACpD,IAAI,CAAC,iBAAiB,GAAG,KAAK,CAAC,SAAS,CAAC;gBACzC,mEAAmE;gBACnE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;gBAC5B,6DAA6D;gBAC7D,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC;gBACtD,2DAA2D;gBAC3D,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC1B,MAAM;YAER,KAAK,YAAY;gBACf,oDAAoD;gBACpD,IAAI,CAAC,eAAe,GAAG,KAAK,CAAC,SAAS,CAAC;gBACvC,wDAAwD;gBACxD,yEAAyE;gBACzE,0EAA0E;gBAC1E,IAAI,CAAC,eAAe,CAAC,eAAe,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC;gBACvD,MAAM;YAER,KAAK,SAAS;gBACZ,mEAAmE;gBACnE,0DAA0D;gBAC1D,IAAI,CAAC,eAAe,CAAC,qBAAqB,CAAC,aAAa,CAAC,CAAC;gBAC1D,MAAM;QACV,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,sCAAsC;IACtC,8EAA8E;IAE9E;;;;;;;OAOG;IACI,cAAc,CAAC,MAAuB;QAC3C,mEAAmE;QACnE,mEAAmE;QACnE,iCAAiC;IACnC,CAAC;IAED,8EAA8E;IAC9E,6BAA6B;IAC7B,8EAA8E;IAE9E;;;;;;OAMG;IACI,KAAK;QACV,IAAI,CAAC,iBAAiB,GAAG,IAAI,CAAC;QAC9B,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;IAC/B,CAAC;CACF"}
@@ -2,21 +2,59 @@
2
2
  * @module voice-pipeline/HardCutBargeinHandler
3
3
  *
4
4
  * Implements a hard-cut barge-in policy: when the user speaks over TTS output
5
- * for at least `minSpeechMs` milliseconds, playback is stopped immediately with
6
- * no fade-out. Short detections below the threshold are treated as accidental
7
- * noise and ignored.
5
+ * for at least {@link HardCutBargeinHandlerOptions.minSpeechMs} milliseconds,
6
+ * playback is stopped immediately with no fade-out. Short detections below the
7
+ * threshold are treated as accidental noise and ignored.
8
+ *
9
+ * ## Why 300 ms default threshold?
10
+ *
11
+ * The 300 ms threshold was chosen to filter out common non-speech audio events
12
+ * that trigger false barge-in detections:
13
+ *
14
+ * - **Lip smacks**: Typically 50-150 ms of energy.
15
+ * - **Breaths/sighs**: Typically 100-250 ms of energy.
16
+ * - **Coughs/sneezes**: Short burst 100-200 ms, but may exceed threshold.
17
+ * - **Background noise spikes**: Door closing, keyboard typing -- usually < 200 ms.
18
+ *
19
+ * At 300 ms, a detection almost certainly represents intentional speech rather
20
+ * than ambient noise. Lowering to < 200 ms increases false positives significantly
21
+ * in noisy environments. Raising to > 500 ms adds noticeable delay before the
22
+ * agent acknowledges the interruption.
23
+ *
24
+ * ## When to use hard-cut vs soft-fade
25
+ *
26
+ * Use hard-cut when:
27
+ * - The conversation style is fast-paced (e.g. customer support).
28
+ * - Users expect immediate response to interruption.
29
+ * - Audio quality is high (fewer false positives).
30
+ *
31
+ * Use {@link SoftFadeBargeinHandler} when:
32
+ * - The conversation is more measured (e.g. storytelling, education).
33
+ * - Users may accidentally trigger barge-in (noisy environment).
34
+ * - A smoother audio experience is preferred.
35
+ *
36
+ * @see {@link SoftFadeBargeinHandler} for the three-tier soft-fade alternative.
37
+ * @see {@link IBargeinHandler} for the interface contract.
8
38
  */
9
39
  import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
10
40
  /**
11
41
  * Construction options for {@link HardCutBargeinHandler}.
42
+ *
43
+ * @example
44
+ * ```typescript
45
+ * const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
46
+ * ```
12
47
  */
13
48
  export interface HardCutBargeinHandlerOptions {
14
49
  /**
15
50
  * Minimum confirmed speech duration (in milliseconds) required before a
16
51
  * barge-in is treated as intentional. Detections shorter than this value are
17
- * returned as `{ type: 'ignore' }` to avoid reacting to background noise.
52
+ * returned as `{ type: 'ignore' }` to avoid reacting to background noise,
53
+ * lip smacks, breaths, or other brief non-speech audio events.
18
54
  *
19
55
  * @defaultValue 300
56
+ *
57
+ * @see Module-level documentation for rationale behind the 300 ms default.
20
58
  */
21
59
  minSpeechMs?: number;
22
60
  }
@@ -24,26 +62,38 @@ export interface HardCutBargeinHandlerOptions {
24
62
  * Barge-in handler that applies a hard-cut strategy.
25
63
  *
26
64
  * When the user speaks over an active TTS stream, this handler immediately
27
- * cancels playback if the detected speech exceeds `minSpeechMs`. Below that
28
- * threshold the interruption is considered noise and playback continues
65
+ * cancels playback if the detected speech exceeds {@link minSpeechMs}. Below
66
+ * that threshold the interruption is considered noise and playback continues
29
67
  * uninterrupted.
30
68
  *
69
+ * The handler is stateless -- each {@link handleBargein} call is evaluated
70
+ * independently with no memory of previous barge-in events.
71
+ *
72
+ * @see {@link IBargeinHandler} for the interface contract.
73
+ * @see {@link SoftFadeBargeinHandler} for the three-tier alternative.
74
+ *
31
75
  * @example
32
- * ```ts
76
+ * ```typescript
33
77
  * const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
34
- * const action = handler.handleBargein({ speechDurationMs: 400, ... });
35
- * // action.type === 'cancel'
78
+ *
79
+ * // Short noise -> ignored
80
+ * handler.handleBargein({ speechDurationMs: 100, interruptedText: '...', playedDurationMs: 500 });
81
+ * // -> { type: 'ignore' }
82
+ *
83
+ * // Intentional speech -> cancel
84
+ * handler.handleBargein({ speechDurationMs: 400, interruptedText: '...', playedDurationMs: 500 });
85
+ * // -> { type: 'cancel', injectMarker: '[interrupted]' }
36
86
  * ```
37
87
  */
38
88
  export declare class HardCutBargeinHandler implements IBargeinHandler {
39
89
  /**
40
90
  * The interruption strategy implemented by this handler.
41
- * Always `'hard-cut'`.
91
+ * Always `'hard-cut'` -- playback is stopped instantly with no fade.
42
92
  */
43
93
  readonly mode: "hard-cut";
44
94
  /**
45
95
  * Minimum speech duration in milliseconds before the interruption is
46
- * considered intentional.
96
+ * considered intentional. Set once at construction and never changed.
47
97
  */
48
98
  private readonly minSpeechMs;
49
99
  /**
@@ -55,12 +105,13 @@ export declare class HardCutBargeinHandler implements IBargeinHandler {
55
105
  /**
56
106
  * Evaluate the barge-in context and return the action the pipeline should take.
57
107
  *
58
- * - If `context.speechDurationMs >= minSpeechMs`, returns
59
- * `{ type: 'cancel', injectMarker: '[interrupted]' }` to immediately halt TTS.
60
- * - Otherwise returns `{ type: 'ignore' }` to continue playback.
108
+ * Decision logic (binary threshold):
109
+ * - `speechDurationMs >= minSpeechMs` -> Cancel TTS immediately and inject
110
+ * an `'[interrupted]'` marker into the conversation context.
111
+ * - `speechDurationMs < minSpeechMs` -> Ignore the detection as noise.
61
112
  *
62
113
  * @param context - Snapshot of the barge-in state at the moment of detection.
63
- * @returns The pipeline action to execute.
114
+ * @returns The pipeline action to execute. Always synchronous (no Promise).
64
115
  */
65
116
  handleBargein(context: BargeinContext): BargeinAction;
66
117
  }
@@ -1 +1 @@
1
- {"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEjF;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;;OAMG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;;GAcG;AACH,qBAAa,qBAAsB,YAAW,eAAe;IAC3D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,UAAU,CAAU;IAEpC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IAErC;;;;OAIG;gBACS,OAAO,GAAE,4BAAiC;IAItD;;;;;;;;;OASG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAMtD"}
1
+ {"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMjF;;;;;;;GAOG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;;;;;OASG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,qBAAa,qBAAsB,YAAW,eAAe;IAC3D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,UAAU,CAAU;IAEpC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IAErC;;;;OAIG;gBACS,OAAO,GAAE,4BAAiC;IAItD;;;;;;;;;;OAUG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAatD"}