@framers/agentos 0.1.108 → 0.1.110

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/orchestration/runtime/GraphRuntime.d.ts.map +1 -1
  2. package/dist/orchestration/runtime/GraphRuntime.js +11 -4
  3. package/dist/orchestration/runtime/GraphRuntime.js.map +1 -1
  4. package/dist/orchestration/runtime/safeExpressionEvaluator.d.ts.map +1 -1
  5. package/dist/orchestration/runtime/safeExpressionEvaluator.js +35 -16
  6. package/dist/orchestration/runtime/safeExpressionEvaluator.js.map +1 -1
  7. package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +95 -20
  8. package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
  9. package/dist/voice-pipeline/AcousticEndpointDetector.js +110 -24
  10. package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
  11. package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +66 -15
  12. package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -1
  13. package/dist/voice-pipeline/HardCutBargeinHandler.js +65 -13
  14. package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -1
  15. package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +116 -42
  16. package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
  17. package/dist/voice-pipeline/HeuristicEndpointDetector.js +159 -52
  18. package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
  19. package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +89 -24
  20. package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -1
  21. package/dist/voice-pipeline/SoftFadeBargeinHandler.js +74 -20
  22. package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -1
  23. package/dist/voice-pipeline/VoiceInterruptError.d.ts +68 -10
  24. package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
  25. package/dist/voice-pipeline/VoiceInterruptError.js +53 -6
  26. package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
  27. package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +190 -39
  28. package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
  29. package/dist/voice-pipeline/VoicePipelineOrchestrator.js +266 -53
  30. package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
  31. package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +135 -43
  32. package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -1
  33. package/dist/voice-pipeline/WebSocketStreamTransport.js +109 -47
  34. package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -1
  35. package/dist/voice-pipeline/index.d.ts +34 -1
  36. package/dist/voice-pipeline/index.d.ts.map +1 -1
  37. package/dist/voice-pipeline/index.js +41 -1
  38. package/dist/voice-pipeline/index.js.map +1 -1
  39. package/dist/voice-pipeline/types.d.ts +432 -106
  40. package/dist/voice-pipeline/types.d.ts.map +1 -1
  41. package/dist/voice-pipeline/types.js +21 -9
  42. package/dist/voice-pipeline/types.js.map +1 -1
  43. package/package.json +1 -1
@@ -1 +1 @@
1
- {"version":3,"file":"AcousticEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,eAAe,EAA8B,MAAM,kCAAkC,CAAC;AAgC/F,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAO,wBAAyB,SAAQ,YAAY;IAmBxD,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,SAAyC,EAAE;QACrD,KAAK,EAAE,CAAC;QA1BV,kBAAkB;QACF,SAAI,GAAG,UAAmB,CAAC;QAK3C;;;WAGG;QACK,sBAAiB,GAAkB,IAAI,CAAC;QAEhD;;;WAGG;QACK,oBAAe,GAAkB,IAAI,CAAC;QAY5C,MAAM,QAAQ,GAA0B;YACtC,2BAA2B,EAAE,MAAM,CAAC,2BAA2B,IAAI,IAAI;YACvE,uBAAuB,EAAE,MAAM,CAAC,uBAAuB,IAAI,IAAI;SAChE,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC,QAAQ,CAAC,CAAC;QAErD,4EAA4E;QAC5E,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC,wBAAwB,EAAE,CAAC,kBAA0B,EAAE,EAAE;YAC/E,MAAM,UAAU,GACd,IAAI,CAAC,iBAAiB,KAAK,IAAI,IAAI,IAAI,CAAC,eAAe,KAAK,IAAI;gBAC9D,CAAC,CAAC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,iBAAiB;gBAC/C,CAAC,CAAC,CAAC,CAAC;YAER,MAAM,KAAK,GAAsB;gBAC/B,UAAU,EAAE,EAAE,EAAI,yCAAyC;gBAC3D,UAAU,EAAE,CAAC;gBACb,UAAU;gBACV,MAAM,EAAE,iBAAiB;aAC1B,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,8EAA8E;IAC9E,oBAAoB;IACpB,8EAA8E;IAE9E;;;;;;;;OAQG;IACI,YAAY,CAAC,KAAe;QACjC,2EAA2E;QAC3E,2DAA2D;QAC3D,MAAM,aAAa,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,EAAW,CAAC;QAE9D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc;gBACjB,IAAI,CAAC,iBAAiB,GAAG,KAAK,CAAC,SAAS,CAAC;gBACzC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;gBAC5B,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC;gBACtD,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC1B,MAAM;YAER,KAAK,YAAY;gBACf,IAAI,CAAC,eAAe,GAAG,KAAK,CAAC,SAAS,CAAC;gBACvC,IAAI,CAAC,eAAe,CAAC,eAAe,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC;gBACvD,MAAM;YAER,KAAK,SAAS;gBACZ,2DAA2D;gBAC3D,IAAI,CAAC,eAAe,CAAC,qBAAqB,CAAC,aAAa,CAAC,CAAC;gBAC1D,MAAM;QACV,CAAC;IACH,CAAC;IAED;;;;OAIG;IACI,cAAc,CAAC,MAAuB;QAC3C,+DAA+D;IACjE,CAAC;IAED;;OAEG;IACI,KAAK;QACV,IAAI,CAAC,iBAAiB,GAAG,IAAI,CAAC;QAC9B,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;IAC/B,CAAC;CACF"}
1
+ {"version":3,"file":"AcousticEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,eAAe,EAA8B,MAAM,kCAAkC,CAAC;AA8C/F,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,OAAO,wBAAyB,SAAQ,YAAY;IA+BxD,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,SAAyC,EAAE;QACrD,KAAK,EAAE,CAAC;QAxCV;;;WAGG;QACa,SAAI,GAAG,UAAmB,CAAC;QAQ3C;;;;;;WAMG;QACK,sBAAiB,GAAkB,IAAI,CAAC;QAEhD;;;;;;WAMG;QACK,oBAAe,GAAkB,IAAI,CAAC;QAc5C,uEAAuE;QACvE,MAAM,QAAQ,GAA0B;YACtC,2BAA2B,EAAE,MAAM,CAAC,2BAA2B,IAAI,IAAI;YACvE,uBAAuB,EAAE,MAAM,CAAC,uBAAuB,IAAI,IAAI;SAChE,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC,QAAQ,CAAC,CAAC;QAErD,yEAAyE;QACzE,qEAAqE;QACrE,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC,wBAAwB,EAAE,CAAC,kBAA0B,EAAE,EAAE;YAC/E,iEAAiE;YACjE,yDAAyD;YACzD,MAAM,UAAU,GACd,IAAI,CAAC,iBAAiB,KAAK,IAAI,IAAI,IAAI,CAAC,eAAe,KAAK,IAAI;gBAC9D,CAAC,CAAC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,iBAAiB;gBAC/C,CAAC,CAAC,CAAC,CAAC;YAER,MAAM,KAAK,GAAsB;gBAC/B,kEAAkE;gBAClE,sEAAsE;gBACtE,UAAU,EAAE,EAAE;gBACd,wDAAwD;gBACxD,UAAU,EAAE,CAAC;gBACb,UAAU;gBACV,MAAM,EAAE,iBAAiB;aAC1B,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,8EAA8E;IAC9E,oCAAoC;IACpC,8EAA8E;IAE9E;;;;;;;;;;OAUG;IACI,YAAY,CAAC,KAAe;QACjC,wEAAwE;QACxE,sEAAsE;QACtE,mEAAmE;QACnE,uDAAuD;QACvD,MAAM,aAAa,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,EAAW,CAAC;QAE9D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc;gBACjB,oDAAoD;gBACpD,IAAI,CAAC,iBAAiB,GAAG,KAAK,CAAC,SAAS,CAAC;gBACzC,mEAAmE;gBACnE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;gBAC5B,6DAA6D;gBAC7D,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC;gBACtD,2DAA2D;gBAC3D,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC1B,MAAM;YAER,KAAK,YAAY;gBACf,oDAAoD;gBACpD,IAAI,CAAC,eAAe,GAAG,KAAK,CAAC,SAAS,CAAC;gBACvC,wDAAwD;gBACxD,yEAAyE;gBACzE,0EAA0E;gBAC1E,IAAI,CAAC,eAAe,CAAC,eAAe,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC;gBACvD,MAAM;YAER,KAAK,SAAS;gBACZ,mEAAmE;gBACnE,0DAA0D;gBAC1D,IAAI,CAAC,eAAe,CAAC,qBAAqB,CAAC,aAAa,CAAC,CAAC;gBAC1D,MAAM;QACV,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,sCAAsC;IACtC,8EAA8E;IAE9E;;;;;;;OAOG;IACI,cAAc,CAAC,MAAuB;QAC3C,mEAAmE;QACnE,mEAAmE;QACnE,iCAAiC;IACnC,CAAC;IAED,8EAA8E;IAC9E,6BAA6B;IAC7B,8EAA8E;IAE9E;;;;;;OAMG;IACI,KAAK;QACV,IAAI,CAAC,iBAAiB,GAAG,IAAI,CAAC;QAC9B,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;IAC/B,CAAC;CACF"}
@@ -2,21 +2,59 @@
2
2
  * @module voice-pipeline/HardCutBargeinHandler
3
3
  *
4
4
  * Implements a hard-cut barge-in policy: when the user speaks over TTS output
5
- * for at least `minSpeechMs` milliseconds, playback is stopped immediately with
6
- * no fade-out. Short detections below the threshold are treated as accidental
7
- * noise and ignored.
5
+ * for at least {@link HardCutBargeinHandlerOptions.minSpeechMs} milliseconds,
6
+ * playback is stopped immediately with no fade-out. Short detections below the
7
+ * threshold are treated as accidental noise and ignored.
8
+ *
9
+ * ## Why 300 ms default threshold?
10
+ *
11
+ * The 300 ms threshold was chosen to filter out common non-speech audio events
12
+ * that trigger false barge-in detections:
13
+ *
14
+ * - **Lip smacks**: Typically 50-150 ms of energy.
15
+ * - **Breaths/sighs**: Typically 100-250 ms of energy.
16
+ * - **Coughs/sneezes**: Short burst 100-200 ms, but may exceed threshold.
17
+ * - **Background noise spikes**: Door closing, keyboard typing -- usually < 200 ms.
18
+ *
19
+ * At 300 ms, a detection almost certainly represents intentional speech rather
20
+ * than ambient noise. Lowering to < 200 ms increases false positives significantly
21
+ * in noisy environments. Raising to > 500 ms adds noticeable delay before the
22
+ * agent acknowledges the interruption.
23
+ *
24
+ * ## When to use hard-cut vs soft-fade
25
+ *
26
+ * Use hard-cut when:
27
+ * - The conversation style is fast-paced (e.g. customer support).
28
+ * - Users expect immediate response to interruption.
29
+ * - Audio quality is high (fewer false positives).
30
+ *
31
+ * Use {@link SoftFadeBargeinHandler} when:
32
+ * - The conversation is more measured (e.g. storytelling, education).
33
+ * - Users may accidentally trigger barge-in (noisy environment).
34
+ * - A smoother audio experience is preferred.
35
+ *
36
+ * @see {@link SoftFadeBargeinHandler} for the three-tier soft-fade alternative.
37
+ * @see {@link IBargeinHandler} for the interface contract.
8
38
  */
9
39
  import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
10
40
  /**
11
41
  * Construction options for {@link HardCutBargeinHandler}.
42
+ *
43
+ * @example
44
+ * ```typescript
45
+ * const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
46
+ * ```
12
47
  */
13
48
  export interface HardCutBargeinHandlerOptions {
14
49
  /**
15
50
  * Minimum confirmed speech duration (in milliseconds) required before a
16
51
  * barge-in is treated as intentional. Detections shorter than this value are
17
- * returned as `{ type: 'ignore' }` to avoid reacting to background noise.
52
+ * returned as `{ type: 'ignore' }` to avoid reacting to background noise,
53
+ * lip smacks, breaths, or other brief non-speech audio events.
18
54
  *
19
55
  * @defaultValue 300
56
+ *
57
+ * @see Module-level documentation for rationale behind the 300 ms default.
20
58
  */
21
59
  minSpeechMs?: number;
22
60
  }
@@ -24,26 +62,38 @@ export interface HardCutBargeinHandlerOptions {
24
62
  * Barge-in handler that applies a hard-cut strategy.
25
63
  *
26
64
  * When the user speaks over an active TTS stream, this handler immediately
27
- * cancels playback if the detected speech exceeds `minSpeechMs`. Below that
28
- * threshold the interruption is considered noise and playback continues
65
+ * cancels playback if the detected speech exceeds {@link minSpeechMs}. Below
66
+ * that threshold the interruption is considered noise and playback continues
29
67
  * uninterrupted.
30
68
  *
69
+ * The handler is stateless -- each {@link handleBargein} call is evaluated
70
+ * independently with no memory of previous barge-in events.
71
+ *
72
+ * @see {@link IBargeinHandler} for the interface contract.
73
+ * @see {@link SoftFadeBargeinHandler} for the three-tier alternative.
74
+ *
31
75
  * @example
32
- * ```ts
76
+ * ```typescript
33
77
  * const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
34
- * const action = handler.handleBargein({ speechDurationMs: 400, ... });
35
- * // action.type === 'cancel'
78
+ *
79
+ * // Short noise -> ignored
80
+ * handler.handleBargein({ speechDurationMs: 100, interruptedText: '...', playedDurationMs: 500 });
81
+ * // -> { type: 'ignore' }
82
+ *
83
+ * // Intentional speech -> cancel
84
+ * handler.handleBargein({ speechDurationMs: 400, interruptedText: '...', playedDurationMs: 500 });
85
+ * // -> { type: 'cancel', injectMarker: '[interrupted]' }
36
86
  * ```
37
87
  */
38
88
  export declare class HardCutBargeinHandler implements IBargeinHandler {
39
89
  /**
40
90
  * The interruption strategy implemented by this handler.
41
- * Always `'hard-cut'`.
91
+ * Always `'hard-cut'` -- playback is stopped instantly with no fade.
42
92
  */
43
93
  readonly mode: "hard-cut";
44
94
  /**
45
95
  * Minimum speech duration in milliseconds before the interruption is
46
- * considered intentional.
96
+ * considered intentional. Set once at construction and never changed.
47
97
  */
48
98
  private readonly minSpeechMs;
49
99
  /**
@@ -55,12 +105,13 @@ export declare class HardCutBargeinHandler implements IBargeinHandler {
55
105
  /**
56
106
  * Evaluate the barge-in context and return the action the pipeline should take.
57
107
  *
58
- * - If `context.speechDurationMs >= minSpeechMs`, returns
59
- * `{ type: 'cancel', injectMarker: '[interrupted]' }` to immediately halt TTS.
60
- * - Otherwise returns `{ type: 'ignore' }` to continue playback.
108
+ * Decision logic (binary threshold):
109
+ * - `speechDurationMs >= minSpeechMs` -> Cancel TTS immediately and inject
110
+ * an `'[interrupted]'` marker into the conversation context.
111
+ * - `speechDurationMs < minSpeechMs` -> Ignore the detection as noise.
61
112
  *
62
113
  * @param context - Snapshot of the barge-in state at the moment of detection.
63
- * @returns The pipeline action to execute.
114
+ * @returns The pipeline action to execute. Always synchronous (no Promise).
64
115
  */
65
116
  handleBargein(context: BargeinContext): BargeinAction;
66
117
  }
@@ -1 +1 @@
1
- {"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEjF;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;;OAMG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;;GAcG;AACH,qBAAa,qBAAsB,YAAW,eAAe;IAC3D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,UAAU,CAAU;IAEpC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IAErC;;;;OAIG;gBACS,OAAO,GAAE,4BAAiC;IAItD;;;;;;;;;OASG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAMtD"}
1
+ {"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMjF;;;;;;;GAOG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;;;;;OASG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,qBAAa,qBAAsB,YAAW,eAAe;IAC3D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,UAAU,CAAU;IAEpC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IAErC;;;;OAIG;gBACS,OAAO,GAAE,4BAAiC;IAItD;;;;;;;;;;OAUG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAatD"}
@@ -2,23 +2,68 @@
2
2
  * @module voice-pipeline/HardCutBargeinHandler
3
3
  *
4
4
  * Implements a hard-cut barge-in policy: when the user speaks over TTS output
5
- * for at least `minSpeechMs` milliseconds, playback is stopped immediately with
6
- * no fade-out. Short detections below the threshold are treated as accidental
7
- * noise and ignored.
5
+ * for at least {@link HardCutBargeinHandlerOptions.minSpeechMs} milliseconds,
6
+ * playback is stopped immediately with no fade-out. Short detections below the
7
+ * threshold are treated as accidental noise and ignored.
8
+ *
9
+ * ## Why 300 ms default threshold?
10
+ *
11
+ * The 300 ms threshold was chosen to filter out common non-speech audio events
12
+ * that trigger false barge-in detections:
13
+ *
14
+ * - **Lip smacks**: Typically 50-150 ms of energy.
15
+ * - **Breaths/sighs**: Typically 100-250 ms of energy.
16
+ * - **Coughs/sneezes**: Short burst 100-200 ms, but may exceed threshold.
17
+ * - **Background noise spikes**: Door closing, keyboard typing -- usually < 200 ms.
18
+ *
19
+ * At 300 ms, a detection almost certainly represents intentional speech rather
20
+ * than ambient noise. Lowering to < 200 ms increases false positives significantly
21
+ * in noisy environments. Raising to > 500 ms adds noticeable delay before the
22
+ * agent acknowledges the interruption.
23
+ *
24
+ * ## When to use hard-cut vs soft-fade
25
+ *
26
+ * Use hard-cut when:
27
+ * - The conversation style is fast-paced (e.g. customer support).
28
+ * - Users expect immediate response to interruption.
29
+ * - Audio quality is high (fewer false positives).
30
+ *
31
+ * Use {@link SoftFadeBargeinHandler} when:
32
+ * - The conversation is more measured (e.g. storytelling, education).
33
+ * - Users may accidentally trigger barge-in (noisy environment).
34
+ * - A smoother audio experience is preferred.
35
+ *
36
+ * @see {@link SoftFadeBargeinHandler} for the three-tier soft-fade alternative.
37
+ * @see {@link IBargeinHandler} for the interface contract.
8
38
  */
39
+ // ---------------------------------------------------------------------------
40
+ // Implementation
41
+ // ---------------------------------------------------------------------------
9
42
  /**
10
43
  * Barge-in handler that applies a hard-cut strategy.
11
44
  *
12
45
  * When the user speaks over an active TTS stream, this handler immediately
13
- * cancels playback if the detected speech exceeds `minSpeechMs`. Below that
14
- * threshold the interruption is considered noise and playback continues
46
+ * cancels playback if the detected speech exceeds {@link minSpeechMs}. Below
47
+ * that threshold the interruption is considered noise and playback continues
15
48
  * uninterrupted.
16
49
  *
50
+ * The handler is stateless -- each {@link handleBargein} call is evaluated
51
+ * independently with no memory of previous barge-in events.
52
+ *
53
+ * @see {@link IBargeinHandler} for the interface contract.
54
+ * @see {@link SoftFadeBargeinHandler} for the three-tier alternative.
55
+ *
17
56
  * @example
18
- * ```ts
57
+ * ```typescript
19
58
  * const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
20
- * const action = handler.handleBargein({ speechDurationMs: 400, ... });
21
- * // action.type === 'cancel'
59
+ *
60
+ * // Short noise -> ignored
61
+ * handler.handleBargein({ speechDurationMs: 100, interruptedText: '...', playedDurationMs: 500 });
62
+ * // -> { type: 'ignore' }
63
+ *
64
+ * // Intentional speech -> cancel
65
+ * handler.handleBargein({ speechDurationMs: 400, interruptedText: '...', playedDurationMs: 500 });
66
+ * // -> { type: 'cancel', injectMarker: '[interrupted]' }
22
67
  * ```
23
68
  */
24
69
  export class HardCutBargeinHandler {
@@ -30,7 +75,7 @@ export class HardCutBargeinHandler {
30
75
  constructor(options = {}) {
31
76
  /**
32
77
  * The interruption strategy implemented by this handler.
33
- * Always `'hard-cut'`.
78
+ * Always `'hard-cut'` -- playback is stopped instantly with no fade.
34
79
  */
35
80
  this.mode = 'hard-cut';
36
81
  this.minSpeechMs = options.minSpeechMs ?? 300;
@@ -38,17 +83,24 @@ export class HardCutBargeinHandler {
38
83
  /**
39
84
  * Evaluate the barge-in context and return the action the pipeline should take.
40
85
  *
41
- * - If `context.speechDurationMs >= minSpeechMs`, returns
42
- * `{ type: 'cancel', injectMarker: '[interrupted]' }` to immediately halt TTS.
43
- * - Otherwise returns `{ type: 'ignore' }` to continue playback.
86
+ * Decision logic (binary threshold):
87
+ * - `speechDurationMs >= minSpeechMs` -> Cancel TTS immediately and inject
88
+ * an `'[interrupted]'` marker into the conversation context.
89
+ * - `speechDurationMs < minSpeechMs` -> Ignore the detection as noise.
44
90
  *
45
91
  * @param context - Snapshot of the barge-in state at the moment of detection.
46
- * @returns The pipeline action to execute.
92
+ * @returns The pipeline action to execute. Always synchronous (no Promise).
47
93
  */
48
94
  handleBargein(context) {
49
95
  if (context.speechDurationMs >= this.minSpeechMs) {
96
+ // Speech duration meets the threshold -> intentional interruption.
97
+ // The '[interrupted]' marker is injected into the conversation history
98
+ // so the agent knows its previous response was cut short and can avoid
99
+ // repeating the interrupted content.
50
100
  return { type: 'cancel', injectMarker: '[interrupted]' };
51
101
  }
102
+ // Below threshold -> likely noise, lip smack, or breath.
103
+ // Continue TTS playback as if nothing happened.
52
104
  return { type: 'ignore' };
53
105
  }
54
106
  }
@@ -1 +1 @@
1
- {"version":3,"file":"HardCutBargeinHandler.js","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAkBH;;;;;;;;;;;;;;GAcG;AACH,MAAM,OAAO,qBAAqB;IAahC;;;;OAIG;IACH,YAAY,UAAwC,EAAE;QAjBtD;;;WAGG;QACM,SAAI,GAAG,UAAmB,CAAC;QAclC,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,GAAG,CAAC;IAChD,CAAC;IAED;;;;;;;;;OASG;IACH,aAAa,CAAC,OAAuB;QACnC,IAAI,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACjD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,CAAC;QAC3D,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC5B,CAAC;CACF"}
1
+ {"version":3,"file":"HardCutBargeinHandler.js","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AA8BH,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,MAAM,OAAO,qBAAqB;IAahC;;;;OAIG;IACH,YAAY,UAAwC,EAAE;QAjBtD;;;WAGG;QACM,SAAI,GAAG,UAAmB,CAAC;QAclC,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,GAAG,CAAC;IAChD,CAAC;IAED;;;;;;;;;;OAUG;IACH,aAAa,CAAC,OAAuB;QACnC,IAAI,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACjD,mEAAmE;YACnE,uEAAuE;YACvE,uEAAuE;YACvE,qCAAqC;YACrC,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,CAAC;QAC3D,CAAC;QAED,yDAAyD;QACzD,gDAAgD;QAChD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC5B,CAAC;CACF"}
@@ -6,24 +6,46 @@
6
6
  * finished speaking. Suitable for low-latency deployments where an LLM-based
7
7
  * semantic detector would add unacceptable round-trip overhead.
8
8
  *
9
- * Detection strategy:
10
- * 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`, or `!`,
11
- * fire `turn_complete` immediately with reason `'punctuation'`.
12
- * 2. Otherwise, start a silence timer (default 1 500 ms). If speech does not
13
- * resume before the timer fires, emit `turn_complete` with reason `'silence_timeout'`.
14
- * 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed from
15
- * accumulation, and re-emitted as `'backchannel_detected'` events so the
16
- * pipeline can decide whether to suppress an agent response.
9
+ * ## Detection strategy
10
+ *
11
+ * 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`,
12
+ * or `!`, fire `turn_complete` immediately with reason `'punctuation'`.
13
+ * This provides the lowest-latency turn handoff for well-punctuated speech.
14
+ *
15
+ * 2. Otherwise, start a silence timer (default 1,500 ms). If speech does not
16
+ * resume before the timer fires, emit `turn_complete` with reason
17
+ * `'silence_timeout'`. The timeout acts as a safety net for STT providers
18
+ * that don't produce terminal punctuation reliably.
19
+ *
20
+ * 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed
21
+ * from accumulation, and re-emitted as `'backchannel_detected'` events so
22
+ * the pipeline can decide whether to suppress an agent response.
23
+ *
24
+ * ## Why heuristic over acoustic-only?
25
+ *
26
+ * Pure silence timeout adds up to 1.5 s of unnecessary latency on every turn
27
+ * when the user ends a sentence cleanly. By checking for terminal punctuation,
28
+ * this detector can fire turn_complete immediately, cutting perceived latency
29
+ * by more than half for typical conversational speech.
30
+ *
31
+ * @see {@link AcousticEndpointDetector} for the purely acoustic alternative.
32
+ * @see {@link IEndpointDetector} for the interface contract.
17
33
  */
18
34
  import { EventEmitter } from 'node:events';
19
35
  import type { IEndpointDetector, TranscriptEvent, VadEvent } from './types.js';
20
36
  /**
21
37
  * Constructor options for {@link HeuristicEndpointDetector}.
38
+ *
39
+ * @example
40
+ * ```typescript
41
+ * const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
42
+ * ```
22
43
  */
23
44
  export interface HeuristicEndpointDetectorOptions {
24
45
  /**
25
46
  * How long (ms) to wait after `speech_end` before emitting `turn_complete`
26
- * when no terminal punctuation is detected.
47
+ * when no terminal punctuation is detected. Lower values reduce latency
48
+ * but risk firing mid-sentence during natural pauses.
27
49
  * @defaultValue 1500
28
50
  */
29
51
  silenceTimeoutMs?: number;
@@ -32,96 +54,148 @@ export interface HeuristicEndpointDetectorOptions {
32
54
  * Heuristic endpoint detector that uses terminal punctuation and a silence
33
55
  * timeout to decide when the user's turn is complete.
34
56
  *
35
- * Emits:
36
- * - `'turn_complete'` ({@link TurnCompleteEvent}) — user turn has ended.
37
- * - `'backchannel_detected'` (`{ text: string }`) — a backchannel phrase was
38
- * recognised; accumulation is suppressed for this utterance.
57
+ * ## Events emitted
58
+ *
59
+ * | Event | Payload | Description |
60
+ * |--------------------------|--------------------------|------------------------------------|
61
+ * | `'turn_complete'` | {@link TurnCompleteEvent}| User turn has ended. |
62
+ * | `'backchannel_detected'` | `{ text: string }` | Backchannel phrase was recognised. |
63
+ *
64
+ * @see {@link IEndpointDetector} for the interface contract.
65
+ * @see {@link AcousticEndpointDetector} for the purely acoustic alternative.
39
66
  *
40
67
  * @example
41
68
  * ```typescript
42
69
  * const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
43
70
  * detector.on('turn_complete', (event) => console.log('Turn done:', event));
71
+ *
72
+ * // Simulate a punctuated sentence followed by speech_end
44
73
  * detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] });
45
74
  * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' });
46
- * // 'turn_complete' fires immediately with reason 'punctuation'
75
+ * // -> 'turn_complete' fires immediately with reason 'punctuation'
47
76
  * ```
48
77
  */
49
78
  export declare class HeuristicEndpointDetector extends EventEmitter implements IEndpointDetector {
50
79
  /**
51
80
  * Active detection strategy label.
52
- * Typed as `'hybrid'` to satisfy {@link IEndpointDetector.mode}; consumers
53
- * that need to distinguish heuristic detectors may inspect `instanceof`.
81
+ * Always `'heuristic'` for this implementation.
82
+ *
83
+ * @see {@link IEndpointDetector.mode}
54
84
  */
55
85
  readonly mode: IEndpointDetector['mode'];
56
86
  /** Resolved silence timeout in milliseconds. */
57
87
  private readonly silenceTimeoutMs;
58
- /** The latest final transcript text accumulated for the current turn. */
88
+ /**
89
+ * The latest final transcript text accumulated for the current turn.
90
+ * Only updated by final (non-interim) transcript events.
91
+ * Reset to empty string after each `turn_complete` emission.
92
+ */
59
93
  private accumulatedText;
60
- /** Whether the VAD currently reports active speech. */
94
+ /**
95
+ * Whether the VAD currently reports active speech. Set to `true` on
96
+ * `speech_start` and `false` on `speech_end`. Used to prevent the
97
+ * silence timer from starting while the user is still speaking.
98
+ */
61
99
  private speechActive;
62
- /** Handle to a pending silence timeout, or `null` if none is running. */
100
+ /**
101
+ * Handle to a pending silence timeout, or `null` if none is running.
102
+ * Cleared when speech resumes or when the detector is reset.
103
+ */
63
104
  private silenceTimer;
64
- /** Wall-clock timestamp (ms) when the current turn's speech started. */
105
+ /**
106
+ * Wall-clock timestamp (ms) when the current turn's speech started.
107
+ * Used to compute `durationMs` in the emitted {@link TurnCompleteEvent}.
108
+ * `null` when no speech has been detected in the current turn.
109
+ */
65
110
  private turnStartMs;
66
- /** Confidence of the most recent final transcript. */
111
+ /**
112
+ * Confidence of the most recent final transcript. Forwarded into the
113
+ * emitted {@link TurnCompleteEvent}. Defaults to 1 (perfect confidence)
114
+ * and is updated with each final transcript event.
115
+ */
67
116
  private lastConfidence;
68
117
  /**
69
118
  * Create a new {@link HeuristicEndpointDetector}.
70
119
  *
71
- * @param options Optional configuration overrides.
120
+ * @param options - Optional configuration overrides.
72
121
  */
73
122
  constructor(options?: HeuristicEndpointDetectorOptions);
74
123
  /**
75
124
  * Ingest a transcript event from the upstream STT session.
76
125
  *
77
126
  * Only final events (`isFinal: true`) affect internal state. Interim results
78
- * are silently ignored — they may arrive very frequently and their text is
79
- * unstable.
127
+ * are silently ignored because:
128
+ * 1. They arrive very frequently (10-50 per second) and would trigger
129
+ * excessive punctuation checks.
130
+ * 2. Their text is unstable -- a word ending with "." may be revised in
131
+ * the next interim result, causing false turn-completion signals.
80
132
  *
81
- * If the final text is a recognised backchannel phrase the detector emits
82
- * `'backchannel_detected'` and returns without accumulating the text, so that
83
- * a subsequent `speech_end` event does not trigger `turn_complete`.
133
+ * If the final text is a recognised backchannel phrase, the detector emits
134
+ * `'backchannel_detected'` and returns WITHOUT accumulating the text. This
135
+ * prevents a subsequent `speech_end` event from triggering `turn_complete`
136
+ * for what was merely an acknowledgement, not a real conversational turn.
84
137
  *
85
- * @param transcript Transcript event from the STT session.
138
+ * @param transcript - Transcript event from the STT session.
86
139
  */
87
140
  pushTranscript(transcript: TranscriptEvent): void;
88
141
  /**
89
142
  * Ingest a VAD (voice activity detection) event.
90
143
  *
91
- * - `speech_start`: marks the turn as active and cancels any pending silence
92
- * timer (the user resumed speaking before the timeout elapsed).
93
- * - `speech_end`: if accumulated text is available, either fires
94
- * `turn_complete` immediately (punctuation) or starts the silence timer.
95
- * - `silence`: heartbeat events are ignored; only explicit `speech_end`
96
- * drives the timeout logic.
144
+ * Event handling by type:
145
+ *
146
+ * - **`speech_start`**: Marks the turn as active and cancels any pending
147
+ * silence timer (the user resumed speaking before the timeout elapsed).
148
+ * This is critical for avoiding false turn-completion when the user
149
+ * takes a brief pause mid-sentence.
150
+ *
151
+ * - **`speech_end`**: If accumulated text is available, either fires
152
+ * `turn_complete` immediately (when text ends with terminal punctuation)
153
+ * or starts the silence timer (when no punctuation is detected).
154
+ *
155
+ * - **`silence`**: Periodic heartbeat events are ignored. The silence timer
156
+ * (started on `speech_end`) already handles delayed turn-completion
157
+ * independently of heartbeat cadence.
97
158
  *
98
- * @param event VAD transition event.
159
+ * @param event - VAD transition event.
99
160
  */
100
161
  pushVadEvent(event: VadEvent): void;
101
162
  /**
102
163
  * Reset all internal state, cancel pending timers, and prepare the detector
103
- * for the next user turn. Should be called by the pipeline after each
104
- * `turn_complete` event before audio for the next turn begins to arrive.
164
+ * for the next user turn.
165
+ *
166
+ * Called by the pipeline after each `turn_complete` event (both internally
167
+ * and by the orchestrator's flush_complete handler) to ensure clean state
168
+ * before audio for the next turn begins to arrive.
105
169
  */
106
170
  reset(): void;
107
171
  /**
108
172
  * Emit `turn_complete` with the currently accumulated transcript and then
109
173
  * reset internal state so the detector is ready for the next turn.
110
174
  *
111
- * @param reason The semantic reason driving this completion.
112
- * @param speechEndTimestamp Unix epoch ms timestamp of the `speech_end` event,
113
- * used to compute `durationMs`.
175
+ * The reset happens BEFORE the emit to ensure that any re-entrant listeners
176
+ * (e.g. an endpoint detector handler that immediately calls pushVadEvent)
177
+ * see clean state.
178
+ *
179
+ * @param reason - The semantic reason driving this completion.
180
+ * @param speechEndTimestamp - Unix epoch ms timestamp of the `speech_end` event,
181
+ * used to compute `durationMs` as `speechEndTimestamp - turnStartMs`.
114
182
  */
115
183
  private _emitTurnComplete;
116
184
  /**
117
185
  * Start the silence-timeout timer. If the user does not resume speaking
118
- * within {@link silenceTimeoutMs} ms the detector fires `turn_complete`.
186
+ * within {@link silenceTimeoutMs} ms, the detector fires `turn_complete`
187
+ * with reason `'silence_timeout'`.
188
+ *
189
+ * Any previously running silence timer is cleared first to prevent
190
+ * double-fires from rapid speech_end -> speech_start -> speech_end sequences.
119
191
  *
120
- * @param speechEndTimestamp Timestamp passed through to `_emitTurnComplete`.
192
+ * @param speechEndTimestamp - Timestamp passed through to {@link _emitTurnComplete}
193
+ * for duration calculation.
121
194
  */
122
195
  private _startSilenceTimer;
123
196
  /**
124
197
  * Cancel a pending silence timer without any side effects.
198
+ * Safe to call when no timer is active (no-op).
125
199
  */
126
200
  private _clearSilenceTimer;
127
201
  }
@@ -1 +1 @@
1
- {"version":3,"file":"HeuristicEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,KAAK,EACV,iBAAiB,EACjB,eAAe,EACf,QAAQ,EAET,MAAM,YAAY,CAAC;AAwCpB;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAC/C;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAMD;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,yBACX,SAAQ,YACR,YAAW,iBAAiB;IAE5B;;;;OAIG;IACH,QAAQ,CAAC,IAAI,EAAE,iBAAiB,CAAC,MAAM,CAAC,CAAe;IAEvD,gDAAgD;IAChD,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAS;IAE1C,yEAAyE;IACzE,OAAO,CAAC,eAAe,CAAM;IAE7B,uDAAuD;IACvD,OAAO,CAAC,YAAY,CAAS;IAE7B,yEAAyE;IACzE,OAAO,CAAC,YAAY,CAA8C;IAElE,wEAAwE;IACxE,OAAO,CAAC,WAAW,CAAuB;IAE1C,sDAAsD;IACtD,OAAO,CAAC,cAAc,CAAK;IAM3B;;;;OAIG;gBACS,OAAO,GAAE,gCAAqC;IAS1D;;;;;;;;;;;;OAYG;IACH,cAAc,CAAC,UAAU,EAAE,eAAe,GAAG,IAAI;IAwBjD;;;;;;;;;;;OAWG;IACH,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,IAAI;IAyCnC;;;;OAIG;IACH,KAAK,IAAI,IAAI;IAYb;;;;;;;OAOG;IACH,OAAO,CAAC,iBAAiB;IAoBzB;;;;;OAKG;IACH,OAAO,CAAC,kBAAkB;IAQ1B;;OAEG;IACH,OAAO,CAAC,kBAAkB;CAM3B"}
1
+ {"version":3,"file":"HeuristicEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,KAAK,EACV,iBAAiB,EACjB,eAAe,EACf,QAAQ,EAET,MAAM,YAAY,CAAC;AAkEpB;;;;;;;GAOG;AACH,MAAM,WAAW,gCAAgC;IAC/C;;;;;OAKG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAMD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,qBAAa,yBACX,SAAQ,YACR,YAAW,iBAAiB;IAE5B;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,EAAE,iBAAiB,CAAC,MAAM,CAAC,CAAe;IAEvD,gDAAgD;IAChD,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAS;IAE1C;;;;OAIG;IACH,OAAO,CAAC,eAAe,CAAM;IAE7B;;;;OAIG;IACH,OAAO,CAAC,YAAY,CAAS;IAE7B;;;OAGG;IACH,OAAO,CAAC,YAAY,CAA8C;IAElE;;;;OAIG;IACH,OAAO,CAAC,WAAW,CAAuB;IAE1C;;;;OAIG;IACH,OAAO,CAAC,cAAc,CAAK;IAM3B;;;;OAIG;gBACS,OAAO,GAAE,gCAAqC;IAS1D;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,CAAC,UAAU,EAAE,eAAe,GAAG,IAAI;IA4BjD;;;;;;;;;;;;;;;;;;;OAmBG;IACH,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,IAAI;IAgDnC;;;;;;;OAOG;IACH,KAAK,IAAI,IAAI;IAYb;;;;;;;;;;;OAWG;IACH,OAAO,CAAC,iBAAiB;IAsBzB;;;;;;;;;;OAUG;IACH,OAAO,CAAC,kBAAkB;IAQ1B;;;OAGG;IACH,OAAO,CAAC,kBAAkB;CAM3B"}