@framers/agentos 0.1.75 → 0.1.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -34
- package/dist/core/agency/AgentCommunicationBus.d.ts +1 -0
- package/dist/core/agency/AgentCommunicationBus.d.ts.map +1 -1
- package/dist/core/agency/AgentCommunicationBus.js +62 -8
- package/dist/core/agency/AgentCommunicationBus.js.map +1 -1
- package/dist/core/agency/IAgentCommunicationBus.d.ts +1 -1
- package/dist/core/agency/IAgentCommunicationBus.d.ts.map +1 -1
- package/dist/orchestration/runtime/LoopController.d.ts +10 -10
- package/dist/orchestration/runtime/LoopController.d.ts.map +1 -1
- package/dist/orchestration/runtime/LoopController.js +1 -1
- package/dist/orchestration/runtime/LoopController.js.map +1 -1
- package/dist/orchestration/runtime/index.d.ts +1 -1
- package/dist/orchestration/runtime/index.d.ts.map +1 -1
- package/dist/orchestration/runtime/index.js.map +1 -1
- package/dist/speech/FallbackProxy.d.ts +104 -0
- package/dist/speech/FallbackProxy.d.ts.map +1 -0
- package/dist/speech/FallbackProxy.js +151 -0
- package/dist/speech/FallbackProxy.js.map +1 -0
- package/dist/speech/SpeechProviderResolver.d.ts +103 -0
- package/dist/speech/SpeechProviderResolver.d.ts.map +1 -0
- package/dist/speech/SpeechProviderResolver.js +256 -0
- package/dist/speech/SpeechProviderResolver.js.map +1 -0
- package/dist/speech/SpeechRuntime.d.ts +23 -1
- package/dist/speech/SpeechRuntime.d.ts.map +1 -1
- package/dist/speech/SpeechRuntime.js +82 -8
- package/dist/speech/SpeechRuntime.js.map +1 -1
- package/dist/speech/index.d.ts +6 -0
- package/dist/speech/index.d.ts.map +1 -1
- package/dist/speech/index.js +6 -0
- package/dist/speech/index.js.map +1 -1
- package/dist/speech/providerCatalog.d.ts.map +1 -1
- package/dist/speech/providerCatalog.js +15 -1
- package/dist/speech/providerCatalog.js.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts +49 -0
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -0
- package/dist/speech/providers/AssemblyAISTTProvider.js +151 -0
- package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +48 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.js +90 -0
- package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +60 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.js +127 -0
- package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +55 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.js +102 -0
- package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -0
- package/dist/speech/types.d.ts +35 -0
- package/dist/speech/types.d.ts.map +1 -1
- package/dist/voice/CallManager.d.ts +1 -1
- package/dist/voice/CallManager.d.ts.map +1 -1
- package/dist/voice/CallManager.js +9 -0
- package/dist/voice/CallManager.js.map +1 -1
- package/dist/voice/MediaStreamParser.d.ts +83 -0
- package/dist/voice/MediaStreamParser.d.ts.map +1 -0
- package/dist/voice/MediaStreamParser.js +2 -0
- package/dist/voice/MediaStreamParser.js.map +1 -0
- package/dist/voice/TelephonyStreamTransport.d.ts +112 -0
- package/dist/voice/TelephonyStreamTransport.d.ts.map +1 -0
- package/dist/voice/TelephonyStreamTransport.js +208 -0
- package/dist/voice/TelephonyStreamTransport.js.map +1 -0
- package/dist/voice/index.d.ts +10 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +11 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts +43 -0
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts.map +1 -0
- package/dist/voice/parsers/PlivoMediaStreamParser.js +92 -0
- package/dist/voice/parsers/PlivoMediaStreamParser.js.map +1 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts +51 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts.map +1 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.js +103 -0
- package/dist/voice/parsers/TelnyxMediaStreamParser.js.map +1 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts +50 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts.map +1 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.js +144 -0
- package/dist/voice/parsers/TwilioMediaStreamParser.js.map +1 -0
- package/dist/voice/providers/plivo.d.ts +77 -0
- package/dist/voice/providers/plivo.d.ts.map +1 -0
- package/dist/voice/providers/plivo.js +180 -0
- package/dist/voice/providers/plivo.js.map +1 -0
- package/dist/voice/providers/telnyx.d.ts +93 -0
- package/dist/voice/providers/telnyx.d.ts.map +1 -0
- package/dist/voice/providers/telnyx.js +193 -0
- package/dist/voice/providers/telnyx.js.map +1 -0
- package/dist/voice/providers/twilio.d.ts +79 -0
- package/dist/voice/providers/twilio.d.ts.map +1 -0
- package/dist/voice/providers/twilio.js +191 -0
- package/dist/voice/providers/twilio.js.map +1 -0
- package/dist/voice/twiml.d.ts +69 -0
- package/dist/voice/twiml.d.ts.map +1 -0
- package/dist/voice/twiml.js +92 -0
- package/dist/voice/twiml.js.map +1 -0
- package/dist/voice/types.d.ts +9 -1
- package/dist/voice/types.d.ts.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +90 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.js +123 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +67 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.js +55 -0
- package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +128 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.js +240 -0
- package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +96 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js +69 -0
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +122 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js +317 -0
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +148 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.js +207 -0
- package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -0
- package/dist/voice-pipeline/index.d.ts +13 -0
- package/dist/voice-pipeline/index.d.ts.map +1 -0
- package/dist/voice-pipeline/index.js +13 -0
- package/dist/voice-pipeline/index.js.map +1 -0
- package/dist/voice-pipeline/types.d.ts +905 -0
- package/dist/voice-pipeline/types.d.ts.map +1 -0
- package/dist/voice-pipeline/types.js +23 -0
- package/dist/voice-pipeline/types.js.map +1 -0
- package/package.json +6 -1
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module voice-pipeline/HardCutBargeinHandler
|
|
3
|
+
*
|
|
4
|
+
* Implements a hard-cut barge-in policy: when the user speaks over TTS output
|
|
5
|
+
* for at least `minSpeechMs` milliseconds, playback is stopped immediately with
|
|
6
|
+
* no fade-out. Short detections below the threshold are treated as accidental
|
|
7
|
+
* noise and ignored.
|
|
8
|
+
*/
|
|
9
|
+
import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Construction options for {@link HardCutBargeinHandler}.
|
|
12
|
+
*/
|
|
13
|
+
export interface HardCutBargeinHandlerOptions {
|
|
14
|
+
/**
|
|
15
|
+
* Minimum confirmed speech duration (in milliseconds) required before a
|
|
16
|
+
* barge-in is treated as intentional. Detections shorter than this value are
|
|
17
|
+
* returned as `{ type: 'ignore' }` to avoid reacting to background noise.
|
|
18
|
+
*
|
|
19
|
+
* @defaultValue 300
|
|
20
|
+
*/
|
|
21
|
+
minSpeechMs?: number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Barge-in handler that applies a hard-cut strategy.
|
|
25
|
+
*
|
|
26
|
+
* When the user speaks over an active TTS stream, this handler immediately
|
|
27
|
+
* cancels playback if the detected speech exceeds `minSpeechMs`. Below that
|
|
28
|
+
* threshold the interruption is considered noise and playback continues
|
|
29
|
+
* uninterrupted.
|
|
30
|
+
*
|
|
31
|
+
* @example
|
|
32
|
+
* ```ts
|
|
33
|
+
* const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
|
|
34
|
+
* const action = handler.handleBargein({ speechDurationMs: 400, ... });
|
|
35
|
+
* // action.type === 'cancel'
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
export declare class HardCutBargeinHandler implements IBargeinHandler {
|
|
39
|
+
/**
|
|
40
|
+
* The interruption strategy implemented by this handler.
|
|
41
|
+
* Always `'hard-cut'`.
|
|
42
|
+
*/
|
|
43
|
+
readonly mode: "hard-cut";
|
|
44
|
+
/**
|
|
45
|
+
* Minimum speech duration in milliseconds before the interruption is
|
|
46
|
+
* considered intentional.
|
|
47
|
+
*/
|
|
48
|
+
private readonly minSpeechMs;
|
|
49
|
+
/**
|
|
50
|
+
* Constructs a new {@link HardCutBargeinHandler}.
|
|
51
|
+
*
|
|
52
|
+
* @param options - Optional configuration. Defaults to `{ minSpeechMs: 300 }`.
|
|
53
|
+
*/
|
|
54
|
+
constructor(options?: HardCutBargeinHandlerOptions);
|
|
55
|
+
/**
|
|
56
|
+
* Evaluate the barge-in context and return the action the pipeline should take.
|
|
57
|
+
*
|
|
58
|
+
* - If `context.speechDurationMs >= minSpeechMs`, returns
|
|
59
|
+
* `{ type: 'cancel', injectMarker: '[interrupted]' }` to immediately halt TTS.
|
|
60
|
+
* - Otherwise returns `{ type: 'ignore' }` to continue playback.
|
|
61
|
+
*
|
|
62
|
+
* @param context - Snapshot of the barge-in state at the moment of detection.
|
|
63
|
+
* @returns The pipeline action to execute.
|
|
64
|
+
*/
|
|
65
|
+
handleBargein(context: BargeinContext): BargeinAction;
|
|
66
|
+
}
|
|
67
|
+
//# sourceMappingURL=HardCutBargeinHandler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEjF;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;;OAMG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;;GAcG;AACH,qBAAa,qBAAsB,YAAW,eAAe;IAC3D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,UAAU,CAAU;IAEpC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IAErC;;;;OAIG;gBACS,OAAO,GAAE,4BAAiC;IAItD;;;;;;;;;OASG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAMtD"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module voice-pipeline/HardCutBargeinHandler
|
|
3
|
+
*
|
|
4
|
+
* Implements a hard-cut barge-in policy: when the user speaks over TTS output
|
|
5
|
+
* for at least `minSpeechMs` milliseconds, playback is stopped immediately with
|
|
6
|
+
* no fade-out. Short detections below the threshold are treated as accidental
|
|
7
|
+
* noise and ignored.
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Barge-in handler that applies a hard-cut strategy.
|
|
11
|
+
*
|
|
12
|
+
* When the user speaks over an active TTS stream, this handler immediately
|
|
13
|
+
* cancels playback if the detected speech exceeds `minSpeechMs`. Below that
|
|
14
|
+
* threshold the interruption is considered noise and playback continues
|
|
15
|
+
* uninterrupted.
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* ```ts
|
|
19
|
+
* const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
|
|
20
|
+
* const action = handler.handleBargein({ speechDurationMs: 400, ... });
|
|
21
|
+
* // action.type === 'cancel'
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
export class HardCutBargeinHandler {
|
|
25
|
+
/**
|
|
26
|
+
* Constructs a new {@link HardCutBargeinHandler}.
|
|
27
|
+
*
|
|
28
|
+
* @param options - Optional configuration. Defaults to `{ minSpeechMs: 300 }`.
|
|
29
|
+
*/
|
|
30
|
+
constructor(options = {}) {
|
|
31
|
+
/**
|
|
32
|
+
* The interruption strategy implemented by this handler.
|
|
33
|
+
* Always `'hard-cut'`.
|
|
34
|
+
*/
|
|
35
|
+
this.mode = 'hard-cut';
|
|
36
|
+
this.minSpeechMs = options.minSpeechMs ?? 300;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Evaluate the barge-in context and return the action the pipeline should take.
|
|
40
|
+
*
|
|
41
|
+
* - If `context.speechDurationMs >= minSpeechMs`, returns
|
|
42
|
+
* `{ type: 'cancel', injectMarker: '[interrupted]' }` to immediately halt TTS.
|
|
43
|
+
* - Otherwise returns `{ type: 'ignore' }` to continue playback.
|
|
44
|
+
*
|
|
45
|
+
* @param context - Snapshot of the barge-in state at the moment of detection.
|
|
46
|
+
* @returns The pipeline action to execute.
|
|
47
|
+
*/
|
|
48
|
+
handleBargein(context) {
|
|
49
|
+
if (context.speechDurationMs >= this.minSpeechMs) {
|
|
50
|
+
return { type: 'cancel', injectMarker: '[interrupted]' };
|
|
51
|
+
}
|
|
52
|
+
return { type: 'ignore' };
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=HardCutBargeinHandler.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"HardCutBargeinHandler.js","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAkBH;;;;;;;;;;;;;;GAcG;AACH,MAAM,OAAO,qBAAqB;IAahC;;;;OAIG;IACH,YAAY,UAAwC,EAAE;QAjBtD;;;WAGG;QACM,SAAI,GAAG,UAAmB,CAAC;QAclC,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,GAAG,CAAC;IAChD,CAAC;IAED;;;;;;;;;OASG;IACH,aAAa,CAAC,OAAuB;QACnC,IAAI,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACjD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,eAAe,EAAE,CAAC;QAC3D,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC5B,CAAC;CACF"}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module voice-pipeline/HeuristicEndpointDetector
|
|
3
|
+
*
|
|
4
|
+
* A lightweight, rule-based endpoint detector that combines terminal punctuation
|
|
5
|
+
* analysis with a configurable silence timeout to determine when the user has
|
|
6
|
+
* finished speaking. Suitable for low-latency deployments where an LLM-based
|
|
7
|
+
* semantic detector would add unacceptable round-trip overhead.
|
|
8
|
+
*
|
|
9
|
+
* Detection strategy:
|
|
10
|
+
* 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`, or `!`,
|
|
11
|
+
* fire `turn_complete` immediately with reason `'punctuation'`.
|
|
12
|
+
* 2. Otherwise, start a silence timer (default 1 500 ms). If speech does not
|
|
13
|
+
* resume before the timer fires, emit `turn_complete` with reason `'silence_timeout'`.
|
|
14
|
+
* 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed from
|
|
15
|
+
* accumulation, and re-emitted as `'backchannel_detected'` events so the
|
|
16
|
+
* pipeline can decide whether to suppress an agent response.
|
|
17
|
+
*/
|
|
18
|
+
import { EventEmitter } from 'node:events';
|
|
19
|
+
import type { IEndpointDetector, TranscriptEvent, VadEvent } from './types.js';
|
|
20
|
+
/**
|
|
21
|
+
* Constructor options for {@link HeuristicEndpointDetector}.
|
|
22
|
+
*/
|
|
23
|
+
export interface HeuristicEndpointDetectorOptions {
|
|
24
|
+
/**
|
|
25
|
+
* How long (ms) to wait after `speech_end` before emitting `turn_complete`
|
|
26
|
+
* when no terminal punctuation is detected.
|
|
27
|
+
* @defaultValue 1500
|
|
28
|
+
*/
|
|
29
|
+
silenceTimeoutMs?: number;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Heuristic endpoint detector that uses terminal punctuation and a silence
|
|
33
|
+
* timeout to decide when the user's turn is complete.
|
|
34
|
+
*
|
|
35
|
+
* Emits:
|
|
36
|
+
* - `'turn_complete'` ({@link TurnCompleteEvent}) — user turn has ended.
|
|
37
|
+
* - `'backchannel_detected'` (`{ text: string }`) — a backchannel phrase was
|
|
38
|
+
* recognised; accumulation is suppressed for this utterance.
|
|
39
|
+
*
|
|
40
|
+
* @example
|
|
41
|
+
* ```typescript
|
|
42
|
+
* const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
|
|
43
|
+
* detector.on('turn_complete', (event) => console.log('Turn done:', event));
|
|
44
|
+
* detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] });
|
|
45
|
+
* detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' });
|
|
46
|
+
* // → 'turn_complete' fires immediately with reason 'punctuation'
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare class HeuristicEndpointDetector extends EventEmitter implements IEndpointDetector {
|
|
50
|
+
/**
|
|
51
|
+
* Active detection strategy label.
|
|
52
|
+
* Typed as `'hybrid'` to satisfy {@link IEndpointDetector.mode}; consumers
|
|
53
|
+
* that need to distinguish heuristic detectors may inspect `instanceof`.
|
|
54
|
+
*/
|
|
55
|
+
readonly mode: IEndpointDetector['mode'];
|
|
56
|
+
/** Resolved silence timeout in milliseconds. */
|
|
57
|
+
private readonly silenceTimeoutMs;
|
|
58
|
+
/** The latest final transcript text accumulated for the current turn. */
|
|
59
|
+
private accumulatedText;
|
|
60
|
+
/** Whether the VAD currently reports active speech. */
|
|
61
|
+
private speechActive;
|
|
62
|
+
/** Handle to a pending silence timeout, or `null` if none is running. */
|
|
63
|
+
private silenceTimer;
|
|
64
|
+
/** Wall-clock timestamp (ms) when the current turn's speech started. */
|
|
65
|
+
private turnStartMs;
|
|
66
|
+
/** Confidence of the most recent final transcript. */
|
|
67
|
+
private lastConfidence;
|
|
68
|
+
/**
|
|
69
|
+
* Create a new {@link HeuristicEndpointDetector}.
|
|
70
|
+
*
|
|
71
|
+
* @param options — Optional configuration overrides.
|
|
72
|
+
*/
|
|
73
|
+
constructor(options?: HeuristicEndpointDetectorOptions);
|
|
74
|
+
/**
|
|
75
|
+
* Ingest a transcript event from the upstream STT session.
|
|
76
|
+
*
|
|
77
|
+
* Only final events (`isFinal: true`) affect internal state. Interim results
|
|
78
|
+
* are silently ignored — they may arrive very frequently and their text is
|
|
79
|
+
* unstable.
|
|
80
|
+
*
|
|
81
|
+
* If the final text is a recognised backchannel phrase the detector emits
|
|
82
|
+
* `'backchannel_detected'` and returns without accumulating the text, so that
|
|
83
|
+
* a subsequent `speech_end` event does not trigger `turn_complete`.
|
|
84
|
+
*
|
|
85
|
+
* @param transcript — Transcript event from the STT session.
|
|
86
|
+
*/
|
|
87
|
+
pushTranscript(transcript: TranscriptEvent): void;
|
|
88
|
+
/**
|
|
89
|
+
* Ingest a VAD (voice activity detection) event.
|
|
90
|
+
*
|
|
91
|
+
* - `speech_start`: marks the turn as active and cancels any pending silence
|
|
92
|
+
* timer (the user resumed speaking before the timeout elapsed).
|
|
93
|
+
* - `speech_end`: if accumulated text is available, either fires
|
|
94
|
+
* `turn_complete` immediately (punctuation) or starts the silence timer.
|
|
95
|
+
* - `silence`: heartbeat events are ignored; only explicit `speech_end`
|
|
96
|
+
* drives the timeout logic.
|
|
97
|
+
*
|
|
98
|
+
* @param event — VAD transition event.
|
|
99
|
+
*/
|
|
100
|
+
pushVadEvent(event: VadEvent): void;
|
|
101
|
+
/**
|
|
102
|
+
* Reset all internal state, cancel pending timers, and prepare the detector
|
|
103
|
+
* for the next user turn. Should be called by the pipeline after each
|
|
104
|
+
* `turn_complete` event before audio for the next turn begins to arrive.
|
|
105
|
+
*/
|
|
106
|
+
reset(): void;
|
|
107
|
+
/**
|
|
108
|
+
* Emit `turn_complete` with the currently accumulated transcript and then
|
|
109
|
+
* reset internal state so the detector is ready for the next turn.
|
|
110
|
+
*
|
|
111
|
+
* @param reason — The semantic reason driving this completion.
|
|
112
|
+
* @param speechEndTimestamp — Unix epoch ms timestamp of the `speech_end` event,
|
|
113
|
+
* used to compute `durationMs`.
|
|
114
|
+
*/
|
|
115
|
+
private _emitTurnComplete;
|
|
116
|
+
/**
|
|
117
|
+
* Start the silence-timeout timer. If the user does not resume speaking
|
|
118
|
+
* within {@link silenceTimeoutMs} ms the detector fires `turn_complete`.
|
|
119
|
+
*
|
|
120
|
+
* @param speechEndTimestamp — Timestamp passed through to `_emitTurnComplete`.
|
|
121
|
+
*/
|
|
122
|
+
private _startSilenceTimer;
|
|
123
|
+
/**
|
|
124
|
+
* Cancel a pending silence timer without any side effects.
|
|
125
|
+
*/
|
|
126
|
+
private _clearSilenceTimer;
|
|
127
|
+
}
|
|
128
|
+
//# sourceMappingURL=HeuristicEndpointDetector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"HeuristicEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,KAAK,EACV,iBAAiB,EACjB,eAAe,EACf,QAAQ,EAET,MAAM,YAAY,CAAC;AAwCpB;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAC/C;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAMD;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,yBACX,SAAQ,YACR,YAAW,iBAAiB;IAE5B;;;;OAIG;IACH,QAAQ,CAAC,IAAI,EAAE,iBAAiB,CAAC,MAAM,CAAC,CAAe;IAEvD,gDAAgD;IAChD,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAS;IAE1C,yEAAyE;IACzE,OAAO,CAAC,eAAe,CAAM;IAE7B,uDAAuD;IACvD,OAAO,CAAC,YAAY,CAAS;IAE7B,yEAAyE;IACzE,OAAO,CAAC,YAAY,CAA8C;IAElE,wEAAwE;IACxE,OAAO,CAAC,WAAW,CAAuB;IAE1C,sDAAsD;IACtD,OAAO,CAAC,cAAc,CAAK;IAM3B;;;;OAIG;gBACS,OAAO,GAAE,gCAAqC;IAS1D;;;;;;;;;;;;OAYG;IACH,cAAc,CAAC,UAAU,EAAE,eAAe,GAAG,IAAI;IAwBjD;;;;;;;;;;;OAWG;IACH,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,IAAI;IAyCnC;;;;OAIG;IACH,KAAK,IAAI,IAAI;IAYb;;;;;;;OAOG;IACH,OAAO,CAAC,iBAAiB;IAoBzB;;;;;OAKG;IACH,OAAO,CAAC,kBAAkB;IAQ1B;;OAEG;IACH,OAAO,CAAC,kBAAkB;CAM3B"}
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module voice-pipeline/HeuristicEndpointDetector
|
|
3
|
+
*
|
|
4
|
+
* A lightweight, rule-based endpoint detector that combines terminal punctuation
|
|
5
|
+
* analysis with a configurable silence timeout to determine when the user has
|
|
6
|
+
* finished speaking. Suitable for low-latency deployments where an LLM-based
|
|
7
|
+
* semantic detector would add unacceptable round-trip overhead.
|
|
8
|
+
*
|
|
9
|
+
* Detection strategy:
|
|
10
|
+
* 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`, or `!`,
|
|
11
|
+
* fire `turn_complete` immediately with reason `'punctuation'`.
|
|
12
|
+
* 2. Otherwise, start a silence timer (default 1 500 ms). If speech does not
|
|
13
|
+
* resume before the timer fires, emit `turn_complete` with reason `'silence_timeout'`.
|
|
14
|
+
* 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed from
|
|
15
|
+
* accumulation, and re-emitted as `'backchannel_detected'` events so the
|
|
16
|
+
* pipeline can decide whether to suppress an agent response.
|
|
17
|
+
*/
|
|
18
|
+
import { EventEmitter } from 'node:events';
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Constants
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
/**
|
|
23
|
+
* Default silence duration (ms) after speech stops before firing `turn_complete`.
|
|
24
|
+
*/
|
|
25
|
+
const DEFAULT_SILENCE_TIMEOUT_MS = 1500;
|
|
26
|
+
/**
|
|
27
|
+
* Terminal punctuation characters that signal sentence completion.
|
|
28
|
+
*/
|
|
29
|
+
const TERMINAL_PUNCTUATION = /[.?!]$/;
|
|
30
|
+
/**
|
|
31
|
+
* Normalised backchannel phrases that indicate the listener is acknowledging
|
|
32
|
+
* but not taking a full conversational turn. Compared after `.trim().toLowerCase()`.
|
|
33
|
+
*/
|
|
34
|
+
const BACKCHANNEL_PHRASES = new Set([
|
|
35
|
+
'uh huh',
|
|
36
|
+
'yeah',
|
|
37
|
+
'okay',
|
|
38
|
+
'ok',
|
|
39
|
+
'mm hmm',
|
|
40
|
+
'mmhmm',
|
|
41
|
+
'mhm',
|
|
42
|
+
'mm-hmm',
|
|
43
|
+
'right',
|
|
44
|
+
'sure',
|
|
45
|
+
'yep',
|
|
46
|
+
'yup',
|
|
47
|
+
'gotcha',
|
|
48
|
+
]);
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Implementation
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
/**
|
|
53
|
+
* Heuristic endpoint detector that uses terminal punctuation and a silence
|
|
54
|
+
* timeout to decide when the user's turn is complete.
|
|
55
|
+
*
|
|
56
|
+
* Emits:
|
|
57
|
+
* - `'turn_complete'` ({@link TurnCompleteEvent}) — user turn has ended.
|
|
58
|
+
* - `'backchannel_detected'` (`{ text: string }`) — a backchannel phrase was
|
|
59
|
+
* recognised; accumulation is suppressed for this utterance.
|
|
60
|
+
*
|
|
61
|
+
* @example
|
|
62
|
+
* ```typescript
|
|
63
|
+
* const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
|
|
64
|
+
* detector.on('turn_complete', (event) => console.log('Turn done:', event));
|
|
65
|
+
* detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] });
|
|
66
|
+
* detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' });
|
|
67
|
+
* // → 'turn_complete' fires immediately with reason 'punctuation'
|
|
68
|
+
* ```
|
|
69
|
+
*/
|
|
70
|
+
export class HeuristicEndpointDetector extends EventEmitter {
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
// Constructor
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
/**
|
|
75
|
+
* Create a new {@link HeuristicEndpointDetector}.
|
|
76
|
+
*
|
|
77
|
+
* @param options — Optional configuration overrides.
|
|
78
|
+
*/
|
|
79
|
+
constructor(options = {}) {
|
|
80
|
+
super();
|
|
81
|
+
/**
|
|
82
|
+
* Active detection strategy label.
|
|
83
|
+
* Typed as `'hybrid'` to satisfy {@link IEndpointDetector.mode}; consumers
|
|
84
|
+
* that need to distinguish heuristic detectors may inspect `instanceof`.
|
|
85
|
+
*/
|
|
86
|
+
this.mode = 'heuristic';
|
|
87
|
+
/** The latest final transcript text accumulated for the current turn. */
|
|
88
|
+
this.accumulatedText = '';
|
|
89
|
+
/** Whether the VAD currently reports active speech. */
|
|
90
|
+
this.speechActive = false;
|
|
91
|
+
/** Handle to a pending silence timeout, or `null` if none is running. */
|
|
92
|
+
this.silenceTimer = null;
|
|
93
|
+
/** Wall-clock timestamp (ms) when the current turn's speech started. */
|
|
94
|
+
this.turnStartMs = null;
|
|
95
|
+
/** Confidence of the most recent final transcript. */
|
|
96
|
+
this.lastConfidence = 1;
|
|
97
|
+
this.silenceTimeoutMs = options.silenceTimeoutMs ?? DEFAULT_SILENCE_TIMEOUT_MS;
|
|
98
|
+
}
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
// IEndpointDetector — pushTranscript
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
/**
|
|
103
|
+
* Ingest a transcript event from the upstream STT session.
|
|
104
|
+
*
|
|
105
|
+
* Only final events (`isFinal: true`) affect internal state. Interim results
|
|
106
|
+
* are silently ignored — they may arrive very frequently and their text is
|
|
107
|
+
* unstable.
|
|
108
|
+
*
|
|
109
|
+
* If the final text is a recognised backchannel phrase the detector emits
|
|
110
|
+
* `'backchannel_detected'` and returns without accumulating the text, so that
|
|
111
|
+
* a subsequent `speech_end` event does not trigger `turn_complete`.
|
|
112
|
+
*
|
|
113
|
+
* @param transcript — Transcript event from the STT session.
|
|
114
|
+
*/
|
|
115
|
+
pushTranscript(transcript) {
|
|
116
|
+
if (!transcript.isFinal) {
|
|
117
|
+
// Ignore partial/interim hypotheses — they will be superseded.
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
const text = transcript.text;
|
|
121
|
+
const normalised = text.trim().toLowerCase();
|
|
122
|
+
// Detect backchannel acknowledgements before accumulating.
|
|
123
|
+
if (BACKCHANNEL_PHRASES.has(normalised)) {
|
|
124
|
+
this.emit('backchannel_detected', { text });
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
// Accumulate the final transcript and store the confidence score.
|
|
128
|
+
this.accumulatedText = text;
|
|
129
|
+
this.lastConfidence = transcript.confidence;
|
|
130
|
+
}
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
// IEndpointDetector — pushVadEvent
|
|
133
|
+
// ---------------------------------------------------------------------------
|
|
134
|
+
/**
|
|
135
|
+
* Ingest a VAD (voice activity detection) event.
|
|
136
|
+
*
|
|
137
|
+
* - `speech_start`: marks the turn as active and cancels any pending silence
|
|
138
|
+
* timer (the user resumed speaking before the timeout elapsed).
|
|
139
|
+
* - `speech_end`: if accumulated text is available, either fires
|
|
140
|
+
* `turn_complete` immediately (punctuation) or starts the silence timer.
|
|
141
|
+
* - `silence`: heartbeat events are ignored; only explicit `speech_end`
|
|
142
|
+
* drives the timeout logic.
|
|
143
|
+
*
|
|
144
|
+
* @param event — VAD transition event.
|
|
145
|
+
*/
|
|
146
|
+
pushVadEvent(event) {
|
|
147
|
+
switch (event.type) {
|
|
148
|
+
case 'speech_start': {
|
|
149
|
+
this.speechActive = true;
|
|
150
|
+
this._clearSilenceTimer();
|
|
151
|
+
if (this.turnStartMs === null) {
|
|
152
|
+
this.turnStartMs = event.timestamp;
|
|
153
|
+
}
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
case 'speech_end': {
|
|
157
|
+
this.speechActive = false;
|
|
158
|
+
if (!this.accumulatedText) {
|
|
159
|
+
// Nothing to flush — no transcript arrived yet.
|
|
160
|
+
break;
|
|
161
|
+
}
|
|
162
|
+
if (TERMINAL_PUNCTUATION.test(this.accumulatedText)) {
|
|
163
|
+
// Sentence-terminal punctuation → fire immediately.
|
|
164
|
+
this._emitTurnComplete('punctuation', event.timestamp);
|
|
165
|
+
}
|
|
166
|
+
else {
|
|
167
|
+
// No punctuation → wait for silence timeout.
|
|
168
|
+
this._startSilenceTimer(event.timestamp);
|
|
169
|
+
}
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
case 'silence': {
|
|
173
|
+
// Periodic heartbeat — no action required; the silence timer already
|
|
174
|
+
// handles the delayed fire if one is pending.
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// IEndpointDetector — reset
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
/**
|
|
183
|
+
* Reset all internal state, cancel pending timers, and prepare the detector
|
|
184
|
+
* for the next user turn. Should be called by the pipeline after each
|
|
185
|
+
* `turn_complete` event before audio for the next turn begins to arrive.
|
|
186
|
+
*/
|
|
187
|
+
reset() {
|
|
188
|
+
this._clearSilenceTimer();
|
|
189
|
+
this.accumulatedText = '';
|
|
190
|
+
this.speechActive = false;
|
|
191
|
+
this.turnStartMs = null;
|
|
192
|
+
this.lastConfidence = 1;
|
|
193
|
+
}
|
|
194
|
+
// ---------------------------------------------------------------------------
|
|
195
|
+
// Private helpers
|
|
196
|
+
// ---------------------------------------------------------------------------
|
|
197
|
+
/**
|
|
198
|
+
* Emit `turn_complete` with the currently accumulated transcript and then
|
|
199
|
+
* reset internal state so the detector is ready for the next turn.
|
|
200
|
+
*
|
|
201
|
+
* @param reason — The semantic reason driving this completion.
|
|
202
|
+
* @param speechEndTimestamp — Unix epoch ms timestamp of the `speech_end` event,
|
|
203
|
+
* used to compute `durationMs`.
|
|
204
|
+
*/
|
|
205
|
+
_emitTurnComplete(reason, speechEndTimestamp) {
|
|
206
|
+
const durationMs = this.turnStartMs !== null ? speechEndTimestamp - this.turnStartMs : 0;
|
|
207
|
+
const event = {
|
|
208
|
+
transcript: this.accumulatedText,
|
|
209
|
+
confidence: this.lastConfidence,
|
|
210
|
+
durationMs,
|
|
211
|
+
reason,
|
|
212
|
+
};
|
|
213
|
+
// Reset before emitting so that any re-entrant listeners see clean state.
|
|
214
|
+
this.reset();
|
|
215
|
+
this.emit('turn_complete', event);
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Start the silence-timeout timer. If the user does not resume speaking
|
|
219
|
+
* within {@link silenceTimeoutMs} ms the detector fires `turn_complete`.
|
|
220
|
+
*
|
|
221
|
+
* @param speechEndTimestamp — Timestamp passed through to `_emitTurnComplete`.
|
|
222
|
+
*/
|
|
223
|
+
_startSilenceTimer(speechEndTimestamp) {
|
|
224
|
+
this._clearSilenceTimer();
|
|
225
|
+
this.silenceTimer = setTimeout(() => {
|
|
226
|
+
this.silenceTimer = null;
|
|
227
|
+
this._emitTurnComplete('silence_timeout', speechEndTimestamp);
|
|
228
|
+
}, this.silenceTimeoutMs);
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Cancel a pending silence timer without any side effects.
|
|
232
|
+
*/
|
|
233
|
+
_clearSilenceTimer() {
|
|
234
|
+
if (this.silenceTimer !== null) {
|
|
235
|
+
clearTimeout(this.silenceTimer);
|
|
236
|
+
this.silenceTimer = null;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
//# sourceMappingURL=HeuristicEndpointDetector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"HeuristicEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAQ3C,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E;;GAEG;AACH,MAAM,0BAA0B,GAAG,IAAK,CAAC;AAEzC;;GAEG;AACH,MAAM,oBAAoB,GAAG,QAAQ,CAAC;AAEtC;;;GAGG;AACH,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC;IAClC,QAAQ;IACR,MAAM;IACN,MAAM;IACN,IAAI;IACJ,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,OAAO;IACP,MAAM;IACN,KAAK;IACL,KAAK;IACL,QAAQ;CACT,CAAC,CAAC;AAkBH,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,OAAO,yBACX,SAAQ,YAAY;IA4BpB,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,UAA4C,EAAE;QACxD,KAAK,EAAE,CAAC;QAnCV;;;;WAIG;QACM,SAAI,GAA8B,WAAW,CAAC;QAKvD,yEAAyE;QACjE,oBAAe,GAAG,EAAE,CAAC;QAE7B,uDAAuD;QAC/C,iBAAY,GAAG,KAAK,CAAC;QAE7B,yEAAyE;QACjE,iBAAY,GAAyC,IAAI,CAAC;QAElE,wEAAwE;QAChE,gBAAW,GAAkB,IAAI,CAAC;QAE1C,sDAAsD;QAC9C,mBAAc,GAAG,CAAC,CAAC;QAazB,IAAI,CAAC,gBAAgB,GAAG,OAAO,CAAC,gBAAgB,IAAI,0BAA0B,CAAC;IACjF,CAAC;IAED,8EAA8E;IAC9E,qCAAqC;IACrC,8EAA8E;IAE9E;;;;;;;;;;;;OAYG;IACH,cAAc,CAAC,UAA2B;QACxC,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;YACxB,+DAA+D;YAC/D,OAAO;QACT,CAAC;QAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC;QAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE7C,2DAA2D;QAC3D,IAAI,mBAAmB,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;YAC5C,OAAO;QACT,CAAC;QAED,kEAAkE;QAClE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC;IAC9C,CAAC;IAED,8EAA8E;IAC9E,mCAAmC;IACnC,8EAA8E;IAE9E;;;;;;;;;;;OAWG;IACH,YAAY,CAAC,KAAe;QAC1B,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;gBACzB,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC1B,IAAI,IAAI,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;oBAC9B,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,SAAS,CAAC;gBACrC,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,YAAY,CAAC,CAAC,CAAC;gBAClB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;gBAE1B,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;oBAC1B,gDAAgD;oBAChD,MAAM;gBACR,CAAC;gBAED,IAAI,oBAAoB,CAAC,IAAI,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;oBACpD,oDAAoD;oBACpD,IAAI,CAAC,iBAAiB,CAAC,aAAa,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;gBACzD,CAAC;qBAAM,CAAC;oBACN,6CAA6C;oBAC7C,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBAC3C,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,SAAS,CAAC,CAAC,CAAC;gBACf,qEAAqE;gBACrE,8CAA8C;gBAC9C,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,4BAA4B;IAC5B,8EAA8E;IAE9E;;;;OAIG;IACH,KAAK;QACH,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,eAAe,GAAG,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;QAC1B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;QACxB,IAAI,CAAC,cAAc,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,8EAA8E;IAC9E,kBAAkB;IAClB,8EAA8E;IAE9E;;;;;;;OAOG;IACK,iBAAiB,CACvB,MAAmC,EACnC,kBAA0B;QAE1B,MAAM,UAAU,GACd,IAAI,CAAC,WAAW,KAAK,IAAI,CAAC,CAAC,CAAC,kBAAkB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAExE,MAAM,KAAK,GAAsB;YAC/B,UAAU,EAAE,IAAI,CAAC,eAAe;YAChC,UAAU,EAAE,IAAI,CAAC,cAAc;YAC/B,UAAU;YACV,MAAM;SACP,CAAC;QAEF,0EAA0E;QAC1E,IAAI,CAAC,KAAK,EAAE,CAAC;QAEb,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;IACpC,CAAC;IAED;;;;;OAKG;IACK,kBAAkB,CAAC,kBAA0B;QACnD,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;YAClC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;YACzB,IAAI,CAAC,iBAAiB,CAAC,iBAAiB,EAAE,kBAAkB,CAAC,CAAC;QAChE,CAAC,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,kBAAkB;QACxB,IAAI,IAAI,CAAC,YAAY,KAAK,IAAI,EAAE,CAAC;YAC/B,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAChC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module voice-pipeline/SoftFadeBargeinHandler
|
|
3
|
+
*
|
|
4
|
+
* Implements a three-tier soft-fade barge-in policy.
|
|
5
|
+
*
|
|
6
|
+
* Very short speech detections (< `ignoreMs`) are dismissed as noise.
|
|
7
|
+
* Medium-length detections trigger a fade-out pause so the user can speak
|
|
8
|
+
* without an abrupt cut. Long detections (>= `cancelMs`) stop playback
|
|
9
|
+
* outright and inject a conversation marker.
|
|
10
|
+
*/
|
|
11
|
+
import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
|
|
12
|
+
/**
|
|
13
|
+
* Construction options for {@link SoftFadeBargeinHandler}.
|
|
14
|
+
*/
|
|
15
|
+
export interface SoftFadeBargeinHandlerOptions {
|
|
16
|
+
/**
|
|
17
|
+
* Speech duration threshold in milliseconds below which the barge-in is
|
|
18
|
+
* treated as accidental noise and ignored.
|
|
19
|
+
*
|
|
20
|
+
* @defaultValue 100
|
|
21
|
+
*/
|
|
22
|
+
ignoreMs?: number;
|
|
23
|
+
/**
|
|
24
|
+
* Speech duration threshold in milliseconds at or above which the barge-in
|
|
25
|
+
* triggers an immediate cancel rather than a fade-out pause. Must be greater
|
|
26
|
+
* than `ignoreMs` for the fade region to exist.
|
|
27
|
+
*
|
|
28
|
+
* @defaultValue 2000
|
|
29
|
+
*/
|
|
30
|
+
cancelMs?: number;
|
|
31
|
+
/**
|
|
32
|
+
* Duration of the TTS fade-out in milliseconds applied when the speech
|
|
33
|
+
* duration falls in the range `[ignoreMs, cancelMs)`.
|
|
34
|
+
*
|
|
35
|
+
* @defaultValue 200
|
|
36
|
+
*/
|
|
37
|
+
fadeMs?: number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Barge-in handler that applies a three-tier soft-fade strategy.
|
|
41
|
+
*
|
|
42
|
+
* The handler maps the confirmed speech duration to one of three actions:
|
|
43
|
+
*
|
|
44
|
+
* | Speech duration | Action |
|
|
45
|
+
* |--------------------------|---------------------------------------------|
|
|
46
|
+
* | `< ignoreMs` | `ignore` — noise, continue TTS uninterrupted |
|
|
47
|
+
* | `>= ignoreMs < cancelMs` | `pause` with `fadeMs` fade-out |
|
|
48
|
+
* | `>= cancelMs` | `cancel` with `'[interrupted]'` marker |
|
|
49
|
+
*
|
|
50
|
+
* @example
|
|
51
|
+
* ```ts
|
|
52
|
+
* const handler = new SoftFadeBargeinHandler({ ignoreMs: 80, cancelMs: 1500, fadeMs: 150 });
|
|
53
|
+
* handler.handleBargein({ speechDurationMs: 500, ... }); // { type: 'pause', fadeMs: 150 }
|
|
54
|
+
* handler.handleBargein({ speechDurationMs: 1600, ... }); // { type: 'cancel', injectMarker: '[interrupted]' }
|
|
55
|
+
* handler.handleBargein({ speechDurationMs: 30, ... }); // { type: 'ignore' }
|
|
56
|
+
* ```
|
|
57
|
+
*/
|
|
58
|
+
export declare class SoftFadeBargeinHandler implements IBargeinHandler {
|
|
59
|
+
/**
|
|
60
|
+
* The interruption strategy implemented by this handler.
|
|
61
|
+
* Always `'soft-fade'`.
|
|
62
|
+
*/
|
|
63
|
+
readonly mode: "soft-fade";
|
|
64
|
+
/**
|
|
65
|
+
* Speech duration below which the barge-in is dismissed as noise.
|
|
66
|
+
*/
|
|
67
|
+
private readonly ignoreMs;
|
|
68
|
+
/**
|
|
69
|
+
* Speech duration at or above which the barge-in escalates to a full cancel.
|
|
70
|
+
*/
|
|
71
|
+
private readonly cancelMs;
|
|
72
|
+
/**
|
|
73
|
+
* Duration of the TTS audio fade-out applied during a `'pause'` action.
|
|
74
|
+
*/
|
|
75
|
+
private readonly fadeMs;
|
|
76
|
+
/**
|
|
77
|
+
* Constructs a new {@link SoftFadeBargeinHandler}.
|
|
78
|
+
*
|
|
79
|
+
* @param options - Optional configuration. Defaults to
|
|
80
|
+
* `{ ignoreMs: 100, cancelMs: 2000, fadeMs: 200 }`.
|
|
81
|
+
*/
|
|
82
|
+
constructor(options?: SoftFadeBargeinHandlerOptions);
|
|
83
|
+
/**
|
|
84
|
+
* Evaluate the barge-in context and return the pipeline action.
|
|
85
|
+
*
|
|
86
|
+
* Decision tree (evaluated in order):
|
|
87
|
+
* 1. `speechDurationMs < ignoreMs` → `{ type: 'ignore' }`
|
|
88
|
+
* 2. `speechDurationMs >= cancelMs` → `{ type: 'cancel', injectMarker: '[interrupted]' }`
|
|
89
|
+
* 3. Otherwise → `{ type: 'pause', fadeMs }`
|
|
90
|
+
*
|
|
91
|
+
* @param context - Snapshot of the barge-in state at the moment of detection.
|
|
92
|
+
* @returns The pipeline action to execute.
|
|
93
|
+
*/
|
|
94
|
+
handleBargein(context: BargeinContext): BargeinAction;
|
|
95
|
+
}
|
|
96
|
+
//# sourceMappingURL=SoftFadeBargeinHandler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SoftFadeBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/SoftFadeBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEjF;;GAEG;AACH,MAAM,WAAW,6BAA6B;IAC5C;;;;;OAKG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;OAMG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;OAKG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,sBAAuB,YAAW,eAAe;IAC5D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,WAAW,CAAU;IAErC;;OAEG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC;;OAEG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC;;OAEG;IACH,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAEhC;;;;;OAKG;gBACS,OAAO,GAAE,6BAAkC;IAMvD;;;;;;;;;;OAUG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAatD"}
|