@framers/agentos 0.1.107 → 0.1.109
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/memory/ingestion/ChunkingEngine.d.ts.map +1 -1
- package/dist/memory/ingestion/ChunkingEngine.js +5 -1
- package/dist/memory/ingestion/ChunkingEngine.js.map +1 -1
- package/dist/memory/ingestion/DocxLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/DocxLoader.js +2 -1
- package/dist/memory/ingestion/DocxLoader.js.map +1 -1
- package/dist/memory/ingestion/FolderScanner.d.ts.map +1 -1
- package/dist/memory/ingestion/FolderScanner.js +6 -3
- package/dist/memory/ingestion/FolderScanner.js.map +1 -1
- package/dist/memory/ingestion/HtmlLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/HtmlLoader.js +2 -1
- package/dist/memory/ingestion/HtmlLoader.js.map +1 -1
- package/dist/memory/ingestion/MarkdownLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/MarkdownLoader.js +2 -1
- package/dist/memory/ingestion/MarkdownLoader.js.map +1 -1
- package/dist/memory/ingestion/PdfLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/PdfLoader.js +2 -1
- package/dist/memory/ingestion/PdfLoader.js.map +1 -1
- package/dist/memory/ingestion/TextLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/TextLoader.js +3 -2
- package/dist/memory/ingestion/TextLoader.js.map +1 -1
- package/dist/memory/ingestion/pathUtils.d.ts +40 -0
- package/dist/memory/ingestion/pathUtils.d.ts.map +1 -0
- package/dist/memory/ingestion/pathUtils.js +62 -0
- package/dist/memory/ingestion/pathUtils.js.map +1 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +95 -20
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.js +110 -24
- package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +66 -15
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.js +65 -13
- package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +116 -42
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.js +159 -52
- package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +89 -24
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js +74 -20
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.d.ts +68 -10
- package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.js +53 -6
- package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +190 -39
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js +266 -53
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +135 -43
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -1
- package/dist/voice-pipeline/WebSocketStreamTransport.js +109 -47
- package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -1
- package/dist/voice-pipeline/index.d.ts +34 -1
- package/dist/voice-pipeline/index.d.ts.map +1 -1
- package/dist/voice-pipeline/index.js +41 -1
- package/dist/voice-pipeline/index.js.map +1 -1
- package/dist/voice-pipeline/types.d.ts +432 -106
- package/dist/voice-pipeline/types.d.ts.map +1 -1
- package/dist/voice-pipeline/types.js +21 -9
- package/dist/voice-pipeline/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -6,27 +6,74 @@
|
|
|
6
6
|
* and relies solely on the duration of post-speech silence to decide when the user
|
|
7
7
|
* has finished speaking.
|
|
8
8
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
9
|
+
* ## How it works
|
|
10
|
+
*
|
|
11
|
+
* This detector delegates all silence timing to a {@link SilenceDetector} instance
|
|
12
|
+
* (from `core/audio/`). The SilenceDetector maintains an internal timer that
|
|
13
|
+
* starts when `handleSpeechEnd()` is called and fires `'utterance_end_detected'`
|
|
14
|
+
* when silence exceeds the configured `utteranceEndThresholdMs`. A
|
|
15
|
+
* `handleSpeechStart()` call cancels the timer.
|
|
16
|
+
*
|
|
17
|
+
* ## Energy threshold adaptation
|
|
18
|
+
*
|
|
19
|
+
* The SilenceDetector internally uses adaptive energy thresholds from the
|
|
20
|
+
* {@link AdaptiveVAD}. The VAD continuously recalibrates its speech/silence
|
|
21
|
+
* boundary based on ambient noise levels, so the effective silence threshold
|
|
22
|
+
* adapts to the environment (e.g. coffee shop vs quiet room). This detector
|
|
23
|
+
* does not perform its own energy analysis -- it trusts the upstream VAD's
|
|
24
|
+
* speech_start/speech_end decisions.
|
|
25
|
+
*
|
|
26
|
+
* ## When to use
|
|
27
|
+
*
|
|
28
|
+
* Use this detector when:
|
|
29
|
+
* - The STT provider does not produce reliable punctuation.
|
|
30
|
+
* - You want the simplest possible endpoint detection with no linguistic analysis.
|
|
31
|
+
* - Latency tolerance is higher (the full `utteranceEndThresholdMs` is always
|
|
32
|
+
* consumed, unlike the {@link HeuristicEndpointDetector} which can fire
|
|
33
|
+
* immediately on terminal punctuation).
|
|
34
|
+
*
|
|
35
|
+
* @see {@link HeuristicEndpointDetector} for the rule-based alternative with
|
|
36
|
+
* punctuation-triggered fast path.
|
|
37
|
+
* @see {@link IEndpointDetector} for the interface contract.
|
|
38
|
+
* @see {@link SilenceDetector} for the underlying silence timing logic.
|
|
39
|
+
*
|
|
40
|
+
* ## Events emitted
|
|
41
|
+
*
|
|
42
|
+
* | Event | Payload | Description |
|
|
43
|
+
* |-------------------|--------------------------|---------------------------------------------|
|
|
44
|
+
* | `'turn_complete'` | {@link TurnCompleteEvent} | Silence exceeded `utteranceEndThresholdMs`. |
|
|
45
|
+
* | `'speech_start'` | *(none)* | Re-emitted from incoming VAD event. |
|
|
13
46
|
*/
|
|
14
47
|
import { EventEmitter } from 'node:events';
|
|
15
48
|
import type { IEndpointDetector, VadEvent, TranscriptEvent } from './types.js';
|
|
16
49
|
/**
|
|
17
50
|
* Constructor options for {@link AcousticEndpointDetector}.
|
|
51
|
+
*
|
|
52
|
+
* @example
|
|
53
|
+
* ```typescript
|
|
54
|
+
* const detector = new AcousticEndpointDetector({
|
|
55
|
+
* significantPauseThresholdMs: 1000,
|
|
56
|
+
* utteranceEndThresholdMs: 2000,
|
|
57
|
+
* });
|
|
58
|
+
* ```
|
|
18
59
|
*/
|
|
19
60
|
export interface AcousticEndpointDetectorConfig {
|
|
20
61
|
/**
|
|
21
62
|
* Silence duration after speech (ms) that triggers a "significant pause"
|
|
22
63
|
* notification on the underlying {@link SilenceDetector}. Does not directly
|
|
23
|
-
* cause `turn_complete` to fire, but
|
|
64
|
+
* cause `turn_complete` to fire, but can be used by other pipeline components
|
|
65
|
+
* to show a "thinking" indicator.
|
|
24
66
|
* @defaultValue 1500
|
|
25
67
|
*/
|
|
26
68
|
significantPauseThresholdMs?: number;
|
|
27
69
|
/**
|
|
28
70
|
* Silence duration after speech (ms) that triggers `turn_complete` with
|
|
29
|
-
* `reason: 'silence_timeout'`.
|
|
71
|
+
* `reason: 'silence_timeout'`. This is the primary tuning knob for how
|
|
72
|
+
* long the pipeline waits after the user stops speaking.
|
|
73
|
+
*
|
|
74
|
+
* - Lower values (1000-2000 ms): Faster response, but may fire during natural pauses.
|
|
75
|
+
* - Higher values (3000-5000 ms): More tolerant of pauses, but feels sluggish.
|
|
76
|
+
*
|
|
30
77
|
* @defaultValue 3000
|
|
31
78
|
*/
|
|
32
79
|
utteranceEndThresholdMs?: number;
|
|
@@ -38,26 +85,45 @@ export interface AcousticEndpointDetectorConfig {
|
|
|
38
85
|
* `speech_end` events start the silence clock; `speech_start` events cancel
|
|
39
86
|
* any pending turn-complete emission. Transcript content is completely ignored.
|
|
40
87
|
*
|
|
88
|
+
* @see {@link IEndpointDetector} for the interface contract.
|
|
89
|
+
* @see {@link HeuristicEndpointDetector} for the heuristic alternative.
|
|
90
|
+
*
|
|
41
91
|
* @example
|
|
42
|
-
* ```
|
|
92
|
+
* ```typescript
|
|
43
93
|
* const detector = new AcousticEndpointDetector({ utteranceEndThresholdMs: 2000 });
|
|
44
|
-
* detector.on('turn_complete', (event) =>
|
|
45
|
-
*
|
|
94
|
+
* detector.on('turn_complete', (event) => {
|
|
95
|
+
* console.log(`Turn done after ${event.durationMs}ms of speech`);
|
|
96
|
+
* });
|
|
97
|
+
* detector.pushVadEvent({ type: 'speech_start', timestamp: Date.now() });
|
|
98
|
+
* detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now() + 500 });
|
|
99
|
+
* // -> After 2000ms of silence, 'turn_complete' fires with reason 'silence_timeout'
|
|
46
100
|
* ```
|
|
47
101
|
*/
|
|
48
102
|
export declare class AcousticEndpointDetector extends EventEmitter implements IEndpointDetector {
|
|
49
|
-
/**
|
|
103
|
+
/**
|
|
104
|
+
* Detection mode identifier. Always `'acoustic'` for this implementation.
|
|
105
|
+
* @see {@link IEndpointDetector.mode}
|
|
106
|
+
*/
|
|
50
107
|
readonly mode: "acoustic";
|
|
51
|
-
/**
|
|
108
|
+
/**
|
|
109
|
+
* Underlying silence-duration tracker from `core/audio/`.
|
|
110
|
+
* Handles the actual timer management and threshold comparison.
|
|
111
|
+
*/
|
|
52
112
|
private readonly silenceDetector;
|
|
53
113
|
/**
|
|
54
|
-
* Timestamp (ms) when the current speech segment began.
|
|
55
|
-
* `durationMs` in the emitted {@link TurnCompleteEvent}
|
|
114
|
+
* Timestamp (ms) when the current speech segment began. Used to compute
|
|
115
|
+
* `durationMs` in the emitted {@link TurnCompleteEvent} as:
|
|
116
|
+
* `speechEndTimeMs - speechStartTimeMs`.
|
|
117
|
+
*
|
|
118
|
+
* Reset to `null` on each {@link reset} call.
|
|
56
119
|
*/
|
|
57
120
|
private speechStartTimeMs;
|
|
58
121
|
/**
|
|
59
122
|
* Timestamp (ms) when the most recent `speech_end` VAD event was received.
|
|
60
|
-
* Used to calculate `durationMs`
|
|
123
|
+
* Used together with {@link speechStartTimeMs} to calculate `durationMs`
|
|
124
|
+
* for the turn-complete event.
|
|
125
|
+
*
|
|
126
|
+
* Reset to `null` on each {@link reset} call.
|
|
61
127
|
*/
|
|
62
128
|
private speechEndTimeMs;
|
|
63
129
|
/**
|
|
@@ -69,21 +135,30 @@ export declare class AcousticEndpointDetector extends EventEmitter implements IE
|
|
|
69
135
|
/**
|
|
70
136
|
* Converts a {@link VadEvent} into the SilenceDetector's expected API calls.
|
|
71
137
|
*
|
|
72
|
-
* -
|
|
73
|
-
*
|
|
74
|
-
* -
|
|
138
|
+
* - **`speech_start`**: Resets silence state (cancels pending timers) and
|
|
139
|
+
* re-emits `'speech_start'` on this detector for pipeline consumption.
|
|
140
|
+
* - **`speech_end`**: Records the timestamp and starts the silence clock.
|
|
141
|
+
* - **`silence`**: Treated as ongoing non-speech frames, advancing the
|
|
142
|
+
* SilenceDetector's internal timer.
|
|
75
143
|
*
|
|
76
|
-
* @param event - Incoming VAD event.
|
|
144
|
+
* @param event - Incoming VAD event from the upstream voice activity detector.
|
|
77
145
|
*/
|
|
78
146
|
pushVadEvent(event: VadEvent): void;
|
|
79
147
|
/**
|
|
80
|
-
* No-op
|
|
148
|
+
* No-op -- this detector is purely acoustic and does not use transcript content.
|
|
149
|
+
*
|
|
150
|
+
* The method exists solely to satisfy the {@link IEndpointDetector} interface.
|
|
151
|
+
* Calling it has no effect and does not throw.
|
|
81
152
|
*
|
|
82
153
|
* @param _event - Ignored transcript event.
|
|
83
154
|
*/
|
|
84
155
|
pushTranscript(_event: TranscriptEvent): void;
|
|
85
156
|
/**
|
|
86
|
-
* Resets all internal state and
|
|
157
|
+
* Resets all internal state and cancels pending timers.
|
|
158
|
+
*
|
|
159
|
+
* Should be called at the start of each new turn to ensure clean state.
|
|
160
|
+
* This also resets the underlying SilenceDetector, cancelling any pending
|
|
161
|
+
* utterance_end_detected timer.
|
|
87
162
|
*/
|
|
88
163
|
reset(): void;
|
|
89
164
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AcousticEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"AcousticEndpointDetector.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,OAAO,KAAK,EACV,iBAAiB,EACjB,QAAQ,EACR,eAAe,EAEhB,MAAM,YAAY,CAAC;AAMpB;;;;;;;;;;GAUG;AACH,MAAM,WAAW,8BAA8B;IAC7C;;;;;;OAMG;IACH,2BAA2B,CAAC,EAAE,MAAM,CAAC;IAErC;;;;;;;;;OASG;IACH,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAClC;AAMD;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,qBAAa,wBAAyB,SAAQ,YAAa,YAAW,iBAAiB;IACrF;;;OAGG;IACH,SAAgB,IAAI,EAAG,UAAU,CAAU;IAE3C;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAkB;IAElD;;;;;;OAMG;IACH,OAAO,CAAC,iBAAiB,CAAuB;IAEhD;;;;;;OAMG;IACH,OAAO,CAAC,eAAe,CAAuB;IAM9C;;;;OAIG;gBACS,MAAM,GAAE,8BAAmC;IAuCvD;;;;;;;;;;OAUG;IACI,YAAY,CAAC,KAAK,EAAE,QAAQ,GAAG,IAAI;IAwC1C;;;;;;;OAOG;IACI,cAAc,CAAC,MAAM,EAAE,eAAe,GAAG,IAAI;IAUpD;;;;;;OAMG;IACI,KAAK,IAAI,IAAI;CAKrB"}
|
|
@@ -6,10 +6,43 @@
|
|
|
6
6
|
* and relies solely on the duration of post-speech silence to decide when the user
|
|
7
7
|
* has finished speaking.
|
|
8
8
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
9
|
+
* ## How it works
|
|
10
|
+
*
|
|
11
|
+
* This detector delegates all silence timing to a {@link SilenceDetector} instance
|
|
12
|
+
* (from `core/audio/`). The SilenceDetector maintains an internal timer that
|
|
13
|
+
* starts when `handleSpeechEnd()` is called and fires `'utterance_end_detected'`
|
|
14
|
+
* when silence exceeds the configured `utteranceEndThresholdMs`. A
|
|
15
|
+
* `handleSpeechStart()` call cancels the timer.
|
|
16
|
+
*
|
|
17
|
+
* ## Energy threshold adaptation
|
|
18
|
+
*
|
|
19
|
+
* The SilenceDetector internally uses adaptive energy thresholds from the
|
|
20
|
+
* {@link AdaptiveVAD}. The VAD continuously recalibrates its speech/silence
|
|
21
|
+
* boundary based on ambient noise levels, so the effective silence threshold
|
|
22
|
+
* adapts to the environment (e.g. coffee shop vs quiet room). This detector
|
|
23
|
+
* does not perform its own energy analysis -- it trusts the upstream VAD's
|
|
24
|
+
* speech_start/speech_end decisions.
|
|
25
|
+
*
|
|
26
|
+
* ## When to use
|
|
27
|
+
*
|
|
28
|
+
* Use this detector when:
|
|
29
|
+
* - The STT provider does not produce reliable punctuation.
|
|
30
|
+
* - You want the simplest possible endpoint detection with no linguistic analysis.
|
|
31
|
+
* - Latency tolerance is higher (the full `utteranceEndThresholdMs` is always
|
|
32
|
+
* consumed, unlike the {@link HeuristicEndpointDetector} which can fire
|
|
33
|
+
* immediately on terminal punctuation).
|
|
34
|
+
*
|
|
35
|
+
* @see {@link HeuristicEndpointDetector} for the rule-based alternative with
|
|
36
|
+
* punctuation-triggered fast path.
|
|
37
|
+
* @see {@link IEndpointDetector} for the interface contract.
|
|
38
|
+
* @see {@link SilenceDetector} for the underlying silence timing logic.
|
|
39
|
+
*
|
|
40
|
+
* ## Events emitted
|
|
41
|
+
*
|
|
42
|
+
* | Event | Payload | Description |
|
|
43
|
+
* |-------------------|--------------------------|---------------------------------------------|
|
|
44
|
+
* | `'turn_complete'` | {@link TurnCompleteEvent} | Silence exceeded `utteranceEndThresholdMs`. |
|
|
45
|
+
* | `'speech_start'` | *(none)* | Re-emitted from incoming VAD event. |
|
|
13
46
|
*/
|
|
14
47
|
import { EventEmitter } from 'node:events';
|
|
15
48
|
import { SilenceDetector } from '../core/audio/SilenceDetector.js';
|
|
@@ -23,14 +56,23 @@ import { SilenceDetector } from '../core/audio/SilenceDetector.js';
|
|
|
23
56
|
* `speech_end` events start the silence clock; `speech_start` events cancel
|
|
24
57
|
* any pending turn-complete emission. Transcript content is completely ignored.
|
|
25
58
|
*
|
|
59
|
+
* @see {@link IEndpointDetector} for the interface contract.
|
|
60
|
+
* @see {@link HeuristicEndpointDetector} for the heuristic alternative.
|
|
61
|
+
*
|
|
26
62
|
* @example
|
|
27
|
-
* ```
|
|
63
|
+
* ```typescript
|
|
28
64
|
* const detector = new AcousticEndpointDetector({ utteranceEndThresholdMs: 2000 });
|
|
29
|
-
* detector.on('turn_complete', (event) =>
|
|
30
|
-
*
|
|
65
|
+
* detector.on('turn_complete', (event) => {
|
|
66
|
+
* console.log(`Turn done after ${event.durationMs}ms of speech`);
|
|
67
|
+
* });
|
|
68
|
+
* detector.pushVadEvent({ type: 'speech_start', timestamp: Date.now() });
|
|
69
|
+
* detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now() + 500 });
|
|
70
|
+
* // -> After 2000ms of silence, 'turn_complete' fires with reason 'silence_timeout'
|
|
31
71
|
* ```
|
|
32
72
|
*/
|
|
33
73
|
export class AcousticEndpointDetector extends EventEmitter {
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
// Constructor
|
|
34
76
|
// ---------------------------------------------------------------------------
|
|
35
77
|
/**
|
|
36
78
|
* Creates a new AcousticEndpointDetector.
|
|
@@ -39,30 +81,46 @@ export class AcousticEndpointDetector extends EventEmitter {
|
|
|
39
81
|
*/
|
|
40
82
|
constructor(config = {}) {
|
|
41
83
|
super();
|
|
42
|
-
/**
|
|
84
|
+
/**
|
|
85
|
+
* Detection mode identifier. Always `'acoustic'` for this implementation.
|
|
86
|
+
* @see {@link IEndpointDetector.mode}
|
|
87
|
+
*/
|
|
43
88
|
this.mode = 'acoustic';
|
|
44
89
|
/**
|
|
45
|
-
* Timestamp (ms) when the current speech segment began.
|
|
46
|
-
* `durationMs` in the emitted {@link TurnCompleteEvent}
|
|
90
|
+
* Timestamp (ms) when the current speech segment began. Used to compute
|
|
91
|
+
* `durationMs` in the emitted {@link TurnCompleteEvent} as:
|
|
92
|
+
* `speechEndTimeMs - speechStartTimeMs`.
|
|
93
|
+
*
|
|
94
|
+
* Reset to `null` on each {@link reset} call.
|
|
47
95
|
*/
|
|
48
96
|
this.speechStartTimeMs = null;
|
|
49
97
|
/**
|
|
50
98
|
* Timestamp (ms) when the most recent `speech_end` VAD event was received.
|
|
51
|
-
* Used to calculate `durationMs`
|
|
99
|
+
* Used together with {@link speechStartTimeMs} to calculate `durationMs`
|
|
100
|
+
* for the turn-complete event.
|
|
101
|
+
*
|
|
102
|
+
* Reset to `null` on each {@link reset} call.
|
|
52
103
|
*/
|
|
53
104
|
this.speechEndTimeMs = null;
|
|
105
|
+
// Build SilenceDetector config from our options with sensible defaults
|
|
54
106
|
const sdConfig = {
|
|
55
107
|
significantPauseThresholdMs: config.significantPauseThresholdMs ?? 1500,
|
|
56
108
|
utteranceEndThresholdMs: config.utteranceEndThresholdMs ?? 3000,
|
|
57
109
|
};
|
|
58
110
|
this.silenceDetector = new SilenceDetector(sdConfig);
|
|
59
|
-
// When SilenceDetector decides the utterance has ended
|
|
111
|
+
// When SilenceDetector decides the utterance has ended (silence exceeded
|
|
112
|
+
// utteranceEndThresholdMs), translate that into a TurnCompleteEvent.
|
|
60
113
|
this.silenceDetector.on('utterance_end_detected', (_silenceDurationMs) => {
|
|
114
|
+
// Compute the duration of actual speech (not including silence).
|
|
115
|
+
// Falls back to 0 if timestamps are missing (defensive).
|
|
61
116
|
const durationMs = this.speechStartTimeMs !== null && this.speechEndTimeMs !== null
|
|
62
117
|
? this.speechEndTimeMs - this.speechStartTimeMs
|
|
63
118
|
: 0;
|
|
64
119
|
const event = {
|
|
65
|
-
|
|
120
|
+
// Acoustic mode has no transcript access -- the orchestrator will
|
|
121
|
+
// use whatever transcript the STT session has accumulated separately.
|
|
122
|
+
transcript: '',
|
|
123
|
+
// Confidence is 0 because we have no STT data to score.
|
|
66
124
|
confidence: 0,
|
|
67
125
|
durationMs,
|
|
68
126
|
reason: 'silence_timeout',
|
|
@@ -71,48 +129,76 @@ export class AcousticEndpointDetector extends EventEmitter {
|
|
|
71
129
|
});
|
|
72
130
|
}
|
|
73
131
|
// ---------------------------------------------------------------------------
|
|
74
|
-
// IEndpointDetector
|
|
132
|
+
// IEndpointDetector -- pushVadEvent
|
|
75
133
|
// ---------------------------------------------------------------------------
|
|
76
134
|
/**
|
|
77
135
|
* Converts a {@link VadEvent} into the SilenceDetector's expected API calls.
|
|
78
136
|
*
|
|
79
|
-
* -
|
|
80
|
-
*
|
|
81
|
-
* -
|
|
137
|
+
* - **`speech_start`**: Resets silence state (cancels pending timers) and
|
|
138
|
+
* re-emits `'speech_start'` on this detector for pipeline consumption.
|
|
139
|
+
* - **`speech_end`**: Records the timestamp and starts the silence clock.
|
|
140
|
+
* - **`silence`**: Treated as ongoing non-speech frames, advancing the
|
|
141
|
+
* SilenceDetector's internal timer.
|
|
82
142
|
*
|
|
83
|
-
* @param event - Incoming VAD event.
|
|
143
|
+
* @param event - Incoming VAD event from the upstream voice activity detector.
|
|
84
144
|
*/
|
|
85
145
|
pushVadEvent(event) {
|
|
86
|
-
//
|
|
87
|
-
// a pass-through
|
|
146
|
+
// The SilenceDetector's API requires a VADResult parameter, but it only
|
|
147
|
+
// uses it as a pass-through and doesn't inspect its contents. We pass
|
|
148
|
+
// a minimal stub typed as `never` to satisfy the signature without
|
|
149
|
+
// introducing a dependency on the full VADResult type.
|
|
88
150
|
const vadResultStub = { timestamp: event.timestamp };
|
|
89
151
|
switch (event.type) {
|
|
90
152
|
case 'speech_start':
|
|
153
|
+
// Record when speech began for duration calculation
|
|
91
154
|
this.speechStartTimeMs = event.timestamp;
|
|
155
|
+
// Clear the previous speech_end since a new speech segment started
|
|
92
156
|
this.speechEndTimeMs = null;
|
|
157
|
+
// Notify SilenceDetector to cancel any pending silence timer
|
|
93
158
|
this.silenceDetector.handleSpeechStart(vadResultStub);
|
|
159
|
+
// Re-emit for pipeline consumers (e.g. barge-in detection)
|
|
94
160
|
this.emit('speech_start');
|
|
95
161
|
break;
|
|
96
162
|
case 'speech_end':
|
|
163
|
+
// Record when speech ended for duration calculation
|
|
97
164
|
this.speechEndTimeMs = event.timestamp;
|
|
165
|
+
// Start the silence clock -- if silence persists beyond
|
|
166
|
+
// utteranceEndThresholdMs, SilenceDetector fires utterance_end_detected.
|
|
167
|
+
// The second argument (0) is the energy level -- not used in our context.
|
|
98
168
|
this.silenceDetector.handleSpeechEnd(vadResultStub, 0);
|
|
99
169
|
break;
|
|
100
170
|
case 'silence':
|
|
101
|
-
// Periodic silence heartbeat
|
|
171
|
+
// Periodic silence heartbeat -- advance SilenceDetector's internal
|
|
172
|
+
// timer by notifying it of continued non-speech activity.
|
|
102
173
|
this.silenceDetector.handleNoVoiceActivity(vadResultStub);
|
|
103
174
|
break;
|
|
104
175
|
}
|
|
105
176
|
}
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// IEndpointDetector -- pushTranscript
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
106
180
|
/**
|
|
107
|
-
* No-op
|
|
181
|
+
* No-op -- this detector is purely acoustic and does not use transcript content.
|
|
182
|
+
*
|
|
183
|
+
* The method exists solely to satisfy the {@link IEndpointDetector} interface.
|
|
184
|
+
* Calling it has no effect and does not throw.
|
|
108
185
|
*
|
|
109
186
|
* @param _event - Ignored transcript event.
|
|
110
187
|
*/
|
|
111
188
|
pushTranscript(_event) {
|
|
112
|
-
// Intentional no-op: acoustic mode ignores linguistic content.
|
|
189
|
+
// Intentional no-op: acoustic mode ignores all linguistic content.
|
|
190
|
+
// The HeuristicEndpointDetector should be used if transcript-based
|
|
191
|
+
// endpoint detection is desired.
|
|
113
192
|
}
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
// IEndpointDetector -- reset
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
114
196
|
/**
|
|
115
|
-
* Resets all internal state and
|
|
197
|
+
* Resets all internal state and cancels pending timers.
|
|
198
|
+
*
|
|
199
|
+
* Should be called at the start of each new turn to ensure clean state.
|
|
200
|
+
* This also resets the underlying SilenceDetector, cancelling any pending
|
|
201
|
+
* utterance_end_detected timer.
|
|
116
202
|
*/
|
|
117
203
|
reset() {
|
|
118
204
|
this.speechStartTimeMs = null;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AcousticEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"AcousticEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/AcousticEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,eAAe,EAA8B,MAAM,kCAAkC,CAAC;AA8C/F,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,OAAO,wBAAyB,SAAQ,YAAY;IA+BxD,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,SAAyC,EAAE;QACrD,KAAK,EAAE,CAAC;QAxCV;;;WAGG;QACa,SAAI,GAAG,UAAmB,CAAC;QAQ3C;;;;;;WAMG;QACK,sBAAiB,GAAkB,IAAI,CAAC;QAEhD;;;;;;WAMG;QACK,oBAAe,GAAkB,IAAI,CAAC;QAc5C,uEAAuE;QACvE,MAAM,QAAQ,GAA0B;YACtC,2BAA2B,EAAE,MAAM,CAAC,2BAA2B,IAAI,IAAI;YACvE,uBAAuB,EAAE,MAAM,CAAC,uBAAuB,IAAI,IAAI;SAChE,CAAC;QAEF,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC,QAAQ,CAAC,CAAC;QAErD,yEAAyE;QACzE,qEAAqE;QACrE,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC,wBAAwB,EAAE,CAAC,kBAA0B,EAAE,EAAE;YAC/E,iEAAiE;YACjE,yDAAyD;YACzD,MAAM,UAAU,GACd,IAAI,CAAC,iBAAiB,KAAK,IAAI,IAAI,IAAI,CAAC,eAAe,KAAK,IAAI;gBAC9D,CAAC,CAAC,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC,iBAAiB;gBAC/C,CAAC,CAAC,CAAC,CAAC;YAER,MAAM,KAAK,GAAsB;gBAC/B,kEAAkE;gBAClE,sEAAsE;gBACtE,UAAU,EAAE,EAAE;gBACd,wDAAwD;gBACxD,UAAU,EAAE,CAAC;gBACb,UAAU;gBACV,MAAM,EAAE,iBAAiB;aAC1B,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAED,8EAA8E;IAC9E,oCAAoC;IACpC,8EAA8E;IAE9E;;;;;;;;;;OAUG;IACI,YAAY,CAAC,KAAe;QACjC,wEAAwE;QACxE,sEAAsE;QACtE,mEAAmE;QACnE,uDAAuD;QACvD,MAAM,aAAa,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,EAAW,CAAC;QAE9D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc;gBACjB,oDAAoD;gBACpD,IAAI,CAAC,iBAAiB,GAAG,KAAK,CAAC,SAAS,CAAC;gBACzC,mEAAmE;gBACnE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;gBAC5B,6DAA6D;gBAC7D,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC;gBACtD,2DAA2D;gBAC3D,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC1B,MAAM;YAER,KAAK,YAAY;gBACf,oDAAoD;gBACpD,IAAI,CAAC,eAAe,GAAG,KAAK,CAAC,SAAS,CAAC;gBACvC,wDAAwD;gBACxD,yEAAyE;gBACzE,0EAA0E;gBAC1E,IAAI,CAAC,eAAe,CAAC,eAAe,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC;gBACvD,MAAM;YAER,KAAK,SAAS;gBACZ,mEAAmE;gBACnE,0DAA0D;gBAC1D,IAAI,CAAC,eAAe,CAAC,qBAAqB,CAAC,aAAa,CAAC,CAAC;gBAC1D,MAAM;QACV,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,sCAAsC;IACtC,8EAA8E;IAE9E;;;;;;;OAOG;IACI,cAAc,CAAC,MAAuB;QAC3C,mEAAmE;QACnE,mEAAmE;QACnE,iCAAiC;IACnC,CAAC;IAED,8EAA8E;IAC9E,6BAA6B;IAC7B,8EAA8E;IAE9E;;;;;;OAMG;IACI,KAAK;QACV,IAAI,CAAC,iBAAiB,GAAG,IAAI,CAAC;QAC9B,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,CAAC;IAC/B,CAAC;CACF"}
|
|
@@ -2,21 +2,59 @@
|
|
|
2
2
|
* @module voice-pipeline/HardCutBargeinHandler
|
|
3
3
|
*
|
|
4
4
|
* Implements a hard-cut barge-in policy: when the user speaks over TTS output
|
|
5
|
-
* for at least
|
|
6
|
-
* no fade-out. Short detections below the
|
|
7
|
-
* noise and ignored.
|
|
5
|
+
* for at least {@link HardCutBargeinHandlerOptions.minSpeechMs} milliseconds,
|
|
6
|
+
* playback is stopped immediately with no fade-out. Short detections below the
|
|
7
|
+
* threshold are treated as accidental noise and ignored.
|
|
8
|
+
*
|
|
9
|
+
* ## Why 300 ms default threshold?
|
|
10
|
+
*
|
|
11
|
+
* The 300 ms threshold was chosen to filter out common non-speech audio events
|
|
12
|
+
* that trigger false barge-in detections:
|
|
13
|
+
*
|
|
14
|
+
* - **Lip smacks**: Typically 50-150 ms of energy.
|
|
15
|
+
* - **Breaths/sighs**: Typically 100-250 ms of energy.
|
|
16
|
+
* - **Coughs/sneezes**: Short burst 100-200 ms, but may exceed threshold.
|
|
17
|
+
* - **Background noise spikes**: Door closing, keyboard typing -- usually < 200 ms.
|
|
18
|
+
*
|
|
19
|
+
* At 300 ms, a detection almost certainly represents intentional speech rather
|
|
20
|
+
* than ambient noise. Lowering to < 200 ms increases false positives significantly
|
|
21
|
+
* in noisy environments. Raising to > 500 ms adds noticeable delay before the
|
|
22
|
+
* agent acknowledges the interruption.
|
|
23
|
+
*
|
|
24
|
+
* ## When to use hard-cut vs soft-fade
|
|
25
|
+
*
|
|
26
|
+
* Use hard-cut when:
|
|
27
|
+
* - The conversation style is fast-paced (e.g. customer support).
|
|
28
|
+
* - Users expect immediate response to interruption.
|
|
29
|
+
* - Audio quality is high (fewer false positives).
|
|
30
|
+
*
|
|
31
|
+
* Use {@link SoftFadeBargeinHandler} when:
|
|
32
|
+
* - The conversation is more measured (e.g. storytelling, education).
|
|
33
|
+
* - Users may accidentally trigger barge-in (noisy environment).
|
|
34
|
+
* - A smoother audio experience is preferred.
|
|
35
|
+
*
|
|
36
|
+
* @see {@link SoftFadeBargeinHandler} for the three-tier soft-fade alternative.
|
|
37
|
+
* @see {@link IBargeinHandler} for the interface contract.
|
|
8
38
|
*/
|
|
9
39
|
import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
|
|
10
40
|
/**
|
|
11
41
|
* Construction options for {@link HardCutBargeinHandler}.
|
|
42
|
+
*
|
|
43
|
+
* @example
|
|
44
|
+
* ```typescript
|
|
45
|
+
* const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
|
|
46
|
+
* ```
|
|
12
47
|
*/
|
|
13
48
|
export interface HardCutBargeinHandlerOptions {
|
|
14
49
|
/**
|
|
15
50
|
* Minimum confirmed speech duration (in milliseconds) required before a
|
|
16
51
|
* barge-in is treated as intentional. Detections shorter than this value are
|
|
17
|
-
* returned as `{ type: 'ignore' }` to avoid reacting to background noise
|
|
52
|
+
* returned as `{ type: 'ignore' }` to avoid reacting to background noise,
|
|
53
|
+
* lip smacks, breaths, or other brief non-speech audio events.
|
|
18
54
|
*
|
|
19
55
|
* @defaultValue 300
|
|
56
|
+
*
|
|
57
|
+
* @see Module-level documentation for rationale behind the 300 ms default.
|
|
20
58
|
*/
|
|
21
59
|
minSpeechMs?: number;
|
|
22
60
|
}
|
|
@@ -24,26 +62,38 @@ export interface HardCutBargeinHandlerOptions {
|
|
|
24
62
|
* Barge-in handler that applies a hard-cut strategy.
|
|
25
63
|
*
|
|
26
64
|
* When the user speaks over an active TTS stream, this handler immediately
|
|
27
|
-
* cancels playback if the detected speech exceeds
|
|
28
|
-
* threshold the interruption is considered noise and playback continues
|
|
65
|
+
* cancels playback if the detected speech exceeds {@link minSpeechMs}. Below
|
|
66
|
+
* that threshold the interruption is considered noise and playback continues
|
|
29
67
|
* uninterrupted.
|
|
30
68
|
*
|
|
69
|
+
* The handler is stateless -- each {@link handleBargein} call is evaluated
|
|
70
|
+
* independently with no memory of previous barge-in events.
|
|
71
|
+
*
|
|
72
|
+
* @see {@link IBargeinHandler} for the interface contract.
|
|
73
|
+
* @see {@link SoftFadeBargeinHandler} for the three-tier alternative.
|
|
74
|
+
*
|
|
31
75
|
* @example
|
|
32
|
-
* ```
|
|
76
|
+
* ```typescript
|
|
33
77
|
* const handler = new HardCutBargeinHandler({ minSpeechMs: 250 });
|
|
34
|
-
*
|
|
35
|
-
* //
|
|
78
|
+
*
|
|
79
|
+
* // Short noise -> ignored
|
|
80
|
+
* handler.handleBargein({ speechDurationMs: 100, interruptedText: '...', playedDurationMs: 500 });
|
|
81
|
+
* // -> { type: 'ignore' }
|
|
82
|
+
*
|
|
83
|
+
* // Intentional speech -> cancel
|
|
84
|
+
* handler.handleBargein({ speechDurationMs: 400, interruptedText: '...', playedDurationMs: 500 });
|
|
85
|
+
* // -> { type: 'cancel', injectMarker: '[interrupted]' }
|
|
36
86
|
* ```
|
|
37
87
|
*/
|
|
38
88
|
export declare class HardCutBargeinHandler implements IBargeinHandler {
|
|
39
89
|
/**
|
|
40
90
|
* The interruption strategy implemented by this handler.
|
|
41
|
-
* Always `'hard-cut'
|
|
91
|
+
* Always `'hard-cut'` -- playback is stopped instantly with no fade.
|
|
42
92
|
*/
|
|
43
93
|
readonly mode: "hard-cut";
|
|
44
94
|
/**
|
|
45
95
|
* Minimum speech duration in milliseconds before the interruption is
|
|
46
|
-
* considered intentional.
|
|
96
|
+
* considered intentional. Set once at construction and never changed.
|
|
47
97
|
*/
|
|
48
98
|
private readonly minSpeechMs;
|
|
49
99
|
/**
|
|
@@ -55,12 +105,13 @@ export declare class HardCutBargeinHandler implements IBargeinHandler {
|
|
|
55
105
|
/**
|
|
56
106
|
* Evaluate the barge-in context and return the action the pipeline should take.
|
|
57
107
|
*
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
108
|
+
* Decision logic (binary threshold):
|
|
109
|
+
* - `speechDurationMs >= minSpeechMs` -> Cancel TTS immediately and inject
|
|
110
|
+
* an `'[interrupted]'` marker into the conversation context.
|
|
111
|
+
* - `speechDurationMs < minSpeechMs` -> Ignore the detection as noise.
|
|
61
112
|
*
|
|
62
113
|
* @param context - Snapshot of the barge-in state at the moment of detection.
|
|
63
|
-
* @returns The pipeline action to execute.
|
|
114
|
+
* @returns The pipeline action to execute. Always synchronous (no Promise).
|
|
64
115
|
*/
|
|
65
116
|
handleBargein(context: BargeinContext): BargeinAction;
|
|
66
117
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"HardCutBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/HardCutBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMjF;;;;;;;GAOG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;;;;;OASG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,qBAAa,qBAAsB,YAAW,eAAe;IAC3D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,UAAU,CAAU;IAEpC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAS;IAErC;;;;OAIG;gBACS,OAAO,GAAE,4BAAiC;IAItD;;;;;;;;;;OAUG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAatD"}
|