@framers/agentos 0.1.107 → 0.1.109
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/memory/ingestion/ChunkingEngine.d.ts.map +1 -1
- package/dist/memory/ingestion/ChunkingEngine.js +5 -1
- package/dist/memory/ingestion/ChunkingEngine.js.map +1 -1
- package/dist/memory/ingestion/DocxLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/DocxLoader.js +2 -1
- package/dist/memory/ingestion/DocxLoader.js.map +1 -1
- package/dist/memory/ingestion/FolderScanner.d.ts.map +1 -1
- package/dist/memory/ingestion/FolderScanner.js +6 -3
- package/dist/memory/ingestion/FolderScanner.js.map +1 -1
- package/dist/memory/ingestion/HtmlLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/HtmlLoader.js +2 -1
- package/dist/memory/ingestion/HtmlLoader.js.map +1 -1
- package/dist/memory/ingestion/MarkdownLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/MarkdownLoader.js +2 -1
- package/dist/memory/ingestion/MarkdownLoader.js.map +1 -1
- package/dist/memory/ingestion/PdfLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/PdfLoader.js +2 -1
- package/dist/memory/ingestion/PdfLoader.js.map +1 -1
- package/dist/memory/ingestion/TextLoader.d.ts.map +1 -1
- package/dist/memory/ingestion/TextLoader.js +3 -2
- package/dist/memory/ingestion/TextLoader.js.map +1 -1
- package/dist/memory/ingestion/pathUtils.d.ts +40 -0
- package/dist/memory/ingestion/pathUtils.d.ts.map +1 -0
- package/dist/memory/ingestion/pathUtils.js +62 -0
- package/dist/memory/ingestion/pathUtils.js.map +1 -0
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts +95 -20
- package/dist/voice-pipeline/AcousticEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/AcousticEndpointDetector.js +110 -24
- package/dist/voice-pipeline/AcousticEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts +66 -15
- package/dist/voice-pipeline/HardCutBargeinHandler.d.ts.map +1 -1
- package/dist/voice-pipeline/HardCutBargeinHandler.js +65 -13
- package/dist/voice-pipeline/HardCutBargeinHandler.js.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts +116 -42
- package/dist/voice-pipeline/HeuristicEndpointDetector.d.ts.map +1 -1
- package/dist/voice-pipeline/HeuristicEndpointDetector.js +159 -52
- package/dist/voice-pipeline/HeuristicEndpointDetector.js.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts +89 -24
- package/dist/voice-pipeline/SoftFadeBargeinHandler.d.ts.map +1 -1
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js +74 -20
- package/dist/voice-pipeline/SoftFadeBargeinHandler.js.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.d.ts +68 -10
- package/dist/voice-pipeline/VoiceInterruptError.d.ts.map +1 -1
- package/dist/voice-pipeline/VoiceInterruptError.js +53 -6
- package/dist/voice-pipeline/VoiceInterruptError.js.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts +190 -39
- package/dist/voice-pipeline/VoicePipelineOrchestrator.d.ts.map +1 -1
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js +266 -53
- package/dist/voice-pipeline/VoicePipelineOrchestrator.js.map +1 -1
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts +135 -43
- package/dist/voice-pipeline/WebSocketStreamTransport.d.ts.map +1 -1
- package/dist/voice-pipeline/WebSocketStreamTransport.js +109 -47
- package/dist/voice-pipeline/WebSocketStreamTransport.js.map +1 -1
- package/dist/voice-pipeline/index.d.ts +34 -1
- package/dist/voice-pipeline/index.d.ts.map +1 -1
- package/dist/voice-pipeline/index.js +41 -1
- package/dist/voice-pipeline/index.js.map +1 -1
- package/dist/voice-pipeline/types.d.ts +432 -106
- package/dist/voice-pipeline/types.d.ts.map +1 -1
- package/dist/voice-pipeline/types.js +21 -9
- package/dist/voice-pipeline/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -6,14 +6,30 @@
|
|
|
6
6
|
* finished speaking. Suitable for low-latency deployments where an LLM-based
|
|
7
7
|
* semantic detector would add unacceptable round-trip overhead.
|
|
8
8
|
*
|
|
9
|
-
* Detection strategy
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
9
|
+
* ## Detection strategy
|
|
10
|
+
*
|
|
11
|
+
* 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`,
|
|
12
|
+
* or `!`, fire `turn_complete` immediately with reason `'punctuation'`.
|
|
13
|
+
* This provides the lowest-latency turn handoff for well-punctuated speech.
|
|
14
|
+
*
|
|
15
|
+
* 2. Otherwise, start a silence timer (default 1,500 ms). If speech does not
|
|
16
|
+
* resume before the timer fires, emit `turn_complete` with reason
|
|
17
|
+
* `'silence_timeout'`. The timeout acts as a safety net for STT providers
|
|
18
|
+
* that don't produce terminal punctuation reliably.
|
|
19
|
+
*
|
|
20
|
+
* 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed
|
|
21
|
+
* from accumulation, and re-emitted as `'backchannel_detected'` events so
|
|
22
|
+
* the pipeline can decide whether to suppress an agent response.
|
|
23
|
+
*
|
|
24
|
+
* ## Why heuristic over acoustic-only?
|
|
25
|
+
*
|
|
26
|
+
* Pure silence timeout adds up to 1.5 s of unnecessary latency on every turn
|
|
27
|
+
* when the user ends a sentence cleanly. By checking for terminal punctuation,
|
|
28
|
+
* this detector can fire turn_complete immediately, cutting perceived latency
|
|
29
|
+
* by more than half for typical conversational speech.
|
|
30
|
+
*
|
|
31
|
+
* @see {@link AcousticEndpointDetector} for the purely acoustic alternative.
|
|
32
|
+
* @see {@link IEndpointDetector} for the interface contract.
|
|
17
33
|
*/
|
|
18
34
|
import { EventEmitter } from 'node:events';
|
|
19
35
|
// ---------------------------------------------------------------------------
|
|
@@ -21,15 +37,41 @@ import { EventEmitter } from 'node:events';
|
|
|
21
37
|
// ---------------------------------------------------------------------------
|
|
22
38
|
/**
|
|
23
39
|
* Default silence duration (ms) after speech stops before firing `turn_complete`.
|
|
40
|
+
*
|
|
41
|
+
* 1,500 ms was chosen as a balance between:
|
|
42
|
+
* - Too short (< 800 ms): Fires mid-sentence when the user pauses to think.
|
|
43
|
+
* - Too long (> 2,000 ms): Adds noticeable latency, making the agent feel slow.
|
|
44
|
+
*
|
|
45
|
+
* This value is consistent with research on conversational turn-taking gaps
|
|
46
|
+
* (Stivers et al., 2009: modal gap ~200 ms, but STT adds 200-500 ms latency,
|
|
47
|
+
* and users expect the agent to wait slightly longer than a human would).
|
|
24
48
|
*/
|
|
25
49
|
const DEFAULT_SILENCE_TIMEOUT_MS = 1500;
|
|
26
50
|
/**
|
|
27
|
-
*
|
|
51
|
+
* Regular expression matching terminal punctuation characters that signal
|
|
52
|
+
* sentence completion. Only tested against the final character of the
|
|
53
|
+
* accumulated transcript text.
|
|
54
|
+
*
|
|
55
|
+
* We deliberately exclude semicolons, colons, and ellipses because they
|
|
56
|
+
* rarely indicate turn completion in spoken language.
|
|
28
57
|
*/
|
|
29
58
|
const TERMINAL_PUNCTUATION = /[.?!]$/;
|
|
30
59
|
/**
|
|
31
60
|
* Normalised backchannel phrases that indicate the listener is acknowledging
|
|
32
61
|
* but not taking a full conversational turn. Compared after `.trim().toLowerCase()`.
|
|
62
|
+
*
|
|
63
|
+
* These 13 phrases were selected because:
|
|
64
|
+
* - They are the most common English-language backchannel markers in the
|
|
65
|
+
* Switchboard and Fisher telephone conversation corpora.
|
|
66
|
+
* - They are short enough that STT providers reliably produce them as
|
|
67
|
+
* standalone final transcripts (longer phrases like "I see" risk being
|
|
68
|
+
* part of a larger utterance).
|
|
69
|
+
* - Including both spellings of common variants (e.g. "mm hmm", "mmhmm",
|
|
70
|
+
* "mm-hmm", "mhm") ensures robust matching across STT providers that
|
|
71
|
+
* normalise differently.
|
|
72
|
+
*
|
|
73
|
+
* The set is intentionally conservative -- adding phrases like "I see" or
|
|
74
|
+
* "go on" risks false positives when the user is genuinely taking a turn.
|
|
33
75
|
*/
|
|
34
76
|
const BACKCHANNEL_PHRASES = new Set([
|
|
35
77
|
'uh huh',
|
|
@@ -53,18 +95,25 @@ const BACKCHANNEL_PHRASES = new Set([
|
|
|
53
95
|
* Heuristic endpoint detector that uses terminal punctuation and a silence
|
|
54
96
|
* timeout to decide when the user's turn is complete.
|
|
55
97
|
*
|
|
56
|
-
*
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
*
|
|
98
|
+
* ## Events emitted
|
|
99
|
+
*
|
|
100
|
+
* | Event | Payload | Description |
|
|
101
|
+
* |--------------------------|--------------------------|------------------------------------|
|
|
102
|
+
* | `'turn_complete'` | {@link TurnCompleteEvent}| User turn has ended. |
|
|
103
|
+
* | `'backchannel_detected'` | `{ text: string }` | Backchannel phrase was recognised. |
|
|
104
|
+
*
|
|
105
|
+
* @see {@link IEndpointDetector} for the interface contract.
|
|
106
|
+
* @see {@link AcousticEndpointDetector} for the purely acoustic alternative.
|
|
60
107
|
*
|
|
61
108
|
* @example
|
|
62
109
|
* ```typescript
|
|
63
110
|
* const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
|
|
64
111
|
* detector.on('turn_complete', (event) => console.log('Turn done:', event));
|
|
112
|
+
*
|
|
113
|
+
* // Simulate a punctuated sentence followed by speech_end
|
|
65
114
|
* detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] });
|
|
66
115
|
* detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' });
|
|
67
|
-
* //
|
|
116
|
+
* // -> 'turn_complete' fires immediately with reason 'punctuation'
|
|
68
117
|
* ```
|
|
69
118
|
*/
|
|
70
119
|
export class HeuristicEndpointDetector extends EventEmitter {
|
|
@@ -74,80 +123,118 @@ export class HeuristicEndpointDetector extends EventEmitter {
|
|
|
74
123
|
/**
|
|
75
124
|
* Create a new {@link HeuristicEndpointDetector}.
|
|
76
125
|
*
|
|
77
|
-
* @param options
|
|
126
|
+
* @param options - Optional configuration overrides.
|
|
78
127
|
*/
|
|
79
128
|
constructor(options = {}) {
|
|
80
129
|
super();
|
|
81
130
|
/**
|
|
82
131
|
* Active detection strategy label.
|
|
83
|
-
*
|
|
84
|
-
*
|
|
132
|
+
* Always `'heuristic'` for this implementation.
|
|
133
|
+
*
|
|
134
|
+
* @see {@link IEndpointDetector.mode}
|
|
85
135
|
*/
|
|
86
136
|
this.mode = 'heuristic';
|
|
87
|
-
/**
|
|
137
|
+
/**
|
|
138
|
+
* The latest final transcript text accumulated for the current turn.
|
|
139
|
+
* Only updated by final (non-interim) transcript events.
|
|
140
|
+
* Reset to empty string after each `turn_complete` emission.
|
|
141
|
+
*/
|
|
88
142
|
this.accumulatedText = '';
|
|
89
|
-
/**
|
|
143
|
+
/**
|
|
144
|
+
* Whether the VAD currently reports active speech. Set to `true` on
|
|
145
|
+
* `speech_start` and `false` on `speech_end`. Used to prevent the
|
|
146
|
+
* silence timer from starting while the user is still speaking.
|
|
147
|
+
*/
|
|
90
148
|
this.speechActive = false;
|
|
91
|
-
/**
|
|
149
|
+
/**
|
|
150
|
+
* Handle to a pending silence timeout, or `null` if none is running.
|
|
151
|
+
* Cleared when speech resumes or when the detector is reset.
|
|
152
|
+
*/
|
|
92
153
|
this.silenceTimer = null;
|
|
93
|
-
/**
|
|
154
|
+
/**
|
|
155
|
+
* Wall-clock timestamp (ms) when the current turn's speech started.
|
|
156
|
+
* Used to compute `durationMs` in the emitted {@link TurnCompleteEvent}.
|
|
157
|
+
* `null` when no speech has been detected in the current turn.
|
|
158
|
+
*/
|
|
94
159
|
this.turnStartMs = null;
|
|
95
|
-
/**
|
|
160
|
+
/**
|
|
161
|
+
* Confidence of the most recent final transcript. Forwarded into the
|
|
162
|
+
* emitted {@link TurnCompleteEvent}. Defaults to 1 (perfect confidence)
|
|
163
|
+
* and is updated with each final transcript event.
|
|
164
|
+
*/
|
|
96
165
|
this.lastConfidence = 1;
|
|
97
166
|
this.silenceTimeoutMs = options.silenceTimeoutMs ?? DEFAULT_SILENCE_TIMEOUT_MS;
|
|
98
167
|
}
|
|
99
168
|
// ---------------------------------------------------------------------------
|
|
100
|
-
// IEndpointDetector
|
|
169
|
+
// IEndpointDetector -- pushTranscript
|
|
101
170
|
// ---------------------------------------------------------------------------
|
|
102
171
|
/**
|
|
103
172
|
* Ingest a transcript event from the upstream STT session.
|
|
104
173
|
*
|
|
105
174
|
* Only final events (`isFinal: true`) affect internal state. Interim results
|
|
106
|
-
* are silently ignored
|
|
107
|
-
*
|
|
175
|
+
* are silently ignored because:
|
|
176
|
+
* 1. They arrive very frequently (10-50 per second) and would trigger
|
|
177
|
+
* excessive punctuation checks.
|
|
178
|
+
* 2. Their text is unstable -- a word ending with "." may be revised in
|
|
179
|
+
* the next interim result, causing false turn-completion signals.
|
|
108
180
|
*
|
|
109
|
-
* If the final text is a recognised backchannel phrase the detector emits
|
|
110
|
-
* `'backchannel_detected'` and returns
|
|
111
|
-
* a subsequent `speech_end` event
|
|
181
|
+
* If the final text is a recognised backchannel phrase, the detector emits
|
|
182
|
+
* `'backchannel_detected'` and returns WITHOUT accumulating the text. This
|
|
183
|
+
* prevents a subsequent `speech_end` event from triggering `turn_complete`
|
|
184
|
+
* for what was merely an acknowledgement, not a real conversational turn.
|
|
112
185
|
*
|
|
113
|
-
* @param transcript
|
|
186
|
+
* @param transcript - Transcript event from the STT session.
|
|
114
187
|
*/
|
|
115
188
|
pushTranscript(transcript) {
|
|
116
189
|
if (!transcript.isFinal) {
|
|
117
|
-
// Ignore partial/interim hypotheses
|
|
190
|
+
// Ignore partial/interim hypotheses -- they will be superseded by
|
|
191
|
+
// a subsequent final result or revised interim.
|
|
118
192
|
return;
|
|
119
193
|
}
|
|
120
194
|
const text = transcript.text;
|
|
121
195
|
const normalised = text.trim().toLowerCase();
|
|
122
|
-
//
|
|
196
|
+
// Check for backchannel phrases BEFORE accumulating. This ensures that
|
|
197
|
+
// "uh huh" followed by speech_end does NOT produce a turn_complete.
|
|
123
198
|
if (BACKCHANNEL_PHRASES.has(normalised)) {
|
|
124
199
|
this.emit('backchannel_detected', { text });
|
|
125
200
|
return;
|
|
126
201
|
}
|
|
127
202
|
// Accumulate the final transcript and store the confidence score.
|
|
203
|
+
// We overwrite (not append) because each final event from the STT
|
|
204
|
+
// provider represents the complete hypothesis for the current utterance.
|
|
128
205
|
this.accumulatedText = text;
|
|
129
206
|
this.lastConfidence = transcript.confidence;
|
|
130
207
|
}
|
|
131
208
|
// ---------------------------------------------------------------------------
|
|
132
|
-
// IEndpointDetector
|
|
209
|
+
// IEndpointDetector -- pushVadEvent
|
|
133
210
|
// ---------------------------------------------------------------------------
|
|
134
211
|
/**
|
|
135
212
|
* Ingest a VAD (voice activity detection) event.
|
|
136
213
|
*
|
|
137
|
-
*
|
|
138
|
-
*
|
|
139
|
-
* -
|
|
140
|
-
*
|
|
141
|
-
*
|
|
142
|
-
*
|
|
214
|
+
* Event handling by type:
|
|
215
|
+
*
|
|
216
|
+
* - **`speech_start`**: Marks the turn as active and cancels any pending
|
|
217
|
+
* silence timer (the user resumed speaking before the timeout elapsed).
|
|
218
|
+
* This is critical for avoiding false turn-completion when the user
|
|
219
|
+
* takes a brief pause mid-sentence.
|
|
220
|
+
*
|
|
221
|
+
* - **`speech_end`**: If accumulated text is available, either fires
|
|
222
|
+
* `turn_complete` immediately (when text ends with terminal punctuation)
|
|
223
|
+
* or starts the silence timer (when no punctuation is detected).
|
|
224
|
+
*
|
|
225
|
+
* - **`silence`**: Periodic heartbeat events are ignored. The silence timer
|
|
226
|
+
* (started on `speech_end`) already handles delayed turn-completion
|
|
227
|
+
* independently of heartbeat cadence.
|
|
143
228
|
*
|
|
144
|
-
* @param event
|
|
229
|
+
* @param event - VAD transition event.
|
|
145
230
|
*/
|
|
146
231
|
pushVadEvent(event) {
|
|
147
232
|
switch (event.type) {
|
|
148
233
|
case 'speech_start': {
|
|
149
234
|
this.speechActive = true;
|
|
235
|
+
// Cancel any pending silence timer -- the user is speaking again
|
|
150
236
|
this._clearSilenceTimer();
|
|
237
|
+
// Record turn start only once (first speech_start in this turn)
|
|
151
238
|
if (this.turnStartMs === null) {
|
|
152
239
|
this.turnStartMs = event.timestamp;
|
|
153
240
|
}
|
|
@@ -156,33 +243,41 @@ export class HeuristicEndpointDetector extends EventEmitter {
|
|
|
156
243
|
case 'speech_end': {
|
|
157
244
|
this.speechActive = false;
|
|
158
245
|
if (!this.accumulatedText) {
|
|
159
|
-
//
|
|
246
|
+
// No transcript has arrived yet -- nothing to flush.
|
|
247
|
+
// This can happen when the VAD detects a very short burst of
|
|
248
|
+
// noise that doesn't produce any STT output.
|
|
160
249
|
break;
|
|
161
250
|
}
|
|
162
251
|
if (TERMINAL_PUNCTUATION.test(this.accumulatedText)) {
|
|
163
|
-
// Sentence-terminal punctuation
|
|
252
|
+
// Sentence-terminal punctuation detected -> fire immediately.
|
|
253
|
+
// This is the fast path that eliminates the 1.5 s silence wait.
|
|
164
254
|
this._emitTurnComplete('punctuation', event.timestamp);
|
|
165
255
|
}
|
|
166
256
|
else {
|
|
167
|
-
// No punctuation
|
|
257
|
+
// No terminal punctuation -> start the silence timer.
|
|
258
|
+
// If the user doesn't resume speaking within silenceTimeoutMs,
|
|
259
|
+
// we'll fire turn_complete with reason 'silence_timeout'.
|
|
168
260
|
this._startSilenceTimer(event.timestamp);
|
|
169
261
|
}
|
|
170
262
|
break;
|
|
171
263
|
}
|
|
172
264
|
case 'silence': {
|
|
173
|
-
// Periodic heartbeat
|
|
174
|
-
// handles
|
|
265
|
+
// Periodic heartbeat -- no action required. The silence timer
|
|
266
|
+
// (if running) handles delayed turn-completion independently.
|
|
175
267
|
break;
|
|
176
268
|
}
|
|
177
269
|
}
|
|
178
270
|
}
|
|
179
271
|
// ---------------------------------------------------------------------------
|
|
180
|
-
// IEndpointDetector
|
|
272
|
+
// IEndpointDetector -- reset
|
|
181
273
|
// ---------------------------------------------------------------------------
|
|
182
274
|
/**
|
|
183
275
|
* Reset all internal state, cancel pending timers, and prepare the detector
|
|
184
|
-
* for the next user turn.
|
|
185
|
-
*
|
|
276
|
+
* for the next user turn.
|
|
277
|
+
*
|
|
278
|
+
* Called by the pipeline after each `turn_complete` event (both internally
|
|
279
|
+
* and by the orchestrator's flush_complete handler) to ensure clean state
|
|
280
|
+
* before audio for the next turn begins to arrive.
|
|
186
281
|
*/
|
|
187
282
|
reset() {
|
|
188
283
|
this._clearSilenceTimer();
|
|
@@ -198,11 +293,17 @@ export class HeuristicEndpointDetector extends EventEmitter {
|
|
|
198
293
|
* Emit `turn_complete` with the currently accumulated transcript and then
|
|
199
294
|
* reset internal state so the detector is ready for the next turn.
|
|
200
295
|
*
|
|
201
|
-
*
|
|
202
|
-
*
|
|
203
|
-
*
|
|
296
|
+
* The reset happens BEFORE the emit to ensure that any re-entrant listeners
|
|
297
|
+
* (e.g. an endpoint detector handler that immediately calls pushVadEvent)
|
|
298
|
+
* see clean state.
|
|
299
|
+
*
|
|
300
|
+
* @param reason - The semantic reason driving this completion.
|
|
301
|
+
* @param speechEndTimestamp - Unix epoch ms timestamp of the `speech_end` event,
|
|
302
|
+
* used to compute `durationMs` as `speechEndTimestamp - turnStartMs`.
|
|
204
303
|
*/
|
|
205
304
|
_emitTurnComplete(reason, speechEndTimestamp) {
|
|
305
|
+
// Compute speech duration. Falls back to 0 if no speech_start was recorded
|
|
306
|
+
// (defensive: should not happen in normal operation).
|
|
206
307
|
const durationMs = this.turnStartMs !== null ? speechEndTimestamp - this.turnStartMs : 0;
|
|
207
308
|
const event = {
|
|
208
309
|
transcript: this.accumulatedText,
|
|
@@ -216,9 +317,14 @@ export class HeuristicEndpointDetector extends EventEmitter {
|
|
|
216
317
|
}
|
|
217
318
|
/**
|
|
218
319
|
* Start the silence-timeout timer. If the user does not resume speaking
|
|
219
|
-
* within {@link silenceTimeoutMs} ms the detector fires `turn_complete
|
|
320
|
+
* within {@link silenceTimeoutMs} ms, the detector fires `turn_complete`
|
|
321
|
+
* with reason `'silence_timeout'`.
|
|
322
|
+
*
|
|
323
|
+
* Any previously running silence timer is cleared first to prevent
|
|
324
|
+
* double-fires from rapid speech_end -> speech_start -> speech_end sequences.
|
|
220
325
|
*
|
|
221
|
-
* @param speechEndTimestamp
|
|
326
|
+
* @param speechEndTimestamp - Timestamp passed through to {@link _emitTurnComplete}
|
|
327
|
+
* for duration calculation.
|
|
222
328
|
*/
|
|
223
329
|
_startSilenceTimer(speechEndTimestamp) {
|
|
224
330
|
this._clearSilenceTimer();
|
|
@@ -229,6 +335,7 @@ export class HeuristicEndpointDetector extends EventEmitter {
|
|
|
229
335
|
}
|
|
230
336
|
/**
|
|
231
337
|
* Cancel a pending silence timer without any side effects.
|
|
338
|
+
* Safe to call when no timer is active (no-op).
|
|
232
339
|
*/
|
|
233
340
|
_clearSilenceTimer() {
|
|
234
341
|
if (this.silenceTimer !== null) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HeuristicEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"HeuristicEndpointDetector.js","sourceRoot":"","sources":["../../src/voice-pipeline/HeuristicEndpointDetector.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAQ3C,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E;;;;;;;;;;GAUG;AACH,MAAM,0BAA0B,GAAG,IAAK,CAAC;AAEzC;;;;;;;GAOG;AACH,MAAM,oBAAoB,GAAG,QAAQ,CAAC;AAEtC;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,mBAAmB,GAAwB,IAAI,GAAG,CAAC;IACvD,QAAQ;IACR,MAAM;IACN,MAAM;IACN,IAAI;IACJ,QAAQ;IACR,OAAO;IACP,KAAK;IACL,QAAQ;IACR,OAAO;IACP,MAAM;IACN,KAAK;IACL,KAAK;IACL,QAAQ;CACT,CAAC,CAAC;AAwBH,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,OAAO,yBACX,SAAQ,YAAY;IAgDpB,8EAA8E;IAC9E,cAAc;IACd,8EAA8E;IAE9E;;;;OAIG;IACH,YAAY,UAA4C,EAAE;QACxD,KAAK,EAAE,CAAC;QAvDV;;;;;WAKG;QACM,SAAI,GAA8B,WAAW,CAAC;QAKvD;;;;WAIG;QACK,oBAAe,GAAG,EAAE,CAAC;QAE7B;;;;WAIG;QACK,iBAAY,GAAG,KAAK,CAAC;QAE7B;;;WAGG;QACK,iBAAY,GAAyC,IAAI,CAAC;QAElE;;;;WAIG;QACK,gBAAW,GAAkB,IAAI,CAAC;QAE1C;;;;WAIG;QACK,mBAAc,GAAG,CAAC,CAAC;QAazB,IAAI,CAAC,gBAAgB,GAAG,OAAO,CAAC,gBAAgB,IAAI,0BAA0B,CAAC;IACjF,CAAC;IAED,8EAA8E;IAC9E,sCAAsC;IACtC,8EAA8E;IAE9E;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,CAAC,UAA2B;QACxC,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;YACxB,kEAAkE;YAClE,gDAAgD;YAChD,OAAO;QACT,CAAC;QAED,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,CAAC;QAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE7C,uEAAuE;QACvE,oEAAoE;QACpE,IAAI,mBAAmB,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;YAC5C,OAAO;QACT,CAAC;QAED,kEAAkE;QAClE,kEAAkE;QAClE,yEAAyE;QACzE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC;IAC9C,CAAC;IAED,8EAA8E;IAC9E,oCAAoC;IACpC,8EAA8E;IAE9E;;;;;;;;;;;;;;;;;;;OAmBG;IACH,YAAY,CAAC,KAAe;QAC1B,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;gBACzB,iEAAiE;gBACjE,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC1B,gEAAgE;gBAChE,IAAI,IAAI,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;oBAC9B,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,SAAS,CAAC;gBACrC,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,YAAY,CAAC,CAAC,CAAC;gBAClB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;gBAE1B,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;oBAC1B,qDAAqD;oBACrD,6DAA6D;oBAC7D,6CAA6C;oBAC7C,MAAM;gBACR,CAAC;gBAED,IAAI,oBAAoB,CAAC,IAAI,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;oBACpD,8DAA8D;oBAC9D,gEAAgE;oBAChE,IAAI,CAAC,iBAAiB,CAAC,aAAa,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;gBACzD,CAAC;qBAAM,CAAC;oBACN,sDAAsD;oBACtD,+DAA+D;oBAC/D,0DAA0D;oBAC1D,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBAC3C,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,SAAS,CAAC,CAAC,CAAC;gBACf,8DAA8D;gBAC9D,8DAA8D;gBAC9D,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,6BAA6B;IAC7B,8EAA8E;IAE9E;;;;;;;OAOG;IACH,KAAK;QACH,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,eAAe,GAAG,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;QAC1B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;QACxB,IAAI,CAAC,cAAc,GAAG,CAAC,CAAC;IAC1B,CAAC;IAED,8EAA8E;IAC9E,kBAAkB;IAClB,8EAA8E;IAE9E;;;;;;;;;;;OAWG;IACK,iBAAiB,CACvB,MAAmC,EACnC,kBAA0B;QAE1B,2EAA2E;QAC3E,sDAAsD;QACtD,MAAM,UAAU,GACd,IAAI,CAAC,WAAW,KAAK,IAAI,CAAC,CAAC,CAAC,kBAAkB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QAExE,MAAM,KAAK,GAAsB;YAC/B,UAAU,EAAE,IAAI,CAAC,eAAe;YAChC,UAAU,EAAE,IAAI,CAAC,cAAc;YAC/B,UAAU;YACV,MAAM;SACP,CAAC;QAEF,0EAA0E;QAC1E,IAAI,CAAC,KAAK,EAAE,CAAC;QAEb,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;IACpC,CAAC;IAED;;;;;;;;;;OAUG;IACK,kBAAkB,CAAC,kBAA0B;QACnD,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,GAAG,EAAE;YAClC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;YACzB,IAAI,CAAC,iBAAiB,CAAC,iBAAiB,EAAE,kBAAkB,CAAC,CAAC;QAChE,CAAC,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC5B,CAAC;IAED;;;OAGG;IACK,kBAAkB;QACxB,IAAI,IAAI,CAAC,YAAY,KAAK,IAAI,EAAE,CAAC;YAC/B,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAChC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QAC3B,CAAC;IACH,CAAC;CACF"}
|
|
@@ -1,36 +1,91 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @module voice-pipeline/SoftFadeBargeinHandler
|
|
3
3
|
*
|
|
4
|
-
* Implements a three-tier soft-fade barge-in policy
|
|
4
|
+
* Implements a three-tier soft-fade barge-in policy that maps detected speech
|
|
5
|
+
* duration to one of three actions: ignore, pause (with fade-out), or cancel.
|
|
5
6
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* ## Three-tier logic
|
|
8
|
+
*
|
|
9
|
+
* The handler divides the speech duration axis into three regions:
|
|
10
|
+
*
|
|
11
|
+
* ```
|
|
12
|
+
* 0 ms ignoreMs cancelMs
|
|
13
|
+
* |-------- ignore --------|-------- pause --------|-------- cancel -------->
|
|
14
|
+
* (noise) (fade-out) (hard stop)
|
|
15
|
+
* ```
|
|
16
|
+
*
|
|
17
|
+
* | Region | Action | Rationale |
|
|
18
|
+
* |---------------------------------|----------|-----------------------------------------------|
|
|
19
|
+
* | `speechDurationMs < ignoreMs` | `ignore` | Too short to be intentional (noise, breath). |
|
|
20
|
+
* | `ignoreMs <= speech < cancelMs` | `pause` | Probably intentional; fade out gracefully. |
|
|
21
|
+
* | `speechDurationMs >= cancelMs` | `cancel` | Definitely intentional; stop immediately. |
|
|
22
|
+
*
|
|
23
|
+
* ## Configurable thresholds
|
|
24
|
+
*
|
|
25
|
+
* - **`ignoreMs`** (default 100 ms): The noise floor. Anything shorter than
|
|
26
|
+
* this is dismissed. Set lower in quiet environments, higher in noisy ones.
|
|
27
|
+
*
|
|
28
|
+
* - **`cancelMs`** (default 2,000 ms): The hard-stop ceiling. By this point,
|
|
29
|
+
* the user has clearly been speaking for a while and wants to take over.
|
|
30
|
+
* The pipeline should stop TTS immediately rather than fading.
|
|
31
|
+
*
|
|
32
|
+
* - **`fadeMs`** (default 200 ms): The duration of the audio fade-out applied
|
|
33
|
+
* during a `'pause'` action. Shorter fades (100 ms) feel snappier; longer
|
|
34
|
+
* fades (300+ ms) feel smoother but delay the user's ability to be heard.
|
|
35
|
+
*
|
|
36
|
+
* ## When to use soft-fade vs hard-cut
|
|
37
|
+
*
|
|
38
|
+
* Soft-fade is preferred when:
|
|
39
|
+
* - The environment is noisy and false barge-in detections are common.
|
|
40
|
+
* - The conversation is measured/educational and abrupt cuts feel jarring.
|
|
41
|
+
* - The TTS voice has long trailing prosody that benefits from a fade.
|
|
42
|
+
*
|
|
43
|
+
* Use {@link HardCutBargeinHandler} when:
|
|
44
|
+
* - The conversation is fast-paced (customer support, command interfaces).
|
|
45
|
+
* - Audio quality is high and false positives are rare.
|
|
46
|
+
* - Minimal interruption latency is critical.
|
|
47
|
+
*
|
|
48
|
+
* @see {@link HardCutBargeinHandler} for the binary hard-cut alternative.
|
|
49
|
+
* @see {@link IBargeinHandler} for the interface contract.
|
|
10
50
|
*/
|
|
11
51
|
import type { BargeinAction, BargeinContext, IBargeinHandler } from './types.js';
|
|
12
52
|
/**
|
|
13
53
|
* Construction options for {@link SoftFadeBargeinHandler}.
|
|
54
|
+
*
|
|
55
|
+
* @example
|
|
56
|
+
* ```typescript
|
|
57
|
+
* const handler = new SoftFadeBargeinHandler({
|
|
58
|
+
* ignoreMs: 80,
|
|
59
|
+
* cancelMs: 1500,
|
|
60
|
+
* fadeMs: 150,
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
14
63
|
*/
|
|
15
64
|
export interface SoftFadeBargeinHandlerOptions {
|
|
16
65
|
/**
|
|
17
66
|
* Speech duration threshold in milliseconds below which the barge-in is
|
|
18
|
-
* treated as accidental noise and ignored.
|
|
67
|
+
* treated as accidental noise and ignored. This is the lower boundary
|
|
68
|
+
* of the "pause" region.
|
|
19
69
|
*
|
|
20
70
|
* @defaultValue 100
|
|
21
71
|
*/
|
|
22
72
|
ignoreMs?: number;
|
|
23
73
|
/**
|
|
24
74
|
* Speech duration threshold in milliseconds at or above which the barge-in
|
|
25
|
-
* triggers an immediate cancel rather than a fade-out pause.
|
|
26
|
-
*
|
|
75
|
+
* triggers an immediate cancel rather than a fade-out pause. This is the
|
|
76
|
+
* upper boundary of the "pause" region. Must be greater than {@link ignoreMs}
|
|
77
|
+
* for the pause (fade) region to exist.
|
|
27
78
|
*
|
|
28
79
|
* @defaultValue 2000
|
|
29
80
|
*/
|
|
30
81
|
cancelMs?: number;
|
|
31
82
|
/**
|
|
32
83
|
* Duration of the TTS fade-out in milliseconds applied when the speech
|
|
33
|
-
* duration falls in the range `[ignoreMs, cancelMs)`.
|
|
84
|
+
* duration falls in the "pause" range `[ignoreMs, cancelMs)`.
|
|
85
|
+
*
|
|
86
|
+
* The fade-out is applied client-side; the server sends a `{ type: 'pause', fadeMs }`
|
|
87
|
+
* control message and the client's audio player reduces volume linearly
|
|
88
|
+
* over this duration.
|
|
34
89
|
*
|
|
35
90
|
* @defaultValue 200
|
|
36
91
|
*/
|
|
@@ -39,38 +94,40 @@ export interface SoftFadeBargeinHandlerOptions {
|
|
|
39
94
|
/**
|
|
40
95
|
* Barge-in handler that applies a three-tier soft-fade strategy.
|
|
41
96
|
*
|
|
42
|
-
* The handler
|
|
97
|
+
* The handler is stateless -- each {@link handleBargein} call is evaluated
|
|
98
|
+
* independently with no memory of previous barge-in events.
|
|
43
99
|
*
|
|
44
|
-
*
|
|
45
|
-
*
|
|
46
|
-
* | `< ignoreMs` | `ignore` — noise, continue TTS uninterrupted |
|
|
47
|
-
* | `>= ignoreMs < cancelMs` | `pause` with `fadeMs` fade-out |
|
|
48
|
-
* | `>= cancelMs` | `cancel` with `'[interrupted]'` marker |
|
|
100
|
+
* @see {@link IBargeinHandler} for the interface contract.
|
|
101
|
+
* @see {@link HardCutBargeinHandler} for the binary hard-cut alternative.
|
|
49
102
|
*
|
|
50
103
|
* @example
|
|
51
|
-
* ```
|
|
104
|
+
* ```typescript
|
|
52
105
|
* const handler = new SoftFadeBargeinHandler({ ignoreMs: 80, cancelMs: 1500, fadeMs: 150 });
|
|
53
|
-
*
|
|
54
|
-
* handler.handleBargein({ speechDurationMs:
|
|
55
|
-
* handler.handleBargein({ speechDurationMs:
|
|
106
|
+
*
|
|
107
|
+
* handler.handleBargein({ speechDurationMs: 30, ... }); // -> { type: 'ignore' }
|
|
108
|
+
* handler.handleBargein({ speechDurationMs: 500, ... }); // -> { type: 'pause', fadeMs: 150 }
|
|
109
|
+
* handler.handleBargein({ speechDurationMs: 1600, ... }); // -> { type: 'cancel', injectMarker: '[interrupted]' }
|
|
56
110
|
* ```
|
|
57
111
|
*/
|
|
58
112
|
export declare class SoftFadeBargeinHandler implements IBargeinHandler {
|
|
59
113
|
/**
|
|
60
114
|
* The interruption strategy implemented by this handler.
|
|
61
|
-
* Always `'soft-fade'
|
|
115
|
+
* Always `'soft-fade'` -- TTS audio is faded out over a configurable window.
|
|
62
116
|
*/
|
|
63
117
|
readonly mode: "soft-fade";
|
|
64
118
|
/**
|
|
65
119
|
* Speech duration below which the barge-in is dismissed as noise.
|
|
120
|
+
* @see {@link SoftFadeBargeinHandlerOptions.ignoreMs}
|
|
66
121
|
*/
|
|
67
122
|
private readonly ignoreMs;
|
|
68
123
|
/**
|
|
69
124
|
* Speech duration at or above which the barge-in escalates to a full cancel.
|
|
125
|
+
* @see {@link SoftFadeBargeinHandlerOptions.cancelMs}
|
|
70
126
|
*/
|
|
71
127
|
private readonly cancelMs;
|
|
72
128
|
/**
|
|
73
129
|
* Duration of the TTS audio fade-out applied during a `'pause'` action.
|
|
130
|
+
* @see {@link SoftFadeBargeinHandlerOptions.fadeMs}
|
|
74
131
|
*/
|
|
75
132
|
private readonly fadeMs;
|
|
76
133
|
/**
|
|
@@ -84,12 +141,20 @@ export declare class SoftFadeBargeinHandler implements IBargeinHandler {
|
|
|
84
141
|
* Evaluate the barge-in context and return the pipeline action.
|
|
85
142
|
*
|
|
86
143
|
* Decision tree (evaluated in order):
|
|
87
|
-
*
|
|
88
|
-
*
|
|
89
|
-
*
|
|
144
|
+
*
|
|
145
|
+
* 1. `speechDurationMs < ignoreMs` -> `{ type: 'ignore' }`
|
|
146
|
+
* Too short to be intentional. Likely a lip smack, breath, or noise burst.
|
|
147
|
+
*
|
|
148
|
+
* 2. `speechDurationMs >= cancelMs` -> `{ type: 'cancel', injectMarker: '[interrupted]' }`
|
|
149
|
+
* The user has been speaking long enough that they clearly want to take over.
|
|
150
|
+
* Stop TTS immediately and mark the conversation as interrupted.
|
|
151
|
+
*
|
|
152
|
+
* 3. Otherwise (ignoreMs <= speech < cancelMs) -> `{ type: 'pause', fadeMs }`
|
|
153
|
+
* Probably intentional but not yet certain. Fade out TTS gracefully so the
|
|
154
|
+
* user can be heard. If the speech stops, the pipeline can resume playback.
|
|
90
155
|
*
|
|
91
156
|
* @param context - Snapshot of the barge-in state at the moment of detection.
|
|
92
|
-
* @returns The pipeline action to execute.
|
|
157
|
+
* @returns The pipeline action to execute. Always synchronous (no Promise).
|
|
93
158
|
*/
|
|
94
159
|
handleBargein(context: BargeinContext): BargeinAction;
|
|
95
160
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"SoftFadeBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/SoftFadeBargeinHandler.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"SoftFadeBargeinHandler.d.ts","sourceRoot":"","sources":["../../src/voice-pipeline/SoftFadeBargeinHandler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiDG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMjF;;;;;;;;;;;GAWG;AACH,MAAM,WAAW,6BAA6B;IAC5C;;;;;;OAMG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;;OAOG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB;;;;;;;;;OASG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAMD;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,sBAAuB,YAAW,eAAe;IAC5D;;;OAGG;IACH,QAAQ,CAAC,IAAI,EAAG,WAAW,CAAU;IAErC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAElC;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAEhC;;;;;OAKG;gBACS,OAAO,GAAE,6BAAkC;IAMvD;;;;;;;;;;;;;;;;;;OAkBG;IACH,aAAa,CAAC,OAAO,EAAE,cAAc,GAAG,aAAa;CAiBtD"}
|