osborn 0.9.35 → 0.9.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +76 -46
- package/dist/pipeline-direct-llm.d.ts +7 -0
- package/dist/pipeline-direct-llm.js +81 -23
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// Load environment variables FIRST before any other imports
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { voice, initializeLogger } from '@livekit/agents';
|
|
4
|
-
import { Room, RoomEvent } from '@livekit/rtc-node';
|
|
4
|
+
import { Room, RoomEvent, RemoteParticipant } from '@livekit/rtc-node';
|
|
5
5
|
import { AccessToken } from 'livekit-server-sdk';
|
|
6
6
|
// Initialize logger before anything else
|
|
7
7
|
initializeLogger({ pretty: true, level: 'info' });
|
|
@@ -1294,9 +1294,34 @@ async function main() {
|
|
|
1294
1294
|
console.warn('⚠️ Failed to read JSONL for interruption context:', err);
|
|
1295
1295
|
}
|
|
1296
1296
|
}
|
|
1297
|
-
// Store — consumed when user's next message arrives via chat()
|
|
1298
|
-
|
|
1299
|
-
|
|
1297
|
+
// Store — consumed when user's next message arrives via chat().
|
|
1298
|
+
// Preserve any already-buffered suppressedText (the user may have started speaking
|
|
1299
|
+
// BEFORE the previous TTS completed, and we may have already suppressed in-flight
|
|
1300
|
+
// tts_say events that arrived during that overlap).
|
|
1301
|
+
const carriedSuppressed = lastInterruption?.suppressedText ?? '';
|
|
1302
|
+
lastInterruption = { spokenText: fullText, recentMessages, suppressedText: carriedSuppressed, timestamp: Date.now() };
|
|
1303
|
+
console.log(`📋 Interruption context stored (text: ${fullText.length} chars, JSONL: ${recentMessages.length} chars, suppressed carried: ${carriedSuppressed.length} chars)`);
|
|
1304
|
+
}
|
|
1305
|
+
/**
|
|
1306
|
+
* Append text the agent tried to say while the user was speaking, but which we
|
|
1307
|
+
* suppressed at the tts_say gate to avoid talking over them. Folded into
|
|
1308
|
+
* lastInterruption so it travels to Claude in the next chat() call.
|
|
1309
|
+
* If no interruption context exists yet (e.g. user just started speaking with no
|
|
1310
|
+
* prior TTS interrupt), creates a fresh entry.
|
|
1311
|
+
*/
|
|
1312
|
+
function appendSuppressedText(text) {
|
|
1313
|
+
const t = text.trim();
|
|
1314
|
+
if (!t)
|
|
1315
|
+
return;
|
|
1316
|
+
if (lastInterruption) {
|
|
1317
|
+
const sep = lastInterruption.suppressedText ? '\n' : '';
|
|
1318
|
+
lastInterruption.suppressedText = lastInterruption.suppressedText + sep + t;
|
|
1319
|
+
lastInterruption.timestamp = Date.now();
|
|
1320
|
+
}
|
|
1321
|
+
else {
|
|
1322
|
+
lastInterruption = { spokenText: '', recentMessages: '', suppressedText: t, timestamp: Date.now() };
|
|
1323
|
+
}
|
|
1324
|
+
console.log(`🤐 Suppressed text buffered (+${t.length} chars, total ${lastInterruption.suppressedText.length}): "${t.substring(0, 80)}${t.length > 80 ? '...' : ''}"`);
|
|
1300
1325
|
}
|
|
1301
1326
|
/**
|
|
1302
1327
|
* Callback for PipelineDirectLLM — returns pending interruption context and clears it.
|
|
@@ -1311,7 +1336,11 @@ async function main() {
|
|
|
1311
1336
|
lastInterruption = null;
|
|
1312
1337
|
return null;
|
|
1313
1338
|
}
|
|
1314
|
-
const ctx = {
|
|
1339
|
+
const ctx = {
|
|
1340
|
+
spokenText: lastInterruption.spokenText,
|
|
1341
|
+
recentMessages: lastInterruption.recentMessages,
|
|
1342
|
+
suppressedText: lastInterruption.suppressedText,
|
|
1343
|
+
};
|
|
1315
1344
|
lastInterruption = null;
|
|
1316
1345
|
return ctx;
|
|
1317
1346
|
}
|
|
@@ -1821,6 +1850,15 @@ async function main() {
|
|
|
1821
1850
|
console.log(`🔇 tts_say fired but text is empty — skipping`);
|
|
1822
1851
|
return;
|
|
1823
1852
|
}
|
|
1853
|
+
// Suppress while the user is mid-utterance. Without this, agent text generated
|
|
1854
|
+
// in parallel by the Claude SDK plays right over the user — same problem as
|
|
1855
|
+
// pre-interrupt overlap, but at the *output* side. The suppressed text gets
|
|
1856
|
+
// folded into lastInterruption so the next chat() to Claude carries it as
|
|
1857
|
+
// "you wrote this but the user did not hear it — re-articulate if relevant."
|
|
1858
|
+
if (userState === 'speaking') {
|
|
1859
|
+
appendSuppressedText(data.text);
|
|
1860
|
+
return;
|
|
1861
|
+
}
|
|
1824
1862
|
const sayId = Date.now(); // simple ID to correlate start/end logs
|
|
1825
1863
|
console.log(`🗣️ [${sayId}] session.say START (${data.text.length} chars): "${data.text}"`);
|
|
1826
1864
|
// Forward spoken text + audio to meeting output page when bot is in a meeting.
|
|
@@ -2495,46 +2533,46 @@ async function main() {
|
|
|
2495
2533
|
// EARLIEST possible "user is speaking" signal in our setup. Driven by LiveKit's
|
|
2496
2534
|
// server-side audio-level VAD on the participant's WebRTC track — fires ~50-100ms
|
|
2497
2535
|
// after mic onset, independent of Deepgram STT or any local VAD (we don't run one).
|
|
2498
|
-
// Same signal the LiveKit room uses to identify active speakers, so it's tuned for
|
|
2499
|
-
// real speech and ignores low-level noise.
|
|
2500
2536
|
//
|
|
2501
|
-
// Flow: user starts talking → ActiveSpeakersChanged includes
|
|
2502
|
-
// if agent is currently speaking → interrupt the SpeechHandle to flush TTS.
|
|
2503
|
-
// The existing handleSpeechDone callback captures the spoken-text
|
|
2504
|
-
//
|
|
2505
|
-
// enrich the user's message with [INTERRUPTED] context — so the
|
|
2506
|
-
// note flow is preserved even though we're
|
|
2537
|
+
// Flow: user starts talking → ActiveSpeakersChanged includes a RemoteParticipant →
|
|
2538
|
+
// if agent is currently speaking → interrupt the SpeechHandle to flush TTS playback.
|
|
2539
|
+
// The existing handleSpeechDone callback (around line 1320) captures the spoken-text
|
|
2540
|
+
// + JSONL context into lastInterruption; PipelineDirectLLM consumes it on the next
|
|
2541
|
+
// chat() call to enrich the user's message with [INTERRUPTED] context — so the
|
|
2542
|
+
// post-interrupt note flow is preserved even though we're cutting TTS earlier.
|
|
2543
|
+
//
|
|
2544
|
+
// Filter is `instanceof RemoteParticipant`. The agent IS the LocalParticipant in this
|
|
2545
|
+
// room, and when its TTS plays it appears in the active-speakers list too. An earlier
|
|
2546
|
+
// attempt that compared `s.identity !== room.localParticipant?.identity` failed because
|
|
2547
|
+
// localParticipant.identity could be undefined at event-fire time, letting the agent's
|
|
2548
|
+
// own speech trigger a self-interrupt. The type check is bulletproof.
|
|
2507
2549
|
//
|
|
2508
2550
|
// Realtime mode skipped — the SDK handles interruption internally there, and manual
|
|
2509
2551
|
// interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
|
|
2510
2552
|
let lastActiveSpeakerInterruptAt = 0;
|
|
2511
2553
|
room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
|
|
2554
|
+
if (!Array.isArray(speakers) || speakers.length === 0)
|
|
2555
|
+
return;
|
|
2556
|
+
const remoteSpeakers = speakers.filter((s) => s instanceof RemoteParticipant);
|
|
2557
|
+
if (remoteSpeakers.length === 0)
|
|
2558
|
+
return;
|
|
2512
2559
|
if (currentVoiceMode === 'realtime')
|
|
2513
2560
|
return;
|
|
2514
2561
|
if (agentState !== 'speaking')
|
|
2515
2562
|
return;
|
|
2516
|
-
const localIdentity = room.localParticipant?.identity;
|
|
2517
|
-
const remoteSpeaking = Array.isArray(speakers) && speakers.some((s) => s && s.identity && s.identity !== localIdentity);
|
|
2518
|
-
if (!remoteSpeaking)
|
|
2519
|
-
return;
|
|
2520
|
-
// Debounce: avoid log spam when audio level oscillates above/below threshold.
|
|
2521
|
-
// interrupt() itself is idempotent on an already-interrupted SpeechHandle, but
|
|
2522
|
-
// we suppress duplicate logs within 1s.
|
|
2523
2563
|
const now = Date.now();
|
|
2524
|
-
|
|
2525
|
-
try {
|
|
2526
|
-
currentSession?.interrupt();
|
|
2527
|
-
}
|
|
2528
|
-
catch { }
|
|
2529
|
-
return;
|
|
2530
|
-
}
|
|
2564
|
+
const debounced = now - lastActiveSpeakerInterruptAt < 1000;
|
|
2531
2565
|
lastActiveSpeakerInterruptAt = now;
|
|
2532
2566
|
try {
|
|
2533
|
-
|
|
2567
|
+
if (!debounced) {
|
|
2568
|
+
const ids = remoteSpeakers.map((s) => s.identity).join(',');
|
|
2569
|
+
console.log(`🎤 ActiveSpeakersChanged: remote speakers [${ids}] + agent speaking → interrupting TTS`);
|
|
2570
|
+
}
|
|
2534
2571
|
currentSession?.interrupt();
|
|
2535
2572
|
}
|
|
2536
2573
|
catch (err) {
|
|
2537
|
-
|
|
2574
|
+
if (!debounced)
|
|
2575
|
+
console.warn('⚠️ active-speaker interrupt failed:', err instanceof Error ? err.message : err);
|
|
2538
2576
|
}
|
|
2539
2577
|
});
|
|
2540
2578
|
room.on(RoomEvent.Disconnected, () => {
|
|
@@ -2830,28 +2868,23 @@ async function main() {
|
|
|
2830
2868
|
}
|
|
2831
2869
|
});
|
|
2832
2870
|
// User state tracking — prevents queue from colliding with server-side VAD.
|
|
2833
|
-
//
|
|
2834
|
-
//
|
|
2835
|
-
//
|
|
2836
|
-
//
|
|
2837
|
-
//
|
|
2838
|
-
//
|
|
2839
|
-
// a local VAD. interrupt() drains the currentSpeech + speech queue, killing TTS
|
|
2840
|
-
// playback in-flight. handleSpeechDone still captures the spoken-text + JSONL
|
|
2841
|
-
// context, consumed by PipelineDirectLLM on the next chat() call.
|
|
2842
|
-
// Realtime mode skipped — the SDK handles interruption internally there, and manual
|
|
2843
|
-
// interrupt for Gemini realtime crashes its state machine (code 1008, memory v0.4.5).
|
|
2871
|
+
// Also a secondary interrupt trigger: when Deepgram STT classifies speech onset
|
|
2872
|
+
// it propagates here via agent_activity.onStartOfSpeech → _updateUserState('speaking').
|
|
2873
|
+
// Fires later than ActiveSpeakersChanged (Deepgram has ~100-300ms classification
|
|
2874
|
+
// latency vs LiveKit's ~50-100ms audio-level) but acts as a redundant fallback in
|
|
2875
|
+
// case the room-level event drops. interrupt() is idempotent on an already-
|
|
2876
|
+
// interrupted SpeechHandle so calling both paths is harmless.
|
|
2844
2877
|
sess.on('user_state_changed', (ev) => {
|
|
2845
2878
|
const prev = userState;
|
|
2846
2879
|
userState = ev.newState;
|
|
2847
2880
|
console.log(`👤 User state: ${prev} → ${ev.newState} (agent: ${agentState})`);
|
|
2848
2881
|
if (ev.newState === 'speaking' && agentState === 'speaking' && sessionVoiceMode !== 'realtime') {
|
|
2849
2882
|
try {
|
|
2850
|
-
console.log('🎤
|
|
2883
|
+
console.log('🎤 user_state_changed=speaking + agent speaking → interrupting TTS (fallback)');
|
|
2851
2884
|
currentSession?.interrupt();
|
|
2852
2885
|
}
|
|
2853
2886
|
catch (err) {
|
|
2854
|
-
console.warn('⚠️ user-
|
|
2887
|
+
console.warn('⚠️ user-state interrupt failed:', err instanceof Error ? err.message : err);
|
|
2855
2888
|
}
|
|
2856
2889
|
}
|
|
2857
2890
|
// When user stops speaking, retry voice queue — items may be waiting
|
|
@@ -3147,10 +3180,7 @@ async function main() {
|
|
|
3147
3180
|
clearInterval(readyInterval);
|
|
3148
3181
|
console.log('✅ agent_ready retries complete');
|
|
3149
3182
|
}, 20000);
|
|
3150
|
-
// Stop agent_ready retries on user speech
|
|
3151
|
-
// NB: input_speech_started is realtime-only — the SDK never emits it in STT pipeline
|
|
3152
|
-
// mode. The earliest onset signal in pipeline mode is user_state_changed → 'speaking',
|
|
3153
|
-
// wired further down. Don't add interrupt logic here.
|
|
3183
|
+
// Stop agent_ready retries on user speech
|
|
3154
3184
|
session.on('input_speech_started', () => {
|
|
3155
3185
|
readySent = true;
|
|
3156
3186
|
clearInterval(readyInterval);
|
|
@@ -14,6 +14,13 @@ import { EventEmitter } from 'events';
|
|
|
14
14
|
export interface InterruptionContext {
|
|
15
15
|
spokenText: string;
|
|
16
16
|
recentMessages: string;
|
|
17
|
+
/**
|
|
18
|
+
* Text the agent generated while the user was still speaking, which we
|
|
19
|
+
* suppressed at session.say() to avoid talking over the user. The agent
|
|
20
|
+
* receives this so it knows what it tried to say but the user did not hear,
|
|
21
|
+
* and can re-articulate the relevant bits in its next response.
|
|
22
|
+
*/
|
|
23
|
+
suppressedText: string;
|
|
17
24
|
}
|
|
18
25
|
export interface PipelineDirectOptions extends ClaudeLLMOptions {
|
|
19
26
|
onFastBrainResult?: (result: FastBrainPanelResult) => void;
|
|
@@ -83,31 +83,89 @@ export class PipelineDirectLLM extends llm.LLM {
|
|
|
83
83
|
}
|
|
84
84
|
}
|
|
85
85
|
console.log(`📥 [pipeline] chat() call #${callN} (${userText.length} chars): "${userText}"`);
|
|
86
|
-
//
|
|
86
|
+
// Always check the pending playback context — it can carry two independent
|
|
87
|
+
// signals: (a) an actual interruption (spokenText + recentMessages) when the
|
|
88
|
+
// user cut Osborn off mid-TTS, OR (b) suppressed text generated by the SDK
|
|
89
|
+
// while the user was speaking, regardless of whether they were actually
|
|
90
|
+
// interrupting active TTS. We need to forward BOTH cases so the agent knows
|
|
91
|
+
// what it produced that the user didn't hear, and so the buffer is cleared
|
|
92
|
+
// every turn even when there was no interrupt.
|
|
87
93
|
const interruptCtx = this.#opts.getAndConsumeInterruptionContext?.();
|
|
88
94
|
if (interruptCtx && userText.trim()) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
95
|
+
const hasInterrupt = !!interruptCtx.spokenText;
|
|
96
|
+
const hasSuppressed = !!interruptCtx.suppressedText;
|
|
97
|
+
const suppressedBlock = hasSuppressed
|
|
98
|
+
? [
|
|
99
|
+
``,
|
|
100
|
+
`Text you generated while the user was speaking — NOT played (we suppressed it so we wouldn't talk over them):`,
|
|
101
|
+
`"${interruptCtx.suppressedText}"`,
|
|
102
|
+
`If any of that is still relevant to the user's current message, re-articulate the key points naturally. If it's no longer relevant, drop it.`,
|
|
103
|
+
].join('\n')
|
|
104
|
+
: '';
|
|
105
|
+
let enrichedMessage;
|
|
106
|
+
if (hasInterrupt) {
|
|
107
|
+
// Actual mid-TTS interruption — keep the full [INTERRUPTED] template
|
|
108
|
+
console.log(`🔇 [pipeline] Enriching: interrupt (spoken=${interruptCtx.spokenText.length} chars, suppressed=${interruptCtx.suppressedText.length} chars)`);
|
|
109
|
+
this.#claudeLLM.interruptQuery().catch(() => { });
|
|
110
|
+
enrichedMessage = [
|
|
111
|
+
`[INTERRUPTED] The user interrupted your response mid-speech.`,
|
|
112
|
+
``,
|
|
113
|
+
`What the user heard before cutoff:`,
|
|
114
|
+
`"${interruptCtx.spokenText}"`,
|
|
115
|
+
``,
|
|
116
|
+
`WHAT THE USER DID NOT HEAR (you wrote this but it was cut off):`,
|
|
117
|
+
`Anything in "Your recent messages" below that appears AFTER the quoted heard text is content the user did not hear. The user has no memory of it.`,
|
|
118
|
+
``,
|
|
119
|
+
`Your recent messages (full untruncated — you wrote these):`,
|
|
120
|
+
interruptCtx.recentMessages || '(no recent messages found)',
|
|
121
|
+
suppressedBlock,
|
|
122
|
+
``,
|
|
123
|
+
`User's message: "${userText}"`,
|
|
124
|
+
``,
|
|
125
|
+
`CONTEXT PRESERVATION (READ THIS):`,
|
|
126
|
+
`The user has NO memory of unheard content. If any of it bears on their current message — answers a question they just asked, sets up a follow-up they're now asking about, or shows a knowledge gap and fills in a detail they're reacting to — you MUST surface it. Briefly is fine if their message is off-topic or explores a direction they haven't asked about yet. But never assume they remember what they never heard.`,
|
|
127
|
+
``,
|
|
128
|
+
`RESPOND with speech first, then act:`,
|
|
129
|
+
`- ALWAYS reply with at least one spoken sentence before doing any tool calls`,
|
|
130
|
+
`- If it's a quick side question, answer it then continue where you left off`,
|
|
131
|
+
`- If they want to change direction, acknowledge and follow their lead`,
|
|
132
|
+
`- Clarify when asked to or the question requires going over what you just said`,
|
|
133
|
+
`- If relevant details were cut off — whether they answer the current question or an earlier one — weave them back in naturally so the user stays in context without having to ask again.`,
|
|
134
|
+
].join('\n');
|
|
135
|
+
}
|
|
136
|
+
else if (hasSuppressed) {
|
|
137
|
+
// No real interrupt — user was speaking while we had text queued. They
|
|
138
|
+
// weren't cutting Osborn off, just talking over a gap. Don't claim an
|
|
139
|
+
// interrupt happened; symmetric structure to [INTERRUPTED] so Claude
|
|
140
|
+
// treats both signals consistently.
|
|
141
|
+
console.log(`🤐 [pipeline] Enriching: suppressed-only (${interruptCtx.suppressedText.length} chars, no interrupt)`);
|
|
142
|
+
enrichedMessage = [
|
|
143
|
+
`[CONTEXT] You generated speech while the user was already talking. None of it played.`,
|
|
144
|
+
``,
|
|
145
|
+
`What the user is saying now:`,
|
|
146
|
+
`"${userText}"`,
|
|
147
|
+
``,
|
|
148
|
+
`Text you produced that the user did NOT hear:`,
|
|
149
|
+
`"${interruptCtx.suppressedText}"`,
|
|
150
|
+
``,
|
|
151
|
+
`CONTEXT PRESERVATION (READ THIS):`,
|
|
152
|
+
`The user has NO memory of the unheard text. If any of it bears on their current message — answers a question they just asked, sets up a follow-up they're now asking about, or shows a knowledge gap and fills in a detail they're reacting to — you MUST surface it. Briefly is fine if their message is off-topic or explores a direction they haven't asked about yet. But never assume they remember what they never heard.`,
|
|
153
|
+
``,
|
|
154
|
+
`RESPOND with speech first, then act:`,
|
|
155
|
+
`- ALWAYS reply with at least one spoken sentence before doing any tool calls`,
|
|
156
|
+
`- Three likely cases — figure out which applies:`,
|
|
157
|
+
` (a) the user didn't realize you were responding → forward the key points of the unheard text`,
|
|
158
|
+
` (b) the user changed direction → drop the unheard text, follow their lead`,
|
|
159
|
+
` (c) the user's message builds on the unheard text → integrate it as if they'd heard it`,
|
|
160
|
+
`- Keep it tight — this is a voice response.`,
|
|
161
|
+
].join('\n');
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
// Context object existed but both fields empty — defensive no-op,
|
|
165
|
+
// shouldn't happen because appendSuppressedText only creates entries
|
|
166
|
+
// when text is non-empty.
|
|
167
|
+
enrichedMessage = userText;
|
|
168
|
+
}
|
|
111
169
|
// Modify the last user message in chatCtx
|
|
112
170
|
for (let i = chatCtx.items.length - 1; i >= 0; i--) {
|
|
113
171
|
const item = chatCtx.items[i];
|