@pimote/pimote 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -16
- package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
- package/client/build/_app/immutable/assets/2.DwPXxSa-.css +1 -0
- package/client/build/_app/immutable/chunks/-Lc-U-GJ.js +1 -0
- package/client/build/_app/immutable/chunks/{CT6ckxpD.js → CO_BwWGt.js} +1 -1
- package/client/build/_app/immutable/chunks/CklMSqcv.js +1 -0
- package/client/build/_app/immutable/chunks/D1INvMB9.js +1 -0
- package/client/build/_app/immutable/chunks/D1vhgXpq.js +5 -0
- package/client/build/_app/immutable/entry/{app.CNzpBgAg.js → app.B-HFVtpC.js} +2 -2
- package/client/build/_app/immutable/entry/start.DJTQ8-sD.js +1 -0
- package/client/build/_app/immutable/nodes/0.CepAO4xf.js +10 -0
- package/client/build/_app/immutable/nodes/{1.B8zmHMre.js → 1.CmxFYjRm.js} +1 -1
- package/client/build/_app/immutable/nodes/2.DAtqfmki.js +54 -0
- package/client/build/_app/version.json +1 -1
- package/client/build/index.html +7 -7
- package/package.json +7 -3
- package/server/dist/auto-drain-on-abort.js +49 -0
- package/server/dist/config.js +21 -0
- package/server/dist/extension-ui-bridge.js +14 -1
- package/server/dist/index.js +36 -1
- package/server/dist/message-mapper.js +38 -6
- package/server/dist/push-notification.js +11 -0
- package/server/dist/server.js +2 -2
- package/server/dist/session-manager.js +72 -4
- package/server/dist/voice/fsm/actions.js +6 -0
- package/server/dist/voice/fsm/events.js +7 -0
- package/server/dist/voice/fsm/reducer.js +74 -0
- package/server/dist/voice/fsm/reducers/lifecycle.js +158 -0
- package/server/dist/voice/fsm/reducers/streaming.js +220 -0
- package/server/dist/voice/fsm/reducers/walkback.js +73 -0
- package/server/dist/voice/fsm/state.js +21 -0
- package/server/dist/voice/fsm/text-extractor.js +128 -0
- package/server/dist/voice/index.js +336 -0
- package/server/dist/voice/interpreter-prompt.js +115 -0
- package/server/dist/voice/speechmux-client.js +153 -0
- package/server/dist/voice/state-machine.js +14 -0
- package/server/dist/voice/wait-for-idle.js +67 -0
- package/server/dist/voice/walk-back.js +198 -0
- package/server/dist/voice-orchestrator-boot.js +90 -0
- package/server/dist/voice-orchestrator.js +91 -0
- package/server/dist/ws-handler.js +112 -7
- package/shared/dist/index.d.ts +1 -0
- package/shared/dist/index.js +2 -0
- package/shared/dist/protocol.d.ts +614 -0
- package/shared/dist/protocol.js +30 -0
- package/client/build/_app/immutable/assets/0.DBrr7n4n.css +0 -2
- package/client/build/_app/immutable/assets/2.DE6k3bQj.css +0 -1
- package/client/build/_app/immutable/chunks/5vSSf6qG.js +0 -5
- package/client/build/_app/immutable/chunks/DlJOVoUQ.js +0 -1
- package/client/build/_app/immutable/chunks/YxmLwfhj.js +0 -1
- package/client/build/_app/immutable/chunks/yWVx3W2o.js +0 -1
- package/client/build/_app/immutable/entry/start.DYkTAHh1.js +0 -1
- package/client/build/_app/immutable/nodes/0.DNlQhEb_.js +0 -10
- package/client/build/_app/immutable/nodes/2.W9yV4-x2.js +0 -54
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
// Concern A: Lifecycle reducer.
|
|
2
|
+
//
|
|
3
|
+
// Responsible for:
|
|
4
|
+
// - Activate / deactivate of voice mode.
|
|
5
|
+
// - WS connection lifecycle (opened / failed / disconnected).
|
|
6
|
+
// - Buffering of outgoing speak frames during the `activating` window
|
|
7
|
+
// and flushing them on `ws:opened`.
|
|
8
|
+
//
|
|
9
|
+
// Holds NO knowledge of the streaming / walkback machines. Those plug in
|
|
10
|
+
// through the top-level dispatcher.
|
|
11
|
+
import { VOICE_CALL_ENDED_SENTINEL, VOICE_CALL_STARTED_SENTINEL } from '../../state-machine.js';
|
|
12
|
+
/**
|
|
13
|
+
* Pure transition function for the lifecycle slice.
|
|
14
|
+
*
|
|
15
|
+
* Emits the bulk of the side-effect actions: model setup, sentinel user
|
|
16
|
+
* message, WS open/close, deactivate-request.
|
|
17
|
+
*
|
|
18
|
+
* Frame emission policy: whenever the streaming machine produces a
|
|
19
|
+
* `send_frame` action, it goes via the top-level dispatcher into
|
|
20
|
+
* `bufferOrPassFrame()` below — which either forwards it (Active) or
|
|
21
|
+
* appends to `pendingFrames` (Activating). On `ws:opened` we flush in
|
|
22
|
+
* order.
|
|
23
|
+
*/
|
|
24
|
+
export function reduceLifecycle(prev, event, ctx) {
|
|
25
|
+
switch (event.type) {
|
|
26
|
+
case 'eb:activate': {
|
|
27
|
+
if (prev.kind !== 'dormant') {
|
|
28
|
+
// Re-activation while already active or activating is a no-op
|
|
29
|
+
// — the orchestrator's bind path is supposed to be the single
|
|
30
|
+
// owner. We log loudly in the shell; here we just stay put.
|
|
31
|
+
return { next: prev, interpreterAppliedNow: false, actions: [] };
|
|
32
|
+
}
|
|
33
|
+
const actions = [];
|
|
34
|
+
if (!ctx.interpreterApplied) {
|
|
35
|
+
actions.push({
|
|
36
|
+
kind: 'set_interpreter_model',
|
|
37
|
+
provider: ctx.config.defaultInterpreterModel.provider,
|
|
38
|
+
modelId: ctx.config.defaultInterpreterModel.modelId,
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
actions.push({ kind: 'send_user_message', text: VOICE_CALL_STARTED_SENTINEL });
|
|
42
|
+
actions.push({ kind: 'open_ws', url: event.msg.speechmuxWsUrl });
|
|
43
|
+
return {
|
|
44
|
+
next: {
|
|
45
|
+
kind: 'activating',
|
|
46
|
+
sessionId: event.msg.sessionId,
|
|
47
|
+
wsUrl: event.msg.speechmuxWsUrl,
|
|
48
|
+
pendingFrames: [],
|
|
49
|
+
},
|
|
50
|
+
interpreterAppliedNow: !ctx.interpreterApplied,
|
|
51
|
+
actions,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
case 'eb:deactivate': {
|
|
55
|
+
if (prev.kind === 'dormant') {
|
|
56
|
+
return { next: prev, interpreterAppliedNow: false, actions: [] };
|
|
57
|
+
}
|
|
58
|
+
// Inject an explicit end-of-call sentinel into the conversation so
|
|
59
|
+
// the agent has an in-history signal that voice mode is over —
|
|
60
|
+
// mirrors the `<voice_call_started/>` sentinel on activate. Without
|
|
61
|
+
// it, a session resumed in text mode after a call sees a wall of
|
|
62
|
+
// prior `speak()` calls and tends to mimic one more before the
|
|
63
|
+
// tool-side guard rejects it.
|
|
64
|
+
//
|
|
65
|
+
// Crucially this is a *silent* injection: unlike the start sentinel
|
|
66
|
+
// (which deliberately triggers the greeting turn), the end sentinel
|
|
67
|
+
// must NOT trigger a turn — the call is over, the LLM has nothing
|
|
68
|
+
// to do right now, and a triggered turn would just provoke one more
|
|
69
|
+
// speak() attempt that we then have to reject.
|
|
70
|
+
// Idempotent: close_ws is a no-op if no client is open.
|
|
71
|
+
return {
|
|
72
|
+
next: { kind: 'dormant' },
|
|
73
|
+
interpreterAppliedNow: false,
|
|
74
|
+
actions: [{ kind: 'inject_silent_user_message', customType: 'voice_call_ended', text: VOICE_CALL_ENDED_SENTINEL }, { kind: 'close_ws' }],
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
case 'ws:opened': {
|
|
78
|
+
if (prev.kind !== 'activating') {
|
|
79
|
+
// Stray opened event (e.g. after a deactivate-then-open race).
|
|
80
|
+
// Close the new connection if we somehow have one.
|
|
81
|
+
return { next: prev, interpreterAppliedNow: false, actions: [] };
|
|
82
|
+
}
|
|
83
|
+
const actions = [];
|
|
84
|
+
// Flush buffered speak frames in arrival order. Done here, not
|
|
85
|
+
// in the streaming reducer, so frame ordering is preserved across
|
|
86
|
+
// the activating→active boundary.
|
|
87
|
+
for (const frame of prev.pendingFrames) {
|
|
88
|
+
actions.push({ kind: 'send_frame', frame });
|
|
89
|
+
}
|
|
90
|
+
return {
|
|
91
|
+
next: { kind: 'active', sessionId: prev.sessionId },
|
|
92
|
+
interpreterAppliedNow: false,
|
|
93
|
+
actions,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
case 'ws:open_failed': {
|
|
97
|
+
if (prev.kind !== 'activating') {
|
|
98
|
+
return { next: prev, interpreterAppliedNow: false, actions: [] };
|
|
99
|
+
}
|
|
100
|
+
// Drop any buffered frames; the shell will rebuild from scratch
|
|
101
|
+
// on the next activate.
|
|
102
|
+
return {
|
|
103
|
+
next: { kind: 'dormant' },
|
|
104
|
+
interpreterAppliedNow: false,
|
|
105
|
+
actions: [{ kind: 'emit_deactivate_request' }],
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
case 'ws:disconnected': {
|
|
109
|
+
if (prev.kind === 'dormant') {
|
|
110
|
+
return { next: prev, interpreterAppliedNow: false, actions: [] };
|
|
111
|
+
}
|
|
112
|
+
return {
|
|
113
|
+
next: { kind: 'dormant' },
|
|
114
|
+
interpreterAppliedNow: false,
|
|
115
|
+
actions: [{ kind: 'emit_deactivate_request' }],
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
default:
|
|
119
|
+
return { next: prev, interpreterAppliedNow: false, actions: [] };
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Apply a `send_frame` action against the lifecycle state. Returns
|
|
124
|
+
* either the same action (to be executed by the shell) or a state
|
|
125
|
+
* mutation that buffers the frame for later flush.
|
|
126
|
+
*
|
|
127
|
+
* Splitting this out keeps the streaming reducer agnostic of the
|
|
128
|
+
* lifecycle phase — it always emits `send_frame`; this function decides
|
|
129
|
+
* whether to forward or buffer.
|
|
130
|
+
*/
|
|
131
|
+
export function bufferOrPassFrame(prev, frame) {
|
|
132
|
+
switch (prev.kind) {
|
|
133
|
+
case 'dormant':
|
|
134
|
+
// Frame produced while no call is bound — drop. Streaming reducer
|
|
135
|
+
// is supposed to no-op while dormant; if we land here it's a
|
|
136
|
+
// diagnostic the shell will log.
|
|
137
|
+
return { next: prev, actions: [] };
|
|
138
|
+
case 'activating':
|
|
139
|
+
return {
|
|
140
|
+
next: { ...prev, pendingFrames: [...prev.pendingFrames, frame] },
|
|
141
|
+
actions: [],
|
|
142
|
+
};
|
|
143
|
+
case 'active':
|
|
144
|
+
return { next: prev, actions: [{ kind: 'send_frame', frame }] };
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Top-level merge helper used by the dispatcher to splice the lifecycle
|
|
149
|
+
* sub-state back into the runtime record. Kept here so the dispatcher
|
|
150
|
+
* stays mechanical.
|
|
151
|
+
*/
|
|
152
|
+
export function applyLifecycleResult(prev, r) {
|
|
153
|
+
return {
|
|
154
|
+
...prev,
|
|
155
|
+
lifecycle: r.next,
|
|
156
|
+
interpreterApplied: prev.interpreterApplied || r.interpreterAppliedNow,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
// Concern B: Outbound speak streaming reducer.
|
|
2
|
+
//
|
|
3
|
+
// Translates the SDK's `message_update.assistantMessageEvent.toolcall_*`
|
|
4
|
+
// stream into speechmux WS frames (`token` + `end`) per `speak()` call.
|
|
5
|
+
//
|
|
6
|
+
// **Single emission path.** The reducer is the *only* code that ever
|
|
7
|
+
// produces speak `token` / `end` frames. The SDK's `tool_call` hook
|
|
8
|
+
// (which historically returned the full bulk text) does NOT emit
|
|
9
|
+
// anything; it only returns the tool-result. This eliminates the
|
|
10
|
+
// "double-emit" class of bugs by construction.
|
|
11
|
+
//
|
|
12
|
+
// Per-block FSM:
|
|
13
|
+
// no entry + ToolCallStart → unknown | speak_streaming | not_speak
|
|
14
|
+
// (depending on partial.content[idx].name)
|
|
15
|
+
// unknown + ToolCallDelta → promote (if name now resolved) and
|
|
16
|
+
// replay delta
|
|
17
|
+
// speak_str + ToolCallDelta → extractor.write(delta) → emit any
|
|
18
|
+
// newly-revealed token suffix
|
|
19
|
+
// speak_str + ToolCallEnd → diff against finalText → emit tail + end
|
|
20
|
+
// no entry + ToolCallEnd → emit (token + end) using the
|
|
21
|
+
// authoritative final args (covers
|
|
22
|
+
// providers that don't stream tool args)
|
|
23
|
+
// not_speak | speak_ended → no-op
|
|
24
|
+
//
|
|
25
|
+
// **Reset trigger:** the SDK `message_start` event for `role==='assistant'`
|
|
26
|
+
// clears the entire blocks map. This is the bug fix for the leak that
|
|
27
|
+
// stranded the previous implementation: it watched the wrong event
|
|
28
|
+
// (`assistantMessageEvent.start`, which never fires inside
|
|
29
|
+
// `message_update`).
|
|
30
|
+
//
|
|
31
|
+
// **No closures.** Block fields are fully immutable — every transition
|
|
32
|
+
// produces a fresh block. The `TextExtractor` referenced by a
|
|
33
|
+
// `speak_streaming` block is the one piece of mutable state, and that
|
|
34
|
+
// mutation is encapsulated; the reducer only ever reads it via
|
|
35
|
+
// `extractor.currentText()`. The block reference is preserved across
|
|
36
|
+
// `toolcall_delta` events that don't change the block's `kind`, so the
|
|
37
|
+
// extractor's accumulated text persists correctly.
|
|
38
|
+
import { TextExtractor } from '../text-extractor.js';
|
|
39
|
+
const noFrames = (next) => ({
|
|
40
|
+
next,
|
|
41
|
+
frames: [],
|
|
42
|
+
endedSpeakIds: [],
|
|
43
|
+
});
|
|
44
|
+
export function reduceStreaming(prev, event) {
|
|
45
|
+
switch (event.type) {
|
|
46
|
+
case 'sdk:message_start':
|
|
47
|
+
// Assistant message starts → wipe per-block state. (Filtering on
|
|
48
|
+
// role==='assistant' happens at the dispatcher.)
|
|
49
|
+
return noFrames({ blocks: new Map() });
|
|
50
|
+
case 'sdk:toolcall_start':
|
|
51
|
+
return noFrames(setBlock(prev, event.contentIndex, blockFromPartial(event.contentIndex, event.partial)));
|
|
52
|
+
case 'sdk:toolcall_delta':
|
|
53
|
+
return reduceDelta(prev, event.contentIndex, event.delta, event.partial);
|
|
54
|
+
case 'sdk:toolcall_end':
|
|
55
|
+
return reduceEnd(prev, event.contentIndex, event.toolCall);
|
|
56
|
+
default:
|
|
57
|
+
return noFrames(prev);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
// ---------------------------------------------------------------------------
|
|
61
|
+
// Per-event helpers
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
function reduceDelta(prev, idx, delta, partial) {
|
|
64
|
+
// Step 1: locate / synthesize / promote the block.
|
|
65
|
+
let entry = prev.blocks.get(idx) ?? blockFromPartial(idx, partial);
|
|
66
|
+
if (entry.kind === 'unknown')
|
|
67
|
+
entry = promoteUnknown(entry, idx, partial);
|
|
68
|
+
// Step 2: feed the extractor (only meaningful for speak_streaming).
|
|
69
|
+
if (entry.kind !== 'speak_streaming') {
|
|
70
|
+
return noFrames(setBlock(prev, idx, entry));
|
|
71
|
+
}
|
|
72
|
+
// Mutating the extractor here is internal to the extractor object;
|
|
73
|
+
// the reducer treats the extractor reference as opaque.
|
|
74
|
+
entry.extractor.write(delta);
|
|
75
|
+
// Step 3: harvest any newly-revealed prefix and emit one fragment.
|
|
76
|
+
const text = entry.extractor.currentText();
|
|
77
|
+
if (text.length <= entry.emittedLength) {
|
|
78
|
+
// No growth — keep the existing block reference (the extractor
|
|
79
|
+
// identity is preserved). We still must rewrite the map if the
|
|
80
|
+
// block was synthesized/promoted above; setBlock handles that.
|
|
81
|
+
return noFrames(setBlock(prev, idx, entry));
|
|
82
|
+
}
|
|
83
|
+
const fragment = text.slice(entry.emittedLength);
|
|
84
|
+
const advanced = {
|
|
85
|
+
kind: 'speak_streaming',
|
|
86
|
+
toolCallId: entry.toolCallId,
|
|
87
|
+
extractor: entry.extractor,
|
|
88
|
+
emittedLength: text.length,
|
|
89
|
+
};
|
|
90
|
+
return {
|
|
91
|
+
next: setBlock(prev, idx, advanced),
|
|
92
|
+
frames: [tokenFrame(fragment, entry.toolCallId)],
|
|
93
|
+
endedSpeakIds: [],
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
function reduceEnd(prev, idx, tc) {
|
|
97
|
+
const finalText = readFinalText(tc);
|
|
98
|
+
const toolName = typeof tc.name === 'string' ? tc.name : null;
|
|
99
|
+
const toolCallId = typeof tc.id === 'string' ? tc.id : null;
|
|
100
|
+
const entry = prev.blocks.get(idx);
|
|
101
|
+
// Case 1: no prior block — provider skipped both toolcall_start AND
|
|
102
|
+
// toolcall_delta. Emit the full text in one go.
|
|
103
|
+
if (!entry) {
|
|
104
|
+
if (toolName === 'speak' && finalText.length > 0) {
|
|
105
|
+
return {
|
|
106
|
+
next: setBlock(prev, idx, { kind: 'speak_ended', toolCallId }),
|
|
107
|
+
frames: [tokenFrame(finalText, toolCallId), endFrame(toolCallId)],
|
|
108
|
+
endedSpeakIds: toolCallId ? [toolCallId] : [],
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
return noFrames(setBlock(prev, idx, { kind: 'not_speak' }));
|
|
112
|
+
}
|
|
113
|
+
// Case 2: block was unknown — last chance to learn the name.
|
|
114
|
+
if (entry.kind === 'unknown') {
|
|
115
|
+
if (toolName === 'speak' && finalText.length > 0) {
|
|
116
|
+
return {
|
|
117
|
+
next: setBlock(prev, idx, { kind: 'speak_ended', toolCallId }),
|
|
118
|
+
frames: [tokenFrame(finalText, toolCallId), endFrame(toolCallId)],
|
|
119
|
+
endedSpeakIds: toolCallId ? [toolCallId] : [],
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
return noFrames(setBlock(prev, idx, { kind: 'not_speak' }));
|
|
123
|
+
}
|
|
124
|
+
// Case 3: not_speak / speak_ended — nothing to do.
|
|
125
|
+
if (entry.kind !== 'speak_streaming')
|
|
126
|
+
return noFrames(prev);
|
|
127
|
+
// Case 4: speak_streaming → finalize.
|
|
128
|
+
//
|
|
129
|
+
// We don't trust the extractor as authoritative at end-of-stream
|
|
130
|
+
// (escapes mid-chunk could have errored, etc.). Instead diff against
|
|
131
|
+
// the SDK-provided `finalText` and flush whatever's missing. This
|
|
132
|
+
// single fallback covers all parser-failure modes by construction.
|
|
133
|
+
const resolvedId = entry.toolCallId ?? toolCallId;
|
|
134
|
+
const frames = [];
|
|
135
|
+
let emitted = entry.emittedLength;
|
|
136
|
+
if (finalText.length > emitted) {
|
|
137
|
+
frames.push(tokenFrame(finalText.slice(emitted), resolvedId));
|
|
138
|
+
emitted = finalText.length;
|
|
139
|
+
}
|
|
140
|
+
if (emitted > 0) {
|
|
141
|
+
frames.push(endFrame(resolvedId));
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
next: setBlock(prev, idx, { kind: 'speak_ended', toolCallId: resolvedId }),
|
|
145
|
+
frames,
|
|
146
|
+
endedSpeakIds: resolvedId !== null && emitted > 0 ? [resolvedId] : [],
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
// Pure helpers
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
/** Construct an outgoing `token` frame, attaching speak_id when known. */
|
|
153
|
+
function tokenFrame(text, toolCallId) {
|
|
154
|
+
return toolCallId === null ? { type: 'token', text } : { type: 'token', text, speak_id: toolCallId };
|
|
155
|
+
}
|
|
156
|
+
/** Construct an outgoing `end` frame, attaching speak_id when known. */
|
|
157
|
+
function endFrame(toolCallId) {
|
|
158
|
+
return toolCallId === null ? { type: 'end' } : { type: 'end', speak_id: toolCallId };
|
|
159
|
+
}
|
|
160
|
+
function setBlock(state, idx, block) {
|
|
161
|
+
// Cheap aliasing check: if the block reference is identical and
|
|
162
|
+
// already present at this index, skip the Map allocation. Lets
|
|
163
|
+
// toolcall_delta steps that don't change anything stay zero-alloc.
|
|
164
|
+
if (state.blocks.get(idx) === block)
|
|
165
|
+
return state;
|
|
166
|
+
const blocks = new Map(state.blocks);
|
|
167
|
+
blocks.set(idx, block);
|
|
168
|
+
return { blocks };
|
|
169
|
+
}
|
|
170
|
+
function partialBlock(partial, idx) {
|
|
171
|
+
const c = partial?.content;
|
|
172
|
+
if (!Array.isArray(c))
|
|
173
|
+
return undefined;
|
|
174
|
+
const b = c[idx];
|
|
175
|
+
if (b && typeof b === 'object')
|
|
176
|
+
return b;
|
|
177
|
+
return undefined;
|
|
178
|
+
}
|
|
179
|
+
function readFinalText(tc) {
|
|
180
|
+
const a = tc.arguments;
|
|
181
|
+
if (a && typeof a === 'object') {
|
|
182
|
+
const t = a.text;
|
|
183
|
+
if (typeof t === 'string')
|
|
184
|
+
return t;
|
|
185
|
+
}
|
|
186
|
+
return '';
|
|
187
|
+
}
|
|
188
|
+
/** Build the initial block state from the partial carried on
|
|
189
|
+
* toolcall_start (or the first delta, when start is missing). */
|
|
190
|
+
function blockFromPartial(idx, partial) {
|
|
191
|
+
const pb = partialBlock(partial, idx);
|
|
192
|
+
const name = pb?.name;
|
|
193
|
+
const id = typeof pb?.id === 'string' ? pb.id : null;
|
|
194
|
+
if (typeof name !== 'string')
|
|
195
|
+
return { kind: 'unknown' };
|
|
196
|
+
if (name === 'speak')
|
|
197
|
+
return makeSpeakStreaming(id);
|
|
198
|
+
return { kind: 'not_speak' };
|
|
199
|
+
}
|
|
200
|
+
/** Late name resolution for an `unknown` block. */
|
|
201
|
+
function promoteUnknown(block, idx, partial) {
|
|
202
|
+
if (block.kind !== 'unknown')
|
|
203
|
+
return block;
|
|
204
|
+
const pb = partialBlock(partial, idx);
|
|
205
|
+
const name = pb?.name;
|
|
206
|
+
const id = typeof pb?.id === 'string' ? pb.id : null;
|
|
207
|
+
if (typeof name !== 'string')
|
|
208
|
+
return block;
|
|
209
|
+
if (name === 'speak')
|
|
210
|
+
return makeSpeakStreaming(id);
|
|
211
|
+
return { kind: 'not_speak' };
|
|
212
|
+
}
|
|
213
|
+
function makeSpeakStreaming(toolCallId) {
|
|
214
|
+
return {
|
|
215
|
+
kind: 'speak_streaming',
|
|
216
|
+
toolCallId,
|
|
217
|
+
extractor: new TextExtractor(),
|
|
218
|
+
emittedLength: 0,
|
|
219
|
+
};
|
|
220
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
// Concern C: Walkback / context rewrite reducer.
|
|
2
|
+
//
|
|
3
|
+
// When speechmux signals barge-in (abort or rollback), we mark walkback
|
|
4
|
+
// `pending` with the speak's `targetSpeakToolCallId`. The toolCallId
|
|
5
|
+
// comes from the wire frame's `speak_id` (echoed by speechmux from the
|
|
6
|
+
// chunk that was actively playing) when present; otherwise we fall back
|
|
7
|
+
// to the runtime-tracked `lastEmittedSpeakId`.
|
|
8
|
+
//
|
|
9
|
+
// On every `sdk:context` event we run `walkBack(...)`:
|
|
10
|
+
// - idle → just strip trailing aborted-empty assistants.
|
|
11
|
+
// - pending → strip + rewrite the targeted speak block to `heardText`,
|
|
12
|
+
// drop any blocks/messages that came after.
|
|
13
|
+
//
|
|
14
|
+
// The previous design captured the in-flight assistant message snapshot
|
|
15
|
+
// and used string-prefix accumulation across content blocks to identify
|
|
16
|
+
// what was heard. That broke whenever a turn had multiple speak()
|
|
17
|
+
// calls or whenever the snapshot was stale. The id-based design has no
|
|
18
|
+
// such ambiguity.
|
|
19
|
+
import { VOICE_INTERRUPT_CUSTOM_TYPE } from '../../../../../shared/dist/index.js';
|
|
20
|
+
import { walkBack } from '../../walk-back.js';
|
|
21
|
+
/** Resolve which speak() id to walk back to. Prefers what speechmux
|
|
22
|
+
* echoes; falls back to runtime-tracked latest. Returns null if neither
|
|
23
|
+
* is available (we'll degrade gracefully — abort the agent but skip
|
|
24
|
+
* the rewrite). */
|
|
25
|
+
function resolveTarget(frameSpeakId, lastEmittedSpeakId) {
|
|
26
|
+
if (frameSpeakId)
|
|
27
|
+
return frameSpeakId;
|
|
28
|
+
return lastEmittedSpeakId;
|
|
29
|
+
}
|
|
30
|
+
export function reduceWalkback(prev, lastEmittedSpeakId, event) {
|
|
31
|
+
switch (event.type) {
|
|
32
|
+
case 'ws:incoming': {
|
|
33
|
+
const f = event.frame;
|
|
34
|
+
if (f.type === 'user')
|
|
35
|
+
return { next: prev, actions: [] };
|
|
36
|
+
const heardText = f.type === 'rollback' ? f.heard_text : '';
|
|
37
|
+
const data = {
|
|
38
|
+
heard_text: heardText,
|
|
39
|
+
kind: f.type === 'rollback' ? 'rollback' : 'abort',
|
|
40
|
+
};
|
|
41
|
+
const target = resolveTarget(f.speak_id, lastEmittedSpeakId);
|
|
42
|
+
const actions = [{ kind: 'abort_agent' }, { kind: 'append_custom_entry', customType: VOICE_INTERRUPT_CUSTOM_TYPE, data }];
|
|
43
|
+
if (target === null) {
|
|
44
|
+
// No target available → can't rewrite. Just abort + record the
|
|
45
|
+
// interrupt entry; the next sdk:context will only strip
|
|
46
|
+
// aborted-empty assistants.
|
|
47
|
+
return { next: { kind: 'idle' }, actions };
|
|
48
|
+
}
|
|
49
|
+
return {
|
|
50
|
+
next: { kind: 'pending', heardText, targetSpeakToolCallId: target },
|
|
51
|
+
actions,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
case 'sdk:context': {
|
|
55
|
+
const rollback = prev.kind === 'pending' ? { heardText: prev.heardText, targetSpeakToolCallId: prev.targetSpeakToolCallId } : null;
|
|
56
|
+
const rewritten = walkBack({
|
|
57
|
+
messages: event.messages,
|
|
58
|
+
rollback,
|
|
59
|
+
});
|
|
60
|
+
return {
|
|
61
|
+
next: { kind: 'idle' },
|
|
62
|
+
actions: [{ kind: 'rewrite_context', messages: rewritten }],
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
case 'eb:deactivate':
|
|
66
|
+
return { next: { kind: 'idle' }, actions: [] };
|
|
67
|
+
default:
|
|
68
|
+
return { next: prev, actions: [] };
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
export function applyWalkbackResult(prev, r) {
|
|
72
|
+
return { ...prev, walkback: r.next };
|
|
73
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
// Voice extension runtime state.
|
|
2
|
+
//
|
|
3
|
+
// Three orthogonal concerns are modelled as parallel sub-machines that
|
|
4
|
+
// share a single top-level state record. Sub-reducers in
|
|
5
|
+
// `reducers/{lifecycle,streaming,walkback}.ts` operate only on their own
|
|
6
|
+
// slice; the top-level dispatcher in `reducer.ts` folds them together.
|
|
7
|
+
//
|
|
8
|
+
// The `JSONParser` instance held by `speak_streaming` blocks is the one
|
|
9
|
+
// piece of impurity inside this state — necessary because streaming JSON
|
|
10
|
+
// argument parsing can't be replayed lazily. It's owned by the block and
|
|
11
|
+
// disposed when the block transitions to `speak_ended` or the message
|
|
12
|
+
// resets.
|
|
13
|
+
export function initialState() {
|
|
14
|
+
return {
|
|
15
|
+
lifecycle: { kind: 'dormant' },
|
|
16
|
+
message: { blocks: new Map() },
|
|
17
|
+
walkback: { kind: 'idle' },
|
|
18
|
+
interpreterApplied: false,
|
|
19
|
+
lastEmittedSpeakId: null,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
// Streaming extractor for the `text` value of a `speak({text:"..."})`
|
|
2
|
+
// tool argument JSON.
|
|
3
|
+
//
|
|
4
|
+
// Replaces our previous use of `@streamparser/json` (which is callback-
|
|
5
|
+
// based and thus required closures into reducer state). This extractor
|
|
6
|
+
// is fully synchronous: callers write JSON chunks and read the extracted
|
|
7
|
+
// text via `currentText()`. All mutation is encapsulated inside the
|
|
8
|
+
// extractor object; the FSM treats it as an opaque streaming buffer.
|
|
9
|
+
//
|
|
10
|
+
// **Scope.** This handles only the JSON shape `{"text": "<string>"}` —
|
|
11
|
+
// the exact shape of the `speak` tool's argument schema (single string
|
|
12
|
+
// field). It is *not* a general JSON parser. If we ever add more args
|
|
13
|
+
// to `speak`, we'll need to extend it (or reach for streamparser
|
|
14
|
+
// again). The trade-off is: ~80 lines of focused code vs a 3-letter
|
|
15
|
+
// dependency that introduced a closure-binding bug.
|
|
16
|
+
//
|
|
17
|
+
// **Robustness.** The extractor handles:
|
|
18
|
+
// - leading whitespace before / inside the object
|
|
19
|
+
// - the `text` key appearing first (not nested or preceded by other
|
|
20
|
+
// keys — the schema enforces this)
|
|
21
|
+
// - all JSON string escapes including `\uXXXX`
|
|
22
|
+
// - chunk boundaries falling inside escape sequences (the buffer
|
|
23
|
+
// holds onto unconsumed bytes until the next write provides the
|
|
24
|
+
// rest)
|
|
25
|
+
// It does NOT handle:
|
|
26
|
+
// - object/array values (no need; `text` is a string)
|
|
27
|
+
// - non-`text` keys appearing before `text`
|
|
28
|
+
const HEAD_PATTERN = /"text"\s*:\s*"/;
|
|
29
|
+
const SIMPLE_ESCAPES = {
|
|
30
|
+
'"': '"',
|
|
31
|
+
'\\': '\\',
|
|
32
|
+
'/': '/',
|
|
33
|
+
n: '\n',
|
|
34
|
+
r: '\r',
|
|
35
|
+
t: '\t',
|
|
36
|
+
b: '\b',
|
|
37
|
+
f: '\f',
|
|
38
|
+
};
|
|
39
|
+
export class TextExtractor {
|
|
40
|
+
phase = 'pre_string';
|
|
41
|
+
/** Unconsumed input bytes that follow the cursor. */
|
|
42
|
+
buffer = '';
|
|
43
|
+
/** Decoded text accumulated so far. */
|
|
44
|
+
text = '';
|
|
45
|
+
/** Feed another JSON chunk. Idempotent once `closed` or `errored`. */
|
|
46
|
+
write(chunk) {
|
|
47
|
+
if (this.phase === 'closed' || this.phase === 'errored')
|
|
48
|
+
return;
|
|
49
|
+
if (chunk.length === 0)
|
|
50
|
+
return;
|
|
51
|
+
this.buffer += chunk;
|
|
52
|
+
this.advance();
|
|
53
|
+
}
|
|
54
|
+
/** The decoded value of `$.text` accumulated so far. Monotonic until
|
|
55
|
+
* `closed`/`errored`. */
|
|
56
|
+
currentText() {
|
|
57
|
+
return this.text;
|
|
58
|
+
}
|
|
59
|
+
/** Whether the closing `"` has been observed. */
|
|
60
|
+
isClosed() {
|
|
61
|
+
return this.phase === 'closed';
|
|
62
|
+
}
|
|
63
|
+
/** Whether parsing failed (e.g. malformed escape sequence). The FSM's
|
|
64
|
+
* toolcall_end fallback fills any remaining gap from the SDK's
|
|
65
|
+
* authoritative final text, so an errored extractor is recoverable
|
|
66
|
+
* at end-of-stream. */
|
|
67
|
+
isErrored() {
|
|
68
|
+
return this.phase === 'errored';
|
|
69
|
+
}
|
|
70
|
+
// -------------------------------------------------------------------------
|
|
71
|
+
advance() {
|
|
72
|
+
if (this.phase === 'pre_string')
|
|
73
|
+
this.advancePreString();
|
|
74
|
+
if (this.phase === 'in_string')
|
|
75
|
+
this.advanceInString();
|
|
76
|
+
}
|
|
77
|
+
advancePreString() {
|
|
78
|
+
const m = HEAD_PATTERN.exec(this.buffer);
|
|
79
|
+
if (!m)
|
|
80
|
+
return; // wait for more input
|
|
81
|
+
// Drop everything up to and including the opening quote.
|
|
82
|
+
this.buffer = this.buffer.slice(m.index + m[0].length);
|
|
83
|
+
this.phase = 'in_string';
|
|
84
|
+
}
|
|
85
|
+
advanceInString() {
|
|
86
|
+
let i = 0;
|
|
87
|
+
while (i < this.buffer.length) {
|
|
88
|
+
const c = this.buffer.charCodeAt(i);
|
|
89
|
+
if (c === 0x5c /* \ */) {
|
|
90
|
+
if (i + 1 >= this.buffer.length)
|
|
91
|
+
break; // wait for the escape char
|
|
92
|
+
const esc = this.buffer[i + 1];
|
|
93
|
+
if (esc === 'u') {
|
|
94
|
+
if (i + 6 > this.buffer.length)
|
|
95
|
+
break; // wait for the 4 hex digits
|
|
96
|
+
const hex = this.buffer.slice(i + 2, i + 6);
|
|
97
|
+
if (!/^[0-9a-fA-F]{4}$/.test(hex)) {
|
|
98
|
+
this.phase = 'errored';
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
this.text += String.fromCharCode(parseInt(hex, 16));
|
|
102
|
+
i += 6;
|
|
103
|
+
}
|
|
104
|
+
else if (esc in SIMPLE_ESCAPES) {
|
|
105
|
+
this.text += SIMPLE_ESCAPES[esc];
|
|
106
|
+
i += 2;
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
// Invalid escape; bail.
|
|
110
|
+
this.phase = 'errored';
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
else if (c === 0x22 /* " */) {
|
|
115
|
+
// End of string. Consume up to and including the closing quote.
|
|
116
|
+
this.phase = 'closed';
|
|
117
|
+
this.buffer = this.buffer.slice(i + 1);
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
this.text += this.buffer[i];
|
|
122
|
+
i += 1;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
// Retain unconsumed tail (a partial escape at boundary).
|
|
126
|
+
this.buffer = this.buffer.slice(i);
|
|
127
|
+
}
|
|
128
|
+
}
|