npm - @pimote/pimote - Versions diffs - 0.2.0 → 0.3.0 - Mend

@pimote/pimote 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +43 -16
package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
package/client/build/_app/immutable/assets/2.D9fiCd8W.css +1 -0
package/client/build/_app/immutable/chunks/BNqgidwO.js +5 -0
package/client/build/_app/immutable/chunks/D26i4pYm.js +1 -0
package/client/build/_app/immutable/chunks/D_Fpgknp.js +1 -0
package/client/build/_app/immutable/chunks/DoVhjU85.js +1 -0
package/client/build/_app/immutable/chunks/DzqbY2XU.js +1 -0
package/client/build/_app/immutable/entry/{app.CNzpBgAg.js → app.DO-zgzyy.js} +2 -2
package/client/build/_app/immutable/entry/start.BZlrOH0-.js +1 -0
package/client/build/_app/immutable/nodes/0.BEh4bPGQ.js +10 -0
package/client/build/_app/immutable/nodes/{1.B8zmHMre.js → 1.B2l9JGRO.js} +1 -1
package/client/build/_app/immutable/nodes/2.ph9M0S1U.js +54 -0
package/client/build/_app/version.json +1 -1
package/client/build/index.html +7 -7
package/package.json +7 -3
package/server/dist/auto-drain-on-abort.js +49 -0
package/server/dist/config.js +21 -0
package/server/dist/extension-ui-bridge.js +14 -1
package/server/dist/index.js +31 -1
package/server/dist/message-mapper.js +38 -6
package/server/dist/server.js +2 -2
package/server/dist/session-manager.js +64 -2
package/server/dist/voice/fsm/actions.js +6 -0
package/server/dist/voice/fsm/events.js +7 -0
package/server/dist/voice/fsm/reducer.js +74 -0
package/server/dist/voice/fsm/reducers/lifecycle.js +146 -0
package/server/dist/voice/fsm/reducers/streaming.js +220 -0
package/server/dist/voice/fsm/reducers/walkback.js +73 -0
package/server/dist/voice/fsm/state.js +21 -0
package/server/dist/voice/fsm/text-extractor.js +128 -0
package/server/dist/voice/index.js +319 -0
package/server/dist/voice/interpreter-prompt.js +115 -0
package/server/dist/voice/speechmux-client.js +153 -0
package/server/dist/voice/state-machine.js +7 -0
package/server/dist/voice/wait-for-idle.js +67 -0
package/server/dist/voice/walk-back.js +198 -0
package/server/dist/voice-orchestrator-boot.js +90 -0
package/server/dist/voice-orchestrator.js +91 -0
package/server/dist/ws-handler.js +108 -5
package/shared/dist/index.d.ts +1 -0
package/shared/dist/index.js +2 -0
package/shared/dist/protocol.d.ts +614 -0
package/shared/dist/protocol.js +30 -0
package/client/build/_app/immutable/assets/0.DBrr7n4n.css +0 -2
package/client/build/_app/immutable/assets/2.DE6k3bQj.css +0 -1
package/client/build/_app/immutable/chunks/5vSSf6qG.js +0 -5
package/client/build/_app/immutable/chunks/CT6ckxpD.js +0 -1
package/client/build/_app/immutable/chunks/DlJOVoUQ.js +0 -1
package/client/build/_app/immutable/chunks/YxmLwfhj.js +0 -1
package/client/build/_app/immutable/chunks/yWVx3W2o.js +0 -1
package/client/build/_app/immutable/entry/start.DYkTAHh1.js +0 -1
package/client/build/_app/immutable/nodes/0.DNlQhEb_.js +0 -10
package/client/build/_app/immutable/nodes/2.W9yV4-x2.js +0 -54

package/server/dist/voice/fsm/reducers/streaming.js ADDED Viewed

@@ -0,0 +1,220 @@
+// Concern B: Outbound speak streaming reducer.
+//
+// Translates the SDK's `message_update.assistantMessageEvent.toolcall_*`
+// stream into speechmux WS frames (`token` + `end`) per `speak()` call.
+//
+// **Single emission path.** The reducer is the *only* code that ever
+// produces speak `token` / `end` frames. The SDK's `tool_call` hook
+// (which historically returned the full bulk text) does NOT emit
+// anything; it only returns the tool-result. This eliminates the
+// "double-emit" class of bugs by construction.
+//
+// Per-block FSM:
+//   no entry  + ToolCallStart  → unknown | speak_streaming | not_speak
+//                                (depending on partial.content[idx].name)
+//   unknown   + ToolCallDelta  → promote (if name now resolved) and
+//                                replay delta
+//   speak_str + ToolCallDelta  → extractor.write(delta) → emit any
+//                                newly-revealed token suffix
+//   speak_str + ToolCallEnd    → diff against finalText → emit tail + end
+//   no entry  + ToolCallEnd    → emit (token + end) using the
+//                                authoritative final args (covers
+//                                providers that don't stream tool args)
+//   not_speak | speak_ended    → no-op
+//
+// **Reset trigger:** the SDK `message_start` event for `role==='assistant'`
+// clears the entire blocks map. This is the bug fix for the leak that
+// stranded the previous implementation: it watched the wrong event
+// (`assistantMessageEvent.start`, which never fires inside
+// `message_update`).
+//
+// **No closures.** Block fields are fully immutable — every transition
+// produces a fresh block. The `TextExtractor` referenced by a
+// `speak_streaming` block is the one piece of mutable state, and that
+// mutation is encapsulated; the reducer only ever reads it via
+// `extractor.currentText()`. The block reference is preserved across
+// `toolcall_delta` events that don't change the block's `kind`, so the
+// extractor's accumulated text persists correctly.
+import { TextExtractor } from '../text-extractor.js';
+const noFrames = (next) => ({
+    next,
+    frames: [],
+    endedSpeakIds: [],
+});
+export function reduceStreaming(prev, event) {
+    switch (event.type) {
+        case 'sdk:message_start':
+            // Assistant message starts → wipe per-block state. (Filtering on
+            // role==='assistant' happens at the dispatcher.)
+            return noFrames({ blocks: new Map() });
+        case 'sdk:toolcall_start':
+            return noFrames(setBlock(prev, event.contentIndex, blockFromPartial(event.contentIndex, event.partial)));
+        case 'sdk:toolcall_delta':
+            return reduceDelta(prev, event.contentIndex, event.delta, event.partial);
+        case 'sdk:toolcall_end':
+            return reduceEnd(prev, event.contentIndex, event.toolCall);
+        default:
+            return noFrames(prev);
+    }
+}
+// ---------------------------------------------------------------------------
+// Per-event helpers
+// ---------------------------------------------------------------------------
+function reduceDelta(prev, idx, delta, partial) {
+    // Step 1: locate / synthesize / promote the block.
+    let entry = prev.blocks.get(idx) ?? blockFromPartial(idx, partial);
+    if (entry.kind === 'unknown')
+        entry = promoteUnknown(entry, idx, partial);
+    // Step 2: feed the extractor (only meaningful for speak_streaming).
+    if (entry.kind !== 'speak_streaming') {
+        return noFrames(setBlock(prev, idx, entry));
+    }
+    // Mutating the extractor here is internal to the extractor object;
+    // the reducer treats the extractor reference as opaque.
+    entry.extractor.write(delta);
+    // Step 3: harvest any newly-revealed prefix and emit one fragment.
+    const text = entry.extractor.currentText();
+    if (text.length <= entry.emittedLength) {
+        // No growth — keep the existing block reference (the extractor
+        // identity is preserved). We still must rewrite the map if the
+        // block was synthesized/promoted above; setBlock handles that.
+        return noFrames(setBlock(prev, idx, entry));
+    }
+    const fragment = text.slice(entry.emittedLength);
+    const advanced = {
+        kind: 'speak_streaming',
+        toolCallId: entry.toolCallId,
+        extractor: entry.extractor,
+        emittedLength: text.length,
+    };
+    return {
+        next: setBlock(prev, idx, advanced),
+        frames: [tokenFrame(fragment, entry.toolCallId)],
+        endedSpeakIds: [],
+    };
+}
+function reduceEnd(prev, idx, tc) {
+    const finalText = readFinalText(tc);
+    const toolName = typeof tc.name === 'string' ? tc.name : null;
+    const toolCallId = typeof tc.id === 'string' ? tc.id : null;
+    const entry = prev.blocks.get(idx);
+    // Case 1: no prior block — provider skipped both toolcall_start AND
+    // toolcall_delta. Emit the full text in one go.
+    if (!entry) {
+        if (toolName === 'speak' && finalText.length > 0) {
+            return {
+                next: setBlock(prev, idx, { kind: 'speak_ended', toolCallId }),
+                frames: [tokenFrame(finalText, toolCallId), endFrame(toolCallId)],
+                endedSpeakIds: toolCallId ? [toolCallId] : [],
+            };
+        }
+        return noFrames(setBlock(prev, idx, { kind: 'not_speak' }));
+    }
+    // Case 2: block was unknown — last chance to learn the name.
+    if (entry.kind === 'unknown') {
+        if (toolName === 'speak' && finalText.length > 0) {
+            return {
+                next: setBlock(prev, idx, { kind: 'speak_ended', toolCallId }),
+                frames: [tokenFrame(finalText, toolCallId), endFrame(toolCallId)],
+                endedSpeakIds: toolCallId ? [toolCallId] : [],
+            };
+        }
+        return noFrames(setBlock(prev, idx, { kind: 'not_speak' }));
+    }
+    // Case 3: not_speak / speak_ended — nothing to do.
+    if (entry.kind !== 'speak_streaming')
+        return noFrames(prev);
+    // Case 4: speak_streaming → finalize.
+    //
+    // We don't trust the extractor as authoritative at end-of-stream
+    // (escapes mid-chunk could have errored, etc.). Instead diff against
+    // the SDK-provided `finalText` and flush whatever's missing. This
+    // single fallback covers all parser-failure modes by construction.
+    const resolvedId = entry.toolCallId ?? toolCallId;
+    const frames = [];
+    let emitted = entry.emittedLength;
+    if (finalText.length > emitted) {
+        frames.push(tokenFrame(finalText.slice(emitted), resolvedId));
+        emitted = finalText.length;
+    }
+    if (emitted > 0) {
+        frames.push(endFrame(resolvedId));
+    }
+    return {
+        next: setBlock(prev, idx, { kind: 'speak_ended', toolCallId: resolvedId }),
+        frames,
+        endedSpeakIds: resolvedId !== null && emitted > 0 ? [resolvedId] : [],
+    };
+}
+// ---------------------------------------------------------------------------
+// Pure helpers
+// ---------------------------------------------------------------------------
+/** Construct an outgoing `token` frame, attaching speak_id when known. */
+function tokenFrame(text, toolCallId) {
+    return toolCallId === null ? { type: 'token', text } : { type: 'token', text, speak_id: toolCallId };
+}
+/** Construct an outgoing `end` frame, attaching speak_id when known. */
+function endFrame(toolCallId) {
+    return toolCallId === null ? { type: 'end' } : { type: 'end', speak_id: toolCallId };
+}
+function setBlock(state, idx, block) {
+    // Cheap aliasing check: if the block reference is identical and
+    // already present at this index, skip the Map allocation. Lets
+    // toolcall_delta steps that don't change anything stay zero-alloc.
+    if (state.blocks.get(idx) === block)
+        return state;
+    const blocks = new Map(state.blocks);
+    blocks.set(idx, block);
+    return { blocks };
+}
+function partialBlock(partial, idx) {
+    const c = partial?.content;
+    if (!Array.isArray(c))
+        return undefined;
+    const b = c[idx];
+    if (b && typeof b === 'object')
+        return b;
+    return undefined;
+}
+function readFinalText(tc) {
+    const a = tc.arguments;
+    if (a && typeof a === 'object') {
+        const t = a.text;
+        if (typeof t === 'string')
+            return t;
+    }
+    return '';
+}
+/** Build the initial block state from the partial carried on
+ *  toolcall_start (or the first delta, when start is missing). */
+function blockFromPartial(idx, partial) {
+    const pb = partialBlock(partial, idx);
+    const name = pb?.name;
+    const id = typeof pb?.id === 'string' ? pb.id : null;
+    if (typeof name !== 'string')
+        return { kind: 'unknown' };
+    if (name === 'speak')
+        return makeSpeakStreaming(id);
+    return { kind: 'not_speak' };
+}
+/** Late name resolution for an `unknown` block. */
+function promoteUnknown(block, idx, partial) {
+    if (block.kind !== 'unknown')
+        return block;
+    const pb = partialBlock(partial, idx);
+    const name = pb?.name;
+    const id = typeof pb?.id === 'string' ? pb.id : null;
+    if (typeof name !== 'string')
+        return block;
+    if (name === 'speak')
+        return makeSpeakStreaming(id);
+    return { kind: 'not_speak' };
+}
+function makeSpeakStreaming(toolCallId) {
+    return {
+        kind: 'speak_streaming',
+        toolCallId,
+        extractor: new TextExtractor(),
+        emittedLength: 0,
+    };
+}

package/server/dist/voice/fsm/reducers/walkback.js ADDED Viewed

@@ -0,0 +1,73 @@
+// Concern C: Walkback / context rewrite reducer.
+//
+// When speechmux signals barge-in (abort or rollback), we mark walkback
+// `pending` with the speak's `targetSpeakToolCallId`. The toolCallId
+// comes from the wire frame's `speak_id` (echoed by speechmux from the
+// chunk that was actively playing) when present; otherwise we fall back
+// to the runtime-tracked `lastEmittedSpeakId`.
+//
+// On every `sdk:context` event we run `walkBack(...)`:
+//   - idle → just strip trailing aborted-empty assistants.
+//   - pending → strip + rewrite the targeted speak block to `heardText`,
+//               drop any blocks/messages that came after.
+//
+// The previous design captured the in-flight assistant message snapshot
+// and used string-prefix accumulation across content blocks to identify
+// what was heard. That broke whenever a turn had multiple speak()
+// calls or whenever the snapshot was stale. The id-based design has no
+// such ambiguity.
+import { VOICE_INTERRUPT_CUSTOM_TYPE } from '../../../../../shared/dist/index.js';
+import { walkBack } from '../../walk-back.js';
+/** Resolve which speak() id to walk back to. Prefers what speechmux
+ *  echoes; falls back to runtime-tracked latest. Returns null if neither
+ *  is available (we'll degrade gracefully — abort the agent but skip
+ *  the rewrite). */
+function resolveTarget(frameSpeakId, lastEmittedSpeakId) {
+    if (frameSpeakId)
+        return frameSpeakId;
+    return lastEmittedSpeakId;
+}
+export function reduceWalkback(prev, lastEmittedSpeakId, event) {
+    switch (event.type) {
+        case 'ws:incoming': {
+            const f = event.frame;
+            if (f.type === 'user')
+                return { next: prev, actions: [] };
+            const heardText = f.type === 'rollback' ? f.heard_text : '';
+            const data = {
+                heard_text: heardText,
+                kind: f.type === 'rollback' ? 'rollback' : 'abort',
+            };
+            const target = resolveTarget(f.speak_id, lastEmittedSpeakId);
+            const actions = [{ kind: 'abort_agent' }, { kind: 'append_custom_entry', customType: VOICE_INTERRUPT_CUSTOM_TYPE, data }];
+            if (target === null) {
+                // No target available → can't rewrite. Just abort + record the
+                // interrupt entry; the next sdk:context will only strip
+                // aborted-empty assistants.
+                return { next: { kind: 'idle' }, actions };
+            }
+            return {
+                next: { kind: 'pending', heardText, targetSpeakToolCallId: target },
+                actions,
+            };
+        }
+        case 'sdk:context': {
+            const rollback = prev.kind === 'pending' ? { heardText: prev.heardText, targetSpeakToolCallId: prev.targetSpeakToolCallId } : null;
+            const rewritten = walkBack({
+                messages: event.messages,
+                rollback,
+            });
+            return {
+                next: { kind: 'idle' },
+                actions: [{ kind: 'rewrite_context', messages: rewritten }],
+            };
+        }
+        case 'eb:deactivate':
+            return { next: { kind: 'idle' }, actions: [] };
+        default:
+            return { next: prev, actions: [] };
+    }
+}
+export function applyWalkbackResult(prev, r) {
+    return { ...prev, walkback: r.next };
+}

package/server/dist/voice/fsm/state.js ADDED Viewed

@@ -0,0 +1,21 @@
+// Voice extension runtime state.
+//
+// Three orthogonal concerns are modelled as parallel sub-machines that
+// share a single top-level state record. Sub-reducers in
+// `reducers/{lifecycle,streaming,walkback}.ts` operate only on their own
+// slice; the top-level dispatcher in `reducer.ts` folds them together.
+//
+// The `JSONParser` instance held by `speak_streaming` blocks is the one
+// piece of impurity inside this state — necessary because streaming JSON
+// argument parsing can't be replayed lazily. It's owned by the block and
+// disposed when the block transitions to `speak_ended` or the message
+// resets.
+export function initialState() {
+    return {
+        lifecycle: { kind: 'dormant' },
+        message: { blocks: new Map() },
+        walkback: { kind: 'idle' },
+        interpreterApplied: false,
+        lastEmittedSpeakId: null,
+    };
+}

package/server/dist/voice/fsm/text-extractor.js ADDED Viewed

@@ -0,0 +1,128 @@
+// Streaming extractor for the `text` value of a `speak({text:"..."})`
+// tool argument JSON.
+//
+// Replaces our previous use of `@streamparser/json` (which is callback-
+// based and thus required closures into reducer state). This extractor
+// is fully synchronous: callers write JSON chunks and read the extracted
+// text via `currentText()`. All mutation is encapsulated inside the
+// extractor object; the FSM treats it as an opaque streaming buffer.
+//
+// **Scope.** This handles only the JSON shape `{"text": "<string>"}` —
+// the exact shape of the `speak` tool's argument schema (single string
+// field). It is *not* a general JSON parser. If we ever add more args
+// to `speak`, we'll need to extend it (or reach for streamparser
+// again). The trade-off is: ~80 lines of focused code vs a 3-letter
+// dependency that introduced a closure-binding bug.
+//
+// **Robustness.** The extractor handles:
+//   - leading whitespace before / inside the object
+//   - the `text` key appearing first (not nested or preceded by other
+//     keys — the schema enforces this)
+//   - all JSON string escapes including `\uXXXX`
+//   - chunk boundaries falling inside escape sequences (the buffer
+//     holds onto unconsumed bytes until the next write provides the
+//     rest)
+// It does NOT handle:
+//   - object/array values (no need; `text` is a string)
+//   - non-`text` keys appearing before `text`
+const HEAD_PATTERN = /"text"\s*:\s*"/;
+const SIMPLE_ESCAPES = {
+    '"': '"',
+    '\\': '\\',
+    '/': '/',
+    n: '\n',
+    r: '\r',
+    t: '\t',
+    b: '\b',
+    f: '\f',
+};
+export class TextExtractor {
+    phase = 'pre_string';
+    /** Unconsumed input bytes that follow the cursor. */
+    buffer = '';
+    /** Decoded text accumulated so far. */
+    text = '';
+    /** Feed another JSON chunk. Idempotent once `closed` or `errored`. */
+    write(chunk) {
+        if (this.phase === 'closed' || this.phase === 'errored')
+            return;
+        if (chunk.length === 0)
+            return;
+        this.buffer += chunk;
+        this.advance();
+    }
+    /** The decoded value of `$.text` accumulated so far. Monotonic until
+     *  `closed`/`errored`. */
+    currentText() {
+        return this.text;
+    }
+    /** Whether the closing `"` has been observed. */
+    isClosed() {
+        return this.phase === 'closed';
+    }
+    /** Whether parsing failed (e.g. malformed escape sequence). The FSM's
+     *  toolcall_end fallback fills any remaining gap from the SDK's
+     *  authoritative final text, so an errored extractor is recoverable
+     *  at end-of-stream. */
+    isErrored() {
+        return this.phase === 'errored';
+    }
+    // -------------------------------------------------------------------------
+    advance() {
+        if (this.phase === 'pre_string')
+            this.advancePreString();
+        if (this.phase === 'in_string')
+            this.advanceInString();
+    }
+    advancePreString() {
+        const m = HEAD_PATTERN.exec(this.buffer);
+        if (!m)
+            return; // wait for more input
+        // Drop everything up to and including the opening quote.
+        this.buffer = this.buffer.slice(m.index + m[0].length);
+        this.phase = 'in_string';
+    }
+    advanceInString() {
+        let i = 0;
+        while (i < this.buffer.length) {
+            const c = this.buffer.charCodeAt(i);
+            if (c === 0x5c /* \ */) {
+                if (i + 1 >= this.buffer.length)
+                    break; // wait for the escape char
+                const esc = this.buffer[i + 1];
+                if (esc === 'u') {
+                    if (i + 6 > this.buffer.length)
+                        break; // wait for the 4 hex digits
+                    const hex = this.buffer.slice(i + 2, i + 6);
+                    if (!/^[0-9a-fA-F]{4}$/.test(hex)) {
+                        this.phase = 'errored';
+                        return;
+                    }
+                    this.text += String.fromCharCode(parseInt(hex, 16));
+                    i += 6;
+                }
+                else if (esc in SIMPLE_ESCAPES) {
+                    this.text += SIMPLE_ESCAPES[esc];
+                    i += 2;
+                }
+                else {
+                    // Invalid escape; bail.
+                    this.phase = 'errored';
+                    return;
+                }
+            }
+            else if (c === 0x22 /* " */) {
+                // End of string. Consume up to and including the closing quote.
+                this.phase = 'closed';
+                this.buffer = this.buffer.slice(i + 1);
+                return;
+            }
+            else {
+                this.text += this.buffer[i];
+                i += 1;
+            }
+        }
+        // Retain unconsumed tail (a partial escape at boundary).
+        this.buffer = this.buffer.slice(i);
+    }
+}