npm - open-agents-ai - Versions diffs - 0.187.255 → 0.187.257 - Mend

open-agents-ai 0.187.255 → 0.187.257

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +297 -93
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -327046,32 +327046,52 @@ __export(voicechat_exports, {
   VoiceChatSession: () => VoiceChatSession
 });
 import { EventEmitter as EventEmitter10 } from "node:events";
-var VoiceChatSession;
+var VAD_SILENCE_MS, MAX_SEGMENT_MS, SUMMARY_INJECTION_INTERVAL, MAX_CONTEXT_TURNS, SYSTEM_PROMPT2, VoiceChatSession;
 var init_voicechat = __esm({
   "packages/cli/src/tui/voicechat.ts"() {
     "use strict";
+    VAD_SILENCE_MS = 1100;
+    MAX_SEGMENT_MS = 6500;
+    SUMMARY_INJECTION_INTERVAL = 4;
+    MAX_CONTEXT_TURNS = 20;
+    SYSTEM_PROMPT2 = `You are a voice assistant having a live spoken conversation. Keep responses extremely brief — 1-2 sentences max. You're speaking aloud, not writing. Be conversational, direct, and helpful. Don't use markdown, bullet points, or formatting — just natural speech. If you don't know something, say so briefly. Do not over-think — respond quickly and concisely.`;
     VoiceChatSession = class extends EventEmitter10 {
       voice;
       listen;
+      backendUrl;
+      model;
+      apiKey;
       runner;
+      // State machine
+      _state = "IDLE";
       active = false;
-      silenceTimeout;
+      // Conversation context — own turns, separate from main agent
+      context = [];
+      turnCount = 0;
+      // VAD segment capture
+      captureBuffer = "";
+      captureStartTime = 0;
+      silenceTimer = null;
+      maxSegmentTimer = null;
+      // Abort control for inference
+      abortController = null;
+      // Callbacks
       onStatus;
       onUserSpeech;
       onPartialTranscript;
       onAgentSpeech;
-      transcriptBuffer = "";
-      silenceTimer = null;
-      agentTextBuffer = "";
-      speakQueue = [];
-      isSpeaking = false;
-      lastSpokenText = "";
+      onStateChange;
+      // Bound handlers for cleanup
+      _onTranscript = null;
+      _onError = null;
       constructor(opts) {
         super();
         this.voice = opts.voice;
         this.listen = opts.listen;
-        this.runner = opts.runner;
-        this.silenceTimeout = opts.silenceTimeout ?? 3;
+        this.backendUrl = opts.backendUrl.replace(/\/+$/, "");
+        this.model = opts.model;
+        this.apiKey = opts.apiKey ?? "";
+        this.runner = opts.runner ?? null;
         this.onStatus = opts.onStatus ?? (() => {
         });
         this.onUserSpeech = opts.onUserSpeech ?? (() => {
@@ -327080,11 +327100,28 @@ var init_voicechat = __esm({
         });
         this.onAgentSpeech = opts.onAgentSpeech ?? (() => {
         });
+        this.onStateChange = opts.onStateChange ?? (() => {
+        });
+      }
+      get state() {
+        return this._state;
       }
       get isActive() {
         return this.active;
       }
-      /** Start the voice chat session — begins listening and wires agent responses to TTS */
+      // ---------------------------------------------------------------------------
+      // State transitions
+      // ---------------------------------------------------------------------------
+      setState(next) {
+        if (this._state === next) return;
+        const prev = this._state;
+        this._state = next;
+        this.onStateChange(next);
+        this.emit("stateChange", { from: prev, to: next });
+      }
+      // ---------------------------------------------------------------------------
+      // Start / Stop
+      // ---------------------------------------------------------------------------
       async start() {
         if (this.active) return;
         if (!this.voice.enabled || !this.voice.ready) {
@@ -327092,98 +327129,278 @@ var init_voicechat = __esm({
           await this.voice.toggle();
         }
         this.active = true;
-        this.onStatus("Voice chat active — speak naturally, agent will respond");
-        this.listen.on("transcript", (evt) => {
-          const { text, isFinal } = evt;
-          if (!text?.trim()) return;
-          this.transcriptBuffer = text.trim();
-          this.onPartialTranscript(this.transcriptBuffer);
-          if (this.silenceTimer) clearTimeout(this.silenceTimer);
-          if (isFinal || this.silenceTimeout === 0) {
-            this.submitTranscript();
+        this.context = [{ role: "system", content: SYSTEM_PROMPT2 }];
+        this.turnCount = 0;
+        this.onStatus("VoiceChat v2 active — state machine: LISTENING");
+        this._onTranscript = (...args) => {
+          let text;
+          let isFinal;
+          if (typeof args[0] === "object" && args[0] !== null) {
+            const evt = args[0];
+            text = evt.text ?? "";
+            isFinal = evt.isFinal ?? false;
           } else {
-            this.silenceTimer = setTimeout(() => {
-              this.submitTranscript();
-            }, this.silenceTimeout * 1e3);
+            text = String(args[0] ?? "");
+            isFinal = Boolean(args[1]);
           }
-        });
-        this.runner.onEvent((event) => {
-          if (!this.active) return;
-          if (event.type === "assistant_text" && event.content) {
-            const text = event.content.trim();
-            if (!text || text.length < 3) return;
-            if (text === this.lastSpokenText) return;
-            this.lastSpokenText = text;
-            this.onAgentSpeech(text);
-            this.queueSpeak(text);
-          }
-        });
-        this.listen.on("error", (err) => {
+          if (!text.trim()) return;
+          this.handleTranscript(text.trim(), isFinal);
+        };
+        this._onError = (err) => {
           const msg = err instanceof Error ? err.message : String(err);
-          this.onStatus(`ASR error (voice chat continues without mic): ${msg.slice(0, 80)}`);
-        });
+          this.onStatus(`ASR error (voicechat continues without mic): ${msg.slice(0, 80)}`);
+        };
+        this.listen.on("transcript", this._onTranscript);
+        this.listen.on("error", this._onError);
         try {
           await this.listen.start();
-          this.onStatus("Mic active — listening...");
+          this.setState("LISTENING");
+          this.onStatus("Mic active — LISTENING for speech...");
         } catch (err) {
-          this.onStatus(`Mic failed: ${err instanceof Error ? err.message : String(err)}. Voice chat active without mic — agent responses will still be spoken.`);
+          this.onStatus(
+            `Mic failed: ${err instanceof Error ? err.message : String(err)}. VoiceChat active without mic.`
+          );
+          this.setState("LISTENING");
         }
       }
-      /** Stop the voice chat session */
       async stop() {
         if (!this.active) return;
         this.active = false;
+        if (this.abortController) {
+          this.abortController.abort();
+          this.abortController = null;
+        }
         if (this.silenceTimer) {
           clearTimeout(this.silenceTimer);
           this.silenceTimer = null;
         }
-        if (this.transcriptBuffer.trim()) {
-          this.submitTranscript();
+        if (this.maxSegmentTimer) {
+          clearTimeout(this.maxSegmentTimer);
+          this.maxSegmentTimer = null;
+        }
+        if (this.captureBuffer.trim() && (this._state === "CAPTURING" || this._state === "TRANSCRIBING")) {
+          this.finalizeSegment();
+        }
+        if (this._onTranscript) {
+          this.listen.removeAllListeners("transcript");
+          this._onTranscript = null;
+        }
+        if (this._onError) {
+          this.listen.removeAllListeners("error");
+          this._onError = null;
         }
         try {
           await this.listen.stop();
         } catch {
         }
-        this.listen.removeAllListeners("transcript");
-        this.speakQueue.length = 0;
-        this.onStatus("Voice chat ended");
+        this.setState("IDLE");
+        this.onStatus("VoiceChat ended");
         this.emit("stopped");
       }
-      /** Submit the current transcript buffer to the agent */
-      submitTranscript() {
-        const text = this.transcriptBuffer.trim();
-        if (!text) return;
-        this.transcriptBuffer = "";
+      // ---------------------------------------------------------------------------
+      // Transcript handling — VAD-style segment capture (Voryn pattern)
+      // ---------------------------------------------------------------------------
+      handleTranscript(text, isFinal) {
+        if (!this.active) return;
+        if (this._state !== "LISTENING" && this._state !== "CAPTURING") {
+          return;
+        }
+        if (this._state === "LISTENING") {
+          this.setState("CAPTURING");
+          this.captureBuffer = "";
+          this.captureStartTime = Date.now();
+          this.maxSegmentTimer = setTimeout(() => {
+            if (this._state === "CAPTURING") {
+              this.finalizeSegment();
+            }
+          }, MAX_SEGMENT_MS);
+        }
+        this.captureBuffer = text;
+        this.onPartialTranscript(text);
+        if (this.silenceTimer) clearTimeout(this.silenceTimer);
+        if (isFinal) {
+          this.finalizeSegment();
+        } else {
+          this.silenceTimer = setTimeout(() => {
+            if (this._state === "CAPTURING") {
+              this.finalizeSegment();
+            }
+          }, VAD_SILENCE_MS);
+        }
+      }
+      // ---------------------------------------------------------------------------
+      // Segment finalization → Transcribing → Thinking → Speaking
+      // ---------------------------------------------------------------------------
+      finalizeSegment() {
+        const text = this.captureBuffer.trim();
         if (this.silenceTimer) {
           clearTimeout(this.silenceTimer);
           this.silenceTimer = null;
         }
+        if (this.maxSegmentTimer) {
+          clearTimeout(this.maxSegmentTimer);
+          this.maxSegmentTimer = null;
+        }
+        this.captureBuffer = "";
+        if (!text) {
+          this.setState("LISTENING");
+          return;
+        }
+        this.setState("TRANSCRIBING");
         this.onUserSpeech(text);
-        this.runner.injectUserMessage(
-          `[VOICE] The user spoke (live microphone): "${text}"
-This is a live voice conversation running alongside your work. Respond briefly and naturally — your text response will be spoken aloud via TTS. If they ask you to look something up, acknowledge first then research. After responding, continue your current task.`
-        );
+        this.context.push({ role: "user", content: text });
+        this.turnCount++;
+        while (this.context.length > MAX_CONTEXT_TURNS + 1) {
+          this.context.splice(1, 1);
+        }
+        this.think();
       }
-      /** Queue text for TTS playback — non-blocking, processes sequentially */
-      queueSpeak(text) {
-        this.speakQueue.push(text);
-        if (!this.isSpeaking) {
-          this.processQueue();
+      // ---------------------------------------------------------------------------
+      // Direct Ollama inference (not through main agent runner)
+      // ---------------------------------------------------------------------------
+      async think() {
+        if (!this.active) return;
+        this.setState("THINKING");
+        this.onStatus("Thinking...");
+        this.abortController = new AbortController();
+        try {
+          const response = await this.streamOllamaInference(this.abortController.signal);
+          if (!this.active) return;
+          if (response.trim()) {
+            this.context.push({ role: "assistant", content: response.trim() });
+            this.setState("SPEAKING");
+            this.onAgentSpeech(response.trim());
+            this.voice.speak(response.trim());
+            if (this.runner && this.turnCount % SUMMARY_INJECTION_INTERVAL === 0) {
+              this.injectSummary();
+            }
+            const estimatedMs = Math.max(1500, response.length / 5 * (6e4 / 150));
+            await new Promise((r2) => setTimeout(r2, estimatedMs));
+          }
+        } catch (err) {
+          if (!this.active) return;
+          const msg = err instanceof Error ? err.message : String(err);
+          if (!msg.includes("abort")) {
+            this.onStatus(`Inference error: ${msg.slice(0, 100)}`);
+          }
+        } finally {
+          this.abortController = null;
+        }
+        if (this.active) {
+          this.setState("LISTENING");
+          this.onStatus("LISTENING...");
         }
       }
-      /** Process the TTS queue — speaks one item at a time */
-      async processQueue() {
-        if (this.isSpeaking) return;
-        this.isSpeaking = true;
-        while (this.speakQueue.length > 0 && this.active) {
-          const text = this.speakQueue.shift();
-          try {
-            this.voice.speak(text);
-            await new Promise((r2) => setTimeout(r2, 500));
-          } catch {
+      /**
+       * Stream inference. Tries native Ollama /api/chat first (supports think:false
+       * for reasoning models), falls back to OpenAI-compat /v1/chat/completions.
+       */
+      async streamOllamaInference(signal) {
+        const baseUrl = this.backendUrl.replace(/\/v1\/?$/, "");
+        const headers = { "Content-Type": "application/json" };
+        if (this.apiKey) headers["Authorization"] = `Bearer ${this.apiKey}`;
+        try {
+          const nativeBody = JSON.stringify({
+            model: this.model,
+            messages: this.context,
+            stream: true,
+            think: false,
+            // Disable reasoning — voice chat needs fast, direct responses
+            options: { temperature: 0.7, num_predict: 256 }
+          });
+          const res2 = await fetch(`${baseUrl}/api/chat`, {
+            method: "POST",
+            headers,
+            body: nativeBody,
+            signal
+          });
+          if (res2.ok) {
+            return await this.parseOllamaNativeStream(res2, signal);
           }
+        } catch (err) {
+          const msg = err instanceof Error ? err.message : "";
+          if (msg.includes("abort")) throw err;
         }
-        this.isSpeaking = false;
+        const openaiBody = JSON.stringify({
+          model: this.model,
+          messages: this.context,
+          stream: true,
+          temperature: 0.7,
+          max_tokens: 1024
+        });
+        const endpoint = baseUrl.includes("/v1") ? `${baseUrl}/chat/completions` : `${baseUrl}/v1/chat/completions`;
+        const res = await fetch(endpoint, { method: "POST", headers, body: openaiBody, signal });
+        if (!res.ok) {
+          const errText = await res.text().catch(() => "unknown");
+          throw new Error(`Inference ${res.status}: ${errText.slice(0, 200)}`);
+        }
+        return await this.parseOpenAIStream(res);
+      }
+      /** Parse native Ollama /api/chat streaming response (NDJSON, not SSE) */
+      async parseOllamaNativeStream(res, _signal) {
+        const reader = res.body?.getReader();
+        if (!reader) throw new Error("No response body");
+        const decoder = new TextDecoder();
+        let fullText = "";
+        let buffer2 = "";
+        while (true) {
+          const { done, value: value2 } = await reader.read();
+          if (done) break;
+          buffer2 += decoder.decode(value2, { stream: true });
+          const lines = buffer2.split("\n");
+          buffer2 = lines.pop() ?? "";
+          for (const line of lines) {
+            if (!line.trim()) continue;
+            try {
+              const parsed = JSON.parse(line);
+              const content = parsed.message?.content;
+              if (content) fullText += content;
+              if (parsed.done) return fullText;
+            } catch {
+            }
+          }
+        }
+        return fullText;
+      }
+      /** Parse OpenAI-compat SSE streaming response */
+      async parseOpenAIStream(res) {
+        const reader = res.body?.getReader();
+        if (!reader) throw new Error("No response body");
+        const decoder = new TextDecoder();
+        let fullText = "";
+        let buffer2 = "";
+        while (true) {
+          const { done, value: value2 } = await reader.read();
+          if (done) break;
+          buffer2 += decoder.decode(value2, { stream: true });
+          const lines = buffer2.split("\n");
+          buffer2 = lines.pop() ?? "";
+          for (const line of lines) {
+            const trimmed = line.trim();
+            if (!trimmed || !trimmed.startsWith("data: ")) continue;
+            const data = trimmed.slice(6);
+            if (data === "[DONE]") continue;
+            try {
+              const parsed = JSON.parse(data);
+              const delta = parsed.choices?.[0]?.delta?.content;
+              if (delta) fullText += delta;
+            } catch {
+            }
+          }
+        }
+        return fullText;
+      }
+      // ---------------------------------------------------------------------------
+      // Summary injection to main agent
+      // ---------------------------------------------------------------------------
+      injectSummary() {
+        if (!this.runner) return;
+        const recentTurns = this.context.filter((t2) => t2.role !== "system").slice(-6).map((t2) => `${t2.role === "user" ? "User" : "Assistant"}: ${t2.content}`).join("\n");
+        this.runner.injectUserMessage(
+          `[VOICECHAT SUMMARY] The following is a summary of the recent voice conversation happening in parallel. You don't need to respond to this directly — it's for your awareness. Continue your current task.
+${recentTurns}`
+        );
       }
     };
   }
@@ -331399,7 +331616,7 @@ Respond concisely and safely. Remember: you are talking to the general public.`;
     getCallUrl() {
       return voiceSession?.tunnelUrl ?? null;
     },
-    // --- /voicechat: async voice conversation parallel to agent loop ---
+    // --- /voicechat: Voryn-style state machine voice conversation ---
     async voiceChatStart() {
       if (_voiceChatSession?.isActive) return;
       if (!voiceEngine.enabled || !voiceEngine.ready) {
@@ -331411,36 +331628,20 @@ Respond concisely and safely. Remember: you are talking to the general public.`;
       const { VoiceChatSession: VoiceChatSession2 } = await Promise.resolve().then(() => (init_voicechat(), voicechat_exports));
       const { ListenEngine: ListenEngine2 } = await Promise.resolve().then(() => (init_listen(), listen_exports));
       const listenEng = new ListenEngine2();
-      const dynamicRunner = {
+      const summaryRunner = {
         injectUserMessage(content) {
           if (activeTask?.runner) {
             activeTask.runner.injectUserMessage(content);
-          } else {
-            const match = content.match(/:\s*"([^"]+)"/);
-            const rawText = match ? match[1] : content;
-            if (rl && rawText.trim()) {
-              rl.setLine(rawText.trim());
-              rl.emit("line", rawText.trim());
-            }
-          }
-        },
-        onEvent(handler) {
-          const checkInterval = setInterval(() => {
-            if (activeTask?.runner) {
-              activeTask.runner.onEvent(handler);
-              clearInterval(checkInterval);
-            }
-          }, 500);
-          if (activeTask?.runner) {
-            activeTask.runner.onEvent(handler);
-            clearInterval(checkInterval);
           }
         }
       };
       _voiceChatSession = new VoiceChatSession2({
         voice: voiceEngine,
         listen: listenEng,
-        runner: dynamicRunner,
+        backendUrl: currentConfig.backendUrl,
+        model: currentConfig.model,
+        apiKey: currentConfig.apiKey,
+        runner: summaryRunner,
         onStatus(msg) {
           writeContent(() => renderInfo(`[voicechat] ${msg}`));
         },
@@ -331454,6 +331655,9 @@ Respond concisely and safely. Remember: you are talking to the general public.`;
         },
         onAgentSpeech(text) {
           writeContent(() => renderInfo(`\x1B[38;5;178m[agent]\x1B[0m ${text.slice(0, 120)}`));
+        },
+        onStateChange(state) {
+          writeContent(() => renderInfo(`\x1B[38;5;243m[voicechat] ${state}\x1B[0m`));
         }
       });
       await _voiceChatSession.start();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "open-agents-ai",
-  "version": "0.187.255",
+  "version": "0.187.257",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",