open-agents-ai 0.187.255 → 0.187.257

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +297 -93
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -327046,32 +327046,52 @@ __export(voicechat_exports, {
327046
327046
  VoiceChatSession: () => VoiceChatSession
327047
327047
  });
327048
327048
  import { EventEmitter as EventEmitter10 } from "node:events";
327049
- var VoiceChatSession;
327049
+ var VAD_SILENCE_MS, MAX_SEGMENT_MS, SUMMARY_INJECTION_INTERVAL, MAX_CONTEXT_TURNS, SYSTEM_PROMPT2, VoiceChatSession;
327050
327050
  var init_voicechat = __esm({
327051
327051
  "packages/cli/src/tui/voicechat.ts"() {
327052
327052
  "use strict";
327053
+ VAD_SILENCE_MS = 1100;
327054
+ MAX_SEGMENT_MS = 6500;
327055
+ SUMMARY_INJECTION_INTERVAL = 4;
327056
+ MAX_CONTEXT_TURNS = 20;
327057
+ SYSTEM_PROMPT2 = `You are a voice assistant having a live spoken conversation. Keep responses extremely brief — 1-2 sentences max. You're speaking aloud, not writing. Be conversational, direct, and helpful. Don't use markdown, bullet points, or formatting — just natural speech. If you don't know something, say so briefly. Do not over-think — respond quickly and concisely.`;
327053
327058
  VoiceChatSession = class extends EventEmitter10 {
327054
327059
  voice;
327055
327060
  listen;
327061
+ backendUrl;
327062
+ model;
327063
+ apiKey;
327056
327064
  runner;
327065
+ // State machine
327066
+ _state = "IDLE";
327057
327067
  active = false;
327058
- silenceTimeout;
327068
+ // Conversation context — own turns, separate from main agent
327069
+ context = [];
327070
+ turnCount = 0;
327071
+ // VAD segment capture
327072
+ captureBuffer = "";
327073
+ captureStartTime = 0;
327074
+ silenceTimer = null;
327075
+ maxSegmentTimer = null;
327076
+ // Abort control for inference
327077
+ abortController = null;
327078
+ // Callbacks
327059
327079
  onStatus;
327060
327080
  onUserSpeech;
327061
327081
  onPartialTranscript;
327062
327082
  onAgentSpeech;
327063
- transcriptBuffer = "";
327064
- silenceTimer = null;
327065
- agentTextBuffer = "";
327066
- speakQueue = [];
327067
- isSpeaking = false;
327068
- lastSpokenText = "";
327083
+ onStateChange;
327084
+ // Bound handlers for cleanup
327085
+ _onTranscript = null;
327086
+ _onError = null;
327069
327087
  constructor(opts) {
327070
327088
  super();
327071
327089
  this.voice = opts.voice;
327072
327090
  this.listen = opts.listen;
327073
- this.runner = opts.runner;
327074
- this.silenceTimeout = opts.silenceTimeout ?? 3;
327091
+ this.backendUrl = opts.backendUrl.replace(/\/+$/, "");
327092
+ this.model = opts.model;
327093
+ this.apiKey = opts.apiKey ?? "";
327094
+ this.runner = opts.runner ?? null;
327075
327095
  this.onStatus = opts.onStatus ?? (() => {
327076
327096
  });
327077
327097
  this.onUserSpeech = opts.onUserSpeech ?? (() => {
@@ -327080,11 +327100,28 @@ var init_voicechat = __esm({
327080
327100
  });
327081
327101
  this.onAgentSpeech = opts.onAgentSpeech ?? (() => {
327082
327102
  });
327103
+ this.onStateChange = opts.onStateChange ?? (() => {
327104
+ });
327105
+ }
327106
+ get state() {
327107
+ return this._state;
327083
327108
  }
327084
327109
  get isActive() {
327085
327110
  return this.active;
327086
327111
  }
327087
- /** Start the voice chat session — begins listening and wires agent responses to TTS */
327112
+ // ---------------------------------------------------------------------------
327113
+ // State transitions
327114
+ // ---------------------------------------------------------------------------
327115
+ setState(next) {
327116
+ if (this._state === next) return;
327117
+ const prev = this._state;
327118
+ this._state = next;
327119
+ this.onStateChange(next);
327120
+ this.emit("stateChange", { from: prev, to: next });
327121
+ }
327122
+ // ---------------------------------------------------------------------------
327123
+ // Start / Stop
327124
+ // ---------------------------------------------------------------------------
327088
327125
  async start() {
327089
327126
  if (this.active) return;
327090
327127
  if (!this.voice.enabled || !this.voice.ready) {
@@ -327092,98 +327129,278 @@ var init_voicechat = __esm({
327092
327129
  await this.voice.toggle();
327093
327130
  }
327094
327131
  this.active = true;
327095
- this.onStatus("Voice chat active speak naturally, agent will respond");
327096
- this.listen.on("transcript", (evt) => {
327097
- const { text, isFinal } = evt;
327098
- if (!text?.trim()) return;
327099
- this.transcriptBuffer = text.trim();
327100
- this.onPartialTranscript(this.transcriptBuffer);
327101
- if (this.silenceTimer) clearTimeout(this.silenceTimer);
327102
- if (isFinal || this.silenceTimeout === 0) {
327103
- this.submitTranscript();
327132
+ this.context = [{ role: "system", content: SYSTEM_PROMPT2 }];
327133
+ this.turnCount = 0;
327134
+ this.onStatus("VoiceChat v2 active state machine: LISTENING");
327135
+ this._onTranscript = (...args) => {
327136
+ let text;
327137
+ let isFinal;
327138
+ if (typeof args[0] === "object" && args[0] !== null) {
327139
+ const evt = args[0];
327140
+ text = evt.text ?? "";
327141
+ isFinal = evt.isFinal ?? false;
327104
327142
  } else {
327105
- this.silenceTimer = setTimeout(() => {
327106
- this.submitTranscript();
327107
- }, this.silenceTimeout * 1e3);
327143
+ text = String(args[0] ?? "");
327144
+ isFinal = Boolean(args[1]);
327108
327145
  }
327109
- });
327110
- this.runner.onEvent((event) => {
327111
- if (!this.active) return;
327112
- if (event.type === "assistant_text" && event.content) {
327113
- const text = event.content.trim();
327114
- if (!text || text.length < 3) return;
327115
- if (text === this.lastSpokenText) return;
327116
- this.lastSpokenText = text;
327117
- this.onAgentSpeech(text);
327118
- this.queueSpeak(text);
327119
- }
327120
- });
327121
- this.listen.on("error", (err) => {
327146
+ if (!text.trim()) return;
327147
+ this.handleTranscript(text.trim(), isFinal);
327148
+ };
327149
+ this._onError = (err) => {
327122
327150
  const msg = err instanceof Error ? err.message : String(err);
327123
- this.onStatus(`ASR error (voice chat continues without mic): ${msg.slice(0, 80)}`);
327124
- });
327151
+ this.onStatus(`ASR error (voicechat continues without mic): ${msg.slice(0, 80)}`);
327152
+ };
327153
+ this.listen.on("transcript", this._onTranscript);
327154
+ this.listen.on("error", this._onError);
327125
327155
  try {
327126
327156
  await this.listen.start();
327127
- this.onStatus("Mic active — listening...");
327157
+ this.setState("LISTENING");
327158
+ this.onStatus("Mic active — LISTENING for speech...");
327128
327159
  } catch (err) {
327129
- this.onStatus(`Mic failed: ${err instanceof Error ? err.message : String(err)}. Voice chat active without mic — agent responses will still be spoken.`);
327160
+ this.onStatus(
327161
+ `Mic failed: ${err instanceof Error ? err.message : String(err)}. VoiceChat active without mic.`
327162
+ );
327163
+ this.setState("LISTENING");
327130
327164
  }
327131
327165
  }
327132
- /** Stop the voice chat session */
327133
327166
  async stop() {
327134
327167
  if (!this.active) return;
327135
327168
  this.active = false;
327169
+ if (this.abortController) {
327170
+ this.abortController.abort();
327171
+ this.abortController = null;
327172
+ }
327136
327173
  if (this.silenceTimer) {
327137
327174
  clearTimeout(this.silenceTimer);
327138
327175
  this.silenceTimer = null;
327139
327176
  }
327140
- if (this.transcriptBuffer.trim()) {
327141
- this.submitTranscript();
327177
+ if (this.maxSegmentTimer) {
327178
+ clearTimeout(this.maxSegmentTimer);
327179
+ this.maxSegmentTimer = null;
327180
+ }
327181
+ if (this.captureBuffer.trim() && (this._state === "CAPTURING" || this._state === "TRANSCRIBING")) {
327182
+ this.finalizeSegment();
327183
+ }
327184
+ if (this._onTranscript) {
327185
+ this.listen.removeAllListeners("transcript");
327186
+ this._onTranscript = null;
327187
+ }
327188
+ if (this._onError) {
327189
+ this.listen.removeAllListeners("error");
327190
+ this._onError = null;
327142
327191
  }
327143
327192
  try {
327144
327193
  await this.listen.stop();
327145
327194
  } catch {
327146
327195
  }
327147
- this.listen.removeAllListeners("transcript");
327148
- this.speakQueue.length = 0;
327149
- this.onStatus("Voice chat ended");
327196
+ this.setState("IDLE");
327197
+ this.onStatus("VoiceChat ended");
327150
327198
  this.emit("stopped");
327151
327199
  }
327152
- /** Submit the current transcript buffer to the agent */
327153
- submitTranscript() {
327154
- const text = this.transcriptBuffer.trim();
327155
- if (!text) return;
327156
- this.transcriptBuffer = "";
327200
+ // ---------------------------------------------------------------------------
327201
+ // Transcript handling — VAD-style segment capture (Voryn pattern)
327202
+ // ---------------------------------------------------------------------------
327203
+ handleTranscript(text, isFinal) {
327204
+ if (!this.active) return;
327205
+ if (this._state !== "LISTENING" && this._state !== "CAPTURING") {
327206
+ return;
327207
+ }
327208
+ if (this._state === "LISTENING") {
327209
+ this.setState("CAPTURING");
327210
+ this.captureBuffer = "";
327211
+ this.captureStartTime = Date.now();
327212
+ this.maxSegmentTimer = setTimeout(() => {
327213
+ if (this._state === "CAPTURING") {
327214
+ this.finalizeSegment();
327215
+ }
327216
+ }, MAX_SEGMENT_MS);
327217
+ }
327218
+ this.captureBuffer = text;
327219
+ this.onPartialTranscript(text);
327220
+ if (this.silenceTimer) clearTimeout(this.silenceTimer);
327221
+ if (isFinal) {
327222
+ this.finalizeSegment();
327223
+ } else {
327224
+ this.silenceTimer = setTimeout(() => {
327225
+ if (this._state === "CAPTURING") {
327226
+ this.finalizeSegment();
327227
+ }
327228
+ }, VAD_SILENCE_MS);
327229
+ }
327230
+ }
327231
+ // ---------------------------------------------------------------------------
327232
+ // Segment finalization → Transcribing → Thinking → Speaking
327233
+ // ---------------------------------------------------------------------------
327234
+ finalizeSegment() {
327235
+ const text = this.captureBuffer.trim();
327157
327236
  if (this.silenceTimer) {
327158
327237
  clearTimeout(this.silenceTimer);
327159
327238
  this.silenceTimer = null;
327160
327239
  }
327240
+ if (this.maxSegmentTimer) {
327241
+ clearTimeout(this.maxSegmentTimer);
327242
+ this.maxSegmentTimer = null;
327243
+ }
327244
+ this.captureBuffer = "";
327245
+ if (!text) {
327246
+ this.setState("LISTENING");
327247
+ return;
327248
+ }
327249
+ this.setState("TRANSCRIBING");
327161
327250
  this.onUserSpeech(text);
327162
- this.runner.injectUserMessage(
327163
- `[VOICE] The user spoke (live microphone): "${text}"
327164
- This is a live voice conversation running alongside your work. Respond briefly and naturally — your text response will be spoken aloud via TTS. If they ask you to look something up, acknowledge first then research. After responding, continue your current task.`
327165
- );
327251
+ this.context.push({ role: "user", content: text });
327252
+ this.turnCount++;
327253
+ while (this.context.length > MAX_CONTEXT_TURNS + 1) {
327254
+ this.context.splice(1, 1);
327255
+ }
327256
+ this.think();
327166
327257
  }
327167
- /** Queue text for TTS playback — non-blocking, processes sequentially */
327168
- queueSpeak(text) {
327169
- this.speakQueue.push(text);
327170
- if (!this.isSpeaking) {
327171
- this.processQueue();
327258
+ // ---------------------------------------------------------------------------
327259
+ // Direct Ollama inference (not through main agent runner)
327260
+ // ---------------------------------------------------------------------------
327261
+ async think() {
327262
+ if (!this.active) return;
327263
+ this.setState("THINKING");
327264
+ this.onStatus("Thinking...");
327265
+ this.abortController = new AbortController();
327266
+ try {
327267
+ const response = await this.streamOllamaInference(this.abortController.signal);
327268
+ if (!this.active) return;
327269
+ if (response.trim()) {
327270
+ this.context.push({ role: "assistant", content: response.trim() });
327271
+ this.setState("SPEAKING");
327272
+ this.onAgentSpeech(response.trim());
327273
+ this.voice.speak(response.trim());
327274
+ if (this.runner && this.turnCount % SUMMARY_INJECTION_INTERVAL === 0) {
327275
+ this.injectSummary();
327276
+ }
327277
+ const estimatedMs = Math.max(1500, response.length / 5 * (6e4 / 150));
327278
+ await new Promise((r2) => setTimeout(r2, estimatedMs));
327279
+ }
327280
+ } catch (err) {
327281
+ if (!this.active) return;
327282
+ const msg = err instanceof Error ? err.message : String(err);
327283
+ if (!msg.includes("abort")) {
327284
+ this.onStatus(`Inference error: ${msg.slice(0, 100)}`);
327285
+ }
327286
+ } finally {
327287
+ this.abortController = null;
327288
+ }
327289
+ if (this.active) {
327290
+ this.setState("LISTENING");
327291
+ this.onStatus("LISTENING...");
327172
327292
  }
327173
327293
  }
327174
- /** Process the TTS queue — speaks one item at a time */
327175
- async processQueue() {
327176
- if (this.isSpeaking) return;
327177
- this.isSpeaking = true;
327178
- while (this.speakQueue.length > 0 && this.active) {
327179
- const text = this.speakQueue.shift();
327180
- try {
327181
- this.voice.speak(text);
327182
- await new Promise((r2) => setTimeout(r2, 500));
327183
- } catch {
327294
+ /**
327295
+ * Stream inference. Tries native Ollama /api/chat first (supports think:false
327296
+ * for reasoning models), falls back to OpenAI-compat /v1/chat/completions.
327297
+ */
327298
+ async streamOllamaInference(signal) {
327299
+ const baseUrl = this.backendUrl.replace(/\/v1\/?$/, "");
327300
+ const headers = { "Content-Type": "application/json" };
327301
+ if (this.apiKey) headers["Authorization"] = `Bearer ${this.apiKey}`;
327302
+ try {
327303
+ const nativeBody = JSON.stringify({
327304
+ model: this.model,
327305
+ messages: this.context,
327306
+ stream: true,
327307
+ think: false,
327308
+ // Disable reasoning — voice chat needs fast, direct responses
327309
+ options: { temperature: 0.7, num_predict: 256 }
327310
+ });
327311
+ const res2 = await fetch(`${baseUrl}/api/chat`, {
327312
+ method: "POST",
327313
+ headers,
327314
+ body: nativeBody,
327315
+ signal
327316
+ });
327317
+ if (res2.ok) {
327318
+ return await this.parseOllamaNativeStream(res2, signal);
327184
327319
  }
327320
+ } catch (err) {
327321
+ const msg = err instanceof Error ? err.message : "";
327322
+ if (msg.includes("abort")) throw err;
327185
327323
  }
327186
- this.isSpeaking = false;
327324
+ const openaiBody = JSON.stringify({
327325
+ model: this.model,
327326
+ messages: this.context,
327327
+ stream: true,
327328
+ temperature: 0.7,
327329
+ max_tokens: 1024
327330
+ });
327331
+ const endpoint = baseUrl.includes("/v1") ? `${baseUrl}/chat/completions` : `${baseUrl}/v1/chat/completions`;
327332
+ const res = await fetch(endpoint, { method: "POST", headers, body: openaiBody, signal });
327333
+ if (!res.ok) {
327334
+ const errText = await res.text().catch(() => "unknown");
327335
+ throw new Error(`Inference ${res.status}: ${errText.slice(0, 200)}`);
327336
+ }
327337
+ return await this.parseOpenAIStream(res);
327338
+ }
327339
+ /** Parse native Ollama /api/chat streaming response (NDJSON, not SSE) */
327340
+ async parseOllamaNativeStream(res, _signal) {
327341
+ const reader = res.body?.getReader();
327342
+ if (!reader) throw new Error("No response body");
327343
+ const decoder = new TextDecoder();
327344
+ let fullText = "";
327345
+ let buffer2 = "";
327346
+ while (true) {
327347
+ const { done, value: value2 } = await reader.read();
327348
+ if (done) break;
327349
+ buffer2 += decoder.decode(value2, { stream: true });
327350
+ const lines = buffer2.split("\n");
327351
+ buffer2 = lines.pop() ?? "";
327352
+ for (const line of lines) {
327353
+ if (!line.trim()) continue;
327354
+ try {
327355
+ const parsed = JSON.parse(line);
327356
+ const content = parsed.message?.content;
327357
+ if (content) fullText += content;
327358
+ if (parsed.done) return fullText;
327359
+ } catch {
327360
+ }
327361
+ }
327362
+ }
327363
+ return fullText;
327364
+ }
327365
+ /** Parse OpenAI-compat SSE streaming response */
327366
+ async parseOpenAIStream(res) {
327367
+ const reader = res.body?.getReader();
327368
+ if (!reader) throw new Error("No response body");
327369
+ const decoder = new TextDecoder();
327370
+ let fullText = "";
327371
+ let buffer2 = "";
327372
+ while (true) {
327373
+ const { done, value: value2 } = await reader.read();
327374
+ if (done) break;
327375
+ buffer2 += decoder.decode(value2, { stream: true });
327376
+ const lines = buffer2.split("\n");
327377
+ buffer2 = lines.pop() ?? "";
327378
+ for (const line of lines) {
327379
+ const trimmed = line.trim();
327380
+ if (!trimmed || !trimmed.startsWith("data: ")) continue;
327381
+ const data = trimmed.slice(6);
327382
+ if (data === "[DONE]") continue;
327383
+ try {
327384
+ const parsed = JSON.parse(data);
327385
+ const delta = parsed.choices?.[0]?.delta?.content;
327386
+ if (delta) fullText += delta;
327387
+ } catch {
327388
+ }
327389
+ }
327390
+ }
327391
+ return fullText;
327392
+ }
327393
+ // ---------------------------------------------------------------------------
327394
+ // Summary injection to main agent
327395
+ // ---------------------------------------------------------------------------
327396
+ injectSummary() {
327397
+ if (!this.runner) return;
327398
+ const recentTurns = this.context.filter((t2) => t2.role !== "system").slice(-6).map((t2) => `${t2.role === "user" ? "User" : "Assistant"}: ${t2.content}`).join("\n");
327399
+ this.runner.injectUserMessage(
327400
+ `[VOICECHAT SUMMARY] The following is a summary of the recent voice conversation happening in parallel. You don't need to respond to this directly — it's for your awareness. Continue your current task.
327401
+
327402
+ ${recentTurns}`
327403
+ );
327187
327404
  }
327188
327405
  };
327189
327406
  }
@@ -331399,7 +331616,7 @@ Respond concisely and safely. Remember: you are talking to the general public.`;
331399
331616
  getCallUrl() {
331400
331617
  return voiceSession?.tunnelUrl ?? null;
331401
331618
  },
331402
- // --- /voicechat: async voice conversation parallel to agent loop ---
331619
+ // --- /voicechat: Voryn-style state machine voice conversation ---
331403
331620
  async voiceChatStart() {
331404
331621
  if (_voiceChatSession?.isActive) return;
331405
331622
  if (!voiceEngine.enabled || !voiceEngine.ready) {
@@ -331411,36 +331628,20 @@ Respond concisely and safely. Remember: you are talking to the general public.`;
331411
331628
  const { VoiceChatSession: VoiceChatSession2 } = await Promise.resolve().then(() => (init_voicechat(), voicechat_exports));
331412
331629
  const { ListenEngine: ListenEngine2 } = await Promise.resolve().then(() => (init_listen(), listen_exports));
331413
331630
  const listenEng = new ListenEngine2();
331414
- const dynamicRunner = {
331631
+ const summaryRunner = {
331415
331632
  injectUserMessage(content) {
331416
331633
  if (activeTask?.runner) {
331417
331634
  activeTask.runner.injectUserMessage(content);
331418
- } else {
331419
- const match = content.match(/:\s*"([^"]+)"/);
331420
- const rawText = match ? match[1] : content;
331421
- if (rl && rawText.trim()) {
331422
- rl.setLine(rawText.trim());
331423
- rl.emit("line", rawText.trim());
331424
- }
331425
- }
331426
- },
331427
- onEvent(handler) {
331428
- const checkInterval = setInterval(() => {
331429
- if (activeTask?.runner) {
331430
- activeTask.runner.onEvent(handler);
331431
- clearInterval(checkInterval);
331432
- }
331433
- }, 500);
331434
- if (activeTask?.runner) {
331435
- activeTask.runner.onEvent(handler);
331436
- clearInterval(checkInterval);
331437
331635
  }
331438
331636
  }
331439
331637
  };
331440
331638
  _voiceChatSession = new VoiceChatSession2({
331441
331639
  voice: voiceEngine,
331442
331640
  listen: listenEng,
331443
- runner: dynamicRunner,
331641
+ backendUrl: currentConfig.backendUrl,
331642
+ model: currentConfig.model,
331643
+ apiKey: currentConfig.apiKey,
331644
+ runner: summaryRunner,
331444
331645
  onStatus(msg) {
331445
331646
  writeContent(() => renderInfo(`[voicechat] ${msg}`));
331446
331647
  },
@@ -331454,6 +331655,9 @@ Respond concisely and safely. Remember: you are talking to the general public.`;
331454
331655
  },
331455
331656
  onAgentSpeech(text) {
331456
331657
  writeContent(() => renderInfo(`\x1B[38;5;178m[agent]\x1B[0m ${text.slice(0, 120)}`));
331658
+ },
331659
+ onStateChange(state) {
331660
+ writeContent(() => renderInfo(`\x1B[38;5;243m[voicechat] ${state}\x1B[0m`));
331457
331661
  }
331458
331662
  });
331459
331663
  await _voiceChatSession.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.187.255",
3
+ "version": "0.187.257",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",