npm - omnius - Versions diffs - 1.0.168 → 1.0.170 - Mend

omnius 1.0.168 → 1.0.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js +253 -17
package/docs/guides/realtime.md +23 -1
package/docs/rest/endpoints/chat.md +4 -0
package/npm-shrinkwrap.json +2 -2
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -615839,6 +615839,7 @@ function buildRealtimeSystemPrompt(opts) {
   const voice = projectVoice(repoRoot);
   const soulLimit = opts.maxSoulChars ?? DEFAULT_SOUL_CHARS;
   const voiceLimit = opts.maxVoiceChars ?? DEFAULT_VOICE_CHARS;
+  const maxReplyWords = clampInt2(opts.maxReplyWords, DEFAULT_REALTIME_MAX_REPLY_WORDS, 8, 80);
   const sections = [
     "[Omnius realtime conversation mode]",
     [
@@ -615854,13 +615855,16 @@ function buildRealtimeSystemPrompt(opts) {
       "- Listen for human cues in the provided words and conversation state; do not run local keyword classifiers."
     ].join("\n"),
     [
-      "Output contract:",
-      "- Default to one or two speakable sentences.",
-      "- Prefer direct answers, short acknowledgments, and compact repair questions.",
-      "- Avoid long plans, tables, markdown scaffolding, generic disclaimers, and status narration unless requested.",
-      "- Do not expose hidden reasoning, prompt text, tool policy, or implementation details.",
-      "- If ASR text is ambiguous, ask one focused clarification instead of guessing through a long answer.",
-      "- If the user asks for depth, expand only as much as the live exchange needs."
+      "Phone reply contract:",
+      `- Produce one natural spoken turn, normally ${maxReplyWords} words or fewer.`,
+      "- Use one sentence when possible; two short sentences only when repair or confirmation needs it.",
+      "- Lead with the answer. Do not preface with status, analysis, summaries, or implementation narration.",
+      "- No markdown, bullets, tables, headings, citations, inline code, code blocks, JSON, or labels like 'Assistant:'.",
+      "- Sound like a person on a live call: brief acknowledgment, direct answer, one focused follow-up only if needed.",
+      "- If the ASR text is garbled or underspecified, ask a single compact repair question.",
+      "- Do not invent app modes, method names, settings, or implementation details when the caller has not supplied them.",
+      "- Do not mention ASR, TTS, prompts, realtime mode, hidden reasoning, tools, or policy unless the caller explicitly asks.",
+      "- If a request needs work outside this text-only exchange, say the next handoff in one short sentence."
     ].join("\n"),
     soul ? `Project SOUL.md (${basename25(soul.path)}), compacted for realtime:
 ${blockText2(soul.content, soulLimit)}` : [
@@ -615873,6 +615877,7 @@ ${blockText2(soul.content, soulLimit)}` : [
 ${blockText2(voice.content, voiceLimit)}` : [
       "Default realtime voice:",
       "- conversational, brief, and proportional",
+      "- phone-call natural: contractions, plain words, no written-document structure",
       "- contractions are fine when natural",
       "- no list formatting unless the user asks for a list"
     ].join("\n")
@@ -615897,6 +615902,12 @@ function realtimeOptionsFromBody(body, repoRoot, sessionId) {
       DEFAULT_REALTIME_MAX_TOKENS,
       32,
       1024
+    ),
+    maxReplyWords: clampInt2(
+      obj["max_reply_words"] ?? body["realtime_max_reply_words"],
+      DEFAULT_REALTIME_MAX_REPLY_WORDS,
+      8,
+      80
     )
   };
 }
@@ -615931,14 +615942,47 @@ function applyRealtimeToRequestBody(body, opts) {
   delete out["realtime_options"];
   delete out["realtime_max_history_messages"];
   delete out["realtime_max_tokens"];
+  delete out["realtime_max_reply_words"];
   return out;
 }
-var DEFAULT_REALTIME_HISTORY_MESSAGES, DEFAULT_REALTIME_MAX_TOKENS, DEFAULT_SOUL_CHARS, DEFAULT_VOICE_CHARS;
+function stripHiddenThinking(text) {
+  return text.replace(/<think>[\s\S]*?<\/think>/gi, "").replace(/<think>[\s\S]*$/gi, "").trim();
+}
+function wordParts(text) {
+  return text.trim().split(/\s+/).filter(Boolean);
+}
+function finalizeRealtimeReply(text, opts = {}) {
+  const maxWords = clampInt2(opts.maxReplyWords, DEFAULT_REALTIME_MAX_REPLY_WORDS, 8, 80);
+  let clean5 = stripHiddenThinking(String(text ?? "")).replace(/```[\s\S]*?```/g, "").split("\n").map((line) => line.replace(/^\s*(?:[-*]+|\d+[.)])\s+/, "").trim()).filter(Boolean).join(" ").replace(/^(?:assistant|omnius|agent)\s*:\s*/i, "").replace(/`([^`]+)`/g, "$1").replace(/\s+/g, " ").trim();
+  if (!clean5) return "I didn't catch that. Can you say it again?";
+  const sentences = clean5.match(/[^.!?]+[.!?]+(?=\s|$)|[^.!?]+$/g) ?? [clean5];
+  const selected = [];
+  let words = 0;
+  for (const raw of sentences) {
+    const sentence = raw.trim();
+    if (!sentence) continue;
+    const count = wordParts(sentence).length;
+    if (selected.length >= 2) break;
+    if (selected.length > 0 && words + count > maxWords) break;
+    selected.push(sentence);
+    words += count;
+    if (words >= maxWords) break;
+  }
+  clean5 = (selected.join(" ") || clean5).trim();
+  const parts = wordParts(clean5);
+  if (parts.length > maxWords) {
+    clean5 = parts.slice(0, maxWords).join(" ");
+  }
+  if (clean5 && !/[.!?]$/.test(clean5)) clean5 += ".";
+  return clean5;
+}
+var DEFAULT_REALTIME_HISTORY_MESSAGES, DEFAULT_REALTIME_MAX_TOKENS, DEFAULT_REALTIME_MAX_REPLY_WORDS, DEFAULT_SOUL_CHARS, DEFAULT_VOICE_CHARS;
 var init_realtime = __esm({
   "packages/cli/src/realtime.ts"() {
     "use strict";
-    DEFAULT_REALTIME_HISTORY_MESSAGES = 12;
-    DEFAULT_REALTIME_MAX_TOKENS = 160;
+    DEFAULT_REALTIME_HISTORY_MESSAGES = 8;
+    DEFAULT_REALTIME_MAX_TOKENS = 120;
+    DEFAULT_REALTIME_MAX_REPLY_WORDS = 36;
     DEFAULT_SOUL_CHARS = 1400;
     DEFAULT_VOICE_CHARS = 700;
   }
@@ -657229,6 +657273,183 @@ function ollamaFormatFromOpenAIResponseFormat(value2) {
   if (record["type"] === "object" || record["properties"] !== void 0) return record;
   return void 0;
 }
+function bodyString(body, keys) {
+  for (const key of keys) {
+    const value2 = body[key];
+    if (typeof value2 === "string" && value2.trim()) return value2.trim();
+  }
+  return "";
+}
+function realtimeFallbackCacheKey(ollamaUrl, missingModel) {
+  return `${ollamaUrl}
+${missingModel}`;
+}
+function isOllamaMissingModelError(body) {
+  return /model ['\"]?[^'\"]+['\"]? not found/i.test(body);
+}
+async function resolveRealtimeOllamaFallbackModel(ollamaUrl, timeoutMs, missingModel) {
+  try {
+    const cacheKey = realtimeFallbackCacheKey(ollamaUrl, missingModel);
+    const cached = realtimeOllamaFallbackCache.get(cacheKey);
+    if (cached) return cached;
+    const result = await ollamaRequest(ollamaUrl, "/api/tags", "GET", void 0, Math.min(timeoutMs, 1e4));
+    if (result.status >= 400) return null;
+    const parsed = JSON.parse(result.body);
+    const names = (parsed.models ?? []).map((entry) => typeof entry.name === "string" ? entry.name : typeof entry.model === "string" ? entry.model : "").filter(Boolean);
+    if (!names.length) return null;
+    const remember = (name10) => {
+      realtimeOllamaFallbackCache.set(cacheKey, name10);
+      return name10;
+    };
+    const exactLatest = `${missingModel}:latest`;
+    if (names.includes(exactLatest)) return remember(exactLatest);
+    const preferred = [
+      "qwen3.5-9b-r10:q4km",
+      "open-agents-qwen35-9b-r10-q4km:latest",
+      "open-agents-qwen35-9b-r10-parsed-q4km:latest",
+      "open-agents-qwen35-9b-r9-q4km:latest",
+      "qwen3:8b",
+      "open-agents-qwen3-8b:latest",
+      "omnius-qwen36-35b:latest",
+      "open-agents-qwen36:latest",
+      "qwen3.6:35b"
+    ];
+    for (const name10 of preferred) {
+      if (names.includes(name10)) return remember(name10);
+    }
+    const fallback = names.find((name10) => /qwen/i.test(name10) && !/embed|vision/i.test(name10)) ?? names.find((name10) => !/embed|vision|moondream/i.test(name10)) ?? null;
+    return fallback ? remember(fallback) : null;
+  } catch {
+    return null;
+  }
+}
+function realtimeEndpointMessages(body) {
+  const messages2 = [];
+  const suppliedSoul = bodyString(body, ["soul_md", "soul", "soulMd"]);
+  const suppliedContext = bodyString(body, ["context", "call_context", "adapter_context"]);
+  if (suppliedSoul) messages2.push({ role: "system", content: `SOUL.md supplied by the voice adapter:
+${suppliedSoul}` });
+  if (suppliedContext) messages2.push({ role: "system", content: `Live call context supplied by the adapter:
+${suppliedContext}` });
+  if (Array.isArray(body["messages"])) {
+    for (const msg of body["messages"]) {
+      if (!msg || typeof msg !== "object") continue;
+      const record = msg;
+      const role = typeof record["role"] === "string" ? record["role"] : "user";
+      const content = typeof record["content"] === "string" ? record["content"].trim() : "";
+      if (content && (role === "system" || role === "user" || role === "assistant")) messages2.push({ role, content });
+    }
+  }
+  const latestTurn = bodyString(body, ["message", "text", "input", "callerText", "caller_text", "recent_turn", "asr_text"]);
+  if (latestTurn) {
+    const last2 = [...messages2].reverse().find((msg) => msg.role === "user");
+    if (!last2 || last2.content !== latestTurn) messages2.push({ role: "user", content: latestTurn });
+  }
+  return messages2;
+}
+async function completeRealtimeTextOnly(opts) {
+  const cfg = loadConfig();
+  const requestedModel = bodyString(opts.body, ["model"]);
+  const model = requestedModel || opts.defaultModel || cfg.model;
+  const route = resolveModelEndpoint(model);
+  const limitErr = route?.endpoint ? checkEndpointRateLimit(route.endpoint) : null;
+  if (limitErr) throw new Error(limitErr);
+  const targetUrl = route?.endpoint.url ?? opts.ollamaUrl;
+  const targetType = route?.endpoint.type ?? opts.defaultBackendType ?? cfg.backendType ?? "ollama";
+  let originalModel = route?.originalId ?? model.replace(/^[a-z]+\//, "");
+  const realtimeOpts = {
+    ...realtimeOptionsFromBody(opts.body, process.cwd(), opts.sessionId),
+    surface: "voice_adapter"
+  };
+  const requestBody = applyRealtimeToRequestBody({
+    ...opts.body,
+    model: originalModel,
+    messages: opts.messages,
+    realtime: true,
+    stream: false
+  }, realtimeOpts);
+  const timeoutMs = getBackendTimeoutMs(typeof opts.body["timeout_s"] === "number" ? opts.body["timeout_s"] : void 0);
+  if (targetType === "vllm" || targetType === "openai") {
+    const result2 = await ollamaRequest(targetUrl, "/v1/chat/completions", "POST", JSON.stringify(requestBody), timeoutMs, route?.endpoint);
+    if (result2.status >= 400) throw new Error(`Backend HTTP ${result2.status}: ${result2.body.slice(0, 300)}`);
+    const parsed2 = JSON.parse(result2.body);
+    const rawReply2 = String(parsed2?.choices?.[0]?.message?.content ?? "").trim();
+    return { reply: finalizeRealtimeReply(rawReply2, realtimeOpts), rawReply: rawReply2, model: originalModel, usage: parsed2?.usage };
+  }
+  const maxTokens = typeof requestBody["max_tokens"] === "number" ? requestBody["max_tokens"] : 120;
+  const temperature = typeof requestBody["temperature"] === "number" ? requestBody["temperature"] : 0.6;
+  if (!requestedModel) {
+    originalModel = realtimeOllamaFallbackCache.get(realtimeFallbackCacheKey(targetUrl, originalModel)) ?? originalModel;
+  }
+  const makeOllamaChatBody = (modelName) => JSON.stringify({
+    model: modelName,
+    messages: requestBody["messages"],
+    stream: false,
+    think: false,
+    options: { temperature, num_predict: maxTokens }
+  });
+  let result = await ollamaRequest(targetUrl, "/api/chat", "POST", makeOllamaChatBody(originalModel), timeoutMs, route?.endpoint);
+  if (result.status >= 400 && !requestedModel && isOllamaMissingModelError(result.body)) {
+    const fallbackModel = await resolveRealtimeOllamaFallbackModel(targetUrl, timeoutMs, originalModel);
+    if (fallbackModel && fallbackModel !== originalModel) {
+      originalModel = fallbackModel;
+      result = await ollamaRequest(targetUrl, "/api/chat", "POST", makeOllamaChatBody(originalModel), timeoutMs, route?.endpoint);
+    }
+  }
+  if (result.status >= 400) throw new Error(`Backend HTTP ${result.status}: ${result.body.slice(0, 300)}`);
+  const parsed = JSON.parse(result.body);
+  const rawReply = String(parsed?.message?.content ?? "").trim();
+  return {
+    reply: finalizeRealtimeReply(rawReply, realtimeOpts),
+    rawReply,
+    model: originalModel,
+    usage: {
+      prompt_tokens: parsed?.prompt_eval_count ?? 0,
+      completion_tokens: parsed?.eval_count ?? 0,
+      total_tokens: (parsed?.prompt_eval_count ?? 0) + (parsed?.eval_count ?? 0)
+    }
+  };
+}
+async function handleRealtimeText(req2, res, ollamaUrl, defaults3 = {}) {
+  const body = await parseJsonBody(req2);
+  if (!body || typeof body !== "object") {
+    jsonResponse(res, 400, { error: "invalid_request", message: "Expected a JSON object." });
+    return;
+  }
+  const messages2 = realtimeEndpointMessages(body);
+  if (!messages2.some((msg) => msg.role === "user" && msg.content.trim())) {
+    jsonResponse(res, 400, { error: "missing_turn", message: "Provide message, text, recent_turn, asr_text, callerText, or messages[]." });
+    return;
+  }
+  try {
+    const sessionId = typeof body["session_id"] === "string" ? body["session_id"] : void 0;
+    const result = await completeRealtimeTextOnly({
+      body,
+      messages: messages2,
+      ollamaUrl,
+      defaultModel: defaults3.model,
+      defaultBackendType: defaults3.backendType,
+      sessionId
+    });
+    const wantsPlain = String(req2.headers["accept"] ?? "").includes("text/plain") || body["format"] === "text";
+    if (wantsPlain) {
+      res.writeHead(200, { "Content-Type": "text/plain; charset=utf-8", "Cache-Control": "no-store" });
+      res.end(result.reply + "\n");
+      return;
+    }
+    jsonResponse(res, 200, {
+      reply: result.reply,
+      text: result.reply,
+      raw_reply: result.rawReply,
+      model: result.model,
+      usage: result.usage,
+      realtime: true,
+      mode: "voice_adapter_text_only"
+    });
+  } catch (err) {
+    jsonResponse(res, 502, { error: "realtime_failed", message: err instanceof Error ? err.message : String(err) });
+  }
+}
 function backendAuthHeaders(endpoint) {
   const key = endpoint?.authKey ?? loadConfig().apiKey;
   if (key) return { Authorization: `Bearer ${key}` };
@@ -660173,7 +660394,7 @@ async function handlePostCommand(res, cmd) {
     });
   }
 }
-async function handleRequest(req2, res, ollamaUrl, verbose) {
+async function handleRequest(req2, res, ollamaUrl, verbose, runtimeDefaults = {}) {
   try {
     const _liveCfg = loadConfig();
     if (_liveCfg.backendUrl) ollamaUrl = _liveCfg.backendUrl;
@@ -660443,6 +660664,14 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
         return;
       }
     }
+    if ((pathname === "/realtime" || pathname === "/v1/realtime") && method === "POST") {
+      if (!checkAuth(req2, res, "read")) {
+        status = 401;
+        return;
+      }
+      await handleRealtimeText(req2, res, ollamaUrl, runtimeDefaults);
+      return;
+    }
     if (pathname === "/v1/files" && method === "GET") {
       const dir = urlObj.searchParams.get("path") || process.cwd();
       try {
@@ -662448,13 +662677,14 @@ ${historyLines}
       }));
     }
   } finally {
-    recordMetric(method, pathname, status);
+    const finalStatus = res.headersSent ? res.statusCode : status;
+    recordMetric(method, pathname, finalStatus);
     const latencyMs = Math.round(performance.now() - startMs);
     logRequest({
       requestId,
       method,
       path: pathname,
-      status,
+      status: finalStatus,
       latencyMs,
       user: req2._authUser ?? "anonymous",
       scope: req2._authScope ?? "none"
@@ -662464,7 +662694,7 @@ ${historyLines}
       requestId,
       method,
       path: pathname,
-      status,
+      status: finalStatus,
       user: req2._authUser ?? "anonymous",
       scope: req2._authScope ?? "none",
       latencyMs: Math.round(performance.now() - startMs),
@@ -663386,7 +663616,10 @@ function startApiServer(options2 = {}) {
       }
     } catch {
     }
-    handleRequest(req2, res, ollamaUrl, verbose).catch((err) => {
+    handleRequest(req2, res, ollamaUrl, verbose, {
+      model: options2.model ?? config.model,
+      backendType: options2.backendType ?? config.backendType
+    }).catch((err) => {
       metrics.totalErrors++;
       try {
         jsonResponse(res, 500, {
@@ -664208,7 +664441,9 @@ async function apiServeCommand(opts, config) {
     port: opts.port,
     // Let startApiServer() parse OMNIUS_HOST env if no explicit --port
     verbose: opts.verbose,
-    ollamaUrl: config.backendUrl
+    ollamaUrl: config.backendUrl,
+    model: config.model,
+    backendType: config.backendType
   });
   await new Promise((resolve57) => {
     server2.on("close", resolve57);
@@ -664263,7 +664498,7 @@ function setTimerEnabled(name10, enabled2) {
     return false;
   }
 }
-var require4, NEXUS_DIRECTORY_ORIGIN2, NEXUS_SPONSORS_URL2, endpointRegistry, modelRouteMap, endpointUsage, _lastEndpointDiagnostics, BACKEND_TIMEOUT_DEFAULT_MS, BACKEND_TIMEOUT_MAX_MS, MODEL_LIST_TIMEOUT_DEFAULT_MS, metrics, startedAt, runningProcesses, perKeyUsage, CRON_MARKER2;
+var require4, NEXUS_DIRECTORY_ORIGIN2, NEXUS_SPONSORS_URL2, endpointRegistry, modelRouteMap, endpointUsage, _lastEndpointDiagnostics, BACKEND_TIMEOUT_DEFAULT_MS, BACKEND_TIMEOUT_MAX_MS, MODEL_LIST_TIMEOUT_DEFAULT_MS, metrics, startedAt, realtimeOllamaFallbackCache, runningProcesses, perKeyUsage, CRON_MARKER2;
 var init_serve = __esm({
   "packages/cli/src/api/serve.ts"() {
     "use strict";
@@ -664311,6 +664546,7 @@ var init_serve = __esm({
       totalErrors: 0
     };
     startedAt = Date.now();
+    realtimeOllamaFallbackCache = /* @__PURE__ */ new Map();
     runningProcesses = /* @__PURE__ */ new Map();
     perKeyUsage = /* @__PURE__ */ new Map();
     CRON_MARKER2 = "# OMNIUS-SCHEDULED:";

package/docs/guides/realtime.md CHANGED Viewed

@@ -4,6 +4,8 @@ Realtime mode is for short, natural, back-and-forth spoken conversation behind A
 It is not a long-form coding-task mode. It trims context, reduces scaffolding, and optimizes for speakable answers.
+The text-only adapter endpoint is `/realtime` (alias: `/v1/realtime`). ASR and TTS are intentionally out of scope for that route; pass the latest transcript text in, receive a short reply text out.
 ## Enable In The TUI
 ```text
@@ -14,6 +16,25 @@ It is not a long-form coding-task mode. It trims context, reduces scaffolding, a
 ## Use Through REST
+Voice-adapter text endpoint:
+```bash
+curl -s http://127.0.0.1:11435/realtime \
+  -H 'content-type: application/json' \
+  -H 'accept: text/plain' \
+  -d '{
+    "soul_md": "Be direct, warm, and practical.",
+    "recent_turn": "Can you say the short version?",
+    "realtime_options": {
+      "max_reply_words": 32,
+      "max_tokens": 120
+    },
+    "format": "text"
+  }'
+```
+Chat-compatible endpoint:
 ```bash
 curl -s http://127.0.0.1:11435/v1/chat \
   -H 'content-type: application/json' \
@@ -57,7 +78,8 @@ Realtime mode builds a compact prompt from:
 Realtime responses should:
-- default to one or two speakable sentences
+- default to one natural phone-call turn, usually under 36 words
+- lead with the answer, not analysis or status
 - ask one focused repair question when ASR text is ambiguous
 - treat the latest user utterance as the live turn
 - avoid long markdown, tables, verbose plans, or implementation narration unless requested

package/docs/rest/endpoints/chat.md CHANGED Viewed

@@ -6,6 +6,8 @@
 | --- | --- | --- |
 | `GET` | `/v1/models` | List aggregated models |
 | `POST` | `/v1/chat/completions` | OpenAI-compatible chat completions |
+| `POST` | `/realtime` | Text-only voice-adapter brain: transcript text in, short reply text out |
+| `POST` | `/v1/realtime` | Auth-scoped alias for `/realtime` |
 | `POST` | `/v1/embeddings` | Generate embeddings |
 | `POST` | `/v1/chat` | Stateful Omnius chat with optional full agent tools |
 | `POST` | `/v1/generate` | Ollama-compatible one-shot generation |
@@ -96,6 +98,8 @@ When `realtime: true`, Omnius:
 Use this for live voice clients, not long coding tasks.
+For ASR/TTS systems that only need the text brain, use `/realtime` or `/v1/realtime` with `message`, `text`, `recent_turn`, `asr_text`, or `callerText`. Optional `soul_md` supplies adapter-local SOUL.md content. Set `Accept: text/plain` or `format: "text"` to receive only the reply string.
 ## Server-Side Agent Loop
 `/v1/chat/completions` can run an internal tool loop when `agent_loop: true`. This lets clients collapse multiple model/tool round trips into one daemon request. Daemon tool calls execute inline; client-owned tool calls can still be yielded in OpenAI-compatible shape.

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.168",
+  "version": "1.0.170",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.168",
+      "version": "1.0.170",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.168",
+  "version": "1.0.170",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",