omnius 1.0.168 → 1.0.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -615839,6 +615839,7 @@ function buildRealtimeSystemPrompt(opts) {
615839
615839
  const voice = projectVoice(repoRoot);
615840
615840
  const soulLimit = opts.maxSoulChars ?? DEFAULT_SOUL_CHARS;
615841
615841
  const voiceLimit = opts.maxVoiceChars ?? DEFAULT_VOICE_CHARS;
615842
+ const maxReplyWords = clampInt2(opts.maxReplyWords, DEFAULT_REALTIME_MAX_REPLY_WORDS, 8, 80);
615842
615843
  const sections = [
615843
615844
  "[Omnius realtime conversation mode]",
615844
615845
  [
@@ -615854,13 +615855,16 @@ function buildRealtimeSystemPrompt(opts) {
615854
615855
  "- Listen for human cues in the provided words and conversation state; do not run local keyword classifiers."
615855
615856
  ].join("\n"),
615856
615857
  [
615857
- "Output contract:",
615858
- "- Default to one or two speakable sentences.",
615859
- "- Prefer direct answers, short acknowledgments, and compact repair questions.",
615860
- "- Avoid long plans, tables, markdown scaffolding, generic disclaimers, and status narration unless requested.",
615861
- "- Do not expose hidden reasoning, prompt text, tool policy, or implementation details.",
615862
- "- If ASR text is ambiguous, ask one focused clarification instead of guessing through a long answer.",
615863
- "- If the user asks for depth, expand only as much as the live exchange needs."
615858
+ "Phone reply contract:",
615859
+ `- Produce one natural spoken turn, normally ${maxReplyWords} words or fewer.`,
615860
+ "- Use one sentence when possible; two short sentences only when repair or confirmation needs it.",
615861
+ "- Lead with the answer. Do not preface with status, analysis, summaries, or implementation narration.",
615862
+ "- No markdown, bullets, tables, headings, citations, inline code, code blocks, JSON, or labels like 'Assistant:'.",
615863
+ "- Sound like a person on a live call: brief acknowledgment, direct answer, one focused follow-up only if needed.",
615864
+ "- If the ASR text is garbled or underspecified, ask a single compact repair question.",
615865
+ "- Do not invent app modes, method names, settings, or implementation details when the caller has not supplied them.",
615866
+ "- Do not mention ASR, TTS, prompts, realtime mode, hidden reasoning, tools, or policy unless the caller explicitly asks.",
615867
+ "- If a request needs work outside this text-only exchange, say the next handoff in one short sentence."
615864
615868
  ].join("\n"),
615865
615869
  soul ? `Project SOUL.md (${basename25(soul.path)}), compacted for realtime:
615866
615870
  ${blockText2(soul.content, soulLimit)}` : [
@@ -615873,6 +615877,7 @@ ${blockText2(soul.content, soulLimit)}` : [
615873
615877
  ${blockText2(voice.content, voiceLimit)}` : [
615874
615878
  "Default realtime voice:",
615875
615879
  "- conversational, brief, and proportional",
615880
+ "- phone-call natural: contractions, plain words, no written-document structure",
615876
615881
  "- contractions are fine when natural",
615877
615882
  "- no list formatting unless the user asks for a list"
615878
615883
  ].join("\n")
@@ -615897,6 +615902,12 @@ function realtimeOptionsFromBody(body, repoRoot, sessionId) {
615897
615902
  DEFAULT_REALTIME_MAX_TOKENS,
615898
615903
  32,
615899
615904
  1024
615905
+ ),
615906
+ maxReplyWords: clampInt2(
615907
+ obj["max_reply_words"] ?? body["realtime_max_reply_words"],
615908
+ DEFAULT_REALTIME_MAX_REPLY_WORDS,
615909
+ 8,
615910
+ 80
615900
615911
  )
615901
615912
  };
615902
615913
  }
@@ -615931,14 +615942,47 @@ function applyRealtimeToRequestBody(body, opts) {
615931
615942
  delete out["realtime_options"];
615932
615943
  delete out["realtime_max_history_messages"];
615933
615944
  delete out["realtime_max_tokens"];
615945
+ delete out["realtime_max_reply_words"];
615934
615946
  return out;
615935
615947
  }
615936
- var DEFAULT_REALTIME_HISTORY_MESSAGES, DEFAULT_REALTIME_MAX_TOKENS, DEFAULT_SOUL_CHARS, DEFAULT_VOICE_CHARS;
615948
+ function stripHiddenThinking(text) {
615949
+ return text.replace(/<think>[\s\S]*?<\/think>/gi, "").replace(/<think>[\s\S]*$/gi, "").trim();
615950
+ }
615951
+ function wordParts(text) {
615952
+ return text.trim().split(/\s+/).filter(Boolean);
615953
+ }
615954
+ function finalizeRealtimeReply(text, opts = {}) {
615955
+ const maxWords = clampInt2(opts.maxReplyWords, DEFAULT_REALTIME_MAX_REPLY_WORDS, 8, 80);
615956
+ let clean5 = stripHiddenThinking(String(text ?? "")).replace(/```[\s\S]*?```/g, "").split("\n").map((line) => line.replace(/^\s*(?:[-*]+|\d+[.)])\s+/, "").trim()).filter(Boolean).join(" ").replace(/^(?:assistant|omnius|agent)\s*:\s*/i, "").replace(/`([^`]+)`/g, "$1").replace(/\s+/g, " ").trim();
615957
+ if (!clean5) return "I didn't catch that. Can you say it again?";
615958
+ const sentences = clean5.match(/[^.!?]+[.!?]+(?=\s|$)|[^.!?]+$/g) ?? [clean5];
615959
+ const selected = [];
615960
+ let words = 0;
615961
+ for (const raw of sentences) {
615962
+ const sentence = raw.trim();
615963
+ if (!sentence) continue;
615964
+ const count = wordParts(sentence).length;
615965
+ if (selected.length >= 2) break;
615966
+ if (selected.length > 0 && words + count > maxWords) break;
615967
+ selected.push(sentence);
615968
+ words += count;
615969
+ if (words >= maxWords) break;
615970
+ }
615971
+ clean5 = (selected.join(" ") || clean5).trim();
615972
+ const parts = wordParts(clean5);
615973
+ if (parts.length > maxWords) {
615974
+ clean5 = parts.slice(0, maxWords).join(" ");
615975
+ }
615976
+ if (clean5 && !/[.!?]$/.test(clean5)) clean5 += ".";
615977
+ return clean5;
615978
+ }
615979
+ var DEFAULT_REALTIME_HISTORY_MESSAGES, DEFAULT_REALTIME_MAX_TOKENS, DEFAULT_REALTIME_MAX_REPLY_WORDS, DEFAULT_SOUL_CHARS, DEFAULT_VOICE_CHARS;
615937
615980
  var init_realtime = __esm({
615938
615981
  "packages/cli/src/realtime.ts"() {
615939
615982
  "use strict";
615940
- DEFAULT_REALTIME_HISTORY_MESSAGES = 12;
615941
- DEFAULT_REALTIME_MAX_TOKENS = 160;
615983
+ DEFAULT_REALTIME_HISTORY_MESSAGES = 8;
615984
+ DEFAULT_REALTIME_MAX_TOKENS = 120;
615985
+ DEFAULT_REALTIME_MAX_REPLY_WORDS = 36;
615942
615986
  DEFAULT_SOUL_CHARS = 1400;
615943
615987
  DEFAULT_VOICE_CHARS = 700;
615944
615988
  }
@@ -657229,6 +657273,183 @@ function ollamaFormatFromOpenAIResponseFormat(value2) {
657229
657273
  if (record["type"] === "object" || record["properties"] !== void 0) return record;
657230
657274
  return void 0;
657231
657275
  }
657276
+ function bodyString(body, keys) {
657277
+ for (const key of keys) {
657278
+ const value2 = body[key];
657279
+ if (typeof value2 === "string" && value2.trim()) return value2.trim();
657280
+ }
657281
+ return "";
657282
+ }
657283
+ function realtimeFallbackCacheKey(ollamaUrl, missingModel) {
657284
+ return `${ollamaUrl}
657285
+ ${missingModel}`;
657286
+ }
657287
+ function isOllamaMissingModelError(body) {
657288
+ return /model ['\"]?[^'\"]+['\"]? not found/i.test(body);
657289
+ }
657290
+ async function resolveRealtimeOllamaFallbackModel(ollamaUrl, timeoutMs, missingModel) {
657291
+ try {
657292
+ const cacheKey = realtimeFallbackCacheKey(ollamaUrl, missingModel);
657293
+ const cached = realtimeOllamaFallbackCache.get(cacheKey);
657294
+ if (cached) return cached;
657295
+ const result = await ollamaRequest(ollamaUrl, "/api/tags", "GET", void 0, Math.min(timeoutMs, 1e4));
657296
+ if (result.status >= 400) return null;
657297
+ const parsed = JSON.parse(result.body);
657298
+ const names = (parsed.models ?? []).map((entry) => typeof entry.name === "string" ? entry.name : typeof entry.model === "string" ? entry.model : "").filter(Boolean);
657299
+ if (!names.length) return null;
657300
+ const remember = (name10) => {
657301
+ realtimeOllamaFallbackCache.set(cacheKey, name10);
657302
+ return name10;
657303
+ };
657304
+ const exactLatest = `${missingModel}:latest`;
657305
+ if (names.includes(exactLatest)) return remember(exactLatest);
657306
+ const preferred = [
657307
+ "qwen3.5-9b-r10:q4km",
657308
+ "open-agents-qwen35-9b-r10-q4km:latest",
657309
+ "open-agents-qwen35-9b-r10-parsed-q4km:latest",
657310
+ "open-agents-qwen35-9b-r9-q4km:latest",
657311
+ "qwen3:8b",
657312
+ "open-agents-qwen3-8b:latest",
657313
+ "omnius-qwen36-35b:latest",
657314
+ "open-agents-qwen36:latest",
657315
+ "qwen3.6:35b"
657316
+ ];
657317
+ for (const name10 of preferred) {
657318
+ if (names.includes(name10)) return remember(name10);
657319
+ }
657320
+ const fallback = names.find((name10) => /qwen/i.test(name10) && !/embed|vision/i.test(name10)) ?? names.find((name10) => !/embed|vision|moondream/i.test(name10)) ?? null;
657321
+ return fallback ? remember(fallback) : null;
657322
+ } catch {
657323
+ return null;
657324
+ }
657325
+ }
657326
+ function realtimeEndpointMessages(body) {
657327
+ const messages2 = [];
657328
+ const suppliedSoul = bodyString(body, ["soul_md", "soul", "soulMd"]);
657329
+ const suppliedContext = bodyString(body, ["context", "call_context", "adapter_context"]);
657330
+ if (suppliedSoul) messages2.push({ role: "system", content: `SOUL.md supplied by the voice adapter:
657331
+ ${suppliedSoul}` });
657332
+ if (suppliedContext) messages2.push({ role: "system", content: `Live call context supplied by the adapter:
657333
+ ${suppliedContext}` });
657334
+ if (Array.isArray(body["messages"])) {
657335
+ for (const msg of body["messages"]) {
657336
+ if (!msg || typeof msg !== "object") continue;
657337
+ const record = msg;
657338
+ const role = typeof record["role"] === "string" ? record["role"] : "user";
657339
+ const content = typeof record["content"] === "string" ? record["content"].trim() : "";
657340
+ if (content && (role === "system" || role === "user" || role === "assistant")) messages2.push({ role, content });
657341
+ }
657342
+ }
657343
+ const latestTurn = bodyString(body, ["message", "text", "input", "callerText", "caller_text", "recent_turn", "asr_text"]);
657344
+ if (latestTurn) {
657345
+ const last2 = [...messages2].reverse().find((msg) => msg.role === "user");
657346
+ if (!last2 || last2.content !== latestTurn) messages2.push({ role: "user", content: latestTurn });
657347
+ }
657348
+ return messages2;
657349
+ }
657350
+ async function completeRealtimeTextOnly(opts) {
657351
+ const cfg = loadConfig();
657352
+ const requestedModel = bodyString(opts.body, ["model"]);
657353
+ const model = requestedModel || opts.defaultModel || cfg.model;
657354
+ const route = resolveModelEndpoint(model);
657355
+ const limitErr = route?.endpoint ? checkEndpointRateLimit(route.endpoint) : null;
657356
+ if (limitErr) throw new Error(limitErr);
657357
+ const targetUrl = route?.endpoint.url ?? opts.ollamaUrl;
657358
+ const targetType = route?.endpoint.type ?? opts.defaultBackendType ?? cfg.backendType ?? "ollama";
657359
+ let originalModel = route?.originalId ?? model.replace(/^[a-z]+\//, "");
657360
+ const realtimeOpts = {
657361
+ ...realtimeOptionsFromBody(opts.body, process.cwd(), opts.sessionId),
657362
+ surface: "voice_adapter"
657363
+ };
657364
+ const requestBody = applyRealtimeToRequestBody({
657365
+ ...opts.body,
657366
+ model: originalModel,
657367
+ messages: opts.messages,
657368
+ realtime: true,
657369
+ stream: false
657370
+ }, realtimeOpts);
657371
+ const timeoutMs = getBackendTimeoutMs(typeof opts.body["timeout_s"] === "number" ? opts.body["timeout_s"] : void 0);
657372
+ if (targetType === "vllm" || targetType === "openai") {
657373
+ const result2 = await ollamaRequest(targetUrl, "/v1/chat/completions", "POST", JSON.stringify(requestBody), timeoutMs, route?.endpoint);
657374
+ if (result2.status >= 400) throw new Error(`Backend HTTP ${result2.status}: ${result2.body.slice(0, 300)}`);
657375
+ const parsed2 = JSON.parse(result2.body);
657376
+ const rawReply2 = String(parsed2?.choices?.[0]?.message?.content ?? "").trim();
657377
+ return { reply: finalizeRealtimeReply(rawReply2, realtimeOpts), rawReply: rawReply2, model: originalModel, usage: parsed2?.usage };
657378
+ }
657379
+ const maxTokens = typeof requestBody["max_tokens"] === "number" ? requestBody["max_tokens"] : 120;
657380
+ const temperature = typeof requestBody["temperature"] === "number" ? requestBody["temperature"] : 0.6;
657381
+ if (!requestedModel) {
657382
+ originalModel = realtimeOllamaFallbackCache.get(realtimeFallbackCacheKey(targetUrl, originalModel)) ?? originalModel;
657383
+ }
657384
+ const makeOllamaChatBody = (modelName) => JSON.stringify({
657385
+ model: modelName,
657386
+ messages: requestBody["messages"],
657387
+ stream: false,
657388
+ think: false,
657389
+ options: { temperature, num_predict: maxTokens }
657390
+ });
657391
+ let result = await ollamaRequest(targetUrl, "/api/chat", "POST", makeOllamaChatBody(originalModel), timeoutMs, route?.endpoint);
657392
+ if (result.status >= 400 && !requestedModel && isOllamaMissingModelError(result.body)) {
657393
+ const fallbackModel = await resolveRealtimeOllamaFallbackModel(targetUrl, timeoutMs, originalModel);
657394
+ if (fallbackModel && fallbackModel !== originalModel) {
657395
+ originalModel = fallbackModel;
657396
+ result = await ollamaRequest(targetUrl, "/api/chat", "POST", makeOllamaChatBody(originalModel), timeoutMs, route?.endpoint);
657397
+ }
657398
+ }
657399
+ if (result.status >= 400) throw new Error(`Backend HTTP ${result.status}: ${result.body.slice(0, 300)}`);
657400
+ const parsed = JSON.parse(result.body);
657401
+ const rawReply = String(parsed?.message?.content ?? "").trim();
657402
+ return {
657403
+ reply: finalizeRealtimeReply(rawReply, realtimeOpts),
657404
+ rawReply,
657405
+ model: originalModel,
657406
+ usage: {
657407
+ prompt_tokens: parsed?.prompt_eval_count ?? 0,
657408
+ completion_tokens: parsed?.eval_count ?? 0,
657409
+ total_tokens: (parsed?.prompt_eval_count ?? 0) + (parsed?.eval_count ?? 0)
657410
+ }
657411
+ };
657412
+ }
657413
+ async function handleRealtimeText(req2, res, ollamaUrl, defaults3 = {}) {
657414
+ const body = await parseJsonBody(req2);
657415
+ if (!body || typeof body !== "object") {
657416
+ jsonResponse(res, 400, { error: "invalid_request", message: "Expected a JSON object." });
657417
+ return;
657418
+ }
657419
+ const messages2 = realtimeEndpointMessages(body);
657420
+ if (!messages2.some((msg) => msg.role === "user" && msg.content.trim())) {
657421
+ jsonResponse(res, 400, { error: "missing_turn", message: "Provide message, text, recent_turn, asr_text, callerText, or messages[]." });
657422
+ return;
657423
+ }
657424
+ try {
657425
+ const sessionId = typeof body["session_id"] === "string" ? body["session_id"] : void 0;
657426
+ const result = await completeRealtimeTextOnly({
657427
+ body,
657428
+ messages: messages2,
657429
+ ollamaUrl,
657430
+ defaultModel: defaults3.model,
657431
+ defaultBackendType: defaults3.backendType,
657432
+ sessionId
657433
+ });
657434
+ const wantsPlain = String(req2.headers["accept"] ?? "").includes("text/plain") || body["format"] === "text";
657435
+ if (wantsPlain) {
657436
+ res.writeHead(200, { "Content-Type": "text/plain; charset=utf-8", "Cache-Control": "no-store" });
657437
+ res.end(result.reply + "\n");
657438
+ return;
657439
+ }
657440
+ jsonResponse(res, 200, {
657441
+ reply: result.reply,
657442
+ text: result.reply,
657443
+ raw_reply: result.rawReply,
657444
+ model: result.model,
657445
+ usage: result.usage,
657446
+ realtime: true,
657447
+ mode: "voice_adapter_text_only"
657448
+ });
657449
+ } catch (err) {
657450
+ jsonResponse(res, 502, { error: "realtime_failed", message: err instanceof Error ? err.message : String(err) });
657451
+ }
657452
+ }
657232
657453
  function backendAuthHeaders(endpoint) {
657233
657454
  const key = endpoint?.authKey ?? loadConfig().apiKey;
657234
657455
  if (key) return { Authorization: `Bearer ${key}` };
@@ -660173,7 +660394,7 @@ async function handlePostCommand(res, cmd) {
660173
660394
  });
660174
660395
  }
660175
660396
  }
660176
- async function handleRequest(req2, res, ollamaUrl, verbose) {
660397
+ async function handleRequest(req2, res, ollamaUrl, verbose, runtimeDefaults = {}) {
660177
660398
  try {
660178
660399
  const _liveCfg = loadConfig();
660179
660400
  if (_liveCfg.backendUrl) ollamaUrl = _liveCfg.backendUrl;
@@ -660443,6 +660664,14 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
660443
660664
  return;
660444
660665
  }
660445
660666
  }
660667
+ if ((pathname === "/realtime" || pathname === "/v1/realtime") && method === "POST") {
660668
+ if (!checkAuth(req2, res, "read")) {
660669
+ status = 401;
660670
+ return;
660671
+ }
660672
+ await handleRealtimeText(req2, res, ollamaUrl, runtimeDefaults);
660673
+ return;
660674
+ }
660446
660675
  if (pathname === "/v1/files" && method === "GET") {
660447
660676
  const dir = urlObj.searchParams.get("path") || process.cwd();
660448
660677
  try {
@@ -662448,13 +662677,14 @@ ${historyLines}
662448
662677
  }));
662449
662678
  }
662450
662679
  } finally {
662451
- recordMetric(method, pathname, status);
662680
+ const finalStatus = res.headersSent ? res.statusCode : status;
662681
+ recordMetric(method, pathname, finalStatus);
662452
662682
  const latencyMs = Math.round(performance.now() - startMs);
662453
662683
  logRequest({
662454
662684
  requestId,
662455
662685
  method,
662456
662686
  path: pathname,
662457
- status,
662687
+ status: finalStatus,
662458
662688
  latencyMs,
662459
662689
  user: req2._authUser ?? "anonymous",
662460
662690
  scope: req2._authScope ?? "none"
@@ -662464,7 +662694,7 @@ ${historyLines}
662464
662694
  requestId,
662465
662695
  method,
662466
662696
  path: pathname,
662467
- status,
662697
+ status: finalStatus,
662468
662698
  user: req2._authUser ?? "anonymous",
662469
662699
  scope: req2._authScope ?? "none",
662470
662700
  latencyMs: Math.round(performance.now() - startMs),
@@ -663386,7 +663616,10 @@ function startApiServer(options2 = {}) {
663386
663616
  }
663387
663617
  } catch {
663388
663618
  }
663389
- handleRequest(req2, res, ollamaUrl, verbose).catch((err) => {
663619
+ handleRequest(req2, res, ollamaUrl, verbose, {
663620
+ model: options2.model ?? config.model,
663621
+ backendType: options2.backendType ?? config.backendType
663622
+ }).catch((err) => {
663390
663623
  metrics.totalErrors++;
663391
663624
  try {
663392
663625
  jsonResponse(res, 500, {
@@ -664208,7 +664441,9 @@ async function apiServeCommand(opts, config) {
664208
664441
  port: opts.port,
664209
664442
  // Let startApiServer() parse OMNIUS_HOST env if no explicit --port
664210
664443
  verbose: opts.verbose,
664211
- ollamaUrl: config.backendUrl
664444
+ ollamaUrl: config.backendUrl,
664445
+ model: config.model,
664446
+ backendType: config.backendType
664212
664447
  });
664213
664448
  await new Promise((resolve57) => {
664214
664449
  server2.on("close", resolve57);
@@ -664263,7 +664498,7 @@ function setTimerEnabled(name10, enabled2) {
664263
664498
  return false;
664264
664499
  }
664265
664500
  }
664266
- var require4, NEXUS_DIRECTORY_ORIGIN2, NEXUS_SPONSORS_URL2, endpointRegistry, modelRouteMap, endpointUsage, _lastEndpointDiagnostics, BACKEND_TIMEOUT_DEFAULT_MS, BACKEND_TIMEOUT_MAX_MS, MODEL_LIST_TIMEOUT_DEFAULT_MS, metrics, startedAt, runningProcesses, perKeyUsage, CRON_MARKER2;
664501
+ var require4, NEXUS_DIRECTORY_ORIGIN2, NEXUS_SPONSORS_URL2, endpointRegistry, modelRouteMap, endpointUsage, _lastEndpointDiagnostics, BACKEND_TIMEOUT_DEFAULT_MS, BACKEND_TIMEOUT_MAX_MS, MODEL_LIST_TIMEOUT_DEFAULT_MS, metrics, startedAt, realtimeOllamaFallbackCache, runningProcesses, perKeyUsage, CRON_MARKER2;
664267
664502
  var init_serve = __esm({
664268
664503
  "packages/cli/src/api/serve.ts"() {
664269
664504
  "use strict";
@@ -664311,6 +664546,7 @@ var init_serve = __esm({
664311
664546
  totalErrors: 0
664312
664547
  };
664313
664548
  startedAt = Date.now();
664549
+ realtimeOllamaFallbackCache = /* @__PURE__ */ new Map();
664314
664550
  runningProcesses = /* @__PURE__ */ new Map();
664315
664551
  perKeyUsage = /* @__PURE__ */ new Map();
664316
664552
  CRON_MARKER2 = "# OMNIUS-SCHEDULED:";
@@ -4,6 +4,8 @@ Realtime mode is for short, natural, back-and-forth spoken conversation behind A
4
4
 
5
5
  It is not a long-form coding-task mode. It trims context, reduces scaffolding, and optimizes for speakable answers.
6
6
 
7
+ The text-only adapter endpoint is `/realtime` (alias: `/v1/realtime`). ASR and TTS are intentionally out of scope for that route; pass the latest transcript text in, receive a short reply text out.
8
+
7
9
  ## Enable In The TUI
8
10
 
9
11
  ```text
@@ -14,6 +16,25 @@ It is not a long-form coding-task mode. It trims context, reduces scaffolding, a
14
16
 
15
17
  ## Use Through REST
16
18
 
19
+ Voice-adapter text endpoint:
20
+
21
+ ```bash
22
+ curl -s http://127.0.0.1:11435/realtime \
23
+ -H 'content-type: application/json' \
24
+ -H 'accept: text/plain' \
25
+ -d '{
26
+ "soul_md": "Be direct, warm, and practical.",
27
+ "recent_turn": "Can you say the short version?",
28
+ "realtime_options": {
29
+ "max_reply_words": 32,
30
+ "max_tokens": 120
31
+ },
32
+ "format": "text"
33
+ }'
34
+ ```
35
+
36
+ Chat-compatible endpoint:
37
+
17
38
  ```bash
18
39
  curl -s http://127.0.0.1:11435/v1/chat \
19
40
  -H 'content-type: application/json' \
@@ -57,7 +78,8 @@ Realtime mode builds a compact prompt from:
57
78
 
58
79
  Realtime responses should:
59
80
 
60
- - default to one or two speakable sentences
81
+ - default to one natural phone-call turn, usually under 36 words
82
+ - lead with the answer, not analysis or status
61
83
  - ask one focused repair question when ASR text is ambiguous
62
84
  - treat the latest user utterance as the live turn
63
85
  - avoid long markdown, tables, verbose plans, or implementation narration unless requested
@@ -6,6 +6,8 @@
6
6
  | --- | --- | --- |
7
7
  | `GET` | `/v1/models` | List aggregated models |
8
8
  | `POST` | `/v1/chat/completions` | OpenAI-compatible chat completions |
9
+ | `POST` | `/realtime` | Text-only voice-adapter brain: transcript text in, short reply text out |
10
+ | `POST` | `/v1/realtime` | Auth-scoped alias for `/realtime` |
9
11
  | `POST` | `/v1/embeddings` | Generate embeddings |
10
12
  | `POST` | `/v1/chat` | Stateful Omnius chat with optional full agent tools |
11
13
  | `POST` | `/v1/generate` | Ollama-compatible one-shot generation |
@@ -96,6 +98,8 @@ When `realtime: true`, Omnius:
96
98
 
97
99
  Use this for live voice clients, not long coding tasks.
98
100
 
101
+ For ASR/TTS systems that only need the text brain, use `/realtime` or `/v1/realtime` with `message`, `text`, `recent_turn`, `asr_text`, or `callerText`. Optional `soul_md` supplies adapter-local SOUL.md content. Set `Accept: text/plain` or `format: "text"` to receive only the reply string.
102
+
99
103
  ## Server-Side Agent Loop
100
104
 
101
105
  `/v1/chat/completions` can run an internal tool loop when `agent_loop: true`. This lets clients collapse multiple model/tool round trips into one daemon request. Daemon tool calls execute inline; client-owned tool calls can still be yielded in OpenAI-compatible shape.
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.168",
3
+ "version": "1.0.170",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.168",
9
+ "version": "1.0.170",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.168",
3
+ "version": "1.0.170",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",