npm - @agentprojectcontext/apx - Versions diffs - 1.15.6 → 1.17.0 - Mend

@agentprojectcontext/apx 1.15.6 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (222) hide show

package/package.json +46 -5
package/src/cli/commands/log.js +113 -0
package/src/cli/commands/overlay.js +253 -0
package/src/cli/commands/sys.js +88 -16
package/src/cli/index.js +23 -1
package/src/cli/terminal-chat/renderer.js +71 -56
package/src/cli-ts/commands/agent.ts +173 -0
package/src/cli-ts/commands/chat.ts +119 -0
package/src/cli-ts/commands/daemon.ts +112 -0
package/src/cli-ts/commands/exec.ts +109 -0
package/src/cli-ts/commands/mcp.ts +235 -0
package/src/cli-ts/commands/session.ts +224 -0
package/src/cli-ts/commands/status.ts +61 -0
package/src/cli-ts/http.ts +36 -0
package/src/cli-ts/index.ts +73 -0
package/src/cli-ts/ui.ts +107 -0
package/src/core/logging.js +81 -0
package/src/daemon/api.js +58 -0
package/src/daemon/engines/anthropic.js +60 -1
package/src/daemon/engines/index.js +2 -1
package/src/daemon/engines/ollama.js +70 -3
package/src/daemon/index.js +58 -0
package/src/daemon/overlay-ws.js +40 -0
package/src/daemon/plugins/index.js +2 -1
package/src/daemon/plugins/overlay.js +177 -0
package/src/daemon/plugins/telegram.js +15 -3
package/src/daemon/super-agent-langchain.js +296 -0
package/src/daemon/super-agent.js +115 -19
package/src/daemon/transcription.js +262 -59
package/src/daemon/whisper-server.py +57 -6
package/src/overlay/index.html +44 -0
package/src/overlay/main.js +480 -0
package/src/overlay/package.json +3 -0
package/src/overlay/preload.js +34 -0
package/src/overlay/renderer.js +371 -0
package/src/overlay/style.css +250 -0
package/src/tui/_shims/cli-error.ts +6 -0
package/src/tui/_shims/cli-logo.ts +18 -0
package/src/tui/_shims/cli-ui.ts +1 -0
package/src/tui/_shims/config-console-state.ts +7 -0
package/src/tui/_shims/core-any.ts +30 -0
package/src/tui/_shims/core-binary.ts +13 -0
package/src/tui/_shims/core-flag.ts +3 -0
package/src/tui/_shims/core-log.ts +14 -0
package/src/tui/_shims/lsp-language.ts +1 -0
package/src/tui/_shims/opencode-any.ts +135 -0
package/src/tui/_shims/opencode-sdk-v2.ts +48 -0
package/src/tui/_shims/plugin-tui.ts +13 -0
package/src/tui/_shims/provider-provider.ts +10 -0
package/src/tui/_shims/session-retry.ts +1 -0
package/src/tui/_shims/session-schema.ts +15 -0
package/src/tui/_shims/session-session.ts +3 -0
package/src/tui/_shims/snapshot.ts +4 -0
package/src/tui/_shims/tool-any.ts +18 -0
package/src/tui/_shims/util-error.ts +7 -0
package/src/tui/_shims/util-filesystem.ts +79 -0
package/src/tui/_shims/util-format.ts +7 -0
package/src/tui/_shims/util-iife.ts +3 -0
package/src/tui/_shims/util-locale.ts +10 -0
package/src/tui/_shims/util-process.ts +38 -0
package/src/tui/app.tsx +783 -0
package/src/tui/asset/charge.wav +0 -0
package/src/tui/asset/pulse-a.wav +0 -0
package/src/tui/asset/pulse-b.wav +0 -0
package/src/tui/asset/pulse-c.wav +0 -0
package/src/tui/attach.ts +100 -0
package/src/tui/component/bg-pulse-render.ts +436 -0
package/src/tui/component/bg-pulse.tsx +99 -0
package/src/tui/component/border.tsx +21 -0
package/src/tui/component/dialog-agent.tsx +31 -0
package/src/tui/component/dialog-console-org.tsx +103 -0
package/src/tui/component/dialog-mcp.tsx +85 -0
package/src/tui/component/dialog-model.tsx +175 -0
package/src/tui/component/dialog-provider.tsx +456 -0
package/src/tui/component/dialog-retry-action.tsx +160 -0
package/src/tui/component/dialog-session-delete-failed.tsx +99 -0
package/src/tui/component/dialog-session-list.tsx +323 -0
package/src/tui/component/dialog-session-rename.tsx +31 -0
package/src/tui/component/dialog-skill.tsx +36 -0
package/src/tui/component/dialog-stash.tsx +87 -0
package/src/tui/component/dialog-status.tsx +168 -0
package/src/tui/component/dialog-tag.tsx +44 -0
package/src/tui/component/dialog-theme-list.tsx +50 -0
package/src/tui/component/dialog-variant.tsx +39 -0
package/src/tui/component/dialog-workspace-create.tsx +302 -0
package/src/tui/component/dialog-workspace-file-changes.tsx +138 -0
package/src/tui/component/dialog-workspace-unavailable.tsx +69 -0
package/src/tui/component/error-component.tsx +92 -0
package/src/tui/component/logo.tsx +896 -0
package/src/tui/component/plugin-route-missing.tsx +14 -0
package/src/tui/component/prompt/autocomplete.tsx +869 -0
package/src/tui/component/prompt/cwd.ts +0 -0
package/src/tui/component/prompt/frecency.tsx +90 -0
package/src/tui/component/prompt/history.tsx +108 -0
package/src/tui/component/prompt/index.tsx +1809 -0
package/src/tui/component/prompt/part.ts +16 -0
package/src/tui/component/prompt/stash.tsx +101 -0
package/src/tui/component/prompt/traits.ts +35 -0
package/src/tui/component/spinner.tsx +24 -0
package/src/tui/component/startup-loading.tsx +63 -0
package/src/tui/component/todo-item.tsx +32 -0
package/src/tui/component/use-connected.tsx +9 -0
package/src/tui/component/workspace-label.tsx +19 -0
package/src/tui/config/cwd.ts +5 -0
package/src/tui/config/keybind.ts +432 -0
package/src/tui/config/tui-migrate.ts +154 -0
package/src/tui/config/tui-schema.ts +34 -0
package/src/tui/config/tui.ts +46 -0
package/src/tui/context/aggregate-failures.ts +34 -0
package/src/tui/context/args.tsx +15 -0
package/src/tui/context/command-palette.tsx +163 -0
package/src/tui/context/directory.ts +15 -0
package/src/tui/context/editor-zed.ts +283 -0
package/src/tui/context/editor.ts +468 -0
package/src/tui/context/event-apx.ts +22 -0
package/src/tui/context/event.ts +6 -0
package/src/tui/context/exit.tsx +60 -0
package/src/tui/context/helper.tsx +25 -0
package/src/tui/context/kv.tsx +81 -0
package/src/tui/context/local.tsx +608 -0
package/src/tui/context/path-format.tsx +39 -0
package/src/tui/context/project-apx.tsx +48 -0
package/src/tui/context/project.tsx +7 -0
package/src/tui/context/prompt.tsx +18 -0
package/src/tui/context/route.tsx +52 -0
package/src/tui/context/sdk-apx.tsx +185 -0
package/src/tui/context/sdk.tsx +6 -0
package/src/tui/context/sync-apx.tsx +178 -0
package/src/tui/context/sync-v2.tsx +16 -0
package/src/tui/context/sync.tsx +118 -0
package/src/tui/context/theme/aura.json +69 -0
package/src/tui/context/theme/ayu.json +80 -0
package/src/tui/context/theme/carbonfox.json +248 -0
package/src/tui/context/theme/catppuccin-frappe.json +230 -0
package/src/tui/context/theme/catppuccin-macchiato.json +230 -0
package/src/tui/context/theme/catppuccin.json +112 -0
package/src/tui/context/theme/cobalt2.json +225 -0
package/src/tui/context/theme/cursor.json +249 -0
package/src/tui/context/theme/dracula.json +219 -0
package/src/tui/context/theme/everforest.json +241 -0
package/src/tui/context/theme/flexoki.json +237 -0
package/src/tui/context/theme/github.json +233 -0
package/src/tui/context/theme/gruvbox.json +242 -0
package/src/tui/context/theme/kanagawa.json +77 -0
package/src/tui/context/theme/lucent-orng.json +234 -0
package/src/tui/context/theme/material.json +235 -0
package/src/tui/context/theme/matrix.json +77 -0
package/src/tui/context/theme/mercury.json +252 -0
package/src/tui/context/theme/monokai.json +221 -0
package/src/tui/context/theme/nightowl.json +221 -0
package/src/tui/context/theme/nord.json +223 -0
package/src/tui/context/theme/one-dark.json +84 -0
package/src/tui/context/theme/opencode.json +245 -0
package/src/tui/context/theme/orng.json +249 -0
package/src/tui/context/theme/osaka-jade.json +93 -0
package/src/tui/context/theme/palenight.json +222 -0
package/src/tui/context/theme/rosepine.json +234 -0
package/src/tui/context/theme/solarized.json +223 -0
package/src/tui/context/theme/synthwave84.json +226 -0
package/src/tui/context/theme/tokyonight.json +243 -0
package/src/tui/context/theme/vercel.json +245 -0
package/src/tui/context/theme/vesper.json +218 -0
package/src/tui/context/theme/zenburn.json +223 -0
package/src/tui/context/theme.tsx +1247 -0
package/src/tui/context/tui-config.tsx +9 -0
package/src/tui/event.ts +16 -0
package/src/tui/feature-plugins/home/footer.tsx +94 -0
package/src/tui/feature-plugins/home/tips-view.tsx +166 -0
package/src/tui/feature-plugins/home/tips.tsx +59 -0
package/src/tui/feature-plugins/sidebar/context.tsx +65 -0
package/src/tui/feature-plugins/sidebar/files.tsx +63 -0
package/src/tui/feature-plugins/sidebar/footer.tsx +94 -0
package/src/tui/feature-plugins/sidebar/lsp.tsx +65 -0
package/src/tui/feature-plugins/sidebar/mcp.tsx +97 -0
package/src/tui/feature-plugins/sidebar/todo.tsx +49 -0
package/src/tui/feature-plugins/system/plugins.tsx +269 -0
package/src/tui/feature-plugins/system/session-v2.tsx +1143 -0
package/src/tui/feature-plugins/system/which-key.tsx +608 -0
package/src/tui/keymap.tsx +166 -0
package/src/tui/layer.ts +6 -0
package/src/tui/plugin/api.tsx +381 -0
package/src/tui/plugin/command-shim.ts +109 -0
package/src/tui/plugin/internal.ts +33 -0
package/src/tui/plugin/runtime.ts +1069 -0
package/src/tui/plugin/slots.tsx +60 -0
package/src/tui/routes/home.tsx +96 -0
package/src/tui/routes/session/dialog-fork-from-timeline.tsx +76 -0
package/src/tui/routes/session/dialog-message.tsx +108 -0
package/src/tui/routes/session/dialog-subagent.tsx +26 -0
package/src/tui/routes/session/dialog-timeline.tsx +47 -0
package/src/tui/routes/session/footer.tsx +91 -0
package/src/tui/routes/session/index.tsx +188 -0
package/src/tui/routes/session/permission.tsx +722 -0
package/src/tui/routes/session/question.tsx +490 -0
package/src/tui/routes/session/sidebar.tsx +102 -0
package/src/tui/routes/session/subagent-footer.tsx +133 -0
package/src/tui/run.ts +84 -0
package/src/tui/thread.ts +261 -0
package/src/tui/tsconfig.json +40 -0
package/src/tui/ui/dialog-alert.tsx +66 -0
package/src/tui/ui/dialog-confirm.tsx +108 -0
package/src/tui/ui/dialog-export-options.tsx +217 -0
package/src/tui/ui/dialog-help.tsx +40 -0
package/src/tui/ui/dialog-prompt.tsx +101 -0
package/src/tui/ui/dialog-select.tsx +553 -0
package/src/tui/ui/dialog.tsx +211 -0
package/src/tui/ui/link.tsx +34 -0
package/src/tui/ui/spinner.ts +368 -0
package/src/tui/ui/toast.tsx +111 -0
package/src/tui/util/clipboard.ts +217 -0
package/src/tui/util/editor.ts +37 -0
package/src/tui/util/model.ts +23 -0
package/src/tui/util/provider-origin.ts +7 -0
package/src/tui/util/revert-diff.ts +18 -0
package/src/tui/util/scroll.ts +25 -0
package/src/tui/util/selection.ts +65 -0
package/src/tui/util/signal.ts +41 -0
package/src/tui/util/sound.ts +156 -0
package/src/tui/util/transcript.ts +112 -0
package/src/tui/validate-session.ts +29 -0
package/src/tui/win32.ts +130 -0
package/src/tui/worker.ts +104 -0

package/src/daemon/super-agent.js CHANGED Viewed

@@ -22,7 +22,43 @@ import { readIdentity } from "../core/identity.js";
 const MAX_TOOL_ITERS = 6;
-const DEFAULT_SYSTEM = `You are the **APX dispatcher** — the daemon-level agent that runs above all APC projects.
+// Tools that, when they're the ONLY thing the model called in an iteration,
+// don't count as "real work" — they're acknowledgements (telegram ping back
+// to the user, log lines, etc). When the model emits an iteration that only
+// contains acks, we DON'T let it leave the loop on iter N+1 with empty text:
+// we force another required tool call so the actual task gets executed.
+//
+// This is the fix for the "agent sends 'ya te escucho 🎧' and then stops"
+// bug. Without it, gemma4-class models sometimes consider the ack the
+// complete reply on iter 0 and emit only "ok" on iter 1, breaking out.
+const ACK_ONLY_TOOLS = new Set(["send_telegram"]);
+// Hard cap so the model can't ack-ack-ack forever — after this many
+// consecutive ack-only iterations we let the loop progress naturally
+// (the model already had its chance to call a real tool).
+const MAX_CONSECUTIVE_ACKS = 2;
+export const DEFAULT_SYSTEM = `# Identity (override everything else)
+You are **APX** — Manuel's personal assistant running on his Mac.
+You are NOT a code analyzer, NOT a generic chatbot, NOT a tutor.
+You are an **action agent**: you USE TOOLS to do real things on Manuel's system.
+# Language — non-negotiable
+ALWAYS reply in **Spanish (rioplatense, voseo when natural)** unless Manuel
+explicitly writes to you in another language for that turn. The user is an
+Argentinian developer; English replies feel broken to him. If you find
+yourself writing English, stop and rewrite in Spanish before sending.
+This rule beats every other formatting hint below.
+# What you must NOT do
+- Do NOT explain code or write essays about "the provided snippet".
+- Do NOT describe what a tool *would* do — call it and report the result.
+- Do NOT dump the tool catalog at the user.
+- Do NOT respond with disclaimers ("as an AI…", "I'm just an assistant…").
+- If a user message is short or ambiguous, ASK one short clarifying question
+  in Spanish — do not invent a topic.
+# How you operate
+You are the **APX dispatcher** — the daemon-level agent that runs above all APC projects.
 APX is a local daemon + CLI for APC projects. User-level runtime state lives under ~/.apx/:
 - ~/.apx/config.json: daemon config, engines, Telegram, super-agent settings
@@ -51,7 +87,7 @@ HARD RULES (do not deviate):
 3. NEVER answer "specify a project" — instead, just call the tool with no argument and you'll get the full picture.
 4. If a tool result has an error, retry with different arguments before falling back to asking the user.
 5. Respect permission mode. total = execute requested actions without confirmation. automatico = read/list/safe shell actions run directly; destructive, external, runtime, MCP calls, outbound messages, config, and filesystem mutations need explicit user confirmation. permiso = only allowed tools run directly; everything else needs confirmation.
-6. Write in the user's language unless they request another language. The system prompt stays English. Plain text, no markdown formatting for Telegram.
+6. Write in **Spanish** by default (see "Language" section above). Plain text on Telegram — no markdown tables, no code fences unless quoting code. Keep replies under 6 sentences unless the user asks for detail.
 7. Stay brief: under 6 sentences unless asked for detail.
 8. You DO see recent prior turns of this chat as previous messages when applicable. **Use them ONLY to disambiguate references** (e.g. "el primero" → first project mentioned earlier). For ANY factual data — agent details, MCP details, file contents, memory — RE-CALL the tool. Past turns are context, not a cache. Models change, agents change, files change.
 9. /reset or /new from the user means "forget previous turns and answer this one fresh" — if you see those prefixes the operator already cleared the context for you.
@@ -63,7 +99,7 @@ HARD RULES (do not deviate):
 15. NO-PENDING RULE: never say "give me a second", "I will do it", or "I will try later" as a final answer. Either call the tool in this same turn or say what blocks you.
 16. IDENTITY RULE: when the user asks you to change your name, call yourself something, or update your personality/language, call set_identity and persist the change. Then confirm with your new name.
 17. ROUTINES RULE: NEVER create a routine in the default project (id=0). Routines MUST be tied to a specific registered project. Before adding a routine, call list_projects to find the correct project id or name. Then pass --project <id|name> to apx routine add. If no project fits, ask the user which project to use. Creating routines in project 0/default mixes unrelated projects' schedules and corrupts state.
-18. **NO EMPTY RESPONSES**: Never respond with only text when you have tools available and the user is asking you to DO something. Call the tool FIRST, then explain. Never say "I'll do X" without immediately calling the tool. Empty acknowledgments ("ok", "entendido", "dame un minuto", "voy", "checking", "stand by") without a tool call are invalid responses — they will be re-prompted and waste a turn.
+18. **NO BARE ACKS AS FINAL ANSWER**: Empty acknowledgments ("ok", "entendido", "dame un minuto", "voy", "checking") are invalid as a FINAL response when a tool was needed — they will be re-prompted. EXCEPTION: a short contextual ack sent via send_telegram BEFORE another tool call is encouraged on Telegram audio inputs and on tool calls that take more than a few seconds (browser_screenshot, web_search, run_shell, long file edits). The ack must be **contextual and varied** in Spanish — e.g. "Ya te escucho 🎧", "Dame un seg, transcribiendo…", "Buscando eso ahora", "Voy a revisar el repo…", "Un momento, ejecutando…". Never reuse the exact same ack twice in a row. The ack is the FIRST tool call in the turn; the actual work follows immediately in the SAME turn (do not return without doing the work).
 19. **CWD RULE**: When the channel context includes a "CWD: <path>" line, that is the user's current working directory. References to "este directorio", "este proyecto", "esta carpeta", "acá", "aquí", "this directory", "this project", "current dir/folder" all mean that exact CWD path. Use it as the path argument directly — DO NOT ask the user "what's the path?" when CWD is already given. Example: if user says "agregá este proyecto a la lista", call add_project({path: <CWD>}) immediately.
 20. **NO MANUAL SCAFFOLDING**: To register or scaffold a project, ALWAYS use add_project — it auto-creates AGENTS.md and .apc/project.json when missing (one call, atomic). NEVER write AGENTS.md, .apc/project.json, or any APC scaffold file by hand via run_shell / write_file / shell pipes. The schema must come from the official initApf scaffold, not improvised. If add_project errors, report the error to the user — don't try to work around it with shell hacks. Same for any other APC-managed file (.apc/agents/*, .apc/skills/*, etc.) — use the dedicated tool, never raw filesystem writes.
 21. **SKILLS — ON DEMAND**: The "# Available skills" section below lists every skill available to you (slug + description, NO body). When the user asks about specific APX/APC commands, project structure, agent runtimes, or anything where exact syntax or detailed behavior matches a skill description (in ANY language — match semantically, not by keyword), call load_skill({slug}) to fetch the full markdown body. If a CWD is in the contextNote, pass it as project_path so project-scoped skills resolve. If the user explicitly asks "what skills do you have?", you can either read the catalog below directly OR call list_skills to get a fresh enumeration. Do NOT load skills for trivial / unrelated questions — that wastes tokens. Don't guess CLI syntax when a skill can tell you; load it.
@@ -143,8 +179,32 @@ function looksLikeActionRequest(text) {
   return /\b(list|show|find|get|fetch|search|run|execute|create|add|make|start|stop|delete|update|send|check|read|write|look|tell me|dame|mostra|busca|ejecuta|crea|agrega|mandá|revisá|corré|borrá|arrancá)\b/.test(t);
 }
+/**
+ * Build the identity block injected into every super-agent system prompt.
+ * Pure function — exported for unit tests.
+ *
+ * @param {object|null} identity  result of readIdentity(), or a plain object for tests
+ * @param {string} userLang       ISO 639-1 code from config.user.language (default "en")
+ */
+export function buildIdentityBlock(identity, userLang = "en") {
+  const lines = ["# Identity"];
+  if (identity?.agent_name) lines.push(`Your name is ${identity.agent_name}.`);
+  if (identity?.personality) lines.push(`Your personality: ${identity.personality}.`);
+  if (identity?.owner_name) lines.push(`Your owner is ${identity.owner_name}.`);
+  if (identity?.owner_context) lines.push(`Owner context: ${identity.owner_context}`);
+  lines.push(`Always reply in the language with ISO code "${userLang}" unless the user explicitly switches.`);
+  return lines.join("\n");
+}
 export function isSuperAgentEnabled(cfg) {
-  return !!(cfg && cfg.super_agent && cfg.super_agent.enabled && cfg.super_agent.model);
+  // The super-agent is the system's default reply path. It is considered
+  // enabled as soon as a model is configured — the legacy `.enabled` flag is
+  // honoured only when explicitly set to `false`. This prevents the bot
+  // from silently dropping Telegram messages just because someone forgot to
+  // set super_agent.enabled = true.
+  const sa = cfg && cfg.super_agent;
+  if (!sa || !sa.model) return false;
+  return sa.enabled !== false;
 }
 export async function runSuperAgent({
@@ -158,6 +218,7 @@ export async function runSuperAgent({
   overrideModel = null,
   onEvent = null,
   signal,
+  onToken = null,
 }) {
   if (!isSuperAgentEnabled(globalConfig)) {
     throw new Error("super-agent not enabled (set super_agent.enabled and .model in ~/.apx/config.json)");
@@ -165,6 +226,19 @@ export async function runSuperAgent({
   const sa = globalConfig.super_agent;
   const activeModel = overrideModel || sa.model;
+  // Engine toggle: if config.super_agent.engine === "langchain", delegate to
+  // the LangChain AgentExecutor adapter. Default stays "native" (this loop).
+  // The toggle exists so we can A/B the two paths on the user's actual chat
+  // without committing to a full migration. See super-agent-langchain.js.
+  if (sa.engine === "langchain") {
+    const { runSuperAgentLangChain } = await import("./super-agent-langchain.js");
+    return runSuperAgentLangChain({
+      globalConfig, projects, plugins, registries,
+      prompt, previousMessages, contextNote,
+      onEvent, onToken, signal,
+    });
+  }
   // Tiny project hint — JUST names + ids, no detail. The model is expected to
   // call list_agents / list_mcps / read_agent_memory / etc. for everything
   // else. Keeping this short forces actual tool use instead of letting the
@@ -206,15 +280,7 @@ export async function runSuperAgent({
   // Language comes from config.user.language (ISO 639-1) so it stays in sync with transcription.
   const identity = (() => { try { return readIdentity(); } catch { return null; } })();
   const userLang = globalConfig?.user?.language || "en";
-  const identityBlock = (() => {
-    const lines = ["# Identity"];
-    if (identity?.agent_name) lines.push(`Your name is ${identity.agent_name}.`);
-    if (identity?.personality) lines.push(`Your personality: ${identity.personality}.`);
-    if (identity?.owner_name) lines.push(`Your owner is ${identity.owner_name}.`);
-    if (identity?.owner_context) lines.push(`Owner context: ${identity.owner_context}`);
-    lines.push(`Always reply in the language with ISO code "${userLang}" unless the user explicitly switches.`);
-    return lines.join("\n");
-  })();
+  const identityBlock = buildIdentityBlock(identity, userLang);
   const system = [
     sa.system || DEFAULT_SYSTEM,
@@ -246,14 +312,21 @@ export async function runSuperAgent({
   let totalUsage = { input_tokens: 0, output_tokens: 0 };
   let lastText = "";
   let usePseudoTools = false;
+  // Track how many consecutive iterations contained only ACK_ONLY tools.
+  // While this is > 0 we keep tool_choice="required" so the next iter has
+  // to do real work — otherwise gemma4-class models call send_telegram
+  // for the ack and then break out with empty text on iter N+1.
+  let ackOnlyStreak = 0;
   for (let iter = 0; iter < MAX_TOOL_ITERS; iter++) {
     await emitProgress(onEvent, { type: "model_start", iteration: iter + 1 });
-    // On the first iteration, force a tool call. This prevents the model from
-    // returning a bare acknowledgment ("ok", "dame un segundo") instead of
-    // acting on an action request. On later iterations (after tool results
-    // have been fed back) tool_choice is "auto" so the model can produce its
-    // final text summary.
+    // Force a tool call on iter 0 (no bare "ok dame un segundo" reply), AND
+    // on any iteration that immediately follows an ack-only iter (so the
+    // model can't ack and then stop). After at most MAX_CONSECUTIVE_ACKS
+    // forced rounds we let it fall back to "auto" so the model can finish.
+    const forceTool =
+      iter === 0 ||
+      (ackOnlyStreak > 0 && ackOnlyStreak <= MAX_CONSECUTIVE_ACKS);
     let result;
     try {
       result = await callEngine({
@@ -262,9 +335,12 @@ export async function runSuperAgent({
         messages: conversation,
         config: globalConfig,
         tools: usePseudoTools ? null : TOOL_SCHEMAS,
-        toolChoice: usePseudoTools ? null : (iter === 0 ? "required" : "auto"),
+        toolChoice: usePseudoTools ? null : (forceTool ? "required" : "auto"),
         maxTokens: 1024,
         signal,
+        // Only stream tokens on non-forced iterations — on forced iters the
+        // model MUST emit a tool_call, streaming text would confuse the user.
+        onToken: (!forceTool && onToken) ? onToken : null,
       });
     } catch (e) {
       if (usePseudoTools && /^ollama:/i.test(String(activeModel || "")) && /ollama\s+500/i.test(String(e?.message || "")) && trace.length > 0) {
@@ -284,6 +360,7 @@ export async function runSuperAgent({
         toolChoice: null,
         maxTokens: 1024,
         signal,
+        onToken: (iter > 0 && onToken) ? onToken : null,
       });
     }
     totalUsage.input_tokens += result.usage?.input_tokens || 0;
@@ -378,6 +455,25 @@ export async function runSuperAgent({
         content: JSON.stringify(toolResult),
       });
     }
+    // Did this iteration consist of ONLY ack-style tool calls? If so we'll
+    // keep tool_choice forced on the next iter (see top of loop). A turn
+    // that mixes send_telegram + e.g. browser_screenshot counts as "real
+    // work" and resets the streak.
+    const allAckOnly = toolCalls.every((tc) => {
+      const n = (tc.function?.name) || tc.name;
+      return ACK_ONLY_TOOLS.has(n);
+    });
+    if (allAckOnly) {
+      ackOnlyStreak += 1;
+      await emitProgress(onEvent, {
+        type: "ack_only_iter",
+        iteration: iter + 1,
+        streak: ackOnlyStreak,
+      });
+    } else {
+      ackOnlyStreak = 0;
+    }
   }
   return {

package/src/daemon/transcription.js CHANGED Viewed

@@ -28,8 +28,9 @@
 import fs from "node:fs";
 import path from "node:path";
-import { spawn } from "node:child_process";
+import { spawn, exec } from "node:child_process";
 import { fileURLToPath } from "node:url";
+import { logInfo, logWarn, logError } from "../core/logging.js";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
@@ -43,8 +44,32 @@ const DEFAULT_LOCAL = {
   language: "auto",
   beam_size: 5,
   idle_minutes: 10,
+  // Max time we wait for /transcribe to return. Long audio files (Telegram
+  // voice notes > 10 min) can take several minutes on CPU; the previous
+  // hard-coded 5-minute cap silently truncated them. 20 minutes covers a
+  // ~60-minute voice note on a small int8 model. Override with
+  // transcription.local.timeout_ms in ~/.apx/config.json if needed.
+  timeout_ms: 20 * 60_000,
 };
+// ---------------------------------------------------------------------------
+// Config helpers (pure — exported for tests)
+// ---------------------------------------------------------------------------
+/**
+ * Resolve the effective transcription language.
+ * Priority: explicit local config → config.user.language → "auto" (whisper detects).
+ *
+ * @param {object} localCfg   merged transcription.local config
+ * @param {string} userLang   config.user.language ISO code (e.g. "es"), or ""
+ * @returns {string}          ISO code or "auto"
+ */
+export function resolveTranscriptionLanguage(localCfg, userLang) {
+  if (localCfg.language && localCfg.language !== "auto") return localCfg.language;
+  if (userLang) return userLang;
+  return "auto";
+}
 // ---------------------------------------------------------------------------
 // Config
 // ---------------------------------------------------------------------------
@@ -59,9 +84,7 @@ async function getConfig() {
     // Explicit transcription.local.language always wins; "auto" means fall back to user.language.
     const userLang = cfg.user?.language || "";
     const localBase = { ...DEFAULT_LOCAL, ...(t.local || {}) };
-    if ((!localBase.language || localBase.language === "auto") && userLang) {
-      localBase.language = userLang;
-    }
+    localBase.language = resolveTranscriptionLanguage(localBase, userLang);
     return {
       provider: t.provider || "auto",
       local: localBase,
@@ -98,6 +121,21 @@ async function _isServerHealthy() {
   }
 }
+// Check if the running whisper-server is using a specific model.
+// Returns the model name string, or null if not reachable.
+async function _serverModelName() {
+  try {
+    const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/health`, {
+      signal: AbortSignal.timeout(800),
+    });
+    if (!res.ok) return null;
+    const j = await res.json();
+    return j?.model || null;
+  } catch {
+    return null;
+  }
+}
 async function _waitForServer(maxMs = 15_000) {
   const deadline = Date.now() + maxMs;
   while (Date.now() < deadline) {
@@ -107,18 +145,62 @@ async function _waitForServer(maxMs = 15_000) {
   throw new Error(`whisper-server did not start within ${maxMs}ms`);
 }
+// Find the PID of the process LISTENing on the whisper port (server only,
+// not clients). Filtering by -sTCP:LISTEN is critical — without it, lsof
+// also returns clients with an open connection (including this daemon).
+async function _findListenerPid() {
+  return new Promise((resolve) => {
+    exec(`lsof -ti tcp:${WHISPER_PORT} -sTCP:LISTEN`, (err, stdout) => {
+      if (err || !stdout) return resolve(null);
+      const candidates = stdout.trim().split("\n")
+        .map(s => parseInt(s, 10))
+        .filter(n => Number.isFinite(n) && n !== process.pid);
+      resolve(candidates[0] || null);
+    });
+  });
+}
+async function _killOrphanWhisper() {
+  // First try graceful /shutdown on the whisper server.
+  try {
+    await fetch(`http://127.0.0.1:${WHISPER_PORT}/shutdown`, {
+      method: "POST", signal: AbortSignal.timeout(1000),
+    });
+    await _sleep(600);
+  } catch {}
+  // If still bound, force-kill the LISTENER pid only (never our own pid).
+  const pid = await _findListenerPid();
+  if (pid && pid !== process.pid) {
+    try { process.kill(pid, "SIGTERM"); } catch {}
+    await _sleep(400);
+    try { process.kill(pid, 0); try { process.kill(pid, "SIGKILL"); } catch {} } catch {}
+    await _sleep(300);
+  }
+}
 async function ensureWhisperServer(opts) {
   const model = opts.model || DEFAULT_LOCAL.model;
   // Already running with the right model — health-check to confirm still alive.
   if (_serverProcess && _serverModel === model) {
     if (await _isServerHealthy()) return;
-    // Process died (idle shutdown). Fall through to restart.
     _serverProcess = null;
     _serverModel = null;
   }
-  // Wrong model: kill old server and start fresh.
+  // Adopt an externally-running whisper-server (e.g. left over from prior daemon).
+  if (!_serverProcess) {
+    const existing = await _serverModelName();
+    if (existing === model) {
+      _serverModel = model;
+      return;
+    }
+    if (existing) {
+      // Wrong model: kick out the orphan so we can start the right one.
+      await _killOrphanWhisper();
+    }
+  }
   if (_serverProcess) {
     try { _serverProcess.kill(); } catch {}
     _serverProcess = null;
@@ -126,6 +208,10 @@ async function ensureWhisperServer(opts) {
     await _sleep(300);
   }
+  await _spawnWhisper(opts, model, /* retried */ false);
+}
+async function _spawnWhisper(opts, model, retried) {
   const args = [
     WHISPER_SERVER,
     "--port", String(WHISPER_PORT),
@@ -151,32 +237,44 @@ async function ensureWhisperServer(opts) {
   });
   // Wait for the "ready" line on stdout, then wait for HTTP to respond.
-  await new Promise((resolve, reject) => {
-    const timeout = setTimeout(
-      () => reject(new Error("whisper-server startup timed out (15s)")),
-      15_000
-    );
-    let buf = "";
-    proc.stdout.on("data", (chunk) => {
-      buf += chunk.toString();
-      const nl = buf.indexOf("\n");
-      if (nl === -1) return;
-      const line = buf.slice(0, nl).trim();
-      buf = buf.slice(nl + 1);
-      clearTimeout(timeout);
-      try {
-        const msg = JSON.parse(line);
-        if (msg.status === "error") return reject(new Error(msg.error || "whisper-server error"));
-        resolve(); // "ready"
-      } catch {
-        resolve(); // unexpected line but server is up
-      }
-    });
-    proc.on("exit", (code) => {
-      clearTimeout(timeout);
-      reject(new Error(`whisper-server exited (code ${code}) before becoming ready`));
+  try {
+    await new Promise((resolve, reject) => {
+      const timeout = setTimeout(
+        () => reject(new Error("whisper-server startup timed out (15s)")),
+        15_000
+      );
+      let buf = "";
+      proc.stdout.on("data", (chunk) => {
+        buf += chunk.toString();
+        const nl = buf.indexOf("\n");
+        if (nl === -1) return;
+        const line = buf.slice(0, nl).trim();
+        buf = buf.slice(nl + 1);
+        clearTimeout(timeout);
+        try {
+          const msg = JSON.parse(line);
+          if (msg.status === "error") return reject(new Error(msg.error || "whisper-server error"));
+          resolve(); // "ready"
+        } catch {
+          resolve(); // unexpected line but server is up
+        }
+      });
+      proc.on("exit", (code) => {
+        clearTimeout(timeout);
+        reject(new Error(`whisper-server exited (code ${code}) before becoming ready`));
+      });
     });
-  });
+  } catch (e) {
+    // Self-heal: if the port was already in use, kill the orphan and retry once.
+    const msg = e.message || "";
+    if (!retried && /address already in use|errno 48|eaddrinuse/i.test(msg)) {
+      _serverProcess = null;
+      _serverModel = null;
+      await _killOrphanWhisper();
+      return _spawnWhisper(opts, model, /* retried */ true);
+    }
+    throw e;
+  }
 }
 // ---------------------------------------------------------------------------
@@ -190,30 +288,74 @@ async function transcribeLocal(filePath, opts) {
     ? null
     : (opts.language || null);
-  const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/transcribe`, {
-    method: "POST",
-    headers: { "content-type": "application/json" },
-    body: JSON.stringify({
-      audio_path: filePath,
-      language,
-      beam_size: opts.beam_size || DEFAULT_LOCAL.beam_size,
-    }),
-    signal: AbortSignal.timeout(5 * 60_000),
-  });
+  const timeoutMs = Number(opts.timeout_ms) > 0
+    ? Number(opts.timeout_ms)
+    : DEFAULT_LOCAL.timeout_ms;
-  const json = await res.json();
-  if (!json.ok) throw new Error(json.error || "transcription failed");
+  const body = JSON.stringify({
+    audio_path: filePath,
+    language,
+    beam_size: opts.beam_size || DEFAULT_LOCAL.beam_size,
+  });
-  return {
-    ok: true,
-    backend: "local",
-    text: json.text || "",
-    language: json.language || null,
-    language_probability: json.language_probability ?? null,
-    duration: json.duration ?? null,
-    model: json.model,
-    compute_type: json.compute_type,
-  };
+  // Long transcriptions on CPU (small int8, 1-minute voice note) can take
+  // 30-45s. Under undici (Node fetch) we occasionally see "fetch failed"
+  // from the inbound Telegram path even though the whisper-server completes
+  // the request successfully — a keep-alive socket gets reset somewhere
+  // between the long whisper-server response and the daemon's other
+  // concurrent traffic. We retry once on a generic "fetch failed" so the
+  // user actually gets a reply.
+  const maxAttempts = 2;
+  let lastErr = null;
+  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+    const t0 = Date.now();
+    try {
+      logInfo("whisper", `transcribeLocal attempt ${attempt}/${maxAttempts}`, {
+        file: path.basename(filePath),
+        language: language || "auto",
+        timeout_ms: timeoutMs,
+      });
+      const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/transcribe`, {
+        method: "POST",
+        headers: { "content-type": "application/json", "connection": "close" },
+        body,
+        signal: AbortSignal.timeout(timeoutMs),
+      });
+      const json = await res.json();
+      if (!json.ok) throw new Error(json.error || "transcription failed");
+      logInfo("whisper", `transcribeLocal ok in ${Date.now() - t0}ms`, {
+        chars: (json.text || "").length,
+        language: json.language,
+        duration: json.duration,
+      });
+      return {
+        ok: true,
+        backend: "local",
+        text: json.text || "",
+        language: json.language || null,
+        language_probability: json.language_probability ?? null,
+        duration: json.duration ?? null,
+        model: json.model,
+        compute_type: json.compute_type,
+      };
+    } catch (e) {
+      lastErr = e;
+      const isRetriable =
+        /fetch failed|ECONNRESET|socket hang up|terminated/i.test(e.message || "");
+      const dt = Date.now() - t0;
+      logWarn("whisper", `transcribeLocal attempt ${attempt} failed in ${dt}ms`, {
+        error: e.message,
+        retriable: isRetriable,
+        will_retry: isRetriable && attempt < maxAttempts,
+      });
+      if (!isRetriable || attempt >= maxAttempts) break;
+      // Brief backoff before retry — gives the whisper-server.py thread time
+      // to flush its pending response and release the model lock.
+      await _sleep(500);
+    }
+  }
+  logError("whisper", `transcribeLocal exhausted retries`, { error: lastErr?.message });
+  throw lastErr || new Error("local transcription failed");
 }
 // ---------------------------------------------------------------------------
@@ -280,19 +422,80 @@ export async function transcribe(filePath, overrides = {}) {
     return transcribeOpenAI(filePath, cfg.openaiKey);
   }
   if (provider === "local") {
+    // Explicit local-only: bubble up the real error, do not mention OpenAI.
     return transcribeLocal(filePath, localOpts);
   }
-  // auto: local first, fall back to openai
+  // auto: local first, fall back to openai only if a key is configured
   try {
     return await transcribeLocal(filePath, localOpts);
   } catch (localErr) {
-    if (!cfg.openaiKey) {
-      throw new Error(
-        `local transcription failed and no OpenAI fallback available: ${localErr.message}`
-      );
+    if (cfg.openaiKey) {
+      return transcribeOpenAI(filePath, cfg.openaiKey);
     }
-    return transcribeOpenAI(filePath, cfg.openaiKey);
+    // No OpenAI configured — surface the real local error verbatim.
+    throw new Error(`local transcription failed: ${localErr.message}`);
+  }
+}
+/**
+ * Transcribe raw audio bytes (e.g. from a mic chunk or Telegram voice blob).
+ * Saves to a temp file, transcribes, cleans up.
+ *
+ * @param {Buffer} buf        raw audio data
+ * @param {string} format     file extension hint: "webm" | "ogg" | "wav" | "mp3" (default "webm")
+ * @param {object} overrides  same as transcribe() overrides
+ */
+export async function transcribeBuffer(buf, format = "webm", overrides = {}) {
+  if (!buf || !buf.length) throw new Error("transcribeBuffer: empty buffer");
+  const ext = format.replace(/^\./, "") || "webm";
+  const tmpFile = path.join(
+    (await import("node:os")).default.tmpdir(),
+    `apx-audio-${Date.now()}-${Math.random().toString(36).slice(2)}.${ext}`
+  );
+  try {
+    fs.writeFileSync(tmpFile, buf);
+    return await transcribe(tmpFile, overrides);
+  } finally {
+    try { fs.unlinkSync(tmpFile); } catch {}
+  }
+}
+// ---------------------------------------------------------------------------
+// Lifecycle (preload on daemon start, shutdown on daemon stop)
+// ---------------------------------------------------------------------------
+/**
+ * Eagerly start the whisper server so the first transcription is fast.
+ * Safe to call multiple times. Never throws — logs and continues on failure.
+ */
+export async function preloadWhisperServer(log = console.log) {
+  try {
+    const cfg = await getConfig();
+    if (cfg.provider === "openai") return; // local backend not used
+    log(`whisper: preloading model "${cfg.local.model}" on port ${WHISPER_PORT}…`);
+    await ensureWhisperServer(cfg.local);
+    log(`whisper: ready on port ${WHISPER_PORT} (model: ${_serverModel})`);
+  } catch (e) {
+    log(`whisper: preload failed — ${e.message} (will retry lazily on first request)`);
+  }
+}
+/**
+ * Stop the whisper server we own (no-op if we adopted an external one).
+ */
+export async function shutdownWhisperServer() {
+  if (_serverProcess) {
+    try { _serverProcess.kill(); } catch {}
+    _serverProcess = null;
+    _serverModel = null;
+  } else {
+    // Try graceful shutdown of an adopted server
+    try {
+      await fetch(`http://127.0.0.1:${WHISPER_PORT}/shutdown`, {
+        method: "POST", signal: AbortSignal.timeout(500),
+      });
+    } catch {}
   }
 }