npm - @agentprojectcontext/apx - Versions diffs - 1.42.2 → 1.43.0 - Mend

@agentprojectcontext/apx 1.42.2 → 1.43.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/package.json +1 -1
package/src/core/config/index.js +2 -0
package/src/core/config/redact.js +2 -0
package/src/core/desktop/process.js +126 -0
package/src/core/voice/stt-hardware.js +87 -0
package/src/core/voice/stt-models.js +97 -0
package/src/core/voice/transcription.js +147 -16
package/src/host/daemon/api/desktop.js +54 -8
package/src/host/daemon/api/transcribe.js +40 -1
package/src/host/daemon/whisper-server.js +18 -8
package/src/host/daemon/whisper-server.py +71 -44
package/src/interfaces/cli/commands/desktop.js +13 -68
package/src/interfaces/desktop/main.js +32 -4
package/src/interfaces/desktop/renderer.js +26 -5
package/src/interfaces/web/dist/assets/index-B0nTYflm.js +651 -0
package/src/interfaces/web/dist/assets/index-B0nTYflm.js.map +1 -0
package/src/interfaces/web/dist/assets/index-C22PmKCD.css +1 -0
package/src/interfaces/web/dist/index.html +2 -2
package/src/interfaces/web/src/components/ShortcutInput.tsx +156 -0
package/src/interfaces/web/src/components/voice/VoiceSttCard.tsx +101 -5
package/src/interfaces/web/src/i18n/en.ts +28 -2
package/src/interfaces/web/src/i18n/es.ts +28 -2
package/src/interfaces/web/src/lib/api/desktop.ts +28 -0
package/src/interfaces/web/src/lib/api/voice.ts +26 -2
package/src/interfaces/web/src/screens/modules/DeckScreen.tsx +55 -3
package/src/interfaces/web/src/screens/modules/DesktopScreen.tsx +98 -36
package/src/interfaces/web/dist/assets/index-BReF4_xV.js +0 -646
package/src/interfaces/web/dist/assets/index-BReF4_xV.js.map +0 -1
package/src/interfaces/web/dist/assets/index-wrEbTJbc.css +0 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentprojectcontext/apx",
-  "version": "1.42.2",
+  "version": "1.43.0",
   "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
   "publishConfig": {
     "access": "public"

package/src/core/config/index.js CHANGED Viewed

@@ -190,6 +190,8 @@ const CREDENTIAL_PATHS = [
   ["voice", "tts", "elevenlabs", "api_key"],
   ["voice", "tts", "openai", "api_key"],
   ["voice", "tts", "gemini", "api_key"],
+  ["transcription", "openai", "api_key"],
+  ["transcription", "custom", "api_key"],
   ["memory", "embeddings", "openai", "api_key"],
   ["memory", "embeddings", "gemini", "api_key"],
   ["telegram", "channels"], // entire array — losing it is also a regression

package/src/core/config/redact.js CHANGED Viewed

@@ -18,6 +18,8 @@ export const SECRET_PATHS = [
   "voice.tts.elevenlabs.api_key",
   "voice.tts.openai.api_key",
   "voice.tts.gemini.api_key",
+  "transcription.openai.api_key",
+  "transcription.custom.api_key",
   "memory.embeddings.openai.api_key",
   "memory.embeddings.gemini.api_key",
   // Telegram bot tokens live inside an array — handled separately in redact()

package/src/core/desktop/process.js ADDED Viewed

@@ -0,0 +1,126 @@
+// Desktop (Electron floating window) process control — shared by the CLI
+// (`apx desktop start/stop/restart`) and the daemon's /desktop/{start,stop}
+// HTTP endpoints, so both spawn/kill the window the exact same way.
+//
+// The window is a detached Electron process (it must survive the spawner so a
+// LaunchAgent / a short-lived CLI invocation doesn't take it down). State is
+// tracked via ~/.apx/desktop.pid.
+"use strict";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { spawn, execFileSync } from "node:child_process";
+import { fileURLToPath } from "node:url";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+// src/core/desktop/ → repo root is three levels up.
+const ROOT = path.resolve(__dirname, "..", "..", "..");
+export const DESKTOP_MAIN = path.resolve(__dirname, "..", "..", "interfaces", "desktop", "main.js");
+export const DESKTOP_PID = path.join(os.homedir(), ".apx", "desktop.pid");
+const DESKTOP_LOG = path.join(os.homedir(), ".apx", "desktop.log");
+// ── PID file ────────────────────────────────────────────────────────────────
+export function readPid() {
+  try { return parseInt(fs.readFileSync(DESKTOP_PID, "utf8").trim(), 10); } catch { return null; }
+}
+export function writePid(pid) {
+  fs.mkdirSync(path.dirname(DESKTOP_PID), { recursive: true });
+  fs.writeFileSync(DESKTOP_PID, String(pid));
+}
+export function clearPid() { try { fs.unlinkSync(DESKTOP_PID); } catch {} }
+export function pidAlive(pid) {
+  if (!pid) return false;
+  try { process.kill(pid, 0); return true; } catch { return false; }
+}
+export function isDesktopRunning() { return pidAlive(readPid()); }
+// ── Electron resolution ───────────────────────────────────────────────────
+// Validate a candidate actually runs (a pnpm shim can exist as a file while its
+// underlying package was never built — `--version` smokes that out).
+function electronRuns(cmd, argv) {
+  try { execFileSync(cmd, argv, { stdio: "ignore", timeout: 5000 }); return true; } catch { return false; }
+}
+// Returns a descriptor for buildElectronSpawn(): an absolute electron binary
+// path, electron's cli.js (".js" → run via node), a global bin, or "npx" as a
+// last resort. Never returns null.
+export function findElectron() {
+  const bin = path.join(ROOT, "node_modules", ".bin", "electron");
+  if (fs.existsSync(bin) && electronRuns(bin, ["--version"])) return bin;
+  const cli = path.join(ROOT, "node_modules", "electron", "cli.js");
+  if (fs.existsSync(cli) && electronRuns(process.execPath, [cli, "--version"])) return cli;
+  try {
+    const which = execFileSync("which", ["electron"], { stdio: ["ignore", "pipe", "ignore"] }).toString().trim();
+    if (which && electronRuns(which, ["--version"])) return which;
+  } catch {}
+  return "npx";
+}
+// Turn a findElectron() descriptor + the app entry into a { cmd, argv } pair.
+export function buildElectronSpawn(descriptor, mainPath, port) {
+  if (descriptor === "npx") {
+    return { cmd: "npx", argv: ["-y", "electron", mainPath, "--port", port] };
+  }
+  if (descriptor.endsWith(".js")) {
+    return { cmd: process.execPath, argv: [descriptor, mainPath, "--port", port] };
+  }
+  return { cmd: descriptor, argv: [mainPath, "--port", port] };
+}
+// ── Lifecycle ───────────────────────────────────────────────────────────────
+// Spawn the window detached (survives the spawner). No console output — callers
+// format their own UX. Returns { ok, pid, already? } | { ok:false, error }.
+// detached:true gives the child its own session so a LaunchAgent / short-lived
+// CLI doesn't drag it down on exit; we unref() after a 1.5s fail-fast window.
+export async function startDesktopDetached({ port = process.env.APX_PORT || "7430" } = {}) {
+  if (isDesktopRunning()) return { ok: true, pid: readPid(), already: true };
+  clearPid();
+  if (!fs.existsSync(DESKTOP_MAIN)) return { ok: false, error: `desktop app not found at ${DESKTOP_MAIN}` };
+  const { cmd, argv } = buildElectronSpawn(findElectron(), DESKTOP_MAIN, String(port));
+  let logFd;
+  try { logFd = fs.openSync(DESKTOP_LOG, "a"); } catch { logFd = "ignore"; }
+  let child;
+  try {
+    child = spawn(cmd, argv, {
+      detached: true,
+      stdio: ["ignore", logFd, logFd],
+      env: { ...process.env, ELECTRON_ENABLE_LOGGING: "1" },
+    });
+  } catch (e) {
+    if (typeof logFd === "number") { try { fs.closeSync(logFd); } catch {} }
+    return { ok: false, error: e.message };
+  }
+  if (typeof logFd === "number") { try { fs.closeSync(logFd); } catch {} }
+  const res = await new Promise((resolve) => {
+    let settled = false;
+    child.on("exit", (code) => { if (!settled) { settled = true; resolve({ ok: code === 0, code }); } });
+    setTimeout(() => { if (!settled) { settled = true; child.unref(); resolve({ ok: true }); } }, 1500);
+  });
+  if (!res.ok) return { ok: false, error: `desktop exited with code ${res.code}` };
+  if (child.pid) writePid(child.pid);
+  return { ok: true, pid: child.pid };
+}
+// Stop the running window (SIGTERM). Returns { ok, stopped, pid? } — stopped is
+// false when nothing was running.
+export function stopDesktop() {
+  const pid = readPid();
+  if (!pidAlive(pid)) { clearPid(); return { ok: true, stopped: false }; }
+  try {
+    process.kill(pid, "SIGTERM");
+    clearPid();
+    return { ok: true, stopped: true, pid };
+  } catch (e) {
+    return { ok: false, error: e.message };
+  }
+}

package/src/core/voice/stt-hardware.js ADDED Viewed

@@ -0,0 +1,87 @@
+// Hardware probe + STT engine recommendation.
+//
+// The transcription backend should adapt to the machine instead of making the
+// user understand CTranslate2 vs MLX vs whisper.cpp:
+//
+//   Apple Silicon (Metal)  → mlx-whisper, large-v3-turbo   (GPU/ANE accelerated)
+//   NVIDIA (CUDA)          → faster-whisper cuda, large-v3  (GPU accelerated)
+//   AMD / Radeon           → faster-whisper cpu (limited)   (no ROCm in CT2)
+//   CPU only               → faster-whisper cpu, small      (safe + light)
+//
+// Detection is dependency-free and best-effort: short-timeout probes of
+// nvidia-smi / rocminfo, plus os.platform()/os.arch(). Anything uncertain
+// degrades to the CPU recommendation.
+import os from "node:os";
+import { spawnSync } from "node:child_process";
+function cmdOk(cmd, args = []) {
+  try {
+    const r = spawnSync(cmd, args, { timeout: 1500, stdio: "ignore" });
+    return r.status === 0;
+  } catch {
+    return false;
+  }
+}
+/**
+ * Probe the machine. Returns a stable shape the UI + recommender consume.
+ * @returns {{platform:string, arch:string, appleSilicon:boolean, gpu:"metal"|"cuda"|"rocm"|"none", gpuName?:string}}
+ */
+export function detectHardware() {
+  const platform = os.platform();           // "darwin" | "linux" | "win32"
+  const arch = os.arch();                    // "arm64" | "x64" | ...
+  const appleSilicon = platform === "darwin" && arch === "arm64";
+  if (appleSilicon) {
+    return { platform, arch, appleSilicon: true, gpu: "metal", gpuName: cpuBrand() };
+  }
+  // NVIDIA: nvidia-smi exits 0 when a CUDA GPU + driver are present.
+  if (cmdOk("nvidia-smi", ["-L"])) {
+    return { platform, arch, appleSilicon: false, gpu: "cuda" };
+  }
+  // AMD/Radeon: rocminfo (ROCm stack) is the clearest signal on Linux.
+  if (platform === "linux" && cmdOk("rocminfo")) {
+    return { platform, arch, appleSilicon: false, gpu: "rocm" };
+  }
+  return { platform, arch, appleSilicon: false, gpu: "none" };
+}
+function cpuBrand() {
+  try { return (os.cpus()?.[0]?.model || "").trim() || undefined; } catch { return undefined; }
+}
+// Recommended STT backend + model per hardware tier. `backend` maps to a local
+// engine implementation; `model` is the repo id in that engine's format.
+export function recommendStt(hw = detectHardware()) {
+  if (hw.gpu === "metal") {
+    return {
+      backend: "mlx", device: "metal",
+      model: "mlx-community/whisper-large-v3-turbo",
+      reason: "Apple Silicon: MLX corre en la GPU/Neural Engine (Metal).",
+      tier: "gpu",
+    };
+  }
+  if (hw.gpu === "cuda") {
+    return {
+      backend: "faster", device: "cuda", compute_type: "float16",
+      model: "large-v3",
+      reason: "GPU NVIDIA: faster-whisper en CUDA soporta modelos grandes rápido.",
+      tier: "gpu",
+    };
+  }
+  if (hw.gpu === "rocm") {
+    return {
+      backend: "faster", device: "cpu", compute_type: "int8",
+      model: "small",
+      reason: "Radeon/ROCm no está soportado por CTranslate2 — se usa CPU. (whisper.cpp Vulkan es una mejora futura.)",
+      tier: "cpu",
+      limited: true,
+    };
+  }
+  return {
+    backend: "faster", device: "cpu", compute_type: "int8",
+    model: "small",
+    reason: "Sin GPU acelerada: faster-whisper en CPU con un modelo liviano.",
+    tier: "cpu",
+  };
+}

package/src/core/voice/stt-models.js ADDED Viewed

@@ -0,0 +1,97 @@
+// STT model catalog + on-disk status (downloaded? how big?).
+//
+// Both engines pull their weights from the HuggingFace hub cache
+// (~/.cache/huggingface/hub/models--<org>--<name>), just in different formats:
+//   faster-whisper → Systran/faster-whisper-<model>   (CTranslate2)
+//   mlx-whisper    → mlx-community/whisper-<model>     (MLX)
+//
+// We read the cache directory to report presence + real byte size, and carry an
+// approximate download size for models that aren't there yet (Ollama-style).
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+function hubDir() {
+  const base = process.env.HF_HOME || path.join(os.homedir(), ".cache", "huggingface");
+  return path.join(base, "hub");
+}
+/** HF cache folder name for a repo id, e.g. "Systran/faster-whisper-small". */
+function repoCacheName(repoId) {
+  return "models--" + repoId.replace(/\//g, "--");
+}
+function dirSizeBytes(dir) {
+  let total = 0;
+  let stack = [dir];
+  while (stack.length) {
+    const d = stack.pop();
+    let entries;
+    try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { continue; }
+    for (const e of entries) {
+      const p = path.join(d, e.name);
+      // HF stores real bytes once in blobs/ and symlinks them from snapshots/.
+      // Count only the real files (skip symlinks) so we don't double-count.
+      if (e.isSymbolicLink()) continue;
+      if (e.isDirectory()) stack.push(p);
+      else if (e.isFile()) { try { total += fs.lstatSync(p).size; } catch {} }
+    }
+  }
+  return total;
+}
+export function humanSize(bytes) {
+  if (!bytes || bytes < 1) return "—";
+  const u = ["B", "KB", "MB", "GB"];
+  let i = 0, n = bytes;
+  while (n >= 1024 && i < u.length - 1) { n /= 1024; i++; }
+  return `${n.toFixed(n >= 10 || i === 0 ? 0 : 1)} ${u[i]}`;
+}
+// Catalog: per backend, the offered models with their HF repo id and an
+// approximate download size (used only until the model is actually on disk).
+export const STT_MODEL_CATALOG = {
+  faster: [
+    { id: "tiny",           repo: "Systran/faster-whisper-tiny",            approx_mb: 75 },
+    { id: "base",           repo: "Systran/faster-whisper-base",            approx_mb: 145 },
+    { id: "small",          repo: "Systran/faster-whisper-small",           approx_mb: 480 },
+    { id: "medium",         repo: "Systran/faster-whisper-medium",          approx_mb: 1500 },
+    { id: "large-v3",       repo: "Systran/faster-whisper-large-v3",        approx_mb: 3100 },
+    { id: "large-v3-turbo", repo: "mobiuslabsgmbh/faster-whisper-large-v3-turbo", approx_mb: 1600 },
+  ],
+  mlx: [
+    { id: "small",          repo: "mlx-community/whisper-small-mlx",            approx_mb: 480 },
+    { id: "large-v3",       repo: "mlx-community/whisper-large-v3-mlx",         approx_mb: 3100 },
+    { id: "large-v3-turbo", repo: "mlx-community/whisper-large-v3-turbo",       approx_mb: 1600 },
+  ],
+};
+/** Status of one repo in the HF cache. */
+export function modelStatusByRepo(repo) {
+  const dir = path.join(hubDir(), repoCacheName(repo));
+  if (!fs.existsSync(dir)) return { downloaded: false, size_bytes: 0 };
+  const size = dirSizeBytes(dir);
+  // A bare ref/lock dir with no blobs is "not really downloaded".
+  return { downloaded: size > 1_000_000, size_bytes: size };
+}
+/** List a backend's models with download status + sizes. */
+export function listSttModels(backend) {
+  const catalog = STT_MODEL_CATALOG[backend] || [];
+  return catalog.map((m) => {
+    const st = modelStatusByRepo(m.repo);
+    return {
+      id: m.id,
+      repo: m.repo,
+      downloaded: st.downloaded,
+      size: st.downloaded ? humanSize(st.size_bytes) : `~${humanSize(m.approx_mb * 1024 * 1024)}`,
+      size_bytes: st.size_bytes,
+    };
+  });
+}
+/** Resolve the HF repo id for a (backend, model-id) pair. */
+export function repoFor(backend, modelId) {
+  const entry = (STT_MODEL_CATALOG[backend] || []).find((m) => m.id === modelId);
+  return entry?.repo || modelId; // allow passing a raw repo id through
+}

package/src/core/voice/transcription.js CHANGED Viewed

@@ -23,7 +23,11 @@ import { logInfo, logWarn } from "#core/logging.js";
 export const WHISPER_LOCAL_PORT = 18765;
 export const DEFAULT_LOCAL = {
-  model: "small",
+  // "auto" = adapt to the machine (mlx/Metal on Apple Silicon, faster-whisper
+  // cuda on NVIDIA, else faster-whisper cpu). Override with "faster" | "mlx".
+  backend: "auto",
+  model: "small",          // faster-whisper model id (tiny|base|small|…)
+  mlx_model: "",           // mlx repo (defaults to the hardware recommendation)
   device: "cpu",
   compute_type: "int8",
   language: "auto",
@@ -34,6 +38,28 @@ export const DEFAULT_LOCAL = {
   timeout_ms: 20 * 60_000,
 };
+// OpenAI's official cloud Whisper. `base_url` is overridable so the same
+// client can target any OpenAI-compatible server (see `custom`).
+export const DEFAULT_OPENAI = {
+  base_url: "https://api.openai.com/v1",
+  model: "whisper-1",
+  language: "auto",
+};
+// A user-supplied, OpenAI-compatible STT server reachable over the network:
+// mlx-audio on this Mac's Metal GPU (localhost:8000), a Radeon/NVIDIA box on
+// the LAN, or anyone's remote endpoint. All expose POST /audio/transcriptions,
+// so they share the exact client as `openai` — only base_url/key/model differ.
+export const DEFAULT_CUSTOM = {
+  base_url: "",   // e.g. http://localhost:8000/v1  or  http://192.168.1.50:9000/v1
+  api_key: "",    // optional — most local servers don't require one
+  model: "",      // e.g. mlx-community/whisper-large-v3-turbo  or  Systran/faster-whisper-large-v3
+  language: "auto",
+};
+/** STT engine ids surfaced to the web admin, in display/fallback order. */
+export const STT_ENGINE_IDS = ["local", "openai", "custom"];
 /**
  * Resolve the effective transcription language. Priority:
  *   explicit local config → config.user.language → "auto" (whisper detects).
@@ -44,29 +70,110 @@ export function resolveTranscriptionLanguage(localCfg, userLang) {
   return "auto";
 }
+/**
+ * Resolve the local engine's effective backend + model in place.
+ *   backend "auto" → mlx (Apple Silicon/Metal), faster-whisper cuda (NVIDIA),
+ *   else faster-whisper cpu.
+ * Safety net: if the chosen mlx model isn't downloaded yet, fall back to
+ * faster-whisper so a live voice turn never stalls on a multi-GB download —
+ * the model-manager UI handles the explicit download.
+ */
+async function resolveLocalBackend(local) {
+  let backend = local.backend || "auto";
+  let rec;
+  try {
+    const { recommendStt } = await import("#core/voice/stt-hardware.js");
+    rec = recommendStt();
+  } catch {
+    rec = { backend: "faster", model: "small", device: "cpu", compute_type: "int8" };
+  }
+  if (backend === "auto") backend = rec.backend;
+  if (backend === "mlx") {
+    const mlxModel = local.mlx_model || rec.model;
+    let downloaded = false;
+    try {
+      const { modelStatusByRepo } = await import("#core/voice/stt-models.js");
+      downloaded = modelStatusByRepo(mlxModel).downloaded;
+    } catch {}
+    if (downloaded) {
+      local.backend = "mlx";
+      local.model = mlxModel;       // whisper-server.js passes this as --model
+      local.device = "metal";
+      local.compute_type = "mlx";
+      return;
+    }
+    backend = "faster";             // not present → don't block voice
+  }
+  // faster-whisper path. On an NVIDIA box, prefer CUDA + float16 unless the
+  // user pinned something explicit.
+  if (rec.backend === "faster" && rec.device === "cuda") {
+    if (!local.device || local.device === "cpu") local.device = "cuda";
+    if (local.compute_type === "int8") local.compute_type = rec.compute_type || "float16";
+  }
+  local.backend = "faster";
+}
 export async function getConfig() {
   try {
     const { readConfig } = await import("#core/config/index.js");
     const cfg = readConfig() || {};
     const t = cfg.transcription || {};
-    const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
     const userLang = cfg.user?.language || "";
     const localBase = { ...DEFAULT_LOCAL, ...(t.local || {}) };
     localBase.language = resolveTranscriptionLanguage(localBase, userLang);
+    await resolveLocalBackend(localBase);
+    // OpenAI cloud: key can live in transcription.openai, the shared
+    // engines.openai block, or the env. base_url defaults to the official API.
+    const openai = { ...DEFAULT_OPENAI, ...(t.openai || {}) };
+    openai.api_key = t.openai?.api_key || cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
+    openai.language = resolveTranscriptionLanguage(openai, userLang);
+    // Custom OpenAI-compatible server (mlx-audio / Radeon / NVIDIA / remote).
+    const custom = { ...DEFAULT_CUSTOM, ...(t.custom || {}) };
+    custom.language = resolveTranscriptionLanguage(custom, userLang);
     return {
       provider: t.provider || "auto",
       local: localBase,
-      openaiKey,
+      openai,
+      custom,
+      // kept for backward-compat with callers that read `.openaiKey`
+      openaiKey: openai.api_key,
     };
   } catch {
     return {
       provider: "auto",
       local: { ...DEFAULT_LOCAL },
+      openai: { ...DEFAULT_OPENAI, api_key: process.env.OPENAI_API_KEY || "" },
+      custom: { ...DEFAULT_CUSTOM },
       openaiKey: process.env.OPENAI_API_KEY || "",
     };
   }
 }
+/**
+ * List STT engines + availability for the web admin (mirrors tts listProviders).
+ * @returns {{configured_provider:string, engines:Array<{id,available,configured}>}}
+ */
+export function listSttProviders(rawConfig = {}) {
+  const t = rawConfig.transcription || {};
+  const provider = t.provider || "auto";
+  const openaiKey = t.openai?.api_key || rawConfig.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
+  const customUrl = (t.custom?.base_url || "").trim();
+  const engines = [
+    // local whisper is embedded (daemon spawns the subprocess on demand) →
+    // always usable, no credentials needed.
+    { id: "local",  available: true,            configured: true },
+    { id: "openai", available: Boolean(openaiKey), configured: Boolean(openaiKey) },
+    { id: "custom", available: Boolean(customUrl), configured: Boolean(customUrl) },
+  ];
+  return { configured_provider: provider, engines };
+}
 /**
  * Call the local whisper-server.py over HTTP. Does NOT spawn or check the
  * subprocess — that's host/daemon/whisper-server.js's job. If the server is
@@ -138,9 +245,14 @@ export async function transcribeViaLocalServer(filePath, opts) {
   throw lastErr || new Error("transcribeViaLocalServer: unknown failure");
 }
-/** OpenAI Whisper-1 cloud API. Needs an api_key. */
-export async function transcribeOpenAI(filePath, apiKey) {
-  if (!apiKey) throw new Error("openai transcription: no api_key");
+/**
+ * OpenAI-compatible transcription (POST {base_url}/audio/transcriptions with a
+ * multipart `file` + `model`). Works against OpenAI itself and any server that
+ * speaks the same contract: mlx-audio, faster-whisper-server, whisper.cpp
+ * server, etc. `backend` is just the label returned to the caller.
+ */
+export async function transcribeViaOpenAICompatible(filePath, { base_url, api_key, model, language, backend = "openai", timeout_ms = 120_000 } = {}) {
+  const baseUrl = (base_url || DEFAULT_OPENAI.base_url).replace(/\/+$/, "");
   const buf = fs.readFileSync(filePath);
   const ext = path.extname(filePath).slice(1).toLowerCase() || "webm";
   const fileType = ext === "ogg" || ext === "oga" ? "audio/ogg"
@@ -151,28 +263,40 @@ export async function transcribeOpenAI(filePath, apiKey) {
     : "application/octet-stream";
   const form = new FormData();
-  form.append("model", "whisper-1");
+  form.append("model", model || DEFAULT_OPENAI.model);
+  if (language && language !== "auto") form.append("language", language);
   form.append("file", new Blob([buf], { type: fileType }), path.basename(filePath));
-  const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
+  const t0 = Date.now();
+  const res = await fetch(`${baseUrl}/audio/transcriptions`, {
     method: "POST",
-    headers: { authorization: `Bearer ${apiKey}` },
+    // Auth header only when a key is set — local servers usually need none.
+    headers: api_key ? { authorization: `Bearer ${api_key}` } : {},
     body: form,
-    signal: AbortSignal.timeout(60_000),
+    signal: AbortSignal.timeout(timeout_ms),
   });
   if (!res.ok) {
     const errBody = await res.text().catch(() => "");
-    throw new Error(`openai whisper ${res.status}: ${errBody.slice(0, 240)}`);
+    throw new Error(`${backend} stt ${res.status}: ${errBody.slice(0, 240)}`);
   }
   const json = await res.json();
+  logInfo("whisper", `transcribeViaOpenAICompatible(${backend}) ok in ${Date.now() - t0}ms`, {
+    chars: (json.text || "").length, base_url: baseUrl, model: model || DEFAULT_OPENAI.model,
+  });
   return {
     ok: true,
-    backend: "openai",
+    backend,
     text: json.text || "",
     language: json.language || null,
   };
 }
+/** Back-compat shim: OpenAI Whisper-1 cloud API by key. */
+export async function transcribeOpenAI(filePath, apiKey) {
+  if (!apiKey) throw new Error("openai transcription: no api_key");
+  return transcribeViaOpenAICompatible(filePath, { ...DEFAULT_OPENAI, api_key: apiKey, backend: "openai" });
+}
 /**
  * Transcribe a file. Provider chosen by config:
  *   - "openai": cloud only
@@ -188,17 +312,24 @@ export async function transcribe(filePath, overrides = {}) {
   const localOpts = { ...cfg.local, ...overrides };
   if (provider === "openai") {
-    return transcribeOpenAI(filePath, cfg.openaiKey);
+    return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
+  }
+  if (provider === "custom") {
+    if (!cfg.custom.base_url) throw new Error("custom transcription: set transcription.custom.base_url");
+    return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
   }
   if (provider === "local") {
     return transcribeViaLocalServer(filePath, localOpts);
   }
-  // auto: local first, fall back to openai if a key is configured
+  // auto: local first, then a configured remote (custom preferred over openai).
   try {
     return await transcribeViaLocalServer(filePath, localOpts);
   } catch (localErr) {
-    if (cfg.openaiKey) {
-      return transcribeOpenAI(filePath, cfg.openaiKey);
+    if (cfg.custom.base_url) {
+      return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
+    }
+    if (cfg.openai.api_key) {
+      return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
     }
     throw new Error(`local transcription failed: ${localErr.message}`);
   }

package/src/host/daemon/api/desktop.js CHANGED Viewed

@@ -1,6 +1,9 @@
 // Desktop (floating voice window) HTTP surface.
 //
-//   GET  /desktop/status        connected websocket clients count
+//   GET  /desktop/status        running flag (pid) + connected websocket clients
+//   POST /desktop/start         launch the floating window (detached Electron)
+//   POST /desktop/stop          terminate the running window (SIGTERM)
+//   POST /desktop/restart       broadcast a "reload" so live windows re-read config
 //   POST /desktop/message       text (post-STT). Responds 200 immediately;
 //                               the super-agent answer is streamed back over WS
 //                               by the desktop plugin.
@@ -16,18 +19,61 @@ import {
   autostartInstall,
   autostartUninstall,
 } from "#core/desktop/autostart.js";
+import {
+  isDesktopRunning,
+  startDesktopDetached,
+  stopDesktop,
+} from "#core/desktop/process.js";
-export function register(app, { plugins }) {
+export function register(app, { plugins, config }) {
   app.get("/desktop/status", (_req, res) => {
+    // `running` is the live Electron process (pid file) — the source of truth
+    // for the Start/Stop/Restart controls. `connected_clients` is how many of
+    // those windows have an open WS to the daemon (a window can be running but
+    // mid-reconnect), surfaced separately.
+    const running = isDesktopRunning();
     import("../desktop-ws.js")
       .then(({ desktopClients }) => {
-        res.json({
-          ok: true,
-          connected_clients: desktopClients.size,
-          running: desktopClients.size > 0,
-        });
+        res.json({ ok: true, connected_clients: desktopClients.size, running });
+      })
+      .catch(() => res.json({ ok: true, connected_clients: 0, running }));
+  });
+  // POST /desktop/start — launch the floating window (detached Electron). Same
+  // helper the CLI's `apx desktop start` uses. No-op-safe if already running.
+  app.post("/desktop/start", async (_req, res) => {
+    try {
+      const r = await startDesktopDetached({ port: config?.port });
+      if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
+      res.json({ ok: true, pid: r.pid, already: !!r.already });
+    } catch (e) {
+      res.status(500).json({ ok: false, error: e.message });
+    }
+  });
+  // POST /desktop/stop — terminate the running window (SIGTERM). `stopped` is
+  // false when nothing was running.
+  app.post("/desktop/stop", (_req, res) => {
+    const r = stopDesktop();
+    if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
+    res.json({ ok: true, stopped: r.stopped });
+  });
+  // POST /desktop/restart — ask every connected desktop window to reload.
+  // The web admin's "Restart" button hits this after a config change (theme,
+  // position) so the floating window re-reads ~/.apx/config.json and re-applies
+  // it without the user dropping to a terminal. The reload is a soft refresh of
+  // the renderer (main.js repositions + reloads webContents), NOT a process
+  // kill — the Electron app keeps its tray/shortcut. Returns how many windows
+  // were signalled so the UI can tell "reloaded" from "nothing connected".
+  app.post("/desktop/restart", (_req, res) => {
+    import("../desktop-ws.js")
+      .then(({ desktopClients, broadcastDesktop }) => {
+        const reloaded = desktopClients.size;
+        broadcastDesktop({ type: "reload" });
+        res.json({ ok: true, reloaded });
       })
-      .catch(() => res.json({ ok: true, connected_clients: 0, running: false }));
+      .catch((e) => res.status(500).json({ ok: false, error: e.message }));
   });
   app.post("/desktop/message", async (req, res) => {