@agentprojectcontext/apx 1.42.1 → 1.43.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +1 -1
  2. package/src/core/channels/telegram/api.js +62 -0
  3. package/src/core/channels/telegram/ask-callbacks.js +238 -0
  4. package/src/core/config/index.js +2 -0
  5. package/src/core/config/redact.js +2 -0
  6. package/src/core/confirmation/adapters/telegram.js +20 -37
  7. package/src/core/desktop/process.js +126 -0
  8. package/src/core/voice/stt-hardware.js +87 -0
  9. package/src/core/voice/stt-models.js +97 -0
  10. package/src/core/voice/transcription.js +147 -16
  11. package/src/host/daemon/api/desktop.js +54 -8
  12. package/src/host/daemon/api/transcribe.js +40 -1
  13. package/src/host/daemon/plugins/desktop/index.js +6 -1
  14. package/src/host/daemon/plugins/telegram/index.js +61 -351
  15. package/src/host/daemon/whisper-server.js +18 -8
  16. package/src/host/daemon/whisper-server.py +71 -44
  17. package/src/interfaces/cli/commands/desktop.js +13 -68
  18. package/src/interfaces/desktop/main.js +32 -4
  19. package/src/interfaces/desktop/renderer.js +26 -5
  20. package/src/interfaces/web/dist/assets/index-B0nTYflm.js +651 -0
  21. package/src/interfaces/web/dist/assets/index-B0nTYflm.js.map +1 -0
  22. package/src/interfaces/web/dist/assets/index-C22PmKCD.css +1 -0
  23. package/src/interfaces/web/dist/index.html +2 -2
  24. package/src/interfaces/web/package-lock.json +3 -3
  25. package/src/interfaces/web/src/components/ShortcutInput.tsx +156 -0
  26. package/src/interfaces/web/src/components/voice/VoiceSttCard.tsx +101 -5
  27. package/src/interfaces/web/src/i18n/en.ts +28 -2
  28. package/src/interfaces/web/src/i18n/es.ts +28 -2
  29. package/src/interfaces/web/src/lib/api/desktop.ts +28 -0
  30. package/src/interfaces/web/src/lib/api/voice.ts +26 -2
  31. package/src/interfaces/web/src/screens/modules/DeckScreen.tsx +55 -3
  32. package/src/interfaces/web/src/screens/modules/DesktopScreen.tsx +98 -36
  33. package/src/interfaces/web/dist/assets/index-BReF4_xV.js +0 -646
  34. package/src/interfaces/web/dist/assets/index-BReF4_xV.js.map +0 -1
  35. package/src/interfaces/web/dist/assets/index-wrEbTJbc.css +0 -1
@@ -0,0 +1,97 @@
1
+ // STT model catalog + on-disk status (downloaded? how big?).
2
+ //
3
+ // Both engines pull their weights from the HuggingFace hub cache
4
+ // (~/.cache/huggingface/hub/models--<org>--<name>), just in different formats:
5
+ // faster-whisper → Systran/faster-whisper-<model> (CTranslate2)
6
+ // mlx-whisper → mlx-community/whisper-<model> (MLX)
7
+ //
8
+ // We read the cache directory to report presence + real byte size, and carry an
9
+ // approximate download size for models that aren't there yet (Ollama-style).
10
+ import fs from "node:fs";
11
+ import os from "node:os";
12
+ import path from "node:path";
13
+
14
+ function hubDir() {
15
+ const base = process.env.HF_HOME || path.join(os.homedir(), ".cache", "huggingface");
16
+ return path.join(base, "hub");
17
+ }
18
+
19
+ /** HF cache folder name for a repo id, e.g. "Systran/faster-whisper-small". */
20
+ function repoCacheName(repoId) {
21
+ return "models--" + repoId.replace(/\//g, "--");
22
+ }
23
+
24
+ function dirSizeBytes(dir) {
25
+ let total = 0;
26
+ let stack = [dir];
27
+ while (stack.length) {
28
+ const d = stack.pop();
29
+ let entries;
30
+ try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { continue; }
31
+ for (const e of entries) {
32
+ const p = path.join(d, e.name);
33
+ // HF stores real bytes once in blobs/ and symlinks them from snapshots/.
34
+ // Count only the real files (skip symlinks) so we don't double-count.
35
+ if (e.isSymbolicLink()) continue;
36
+ if (e.isDirectory()) stack.push(p);
37
+ else if (e.isFile()) { try { total += fs.lstatSync(p).size; } catch {} }
38
+ }
39
+ }
40
+ return total;
41
+ }
42
+
43
+ export function humanSize(bytes) {
44
+ if (!bytes || bytes < 1) return "—";
45
+ const u = ["B", "KB", "MB", "GB"];
46
+ let i = 0, n = bytes;
47
+ while (n >= 1024 && i < u.length - 1) { n /= 1024; i++; }
48
+ return `${n.toFixed(n >= 10 || i === 0 ? 0 : 1)} ${u[i]}`;
49
+ }
50
+
51
+ // Catalog: per backend, the offered models with their HF repo id and an
52
+ // approximate download size (used only until the model is actually on disk).
53
+ export const STT_MODEL_CATALOG = {
54
+ faster: [
55
+ { id: "tiny", repo: "Systran/faster-whisper-tiny", approx_mb: 75 },
56
+ { id: "base", repo: "Systran/faster-whisper-base", approx_mb: 145 },
57
+ { id: "small", repo: "Systran/faster-whisper-small", approx_mb: 480 },
58
+ { id: "medium", repo: "Systran/faster-whisper-medium", approx_mb: 1500 },
59
+ { id: "large-v3", repo: "Systran/faster-whisper-large-v3", approx_mb: 3100 },
60
+ { id: "large-v3-turbo", repo: "mobiuslabsgmbh/faster-whisper-large-v3-turbo", approx_mb: 1600 },
61
+ ],
62
+ mlx: [
63
+ { id: "small", repo: "mlx-community/whisper-small-mlx", approx_mb: 480 },
64
+ { id: "large-v3", repo: "mlx-community/whisper-large-v3-mlx", approx_mb: 3100 },
65
+ { id: "large-v3-turbo", repo: "mlx-community/whisper-large-v3-turbo", approx_mb: 1600 },
66
+ ],
67
+ };
68
+
69
+ /** Status of one repo in the HF cache. */
70
+ export function modelStatusByRepo(repo) {
71
+ const dir = path.join(hubDir(), repoCacheName(repo));
72
+ if (!fs.existsSync(dir)) return { downloaded: false, size_bytes: 0 };
73
+ const size = dirSizeBytes(dir);
74
+ // A bare ref/lock dir with no blobs is "not really downloaded".
75
+ return { downloaded: size > 1_000_000, size_bytes: size };
76
+ }
77
+
78
+ /** List a backend's models with download status + sizes. */
79
+ export function listSttModels(backend) {
80
+ const catalog = STT_MODEL_CATALOG[backend] || [];
81
+ return catalog.map((m) => {
82
+ const st = modelStatusByRepo(m.repo);
83
+ return {
84
+ id: m.id,
85
+ repo: m.repo,
86
+ downloaded: st.downloaded,
87
+ size: st.downloaded ? humanSize(st.size_bytes) : `~${humanSize(m.approx_mb * 1024 * 1024)}`,
88
+ size_bytes: st.size_bytes,
89
+ };
90
+ });
91
+ }
92
+
93
+ /** Resolve the HF repo id for a (backend, model-id) pair. */
94
+ export function repoFor(backend, modelId) {
95
+ const entry = (STT_MODEL_CATALOG[backend] || []).find((m) => m.id === modelId);
96
+ return entry?.repo || modelId; // allow passing a raw repo id through
97
+ }
@@ -23,7 +23,11 @@ import { logInfo, logWarn } from "#core/logging.js";
23
23
  export const WHISPER_LOCAL_PORT = 18765;
24
24
 
25
25
  export const DEFAULT_LOCAL = {
26
- model: "small",
26
+ // "auto" = adapt to the machine (mlx/Metal on Apple Silicon, faster-whisper
27
+ // cuda on NVIDIA, else faster-whisper cpu). Override with "faster" | "mlx".
28
+ backend: "auto",
29
+ model: "small", // faster-whisper model id (tiny|base|small|…)
30
+ mlx_model: "", // mlx repo (defaults to the hardware recommendation)
27
31
  device: "cpu",
28
32
  compute_type: "int8",
29
33
  language: "auto",
@@ -34,6 +38,28 @@ export const DEFAULT_LOCAL = {
34
38
  timeout_ms: 20 * 60_000,
35
39
  };
36
40
 
41
+ // OpenAI's official cloud Whisper. `base_url` is overridable so the same
42
+ // client can target any OpenAI-compatible server (see `custom`).
43
+ export const DEFAULT_OPENAI = {
44
+ base_url: "https://api.openai.com/v1",
45
+ model: "whisper-1",
46
+ language: "auto",
47
+ };
48
+
49
+ // A user-supplied, OpenAI-compatible STT server reachable over the network:
50
+ // mlx-audio on this Mac's Metal GPU (localhost:8000), a Radeon/NVIDIA box on
51
+ // the LAN, or anyone's remote endpoint. All expose POST /audio/transcriptions,
52
+ // so they share the exact client as `openai` — only base_url/key/model differ.
53
+ export const DEFAULT_CUSTOM = {
54
+ base_url: "", // e.g. http://localhost:8000/v1 or http://192.168.1.50:9000/v1
55
+ api_key: "", // optional — most local servers don't require one
56
+ model: "", // e.g. mlx-community/whisper-large-v3-turbo or Systran/faster-whisper-large-v3
57
+ language: "auto",
58
+ };
59
+
60
+ /** STT engine ids surfaced to the web admin, in display/fallback order. */
61
+ export const STT_ENGINE_IDS = ["local", "openai", "custom"];
62
+
37
63
  /**
38
64
  * Resolve the effective transcription language. Priority:
39
65
  * explicit local config → config.user.language → "auto" (whisper detects).
@@ -44,29 +70,110 @@ export function resolveTranscriptionLanguage(localCfg, userLang) {
44
70
  return "auto";
45
71
  }
46
72
 
73
+ /**
74
+ * Resolve the local engine's effective backend + model in place.
75
+ * backend "auto" → mlx (Apple Silicon/Metal), faster-whisper cuda (NVIDIA),
76
+ * else faster-whisper cpu.
77
+ * Safety net: if the chosen mlx model isn't downloaded yet, fall back to
78
+ * faster-whisper so a live voice turn never stalls on a multi-GB download —
79
+ * the model-manager UI handles the explicit download.
80
+ */
81
+ async function resolveLocalBackend(local) {
82
+ let backend = local.backend || "auto";
83
+ let rec;
84
+ try {
85
+ const { recommendStt } = await import("#core/voice/stt-hardware.js");
86
+ rec = recommendStt();
87
+ } catch {
88
+ rec = { backend: "faster", model: "small", device: "cpu", compute_type: "int8" };
89
+ }
90
+ if (backend === "auto") backend = rec.backend;
91
+
92
+ if (backend === "mlx") {
93
+ const mlxModel = local.mlx_model || rec.model;
94
+ let downloaded = false;
95
+ try {
96
+ const { modelStatusByRepo } = await import("#core/voice/stt-models.js");
97
+ downloaded = modelStatusByRepo(mlxModel).downloaded;
98
+ } catch {}
99
+ if (downloaded) {
100
+ local.backend = "mlx";
101
+ local.model = mlxModel; // whisper-server.js passes this as --model
102
+ local.device = "metal";
103
+ local.compute_type = "mlx";
104
+ return;
105
+ }
106
+ backend = "faster"; // not present → don't block voice
107
+ }
108
+
109
+ // faster-whisper path. On an NVIDIA box, prefer CUDA + float16 unless the
110
+ // user pinned something explicit.
111
+ if (rec.backend === "faster" && rec.device === "cuda") {
112
+ if (!local.device || local.device === "cpu") local.device = "cuda";
113
+ if (local.compute_type === "int8") local.compute_type = rec.compute_type || "float16";
114
+ }
115
+ local.backend = "faster";
116
+ }
117
+
47
118
  export async function getConfig() {
48
119
  try {
49
120
  const { readConfig } = await import("#core/config/index.js");
50
121
  const cfg = readConfig() || {};
51
122
  const t = cfg.transcription || {};
52
- const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
53
123
  const userLang = cfg.user?.language || "";
124
+
54
125
  const localBase = { ...DEFAULT_LOCAL, ...(t.local || {}) };
55
126
  localBase.language = resolveTranscriptionLanguage(localBase, userLang);
127
+ await resolveLocalBackend(localBase);
128
+
129
+ // OpenAI cloud: key can live in transcription.openai, the shared
130
+ // engines.openai block, or the env. base_url defaults to the official API.
131
+ const openai = { ...DEFAULT_OPENAI, ...(t.openai || {}) };
132
+ openai.api_key = t.openai?.api_key || cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
133
+ openai.language = resolveTranscriptionLanguage(openai, userLang);
134
+
135
+ // Custom OpenAI-compatible server (mlx-audio / Radeon / NVIDIA / remote).
136
+ const custom = { ...DEFAULT_CUSTOM, ...(t.custom || {}) };
137
+ custom.language = resolveTranscriptionLanguage(custom, userLang);
138
+
56
139
  return {
57
140
  provider: t.provider || "auto",
58
141
  local: localBase,
59
- openaiKey,
142
+ openai,
143
+ custom,
144
+ // kept for backward-compat with callers that read `.openaiKey`
145
+ openaiKey: openai.api_key,
60
146
  };
61
147
  } catch {
62
148
  return {
63
149
  provider: "auto",
64
150
  local: { ...DEFAULT_LOCAL },
151
+ openai: { ...DEFAULT_OPENAI, api_key: process.env.OPENAI_API_KEY || "" },
152
+ custom: { ...DEFAULT_CUSTOM },
65
153
  openaiKey: process.env.OPENAI_API_KEY || "",
66
154
  };
67
155
  }
68
156
  }
69
157
 
158
+ /**
159
+ * List STT engines + availability for the web admin (mirrors tts listProviders).
160
+ * @returns {{configured_provider:string, engines:Array<{id,available,configured}>}}
161
+ */
162
+ export function listSttProviders(rawConfig = {}) {
163
+ const t = rawConfig.transcription || {};
164
+ const provider = t.provider || "auto";
165
+ const openaiKey = t.openai?.api_key || rawConfig.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
166
+ const customUrl = (t.custom?.base_url || "").trim();
167
+ const engines = [
168
+ // local whisper is embedded (daemon spawns the subprocess on demand) →
169
+ // always usable, no credentials needed.
170
+ { id: "local", available: true, configured: true },
171
+ { id: "openai", available: Boolean(openaiKey), configured: Boolean(openaiKey) },
172
+ { id: "custom", available: Boolean(customUrl), configured: Boolean(customUrl) },
173
+ ];
174
+ return { configured_provider: provider, engines };
175
+ }
176
+
70
177
  /**
71
178
  * Call the local whisper-server.py over HTTP. Does NOT spawn or check the
72
179
  * subprocess — that's host/daemon/whisper-server.js's job. If the server is
@@ -138,9 +245,14 @@ export async function transcribeViaLocalServer(filePath, opts) {
138
245
  throw lastErr || new Error("transcribeViaLocalServer: unknown failure");
139
246
  }
140
247
 
141
- /** OpenAI Whisper-1 cloud API. Needs an api_key. */
142
- export async function transcribeOpenAI(filePath, apiKey) {
143
- if (!apiKey) throw new Error("openai transcription: no api_key");
248
+ /**
249
+ * OpenAI-compatible transcription (POST {base_url}/audio/transcriptions with a
250
+ * multipart `file` + `model`). Works against OpenAI itself and any server that
251
+ * speaks the same contract: mlx-audio, faster-whisper-server, whisper.cpp
252
+ * server, etc. `backend` is just the label returned to the caller.
253
+ */
254
+ export async function transcribeViaOpenAICompatible(filePath, { base_url, api_key, model, language, backend = "openai", timeout_ms = 120_000 } = {}) {
255
+ const baseUrl = (base_url || DEFAULT_OPENAI.base_url).replace(/\/+$/, "");
144
256
  const buf = fs.readFileSync(filePath);
145
257
  const ext = path.extname(filePath).slice(1).toLowerCase() || "webm";
146
258
  const fileType = ext === "ogg" || ext === "oga" ? "audio/ogg"
@@ -151,28 +263,40 @@ export async function transcribeOpenAI(filePath, apiKey) {
151
263
  : "application/octet-stream";
152
264
 
153
265
  const form = new FormData();
154
- form.append("model", "whisper-1");
266
+ form.append("model", model || DEFAULT_OPENAI.model);
267
+ if (language && language !== "auto") form.append("language", language);
155
268
  form.append("file", new Blob([buf], { type: fileType }), path.basename(filePath));
156
269
 
157
- const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
270
+ const t0 = Date.now();
271
+ const res = await fetch(`${baseUrl}/audio/transcriptions`, {
158
272
  method: "POST",
159
- headers: { authorization: `Bearer ${apiKey}` },
273
+ // Auth header only when a key is set — local servers usually need none.
274
+ headers: api_key ? { authorization: `Bearer ${api_key}` } : {},
160
275
  body: form,
161
- signal: AbortSignal.timeout(60_000),
276
+ signal: AbortSignal.timeout(timeout_ms),
162
277
  });
163
278
  if (!res.ok) {
164
279
  const errBody = await res.text().catch(() => "");
165
- throw new Error(`openai whisper ${res.status}: ${errBody.slice(0, 240)}`);
280
+ throw new Error(`${backend} stt ${res.status}: ${errBody.slice(0, 240)}`);
166
281
  }
167
282
  const json = await res.json();
283
+ logInfo("whisper", `transcribeViaOpenAICompatible(${backend}) ok in ${Date.now() - t0}ms`, {
284
+ chars: (json.text || "").length, base_url: baseUrl, model: model || DEFAULT_OPENAI.model,
285
+ });
168
286
  return {
169
287
  ok: true,
170
- backend: "openai",
288
+ backend,
171
289
  text: json.text || "",
172
290
  language: json.language || null,
173
291
  };
174
292
  }
175
293
 
294
+ /** Back-compat shim: OpenAI Whisper-1 cloud API by key. */
295
+ export async function transcribeOpenAI(filePath, apiKey) {
296
+ if (!apiKey) throw new Error("openai transcription: no api_key");
297
+ return transcribeViaOpenAICompatible(filePath, { ...DEFAULT_OPENAI, api_key: apiKey, backend: "openai" });
298
+ }
299
+
176
300
  /**
177
301
  * Transcribe a file. Provider chosen by config:
178
302
  * - "openai": cloud only
@@ -188,17 +312,24 @@ export async function transcribe(filePath, overrides = {}) {
188
312
  const localOpts = { ...cfg.local, ...overrides };
189
313
 
190
314
  if (provider === "openai") {
191
- return transcribeOpenAI(filePath, cfg.openaiKey);
315
+ return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
316
+ }
317
+ if (provider === "custom") {
318
+ if (!cfg.custom.base_url) throw new Error("custom transcription: set transcription.custom.base_url");
319
+ return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
192
320
  }
193
321
  if (provider === "local") {
194
322
  return transcribeViaLocalServer(filePath, localOpts);
195
323
  }
196
- // auto: local first, fall back to openai if a key is configured
324
+ // auto: local first, then a configured remote (custom preferred over openai).
197
325
  try {
198
326
  return await transcribeViaLocalServer(filePath, localOpts);
199
327
  } catch (localErr) {
200
- if (cfg.openaiKey) {
201
- return transcribeOpenAI(filePath, cfg.openaiKey);
328
+ if (cfg.custom.base_url) {
329
+ return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
330
+ }
331
+ if (cfg.openai.api_key) {
332
+ return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
202
333
  }
203
334
  throw new Error(`local transcription failed: ${localErr.message}`);
204
335
  }
@@ -1,6 +1,9 @@
1
1
  // Desktop (floating voice window) HTTP surface.
2
2
  //
3
- // GET /desktop/status connected websocket clients count
3
+ // GET /desktop/status running flag (pid) + connected websocket clients
4
+ // POST /desktop/start launch the floating window (detached Electron)
5
+ // POST /desktop/stop terminate the running window (SIGTERM)
6
+ // POST /desktop/restart broadcast a "reload" so live windows re-read config
4
7
  // POST /desktop/message text (post-STT). Responds 200 immediately;
5
8
  // the super-agent answer is streamed back over WS
6
9
  // by the desktop plugin.
@@ -16,18 +19,61 @@ import {
16
19
  autostartInstall,
17
20
  autostartUninstall,
18
21
  } from "#core/desktop/autostart.js";
22
+ import {
23
+ isDesktopRunning,
24
+ startDesktopDetached,
25
+ stopDesktop,
26
+ } from "#core/desktop/process.js";
19
27
 
20
- export function register(app, { plugins }) {
28
+ export function register(app, { plugins, config }) {
21
29
  app.get("/desktop/status", (_req, res) => {
30
+ // `running` is the live Electron process (pid file) — the source of truth
31
+ // for the Start/Stop/Restart controls. `connected_clients` is how many of
32
+ // those windows have an open WS to the daemon (a window can be running but
33
+ // mid-reconnect), surfaced separately.
34
+ const running = isDesktopRunning();
22
35
  import("../desktop-ws.js")
23
36
  .then(({ desktopClients }) => {
24
- res.json({
25
- ok: true,
26
- connected_clients: desktopClients.size,
27
- running: desktopClients.size > 0,
28
- });
37
+ res.json({ ok: true, connected_clients: desktopClients.size, running });
38
+ })
39
+ .catch(() => res.json({ ok: true, connected_clients: 0, running }));
40
+ });
41
+
42
+ // POST /desktop/start — launch the floating window (detached Electron). Same
43
+ // helper the CLI's `apx desktop start` uses. No-op-safe if already running.
44
+ app.post("/desktop/start", async (_req, res) => {
45
+ try {
46
+ const r = await startDesktopDetached({ port: config?.port });
47
+ if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
48
+ res.json({ ok: true, pid: r.pid, already: !!r.already });
49
+ } catch (e) {
50
+ res.status(500).json({ ok: false, error: e.message });
51
+ }
52
+ });
53
+
54
+ // POST /desktop/stop — terminate the running window (SIGTERM). `stopped` is
55
+ // false when nothing was running.
56
+ app.post("/desktop/stop", (_req, res) => {
57
+ const r = stopDesktop();
58
+ if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
59
+ res.json({ ok: true, stopped: r.stopped });
60
+ });
61
+
62
+ // POST /desktop/restart — ask every connected desktop window to reload.
63
+ // The web admin's "Restart" button hits this after a config change (theme,
64
+ // position) so the floating window re-reads ~/.apx/config.json and re-applies
65
+ // it without the user dropping to a terminal. The reload is a soft refresh of
66
+ // the renderer (main.js repositions + reloads webContents), NOT a process
67
+ // kill — the Electron app keeps its tray/shortcut. Returns how many windows
68
+ // were signalled so the UI can tell "reloaded" from "nothing connected".
69
+ app.post("/desktop/restart", (_req, res) => {
70
+ import("../desktop-ws.js")
71
+ .then(({ desktopClients, broadcastDesktop }) => {
72
+ const reloaded = desktopClients.size;
73
+ broadcastDesktop({ type: "reload" });
74
+ res.json({ ok: true, reloaded });
29
75
  })
30
- .catch(() => res.json({ ok: true, connected_clients: 0, running: false }));
76
+ .catch((e) => res.status(500).json({ ok: false, error: e.message }));
31
77
  });
32
78
 
33
79
  app.post("/desktop/message", async (req, res) => {
@@ -2,10 +2,49 @@
2
2
  // Raw audio bytes in the body. Headers:
3
3
  // X-Audio-Format webm | ogg | wav | mp3 (defaults to webm)
4
4
  // X-Language ISO code or "auto"
5
- // X-Provider auto | local | openai (overrides config)
5
+ // X-Provider auto | local | openai | custom (overrides config)
6
6
  //
7
7
  // Shared by overlay, telegram voice messages, and any external caller.
8
8
  export function register(app) {
9
+ // GET /transcribe/providers — STT engine list + availability for the web
10
+ // admin (mirror of /tts/providers). local = embedded faster-whisper;
11
+ // openai = cloud Whisper; custom = any OpenAI-compatible server (mlx-audio
12
+ // on Metal, a Radeon/NVIDIA box on the LAN, a remote endpoint).
13
+ app.get("/transcribe/providers", async (_req, res) => {
14
+ try {
15
+ const { readConfig } = await import("#core/config/index.js");
16
+ const { listSttProviders } = await import("#core/voice/transcription.js");
17
+ res.json(listSttProviders(readConfig()));
18
+ } catch (e) {
19
+ res.status(500).json({ error: e.message });
20
+ }
21
+ });
22
+
23
+ // GET /transcribe/hardware — detected machine + the recommended local backend
24
+ // (mlx on Apple Silicon, faster-whisper cuda on NVIDIA, else CPU). Drives the
25
+ // "engine adapts itself" UX in the web admin.
26
+ app.get("/transcribe/hardware", async (_req, res) => {
27
+ try {
28
+ const { detectHardware, recommendStt } = await import("#core/voice/stt-hardware.js");
29
+ const hw = detectHardware();
30
+ res.json({ hardware: hw, recommended: recommendStt(hw) });
31
+ } catch (e) {
32
+ res.status(500).json({ error: e.message });
33
+ }
34
+ });
35
+
36
+ // GET /transcribe/models?backend=faster|mlx — model catalog with on-disk
37
+ // status (downloaded? size) for the model-manager UI.
38
+ app.get("/transcribe/models", async (req, res) => {
39
+ try {
40
+ const backend = String(req.query.backend || "faster");
41
+ const { listSttModels } = await import("#core/voice/stt-models.js");
42
+ res.json({ backend, models: listSttModels(backend) });
43
+ } catch (e) {
44
+ res.status(500).json({ error: e.message });
45
+ }
46
+ });
47
+
9
48
  // GET /transcribe/warmup — load the local whisper model (if needed) and reset
10
49
  // its idle watchdog. Callers (e.g. the desktop window) ping this while open so
11
50
  // the first real utterance doesn't pay the cold-load cost.
@@ -140,7 +140,12 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
140
140
  channel: CHANNELS.DESKTOP,
141
141
  ...(slashed.handled ? { contextNote: slashed.contextNote } : {}),
142
142
  channelMeta: { voice: true }, // desktop module is voice-first → spoken mode
143
- previousMessages: history.slice(0, -1),
143
+ // WS path: history was just appended with the current user turn (line 87),
144
+ // so drop it. HTTP path: `previousMessages` came in already excluding the
145
+ // current user turn (the renderer slices it off before POSTing), so
146
+ // dropping again would silently strip the last assistant reply — making
147
+ // every turn look like a fresh conversation to the model.
148
+ previousMessages: ws ? history.slice(0, -1) : history,
144
149
  overrideModel: cfg.model || null,
145
150
  signal: controller.signal,
146
151
  onToken: (chunk) => { liveBuf += chunk; },