@agentprojectcontext/apx 1.42.2 → 1.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/config/index.js +2 -0
- package/src/core/config/redact.js +2 -0
- package/src/core/desktop/process.js +126 -0
- package/src/core/voice/stt-hardware.js +87 -0
- package/src/core/voice/stt-models.js +97 -0
- package/src/core/voice/transcription.js +147 -16
- package/src/host/daemon/api/desktop.js +54 -8
- package/src/host/daemon/api/transcribe.js +40 -1
- package/src/host/daemon/whisper-server.js +18 -8
- package/src/host/daemon/whisper-server.py +71 -44
- package/src/interfaces/cli/commands/desktop.js +13 -68
- package/src/interfaces/desktop/main.js +32 -4
- package/src/interfaces/desktop/renderer.js +26 -5
- package/src/interfaces/web/dist/assets/index-B0nTYflm.js +651 -0
- package/src/interfaces/web/dist/assets/index-B0nTYflm.js.map +1 -0
- package/src/interfaces/web/dist/assets/index-C22PmKCD.css +1 -0
- package/src/interfaces/web/dist/index.html +2 -2
- package/src/interfaces/web/src/components/ShortcutInput.tsx +156 -0
- package/src/interfaces/web/src/components/voice/VoiceSttCard.tsx +101 -5
- package/src/interfaces/web/src/i18n/en.ts +28 -2
- package/src/interfaces/web/src/i18n/es.ts +28 -2
- package/src/interfaces/web/src/lib/api/desktop.ts +28 -0
- package/src/interfaces/web/src/lib/api/voice.ts +26 -2
- package/src/interfaces/web/src/screens/modules/DeckScreen.tsx +55 -3
- package/src/interfaces/web/src/screens/modules/DesktopScreen.tsx +98 -36
- package/src/interfaces/web/dist/assets/index-BReF4_xV.js +0 -646
- package/src/interfaces/web/dist/assets/index-BReF4_xV.js.map +0 -1
- package/src/interfaces/web/dist/assets/index-wrEbTJbc.css +0 -1
package/package.json
CHANGED
package/src/core/config/index.js
CHANGED
|
@@ -190,6 +190,8 @@ const CREDENTIAL_PATHS = [
|
|
|
190
190
|
["voice", "tts", "elevenlabs", "api_key"],
|
|
191
191
|
["voice", "tts", "openai", "api_key"],
|
|
192
192
|
["voice", "tts", "gemini", "api_key"],
|
|
193
|
+
["transcription", "openai", "api_key"],
|
|
194
|
+
["transcription", "custom", "api_key"],
|
|
193
195
|
["memory", "embeddings", "openai", "api_key"],
|
|
194
196
|
["memory", "embeddings", "gemini", "api_key"],
|
|
195
197
|
["telegram", "channels"], // entire array — losing it is also a regression
|
|
@@ -18,6 +18,8 @@ export const SECRET_PATHS = [
|
|
|
18
18
|
"voice.tts.elevenlabs.api_key",
|
|
19
19
|
"voice.tts.openai.api_key",
|
|
20
20
|
"voice.tts.gemini.api_key",
|
|
21
|
+
"transcription.openai.api_key",
|
|
22
|
+
"transcription.custom.api_key",
|
|
21
23
|
"memory.embeddings.openai.api_key",
|
|
22
24
|
"memory.embeddings.gemini.api_key",
|
|
23
25
|
// Telegram bot tokens live inside an array — handled separately in redact()
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
// Desktop (Electron floating window) process control — shared by the CLI
|
|
2
|
+
// (`apx desktop start/stop/restart`) and the daemon's /desktop/{start,stop}
|
|
3
|
+
// HTTP endpoints, so both spawn/kill the window the exact same way.
|
|
4
|
+
//
|
|
5
|
+
// The window is a detached Electron process (it must survive the spawner so a
|
|
6
|
+
// LaunchAgent / a short-lived CLI invocation doesn't take it down). State is
|
|
7
|
+
// tracked via ~/.apx/desktop.pid.
|
|
8
|
+
|
|
9
|
+
"use strict";
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import os from "node:os";
|
|
12
|
+
import path from "node:path";
|
|
13
|
+
import { spawn, execFileSync } from "node:child_process";
|
|
14
|
+
import { fileURLToPath } from "node:url";
|
|
15
|
+
|
|
16
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
17
|
+
const __dirname = path.dirname(__filename);
|
|
18
|
+
|
|
19
|
+
// src/core/desktop/ → repo root is three levels up.
|
|
20
|
+
const ROOT = path.resolve(__dirname, "..", "..", "..");
|
|
21
|
+
export const DESKTOP_MAIN = path.resolve(__dirname, "..", "..", "interfaces", "desktop", "main.js");
|
|
22
|
+
export const DESKTOP_PID = path.join(os.homedir(), ".apx", "desktop.pid");
|
|
23
|
+
const DESKTOP_LOG = path.join(os.homedir(), ".apx", "desktop.log");
|
|
24
|
+
|
|
25
|
+
// ── PID file ────────────────────────────────────────────────────────────────
|
|
26
|
+
export function readPid() {
|
|
27
|
+
try { return parseInt(fs.readFileSync(DESKTOP_PID, "utf8").trim(), 10); } catch { return null; }
|
|
28
|
+
}
|
|
29
|
+
export function writePid(pid) {
|
|
30
|
+
fs.mkdirSync(path.dirname(DESKTOP_PID), { recursive: true });
|
|
31
|
+
fs.writeFileSync(DESKTOP_PID, String(pid));
|
|
32
|
+
}
|
|
33
|
+
export function clearPid() { try { fs.unlinkSync(DESKTOP_PID); } catch {} }
|
|
34
|
+
export function pidAlive(pid) {
|
|
35
|
+
if (!pid) return false;
|
|
36
|
+
try { process.kill(pid, 0); return true; } catch { return false; }
|
|
37
|
+
}
|
|
38
|
+
export function isDesktopRunning() { return pidAlive(readPid()); }
|
|
39
|
+
|
|
40
|
+
// ── Electron resolution ───────────────────────────────────────────────────
|
|
41
|
+
// Validate a candidate actually runs (a pnpm shim can exist as a file while its
|
|
42
|
+
// underlying package was never built — `--version` smokes that out).
|
|
43
|
+
function electronRuns(cmd, argv) {
|
|
44
|
+
try { execFileSync(cmd, argv, { stdio: "ignore", timeout: 5000 }); return true; } catch { return false; }
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Returns a descriptor for buildElectronSpawn(): an absolute electron binary
|
|
48
|
+
// path, electron's cli.js (".js" → run via node), a global bin, or "npx" as a
|
|
49
|
+
// last resort. Never returns null.
|
|
50
|
+
export function findElectron() {
|
|
51
|
+
const bin = path.join(ROOT, "node_modules", ".bin", "electron");
|
|
52
|
+
if (fs.existsSync(bin) && electronRuns(bin, ["--version"])) return bin;
|
|
53
|
+
|
|
54
|
+
const cli = path.join(ROOT, "node_modules", "electron", "cli.js");
|
|
55
|
+
if (fs.existsSync(cli) && electronRuns(process.execPath, [cli, "--version"])) return cli;
|
|
56
|
+
|
|
57
|
+
try {
|
|
58
|
+
const which = execFileSync("which", ["electron"], { stdio: ["ignore", "pipe", "ignore"] }).toString().trim();
|
|
59
|
+
if (which && electronRuns(which, ["--version"])) return which;
|
|
60
|
+
} catch {}
|
|
61
|
+
|
|
62
|
+
return "npx";
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Turn a findElectron() descriptor + the app entry into a { cmd, argv } pair.
|
|
66
|
+
export function buildElectronSpawn(descriptor, mainPath, port) {
|
|
67
|
+
if (descriptor === "npx") {
|
|
68
|
+
return { cmd: "npx", argv: ["-y", "electron", mainPath, "--port", port] };
|
|
69
|
+
}
|
|
70
|
+
if (descriptor.endsWith(".js")) {
|
|
71
|
+
return { cmd: process.execPath, argv: [descriptor, mainPath, "--port", port] };
|
|
72
|
+
}
|
|
73
|
+
return { cmd: descriptor, argv: [mainPath, "--port", port] };
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// ── Lifecycle ───────────────────────────────────────────────────────────────
|
|
77
|
+
// Spawn the window detached (survives the spawner). No console output — callers
|
|
78
|
+
// format their own UX. Returns { ok, pid, already? } | { ok:false, error }.
|
|
79
|
+
// detached:true gives the child its own session so a LaunchAgent / short-lived
|
|
80
|
+
// CLI doesn't drag it down on exit; we unref() after a 1.5s fail-fast window.
|
|
81
|
+
export async function startDesktopDetached({ port = process.env.APX_PORT || "7430" } = {}) {
|
|
82
|
+
if (isDesktopRunning()) return { ok: true, pid: readPid(), already: true };
|
|
83
|
+
clearPid();
|
|
84
|
+
if (!fs.existsSync(DESKTOP_MAIN)) return { ok: false, error: `desktop app not found at ${DESKTOP_MAIN}` };
|
|
85
|
+
|
|
86
|
+
const { cmd, argv } = buildElectronSpawn(findElectron(), DESKTOP_MAIN, String(port));
|
|
87
|
+
let logFd;
|
|
88
|
+
try { logFd = fs.openSync(DESKTOP_LOG, "a"); } catch { logFd = "ignore"; }
|
|
89
|
+
|
|
90
|
+
let child;
|
|
91
|
+
try {
|
|
92
|
+
child = spawn(cmd, argv, {
|
|
93
|
+
detached: true,
|
|
94
|
+
stdio: ["ignore", logFd, logFd],
|
|
95
|
+
env: { ...process.env, ELECTRON_ENABLE_LOGGING: "1" },
|
|
96
|
+
});
|
|
97
|
+
} catch (e) {
|
|
98
|
+
if (typeof logFd === "number") { try { fs.closeSync(logFd); } catch {} }
|
|
99
|
+
return { ok: false, error: e.message };
|
|
100
|
+
}
|
|
101
|
+
if (typeof logFd === "number") { try { fs.closeSync(logFd); } catch {} }
|
|
102
|
+
|
|
103
|
+
const res = await new Promise((resolve) => {
|
|
104
|
+
let settled = false;
|
|
105
|
+
child.on("exit", (code) => { if (!settled) { settled = true; resolve({ ok: code === 0, code }); } });
|
|
106
|
+
setTimeout(() => { if (!settled) { settled = true; child.unref(); resolve({ ok: true }); } }, 1500);
|
|
107
|
+
});
|
|
108
|
+
if (!res.ok) return { ok: false, error: `desktop exited with code ${res.code}` };
|
|
109
|
+
|
|
110
|
+
if (child.pid) writePid(child.pid);
|
|
111
|
+
return { ok: true, pid: child.pid };
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Stop the running window (SIGTERM). Returns { ok, stopped, pid? } — stopped is
|
|
115
|
+
// false when nothing was running.
|
|
116
|
+
export function stopDesktop() {
|
|
117
|
+
const pid = readPid();
|
|
118
|
+
if (!pidAlive(pid)) { clearPid(); return { ok: true, stopped: false }; }
|
|
119
|
+
try {
|
|
120
|
+
process.kill(pid, "SIGTERM");
|
|
121
|
+
clearPid();
|
|
122
|
+
return { ok: true, stopped: true, pid };
|
|
123
|
+
} catch (e) {
|
|
124
|
+
return { ok: false, error: e.message };
|
|
125
|
+
}
|
|
126
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
// Hardware probe + STT engine recommendation.
|
|
2
|
+
//
|
|
3
|
+
// The transcription backend should adapt to the machine instead of making the
|
|
4
|
+
// user understand CTranslate2 vs MLX vs whisper.cpp:
|
|
5
|
+
//
|
|
6
|
+
// Apple Silicon (Metal) → mlx-whisper, large-v3-turbo (GPU/ANE accelerated)
|
|
7
|
+
// NVIDIA (CUDA) → faster-whisper cuda, large-v3 (GPU accelerated)
|
|
8
|
+
// AMD / Radeon → faster-whisper cpu (limited) (no ROCm in CT2)
|
|
9
|
+
// CPU only → faster-whisper cpu, small (safe + light)
|
|
10
|
+
//
|
|
11
|
+
// Detection is dependency-free and best-effort: short-timeout probes of
|
|
12
|
+
// nvidia-smi / rocminfo, plus os.platform()/os.arch(). Anything uncertain
|
|
13
|
+
// degrades to the CPU recommendation.
|
|
14
|
+
import os from "node:os";
|
|
15
|
+
import { spawnSync } from "node:child_process";
|
|
16
|
+
|
|
17
|
+
function cmdOk(cmd, args = []) {
|
|
18
|
+
try {
|
|
19
|
+
const r = spawnSync(cmd, args, { timeout: 1500, stdio: "ignore" });
|
|
20
|
+
return r.status === 0;
|
|
21
|
+
} catch {
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Probe the machine. Returns a stable shape the UI + recommender consume.
|
|
28
|
+
* @returns {{platform:string, arch:string, appleSilicon:boolean, gpu:"metal"|"cuda"|"rocm"|"none", gpuName?:string}}
|
|
29
|
+
*/
|
|
30
|
+
export function detectHardware() {
|
|
31
|
+
const platform = os.platform(); // "darwin" | "linux" | "win32"
|
|
32
|
+
const arch = os.arch(); // "arm64" | "x64" | ...
|
|
33
|
+
const appleSilicon = platform === "darwin" && arch === "arm64";
|
|
34
|
+
|
|
35
|
+
if (appleSilicon) {
|
|
36
|
+
return { platform, arch, appleSilicon: true, gpu: "metal", gpuName: cpuBrand() };
|
|
37
|
+
}
|
|
38
|
+
// NVIDIA: nvidia-smi exits 0 when a CUDA GPU + driver are present.
|
|
39
|
+
if (cmdOk("nvidia-smi", ["-L"])) {
|
|
40
|
+
return { platform, arch, appleSilicon: false, gpu: "cuda" };
|
|
41
|
+
}
|
|
42
|
+
// AMD/Radeon: rocminfo (ROCm stack) is the clearest signal on Linux.
|
|
43
|
+
if (platform === "linux" && cmdOk("rocminfo")) {
|
|
44
|
+
return { platform, arch, appleSilicon: false, gpu: "rocm" };
|
|
45
|
+
}
|
|
46
|
+
return { platform, arch, appleSilicon: false, gpu: "none" };
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function cpuBrand() {
|
|
50
|
+
try { return (os.cpus()?.[0]?.model || "").trim() || undefined; } catch { return undefined; }
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Recommended STT backend + model per hardware tier. `backend` maps to a local
|
|
54
|
+
// engine implementation; `model` is the repo id in that engine's format.
|
|
55
|
+
export function recommendStt(hw = detectHardware()) {
|
|
56
|
+
if (hw.gpu === "metal") {
|
|
57
|
+
return {
|
|
58
|
+
backend: "mlx", device: "metal",
|
|
59
|
+
model: "mlx-community/whisper-large-v3-turbo",
|
|
60
|
+
reason: "Apple Silicon: MLX corre en la GPU/Neural Engine (Metal).",
|
|
61
|
+
tier: "gpu",
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
if (hw.gpu === "cuda") {
|
|
65
|
+
return {
|
|
66
|
+
backend: "faster", device: "cuda", compute_type: "float16",
|
|
67
|
+
model: "large-v3",
|
|
68
|
+
reason: "GPU NVIDIA: faster-whisper en CUDA soporta modelos grandes rápido.",
|
|
69
|
+
tier: "gpu",
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
if (hw.gpu === "rocm") {
|
|
73
|
+
return {
|
|
74
|
+
backend: "faster", device: "cpu", compute_type: "int8",
|
|
75
|
+
model: "small",
|
|
76
|
+
reason: "Radeon/ROCm no está soportado por CTranslate2 — se usa CPU. (whisper.cpp Vulkan es una mejora futura.)",
|
|
77
|
+
tier: "cpu",
|
|
78
|
+
limited: true,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
return {
|
|
82
|
+
backend: "faster", device: "cpu", compute_type: "int8",
|
|
83
|
+
model: "small",
|
|
84
|
+
reason: "Sin GPU acelerada: faster-whisper en CPU con un modelo liviano.",
|
|
85
|
+
tier: "cpu",
|
|
86
|
+
};
|
|
87
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
// STT model catalog + on-disk status (downloaded? how big?).
|
|
2
|
+
//
|
|
3
|
+
// Both engines pull their weights from the HuggingFace hub cache
|
|
4
|
+
// (~/.cache/huggingface/hub/models--<org>--<name>), just in different formats:
|
|
5
|
+
// faster-whisper → Systran/faster-whisper-<model> (CTranslate2)
|
|
6
|
+
// mlx-whisper → mlx-community/whisper-<model> (MLX)
|
|
7
|
+
//
|
|
8
|
+
// We read the cache directory to report presence + real byte size, and carry an
|
|
9
|
+
// approximate download size for models that aren't there yet (Ollama-style).
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import os from "node:os";
|
|
12
|
+
import path from "node:path";
|
|
13
|
+
|
|
14
|
+
function hubDir() {
|
|
15
|
+
const base = process.env.HF_HOME || path.join(os.homedir(), ".cache", "huggingface");
|
|
16
|
+
return path.join(base, "hub");
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** HF cache folder name for a repo id, e.g. "Systran/faster-whisper-small". */
|
|
20
|
+
function repoCacheName(repoId) {
|
|
21
|
+
return "models--" + repoId.replace(/\//g, "--");
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function dirSizeBytes(dir) {
|
|
25
|
+
let total = 0;
|
|
26
|
+
let stack = [dir];
|
|
27
|
+
while (stack.length) {
|
|
28
|
+
const d = stack.pop();
|
|
29
|
+
let entries;
|
|
30
|
+
try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { continue; }
|
|
31
|
+
for (const e of entries) {
|
|
32
|
+
const p = path.join(d, e.name);
|
|
33
|
+
// HF stores real bytes once in blobs/ and symlinks them from snapshots/.
|
|
34
|
+
// Count only the real files (skip symlinks) so we don't double-count.
|
|
35
|
+
if (e.isSymbolicLink()) continue;
|
|
36
|
+
if (e.isDirectory()) stack.push(p);
|
|
37
|
+
else if (e.isFile()) { try { total += fs.lstatSync(p).size; } catch {} }
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return total;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function humanSize(bytes) {
|
|
44
|
+
if (!bytes || bytes < 1) return "—";
|
|
45
|
+
const u = ["B", "KB", "MB", "GB"];
|
|
46
|
+
let i = 0, n = bytes;
|
|
47
|
+
while (n >= 1024 && i < u.length - 1) { n /= 1024; i++; }
|
|
48
|
+
return `${n.toFixed(n >= 10 || i === 0 ? 0 : 1)} ${u[i]}`;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Catalog: per backend, the offered models with their HF repo id and an
|
|
52
|
+
// approximate download size (used only until the model is actually on disk).
|
|
53
|
+
export const STT_MODEL_CATALOG = {
|
|
54
|
+
faster: [
|
|
55
|
+
{ id: "tiny", repo: "Systran/faster-whisper-tiny", approx_mb: 75 },
|
|
56
|
+
{ id: "base", repo: "Systran/faster-whisper-base", approx_mb: 145 },
|
|
57
|
+
{ id: "small", repo: "Systran/faster-whisper-small", approx_mb: 480 },
|
|
58
|
+
{ id: "medium", repo: "Systran/faster-whisper-medium", approx_mb: 1500 },
|
|
59
|
+
{ id: "large-v3", repo: "Systran/faster-whisper-large-v3", approx_mb: 3100 },
|
|
60
|
+
{ id: "large-v3-turbo", repo: "mobiuslabsgmbh/faster-whisper-large-v3-turbo", approx_mb: 1600 },
|
|
61
|
+
],
|
|
62
|
+
mlx: [
|
|
63
|
+
{ id: "small", repo: "mlx-community/whisper-small-mlx", approx_mb: 480 },
|
|
64
|
+
{ id: "large-v3", repo: "mlx-community/whisper-large-v3-mlx", approx_mb: 3100 },
|
|
65
|
+
{ id: "large-v3-turbo", repo: "mlx-community/whisper-large-v3-turbo", approx_mb: 1600 },
|
|
66
|
+
],
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
/** Status of one repo in the HF cache. */
|
|
70
|
+
export function modelStatusByRepo(repo) {
|
|
71
|
+
const dir = path.join(hubDir(), repoCacheName(repo));
|
|
72
|
+
if (!fs.existsSync(dir)) return { downloaded: false, size_bytes: 0 };
|
|
73
|
+
const size = dirSizeBytes(dir);
|
|
74
|
+
// A bare ref/lock dir with no blobs is "not really downloaded".
|
|
75
|
+
return { downloaded: size > 1_000_000, size_bytes: size };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** List a backend's models with download status + sizes. */
|
|
79
|
+
export function listSttModels(backend) {
|
|
80
|
+
const catalog = STT_MODEL_CATALOG[backend] || [];
|
|
81
|
+
return catalog.map((m) => {
|
|
82
|
+
const st = modelStatusByRepo(m.repo);
|
|
83
|
+
return {
|
|
84
|
+
id: m.id,
|
|
85
|
+
repo: m.repo,
|
|
86
|
+
downloaded: st.downloaded,
|
|
87
|
+
size: st.downloaded ? humanSize(st.size_bytes) : `~${humanSize(m.approx_mb * 1024 * 1024)}`,
|
|
88
|
+
size_bytes: st.size_bytes,
|
|
89
|
+
};
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/** Resolve the HF repo id for a (backend, model-id) pair. */
|
|
94
|
+
export function repoFor(backend, modelId) {
|
|
95
|
+
const entry = (STT_MODEL_CATALOG[backend] || []).find((m) => m.id === modelId);
|
|
96
|
+
return entry?.repo || modelId; // allow passing a raw repo id through
|
|
97
|
+
}
|
|
@@ -23,7 +23,11 @@ import { logInfo, logWarn } from "#core/logging.js";
|
|
|
23
23
|
export const WHISPER_LOCAL_PORT = 18765;
|
|
24
24
|
|
|
25
25
|
export const DEFAULT_LOCAL = {
|
|
26
|
-
|
|
26
|
+
// "auto" = adapt to the machine (mlx/Metal on Apple Silicon, faster-whisper
|
|
27
|
+
// cuda on NVIDIA, else faster-whisper cpu). Override with "faster" | "mlx".
|
|
28
|
+
backend: "auto",
|
|
29
|
+
model: "small", // faster-whisper model id (tiny|base|small|…)
|
|
30
|
+
mlx_model: "", // mlx repo (defaults to the hardware recommendation)
|
|
27
31
|
device: "cpu",
|
|
28
32
|
compute_type: "int8",
|
|
29
33
|
language: "auto",
|
|
@@ -34,6 +38,28 @@ export const DEFAULT_LOCAL = {
|
|
|
34
38
|
timeout_ms: 20 * 60_000,
|
|
35
39
|
};
|
|
36
40
|
|
|
41
|
+
// OpenAI's official cloud Whisper. `base_url` is overridable so the same
|
|
42
|
+
// client can target any OpenAI-compatible server (see `custom`).
|
|
43
|
+
export const DEFAULT_OPENAI = {
|
|
44
|
+
base_url: "https://api.openai.com/v1",
|
|
45
|
+
model: "whisper-1",
|
|
46
|
+
language: "auto",
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
// A user-supplied, OpenAI-compatible STT server reachable over the network:
|
|
50
|
+
// mlx-audio on this Mac's Metal GPU (localhost:8000), a Radeon/NVIDIA box on
|
|
51
|
+
// the LAN, or anyone's remote endpoint. All expose POST /audio/transcriptions,
|
|
52
|
+
// so they share the exact client as `openai` — only base_url/key/model differ.
|
|
53
|
+
export const DEFAULT_CUSTOM = {
|
|
54
|
+
base_url: "", // e.g. http://localhost:8000/v1 or http://192.168.1.50:9000/v1
|
|
55
|
+
api_key: "", // optional — most local servers don't require one
|
|
56
|
+
model: "", // e.g. mlx-community/whisper-large-v3-turbo or Systran/faster-whisper-large-v3
|
|
57
|
+
language: "auto",
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
/** STT engine ids surfaced to the web admin, in display/fallback order. */
|
|
61
|
+
export const STT_ENGINE_IDS = ["local", "openai", "custom"];
|
|
62
|
+
|
|
37
63
|
/**
|
|
38
64
|
* Resolve the effective transcription language. Priority:
|
|
39
65
|
* explicit local config → config.user.language → "auto" (whisper detects).
|
|
@@ -44,29 +70,110 @@ export function resolveTranscriptionLanguage(localCfg, userLang) {
|
|
|
44
70
|
return "auto";
|
|
45
71
|
}
|
|
46
72
|
|
|
73
|
+
/**
|
|
74
|
+
* Resolve the local engine's effective backend + model in place.
|
|
75
|
+
* backend "auto" → mlx (Apple Silicon/Metal), faster-whisper cuda (NVIDIA),
|
|
76
|
+
* else faster-whisper cpu.
|
|
77
|
+
* Safety net: if the chosen mlx model isn't downloaded yet, fall back to
|
|
78
|
+
* faster-whisper so a live voice turn never stalls on a multi-GB download —
|
|
79
|
+
* the model-manager UI handles the explicit download.
|
|
80
|
+
*/
|
|
81
|
+
async function resolveLocalBackend(local) {
|
|
82
|
+
let backend = local.backend || "auto";
|
|
83
|
+
let rec;
|
|
84
|
+
try {
|
|
85
|
+
const { recommendStt } = await import("#core/voice/stt-hardware.js");
|
|
86
|
+
rec = recommendStt();
|
|
87
|
+
} catch {
|
|
88
|
+
rec = { backend: "faster", model: "small", device: "cpu", compute_type: "int8" };
|
|
89
|
+
}
|
|
90
|
+
if (backend === "auto") backend = rec.backend;
|
|
91
|
+
|
|
92
|
+
if (backend === "mlx") {
|
|
93
|
+
const mlxModel = local.mlx_model || rec.model;
|
|
94
|
+
let downloaded = false;
|
|
95
|
+
try {
|
|
96
|
+
const { modelStatusByRepo } = await import("#core/voice/stt-models.js");
|
|
97
|
+
downloaded = modelStatusByRepo(mlxModel).downloaded;
|
|
98
|
+
} catch {}
|
|
99
|
+
if (downloaded) {
|
|
100
|
+
local.backend = "mlx";
|
|
101
|
+
local.model = mlxModel; // whisper-server.js passes this as --model
|
|
102
|
+
local.device = "metal";
|
|
103
|
+
local.compute_type = "mlx";
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
backend = "faster"; // not present → don't block voice
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// faster-whisper path. On an NVIDIA box, prefer CUDA + float16 unless the
|
|
110
|
+
// user pinned something explicit.
|
|
111
|
+
if (rec.backend === "faster" && rec.device === "cuda") {
|
|
112
|
+
if (!local.device || local.device === "cpu") local.device = "cuda";
|
|
113
|
+
if (local.compute_type === "int8") local.compute_type = rec.compute_type || "float16";
|
|
114
|
+
}
|
|
115
|
+
local.backend = "faster";
|
|
116
|
+
}
|
|
117
|
+
|
|
47
118
|
export async function getConfig() {
|
|
48
119
|
try {
|
|
49
120
|
const { readConfig } = await import("#core/config/index.js");
|
|
50
121
|
const cfg = readConfig() || {};
|
|
51
122
|
const t = cfg.transcription || {};
|
|
52
|
-
const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
53
123
|
const userLang = cfg.user?.language || "";
|
|
124
|
+
|
|
54
125
|
const localBase = { ...DEFAULT_LOCAL, ...(t.local || {}) };
|
|
55
126
|
localBase.language = resolveTranscriptionLanguage(localBase, userLang);
|
|
127
|
+
await resolveLocalBackend(localBase);
|
|
128
|
+
|
|
129
|
+
// OpenAI cloud: key can live in transcription.openai, the shared
|
|
130
|
+
// engines.openai block, or the env. base_url defaults to the official API.
|
|
131
|
+
const openai = { ...DEFAULT_OPENAI, ...(t.openai || {}) };
|
|
132
|
+
openai.api_key = t.openai?.api_key || cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
133
|
+
openai.language = resolveTranscriptionLanguage(openai, userLang);
|
|
134
|
+
|
|
135
|
+
// Custom OpenAI-compatible server (mlx-audio / Radeon / NVIDIA / remote).
|
|
136
|
+
const custom = { ...DEFAULT_CUSTOM, ...(t.custom || {}) };
|
|
137
|
+
custom.language = resolveTranscriptionLanguage(custom, userLang);
|
|
138
|
+
|
|
56
139
|
return {
|
|
57
140
|
provider: t.provider || "auto",
|
|
58
141
|
local: localBase,
|
|
59
|
-
|
|
142
|
+
openai,
|
|
143
|
+
custom,
|
|
144
|
+
// kept for backward-compat with callers that read `.openaiKey`
|
|
145
|
+
openaiKey: openai.api_key,
|
|
60
146
|
};
|
|
61
147
|
} catch {
|
|
62
148
|
return {
|
|
63
149
|
provider: "auto",
|
|
64
150
|
local: { ...DEFAULT_LOCAL },
|
|
151
|
+
openai: { ...DEFAULT_OPENAI, api_key: process.env.OPENAI_API_KEY || "" },
|
|
152
|
+
custom: { ...DEFAULT_CUSTOM },
|
|
65
153
|
openaiKey: process.env.OPENAI_API_KEY || "",
|
|
66
154
|
};
|
|
67
155
|
}
|
|
68
156
|
}
|
|
69
157
|
|
|
158
|
+
/**
|
|
159
|
+
* List STT engines + availability for the web admin (mirrors tts listProviders).
|
|
160
|
+
* @returns {{configured_provider:string, engines:Array<{id,available,configured}>}}
|
|
161
|
+
*/
|
|
162
|
+
export function listSttProviders(rawConfig = {}) {
|
|
163
|
+
const t = rawConfig.transcription || {};
|
|
164
|
+
const provider = t.provider || "auto";
|
|
165
|
+
const openaiKey = t.openai?.api_key || rawConfig.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
166
|
+
const customUrl = (t.custom?.base_url || "").trim();
|
|
167
|
+
const engines = [
|
|
168
|
+
// local whisper is embedded (daemon spawns the subprocess on demand) →
|
|
169
|
+
// always usable, no credentials needed.
|
|
170
|
+
{ id: "local", available: true, configured: true },
|
|
171
|
+
{ id: "openai", available: Boolean(openaiKey), configured: Boolean(openaiKey) },
|
|
172
|
+
{ id: "custom", available: Boolean(customUrl), configured: Boolean(customUrl) },
|
|
173
|
+
];
|
|
174
|
+
return { configured_provider: provider, engines };
|
|
175
|
+
}
|
|
176
|
+
|
|
70
177
|
/**
|
|
71
178
|
* Call the local whisper-server.py over HTTP. Does NOT spawn or check the
|
|
72
179
|
* subprocess — that's host/daemon/whisper-server.js's job. If the server is
|
|
@@ -138,9 +245,14 @@ export async function transcribeViaLocalServer(filePath, opts) {
|
|
|
138
245
|
throw lastErr || new Error("transcribeViaLocalServer: unknown failure");
|
|
139
246
|
}
|
|
140
247
|
|
|
141
|
-
/**
|
|
142
|
-
|
|
143
|
-
|
|
248
|
+
/**
|
|
249
|
+
* OpenAI-compatible transcription (POST {base_url}/audio/transcriptions with a
|
|
250
|
+
* multipart `file` + `model`). Works against OpenAI itself and any server that
|
|
251
|
+
* speaks the same contract: mlx-audio, faster-whisper-server, whisper.cpp
|
|
252
|
+
* server, etc. `backend` is just the label returned to the caller.
|
|
253
|
+
*/
|
|
254
|
+
export async function transcribeViaOpenAICompatible(filePath, { base_url, api_key, model, language, backend = "openai", timeout_ms = 120_000 } = {}) {
|
|
255
|
+
const baseUrl = (base_url || DEFAULT_OPENAI.base_url).replace(/\/+$/, "");
|
|
144
256
|
const buf = fs.readFileSync(filePath);
|
|
145
257
|
const ext = path.extname(filePath).slice(1).toLowerCase() || "webm";
|
|
146
258
|
const fileType = ext === "ogg" || ext === "oga" ? "audio/ogg"
|
|
@@ -151,28 +263,40 @@ export async function transcribeOpenAI(filePath, apiKey) {
|
|
|
151
263
|
: "application/octet-stream";
|
|
152
264
|
|
|
153
265
|
const form = new FormData();
|
|
154
|
-
form.append("model",
|
|
266
|
+
form.append("model", model || DEFAULT_OPENAI.model);
|
|
267
|
+
if (language && language !== "auto") form.append("language", language);
|
|
155
268
|
form.append("file", new Blob([buf], { type: fileType }), path.basename(filePath));
|
|
156
269
|
|
|
157
|
-
const
|
|
270
|
+
const t0 = Date.now();
|
|
271
|
+
const res = await fetch(`${baseUrl}/audio/transcriptions`, {
|
|
158
272
|
method: "POST",
|
|
159
|
-
|
|
273
|
+
// Auth header only when a key is set — local servers usually need none.
|
|
274
|
+
headers: api_key ? { authorization: `Bearer ${api_key}` } : {},
|
|
160
275
|
body: form,
|
|
161
|
-
signal: AbortSignal.timeout(
|
|
276
|
+
signal: AbortSignal.timeout(timeout_ms),
|
|
162
277
|
});
|
|
163
278
|
if (!res.ok) {
|
|
164
279
|
const errBody = await res.text().catch(() => "");
|
|
165
|
-
throw new Error(
|
|
280
|
+
throw new Error(`${backend} stt ${res.status}: ${errBody.slice(0, 240)}`);
|
|
166
281
|
}
|
|
167
282
|
const json = await res.json();
|
|
283
|
+
logInfo("whisper", `transcribeViaOpenAICompatible(${backend}) ok in ${Date.now() - t0}ms`, {
|
|
284
|
+
chars: (json.text || "").length, base_url: baseUrl, model: model || DEFAULT_OPENAI.model,
|
|
285
|
+
});
|
|
168
286
|
return {
|
|
169
287
|
ok: true,
|
|
170
|
-
backend
|
|
288
|
+
backend,
|
|
171
289
|
text: json.text || "",
|
|
172
290
|
language: json.language || null,
|
|
173
291
|
};
|
|
174
292
|
}
|
|
175
293
|
|
|
294
|
+
/** Back-compat shim: OpenAI Whisper-1 cloud API by key. */
|
|
295
|
+
export async function transcribeOpenAI(filePath, apiKey) {
|
|
296
|
+
if (!apiKey) throw new Error("openai transcription: no api_key");
|
|
297
|
+
return transcribeViaOpenAICompatible(filePath, { ...DEFAULT_OPENAI, api_key: apiKey, backend: "openai" });
|
|
298
|
+
}
|
|
299
|
+
|
|
176
300
|
/**
|
|
177
301
|
* Transcribe a file. Provider chosen by config:
|
|
178
302
|
* - "openai": cloud only
|
|
@@ -188,17 +312,24 @@ export async function transcribe(filePath, overrides = {}) {
|
|
|
188
312
|
const localOpts = { ...cfg.local, ...overrides };
|
|
189
313
|
|
|
190
314
|
if (provider === "openai") {
|
|
191
|
-
return
|
|
315
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
|
|
316
|
+
}
|
|
317
|
+
if (provider === "custom") {
|
|
318
|
+
if (!cfg.custom.base_url) throw new Error("custom transcription: set transcription.custom.base_url");
|
|
319
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
|
|
192
320
|
}
|
|
193
321
|
if (provider === "local") {
|
|
194
322
|
return transcribeViaLocalServer(filePath, localOpts);
|
|
195
323
|
}
|
|
196
|
-
// auto: local first,
|
|
324
|
+
// auto: local first, then a configured remote (custom preferred over openai).
|
|
197
325
|
try {
|
|
198
326
|
return await transcribeViaLocalServer(filePath, localOpts);
|
|
199
327
|
} catch (localErr) {
|
|
200
|
-
if (cfg.
|
|
201
|
-
return
|
|
328
|
+
if (cfg.custom.base_url) {
|
|
329
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
|
|
330
|
+
}
|
|
331
|
+
if (cfg.openai.api_key) {
|
|
332
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
|
|
202
333
|
}
|
|
203
334
|
throw new Error(`local transcription failed: ${localErr.message}`);
|
|
204
335
|
}
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
// Desktop (floating voice window) HTTP surface.
|
|
2
2
|
//
|
|
3
|
-
// GET /desktop/status connected websocket clients
|
|
3
|
+
// GET /desktop/status running flag (pid) + connected websocket clients
|
|
4
|
+
// POST /desktop/start launch the floating window (detached Electron)
|
|
5
|
+
// POST /desktop/stop terminate the running window (SIGTERM)
|
|
6
|
+
// POST /desktop/restart broadcast a "reload" so live windows re-read config
|
|
4
7
|
// POST /desktop/message text (post-STT). Responds 200 immediately;
|
|
5
8
|
// the super-agent answer is streamed back over WS
|
|
6
9
|
// by the desktop plugin.
|
|
@@ -16,18 +19,61 @@ import {
|
|
|
16
19
|
autostartInstall,
|
|
17
20
|
autostartUninstall,
|
|
18
21
|
} from "#core/desktop/autostart.js";
|
|
22
|
+
import {
|
|
23
|
+
isDesktopRunning,
|
|
24
|
+
startDesktopDetached,
|
|
25
|
+
stopDesktop,
|
|
26
|
+
} from "#core/desktop/process.js";
|
|
19
27
|
|
|
20
|
-
export function register(app, { plugins }) {
|
|
28
|
+
export function register(app, { plugins, config }) {
|
|
21
29
|
app.get("/desktop/status", (_req, res) => {
|
|
30
|
+
// `running` is the live Electron process (pid file) — the source of truth
|
|
31
|
+
// for the Start/Stop/Restart controls. `connected_clients` is how many of
|
|
32
|
+
// those windows have an open WS to the daemon (a window can be running but
|
|
33
|
+
// mid-reconnect), surfaced separately.
|
|
34
|
+
const running = isDesktopRunning();
|
|
22
35
|
import("../desktop-ws.js")
|
|
23
36
|
.then(({ desktopClients }) => {
|
|
24
|
-
res.json({
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
37
|
+
res.json({ ok: true, connected_clients: desktopClients.size, running });
|
|
38
|
+
})
|
|
39
|
+
.catch(() => res.json({ ok: true, connected_clients: 0, running }));
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// POST /desktop/start — launch the floating window (detached Electron). Same
|
|
43
|
+
// helper the CLI's `apx desktop start` uses. No-op-safe if already running.
|
|
44
|
+
app.post("/desktop/start", async (_req, res) => {
|
|
45
|
+
try {
|
|
46
|
+
const r = await startDesktopDetached({ port: config?.port });
|
|
47
|
+
if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
|
|
48
|
+
res.json({ ok: true, pid: r.pid, already: !!r.already });
|
|
49
|
+
} catch (e) {
|
|
50
|
+
res.status(500).json({ ok: false, error: e.message });
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// POST /desktop/stop — terminate the running window (SIGTERM). `stopped` is
|
|
55
|
+
// false when nothing was running.
|
|
56
|
+
app.post("/desktop/stop", (_req, res) => {
|
|
57
|
+
const r = stopDesktop();
|
|
58
|
+
if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
|
|
59
|
+
res.json({ ok: true, stopped: r.stopped });
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// POST /desktop/restart — ask every connected desktop window to reload.
|
|
63
|
+
// The web admin's "Restart" button hits this after a config change (theme,
|
|
64
|
+
// position) so the floating window re-reads ~/.apx/config.json and re-applies
|
|
65
|
+
// it without the user dropping to a terminal. The reload is a soft refresh of
|
|
66
|
+
// the renderer (main.js repositions + reloads webContents), NOT a process
|
|
67
|
+
// kill — the Electron app keeps its tray/shortcut. Returns how many windows
|
|
68
|
+
// were signalled so the UI can tell "reloaded" from "nothing connected".
|
|
69
|
+
app.post("/desktop/restart", (_req, res) => {
|
|
70
|
+
import("../desktop-ws.js")
|
|
71
|
+
.then(({ desktopClients, broadcastDesktop }) => {
|
|
72
|
+
const reloaded = desktopClients.size;
|
|
73
|
+
broadcastDesktop({ type: "reload" });
|
|
74
|
+
res.json({ ok: true, reloaded });
|
|
29
75
|
})
|
|
30
|
-
.catch(() => res.json({ ok:
|
|
76
|
+
.catch((e) => res.status(500).json({ ok: false, error: e.message }));
|
|
31
77
|
});
|
|
32
78
|
|
|
33
79
|
app.post("/desktop/message", async (req, res) => {
|