@agentprojectcontext/apx 1.42.1 → 1.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/channels/telegram/api.js +62 -0
- package/src/core/channels/telegram/ask-callbacks.js +238 -0
- package/src/core/config/index.js +2 -0
- package/src/core/config/redact.js +2 -0
- package/src/core/confirmation/adapters/telegram.js +20 -37
- package/src/core/desktop/process.js +126 -0
- package/src/core/voice/stt-hardware.js +87 -0
- package/src/core/voice/stt-models.js +97 -0
- package/src/core/voice/transcription.js +147 -16
- package/src/host/daemon/api/desktop.js +54 -8
- package/src/host/daemon/api/transcribe.js +40 -1
- package/src/host/daemon/plugins/desktop/index.js +6 -1
- package/src/host/daemon/plugins/telegram/index.js +61 -351
- package/src/host/daemon/whisper-server.js +18 -8
- package/src/host/daemon/whisper-server.py +71 -44
- package/src/interfaces/cli/commands/desktop.js +13 -68
- package/src/interfaces/desktop/main.js +32 -4
- package/src/interfaces/desktop/renderer.js +26 -5
- package/src/interfaces/web/dist/assets/index-B0nTYflm.js +651 -0
- package/src/interfaces/web/dist/assets/index-B0nTYflm.js.map +1 -0
- package/src/interfaces/web/dist/assets/index-C22PmKCD.css +1 -0
- package/src/interfaces/web/dist/index.html +2 -2
- package/src/interfaces/web/package-lock.json +3 -3
- package/src/interfaces/web/src/components/ShortcutInput.tsx +156 -0
- package/src/interfaces/web/src/components/voice/VoiceSttCard.tsx +101 -5
- package/src/interfaces/web/src/i18n/en.ts +28 -2
- package/src/interfaces/web/src/i18n/es.ts +28 -2
- package/src/interfaces/web/src/lib/api/desktop.ts +28 -0
- package/src/interfaces/web/src/lib/api/voice.ts +26 -2
- package/src/interfaces/web/src/screens/modules/DeckScreen.tsx +55 -3
- package/src/interfaces/web/src/screens/modules/DesktopScreen.tsx +98 -36
- package/src/interfaces/web/dist/assets/index-BReF4_xV.js +0 -646
- package/src/interfaces/web/dist/assets/index-BReF4_xV.js.map +0 -1
- package/src/interfaces/web/dist/assets/index-wrEbTJbc.css +0 -1
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
// STT model catalog + on-disk status (downloaded? how big?).
|
|
2
|
+
//
|
|
3
|
+
// Both engines pull their weights from the HuggingFace hub cache
|
|
4
|
+
// (~/.cache/huggingface/hub/models--<org>--<name>), just in different formats:
|
|
5
|
+
// faster-whisper → Systran/faster-whisper-<model> (CTranslate2)
|
|
6
|
+
// mlx-whisper → mlx-community/whisper-<model> (MLX)
|
|
7
|
+
//
|
|
8
|
+
// We read the cache directory to report presence + real byte size, and carry an
|
|
9
|
+
// approximate download size for models that aren't there yet (Ollama-style).
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import os from "node:os";
|
|
12
|
+
import path from "node:path";
|
|
13
|
+
|
|
14
|
+
function hubDir() {
|
|
15
|
+
const base = process.env.HF_HOME || path.join(os.homedir(), ".cache", "huggingface");
|
|
16
|
+
return path.join(base, "hub");
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** HF cache folder name for a repo id, e.g. "Systran/faster-whisper-small". */
|
|
20
|
+
function repoCacheName(repoId) {
|
|
21
|
+
return "models--" + repoId.replace(/\//g, "--");
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function dirSizeBytes(dir) {
|
|
25
|
+
let total = 0;
|
|
26
|
+
let stack = [dir];
|
|
27
|
+
while (stack.length) {
|
|
28
|
+
const d = stack.pop();
|
|
29
|
+
let entries;
|
|
30
|
+
try { entries = fs.readdirSync(d, { withFileTypes: true }); } catch { continue; }
|
|
31
|
+
for (const e of entries) {
|
|
32
|
+
const p = path.join(d, e.name);
|
|
33
|
+
// HF stores real bytes once in blobs/ and symlinks them from snapshots/.
|
|
34
|
+
// Count only the real files (skip symlinks) so we don't double-count.
|
|
35
|
+
if (e.isSymbolicLink()) continue;
|
|
36
|
+
if (e.isDirectory()) stack.push(p);
|
|
37
|
+
else if (e.isFile()) { try { total += fs.lstatSync(p).size; } catch {} }
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return total;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function humanSize(bytes) {
|
|
44
|
+
if (!bytes || bytes < 1) return "—";
|
|
45
|
+
const u = ["B", "KB", "MB", "GB"];
|
|
46
|
+
let i = 0, n = bytes;
|
|
47
|
+
while (n >= 1024 && i < u.length - 1) { n /= 1024; i++; }
|
|
48
|
+
return `${n.toFixed(n >= 10 || i === 0 ? 0 : 1)} ${u[i]}`;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Catalog: per backend, the offered models with their HF repo id and an
|
|
52
|
+
// approximate download size (used only until the model is actually on disk).
|
|
53
|
+
export const STT_MODEL_CATALOG = {
|
|
54
|
+
faster: [
|
|
55
|
+
{ id: "tiny", repo: "Systran/faster-whisper-tiny", approx_mb: 75 },
|
|
56
|
+
{ id: "base", repo: "Systran/faster-whisper-base", approx_mb: 145 },
|
|
57
|
+
{ id: "small", repo: "Systran/faster-whisper-small", approx_mb: 480 },
|
|
58
|
+
{ id: "medium", repo: "Systran/faster-whisper-medium", approx_mb: 1500 },
|
|
59
|
+
{ id: "large-v3", repo: "Systran/faster-whisper-large-v3", approx_mb: 3100 },
|
|
60
|
+
{ id: "large-v3-turbo", repo: "mobiuslabsgmbh/faster-whisper-large-v3-turbo", approx_mb: 1600 },
|
|
61
|
+
],
|
|
62
|
+
mlx: [
|
|
63
|
+
{ id: "small", repo: "mlx-community/whisper-small-mlx", approx_mb: 480 },
|
|
64
|
+
{ id: "large-v3", repo: "mlx-community/whisper-large-v3-mlx", approx_mb: 3100 },
|
|
65
|
+
{ id: "large-v3-turbo", repo: "mlx-community/whisper-large-v3-turbo", approx_mb: 1600 },
|
|
66
|
+
],
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
/** Status of one repo in the HF cache. */
|
|
70
|
+
export function modelStatusByRepo(repo) {
|
|
71
|
+
const dir = path.join(hubDir(), repoCacheName(repo));
|
|
72
|
+
if (!fs.existsSync(dir)) return { downloaded: false, size_bytes: 0 };
|
|
73
|
+
const size = dirSizeBytes(dir);
|
|
74
|
+
// A bare ref/lock dir with no blobs is "not really downloaded".
|
|
75
|
+
return { downloaded: size > 1_000_000, size_bytes: size };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** List a backend's models with download status + sizes. */
|
|
79
|
+
export function listSttModels(backend) {
|
|
80
|
+
const catalog = STT_MODEL_CATALOG[backend] || [];
|
|
81
|
+
return catalog.map((m) => {
|
|
82
|
+
const st = modelStatusByRepo(m.repo);
|
|
83
|
+
return {
|
|
84
|
+
id: m.id,
|
|
85
|
+
repo: m.repo,
|
|
86
|
+
downloaded: st.downloaded,
|
|
87
|
+
size: st.downloaded ? humanSize(st.size_bytes) : `~${humanSize(m.approx_mb * 1024 * 1024)}`,
|
|
88
|
+
size_bytes: st.size_bytes,
|
|
89
|
+
};
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/** Resolve the HF repo id for a (backend, model-id) pair. */
|
|
94
|
+
export function repoFor(backend, modelId) {
|
|
95
|
+
const entry = (STT_MODEL_CATALOG[backend] || []).find((m) => m.id === modelId);
|
|
96
|
+
return entry?.repo || modelId; // allow passing a raw repo id through
|
|
97
|
+
}
|
|
@@ -23,7 +23,11 @@ import { logInfo, logWarn } from "#core/logging.js";
|
|
|
23
23
|
export const WHISPER_LOCAL_PORT = 18765;
|
|
24
24
|
|
|
25
25
|
export const DEFAULT_LOCAL = {
|
|
26
|
-
|
|
26
|
+
// "auto" = adapt to the machine (mlx/Metal on Apple Silicon, faster-whisper
|
|
27
|
+
// cuda on NVIDIA, else faster-whisper cpu). Override with "faster" | "mlx".
|
|
28
|
+
backend: "auto",
|
|
29
|
+
model: "small", // faster-whisper model id (tiny|base|small|…)
|
|
30
|
+
mlx_model: "", // mlx repo (defaults to the hardware recommendation)
|
|
27
31
|
device: "cpu",
|
|
28
32
|
compute_type: "int8",
|
|
29
33
|
language: "auto",
|
|
@@ -34,6 +38,28 @@ export const DEFAULT_LOCAL = {
|
|
|
34
38
|
timeout_ms: 20 * 60_000,
|
|
35
39
|
};
|
|
36
40
|
|
|
41
|
+
// OpenAI's official cloud Whisper. `base_url` is overridable so the same
|
|
42
|
+
// client can target any OpenAI-compatible server (see `custom`).
|
|
43
|
+
export const DEFAULT_OPENAI = {
|
|
44
|
+
base_url: "https://api.openai.com/v1",
|
|
45
|
+
model: "whisper-1",
|
|
46
|
+
language: "auto",
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
// A user-supplied, OpenAI-compatible STT server reachable over the network:
|
|
50
|
+
// mlx-audio on this Mac's Metal GPU (localhost:8000), a Radeon/NVIDIA box on
|
|
51
|
+
// the LAN, or anyone's remote endpoint. All expose POST /audio/transcriptions,
|
|
52
|
+
// so they share the exact client as `openai` — only base_url/key/model differ.
|
|
53
|
+
export const DEFAULT_CUSTOM = {
|
|
54
|
+
base_url: "", // e.g. http://localhost:8000/v1 or http://192.168.1.50:9000/v1
|
|
55
|
+
api_key: "", // optional — most local servers don't require one
|
|
56
|
+
model: "", // e.g. mlx-community/whisper-large-v3-turbo or Systran/faster-whisper-large-v3
|
|
57
|
+
language: "auto",
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
/** STT engine ids surfaced to the web admin, in display/fallback order. */
|
|
61
|
+
export const STT_ENGINE_IDS = ["local", "openai", "custom"];
|
|
62
|
+
|
|
37
63
|
/**
|
|
38
64
|
* Resolve the effective transcription language. Priority:
|
|
39
65
|
* explicit local config → config.user.language → "auto" (whisper detects).
|
|
@@ -44,29 +70,110 @@ export function resolveTranscriptionLanguage(localCfg, userLang) {
|
|
|
44
70
|
return "auto";
|
|
45
71
|
}
|
|
46
72
|
|
|
73
|
+
/**
|
|
74
|
+
* Resolve the local engine's effective backend + model in place.
|
|
75
|
+
* backend "auto" → mlx (Apple Silicon/Metal), faster-whisper cuda (NVIDIA),
|
|
76
|
+
* else faster-whisper cpu.
|
|
77
|
+
* Safety net: if the chosen mlx model isn't downloaded yet, fall back to
|
|
78
|
+
* faster-whisper so a live voice turn never stalls on a multi-GB download —
|
|
79
|
+
* the model-manager UI handles the explicit download.
|
|
80
|
+
*/
|
|
81
|
+
async function resolveLocalBackend(local) {
|
|
82
|
+
let backend = local.backend || "auto";
|
|
83
|
+
let rec;
|
|
84
|
+
try {
|
|
85
|
+
const { recommendStt } = await import("#core/voice/stt-hardware.js");
|
|
86
|
+
rec = recommendStt();
|
|
87
|
+
} catch {
|
|
88
|
+
rec = { backend: "faster", model: "small", device: "cpu", compute_type: "int8" };
|
|
89
|
+
}
|
|
90
|
+
if (backend === "auto") backend = rec.backend;
|
|
91
|
+
|
|
92
|
+
if (backend === "mlx") {
|
|
93
|
+
const mlxModel = local.mlx_model || rec.model;
|
|
94
|
+
let downloaded = false;
|
|
95
|
+
try {
|
|
96
|
+
const { modelStatusByRepo } = await import("#core/voice/stt-models.js");
|
|
97
|
+
downloaded = modelStatusByRepo(mlxModel).downloaded;
|
|
98
|
+
} catch {}
|
|
99
|
+
if (downloaded) {
|
|
100
|
+
local.backend = "mlx";
|
|
101
|
+
local.model = mlxModel; // whisper-server.js passes this as --model
|
|
102
|
+
local.device = "metal";
|
|
103
|
+
local.compute_type = "mlx";
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
backend = "faster"; // not present → don't block voice
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// faster-whisper path. On an NVIDIA box, prefer CUDA + float16 unless the
|
|
110
|
+
// user pinned something explicit.
|
|
111
|
+
if (rec.backend === "faster" && rec.device === "cuda") {
|
|
112
|
+
if (!local.device || local.device === "cpu") local.device = "cuda";
|
|
113
|
+
if (local.compute_type === "int8") local.compute_type = rec.compute_type || "float16";
|
|
114
|
+
}
|
|
115
|
+
local.backend = "faster";
|
|
116
|
+
}
|
|
117
|
+
|
|
47
118
|
export async function getConfig() {
|
|
48
119
|
try {
|
|
49
120
|
const { readConfig } = await import("#core/config/index.js");
|
|
50
121
|
const cfg = readConfig() || {};
|
|
51
122
|
const t = cfg.transcription || {};
|
|
52
|
-
const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
53
123
|
const userLang = cfg.user?.language || "";
|
|
124
|
+
|
|
54
125
|
const localBase = { ...DEFAULT_LOCAL, ...(t.local || {}) };
|
|
55
126
|
localBase.language = resolveTranscriptionLanguage(localBase, userLang);
|
|
127
|
+
await resolveLocalBackend(localBase);
|
|
128
|
+
|
|
129
|
+
// OpenAI cloud: key can live in transcription.openai, the shared
|
|
130
|
+
// engines.openai block, or the env. base_url defaults to the official API.
|
|
131
|
+
const openai = { ...DEFAULT_OPENAI, ...(t.openai || {}) };
|
|
132
|
+
openai.api_key = t.openai?.api_key || cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
133
|
+
openai.language = resolveTranscriptionLanguage(openai, userLang);
|
|
134
|
+
|
|
135
|
+
// Custom OpenAI-compatible server (mlx-audio / Radeon / NVIDIA / remote).
|
|
136
|
+
const custom = { ...DEFAULT_CUSTOM, ...(t.custom || {}) };
|
|
137
|
+
custom.language = resolveTranscriptionLanguage(custom, userLang);
|
|
138
|
+
|
|
56
139
|
return {
|
|
57
140
|
provider: t.provider || "auto",
|
|
58
141
|
local: localBase,
|
|
59
|
-
|
|
142
|
+
openai,
|
|
143
|
+
custom,
|
|
144
|
+
// kept for backward-compat with callers that read `.openaiKey`
|
|
145
|
+
openaiKey: openai.api_key,
|
|
60
146
|
};
|
|
61
147
|
} catch {
|
|
62
148
|
return {
|
|
63
149
|
provider: "auto",
|
|
64
150
|
local: { ...DEFAULT_LOCAL },
|
|
151
|
+
openai: { ...DEFAULT_OPENAI, api_key: process.env.OPENAI_API_KEY || "" },
|
|
152
|
+
custom: { ...DEFAULT_CUSTOM },
|
|
65
153
|
openaiKey: process.env.OPENAI_API_KEY || "",
|
|
66
154
|
};
|
|
67
155
|
}
|
|
68
156
|
}
|
|
69
157
|
|
|
158
|
+
/**
|
|
159
|
+
* List STT engines + availability for the web admin (mirrors tts listProviders).
|
|
160
|
+
* @returns {{configured_provider:string, engines:Array<{id,available,configured}>}}
|
|
161
|
+
*/
|
|
162
|
+
export function listSttProviders(rawConfig = {}) {
|
|
163
|
+
const t = rawConfig.transcription || {};
|
|
164
|
+
const provider = t.provider || "auto";
|
|
165
|
+
const openaiKey = t.openai?.api_key || rawConfig.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
166
|
+
const customUrl = (t.custom?.base_url || "").trim();
|
|
167
|
+
const engines = [
|
|
168
|
+
// local whisper is embedded (daemon spawns the subprocess on demand) →
|
|
169
|
+
// always usable, no credentials needed.
|
|
170
|
+
{ id: "local", available: true, configured: true },
|
|
171
|
+
{ id: "openai", available: Boolean(openaiKey), configured: Boolean(openaiKey) },
|
|
172
|
+
{ id: "custom", available: Boolean(customUrl), configured: Boolean(customUrl) },
|
|
173
|
+
];
|
|
174
|
+
return { configured_provider: provider, engines };
|
|
175
|
+
}
|
|
176
|
+
|
|
70
177
|
/**
|
|
71
178
|
* Call the local whisper-server.py over HTTP. Does NOT spawn or check the
|
|
72
179
|
* subprocess — that's host/daemon/whisper-server.js's job. If the server is
|
|
@@ -138,9 +245,14 @@ export async function transcribeViaLocalServer(filePath, opts) {
|
|
|
138
245
|
throw lastErr || new Error("transcribeViaLocalServer: unknown failure");
|
|
139
246
|
}
|
|
140
247
|
|
|
141
|
-
/**
|
|
142
|
-
|
|
143
|
-
|
|
248
|
+
/**
|
|
249
|
+
* OpenAI-compatible transcription (POST {base_url}/audio/transcriptions with a
|
|
250
|
+
* multipart `file` + `model`). Works against OpenAI itself and any server that
|
|
251
|
+
* speaks the same contract: mlx-audio, faster-whisper-server, whisper.cpp
|
|
252
|
+
* server, etc. `backend` is just the label returned to the caller.
|
|
253
|
+
*/
|
|
254
|
+
export async function transcribeViaOpenAICompatible(filePath, { base_url, api_key, model, language, backend = "openai", timeout_ms = 120_000 } = {}) {
|
|
255
|
+
const baseUrl = (base_url || DEFAULT_OPENAI.base_url).replace(/\/+$/, "");
|
|
144
256
|
const buf = fs.readFileSync(filePath);
|
|
145
257
|
const ext = path.extname(filePath).slice(1).toLowerCase() || "webm";
|
|
146
258
|
const fileType = ext === "ogg" || ext === "oga" ? "audio/ogg"
|
|
@@ -151,28 +263,40 @@ export async function transcribeOpenAI(filePath, apiKey) {
|
|
|
151
263
|
: "application/octet-stream";
|
|
152
264
|
|
|
153
265
|
const form = new FormData();
|
|
154
|
-
form.append("model",
|
|
266
|
+
form.append("model", model || DEFAULT_OPENAI.model);
|
|
267
|
+
if (language && language !== "auto") form.append("language", language);
|
|
155
268
|
form.append("file", new Blob([buf], { type: fileType }), path.basename(filePath));
|
|
156
269
|
|
|
157
|
-
const
|
|
270
|
+
const t0 = Date.now();
|
|
271
|
+
const res = await fetch(`${baseUrl}/audio/transcriptions`, {
|
|
158
272
|
method: "POST",
|
|
159
|
-
|
|
273
|
+
// Auth header only when a key is set — local servers usually need none.
|
|
274
|
+
headers: api_key ? { authorization: `Bearer ${api_key}` } : {},
|
|
160
275
|
body: form,
|
|
161
|
-
signal: AbortSignal.timeout(
|
|
276
|
+
signal: AbortSignal.timeout(timeout_ms),
|
|
162
277
|
});
|
|
163
278
|
if (!res.ok) {
|
|
164
279
|
const errBody = await res.text().catch(() => "");
|
|
165
|
-
throw new Error(
|
|
280
|
+
throw new Error(`${backend} stt ${res.status}: ${errBody.slice(0, 240)}`);
|
|
166
281
|
}
|
|
167
282
|
const json = await res.json();
|
|
283
|
+
logInfo("whisper", `transcribeViaOpenAICompatible(${backend}) ok in ${Date.now() - t0}ms`, {
|
|
284
|
+
chars: (json.text || "").length, base_url: baseUrl, model: model || DEFAULT_OPENAI.model,
|
|
285
|
+
});
|
|
168
286
|
return {
|
|
169
287
|
ok: true,
|
|
170
|
-
backend
|
|
288
|
+
backend,
|
|
171
289
|
text: json.text || "",
|
|
172
290
|
language: json.language || null,
|
|
173
291
|
};
|
|
174
292
|
}
|
|
175
293
|
|
|
294
|
+
/** Back-compat shim: OpenAI Whisper-1 cloud API by key. */
|
|
295
|
+
export async function transcribeOpenAI(filePath, apiKey) {
|
|
296
|
+
if (!apiKey) throw new Error("openai transcription: no api_key");
|
|
297
|
+
return transcribeViaOpenAICompatible(filePath, { ...DEFAULT_OPENAI, api_key: apiKey, backend: "openai" });
|
|
298
|
+
}
|
|
299
|
+
|
|
176
300
|
/**
|
|
177
301
|
* Transcribe a file. Provider chosen by config:
|
|
178
302
|
* - "openai": cloud only
|
|
@@ -188,17 +312,24 @@ export async function transcribe(filePath, overrides = {}) {
|
|
|
188
312
|
const localOpts = { ...cfg.local, ...overrides };
|
|
189
313
|
|
|
190
314
|
if (provider === "openai") {
|
|
191
|
-
return
|
|
315
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
|
|
316
|
+
}
|
|
317
|
+
if (provider === "custom") {
|
|
318
|
+
if (!cfg.custom.base_url) throw new Error("custom transcription: set transcription.custom.base_url");
|
|
319
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
|
|
192
320
|
}
|
|
193
321
|
if (provider === "local") {
|
|
194
322
|
return transcribeViaLocalServer(filePath, localOpts);
|
|
195
323
|
}
|
|
196
|
-
// auto: local first,
|
|
324
|
+
// auto: local first, then a configured remote (custom preferred over openai).
|
|
197
325
|
try {
|
|
198
326
|
return await transcribeViaLocalServer(filePath, localOpts);
|
|
199
327
|
} catch (localErr) {
|
|
200
|
-
if (cfg.
|
|
201
|
-
return
|
|
328
|
+
if (cfg.custom.base_url) {
|
|
329
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.custom, backend: "custom" });
|
|
330
|
+
}
|
|
331
|
+
if (cfg.openai.api_key) {
|
|
332
|
+
return transcribeViaOpenAICompatible(filePath, { ...cfg.openai, backend: "openai" });
|
|
202
333
|
}
|
|
203
334
|
throw new Error(`local transcription failed: ${localErr.message}`);
|
|
204
335
|
}
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
// Desktop (floating voice window) HTTP surface.
|
|
2
2
|
//
|
|
3
|
-
// GET /desktop/status connected websocket clients
|
|
3
|
+
// GET /desktop/status running flag (pid) + connected websocket clients
|
|
4
|
+
// POST /desktop/start launch the floating window (detached Electron)
|
|
5
|
+
// POST /desktop/stop terminate the running window (SIGTERM)
|
|
6
|
+
// POST /desktop/restart broadcast a "reload" so live windows re-read config
|
|
4
7
|
// POST /desktop/message text (post-STT). Responds 200 immediately;
|
|
5
8
|
// the super-agent answer is streamed back over WS
|
|
6
9
|
// by the desktop plugin.
|
|
@@ -16,18 +19,61 @@ import {
|
|
|
16
19
|
autostartInstall,
|
|
17
20
|
autostartUninstall,
|
|
18
21
|
} from "#core/desktop/autostart.js";
|
|
22
|
+
import {
|
|
23
|
+
isDesktopRunning,
|
|
24
|
+
startDesktopDetached,
|
|
25
|
+
stopDesktop,
|
|
26
|
+
} from "#core/desktop/process.js";
|
|
19
27
|
|
|
20
|
-
export function register(app, { plugins }) {
|
|
28
|
+
export function register(app, { plugins, config }) {
|
|
21
29
|
app.get("/desktop/status", (_req, res) => {
|
|
30
|
+
// `running` is the live Electron process (pid file) — the source of truth
|
|
31
|
+
// for the Start/Stop/Restart controls. `connected_clients` is how many of
|
|
32
|
+
// those windows have an open WS to the daemon (a window can be running but
|
|
33
|
+
// mid-reconnect), surfaced separately.
|
|
34
|
+
const running = isDesktopRunning();
|
|
22
35
|
import("../desktop-ws.js")
|
|
23
36
|
.then(({ desktopClients }) => {
|
|
24
|
-
res.json({
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
37
|
+
res.json({ ok: true, connected_clients: desktopClients.size, running });
|
|
38
|
+
})
|
|
39
|
+
.catch(() => res.json({ ok: true, connected_clients: 0, running }));
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// POST /desktop/start — launch the floating window (detached Electron). Same
|
|
43
|
+
// helper the CLI's `apx desktop start` uses. No-op-safe if already running.
|
|
44
|
+
app.post("/desktop/start", async (_req, res) => {
|
|
45
|
+
try {
|
|
46
|
+
const r = await startDesktopDetached({ port: config?.port });
|
|
47
|
+
if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
|
|
48
|
+
res.json({ ok: true, pid: r.pid, already: !!r.already });
|
|
49
|
+
} catch (e) {
|
|
50
|
+
res.status(500).json({ ok: false, error: e.message });
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// POST /desktop/stop — terminate the running window (SIGTERM). `stopped` is
|
|
55
|
+
// false when nothing was running.
|
|
56
|
+
app.post("/desktop/stop", (_req, res) => {
|
|
57
|
+
const r = stopDesktop();
|
|
58
|
+
if (!r.ok) return res.status(500).json({ ok: false, error: r.error });
|
|
59
|
+
res.json({ ok: true, stopped: r.stopped });
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// POST /desktop/restart — ask every connected desktop window to reload.
|
|
63
|
+
// The web admin's "Restart" button hits this after a config change (theme,
|
|
64
|
+
// position) so the floating window re-reads ~/.apx/config.json and re-applies
|
|
65
|
+
// it without the user dropping to a terminal. The reload is a soft refresh of
|
|
66
|
+
// the renderer (main.js repositions + reloads webContents), NOT a process
|
|
67
|
+
// kill — the Electron app keeps its tray/shortcut. Returns how many windows
|
|
68
|
+
// were signalled so the UI can tell "reloaded" from "nothing connected".
|
|
69
|
+
app.post("/desktop/restart", (_req, res) => {
|
|
70
|
+
import("../desktop-ws.js")
|
|
71
|
+
.then(({ desktopClients, broadcastDesktop }) => {
|
|
72
|
+
const reloaded = desktopClients.size;
|
|
73
|
+
broadcastDesktop({ type: "reload" });
|
|
74
|
+
res.json({ ok: true, reloaded });
|
|
29
75
|
})
|
|
30
|
-
.catch(() => res.json({ ok:
|
|
76
|
+
.catch((e) => res.status(500).json({ ok: false, error: e.message }));
|
|
31
77
|
});
|
|
32
78
|
|
|
33
79
|
app.post("/desktop/message", async (req, res) => {
|
|
@@ -2,10 +2,49 @@
|
|
|
2
2
|
// Raw audio bytes in the body. Headers:
|
|
3
3
|
// X-Audio-Format webm | ogg | wav | mp3 (defaults to webm)
|
|
4
4
|
// X-Language ISO code or "auto"
|
|
5
|
-
// X-Provider auto | local | openai (overrides config)
|
|
5
|
+
// X-Provider auto | local | openai | custom (overrides config)
|
|
6
6
|
//
|
|
7
7
|
// Shared by overlay, telegram voice messages, and any external caller.
|
|
8
8
|
export function register(app) {
|
|
9
|
+
// GET /transcribe/providers — STT engine list + availability for the web
|
|
10
|
+
// admin (mirror of /tts/providers). local = embedded faster-whisper;
|
|
11
|
+
// openai = cloud Whisper; custom = any OpenAI-compatible server (mlx-audio
|
|
12
|
+
// on Metal, a Radeon/NVIDIA box on the LAN, a remote endpoint).
|
|
13
|
+
app.get("/transcribe/providers", async (_req, res) => {
|
|
14
|
+
try {
|
|
15
|
+
const { readConfig } = await import("#core/config/index.js");
|
|
16
|
+
const { listSttProviders } = await import("#core/voice/transcription.js");
|
|
17
|
+
res.json(listSttProviders(readConfig()));
|
|
18
|
+
} catch (e) {
|
|
19
|
+
res.status(500).json({ error: e.message });
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// GET /transcribe/hardware — detected machine + the recommended local backend
|
|
24
|
+
// (mlx on Apple Silicon, faster-whisper cuda on NVIDIA, else CPU). Drives the
|
|
25
|
+
// "engine adapts itself" UX in the web admin.
|
|
26
|
+
app.get("/transcribe/hardware", async (_req, res) => {
|
|
27
|
+
try {
|
|
28
|
+
const { detectHardware, recommendStt } = await import("#core/voice/stt-hardware.js");
|
|
29
|
+
const hw = detectHardware();
|
|
30
|
+
res.json({ hardware: hw, recommended: recommendStt(hw) });
|
|
31
|
+
} catch (e) {
|
|
32
|
+
res.status(500).json({ error: e.message });
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
// GET /transcribe/models?backend=faster|mlx — model catalog with on-disk
|
|
37
|
+
// status (downloaded? size) for the model-manager UI.
|
|
38
|
+
app.get("/transcribe/models", async (req, res) => {
|
|
39
|
+
try {
|
|
40
|
+
const backend = String(req.query.backend || "faster");
|
|
41
|
+
const { listSttModels } = await import("#core/voice/stt-models.js");
|
|
42
|
+
res.json({ backend, models: listSttModels(backend) });
|
|
43
|
+
} catch (e) {
|
|
44
|
+
res.status(500).json({ error: e.message });
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
9
48
|
// GET /transcribe/warmup — load the local whisper model (if needed) and reset
|
|
10
49
|
// its idle watchdog. Callers (e.g. the desktop window) ping this while open so
|
|
11
50
|
// the first real utterance doesn't pay the cold-load cost.
|
|
@@ -140,7 +140,12 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
140
140
|
channel: CHANNELS.DESKTOP,
|
|
141
141
|
...(slashed.handled ? { contextNote: slashed.contextNote } : {}),
|
|
142
142
|
channelMeta: { voice: true }, // desktop module is voice-first → spoken mode
|
|
143
|
-
|
|
143
|
+
// WS path: history was just appended with the current user turn (line 87),
|
|
144
|
+
// so drop it. HTTP path: `previousMessages` came in already excluding the
|
|
145
|
+
// current user turn (the renderer slices it off before POSTing), so
|
|
146
|
+
// dropping again would silently strip the last assistant reply — making
|
|
147
|
+
// every turn look like a fresh conversation to the model.
|
|
148
|
+
previousMessages: ws ? history.slice(0, -1) : history,
|
|
144
149
|
overrideModel: cfg.model || null,
|
|
145
150
|
signal: controller.signal,
|
|
146
151
|
onToken: (chunk) => { liveBuf += chunk; },
|