@agentprojectcontext/apx 1.48.1 → 1.48.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/voice/stt-hardware.js +13 -4
- package/src/host/daemon/api/transcribe.js +5 -1
- package/src/interfaces/web/dist/assets/{index-BDJfFzQk.js → index-Bgu-xy_L.js} +124 -124
- package/src/interfaces/web/dist/assets/{index-BDJfFzQk.js.map → index-Bgu-xy_L.js.map} +1 -1
- package/src/interfaces/web/dist/assets/index-C53eJujd.css +1 -0
- package/src/interfaces/web/dist/index.html +2 -2
- package/src/interfaces/web/src/components/voice/VoiceSttCard.tsx +123 -17
- package/src/interfaces/web/src/i18n/en.ts +7 -0
- package/src/interfaces/web/src/i18n/es.ts +7 -0
- package/src/interfaces/web/src/lib/api/voice.ts +37 -1
- package/src/interfaces/web/dist/assets/index-CilEtMjV.css +0 -1
|
@@ -1,9 +1,36 @@
|
|
|
1
|
+
import { useEffect, useState } from "react";
|
|
1
2
|
import { Field, Input } from "../ui";
|
|
2
3
|
import { UiSelect } from "../UiSelect";
|
|
3
|
-
import { WHISPER_MODELS, type TranscriptionConfig } from "../../lib/api/voice";
|
|
4
|
+
import { Voice, WHISPER_MODELS, type TranscriptionConfig, type SttHardwareResponse, type SttModelEntry } from "../../lib/api/voice";
|
|
4
5
|
import { isSecretMarker, secretSuffix } from "../../lib/secrets";
|
|
5
6
|
import { t } from "../../i18n";
|
|
6
7
|
|
|
8
|
+
// Acceleration badge — each compute backend gets its own colour so the user can
|
|
9
|
+
// tell at a glance what the local engine runs on (Metal on Apple Silicon, CUDA
|
|
10
|
+
// on NVIDIA, Vulkan/ROCm on AMD, plain CPU otherwise).
|
|
11
|
+
const ACCEL: Record<string, { label: string; cls: string }> = {
|
|
12
|
+
metal: { label: "Metal", cls: "text-emerald-400 border-emerald-500/40 bg-emerald-500/10" },
|
|
13
|
+
cuda: { label: "CUDA", cls: "text-lime-400 border-lime-500/40 bg-lime-500/10" },
|
|
14
|
+
rocm: { label: "Vulkan / ROCm", cls: "text-orange-400 border-orange-500/40 bg-orange-500/10" },
|
|
15
|
+
none: { label: "CPU", cls: "text-muted-fg border-border bg-muted" },
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
function AccelBadge({ gpu }: { gpu: string }) {
|
|
19
|
+
const a = ACCEL[gpu] ?? ACCEL.none;
|
|
20
|
+
return (
|
|
21
|
+
<span className={`inline-flex items-center rounded-md border px-1.5 py-0.5 text-[11px] font-medium ${a.cls}`}>
|
|
22
|
+
{a.label}
|
|
23
|
+
</span>
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Human label for the recommended backend (engine + where it runs).
|
|
28
|
+
function backendLabel(rec: SttHardwareResponse["recommended"]): string {
|
|
29
|
+
if (rec.backend === "mlx") return "Metal · mlx-whisper";
|
|
30
|
+
if (rec.backend === "faster") return (rec.device === "cuda" ? "CUDA" : "CPU") + " · faster-whisper";
|
|
31
|
+
return rec.backend;
|
|
32
|
+
}
|
|
33
|
+
|
|
7
34
|
// STT (speech-to-text) configuration. Persisted under config.transcription.
|
|
8
35
|
// The actual capture happens in the desktop window / Telegram / CLI; here the
|
|
9
36
|
// owner picks the engine and configures it:
|
|
@@ -36,6 +63,13 @@ const langOptions = () => [
|
|
|
36
63
|
];
|
|
37
64
|
|
|
38
65
|
export function VoiceSttCard({ config, onPatch, busy }: Props) {
|
|
66
|
+
const [hw, setHw] = useState<SttHardwareResponse | null>(null);
|
|
67
|
+
useEffect(() => {
|
|
68
|
+
let alive = true;
|
|
69
|
+
Voice.sttHardware().then((r) => { if (alive) setHw(r); }).catch(() => {});
|
|
70
|
+
return () => { alive = false; };
|
|
71
|
+
}, []);
|
|
72
|
+
|
|
39
73
|
const provider = config.provider || "auto";
|
|
40
74
|
const local = config.local || {};
|
|
41
75
|
const openai = config.openai || {};
|
|
@@ -63,8 +97,63 @@ export function VoiceSttCard({ config, onPatch, busy }: Props) {
|
|
|
63
97
|
? t("voice_ui.api_key_set", { suffix: secretSuffix(marker) ?? "" })
|
|
64
98
|
: t("voice_ui.api_key_label");
|
|
65
99
|
|
|
100
|
+
// ── Local engine: acceleration backend + model (hardware-adaptive) ─────────
|
|
101
|
+
const localBackend = local.backend || "auto";
|
|
102
|
+
const accel = hw?.hardware.gpu || "none";
|
|
103
|
+
// What "auto" actually resolves to on this machine (mlx on Metal, faster else).
|
|
104
|
+
const effectiveBackend = localBackend === "auto" ? (hw?.recommended.backend || "faster") : localBackend;
|
|
105
|
+
const isMlx = effectiveBackend === "mlx";
|
|
106
|
+
// The accel a chosen backend runs on — drives the badge next to the selector.
|
|
107
|
+
const selectedAccel = isMlx ? "metal" : (effectiveBackend === "faster" && accel === "cuda" ? "cuda" : "none");
|
|
108
|
+
|
|
109
|
+
const backendOptions = () => {
|
|
110
|
+
const opts = [{ value: "auto", label: t("voice_ui.stt_backend_auto") }];
|
|
111
|
+
if (accel === "metal") opts.push({ value: "mlx", label: "Metal — mlx-whisper" });
|
|
112
|
+
opts.push({ value: "faster", label: accel === "cuda" ? "CUDA — faster-whisper" : "CPU — faster-whisper" });
|
|
113
|
+
return opts;
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
// Model list for the effective backend, with on-disk status in the label.
|
|
117
|
+
const [models, setModels] = useState<SttModelEntry[]>([]);
|
|
118
|
+
useEffect(() => {
|
|
119
|
+
let alive = true;
|
|
120
|
+
Voice.sttModels(effectiveBackend).then((r) => { if (alive) setModels(r.models); }).catch(() => { if (alive) setModels([]); });
|
|
121
|
+
return () => { alive = false; };
|
|
122
|
+
}, [effectiveBackend]);
|
|
123
|
+
|
|
124
|
+
const fmtModel = (m: SttModelEntry) => `${m.id} · ${m.downloaded ? "✓ " + m.size : m.size}`;
|
|
125
|
+
const modelOptions = () =>
|
|
126
|
+
models.length
|
|
127
|
+
? models.map((m) => ({ value: isMlx ? m.repo : m.id, label: fmtModel(m) }))
|
|
128
|
+
: WHISPER_MODELS.map((m) => ({ value: m, label: m }));
|
|
129
|
+
const modelValue = isMlx ? (local.mlx_model || hw?.recommended.model || "") : model;
|
|
130
|
+
const modelPatchKey = isMlx ? "transcription.local.mlx_model" : "transcription.local.model";
|
|
131
|
+
const selectedModel = models.find((m) => (isMlx ? m.repo : m.id) === modelValue);
|
|
132
|
+
const needsDownload = !!selectedModel && !selectedModel.downloaded;
|
|
133
|
+
|
|
66
134
|
return (
|
|
67
135
|
<div className="space-y-3">
|
|
136
|
+
{hw && (
|
|
137
|
+
<div className="rounded-lg border border-border bg-muted px-3 py-2 text-sm">
|
|
138
|
+
<div className="flex flex-wrap items-center gap-2">
|
|
139
|
+
<span className="text-muted-fg">{t("voice_ui.stt_hw_label")}:</span>
|
|
140
|
+
<AccelBadge gpu={hw.hardware.gpu} />
|
|
141
|
+
<span className="font-medium text-fg">{hw.hardware.gpuName || hw.hardware.platform}</span>
|
|
142
|
+
{hw.hardware.mem_gb ? (
|
|
143
|
+
<span className="text-muted-fg">
|
|
144
|
+
· {hw.hardware.mem_gb} GB{hw.hardware.unified_memory ? " unified" : ""}
|
|
145
|
+
</span>
|
|
146
|
+
) : null}
|
|
147
|
+
</div>
|
|
148
|
+
<div className="mt-1 text-xs text-muted-fg">
|
|
149
|
+
{t("voice_ui.stt_hw_recommended")}:{" "}
|
|
150
|
+
<span className="text-fg">{hw.recommended.model}</span>
|
|
151
|
+
{" "}({backendLabel(hw.recommended)})
|
|
152
|
+
{hw.recommended.limited ? ` — ${t("voice_ui.stt_hw_limited")}` : ""}
|
|
153
|
+
</div>
|
|
154
|
+
</div>
|
|
155
|
+
)}
|
|
156
|
+
|
|
68
157
|
<Field label={t("voice_ui.stt_engine_label")} hint={t("voice_ui.stt_engine_hint")}>
|
|
69
158
|
<UiSelect
|
|
70
159
|
value={provider}
|
|
@@ -76,23 +165,40 @@ export function VoiceSttCard({ config, onPatch, busy }: Props) {
|
|
|
76
165
|
</Field>
|
|
77
166
|
|
|
78
167
|
{showLocal && (
|
|
79
|
-
<div className="
|
|
80
|
-
<Field label={t("voice_ui.
|
|
81
|
-
<
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
onChange={(v) => onPatch({ "transcription.local.language": v })}
|
|
92
|
-
options={langOptions()}
|
|
93
|
-
disabled={busy}
|
|
94
|
-
/>
|
|
168
|
+
<div className="space-y-3">
|
|
169
|
+
<Field label={t("voice_ui.stt_backend_label")} hint={t("voice_ui.stt_backend_hint")}>
|
|
170
|
+
<div className="flex items-center gap-2">
|
|
171
|
+
<UiSelect
|
|
172
|
+
value={localBackend}
|
|
173
|
+
onChange={(v) => onPatch({ "transcription.local.backend": v })}
|
|
174
|
+
options={backendOptions()}
|
|
175
|
+
disabled={busy}
|
|
176
|
+
className="max-w-xs"
|
|
177
|
+
/>
|
|
178
|
+
<AccelBadge gpu={selectedAccel} />
|
|
179
|
+
</div>
|
|
95
180
|
</Field>
|
|
181
|
+
<div className="grid grid-cols-1 gap-3 sm:grid-cols-2">
|
|
182
|
+
<Field
|
|
183
|
+
label={t("voice_ui.stt_model_label")}
|
|
184
|
+
hint={needsDownload ? t("voice_ui.stt_model_needs_download", { size: selectedModel!.size }) : t("voice_ui.stt_model_hint")}
|
|
185
|
+
>
|
|
186
|
+
<UiSelect
|
|
187
|
+
value={modelValue}
|
|
188
|
+
onChange={(v) => onPatch({ [modelPatchKey]: v })}
|
|
189
|
+
options={modelOptions()}
|
|
190
|
+
disabled={busy}
|
|
191
|
+
/>
|
|
192
|
+
</Field>
|
|
193
|
+
<Field label={t("voice_ui.stt_language_label")} hint={t("voice_ui.stt_language_hint")}>
|
|
194
|
+
<UiSelect
|
|
195
|
+
value={language}
|
|
196
|
+
onChange={(v) => onPatch({ "transcription.local.language": v })}
|
|
197
|
+
options={langOptions()}
|
|
198
|
+
disabled={busy}
|
|
199
|
+
/>
|
|
200
|
+
</Field>
|
|
201
|
+
</div>
|
|
96
202
|
</div>
|
|
97
203
|
)}
|
|
98
204
|
|
|
@@ -1171,6 +1171,13 @@ export const en = {
|
|
|
1171
1171
|
stt_custom_model_label: "Model",
|
|
1172
1172
|
stt_custom_model_hint: "e.g. mlx-community/whisper-large-v3-turbo or large-v3.",
|
|
1173
1173
|
stt_custom_key_hint: "Optional — most local servers need no key.",
|
|
1174
|
+
stt_hw_label: "Detected hardware",
|
|
1175
|
+
stt_hw_recommended: "Recommended",
|
|
1176
|
+
stt_hw_limited: "limited GPU acceleration, using CPU",
|
|
1177
|
+
stt_backend_label: "Acceleration / Engine",
|
|
1178
|
+
stt_backend_hint: "Auto adapts to your hardware. Metal runs on the GPU (mlx); CPU uses faster-whisper.",
|
|
1179
|
+
stt_backend_auto: "Automatic (recommended)",
|
|
1180
|
+
stt_model_needs_download: "Not downloaded (~{size}). The model must be downloaded to use this engine.",
|
|
1174
1181
|
lang_auto: "Auto-detect",
|
|
1175
1182
|
lang_es: "Spanish",
|
|
1176
1183
|
lang_en: "English",
|
|
@@ -1169,6 +1169,13 @@ export const es = {
|
|
|
1169
1169
|
stt_custom_model_label: "Modelo",
|
|
1170
1170
|
stt_custom_model_hint: "Ej: mlx-community/whisper-large-v3-turbo o large-v3.",
|
|
1171
1171
|
stt_custom_key_hint: "Opcional — la mayoría de los servers locales no requieren key.",
|
|
1172
|
+
stt_hw_label: "Hardware detectado",
|
|
1173
|
+
stt_hw_recommended: "Recomendado",
|
|
1174
|
+
stt_hw_limited: "aceleración GPU limitada, se usa CPU",
|
|
1175
|
+
stt_backend_label: "Aceleración / Motor",
|
|
1176
|
+
stt_backend_hint: "Auto elige según tu hardware. Metal corre en la GPU (mlx); CPU usa faster-whisper.",
|
|
1177
|
+
stt_backend_auto: "Automático (recomendado)",
|
|
1178
|
+
stt_model_needs_download: "Falta descargar (~{size}). Hay que bajar el modelo para usar este motor.",
|
|
1172
1179
|
lang_auto: "Detección automática",
|
|
1173
1180
|
lang_es: "Español",
|
|
1174
1181
|
lang_en: "Inglés",
|
|
@@ -92,7 +92,9 @@ export interface VoiceTtsConfig {
|
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
export interface TranscriptionLocalConfig {
|
|
95
|
-
|
|
95
|
+
backend?: string; // auto | faster | mlx (auto adapts to the hardware)
|
|
96
|
+
model?: string; // faster-whisper model id (tiny | base | small | …)
|
|
97
|
+
mlx_model?: string; // mlx repo (e.g. mlx-community/whisper-large-v3-turbo)
|
|
96
98
|
device?: string; // cpu | cuda
|
|
97
99
|
compute_type?: string; // int8 | int8_float16 | float16 | float32
|
|
98
100
|
language?: string; // ISO code or "auto"
|
|
@@ -117,6 +119,34 @@ export interface TranscriptionConfig {
|
|
|
117
119
|
custom?: TranscriptionCustomConfig;
|
|
118
120
|
}
|
|
119
121
|
|
|
122
|
+
/** Detected machine + recommended local backend (GET /transcribe/hardware). */
|
|
123
|
+
export interface SttHardware {
|
|
124
|
+
platform: string;
|
|
125
|
+
arch: string;
|
|
126
|
+
appleSilicon: boolean;
|
|
127
|
+
gpu: "metal" | "cuda" | "rocm" | "none";
|
|
128
|
+
gpuName?: string;
|
|
129
|
+
mem_gb?: number;
|
|
130
|
+
unified_memory?: boolean;
|
|
131
|
+
}
|
|
132
|
+
export interface SttHardwareResponse {
|
|
133
|
+
hardware: SttHardware;
|
|
134
|
+
recommended: { backend: string; device?: string; model: string; reason?: string; tier?: string; limited?: boolean };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/** One model row from GET /transcribe/models. */
|
|
138
|
+
export interface SttModelEntry {
|
|
139
|
+
id: string;
|
|
140
|
+
repo: string;
|
|
141
|
+
downloaded: boolean;
|
|
142
|
+
size: string; // "1.6 GB" when present, "~1.6 GB" when not yet downloaded
|
|
143
|
+
size_bytes: number;
|
|
144
|
+
}
|
|
145
|
+
export interface SttModelsResponse {
|
|
146
|
+
backend: string;
|
|
147
|
+
models: SttModelEntry[];
|
|
148
|
+
}
|
|
149
|
+
|
|
120
150
|
/** One STT engine entry as reported by GET /transcribe/providers. */
|
|
121
151
|
export interface SttProviderEntry {
|
|
122
152
|
id: string; // "local" | "openai" | "custom"
|
|
@@ -169,6 +199,12 @@ export const Voice = {
|
|
|
169
199
|
/** List TTS engines + availability + the configured default provider. */
|
|
170
200
|
providers: () => http.get<TtsProvidersResponse>("/tts/providers"),
|
|
171
201
|
|
|
202
|
+
/** Detected hardware + the recommended local STT backend (Metal/CUDA/CPU). */
|
|
203
|
+
sttHardware: () => http.get<SttHardwareResponse>("/transcribe/hardware"),
|
|
204
|
+
|
|
205
|
+
/** Model catalog + on-disk status for a local backend ("faster" | "mlx"). */
|
|
206
|
+
sttModels: (backend: string) => http.get<SttModelsResponse>(`/transcribe/models?backend=${backend}`),
|
|
207
|
+
|
|
172
208
|
/**
|
|
173
209
|
* Synthesize speech. Returns the audio file path (server-side); the web
|
|
174
210
|
* fetches it via fetchTtsAudioUrl() to play it in the browser. `no_play`
|