@agentprojectcontext/apx 1.14.1 → 1.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +10 -2
- package/skills/apc-context/SKILL.md +68 -18
- package/skills/apx/SKILL.md +89 -33
- package/src/cli/commands/daemon.js +39 -7
- package/src/cli/commands/sys.js +249 -21
- package/src/cli/commands/telegram.js +8 -2
- package/src/cli/http.js +24 -7
- package/src/cli/index.js +10 -3
- package/src/cli/postinstall.js +54 -4
- package/src/cli/terminal-chat/renderer.js +60 -3
- package/src/core/logging.js +37 -0
- package/src/core/scaffold.js +70 -56
- package/src/daemon/api.js +29 -2
- package/src/daemon/engines/anthropic.js +2 -1
- package/src/daemon/engines/gemini.js +2 -1
- package/src/daemon/engines/index.js +3 -3
- package/src/daemon/engines/ollama.js +2 -1
- package/src/daemon/engines/openai.js +2 -1
- package/src/daemon/plugins/telegram.js +20 -1
- package/src/daemon/skills-loader.js +31 -66
- package/src/daemon/smoke.js +9 -1
- package/src/daemon/super-agent-tools/index.js +2 -0
- package/src/daemon/super-agent-tools/tools/ask-questions.js +28 -0
- package/src/daemon/super-agent-tools/tools/transcribe-audio.js +2 -2
- package/src/daemon/super-agent.js +97 -9
- package/src/daemon/transcription.js +154 -48
- package/src/daemon/whisper-server.py +202 -0
- package/src/daemon/whisper-transcribe.py +3 -1
- package/src/core/apc-context-skill.md +0 -105
- package/src/core/apx-skill.md +0 -135
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "ask_questions",
|
|
3
|
+
schema: {
|
|
4
|
+
function: {
|
|
5
|
+
name: "ask_questions",
|
|
6
|
+
description: "Ask the user one or more specific questions to clarify the task or gather requirements.",
|
|
7
|
+
parameters: {
|
|
8
|
+
type: "object",
|
|
9
|
+
properties: {
|
|
10
|
+
questions: {
|
|
11
|
+
type: "array",
|
|
12
|
+
items: { type: "string" },
|
|
13
|
+
description: "A list of questions for the user."
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
required: ["questions"]
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
makeHandler: () => async ({ questions }) => {
|
|
21
|
+
// This tool is used by the agent to explicitly signal that it is waiting for
|
|
22
|
+
// answers to specific questions. The UI can then highlight these.
|
|
23
|
+
return {
|
|
24
|
+
status: "Questions presented to user. Waiting for input.",
|
|
25
|
+
count: questions.length
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
};
|
|
@@ -11,7 +11,7 @@ export default {
|
|
|
11
11
|
function: {
|
|
12
12
|
name: "transcribe_audio",
|
|
13
13
|
description:
|
|
14
|
-
"Transcribe an audio file to text. Default backend is local faster-whisper (model '
|
|
14
|
+
"Transcribe an audio file to text. Default backend is local faster-whisper (model 'small' on CPU with int8 quantization, persistent server to avoid reload overhead), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
|
|
15
15
|
parameters: {
|
|
16
16
|
type: "object",
|
|
17
17
|
properties: {
|
|
@@ -19,7 +19,7 @@ export default {
|
|
|
19
19
|
base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
|
|
20
20
|
format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
|
|
21
21
|
provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
|
|
22
|
-
model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default
|
|
22
|
+
model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default small)" },
|
|
23
23
|
language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
|
|
24
24
|
device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
|
|
25
25
|
compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
|
|
@@ -68,6 +68,47 @@ HARD RULES (do not deviate):
|
|
|
68
68
|
21. **SKILLS — ON DEMAND**: The "# Available skills" section below lists every skill available to you (slug + description, NO body). When the user asks about specific APX/APC commands, project structure, agent runtimes, or anything where exact syntax or detailed behavior matches a skill description (in ANY language — match semantically, not by keyword), call load_skill({slug}) to fetch the full markdown body. If a CWD is in the contextNote, pass it as project_path so project-scoped skills resolve. If the user explicitly asks "what skills do you have?", you can either read the catalog below directly OR call list_skills to get a fresh enumeration. Do NOT load skills for trivial / unrelated questions — that wastes tokens. Don't guess CLI syntax when a skill can tell you; load it.
|
|
69
69
|
22. **NEVER PASTE BASE64 OR DATA URIs IN MESSAGE TEXT**: When you need to send an image, audio, or file via Telegram (or any channel), you MUST pass it via the dedicated parameter — NEVER embed it in the text field. Concretely: after browser_screenshot returns its base64 field, call send_telegram({text: "<short caption>", photo_base64: "<that base64>"}). Do NOT write text like 'Aquí está: ' — Telegram (and most chat clients) do NOT render data URIs or markdown images; the user sees thousands of garbage characters. Same for files: use document_path / document_base64 / document_url, NOT the text field. The text field is exclusively for human-readable prose (and becomes the caption when media is attached). If unsure, save the image to /tmp/screenshot-<ts>.png first (browser_screenshot supports save_to_tmp=true and returns a path field) and pass that path to send_telegram via photo_path — never inline the bytes in text.`;
|
|
70
70
|
|
|
71
|
+
function compactToolSchema(schema) {
|
|
72
|
+
const fn = schema?.function || {};
|
|
73
|
+
const params = fn.parameters || {};
|
|
74
|
+
const properties = params.properties || {};
|
|
75
|
+
return {
|
|
76
|
+
name: fn.name,
|
|
77
|
+
description: fn.description,
|
|
78
|
+
required: params.required || [],
|
|
79
|
+
properties: Object.fromEntries(
|
|
80
|
+
Object.entries(properties).map(([name, spec]) => [
|
|
81
|
+
name,
|
|
82
|
+
{
|
|
83
|
+
type: spec?.type || "string",
|
|
84
|
+
enum: spec?.enum,
|
|
85
|
+
description: spec?.description,
|
|
86
|
+
},
|
|
87
|
+
])
|
|
88
|
+
),
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function pseudoToolSystem(system) {
|
|
93
|
+
const catalog = TOOL_SCHEMAS.map(compactToolSchema);
|
|
94
|
+
return [
|
|
95
|
+
system,
|
|
96
|
+
"# Structured tool fallback",
|
|
97
|
+
"The engine rejected native structured tools. You can still call tools by emitting plain JSON.",
|
|
98
|
+
"When you need a tool, respond ONLY with one JSON object per line:",
|
|
99
|
+
"{\"name\":\"tool_name\",\"arguments\":{\"arg\":\"value\"}}",
|
|
100
|
+
"After tool results arrive, continue the task or give the final answer normally.",
|
|
101
|
+
"Available tools:",
|
|
102
|
+
JSON.stringify(catalog),
|
|
103
|
+
].join("\n\n");
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function shouldRetryWithPseudoTools(modelId, error, alreadyPseudo) {
|
|
107
|
+
if (alreadyPseudo) return false;
|
|
108
|
+
const message = String(error?.message || "");
|
|
109
|
+
return /^ollama:/i.test(String(modelId || "")) && /ollama\s+500/i.test(message);
|
|
110
|
+
}
|
|
111
|
+
|
|
71
112
|
function isShortConfirmation(text) {
|
|
72
113
|
return /^(yes|y|si|si dale|dale|ok|okay|confirm|confirmed|go|proceed|do it)\b/i
|
|
73
114
|
.test(String(text || "").trim());
|
|
@@ -115,6 +156,7 @@ export async function runSuperAgent({
|
|
|
115
156
|
previousMessages = [],
|
|
116
157
|
overrideModel = null,
|
|
117
158
|
onEvent = null,
|
|
159
|
+
signal,
|
|
118
160
|
}) {
|
|
119
161
|
if (!isSuperAgentEnabled(globalConfig)) {
|
|
120
162
|
throw new Error("super-agent not enabled (set super_agent.enabled and .model in ~/.apx/config.json)");
|
|
@@ -187,6 +229,7 @@ export async function runSuperAgent({
|
|
|
187
229
|
const trace = [];
|
|
188
230
|
let totalUsage = { input_tokens: 0, output_tokens: 0 };
|
|
189
231
|
let lastText = "";
|
|
232
|
+
let usePseudoTools = false;
|
|
190
233
|
|
|
191
234
|
for (let iter = 0; iter < MAX_TOOL_ITERS; iter++) {
|
|
192
235
|
await emitProgress(onEvent, { type: "model_start", iteration: iter + 1 });
|
|
@@ -195,15 +238,38 @@ export async function runSuperAgent({
|
|
|
195
238
|
// acting on an action request. On later iterations (after tool results
|
|
196
239
|
// have been fed back) tool_choice is "auto" so the model can produce its
|
|
197
240
|
// final text summary.
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
241
|
+
let result;
|
|
242
|
+
try {
|
|
243
|
+
result = await callEngine({
|
|
244
|
+
modelId: activeModel,
|
|
245
|
+
system: usePseudoTools ? pseudoToolSystem(system) : system,
|
|
246
|
+
messages: conversation,
|
|
247
|
+
config: globalConfig,
|
|
248
|
+
tools: usePseudoTools ? null : TOOL_SCHEMAS,
|
|
249
|
+
toolChoice: usePseudoTools ? null : (iter === 0 ? "required" : "auto"),
|
|
250
|
+
maxTokens: 1024,
|
|
251
|
+
signal,
|
|
252
|
+
});
|
|
253
|
+
} catch (e) {
|
|
254
|
+
if (usePseudoTools && /^ollama:/i.test(String(activeModel || "")) && /ollama\s+500/i.test(String(e?.message || "")) && trace.length > 0) {
|
|
255
|
+
await emitProgress(onEvent, { type: "model_retry", reason: "ollama_final_response_500", iteration: iter + 1 });
|
|
256
|
+
lastText = fallbackFinalText(trace, e);
|
|
257
|
+
break;
|
|
258
|
+
}
|
|
259
|
+
if (!shouldRetryWithPseudoTools(activeModel, e, usePseudoTools)) throw e;
|
|
260
|
+
usePseudoTools = true;
|
|
261
|
+
await emitProgress(onEvent, { type: "model_retry", reason: "ollama_structured_tools_500", iteration: iter + 1 });
|
|
262
|
+
result = await callEngine({
|
|
263
|
+
modelId: activeModel,
|
|
264
|
+
system: pseudoToolSystem(system),
|
|
265
|
+
messages: conversation,
|
|
266
|
+
config: globalConfig,
|
|
267
|
+
tools: null,
|
|
268
|
+
toolChoice: null,
|
|
269
|
+
maxTokens: 1024,
|
|
270
|
+
signal,
|
|
271
|
+
});
|
|
272
|
+
}
|
|
207
273
|
totalUsage.input_tokens += result.usage?.input_tokens || 0;
|
|
208
274
|
totalUsage.output_tokens += result.usage?.output_tokens || 0;
|
|
209
275
|
lastText = result.text || "";
|
|
@@ -317,3 +383,25 @@ function summarizeForTrace(r) {
|
|
|
317
383
|
if (s.length <= 400) return r;
|
|
318
384
|
return s.slice(0, 380) + "…(truncated)";
|
|
319
385
|
}
|
|
386
|
+
|
|
387
|
+
function fallbackFinalText(trace, error) {
|
|
388
|
+
const lines = [
|
|
389
|
+
"Tool execution completed, but the model failed while composing the final answer.",
|
|
390
|
+
`Engine error: ${String(error?.message || error).slice(0, 220)}`,
|
|
391
|
+
"Trace:",
|
|
392
|
+
];
|
|
393
|
+
for (const item of trace.slice(-8)) {
|
|
394
|
+
lines.push(`- ${item.tool}: ${previewTraceResult(item.result)}`);
|
|
395
|
+
}
|
|
396
|
+
return lines.join("\n");
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
function previewTraceResult(result) {
|
|
400
|
+
if (result === null || result === undefined) return "ok";
|
|
401
|
+
if (typeof result === "string") return result.slice(0, 180);
|
|
402
|
+
if (result.error) return `error: ${String(result.error).slice(0, 180)}`;
|
|
403
|
+
if (result.path) return String(result.path).slice(0, 180);
|
|
404
|
+
if (result.content) return String(result.content).slice(0, 180);
|
|
405
|
+
if (result.results) return JSON.stringify(result.results).slice(0, 180);
|
|
406
|
+
return JSON.stringify(result).slice(0, 180);
|
|
407
|
+
}
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
// daemon/transcription.js
|
|
2
2
|
// Audio transcription dispatcher. Two backends:
|
|
3
3
|
//
|
|
4
|
-
// - LOCAL (faster-whisper via Python
|
|
5
|
-
//
|
|
6
|
-
//
|
|
7
|
-
// `pip3 install faster-whisper` on the host.
|
|
4
|
+
// - LOCAL (faster-whisper via persistent Python server) — the server loads
|
|
5
|
+
// the model once on first use and keeps it in RAM. It auto-shuts down after
|
|
6
|
+
// idle_minutes (default 10) of inactivity, then restarts lazily on the
|
|
7
|
+
// next request. Requires `pip3 install faster-whisper` on the host.
|
|
8
8
|
//
|
|
9
9
|
// - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
|
|
10
10
|
// engines.openai.api_key in config.
|
|
@@ -13,31 +13,36 @@
|
|
|
13
13
|
// "transcription": {
|
|
14
14
|
// "provider": "auto" | "local" | "openai", // default "auto"
|
|
15
15
|
// "local": {
|
|
16
|
-
// "model": "
|
|
17
|
-
// "device": "cpu",
|
|
18
|
-
// "compute_type": "int8",
|
|
19
|
-
// "language": "auto",
|
|
20
|
-
// "beam_size": 5
|
|
16
|
+
// "model": "small", // tiny | base | small | medium | large | large-v2 | large-v3
|
|
17
|
+
// "device": "cpu", // cpu | cuda
|
|
18
|
+
// "compute_type": "int8", // int8 | int8_float16 | float16 | float32
|
|
19
|
+
// "language": "auto", // ISO 639-1 code (e.g. "es") or "auto"
|
|
20
|
+
// "beam_size": 5,
|
|
21
|
+
// "idle_minutes": 10 // auto-shutdown after N minutes idle
|
|
21
22
|
// }
|
|
22
23
|
// }
|
|
23
24
|
//
|
|
24
25
|
// "auto" tries local first; on failure falls back to openai.
|
|
26
|
+
//
|
|
27
|
+
// Spanish tip: set language: "es" for better accuracy with the small model.
|
|
25
28
|
|
|
26
29
|
import fs from "node:fs";
|
|
27
30
|
import path from "node:path";
|
|
28
|
-
import {
|
|
31
|
+
import { spawn } from "node:child_process";
|
|
29
32
|
import { fileURLToPath } from "node:url";
|
|
30
33
|
|
|
31
|
-
const __filename
|
|
32
|
-
const __dirname
|
|
33
|
-
const
|
|
34
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
35
|
+
const __dirname = path.dirname(__filename);
|
|
36
|
+
const WHISPER_SERVER = path.join(__dirname, "whisper-server.py");
|
|
37
|
+
const WHISPER_PORT = 18765;
|
|
34
38
|
|
|
35
39
|
const DEFAULT_LOCAL = {
|
|
36
|
-
model: "
|
|
40
|
+
model: "small",
|
|
37
41
|
device: "cpu",
|
|
38
42
|
compute_type: "int8",
|
|
39
43
|
language: "auto",
|
|
40
44
|
beam_size: 5,
|
|
45
|
+
idle_minutes: 10,
|
|
41
46
|
};
|
|
42
47
|
|
|
43
48
|
// ---------------------------------------------------------------------------
|
|
@@ -65,45 +70,145 @@ async function getConfig() {
|
|
|
65
70
|
}
|
|
66
71
|
|
|
67
72
|
// ---------------------------------------------------------------------------
|
|
68
|
-
//
|
|
73
|
+
// Persistent server management
|
|
69
74
|
// ---------------------------------------------------------------------------
|
|
70
75
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
76
|
+
let _serverProcess = null;
|
|
77
|
+
let _serverModel = null; // model the running server was started with
|
|
78
|
+
|
|
79
|
+
function _sleep(ms) {
|
|
80
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async function _isServerHealthy() {
|
|
84
|
+
try {
|
|
85
|
+
const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/health`, {
|
|
86
|
+
signal: AbortSignal.timeout(800),
|
|
87
|
+
});
|
|
88
|
+
return res.ok;
|
|
89
|
+
} catch {
|
|
90
|
+
return false;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function _waitForServer(maxMs = 15_000) {
|
|
95
|
+
const deadline = Date.now() + maxMs;
|
|
96
|
+
while (Date.now() < deadline) {
|
|
97
|
+
if (await _isServerHealthy()) return;
|
|
98
|
+
await _sleep(250);
|
|
99
|
+
}
|
|
100
|
+
throw new Error(`whisper-server did not start within ${maxMs}ms`);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
async function ensureWhisperServer(opts) {
|
|
104
|
+
const model = opts.model || DEFAULT_LOCAL.model;
|
|
105
|
+
|
|
106
|
+
// Already running with the right model — health-check to confirm still alive.
|
|
107
|
+
if (_serverProcess && _serverModel === model) {
|
|
108
|
+
if (await _isServerHealthy()) return;
|
|
109
|
+
// Process died (idle shutdown). Fall through to restart.
|
|
110
|
+
_serverProcess = null;
|
|
111
|
+
_serverModel = null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Wrong model: kill old server and start fresh.
|
|
115
|
+
if (_serverProcess) {
|
|
116
|
+
try { _serverProcess.kill(); } catch {}
|
|
117
|
+
_serverProcess = null;
|
|
118
|
+
_serverModel = null;
|
|
119
|
+
await _sleep(300);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const args = [
|
|
123
|
+
WHISPER_SERVER,
|
|
124
|
+
"--port", String(WHISPER_PORT),
|
|
125
|
+
"--model", model,
|
|
126
|
+
"--device", String(opts.device || DEFAULT_LOCAL.device),
|
|
127
|
+
"--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
|
|
128
|
+
"--idle-minutes", String(opts.idle_minutes ?? DEFAULT_LOCAL.idle_minutes),
|
|
129
|
+
];
|
|
130
|
+
|
|
131
|
+
const proc = spawn("python3", args, {
|
|
132
|
+
stdio: ["ignore", "pipe", "inherit"],
|
|
133
|
+
detached: false,
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
_serverProcess = proc;
|
|
137
|
+
_serverModel = model;
|
|
138
|
+
|
|
139
|
+
proc.on("exit", () => {
|
|
140
|
+
if (_serverProcess === proc) {
|
|
141
|
+
_serverProcess = null;
|
|
142
|
+
_serverModel = null;
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
// Wait for the "ready" line on stdout, then wait for HTTP to respond.
|
|
147
|
+
await new Promise((resolve, reject) => {
|
|
148
|
+
const timeout = setTimeout(
|
|
149
|
+
() => reject(new Error("whisper-server startup timed out (15s)")),
|
|
150
|
+
15_000
|
|
151
|
+
);
|
|
152
|
+
let buf = "";
|
|
153
|
+
proc.stdout.on("data", (chunk) => {
|
|
154
|
+
buf += chunk.toString();
|
|
155
|
+
const nl = buf.indexOf("\n");
|
|
156
|
+
if (nl === -1) return;
|
|
157
|
+
const line = buf.slice(0, nl).trim();
|
|
158
|
+
buf = buf.slice(nl + 1);
|
|
159
|
+
clearTimeout(timeout);
|
|
160
|
+
try {
|
|
161
|
+
const msg = JSON.parse(line);
|
|
162
|
+
if (msg.status === "error") return reject(new Error(msg.error || "whisper-server error"));
|
|
163
|
+
resolve(); // "ready"
|
|
164
|
+
} catch {
|
|
165
|
+
resolve(); // unexpected line but server is up
|
|
91
166
|
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
text: parsed.text || "",
|
|
97
|
-
language: parsed.language || null,
|
|
98
|
-
language_probability: parsed.language_probability ?? null,
|
|
99
|
-
duration: parsed.duration ?? null,
|
|
100
|
-
model: parsed.model,
|
|
101
|
-
compute_type: parsed.compute_type,
|
|
102
|
-
});
|
|
167
|
+
});
|
|
168
|
+
proc.on("exit", (code) => {
|
|
169
|
+
clearTimeout(timeout);
|
|
170
|
+
reject(new Error(`whisper-server exited (code ${code}) before becoming ready`));
|
|
103
171
|
});
|
|
104
172
|
});
|
|
105
173
|
}
|
|
106
174
|
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
// Local backend (persistent whisper-server.py via HTTP)
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
async function transcribeLocal(filePath, opts) {
|
|
180
|
+
await ensureWhisperServer(opts);
|
|
181
|
+
|
|
182
|
+
const language = (opts.language || DEFAULT_LOCAL.language) === "auto"
|
|
183
|
+
? null
|
|
184
|
+
: (opts.language || null);
|
|
185
|
+
|
|
186
|
+
const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/transcribe`, {
|
|
187
|
+
method: "POST",
|
|
188
|
+
headers: { "content-type": "application/json" },
|
|
189
|
+
body: JSON.stringify({
|
|
190
|
+
audio_path: filePath,
|
|
191
|
+
language,
|
|
192
|
+
beam_size: opts.beam_size || DEFAULT_LOCAL.beam_size,
|
|
193
|
+
}),
|
|
194
|
+
signal: AbortSignal.timeout(5 * 60_000),
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
const json = await res.json();
|
|
198
|
+
if (!json.ok) throw new Error(json.error || "transcription failed");
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
ok: true,
|
|
202
|
+
backend: "local",
|
|
203
|
+
text: json.text || "",
|
|
204
|
+
language: json.language || null,
|
|
205
|
+
language_probability: json.language_probability ?? null,
|
|
206
|
+
duration: json.duration ?? null,
|
|
207
|
+
model: json.model,
|
|
208
|
+
compute_type: json.compute_type,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
107
212
|
// ---------------------------------------------------------------------------
|
|
108
213
|
// OpenAI backend (Whisper-1 cloud)
|
|
109
214
|
// ---------------------------------------------------------------------------
|
|
@@ -154,7 +259,7 @@ async function transcribeOpenAI(filePath, apiKey) {
|
|
|
154
259
|
* Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
|
|
155
260
|
*
|
|
156
261
|
* @param {string} filePath absolute path to audio file
|
|
157
|
-
* @param {object} overrides optional: { provider, model, language, ... }
|
|
262
|
+
* @param {object} overrides optional: { provider, model, language, idle_minutes, ... }
|
|
158
263
|
*/
|
|
159
264
|
export async function transcribe(filePath, overrides = {}) {
|
|
160
265
|
if (!filePath || !fs.existsSync(filePath)) {
|
|
@@ -189,5 +294,6 @@ export async function transcribe(filePath, overrides = {}) {
|
|
|
189
294
|
// ---------------------------------------------------------------------------
|
|
190
295
|
|
|
191
296
|
export const TRANSCRIPTION_PATHS = {
|
|
192
|
-
|
|
297
|
+
whisper_server: WHISPER_SERVER,
|
|
298
|
+
port: WHISPER_PORT,
|
|
193
299
|
};
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Persistent Whisper transcription server for APX.
|
|
4
|
+
|
|
5
|
+
Loads the model once on the first /transcribe request and keeps it in RAM.
|
|
6
|
+
Auto-shuts down after --idle-minutes of inactivity so it doesn't consume
|
|
7
|
+
memory permanently when not in use.
|
|
8
|
+
|
|
9
|
+
Started automatically by APX daemon via transcription.js. Do not run manually.
|
|
10
|
+
|
|
11
|
+
Endpoints:
|
|
12
|
+
GET /health → { ok, model, loaded }
|
|
13
|
+
POST /transcribe ← { audio_path, language?, beam_size? }
|
|
14
|
+
→ { ok, text, language, language_probability, duration, model, compute_type }
|
|
15
|
+
POST /shutdown → graceful stop
|
|
16
|
+
"""
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import threading
|
|
22
|
+
import time
|
|
23
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# State
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
_model = None
|
|
30
|
+
_model_name = None
|
|
31
|
+
_model_lock = threading.Lock()
|
|
32
|
+
_last_used = time.monotonic()
|
|
33
|
+
_idle_seconds = 10 * 60
|
|
34
|
+
_server_ref = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _touch():
|
|
38
|
+
global _last_used
|
|
39
|
+
_last_used = time.monotonic()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _load_model_if_needed(model_name, device, compute_type):
|
|
43
|
+
global _model, _model_name
|
|
44
|
+
if _model is not None and _model_name == model_name:
|
|
45
|
+
return _model
|
|
46
|
+
from faster_whisper import WhisperModel
|
|
47
|
+
threads = os.cpu_count() or 4
|
|
48
|
+
m = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=threads)
|
|
49
|
+
_model = m
|
|
50
|
+
_model_name = model_name
|
|
51
|
+
return m
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# HTTP handler
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
class _Handler(BaseHTTPRequestHandler):
|
|
59
|
+
model_name = "small"
|
|
60
|
+
device = "cpu"
|
|
61
|
+
compute_type = "int8"
|
|
62
|
+
|
|
63
|
+
def log_message(self, fmt, *args):
|
|
64
|
+
pass # suppress access log; APX daemon handles its own logging
|
|
65
|
+
|
|
66
|
+
def _send_json(self, code, body):
|
|
67
|
+
data = json.dumps(body).encode()
|
|
68
|
+
self.send_response(code)
|
|
69
|
+
self.send_header("Content-Type", "application/json")
|
|
70
|
+
self.send_header("Content-Length", str(len(data)))
|
|
71
|
+
self.end_headers()
|
|
72
|
+
self.wfile.write(data)
|
|
73
|
+
|
|
74
|
+
def _read_body(self):
|
|
75
|
+
n = int(self.headers.get("Content-Length", 0))
|
|
76
|
+
if n <= 0:
|
|
77
|
+
return {}
|
|
78
|
+
try:
|
|
79
|
+
return json.loads(self.rfile.read(n))
|
|
80
|
+
except Exception:
|
|
81
|
+
return {}
|
|
82
|
+
|
|
83
|
+
def do_GET(self):
|
|
84
|
+
if self.path == "/health":
|
|
85
|
+
_touch()
|
|
86
|
+
self._send_json(200, {
|
|
87
|
+
"ok": True,
|
|
88
|
+
"model": _model_name or _Handler.model_name,
|
|
89
|
+
"loaded": _model is not None,
|
|
90
|
+
})
|
|
91
|
+
else:
|
|
92
|
+
self._send_json(404, {"ok": False, "error": "not found"})
|
|
93
|
+
|
|
94
|
+
def do_POST(self):
|
|
95
|
+
req = self._read_body()
|
|
96
|
+
|
|
97
|
+
if self.path == "/transcribe":
|
|
98
|
+
_touch()
|
|
99
|
+
audio_path = req.get("audio_path", "")
|
|
100
|
+
language = req.get("language") or None # None → auto-detect
|
|
101
|
+
beam_size = int(req.get("beam_size", 5))
|
|
102
|
+
|
|
103
|
+
if not audio_path or not os.path.exists(audio_path):
|
|
104
|
+
self._send_json(400, {"ok": False, "error": f"file not found: {audio_path}"})
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
with _model_lock:
|
|
108
|
+
try:
|
|
109
|
+
m = _load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
|
|
110
|
+
except ImportError:
|
|
111
|
+
self._send_json(500, {
|
|
112
|
+
"ok": False,
|
|
113
|
+
"error": "faster-whisper not installed — run: pip3 install faster-whisper",
|
|
114
|
+
})
|
|
115
|
+
return
|
|
116
|
+
except Exception as e:
|
|
117
|
+
self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
segments, info = m.transcribe(audio_path, beam_size=beam_size, language=language)
|
|
122
|
+
text = " ".join(seg.text.strip() for seg in segments).strip()
|
|
123
|
+
self._send_json(200, {
|
|
124
|
+
"ok": True,
|
|
125
|
+
"text": text,
|
|
126
|
+
"language": info.language,
|
|
127
|
+
"language_probability": round(info.language_probability, 4),
|
|
128
|
+
"duration": round(info.duration, 2),
|
|
129
|
+
"model": _model_name,
|
|
130
|
+
"compute_type": _Handler.compute_type,
|
|
131
|
+
})
|
|
132
|
+
except Exception as e:
|
|
133
|
+
self._send_json(500, {"ok": False, "error": f"transcription failed: {e}"})
|
|
134
|
+
|
|
135
|
+
elif self.path == "/shutdown":
|
|
136
|
+
self._send_json(200, {"ok": True})
|
|
137
|
+
if _server_ref:
|
|
138
|
+
threading.Thread(target=_server_ref.shutdown, daemon=True).start()
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
self._send_json(404, {"ok": False, "error": "not found"})
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# Idle watchdog
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
def _watchdog(idle_seconds):
|
|
149
|
+
while True:
|
|
150
|
+
time.sleep(30)
|
|
151
|
+
idle = time.monotonic() - _last_used
|
|
152
|
+
if idle > idle_seconds:
|
|
153
|
+
print(
|
|
154
|
+
f"[whisper-server] idle {int(idle)}s > {idle_seconds}s — shutting down",
|
|
155
|
+
file=sys.stderr,
|
|
156
|
+
flush=True,
|
|
157
|
+
)
|
|
158
|
+
if _server_ref:
|
|
159
|
+
_server_ref.shutdown()
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# Main
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def main():
|
|
168
|
+
global _server_ref, _idle_seconds
|
|
169
|
+
|
|
170
|
+
parser = argparse.ArgumentParser(description="Persistent APX Whisper server")
|
|
171
|
+
parser.add_argument("--port", type=int, default=18765)
|
|
172
|
+
parser.add_argument("--model", default="small")
|
|
173
|
+
parser.add_argument("--device", default="cpu")
|
|
174
|
+
parser.add_argument("--compute-type", dest="compute_type", default="int8")
|
|
175
|
+
parser.add_argument("--idle-minutes", dest="idle_minutes", type=int, default=10)
|
|
176
|
+
args = parser.parse_args()
|
|
177
|
+
|
|
178
|
+
_Handler.model_name = args.model
|
|
179
|
+
_Handler.device = args.device
|
|
180
|
+
_Handler.compute_type = args.compute_type
|
|
181
|
+
_idle_seconds = args.idle_minutes * 60
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
_server_ref = HTTPServer(("127.0.0.1", args.port), _Handler)
|
|
185
|
+
except OSError as e:
|
|
186
|
+
print(json.dumps({"status": "error", "error": str(e)}), flush=True)
|
|
187
|
+
sys.exit(1)
|
|
188
|
+
|
|
189
|
+
# Signal readiness to the Node.js parent before serve_forever blocks.
|
|
190
|
+
print(json.dumps({
|
|
191
|
+
"status": "ready",
|
|
192
|
+
"port": args.port,
|
|
193
|
+
"model": args.model,
|
|
194
|
+
"idle_minutes": args.idle_minutes,
|
|
195
|
+
}), flush=True)
|
|
196
|
+
|
|
197
|
+
threading.Thread(target=_watchdog, args=(_idle_seconds,), daemon=True).start()
|
|
198
|
+
_server_ref.serve_forever()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
main()
|
|
@@ -42,7 +42,9 @@ def main() -> int:
|
|
|
42
42
|
return 1
|
|
43
43
|
|
|
44
44
|
try:
|
|
45
|
-
|
|
45
|
+
import multiprocessing
|
|
46
|
+
threads = os.cpu_count() or 4
|
|
47
|
+
model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type, cpu_threads=threads)
|
|
46
48
|
except Exception as e:
|
|
47
49
|
print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
|
|
48
50
|
return 1
|