@agentprojectcontext/apx 1.15.0 → 1.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -2
- package/src/cli/commands/daemon.js +39 -7
- package/src/cli/commands/setup.js +31 -16
- package/src/core/config.js +4 -0
- package/src/daemon/super-agent-tools/tools/transcribe-audio.js +2 -2
- package/src/daemon/transcription.js +162 -49
- package/src/daemon/whisper-server.py +202 -0
- package/src/daemon/whisper-transcribe.py +3 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentprojectcontext/apx",
|
|
3
|
-
"version": "1.15.
|
|
3
|
+
"version": "1.15.2",
|
|
4
4
|
"description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -24,10 +24,11 @@
|
|
|
24
24
|
"start": "node src/daemon/index.js",
|
|
25
25
|
"smoke": "node src/daemon/smoke.js",
|
|
26
26
|
"test": "node --test --test-reporter=spec tests/*.test.js",
|
|
27
|
-
"upgrade": "
|
|
27
|
+
"upgrade": "pnpm install && pnpm add -g .",
|
|
28
28
|
"prepack": "node scripts/sync-apc-skill.js",
|
|
29
29
|
"postinstall": "node src/cli/postinstall.js"
|
|
30
30
|
},
|
|
31
|
+
"packageManager": "pnpm@10.25.0",
|
|
31
32
|
"dependencies": {
|
|
32
33
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
33
34
|
"chalk": "^5.6.2",
|
|
@@ -55,6 +56,12 @@
|
|
|
55
56
|
"claude",
|
|
56
57
|
"ai"
|
|
57
58
|
],
|
|
59
|
+
"pnpm": {
|
|
60
|
+
"onlyBuiltDependencies": [
|
|
61
|
+
"better-sqlite3",
|
|
62
|
+
"puppeteer"
|
|
63
|
+
]
|
|
64
|
+
},
|
|
58
65
|
"repository": {
|
|
59
66
|
"type": "git",
|
|
60
67
|
"url": "https://github.com/agentprojectcontext/apx.git"
|
|
@@ -124,25 +124,57 @@ export async function cmdDaemonStop(args = {}) {
|
|
|
124
124
|
}
|
|
125
125
|
}
|
|
126
126
|
|
|
127
|
-
export function cmdDaemonLogs(args) {
|
|
127
|
+
export async function cmdDaemonLogs(args) {
|
|
128
128
|
const debug = args.flags?.debug;
|
|
129
|
+
const follow = args.flags?.follow || args.flags?.f;
|
|
130
|
+
|
|
129
131
|
if (!fs.existsSync(LOG_PATH)) {
|
|
130
132
|
console.log(fmt.gray(` (no log file at ${LOG_PATH})`));
|
|
131
133
|
return;
|
|
132
134
|
}
|
|
133
|
-
|
|
134
|
-
const
|
|
135
|
+
|
|
136
|
+
const tail = args.flags?.tail ? parseInt(args.flags.tail, 10) : 50;
|
|
137
|
+
const content = fs.readFileSync(LOG_PATH, "utf8");
|
|
138
|
+
const lines = content.split("\n");
|
|
135
139
|
const slice = lines.slice(-tail - 1).filter(Boolean);
|
|
136
140
|
|
|
137
141
|
if (debug) console.log(fmt.gray(` log: ${LOG_PATH} (last ${tail} lines)\n`));
|
|
138
142
|
|
|
139
|
-
|
|
140
|
-
// dim timestamps, highlight ERROR/WARN
|
|
143
|
+
const printLine = (line) => {
|
|
141
144
|
const colored = line
|
|
142
145
|
.replace(/^(\d{4}-\d\d-\d\dT[\d:.Z]+)/, (m) => fmt.gray(m))
|
|
143
146
|
.replace(/\bERROR\b/g, fmt.red("ERROR"))
|
|
144
|
-
.replace(/\bWARN\b/g,
|
|
145
|
-
.replace(/\bINFO\b/g,
|
|
147
|
+
.replace(/\bWARN\b/g, fmt.yellow("WARN"))
|
|
148
|
+
.replace(/\bINFO\b/g, fmt.cyan("INFO"));
|
|
146
149
|
console.log(colored);
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
for (const line of slice) {
|
|
153
|
+
printLine(line);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (follow) {
|
|
157
|
+
let currentSize = fs.statSync(LOG_PATH).size;
|
|
158
|
+
fs.watch(LOG_PATH, (event) => {
|
|
159
|
+
if (event === "change") {
|
|
160
|
+
const newSize = fs.statSync(LOG_PATH).size;
|
|
161
|
+
if (newSize > currentSize) {
|
|
162
|
+
const stream = fs.createReadStream(LOG_PATH, {
|
|
163
|
+
start: currentSize,
|
|
164
|
+
end: newSize - 1,
|
|
165
|
+
});
|
|
166
|
+
stream.on("data", (chunk) => {
|
|
167
|
+
const lines = chunk.toString().split("\n").filter(Boolean);
|
|
168
|
+
for (const l of lines) printLine(l);
|
|
169
|
+
});
|
|
170
|
+
currentSize = newSize;
|
|
171
|
+
} else if (newSize < currentSize) {
|
|
172
|
+
// File truncated or rotated
|
|
173
|
+
currentSize = newSize;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
// Keep process alive
|
|
178
|
+
return new Promise(() => {});
|
|
147
179
|
}
|
|
148
180
|
}
|
|
@@ -192,9 +192,20 @@ export async function cmdSetup() {
|
|
|
192
192
|
// ── Language ────────────────────────────────────────────────────────────────
|
|
193
193
|
console.log();
|
|
194
194
|
console.log(b(" Language:"));
|
|
195
|
-
console.log(di("
|
|
195
|
+
console.log(di(" Used for audio transcription, super-agent replies, and Telegram messages."));
|
|
196
|
+
console.log(di(" Enter a 2-letter ISO 639-1 code. Common codes:"));
|
|
197
|
+
console.log(di(" es=Spanish en=English pt=Portuguese fr=French de=German"));
|
|
198
|
+
console.log(di(" it=Italian zh=Chinese ja=Japanese ko=Korean ar=Arabic"));
|
|
196
199
|
console.log();
|
|
197
|
-
|
|
200
|
+
let language = "";
|
|
201
|
+
while (!language) {
|
|
202
|
+
const raw = (await ask(" Language code [en]: ")).trim().toLowerCase() || "en";
|
|
203
|
+
if (/^[a-z]{2}$/.test(raw)) {
|
|
204
|
+
language = raw;
|
|
205
|
+
} else {
|
|
206
|
+
console.log(` ${c.yellow}Please enter exactly 2 letters (e.g. es, en, pt).${c.reset}`);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
198
209
|
|
|
199
210
|
// ── Summary ─────────────────────────────────────────────────────────────────
|
|
200
211
|
console.log();
|
|
@@ -236,6 +247,8 @@ export async function cmdSetup() {
|
|
|
236
247
|
cfg.telegram.chat_id = chatId;
|
|
237
248
|
}
|
|
238
249
|
|
|
250
|
+
cfg.user = { ...(cfg.user || {}), language };
|
|
251
|
+
|
|
239
252
|
writeConfig(cfg);
|
|
240
253
|
console.log(`\n ${gr("✓")} Config saved to ${di("~/.apx/config.json")}`);
|
|
241
254
|
|
|
@@ -282,7 +295,7 @@ export async function cmdSetup() {
|
|
|
282
295
|
async function sendTelegramWakeup({ botToken, chatId, language, model }) {
|
|
283
296
|
const prompt =
|
|
284
297
|
`You are APX, an AI agent assistant that just came online for the first time. ` +
|
|
285
|
-
`Write a short, enthusiastic wake-up message in ${language}. ` +
|
|
298
|
+
`Write a short, enthusiastic wake-up message in the language with ISO 639-1 code "${language}". ` +
|
|
286
299
|
`Structure it in exactly 3 short lines: ` +
|
|
287
300
|
`1) An energetic line announcing you are online (use ⚡ emoji). ` +
|
|
288
301
|
`2) Say you don't have a name yet and ask the user what they'd like to call you. ` +
|
|
@@ -321,18 +334,20 @@ async function sendTelegramWakeup({ botToken, chatId, language, model }) {
|
|
|
321
334
|
});
|
|
322
335
|
}
|
|
323
336
|
|
|
324
|
-
// Minimal fallback messages per
|
|
337
|
+
// Minimal fallback messages per ISO 639-1 code (used only if daemon can't respond)
|
|
338
|
+
const WAKEUP_FALLBACK = {
|
|
339
|
+
es: "⚡ ¡APX está en línea y listo!\nAún no tengo nombre — ¿cómo te gustaría llamarme?\n¿Y a vos, cómo te llamo?",
|
|
340
|
+
pt: "⚡ APX está online e pronto!\nAinda não tenho nome — como você gostaria de me chamar?\nE você, como devo te chamar?",
|
|
341
|
+
fr: "⚡ APX est en ligne et prêt !\nJe n'ai pas encore de nom — comment souhaitez-vous m'appeler ?\nEt vous, comment dois-je vous appeler ?",
|
|
342
|
+
de: "⚡ APX ist online und bereit!\nIch habe noch keinen Namen — wie möchten Sie mich nennen?\nUnd Sie, wie soll ich Sie nennen?",
|
|
343
|
+
it: "⚡ APX è online e pronto!\nNon ho ancora un nome — come vorresti chiamarmi?\nE tu, come ti chiamo?",
|
|
344
|
+
zh: "⚡ APX 已上线,随时待命!\n我还没有名字——你想叫我什么?\n你希望我怎么称呼你?",
|
|
345
|
+
ja: "⚡ APXがオンラインになりました!\nまだ名前がありません — 何と呼びたいですか?\nあなたのことは何とお呼びすればよいですか?",
|
|
346
|
+
ko: "⚡ APX가 온라인 상태입니다!\n아직 이름이 없어요 — 어떻게 불러주실 건가요?\n그리고 당신은 어떻게 불러드릴까요?",
|
|
347
|
+
};
|
|
325
348
|
function languageFallback(lang) {
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
return "⚡ APX is online and ready to work.\nI do not have a name yet. What would you like to call me?\nAnd what should I call you?";
|
|
331
|
-
if (/franc|french/i.test(l))
|
|
332
|
-
return "⚡ APX is online and ready to work.\nI do not have a name yet. What would you like to call me?\nAnd what should I call you?";
|
|
333
|
-
if (/deutsch|german/i.test(l))
|
|
334
|
-
return "⚡ APX is online and ready to work.\nI do not have a name yet. What would you like to call me?\nAnd what should I call you?";
|
|
335
|
-
if (/ital/i.test(l))
|
|
336
|
-
return "⚡ APX is online and ready to work.\nI do not have a name yet. What would you like to call me?\nAnd what should I call you?";
|
|
337
|
-
return "⚡ I'm awake and ready to go! APX is online.\nI don't have a name yet — what would you like to call me?\nAnd you, what's your name or what should I call you?";
|
|
349
|
+
return (
|
|
350
|
+
WAKEUP_FALLBACK[lang.toLowerCase().slice(0, 2)] ||
|
|
351
|
+
"⚡ I'm awake and ready to go! APX is online.\nI don't have a name yet — what would you like to call me?\nAnd you, what's your name or what should I call you?"
|
|
352
|
+
);
|
|
338
353
|
}
|
package/src/core/config.js
CHANGED
|
@@ -32,6 +32,9 @@ const DEFAULT_CONFIG = {
|
|
|
32
32
|
host: "127.0.0.1",
|
|
33
33
|
log_level: "info",
|
|
34
34
|
projects: [],
|
|
35
|
+
user: {
|
|
36
|
+
language: "en", // ISO 639-1 two-letter code; used by transcription, super-agent, and wake-up message
|
|
37
|
+
},
|
|
35
38
|
telegram: {
|
|
36
39
|
enabled: false,
|
|
37
40
|
bot_token: "",
|
|
@@ -87,6 +90,7 @@ export function mergeDefaults(cfg) {
|
|
|
87
90
|
return {
|
|
88
91
|
...DEFAULT_CONFIG,
|
|
89
92
|
...cfg,
|
|
93
|
+
user: { ...DEFAULT_CONFIG.user, ...(cfg.user || {}) },
|
|
90
94
|
telegram: {
|
|
91
95
|
...DEFAULT_CONFIG.telegram,
|
|
92
96
|
...(cfg.telegram || {}),
|
|
@@ -11,7 +11,7 @@ export default {
|
|
|
11
11
|
function: {
|
|
12
12
|
name: "transcribe_audio",
|
|
13
13
|
description:
|
|
14
|
-
"Transcribe an audio file to text. Default backend is local faster-whisper (model '
|
|
14
|
+
"Transcribe an audio file to text. Default backend is local faster-whisper (model 'small' on CPU with int8 quantization, persistent server to avoid reload overhead), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
|
|
15
15
|
parameters: {
|
|
16
16
|
type: "object",
|
|
17
17
|
properties: {
|
|
@@ -19,7 +19,7 @@ export default {
|
|
|
19
19
|
base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
|
|
20
20
|
format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
|
|
21
21
|
provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
|
|
22
|
-
model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default
|
|
22
|
+
model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default small)" },
|
|
23
23
|
language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
|
|
24
24
|
device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
|
|
25
25
|
compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
// daemon/transcription.js
|
|
2
2
|
// Audio transcription dispatcher. Two backends:
|
|
3
3
|
//
|
|
4
|
-
// - LOCAL (faster-whisper via Python
|
|
5
|
-
//
|
|
6
|
-
//
|
|
7
|
-
// `pip3 install faster-whisper` on the host.
|
|
4
|
+
// - LOCAL (faster-whisper via persistent Python server) — the server loads
|
|
5
|
+
// the model once on first use and keeps it in RAM. It auto-shuts down after
|
|
6
|
+
// idle_minutes (default 10) of inactivity, then restarts lazily on the
|
|
7
|
+
// next request. Requires `pip3 install faster-whisper` on the host.
|
|
8
8
|
//
|
|
9
9
|
// - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
|
|
10
10
|
// engines.openai.api_key in config.
|
|
@@ -13,31 +13,36 @@
|
|
|
13
13
|
// "transcription": {
|
|
14
14
|
// "provider": "auto" | "local" | "openai", // default "auto"
|
|
15
15
|
// "local": {
|
|
16
|
-
// "model": "
|
|
17
|
-
// "device": "cpu",
|
|
18
|
-
// "compute_type": "int8",
|
|
19
|
-
// "language": "auto",
|
|
20
|
-
// "beam_size": 5
|
|
16
|
+
// "model": "small", // tiny | base | small | medium | large | large-v2 | large-v3
|
|
17
|
+
// "device": "cpu", // cpu | cuda
|
|
18
|
+
// "compute_type": "int8", // int8 | int8_float16 | float16 | float32
|
|
19
|
+
// "language": "auto", // ISO 639-1 code (e.g. "es") or "auto"
|
|
20
|
+
// "beam_size": 5,
|
|
21
|
+
// "idle_minutes": 10 // auto-shutdown after N minutes idle
|
|
21
22
|
// }
|
|
22
23
|
// }
|
|
23
24
|
//
|
|
24
25
|
// "auto" tries local first; on failure falls back to openai.
|
|
26
|
+
//
|
|
27
|
+
// Spanish tip: set language: "es" for better accuracy with the small model.
|
|
25
28
|
|
|
26
29
|
import fs from "node:fs";
|
|
27
30
|
import path from "node:path";
|
|
28
|
-
import {
|
|
31
|
+
import { spawn } from "node:child_process";
|
|
29
32
|
import { fileURLToPath } from "node:url";
|
|
30
33
|
|
|
31
|
-
const __filename
|
|
32
|
-
const __dirname
|
|
33
|
-
const
|
|
34
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
35
|
+
const __dirname = path.dirname(__filename);
|
|
36
|
+
const WHISPER_SERVER = path.join(__dirname, "whisper-server.py");
|
|
37
|
+
const WHISPER_PORT = 18765;
|
|
34
38
|
|
|
35
39
|
const DEFAULT_LOCAL = {
|
|
36
|
-
model: "
|
|
40
|
+
model: "small",
|
|
37
41
|
device: "cpu",
|
|
38
42
|
compute_type: "int8",
|
|
39
43
|
language: "auto",
|
|
40
44
|
beam_size: 5,
|
|
45
|
+
idle_minutes: 10,
|
|
41
46
|
};
|
|
42
47
|
|
|
43
48
|
// ---------------------------------------------------------------------------
|
|
@@ -50,9 +55,16 @@ async function getConfig() {
|
|
|
50
55
|
const cfg = readConfig() || {};
|
|
51
56
|
const t = cfg.transcription || {};
|
|
52
57
|
const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
58
|
+
// Use user.language as default for transcription language if not explicitly set.
|
|
59
|
+
// Explicit transcription.local.language always wins; "auto" means fall back to user.language.
|
|
60
|
+
const userLang = cfg.user?.language || "";
|
|
61
|
+
const localBase = { ...DEFAULT_LOCAL, ...(t.local || {}) };
|
|
62
|
+
if ((!localBase.language || localBase.language === "auto") && userLang) {
|
|
63
|
+
localBase.language = userLang;
|
|
64
|
+
}
|
|
53
65
|
return {
|
|
54
66
|
provider: t.provider || "auto",
|
|
55
|
-
local:
|
|
67
|
+
local: localBase,
|
|
56
68
|
openaiKey,
|
|
57
69
|
};
|
|
58
70
|
} catch {
|
|
@@ -65,45 +77,145 @@ async function getConfig() {
|
|
|
65
77
|
}
|
|
66
78
|
|
|
67
79
|
// ---------------------------------------------------------------------------
|
|
68
|
-
//
|
|
80
|
+
// Persistent server management
|
|
69
81
|
// ---------------------------------------------------------------------------
|
|
70
82
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
83
|
+
let _serverProcess = null;
|
|
84
|
+
let _serverModel = null; // model the running server was started with
|
|
85
|
+
|
|
86
|
+
function _sleep(ms) {
|
|
87
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async function _isServerHealthy() {
|
|
91
|
+
try {
|
|
92
|
+
const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/health`, {
|
|
93
|
+
signal: AbortSignal.timeout(800),
|
|
94
|
+
});
|
|
95
|
+
return res.ok;
|
|
96
|
+
} catch {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async function _waitForServer(maxMs = 15_000) {
|
|
102
|
+
const deadline = Date.now() + maxMs;
|
|
103
|
+
while (Date.now() < deadline) {
|
|
104
|
+
if (await _isServerHealthy()) return;
|
|
105
|
+
await _sleep(250);
|
|
106
|
+
}
|
|
107
|
+
throw new Error(`whisper-server did not start within ${maxMs}ms`);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async function ensureWhisperServer(opts) {
|
|
111
|
+
const model = opts.model || DEFAULT_LOCAL.model;
|
|
112
|
+
|
|
113
|
+
// Already running with the right model — health-check to confirm still alive.
|
|
114
|
+
if (_serverProcess && _serverModel === model) {
|
|
115
|
+
if (await _isServerHealthy()) return;
|
|
116
|
+
// Process died (idle shutdown). Fall through to restart.
|
|
117
|
+
_serverProcess = null;
|
|
118
|
+
_serverModel = null;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Wrong model: kill old server and start fresh.
|
|
122
|
+
if (_serverProcess) {
|
|
123
|
+
try { _serverProcess.kill(); } catch {}
|
|
124
|
+
_serverProcess = null;
|
|
125
|
+
_serverModel = null;
|
|
126
|
+
await _sleep(300);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const args = [
|
|
130
|
+
WHISPER_SERVER,
|
|
131
|
+
"--port", String(WHISPER_PORT),
|
|
132
|
+
"--model", model,
|
|
133
|
+
"--device", String(opts.device || DEFAULT_LOCAL.device),
|
|
134
|
+
"--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
|
|
135
|
+
"--idle-minutes", String(opts.idle_minutes ?? DEFAULT_LOCAL.idle_minutes),
|
|
136
|
+
];
|
|
137
|
+
|
|
138
|
+
const proc = spawn("python3", args, {
|
|
139
|
+
stdio: ["ignore", "pipe", "inherit"],
|
|
140
|
+
detached: false,
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
_serverProcess = proc;
|
|
144
|
+
_serverModel = model;
|
|
145
|
+
|
|
146
|
+
proc.on("exit", () => {
|
|
147
|
+
if (_serverProcess === proc) {
|
|
148
|
+
_serverProcess = null;
|
|
149
|
+
_serverModel = null;
|
|
150
|
+
}
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
// Wait for the "ready" line on stdout, then wait for HTTP to respond.
|
|
154
|
+
await new Promise((resolve, reject) => {
|
|
155
|
+
const timeout = setTimeout(
|
|
156
|
+
() => reject(new Error("whisper-server startup timed out (15s)")),
|
|
157
|
+
15_000
|
|
158
|
+
);
|
|
159
|
+
let buf = "";
|
|
160
|
+
proc.stdout.on("data", (chunk) => {
|
|
161
|
+
buf += chunk.toString();
|
|
162
|
+
const nl = buf.indexOf("\n");
|
|
163
|
+
if (nl === -1) return;
|
|
164
|
+
const line = buf.slice(0, nl).trim();
|
|
165
|
+
buf = buf.slice(nl + 1);
|
|
166
|
+
clearTimeout(timeout);
|
|
167
|
+
try {
|
|
168
|
+
const msg = JSON.parse(line);
|
|
169
|
+
if (msg.status === "error") return reject(new Error(msg.error || "whisper-server error"));
|
|
170
|
+
resolve(); // "ready"
|
|
171
|
+
} catch {
|
|
172
|
+
resolve(); // unexpected line but server is up
|
|
91
173
|
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
text: parsed.text || "",
|
|
97
|
-
language: parsed.language || null,
|
|
98
|
-
language_probability: parsed.language_probability ?? null,
|
|
99
|
-
duration: parsed.duration ?? null,
|
|
100
|
-
model: parsed.model,
|
|
101
|
-
compute_type: parsed.compute_type,
|
|
102
|
-
});
|
|
174
|
+
});
|
|
175
|
+
proc.on("exit", (code) => {
|
|
176
|
+
clearTimeout(timeout);
|
|
177
|
+
reject(new Error(`whisper-server exited (code ${code}) before becoming ready`));
|
|
103
178
|
});
|
|
104
179
|
});
|
|
105
180
|
}
|
|
106
181
|
|
|
182
|
+
// ---------------------------------------------------------------------------
|
|
183
|
+
// Local backend (persistent whisper-server.py via HTTP)
|
|
184
|
+
// ---------------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
async function transcribeLocal(filePath, opts) {
|
|
187
|
+
await ensureWhisperServer(opts);
|
|
188
|
+
|
|
189
|
+
const language = (opts.language || DEFAULT_LOCAL.language) === "auto"
|
|
190
|
+
? null
|
|
191
|
+
: (opts.language || null);
|
|
192
|
+
|
|
193
|
+
const res = await fetch(`http://127.0.0.1:${WHISPER_PORT}/transcribe`, {
|
|
194
|
+
method: "POST",
|
|
195
|
+
headers: { "content-type": "application/json" },
|
|
196
|
+
body: JSON.stringify({
|
|
197
|
+
audio_path: filePath,
|
|
198
|
+
language,
|
|
199
|
+
beam_size: opts.beam_size || DEFAULT_LOCAL.beam_size,
|
|
200
|
+
}),
|
|
201
|
+
signal: AbortSignal.timeout(5 * 60_000),
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
const json = await res.json();
|
|
205
|
+
if (!json.ok) throw new Error(json.error || "transcription failed");
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
ok: true,
|
|
209
|
+
backend: "local",
|
|
210
|
+
text: json.text || "",
|
|
211
|
+
language: json.language || null,
|
|
212
|
+
language_probability: json.language_probability ?? null,
|
|
213
|
+
duration: json.duration ?? null,
|
|
214
|
+
model: json.model,
|
|
215
|
+
compute_type: json.compute_type,
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
|
|
107
219
|
// ---------------------------------------------------------------------------
|
|
108
220
|
// OpenAI backend (Whisper-1 cloud)
|
|
109
221
|
// ---------------------------------------------------------------------------
|
|
@@ -154,7 +266,7 @@ async function transcribeOpenAI(filePath, apiKey) {
|
|
|
154
266
|
* Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
|
|
155
267
|
*
|
|
156
268
|
* @param {string} filePath absolute path to audio file
|
|
157
|
-
* @param {object} overrides optional: { provider, model, language, ... }
|
|
269
|
+
* @param {object} overrides optional: { provider, model, language, idle_minutes, ... }
|
|
158
270
|
*/
|
|
159
271
|
export async function transcribe(filePath, overrides = {}) {
|
|
160
272
|
if (!filePath || !fs.existsSync(filePath)) {
|
|
@@ -189,5 +301,6 @@ export async function transcribe(filePath, overrides = {}) {
|
|
|
189
301
|
// ---------------------------------------------------------------------------
|
|
190
302
|
|
|
191
303
|
export const TRANSCRIPTION_PATHS = {
|
|
192
|
-
|
|
304
|
+
whisper_server: WHISPER_SERVER,
|
|
305
|
+
port: WHISPER_PORT,
|
|
193
306
|
};
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Persistent Whisper transcription server for APX.
|
|
4
|
+
|
|
5
|
+
Loads the model once on the first /transcribe request and keeps it in RAM.
|
|
6
|
+
Auto-shuts down after --idle-minutes of inactivity so it doesn't consume
|
|
7
|
+
memory permanently when not in use.
|
|
8
|
+
|
|
9
|
+
Started automatically by APX daemon via transcription.js. Do not run manually.
|
|
10
|
+
|
|
11
|
+
Endpoints:
|
|
12
|
+
GET /health → { ok, model, loaded }
|
|
13
|
+
POST /transcribe ← { audio_path, language?, beam_size? }
|
|
14
|
+
→ { ok, text, language, language_probability, duration, model, compute_type }
|
|
15
|
+
POST /shutdown → graceful stop
|
|
16
|
+
"""
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import threading
|
|
22
|
+
import time
|
|
23
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# State
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
_model = None
|
|
30
|
+
_model_name = None
|
|
31
|
+
_model_lock = threading.Lock()
|
|
32
|
+
_last_used = time.monotonic()
|
|
33
|
+
_idle_seconds = 10 * 60
|
|
34
|
+
_server_ref = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _touch():
|
|
38
|
+
global _last_used
|
|
39
|
+
_last_used = time.monotonic()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _load_model_if_needed(model_name, device, compute_type):
|
|
43
|
+
global _model, _model_name
|
|
44
|
+
if _model is not None and _model_name == model_name:
|
|
45
|
+
return _model
|
|
46
|
+
from faster_whisper import WhisperModel
|
|
47
|
+
threads = os.cpu_count() or 4
|
|
48
|
+
m = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=threads)
|
|
49
|
+
_model = m
|
|
50
|
+
_model_name = model_name
|
|
51
|
+
return m
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# HTTP handler
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
class _Handler(BaseHTTPRequestHandler):
|
|
59
|
+
model_name = "small"
|
|
60
|
+
device = "cpu"
|
|
61
|
+
compute_type = "int8"
|
|
62
|
+
|
|
63
|
+
def log_message(self, fmt, *args):
|
|
64
|
+
pass # suppress access log; APX daemon handles its own logging
|
|
65
|
+
|
|
66
|
+
def _send_json(self, code, body):
|
|
67
|
+
data = json.dumps(body).encode()
|
|
68
|
+
self.send_response(code)
|
|
69
|
+
self.send_header("Content-Type", "application/json")
|
|
70
|
+
self.send_header("Content-Length", str(len(data)))
|
|
71
|
+
self.end_headers()
|
|
72
|
+
self.wfile.write(data)
|
|
73
|
+
|
|
74
|
+
def _read_body(self):
|
|
75
|
+
n = int(self.headers.get("Content-Length", 0))
|
|
76
|
+
if n <= 0:
|
|
77
|
+
return {}
|
|
78
|
+
try:
|
|
79
|
+
return json.loads(self.rfile.read(n))
|
|
80
|
+
except Exception:
|
|
81
|
+
return {}
|
|
82
|
+
|
|
83
|
+
def do_GET(self):
|
|
84
|
+
if self.path == "/health":
|
|
85
|
+
_touch()
|
|
86
|
+
self._send_json(200, {
|
|
87
|
+
"ok": True,
|
|
88
|
+
"model": _model_name or _Handler.model_name,
|
|
89
|
+
"loaded": _model is not None,
|
|
90
|
+
})
|
|
91
|
+
else:
|
|
92
|
+
self._send_json(404, {"ok": False, "error": "not found"})
|
|
93
|
+
|
|
94
|
+
def do_POST(self):
|
|
95
|
+
req = self._read_body()
|
|
96
|
+
|
|
97
|
+
if self.path == "/transcribe":
|
|
98
|
+
_touch()
|
|
99
|
+
audio_path = req.get("audio_path", "")
|
|
100
|
+
language = req.get("language") or None # None → auto-detect
|
|
101
|
+
beam_size = int(req.get("beam_size", 5))
|
|
102
|
+
|
|
103
|
+
if not audio_path or not os.path.exists(audio_path):
|
|
104
|
+
self._send_json(400, {"ok": False, "error": f"file not found: {audio_path}"})
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
with _model_lock:
|
|
108
|
+
try:
|
|
109
|
+
m = _load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
|
|
110
|
+
except ImportError:
|
|
111
|
+
self._send_json(500, {
|
|
112
|
+
"ok": False,
|
|
113
|
+
"error": "faster-whisper not installed — run: pip3 install faster-whisper",
|
|
114
|
+
})
|
|
115
|
+
return
|
|
116
|
+
except Exception as e:
|
|
117
|
+
self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
segments, info = m.transcribe(audio_path, beam_size=beam_size, language=language)
|
|
122
|
+
text = " ".join(seg.text.strip() for seg in segments).strip()
|
|
123
|
+
self._send_json(200, {
|
|
124
|
+
"ok": True,
|
|
125
|
+
"text": text,
|
|
126
|
+
"language": info.language,
|
|
127
|
+
"language_probability": round(info.language_probability, 4),
|
|
128
|
+
"duration": round(info.duration, 2),
|
|
129
|
+
"model": _model_name,
|
|
130
|
+
"compute_type": _Handler.compute_type,
|
|
131
|
+
})
|
|
132
|
+
except Exception as e:
|
|
133
|
+
self._send_json(500, {"ok": False, "error": f"transcription failed: {e}"})
|
|
134
|
+
|
|
135
|
+
elif self.path == "/shutdown":
|
|
136
|
+
self._send_json(200, {"ok": True})
|
|
137
|
+
if _server_ref:
|
|
138
|
+
threading.Thread(target=_server_ref.shutdown, daemon=True).start()
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
self._send_json(404, {"ok": False, "error": "not found"})
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# Idle watchdog
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
def _watchdog(idle_seconds):
|
|
149
|
+
while True:
|
|
150
|
+
time.sleep(30)
|
|
151
|
+
idle = time.monotonic() - _last_used
|
|
152
|
+
if idle > idle_seconds:
|
|
153
|
+
print(
|
|
154
|
+
f"[whisper-server] idle {int(idle)}s > {idle_seconds}s — shutting down",
|
|
155
|
+
file=sys.stderr,
|
|
156
|
+
flush=True,
|
|
157
|
+
)
|
|
158
|
+
if _server_ref:
|
|
159
|
+
_server_ref.shutdown()
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# Main
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def main():
|
|
168
|
+
global _server_ref, _idle_seconds
|
|
169
|
+
|
|
170
|
+
parser = argparse.ArgumentParser(description="Persistent APX Whisper server")
|
|
171
|
+
parser.add_argument("--port", type=int, default=18765)
|
|
172
|
+
parser.add_argument("--model", default="small")
|
|
173
|
+
parser.add_argument("--device", default="cpu")
|
|
174
|
+
parser.add_argument("--compute-type", dest="compute_type", default="int8")
|
|
175
|
+
parser.add_argument("--idle-minutes", dest="idle_minutes", type=int, default=10)
|
|
176
|
+
args = parser.parse_args()
|
|
177
|
+
|
|
178
|
+
_Handler.model_name = args.model
|
|
179
|
+
_Handler.device = args.device
|
|
180
|
+
_Handler.compute_type = args.compute_type
|
|
181
|
+
_idle_seconds = args.idle_minutes * 60
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
_server_ref = HTTPServer(("127.0.0.1", args.port), _Handler)
|
|
185
|
+
except OSError as e:
|
|
186
|
+
print(json.dumps({"status": "error", "error": str(e)}), flush=True)
|
|
187
|
+
sys.exit(1)
|
|
188
|
+
|
|
189
|
+
# Signal readiness to the Node.js parent before serve_forever blocks.
|
|
190
|
+
print(json.dumps({
|
|
191
|
+
"status": "ready",
|
|
192
|
+
"port": args.port,
|
|
193
|
+
"model": args.model,
|
|
194
|
+
"idle_minutes": args.idle_minutes,
|
|
195
|
+
}), flush=True)
|
|
196
|
+
|
|
197
|
+
threading.Thread(target=_watchdog, args=(_idle_seconds,), daemon=True).start()
|
|
198
|
+
_server_ref.serve_forever()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
main()
|
|
@@ -42,7 +42,9 @@ def main() -> int:
|
|
|
42
42
|
return 1
|
|
43
43
|
|
|
44
44
|
try:
|
|
45
|
-
|
|
45
|
+
import multiprocessing
|
|
46
|
+
threads = os.cpu_count() or 4
|
|
47
|
+
model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type, cpu_threads=threads)
|
|
46
48
|
except Exception as e:
|
|
47
49
|
print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
|
|
48
50
|
return 1
|