@agentprojectcontext/apx 1.13.1 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentprojectcontext/apx",
3
- "version": "1.13.1",
3
+ "version": "1.14.0",
4
4
  "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -36,6 +36,7 @@ import { stripThinking } from "../thinking.js";
36
36
  import { getRecentTelegramTurnsFromFs, appendGlobalMessage } from "../../core/messages-store.js";
37
37
  import { readAgents } from "../../core/parser.js";
38
38
  import { buildAgentSystem } from "../../core/agent-system.js";
39
+ import { transcribe as transcribeAudioFile } from "../transcription.js";
39
40
 
40
41
  const API_BASE = "https://api.telegram.org";
41
42
  const nowIso = () => new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
@@ -131,47 +132,9 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
131
132
  return json.result;
132
133
  }
133
134
 
134
- /**
135
- * Transcribe an audio file via OpenAI Whisper.
136
- * Reads OPENAI_API_KEY from env or engines.openai.api_key in ~/.apx/config.json.
137
- * Returns the transcribed text, or throws if no key / API failure.
138
- */
139
- async function transcribeAudio(filePath) {
140
- let apiKey = process.env.OPENAI_API_KEY;
141
- if (!apiKey) {
142
- try {
143
- const { readConfig } = await import("../../core/config.js");
144
- apiKey = readConfig()?.engines?.openai?.api_key || "";
145
- } catch { /* ignore */ }
146
- }
147
- if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
148
-
149
- const fileBuf = fs.readFileSync(filePath);
150
- const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
151
- const mimeMap = {
152
- oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
153
- mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
154
- wav: "audio/wav", webm: "audio/webm",
155
- };
156
- const mime = mimeMap[ext] || "audio/ogg";
157
- const blob = new Blob([fileBuf], { type: mime });
158
-
159
- const form = new FormData();
160
- form.append("file", blob, `audio.${ext}`);
161
- form.append("model", "whisper-1");
162
-
163
- const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
164
- method: "POST",
165
- headers: { Authorization: `Bearer ${apiKey}` },
166
- body: form,
167
- });
168
- if (!res.ok) {
169
- const err = await res.text().catch(() => "");
170
- throw new Error(`Whisper ${res.status}: ${err.slice(0, 200)}`);
171
- }
172
- const json = await res.json();
173
- return String(json.text || "").trim();
174
- }
135
+ // Audio transcription is delegated to the central dispatcher
136
+ // (../transcription.js) which handles local (faster-whisper via Python) +
137
+ // OpenAI cloud fallback. See that module for config keys.
175
138
 
176
139
  /**
177
140
  * Download a file from Telegram servers.
@@ -444,6 +407,7 @@ class ChannelPoller {
444
407
  let localPath = null;
445
408
  let transcript = "";
446
409
  let transcribeError = null;
410
+ let transcribeBackend = null;
447
411
  try {
448
412
  localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
449
413
  this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
@@ -452,8 +416,10 @@ class ChannelPoller {
452
416
  }
453
417
  if (localPath) {
454
418
  try {
455
- transcript = await transcribeAudio(localPath);
456
- this.log(`telegram[${this.channel.name}] audio transcribed (${transcript.length} chars)`);
419
+ const result = await transcribeAudioFile(localPath);
420
+ transcript = result.text || "";
421
+ transcribeBackend = result.backend;
422
+ this.log(`telegram[${this.channel.name}] audio transcribed via ${transcribeBackend} (${transcript.length} chars, lang=${result.language || "?"})`);
457
423
  } catch (e) {
458
424
  transcribeError = e.message;
459
425
  this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
@@ -480,6 +446,7 @@ class ChannelPoller {
480
446
  file_id: incomingAudio.file_id,
481
447
  duration: incomingAudio.duration,
482
448
  mime_type: incomingAudio.mime_type,
449
+ transcription_backend: transcribeBackend,
483
450
  transcription_error: transcribeError,
484
451
  },
485
452
  });
@@ -21,6 +21,7 @@ import setPermissionMode from "./tools/set-permission-mode.js";
21
21
  import searchFiles from "./tools/search-files.js";
22
22
  import listSkills from "./tools/list-skills.js";
23
23
  import loadSkill from "./tools/load-skill.js";
24
+ import transcribeAudio from "./tools/transcribe-audio.js";
24
25
  import { createPermissionGuard } from "./helpers.js";
25
26
  import { buildBridgedTools, DEFAULT_CATEGORIES } from "./registry-bridge.js";
26
27
 
@@ -48,6 +49,7 @@ const NATIVE_TOOLS = [
48
49
  searchFiles,
49
50
  listSkills,
50
51
  loadSkill,
52
+ transcribeAudio,
51
53
  ];
52
54
 
53
55
  // Registry-backed bridges. Categories can be overridden per-process via env
@@ -0,0 +1,61 @@
1
+ import fs from "node:fs";
2
+ import os from "node:os";
3
+ import path from "node:path";
4
+ import crypto from "node:crypto";
5
+ import { transcribe } from "../../transcription.js";
6
+
7
+ export default {
8
+ name: "transcribe_audio",
9
+ schema: {
10
+ type: "function",
11
+ function: {
12
+ name: "transcribe_audio",
13
+ description:
14
+ "Transcribe an audio file to text. Default backend is local faster-whisper (model 'medium' on CPU with int8 quantization), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
15
+ parameters: {
16
+ type: "object",
17
+ properties: {
18
+ file_path: { type: "string", description: "absolute path to audio file (.ogg, .mp3, .m4a, .wav, .webm, .opus)" },
19
+ base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
20
+ format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
21
+ provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
22
+ model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default medium)" },
23
+ language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
24
+ device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
25
+ compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
26
+ },
27
+ },
28
+ },
29
+ },
30
+ makeHandler: () => async ({ file_path, base64, format = "ogg", provider, model, language, device, compute_type } = {}) => {
31
+ if (!file_path && !base64) throw new Error("transcribe_audio: file_path or base64 required");
32
+
33
+ let pathToUse = file_path;
34
+ let cleanupTmp = false;
35
+
36
+ if (!pathToUse && base64) {
37
+ const clean = String(base64).replace(/^data:audio\/[a-z]+;base64,/, "");
38
+ const buf = Buffer.from(clean, "base64");
39
+ const tmpDir = path.join(os.tmpdir(), "apx-transcribe");
40
+ fs.mkdirSync(tmpDir, { recursive: true });
41
+ const id = crypto.randomBytes(6).toString("hex");
42
+ pathToUse = path.join(tmpDir, `audio-${id}.${String(format).replace(/^\./, "") || "ogg"}`);
43
+ fs.writeFileSync(pathToUse, buf);
44
+ cleanupTmp = true;
45
+ }
46
+
47
+ try {
48
+ const overrides = {};
49
+ if (provider) overrides.provider = provider;
50
+ if (model) overrides.model = model;
51
+ if (language) overrides.language = language;
52
+ if (device) overrides.device = device;
53
+ if (compute_type) overrides.compute_type = compute_type;
54
+ return await transcribe(pathToUse, overrides);
55
+ } finally {
56
+ if (cleanupTmp) {
57
+ try { fs.unlinkSync(pathToUse); } catch { /* ignore */ }
58
+ }
59
+ }
60
+ },
61
+ };
@@ -0,0 +1,193 @@
1
+ // daemon/transcription.js
2
+ // Audio transcription dispatcher. Two backends:
3
+ //
4
+ // - LOCAL (faster-whisper via Python subprocess) — ported from Panda's
5
+ // transcription_service.py. Same defaults: model "medium", device "cpu",
6
+ // compute_type "int8", beam_size 5, auto language detection. Requires
7
+ // `pip3 install faster-whisper` on the host.
8
+ //
9
+ // - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
10
+ // engines.openai.api_key in config.
11
+ //
12
+ // Provider selection in ~/.apx/config.json:
13
+ // "transcription": {
14
+ // "provider": "auto" | "local" | "openai", // default "auto"
15
+ // "local": {
16
+ // "model": "medium", // tiny | base | small | medium | large | large-v2 | large-v3
17
+ // "device": "cpu", // cpu | cuda
18
+ // "compute_type": "int8", // int8 | int8_float16 | float16 | float32
19
+ // "language": "auto", // ISO 639-1 code or "auto"
20
+ // "beam_size": 5
21
+ // }
22
+ // }
23
+ //
24
+ // "auto" tries local first; on failure falls back to openai.
25
+
26
+ import fs from "node:fs";
27
+ import path from "node:path";
28
+ import { execFile } from "node:child_process";
29
+ import { fileURLToPath } from "node:url";
30
+
31
+ const __filename = fileURLToPath(import.meta.url);
32
+ const __dirname = path.dirname(__filename);
33
+ const PYTHON_HELPER = path.join(__dirname, "whisper-transcribe.py");
34
+
35
+ const DEFAULT_LOCAL = {
36
+ model: "medium",
37
+ device: "cpu",
38
+ compute_type: "int8",
39
+ language: "auto",
40
+ beam_size: 5,
41
+ };
42
+
43
+ // ---------------------------------------------------------------------------
44
+ // Config
45
+ // ---------------------------------------------------------------------------
46
+
47
+ async function getConfig() {
48
+ try {
49
+ const { readConfig } = await import("../core/config.js");
50
+ const cfg = readConfig() || {};
51
+ const t = cfg.transcription || {};
52
+ const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
53
+ return {
54
+ provider: t.provider || "auto",
55
+ local: { ...DEFAULT_LOCAL, ...(t.local || {}) },
56
+ openaiKey,
57
+ };
58
+ } catch {
59
+ return {
60
+ provider: "auto",
61
+ local: { ...DEFAULT_LOCAL },
62
+ openaiKey: process.env.OPENAI_API_KEY || "",
63
+ };
64
+ }
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Local backend (Python + faster-whisper)
69
+ // ---------------------------------------------------------------------------
70
+
71
+ function transcribeLocal(filePath, opts) {
72
+ return new Promise((resolve, reject) => {
73
+ const args = [
74
+ PYTHON_HELPER,
75
+ filePath,
76
+ "--model", String(opts.model || DEFAULT_LOCAL.model),
77
+ "--language", String(opts.language || DEFAULT_LOCAL.language),
78
+ "--device", String(opts.device || DEFAULT_LOCAL.device),
79
+ "--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
80
+ "--beam-size", String(opts.beam_size || DEFAULT_LOCAL.beam_size),
81
+ ];
82
+ execFile("python3", args, { maxBuffer: 16 * 1024 * 1024, timeout: 5 * 60_000 }, (err, stdout, stderr) => {
83
+ if (err) {
84
+ const tail = (stderr || err.message || "").slice(-300);
85
+ return reject(new Error(`local transcription failed: ${tail}`));
86
+ }
87
+ let parsed;
88
+ try { parsed = JSON.parse(String(stdout).trim().split("\n").pop()); }
89
+ catch (e) {
90
+ return reject(new Error(`could not parse helper output: ${stdout.slice(0, 300)}`));
91
+ }
92
+ if (!parsed.ok) return reject(new Error(parsed.error || "unknown local transcription error"));
93
+ resolve({
94
+ ok: true,
95
+ backend: "local",
96
+ text: parsed.text || "",
97
+ language: parsed.language || null,
98
+ language_probability: parsed.language_probability ?? null,
99
+ duration: parsed.duration ?? null,
100
+ model: parsed.model,
101
+ compute_type: parsed.compute_type,
102
+ });
103
+ });
104
+ });
105
+ }
106
+
107
+ // ---------------------------------------------------------------------------
108
+ // OpenAI backend (Whisper-1 cloud)
109
+ // ---------------------------------------------------------------------------
110
+
111
+ async function transcribeOpenAI(filePath, apiKey) {
112
+ if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
113
+
114
+ const fileBuf = fs.readFileSync(filePath);
115
+ const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
116
+ const mimeMap = {
117
+ oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
118
+ mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
119
+ wav: "audio/wav", webm: "audio/webm",
120
+ };
121
+ const blob = new Blob([fileBuf], { type: mimeMap[ext] || "audio/ogg" });
122
+
123
+ const form = new FormData();
124
+ form.append("file", blob, `audio.${ext}`);
125
+ form.append("model", "whisper-1");
126
+
127
+ const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
128
+ method: "POST",
129
+ headers: { Authorization: `Bearer ${apiKey}` },
130
+ body: form,
131
+ });
132
+ if (!res.ok) {
133
+ const err = await res.text().catch(() => "");
134
+ throw new Error(`Whisper API ${res.status}: ${err.slice(0, 200)}`);
135
+ }
136
+ const json = await res.json();
137
+ return {
138
+ ok: true,
139
+ backend: "openai",
140
+ text: String(json.text || "").trim(),
141
+ language: null,
142
+ language_probability: null,
143
+ duration: null,
144
+ model: "whisper-1",
145
+ };
146
+ }
147
+
148
+ // ---------------------------------------------------------------------------
149
+ // Public API
150
+ // ---------------------------------------------------------------------------
151
+
152
+ /**
153
+ * Transcribe an audio file using the configured backend.
154
+ * Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
155
+ *
156
+ * @param {string} filePath absolute path to audio file
157
+ * @param {object} overrides optional: { provider, model, language, ... }
158
+ */
159
+ export async function transcribe(filePath, overrides = {}) {
160
+ if (!filePath || !fs.existsSync(filePath)) {
161
+ throw new Error(`transcribe: file not found: ${filePath}`);
162
+ }
163
+ const cfg = await getConfig();
164
+ const provider = overrides.provider || cfg.provider;
165
+ const localOpts = { ...cfg.local, ...overrides };
166
+
167
+ if (provider === "openai") {
168
+ return transcribeOpenAI(filePath, cfg.openaiKey);
169
+ }
170
+ if (provider === "local") {
171
+ return transcribeLocal(filePath, localOpts);
172
+ }
173
+
174
+ // auto: local first, fall back to openai
175
+ try {
176
+ return await transcribeLocal(filePath, localOpts);
177
+ } catch (localErr) {
178
+ if (!cfg.openaiKey) {
179
+ throw new Error(
180
+ `local transcription failed and no OpenAI fallback available: ${localErr.message}`
181
+ );
182
+ }
183
+ return transcribeOpenAI(filePath, cfg.openaiKey);
184
+ }
185
+ }
186
+
187
+ // ---------------------------------------------------------------------------
188
+ // Diagnostics
189
+ // ---------------------------------------------------------------------------
190
+
191
+ export const TRANSCRIPTION_PATHS = {
192
+ python_helper: PYTHON_HELPER,
193
+ };
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Local audio transcription via faster-whisper. Mirrors the implementation in
4
+ the Panda project (transcription_service.py): same default model "medium",
5
+ device cpu, compute_type int8, beam_size 5. Lazy singleton model cache.
6
+
7
+ Invoked by APX daemon (Node) as a subprocess. Args:
8
+ whisper-transcribe.py <audio_path> [--model medium] [--language auto] [--device cpu] [--compute-type int8] [--beam-size 5]
9
+
10
+ Outputs JSON on stdout:
11
+ { "ok": true, "text": "...", "language": "es", "language_probability": 0.98, "duration": 12.4 }
12
+ { "ok": false, "error": "..." }
13
+ """
14
+ import argparse
15
+ import json
16
+ import os
17
+ import sys
18
+
19
+
20
+ def main() -> int:
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument("audio_path")
23
+ parser.add_argument("--model", default="medium")
24
+ parser.add_argument("--language", default="auto")
25
+ parser.add_argument("--device", default="cpu")
26
+ parser.add_argument("--compute-type", dest="compute_type", default="int8")
27
+ parser.add_argument("--beam-size", dest="beam_size", type=int, default=5)
28
+ args = parser.parse_args()
29
+
30
+ if not os.path.exists(args.audio_path):
31
+ print(json.dumps({"ok": False, "error": f"file not found: {args.audio_path}"}))
32
+ return 1
33
+
34
+ try:
35
+ from faster_whisper import WhisperModel
36
+ except ImportError as e:
37
+ print(json.dumps({
38
+ "ok": False,
39
+ "error": "faster-whisper not installed. Run: pip3 install faster-whisper",
40
+ "import_error": str(e),
41
+ }))
42
+ return 1
43
+
44
+ try:
45
+ model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
46
+ except Exception as e:
47
+ print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
48
+ return 1
49
+
50
+ language = None if args.language == "auto" else args.language
51
+
52
+ try:
53
+ segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=language)
54
+ text = " ".join(seg.text.strip() for seg in segments).strip()
55
+ print(json.dumps({
56
+ "ok": True,
57
+ "text": text,
58
+ "language": info.language,
59
+ "language_probability": round(info.language_probability, 4),
60
+ "duration": round(info.duration, 2),
61
+ "model": args.model,
62
+ "compute_type": args.compute_type,
63
+ }))
64
+ return 0
65
+ except Exception as e:
66
+ print(json.dumps({"ok": False, "error": f"transcription failed: {e}"}))
67
+ return 1
68
+
69
+
70
+ if __name__ == "__main__":
71
+ sys.exit(main())