@agentprojectcontext/apx 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentprojectcontext/apx",
3
- "version": "1.13.0",
3
+ "version": "1.14.0",
4
4
  "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -36,6 +36,7 @@ import { stripThinking } from "../thinking.js";
36
36
  import { getRecentTelegramTurnsFromFs, appendGlobalMessage } from "../../core/messages-store.js";
37
37
  import { readAgents } from "../../core/parser.js";
38
38
  import { buildAgentSystem } from "../../core/agent-system.js";
39
+ import { transcribe as transcribeAudioFile } from "../transcription.js";
39
40
 
40
41
  const API_BASE = "https://api.telegram.org";
41
42
  const nowIso = () => new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
@@ -131,6 +132,10 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
131
132
  return json.result;
132
133
  }
133
134
 
135
+ // Audio transcription is delegated to the central dispatcher
136
+ // (../transcription.js) which handles local (faster-whisper via Python) +
137
+ // OpenAI cloud fallback. See that module for config keys.
138
+
134
139
  /**
135
140
  * Download a file from Telegram servers.
136
141
  * Returns the local file path where it was saved.
@@ -389,6 +394,69 @@ class ChannelPoller {
389
394
  if (!text) return;
390
395
  }
391
396
 
397
+ // ── Incoming voice / audio handling ──────────────────────────────────
398
+ // Telegram sends `voice` for the press-and-hold mic recording (.oga/opus)
399
+ // and `audio` for uploaded audio files (mp3/m4a/etc.). Either way we
400
+ // download, run it through Whisper, prefix the result with `[audio] `
401
+ // and let the rest of the message flow handle it as plain text.
402
+ const incomingAudio = msg.voice || msg.audio;
403
+ if (incomingAudio && incomingAudio.file_id) {
404
+ const token = resolveBotToken(this.channel);
405
+ const mediaDir = path.join(APX_HOME, "media");
406
+ fs.mkdirSync(mediaDir, { recursive: true });
407
+ let localPath = null;
408
+ let transcript = "";
409
+ let transcribeError = null;
410
+ let transcribeBackend = null;
411
+ try {
412
+ localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
413
+ this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
414
+ } catch (e) {
415
+ this.log(`telegram[${this.channel.name}] audio download failed: ${e.message}`);
416
+ }
417
+ if (localPath) {
418
+ try {
419
+ const result = await transcribeAudioFile(localPath);
420
+ transcript = result.text || "";
421
+ transcribeBackend = result.backend;
422
+ this.log(`telegram[${this.channel.name}] audio transcribed via ${transcribeBackend} (${transcript.length} chars, lang=${result.language || "?"})`);
423
+ } catch (e) {
424
+ transcribeError = e.message;
425
+ this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
426
+ }
427
+ }
428
+ const audioBody = transcript
429
+ ? `[audio] ${transcript}`
430
+ : `[audio] (transcription unavailable${transcribeError ? ": " + transcribeError : ""})`;
431
+
432
+ appendGlobalMessage({
433
+ channel: "telegram",
434
+ direction: "in",
435
+ type: "audio",
436
+ actor_id: msg.from?.id ? String(msg.from.id) : author,
437
+ external_id: String(u.update_id),
438
+ author,
439
+ body: audioBody,
440
+ meta: {
441
+ chat_id,
442
+ user_id: msg.from?.id || null,
443
+ message_id: msg.message_id,
444
+ tg_channel: this.channel.name,
445
+ local_path: localPath,
446
+ file_id: incomingAudio.file_id,
447
+ duration: incomingAudio.duration,
448
+ mime_type: incomingAudio.mime_type,
449
+ transcription_backend: transcribeBackend,
450
+ transcription_error: transcribeError,
451
+ },
452
+ });
453
+
454
+ // Inject the transcribed text into `text` so the rest of the agent
455
+ // pipeline treats it identically to a typed message. If there was a
456
+ // caption alongside the audio, prepend the audio marker to it.
457
+ text = text ? `${audioBody}\n${text}` : audioBody;
458
+ }
459
+
392
460
  // /reset or /new wipes the rolling context for this chat. We just
393
461
  // remember a marker timestamp; subsequent inbounds will only consider
394
462
  // history newer than this. Implemented by writing a synthetic message
@@ -21,6 +21,7 @@ import setPermissionMode from "./tools/set-permission-mode.js";
21
21
  import searchFiles from "./tools/search-files.js";
22
22
  import listSkills from "./tools/list-skills.js";
23
23
  import loadSkill from "./tools/load-skill.js";
24
+ import transcribeAudio from "./tools/transcribe-audio.js";
24
25
  import { createPermissionGuard } from "./helpers.js";
25
26
  import { buildBridgedTools, DEFAULT_CATEGORIES } from "./registry-bridge.js";
26
27
 
@@ -48,6 +49,7 @@ const NATIVE_TOOLS = [
48
49
  searchFiles,
49
50
  listSkills,
50
51
  loadSkill,
52
+ transcribeAudio,
51
53
  ];
52
54
 
53
55
  // Registry-backed bridges. Categories can be overridden per-process via env
@@ -1,30 +1,54 @@
1
1
  import { confirmedProperty } from "../helpers.js";
2
2
 
3
+ function decodePhoto({ photo_base64, photo_path, photo_url }) {
4
+ if (photo_url) return String(photo_url);
5
+ if (photo_path) return String(photo_path);
6
+ if (photo_base64) {
7
+ // Strip "data:image/...;base64," prefix if present
8
+ const clean = String(photo_base64).replace(/^data:image\/[a-z]+;base64,/, "");
9
+ return Buffer.from(clean, "base64");
10
+ }
11
+ return null;
12
+ }
13
+
3
14
  export default {
4
15
  name: "send_telegram",
5
16
  schema: {
6
17
  type: "function",
7
18
  function: {
8
19
  name: "send_telegram",
9
- description: "Send a Telegram message via the daemon's Telegram plugin.",
20
+ description:
21
+ "Send a Telegram message via the daemon's Telegram plugin. Text only by default; pass photo_base64 (from browser_screenshot) / photo_path / photo_url to attach an image — the text becomes the caption. Use this AFTER a browser_screenshot when the user asks for a screenshot or visual reply.",
10
22
  parameters: {
11
23
  type: "object",
12
24
  properties: {
13
- channel: { type: "string", description: "telegram channel name; omit for default" },
14
- chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
15
- text: { type: "string" },
16
- confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
25
+ channel: { type: "string", description: "telegram channel name; omit for default" },
26
+ chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
27
+ text: { type: "string", description: "message body (becomes the photo caption when a photo_* arg is passed)" },
28
+ photo_base64: { type: "string", description: "raw base64 PNG/JPG (or 'data:image/...;base64,...' data URI). Pass the `base64` field returned by browser_screenshot here." },
29
+ photo_path: { type: "string", description: "absolute filesystem path to an image file" },
30
+ photo_url: { type: "string", description: "public https URL of an image" },
31
+ confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
17
32
  },
18
33
  required: ["text"],
19
34
  },
20
35
  },
21
36
  },
22
- makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, confirmed = false }) => {
37
+ makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, photo_base64, photo_path, photo_url, confirmed = false }) => {
23
38
  requirePermission("send_telegram", { dangerous: true, confirmed });
24
39
  if (!plugins) throw new Error("plugins unavailable");
25
40
  const telegram = plugins.get("telegram");
26
41
  if (!telegram) throw new Error("telegram plugin not loaded");
42
+
43
+ const photo = decodePhoto({ photo_base64, photo_path, photo_url });
44
+ if (photo) {
45
+ const result = await telegram.sendPhoto({
46
+ channel, chat_id, photo, caption: text, author: "apx",
47
+ });
48
+ return { ok: true, kind: "photo", message_id: result.message_id };
49
+ }
50
+
27
51
  const result = await telegram.send({ channel, chat_id, text, author: "apx" });
28
- return { ok: true, message_id: result.message_id };
52
+ return { ok: true, kind: "text", message_id: result.message_id };
29
53
  },
30
54
  };
@@ -0,0 +1,61 @@
1
+ import fs from "node:fs";
2
+ import os from "node:os";
3
+ import path from "node:path";
4
+ import crypto from "node:crypto";
5
+ import { transcribe } from "../../transcription.js";
6
+
7
+ export default {
8
+ name: "transcribe_audio",
9
+ schema: {
10
+ type: "function",
11
+ function: {
12
+ name: "transcribe_audio",
13
+ description:
14
+ "Transcribe an audio file to text. Default backend is local faster-whisper (model 'medium' on CPU with int8 quantization), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
15
+ parameters: {
16
+ type: "object",
17
+ properties: {
18
+ file_path: { type: "string", description: "absolute path to audio file (.ogg, .mp3, .m4a, .wav, .webm, .opus)" },
19
+ base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
20
+ format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
21
+ provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
22
+ model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default medium)" },
23
+ language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
24
+ device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
25
+ compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
26
+ },
27
+ },
28
+ },
29
+ },
30
+ makeHandler: () => async ({ file_path, base64, format = "ogg", provider, model, language, device, compute_type } = {}) => {
31
+ if (!file_path && !base64) throw new Error("transcribe_audio: file_path or base64 required");
32
+
33
+ let pathToUse = file_path;
34
+ let cleanupTmp = false;
35
+
36
+ if (!pathToUse && base64) {
37
+ const clean = String(base64).replace(/^data:audio\/[a-z]+;base64,/, "");
38
+ const buf = Buffer.from(clean, "base64");
39
+ const tmpDir = path.join(os.tmpdir(), "apx-transcribe");
40
+ fs.mkdirSync(tmpDir, { recursive: true });
41
+ const id = crypto.randomBytes(6).toString("hex");
42
+ pathToUse = path.join(tmpDir, `audio-${id}.${String(format).replace(/^\./, "") || "ogg"}`);
43
+ fs.writeFileSync(pathToUse, buf);
44
+ cleanupTmp = true;
45
+ }
46
+
47
+ try {
48
+ const overrides = {};
49
+ if (provider) overrides.provider = provider;
50
+ if (model) overrides.model = model;
51
+ if (language) overrides.language = language;
52
+ if (device) overrides.device = device;
53
+ if (compute_type) overrides.compute_type = compute_type;
54
+ return await transcribe(pathToUse, overrides);
55
+ } finally {
56
+ if (cleanupTmp) {
57
+ try { fs.unlinkSync(pathToUse); } catch { /* ignore */ }
58
+ }
59
+ }
60
+ },
61
+ };
@@ -0,0 +1,193 @@
1
+ // daemon/transcription.js
2
+ // Audio transcription dispatcher. Two backends:
3
+ //
4
+ // - LOCAL (faster-whisper via Python subprocess) — ported from Panda's
5
+ // transcription_service.py. Same defaults: model "medium", device "cpu",
6
+ // compute_type "int8", beam_size 5, auto language detection. Requires
7
+ // `pip3 install faster-whisper` on the host.
8
+ //
9
+ // - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
10
+ // engines.openai.api_key in config.
11
+ //
12
+ // Provider selection in ~/.apx/config.json:
13
+ // "transcription": {
14
+ // "provider": "auto" | "local" | "openai", // default "auto"
15
+ // "local": {
16
+ // "model": "medium", // tiny | base | small | medium | large | large-v2 | large-v3
17
+ // "device": "cpu", // cpu | cuda
18
+ // "compute_type": "int8", // int8 | int8_float16 | float16 | float32
19
+ // "language": "auto", // ISO 639-1 code or "auto"
20
+ // "beam_size": 5
21
+ // }
22
+ // }
23
+ //
24
+ // "auto" tries local first; on failure falls back to openai.
25
+
26
+ import fs from "node:fs";
27
+ import path from "node:path";
28
+ import { execFile } from "node:child_process";
29
+ import { fileURLToPath } from "node:url";
30
+
31
+ const __filename = fileURLToPath(import.meta.url);
32
+ const __dirname = path.dirname(__filename);
33
+ const PYTHON_HELPER = path.join(__dirname, "whisper-transcribe.py");
34
+
35
+ const DEFAULT_LOCAL = {
36
+ model: "medium",
37
+ device: "cpu",
38
+ compute_type: "int8",
39
+ language: "auto",
40
+ beam_size: 5,
41
+ };
42
+
43
+ // ---------------------------------------------------------------------------
44
+ // Config
45
+ // ---------------------------------------------------------------------------
46
+
47
+ async function getConfig() {
48
+ try {
49
+ const { readConfig } = await import("../core/config.js");
50
+ const cfg = readConfig() || {};
51
+ const t = cfg.transcription || {};
52
+ const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
53
+ return {
54
+ provider: t.provider || "auto",
55
+ local: { ...DEFAULT_LOCAL, ...(t.local || {}) },
56
+ openaiKey,
57
+ };
58
+ } catch {
59
+ return {
60
+ provider: "auto",
61
+ local: { ...DEFAULT_LOCAL },
62
+ openaiKey: process.env.OPENAI_API_KEY || "",
63
+ };
64
+ }
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Local backend (Python + faster-whisper)
69
+ // ---------------------------------------------------------------------------
70
+
71
+ function transcribeLocal(filePath, opts) {
72
+ return new Promise((resolve, reject) => {
73
+ const args = [
74
+ PYTHON_HELPER,
75
+ filePath,
76
+ "--model", String(opts.model || DEFAULT_LOCAL.model),
77
+ "--language", String(opts.language || DEFAULT_LOCAL.language),
78
+ "--device", String(opts.device || DEFAULT_LOCAL.device),
79
+ "--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
80
+ "--beam-size", String(opts.beam_size || DEFAULT_LOCAL.beam_size),
81
+ ];
82
+ execFile("python3", args, { maxBuffer: 16 * 1024 * 1024, timeout: 5 * 60_000 }, (err, stdout, stderr) => {
83
+ if (err) {
84
+ const tail = (stderr || err.message || "").slice(-300);
85
+ return reject(new Error(`local transcription failed: ${tail}`));
86
+ }
87
+ let parsed;
88
+ try { parsed = JSON.parse(String(stdout).trim().split("\n").pop()); }
89
+ catch (e) {
90
+ return reject(new Error(`could not parse helper output: ${stdout.slice(0, 300)}`));
91
+ }
92
+ if (!parsed.ok) return reject(new Error(parsed.error || "unknown local transcription error"));
93
+ resolve({
94
+ ok: true,
95
+ backend: "local",
96
+ text: parsed.text || "",
97
+ language: parsed.language || null,
98
+ language_probability: parsed.language_probability ?? null,
99
+ duration: parsed.duration ?? null,
100
+ model: parsed.model,
101
+ compute_type: parsed.compute_type,
102
+ });
103
+ });
104
+ });
105
+ }
106
+
107
+ // ---------------------------------------------------------------------------
108
+ // OpenAI backend (Whisper-1 cloud)
109
+ // ---------------------------------------------------------------------------
110
+
111
+ async function transcribeOpenAI(filePath, apiKey) {
112
+ if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
113
+
114
+ const fileBuf = fs.readFileSync(filePath);
115
+ const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
116
+ const mimeMap = {
117
+ oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
118
+ mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
119
+ wav: "audio/wav", webm: "audio/webm",
120
+ };
121
+ const blob = new Blob([fileBuf], { type: mimeMap[ext] || "audio/ogg" });
122
+
123
+ const form = new FormData();
124
+ form.append("file", blob, `audio.${ext}`);
125
+ form.append("model", "whisper-1");
126
+
127
+ const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
128
+ method: "POST",
129
+ headers: { Authorization: `Bearer ${apiKey}` },
130
+ body: form,
131
+ });
132
+ if (!res.ok) {
133
+ const err = await res.text().catch(() => "");
134
+ throw new Error(`Whisper API ${res.status}: ${err.slice(0, 200)}`);
135
+ }
136
+ const json = await res.json();
137
+ return {
138
+ ok: true,
139
+ backend: "openai",
140
+ text: String(json.text || "").trim(),
141
+ language: null,
142
+ language_probability: null,
143
+ duration: null,
144
+ model: "whisper-1",
145
+ };
146
+ }
147
+
148
+ // ---------------------------------------------------------------------------
149
+ // Public API
150
+ // ---------------------------------------------------------------------------
151
+
152
+ /**
153
+ * Transcribe an audio file using the configured backend.
154
+ * Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
155
+ *
156
+ * @param {string} filePath absolute path to audio file
157
+ * @param {object} overrides optional: { provider, model, language, ... }
158
+ */
159
+ export async function transcribe(filePath, overrides = {}) {
160
+ if (!filePath || !fs.existsSync(filePath)) {
161
+ throw new Error(`transcribe: file not found: ${filePath}`);
162
+ }
163
+ const cfg = await getConfig();
164
+ const provider = overrides.provider || cfg.provider;
165
+ const localOpts = { ...cfg.local, ...overrides };
166
+
167
+ if (provider === "openai") {
168
+ return transcribeOpenAI(filePath, cfg.openaiKey);
169
+ }
170
+ if (provider === "local") {
171
+ return transcribeLocal(filePath, localOpts);
172
+ }
173
+
174
+ // auto: local first, fall back to openai
175
+ try {
176
+ return await transcribeLocal(filePath, localOpts);
177
+ } catch (localErr) {
178
+ if (!cfg.openaiKey) {
179
+ throw new Error(
180
+ `local transcription failed and no OpenAI fallback available: ${localErr.message}`
181
+ );
182
+ }
183
+ return transcribeOpenAI(filePath, cfg.openaiKey);
184
+ }
185
+ }
186
+
187
+ // ---------------------------------------------------------------------------
188
+ // Diagnostics
189
+ // ---------------------------------------------------------------------------
190
+
191
+ export const TRANSCRIPTION_PATHS = {
192
+ python_helper: PYTHON_HELPER,
193
+ };
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Local audio transcription via faster-whisper. Mirrors the implementation in
4
+ the Panda project (transcription_service.py): same default model "medium",
5
+ device cpu, compute_type int8, beam_size 5. Lazy singleton model cache.
6
+
7
+ Invoked by APX daemon (Node) as a subprocess. Args:
8
+ whisper-transcribe.py <audio_path> [--model medium] [--language auto] [--device cpu] [--compute-type int8] [--beam-size 5]
9
+
10
+ Outputs JSON on stdout:
11
+ { "ok": true, "text": "...", "language": "es", "language_probability": 0.98, "duration": 12.4 }
12
+ { "ok": false, "error": "..." }
13
+ """
14
+ import argparse
15
+ import json
16
+ import os
17
+ import sys
18
+
19
+
20
+ def main() -> int:
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument("audio_path")
23
+ parser.add_argument("--model", default="medium")
24
+ parser.add_argument("--language", default="auto")
25
+ parser.add_argument("--device", default="cpu")
26
+ parser.add_argument("--compute-type", dest="compute_type", default="int8")
27
+ parser.add_argument("--beam-size", dest="beam_size", type=int, default=5)
28
+ args = parser.parse_args()
29
+
30
+ if not os.path.exists(args.audio_path):
31
+ print(json.dumps({"ok": False, "error": f"file not found: {args.audio_path}"}))
32
+ return 1
33
+
34
+ try:
35
+ from faster_whisper import WhisperModel
36
+ except ImportError as e:
37
+ print(json.dumps({
38
+ "ok": False,
39
+ "error": "faster-whisper not installed. Run: pip3 install faster-whisper",
40
+ "import_error": str(e),
41
+ }))
42
+ return 1
43
+
44
+ try:
45
+ model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
46
+ except Exception as e:
47
+ print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
48
+ return 1
49
+
50
+ language = None if args.language == "auto" else args.language
51
+
52
+ try:
53
+ segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=language)
54
+ text = " ".join(seg.text.strip() for seg in segments).strip()
55
+ print(json.dumps({
56
+ "ok": True,
57
+ "text": text,
58
+ "language": info.language,
59
+ "language_probability": round(info.language_probability, 4),
60
+ "duration": round(info.duration, 2),
61
+ "model": args.model,
62
+ "compute_type": args.compute_type,
63
+ }))
64
+ return 0
65
+ except Exception as e:
66
+ print(json.dumps({"ok": False, "error": f"transcription failed: {e}"}))
67
+ return 1
68
+
69
+
70
+ if __name__ == "__main__":
71
+ sys.exit(main())