@agentprojectcontext/apx 1.13.0 → 1.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentprojectcontext/apx",
3
- "version": "1.13.0",
3
+ "version": "1.13.1",
4
4
  "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -131,6 +131,48 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
131
131
  return json.result;
132
132
  }
133
133
 
134
+ /**
135
+ * Transcribe an audio file via OpenAI Whisper.
136
+ * Reads OPENAI_API_KEY from env or engines.openai.api_key in ~/.apx/config.json.
137
+ * Returns the transcribed text, or throws if no key / API failure.
138
+ */
139
+ async function transcribeAudio(filePath) {
140
+ let apiKey = process.env.OPENAI_API_KEY;
141
+ if (!apiKey) {
142
+ try {
143
+ const { readConfig } = await import("../../core/config.js");
144
+ apiKey = readConfig()?.engines?.openai?.api_key || "";
145
+ } catch { /* ignore */ }
146
+ }
147
+ if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
148
+
149
+ const fileBuf = fs.readFileSync(filePath);
150
+ const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
151
+ const mimeMap = {
152
+ oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
153
+ mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
154
+ wav: "audio/wav", webm: "audio/webm",
155
+ };
156
+ const mime = mimeMap[ext] || "audio/ogg";
157
+ const blob = new Blob([fileBuf], { type: mime });
158
+
159
+ const form = new FormData();
160
+ form.append("file", blob, `audio.${ext}`);
161
+ form.append("model", "whisper-1");
162
+
163
+ const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
164
+ method: "POST",
165
+ headers: { Authorization: `Bearer ${apiKey}` },
166
+ body: form,
167
+ });
168
+ if (!res.ok) {
169
+ const err = await res.text().catch(() => "");
170
+ throw new Error(`Whisper ${res.status}: ${err.slice(0, 200)}`);
171
+ }
172
+ const json = await res.json();
173
+ return String(json.text || "").trim();
174
+ }
175
+
134
176
  /**
135
177
  * Download a file from Telegram servers.
136
178
  * Returns the local file path where it was saved.
@@ -389,6 +431,65 @@ class ChannelPoller {
389
431
  if (!text) return;
390
432
  }
391
433
 
434
+ // ── Incoming voice / audio handling ──────────────────────────────────
435
+ // Telegram sends `voice` for the press-and-hold mic recording (.oga/opus)
436
+ // and `audio` for uploaded audio files (mp3/m4a/etc.). Either way we
437
+ // download, run it through Whisper, prefix the result with `[audio] `
438
+ // and let the rest of the message flow handle it as plain text.
439
+ const incomingAudio = msg.voice || msg.audio;
440
+ if (incomingAudio && incomingAudio.file_id) {
441
+ const token = resolveBotToken(this.channel);
442
+ const mediaDir = path.join(APX_HOME, "media");
443
+ fs.mkdirSync(mediaDir, { recursive: true });
444
+ let localPath = null;
445
+ let transcript = "";
446
+ let transcribeError = null;
447
+ try {
448
+ localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
449
+ this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
450
+ } catch (e) {
451
+ this.log(`telegram[${this.channel.name}] audio download failed: ${e.message}`);
452
+ }
453
+ if (localPath) {
454
+ try {
455
+ transcript = await transcribeAudio(localPath);
456
+ this.log(`telegram[${this.channel.name}] audio transcribed (${transcript.length} chars)`);
457
+ } catch (e) {
458
+ transcribeError = e.message;
459
+ this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
460
+ }
461
+ }
462
+ const audioBody = transcript
463
+ ? `[audio] ${transcript}`
464
+ : `[audio] (transcription unavailable${transcribeError ? ": " + transcribeError : ""})`;
465
+
466
+ appendGlobalMessage({
467
+ channel: "telegram",
468
+ direction: "in",
469
+ type: "audio",
470
+ actor_id: msg.from?.id ? String(msg.from.id) : author,
471
+ external_id: String(u.update_id),
472
+ author,
473
+ body: audioBody,
474
+ meta: {
475
+ chat_id,
476
+ user_id: msg.from?.id || null,
477
+ message_id: msg.message_id,
478
+ tg_channel: this.channel.name,
479
+ local_path: localPath,
480
+ file_id: incomingAudio.file_id,
481
+ duration: incomingAudio.duration,
482
+ mime_type: incomingAudio.mime_type,
483
+ transcription_error: transcribeError,
484
+ },
485
+ });
486
+
487
+ // Inject the transcribed text into `text` so the rest of the agent
488
+ // pipeline treats it identically to a typed message. If there was a
489
+ // caption alongside the audio, prepend the audio marker to it.
490
+ text = text ? `${audioBody}\n${text}` : audioBody;
491
+ }
492
+
392
493
  // /reset or /new wipes the rolling context for this chat. We just
393
494
  // remember a marker timestamp; subsequent inbounds will only consider
394
495
  // history newer than this. Implemented by writing a synthetic message
@@ -1,30 +1,54 @@
1
1
  import { confirmedProperty } from "../helpers.js";
2
2
 
3
+ function decodePhoto({ photo_base64, photo_path, photo_url }) {
4
+ if (photo_url) return String(photo_url);
5
+ if (photo_path) return String(photo_path);
6
+ if (photo_base64) {
7
+ // Strip "data:image/...;base64," prefix if present
8
+ const clean = String(photo_base64).replace(/^data:image\/[a-z]+;base64,/, "");
9
+ return Buffer.from(clean, "base64");
10
+ }
11
+ return null;
12
+ }
13
+
3
14
  export default {
4
15
  name: "send_telegram",
5
16
  schema: {
6
17
  type: "function",
7
18
  function: {
8
19
  name: "send_telegram",
9
- description: "Send a Telegram message via the daemon's Telegram plugin.",
20
+ description:
21
+ "Send a Telegram message via the daemon's Telegram plugin. Text only by default; pass photo_base64 (from browser_screenshot) / photo_path / photo_url to attach an image — the text becomes the caption. Use this AFTER a browser_screenshot when the user asks for a screenshot or visual reply.",
10
22
  parameters: {
11
23
  type: "object",
12
24
  properties: {
13
- channel: { type: "string", description: "telegram channel name; omit for default" },
14
- chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
15
- text: { type: "string" },
16
- confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
25
+ channel: { type: "string", description: "telegram channel name; omit for default" },
26
+ chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
27
+ text: { type: "string", description: "message body (becomes the photo caption when a photo_* arg is passed)" },
28
+ photo_base64: { type: "string", description: "raw base64 PNG/JPG (or 'data:image/...;base64,...' data URI). Pass the `base64` field returned by browser_screenshot here." },
29
+ photo_path: { type: "string", description: "absolute filesystem path to an image file" },
30
+ photo_url: { type: "string", description: "public https URL of an image" },
31
+ confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
17
32
  },
18
33
  required: ["text"],
19
34
  },
20
35
  },
21
36
  },
22
- makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, confirmed = false }) => {
37
+ makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, photo_base64, photo_path, photo_url, confirmed = false }) => {
23
38
  requirePermission("send_telegram", { dangerous: true, confirmed });
24
39
  if (!plugins) throw new Error("plugins unavailable");
25
40
  const telegram = plugins.get("telegram");
26
41
  if (!telegram) throw new Error("telegram plugin not loaded");
42
+
43
+ const photo = decodePhoto({ photo_base64, photo_path, photo_url });
44
+ if (photo) {
45
+ const result = await telegram.sendPhoto({
46
+ channel, chat_id, photo, caption: text, author: "apx",
47
+ });
48
+ return { ok: true, kind: "photo", message_id: result.message_id };
49
+ }
50
+
27
51
  const result = await telegram.send({ channel, chat_id, text, author: "apx" });
28
- return { ok: true, message_id: result.message_id };
52
+ return { ok: true, kind: "text", message_id: result.message_id };
29
53
  },
30
54
  };