npm - @agentprojectcontext/apx - Versions diffs - 1.13.0 → 1.13.1 - Mend

@agentprojectcontext/apx 1.13.0 → 1.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/src/daemon/plugins/telegram.js +101 -0
package/src/daemon/super-agent-tools/tools/send-telegram.js +31 -7

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentprojectcontext/apx",
-  "version": "1.13.0",
+  "version": "1.13.1",
   "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
   "publishConfig": {
     "access": "public"

package/src/daemon/plugins/telegram.js CHANGED Viewed

@@ -131,6 +131,48 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
   return json.result;
 }
+/**
+ * Transcribe an audio file via OpenAI Whisper.
+ * Reads OPENAI_API_KEY from env or engines.openai.api_key in ~/.apx/config.json.
+ * Returns the transcribed text, or throws if no key / API failure.
+ */
+async function transcribeAudio(filePath) {
+  let apiKey = process.env.OPENAI_API_KEY;
+  if (!apiKey) {
+    try {
+      const { readConfig } = await import("../../core/config.js");
+      apiKey = readConfig()?.engines?.openai?.api_key || "";
+    } catch { /* ignore */ }
+  }
+  if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
+  const fileBuf = fs.readFileSync(filePath);
+  const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
+  const mimeMap = {
+    oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
+    mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
+    wav: "audio/wav", webm: "audio/webm",
+  };
+  const mime = mimeMap[ext] || "audio/ogg";
+  const blob = new Blob([fileBuf], { type: mime });
+  const form = new FormData();
+  form.append("file", blob, `audio.${ext}`);
+  form.append("model", "whisper-1");
+  const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
+    method: "POST",
+    headers: { Authorization: `Bearer ${apiKey}` },
+    body: form,
+  });
+  if (!res.ok) {
+    const err = await res.text().catch(() => "");
+    throw new Error(`Whisper ${res.status}: ${err.slice(0, 200)}`);
+  }
+  const json = await res.json();
+  return String(json.text || "").trim();
+}
 /**
  * Download a file from Telegram servers.
  * Returns the local file path where it was saved.
@@ -389,6 +431,65 @@ class ChannelPoller {
       if (!text) return;
     }
+    // ── Incoming voice / audio handling ──────────────────────────────────
+    // Telegram sends `voice` for the press-and-hold mic recording (.oga/opus)
+    // and `audio` for uploaded audio files (mp3/m4a/etc.). Either way we
+    // download, run it through Whisper, prefix the result with `[audio] `
+    // and let the rest of the message flow handle it as plain text.
+    const incomingAudio = msg.voice || msg.audio;
+    if (incomingAudio && incomingAudio.file_id) {
+      const token = resolveBotToken(this.channel);
+      const mediaDir = path.join(APX_HOME, "media");
+      fs.mkdirSync(mediaDir, { recursive: true });
+      let localPath = null;
+      let transcript = "";
+      let transcribeError = null;
+      try {
+        localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
+        this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
+      } catch (e) {
+        this.log(`telegram[${this.channel.name}] audio download failed: ${e.message}`);
+      }
+      if (localPath) {
+        try {
+          transcript = await transcribeAudio(localPath);
+          this.log(`telegram[${this.channel.name}] audio transcribed (${transcript.length} chars)`);
+        } catch (e) {
+          transcribeError = e.message;
+          this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
+        }
+      }
+      const audioBody = transcript
+        ? `[audio] ${transcript}`
+        : `[audio] (transcription unavailable${transcribeError ? ": " + transcribeError : ""})`;
+      appendGlobalMessage({
+        channel: "telegram",
+        direction: "in",
+        type: "audio",
+        actor_id: msg.from?.id ? String(msg.from.id) : author,
+        external_id: String(u.update_id),
+        author,
+        body: audioBody,
+        meta: {
+          chat_id,
+          user_id: msg.from?.id || null,
+          message_id: msg.message_id,
+          tg_channel: this.channel.name,
+          local_path: localPath,
+          file_id: incomingAudio.file_id,
+          duration: incomingAudio.duration,
+          mime_type: incomingAudio.mime_type,
+          transcription_error: transcribeError,
+        },
+      });
+      // Inject the transcribed text into `text` so the rest of the agent
+      // pipeline treats it identically to a typed message. If there was a
+      // caption alongside the audio, prepend the audio marker to it.
+      text = text ? `${audioBody}\n${text}` : audioBody;
+    }
     // /reset or /new wipes the rolling context for this chat. We just
     // remember a marker timestamp; subsequent inbounds will only consider
     // history newer than this. Implemented by writing a synthetic message

package/src/daemon/super-agent-tools/tools/send-telegram.js CHANGED Viewed

@@ -1,30 +1,54 @@
 import { confirmedProperty } from "../helpers.js";
+function decodePhoto({ photo_base64, photo_path, photo_url }) {
+  if (photo_url)  return String(photo_url);
+  if (photo_path) return String(photo_path);
+  if (photo_base64) {
+    // Strip "data:image/...;base64," prefix if present
+    const clean = String(photo_base64).replace(/^data:image\/[a-z]+;base64,/, "");
+    return Buffer.from(clean, "base64");
+  }
+  return null;
+}
 export default {
   name: "send_telegram",
   schema: {
     type: "function",
     function: {
       name: "send_telegram",
-      description: "Send a Telegram message via the daemon's Telegram plugin.",
+      description:
+        "Send a Telegram message via the daemon's Telegram plugin. Text only by default; pass photo_base64 (from browser_screenshot) / photo_path / photo_url to attach an image — the text becomes the caption. Use this AFTER a browser_screenshot when the user asks for a screenshot or visual reply.",
       parameters: {
         type: "object",
         properties: {
-          channel: { type: "string", description: "telegram channel name; omit for default" },
-          chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
-          text: { type: "string" },
-          confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
+          channel:      { type: "string", description: "telegram channel name; omit for default" },
+          chat_id:      { type: "string", description: "destination chat id; omit to use channel default" },
+          text:         { type: "string", description: "message body (becomes the photo caption when a photo_* arg is passed)" },
+          photo_base64: { type: "string", description: "raw base64 PNG/JPG (or 'data:image/...;base64,...' data URI). Pass the `base64` field returned by browser_screenshot here." },
+          photo_path:   { type: "string", description: "absolute filesystem path to an image file" },
+          photo_url:    { type: "string", description: "public https URL of an image" },
+          confirmed:    confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
         },
         required: ["text"],
       },
     },
   },
-  makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, confirmed = false }) => {
+  makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, photo_base64, photo_path, photo_url, confirmed = false }) => {
     requirePermission("send_telegram", { dangerous: true, confirmed });
     if (!plugins) throw new Error("plugins unavailable");
     const telegram = plugins.get("telegram");
     if (!telegram) throw new Error("telegram plugin not loaded");
+    const photo = decodePhoto({ photo_base64, photo_path, photo_url });
+    if (photo) {
+      const result = await telegram.sendPhoto({
+        channel, chat_id, photo, caption: text, author: "apx",
+      });
+      return { ok: true, kind: "photo", message_id: result.message_id };
+    }
     const result = await telegram.send({ channel, chat_id, text, author: "apx" });
-    return { ok: true, message_id: result.message_id };
+    return { ok: true, kind: "text", message_id: result.message_id };
   },
 };