npm - @agentprojectcontext/apx - Versions diffs - 1.13.0 → 1.14.0 - Mend

@agentprojectcontext/apx 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/src/daemon/plugins/telegram.js +68 -0
package/src/daemon/super-agent-tools/index.js +2 -0
package/src/daemon/super-agent-tools/tools/send-telegram.js +31 -7
package/src/daemon/super-agent-tools/tools/transcribe-audio.js +61 -0
package/src/daemon/transcription.js +193 -0
package/src/daemon/whisper-transcribe.py +71 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentprojectcontext/apx",
-  "version": "1.13.0",
+  "version": "1.14.0",
   "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
   "publishConfig": {
     "access": "public"

package/src/daemon/plugins/telegram.js CHANGED Viewed

@@ -36,6 +36,7 @@ import { stripThinking } from "../thinking.js";
 import { getRecentTelegramTurnsFromFs, appendGlobalMessage } from "../../core/messages-store.js";
 import { readAgents } from "../../core/parser.js";
 import { buildAgentSystem } from "../../core/agent-system.js";
+import { transcribe as transcribeAudioFile } from "../transcription.js";
 const API_BASE = "https://api.telegram.org";
 const nowIso = () => new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
@@ -131,6 +132,10 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
   return json.result;
 }
+// Audio transcription is delegated to the central dispatcher
+// (../transcription.js) which handles local (faster-whisper via Python) +
+// OpenAI cloud fallback. See that module for config keys.
 /**
  * Download a file from Telegram servers.
  * Returns the local file path where it was saved.
@@ -389,6 +394,69 @@ class ChannelPoller {
       if (!text) return;
     }
+    // ── Incoming voice / audio handling ──────────────────────────────────
+    // Telegram sends `voice` for the press-and-hold mic recording (.oga/opus)
+    // and `audio` for uploaded audio files (mp3/m4a/etc.). Either way we
+    // download, run it through Whisper, prefix the result with `[audio] `
+    // and let the rest of the message flow handle it as plain text.
+    const incomingAudio = msg.voice || msg.audio;
+    if (incomingAudio && incomingAudio.file_id) {
+      const token = resolveBotToken(this.channel);
+      const mediaDir = path.join(APX_HOME, "media");
+      fs.mkdirSync(mediaDir, { recursive: true });
+      let localPath = null;
+      let transcript = "";
+      let transcribeError = null;
+      let transcribeBackend = null;
+      try {
+        localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
+        this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
+      } catch (e) {
+        this.log(`telegram[${this.channel.name}] audio download failed: ${e.message}`);
+      }
+      if (localPath) {
+        try {
+          const result = await transcribeAudioFile(localPath);
+          transcript = result.text || "";
+          transcribeBackend = result.backend;
+          this.log(`telegram[${this.channel.name}] audio transcribed via ${transcribeBackend} (${transcript.length} chars, lang=${result.language || "?"})`);
+        } catch (e) {
+          transcribeError = e.message;
+          this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
+        }
+      }
+      const audioBody = transcript
+        ? `[audio] ${transcript}`
+        : `[audio] (transcription unavailable${transcribeError ? ": " + transcribeError : ""})`;
+      appendGlobalMessage({
+        channel: "telegram",
+        direction: "in",
+        type: "audio",
+        actor_id: msg.from?.id ? String(msg.from.id) : author,
+        external_id: String(u.update_id),
+        author,
+        body: audioBody,
+        meta: {
+          chat_id,
+          user_id: msg.from?.id || null,
+          message_id: msg.message_id,
+          tg_channel: this.channel.name,
+          local_path: localPath,
+          file_id: incomingAudio.file_id,
+          duration: incomingAudio.duration,
+          mime_type: incomingAudio.mime_type,
+          transcription_backend: transcribeBackend,
+          transcription_error: transcribeError,
+        },
+      });
+      // Inject the transcribed text into `text` so the rest of the agent
+      // pipeline treats it identically to a typed message. If there was a
+      // caption alongside the audio, prepend the audio marker to it.
+      text = text ? `${audioBody}\n${text}` : audioBody;
+    }
     // /reset or /new wipes the rolling context for this chat. We just
     // remember a marker timestamp; subsequent inbounds will only consider
     // history newer than this. Implemented by writing a synthetic message

package/src/daemon/super-agent-tools/index.js CHANGED Viewed

@@ -21,6 +21,7 @@ import setPermissionMode from "./tools/set-permission-mode.js";
 import searchFiles from "./tools/search-files.js";
 import listSkills from "./tools/list-skills.js";
 import loadSkill from "./tools/load-skill.js";
+import transcribeAudio from "./tools/transcribe-audio.js";
 import { createPermissionGuard } from "./helpers.js";
 import { buildBridgedTools, DEFAULT_CATEGORIES } from "./registry-bridge.js";
@@ -48,6 +49,7 @@ const NATIVE_TOOLS = [
   searchFiles,
   listSkills,
   loadSkill,
+  transcribeAudio,
 ];
 // Registry-backed bridges. Categories can be overridden per-process via env

package/src/daemon/super-agent-tools/tools/send-telegram.js CHANGED Viewed

@@ -1,30 +1,54 @@
 import { confirmedProperty } from "../helpers.js";
+function decodePhoto({ photo_base64, photo_path, photo_url }) {
+  if (photo_url)  return String(photo_url);
+  if (photo_path) return String(photo_path);
+  if (photo_base64) {
+    // Strip "data:image/...;base64," prefix if present
+    const clean = String(photo_base64).replace(/^data:image\/[a-z]+;base64,/, "");
+    return Buffer.from(clean, "base64");
+  }
+  return null;
+}
 export default {
   name: "send_telegram",
   schema: {
     type: "function",
     function: {
       name: "send_telegram",
-      description: "Send a Telegram message via the daemon's Telegram plugin.",
+      description:
+        "Send a Telegram message via the daemon's Telegram plugin. Text only by default; pass photo_base64 (from browser_screenshot) / photo_path / photo_url to attach an image — the text becomes the caption. Use this AFTER a browser_screenshot when the user asks for a screenshot or visual reply.",
       parameters: {
         type: "object",
         properties: {
-          channel: { type: "string", description: "telegram channel name; omit for default" },
-          chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
-          text: { type: "string" },
-          confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
+          channel:      { type: "string", description: "telegram channel name; omit for default" },
+          chat_id:      { type: "string", description: "destination chat id; omit to use channel default" },
+          text:         { type: "string", description: "message body (becomes the photo caption when a photo_* arg is passed)" },
+          photo_base64: { type: "string", description: "raw base64 PNG/JPG (or 'data:image/...;base64,...' data URI). Pass the `base64` field returned by browser_screenshot here." },
+          photo_path:   { type: "string", description: "absolute filesystem path to an image file" },
+          photo_url:    { type: "string", description: "public https URL of an image" },
+          confirmed:    confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
         },
         required: ["text"],
       },
     },
   },
-  makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, confirmed = false }) => {
+  makeHandler: ({ plugins, requirePermission }) => async ({ channel, chat_id, text, photo_base64, photo_path, photo_url, confirmed = false }) => {
     requirePermission("send_telegram", { dangerous: true, confirmed });
     if (!plugins) throw new Error("plugins unavailable");
     const telegram = plugins.get("telegram");
     if (!telegram) throw new Error("telegram plugin not loaded");
+    const photo = decodePhoto({ photo_base64, photo_path, photo_url });
+    if (photo) {
+      const result = await telegram.sendPhoto({
+        channel, chat_id, photo, caption: text, author: "apx",
+      });
+      return { ok: true, kind: "photo", message_id: result.message_id };
+    }
     const result = await telegram.send({ channel, chat_id, text, author: "apx" });
-    return { ok: true, message_id: result.message_id };
+    return { ok: true, kind: "text", message_id: result.message_id };
   },
 };

package/src/daemon/super-agent-tools/tools/transcribe-audio.js ADDED Viewed

@@ -0,0 +1,61 @@
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import crypto from "node:crypto";
+import { transcribe } from "../../transcription.js";
+export default {
+  name: "transcribe_audio",
+  schema: {
+    type: "function",
+    function: {
+      name: "transcribe_audio",
+      description:
+        "Transcribe an audio file to text. Default backend is local faster-whisper (model 'medium' on CPU with int8 quantization), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
+      parameters: {
+        type: "object",
+        properties: {
+          file_path: { type: "string", description: "absolute path to audio file (.ogg, .mp3, .m4a, .wav, .webm, .opus)" },
+          base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
+          format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
+          provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
+          model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default medium)" },
+          language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
+          device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
+          compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
+        },
+      },
+    },
+  },
+  makeHandler: () => async ({ file_path, base64, format = "ogg", provider, model, language, device, compute_type } = {}) => {
+    if (!file_path && !base64) throw new Error("transcribe_audio: file_path or base64 required");
+    let pathToUse = file_path;
+    let cleanupTmp = false;
+    if (!pathToUse && base64) {
+      const clean = String(base64).replace(/^data:audio\/[a-z]+;base64,/, "");
+      const buf = Buffer.from(clean, "base64");
+      const tmpDir = path.join(os.tmpdir(), "apx-transcribe");
+      fs.mkdirSync(tmpDir, { recursive: true });
+      const id = crypto.randomBytes(6).toString("hex");
+      pathToUse = path.join(tmpDir, `audio-${id}.${String(format).replace(/^\./, "") || "ogg"}`);
+      fs.writeFileSync(pathToUse, buf);
+      cleanupTmp = true;
+    }
+    try {
+      const overrides = {};
+      if (provider) overrides.provider = provider;
+      if (model) overrides.model = model;
+      if (language) overrides.language = language;
+      if (device) overrides.device = device;
+      if (compute_type) overrides.compute_type = compute_type;
+      return await transcribe(pathToUse, overrides);
+    } finally {
+      if (cleanupTmp) {
+        try { fs.unlinkSync(pathToUse); } catch { /* ignore */ }
+      }
+    }
+  },
+};

package/src/daemon/transcription.js ADDED Viewed

@@ -0,0 +1,193 @@
+// daemon/transcription.js
+// Audio transcription dispatcher. Two backends:
+//
+//   - LOCAL (faster-whisper via Python subprocess) — ported from Panda's
+//     transcription_service.py. Same defaults: model "medium", device "cpu",
+//     compute_type "int8", beam_size 5, auto language detection. Requires
+//     `pip3 install faster-whisper` on the host.
+//
+//   - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
+//     engines.openai.api_key in config.
+//
+// Provider selection in ~/.apx/config.json:
+//   "transcription": {
+//     "provider": "auto" | "local" | "openai",   // default "auto"
+//     "local": {
+//       "model": "medium",            // tiny | base | small | medium | large | large-v2 | large-v3
+//       "device": "cpu",              // cpu | cuda
+//       "compute_type": "int8",       // int8 | int8_float16 | float16 | float32
+//       "language": "auto",           // ISO 639-1 code or "auto"
+//       "beam_size": 5
+//     }
+//   }
+//
+// "auto" tries local first; on failure falls back to openai.
+import fs from "node:fs";
+import path from "node:path";
+import { execFile } from "node:child_process";
+import { fileURLToPath } from "node:url";
+const __filename  = fileURLToPath(import.meta.url);
+const __dirname   = path.dirname(__filename);
+const PYTHON_HELPER = path.join(__dirname, "whisper-transcribe.py");
+const DEFAULT_LOCAL = {
+  model: "medium",
+  device: "cpu",
+  compute_type: "int8",
+  language: "auto",
+  beam_size: 5,
+};
+// ---------------------------------------------------------------------------
+// Config
+// ---------------------------------------------------------------------------
+async function getConfig() {
+  try {
+    const { readConfig } = await import("../core/config.js");
+    const cfg = readConfig() || {};
+    const t = cfg.transcription || {};
+    const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
+    return {
+      provider: t.provider || "auto",
+      local: { ...DEFAULT_LOCAL, ...(t.local || {}) },
+      openaiKey,
+    };
+  } catch {
+    return {
+      provider: "auto",
+      local: { ...DEFAULT_LOCAL },
+      openaiKey: process.env.OPENAI_API_KEY || "",
+    };
+  }
+}
+// ---------------------------------------------------------------------------
+// Local backend (Python + faster-whisper)
+// ---------------------------------------------------------------------------
+function transcribeLocal(filePath, opts) {
+  return new Promise((resolve, reject) => {
+    const args = [
+      PYTHON_HELPER,
+      filePath,
+      "--model",       String(opts.model || DEFAULT_LOCAL.model),
+      "--language",    String(opts.language || DEFAULT_LOCAL.language),
+      "--device",      String(opts.device || DEFAULT_LOCAL.device),
+      "--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
+      "--beam-size",   String(opts.beam_size || DEFAULT_LOCAL.beam_size),
+    ];
+    execFile("python3", args, { maxBuffer: 16 * 1024 * 1024, timeout: 5 * 60_000 }, (err, stdout, stderr) => {
+      if (err) {
+        const tail = (stderr || err.message || "").slice(-300);
+        return reject(new Error(`local transcription failed: ${tail}`));
+      }
+      let parsed;
+      try { parsed = JSON.parse(String(stdout).trim().split("\n").pop()); }
+      catch (e) {
+        return reject(new Error(`could not parse helper output: ${stdout.slice(0, 300)}`));
+      }
+      if (!parsed.ok) return reject(new Error(parsed.error || "unknown local transcription error"));
+      resolve({
+        ok: true,
+        backend: "local",
+        text: parsed.text || "",
+        language: parsed.language || null,
+        language_probability: parsed.language_probability ?? null,
+        duration: parsed.duration ?? null,
+        model: parsed.model,
+        compute_type: parsed.compute_type,
+      });
+    });
+  });
+}
+// ---------------------------------------------------------------------------
+// OpenAI backend (Whisper-1 cloud)
+// ---------------------------------------------------------------------------
+async function transcribeOpenAI(filePath, apiKey) {
+  if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
+  const fileBuf = fs.readFileSync(filePath);
+  const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
+  const mimeMap = {
+    oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
+    mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
+    wav: "audio/wav", webm: "audio/webm",
+  };
+  const blob = new Blob([fileBuf], { type: mimeMap[ext] || "audio/ogg" });
+  const form = new FormData();
+  form.append("file", blob, `audio.${ext}`);
+  form.append("model", "whisper-1");
+  const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
+    method: "POST",
+    headers: { Authorization: `Bearer ${apiKey}` },
+    body: form,
+  });
+  if (!res.ok) {
+    const err = await res.text().catch(() => "");
+    throw new Error(`Whisper API ${res.status}: ${err.slice(0, 200)}`);
+  }
+  const json = await res.json();
+  return {
+    ok: true,
+    backend: "openai",
+    text: String(json.text || "").trim(),
+    language: null,
+    language_probability: null,
+    duration: null,
+    model: "whisper-1",
+  };
+}
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Transcribe an audio file using the configured backend.
+ * Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
+ *
+ * @param {string} filePath   absolute path to audio file
+ * @param {object} overrides  optional: { provider, model, language, ... }
+ */
+export async function transcribe(filePath, overrides = {}) {
+  if (!filePath || !fs.existsSync(filePath)) {
+    throw new Error(`transcribe: file not found: ${filePath}`);
+  }
+  const cfg = await getConfig();
+  const provider = overrides.provider || cfg.provider;
+  const localOpts = { ...cfg.local, ...overrides };
+  if (provider === "openai") {
+    return transcribeOpenAI(filePath, cfg.openaiKey);
+  }
+  if (provider === "local") {
+    return transcribeLocal(filePath, localOpts);
+  }
+  // auto: local first, fall back to openai
+  try {
+    return await transcribeLocal(filePath, localOpts);
+  } catch (localErr) {
+    if (!cfg.openaiKey) {
+      throw new Error(
+        `local transcription failed and no OpenAI fallback available: ${localErr.message}`
+      );
+    }
+    return transcribeOpenAI(filePath, cfg.openaiKey);
+  }
+}
+// ---------------------------------------------------------------------------
+// Diagnostics
+// ---------------------------------------------------------------------------
+export const TRANSCRIPTION_PATHS = {
+  python_helper: PYTHON_HELPER,
+};

package/src/daemon/whisper-transcribe.py ADDED Viewed

@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""
+Local audio transcription via faster-whisper. Mirrors the implementation in
+the Panda project (transcription_service.py): same default model "medium",
+device cpu, compute_type int8, beam_size 5. Lazy singleton model cache.
+Invoked by APX daemon (Node) as a subprocess. Args:
+  whisper-transcribe.py <audio_path> [--model medium] [--language auto] [--device cpu] [--compute-type int8] [--beam-size 5]
+Outputs JSON on stdout:
+  { "ok": true,  "text": "...", "language": "es", "language_probability": 0.98, "duration": 12.4 }
+  { "ok": false, "error": "..." }
+"""
+import argparse
+import json
+import os
+import sys
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_path")
+    parser.add_argument("--model", default="medium")
+    parser.add_argument("--language", default="auto")
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--compute-type", dest="compute_type", default="int8")
+    parser.add_argument("--beam-size", dest="beam_size", type=int, default=5)
+    args = parser.parse_args()
+    if not os.path.exists(args.audio_path):
+        print(json.dumps({"ok": False, "error": f"file not found: {args.audio_path}"}))
+        return 1
+    try:
+        from faster_whisper import WhisperModel
+    except ImportError as e:
+        print(json.dumps({
+            "ok": False,
+            "error": "faster-whisper not installed. Run: pip3 install faster-whisper",
+            "import_error": str(e),
+        }))
+        return 1
+    try:
+        model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
+        return 1
+    language = None if args.language == "auto" else args.language
+    try:
+        segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=language)
+        text = " ".join(seg.text.strip() for seg in segments).strip()
+        print(json.dumps({
+            "ok": True,
+            "text": text,
+            "language": info.language,
+            "language_probability": round(info.language_probability, 4),
+            "duration": round(info.duration, 2),
+            "model": args.model,
+            "compute_type": args.compute_type,
+        }))
+        return 0
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": f"transcription failed: {e}"}))
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())