npm - @agentprojectcontext/apx - Versions diffs - 1.13.1 → 1.14.0 - Mend

@agentprojectcontext/apx 1.13.1 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +1 -1
package/src/daemon/plugins/telegram.js +10 -43
package/src/daemon/super-agent-tools/index.js +2 -0
package/src/daemon/super-agent-tools/tools/transcribe-audio.js +61 -0
package/src/daemon/transcription.js +193 -0
package/src/daemon/whisper-transcribe.py +71 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentprojectcontext/apx",
-  "version": "1.13.1",
+  "version": "1.14.0",
   "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
   "publishConfig": {
     "access": "public"

package/src/daemon/plugins/telegram.js CHANGED Viewed

@@ -36,6 +36,7 @@ import { stripThinking } from "../thinking.js";
 import { getRecentTelegramTurnsFromFs, appendGlobalMessage } from "../../core/messages-store.js";
 import { readAgents } from "../../core/parser.js";
 import { buildAgentSystem } from "../../core/agent-system.js";
+import { transcribe as transcribeAudioFile } from "../transcription.js";
 const API_BASE = "https://api.telegram.org";
 const nowIso = () => new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
@@ -131,47 +132,9 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
   return json.result;
 }
-/**
- * Transcribe an audio file via OpenAI Whisper.
- * Reads OPENAI_API_KEY from env or engines.openai.api_key in ~/.apx/config.json.
- * Returns the transcribed text, or throws if no key / API failure.
- */
-async function transcribeAudio(filePath) {
-  let apiKey = process.env.OPENAI_API_KEY;
-  if (!apiKey) {
-    try {
-      const { readConfig } = await import("../../core/config.js");
-      apiKey = readConfig()?.engines?.openai?.api_key || "";
-    } catch { /* ignore */ }
-  }
-  if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
-  const fileBuf = fs.readFileSync(filePath);
-  const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
-  const mimeMap = {
-    oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
-    mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
-    wav: "audio/wav", webm: "audio/webm",
-  };
-  const mime = mimeMap[ext] || "audio/ogg";
-  const blob = new Blob([fileBuf], { type: mime });
-  const form = new FormData();
-  form.append("file", blob, `audio.${ext}`);
-  form.append("model", "whisper-1");
-  const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
-    method: "POST",
-    headers: { Authorization: `Bearer ${apiKey}` },
-    body: form,
-  });
-  if (!res.ok) {
-    const err = await res.text().catch(() => "");
-    throw new Error(`Whisper ${res.status}: ${err.slice(0, 200)}`);
-  }
-  const json = await res.json();
-  return String(json.text || "").trim();
-}
+// Audio transcription is delegated to the central dispatcher
+// (../transcription.js) which handles local (faster-whisper via Python) +
+// OpenAI cloud fallback. See that module for config keys.
 /**
  * Download a file from Telegram servers.
@@ -444,6 +407,7 @@ class ChannelPoller {
       let localPath = null;
       let transcript = "";
       let transcribeError = null;
+      let transcribeBackend = null;
       try {
         localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
         this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
@@ -452,8 +416,10 @@ class ChannelPoller {
       }
       if (localPath) {
         try {
-          transcript = await transcribeAudio(localPath);
-          this.log(`telegram[${this.channel.name}] audio transcribed (${transcript.length} chars)`);
+          const result = await transcribeAudioFile(localPath);
+          transcript = result.text || "";
+          transcribeBackend = result.backend;
+          this.log(`telegram[${this.channel.name}] audio transcribed via ${transcribeBackend} (${transcript.length} chars, lang=${result.language || "?"})`);
         } catch (e) {
           transcribeError = e.message;
           this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
@@ -480,6 +446,7 @@ class ChannelPoller {
           file_id: incomingAudio.file_id,
           duration: incomingAudio.duration,
           mime_type: incomingAudio.mime_type,
+          transcription_backend: transcribeBackend,
           transcription_error: transcribeError,
         },
       });

package/src/daemon/super-agent-tools/index.js CHANGED Viewed

@@ -21,6 +21,7 @@ import setPermissionMode from "./tools/set-permission-mode.js";
 import searchFiles from "./tools/search-files.js";
 import listSkills from "./tools/list-skills.js";
 import loadSkill from "./tools/load-skill.js";
+import transcribeAudio from "./tools/transcribe-audio.js";
 import { createPermissionGuard } from "./helpers.js";
 import { buildBridgedTools, DEFAULT_CATEGORIES } from "./registry-bridge.js";
@@ -48,6 +49,7 @@ const NATIVE_TOOLS = [
   searchFiles,
   listSkills,
   loadSkill,
+  transcribeAudio,
 ];
 // Registry-backed bridges. Categories can be overridden per-process via env

package/src/daemon/super-agent-tools/tools/transcribe-audio.js ADDED Viewed

@@ -0,0 +1,61 @@
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import crypto from "node:crypto";
+import { transcribe } from "../../transcription.js";
+export default {
+  name: "transcribe_audio",
+  schema: {
+    type: "function",
+    function: {
+      name: "transcribe_audio",
+      description:
+        "Transcribe an audio file to text. Default backend is local faster-whisper (model 'medium' on CPU with int8 quantization), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
+      parameters: {
+        type: "object",
+        properties: {
+          file_path: { type: "string", description: "absolute path to audio file (.ogg, .mp3, .m4a, .wav, .webm, .opus)" },
+          base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
+          format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
+          provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
+          model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default medium)" },
+          language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
+          device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
+          compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
+        },
+      },
+    },
+  },
+  makeHandler: () => async ({ file_path, base64, format = "ogg", provider, model, language, device, compute_type } = {}) => {
+    if (!file_path && !base64) throw new Error("transcribe_audio: file_path or base64 required");
+    let pathToUse = file_path;
+    let cleanupTmp = false;
+    if (!pathToUse && base64) {
+      const clean = String(base64).replace(/^data:audio\/[a-z]+;base64,/, "");
+      const buf = Buffer.from(clean, "base64");
+      const tmpDir = path.join(os.tmpdir(), "apx-transcribe");
+      fs.mkdirSync(tmpDir, { recursive: true });
+      const id = crypto.randomBytes(6).toString("hex");
+      pathToUse = path.join(tmpDir, `audio-${id}.${String(format).replace(/^\./, "") || "ogg"}`);
+      fs.writeFileSync(pathToUse, buf);
+      cleanupTmp = true;
+    }
+    try {
+      const overrides = {};
+      if (provider) overrides.provider = provider;
+      if (model) overrides.model = model;
+      if (language) overrides.language = language;
+      if (device) overrides.device = device;
+      if (compute_type) overrides.compute_type = compute_type;
+      return await transcribe(pathToUse, overrides);
+    } finally {
+      if (cleanupTmp) {
+        try { fs.unlinkSync(pathToUse); } catch { /* ignore */ }
+      }
+    }
+  },
+};

package/src/daemon/transcription.js ADDED Viewed

@@ -0,0 +1,193 @@
+// daemon/transcription.js
+// Audio transcription dispatcher. Two backends:
+//
+//   - LOCAL (faster-whisper via Python subprocess) — ported from Panda's
+//     transcription_service.py. Same defaults: model "medium", device "cpu",
+//     compute_type "int8", beam_size 5, auto language detection. Requires
+//     `pip3 install faster-whisper` on the host.
+//
+//   - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
+//     engines.openai.api_key in config.
+//
+// Provider selection in ~/.apx/config.json:
+//   "transcription": {
+//     "provider": "auto" | "local" | "openai",   // default "auto"
+//     "local": {
+//       "model": "medium",            // tiny | base | small | medium | large | large-v2 | large-v3
+//       "device": "cpu",              // cpu | cuda
+//       "compute_type": "int8",       // int8 | int8_float16 | float16 | float32
+//       "language": "auto",           // ISO 639-1 code or "auto"
+//       "beam_size": 5
+//     }
+//   }
+//
+// "auto" tries local first; on failure falls back to openai.
+import fs from "node:fs";
+import path from "node:path";
+import { execFile } from "node:child_process";
+import { fileURLToPath } from "node:url";
+const __filename  = fileURLToPath(import.meta.url);
+const __dirname   = path.dirname(__filename);
+const PYTHON_HELPER = path.join(__dirname, "whisper-transcribe.py");
+const DEFAULT_LOCAL = {
+  model: "medium",
+  device: "cpu",
+  compute_type: "int8",
+  language: "auto",
+  beam_size: 5,
+};
+// ---------------------------------------------------------------------------
+// Config
+// ---------------------------------------------------------------------------
+async function getConfig() {
+  try {
+    const { readConfig } = await import("../core/config.js");
+    const cfg = readConfig() || {};
+    const t = cfg.transcription || {};
+    const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
+    return {
+      provider: t.provider || "auto",
+      local: { ...DEFAULT_LOCAL, ...(t.local || {}) },
+      openaiKey,
+    };
+  } catch {
+    return {
+      provider: "auto",
+      local: { ...DEFAULT_LOCAL },
+      openaiKey: process.env.OPENAI_API_KEY || "",
+    };
+  }
+}
+// ---------------------------------------------------------------------------
+// Local backend (Python + faster-whisper)
+// ---------------------------------------------------------------------------
+function transcribeLocal(filePath, opts) {
+  return new Promise((resolve, reject) => {
+    const args = [
+      PYTHON_HELPER,
+      filePath,
+      "--model",       String(opts.model || DEFAULT_LOCAL.model),
+      "--language",    String(opts.language || DEFAULT_LOCAL.language),
+      "--device",      String(opts.device || DEFAULT_LOCAL.device),
+      "--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
+      "--beam-size",   String(opts.beam_size || DEFAULT_LOCAL.beam_size),
+    ];
+    execFile("python3", args, { maxBuffer: 16 * 1024 * 1024, timeout: 5 * 60_000 }, (err, stdout, stderr) => {
+      if (err) {
+        const tail = (stderr || err.message || "").slice(-300);
+        return reject(new Error(`local transcription failed: ${tail}`));
+      }
+      let parsed;
+      try { parsed = JSON.parse(String(stdout).trim().split("\n").pop()); }
+      catch (e) {
+        return reject(new Error(`could not parse helper output: ${stdout.slice(0, 300)}`));
+      }
+      if (!parsed.ok) return reject(new Error(parsed.error || "unknown local transcription error"));
+      resolve({
+        ok: true,
+        backend: "local",
+        text: parsed.text || "",
+        language: parsed.language || null,
+        language_probability: parsed.language_probability ?? null,
+        duration: parsed.duration ?? null,
+        model: parsed.model,
+        compute_type: parsed.compute_type,
+      });
+    });
+  });
+}
+// ---------------------------------------------------------------------------
+// OpenAI backend (Whisper-1 cloud)
+// ---------------------------------------------------------------------------
+async function transcribeOpenAI(filePath, apiKey) {
+  if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
+  const fileBuf = fs.readFileSync(filePath);
+  const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
+  const mimeMap = {
+    oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
+    mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
+    wav: "audio/wav", webm: "audio/webm",
+  };
+  const blob = new Blob([fileBuf], { type: mimeMap[ext] || "audio/ogg" });
+  const form = new FormData();
+  form.append("file", blob, `audio.${ext}`);
+  form.append("model", "whisper-1");
+  const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
+    method: "POST",
+    headers: { Authorization: `Bearer ${apiKey}` },
+    body: form,
+  });
+  if (!res.ok) {
+    const err = await res.text().catch(() => "");
+    throw new Error(`Whisper API ${res.status}: ${err.slice(0, 200)}`);
+  }
+  const json = await res.json();
+  return {
+    ok: true,
+    backend: "openai",
+    text: String(json.text || "").trim(),
+    language: null,
+    language_probability: null,
+    duration: null,
+    model: "whisper-1",
+  };
+}
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Transcribe an audio file using the configured backend.
+ * Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
+ *
+ * @param {string} filePath   absolute path to audio file
+ * @param {object} overrides  optional: { provider, model, language, ... }
+ */
+export async function transcribe(filePath, overrides = {}) {
+  if (!filePath || !fs.existsSync(filePath)) {
+    throw new Error(`transcribe: file not found: ${filePath}`);
+  }
+  const cfg = await getConfig();
+  const provider = overrides.provider || cfg.provider;
+  const localOpts = { ...cfg.local, ...overrides };
+  if (provider === "openai") {
+    return transcribeOpenAI(filePath, cfg.openaiKey);
+  }
+  if (provider === "local") {
+    return transcribeLocal(filePath, localOpts);
+  }
+  // auto: local first, fall back to openai
+  try {
+    return await transcribeLocal(filePath, localOpts);
+  } catch (localErr) {
+    if (!cfg.openaiKey) {
+      throw new Error(
+        `local transcription failed and no OpenAI fallback available: ${localErr.message}`
+      );
+    }
+    return transcribeOpenAI(filePath, cfg.openaiKey);
+  }
+}
+// ---------------------------------------------------------------------------
+// Diagnostics
+// ---------------------------------------------------------------------------
+export const TRANSCRIPTION_PATHS = {
+  python_helper: PYTHON_HELPER,
+};

package/src/daemon/whisper-transcribe.py ADDED Viewed

@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""
+Local audio transcription via faster-whisper. Mirrors the implementation in
+the Panda project (transcription_service.py): same default model "medium",
+device cpu, compute_type int8, beam_size 5. Lazy singleton model cache.
+Invoked by APX daemon (Node) as a subprocess. Args:
+  whisper-transcribe.py <audio_path> [--model medium] [--language auto] [--device cpu] [--compute-type int8] [--beam-size 5]
+Outputs JSON on stdout:
+  { "ok": true,  "text": "...", "language": "es", "language_probability": 0.98, "duration": 12.4 }
+  { "ok": false, "error": "..." }
+"""
+import argparse
+import json
+import os
+import sys
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_path")
+    parser.add_argument("--model", default="medium")
+    parser.add_argument("--language", default="auto")
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--compute-type", dest="compute_type", default="int8")
+    parser.add_argument("--beam-size", dest="beam_size", type=int, default=5)
+    args = parser.parse_args()
+    if not os.path.exists(args.audio_path):
+        print(json.dumps({"ok": False, "error": f"file not found: {args.audio_path}"}))
+        return 1
+    try:
+        from faster_whisper import WhisperModel
+    except ImportError as e:
+        print(json.dumps({
+            "ok": False,
+            "error": "faster-whisper not installed. Run: pip3 install faster-whisper",
+            "import_error": str(e),
+        }))
+        return 1
+    try:
+        model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
+        return 1
+    language = None if args.language == "auto" else args.language
+    try:
+        segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=language)
+        text = " ".join(seg.text.strip() for seg in segments).strip()
+        print(json.dumps({
+            "ok": True,
+            "text": text,
+            "language": info.language,
+            "language_probability": round(info.language_probability, 4),
+            "duration": round(info.duration, 2),
+            "model": args.model,
+            "compute_type": args.compute_type,
+        }))
+        return 0
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": f"transcription failed: {e}"}))
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())