@agentprojectcontext/apx 1.13.1 → 1.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/daemon/plugins/telegram.js +75 -43
- package/src/daemon/super-agent-tools/index.js +2 -0
- package/src/daemon/super-agent-tools/tools/send-telegram.js +85 -15
- package/src/daemon/super-agent-tools/tools/transcribe-audio.js +61 -0
- package/src/daemon/super-agent.js +2 -1
- package/src/daemon/tools/browser.js +19 -1
- package/src/daemon/tools/registry.js +9 -7
- package/src/daemon/transcription.js +193 -0
- package/src/daemon/whisper-transcribe.py +71 -0
package/package.json
CHANGED
|
@@ -36,6 +36,7 @@ import { stripThinking } from "../thinking.js";
|
|
|
36
36
|
import { getRecentTelegramTurnsFromFs, appendGlobalMessage } from "../../core/messages-store.js";
|
|
37
37
|
import { readAgents } from "../../core/parser.js";
|
|
38
38
|
import { buildAgentSystem } from "../../core/agent-system.js";
|
|
39
|
+
import { transcribe as transcribeAudioFile } from "../transcription.js";
|
|
39
40
|
|
|
40
41
|
const API_BASE = "https://api.telegram.org";
|
|
41
42
|
const nowIso = () => new Date().toISOString().replace(/\.\d{3}Z$/, "Z");
|
|
@@ -112,6 +113,40 @@ export async function sendVoice(token, chatId, audio, { caption, duration } = {}
|
|
|
112
113
|
* @param {string} [opts.title]
|
|
113
114
|
* @param {string} [opts.performer]
|
|
114
115
|
*/
|
|
116
|
+
/**
|
|
117
|
+
* Send any file as a Telegram document (PDF, zip, txt, etc).
|
|
118
|
+
* @param {string} token
|
|
119
|
+
* @param {string|number} chatId
|
|
120
|
+
* @param {string|Buffer} document Path or Buffer of document data
|
|
121
|
+
* @param {object} [opts]
|
|
122
|
+
* @param {string} [opts.caption]
|
|
123
|
+
* @param {string} [opts.filename] override filename for Buffer input
|
|
124
|
+
* @param {string} [opts.mime_type]
|
|
125
|
+
*/
|
|
126
|
+
export async function sendDocument(token, chatId, document, { caption, filename, mime_type } = {}) {
|
|
127
|
+
const url = `${API_BASE}/bot${token}/sendDocument`;
|
|
128
|
+
const form = new FormData();
|
|
129
|
+
form.append("chat_id", String(chatId));
|
|
130
|
+
if (caption) form.append("caption", caption);
|
|
131
|
+
|
|
132
|
+
// URL string → let Telegram fetch it
|
|
133
|
+
if (typeof document === "string" && /^https?:\/\//.test(document)) {
|
|
134
|
+
form.append("document", document);
|
|
135
|
+
} else {
|
|
136
|
+
const buf = Buffer.isBuffer(document) ? document : fs.readFileSync(document);
|
|
137
|
+
const name =
|
|
138
|
+
filename ||
|
|
139
|
+
(typeof document === "string" ? path.basename(document) : "document.bin");
|
|
140
|
+
const blob = new Blob([buf], { type: mime_type || "application/octet-stream" });
|
|
141
|
+
form.append("document", blob, name);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const res = await fetch(url, { method: "POST", body: form });
|
|
145
|
+
const json = await res.json();
|
|
146
|
+
if (!json.ok) throw new Error(`sendDocument failed: ${json.description || res.status}`);
|
|
147
|
+
return json.result;
|
|
148
|
+
}
|
|
149
|
+
|
|
115
150
|
export async function sendAudio(token, chatId, audio, { caption, title, performer } = {}) {
|
|
116
151
|
const url = `${API_BASE}/bot${token}/sendAudio`;
|
|
117
152
|
const form = new FormData();
|
|
@@ -131,47 +166,9 @@ export async function sendAudio(token, chatId, audio, { caption, title, performe
|
|
|
131
166
|
return json.result;
|
|
132
167
|
}
|
|
133
168
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
* Returns the transcribed text, or throws if no key / API failure.
|
|
138
|
-
*/
|
|
139
|
-
async function transcribeAudio(filePath) {
|
|
140
|
-
let apiKey = process.env.OPENAI_API_KEY;
|
|
141
|
-
if (!apiKey) {
|
|
142
|
-
try {
|
|
143
|
-
const { readConfig } = await import("../../core/config.js");
|
|
144
|
-
apiKey = readConfig()?.engines?.openai?.api_key || "";
|
|
145
|
-
} catch { /* ignore */ }
|
|
146
|
-
}
|
|
147
|
-
if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
|
|
148
|
-
|
|
149
|
-
const fileBuf = fs.readFileSync(filePath);
|
|
150
|
-
const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
|
|
151
|
-
const mimeMap = {
|
|
152
|
-
oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
|
|
153
|
-
mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
|
|
154
|
-
wav: "audio/wav", webm: "audio/webm",
|
|
155
|
-
};
|
|
156
|
-
const mime = mimeMap[ext] || "audio/ogg";
|
|
157
|
-
const blob = new Blob([fileBuf], { type: mime });
|
|
158
|
-
|
|
159
|
-
const form = new FormData();
|
|
160
|
-
form.append("file", blob, `audio.${ext}`);
|
|
161
|
-
form.append("model", "whisper-1");
|
|
162
|
-
|
|
163
|
-
const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
164
|
-
method: "POST",
|
|
165
|
-
headers: { Authorization: `Bearer ${apiKey}` },
|
|
166
|
-
body: form,
|
|
167
|
-
});
|
|
168
|
-
if (!res.ok) {
|
|
169
|
-
const err = await res.text().catch(() => "");
|
|
170
|
-
throw new Error(`Whisper ${res.status}: ${err.slice(0, 200)}`);
|
|
171
|
-
}
|
|
172
|
-
const json = await res.json();
|
|
173
|
-
return String(json.text || "").trim();
|
|
174
|
-
}
|
|
169
|
+
// Audio transcription is delegated to the central dispatcher
|
|
170
|
+
// (../transcription.js) which handles local (faster-whisper via Python) +
|
|
171
|
+
// OpenAI cloud fallback. See that module for config keys.
|
|
175
172
|
|
|
176
173
|
/**
|
|
177
174
|
* Download a file from Telegram servers.
|
|
@@ -444,6 +441,7 @@ class ChannelPoller {
|
|
|
444
441
|
let localPath = null;
|
|
445
442
|
let transcript = "";
|
|
446
443
|
let transcribeError = null;
|
|
444
|
+
let transcribeBackend = null;
|
|
447
445
|
try {
|
|
448
446
|
localPath = await downloadTelegramFile(token, incomingAudio.file_id, mediaDir);
|
|
449
447
|
this.log(`telegram[${this.channel.name}] audio saved: ${localPath}`);
|
|
@@ -452,8 +450,10 @@ class ChannelPoller {
|
|
|
452
450
|
}
|
|
453
451
|
if (localPath) {
|
|
454
452
|
try {
|
|
455
|
-
|
|
456
|
-
|
|
453
|
+
const result = await transcribeAudioFile(localPath);
|
|
454
|
+
transcript = result.text || "";
|
|
455
|
+
transcribeBackend = result.backend;
|
|
456
|
+
this.log(`telegram[${this.channel.name}] audio transcribed via ${transcribeBackend} (${transcript.length} chars, lang=${result.language || "?"})`);
|
|
457
457
|
} catch (e) {
|
|
458
458
|
transcribeError = e.message;
|
|
459
459
|
this.log(`telegram[${this.channel.name}] audio transcription failed: ${e.message}`);
|
|
@@ -480,6 +480,7 @@ class ChannelPoller {
|
|
|
480
480
|
file_id: incomingAudio.file_id,
|
|
481
481
|
duration: incomingAudio.duration,
|
|
482
482
|
mime_type: incomingAudio.mime_type,
|
|
483
|
+
transcription_backend: transcribeBackend,
|
|
483
484
|
transcription_error: transcribeError,
|
|
484
485
|
},
|
|
485
486
|
});
|
|
@@ -757,6 +758,14 @@ class ChannelPoller {
|
|
|
757
758
|
return sendVoice(token, target, audio, { caption, duration });
|
|
758
759
|
}
|
|
759
760
|
|
|
761
|
+
/** Send a document (PDF, zip, etc) via this channel */
|
|
762
|
+
async _sendDocument({ chat_id, document, caption, filename, mime_type }) {
|
|
763
|
+
const token = resolveBotToken(this.channel);
|
|
764
|
+
if (!token) throw new Error(`channel ${this.channel.name}: no bot_token`);
|
|
765
|
+
const target = chat_id || resolveChatId(this.channel);
|
|
766
|
+
return sendDocument(token, target, document, { caption, filename, mime_type });
|
|
767
|
+
}
|
|
768
|
+
|
|
760
769
|
/** Send an audio file via this channel */
|
|
761
770
|
async _sendAudio({ chat_id, audio, caption, title, performer }) {
|
|
762
771
|
const token = resolveBotToken(this.channel);
|
|
@@ -881,6 +890,29 @@ export default {
|
|
|
881
890
|
return result;
|
|
882
891
|
},
|
|
883
892
|
|
|
893
|
+
/**
|
|
894
|
+
* Send a document (PDF, zip, txt, generated reports, etc).
|
|
895
|
+
* document: local file path, Buffer, or public https URL.
|
|
896
|
+
*/
|
|
897
|
+
async sendDocument({ channel: channelName, chat_id, document, caption, filename, mime_type, author = "apx" }) {
|
|
898
|
+
const p =
|
|
899
|
+
(channelName && pollers.find((pp) => pp.channel.name === channelName)) ||
|
|
900
|
+
pollers.find((pp) => resolveBotToken(pp.channel)) ||
|
|
901
|
+
null;
|
|
902
|
+
if (!p) throw new Error("no telegram channel available");
|
|
903
|
+
const result = await p._sendDocument({ chat_id, document, caption, filename, mime_type });
|
|
904
|
+
appendGlobalMessage({
|
|
905
|
+
channel: "telegram",
|
|
906
|
+
direction: "out",
|
|
907
|
+
type: "document",
|
|
908
|
+
actor_id: author,
|
|
909
|
+
author,
|
|
910
|
+
body: caption || `[document${filename ? " " + filename : ""}]`,
|
|
911
|
+
meta: { chat_id: chat_id || resolveChatId(p.channel), tg_channel: p.channel.name, filename, mime_type },
|
|
912
|
+
});
|
|
913
|
+
return result;
|
|
914
|
+
},
|
|
915
|
+
|
|
884
916
|
/**
|
|
885
917
|
* Send an audio file (MP3/M4A — shown in music player).
|
|
886
918
|
* audio: local file path or Buffer
|
|
@@ -21,6 +21,7 @@ import setPermissionMode from "./tools/set-permission-mode.js";
|
|
|
21
21
|
import searchFiles from "./tools/search-files.js";
|
|
22
22
|
import listSkills from "./tools/list-skills.js";
|
|
23
23
|
import loadSkill from "./tools/load-skill.js";
|
|
24
|
+
import transcribeAudio from "./tools/transcribe-audio.js";
|
|
24
25
|
import { createPermissionGuard } from "./helpers.js";
|
|
25
26
|
import { buildBridgedTools, DEFAULT_CATEGORIES } from "./registry-bridge.js";
|
|
26
27
|
|
|
@@ -48,6 +49,7 @@ const NATIVE_TOOLS = [
|
|
|
48
49
|
searchFiles,
|
|
49
50
|
listSkills,
|
|
50
51
|
loadSkill,
|
|
52
|
+
transcribeAudio,
|
|
51
53
|
];
|
|
52
54
|
|
|
53
55
|
// Registry-backed bridges. Categories can be overridden per-process via env
|
|
@@ -1,12 +1,41 @@
|
|
|
1
1
|
import { confirmedProperty } from "../helpers.js";
|
|
2
2
|
|
|
3
|
+
function decodeBase64(b64) {
|
|
4
|
+
const clean = String(b64).replace(/^data:[a-z/-]+;base64,/, "");
|
|
5
|
+
return Buffer.from(clean, "base64");
|
|
6
|
+
}
|
|
7
|
+
|
|
3
8
|
function decodePhoto({ photo_base64, photo_path, photo_url }) {
|
|
4
|
-
if (photo_url)
|
|
5
|
-
if (photo_path)
|
|
6
|
-
if (photo_base64)
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
if (photo_url) return String(photo_url);
|
|
10
|
+
if (photo_path) return String(photo_path);
|
|
11
|
+
if (photo_base64) return decodeBase64(photo_base64);
|
|
12
|
+
return null;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function decodeDocument({ document_base64, document_path, document_url }) {
|
|
16
|
+
if (document_url) return String(document_url);
|
|
17
|
+
if (document_path) return String(document_path);
|
|
18
|
+
if (document_base64) return decodeBase64(document_base64);
|
|
19
|
+
return null;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Detect the common LLM mistake of embedding raw base64 in the text field
|
|
24
|
+
* (often wrapped in markdown image syntax). Telegram does NOT render those —
|
|
25
|
+
* it just shows the literal characters. Fail fast with a clear hint.
|
|
26
|
+
*/
|
|
27
|
+
function detectBase64InText(text) {
|
|
28
|
+
if (!text || typeof text !== "string") return null;
|
|
29
|
+
if (/!\[[^\]]*\]\(data:image\/[a-z]+;base64,/i.test(text)) {
|
|
30
|
+
return "markdown image with data URI";
|
|
31
|
+
}
|
|
32
|
+
if (/data:image\/[a-z]+;base64,/i.test(text)) {
|
|
33
|
+
return "data URI";
|
|
34
|
+
}
|
|
35
|
+
// Long runs of base64-looking chars (>500 contiguous) — almost certainly a
|
|
36
|
+
// dumped image
|
|
37
|
+
if (/[A-Za-z0-9+/=]{500,}/.test(text)) {
|
|
38
|
+
return "raw base64 blob (>500 chars)";
|
|
10
39
|
}
|
|
11
40
|
return null;
|
|
12
41
|
}
|
|
@@ -18,28 +47,61 @@ export default {
|
|
|
18
47
|
function: {
|
|
19
48
|
name: "send_telegram",
|
|
20
49
|
description:
|
|
21
|
-
"Send a Telegram message via the daemon's Telegram plugin.
|
|
50
|
+
"Send a Telegram message via the daemon's Telegram plugin. STRICT rule: to attach an image use the photo_* params; to attach a file use the document_* params — NEVER paste base64 or a data URI inside `text` (Telegram does not render markdown images / data URIs, the recipient sees the literal base64). After browser_screenshot, pass its `base64` field directly to photo_base64 here (not in text). The text field becomes the caption when media is attached.",
|
|
22
51
|
parameters: {
|
|
23
52
|
type: "object",
|
|
24
53
|
properties: {
|
|
25
|
-
channel:
|
|
26
|
-
chat_id:
|
|
27
|
-
text:
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
54
|
+
channel: { type: "string", description: "telegram channel name; omit for default" },
|
|
55
|
+
chat_id: { type: "string", description: "destination chat id; omit to use channel default" },
|
|
56
|
+
text: {
|
|
57
|
+
type: "string",
|
|
58
|
+
description:
|
|
59
|
+
"Plain-text body (becomes the caption when a photo_* or document_* is attached). MUST NOT contain base64, data URIs, or markdown image syntax like  — use photo_base64 for that.",
|
|
60
|
+
},
|
|
61
|
+
// --- image attachments ---
|
|
62
|
+
photo_base64: {
|
|
63
|
+
type: "string",
|
|
64
|
+
description:
|
|
65
|
+
"raw base64 PNG/JPG (or 'data:image/...;base64,...'). Pass the `base64` field from browser_screenshot directly here.",
|
|
66
|
+
},
|
|
67
|
+
photo_path: { type: "string", description: "absolute filesystem path to an image file" },
|
|
68
|
+
photo_url: { type: "string", description: "public https URL of an image" },
|
|
69
|
+
// --- document attachments (PDF, txt, zip, etc) ---
|
|
70
|
+
document_base64: { type: "string", description: "raw base64 of a file" },
|
|
71
|
+
document_path: { type: "string", description: "absolute filesystem path to any file (PDF, txt, zip, .csv...)" },
|
|
72
|
+
document_url: { type: "string", description: "public https URL of a file" },
|
|
73
|
+
filename: { type: "string", description: "filename to show in Telegram when sending a document (Buffer-style input)" },
|
|
74
|
+
mime_type: { type: "string", description: "optional MIME type for the document" },
|
|
75
|
+
confirmed: confirmedProperty("true only after explicit user confirmation for this exact outbound message"),
|
|
32
76
|
},
|
|
33
77
|
required: ["text"],
|
|
34
78
|
},
|
|
35
79
|
},
|
|
36
80
|
},
|
|
37
|
-
makeHandler: ({ plugins, requirePermission }) => async (
|
|
81
|
+
makeHandler: ({ plugins, requirePermission }) => async (args = {}) => {
|
|
82
|
+
const {
|
|
83
|
+
channel, chat_id, text,
|
|
84
|
+
photo_base64, photo_path, photo_url,
|
|
85
|
+
document_base64, document_path, document_url,
|
|
86
|
+
filename, mime_type,
|
|
87
|
+
confirmed = false,
|
|
88
|
+
} = args;
|
|
89
|
+
|
|
38
90
|
requirePermission("send_telegram", { dangerous: true, confirmed });
|
|
39
91
|
if (!plugins) throw new Error("plugins unavailable");
|
|
40
92
|
const telegram = plugins.get("telegram");
|
|
41
93
|
if (!telegram) throw new Error("telegram plugin not loaded");
|
|
42
94
|
|
|
95
|
+
// Defensive: catch the classic mistake of dumping base64 into text.
|
|
96
|
+
const bad = detectBase64InText(text);
|
|
97
|
+
if (bad) {
|
|
98
|
+
throw new Error(
|
|
99
|
+
`send_telegram: refusing to send — text appears to contain ${bad}. ` +
|
|
100
|
+
`Telegram does not render data URIs or markdown images. ` +
|
|
101
|
+
`Pass the base64 in photo_base64 (NOT text). Set text to a short caption like "Captura de localhost:8801".`
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
|
|
43
105
|
const photo = decodePhoto({ photo_base64, photo_path, photo_url });
|
|
44
106
|
if (photo) {
|
|
45
107
|
const result = await telegram.sendPhoto({
|
|
@@ -48,6 +110,14 @@ export default {
|
|
|
48
110
|
return { ok: true, kind: "photo", message_id: result.message_id };
|
|
49
111
|
}
|
|
50
112
|
|
|
113
|
+
const document = decodeDocument({ document_base64, document_path, document_url });
|
|
114
|
+
if (document) {
|
|
115
|
+
const result = await telegram.sendDocument({
|
|
116
|
+
channel, chat_id, document, caption: text, filename, mime_type, author: "apx",
|
|
117
|
+
});
|
|
118
|
+
return { ok: true, kind: "document", message_id: result.message_id, filename };
|
|
119
|
+
}
|
|
120
|
+
|
|
51
121
|
const result = await telegram.send({ channel, chat_id, text, author: "apx" });
|
|
52
122
|
return { ok: true, kind: "text", message_id: result.message_id };
|
|
53
123
|
},
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import os from "node:os";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import crypto from "node:crypto";
|
|
5
|
+
import { transcribe } from "../../transcription.js";
|
|
6
|
+
|
|
7
|
+
export default {
|
|
8
|
+
name: "transcribe_audio",
|
|
9
|
+
schema: {
|
|
10
|
+
type: "function",
|
|
11
|
+
function: {
|
|
12
|
+
name: "transcribe_audio",
|
|
13
|
+
description:
|
|
14
|
+
"Transcribe an audio file to text. Default backend is local faster-whisper (model 'medium' on CPU with int8 quantization), with automatic fallback to OpenAI Whisper API if local fails. Pass file_path for a file on disk, or base64 for raw audio bytes (will be written to a temp file). Override provider/model/language as needed.",
|
|
15
|
+
parameters: {
|
|
16
|
+
type: "object",
|
|
17
|
+
properties: {
|
|
18
|
+
file_path: { type: "string", description: "absolute path to audio file (.ogg, .mp3, .m4a, .wav, .webm, .opus)" },
|
|
19
|
+
base64: { type: "string", description: "alternative to file_path — raw base64 audio bytes (or 'data:audio/...;base64,...' data URI)" },
|
|
20
|
+
format: { type: "string", description: "file extension hint when using base64 (default 'ogg')" },
|
|
21
|
+
provider: { type: "string", description: "override the configured provider: 'auto' | 'local' | 'openai'" },
|
|
22
|
+
model: { type: "string", description: "local model size: tiny | base | small | medium | large | large-v2 | large-v3 (default medium)" },
|
|
23
|
+
language: { type: "string", description: "ISO 639-1 code (e.g. 'es', 'en') or 'auto' for detection" },
|
|
24
|
+
device: { type: "string", description: "local device: cpu | cuda (default cpu)" },
|
|
25
|
+
compute_type: { type: "string", description: "local quantization: int8 | int8_float16 | float16 | float32 (default int8)" },
|
|
26
|
+
},
|
|
27
|
+
},
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
makeHandler: () => async ({ file_path, base64, format = "ogg", provider, model, language, device, compute_type } = {}) => {
|
|
31
|
+
if (!file_path && !base64) throw new Error("transcribe_audio: file_path or base64 required");
|
|
32
|
+
|
|
33
|
+
let pathToUse = file_path;
|
|
34
|
+
let cleanupTmp = false;
|
|
35
|
+
|
|
36
|
+
if (!pathToUse && base64) {
|
|
37
|
+
const clean = String(base64).replace(/^data:audio\/[a-z]+;base64,/, "");
|
|
38
|
+
const buf = Buffer.from(clean, "base64");
|
|
39
|
+
const tmpDir = path.join(os.tmpdir(), "apx-transcribe");
|
|
40
|
+
fs.mkdirSync(tmpDir, { recursive: true });
|
|
41
|
+
const id = crypto.randomBytes(6).toString("hex");
|
|
42
|
+
pathToUse = path.join(tmpDir, `audio-${id}.${String(format).replace(/^\./, "") || "ogg"}`);
|
|
43
|
+
fs.writeFileSync(pathToUse, buf);
|
|
44
|
+
cleanupTmp = true;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
const overrides = {};
|
|
49
|
+
if (provider) overrides.provider = provider;
|
|
50
|
+
if (model) overrides.model = model;
|
|
51
|
+
if (language) overrides.language = language;
|
|
52
|
+
if (device) overrides.device = device;
|
|
53
|
+
if (compute_type) overrides.compute_type = compute_type;
|
|
54
|
+
return await transcribe(pathToUse, overrides);
|
|
55
|
+
} finally {
|
|
56
|
+
if (cleanupTmp) {
|
|
57
|
+
try { fs.unlinkSync(pathToUse); } catch { /* ignore */ }
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
};
|
|
@@ -65,7 +65,8 @@ HARD RULES (do not deviate):
|
|
|
65
65
|
18. **NO EMPTY RESPONSES**: Never respond with only text when you have tools available and the user is asking you to DO something. Call the tool FIRST, then explain. Never say "I'll do X" without immediately calling the tool. Empty acknowledgments ("ok", "entendido", "dame un minuto", "voy", "checking", "stand by") without a tool call are invalid responses — they will be re-prompted and waste a turn.
|
|
66
66
|
19. **CWD RULE**: When the channel context includes a "CWD: <path>" line, that is the user's current working directory. References to "este directorio", "este proyecto", "esta carpeta", "acá", "aquí", "this directory", "this project", "current dir/folder" all mean that exact CWD path. Use it as the path argument directly — DO NOT ask the user "what's the path?" when CWD is already given. Example: if user says "agregá este proyecto a la lista", call add_project({path: <CWD>}) immediately.
|
|
67
67
|
20. **NO MANUAL SCAFFOLDING**: To register or scaffold a project, ALWAYS use add_project — it auto-creates AGENTS.md and .apc/project.json when missing (one call, atomic). NEVER write AGENTS.md, .apc/project.json, or any APC scaffold file by hand via run_shell / write_file / shell pipes. The schema must come from the official initApf scaffold, not improvised. If add_project errors, report the error to the user — don't try to work around it with shell hacks. Same for any other APC-managed file (.apc/agents/*, .apc/skills/*, etc.) — use the dedicated tool, never raw filesystem writes.
|
|
68
|
-
21. **SKILLS — ON DEMAND**: The "# Available skills" section below lists every skill available to you (slug + description, NO body). When the user asks about specific APX/APC commands, project structure, agent runtimes, or anything where exact syntax or detailed behavior matches a skill description (in ANY language — match semantically, not by keyword), call load_skill({slug}) to fetch the full markdown body. If a CWD is in the contextNote, pass it as project_path so project-scoped skills resolve. If the user explicitly asks "what skills do you have?", you can either read the catalog below directly OR call list_skills to get a fresh enumeration. Do NOT load skills for trivial / unrelated questions — that wastes tokens. Don't guess CLI syntax when a skill can tell you; load it
|
|
68
|
+
21. **SKILLS — ON DEMAND**: The "# Available skills" section below lists every skill available to you (slug + description, NO body). When the user asks about specific APX/APC commands, project structure, agent runtimes, or anything where exact syntax or detailed behavior matches a skill description (in ANY language — match semantically, not by keyword), call load_skill({slug}) to fetch the full markdown body. If a CWD is in the contextNote, pass it as project_path so project-scoped skills resolve. If the user explicitly asks "what skills do you have?", you can either read the catalog below directly OR call list_skills to get a fresh enumeration. Do NOT load skills for trivial / unrelated questions — that wastes tokens. Don't guess CLI syntax when a skill can tell you; load it.
|
|
69
|
+
22. **NEVER PASTE BASE64 OR DATA URIs IN MESSAGE TEXT**: When you need to send an image, audio, or file via Telegram (or any channel), you MUST pass it via the dedicated parameter — NEVER embed it in the text field. Concretely: after browser_screenshot returns its base64 field, call send_telegram({text: "<short caption>", photo_base64: "<that base64>"}). Do NOT write text like 'Aquí está: ' — Telegram (and most chat clients) do NOT render data URIs or markdown images; the user sees thousands of garbage characters. Same for files: use document_path / document_base64 / document_url, NOT the text field. The text field is exclusively for human-readable prose (and becomes the caption when media is attached). If unsure, save the image to /tmp/screenshot-<ts>.png first (browser_screenshot supports save_to_tmp=true and returns a path field) and pass that path to send_telegram via photo_path — never inline the bytes in text.`;
|
|
69
70
|
|
|
70
71
|
function isShortConfirmation(text) {
|
|
71
72
|
return /^(yes|y|si|si dale|dale|ok|okay|confirm|confirmed|go|proceed|do it)\b/i
|
|
@@ -197,7 +197,7 @@ export async function browser_navigate({ url, launch_options, allow_dangerous }
|
|
|
197
197
|
};
|
|
198
198
|
}
|
|
199
199
|
|
|
200
|
-
export async function browser_screenshot({ selector, full_page = false, width, height, encoded = false } = {}) {
|
|
200
|
+
export async function browser_screenshot({ selector, full_page = false, width, height, encoded = false, save_path, save_to_tmp = false } = {}) {
|
|
201
201
|
const page = await ensureBrowser();
|
|
202
202
|
if (width || height) {
|
|
203
203
|
await page.setViewport({
|
|
@@ -218,12 +218,30 @@ export async function browser_screenshot({ selector, full_page = false, width, h
|
|
|
218
218
|
throw new Error(`Screenshot too large: ${Math.round(size / 1024)}KB (max ${Math.round(MAX_SCREENSHOT_BYTES / 1024)}KB)`);
|
|
219
219
|
}
|
|
220
220
|
|
|
221
|
+
// Optional disk write so the caller can pass `path` to e.g. send_telegram
|
|
222
|
+
// instead of shuttling base64 around.
|
|
223
|
+
let writtenPath = null;
|
|
224
|
+
if (save_path || save_to_tmp) {
|
|
225
|
+
const fs = await import("node:fs");
|
|
226
|
+
const path = await import("node:path");
|
|
227
|
+
const os = await import("node:os");
|
|
228
|
+
let target = save_path;
|
|
229
|
+
if (!target) {
|
|
230
|
+
const dir = path.join(os.tmpdir(), "apx-screenshots");
|
|
231
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
232
|
+
target = path.join(dir, `screenshot-${Date.now()}.png`);
|
|
233
|
+
}
|
|
234
|
+
fs.writeFileSync(target, Buffer.from(String(buf), "base64"));
|
|
235
|
+
writtenPath = target;
|
|
236
|
+
}
|
|
237
|
+
|
|
221
238
|
return {
|
|
222
239
|
ok: true,
|
|
223
240
|
url: page.url(),
|
|
224
241
|
format: "png",
|
|
225
242
|
bytes: size,
|
|
226
243
|
base64: buf,
|
|
244
|
+
path: writtenPath,
|
|
227
245
|
data_uri: encoded ? `data:image/png;base64,${buf}` : undefined,
|
|
228
246
|
};
|
|
229
247
|
}
|
|
@@ -366,19 +366,21 @@ const TOOL_DEFINITIONS = [
|
|
|
366
366
|
{
|
|
367
367
|
name: "browser_screenshot",
|
|
368
368
|
category: "browser",
|
|
369
|
-
description: "Take a screenshot of the current browser page (or
|
|
369
|
+
description: "Take a screenshot of the current browser page (or an element via selector). Returns { base64, path?, bytes, url }. To send via Telegram, prefer `save_to_tmp: true` and pass the returned `path` to send_telegram({photo_path}); otherwise pass `base64` straight to send_telegram({photo_base64}). NEVER include the base64 in any text field — Telegram does not render it.",
|
|
370
370
|
endpoint: { method: "POST", path: "/tools/browser/screenshot" },
|
|
371
371
|
parameters: {
|
|
372
372
|
type: "object",
|
|
373
373
|
properties: {
|
|
374
|
-
selector:
|
|
375
|
-
full_page:
|
|
376
|
-
width:
|
|
377
|
-
height:
|
|
378
|
-
encoded:
|
|
374
|
+
selector: { type: "string", description: "CSS selector of element to capture. Omit for full viewport/page." },
|
|
375
|
+
full_page: { type: "boolean", default: false },
|
|
376
|
+
width: { type: "number", description: "Viewport width (capped at 1920)." },
|
|
377
|
+
height: { type: "number", description: "Viewport height (capped at 1080)." },
|
|
378
|
+
encoded: { type: "boolean", description: "Also return a data:image/png;base64 URI in response." },
|
|
379
|
+
save_path: { type: "string", description: "Absolute path to write the PNG. Returns it in `path`." },
|
|
380
|
+
save_to_tmp: { type: "boolean", description: "Auto-write to <os.tmpdir>/apx-screenshots/screenshot-<ts>.png. Returns the path." },
|
|
379
381
|
},
|
|
380
382
|
},
|
|
381
|
-
examples: [{}, { selector: "#hero" }],
|
|
383
|
+
examples: [{}, { selector: "#hero" }, { save_to_tmp: true }],
|
|
382
384
|
},
|
|
383
385
|
{
|
|
384
386
|
name: "browser_click",
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
// daemon/transcription.js
|
|
2
|
+
// Audio transcription dispatcher. Two backends:
|
|
3
|
+
//
|
|
4
|
+
// - LOCAL (faster-whisper via Python subprocess) — ported from Panda's
|
|
5
|
+
// transcription_service.py. Same defaults: model "medium", device "cpu",
|
|
6
|
+
// compute_type "int8", beam_size 5, auto language detection. Requires
|
|
7
|
+
// `pip3 install faster-whisper` on the host.
|
|
8
|
+
//
|
|
9
|
+
// - OPENAI (Whisper-1 cloud API) — needs OPENAI_API_KEY or
|
|
10
|
+
// engines.openai.api_key in config.
|
|
11
|
+
//
|
|
12
|
+
// Provider selection in ~/.apx/config.json:
|
|
13
|
+
// "transcription": {
|
|
14
|
+
// "provider": "auto" | "local" | "openai", // default "auto"
|
|
15
|
+
// "local": {
|
|
16
|
+
// "model": "medium", // tiny | base | small | medium | large | large-v2 | large-v3
|
|
17
|
+
// "device": "cpu", // cpu | cuda
|
|
18
|
+
// "compute_type": "int8", // int8 | int8_float16 | float16 | float32
|
|
19
|
+
// "language": "auto", // ISO 639-1 code or "auto"
|
|
20
|
+
// "beam_size": 5
|
|
21
|
+
// }
|
|
22
|
+
// }
|
|
23
|
+
//
|
|
24
|
+
// "auto" tries local first; on failure falls back to openai.
|
|
25
|
+
|
|
26
|
+
import fs from "node:fs";
|
|
27
|
+
import path from "node:path";
|
|
28
|
+
import { execFile } from "node:child_process";
|
|
29
|
+
import { fileURLToPath } from "node:url";
|
|
30
|
+
|
|
31
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
32
|
+
const __dirname = path.dirname(__filename);
|
|
33
|
+
const PYTHON_HELPER = path.join(__dirname, "whisper-transcribe.py");
|
|
34
|
+
|
|
35
|
+
const DEFAULT_LOCAL = {
|
|
36
|
+
model: "medium",
|
|
37
|
+
device: "cpu",
|
|
38
|
+
compute_type: "int8",
|
|
39
|
+
language: "auto",
|
|
40
|
+
beam_size: 5,
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Config
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
async function getConfig() {
|
|
48
|
+
try {
|
|
49
|
+
const { readConfig } = await import("../core/config.js");
|
|
50
|
+
const cfg = readConfig() || {};
|
|
51
|
+
const t = cfg.transcription || {};
|
|
52
|
+
const openaiKey = cfg.engines?.openai?.api_key || process.env.OPENAI_API_KEY || "";
|
|
53
|
+
return {
|
|
54
|
+
provider: t.provider || "auto",
|
|
55
|
+
local: { ...DEFAULT_LOCAL, ...(t.local || {}) },
|
|
56
|
+
openaiKey,
|
|
57
|
+
};
|
|
58
|
+
} catch {
|
|
59
|
+
return {
|
|
60
|
+
provider: "auto",
|
|
61
|
+
local: { ...DEFAULT_LOCAL },
|
|
62
|
+
openaiKey: process.env.OPENAI_API_KEY || "",
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
// Local backend (Python + faster-whisper)
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
function transcribeLocal(filePath, opts) {
|
|
72
|
+
return new Promise((resolve, reject) => {
|
|
73
|
+
const args = [
|
|
74
|
+
PYTHON_HELPER,
|
|
75
|
+
filePath,
|
|
76
|
+
"--model", String(opts.model || DEFAULT_LOCAL.model),
|
|
77
|
+
"--language", String(opts.language || DEFAULT_LOCAL.language),
|
|
78
|
+
"--device", String(opts.device || DEFAULT_LOCAL.device),
|
|
79
|
+
"--compute-type", String(opts.compute_type || DEFAULT_LOCAL.compute_type),
|
|
80
|
+
"--beam-size", String(opts.beam_size || DEFAULT_LOCAL.beam_size),
|
|
81
|
+
];
|
|
82
|
+
execFile("python3", args, { maxBuffer: 16 * 1024 * 1024, timeout: 5 * 60_000 }, (err, stdout, stderr) => {
|
|
83
|
+
if (err) {
|
|
84
|
+
const tail = (stderr || err.message || "").slice(-300);
|
|
85
|
+
return reject(new Error(`local transcription failed: ${tail}`));
|
|
86
|
+
}
|
|
87
|
+
let parsed;
|
|
88
|
+
try { parsed = JSON.parse(String(stdout).trim().split("\n").pop()); }
|
|
89
|
+
catch (e) {
|
|
90
|
+
return reject(new Error(`could not parse helper output: ${stdout.slice(0, 300)}`));
|
|
91
|
+
}
|
|
92
|
+
if (!parsed.ok) return reject(new Error(parsed.error || "unknown local transcription error"));
|
|
93
|
+
resolve({
|
|
94
|
+
ok: true,
|
|
95
|
+
backend: "local",
|
|
96
|
+
text: parsed.text || "",
|
|
97
|
+
language: parsed.language || null,
|
|
98
|
+
language_probability: parsed.language_probability ?? null,
|
|
99
|
+
duration: parsed.duration ?? null,
|
|
100
|
+
model: parsed.model,
|
|
101
|
+
compute_type: parsed.compute_type,
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
// OpenAI backend (Whisper-1 cloud)
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
async function transcribeOpenAI(filePath, apiKey) {
|
|
112
|
+
if (!apiKey) throw new Error("OPENAI_API_KEY not set (env or engines.openai.api_key)");
|
|
113
|
+
|
|
114
|
+
const fileBuf = fs.readFileSync(filePath);
|
|
115
|
+
const ext = path.extname(filePath).slice(1).toLowerCase() || "ogg";
|
|
116
|
+
const mimeMap = {
|
|
117
|
+
oga: "audio/ogg", ogg: "audio/ogg", opus: "audio/ogg",
|
|
118
|
+
mp3: "audio/mpeg", m4a: "audio/mp4", mp4: "audio/mp4",
|
|
119
|
+
wav: "audio/wav", webm: "audio/webm",
|
|
120
|
+
};
|
|
121
|
+
const blob = new Blob([fileBuf], { type: mimeMap[ext] || "audio/ogg" });
|
|
122
|
+
|
|
123
|
+
const form = new FormData();
|
|
124
|
+
form.append("file", blob, `audio.${ext}`);
|
|
125
|
+
form.append("model", "whisper-1");
|
|
126
|
+
|
|
127
|
+
const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
128
|
+
method: "POST",
|
|
129
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
130
|
+
body: form,
|
|
131
|
+
});
|
|
132
|
+
if (!res.ok) {
|
|
133
|
+
const err = await res.text().catch(() => "");
|
|
134
|
+
throw new Error(`Whisper API ${res.status}: ${err.slice(0, 200)}`);
|
|
135
|
+
}
|
|
136
|
+
const json = await res.json();
|
|
137
|
+
return {
|
|
138
|
+
ok: true,
|
|
139
|
+
backend: "openai",
|
|
140
|
+
text: String(json.text || "").trim(),
|
|
141
|
+
language: null,
|
|
142
|
+
language_probability: null,
|
|
143
|
+
duration: null,
|
|
144
|
+
model: "whisper-1",
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// Public API
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Transcribe an audio file using the configured backend.
|
|
154
|
+
* Returns { ok, backend, text, language?, language_probability?, duration?, model? }.
|
|
155
|
+
*
|
|
156
|
+
* @param {string} filePath absolute path to audio file
|
|
157
|
+
* @param {object} overrides optional: { provider, model, language, ... }
|
|
158
|
+
*/
|
|
159
|
+
export async function transcribe(filePath, overrides = {}) {
|
|
160
|
+
if (!filePath || !fs.existsSync(filePath)) {
|
|
161
|
+
throw new Error(`transcribe: file not found: ${filePath}`);
|
|
162
|
+
}
|
|
163
|
+
const cfg = await getConfig();
|
|
164
|
+
const provider = overrides.provider || cfg.provider;
|
|
165
|
+
const localOpts = { ...cfg.local, ...overrides };
|
|
166
|
+
|
|
167
|
+
if (provider === "openai") {
|
|
168
|
+
return transcribeOpenAI(filePath, cfg.openaiKey);
|
|
169
|
+
}
|
|
170
|
+
if (provider === "local") {
|
|
171
|
+
return transcribeLocal(filePath, localOpts);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// auto: local first, fall back to openai
|
|
175
|
+
try {
|
|
176
|
+
return await transcribeLocal(filePath, localOpts);
|
|
177
|
+
} catch (localErr) {
|
|
178
|
+
if (!cfg.openaiKey) {
|
|
179
|
+
throw new Error(
|
|
180
|
+
`local transcription failed and no OpenAI fallback available: ${localErr.message}`
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
return transcribeOpenAI(filePath, cfg.openaiKey);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// ---------------------------------------------------------------------------
|
|
188
|
+
// Diagnostics
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
export const TRANSCRIPTION_PATHS = {
|
|
192
|
+
python_helper: PYTHON_HELPER,
|
|
193
|
+
};
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Local audio transcription via faster-whisper. Mirrors the implementation in
|
|
4
|
+
the Panda project (transcription_service.py): same default model "medium",
|
|
5
|
+
device cpu, compute_type int8, beam_size 5. Lazy singleton model cache.
|
|
6
|
+
|
|
7
|
+
Invoked by APX daemon (Node) as a subprocess. Args:
|
|
8
|
+
whisper-transcribe.py <audio_path> [--model medium] [--language auto] [--device cpu] [--compute-type int8] [--beam-size 5]
|
|
9
|
+
|
|
10
|
+
Outputs JSON on stdout:
|
|
11
|
+
{ "ok": true, "text": "...", "language": "es", "language_probability": 0.98, "duration": 12.4 }
|
|
12
|
+
{ "ok": false, "error": "..." }
|
|
13
|
+
"""
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def main() -> int:
|
|
21
|
+
parser = argparse.ArgumentParser()
|
|
22
|
+
parser.add_argument("audio_path")
|
|
23
|
+
parser.add_argument("--model", default="medium")
|
|
24
|
+
parser.add_argument("--language", default="auto")
|
|
25
|
+
parser.add_argument("--device", default="cpu")
|
|
26
|
+
parser.add_argument("--compute-type", dest="compute_type", default="int8")
|
|
27
|
+
parser.add_argument("--beam-size", dest="beam_size", type=int, default=5)
|
|
28
|
+
args = parser.parse_args()
|
|
29
|
+
|
|
30
|
+
if not os.path.exists(args.audio_path):
|
|
31
|
+
print(json.dumps({"ok": False, "error": f"file not found: {args.audio_path}"}))
|
|
32
|
+
return 1
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
from faster_whisper import WhisperModel
|
|
36
|
+
except ImportError as e:
|
|
37
|
+
print(json.dumps({
|
|
38
|
+
"ok": False,
|
|
39
|
+
"error": "faster-whisper not installed. Run: pip3 install faster-whisper",
|
|
40
|
+
"import_error": str(e),
|
|
41
|
+
}))
|
|
42
|
+
return 1
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(json.dumps({"ok": False, "error": f"failed to load model '{args.model}': {e}"}))
|
|
48
|
+
return 1
|
|
49
|
+
|
|
50
|
+
language = None if args.language == "auto" else args.language
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=language)
|
|
54
|
+
text = " ".join(seg.text.strip() for seg in segments).strip()
|
|
55
|
+
print(json.dumps({
|
|
56
|
+
"ok": True,
|
|
57
|
+
"text": text,
|
|
58
|
+
"language": info.language,
|
|
59
|
+
"language_probability": round(info.language_probability, 4),
|
|
60
|
+
"duration": round(info.duration, 2),
|
|
61
|
+
"model": args.model,
|
|
62
|
+
"compute_type": args.compute_type,
|
|
63
|
+
}))
|
|
64
|
+
return 0
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(json.dumps({"ok": False, "error": f"transcription failed: {e}"}))
|
|
67
|
+
return 1
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
sys.exit(main())
|