@rubytech/taskmaster 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/auth-profiles/profiles.js +37 -0
- package/dist/agents/auth-profiles.js +1 -1
- package/dist/agents/pi-tools.policy.js +4 -1
- package/dist/agents/sandbox/constants.js +0 -1
- package/dist/agents/system-prompt.js +1 -4
- package/dist/agents/taskmaster-tools.js +14 -6
- package/dist/agents/tool-policy.js +5 -5
- package/dist/agents/tools/apikeys-tool.js +16 -5
- package/dist/agents/tools/contact-create-tool.js +59 -0
- package/dist/agents/tools/contact-delete-tool.js +48 -0
- package/dist/agents/tools/contact-update-tool.js +17 -2
- package/dist/agents/tools/file-delete-tool.js +137 -0
- package/dist/agents/tools/file-list-tool.js +127 -0
- package/dist/auto-reply/reply/commands-tts.js +7 -2
- package/dist/build-info.json +3 -3
- package/dist/cli/provision-seed.js +1 -3
- package/dist/commands/doctor-config-flow.js +13 -0
- package/dist/config/agent-tools-reconcile.js +53 -0
- package/dist/config/defaults.js +10 -1
- package/dist/config/legacy.migrations.part-3.js +26 -0
- package/dist/config/zod-schema.core.js +9 -1
- package/dist/config/zod-schema.js +1 -0
- package/dist/control-ui/assets/index-CPawOl_z.css +1 -0
- package/dist/control-ui/assets/{index-DwMopZij.js → index-DQ1kxYd4.js} +693 -598
- package/dist/control-ui/assets/index-DQ1kxYd4.js.map +1 -0
- package/dist/control-ui/index.html +2 -2
- package/dist/gateway/chat-sanitize.js +16 -2
- package/dist/gateway/config-reload.js +1 -0
- package/dist/gateway/media-http.js +32 -1
- package/dist/gateway/server-methods/apikeys.js +56 -4
- package/dist/gateway/server-methods/tts.js +11 -2
- package/dist/gateway/server.impl.js +15 -0
- package/dist/media-understanding/apply.js +35 -0
- package/dist/media-understanding/providers/deepgram/audio.js +1 -1
- package/dist/media-understanding/providers/google/audio.js +1 -1
- package/dist/media-understanding/providers/google/video.js +1 -1
- package/dist/media-understanding/providers/index.js +2 -0
- package/dist/media-understanding/providers/openai/audio.js +1 -1
- package/dist/media-understanding/providers/sherpa-onnx/index.js +10 -0
- package/dist/media-understanding/runner.js +61 -72
- package/dist/media-understanding/sherpa-onnx-local.js +223 -0
- package/dist/records/records-manager.js +10 -0
- package/dist/tts/tts.js +98 -10
- package/dist/web/auto-reply/monitor/process-message.js +1 -0
- package/dist/web/inbound/monitor.js +9 -1
- package/extensions/googlechat/node_modules/.bin/taskmaster +2 -2
- package/extensions/googlechat/package.json +2 -2
- package/extensions/line/node_modules/.bin/taskmaster +2 -2
- package/extensions/line/package.json +1 -1
- package/extensions/matrix/node_modules/.bin/taskmaster +2 -2
- package/extensions/matrix/package.json +1 -1
- package/extensions/msteams/node_modules/.bin/taskmaster +2 -2
- package/extensions/msteams/package.json +1 -1
- package/extensions/nostr/node_modules/.bin/taskmaster +2 -2
- package/extensions/nostr/package.json +1 -1
- package/extensions/zalo/node_modules/.bin/taskmaster +2 -2
- package/extensions/zalo/package.json +1 -1
- package/extensions/zalouser/node_modules/.bin/taskmaster +2 -2
- package/extensions/zalouser/package.json +1 -1
- package/package.json +3 -2
- package/scripts/postinstall.js +76 -0
- package/skills/business-assistant/references/crm.md +32 -8
- package/taskmaster-docs/USER-GUIDE.md +84 -5
- package/templates/beagle/agents/admin/AGENTS.md +4 -2
- package/templates/taskmaster/agents/admin/AGENTS.md +1 -0
- package/dist/control-ui/assets/index-DvB85yTz.css +0 -1
- package/dist/control-ui/assets/index-DwMopZij.js.map +0 -1
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
<title>Taskmaster Control</title>
|
|
7
7
|
<meta name="color-scheme" content="dark light" />
|
|
8
8
|
<link rel="icon" type="image/png" href="./favicon.png" />
|
|
9
|
-
<script type="module" crossorigin src="./assets/index-
|
|
10
|
-
<link rel="stylesheet" crossorigin href="./assets/index-
|
|
9
|
+
<script type="module" crossorigin src="./assets/index-DQ1kxYd4.js"></script>
|
|
10
|
+
<link rel="stylesheet" crossorigin href="./assets/index-CPawOl_z.css">
|
|
11
11
|
</head>
|
|
12
12
|
<body>
|
|
13
13
|
<taskmaster-app></taskmaster-app>
|
|
@@ -130,6 +130,7 @@ export function stripEnvelopeFromMessages(messages) {
|
|
|
130
130
|
// 2. Remove base64 image blocks from content
|
|
131
131
|
// 3. Add { type: "image", url: "/api/media?path=..." } blocks
|
|
132
132
|
// ---------------------------------------------------------------------------
|
|
133
|
+
import nodeFs from "node:fs";
|
|
133
134
|
import nodePath from "node:path";
|
|
134
135
|
function isBase64ImageBlock(block) {
|
|
135
136
|
if (!block || typeof block !== "object")
|
|
@@ -236,7 +237,14 @@ function mediaRefToUrl(ref, workspaceRoot) {
|
|
|
236
237
|
// Must stay within workspace (no ../ escapes)
|
|
237
238
|
if (relPath.startsWith("..") || nodePath.isAbsolute(relPath))
|
|
238
239
|
return null;
|
|
239
|
-
|
|
240
|
+
// Append file mtime as cache buster so updated files are never served stale.
|
|
241
|
+
let mtime = "";
|
|
242
|
+
try {
|
|
243
|
+
const stat = nodeFs.statSync(ref.absPath);
|
|
244
|
+
mtime = `&t=${stat.mtimeMs | 0}`;
|
|
245
|
+
}
|
|
246
|
+
catch { /* file may not exist yet */ }
|
|
247
|
+
return `/api/media?path=${encodeURIComponent(relPath)}${mtime}`;
|
|
240
248
|
}
|
|
241
249
|
function stripBase64FromContentBlocks(content) {
|
|
242
250
|
let changed = false;
|
|
@@ -302,11 +310,17 @@ export function sanitizeMediaForChat(messages, workspaceRoot) {
|
|
|
302
310
|
// No workspace context — fall back to plain base64 stripping
|
|
303
311
|
return stripBase64ImagesFromMessages(messages);
|
|
304
312
|
}
|
|
305
|
-
// Track paths
|
|
313
|
+
// Track paths within each assistant turn to prevent duplicate blocks when an
|
|
306
314
|
// assistant message echoes a MEDIA: ref that already appeared in a tool result.
|
|
315
|
+
// Reset at each user message so the same file can be re-shared in later turns.
|
|
307
316
|
const seenPaths = new Set();
|
|
308
317
|
let changed = false;
|
|
309
318
|
const next = messages.map((message) => {
|
|
319
|
+
// Reset dedup at user turn boundaries so files can appear again in later turns.
|
|
320
|
+
const entry = message;
|
|
321
|
+
const role = typeof entry?.role === "string" ? entry.role.toLowerCase() : "";
|
|
322
|
+
if (role === "user")
|
|
323
|
+
seenPaths.clear();
|
|
310
324
|
const result = sanitizeMessageMedia(message, workspaceRoot, seenPaths);
|
|
311
325
|
if (result !== message)
|
|
312
326
|
changed = true;
|
|
@@ -9,6 +9,7 @@ const BASE_RELOAD_RULES = [
|
|
|
9
9
|
{ prefix: "access", kind: "none" },
|
|
10
10
|
{ prefix: "publicChat", kind: "none" },
|
|
11
11
|
{ prefix: "apiKeys", kind: "none" },
|
|
12
|
+
{ prefix: "apiKeysDisabled", kind: "none" },
|
|
12
13
|
{ prefix: "gateway.remote", kind: "none" },
|
|
13
14
|
{ prefix: "gateway.reload", kind: "none" },
|
|
14
15
|
{ prefix: "hooks.gmail", kind: "hot", actions: ["restart-gmail-watcher"] },
|
|
@@ -22,6 +22,16 @@ const ALLOWED_MEDIA_EXTENSIONS = new Set([
|
|
|
22
22
|
".tiff",
|
|
23
23
|
".tif",
|
|
24
24
|
".pdf",
|
|
25
|
+
// Audio
|
|
26
|
+
".mp3",
|
|
27
|
+
".opus",
|
|
28
|
+
".ogg",
|
|
29
|
+
".oga",
|
|
30
|
+
".wav",
|
|
31
|
+
".m4a",
|
|
32
|
+
".webm",
|
|
33
|
+
".aac",
|
|
34
|
+
".flac",
|
|
25
35
|
]);
|
|
26
36
|
function contentType(ext) {
|
|
27
37
|
switch (ext) {
|
|
@@ -45,6 +55,24 @@ function contentType(ext) {
|
|
|
45
55
|
return "image/tiff";
|
|
46
56
|
case ".pdf":
|
|
47
57
|
return "application/pdf";
|
|
58
|
+
// Audio
|
|
59
|
+
case ".mp3":
|
|
60
|
+
return "audio/mpeg";
|
|
61
|
+
case ".opus":
|
|
62
|
+
return "audio/opus";
|
|
63
|
+
case ".ogg":
|
|
64
|
+
case ".oga":
|
|
65
|
+
return "audio/ogg";
|
|
66
|
+
case ".wav":
|
|
67
|
+
return "audio/wav";
|
|
68
|
+
case ".m4a":
|
|
69
|
+
return "audio/mp4";
|
|
70
|
+
case ".webm":
|
|
71
|
+
return "audio/webm";
|
|
72
|
+
case ".aac":
|
|
73
|
+
return "audio/aac";
|
|
74
|
+
case ".flac":
|
|
75
|
+
return "audio/flac";
|
|
48
76
|
default:
|
|
49
77
|
return "application/octet-stream";
|
|
50
78
|
}
|
|
@@ -109,7 +137,10 @@ export function handleMediaRequest(req, res, opts) {
|
|
|
109
137
|
res.statusCode = 200;
|
|
110
138
|
res.setHeader("Content-Type", contentType(ext));
|
|
111
139
|
res.setHeader("Content-Length", stat.size);
|
|
112
|
-
|
|
140
|
+
// Revalidate on every request so updated files (e.g. regenerated invoices)
|
|
141
|
+
// are never served stale. Last-Modified lets the browser use 304 for unchanged files.
|
|
142
|
+
res.setHeader("Cache-Control", "no-cache");
|
|
143
|
+
res.setHeader("Last-Modified", stat.mtime.toUTCString());
|
|
113
144
|
if (req.method === "HEAD") {
|
|
114
145
|
res.end();
|
|
115
146
|
}
|
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Gateway handlers for centralized API key management.
|
|
3
3
|
*
|
|
4
|
-
* apikeys.list
|
|
5
|
-
* apikeys.set
|
|
6
|
-
* apikeys.remove
|
|
4
|
+
* apikeys.list — returns provider catalog with set/unset/disabled status
|
|
5
|
+
* apikeys.set — stores a key for a provider (config watcher hot-reloads)
|
|
6
|
+
* apikeys.remove — removes a key for a provider (config watcher hot-reloads)
|
|
7
|
+
* apikeys.disable — toggles a key's disabled state (key preserved, not distributed)
|
|
7
8
|
*
|
|
8
9
|
* No explicit restart needed — the config file watcher detects the change,
|
|
9
10
|
* and `applyApiKeys()` runs as a side effect of `readConfigFileSnapshot()`,
|
|
10
11
|
* injecting keys into the auth profile store and tool config paths.
|
|
11
12
|
* The `apiKeys` prefix is registered as "none" in config-reload.ts.
|
|
12
13
|
*/
|
|
14
|
+
import { removeAuthProfile } from "../../agents/auth-profiles.js";
|
|
13
15
|
import { readConfigFileSnapshot, writeConfigFile } from "../../config/config.js";
|
|
14
16
|
import { ErrorCodes, errorShape } from "../protocol/index.js";
|
|
15
17
|
import { formatForLog } from "../ws-log.js";
|
|
18
|
+
/** Providers whose API keys are stored as auth profiles (type: api_key). */
|
|
19
|
+
const AUTH_PROFILE_PROVIDERS = new Set([
|
|
20
|
+
"anthropic", "openai", "google", "replicate", "hume",
|
|
21
|
+
]);
|
|
16
22
|
const PROVIDER_CATALOG = [
|
|
17
23
|
{ id: "anthropic", name: "Anthropic", category: "AI Model", primary: true },
|
|
18
24
|
{ id: "google", name: "Google", category: "Voice & Video", primary: true },
|
|
@@ -29,9 +35,10 @@ export const apikeysHandlers = {
|
|
|
29
35
|
try {
|
|
30
36
|
const snapshot = await readConfigFileSnapshot();
|
|
31
37
|
const storedKeys = snapshot.config.apiKeys ?? {};
|
|
38
|
+
const disabledMap = snapshot.config.apiKeysDisabled ?? {};
|
|
32
39
|
const providers = PROVIDER_CATALOG.map((p) => {
|
|
33
40
|
const raw = storedKeys[p.id]?.trim();
|
|
34
|
-
return { ...p, hasKey: Boolean(raw), key: raw || undefined };
|
|
41
|
+
return { ...p, hasKey: Boolean(raw), disabled: Boolean(disabledMap[p.id]), key: raw || undefined };
|
|
35
42
|
});
|
|
36
43
|
respond(true, { providers });
|
|
37
44
|
}
|
|
@@ -73,11 +80,56 @@ export const apikeysHandlers = {
|
|
|
73
80
|
const existing = { ...config.apiKeys };
|
|
74
81
|
delete existing[provider];
|
|
75
82
|
config.apiKeys = Object.keys(existing).length > 0 ? existing : undefined;
|
|
83
|
+
// Clean up disabled entry when key is removed
|
|
84
|
+
if (config.apiKeysDisabled?.[provider]) {
|
|
85
|
+
const disabled = { ...config.apiKeysDisabled };
|
|
86
|
+
delete disabled[provider];
|
|
87
|
+
config.apiKeysDisabled = Object.keys(disabled).length > 0 ? disabled : undefined;
|
|
88
|
+
}
|
|
76
89
|
await writeConfigFile(config);
|
|
90
|
+
// Remove the auth profile so stale credentials are not picked up by
|
|
91
|
+
// provider resolution (the profile outlives the config key otherwise).
|
|
92
|
+
if (AUTH_PROFILE_PROVIDERS.has(provider)) {
|
|
93
|
+
removeAuthProfile({ profileId: `${provider}:api-key` });
|
|
94
|
+
}
|
|
77
95
|
respond(true, { ok: true, provider });
|
|
78
96
|
}
|
|
79
97
|
catch (err) {
|
|
80
98
|
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
|
81
99
|
}
|
|
82
100
|
},
|
|
101
|
+
"apikeys.disable": async ({ params, respond }) => {
|
|
102
|
+
try {
|
|
103
|
+
const provider = params.provider?.trim();
|
|
104
|
+
const disabled = params.disabled;
|
|
105
|
+
if (!provider || !VALID_PROVIDER_IDS.has(provider)) {
|
|
106
|
+
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, `Invalid provider: ${provider ?? "(empty)"}`));
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
if (typeof disabled !== "boolean") {
|
|
110
|
+
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "disabled must be a boolean"));
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
const snapshot = await readConfigFileSnapshot();
|
|
114
|
+
const config = { ...snapshot.config };
|
|
115
|
+
if (disabled) {
|
|
116
|
+
config.apiKeysDisabled = { ...config.apiKeysDisabled, [provider]: true };
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
const existing = { ...config.apiKeysDisabled };
|
|
120
|
+
delete existing[provider];
|
|
121
|
+
config.apiKeysDisabled = Object.keys(existing).length > 0 ? existing : undefined;
|
|
122
|
+
}
|
|
123
|
+
await writeConfigFile(config);
|
|
124
|
+
// Eagerly remove/restore auth profile so the change takes effect
|
|
125
|
+
// before the config watcher fires.
|
|
126
|
+
if (disabled && AUTH_PROFILE_PROVIDERS.has(provider)) {
|
|
127
|
+
removeAuthProfile({ profileId: `${provider}:api-key` });
|
|
128
|
+
}
|
|
129
|
+
respond(true, { ok: true, provider, disabled });
|
|
130
|
+
}
|
|
131
|
+
catch (err) {
|
|
132
|
+
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
|
133
|
+
}
|
|
134
|
+
},
|
|
83
135
|
};
|
|
@@ -80,8 +80,11 @@ export const ttsHandlers = {
|
|
|
80
80
|
},
|
|
81
81
|
"tts.setProvider": async ({ params, respond }) => {
|
|
82
82
|
const provider = typeof params.provider === "string" ? params.provider.trim() : "";
|
|
83
|
-
if (provider !== "openai" &&
|
|
84
|
-
|
|
83
|
+
if (provider !== "openai" &&
|
|
84
|
+
provider !== "elevenlabs" &&
|
|
85
|
+
provider !== "edge" &&
|
|
86
|
+
provider !== "hume") {
|
|
87
|
+
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai, elevenlabs, hume, or edge."));
|
|
85
88
|
return;
|
|
86
89
|
}
|
|
87
90
|
try {
|
|
@@ -115,6 +118,12 @@ export const ttsHandlers = {
|
|
|
115
118
|
configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
|
|
116
119
|
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
|
|
117
120
|
},
|
|
121
|
+
{
|
|
122
|
+
id: "hume",
|
|
123
|
+
name: "Hume",
|
|
124
|
+
configured: Boolean(resolveTtsApiKey(config, "hume")),
|
|
125
|
+
models: [],
|
|
126
|
+
},
|
|
118
127
|
{
|
|
119
128
|
id: "edge",
|
|
120
129
|
name: "Edge TTS",
|
|
@@ -9,6 +9,7 @@ import { CONFIG_PATH_TASKMASTER, isNixMode, loadConfig, migrateLegacyConfig, rea
|
|
|
9
9
|
import { VERSION } from "../version.js";
|
|
10
10
|
import { isDiagnosticsEnabled } from "../infra/diagnostic-events.js";
|
|
11
11
|
import { logAcceptedEnvOption } from "../infra/env.js";
|
|
12
|
+
import { reconcileAgentContactTools } from "../config/agent-tools-reconcile.js";
|
|
12
13
|
import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
|
|
13
14
|
import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
|
|
14
15
|
import { onHeartbeatEvent } from "../infra/heartbeat-events.js";
|
|
@@ -121,6 +122,20 @@ export async function startGatewayServer(port = 18789, opts = {}) {
|
|
|
121
122
|
log.warn(`gateway: failed to persist plugin auto-enable changes: ${String(err)}`);
|
|
122
123
|
}
|
|
123
124
|
}
|
|
125
|
+
// Reconcile agent tool groups (e.g. individual contact tools → group:contacts).
|
|
126
|
+
const toolReconcile = reconcileAgentContactTools({ config: configSnapshot.config });
|
|
127
|
+
if (toolReconcile.changes.length > 0) {
|
|
128
|
+
try {
|
|
129
|
+
await writeConfigFile(toolReconcile.config);
|
|
130
|
+
configSnapshot = await readConfigFileSnapshot();
|
|
131
|
+
log.info(`gateway: reconciled agent tools:\n${toolReconcile.changes
|
|
132
|
+
.map((entry) => `- ${entry}`)
|
|
133
|
+
.join("\n")}`);
|
|
134
|
+
}
|
|
135
|
+
catch (err) {
|
|
136
|
+
log.warn(`gateway: failed to persist agent tools reconciliation: ${String(err)}`);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
124
139
|
// Stamp config with running version on startup so upgrades keep the stamp current.
|
|
125
140
|
const storedVersion = configSnapshot.config.meta?.lastTouchedVersion;
|
|
126
141
|
if (configSnapshot.exists && storedVersion !== VERSION) {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
|
|
2
|
+
import { logVerbose } from "../globals.js";
|
|
2
3
|
import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, } from "./format.js";
|
|
3
4
|
import { runWithConcurrency } from "./concurrency.js";
|
|
4
5
|
import { resolveConcurrency } from "./resolve.js";
|
|
@@ -42,6 +43,40 @@ export async function applyMediaUnderstanding(params) {
|
|
|
42
43
|
if (decisions.length > 0) {
|
|
43
44
|
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
|
|
44
45
|
}
|
|
46
|
+
// Surface audio failures so the agent can inform the user instead of receiving
|
|
47
|
+
// a bare <media:audio> placeholder with no context about what went wrong.
|
|
48
|
+
const audioDecision = decisions.find((d) => d.capability === "audio");
|
|
49
|
+
const audioTranscribed = outputs.some((o) => o.kind === "audio.transcription");
|
|
50
|
+
const bodyHint = ctx.CommandBody ?? ctx.RawBody ?? ctx.Body ?? "";
|
|
51
|
+
const isAudioPlaceholder = /^<media:audio>/i.test(bodyHint.trim());
|
|
52
|
+
if (isAudioPlaceholder && !audioTranscribed) {
|
|
53
|
+
let reason;
|
|
54
|
+
if (ctx.MediaDownloadFailed) {
|
|
55
|
+
reason = "media download failed — the voice note could not be retrieved from WhatsApp";
|
|
56
|
+
}
|
|
57
|
+
else if (audioDecision?.outcome === "no-attachment") {
|
|
58
|
+
reason = "no audio file available for transcription";
|
|
59
|
+
}
|
|
60
|
+
else if (audioDecision?.outcome === "skipped") {
|
|
61
|
+
// Distinguish between "no providers at all" (empty attempts) and "providers tried but all failed"
|
|
62
|
+
const hasAttempts = audioDecision.attachments?.some((a) => a.attempts.length > 0);
|
|
63
|
+
reason = hasAttempts
|
|
64
|
+
? "all transcription attempts failed"
|
|
65
|
+
: "no transcription provider configured (add an OpenAI, Google, Groq, or Deepgram API key)";
|
|
66
|
+
}
|
|
67
|
+
else if (audioDecision?.outcome === "disabled") {
|
|
68
|
+
reason = "audio transcription is disabled in config";
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
reason = `transcription ${audioDecision?.outcome ?? "unavailable"}`;
|
|
72
|
+
}
|
|
73
|
+
const note = `[Voice note received but could not be transcribed: ${reason}]`;
|
|
74
|
+
logVerbose(`applyMediaUnderstanding: ${note}`);
|
|
75
|
+
ctx.Body = note;
|
|
76
|
+
ctx.CommandBody = note;
|
|
77
|
+
ctx.RawBody = note;
|
|
78
|
+
finalizeInboundContext(ctx, { forceBodyForAgent: true, forceBodyForCommands: true });
|
|
79
|
+
}
|
|
45
80
|
if (outputs.length > 0) {
|
|
46
81
|
ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
|
|
47
82
|
const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription");
|
|
@@ -22,7 +22,7 @@ export async function transcribeDeepgramAudio(params) {
|
|
|
22
22
|
}
|
|
23
23
|
const headers = new Headers(params.headers);
|
|
24
24
|
if (!headers.has("authorization")) {
|
|
25
|
-
headers.set("authorization", `Token ${params.apiKey}`);
|
|
25
|
+
headers.set("authorization", `Token ${params.apiKey ?? ""}`);
|
|
26
26
|
}
|
|
27
27
|
if (!headers.has("content-type")) {
|
|
28
28
|
headers.set("content-type", params.mime ?? "application/octet-stream");
|
|
@@ -23,7 +23,7 @@ export async function transcribeGeminiAudio(params) {
|
|
|
23
23
|
headers.set("content-type", "application/json");
|
|
24
24
|
}
|
|
25
25
|
if (!headers.has("x-goog-api-key")) {
|
|
26
|
-
headers.set("x-goog-api-key", params.apiKey);
|
|
26
|
+
headers.set("x-goog-api-key", params.apiKey ?? "");
|
|
27
27
|
}
|
|
28
28
|
const body = {
|
|
29
29
|
contents: [
|
|
@@ -23,7 +23,7 @@ export async function describeGeminiVideo(params) {
|
|
|
23
23
|
headers.set("content-type", "application/json");
|
|
24
24
|
}
|
|
25
25
|
if (!headers.has("x-goog-api-key")) {
|
|
26
|
-
headers.set("x-goog-api-key", params.apiKey);
|
|
26
|
+
headers.set("x-goog-api-key", params.apiKey ?? "");
|
|
27
27
|
}
|
|
28
28
|
const body = {
|
|
29
29
|
contents: [
|
|
@@ -5,6 +5,7 @@ import { googleProvider } from "./google/index.js";
|
|
|
5
5
|
import { groqProvider } from "./groq/index.js";
|
|
6
6
|
import { minimaxProvider } from "./minimax/index.js";
|
|
7
7
|
import { openaiProvider } from "./openai/index.js";
|
|
8
|
+
import { sherpaOnnxProvider } from "./sherpa-onnx/index.js";
|
|
8
9
|
const PROVIDERS = [
|
|
9
10
|
groqProvider,
|
|
10
11
|
openaiProvider,
|
|
@@ -12,6 +13,7 @@ const PROVIDERS = [
|
|
|
12
13
|
anthropicProvider,
|
|
13
14
|
minimaxProvider,
|
|
14
15
|
deepgramProvider,
|
|
16
|
+
sherpaOnnxProvider,
|
|
15
17
|
];
|
|
16
18
|
export function normalizeMediaProviderId(id) {
|
|
17
19
|
const normalized = normalizeProviderId(id);
|
|
@@ -25,7 +25,7 @@ export async function transcribeOpenAiCompatibleAudio(params) {
|
|
|
25
25
|
form.append("prompt", params.prompt.trim());
|
|
26
26
|
const headers = new Headers(params.headers);
|
|
27
27
|
if (!headers.has("authorization")) {
|
|
28
|
-
headers.set("authorization", `Bearer ${params.apiKey}`);
|
|
28
|
+
headers.set("authorization", `Bearer ${params.apiKey ?? ""}`);
|
|
29
29
|
}
|
|
30
30
|
const res = await fetchWithTimeout(url, {
|
|
31
31
|
method: "POST",
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { transcribeLocal, MODEL_LABEL } from "../../sherpa-onnx-local.js";
|
|
2
|
+
export const sherpaOnnxProvider = {
|
|
3
|
+
id: "sherpa-onnx",
|
|
4
|
+
isLocal: true,
|
|
5
|
+
capabilities: ["audio"],
|
|
6
|
+
transcribeAudio: async (req) => {
|
|
7
|
+
const result = await transcribeLocal(req.buffer, req.fileName);
|
|
8
|
+
return { text: result.text, model: result.model ?? MODEL_LABEL };
|
|
9
|
+
},
|
|
10
|
+
};
|
|
@@ -5,7 +5,7 @@ import path from "node:path";
|
|
|
5
5
|
import { findModelInCatalog, loadModelCatalog, modelSupportsVision, } from "../agents/model-catalog.js";
|
|
6
6
|
import { applyTemplate } from "../auto-reply/templating.js";
|
|
7
7
|
import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js";
|
|
8
|
-
import {
|
|
8
|
+
import { createSubsystemLogger } from "../logging/subsystem.js";
|
|
9
9
|
import { runExec } from "../process/exec.js";
|
|
10
10
|
import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js";
|
|
11
11
|
import { CLI_OUTPUT_MAX_BUFFER, DEFAULT_AUDIO_MODELS, DEFAULT_TIMEOUT_SECONDS, } from "./defaults.js";
|
|
@@ -23,6 +23,7 @@ const DEFAULT_IMAGE_MODELS = {
|
|
|
23
23
|
google: "gemini-3-flash-preview",
|
|
24
24
|
minimax: "MiniMax-VL-01",
|
|
25
25
|
};
|
|
26
|
+
const log = createSubsystemLogger("gateway/media");
|
|
26
27
|
export function buildProviderRegistry(overrides) {
|
|
27
28
|
return buildMediaUnderstandingRegistry(overrides);
|
|
28
29
|
}
|
|
@@ -33,7 +34,6 @@ export function createMediaAttachmentCache(attachments) {
|
|
|
33
34
|
return new MediaAttachmentCache(attachments);
|
|
34
35
|
}
|
|
35
36
|
const binaryCache = new Map();
|
|
36
|
-
const geminiProbeCache = new Map();
|
|
37
37
|
function expandHomeDir(value) {
|
|
38
38
|
if (!value.startsWith("~"))
|
|
39
39
|
return value;
|
|
@@ -181,26 +181,6 @@ function extractSherpaOnnxText(raw) {
|
|
|
181
181
|
}
|
|
182
182
|
return null;
|
|
183
183
|
}
|
|
184
|
-
async function probeGeminiCli() {
|
|
185
|
-
const cached = geminiProbeCache.get("gemini");
|
|
186
|
-
if (cached)
|
|
187
|
-
return cached;
|
|
188
|
-
const resolved = (async () => {
|
|
189
|
-
if (!(await hasBinary("gemini")))
|
|
190
|
-
return false;
|
|
191
|
-
try {
|
|
192
|
-
const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], {
|
|
193
|
-
timeoutMs: 8000,
|
|
194
|
-
});
|
|
195
|
-
return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok"));
|
|
196
|
-
}
|
|
197
|
-
catch {
|
|
198
|
-
return false;
|
|
199
|
-
}
|
|
200
|
-
})();
|
|
201
|
-
geminiProbeCache.set("gemini", resolved);
|
|
202
|
-
return resolved;
|
|
203
|
-
}
|
|
204
184
|
async function resolveLocalWhisperCppEntry() {
|
|
205
185
|
if (!(await hasBinary("whisper-cli")))
|
|
206
186
|
return null;
|
|
@@ -234,7 +214,34 @@ async function resolveLocalWhisperEntry() {
|
|
|
234
214
|
],
|
|
235
215
|
};
|
|
236
216
|
}
|
|
237
|
-
|
|
217
|
+
/**
|
|
218
|
+
* Check if sherpa-onnx-node (npm package) is available with model + ffmpeg.
|
|
219
|
+
* Returns a provider entry so the pipeline uses the Node.js API directly
|
|
220
|
+
* (no CLI binary or SHERPA_ONNX_MODEL_DIR env var required).
|
|
221
|
+
*/
|
|
222
|
+
async function resolveSherpaOnnxNodeEntry() {
|
|
223
|
+
try {
|
|
224
|
+
const { isReady } = await import("./sherpa-onnx-local.js");
|
|
225
|
+
if (await isReady()) {
|
|
226
|
+
return { type: "provider", provider: "sherpa-onnx" };
|
|
227
|
+
}
|
|
228
|
+
// Package + ffmpeg available but model not yet downloaded — still viable
|
|
229
|
+
// (the provider will trigger a lazy download on first use)
|
|
230
|
+
const { isAvailable } = await import("./sherpa-onnx-local.js");
|
|
231
|
+
if (await isAvailable()) {
|
|
232
|
+
return { type: "provider", provider: "sherpa-onnx" };
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
catch {
|
|
236
|
+
// sherpa-onnx-node not installed — skip
|
|
237
|
+
}
|
|
238
|
+
return null;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Fallback: check for sherpa-onnx-offline CLI binary + SHERPA_ONNX_MODEL_DIR env var.
|
|
242
|
+
* This is the legacy detection path for users who installed the binary manually.
|
|
243
|
+
*/
|
|
244
|
+
async function resolveSherpaOnnxCliEntry() {
|
|
238
245
|
if (!(await hasBinary("sherpa-onnx-offline")))
|
|
239
246
|
return null;
|
|
240
247
|
const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim();
|
|
@@ -265,32 +272,19 @@ async function resolveSherpaOnnxEntry() {
|
|
|
265
272
|
};
|
|
266
273
|
}
|
|
267
274
|
async function resolveLocalAudioEntry() {
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
275
|
+
// Prefer sherpa-onnx-node (npm, no PATH issues, automatic model management)
|
|
276
|
+
const sherpaNode = await resolveSherpaOnnxNodeEntry();
|
|
277
|
+
if (sherpaNode)
|
|
278
|
+
return sherpaNode;
|
|
279
|
+
// Fallback: CLI binary (legacy/manual installs)
|
|
280
|
+
const sherpaCli = await resolveSherpaOnnxCliEntry();
|
|
281
|
+
if (sherpaCli)
|
|
282
|
+
return sherpaCli;
|
|
271
283
|
const whisperCpp = await resolveLocalWhisperCppEntry();
|
|
272
284
|
if (whisperCpp)
|
|
273
285
|
return whisperCpp;
|
|
274
286
|
return await resolveLocalWhisperEntry();
|
|
275
287
|
}
|
|
276
|
-
async function resolveGeminiCliEntry(_capability) {
|
|
277
|
-
if (!(await probeGeminiCli()))
|
|
278
|
-
return null;
|
|
279
|
-
return {
|
|
280
|
-
type: "cli",
|
|
281
|
-
command: "gemini",
|
|
282
|
-
args: [
|
|
283
|
-
"--output-format",
|
|
284
|
-
"json",
|
|
285
|
-
"--allowed-tools",
|
|
286
|
-
"read_many_files",
|
|
287
|
-
"--include-directories",
|
|
288
|
-
"{{MediaDir}}",
|
|
289
|
-
"{{Prompt}}",
|
|
290
|
-
"Use read_many_files to read {{MediaPath}} and respond with only the text output.",
|
|
291
|
-
],
|
|
292
|
-
};
|
|
293
|
-
}
|
|
294
288
|
async function resolveKeyEntry(params) {
|
|
295
289
|
const { cfg, agentDir, providerRegistry, capability } = params;
|
|
296
290
|
const checkProvider = async (providerId, model) => {
|
|
@@ -362,9 +356,6 @@ async function resolveAutoEntries(params) {
|
|
|
362
356
|
if (localAudio)
|
|
363
357
|
return [localAudio];
|
|
364
358
|
}
|
|
365
|
-
const gemini = await resolveGeminiCliEntry(params.capability);
|
|
366
|
-
if (gemini)
|
|
367
|
-
return [gemini];
|
|
368
359
|
const keys = await resolveKeyEntry(params);
|
|
369
360
|
if (keys)
|
|
370
361
|
return [keys];
|
|
@@ -635,14 +626,18 @@ async function runProviderEntry(params) {
|
|
|
635
626
|
maxBytes,
|
|
636
627
|
timeoutMs,
|
|
637
628
|
});
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
629
|
+
// Local providers (e.g. sherpa-onnx) do not require an API key.
|
|
630
|
+
let apiKey;
|
|
631
|
+
if (!provider.isLocal) {
|
|
632
|
+
const auth = await resolveApiKeyForProvider({
|
|
633
|
+
provider: providerId,
|
|
634
|
+
cfg,
|
|
635
|
+
profileId: entry.profile,
|
|
636
|
+
preferredProfile: entry.preferredProfile,
|
|
637
|
+
agentDir: params.agentDir,
|
|
638
|
+
});
|
|
639
|
+
apiKey = requireApiKey(auth, providerId);
|
|
640
|
+
}
|
|
646
641
|
const providerConfig = cfg.models?.providers?.[providerId];
|
|
647
642
|
const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
|
|
648
643
|
const mergedHeaders = {
|
|
@@ -751,9 +746,7 @@ async function runCliEntry(params) {
|
|
|
751
746
|
};
|
|
752
747
|
const argv = [command, ...args].map((part, index) => index === 0 ? part : applyTemplate(part, templCtx));
|
|
753
748
|
try {
|
|
754
|
-
|
|
755
|
-
logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
|
|
756
|
-
}
|
|
749
|
+
log.debug(`CLI: ${argv.join(" ")}`);
|
|
757
750
|
const { stdout } = await runExec(argv[0], argv.slice(1), {
|
|
758
751
|
timeoutMs,
|
|
759
752
|
maxBuffer: CLI_OUTPUT_MAX_BUFFER,
|
|
@@ -825,9 +818,7 @@ async function runAttachmentEntries(params) {
|
|
|
825
818
|
outcome: "skipped",
|
|
826
819
|
reason: `${err.reason}: ${err.message}`,
|
|
827
820
|
}));
|
|
828
|
-
|
|
829
|
-
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
|
830
|
-
}
|
|
821
|
+
log.debug(`Skipping ${capability} model: ${err.reason}: ${err.message}`);
|
|
831
822
|
continue;
|
|
832
823
|
}
|
|
833
824
|
attempts.push(buildModelDecision({
|
|
@@ -836,9 +827,7 @@ async function runAttachmentEntries(params) {
|
|
|
836
827
|
outcome: "failed",
|
|
837
828
|
reason: String(err),
|
|
838
829
|
}));
|
|
839
|
-
|
|
840
|
-
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
|
841
|
-
}
|
|
830
|
+
log.error(`${capability} failed: ${String(err)}`);
|
|
842
831
|
}
|
|
843
832
|
}
|
|
844
833
|
return { output: null, attempts };
|
|
@@ -866,9 +855,7 @@ export async function runCapability(params) {
|
|
|
866
855
|
}
|
|
867
856
|
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
|
868
857
|
if (scopeDecision === "deny") {
|
|
869
|
-
|
|
870
|
-
logVerbose(`${capability} understanding disabled by scope policy.`);
|
|
871
|
-
}
|
|
858
|
+
log.debug(`${capability} disabled by scope policy`);
|
|
872
859
|
return {
|
|
873
860
|
outputs: [],
|
|
874
861
|
decision: {
|
|
@@ -885,9 +872,7 @@ export async function runCapability(params) {
|
|
|
885
872
|
const catalog = await loadModelCatalog({ config: cfg });
|
|
886
873
|
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
|
|
887
874
|
if (modelSupportsVision(entry)) {
|
|
888
|
-
|
|
889
|
-
logVerbose("Skipping image understanding: primary model supports vision natively");
|
|
890
|
-
}
|
|
875
|
+
log.debug("Skipping image understanding: primary model supports vision natively");
|
|
891
876
|
const model = params.activeModel?.model?.trim();
|
|
892
877
|
const reason = "primary model supports vision natively";
|
|
893
878
|
return {
|
|
@@ -966,8 +951,12 @@ export async function runCapability(params) {
|
|
|
966
951
|
outcome: outputs.length > 0 ? "success" : "skipped",
|
|
967
952
|
attachments: attachmentDecisions,
|
|
968
953
|
};
|
|
969
|
-
|
|
970
|
-
|
|
954
|
+
const summary = formatDecisionSummary(decision);
|
|
955
|
+
if (decision.outcome === "success") {
|
|
956
|
+
log.info(summary);
|
|
957
|
+
}
|
|
958
|
+
else {
|
|
959
|
+
log.debug(summary);
|
|
971
960
|
}
|
|
972
961
|
return {
|
|
973
962
|
outputs,
|