@rubytech/taskmaster 1.2.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/dist/agents/auth-profiles/oauth.js +24 -0
  2. package/dist/agents/auth-profiles/profiles.js +37 -0
  3. package/dist/agents/auth-profiles.js +1 -1
  4. package/dist/agents/pi-tools.policy.js +4 -0
  5. package/dist/agents/taskmaster-tools.js +14 -0
  6. package/dist/agents/tool-policy.js +5 -2
  7. package/dist/agents/tools/apikeys-tool.js +16 -5
  8. package/dist/agents/tools/contact-create-tool.js +59 -0
  9. package/dist/agents/tools/contact-delete-tool.js +48 -0
  10. package/dist/agents/tools/contact-update-tool.js +17 -2
  11. package/dist/agents/tools/file-delete-tool.js +137 -0
  12. package/dist/agents/tools/file-list-tool.js +127 -0
  13. package/dist/agents/tools/message-history-tool.js +2 -3
  14. package/dist/auto-reply/media-note.js +11 -0
  15. package/dist/auto-reply/reply/commands-tts.js +7 -2
  16. package/dist/auto-reply/reply/get-reply.js +4 -0
  17. package/dist/build-info.json +3 -3
  18. package/dist/cli/provision-seed.js +1 -2
  19. package/dist/commands/doctor-config-flow.js +13 -0
  20. package/dist/config/agent-tools-reconcile.js +53 -0
  21. package/dist/config/defaults.js +10 -1
  22. package/dist/config/legacy.migrations.part-3.js +26 -0
  23. package/dist/config/zod-schema.core.js +9 -1
  24. package/dist/config/zod-schema.js +1 -0
  25. package/dist/control-ui/assets/{index-N8du4fwV.js → index-BDETQp97.js} +692 -600
  26. package/dist/control-ui/assets/index-BDETQp97.js.map +1 -0
  27. package/dist/control-ui/assets/index-CPawOl_z.css +1 -0
  28. package/dist/control-ui/index.html +2 -2
  29. package/dist/gateway/chat-sanitize.js +5 -1
  30. package/dist/gateway/config-reload.js +1 -0
  31. package/dist/gateway/media-http.js +28 -0
  32. package/dist/gateway/server/tls.js +2 -2
  33. package/dist/gateway/server-http.js +34 -4
  34. package/dist/gateway/server-methods/apikeys.js +56 -4
  35. package/dist/gateway/server-methods/chat.js +64 -25
  36. package/dist/gateway/server-methods/tts.js +11 -2
  37. package/dist/gateway/server.impl.js +38 -5
  38. package/dist/infra/tls/gateway.js +19 -3
  39. package/dist/media-understanding/apply.js +35 -0
  40. package/dist/media-understanding/providers/deepgram/audio.js +1 -1
  41. package/dist/media-understanding/providers/google/audio.js +1 -1
  42. package/dist/media-understanding/providers/google/video.js +1 -1
  43. package/dist/media-understanding/providers/index.js +2 -0
  44. package/dist/media-understanding/providers/openai/audio.js +1 -1
  45. package/dist/media-understanding/providers/sherpa-onnx/index.js +10 -0
  46. package/dist/media-understanding/runner.js +61 -72
  47. package/dist/media-understanding/sherpa-onnx-local.js +223 -0
  48. package/dist/memory/audit.js +9 -0
  49. package/dist/memory/manager.js +1 -1
  50. package/dist/records/records-manager.js +10 -0
  51. package/dist/tts/tts.js +98 -10
  52. package/dist/web/auto-reply/monitor/process-message.js +45 -17
  53. package/dist/web/inbound/monitor.js +9 -1
  54. package/extensions/diagnostics-otel/node_modules/.bin/acorn +0 -0
  55. package/extensions/googlechat/node_modules/.bin/taskmaster +2 -2
  56. package/extensions/googlechat/package.json +2 -2
  57. package/extensions/line/node_modules/.bin/taskmaster +2 -2
  58. package/extensions/line/package.json +1 -1
  59. package/extensions/matrix/node_modules/.bin/markdown-it +0 -0
  60. package/extensions/matrix/node_modules/.bin/taskmaster +2 -2
  61. package/extensions/matrix/package.json +1 -1
  62. package/extensions/memory-lancedb/node_modules/.bin/arrow2csv +0 -0
  63. package/extensions/memory-lancedb/node_modules/.bin/openai +0 -0
  64. package/extensions/msteams/node_modules/.bin/taskmaster +2 -2
  65. package/extensions/msteams/package.json +1 -1
  66. package/extensions/nostr/node_modules/.bin/taskmaster +2 -2
  67. package/extensions/nostr/node_modules/.bin/tsc +0 -0
  68. package/extensions/nostr/node_modules/.bin/tsserver +0 -0
  69. package/extensions/nostr/package.json +1 -1
  70. package/extensions/zalo/node_modules/.bin/taskmaster +2 -2
  71. package/extensions/zalo/package.json +1 -1
  72. package/extensions/zalouser/node_modules/.bin/taskmaster +2 -2
  73. package/extensions/zalouser/package.json +1 -1
  74. package/package.json +56 -65
  75. package/scripts/install.sh +0 -0
  76. package/scripts/postinstall.js +76 -0
  77. package/skills/business-assistant/references/crm.md +32 -8
  78. package/taskmaster-docs/USER-GUIDE.md +111 -6
  79. package/templates/.DS_Store +0 -0
  80. package/templates/beagle/agents/admin/AGENTS.md +4 -2
  81. package/templates/customer/.DS_Store +0 -0
  82. package/templates/customer/agents/.DS_Store +0 -0
  83. package/templates/maxy/.DS_Store +0 -0
  84. package/templates/maxy/.gitignore +1 -0
  85. package/templates/maxy/agents/.DS_Store +0 -0
  86. package/templates/maxy/agents/admin/.DS_Store +0 -0
  87. package/templates/maxy/memory/.DS_Store +0 -0
  88. package/templates/maxy/skills/.DS_Store +0 -0
  89. package/templates/taskmaster/.gitignore +1 -0
  90. package/templates/taskmaster/agents/admin/AGENTS.md +1 -0
  91. package/dist/control-ui/assets/index-DtQHRIVD.css +0 -1
  92. package/dist/control-ui/assets/index-N8du4fwV.js.map +0 -1
@@ -379,8 +379,12 @@ export const chatHandlers = {
379
379
  }
380
380
  }
381
381
  }
382
- // Save document attachments to workspace uploads dir (persistent, accessible by agent)
382
+ // Save document attachments to workspace uploads dir (persistent, accessible by agent).
383
+ // Audio files are separated so they can be routed through the media understanding
384
+ // pipeline (STT) instead of being treated as generic file attachments.
383
385
  const savedDocPaths = [];
386
+ const savedAudioPaths = [];
387
+ const savedAudioTypes = [];
384
388
  if (documentAttachments.length > 0 && uploadsDir) {
385
389
  for (const doc of documentAttachments) {
386
390
  if (!doc.content || typeof doc.content !== "string")
@@ -389,7 +393,14 @@ export const chatHandlers = {
389
393
  const destPath = path.join(uploadsDir, safeName);
390
394
  try {
391
395
  fs.writeFileSync(destPath, Buffer.from(doc.content, "base64"));
392
- savedDocPaths.push(destPath);
396
+ const mimeBase = doc.mimeType?.split(";")[0]?.trim() ?? "";
397
+ if (mimeBase.startsWith("audio/")) {
398
+ savedAudioPaths.push(destPath);
399
+ savedAudioTypes.push(doc.mimeType ?? "audio/webm");
400
+ }
401
+ else {
402
+ savedDocPaths.push(destPath);
403
+ }
393
404
  }
394
405
  catch (err) {
395
406
  context.logGateway.warn(`chat document save failed: ${String(err)}`);
@@ -460,18 +471,29 @@ export const chatHandlers = {
460
471
  const trimmedMessage = p.message.trim();
461
472
  const injectThinking = Boolean(p.thinking && trimmedMessage && !trimmedMessage.startsWith("/"));
462
473
  const commandBody = injectThinking ? `/think ${p.thinking} ${p.message}` : p.message;
463
- // If documents were saved, prepend file paths to message so the agent knows about them
474
+ // If non-audio documents were saved, prepend file paths to message.
475
+ // Audio files are NOT annotated here — they go through MediaPaths so the
476
+ // media understanding pipeline (STT) handles them, and buildInboundMediaNote
477
+ // generates the proper [media attached: ...] annotation.
464
478
  const docNote = savedDocPaths.length > 0
465
479
  ? savedDocPaths.map((p) => `[file: ${p}]`).join("\n") + "\n\n"
466
480
  : "";
467
- const messageWithDocs = docNote + p.message;
481
+ // Audio-only message (voice note, no text): use placeholder so
482
+ // applyMediaUnderstanding knows to replace with transcript or error.
483
+ const hasAudioMedia = savedAudioPaths.length > 0;
484
+ const effectiveBody = hasAudioMedia && !trimmedMessage ? "<media:audio>" : p.message;
485
+ const messageWithDocs = docNote + effectiveBody;
486
+ const effectiveCommandBody = hasAudioMedia && !trimmedMessage ? "<media:audio>" : commandBody;
487
+ // Merge image and audio paths so the media understanding pipeline sees both.
488
+ const allMediaPaths = [...savedImagePaths, ...savedAudioPaths];
489
+ const allMediaTypes = [...savedImageTypes, ...savedAudioTypes];
468
490
  const clientInfo = client?.connect?.client;
469
491
  const ctx = {
470
492
  Body: messageWithDocs,
471
493
  BodyForAgent: messageWithDocs,
472
- BodyForCommands: docNote + commandBody,
494
+ BodyForCommands: docNote + effectiveCommandBody,
473
495
  RawBody: messageWithDocs,
474
- CommandBody: docNote + commandBody,
496
+ CommandBody: docNote + effectiveCommandBody,
475
497
  SessionKey: p.sessionKey,
476
498
  Provider: INTERNAL_MESSAGE_CHANNEL,
477
499
  Surface: INTERNAL_MESSAGE_CHANNEL,
@@ -485,10 +507,10 @@ export const chatHandlers = {
485
507
  // Image/media paths — same pattern as WhatsApp. buildInboundMediaNote()
486
508
  // will generate [media attached: ...] annotations that the agent runner
487
509
  // detects and loads from disk at inference time.
488
- MediaPaths: savedImagePaths.length > 0 ? savedImagePaths : undefined,
489
- MediaPath: savedImagePaths[0],
490
- MediaTypes: savedImageTypes.length > 0 ? savedImageTypes : undefined,
491
- MediaType: savedImageTypes[0],
510
+ MediaPaths: allMediaPaths.length > 0 ? allMediaPaths : undefined,
511
+ MediaPath: allMediaPaths[0],
512
+ MediaTypes: allMediaTypes.length > 0 ? allMediaTypes : undefined,
513
+ MediaType: allMediaTypes[0],
492
514
  };
493
515
  const agentId = resolveSessionAgentId({
494
516
  sessionKey: p.sessionKey,
@@ -496,16 +518,26 @@ export const chatHandlers = {
496
518
  });
497
519
  // Fire message:inbound hook for conversation archiving.
498
520
  // Include image paths so the archive references the attached media.
521
+ // Audio archive is deferred until after media understanding resolves (see
522
+ // onMediaResolved below) so the transcript is available instead of the
523
+ // raw <media:audio> placeholder.
499
524
  const imageNote = savedImagePaths.length > 0 ? savedImagePaths.map((ip) => `[image: ${ip}]`).join("\n") : "";
500
- const archiveText = [p.message, imageNote].filter(Boolean).join("\n").trim();
501
- void triggerInternalHook(createInternalHookEvent("message", "inbound", p.sessionKey, {
502
- text: archiveText || undefined,
503
- timestamp: now,
504
- chatType: "direct",
505
- agentId,
506
- channel: "webchat",
507
- cfg,
508
- }));
525
+ const fireArchiveHook = (resolvedBody) => {
526
+ const body = resolvedBody ?? p.message;
527
+ const archiveText = [body, imageNote].filter(Boolean).join("\n").trim();
528
+ void triggerInternalHook(createInternalHookEvent("message", "inbound", p.sessionKey, {
529
+ text: archiveText || undefined,
530
+ timestamp: now,
531
+ chatType: "direct",
532
+ agentId,
533
+ channel: "webchat",
534
+ cfg,
535
+ }));
536
+ };
537
+ if (!hasAudioMedia) {
538
+ // No audio — fire immediately (no STT to wait for).
539
+ fireArchiveHook();
540
+ }
509
541
  let prefixContext = {
510
542
  identityName: resolveIdentityName(cfg, agentId),
511
543
  };
@@ -541,7 +573,7 @@ export const chatHandlers = {
541
573
  },
542
574
  });
543
575
  let agentRunStarted = false;
544
- context.logGateway.info(`webchat dispatch: sessionKey=${p.sessionKey} runId=${clientRunId} body=${messageWithDocs.length}ch images=${savedImagePaths.length} docs=${savedDocPaths.length}`);
576
+ context.logGateway.info(`webchat dispatch: sessionKey=${p.sessionKey} runId=${clientRunId} body=${messageWithDocs.length}ch images=${savedImagePaths.length} audio=${savedAudioPaths.length} docs=${savedDocPaths.length}`);
545
577
  void dispatchInboundMessage({
546
578
  ctx,
547
579
  cfg,
@@ -554,11 +586,18 @@ export const chatHandlers = {
554
586
  agentRunStarted = true;
555
587
  context.logGateway.info(`webchat agent run started: sessionKey=${p.sessionKey} runId=${runId}`);
556
588
  },
557
- onModelSelected: (ctx) => {
558
- prefixContext.provider = ctx.provider;
559
- prefixContext.model = extractShortModelName(ctx.model);
560
- prefixContext.modelFull = `${ctx.provider}/${ctx.model}`;
561
- prefixContext.thinkingLevel = ctx.thinkLevel ?? "off";
589
+ onMediaResolved: hasAudioMedia
590
+ ? () => {
591
+ // STT complete — archive the resolved body (transcript) instead
592
+ // of the raw <media:audio> placeholder.
593
+ fireArchiveHook(ctx.Body);
594
+ }
595
+ : undefined,
596
+ onModelSelected: (modelCtx) => {
597
+ prefixContext.provider = modelCtx.provider;
598
+ prefixContext.model = extractShortModelName(modelCtx.model);
599
+ prefixContext.modelFull = `${modelCtx.provider}/${modelCtx.model}`;
600
+ prefixContext.thinkingLevel = modelCtx.thinkLevel ?? "off";
562
601
  },
563
602
  },
564
603
  })
@@ -80,8 +80,11 @@ export const ttsHandlers = {
80
80
  },
81
81
  "tts.setProvider": async ({ params, respond }) => {
82
82
  const provider = typeof params.provider === "string" ? params.provider.trim() : "";
83
- if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") {
84
- respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai, elevenlabs, or edge."));
83
+ if (provider !== "openai" &&
84
+ provider !== "elevenlabs" &&
85
+ provider !== "edge" &&
86
+ provider !== "hume") {
87
+ respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "Invalid provider. Use openai, elevenlabs, hume, or edge."));
85
88
  return;
86
89
  }
87
90
  try {
@@ -115,6 +118,12 @@ export const ttsHandlers = {
115
118
  configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
116
119
  models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
117
120
  },
121
+ {
122
+ id: "hume",
123
+ name: "Hume",
124
+ configured: Boolean(resolveTtsApiKey(config, "hume")),
125
+ models: [],
126
+ },
118
127
  {
119
128
  id: "edge",
120
129
  name: "Edge TTS",
@@ -9,6 +9,7 @@ import { CONFIG_PATH_TASKMASTER, isNixMode, loadConfig, migrateLegacyConfig, rea
9
9
  import { VERSION } from "../version.js";
10
10
  import { isDiagnosticsEnabled } from "../infra/diagnostic-events.js";
11
11
  import { logAcceptedEnvOption } from "../infra/env.js";
12
+ import { reconcileAgentContactTools } from "../config/agent-tools-reconcile.js";
12
13
  import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
13
14
  import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
14
15
  import { onHeartbeatEvent } from "../infra/heartbeat-events.js";
@@ -53,6 +54,7 @@ import { ensureWatchdogUnitOnStartup, scheduleWatchdogStabilityConfirmation, } f
53
54
  import { startGatewayTailscaleExposure } from "./server-tailscale.js";
54
55
  import { startWifiWatchdog } from "./server-wifi-watchdog.js";
55
56
  import { loadGatewayTlsRuntime } from "./server/tls.js";
57
+ import { isLoopbackHost } from "./net.js";
56
58
  import { createWizardSessionTracker } from "./server-wizard-sessions.js";
57
59
  import { attachGatewayWsHandlers } from "./server-ws-runtime.js";
58
60
  import { isLicenseValid } from "../license/validate.js";
@@ -121,6 +123,20 @@ export async function startGatewayServer(port = 18789, opts = {}) {
121
123
  log.warn(`gateway: failed to persist plugin auto-enable changes: ${String(err)}`);
122
124
  }
123
125
  }
126
+ // Reconcile agent tool groups (e.g. individual contact tools → group:contacts).
127
+ const toolReconcile = reconcileAgentContactTools({ config: configSnapshot.config });
128
+ if (toolReconcile.changes.length > 0) {
129
+ try {
130
+ await writeConfigFile(toolReconcile.config);
131
+ configSnapshot = await readConfigFileSnapshot();
132
+ log.info(`gateway: reconciled agent tools:\n${toolReconcile.changes
133
+ .map((entry) => `- ${entry}`)
134
+ .join("\n")}`);
135
+ }
136
+ catch (err) {
137
+ log.warn(`gateway: failed to persist agent tools reconciliation: ${String(err)}`);
138
+ }
139
+ }
124
140
  // Stamp config with running version on startup so upgrades keep the stamp current.
125
141
  const storedVersion = configSnapshot.config.meta?.lastTouchedVersion;
126
142
  if (configSnapshot.exists && storedVersion !== VERSION) {
@@ -211,10 +227,30 @@ export async function startGatewayServer(port = 18789, opts = {}) {
211
227
  const { wizardSessions, findRunningWizard, purgeWizardSession } = createWizardSessionTracker();
212
228
  const deps = createDefaultDeps();
213
229
  let canvasHostServer = null;
214
- const gatewayTls = await loadGatewayTlsRuntime(cfgAtStart.gateway?.tls, log.child("tls"));
215
- if (cfgAtStart.gateway?.tls?.enabled && !gatewayTls.enabled) {
230
+ // Auto-enable TLS when binding to a non-loopback address (LAN, custom, etc.)
231
+ // so that browser secure-context APIs (getUserMedia, etc.) work over .local.
232
+ // Only auto-enable when the user hasn't explicitly configured tls.enabled.
233
+ const tlsExplicit = cfgAtStart.gateway?.tls?.enabled;
234
+ const tlsAutoEnable = tlsExplicit === undefined && !isLoopbackHost(bindHost);
235
+ const effectiveTlsConfig = tlsAutoEnable
236
+ ? { ...cfgAtStart.gateway?.tls, enabled: true }
237
+ : cfgAtStart.gateway?.tls;
238
+ if (tlsAutoEnable) {
239
+ log.child("tls").info("gateway tls: auto-enabled for non-loopback bind");
240
+ }
241
+ const bonjourHostname = cfgAtStart.discovery?.bonjourHostname || "taskmaster";
242
+ const tlsHostnames = [bonjourHostname];
243
+ const gatewayTls = await loadGatewayTlsRuntime(effectiveTlsConfig, log.child("tls"), tlsHostnames);
244
+ if (tlsExplicit === true && !gatewayTls.enabled) {
245
+ // User explicitly enabled TLS — fail hard if it can't start.
216
246
  throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
217
247
  }
248
+ if (tlsAutoEnable && !gatewayTls.enabled) {
249
+ // Auto-enabled TLS failed — fall back to HTTP with a warning.
250
+ log
251
+ .child("tls")
252
+ .warn(`gateway tls: auto-enable failed (${gatewayTls.error ?? "unknown"}), continuing with HTTP`);
253
+ }
218
254
  const { canvasHost, httpServer, httpServers, httpBindHosts, wss, clients, broadcast, agentRunSeq, dedupe, chatRunState, chatRunBuffers, chatDeltaSentAt, addChatRun, removeChatRun, chatAbortControllers, } = await createGatewayRuntimeState({
219
255
  cfg: cfgAtStart,
220
256
  bindHost,
@@ -268,9 +304,6 @@ export async function startGatewayServer(port = 18789, opts = {}) {
268
304
  });
269
305
  const { getRuntimeSnapshot, startChannels, startChannel, stopChannel, markChannelLoggedOut } = channelManager;
270
306
  const machineDisplayName = await getMachineDisplayName();
271
- // Default to "taskmaster" hostname for mDNS so taskmaster.local works out of the box.
272
- // Users can override via discovery.bonjourHostname config if needed.
273
- const bonjourHostname = cfgAtStart.discovery?.bonjourHostname || "taskmaster";
274
307
  const discovery = await startGatewayDiscovery({
275
308
  machineDisplayName,
276
309
  port,
@@ -1,6 +1,7 @@
1
1
  import { execFile } from "node:child_process";
2
2
  import { X509Certificate } from "node:crypto";
3
3
  import fs from "node:fs/promises";
4
+ import os from "node:os";
4
5
  import path from "node:path";
5
6
  import { promisify } from "node:util";
6
7
  import { CONFIG_DIR, ensureDir, resolveUserPath, shortenHomeInString } from "../../utils.js";
@@ -15,6 +16,18 @@ async function fileExists(filePath) {
15
16
  return false;
16
17
  }
17
18
  }
19
+ function buildSanString(hostnames) {
20
+ const sans = new Set(["DNS:localhost", "IP:127.0.0.1"]);
21
+ const raw = hostnames?.length ? hostnames : [os.hostname()];
22
+ for (const h of raw) {
23
+ const name = h.replace(/\.local$/i, "").trim();
24
+ if (!name)
25
+ continue;
26
+ sans.add(`DNS:${name}`);
27
+ sans.add(`DNS:${name}.local`);
28
+ }
29
+ return [...sans].join(",");
30
+ }
18
31
  async function generateSelfSignedCert(params) {
19
32
  const certDir = path.dirname(params.certPath);
20
33
  const keyDir = path.dirname(params.keyPath);
@@ -22,6 +35,7 @@ async function generateSelfSignedCert(params) {
22
35
  if (keyDir !== certDir) {
23
36
  await ensureDir(keyDir);
24
37
  }
38
+ const san = buildSanString(params.hostnames);
25
39
  await execFileAsync("openssl", [
26
40
  "req",
27
41
  "-x509",
@@ -37,12 +51,14 @@ async function generateSelfSignedCert(params) {
37
51
  params.certPath,
38
52
  "-subj",
39
53
  "/CN=taskmaster-gateway",
54
+ "-addext",
55
+ `subjectAltName=${san}`,
40
56
  ]);
41
57
  await fs.chmod(params.keyPath, 0o600).catch(() => { });
42
58
  await fs.chmod(params.certPath, 0o600).catch(() => { });
43
- params.log?.info?.(`gateway tls: generated self-signed cert at ${shortenHomeInString(params.certPath)}`);
59
+ params.log?.info?.(`gateway tls: generated self-signed cert at ${shortenHomeInString(params.certPath)} (SAN: ${san})`);
44
60
  }
45
- export async function loadGatewayTlsRuntime(cfg, log) {
61
+ export async function loadGatewayTlsRuntime(cfg, log, hostnames) {
46
62
  if (!cfg || cfg.enabled !== true)
47
63
  return { enabled: false, required: false };
48
64
  const autoGenerate = cfg.autoGenerate !== false;
@@ -54,7 +70,7 @@ export async function loadGatewayTlsRuntime(cfg, log) {
54
70
  const hasKey = await fileExists(keyPath);
55
71
  if (!hasCert && !hasKey && autoGenerate) {
56
72
  try {
57
- await generateSelfSignedCert({ certPath, keyPath, log });
73
+ await generateSelfSignedCert({ certPath, keyPath, hostnames, log });
58
74
  }
59
75
  catch (err) {
60
76
  return {
@@ -1,4 +1,5 @@
1
1
  import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
2
+ import { logVerbose } from "../globals.js";
2
3
  import { extractMediaUserText, formatAudioTranscripts, formatMediaUnderstandingBody, } from "./format.js";
3
4
  import { runWithConcurrency } from "./concurrency.js";
4
5
  import { resolveConcurrency } from "./resolve.js";
@@ -42,6 +43,40 @@ export async function applyMediaUnderstanding(params) {
42
43
  if (decisions.length > 0) {
43
44
  ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
44
45
  }
46
+ // Surface audio failures so the agent can inform the user instead of receiving
47
+ // a bare <media:audio> placeholder with no context about what went wrong.
48
+ const audioDecision = decisions.find((d) => d.capability === "audio");
49
+ const audioTranscribed = outputs.some((o) => o.kind === "audio.transcription");
50
+ const bodyHint = ctx.CommandBody ?? ctx.RawBody ?? ctx.Body ?? "";
51
+ const isAudioPlaceholder = /^<media:audio>/i.test(bodyHint.trim());
52
+ if (isAudioPlaceholder && !audioTranscribed) {
53
+ let reason;
54
+ if (ctx.MediaDownloadFailed) {
55
+ reason = "media download failed — the voice note could not be retrieved from WhatsApp";
56
+ }
57
+ else if (audioDecision?.outcome === "no-attachment") {
58
+ reason = "no audio file available for transcription";
59
+ }
60
+ else if (audioDecision?.outcome === "skipped") {
61
+ // Distinguish between "no providers at all" (empty attempts) and "providers tried but all failed"
62
+ const hasAttempts = audioDecision.attachments?.some((a) => a.attempts.length > 0);
63
+ reason = hasAttempts
64
+ ? "all transcription attempts failed"
65
+ : "no transcription provider configured (add an OpenAI, Google, Groq, or Deepgram API key)";
66
+ }
67
+ else if (audioDecision?.outcome === "disabled") {
68
+ reason = "audio transcription is disabled in config";
69
+ }
70
+ else {
71
+ reason = `transcription ${audioDecision?.outcome ?? "unavailable"}`;
72
+ }
73
+ const note = `[Voice note received but could not be transcribed: ${reason}]`;
74
+ logVerbose(`applyMediaUnderstanding: ${note}`);
75
+ ctx.Body = note;
76
+ ctx.CommandBody = note;
77
+ ctx.RawBody = note;
78
+ finalizeInboundContext(ctx, { forceBodyForAgent: true, forceBodyForCommands: true });
79
+ }
45
80
  if (outputs.length > 0) {
46
81
  ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
47
82
  const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription");
@@ -22,7 +22,7 @@ export async function transcribeDeepgramAudio(params) {
22
22
  }
23
23
  const headers = new Headers(params.headers);
24
24
  if (!headers.has("authorization")) {
25
- headers.set("authorization", `Token ${params.apiKey}`);
25
+ headers.set("authorization", `Token ${params.apiKey ?? ""}`);
26
26
  }
27
27
  if (!headers.has("content-type")) {
28
28
  headers.set("content-type", params.mime ?? "application/octet-stream");
@@ -23,7 +23,7 @@ export async function transcribeGeminiAudio(params) {
23
23
  headers.set("content-type", "application/json");
24
24
  }
25
25
  if (!headers.has("x-goog-api-key")) {
26
- headers.set("x-goog-api-key", params.apiKey);
26
+ headers.set("x-goog-api-key", params.apiKey ?? "");
27
27
  }
28
28
  const body = {
29
29
  contents: [
@@ -23,7 +23,7 @@ export async function describeGeminiVideo(params) {
23
23
  headers.set("content-type", "application/json");
24
24
  }
25
25
  if (!headers.has("x-goog-api-key")) {
26
- headers.set("x-goog-api-key", params.apiKey);
26
+ headers.set("x-goog-api-key", params.apiKey ?? "");
27
27
  }
28
28
  const body = {
29
29
  contents: [
@@ -5,6 +5,7 @@ import { googleProvider } from "./google/index.js";
5
5
  import { groqProvider } from "./groq/index.js";
6
6
  import { minimaxProvider } from "./minimax/index.js";
7
7
  import { openaiProvider } from "./openai/index.js";
8
+ import { sherpaOnnxProvider } from "./sherpa-onnx/index.js";
8
9
  const PROVIDERS = [
9
10
  groqProvider,
10
11
  openaiProvider,
@@ -12,6 +13,7 @@ const PROVIDERS = [
12
13
  anthropicProvider,
13
14
  minimaxProvider,
14
15
  deepgramProvider,
16
+ sherpaOnnxProvider,
15
17
  ];
16
18
  export function normalizeMediaProviderId(id) {
17
19
  const normalized = normalizeProviderId(id);
@@ -25,7 +25,7 @@ export async function transcribeOpenAiCompatibleAudio(params) {
25
25
  form.append("prompt", params.prompt.trim());
26
26
  const headers = new Headers(params.headers);
27
27
  if (!headers.has("authorization")) {
28
- headers.set("authorization", `Bearer ${params.apiKey}`);
28
+ headers.set("authorization", `Bearer ${params.apiKey ?? ""}`);
29
29
  }
30
30
  const res = await fetchWithTimeout(url, {
31
31
  method: "POST",
@@ -0,0 +1,10 @@
1
+ import { transcribeLocal, MODEL_LABEL } from "../../sherpa-onnx-local.js";
2
+ export const sherpaOnnxProvider = {
3
+ id: "sherpa-onnx",
4
+ isLocal: true,
5
+ capabilities: ["audio"],
6
+ transcribeAudio: async (req) => {
7
+ const result = await transcribeLocal(req.buffer, req.fileName);
8
+ return { text: result.text, model: result.model ?? MODEL_LABEL };
9
+ },
10
+ };