vellum 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/bun.lock +2 -2
  2. package/package.json +3 -2
  3. package/src/__tests__/config-schema.test.ts +0 -6
  4. package/src/__tests__/forbidden-legacy-symbols.test.ts +69 -0
  5. package/src/__tests__/gateway-only-enforcement.test.ts +91 -11
  6. package/src/__tests__/ingress-url-consistency.test.ts +214 -0
  7. package/src/__tests__/ipc-snapshot.test.ts +17 -16
  8. package/src/__tests__/oauth2-gateway-transport.test.ts +7 -1
  9. package/src/__tests__/public-ingress-urls.test.ts +50 -34
  10. package/src/__tests__/runtime-events-sse-parity.test.ts +343 -0
  11. package/src/__tests__/runtime-events-sse.test.ts +162 -0
  12. package/src/__tests__/twilio-provider.test.ts +1 -1
  13. package/src/__tests__/twilio-routes.test.ts +4 -4
  14. package/src/__tests__/twitter-auth-handler.test.ts +87 -2
  15. package/src/calls/call-domain.ts +8 -6
  16. package/src/calls/twilio-config.ts +2 -3
  17. package/src/config/bundled-skills/tasks/TOOLS.json +25 -0
  18. package/src/config/bundled-skills/tasks/tools/task-queue-run.ts +9 -0
  19. package/src/config/bundled-skills/transcribe/SKILL.md +25 -0
  20. package/src/config/bundled-skills/transcribe/TOOLS.json +32 -0
  21. package/src/config/bundled-skills/transcribe/tools/transcribe-media.ts +370 -0
  22. package/src/config/defaults.ts +1 -2
  23. package/src/config/schema.ts +2 -6
  24. package/src/config/vellum-skills/google-oauth-setup/SKILL.md +5 -4
  25. package/src/config/vellum-skills/slack-oauth-setup/SKILL.md +4 -2
  26. package/src/config/vellum-skills/telegram-setup/SKILL.md +3 -3
  27. package/src/daemon/handlers/config.ts +33 -50
  28. package/src/daemon/handlers/shared.ts +1 -0
  29. package/src/daemon/handlers/subagents.ts +85 -2
  30. package/src/daemon/handlers/twitter-auth.ts +31 -2
  31. package/src/daemon/ipc-contract-inventory.json +4 -4
  32. package/src/daemon/ipc-contract.ts +25 -21
  33. package/src/daemon/lifecycle.ts +9 -4
  34. package/src/daemon/server.ts +7 -0
  35. package/src/daemon/session-tool-setup.ts +1 -1
  36. package/src/inbound/public-ingress-urls.ts +36 -30
  37. package/src/memory/db.ts +132 -5
  38. package/src/memory/llm-usage-store.ts +0 -1
  39. package/src/memory/runs-store.ts +51 -3
  40. package/src/memory/schema.ts +2 -2
  41. package/src/runtime/gateway-client.ts +7 -1
  42. package/src/runtime/http-server.ts +95 -10
  43. package/src/runtime/routes/channel-routes.ts +7 -2
  44. package/src/runtime/routes/events-routes.ts +79 -0
  45. package/src/runtime/routes/run-routes.ts +43 -0
  46. package/src/runtime/run-orchestrator.ts +64 -7
  47. package/src/security/oauth-callback-registry.ts +10 -0
  48. package/src/security/oauth2.ts +41 -7
  49. package/src/subagent/manager.ts +3 -1
  50. package/src/tools/tasks/work-item-run.ts +78 -0
  51. package/src/util/platform.ts +1 -1
  52. package/src/work-items/work-item-runner.ts +171 -0
  53. package/src/__tests__/handlers-twilio-config.test.ts +0 -221
  54. package/src/calls/__tests__/twilio-webhook-urls.test.ts +0 -162
  55. package/src/calls/twilio-webhook-urls.ts +0 -47
@@ -8,10 +8,11 @@ const testDir = mkdtempSync(join(tmpdir(), 'handlers-twitter-auth-test-'));
8
8
 
9
9
  // Track loadRawConfig / saveRawConfig calls
10
10
  let rawConfigStore: Record<string, unknown> = {};
11
+ let mockIngressPublicBaseUrl: string | undefined = 'https://test.example.com';
11
12
 
12
13
  mock.module('../config/loader.js', () => ({
13
14
  getConfig: () => ({}),
14
- loadConfig: () => ({}),
15
+ loadConfig: () => ({ ingress: { publicBaseUrl: mockIngressPublicBaseUrl } }),
15
16
  loadRawConfig: () => ({ ...rawConfigStore }),
16
17
  saveRawConfig: (cfg: Record<string, unknown>) => {
17
18
  rawConfigStore = { ...cfg };
@@ -20,6 +21,19 @@ mock.module('../config/loader.js', () => ({
20
21
  invalidateConfigCache: () => {},
21
22
  }));
22
23
 
24
+ mock.module('../inbound/public-ingress-urls.js', () => ({
25
+ getPublicBaseUrl: (config: { ingress?: { publicBaseUrl?: string } }) => {
26
+ const url = config?.ingress?.publicBaseUrl;
27
+ if (url) return url;
28
+ throw new Error('No public base URL configured.');
29
+ },
30
+ getOAuthCallbackUrl: (config: { ingress?: { publicBaseUrl?: string } }) => {
31
+ const url = config?.ingress?.publicBaseUrl;
32
+ if (!url) throw new Error('No public base URL configured.');
33
+ return `${url}/webhooks/oauth/callback`;
34
+ },
35
+ }));
36
+
23
37
  mock.module('../util/platform.js', () => ({
24
38
  getRootDir: () => testDir,
25
39
  getDataDir: () => testDir,
@@ -77,9 +91,15 @@ mock.module('../security/secure-keys.js', () => ({
77
91
  // Mock OAuth2 flow
78
92
  let oauthFlowResult: unknown = null;
79
93
  let oauthFlowError: Error | null = null;
94
+ let lastOAuthFlowOptions: Record<string, unknown> | undefined;
80
95
 
81
96
  mock.module('../security/oauth2.js', () => ({
82
- startOAuth2Flow: async (_config: unknown, callbacks: { openUrl: (url: string) => void }) => {
97
+ startOAuth2Flow: async (
98
+ _config: unknown,
99
+ callbacks: { openUrl: (url: string) => void },
100
+ options?: Record<string, unknown>,
101
+ ) => {
102
+ lastOAuthFlowOptions = options;
83
103
  // Trigger the openUrl callback so tests can verify the open_url message is sent
84
104
  callbacks.openUrl('https://twitter.com/i/oauth2/authorize?test=1');
85
105
  if (oauthFlowError) throw oauthFlowError;
@@ -163,6 +183,8 @@ describe('Twitter auth handler', () => {
163
183
  oauthFlowResult = null;
164
184
  oauthFlowError = null;
165
185
  lastUpsertPolicy = undefined;
186
+ lastOAuthFlowOptions = undefined;
187
+ mockIngressPublicBaseUrl = 'https://test.example.com';
166
188
  // Mock fetch for Twitter API
167
189
  globalThis.fetch = (async (_url: string | URL | Request) => {
168
190
  return mockFetchResponse;
@@ -267,6 +289,69 @@ describe('Twitter auth handler', () => {
267
289
  expect(meta!.accountInfo).toBe('@testuser');
268
290
  });
269
291
 
292
+ test('passes callbackTransport: gateway to startOAuth2Flow', async () => {
293
+ rawConfigStore = { twitterIntegrationMode: 'local_byo' };
294
+ secureKeyStore['credential:integration:twitter:oauth_client_id'] = 'test-client-id';
295
+
296
+ oauthFlowResult = {
297
+ tokens: {
298
+ accessToken: 'mock-access-token',
299
+ refreshToken: 'mock-refresh-token',
300
+ expiresIn: 7200,
301
+ scope: 'tweet.read users.read offline.access',
302
+ tokenType: 'bearer',
303
+ },
304
+ grantedScopes: ['tweet.read', 'users.read', 'offline.access'],
305
+ rawTokenResponse: {},
306
+ };
307
+
308
+ const msg: TwitterAuthStartRequest = { type: 'twitter_auth_start' };
309
+ const { ctx, sent } = createTestContext();
310
+ await handleMessage(msg, {} as net.Socket, ctx);
311
+
312
+ await new Promise((r) => setTimeout(r, 50));
313
+
314
+ // Verify startOAuth2Flow was called with gateway transport
315
+ expect(lastOAuthFlowOptions).toBeDefined();
316
+ expect(lastOAuthFlowOptions!.callbackTransport).toBe('gateway');
317
+ });
318
+
319
+ test('fails fast with actionable error when no ingress URL is configured', async () => {
320
+ rawConfigStore = { twitterIntegrationMode: 'local_byo' };
321
+ secureKeyStore['credential:integration:twitter:oauth_client_id'] = 'test-client-id';
322
+ mockIngressPublicBaseUrl = undefined;
323
+
324
+ oauthFlowResult = {
325
+ tokens: { accessToken: 'should-not-reach', refreshToken: undefined },
326
+ grantedScopes: [],
327
+ rawTokenResponse: {},
328
+ };
329
+
330
+ const msg: TwitterAuthStartRequest = { type: 'twitter_auth_start' };
331
+ const { ctx, sent } = createTestContext();
332
+ await handleMessage(msg, {} as net.Socket, ctx);
333
+
334
+ await new Promise((r) => setTimeout(r, 50));
335
+
336
+ // Should NOT have sent open_url — the flow should fail before reaching OAuth
337
+ const openUrlMsg = sent.find((m) => m.type === 'open_url');
338
+ expect(openUrlMsg).toBeUndefined();
339
+
340
+ const result = sent.find((m) => m.type === 'twitter_auth_result') as {
341
+ type: string;
342
+ success: boolean;
343
+ error?: string;
344
+ };
345
+ expect(result).toBeDefined();
346
+ expect(result.success).toBe(false);
347
+ expect(result.error).toContain('ingress.publicBaseUrl');
348
+ expect(result.error).toContain('INGRESS_PUBLIC_BASE_URL');
349
+ expect(result.error).toContain('/webhooks/oauth/callback');
350
+
351
+ // startOAuth2Flow should not have been called
352
+ expect(lastOAuthFlowOptions).toBeUndefined();
353
+ });
354
+
270
355
  describe('auth hardening', () => {
271
356
  test('OAuth cancel path returns sanitized failure', async () => {
272
357
  rawConfigStore = { twitterIntegrationMode: 'local_byo' };
@@ -20,7 +20,8 @@ import { getCallOrchestrator, unregisterCallOrchestrator } from './call-state.js
20
20
  import { activeRelayConnections } from './relay-server.js';
21
21
  import { TwilioConversationRelayProvider } from './twilio-provider.js';
22
22
  import { getTwilioConfig } from './twilio-config.js';
23
- import { buildTwilioVoiceWebhookUrl, buildTwilioStatusCallbackUrl } from './twilio-webhook-urls.js';
23
+ import { getTwilioVoiceWebhookUrl, getTwilioStatusCallbackUrl } from '../inbound/public-ingress-urls.js';
24
+ import { loadConfig } from '../config/loader.js';
24
25
  import type { CallSession } from './types.js';
25
26
 
26
27
  const log = getLogger('call-domain');
@@ -89,13 +90,14 @@ export async function startCall(input: StartCallInput): Promise<StartCallResult
89
90
  let sessionId: string | null = null;
90
91
 
91
92
  try {
92
- const config = getTwilioConfig();
93
+ const twilioConfig = getTwilioConfig();
94
+ const ingressConfig = loadConfig();
93
95
  const provider = new TwilioConversationRelayProvider();
94
96
 
95
97
  const session = createCallSession({
96
98
  conversationId,
97
99
  provider: 'twilio',
98
- fromNumber: config.phoneNumber,
100
+ fromNumber: twilioConfig.phoneNumber,
99
101
  toNumber: phoneNumber,
100
102
  task: callContext ? `${task}\n\nContext: ${callContext}` : task,
101
103
  });
@@ -104,10 +106,10 @@ export async function startCall(input: StartCallInput): Promise<StartCallResult
104
106
  log.info({ callSessionId: session.id, to: phoneNumber, task }, 'Initiating outbound call');
105
107
 
106
108
  const { callSid } = await provider.initiateCall({
107
- from: config.phoneNumber,
109
+ from: twilioConfig.phoneNumber,
108
110
  to: phoneNumber,
109
- webhookUrl: buildTwilioVoiceWebhookUrl(config.webhookBaseUrl, session.id),
110
- statusCallbackUrl: buildTwilioStatusCallbackUrl(config.webhookBaseUrl),
111
+ webhookUrl: getTwilioVoiceWebhookUrl(ingressConfig, session.id),
112
+ statusCallbackUrl: getTwilioStatusCallbackUrl(ingressConfig),
111
113
  });
112
114
 
113
115
  updateCallSession(session.id, { providerCallSid: callSid });
@@ -1,8 +1,7 @@
1
1
  import { getSecureKey } from '../security/secure-keys.js';
2
2
  import { getLogger } from '../util/logger.js';
3
3
  import { loadConfig } from '../config/loader.js';
4
- import { getWebhookBaseUrl } from './twilio-webhook-urls.js';
5
- import { getTwilioRelayUrl } from '../inbound/public-ingress-urls.js';
4
+ import { getPublicBaseUrl, getTwilioRelayUrl } from '../inbound/public-ingress-urls.js';
6
5
 
7
6
  const log = getLogger('twilio-config');
8
7
 
@@ -19,7 +18,7 @@ export function getTwilioConfig(): TwilioConfig {
19
18
  const authToken = getSecureKey('credential:twilio:auth_token');
20
19
  const phoneNumber = process.env.TWILIO_PHONE_NUMBER || getSecureKey('credential:twilio:phone_number') || '';
21
20
  const config = loadConfig();
22
- const webhookBaseUrl = getWebhookBaseUrl(config);
21
+ const webhookBaseUrl = getPublicBaseUrl(config);
23
22
 
24
23
  // In gateway_only mode, ignore TWILIO_WSS_BASE_URL and always use the
25
24
  // centralized relay URL derived from the public ingress base URL.
@@ -251,6 +251,31 @@
251
251
  },
252
252
  "executor": "tools/task-list-remove.ts",
253
253
  "execution_target": "host"
254
+ },
255
+ {
256
+ "name": "task_queue_run",
257
+ "description": "Run a task from the Task Queue in the background. Use this when the user says \"run this task\", \"execute this task\", \"start this task\", or wants to kick off a queued work item. The task runs asynchronously — the user can continue chatting while it executes. Required tool permissions are auto-approved since the user is explicitly requesting execution.",
258
+ "category": "tasks",
259
+ "risk": "medium",
260
+ "input_schema": {
261
+ "type": "object",
262
+ "properties": {
263
+ "work_item_id": {
264
+ "type": "string",
265
+ "description": "Direct work item ID (most precise selector)"
266
+ },
267
+ "task_name": {
268
+ "type": "string",
269
+ "description": "Task name/title to search for (case-insensitive substring match)"
270
+ },
271
+ "title": {
272
+ "type": "string",
273
+ "description": "Work item title to search for (case-insensitive substring match)"
274
+ }
275
+ }
276
+ },
277
+ "executor": "tools/task-queue-run.ts",
278
+ "execution_target": "host"
254
279
  }
255
280
  ]
256
281
  }
@@ -0,0 +1,9 @@
1
+ import type { ToolContext, ToolExecutionResult } from '../../../../tools/types.js';
2
+ import { executeTaskQueueRun } from '../../../../tools/tasks/work-item-run.js';
3
+
4
+ export async function run(
5
+ input: Record<string, unknown>,
6
+ context: ToolContext,
7
+ ): Promise<ToolExecutionResult> {
8
+ return executeTaskQueueRun(input, context);
9
+ }
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: "Transcribe"
3
+ description: "Transcribe audio and video files using Whisper (cloud API or local)"
4
+ metadata: {"vellum": {"emoji": "🎙️"}}
5
+ ---
6
+
7
+ Transcribe audio and video files using OpenAI's Whisper model — either via the cloud API or locally via whisper.cpp.
8
+
9
+ ## Choosing a Mode
10
+
11
+ Before transcribing, **ask the user which mode they prefer** if they haven't specified:
12
+
13
+ 1. **`api`** — Uses the OpenAI Whisper API. Fast, accurate, no setup needed. Requires an OpenAI API key (check if one is already configured). Audio is sent to OpenAI's servers. Costs ~$0.006/min.
14
+ 2. **`local`** — Uses whisper.cpp installed via Homebrew. Free, private, runs entirely on-device. Requires a one-time `brew install whisper-cpp`. Slightly slower but no data leaves the machine.
15
+
16
+ If the user says "cloud", "API", or "online" → use `api`.
17
+ If the user says "local", "offline", "private", or "on-device" → use `local`.
18
+
19
+ ## Usage Notes
20
+
21
+ - The tool accepts either a `file_path` (absolute path to a local file) or an `attachment_id` (for uploaded attachments). Prefer `file_path` when the user references a file on disk.
22
+ - Supported formats: any video (mp4, mov, etc.) or audio (mp3, wav, m4a, etc.) file.
23
+ - For video files, audio is automatically extracted via ffmpeg before transcription.
24
+ - The API mode has a 25MB per-request limit — large files are automatically split into chunks.
25
+ - Local mode requires whisper.cpp (`brew install whisper-cpp`). The model is downloaded automatically on first use.
@@ -0,0 +1,32 @@
1
+ {
2
+ "version": 1,
3
+ "tools": [
4
+ {
5
+ "name": "transcribe_media",
6
+ "description": "Transcribe an audio or video file using Whisper. Provide either a file_path to a local file or an attachment_id for an uploaded attachment. Set mode to 'api' (OpenAI cloud) or 'local' (whisper.cpp on-device). Ask the user which mode they prefer before calling.",
7
+ "category": "transcribe",
8
+ "risk": "low",
9
+ "input_schema": {
10
+ "type": "object",
11
+ "properties": {
12
+ "file_path": {
13
+ "type": "string",
14
+ "description": "Absolute path to a local audio or video file to transcribe"
15
+ },
16
+ "attachment_id": {
17
+ "type": "string",
18
+ "description": "The ID of an attached audio or video file to transcribe"
19
+ },
20
+ "mode": {
21
+ "type": "string",
22
+ "enum": ["api", "local"],
23
+ "description": "Transcription backend: 'api' for OpenAI Whisper API (cloud), 'local' for whisper.cpp (on-device)"
24
+ }
25
+ },
26
+ "required": ["mode"]
27
+ },
28
+ "executor": "tools/transcribe-media.ts",
29
+ "execution_target": "host"
30
+ }
31
+ ]
32
+ }
@@ -0,0 +1,370 @@
1
+ import { tmpdir } from 'node:os';
2
+ import { join, extname } from 'node:path';
3
+ import { writeFile, unlink, access, readFile, mkdir, readdir } from 'node:fs/promises';
4
+ import { randomUUID } from 'node:crypto';
5
+ import type { ToolContext, ToolExecutionResult } from '../../../../tools/types.js';
6
+ import { getAttachmentsByIds } from '../../../../memory/attachments-store.js';
7
+ import { getConfig } from '../../../../config/loader.js';
8
+
9
+ const VIDEO_EXTENSIONS = new Set(['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v', '.mpeg', '.mpg']);
10
+ const AUDIO_EXTENSIONS = new Set(['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac', '.aiff', '.wma']);
11
+
12
+ /** Timeout for ffmpeg operations. */
13
+ const FFMPEG_TIMEOUT_MS = 120_000;
14
+
15
+ /** Max file size for a single OpenAI Whisper API request (25MB). */
16
+ const WHISPER_API_MAX_BYTES = 25 * 1024 * 1024;
17
+
18
+ /** Duration per chunk when splitting for the API (10 minutes — stays well under 25MB as WAV). */
19
+ const API_CHUNK_DURATION_SECS = 600;
20
+
21
+ /** Timeout for a single Whisper API request. */
22
+ const API_REQUEST_TIMEOUT_MS = 300_000;
23
+
24
+ /** Timeout for a single whisper.cpp chunk transcription. */
25
+ const LOCAL_CHUNK_TIMEOUT_MS = 600_000;
26
+
27
+ // ---------------------------------------------------------------------------
28
+ // Helpers
29
+ // ---------------------------------------------------------------------------
30
+
31
+ function spawnWithTimeout(
32
+ cmd: string[],
33
+ timeoutMs: number,
34
+ ): Promise<{ exitCode: number; stdout: string; stderr: string }> {
35
+ return new Promise((resolve, reject) => {
36
+ const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' });
37
+ const timer = setTimeout(() => {
38
+ proc.kill();
39
+ reject(new Error(`Process timed out after ${timeoutMs}ms: ${cmd[0]}`));
40
+ }, timeoutMs);
41
+ proc.exited.then(async (exitCode) => {
42
+ clearTimeout(timer);
43
+ const stdout = await new Response(proc.stdout).text();
44
+ const stderr = await new Response(proc.stderr).text();
45
+ resolve({ exitCode, stdout, stderr });
46
+ });
47
+ });
48
+ }
49
+
50
+ async function getAudioDuration(audioPath: string): Promise<number> {
51
+ const result = await spawnWithTimeout([
52
+ 'ffprobe', '-v', 'error',
53
+ '-show_entries', 'format=duration',
54
+ '-of', 'csv=p=0',
55
+ audioPath,
56
+ ], 10_000);
57
+ if (result.exitCode !== 0) return 0;
58
+ return parseFloat(result.stdout.trim()) || 0;
59
+ }
60
+
61
+ async function splitAudio(
62
+ audioPath: string,
63
+ chunkDir: string,
64
+ chunkDurationSecs: number,
65
+ ): Promise<string[]> {
66
+ const chunkPattern = join(chunkDir, 'chunk-%03d.wav');
67
+ const result = await spawnWithTimeout([
68
+ 'ffmpeg', '-y',
69
+ '-i', audioPath,
70
+ '-f', 'segment',
71
+ '-segment_time', String(chunkDurationSecs),
72
+ '-acodec', 'pcm_s16le',
73
+ '-ar', '16000',
74
+ '-ac', '1',
75
+ chunkPattern,
76
+ ], FFMPEG_TIMEOUT_MS);
77
+ if (result.exitCode !== 0) {
78
+ throw new Error(`Failed to split audio: ${result.stderr.slice(0, 300)}`);
79
+ }
80
+ const files = await readdir(chunkDir);
81
+ return files
82
+ .filter(f => f.startsWith('chunk-') && f.endsWith('.wav'))
83
+ .sort()
84
+ .map(f => join(chunkDir, f));
85
+ }
86
+
87
+ // ---------------------------------------------------------------------------
88
+ // Source resolution
89
+ // ---------------------------------------------------------------------------
90
+
91
+ async function resolveSource(
92
+ input: Record<string, unknown>,
93
+ ): Promise<{ inputPath: string; isVideo: boolean; tempFile: string | null } | ToolExecutionResult> {
94
+ const filePath = input.file_path as string | undefined;
95
+ const attachmentId = input.attachment_id as string | undefined;
96
+
97
+ if (filePath) {
98
+ try { await access(filePath); } catch {
99
+ return { content: `File not found: ${filePath}`, isError: true };
100
+ }
101
+ const ext = extname(filePath).toLowerCase();
102
+ const isVideo = VIDEO_EXTENSIONS.has(ext);
103
+ const isAudio = AUDIO_EXTENSIONS.has(ext);
104
+ if (!isVideo && !isAudio) {
105
+ return { content: `Unsupported file type: ${ext}. Only video and audio files can be transcribed.`, isError: true };
106
+ }
107
+ return { inputPath: filePath, isVideo, tempFile: null };
108
+ }
109
+
110
+ if (attachmentId) {
111
+ const attachments = getAttachmentsByIds([attachmentId]);
112
+ if (attachments.length === 0) {
113
+ return { content: `Attachment not found: ${attachmentId}`, isError: true };
114
+ }
115
+ const attachment = attachments[0];
116
+ const mime = attachment.mimeType;
117
+ if (!mime.startsWith('video/') && !mime.startsWith('audio/')) {
118
+ return { content: `Unsupported file type: ${mime}. Only video and audio files can be transcribed.`, isError: true };
119
+ }
120
+ const ext = mime.startsWith('video/') ? '.mp4' : '.m4a';
121
+ const tempPath = join(tmpdir(), `vellum-transcribe-in-${randomUUID()}${ext}`);
122
+ await writeFile(tempPath, Buffer.from(attachment.dataBase64, 'base64'));
123
+ return { inputPath: tempPath, isVideo: mime.startsWith('video/'), tempFile: tempPath };
124
+ }
125
+
126
+ return { content: 'Provide either file_path or attachment_id.', isError: true };
127
+ }
128
+
129
+ /** Convert source to 16kHz mono WAV for consistent processing. */
130
+ async function toWav(inputPath: string, isVideo: boolean): Promise<string> {
131
+ const wavPath = join(tmpdir(), `vellum-transcribe-${randomUUID()}.wav`);
132
+ const args = ['ffmpeg', '-y', '-i', inputPath];
133
+ if (isVideo) args.push('-vn');
134
+ args.push('-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', wavPath);
135
+ const result = await spawnWithTimeout(args, FFMPEG_TIMEOUT_MS);
136
+ if (result.exitCode !== 0) {
137
+ throw new Error(`ffmpeg failed: ${result.stderr.slice(0, 500)}`);
138
+ }
139
+ return wavPath;
140
+ }
141
+
142
+ // ---------------------------------------------------------------------------
143
+ // API mode — OpenAI Whisper API
144
+ // ---------------------------------------------------------------------------
145
+
146
+ async function transcribeViaApi(
147
+ audioPath: string,
148
+ apiKey: string,
149
+ context: ToolContext,
150
+ ): Promise<string> {
151
+ const duration = await getAudioDuration(audioPath);
152
+ const fileSize = Bun.file(audioPath).size;
153
+
154
+ // If small enough, send directly
155
+ if (fileSize <= WHISPER_API_MAX_BYTES) {
156
+ return await whisperApiRequest(audioPath, apiKey);
157
+ }
158
+
159
+ // Split into chunks for large files
160
+ const chunkDir = join(tmpdir(), `vellum-transcribe-api-chunks-${randomUUID()}`);
161
+ await mkdir(chunkDir, { recursive: true });
162
+
163
+ try {
164
+ context.onOutput?.(`Large file (${Math.round(duration / 60)}min) — splitting into chunks...\n`);
165
+ const chunks = await splitAudio(audioPath, chunkDir, API_CHUNK_DURATION_SECS);
166
+ const parts: string[] = [];
167
+
168
+ for (let i = 0; i < chunks.length; i++) {
169
+ if (context.signal?.aborted) throw new Error('Cancelled');
170
+ context.onOutput?.(` Transcribing chunk ${i + 1}/${chunks.length}...\n`);
171
+ const text = await whisperApiRequest(chunks[i], apiKey);
172
+ if (text) parts.push(text);
173
+ }
174
+
175
+ return parts.join(' ');
176
+ } finally {
177
+ const { rm } = await import('node:fs/promises');
178
+ await rm(chunkDir, { recursive: true, force: true }).catch(() => {});
179
+ }
180
+ }
181
+
182
+ async function whisperApiRequest(audioPath: string, apiKey: string): Promise<string> {
183
+ const audioData = await readFile(audioPath);
184
+ const formData = new FormData();
185
+ formData.append('file', new Blob([audioData], { type: 'audio/wav' }), 'audio.wav');
186
+ formData.append('model', 'whisper-1');
187
+
188
+ const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
189
+ method: 'POST',
190
+ headers: { 'Authorization': `Bearer ${apiKey}` },
191
+ body: formData,
192
+ signal: AbortSignal.timeout(API_REQUEST_TIMEOUT_MS),
193
+ });
194
+
195
+ if (!response.ok) {
196
+ const body = await response.text().catch(() => '');
197
+ throw new Error(`Whisper API error (${response.status}): ${body.slice(0, 300)}`);
198
+ }
199
+
200
+ const result = await response.json() as { text?: string };
201
+ return result.text?.trim() ?? '';
202
+ }
203
+
204
+ // ---------------------------------------------------------------------------
205
+ // Local mode — whisper.cpp
206
+ // ---------------------------------------------------------------------------
207
+
208
+ async function transcribeViaLocal(
209
+ audioPath: string,
210
+ context: ToolContext,
211
+ ): Promise<string> {
212
+ // Check if whisper-cpp is installed
213
+ const whichResult = await spawnWithTimeout(['which', 'whisper-cpp'], 5_000);
214
+ if (whichResult.exitCode !== 0) {
215
+ throw new Error(
216
+ 'whisper-cpp is not installed. Install it with: brew install whisper-cpp'
217
+ );
218
+ }
219
+
220
+ // Resolve model path — use the base model, download if needed
221
+ const modelPath = await resolveWhisperModel(context);
222
+
223
+ const duration = await getAudioDuration(audioPath);
224
+
225
+ if (duration > 0 && duration <= 1800) {
226
+ // Under 30 minutes — transcribe directly (whisper.cpp handles long files well)
227
+ context.onOutput?.(`Transcribing ${Math.round(duration / 60)}min of audio locally...\n`);
228
+ return await whisperCppRun(audioPath, modelPath);
229
+ }
230
+
231
+ // Very long files — split into 10-minute chunks to show progress
232
+ const chunkDir = join(tmpdir(), `vellum-transcribe-local-chunks-${randomUUID()}`);
233
+ await mkdir(chunkDir, { recursive: true });
234
+
235
+ try {
236
+ context.onOutput?.(`Large file (${Math.round(duration / 60)}min) — splitting into chunks...\n`);
237
+ const chunks = await splitAudio(audioPath, chunkDir, 600);
238
+ const parts: string[] = [];
239
+
240
+ for (let i = 0; i < chunks.length; i++) {
241
+ if (context.signal?.aborted) throw new Error('Cancelled');
242
+ context.onOutput?.(` Transcribing chunk ${i + 1}/${chunks.length}...\n`);
243
+ const text = await whisperCppRun(chunks[i], modelPath);
244
+ if (text) parts.push(text);
245
+ }
246
+
247
+ return parts.join(' ');
248
+ } finally {
249
+ const { rm } = await import('node:fs/promises');
250
+ await rm(chunkDir, { recursive: true, force: true }).catch(() => {});
251
+ }
252
+ }
253
+
254
+ async function resolveWhisperModel(context: ToolContext): Promise<string> {
255
+ // Check common locations for the base model
256
+ const homeDir = process.env.HOME ?? '/tmp';
257
+ const candidates = [
258
+ join(homeDir, '.vellum', 'models', 'ggml-base.en.bin'),
259
+ join(homeDir, '.vellum', 'models', 'ggml-base.bin'),
260
+ '/usr/local/share/whisper-cpp/models/ggml-base.en.bin',
261
+ '/opt/homebrew/share/whisper-cpp/models/ggml-base.en.bin',
262
+ ];
263
+
264
+ for (const p of candidates) {
265
+ try { await access(p); return p; } catch { /* next */ }
266
+ }
267
+
268
+ // Download the base.en model (~140MB)
269
+ const modelDir = join(homeDir, '.vellum', 'models');
270
+ await mkdir(modelDir, { recursive: true });
271
+ const modelPath = join(modelDir, 'ggml-base.en.bin');
272
+ const modelUrl = 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin';
273
+
274
+ context.onOutput?.('Downloading Whisper base.en model (~140MB)...\n');
275
+
276
+ const response = await fetch(modelUrl);
277
+ if (!response.ok) {
278
+ throw new Error(`Failed to download model: ${response.status}`);
279
+ }
280
+
281
+ const data = Buffer.from(await response.arrayBuffer());
282
+ await writeFile(modelPath, data);
283
+ context.onOutput?.('Model downloaded.\n');
284
+
285
+ return modelPath;
286
+ }
287
+
288
+ async function whisperCppRun(audioPath: string, modelPath: string): Promise<string> {
289
+ const result = await spawnWithTimeout([
290
+ 'whisper-cpp',
291
+ '-m', modelPath,
292
+ '-f', audioPath,
293
+ '--no-timestamps',
294
+ ], LOCAL_CHUNK_TIMEOUT_MS);
295
+
296
+ if (result.exitCode !== 0) {
297
+ throw new Error(`whisper-cpp failed: ${result.stderr.slice(0, 300)}`);
298
+ }
299
+
300
+ // whisper-cpp outputs transcription to stderr with some logging, and
301
+ // the actual text lines to stdout. Clean up whitespace.
302
+ return result.stdout
303
+ .split('\n')
304
+ .map(l => l.trim())
305
+ .filter(l => l.length > 0)
306
+ .join(' ')
307
+ .trim();
308
+ }
309
+
310
+ // ---------------------------------------------------------------------------
311
+ // Main entry point
312
+ // ---------------------------------------------------------------------------
313
+
314
+ export async function run(
315
+ input: Record<string, unknown>,
316
+ context: ToolContext,
317
+ ): Promise<ToolExecutionResult> {
318
+ const mode = input.mode as 'api' | 'local';
319
+ if (!mode || (mode !== 'api' && mode !== 'local')) {
320
+ return {
321
+ content: "Please specify mode: 'api' (OpenAI cloud) or 'local' (whisper.cpp on-device). Ask the user which they prefer.",
322
+ isError: true,
323
+ };
324
+ }
325
+
326
+ // Validate API key for api mode
327
+ if (mode === 'api') {
328
+ const config = getConfig();
329
+ const apiKey = config.apiKeys.openai;
330
+ if (!apiKey) {
331
+ return {
332
+ content: 'No OpenAI API key configured. Set your OpenAI API key to use cloud transcription, or use mode "local" for on-device transcription with whisper.cpp.',
333
+ isError: true,
334
+ };
335
+ }
336
+ }
337
+
338
+ const source = await resolveSource(input);
339
+ if ('isError' in source) return source;
340
+
341
+ const { inputPath, isVideo, tempFile } = source;
342
+ let wavPath: string | null = null;
343
+
344
+ try {
345
+ // Convert to WAV
346
+ wavPath = await toWav(inputPath, isVideo);
347
+
348
+ let text: string;
349
+ if (mode === 'api') {
350
+ const config = getConfig();
351
+ text = await transcribeViaApi(wavPath, config.apiKeys.openai!, context);
352
+ } else {
353
+ text = await transcribeViaLocal(wavPath, context);
354
+ }
355
+
356
+ if (!text.trim()) {
357
+ return { content: 'No speech detected in the audio.', isError: false };
358
+ }
359
+
360
+ return { content: text, isError: false };
361
+ } catch (err) {
362
+ return {
363
+ content: `Transcription failed: ${(err as Error).message}`,
364
+ isError: true,
365
+ };
366
+ } finally {
367
+ if (tempFile) { try { await unlink(tempFile); } catch { /* ignore */ } }
368
+ if (wavPath) { try { await unlink(wavPath); } catch { /* ignore */ } }
369
+ }
370
+ }
@@ -217,7 +217,6 @@ export const DEFAULT_CONFIG: AssistantConfig = {
217
217
  calls: {
218
218
  enabled: true,
219
219
  provider: 'twilio' as const,
220
- webhookBaseUrl: '',
221
220
  maxDurationSeconds: 3600,
222
221
  userConsultTimeoutSeconds: 120,
223
222
  disclosure: {
@@ -230,6 +229,6 @@ export const DEFAULT_CONFIG: AssistantConfig = {
230
229
  },
231
230
  ingress: {
232
231
  publicBaseUrl: '',
233
- mode: 'compat' as const,
232
+ mode: 'gateway_only' as const,
234
233
  },
235
234
  };