vellum 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bun.lock +2 -2
- package/package.json +3 -2
- package/src/__tests__/config-schema.test.ts +0 -6
- package/src/__tests__/forbidden-legacy-symbols.test.ts +69 -0
- package/src/__tests__/gateway-only-enforcement.test.ts +91 -11
- package/src/__tests__/ingress-url-consistency.test.ts +214 -0
- package/src/__tests__/ipc-snapshot.test.ts +17 -16
- package/src/__tests__/oauth2-gateway-transport.test.ts +7 -1
- package/src/__tests__/public-ingress-urls.test.ts +50 -34
- package/src/__tests__/runtime-events-sse-parity.test.ts +343 -0
- package/src/__tests__/runtime-events-sse.test.ts +162 -0
- package/src/__tests__/twilio-provider.test.ts +1 -1
- package/src/__tests__/twilio-routes.test.ts +4 -4
- package/src/__tests__/twitter-auth-handler.test.ts +87 -2
- package/src/calls/call-domain.ts +8 -6
- package/src/calls/twilio-config.ts +2 -3
- package/src/config/bundled-skills/tasks/TOOLS.json +25 -0
- package/src/config/bundled-skills/tasks/tools/task-queue-run.ts +9 -0
- package/src/config/bundled-skills/transcribe/SKILL.md +25 -0
- package/src/config/bundled-skills/transcribe/TOOLS.json +32 -0
- package/src/config/bundled-skills/transcribe/tools/transcribe-media.ts +370 -0
- package/src/config/defaults.ts +1 -2
- package/src/config/schema.ts +2 -6
- package/src/config/vellum-skills/google-oauth-setup/SKILL.md +5 -4
- package/src/config/vellum-skills/slack-oauth-setup/SKILL.md +4 -2
- package/src/config/vellum-skills/telegram-setup/SKILL.md +3 -3
- package/src/daemon/handlers/config.ts +33 -50
- package/src/daemon/handlers/shared.ts +1 -0
- package/src/daemon/handlers/subagents.ts +85 -2
- package/src/daemon/handlers/twitter-auth.ts +31 -2
- package/src/daemon/ipc-contract-inventory.json +4 -4
- package/src/daemon/ipc-contract.ts +25 -21
- package/src/daemon/lifecycle.ts +9 -4
- package/src/daemon/server.ts +7 -0
- package/src/daemon/session-tool-setup.ts +1 -1
- package/src/inbound/public-ingress-urls.ts +36 -30
- package/src/memory/db.ts +132 -5
- package/src/memory/llm-usage-store.ts +0 -1
- package/src/memory/runs-store.ts +51 -3
- package/src/memory/schema.ts +2 -2
- package/src/runtime/gateway-client.ts +7 -1
- package/src/runtime/http-server.ts +95 -10
- package/src/runtime/routes/channel-routes.ts +7 -2
- package/src/runtime/routes/events-routes.ts +79 -0
- package/src/runtime/routes/run-routes.ts +43 -0
- package/src/runtime/run-orchestrator.ts +64 -7
- package/src/security/oauth-callback-registry.ts +10 -0
- package/src/security/oauth2.ts +41 -7
- package/src/subagent/manager.ts +3 -1
- package/src/tools/tasks/work-item-run.ts +78 -0
- package/src/util/platform.ts +1 -1
- package/src/work-items/work-item-runner.ts +171 -0
- package/src/__tests__/handlers-twilio-config.test.ts +0 -221
- package/src/calls/__tests__/twilio-webhook-urls.test.ts +0 -162
- package/src/calls/twilio-webhook-urls.ts +0 -47
|
@@ -8,10 +8,11 @@ const testDir = mkdtempSync(join(tmpdir(), 'handlers-twitter-auth-test-'));
|
|
|
8
8
|
|
|
9
9
|
// Track loadRawConfig / saveRawConfig calls
|
|
10
10
|
let rawConfigStore: Record<string, unknown> = {};
|
|
11
|
+
let mockIngressPublicBaseUrl: string | undefined = 'https://test.example.com';
|
|
11
12
|
|
|
12
13
|
mock.module('../config/loader.js', () => ({
|
|
13
14
|
getConfig: () => ({}),
|
|
14
|
-
loadConfig: () => ({}),
|
|
15
|
+
loadConfig: () => ({ ingress: { publicBaseUrl: mockIngressPublicBaseUrl } }),
|
|
15
16
|
loadRawConfig: () => ({ ...rawConfigStore }),
|
|
16
17
|
saveRawConfig: (cfg: Record<string, unknown>) => {
|
|
17
18
|
rawConfigStore = { ...cfg };
|
|
@@ -20,6 +21,19 @@ mock.module('../config/loader.js', () => ({
|
|
|
20
21
|
invalidateConfigCache: () => {},
|
|
21
22
|
}));
|
|
22
23
|
|
|
24
|
+
mock.module('../inbound/public-ingress-urls.js', () => ({
|
|
25
|
+
getPublicBaseUrl: (config: { ingress?: { publicBaseUrl?: string } }) => {
|
|
26
|
+
const url = config?.ingress?.publicBaseUrl;
|
|
27
|
+
if (url) return url;
|
|
28
|
+
throw new Error('No public base URL configured.');
|
|
29
|
+
},
|
|
30
|
+
getOAuthCallbackUrl: (config: { ingress?: { publicBaseUrl?: string } }) => {
|
|
31
|
+
const url = config?.ingress?.publicBaseUrl;
|
|
32
|
+
if (!url) throw new Error('No public base URL configured.');
|
|
33
|
+
return `${url}/webhooks/oauth/callback`;
|
|
34
|
+
},
|
|
35
|
+
}));
|
|
36
|
+
|
|
23
37
|
mock.module('../util/platform.js', () => ({
|
|
24
38
|
getRootDir: () => testDir,
|
|
25
39
|
getDataDir: () => testDir,
|
|
@@ -77,9 +91,15 @@ mock.module('../security/secure-keys.js', () => ({
|
|
|
77
91
|
// Mock OAuth2 flow
|
|
78
92
|
let oauthFlowResult: unknown = null;
|
|
79
93
|
let oauthFlowError: Error | null = null;
|
|
94
|
+
let lastOAuthFlowOptions: Record<string, unknown> | undefined;
|
|
80
95
|
|
|
81
96
|
mock.module('../security/oauth2.js', () => ({
|
|
82
|
-
startOAuth2Flow: async (
|
|
97
|
+
startOAuth2Flow: async (
|
|
98
|
+
_config: unknown,
|
|
99
|
+
callbacks: { openUrl: (url: string) => void },
|
|
100
|
+
options?: Record<string, unknown>,
|
|
101
|
+
) => {
|
|
102
|
+
lastOAuthFlowOptions = options;
|
|
83
103
|
// Trigger the openUrl callback so tests can verify the open_url message is sent
|
|
84
104
|
callbacks.openUrl('https://twitter.com/i/oauth2/authorize?test=1');
|
|
85
105
|
if (oauthFlowError) throw oauthFlowError;
|
|
@@ -163,6 +183,8 @@ describe('Twitter auth handler', () => {
|
|
|
163
183
|
oauthFlowResult = null;
|
|
164
184
|
oauthFlowError = null;
|
|
165
185
|
lastUpsertPolicy = undefined;
|
|
186
|
+
lastOAuthFlowOptions = undefined;
|
|
187
|
+
mockIngressPublicBaseUrl = 'https://test.example.com';
|
|
166
188
|
// Mock fetch for Twitter API
|
|
167
189
|
globalThis.fetch = (async (_url: string | URL | Request) => {
|
|
168
190
|
return mockFetchResponse;
|
|
@@ -267,6 +289,69 @@ describe('Twitter auth handler', () => {
|
|
|
267
289
|
expect(meta!.accountInfo).toBe('@testuser');
|
|
268
290
|
});
|
|
269
291
|
|
|
292
|
+
test('passes callbackTransport: gateway to startOAuth2Flow', async () => {
|
|
293
|
+
rawConfigStore = { twitterIntegrationMode: 'local_byo' };
|
|
294
|
+
secureKeyStore['credential:integration:twitter:oauth_client_id'] = 'test-client-id';
|
|
295
|
+
|
|
296
|
+
oauthFlowResult = {
|
|
297
|
+
tokens: {
|
|
298
|
+
accessToken: 'mock-access-token',
|
|
299
|
+
refreshToken: 'mock-refresh-token',
|
|
300
|
+
expiresIn: 7200,
|
|
301
|
+
scope: 'tweet.read users.read offline.access',
|
|
302
|
+
tokenType: 'bearer',
|
|
303
|
+
},
|
|
304
|
+
grantedScopes: ['tweet.read', 'users.read', 'offline.access'],
|
|
305
|
+
rawTokenResponse: {},
|
|
306
|
+
};
|
|
307
|
+
|
|
308
|
+
const msg: TwitterAuthStartRequest = { type: 'twitter_auth_start' };
|
|
309
|
+
const { ctx, sent } = createTestContext();
|
|
310
|
+
await handleMessage(msg, {} as net.Socket, ctx);
|
|
311
|
+
|
|
312
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
313
|
+
|
|
314
|
+
// Verify startOAuth2Flow was called with gateway transport
|
|
315
|
+
expect(lastOAuthFlowOptions).toBeDefined();
|
|
316
|
+
expect(lastOAuthFlowOptions!.callbackTransport).toBe('gateway');
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
test('fails fast with actionable error when no ingress URL is configured', async () => {
|
|
320
|
+
rawConfigStore = { twitterIntegrationMode: 'local_byo' };
|
|
321
|
+
secureKeyStore['credential:integration:twitter:oauth_client_id'] = 'test-client-id';
|
|
322
|
+
mockIngressPublicBaseUrl = undefined;
|
|
323
|
+
|
|
324
|
+
oauthFlowResult = {
|
|
325
|
+
tokens: { accessToken: 'should-not-reach', refreshToken: undefined },
|
|
326
|
+
grantedScopes: [],
|
|
327
|
+
rawTokenResponse: {},
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
const msg: TwitterAuthStartRequest = { type: 'twitter_auth_start' };
|
|
331
|
+
const { ctx, sent } = createTestContext();
|
|
332
|
+
await handleMessage(msg, {} as net.Socket, ctx);
|
|
333
|
+
|
|
334
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
335
|
+
|
|
336
|
+
// Should NOT have sent open_url — the flow should fail before reaching OAuth
|
|
337
|
+
const openUrlMsg = sent.find((m) => m.type === 'open_url');
|
|
338
|
+
expect(openUrlMsg).toBeUndefined();
|
|
339
|
+
|
|
340
|
+
const result = sent.find((m) => m.type === 'twitter_auth_result') as {
|
|
341
|
+
type: string;
|
|
342
|
+
success: boolean;
|
|
343
|
+
error?: string;
|
|
344
|
+
};
|
|
345
|
+
expect(result).toBeDefined();
|
|
346
|
+
expect(result.success).toBe(false);
|
|
347
|
+
expect(result.error).toContain('ingress.publicBaseUrl');
|
|
348
|
+
expect(result.error).toContain('INGRESS_PUBLIC_BASE_URL');
|
|
349
|
+
expect(result.error).toContain('/webhooks/oauth/callback');
|
|
350
|
+
|
|
351
|
+
// startOAuth2Flow should not have been called
|
|
352
|
+
expect(lastOAuthFlowOptions).toBeUndefined();
|
|
353
|
+
});
|
|
354
|
+
|
|
270
355
|
describe('auth hardening', () => {
|
|
271
356
|
test('OAuth cancel path returns sanitized failure', async () => {
|
|
272
357
|
rawConfigStore = { twitterIntegrationMode: 'local_byo' };
|
package/src/calls/call-domain.ts
CHANGED
|
@@ -20,7 +20,8 @@ import { getCallOrchestrator, unregisterCallOrchestrator } from './call-state.js
|
|
|
20
20
|
import { activeRelayConnections } from './relay-server.js';
|
|
21
21
|
import { TwilioConversationRelayProvider } from './twilio-provider.js';
|
|
22
22
|
import { getTwilioConfig } from './twilio-config.js';
|
|
23
|
-
import {
|
|
23
|
+
import { getTwilioVoiceWebhookUrl, getTwilioStatusCallbackUrl } from '../inbound/public-ingress-urls.js';
|
|
24
|
+
import { loadConfig } from '../config/loader.js';
|
|
24
25
|
import type { CallSession } from './types.js';
|
|
25
26
|
|
|
26
27
|
const log = getLogger('call-domain');
|
|
@@ -89,13 +90,14 @@ export async function startCall(input: StartCallInput): Promise<StartCallResult
|
|
|
89
90
|
let sessionId: string | null = null;
|
|
90
91
|
|
|
91
92
|
try {
|
|
92
|
-
const
|
|
93
|
+
const twilioConfig = getTwilioConfig();
|
|
94
|
+
const ingressConfig = loadConfig();
|
|
93
95
|
const provider = new TwilioConversationRelayProvider();
|
|
94
96
|
|
|
95
97
|
const session = createCallSession({
|
|
96
98
|
conversationId,
|
|
97
99
|
provider: 'twilio',
|
|
98
|
-
fromNumber:
|
|
100
|
+
fromNumber: twilioConfig.phoneNumber,
|
|
99
101
|
toNumber: phoneNumber,
|
|
100
102
|
task: callContext ? `${task}\n\nContext: ${callContext}` : task,
|
|
101
103
|
});
|
|
@@ -104,10 +106,10 @@ export async function startCall(input: StartCallInput): Promise<StartCallResult
|
|
|
104
106
|
log.info({ callSessionId: session.id, to: phoneNumber, task }, 'Initiating outbound call');
|
|
105
107
|
|
|
106
108
|
const { callSid } = await provider.initiateCall({
|
|
107
|
-
from:
|
|
109
|
+
from: twilioConfig.phoneNumber,
|
|
108
110
|
to: phoneNumber,
|
|
109
|
-
webhookUrl:
|
|
110
|
-
statusCallbackUrl:
|
|
111
|
+
webhookUrl: getTwilioVoiceWebhookUrl(ingressConfig, session.id),
|
|
112
|
+
statusCallbackUrl: getTwilioStatusCallbackUrl(ingressConfig),
|
|
111
113
|
});
|
|
112
114
|
|
|
113
115
|
updateCallSession(session.id, { providerCallSid: callSid });
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import { getSecureKey } from '../security/secure-keys.js';
|
|
2
2
|
import { getLogger } from '../util/logger.js';
|
|
3
3
|
import { loadConfig } from '../config/loader.js';
|
|
4
|
-
import {
|
|
5
|
-
import { getTwilioRelayUrl } from '../inbound/public-ingress-urls.js';
|
|
4
|
+
import { getPublicBaseUrl, getTwilioRelayUrl } from '../inbound/public-ingress-urls.js';
|
|
6
5
|
|
|
7
6
|
const log = getLogger('twilio-config');
|
|
8
7
|
|
|
@@ -19,7 +18,7 @@ export function getTwilioConfig(): TwilioConfig {
|
|
|
19
18
|
const authToken = getSecureKey('credential:twilio:auth_token');
|
|
20
19
|
const phoneNumber = process.env.TWILIO_PHONE_NUMBER || getSecureKey('credential:twilio:phone_number') || '';
|
|
21
20
|
const config = loadConfig();
|
|
22
|
-
const webhookBaseUrl =
|
|
21
|
+
const webhookBaseUrl = getPublicBaseUrl(config);
|
|
23
22
|
|
|
24
23
|
// In gateway_only mode, ignore TWILIO_WSS_BASE_URL and always use the
|
|
25
24
|
// centralized relay URL derived from the public ingress base URL.
|
|
@@ -251,6 +251,31 @@
|
|
|
251
251
|
},
|
|
252
252
|
"executor": "tools/task-list-remove.ts",
|
|
253
253
|
"execution_target": "host"
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
"name": "task_queue_run",
|
|
257
|
+
"description": "Run a task from the Task Queue in the background. Use this when the user says \"run this task\", \"execute this task\", \"start this task\", or wants to kick off a queued work item. The task runs asynchronously — the user can continue chatting while it executes. Required tool permissions are auto-approved since the user is explicitly requesting execution.",
|
|
258
|
+
"category": "tasks",
|
|
259
|
+
"risk": "medium",
|
|
260
|
+
"input_schema": {
|
|
261
|
+
"type": "object",
|
|
262
|
+
"properties": {
|
|
263
|
+
"work_item_id": {
|
|
264
|
+
"type": "string",
|
|
265
|
+
"description": "Direct work item ID (most precise selector)"
|
|
266
|
+
},
|
|
267
|
+
"task_name": {
|
|
268
|
+
"type": "string",
|
|
269
|
+
"description": "Task name/title to search for (case-insensitive substring match)"
|
|
270
|
+
},
|
|
271
|
+
"title": {
|
|
272
|
+
"type": "string",
|
|
273
|
+
"description": "Work item title to search for (case-insensitive substring match)"
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
},
|
|
277
|
+
"executor": "tools/task-queue-run.ts",
|
|
278
|
+
"execution_target": "host"
|
|
254
279
|
}
|
|
255
280
|
]
|
|
256
281
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { ToolContext, ToolExecutionResult } from '../../../../tools/types.js';
|
|
2
|
+
import { executeTaskQueueRun } from '../../../../tools/tasks/work-item-run.js';
|
|
3
|
+
|
|
4
|
+
export async function run(
|
|
5
|
+
input: Record<string, unknown>,
|
|
6
|
+
context: ToolContext,
|
|
7
|
+
): Promise<ToolExecutionResult> {
|
|
8
|
+
return executeTaskQueueRun(input, context);
|
|
9
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: "Transcribe"
|
|
3
|
+
description: "Transcribe audio and video files using Whisper (cloud API or local)"
|
|
4
|
+
metadata: {"vellum": {"emoji": "🎙️"}}
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
Transcribe audio and video files using OpenAI's Whisper model — either via the cloud API or locally via whisper.cpp.
|
|
8
|
+
|
|
9
|
+
## Choosing a Mode
|
|
10
|
+
|
|
11
|
+
Before transcribing, **ask the user which mode they prefer** if they haven't specified:
|
|
12
|
+
|
|
13
|
+
1. **`api`** — Uses the OpenAI Whisper API. Fast, accurate, no setup needed. Requires an OpenAI API key (check if one is already configured). Audio is sent to OpenAI's servers. Costs ~$0.006/min.
|
|
14
|
+
2. **`local`** — Uses whisper.cpp installed via Homebrew. Free, private, runs entirely on-device. Requires a one-time `brew install whisper-cpp`. Slightly slower but no data leaves the machine.
|
|
15
|
+
|
|
16
|
+
If the user says "cloud", "API", or "online" → use `api`.
|
|
17
|
+
If the user says "local", "offline", "private", or "on-device" → use `local`.
|
|
18
|
+
|
|
19
|
+
## Usage Notes
|
|
20
|
+
|
|
21
|
+
- The tool accepts either a `file_path` (absolute path to a local file) or an `attachment_id` (for uploaded attachments). Prefer `file_path` when the user references a file on disk.
|
|
22
|
+
- Supported formats: any video (mp4, mov, etc.) or audio (mp3, wav, m4a, etc.) file.
|
|
23
|
+
- For video files, audio is automatically extracted via ffmpeg before transcription.
|
|
24
|
+
- The API mode has a 25MB per-request limit — large files are automatically split into chunks.
|
|
25
|
+
- Local mode requires whisper.cpp (`brew install whisper-cpp`). The model is downloaded automatically on first use.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"tools": [
|
|
4
|
+
{
|
|
5
|
+
"name": "transcribe_media",
|
|
6
|
+
"description": "Transcribe an audio or video file using Whisper. Provide either a file_path to a local file or an attachment_id for an uploaded attachment. Set mode to 'api' (OpenAI cloud) or 'local' (whisper.cpp on-device). Ask the user which mode they prefer before calling.",
|
|
7
|
+
"category": "transcribe",
|
|
8
|
+
"risk": "low",
|
|
9
|
+
"input_schema": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"file_path": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Absolute path to a local audio or video file to transcribe"
|
|
15
|
+
},
|
|
16
|
+
"attachment_id": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "The ID of an attached audio or video file to transcribe"
|
|
19
|
+
},
|
|
20
|
+
"mode": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"enum": ["api", "local"],
|
|
23
|
+
"description": "Transcription backend: 'api' for OpenAI Whisper API (cloud), 'local' for whisper.cpp (on-device)"
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
"required": ["mode"]
|
|
27
|
+
},
|
|
28
|
+
"executor": "tools/transcribe-media.ts",
|
|
29
|
+
"execution_target": "host"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
import { tmpdir } from 'node:os';
|
|
2
|
+
import { join, extname } from 'node:path';
|
|
3
|
+
import { writeFile, unlink, access, readFile, mkdir, readdir } from 'node:fs/promises';
|
|
4
|
+
import { randomUUID } from 'node:crypto';
|
|
5
|
+
import type { ToolContext, ToolExecutionResult } from '../../../../tools/types.js';
|
|
6
|
+
import { getAttachmentsByIds } from '../../../../memory/attachments-store.js';
|
|
7
|
+
import { getConfig } from '../../../../config/loader.js';
|
|
8
|
+
|
|
9
|
+
const VIDEO_EXTENSIONS = new Set(['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v', '.mpeg', '.mpg']);
|
|
10
|
+
const AUDIO_EXTENSIONS = new Set(['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac', '.aiff', '.wma']);
|
|
11
|
+
|
|
12
|
+
/** Timeout for ffmpeg operations. */
|
|
13
|
+
const FFMPEG_TIMEOUT_MS = 120_000;
|
|
14
|
+
|
|
15
|
+
/** Max file size for a single OpenAI Whisper API request (25MB). */
|
|
16
|
+
const WHISPER_API_MAX_BYTES = 25 * 1024 * 1024;
|
|
17
|
+
|
|
18
|
+
/** Duration per chunk when splitting for the API (10 minutes — stays well under 25MB as WAV). */
|
|
19
|
+
const API_CHUNK_DURATION_SECS = 600;
|
|
20
|
+
|
|
21
|
+
/** Timeout for a single Whisper API request. */
|
|
22
|
+
const API_REQUEST_TIMEOUT_MS = 300_000;
|
|
23
|
+
|
|
24
|
+
/** Timeout for a single whisper.cpp chunk transcription. */
|
|
25
|
+
const LOCAL_CHUNK_TIMEOUT_MS = 600_000;
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Helpers
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
function spawnWithTimeout(
|
|
32
|
+
cmd: string[],
|
|
33
|
+
timeoutMs: number,
|
|
34
|
+
): Promise<{ exitCode: number; stdout: string; stderr: string }> {
|
|
35
|
+
return new Promise((resolve, reject) => {
|
|
36
|
+
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' });
|
|
37
|
+
const timer = setTimeout(() => {
|
|
38
|
+
proc.kill();
|
|
39
|
+
reject(new Error(`Process timed out after ${timeoutMs}ms: ${cmd[0]}`));
|
|
40
|
+
}, timeoutMs);
|
|
41
|
+
proc.exited.then(async (exitCode) => {
|
|
42
|
+
clearTimeout(timer);
|
|
43
|
+
const stdout = await new Response(proc.stdout).text();
|
|
44
|
+
const stderr = await new Response(proc.stderr).text();
|
|
45
|
+
resolve({ exitCode, stdout, stderr });
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
async function getAudioDuration(audioPath: string): Promise<number> {
|
|
51
|
+
const result = await spawnWithTimeout([
|
|
52
|
+
'ffprobe', '-v', 'error',
|
|
53
|
+
'-show_entries', 'format=duration',
|
|
54
|
+
'-of', 'csv=p=0',
|
|
55
|
+
audioPath,
|
|
56
|
+
], 10_000);
|
|
57
|
+
if (result.exitCode !== 0) return 0;
|
|
58
|
+
return parseFloat(result.stdout.trim()) || 0;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async function splitAudio(
|
|
62
|
+
audioPath: string,
|
|
63
|
+
chunkDir: string,
|
|
64
|
+
chunkDurationSecs: number,
|
|
65
|
+
): Promise<string[]> {
|
|
66
|
+
const chunkPattern = join(chunkDir, 'chunk-%03d.wav');
|
|
67
|
+
const result = await spawnWithTimeout([
|
|
68
|
+
'ffmpeg', '-y',
|
|
69
|
+
'-i', audioPath,
|
|
70
|
+
'-f', 'segment',
|
|
71
|
+
'-segment_time', String(chunkDurationSecs),
|
|
72
|
+
'-acodec', 'pcm_s16le',
|
|
73
|
+
'-ar', '16000',
|
|
74
|
+
'-ac', '1',
|
|
75
|
+
chunkPattern,
|
|
76
|
+
], FFMPEG_TIMEOUT_MS);
|
|
77
|
+
if (result.exitCode !== 0) {
|
|
78
|
+
throw new Error(`Failed to split audio: ${result.stderr.slice(0, 300)}`);
|
|
79
|
+
}
|
|
80
|
+
const files = await readdir(chunkDir);
|
|
81
|
+
return files
|
|
82
|
+
.filter(f => f.startsWith('chunk-') && f.endsWith('.wav'))
|
|
83
|
+
.sort()
|
|
84
|
+
.map(f => join(chunkDir, f));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
// Source resolution
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
async function resolveSource(
|
|
92
|
+
input: Record<string, unknown>,
|
|
93
|
+
): Promise<{ inputPath: string; isVideo: boolean; tempFile: string | null } | ToolExecutionResult> {
|
|
94
|
+
const filePath = input.file_path as string | undefined;
|
|
95
|
+
const attachmentId = input.attachment_id as string | undefined;
|
|
96
|
+
|
|
97
|
+
if (filePath) {
|
|
98
|
+
try { await access(filePath); } catch {
|
|
99
|
+
return { content: `File not found: ${filePath}`, isError: true };
|
|
100
|
+
}
|
|
101
|
+
const ext = extname(filePath).toLowerCase();
|
|
102
|
+
const isVideo = VIDEO_EXTENSIONS.has(ext);
|
|
103
|
+
const isAudio = AUDIO_EXTENSIONS.has(ext);
|
|
104
|
+
if (!isVideo && !isAudio) {
|
|
105
|
+
return { content: `Unsupported file type: ${ext}. Only video and audio files can be transcribed.`, isError: true };
|
|
106
|
+
}
|
|
107
|
+
return { inputPath: filePath, isVideo, tempFile: null };
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (attachmentId) {
|
|
111
|
+
const attachments = getAttachmentsByIds([attachmentId]);
|
|
112
|
+
if (attachments.length === 0) {
|
|
113
|
+
return { content: `Attachment not found: ${attachmentId}`, isError: true };
|
|
114
|
+
}
|
|
115
|
+
const attachment = attachments[0];
|
|
116
|
+
const mime = attachment.mimeType;
|
|
117
|
+
if (!mime.startsWith('video/') && !mime.startsWith('audio/')) {
|
|
118
|
+
return { content: `Unsupported file type: ${mime}. Only video and audio files can be transcribed.`, isError: true };
|
|
119
|
+
}
|
|
120
|
+
const ext = mime.startsWith('video/') ? '.mp4' : '.m4a';
|
|
121
|
+
const tempPath = join(tmpdir(), `vellum-transcribe-in-${randomUUID()}${ext}`);
|
|
122
|
+
await writeFile(tempPath, Buffer.from(attachment.dataBase64, 'base64'));
|
|
123
|
+
return { inputPath: tempPath, isVideo: mime.startsWith('video/'), tempFile: tempPath };
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return { content: 'Provide either file_path or attachment_id.', isError: true };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/** Convert source to 16kHz mono WAV for consistent processing. */
|
|
130
|
+
async function toWav(inputPath: string, isVideo: boolean): Promise<string> {
|
|
131
|
+
const wavPath = join(tmpdir(), `vellum-transcribe-${randomUUID()}.wav`);
|
|
132
|
+
const args = ['ffmpeg', '-y', '-i', inputPath];
|
|
133
|
+
if (isVideo) args.push('-vn');
|
|
134
|
+
args.push('-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', wavPath);
|
|
135
|
+
const result = await spawnWithTimeout(args, FFMPEG_TIMEOUT_MS);
|
|
136
|
+
if (result.exitCode !== 0) {
|
|
137
|
+
throw new Error(`ffmpeg failed: ${result.stderr.slice(0, 500)}`);
|
|
138
|
+
}
|
|
139
|
+
return wavPath;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
// API mode — OpenAI Whisper API
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
async function transcribeViaApi(
|
|
147
|
+
audioPath: string,
|
|
148
|
+
apiKey: string,
|
|
149
|
+
context: ToolContext,
|
|
150
|
+
): Promise<string> {
|
|
151
|
+
const duration = await getAudioDuration(audioPath);
|
|
152
|
+
const fileSize = Bun.file(audioPath).size;
|
|
153
|
+
|
|
154
|
+
// If small enough, send directly
|
|
155
|
+
if (fileSize <= WHISPER_API_MAX_BYTES) {
|
|
156
|
+
return await whisperApiRequest(audioPath, apiKey);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Split into chunks for large files
|
|
160
|
+
const chunkDir = join(tmpdir(), `vellum-transcribe-api-chunks-${randomUUID()}`);
|
|
161
|
+
await mkdir(chunkDir, { recursive: true });
|
|
162
|
+
|
|
163
|
+
try {
|
|
164
|
+
context.onOutput?.(`Large file (${Math.round(duration / 60)}min) — splitting into chunks...\n`);
|
|
165
|
+
const chunks = await splitAudio(audioPath, chunkDir, API_CHUNK_DURATION_SECS);
|
|
166
|
+
const parts: string[] = [];
|
|
167
|
+
|
|
168
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
169
|
+
if (context.signal?.aborted) throw new Error('Cancelled');
|
|
170
|
+
context.onOutput?.(` Transcribing chunk ${i + 1}/${chunks.length}...\n`);
|
|
171
|
+
const text = await whisperApiRequest(chunks[i], apiKey);
|
|
172
|
+
if (text) parts.push(text);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return parts.join(' ');
|
|
176
|
+
} finally {
|
|
177
|
+
const { rm } = await import('node:fs/promises');
|
|
178
|
+
await rm(chunkDir, { recursive: true, force: true }).catch(() => {});
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
async function whisperApiRequest(audioPath: string, apiKey: string): Promise<string> {
|
|
183
|
+
const audioData = await readFile(audioPath);
|
|
184
|
+
const formData = new FormData();
|
|
185
|
+
formData.append('file', new Blob([audioData], { type: 'audio/wav' }), 'audio.wav');
|
|
186
|
+
formData.append('model', 'whisper-1');
|
|
187
|
+
|
|
188
|
+
const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
|
|
189
|
+
method: 'POST',
|
|
190
|
+
headers: { 'Authorization': `Bearer ${apiKey}` },
|
|
191
|
+
body: formData,
|
|
192
|
+
signal: AbortSignal.timeout(API_REQUEST_TIMEOUT_MS),
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
if (!response.ok) {
|
|
196
|
+
const body = await response.text().catch(() => '');
|
|
197
|
+
throw new Error(`Whisper API error (${response.status}): ${body.slice(0, 300)}`);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const result = await response.json() as { text?: string };
|
|
201
|
+
return result.text?.trim() ?? '';
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
// Local mode — whisper.cpp
|
|
206
|
+
// ---------------------------------------------------------------------------
|
|
207
|
+
|
|
208
|
+
async function transcribeViaLocal(
|
|
209
|
+
audioPath: string,
|
|
210
|
+
context: ToolContext,
|
|
211
|
+
): Promise<string> {
|
|
212
|
+
// Check if whisper-cpp is installed
|
|
213
|
+
const whichResult = await spawnWithTimeout(['which', 'whisper-cpp'], 5_000);
|
|
214
|
+
if (whichResult.exitCode !== 0) {
|
|
215
|
+
throw new Error(
|
|
216
|
+
'whisper-cpp is not installed. Install it with: brew install whisper-cpp'
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Resolve model path — use the base model, download if needed
|
|
221
|
+
const modelPath = await resolveWhisperModel(context);
|
|
222
|
+
|
|
223
|
+
const duration = await getAudioDuration(audioPath);
|
|
224
|
+
|
|
225
|
+
if (duration > 0 && duration <= 1800) {
|
|
226
|
+
// Under 30 minutes — transcribe directly (whisper.cpp handles long files well)
|
|
227
|
+
context.onOutput?.(`Transcribing ${Math.round(duration / 60)}min of audio locally...\n`);
|
|
228
|
+
return await whisperCppRun(audioPath, modelPath);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Very long files — split into 10-minute chunks to show progress
|
|
232
|
+
const chunkDir = join(tmpdir(), `vellum-transcribe-local-chunks-${randomUUID()}`);
|
|
233
|
+
await mkdir(chunkDir, { recursive: true });
|
|
234
|
+
|
|
235
|
+
try {
|
|
236
|
+
context.onOutput?.(`Large file (${Math.round(duration / 60)}min) — splitting into chunks...\n`);
|
|
237
|
+
const chunks = await splitAudio(audioPath, chunkDir, 600);
|
|
238
|
+
const parts: string[] = [];
|
|
239
|
+
|
|
240
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
241
|
+
if (context.signal?.aborted) throw new Error('Cancelled');
|
|
242
|
+
context.onOutput?.(` Transcribing chunk ${i + 1}/${chunks.length}...\n`);
|
|
243
|
+
const text = await whisperCppRun(chunks[i], modelPath);
|
|
244
|
+
if (text) parts.push(text);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
return parts.join(' ');
|
|
248
|
+
} finally {
|
|
249
|
+
const { rm } = await import('node:fs/promises');
|
|
250
|
+
await rm(chunkDir, { recursive: true, force: true }).catch(() => {});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
async function resolveWhisperModel(context: ToolContext): Promise<string> {
|
|
255
|
+
// Check common locations for the base model
|
|
256
|
+
const homeDir = process.env.HOME ?? '/tmp';
|
|
257
|
+
const candidates = [
|
|
258
|
+
join(homeDir, '.vellum', 'models', 'ggml-base.en.bin'),
|
|
259
|
+
join(homeDir, '.vellum', 'models', 'ggml-base.bin'),
|
|
260
|
+
'/usr/local/share/whisper-cpp/models/ggml-base.en.bin',
|
|
261
|
+
'/opt/homebrew/share/whisper-cpp/models/ggml-base.en.bin',
|
|
262
|
+
];
|
|
263
|
+
|
|
264
|
+
for (const p of candidates) {
|
|
265
|
+
try { await access(p); return p; } catch { /* next */ }
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Download the base.en model (~140MB)
|
|
269
|
+
const modelDir = join(homeDir, '.vellum', 'models');
|
|
270
|
+
await mkdir(modelDir, { recursive: true });
|
|
271
|
+
const modelPath = join(modelDir, 'ggml-base.en.bin');
|
|
272
|
+
const modelUrl = 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin';
|
|
273
|
+
|
|
274
|
+
context.onOutput?.('Downloading Whisper base.en model (~140MB)...\n');
|
|
275
|
+
|
|
276
|
+
const response = await fetch(modelUrl);
|
|
277
|
+
if (!response.ok) {
|
|
278
|
+
throw new Error(`Failed to download model: ${response.status}`);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
const data = Buffer.from(await response.arrayBuffer());
|
|
282
|
+
await writeFile(modelPath, data);
|
|
283
|
+
context.onOutput?.('Model downloaded.\n');
|
|
284
|
+
|
|
285
|
+
return modelPath;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
async function whisperCppRun(audioPath: string, modelPath: string): Promise<string> {
|
|
289
|
+
const result = await spawnWithTimeout([
|
|
290
|
+
'whisper-cpp',
|
|
291
|
+
'-m', modelPath,
|
|
292
|
+
'-f', audioPath,
|
|
293
|
+
'--no-timestamps',
|
|
294
|
+
], LOCAL_CHUNK_TIMEOUT_MS);
|
|
295
|
+
|
|
296
|
+
if (result.exitCode !== 0) {
|
|
297
|
+
throw new Error(`whisper-cpp failed: ${result.stderr.slice(0, 300)}`);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// whisper-cpp outputs transcription to stderr with some logging, and
|
|
301
|
+
// the actual text lines to stdout. Clean up whitespace.
|
|
302
|
+
return result.stdout
|
|
303
|
+
.split('\n')
|
|
304
|
+
.map(l => l.trim())
|
|
305
|
+
.filter(l => l.length > 0)
|
|
306
|
+
.join(' ')
|
|
307
|
+
.trim();
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// ---------------------------------------------------------------------------
|
|
311
|
+
// Main entry point
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
export async function run(
|
|
315
|
+
input: Record<string, unknown>,
|
|
316
|
+
context: ToolContext,
|
|
317
|
+
): Promise<ToolExecutionResult> {
|
|
318
|
+
const mode = input.mode as 'api' | 'local';
|
|
319
|
+
if (!mode || (mode !== 'api' && mode !== 'local')) {
|
|
320
|
+
return {
|
|
321
|
+
content: "Please specify mode: 'api' (OpenAI cloud) or 'local' (whisper.cpp on-device). Ask the user which they prefer.",
|
|
322
|
+
isError: true,
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Validate API key for api mode
|
|
327
|
+
if (mode === 'api') {
|
|
328
|
+
const config = getConfig();
|
|
329
|
+
const apiKey = config.apiKeys.openai;
|
|
330
|
+
if (!apiKey) {
|
|
331
|
+
return {
|
|
332
|
+
content: 'No OpenAI API key configured. Set your OpenAI API key to use cloud transcription, or use mode "local" for on-device transcription with whisper.cpp.',
|
|
333
|
+
isError: true,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
const source = await resolveSource(input);
|
|
339
|
+
if ('isError' in source) return source;
|
|
340
|
+
|
|
341
|
+
const { inputPath, isVideo, tempFile } = source;
|
|
342
|
+
let wavPath: string | null = null;
|
|
343
|
+
|
|
344
|
+
try {
|
|
345
|
+
// Convert to WAV
|
|
346
|
+
wavPath = await toWav(inputPath, isVideo);
|
|
347
|
+
|
|
348
|
+
let text: string;
|
|
349
|
+
if (mode === 'api') {
|
|
350
|
+
const config = getConfig();
|
|
351
|
+
text = await transcribeViaApi(wavPath, config.apiKeys.openai!, context);
|
|
352
|
+
} else {
|
|
353
|
+
text = await transcribeViaLocal(wavPath, context);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if (!text.trim()) {
|
|
357
|
+
return { content: 'No speech detected in the audio.', isError: false };
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return { content: text, isError: false };
|
|
361
|
+
} catch (err) {
|
|
362
|
+
return {
|
|
363
|
+
content: `Transcription failed: ${(err as Error).message}`,
|
|
364
|
+
isError: true,
|
|
365
|
+
};
|
|
366
|
+
} finally {
|
|
367
|
+
if (tempFile) { try { await unlink(tempFile); } catch { /* ignore */ } }
|
|
368
|
+
if (wavPath) { try { await unlink(wavPath); } catch { /* ignore */ } }
|
|
369
|
+
}
|
|
370
|
+
}
|
package/src/config/defaults.ts
CHANGED
|
@@ -217,7 +217,6 @@ export const DEFAULT_CONFIG: AssistantConfig = {
|
|
|
217
217
|
calls: {
|
|
218
218
|
enabled: true,
|
|
219
219
|
provider: 'twilio' as const,
|
|
220
|
-
webhookBaseUrl: '',
|
|
221
220
|
maxDurationSeconds: 3600,
|
|
222
221
|
userConsultTimeoutSeconds: 120,
|
|
223
222
|
disclosure: {
|
|
@@ -230,6 +229,6 @@ export const DEFAULT_CONFIG: AssistantConfig = {
|
|
|
230
229
|
},
|
|
231
230
|
ingress: {
|
|
232
231
|
publicBaseUrl: '',
|
|
233
|
-
mode: '
|
|
232
|
+
mode: 'gateway_only' as const,
|
|
234
233
|
},
|
|
235
234
|
};
|