@vellumai/assistant 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/ARCHITECTURE.md +17 -3
  2. package/README.md +2 -0
  3. package/docs/architecture/scheduling.md +81 -0
  4. package/package.json +1 -1
  5. package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +22 -0
  6. package/src/__tests__/channel-policy.test.ts +19 -0
  7. package/src/__tests__/guardian-control-plane-policy.test.ts +584 -0
  8. package/src/__tests__/intent-routing.test.ts +22 -0
  9. package/src/__tests__/ipc-snapshot.test.ts +10 -0
  10. package/src/__tests__/notification-routing-intent.test.ts +186 -0
  11. package/src/__tests__/recording-handler.test.ts +191 -31
  12. package/src/__tests__/recording-intent-fallback.test.ts +181 -0
  13. package/src/__tests__/recording-intent-handler.test.ts +593 -73
  14. package/src/__tests__/recording-intent.test.ts +739 -343
  15. package/src/__tests__/recording-state-machine.test.ts +1109 -0
  16. package/src/__tests__/reminder-store.test.ts +20 -18
  17. package/src/__tests__/reminder.test.ts +2 -1
  18. package/src/channels/config.ts +1 -1
  19. package/src/config/bundled-skills/phone-calls/SKILL.md +1 -11
  20. package/src/config/bundled-skills/screen-recording/SKILL.md +91 -12
  21. package/src/config/system-prompt.ts +5 -0
  22. package/src/config/vellum-skills/guardian-verify-setup/SKILL.md +1 -0
  23. package/src/daemon/handlers/misc.ts +258 -102
  24. package/src/daemon/handlers/recording.ts +417 -5
  25. package/src/daemon/handlers/sessions.ts +136 -62
  26. package/src/daemon/ipc-contract/computer-use.ts +23 -3
  27. package/src/daemon/ipc-contract/messages.ts +3 -1
  28. package/src/daemon/ipc-contract/shared.ts +6 -0
  29. package/src/daemon/ipc-contract-inventory.json +2 -0
  30. package/src/daemon/lifecycle.ts +2 -0
  31. package/src/daemon/recording-executor.ts +180 -0
  32. package/src/daemon/recording-intent-fallback.ts +132 -0
  33. package/src/daemon/recording-intent.ts +306 -15
  34. package/src/daemon/session-tool-setup.ts +4 -0
  35. package/src/notifications/README.md +69 -1
  36. package/src/notifications/adapters/sms.ts +80 -0
  37. package/src/notifications/broadcaster.ts +1 -0
  38. package/src/notifications/copy-composer.ts +3 -3
  39. package/src/notifications/decision-engine.ts +70 -1
  40. package/src/notifications/decisions-store.ts +24 -0
  41. package/src/notifications/destination-resolver.ts +2 -1
  42. package/src/notifications/emit-signal.ts +35 -3
  43. package/src/notifications/signal.ts +6 -0
  44. package/src/notifications/types.ts +3 -0
  45. package/src/schedule/scheduler.ts +15 -3
  46. package/src/tools/executor.ts +29 -0
  47. package/src/tools/guardian-control-plane-policy.ts +141 -0
  48. package/src/tools/types.ts +2 -0
@@ -35,10 +35,12 @@ import type {
35
35
  UserMessage,
36
36
  } from '../ipc-protocol.js';
37
37
  import { normalizeThreadType } from '../ipc-protocol.js';
38
- import { classifyRecordingIntent, detectRecordingIntent, detectStopRecordingIntent, hasSubstantiveContent, isInterrogative, stripRecordingIntent, stripStopRecordingIntent } from '../recording-intent.js';
38
+ import { executeRecordingIntent } from '../recording-executor.js';
39
+ import { classifyRecordingIntentFallback, containsRecordingKeywords } from '../recording-intent-fallback.js';
40
+ import { resolveRecordingIntent } from '../recording-intent.js';
39
41
  import { buildSessionErrorMessage,classifySessionError } from '../session-error.js';
40
42
  import { generateVideoThumbnail } from '../video-thumbnail.js';
41
- import { handleRecordingStart, handleRecordingStop } from './recording.js';
43
+ import { handleRecordingPause, handleRecordingRestart, handleRecordingResume, handleRecordingStart, handleRecordingStop } from './recording.js';
42
44
  import {
43
45
  defineHandlers,
44
46
  type HandlerContext,
@@ -175,7 +177,7 @@ export async function handleUserMessage(
175
177
  };
176
178
 
177
179
  const config = getConfig();
178
- const messageText = msg.content ?? '';
180
+ let messageText = msg.content ?? '';
179
181
 
180
182
  // Block inbound messages that contain secrets and redirect to secure prompt
181
183
  if (!msg.bypassSecretCheck) {
@@ -227,85 +229,157 @@ export async function handleUserMessage(
227
229
  }
228
230
  }
229
231
 
232
+ // ── Structured command intent (bypasses text parsing) ──────────────────
233
+ if (config.daemon.standaloneRecording && msg.commandIntent?.domain === 'screen_recording') {
234
+ const action = msg.commandIntent.action;
235
+ rlog.info({ action, source: 'commandIntent' }, 'Recording command intent received in user_message');
236
+ if (action === 'start') {
237
+ const recordingId = handleRecordingStart(msg.sessionId, { promptForSource: true }, socket, ctx);
238
+ ctx.send(socket, {
239
+ type: 'assistant_text_delta',
240
+ text: recordingId ? 'Starting screen recording.' : 'A recording is already active.',
241
+ sessionId: msg.sessionId,
242
+ });
243
+ ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
244
+ return;
245
+ } else if (action === 'stop') {
246
+ const stopped = handleRecordingStop(msg.sessionId, ctx) !== undefined;
247
+ ctx.send(socket, {
248
+ type: 'assistant_text_delta',
249
+ text: stopped ? 'Stopping the recording.' : 'No active recording to stop.',
250
+ sessionId: msg.sessionId,
251
+ });
252
+ ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
253
+ return;
254
+ } else if (action === 'restart') {
255
+ const restartResult = handleRecordingRestart(msg.sessionId, socket, ctx);
256
+ ctx.send(socket, {
257
+ type: 'assistant_text_delta',
258
+ text: restartResult.responseText,
259
+ sessionId: msg.sessionId,
260
+ });
261
+ ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
262
+ return;
263
+ } else if (action === 'pause') {
264
+ const paused = handleRecordingPause(msg.sessionId, ctx) !== undefined;
265
+ ctx.send(socket, {
266
+ type: 'assistant_text_delta',
267
+ text: paused ? 'Pausing the recording.' : 'No active recording to pause.',
268
+ sessionId: msg.sessionId,
269
+ });
270
+ ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
271
+ return;
272
+ } else if (action === 'resume') {
273
+ const resumed = handleRecordingResume(msg.sessionId, ctx) !== undefined;
274
+ ctx.send(socket, {
275
+ type: 'assistant_text_delta',
276
+ text: resumed ? 'Resuming the recording.' : 'No active recording to resume.',
277
+ sessionId: msg.sessionId,
278
+ });
279
+ ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
280
+ return;
281
+ } else {
282
+ // Unrecognized action — fall through to normal text handling
283
+ rlog.warn({ action, source: 'commandIntent' }, 'Unrecognized screen_recording action, falling through to text handling');
284
+ }
285
+ }
286
+
230
287
  // ── Standalone recording intent interception ──────────────────────────
231
288
  if (config.daemon.standaloneRecording && messageText) {
232
289
  const name = getAssistantName();
233
290
  const dynamicNames = [name].filter(Boolean) as string[];
234
- const intentClass = classifyRecordingIntent(messageText, dynamicNames);
291
+ const intentResult = resolveRecordingIntent(messageText, dynamicNames);
292
+
293
+ if (intentResult.kind === 'start_only' || intentResult.kind === 'stop_only' ||
294
+ intentResult.kind === 'start_and_stop_only' ||
295
+ intentResult.kind === 'restart_only' || intentResult.kind === 'pause_only' ||
296
+ intentResult.kind === 'resume_only') {
297
+ const execResult = executeRecordingIntent(intentResult, {
298
+ conversationId: msg.sessionId,
299
+ socket,
300
+ ctx,
301
+ });
235
302
 
236
- switch (intentClass) {
237
- case 'stop_only': {
238
- const stopped = handleRecordingStop(msg.sessionId, ctx) !== undefined;
239
- rlog.info('Recording stop intent intercepted in user_message');
303
+ if (execResult.handled) {
304
+ rlog.info({ kind: intentResult.kind }, 'Recording intent intercepted in user_message');
240
305
  ctx.send(socket, {
241
306
  type: 'assistant_text_delta',
242
- text: stopped ? 'Stopping the recording.' : 'No active recording to stop.',
307
+ text: execResult.responseText!,
243
308
  sessionId: msg.sessionId,
244
309
  });
245
310
  ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
246
311
  return;
247
312
  }
248
- case 'start_only': {
249
- const recordingId = handleRecordingStart(msg.sessionId, { promptForSource: true }, socket, ctx);
250
- rlog.info('Recording-only intent intercepted in user_message');
251
-
252
- if (recordingId) {
253
- ctx.send(socket, { type: 'assistant_text_delta', text: 'Starting screen recording.', sessionId: msg.sessionId });
254
- } else {
255
- ctx.send(socket, { type: 'assistant_text_delta', text: 'A recording is already active.', sessionId: msg.sessionId });
256
- }
257
- ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
258
- return;
313
+ }
314
+
315
+ if (intentResult.kind === 'start_with_remainder' || intentResult.kind === 'stop_with_remainder' ||
316
+ intentResult.kind === 'start_and_stop_with_remainder' || intentResult.kind === 'restart_with_remainder') {
317
+ const execResult = executeRecordingIntent(intentResult, {
318
+ conversationId: msg.sessionId,
319
+ socket,
320
+ ctx,
321
+ });
322
+
323
+ // Continue with stripped text for downstream processing
324
+ msg.content = execResult.remainderText ?? messageText;
325
+ messageText = msg.content;
326
+
327
+ // Execute the recording side effects that executeRecordingIntent deferred
328
+ if (intentResult.kind === 'stop_with_remainder') {
329
+ handleRecordingStop(msg.sessionId, ctx);
330
+ }
331
+ if (intentResult.kind === 'start_with_remainder') {
332
+ handleRecordingStart(msg.sessionId, { promptForSource: true }, socket, ctx);
259
333
  }
260
- case 'mixed': {
261
- // Skip recording side effects for questions about recording
262
- // (e.g., "how do I stop recording?") — let the model answer instead.
263
- if (isInterrogative(messageText, dynamicNames)) {
264
- rlog.info('Mixed recording intent is interrogative skipping side effects');
265
- break;
334
+ // start_and_stop_with_remainder / restart_with_remainder — route through
335
+ // handleRecordingRestart which properly cleans up maps between stop and start.
336
+ if (intentResult.kind === 'restart_with_remainder' || intentResult.kind === 'start_and_stop_with_remainder') {
337
+ const restartResult = handleRecordingRestart(msg.sessionId, socket, ctx);
338
+ // Only fall back to plain start for start_and_stop_with_remainder.
339
+ // restart_with_remainder should NOT silently start a new recording when idle.
340
+ if (!restartResult.initiated && restartResult.reason === 'no_active_recording'
341
+ && intentResult.kind === 'start_and_stop_with_remainder') {
342
+ handleRecordingStart(msg.sessionId, { promptForSource: true }, socket, ctx);
266
343
  }
344
+ }
267
345
 
268
- // Mixed = recording intent embedded in broader text.
269
- // Handle the recording action, then check if remaining text is substantive.
270
- const hasStart = detectRecordingIntent(messageText);
271
- const hasStop = detectStopRecordingIntent(messageText);
346
+ rlog.info({ remaining: msg.content, kind: intentResult.kind }, 'Recording intent with remainder continuing with remaining text');
347
+ }
272
348
 
273
- if (hasStop) {
274
- handleRecordingStop(msg.sessionId, ctx);
275
- rlog.info('Mixed intent stopping recording');
276
- }
277
- const startResult = hasStart ? handleRecordingStart(msg.sessionId, { promptForSource: true }, socket, ctx) : null;
278
- if (hasStart) {
279
- rlog.info({ started: !!startResult }, 'Mixed intent starting recording');
280
- }
349
+ // 'none' — deterministic resolver found nothing; try LLM fallback
350
+ // if the text contains recording-related keywords.
351
+ if (intentResult.kind === 'none' && containsRecordingKeywords(messageText)) {
352
+ const fallback = await classifyRecordingIntentFallback(messageText);
353
+ rlog.info({ fallbackAction: fallback.action, fallbackConfidence: fallback.confidence }, 'Recording intent LLM fallback result');
354
+
355
+ if (fallback.action !== 'none' && fallback.confidence === 'high') {
356
+ const kindMap: Record<string, import('../recording-intent.js').RecordingIntentResult> = {
357
+ start: { kind: 'start_only' },
358
+ stop: { kind: 'stop_only' },
359
+ restart: { kind: 'restart_only' },
360
+ pause: { kind: 'pause_only' },
361
+ resume: { kind: 'resume_only' },
362
+ };
363
+ const mapped = kindMap[fallback.action];
364
+ if (mapped) {
365
+ const execResult = executeRecordingIntent(mapped, {
366
+ conversationId: msg.sessionId,
367
+ socket,
368
+ ctx,
369
+ });
281
370
 
282
- // Strip recording clauses from the message
283
- let remaining = messageText;
284
- if (hasStart) remaining = stripRecordingIntent(remaining);
285
- if (hasStop) remaining = stripStopRecordingIntent(remaining);
286
-
287
- // If nothing substantive remains (just fillers, names, punctuation), complete now
288
- if (!hasSubstantiveContent(remaining, dynamicNames)) {
289
- let text: string;
290
- if (hasStart && startResult) {
291
- text = hasStop ? 'Stopping current recording and starting a new one.' : 'Starting screen recording.';
292
- } else if (hasStart) {
293
- text = 'A recording is already active.';
294
- } else {
295
- text = 'Stopping the recording.';
371
+ if (execResult.handled) {
372
+ rlog.info({ kind: mapped.kind, source: 'llm_fallback' }, 'Recording intent intercepted via LLM fallback');
373
+ ctx.send(socket, {
374
+ type: 'assistant_text_delta',
375
+ text: execResult.responseText!,
376
+ sessionId: msg.sessionId,
377
+ });
378
+ ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
379
+ return;
296
380
  }
297
- ctx.send(socket, { type: 'assistant_text_delta', text, sessionId: msg.sessionId });
298
- ctx.send(socket, { type: 'message_complete', sessionId: msg.sessionId });
299
- return;
300
381
  }
301
-
302
- // Continue with stripped text for downstream processing
303
- msg.content = remaining;
304
- rlog.info({ remaining }, 'Mixed recording intent — recording handled, continuing with remaining text');
305
- break;
306
382
  }
307
- case 'none':
308
- break;
309
383
  }
310
384
  }
311
385
 
@@ -1,6 +1,6 @@
1
1
  // Computer use, task routing, ride shotgun, and watch observation types.
2
2
 
3
- import type { IpcBlobRef,UserMessageAttachment } from './shared.js';
3
+ import type { CommandIntent, IpcBlobRef,UserMessageAttachment } from './shared.js';
4
4
 
5
5
  // === Client → Server ===
6
6
 
@@ -53,6 +53,8 @@ export interface TaskSubmit {
53
53
  screenHeight: number;
54
54
  attachments?: UserMessageAttachment[];
55
55
  source?: 'voice' | 'text';
56
+ /** Structured command intent — bypasses text parsing when present. */
57
+ commandIntent?: CommandIntent;
56
58
  }
57
59
 
58
60
  export interface RideShotgunStart {
@@ -100,11 +102,13 @@ export interface RecordingOptions {
100
102
  export interface RecordingStatus {
101
103
  type: 'recording_status';
102
104
  sessionId: string; // matches recordingId from RecordingStart
103
- status: 'started' | 'stopped' | 'failed';
105
+ status: 'started' | 'stopped' | 'failed' | 'restart_cancelled' | 'paused' | 'resumed';
104
106
  filePath?: string; // on stop
105
107
  durationMs?: number; // on stop
106
108
  error?: string; // on failure
107
109
  attachToConversationId?: string;
110
+ /** Operation token for restart race hardening — matches the token from RecordingStart. */
111
+ operationToken?: string;
108
112
  }
109
113
 
110
114
  // === Server → Client ===
@@ -115,6 +119,8 @@ export interface RecordingStart {
115
119
  recordingId: string; // daemon-assigned UUID
116
120
  attachToConversationId?: string;
117
121
  options?: RecordingOptions;
122
+ /** Operation token for restart race hardening — stale completions with mismatched tokens are rejected. */
123
+ operationToken?: string;
118
124
  }
119
125
 
120
126
  /** Server → Client: stop a recording. */
@@ -123,6 +129,18 @@ export interface RecordingStop {
123
129
  recordingId: string; // matches RecordingStart.recordingId
124
130
  }
125
131
 
132
+ /** Server → Client: pause the active recording. */
133
+ export interface RecordingPause {
134
+ type: 'recording_pause';
135
+ recordingId: string;
136
+ }
137
+
138
+ /** Server → Client: resume a paused recording. */
139
+ export interface RecordingResume {
140
+ type: 'recording_resume';
141
+ recordingId: string;
142
+ }
143
+
126
144
  export interface CuAction {
127
145
  type: 'cu_action';
128
146
  sessionId: string;
@@ -211,4 +229,6 @@ export type _ComputerUseServerMessages =
211
229
  | WatchStarted
212
230
  | WatchCompleteRequest
213
231
  | RecordingStart
214
- | RecordingStop;
232
+ | RecordingStop
233
+ | RecordingPause
234
+ | RecordingResume;
@@ -1,7 +1,7 @@
1
1
  // User/assistant messages, tool results, confirmations, secrets, errors, and generation lifecycle.
2
2
 
3
3
  import type { ChannelId, InterfaceId } from '../../channels/types.js';
4
- import type { UserMessageAttachment } from './shared.js';
4
+ import type { CommandIntent, UserMessageAttachment } from './shared.js';
5
5
 
6
6
  // === Client → Server ===
7
7
 
@@ -19,6 +19,8 @@ export interface UserMessage {
19
19
  channel?: ChannelId;
20
20
  /** Originating interface identifier (e.g. 'macos'). */
21
21
  interface: InterfaceId;
22
+ /** Structured command intent — bypasses text parsing when present. */
23
+ commandIntent?: CommandIntent;
22
24
  }
23
25
 
24
26
  export interface ConfirmationResponse {
@@ -29,6 +29,12 @@ export interface DictationContext {
29
29
  cursorInTextField: boolean;
30
30
  }
31
31
 
32
+ /** Structured command intent — bypasses text parsing when present. */
33
+ export interface CommandIntent {
34
+ domain: 'screen_recording';
35
+ action: 'start' | 'stop' | 'restart' | 'pause' | 'resume';
36
+ }
37
+
32
38
  export interface UserMessageAttachment {
33
39
  id?: string;
34
40
  filename: string;
@@ -273,6 +273,8 @@
273
273
  "platform_config_response",
274
274
  "pong",
275
275
  "publish_page_response",
276
+ "recording_pause",
277
+ "recording_resume",
276
278
  "recording_start",
277
279
  "recording_stop",
278
280
  "reminders_list_response",
@@ -185,6 +185,8 @@ export async function runDaemon(): Promise<void> {
185
185
  label: reminder.label,
186
186
  message: reminder.message,
187
187
  },
188
+ routingIntent: reminder.routingIntent,
189
+ routingHints: reminder.routingHints,
188
190
  dedupeKey: `reminder:${reminder.id}`,
189
191
  });
190
192
  },
@@ -0,0 +1,180 @@
1
+ // Unified recording intent executor.
2
+ // Bridges the gap between recording-intent.ts (classification) and
3
+ // handlers/recording.ts (side effects), so both sessions.ts and misc.ts
4
+ // can share the same execution logic without duplicating switch/case blocks.
5
+
6
+ import type * as net from 'node:net';
7
+
8
+ import type { HandlerContext } from './handlers/shared.js';
9
+ import {
10
+ handleRecordingPause,
11
+ handleRecordingRestart,
12
+ handleRecordingResume,
13
+ handleRecordingStart,
14
+ handleRecordingStop,
15
+ isRecordingIdle,
16
+ } from './handlers/recording.js';
17
+ import type { RecordingIntentResult } from './recording-intent.js';
18
+
19
+ export interface RecordingExecutionContext {
20
+ conversationId: string;
21
+ socket: net.Socket;
22
+ ctx: HandlerContext;
23
+ }
24
+
25
+ export interface RecordingExecutionOutput {
26
+ /** If true, the intent was fully handled (start_only / stop_only) -- handler should send completion and return */
27
+ handled: boolean;
28
+ /** Human-readable response text for the user */
29
+ responseText?: string;
30
+ /** For _with_remainder: the remaining text after stripping recording clauses */
31
+ remainderText?: string;
32
+ /** Whether a recording start should be/was initiated */
33
+ pendingStart?: boolean;
34
+ /** Whether a recording stop should be/was initiated */
35
+ pendingStop?: boolean;
36
+ /** Whether a restart is pending (for restart_with_remainder) */
37
+ pendingRestart?: boolean;
38
+ /** Whether handleRecordingStart succeeded (true) or was rejected (false). Only set for start_only / start_and_stop_only. */
39
+ recordingStarted?: boolean;
40
+ }
41
+
42
+ export function executeRecordingIntent(
43
+ result: RecordingIntentResult,
44
+ context: RecordingExecutionContext,
45
+ ): RecordingExecutionOutput {
46
+ switch (result.kind) {
47
+ case 'none':
48
+ return { handled: false };
49
+
50
+ case 'start_only': {
51
+ const recordingId = handleRecordingStart(
52
+ context.conversationId,
53
+ { promptForSource: true },
54
+ context.socket,
55
+ context.ctx,
56
+ );
57
+ return {
58
+ handled: true,
59
+ recordingStarted: !!recordingId,
60
+ responseText: recordingId
61
+ ? 'Starting screen recording.'
62
+ : 'A recording is already active.',
63
+ };
64
+ }
65
+
66
+ case 'stop_only': {
67
+ const stopped = handleRecordingStop(context.conversationId, context.ctx) !== undefined;
68
+ return {
69
+ handled: true,
70
+ responseText: stopped
71
+ ? 'Stopping the recording.'
72
+ : 'No active recording to stop.',
73
+ };
74
+ }
75
+
76
+ case 'start_with_remainder':
77
+ return {
78
+ handled: false,
79
+ remainderText: result.remainder,
80
+ pendingStart: true,
81
+ };
82
+
83
+ case 'stop_with_remainder':
84
+ return {
85
+ handled: false,
86
+ remainderText: result.remainder,
87
+ pendingStop: true,
88
+ };
89
+
90
+ case 'start_and_stop_only': {
91
+ // Route through handleRecordingRestart which properly cleans up maps
92
+ // between stop and start, preventing the "already active" guard from
93
+ // blocking the new recording.
94
+ const restartResult = handleRecordingRestart(
95
+ context.conversationId,
96
+ context.socket,
97
+ context.ctx,
98
+ );
99
+
100
+ // When there was no active recording to restart, fall back to a plain
101
+ // start — the user said "stop and start" but nothing was recording, so
102
+ // the stop is a no-op and we just start a new recording.
103
+ // Only fall back for this specific reason; "restart_in_progress" should
104
+ // not start a duplicate recording.
105
+ if (!restartResult.initiated && restartResult.reason === 'no_active_recording') {
106
+ const recordingId = handleRecordingStart(
107
+ context.conversationId,
108
+ { promptForSource: true },
109
+ context.socket,
110
+ context.ctx,
111
+ );
112
+ return {
113
+ handled: true,
114
+ recordingStarted: !!recordingId,
115
+ responseText: recordingId
116
+ ? 'Starting screen recording.'
117
+ : 'A recording is already active.',
118
+ };
119
+ }
120
+
121
+ return {
122
+ handled: true,
123
+ recordingStarted: restartResult.initiated,
124
+ responseText: restartResult.initiated
125
+ ? 'Stopping current recording and starting a new one.'
126
+ : restartResult.responseText,
127
+ };
128
+ }
129
+
130
+ case 'start_and_stop_with_remainder':
131
+ // When there's no active recording, fall back to a plain start rather
132
+ // than a restart — the stop is a no-op and we just need to start.
133
+ return {
134
+ handled: false,
135
+ remainderText: result.remainder,
136
+ ...(isRecordingIdle()
137
+ ? { pendingStart: true }
138
+ : { pendingRestart: true }),
139
+ };
140
+
141
+ case 'restart_only': {
142
+ const restartResult = handleRecordingRestart(
143
+ context.conversationId,
144
+ context.socket,
145
+ context.ctx,
146
+ );
147
+ return {
148
+ handled: true,
149
+ responseText: restartResult.responseText,
150
+ };
151
+ }
152
+
153
+ case 'restart_with_remainder':
154
+ return {
155
+ handled: false,
156
+ remainderText: result.remainder,
157
+ pendingRestart: true,
158
+ };
159
+
160
+ case 'pause_only': {
161
+ const paused = handleRecordingPause(context.conversationId, context.ctx) !== undefined;
162
+ return {
163
+ handled: true,
164
+ responseText: paused
165
+ ? 'Pausing the recording.'
166
+ : 'No active recording to pause.',
167
+ };
168
+ }
169
+
170
+ case 'resume_only': {
171
+ const resumed = handleRecordingResume(context.conversationId, context.ctx) !== undefined;
172
+ return {
173
+ handled: true,
174
+ responseText: resumed
175
+ ? 'Resuming the recording.'
176
+ : 'No active recording to resume.',
177
+ };
178
+ }
179
+ }
180
+ }
@@ -0,0 +1,132 @@
1
+ // LLM-based fallback classifier for recording intent detection.
2
+ // Fires only when the deterministic resolver returns `none` but the text
3
+ // contains recording-related keywords that suggest an intent the regex missed.
4
+ // Safety: returns `{ action: 'none', confidence: 'low' }` on any failure —
5
+ // never triggers a recording action on error.
6
+
7
+ import { createTimeout, extractText, getConfiguredProvider, userMessage } from '../providers/provider-send-message.js';
8
+ import { getLogger } from '../util/logger.js';
9
+
10
+ const log = getLogger('recording-intent-fallback');
11
+
12
+ const FALLBACK_TIMEOUT_MS = 5000;
13
+
14
+ export type RecordingFallbackAction = 'start' | 'stop' | 'restart' | 'pause' | 'resume' | 'none';
15
+
16
+ export interface RecordingFallbackResult {
17
+ action: RecordingFallbackAction;
18
+ confidence: 'high' | 'medium' | 'low';
19
+ }
20
+
21
+ const SAFE_DEFAULT: RecordingFallbackResult = { action: 'none', confidence: 'low' };
22
+
23
+ /** Keywords that gate whether we spend an LLM call on fallback classification. */
24
+ const RECORDING_KEYWORDS = [
25
+ 'record',
26
+ 'recording',
27
+ 'screen capture',
28
+ 'screencast',
29
+ 'capture screen',
30
+ 'capture my screen',
31
+ 'screen rec',
32
+ ];
33
+
34
+ const SYSTEM_PROMPT =
35
+ 'You are classifying user messages for a screen recording assistant. ' +
36
+ 'Determine if the user wants to: start a recording, stop a recording, restart a recording, ' +
37
+ 'pause a recording, resume a recording, or none of these. ' +
38
+ 'Only classify as an action if the user is giving an imperative command. ' +
39
+ 'Questions about recording (e.g., "how do I record?", "what does recording do?") should be classified as "none". ' +
40
+ 'Respond with a JSON object: {"action": "start|stop|restart|pause|resume|none", "confidence": "high|medium|low"}';
41
+
42
+ const VALID_ACTIONS = new Set<RecordingFallbackAction>(['start', 'stop', 'restart', 'pause', 'resume', 'none']);
43
+ const VALID_CONFIDENCES = new Set<string>(['high', 'medium', 'low']);
44
+
45
+ /**
46
+ * Returns true if the text contains any recording-related keywords,
47
+ * indicating it is worth spending an LLM call on fallback classification.
48
+ */
49
+ export function containsRecordingKeywords(text: string): boolean {
50
+ const lower = text.toLowerCase();
51
+ return RECORDING_KEYWORDS.some((kw) => lower.includes(kw));
52
+ }
53
+
54
+ /**
55
+ * Uses a lightweight LLM call to classify whether text contains a recording intent
56
+ * that the deterministic resolver missed.
57
+ *
58
+ * Returns `{ action: 'none', confidence: 'high' }` for informational questions.
59
+ * Only returns an actionable result with 'high' confidence for clear imperative commands.
60
+ */
61
+ export async function classifyRecordingIntentFallback(
62
+ text: string,
63
+ ): Promise<RecordingFallbackResult> {
64
+ const provider = getConfiguredProvider();
65
+ if (!provider) {
66
+ log.debug('No configured provider available for fallback classification');
67
+ return SAFE_DEFAULT;
68
+ }
69
+
70
+ try {
71
+ const { signal, cleanup } = createTimeout(FALLBACK_TIMEOUT_MS);
72
+ try {
73
+ const response = await provider.sendMessage(
74
+ [userMessage(text)],
75
+ [], // no tools
76
+ SYSTEM_PROMPT,
77
+ {
78
+ config: {
79
+ modelIntent: 'latency-optimized',
80
+ max_tokens: 64,
81
+ },
82
+ signal,
83
+ },
84
+ );
85
+ cleanup();
86
+
87
+ const raw = extractText(response);
88
+ return parseClassificationResponse(raw);
89
+ } finally {
90
+ cleanup();
91
+ }
92
+ } catch (err) {
93
+ const message = err instanceof Error ? err.message : String(err);
94
+ log.warn({ err: message }, 'LLM fallback classification failed');
95
+ return SAFE_DEFAULT;
96
+ }
97
+ }
98
+
99
+ /**
100
+ * Parse the LLM's JSON response into a validated RecordingFallbackResult.
101
+ * Returns safe default on any parse/validation failure.
102
+ */
103
+ function parseClassificationResponse(raw: string): RecordingFallbackResult {
104
+ try {
105
+ // Extract JSON from the response — the LLM may include surrounding text
106
+ const jsonMatch = raw.match(/\{[^}]*\}/);
107
+ if (!jsonMatch) {
108
+ log.debug({ raw }, 'No JSON object found in LLM fallback response');
109
+ return SAFE_DEFAULT;
110
+ }
111
+
112
+ const parsed = JSON.parse(jsonMatch[0]) as { action?: string; confidence?: string };
113
+
114
+ const action = parsed.action as RecordingFallbackAction | undefined;
115
+ const confidence = parsed.confidence;
116
+
117
+ if (!action || !VALID_ACTIONS.has(action)) {
118
+ log.debug({ raw, action }, 'Invalid action in LLM fallback response');
119
+ return SAFE_DEFAULT;
120
+ }
121
+
122
+ if (!confidence || !VALID_CONFIDENCES.has(confidence)) {
123
+ log.debug({ raw, confidence }, 'Invalid confidence in LLM fallback response');
124
+ return SAFE_DEFAULT;
125
+ }
126
+
127
+ return { action, confidence: confidence as RecordingFallbackResult['confidence'] };
128
+ } catch (err) {
129
+ log.debug({ err, raw }, 'Failed to parse LLM fallback response as JSON');
130
+ return SAFE_DEFAULT;
131
+ }
132
+ }