@vellumai/assistant 0.4.20 → 0.4.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,38 @@
1
- import * as net from 'node:net';
2
-
3
- import { createTimeout, extractToolUse, getConfiguredProvider, userMessage } from '../../providers/provider-send-message.js';
4
- import { resolveProfile } from '../dictation-profile-store.js';
5
- import { applyDictionary, expandSnippets } from '../dictation-text-processing.js';
6
- import type { DictationRequest } from '../ipc-protocol.js';
7
- import { defineHandlers, type HandlerContext, log } from './shared.js';
1
+ import * as net from "node:net";
2
+
3
+ import {
4
+ createTimeout,
5
+ extractToolUse,
6
+ getConfiguredProvider,
7
+ userMessage,
8
+ } from "../../providers/provider-send-message.js";
9
+ import {
10
+ type ProfileResolution,
11
+ resolveProfile,
12
+ } from "../dictation-profile-store.js";
13
+ import {
14
+ applyDictionary,
15
+ expandSnippets,
16
+ } from "../dictation-text-processing.js";
17
+ import type { DictationRequest } from "../ipc-protocol.js";
18
+ import { defineHandlers, type HandlerContext, log } from "./shared.js";
8
19
 
9
20
  // Action verbs for fast heuristic fallback (used when LLM classifier is unavailable)
10
- const ACTION_VERBS = ['slack', 'email', 'send', 'create', 'open', 'search', 'find', 'message', 'text', 'schedule', 'remind', 'launch', 'navigate'];
21
+ const ACTION_VERBS = [
22
+ "slack",
23
+ "email",
24
+ "send",
25
+ "create",
26
+ "open",
27
+ "search",
28
+ "find",
29
+ "message",
30
+ "text",
31
+ "schedule",
32
+ "remind",
33
+ "launch",
34
+ "navigate",
35
+ ];
11
36
 
12
37
  const DICTATION_CLASSIFICATION_TIMEOUT_MS = 5000;
13
38
 
@@ -15,9 +40,9 @@ const MAX_WINDOW_TITLE_LENGTH = 100;
15
40
 
16
41
  /** Sanitize window title to mitigate prompt injection from attacker-controlled titles (e.g. browser tabs, Slack conversations). */
17
42
  function sanitizeWindowTitle(title: string | undefined): string {
18
- if (!title) return '';
43
+ if (!title) return "";
19
44
  return title
20
- .replace(/[<>]/g, '') // strip angle brackets to prevent tag injection
45
+ .replace(/[<>]/g, "") // strip angle brackets to prevent tag injection
21
46
  .slice(0, MAX_WINDOW_TITLE_LENGTH);
22
47
  }
23
48
 
@@ -25,233 +50,170 @@ function sanitizeWindowTitle(title: string | undefined): string {
25
50
  function buildAppMetadataBlock(msg: DictationRequest): string {
26
51
  const windowTitle = sanitizeWindowTitle(msg.context.windowTitle);
27
52
  return [
28
- '<app_metadata>',
53
+ "<app_metadata>",
29
54
  `App: ${msg.context.appName} (${msg.context.bundleIdentifier})`,
30
55
  `Window: ${windowTitle}`,
31
- '</app_metadata>',
32
- ].join('\n');
56
+ "</app_metadata>",
57
+ ].join("\n");
33
58
  }
34
59
 
35
- type DictationMode = 'dictation' | 'command' | 'action';
60
+ type DictationMode = "dictation" | "command" | "action";
36
61
 
37
62
  /** Fast heuristic fallback — used when LLM classifier is unavailable or fails. */
38
- export function detectDictationModeHeuristic(msg: DictationRequest): DictationMode {
63
+ export function detectDictationModeHeuristic(
64
+ msg: DictationRequest,
65
+ ): DictationMode {
39
66
  // Command mode: selected text present — treat transcription as a transformation instruction
40
67
  if (msg.context.selectedText && msg.context.selectedText.trim().length > 0) {
41
- return 'command';
68
+ return "command";
42
69
  }
43
70
 
44
71
  // Action mode: transcription starts with an action verb
45
- const firstWord = msg.transcription.trim().split(/\s+/)[0]?.toLowerCase() ?? '';
72
+ const firstWord =
73
+ msg.transcription.trim().split(/\s+/)[0]?.toLowerCase() ?? "";
46
74
  if (ACTION_VERBS.includes(firstWord)) {
47
- return 'action';
75
+ return "action";
48
76
  }
49
77
 
50
78
  // Dictation mode: cursor is in a text field with no selection — clean up for typing
51
79
  if (msg.context.cursorInTextField) {
52
- return 'dictation';
53
- }
54
-
55
- return 'dictation';
56
- }
57
-
58
- /** Classify dictation mode using Haiku, falling back to heuristic. */
59
- export async function detectDictationMode(msg: DictationRequest): Promise<DictationMode> {
60
- // Command mode is deterministic — no need for LLM
61
- if (msg.context.selectedText && msg.context.selectedText.trim().length > 0) {
62
- return 'command';
80
+ return "dictation";
63
81
  }
64
82
 
65
- const provider = getConfiguredProvider();
66
- if (!provider) {
67
- log.warn('No provider for dictation classification, using heuristic');
68
- return detectDictationModeHeuristic(msg);
69
- }
70
-
71
- try {
72
- const { signal, cleanup } = createTimeout(DICTATION_CLASSIFICATION_TIMEOUT_MS);
73
- try {
74
- const contextInfo = [
75
- `App: ${msg.context.appName} (${msg.context.bundleIdentifier})`,
76
- msg.context.windowTitle ? `Window: ${msg.context.windowTitle}` : '',
77
- `Cursor in text field: ${msg.context.cursorInTextField ? 'yes' : 'no'}`,
78
- ].filter(Boolean).join('\n');
79
-
80
- const response = await provider.sendMessage(
81
- [userMessage(`Transcription: "${msg.transcription}"\n\nContext:\n${contextInfo}`)],
82
- [{
83
- name: 'classify_dictation',
84
- description: 'Classify whether voice input is dictation or an action command',
85
- input_schema: {
86
- type: 'object' as const,
87
- properties: {
88
- mode: {
89
- type: 'string',
90
- enum: ['dictation', 'action'],
91
- description: 'dictation = user wants text inserted/cleaned up for typing. action = user wants the assistant to perform a task (send a message, open an app, search, navigate, control something).',
92
- },
93
- reasoning: {
94
- type: 'string',
95
- description: 'Brief reasoning for the classification',
96
- },
97
- },
98
- required: ['mode', 'reasoning'],
99
- },
100
- }],
101
- [
102
- 'You classify voice transcriptions as either "dictation" (text to insert) or "action" (task for an assistant to execute).',
103
- '',
104
- 'DICTATION examples: "Hey how are you doing", "I think we should move forward with the proposal", "Dear team comma please review the attached document"',
105
- 'ACTION examples: "Message Aaron on Slack saying hey what\'s up", "Send an email to the team about the meeting", "Open Spotify and play my playlist", "Search for flights to Denver", "Create a new document in Google Docs"',
106
- '',
107
- 'Key signals for ACTION: the user is addressing an assistant and asking it to DO something (send, message, open, search, create, schedule, etc.)',
108
- 'Key signals for DICTATION: the user is composing text content that should be typed out as-is',
109
- '',
110
- 'Context is provided — if the cursor is in a text field, lean toward dictation unless the intent to command is clear.',
111
- ].join('\n'),
112
- {
113
- config: {
114
- modelIntent: 'latency-optimized',
115
- max_tokens: 128,
116
- tool_choice: { type: 'tool' as const, name: 'classify_dictation' },
117
- },
118
- signal,
119
- },
120
- );
121
- cleanup();
122
-
123
- const toolBlock = extractToolUse(response);
124
- if (toolBlock) {
125
- const input = toolBlock.input as { mode?: string; reasoning?: string };
126
- const mode = input.mode === 'action' ? 'action' : 'dictation';
127
- log.info({ mode, reasoning: input.reasoning }, 'LLM dictation classification');
128
- return mode;
129
- }
130
-
131
- log.warn('No tool_use block in dictation classification, using heuristic');
132
- return detectDictationModeHeuristic(msg);
133
- } finally {
134
- cleanup();
135
- }
136
- } catch (err) {
137
- const message = err instanceof Error ? err.message : String(err);
138
- log.warn({ err: message }, 'LLM dictation classification failed, using heuristic');
139
- return detectDictationModeHeuristic(msg);
140
- }
83
+ return "dictation";
141
84
  }
142
85
 
143
- function buildDictationPrompt(msg: DictationRequest, stylePrompt?: string): string {
86
+ /** Build a combined system prompt that classifies AND cleans dictation in a single LLM call. */
87
+ function buildCombinedDictationPrompt(
88
+ msg: DictationRequest,
89
+ stylePrompt?: string,
90
+ ): string {
144
91
  const sections = [
145
- 'You are a dictation assistant. Clean up the following speech transcription for direct insertion into a text field.',
146
- '',
147
- '## Rules',
148
- '- Fix grammar, punctuation, and capitalization',
149
- '- Remove filler words (um, uh, like, you know)',
92
+ "You are a voice input assistant. You will receive a speech transcription and must:",
93
+ '1. Classify it as "dictation" (text to insert) or "action" (task for an assistant to execute)',
94
+ "2. If dictation, clean up the text. If action, return the raw transcription.",
95
+ "",
96
+ "## Classification",
97
+ 'DICTATION examples: "Hey how are you doing", "I think we should move forward with the proposal", "Dear team comma please review the attached document"',
98
+ 'ACTION examples: "Message Aaron on Slack saying hey what\'s up", "Send an email to the team about the meeting", "Open Spotify and play my playlist", "Search for flights to Denver", "Create a new document in Google Docs"',
99
+ "",
100
+ "Key signals for ACTION: the user is addressing an assistant and asking it to DO something (send, message, open, search, create, schedule, etc.)",
101
+ "Key signals for DICTATION: the user is composing text content that should be typed out as-is",
102
+ `Cursor in text field: ${msg.context.cursorInTextField ? "yes" : "no"} — if yes, lean toward dictation unless the intent to command is clear.`,
103
+ "",
104
+ "## Cleanup Rules (for dictation mode only)",
105
+ "- Fix grammar, punctuation, and capitalization",
106
+ "- Remove filler words (um, uh, like, you know)",
150
107
  '- Rewrite vague or hedging language ("so yeah probably", "I guess maybe") into clear, confident statements',
151
108
  "- Maintain the speaker's intent and meaning",
152
- '- Do NOT add explanations or commentary',
153
- '- Return ONLY the cleaned text, nothing else',
154
109
  ];
155
110
 
156
111
  if (stylePrompt) {
157
112
  sections.push(
158
- '',
159
- '## User Style (HIGHEST PRIORITY)',
160
- 'The user has configured these style preferences. They OVERRIDE the default tone adaptation below.',
161
- 'Follow these instructions precisely — they reflect the user\'s personal writing voice and preferences.',
162
- '',
113
+ "",
114
+ "## User Style (HIGHEST PRIORITY)",
115
+ "The user has configured these style preferences. They OVERRIDE the default tone adaptation below.",
116
+ "Follow these instructions precisely — they reflect the user's personal writing voice and preferences.",
117
+ "",
163
118
  stylePrompt,
164
119
  );
165
120
  }
166
121
 
167
- sections.push(
168
- '',
169
- '## Tone Adaptation',
170
- );
122
+ sections.push("", "## Tone Adaptation");
171
123
 
172
124
  if (stylePrompt) {
173
- sections.push('Use these as fallback guidance only when the User Style above does not cover a specific aspect:');
125
+ sections.push(
126
+ "Use these as fallback guidance only when the User Style above does not cover a specific aspect:",
127
+ );
174
128
  } else {
175
- sections.push('Adapt your output tone based on the active application:');
129
+ sections.push("Adapt your output tone based on the active application:");
176
130
  }
177
131
 
178
132
  sections.push(
179
- '- Email apps (Gmail, Mail): Professional but warm. Use proper greetings and sign-offs if appropriate.',
180
- '- Slack: Casual and conversational. Match typical chat style.',
181
- '- Code editors (VS Code, Xcode): Technical and concise. Code comments style.',
182
- '- Terminal: Command-like, terse.',
183
- '- Messages/iMessage: Very casual, texting style. Short sentences.',
184
- '- Notes/Docs: Neutral, clear writing.',
185
- '- Default: Match the user\'s natural voice.',
186
- '',
187
- '## Context Clues',
188
- '- Window title may contain recipient name (Slack DMs, email compose)',
189
- '- If you can identify a recipient, adapt formality to the apparent relationship',
190
- '- Maintain the user\'s natural voice — don\'t over-formalize casual speech',
191
- '- The user\'s writing patterns and preferences may be available from memory context — follow those when present',
192
- '',
133
+ "- Email apps (Gmail, Mail): Professional but warm. Use proper greetings and sign-offs if appropriate.",
134
+ "- Slack: Casual and conversational. Match typical chat style.",
135
+ "- Code editors (VS Code, Xcode): Technical and concise. Code comments style.",
136
+ "- Terminal: Command-like, terse.",
137
+ "- Messages/iMessage: Very casual, texting style. Short sentences.",
138
+ "- Notes/Docs: Neutral, clear writing.",
139
+ "- Default: Match the user's natural voice.",
140
+ "",
141
+ "## Context Clues",
142
+ "- Window title may contain recipient name (Slack DMs, email compose)",
143
+ "- If you can identify a recipient, adapt formality to the apparent relationship",
144
+ "- Maintain the user's natural voice — don't over-formalize casual speech",
145
+ "- The user's writing patterns and preferences may be available from memory context — follow those when present",
146
+ "",
193
147
  buildAppMetadataBlock(msg),
194
148
  );
195
149
 
196
- return sections.join('\n');
150
+ return sections.join("\n");
197
151
  }
198
152
 
199
- function buildCommandPrompt(msg: DictationRequest, stylePrompt?: string): string {
153
+ function buildCommandPrompt(
154
+ msg: DictationRequest,
155
+ stylePrompt?: string,
156
+ ): string {
200
157
  const sections = [
201
- 'You are a text transformation assistant. The user has selected text and given a voice command to transform it.',
202
- '',
203
- '## Rules',
204
- '- Apply the instruction to the selected text',
205
- '- Return ONLY the transformed text, nothing else',
206
- '- Do NOT add explanations or commentary',
158
+ "You are a text transformation assistant. The user has selected text and given a voice command to transform it.",
159
+ "",
160
+ "## Rules",
161
+ "- Apply the instruction to the selected text",
162
+ "- Return ONLY the transformed text, nothing else",
163
+ "- Do NOT add explanations or commentary",
207
164
  ];
208
165
 
209
166
  if (stylePrompt) {
210
167
  sections.push(
211
- '',
212
- '## User Style (HIGHEST PRIORITY)',
213
- 'The user has configured these style preferences. They OVERRIDE the default tone adaptation below.',
214
- 'Follow these instructions precisely — they reflect the user\'s personal writing voice and preferences.',
215
- '',
168
+ "",
169
+ "## User Style (HIGHEST PRIORITY)",
170
+ "The user has configured these style preferences. They OVERRIDE the default tone adaptation below.",
171
+ "Follow these instructions precisely — they reflect the user's personal writing voice and preferences.",
172
+ "",
216
173
  stylePrompt,
217
174
  );
218
175
  }
219
176
 
220
- sections.push(
221
- '',
222
- '## Tone Adaptation',
223
- );
177
+ sections.push("", "## Tone Adaptation");
224
178
 
225
179
  if (stylePrompt) {
226
- sections.push('Use these as fallback guidance only when the User Style above does not cover a specific aspect:');
180
+ sections.push(
181
+ "Use these as fallback guidance only when the User Style above does not cover a specific aspect:",
182
+ );
227
183
  } else {
228
- sections.push('Match the tone to the active application context:');
184
+ sections.push("Match the tone to the active application context:");
229
185
  }
230
186
 
231
187
  sections.push(
232
- '- Email apps (Gmail, Mail): Professional but warm.',
233
- '- Slack: Casual and conversational.',
234
- '- Code editors (VS Code, Xcode): Technical and concise.',
235
- '- Terminal: Command-like, terse.',
236
- '- Messages/iMessage: Very casual, texting style.',
237
- '- Notes/Docs: Neutral, clear writing.',
238
- '- Default: Match the user\'s natural voice.',
239
- '',
240
- '## Context Clues',
241
- '- Window title may contain recipient name (Slack DMs, email compose)',
242
- '- If you can identify a recipient, adapt formality to the apparent relationship',
243
- '- Maintain the user\'s natural voice — don\'t over-formalize casual speech',
244
- '- The user\'s writing patterns and preferences may be available from memory context — follow those when present',
245
- '',
188
+ "- Email apps (Gmail, Mail): Professional but warm.",
189
+ "- Slack: Casual and conversational.",
190
+ "- Code editors (VS Code, Xcode): Technical and concise.",
191
+ "- Terminal: Command-like, terse.",
192
+ "- Messages/iMessage: Very casual, texting style.",
193
+ "- Notes/Docs: Neutral, clear writing.",
194
+ "- Default: Match the user's natural voice.",
195
+ "",
196
+ "## Context Clues",
197
+ "- Window title may contain recipient name (Slack DMs, email compose)",
198
+ "- If you can identify a recipient, adapt formality to the apparent relationship",
199
+ "- Maintain the user's natural voice — don't over-formalize casual speech",
200
+ "- The user's writing patterns and preferences may be available from memory context — follow those when present",
201
+ "",
246
202
  buildAppMetadataBlock(msg),
247
- '',
248
- 'Selected text:',
249
- msg.context.selectedText ?? '',
250
- '',
203
+ "",
204
+ "Selected text:",
205
+ msg.context.selectedText ?? "",
206
+ "",
251
207
  `Instruction: ${msg.transcription}`,
252
208
  );
253
209
 
254
- return sections.join('\n');
210
+ return sections.join("\n");
211
+ }
212
+
213
+ /** Compute dynamic max_tokens based on input length to avoid waste and truncation. */
214
+ function computeMaxTokens(inputLength: number): number {
215
+ const estimatedInputTokens = Math.ceil(inputLength / 3);
216
+ return Math.max(256, estimatedInputTokens + 128);
255
217
  }
256
218
 
257
219
  export async function handleDictationRequest(
@@ -259,8 +221,10 @@ export async function handleDictationRequest(
259
221
  socket: net.Socket,
260
222
  ctx: HandlerContext,
261
223
  ): Promise<void> {
262
- const mode = await detectDictationMode(msg);
263
- log.info({ mode, transcriptionLength: msg.transcription.length }, 'Dictation request received');
224
+ log.info(
225
+ { transcriptionLength: msg.transcription.length },
226
+ "Dictation request received",
227
+ );
264
228
 
265
229
  // Resolve profile for all modes (metadata is included in response)
266
230
  const resolution = resolveProfile(
@@ -269,70 +233,253 @@ export async function handleDictationRequest(
269
233
  msg.profileId,
270
234
  );
271
235
  const { profile, source: profileSource } = resolution;
272
- log.info({ profileId: profile.id, profileSource }, 'Resolved dictation profile');
236
+ log.info(
237
+ { profileId: profile.id, profileSource },
238
+ "Resolved dictation profile",
239
+ );
273
240
 
274
241
  const profileMeta = {
275
242
  resolvedProfileId: profile.id,
276
243
  profileSource,
277
244
  };
278
245
 
279
- // Action mode: return immediately — the client will route to a full agent session
280
- if (mode === 'action') {
246
+ const stylePrompt = profile.stylePrompt || undefined;
247
+
248
+ // Command mode: selected text present — deterministic, no classification needed
249
+ if (msg.context.selectedText && msg.context.selectedText.trim().length > 0) {
250
+ log.info({ mode: "command" }, "Command mode (selected text present)");
251
+ await handleCommandMode(
252
+ msg,
253
+ socket,
254
+ ctx,
255
+ profile,
256
+ profileMeta,
257
+ stylePrompt,
258
+ );
259
+ return;
260
+ }
261
+
262
+ // Non-command: single LLM call that classifies AND cleans in one shot
263
+ const transcription = expandSnippets(msg.transcription, profile.snippets);
264
+
265
+ try {
266
+ const provider = getConfiguredProvider();
267
+ if (!provider) {
268
+ log.warn(
269
+ "Dictation: no provider available, using heuristic + raw transcription",
270
+ );
271
+ const mode = detectDictationModeHeuristic(msg);
272
+ const normalizedText = applyDictionary(transcription, profile.dictionary);
273
+ if (mode === "action") {
274
+ ctx.send(socket, {
275
+ type: "dictation_response",
276
+ text: msg.transcription,
277
+ mode: "action",
278
+ actionPlan: `User wants to: ${msg.transcription}`,
279
+ ...profileMeta,
280
+ });
281
+ } else {
282
+ ctx.send(socket, {
283
+ type: "dictation_response",
284
+ text: normalizedText,
285
+ mode,
286
+ ...profileMeta,
287
+ });
288
+ }
289
+ return;
290
+ }
291
+
292
+ const systemPrompt = buildCombinedDictationPrompt(msg, stylePrompt);
293
+ const maxTokens = computeMaxTokens(transcription.length);
294
+ const { signal, cleanup } = createTimeout(
295
+ DICTATION_CLASSIFICATION_TIMEOUT_MS,
296
+ );
297
+
298
+ try {
299
+ const response = await provider.sendMessage(
300
+ [userMessage(`Transcription: "${transcription}"`)],
301
+ [
302
+ {
303
+ name: "process_dictation",
304
+ description: "Classify the voice input and return cleaned text",
305
+ input_schema: {
306
+ type: "object" as const,
307
+ properties: {
308
+ mode: {
309
+ type: "string",
310
+ enum: ["dictation", "action"],
311
+ description:
312
+ "dictation = user wants text inserted/cleaned up for typing. action = user wants the assistant to perform a task (send a message, open an app, search, navigate, control something).",
313
+ },
314
+ text: {
315
+ type: "string",
316
+ description:
317
+ "If dictation: the cleaned/formatted text ready for insertion. If action: the raw transcription unchanged.",
318
+ },
319
+ reasoning: {
320
+ type: "string",
321
+ description: "Brief reasoning for the classification",
322
+ },
323
+ },
324
+ required: ["mode", "text", "reasoning"],
325
+ },
326
+ },
327
+ ],
328
+ systemPrompt,
329
+ {
330
+ config: {
331
+ modelIntent: "latency-optimized",
332
+ max_tokens: maxTokens,
333
+ tool_choice: { type: "tool" as const, name: "process_dictation" },
334
+ },
335
+ signal,
336
+ },
337
+ );
338
+ cleanup();
339
+
340
+ const toolBlock = extractToolUse(response);
341
+ if (toolBlock) {
342
+ const input = toolBlock.input as {
343
+ mode?: string;
344
+ text?: string;
345
+ reasoning?: string;
346
+ };
347
+ const mode: DictationMode =
348
+ input.mode === "action" ? "action" : "dictation";
349
+ log.info(
350
+ { mode, reasoning: input.reasoning },
351
+ "LLM dictation classify+clean",
352
+ );
353
+
354
+ if (mode === "action") {
355
+ ctx.send(socket, {
356
+ type: "dictation_response",
357
+ text: msg.transcription,
358
+ mode: "action",
359
+ actionPlan: `User wants to: ${msg.transcription}`,
360
+ ...profileMeta,
361
+ });
362
+ } else {
363
+ const cleanedText = input.text?.trim() || transcription;
364
+ const normalizedText = applyDictionary(
365
+ cleanedText,
366
+ profile.dictionary,
367
+ );
368
+ ctx.send(socket, {
369
+ type: "dictation_response",
370
+ text: normalizedText,
371
+ mode: "dictation",
372
+ ...profileMeta,
373
+ });
374
+ }
375
+ return;
376
+ }
377
+
378
+ // No tool_use block — fall through to heuristic
379
+ log.warn("No tool_use block in combined dictation call, using heuristic");
380
+ } finally {
381
+ cleanup();
382
+ }
383
+ } catch (err) {
384
+ const message = err instanceof Error ? err.message : String(err);
385
+ log.warn(
386
+ { err: message },
387
+ "Combined dictation LLM call failed, using heuristic",
388
+ );
389
+ }
390
+
391
+ // Heuristic fallback
392
+ const fallbackMode = detectDictationModeHeuristic(msg);
393
+ log.info({ mode: fallbackMode }, "Using heuristic fallback");
394
+ if (fallbackMode === "action") {
281
395
  ctx.send(socket, {
282
- type: 'dictation_response',
396
+ type: "dictation_response",
283
397
  text: msg.transcription,
284
- mode: 'action',
398
+ mode: "action",
285
399
  actionPlan: `User wants to: ${msg.transcription}`,
286
400
  ...profileMeta,
287
401
  });
288
- return;
402
+ } else {
403
+ const normalizedText = applyDictionary(transcription, profile.dictionary);
404
+ ctx.send(socket, {
405
+ type: "dictation_response",
406
+ text: normalizedText,
407
+ mode: fallbackMode,
408
+ ...profileMeta,
409
+ });
289
410
  }
411
+ }
290
412
 
291
- // Pre-LLM snippet expansion (dictation mode only)
292
- const transcription = mode === 'dictation'
293
- ? expandSnippets(msg.transcription, profile.snippets)
294
- : msg.transcription;
295
-
296
- // Dictation / command mode: make a single-turn LLM call for text cleanup or transformation
297
- const stylePrompt = profile.stylePrompt || undefined;
298
- const systemPrompt = mode === 'dictation'
299
- ? buildDictationPrompt(msg, stylePrompt)
300
- : buildCommandPrompt(msg, stylePrompt);
301
-
302
- const userText = transcription;
413
+ /** Handle command mode (selected text) — separate code path, latency-optimized. */
414
+ async function handleCommandMode(
415
+ msg: DictationRequest,
416
+ socket: net.Socket,
417
+ ctx: HandlerContext,
418
+ profile: ReturnType<typeof resolveProfile>["profile"],
419
+ profileMeta: {
420
+ resolvedProfileId: string;
421
+ profileSource: ProfileResolution["source"];
422
+ },
423
+ stylePrompt: string | undefined,
424
+ ): Promise<void> {
425
+ const systemPrompt = buildCommandPrompt(msg, stylePrompt);
426
+ const inputLength =
427
+ (msg.context.selectedText ?? "").length + msg.transcription.length;
428
+ const maxTokens = Math.max(1024, computeMaxTokens(inputLength));
303
429
 
304
430
  try {
305
431
  const provider = getConfiguredProvider();
306
432
  if (!provider) {
307
- log.warn('Dictation: no provider available, returning raw transcription');
308
- const fallbackText = mode === 'command' ? (msg.context.selectedText ?? transcription) : transcription;
309
- const normalizedText = applyDictionary(fallbackText, profile.dictionary);
310
- ctx.send(socket, { type: 'dictation_response', text: normalizedText, mode, ...profileMeta });
433
+ log.warn("Command mode: no provider available, returning selected text");
434
+ const normalizedText = applyDictionary(
435
+ msg.context.selectedText ?? msg.transcription,
436
+ profile.dictionary,
437
+ );
438
+ ctx.send(socket, {
439
+ type: "dictation_response",
440
+ text: normalizedText,
441
+ mode: "command",
442
+ ...profileMeta,
443
+ });
311
444
  return;
312
445
  }
313
446
 
314
447
  const response = await provider.sendMessage(
315
- [{ role: 'user', content: [{ type: 'text', text: userText }] }],
448
+ [userMessage(msg.transcription)],
316
449
  [], // no tools
317
450
  systemPrompt,
318
- { config: { max_tokens: 1024 } },
451
+ { config: { modelIntent: "latency-optimized", max_tokens: maxTokens } },
319
452
  );
320
453
 
321
- const textBlock = response.content.find((b) => b.type === 'text');
322
- const inlineFallback = mode === 'command' ? (msg.context.selectedText ?? transcription) : transcription;
323
- const cleanedText = textBlock && 'text' in textBlock ? textBlock.text.trim() : inlineFallback;
324
-
325
- // Post-LLM dictionary normalization
454
+ const textBlock = response.content.find((b) => b.type === "text");
455
+ const cleanedText =
456
+ textBlock && "text" in textBlock
457
+ ? textBlock.text.trim()
458
+ : (msg.context.selectedText ?? msg.transcription);
326
459
  const normalizedText = applyDictionary(cleanedText, profile.dictionary);
327
-
328
- ctx.send(socket, { type: 'dictation_response', text: normalizedText, mode, ...profileMeta });
460
+ ctx.send(socket, {
461
+ type: "dictation_response",
462
+ text: normalizedText,
463
+ mode: "command",
464
+ ...profileMeta,
465
+ });
329
466
  } catch (err) {
330
467
  const message = err instanceof Error ? err.message : String(err);
331
- log.error({ err }, 'Dictation LLM call failed, returning raw transcription');
332
- const fallbackText = mode === 'command' ? (msg.context.selectedText ?? transcription) : transcription;
333
- const normalizedText = applyDictionary(fallbackText, profile.dictionary);
334
- ctx.send(socket, { type: 'dictation_response', text: normalizedText, mode, ...profileMeta });
335
- ctx.send(socket, { type: 'error', message: `Dictation cleanup failed: ${message}` });
468
+ log.error({ err }, "Command mode LLM call failed, returning selected text");
469
+ const normalizedText = applyDictionary(
470
+ msg.context.selectedText ?? msg.transcription,
471
+ profile.dictionary,
472
+ );
473
+ ctx.send(socket, {
474
+ type: "dictation_response",
475
+ text: normalizedText,
476
+ mode: "command",
477
+ ...profileMeta,
478
+ });
479
+ ctx.send(socket, {
480
+ type: "error",
481
+ message: `Dictation cleanup failed: ${message}`,
482
+ });
336
483
  }
337
484
  }
338
485