@vellumai/assistant 0.4.20 → 0.4.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/config/system-prompt.ts +1 -0
- package/src/config/templates/BOOTSTRAP.md +21 -31
- package/src/config/templates/SOUL.md +19 -9
- package/src/daemon/computer-use-session.ts +5 -3
- package/src/daemon/handlers/config-voice.ts +155 -33
- package/src/daemon/handlers/dictation.ts +361 -214
- package/src/daemon/session-runtime-assembly.ts +477 -247
- package/src/daemon/session-surfaces.ts +5 -3
|
@@ -1,13 +1,38 @@
|
|
|
1
|
-
import * as net from
|
|
2
|
-
|
|
3
|
-
import {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
1
|
+
import * as net from "node:net";
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
createTimeout,
|
|
5
|
+
extractToolUse,
|
|
6
|
+
getConfiguredProvider,
|
|
7
|
+
userMessage,
|
|
8
|
+
} from "../../providers/provider-send-message.js";
|
|
9
|
+
import {
|
|
10
|
+
type ProfileResolution,
|
|
11
|
+
resolveProfile,
|
|
12
|
+
} from "../dictation-profile-store.js";
|
|
13
|
+
import {
|
|
14
|
+
applyDictionary,
|
|
15
|
+
expandSnippets,
|
|
16
|
+
} from "../dictation-text-processing.js";
|
|
17
|
+
import type { DictationRequest } from "../ipc-protocol.js";
|
|
18
|
+
import { defineHandlers, type HandlerContext, log } from "./shared.js";
|
|
8
19
|
|
|
9
20
|
// Action verbs for fast heuristic fallback (used when LLM classifier is unavailable)
|
|
10
|
-
const ACTION_VERBS = [
|
|
21
|
+
const ACTION_VERBS = [
|
|
22
|
+
"slack",
|
|
23
|
+
"email",
|
|
24
|
+
"send",
|
|
25
|
+
"create",
|
|
26
|
+
"open",
|
|
27
|
+
"search",
|
|
28
|
+
"find",
|
|
29
|
+
"message",
|
|
30
|
+
"text",
|
|
31
|
+
"schedule",
|
|
32
|
+
"remind",
|
|
33
|
+
"launch",
|
|
34
|
+
"navigate",
|
|
35
|
+
];
|
|
11
36
|
|
|
12
37
|
const DICTATION_CLASSIFICATION_TIMEOUT_MS = 5000;
|
|
13
38
|
|
|
@@ -15,9 +40,9 @@ const MAX_WINDOW_TITLE_LENGTH = 100;
|
|
|
15
40
|
|
|
16
41
|
/** Sanitize window title to mitigate prompt injection from attacker-controlled titles (e.g. browser tabs, Slack conversations). */
|
|
17
42
|
function sanitizeWindowTitle(title: string | undefined): string {
|
|
18
|
-
if (!title) return
|
|
43
|
+
if (!title) return "";
|
|
19
44
|
return title
|
|
20
|
-
.replace(/[<>]/g,
|
|
45
|
+
.replace(/[<>]/g, "") // strip angle brackets to prevent tag injection
|
|
21
46
|
.slice(0, MAX_WINDOW_TITLE_LENGTH);
|
|
22
47
|
}
|
|
23
48
|
|
|
@@ -25,233 +50,170 @@ function sanitizeWindowTitle(title: string | undefined): string {
|
|
|
25
50
|
function buildAppMetadataBlock(msg: DictationRequest): string {
|
|
26
51
|
const windowTitle = sanitizeWindowTitle(msg.context.windowTitle);
|
|
27
52
|
return [
|
|
28
|
-
|
|
53
|
+
"<app_metadata>",
|
|
29
54
|
`App: ${msg.context.appName} (${msg.context.bundleIdentifier})`,
|
|
30
55
|
`Window: ${windowTitle}`,
|
|
31
|
-
|
|
32
|
-
].join(
|
|
56
|
+
"</app_metadata>",
|
|
57
|
+
].join("\n");
|
|
33
58
|
}
|
|
34
59
|
|
|
35
|
-
type DictationMode =
|
|
60
|
+
type DictationMode = "dictation" | "command" | "action";
|
|
36
61
|
|
|
37
62
|
/** Fast heuristic fallback — used when LLM classifier is unavailable or fails. */
|
|
38
|
-
export function detectDictationModeHeuristic(
|
|
63
|
+
export function detectDictationModeHeuristic(
|
|
64
|
+
msg: DictationRequest,
|
|
65
|
+
): DictationMode {
|
|
39
66
|
// Command mode: selected text present — treat transcription as a transformation instruction
|
|
40
67
|
if (msg.context.selectedText && msg.context.selectedText.trim().length > 0) {
|
|
41
|
-
return
|
|
68
|
+
return "command";
|
|
42
69
|
}
|
|
43
70
|
|
|
44
71
|
// Action mode: transcription starts with an action verb
|
|
45
|
-
const firstWord =
|
|
72
|
+
const firstWord =
|
|
73
|
+
msg.transcription.trim().split(/\s+/)[0]?.toLowerCase() ?? "";
|
|
46
74
|
if (ACTION_VERBS.includes(firstWord)) {
|
|
47
|
-
return
|
|
75
|
+
return "action";
|
|
48
76
|
}
|
|
49
77
|
|
|
50
78
|
// Dictation mode: cursor is in a text field with no selection — clean up for typing
|
|
51
79
|
if (msg.context.cursorInTextField) {
|
|
52
|
-
return
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
return 'dictation';
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
/** Classify dictation mode using Haiku, falling back to heuristic. */
|
|
59
|
-
export async function detectDictationMode(msg: DictationRequest): Promise<DictationMode> {
|
|
60
|
-
// Command mode is deterministic — no need for LLM
|
|
61
|
-
if (msg.context.selectedText && msg.context.selectedText.trim().length > 0) {
|
|
62
|
-
return 'command';
|
|
80
|
+
return "dictation";
|
|
63
81
|
}
|
|
64
82
|
|
|
65
|
-
|
|
66
|
-
if (!provider) {
|
|
67
|
-
log.warn('No provider for dictation classification, using heuristic');
|
|
68
|
-
return detectDictationModeHeuristic(msg);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
try {
|
|
72
|
-
const { signal, cleanup } = createTimeout(DICTATION_CLASSIFICATION_TIMEOUT_MS);
|
|
73
|
-
try {
|
|
74
|
-
const contextInfo = [
|
|
75
|
-
`App: ${msg.context.appName} (${msg.context.bundleIdentifier})`,
|
|
76
|
-
msg.context.windowTitle ? `Window: ${msg.context.windowTitle}` : '',
|
|
77
|
-
`Cursor in text field: ${msg.context.cursorInTextField ? 'yes' : 'no'}`,
|
|
78
|
-
].filter(Boolean).join('\n');
|
|
79
|
-
|
|
80
|
-
const response = await provider.sendMessage(
|
|
81
|
-
[userMessage(`Transcription: "${msg.transcription}"\n\nContext:\n${contextInfo}`)],
|
|
82
|
-
[{
|
|
83
|
-
name: 'classify_dictation',
|
|
84
|
-
description: 'Classify whether voice input is dictation or an action command',
|
|
85
|
-
input_schema: {
|
|
86
|
-
type: 'object' as const,
|
|
87
|
-
properties: {
|
|
88
|
-
mode: {
|
|
89
|
-
type: 'string',
|
|
90
|
-
enum: ['dictation', 'action'],
|
|
91
|
-
description: 'dictation = user wants text inserted/cleaned up for typing. action = user wants the assistant to perform a task (send a message, open an app, search, navigate, control something).',
|
|
92
|
-
},
|
|
93
|
-
reasoning: {
|
|
94
|
-
type: 'string',
|
|
95
|
-
description: 'Brief reasoning for the classification',
|
|
96
|
-
},
|
|
97
|
-
},
|
|
98
|
-
required: ['mode', 'reasoning'],
|
|
99
|
-
},
|
|
100
|
-
}],
|
|
101
|
-
[
|
|
102
|
-
'You classify voice transcriptions as either "dictation" (text to insert) or "action" (task for an assistant to execute).',
|
|
103
|
-
'',
|
|
104
|
-
'DICTATION examples: "Hey how are you doing", "I think we should move forward with the proposal", "Dear team comma please review the attached document"',
|
|
105
|
-
'ACTION examples: "Message Aaron on Slack saying hey what\'s up", "Send an email to the team about the meeting", "Open Spotify and play my playlist", "Search for flights to Denver", "Create a new document in Google Docs"',
|
|
106
|
-
'',
|
|
107
|
-
'Key signals for ACTION: the user is addressing an assistant and asking it to DO something (send, message, open, search, create, schedule, etc.)',
|
|
108
|
-
'Key signals for DICTATION: the user is composing text content that should be typed out as-is',
|
|
109
|
-
'',
|
|
110
|
-
'Context is provided — if the cursor is in a text field, lean toward dictation unless the intent to command is clear.',
|
|
111
|
-
].join('\n'),
|
|
112
|
-
{
|
|
113
|
-
config: {
|
|
114
|
-
modelIntent: 'latency-optimized',
|
|
115
|
-
max_tokens: 128,
|
|
116
|
-
tool_choice: { type: 'tool' as const, name: 'classify_dictation' },
|
|
117
|
-
},
|
|
118
|
-
signal,
|
|
119
|
-
},
|
|
120
|
-
);
|
|
121
|
-
cleanup();
|
|
122
|
-
|
|
123
|
-
const toolBlock = extractToolUse(response);
|
|
124
|
-
if (toolBlock) {
|
|
125
|
-
const input = toolBlock.input as { mode?: string; reasoning?: string };
|
|
126
|
-
const mode = input.mode === 'action' ? 'action' : 'dictation';
|
|
127
|
-
log.info({ mode, reasoning: input.reasoning }, 'LLM dictation classification');
|
|
128
|
-
return mode;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
log.warn('No tool_use block in dictation classification, using heuristic');
|
|
132
|
-
return detectDictationModeHeuristic(msg);
|
|
133
|
-
} finally {
|
|
134
|
-
cleanup();
|
|
135
|
-
}
|
|
136
|
-
} catch (err) {
|
|
137
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
138
|
-
log.warn({ err: message }, 'LLM dictation classification failed, using heuristic');
|
|
139
|
-
return detectDictationModeHeuristic(msg);
|
|
140
|
-
}
|
|
83
|
+
return "dictation";
|
|
141
84
|
}
|
|
142
85
|
|
|
143
|
-
|
|
86
|
+
/** Build a combined system prompt that classifies AND cleans dictation in a single LLM call. */
|
|
87
|
+
function buildCombinedDictationPrompt(
|
|
88
|
+
msg: DictationRequest,
|
|
89
|
+
stylePrompt?: string,
|
|
90
|
+
): string {
|
|
144
91
|
const sections = [
|
|
145
|
-
|
|
146
|
-
'',
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
92
|
+
"You are a voice input assistant. You will receive a speech transcription and must:",
|
|
93
|
+
'1. Classify it as "dictation" (text to insert) or "action" (task for an assistant to execute)',
|
|
94
|
+
"2. If dictation, clean up the text. If action, return the raw transcription.",
|
|
95
|
+
"",
|
|
96
|
+
"## Classification",
|
|
97
|
+
'DICTATION examples: "Hey how are you doing", "I think we should move forward with the proposal", "Dear team comma please review the attached document"',
|
|
98
|
+
'ACTION examples: "Message Aaron on Slack saying hey what\'s up", "Send an email to the team about the meeting", "Open Spotify and play my playlist", "Search for flights to Denver", "Create a new document in Google Docs"',
|
|
99
|
+
"",
|
|
100
|
+
"Key signals for ACTION: the user is addressing an assistant and asking it to DO something (send, message, open, search, create, schedule, etc.)",
|
|
101
|
+
"Key signals for DICTATION: the user is composing text content that should be typed out as-is",
|
|
102
|
+
`Cursor in text field: ${msg.context.cursorInTextField ? "yes" : "no"} — if yes, lean toward dictation unless the intent to command is clear.`,
|
|
103
|
+
"",
|
|
104
|
+
"## Cleanup Rules (for dictation mode only)",
|
|
105
|
+
"- Fix grammar, punctuation, and capitalization",
|
|
106
|
+
"- Remove filler words (um, uh, like, you know)",
|
|
150
107
|
'- Rewrite vague or hedging language ("so yeah probably", "I guess maybe") into clear, confident statements',
|
|
151
108
|
"- Maintain the speaker's intent and meaning",
|
|
152
|
-
'- Do NOT add explanations or commentary',
|
|
153
|
-
'- Return ONLY the cleaned text, nothing else',
|
|
154
109
|
];
|
|
155
110
|
|
|
156
111
|
if (stylePrompt) {
|
|
157
112
|
sections.push(
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
113
|
+
"",
|
|
114
|
+
"## User Style (HIGHEST PRIORITY)",
|
|
115
|
+
"The user has configured these style preferences. They OVERRIDE the default tone adaptation below.",
|
|
116
|
+
"Follow these instructions precisely — they reflect the user's personal writing voice and preferences.",
|
|
117
|
+
"",
|
|
163
118
|
stylePrompt,
|
|
164
119
|
);
|
|
165
120
|
}
|
|
166
121
|
|
|
167
|
-
sections.push(
|
|
168
|
-
'',
|
|
169
|
-
'## Tone Adaptation',
|
|
170
|
-
);
|
|
122
|
+
sections.push("", "## Tone Adaptation");
|
|
171
123
|
|
|
172
124
|
if (stylePrompt) {
|
|
173
|
-
sections.push(
|
|
125
|
+
sections.push(
|
|
126
|
+
"Use these as fallback guidance only when the User Style above does not cover a specific aspect:",
|
|
127
|
+
);
|
|
174
128
|
} else {
|
|
175
|
-
sections.push(
|
|
129
|
+
sections.push("Adapt your output tone based on the active application:");
|
|
176
130
|
}
|
|
177
131
|
|
|
178
132
|
sections.push(
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
133
|
+
"- Email apps (Gmail, Mail): Professional but warm. Use proper greetings and sign-offs if appropriate.",
|
|
134
|
+
"- Slack: Casual and conversational. Match typical chat style.",
|
|
135
|
+
"- Code editors (VS Code, Xcode): Technical and concise. Code comments style.",
|
|
136
|
+
"- Terminal: Command-like, terse.",
|
|
137
|
+
"- Messages/iMessage: Very casual, texting style. Short sentences.",
|
|
138
|
+
"- Notes/Docs: Neutral, clear writing.",
|
|
139
|
+
"- Default: Match the user's natural voice.",
|
|
140
|
+
"",
|
|
141
|
+
"## Context Clues",
|
|
142
|
+
"- Window title may contain recipient name (Slack DMs, email compose)",
|
|
143
|
+
"- If you can identify a recipient, adapt formality to the apparent relationship",
|
|
144
|
+
"- Maintain the user's natural voice — don't over-formalize casual speech",
|
|
145
|
+
"- The user's writing patterns and preferences may be available from memory context — follow those when present",
|
|
146
|
+
"",
|
|
193
147
|
buildAppMetadataBlock(msg),
|
|
194
148
|
);
|
|
195
149
|
|
|
196
|
-
return sections.join(
|
|
150
|
+
return sections.join("\n");
|
|
197
151
|
}
|
|
198
152
|
|
|
199
|
-
function buildCommandPrompt(
|
|
153
|
+
function buildCommandPrompt(
|
|
154
|
+
msg: DictationRequest,
|
|
155
|
+
stylePrompt?: string,
|
|
156
|
+
): string {
|
|
200
157
|
const sections = [
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
158
|
+
"You are a text transformation assistant. The user has selected text and given a voice command to transform it.",
|
|
159
|
+
"",
|
|
160
|
+
"## Rules",
|
|
161
|
+
"- Apply the instruction to the selected text",
|
|
162
|
+
"- Return ONLY the transformed text, nothing else",
|
|
163
|
+
"- Do NOT add explanations or commentary",
|
|
207
164
|
];
|
|
208
165
|
|
|
209
166
|
if (stylePrompt) {
|
|
210
167
|
sections.push(
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
168
|
+
"",
|
|
169
|
+
"## User Style (HIGHEST PRIORITY)",
|
|
170
|
+
"The user has configured these style preferences. They OVERRIDE the default tone adaptation below.",
|
|
171
|
+
"Follow these instructions precisely — they reflect the user's personal writing voice and preferences.",
|
|
172
|
+
"",
|
|
216
173
|
stylePrompt,
|
|
217
174
|
);
|
|
218
175
|
}
|
|
219
176
|
|
|
220
|
-
sections.push(
|
|
221
|
-
'',
|
|
222
|
-
'## Tone Adaptation',
|
|
223
|
-
);
|
|
177
|
+
sections.push("", "## Tone Adaptation");
|
|
224
178
|
|
|
225
179
|
if (stylePrompt) {
|
|
226
|
-
sections.push(
|
|
180
|
+
sections.push(
|
|
181
|
+
"Use these as fallback guidance only when the User Style above does not cover a specific aspect:",
|
|
182
|
+
);
|
|
227
183
|
} else {
|
|
228
|
-
sections.push(
|
|
184
|
+
sections.push("Match the tone to the active application context:");
|
|
229
185
|
}
|
|
230
186
|
|
|
231
187
|
sections.push(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
188
|
+
"- Email apps (Gmail, Mail): Professional but warm.",
|
|
189
|
+
"- Slack: Casual and conversational.",
|
|
190
|
+
"- Code editors (VS Code, Xcode): Technical and concise.",
|
|
191
|
+
"- Terminal: Command-like, terse.",
|
|
192
|
+
"- Messages/iMessage: Very casual, texting style.",
|
|
193
|
+
"- Notes/Docs: Neutral, clear writing.",
|
|
194
|
+
"- Default: Match the user's natural voice.",
|
|
195
|
+
"",
|
|
196
|
+
"## Context Clues",
|
|
197
|
+
"- Window title may contain recipient name (Slack DMs, email compose)",
|
|
198
|
+
"- If you can identify a recipient, adapt formality to the apparent relationship",
|
|
199
|
+
"- Maintain the user's natural voice — don't over-formalize casual speech",
|
|
200
|
+
"- The user's writing patterns and preferences may be available from memory context — follow those when present",
|
|
201
|
+
"",
|
|
246
202
|
buildAppMetadataBlock(msg),
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
msg.context.selectedText ??
|
|
250
|
-
|
|
203
|
+
"",
|
|
204
|
+
"Selected text:",
|
|
205
|
+
msg.context.selectedText ?? "",
|
|
206
|
+
"",
|
|
251
207
|
`Instruction: ${msg.transcription}`,
|
|
252
208
|
);
|
|
253
209
|
|
|
254
|
-
return sections.join(
|
|
210
|
+
return sections.join("\n");
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/** Compute dynamic max_tokens based on input length to avoid waste and truncation. */
|
|
214
|
+
function computeMaxTokens(inputLength: number): number {
|
|
215
|
+
const estimatedInputTokens = Math.ceil(inputLength / 3);
|
|
216
|
+
return Math.max(256, estimatedInputTokens + 128);
|
|
255
217
|
}
|
|
256
218
|
|
|
257
219
|
export async function handleDictationRequest(
|
|
@@ -259,8 +221,10 @@ export async function handleDictationRequest(
|
|
|
259
221
|
socket: net.Socket,
|
|
260
222
|
ctx: HandlerContext,
|
|
261
223
|
): Promise<void> {
|
|
262
|
-
|
|
263
|
-
|
|
224
|
+
log.info(
|
|
225
|
+
{ transcriptionLength: msg.transcription.length },
|
|
226
|
+
"Dictation request received",
|
|
227
|
+
);
|
|
264
228
|
|
|
265
229
|
// Resolve profile for all modes (metadata is included in response)
|
|
266
230
|
const resolution = resolveProfile(
|
|
@@ -269,70 +233,253 @@ export async function handleDictationRequest(
|
|
|
269
233
|
msg.profileId,
|
|
270
234
|
);
|
|
271
235
|
const { profile, source: profileSource } = resolution;
|
|
272
|
-
log.info(
|
|
236
|
+
log.info(
|
|
237
|
+
{ profileId: profile.id, profileSource },
|
|
238
|
+
"Resolved dictation profile",
|
|
239
|
+
);
|
|
273
240
|
|
|
274
241
|
const profileMeta = {
|
|
275
242
|
resolvedProfileId: profile.id,
|
|
276
243
|
profileSource,
|
|
277
244
|
};
|
|
278
245
|
|
|
279
|
-
|
|
280
|
-
|
|
246
|
+
const stylePrompt = profile.stylePrompt || undefined;
|
|
247
|
+
|
|
248
|
+
// Command mode: selected text present — deterministic, no classification needed
|
|
249
|
+
if (msg.context.selectedText && msg.context.selectedText.trim().length > 0) {
|
|
250
|
+
log.info({ mode: "command" }, "Command mode (selected text present)");
|
|
251
|
+
await handleCommandMode(
|
|
252
|
+
msg,
|
|
253
|
+
socket,
|
|
254
|
+
ctx,
|
|
255
|
+
profile,
|
|
256
|
+
profileMeta,
|
|
257
|
+
stylePrompt,
|
|
258
|
+
);
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Non-command: single LLM call that classifies AND cleans in one shot
|
|
263
|
+
const transcription = expandSnippets(msg.transcription, profile.snippets);
|
|
264
|
+
|
|
265
|
+
try {
|
|
266
|
+
const provider = getConfiguredProvider();
|
|
267
|
+
if (!provider) {
|
|
268
|
+
log.warn(
|
|
269
|
+
"Dictation: no provider available, using heuristic + raw transcription",
|
|
270
|
+
);
|
|
271
|
+
const mode = detectDictationModeHeuristic(msg);
|
|
272
|
+
const normalizedText = applyDictionary(transcription, profile.dictionary);
|
|
273
|
+
if (mode === "action") {
|
|
274
|
+
ctx.send(socket, {
|
|
275
|
+
type: "dictation_response",
|
|
276
|
+
text: msg.transcription,
|
|
277
|
+
mode: "action",
|
|
278
|
+
actionPlan: `User wants to: ${msg.transcription}`,
|
|
279
|
+
...profileMeta,
|
|
280
|
+
});
|
|
281
|
+
} else {
|
|
282
|
+
ctx.send(socket, {
|
|
283
|
+
type: "dictation_response",
|
|
284
|
+
text: normalizedText,
|
|
285
|
+
mode,
|
|
286
|
+
...profileMeta,
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const systemPrompt = buildCombinedDictationPrompt(msg, stylePrompt);
|
|
293
|
+
const maxTokens = computeMaxTokens(transcription.length);
|
|
294
|
+
const { signal, cleanup } = createTimeout(
|
|
295
|
+
DICTATION_CLASSIFICATION_TIMEOUT_MS,
|
|
296
|
+
);
|
|
297
|
+
|
|
298
|
+
try {
|
|
299
|
+
const response = await provider.sendMessage(
|
|
300
|
+
[userMessage(`Transcription: "${transcription}"`)],
|
|
301
|
+
[
|
|
302
|
+
{
|
|
303
|
+
name: "process_dictation",
|
|
304
|
+
description: "Classify the voice input and return cleaned text",
|
|
305
|
+
input_schema: {
|
|
306
|
+
type: "object" as const,
|
|
307
|
+
properties: {
|
|
308
|
+
mode: {
|
|
309
|
+
type: "string",
|
|
310
|
+
enum: ["dictation", "action"],
|
|
311
|
+
description:
|
|
312
|
+
"dictation = user wants text inserted/cleaned up for typing. action = user wants the assistant to perform a task (send a message, open an app, search, navigate, control something).",
|
|
313
|
+
},
|
|
314
|
+
text: {
|
|
315
|
+
type: "string",
|
|
316
|
+
description:
|
|
317
|
+
"If dictation: the cleaned/formatted text ready for insertion. If action: the raw transcription unchanged.",
|
|
318
|
+
},
|
|
319
|
+
reasoning: {
|
|
320
|
+
type: "string",
|
|
321
|
+
description: "Brief reasoning for the classification",
|
|
322
|
+
},
|
|
323
|
+
},
|
|
324
|
+
required: ["mode", "text", "reasoning"],
|
|
325
|
+
},
|
|
326
|
+
},
|
|
327
|
+
],
|
|
328
|
+
systemPrompt,
|
|
329
|
+
{
|
|
330
|
+
config: {
|
|
331
|
+
modelIntent: "latency-optimized",
|
|
332
|
+
max_tokens: maxTokens,
|
|
333
|
+
tool_choice: { type: "tool" as const, name: "process_dictation" },
|
|
334
|
+
},
|
|
335
|
+
signal,
|
|
336
|
+
},
|
|
337
|
+
);
|
|
338
|
+
cleanup();
|
|
339
|
+
|
|
340
|
+
const toolBlock = extractToolUse(response);
|
|
341
|
+
if (toolBlock) {
|
|
342
|
+
const input = toolBlock.input as {
|
|
343
|
+
mode?: string;
|
|
344
|
+
text?: string;
|
|
345
|
+
reasoning?: string;
|
|
346
|
+
};
|
|
347
|
+
const mode: DictationMode =
|
|
348
|
+
input.mode === "action" ? "action" : "dictation";
|
|
349
|
+
log.info(
|
|
350
|
+
{ mode, reasoning: input.reasoning },
|
|
351
|
+
"LLM dictation classify+clean",
|
|
352
|
+
);
|
|
353
|
+
|
|
354
|
+
if (mode === "action") {
|
|
355
|
+
ctx.send(socket, {
|
|
356
|
+
type: "dictation_response",
|
|
357
|
+
text: msg.transcription,
|
|
358
|
+
mode: "action",
|
|
359
|
+
actionPlan: `User wants to: ${msg.transcription}`,
|
|
360
|
+
...profileMeta,
|
|
361
|
+
});
|
|
362
|
+
} else {
|
|
363
|
+
const cleanedText = input.text?.trim() || transcription;
|
|
364
|
+
const normalizedText = applyDictionary(
|
|
365
|
+
cleanedText,
|
|
366
|
+
profile.dictionary,
|
|
367
|
+
);
|
|
368
|
+
ctx.send(socket, {
|
|
369
|
+
type: "dictation_response",
|
|
370
|
+
text: normalizedText,
|
|
371
|
+
mode: "dictation",
|
|
372
|
+
...profileMeta,
|
|
373
|
+
});
|
|
374
|
+
}
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// No tool_use block — fall through to heuristic
|
|
379
|
+
log.warn("No tool_use block in combined dictation call, using heuristic");
|
|
380
|
+
} finally {
|
|
381
|
+
cleanup();
|
|
382
|
+
}
|
|
383
|
+
} catch (err) {
|
|
384
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
385
|
+
log.warn(
|
|
386
|
+
{ err: message },
|
|
387
|
+
"Combined dictation LLM call failed, using heuristic",
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Heuristic fallback
|
|
392
|
+
const fallbackMode = detectDictationModeHeuristic(msg);
|
|
393
|
+
log.info({ mode: fallbackMode }, "Using heuristic fallback");
|
|
394
|
+
if (fallbackMode === "action") {
|
|
281
395
|
ctx.send(socket, {
|
|
282
|
-
type:
|
|
396
|
+
type: "dictation_response",
|
|
283
397
|
text: msg.transcription,
|
|
284
|
-
mode:
|
|
398
|
+
mode: "action",
|
|
285
399
|
actionPlan: `User wants to: ${msg.transcription}`,
|
|
286
400
|
...profileMeta,
|
|
287
401
|
});
|
|
288
|
-
|
|
402
|
+
} else {
|
|
403
|
+
const normalizedText = applyDictionary(transcription, profile.dictionary);
|
|
404
|
+
ctx.send(socket, {
|
|
405
|
+
type: "dictation_response",
|
|
406
|
+
text: normalizedText,
|
|
407
|
+
mode: fallbackMode,
|
|
408
|
+
...profileMeta,
|
|
409
|
+
});
|
|
289
410
|
}
|
|
411
|
+
}
|
|
290
412
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
413
|
+
/** Handle command mode (selected text) — separate code path, latency-optimized. */
|
|
414
|
+
async function handleCommandMode(
|
|
415
|
+
msg: DictationRequest,
|
|
416
|
+
socket: net.Socket,
|
|
417
|
+
ctx: HandlerContext,
|
|
418
|
+
profile: ReturnType<typeof resolveProfile>["profile"],
|
|
419
|
+
profileMeta: {
|
|
420
|
+
resolvedProfileId: string;
|
|
421
|
+
profileSource: ProfileResolution["source"];
|
|
422
|
+
},
|
|
423
|
+
stylePrompt: string | undefined,
|
|
424
|
+
): Promise<void> {
|
|
425
|
+
const systemPrompt = buildCommandPrompt(msg, stylePrompt);
|
|
426
|
+
const inputLength =
|
|
427
|
+
(msg.context.selectedText ?? "").length + msg.transcription.length;
|
|
428
|
+
const maxTokens = Math.max(1024, computeMaxTokens(inputLength));
|
|
303
429
|
|
|
304
430
|
try {
|
|
305
431
|
const provider = getConfiguredProvider();
|
|
306
432
|
if (!provider) {
|
|
307
|
-
log.warn(
|
|
308
|
-
const
|
|
309
|
-
|
|
310
|
-
|
|
433
|
+
log.warn("Command mode: no provider available, returning selected text");
|
|
434
|
+
const normalizedText = applyDictionary(
|
|
435
|
+
msg.context.selectedText ?? msg.transcription,
|
|
436
|
+
profile.dictionary,
|
|
437
|
+
);
|
|
438
|
+
ctx.send(socket, {
|
|
439
|
+
type: "dictation_response",
|
|
440
|
+
text: normalizedText,
|
|
441
|
+
mode: "command",
|
|
442
|
+
...profileMeta,
|
|
443
|
+
});
|
|
311
444
|
return;
|
|
312
445
|
}
|
|
313
446
|
|
|
314
447
|
const response = await provider.sendMessage(
|
|
315
|
-
[
|
|
448
|
+
[userMessage(msg.transcription)],
|
|
316
449
|
[], // no tools
|
|
317
450
|
systemPrompt,
|
|
318
|
-
{ config: { max_tokens:
|
|
451
|
+
{ config: { modelIntent: "latency-optimized", max_tokens: maxTokens } },
|
|
319
452
|
);
|
|
320
453
|
|
|
321
|
-
const textBlock = response.content.find((b) => b.type ===
|
|
322
|
-
const
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
454
|
+
const textBlock = response.content.find((b) => b.type === "text");
|
|
455
|
+
const cleanedText =
|
|
456
|
+
textBlock && "text" in textBlock
|
|
457
|
+
? textBlock.text.trim()
|
|
458
|
+
: (msg.context.selectedText ?? msg.transcription);
|
|
326
459
|
const normalizedText = applyDictionary(cleanedText, profile.dictionary);
|
|
327
|
-
|
|
328
|
-
|
|
460
|
+
ctx.send(socket, {
|
|
461
|
+
type: "dictation_response",
|
|
462
|
+
text: normalizedText,
|
|
463
|
+
mode: "command",
|
|
464
|
+
...profileMeta,
|
|
465
|
+
});
|
|
329
466
|
} catch (err) {
|
|
330
467
|
const message = err instanceof Error ? err.message : String(err);
|
|
331
|
-
log.error({ err },
|
|
332
|
-
const
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
468
|
+
log.error({ err }, "Command mode LLM call failed, returning selected text");
|
|
469
|
+
const normalizedText = applyDictionary(
|
|
470
|
+
msg.context.selectedText ?? msg.transcription,
|
|
471
|
+
profile.dictionary,
|
|
472
|
+
);
|
|
473
|
+
ctx.send(socket, {
|
|
474
|
+
type: "dictation_response",
|
|
475
|
+
text: normalizedText,
|
|
476
|
+
mode: "command",
|
|
477
|
+
...profileMeta,
|
|
478
|
+
});
|
|
479
|
+
ctx.send(socket, {
|
|
480
|
+
type: "error",
|
|
481
|
+
message: `Dictation cleanup failed: ${message}`,
|
|
482
|
+
});
|
|
336
483
|
}
|
|
337
484
|
}
|
|
338
485
|
|