@agent-native/core 0.7.20 → 0.7.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/dist/agent/engine/ai-sdk-engine.d.ts.map +1 -1
  2. package/dist/agent/engine/ai-sdk-engine.js +43 -1
  3. package/dist/agent/engine/ai-sdk-engine.js.map +1 -1
  4. package/dist/agent/engine/anthropic-engine.d.ts.map +1 -1
  5. package/dist/agent/engine/anthropic-engine.js +8 -0
  6. package/dist/agent/engine/anthropic-engine.js.map +1 -1
  7. package/dist/agent/engine/builder-engine.d.ts +1 -1
  8. package/dist/agent/engine/builder-engine.d.ts.map +1 -1
  9. package/dist/agent/engine/builder-engine.js +9 -4
  10. package/dist/agent/engine/builder-engine.js.map +1 -1
  11. package/dist/agent/engine/translate-ai-sdk.d.ts.map +1 -1
  12. package/dist/agent/engine/translate-ai-sdk.js +31 -1
  13. package/dist/agent/engine/translate-ai-sdk.js.map +1 -1
  14. package/dist/agent/engine/translate-anthropic.d.ts.map +1 -1
  15. package/dist/agent/engine/translate-anthropic.js +16 -0
  16. package/dist/agent/engine/translate-anthropic.js.map +1 -1
  17. package/dist/agent/engine/types.d.ts +16 -1
  18. package/dist/agent/engine/types.d.ts.map +1 -1
  19. package/dist/agent/engine/types.js.map +1 -1
  20. package/dist/agent/production-agent.d.ts +4 -0
  21. package/dist/agent/production-agent.d.ts.map +1 -1
  22. package/dist/agent/production-agent.js +96 -4
  23. package/dist/agent/production-agent.js.map +1 -1
  24. package/dist/agent/types.d.ts +3 -0
  25. package/dist/agent/types.d.ts.map +1 -1
  26. package/dist/agent/types.js.map +1 -1
  27. package/dist/client/AgentPanel.d.ts.map +1 -1
  28. package/dist/client/AgentPanel.js +5 -5
  29. package/dist/client/AgentPanel.js.map +1 -1
  30. package/dist/client/AssistantChat.d.ts +5 -0
  31. package/dist/client/AssistantChat.d.ts.map +1 -1
  32. package/dist/client/AssistantChat.js +54 -2
  33. package/dist/client/AssistantChat.js.map +1 -1
  34. package/dist/client/MultiTabAssistantChat.d.ts.map +1 -1
  35. package/dist/client/MultiTabAssistantChat.js +33 -2
  36. package/dist/client/MultiTabAssistantChat.js.map +1 -1
  37. package/dist/client/NewWorkspaceAppFlow.d.ts.map +1 -1
  38. package/dist/client/NewWorkspaceAppFlow.js +15 -8
  39. package/dist/client/NewWorkspaceAppFlow.js.map +1 -1
  40. package/dist/client/agent-chat-adapter.d.ts +4 -0
  41. package/dist/client/agent-chat-adapter.d.ts.map +1 -1
  42. package/dist/client/agent-chat-adapter.js +5 -1
  43. package/dist/client/agent-chat-adapter.js.map +1 -1
  44. package/dist/client/composer/TiptapComposer.d.ts +6 -1
  45. package/dist/client/composer/TiptapComposer.d.ts.map +1 -1
  46. package/dist/client/composer/TiptapComposer.js +25 -17
  47. package/dist/client/composer/TiptapComposer.js.map +1 -1
  48. package/dist/client/composer/useVoiceDictation.d.ts +6 -5
  49. package/dist/client/composer/useVoiceDictation.d.ts.map +1 -1
  50. package/dist/client/composer/useVoiceDictation.js +54 -21
  51. package/dist/client/composer/useVoiceDictation.js.map +1 -1
  52. package/dist/client/notifications/NotificationsBell.d.ts.map +1 -1
  53. package/dist/client/notifications/NotificationsBell.js +28 -1
  54. package/dist/client/notifications/NotificationsBell.js.map +1 -1
  55. package/dist/client/org/OrgSwitcher.d.ts +3 -1
  56. package/dist/client/org/OrgSwitcher.d.ts.map +1 -1
  57. package/dist/client/org/OrgSwitcher.js +12 -7
  58. package/dist/client/org/OrgSwitcher.js.map +1 -1
  59. package/dist/client/settings/AutomationsSection.d.ts.map +1 -1
  60. package/dist/client/settings/AutomationsSection.js +2 -2
  61. package/dist/client/settings/AutomationsSection.js.map +1 -1
  62. package/dist/client/settings/VoiceTranscriptionSection.d.ts.map +1 -1
  63. package/dist/client/settings/VoiceTranscriptionSection.js +46 -15
  64. package/dist/client/settings/VoiceTranscriptionSection.js.map +1 -1
  65. package/dist/client/tools/ToolViewer.d.ts.map +1 -1
  66. package/dist/client/tools/ToolViewer.js +2 -2
  67. package/dist/client/tools/ToolViewer.js.map +1 -1
  68. package/dist/client/tools/ToolsListPage.d.ts.map +1 -1
  69. package/dist/client/tools/ToolsListPage.js +4 -4
  70. package/dist/client/tools/ToolsListPage.js.map +1 -1
  71. package/dist/client/tools/ToolsSidebarSection.d.ts.map +1 -1
  72. package/dist/client/tools/ToolsSidebarSection.js +2 -2
  73. package/dist/client/tools/ToolsSidebarSection.js.map +1 -1
  74. package/dist/client/transcription/use-live-transcription.d.ts +1 -0
  75. package/dist/client/transcription/use-live-transcription.d.ts.map +1 -1
  76. package/dist/client/transcription/use-live-transcription.js +41 -0
  77. package/dist/client/transcription/use-live-transcription.js.map +1 -1
  78. package/dist/integrations/adapters/email.js +81 -5
  79. package/dist/integrations/adapters/email.js.map +1 -1
  80. package/dist/integrations/plugin.d.ts.map +1 -1
  81. package/dist/integrations/plugin.js +2 -1
  82. package/dist/integrations/plugin.js.map +1 -1
  83. package/dist/integrations/types.d.ts +2 -0
  84. package/dist/integrations/types.d.ts.map +1 -1
  85. package/dist/integrations/types.js.map +1 -1
  86. package/dist/integrations/webhook-handler.js +12 -2
  87. package/dist/integrations/webhook-handler.js.map +1 -1
  88. package/dist/oauth-tokens/store.d.ts.map +1 -1
  89. package/dist/oauth-tokens/store.js +34 -16
  90. package/dist/oauth-tokens/store.js.map +1 -1
  91. package/dist/scripts/db/exec.d.ts.map +1 -1
  92. package/dist/scripts/db/exec.js +32 -23
  93. package/dist/scripts/db/exec.js.map +1 -1
  94. package/dist/scripts/db/patch.d.ts.map +1 -1
  95. package/dist/scripts/db/patch.js +48 -35
  96. package/dist/scripts/db/patch.js.map +1 -1
  97. package/dist/scripts/db/query.d.ts.map +1 -1
  98. package/dist/scripts/db/query.js +22 -13
  99. package/dist/scripts/db/query.js.map +1 -1
  100. package/dist/scripts/db/safety.d.ts +2 -0
  101. package/dist/scripts/db/safety.d.ts.map +1 -0
  102. package/dist/scripts/db/safety.js +67 -0
  103. package/dist/scripts/db/safety.js.map +1 -0
  104. package/dist/scripts/db/scoping.js +4 -4
  105. package/dist/scripts/db/scoping.js.map +1 -1
  106. package/dist/server/email-template.d.ts +5 -0
  107. package/dist/server/email-template.d.ts.map +1 -1
  108. package/dist/server/email-template.js +7 -4
  109. package/dist/server/email-template.js.map +1 -1
  110. package/dist/server/google-auth-plugin.d.ts.map +1 -1
  111. package/dist/server/google-auth-plugin.js +1 -8
  112. package/dist/server/google-auth-plugin.js.map +1 -1
  113. package/dist/server/index.d.ts +3 -2
  114. package/dist/server/index.d.ts.map +1 -1
  115. package/dist/server/index.js +3 -2
  116. package/dist/server/index.js.map +1 -1
  117. package/dist/server/onboarding-html.d.ts.map +1 -1
  118. package/dist/server/onboarding-html.js +3 -10
  119. package/dist/server/onboarding-html.js.map +1 -1
  120. package/dist/server/ssr-handler.d.ts.map +1 -1
  121. package/dist/server/ssr-handler.js +7 -2
  122. package/dist/server/ssr-handler.js.map +1 -1
  123. package/dist/server/transcribe-voice.d.ts +9 -9
  124. package/dist/server/transcribe-voice.d.ts.map +1 -1
  125. package/dist/server/transcribe-voice.js +405 -51
  126. package/dist/server/transcribe-voice.js.map +1 -1
  127. package/dist/server/voice-providers-status.d.ts.map +1 -1
  128. package/dist/server/voice-providers-status.js +13 -1
  129. package/dist/server/voice-providers-status.js.map +1 -1
  130. package/dist/settings/store.d.ts.map +1 -1
  131. package/dist/settings/store.js +14 -6
  132. package/dist/settings/store.js.map +1 -1
  133. package/dist/shared/reasoning-effort.d.ts +8 -0
  134. package/dist/shared/reasoning-effort.d.ts.map +1 -0
  135. package/dist/shared/reasoning-effort.js +94 -0
  136. package/dist/shared/reasoning-effort.js.map +1 -0
  137. package/dist/templates/default/public/favicon.svg +1 -13
  138. package/dist/templates/default/public/icon-180.svg +1 -13
  139. package/dist/templates/default/public/icon-192.svg +1 -13
  140. package/dist/templates/default/public/icon-512.svg +1 -13
  141. package/dist/templates/workspace-root/scripts/workspace-dev.ts +5 -38
  142. package/dist/transcription/builder-transcription.d.ts +2 -0
  143. package/dist/transcription/builder-transcription.d.ts.map +1 -1
  144. package/dist/transcription/builder-transcription.js +4 -0
  145. package/dist/transcription/builder-transcription.js.map +1 -1
  146. package/dist/vite/client.d.ts.map +1 -1
  147. package/dist/vite/client.js +1 -5
  148. package/dist/vite/client.js.map +1 -1
  149. package/docs/content/voice-input.md +14 -13
  150. package/package.json +1 -1
  151. package/src/templates/default/public/favicon.svg +1 -13
  152. package/src/templates/default/public/icon-180.svg +1 -13
  153. package/src/templates/default/public/icon-192.svg +1 -13
  154. package/src/templates/default/public/icon-512.svg +1 -13
  155. package/src/templates/workspace-root/scripts/workspace-dev.ts +5 -38
@@ -2,15 +2,17 @@
2
2
  * POST /_agent-native/transcribe-voice
3
3
  *
4
4
  * Receives an audio blob from the agent sidebar composer and forwards it to
5
- * OpenAI Whisper. Returns `{ text }` on success, `{ error }` on failure.
5
+ * the configured transcription provider. Returns `{ text }` on success,
6
+ * `{ error }` on failure.
6
7
  *
7
- * Key resolution order (mirrors `templates/clips/actions/request-transcript.ts`):
8
+ * Key resolution order for BYOK providers:
8
9
  * 1. User-scoped encrypted secret (`readAppSecret` — set via the sidebar
9
10
  * settings UI).
10
- * 2. `resolveCredential("OPENAI_API_KEY")` — env var + SQL settings store.
11
+ * 2. `resolveCredential("<PROVIDER>_API_KEY")` — env var + SQL settings
12
+ * store.
11
13
  *
12
- * If no key is configured, returns 400 with an error the composer UI can
13
- * surface (the client falls back to the browser Web Speech API).
14
+ * If no server provider is configured, returns 400 with an error the
15
+ * composer UI can surface (the client falls back to Web Speech when possible).
14
16
  *
15
17
  * This is a framework route rather than a `defineAction` because multipart
16
18
  * audio bodies aren't a clean fit for the action contract (actions are
@@ -23,18 +25,29 @@ import { getSession, DEV_MODE_USER_EMAIL } from "./auth.js";
23
25
  import { appStateGet } from "../application-state/store.js";
24
26
  import { resolveHasBuilderPrivateKey } from "./credential-provider.js";
25
27
  import { transcribeWithBuilder } from "../transcription/builder-transcription.js";
28
+ import { runWithRequestContext } from "./request-context.js";
29
+ import { getOrgContext } from "../org/context.js";
30
+ import { createBuilderEngine } from "../agent/engine/builder-engine.js";
26
31
  const WHISPER_URL = "https://api.openai.com/v1/audio/transcriptions";
27
32
  const GROQ_URL = "https://api.groq.com/openai/v1/audio/transcriptions";
33
+ const GROQ_CHAT_URL = "https://api.groq.com/openai/v1/chat/completions";
28
34
  const GROQ_MODEL = "whisper-large-v3-turbo";
35
+ const GROQ_CLEANUP_MODEL = "llama-3.3-70b-versatile";
29
36
  const OPENAI_MODEL = "whisper-1";
37
+ const OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions";
38
+ const OPENAI_CLEANUP_MODEL = "gpt-5.4-mini";
30
39
  const MAX_AUDIO_BYTES = 25 * 1024 * 1024; // Whisper hard limit.
31
- // Gemini Flash Lite — fastest path when GEMINI_API_KEY is configured.
40
+ const MAX_TRANSCRIPT_CHARS = 40_000;
41
+ // Public Builder transcription model id. The Builder gateway maps this to
42
+ // Gemini 3.1 Flash-Lite.
43
+ const BUILDER_GEMINI_TRANSCRIPTION_MODEL = "gemini-3-1-flash-lite";
44
+ // Gemini Flash Lite BYOK path when GEMINI_API_KEY is configured.
32
45
  // Gemini accepts inline audio; we just give it the bytes and a "transcribe
33
46
  // this" prompt and it replies with text. 2.5x faster TTFT than 2.5 Flash
34
47
  // per Google's release notes, and noticeably snappier than the Whisper
35
48
  // round-trip even on a fast connection.
36
- // gemini-2.0-flash-lite is the stable GA Flash Lite as of April 2026.
37
- // (gemini-3.1-flash-lite-preview was never a real model ID — Gemini is on 2.x naming.)
49
+ // Keep the direct Google AI path on a stable public model id; Builder's
50
+ // managed provider above handles the newer Gemini 3.1 Flash-Lite preview.
38
51
  const GEMINI_MODEL = "gemini-2.0-flash-lite";
39
52
  const GEMINI_URL = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent`;
40
53
  /**
@@ -96,11 +109,15 @@ export function createTranscribeVoiceHandler() {
96
109
  }
97
110
  const parts = await readMultipartFormData(event).catch(() => null);
98
111
  const audio = parts?.find((p) => p.name === "audio");
99
- if (!audio?.data?.length) {
112
+ const textPart = parts?.find((p) => p.name === "text");
113
+ const transcriptText = textPart?.data
114
+ ? sanitizeTranscriptText(textPart.data.toString("utf8"))
115
+ : undefined;
116
+ if (!audio?.data?.length && !transcriptText) {
100
117
  setResponseStatus(event, 400);
101
- return { error: "Missing audio payload" };
118
+ return { error: "Missing audio or transcript payload" };
102
119
  }
103
- if (audio.data.length > MAX_AUDIO_BYTES) {
120
+ if (audio?.data?.length && audio.data.length > MAX_AUDIO_BYTES) {
104
121
  setResponseStatus(event, 413);
105
122
  return { error: "Audio too large (max 25 MB)" };
106
123
  }
@@ -108,6 +125,10 @@ export function createTranscribeVoiceHandler() {
108
125
  const language = languagePart?.data
109
126
  ? languagePart.data.toString("utf8").trim().slice(0, 8)
110
127
  : undefined;
128
+ const instructionsPart = parts?.find((p) => p.name === "instructions");
129
+ const instructions = instructionsPart?.data
130
+ ? sanitizeInstructions(instructionsPart.data.toString("utf8"))
131
+ : undefined;
111
132
  // Resolve provider preference. Per-request "provider" form field takes
112
133
  // precedence (the desktop client sends it on every dictation press),
113
134
  // falling back to the user's stored `voice-transcription-prefs` app
@@ -118,6 +139,18 @@ export function createTranscribeVoiceHandler() {
118
139
  setResponseStatus(event, 401);
119
140
  return { error: "Authentication required" };
120
141
  }
142
+ const orgCtx = session?.email
143
+ ? await getOrgContext(event).catch(() => null)
144
+ : null;
145
+ const requestContext = {
146
+ userEmail: session?.email,
147
+ orgId: orgCtx?.orgId ?? undefined,
148
+ };
149
+ const withRequestContext = async (fn) => requestContext.userEmail
150
+ ? runWithRequestContext(requestContext, fn)
151
+ : fn();
152
+ const hasBuilderPrivateKey = async () => withRequestContext(() => resolveHasBuilderPrivateKey());
153
+ const transcribeWithBuilderForRequest = (opts) => withRequestContext(() => transcribeWithBuilder(opts));
121
154
  const sessionId = session?.email === DEV_MODE_USER_EMAIL
122
155
  ? "local"
123
156
  : (session?.email ?? "local");
@@ -137,6 +170,7 @@ export function createTranscribeVoiceHandler() {
137
170
  if (v === "auto" ||
138
171
  v === "browser" ||
139
172
  v === "builder" ||
173
+ v === "builder-gemini" ||
140
174
  v === "gemini" ||
141
175
  v === "openai" ||
142
176
  v === "groq") {
@@ -165,9 +199,6 @@ export function createTranscribeVoiceHandler() {
165
199
  error: 'Voice provider is set to "browser" (Web Speech API only). Change the preference in Settings → Voice Transcription to use a server-side provider.',
166
200
  };
167
201
  }
168
- const mime = audio.type || "audio/webm";
169
- const audioBytes = new Uint8Array(audio.data.buffer, audio.data.byteOffset, audio.data.byteLength);
170
- let builderError = null;
171
202
  // Per-user-or-fallback API key resolution. Hoisted up so the Gemini
172
203
  // path below can use it without duplicating logic.
173
204
  async function resolveApiKey(key) {
@@ -181,6 +212,24 @@ export function createTranscribeVoiceHandler() {
181
212
  }).catch(() => null);
182
213
  return (userSecret?.value || (await resolveCredential(key, ctx)) || undefined);
183
214
  }
215
+ if (transcriptText) {
216
+ return await cleanupTranscriptText({
217
+ event,
218
+ text: transcriptText,
219
+ instructions,
220
+ providerPref,
221
+ hasBuilderPrivateKey,
222
+ withRequestContext,
223
+ resolveApiKey,
224
+ });
225
+ }
226
+ if (!audio?.data?.length) {
227
+ setResponseStatus(event, 400);
228
+ return { error: "Missing audio payload" };
229
+ }
230
+ const mime = audio.type || "audio/webm";
231
+ const audioBytes = new Uint8Array(audio.data.buffer, audio.data.byteOffset, audio.data.byteLength);
232
+ let builderError = null;
184
233
  // ── Strict per-provider preferences ─────────────────────────────────
185
234
  // When the user explicitly picks a single provider (gemini / builder /
186
235
  // groq), we only try that provider and surface its error rather than
@@ -201,6 +250,7 @@ export function createTranscribeVoiceHandler() {
201
250
  mimeType: mime,
202
251
  apiKey: geminiKey,
203
252
  language: language || undefined,
253
+ instructions,
204
254
  });
205
255
  const trimmed = text.trim();
206
256
  if (!trimmed) {
@@ -216,18 +266,25 @@ export function createTranscribeVoiceHandler() {
216
266
  };
217
267
  }
218
268
  }
219
- if (providerPref === "builder") {
220
- if (!(await resolveHasBuilderPrivateKey())) {
269
+ if (providerPref === "builder" || providerPref === "builder-gemini") {
270
+ const label = providerPref === "builder-gemini"
271
+ ? "Builder Gemini Flash-Lite"
272
+ : "Builder";
273
+ if (!(await hasBuilderPrivateKey())) {
221
274
  setResponseStatus(event, 400);
222
275
  return {
223
- error: "Builder is selected but is not connected. Connect Builder.io in Settings, or change the provider preference.",
276
+ error: `${label} is selected but Builder.io is not connected. Connect Builder.io in Settings, or change the provider preference.`,
224
277
  };
225
278
  }
226
279
  try {
227
- const result = await transcribeWithBuilder({
280
+ const result = await transcribeWithBuilderForRequest({
228
281
  audioBytes,
229
282
  mimeType: mime,
283
+ model: providerPref === "builder-gemini"
284
+ ? BUILDER_GEMINI_TRANSCRIPTION_MODEL
285
+ : undefined,
230
286
  language: language || undefined,
287
+ instructions,
231
288
  });
232
289
  return { text: (result.text ?? "").trim() };
233
290
  }
@@ -238,7 +295,7 @@ export function createTranscribeVoiceHandler() {
238
295
  return { error: message };
239
296
  }
240
297
  setResponseStatus(event, 502);
241
- return { error: `Builder transcription failed: ${message}` };
298
+ return { error: `${label} transcription failed: ${message}` };
242
299
  }
243
300
  }
244
301
  if (providerPref === "groq") {
@@ -260,13 +317,38 @@ export function createTranscribeVoiceHandler() {
260
317
  audioBytes,
261
318
  mime,
262
319
  language,
320
+ instructions,
263
321
  });
264
322
  }
265
323
  // ── Auto / undefined / openai fallback chain ────────────────────────
266
- // ── Gemini Flash Lite path (fastest) ────────────────────────────────
267
- // First-priority when a Gemini key is configured. The provider-pref
268
- // "openai" still forces Whisper; otherwise we try Gemini before
269
- // Builder / Groq / OpenAI Whisper because it's reliably faster.
324
+ // ── Builder Gemini Flash-Lite path ─────────────────────────────────
325
+ // First-priority in auto mode when Builder is connected. This lets users
326
+ // try Gemini 3.1 Flash-Lite without bringing their own Google key.
327
+ if (providerPref !== "openai" && (await hasBuilderPrivateKey())) {
328
+ try {
329
+ const result = await transcribeWithBuilderForRequest({
330
+ audioBytes,
331
+ mimeType: mime,
332
+ model: BUILDER_GEMINI_TRANSCRIPTION_MODEL,
333
+ language: language || undefined,
334
+ instructions,
335
+ });
336
+ return { text: (result.text ?? "").trim() };
337
+ }
338
+ catch (err) {
339
+ const message = err?.message ?? String(err);
340
+ // Surface 402 (credits exhausted) as a 402 so the client can show
341
+ // a specific upgrade prompt.
342
+ if (message.includes("credits exhausted")) {
343
+ setResponseStatus(event, 402);
344
+ return { error: message };
345
+ }
346
+ builderError = message;
347
+ }
348
+ }
349
+ // ── Gemini Flash Lite BYOK path ────────────────────────────────────
350
+ // If Builder is unavailable, try a user-provided Gemini key before
351
+ // Whisper-compatible providers.
270
352
  if (providerPref !== "openai") {
271
353
  const geminiKey = await resolveApiKey("GEMINI_API_KEY");
272
354
  if (geminiKey) {
@@ -276,6 +358,7 @@ export function createTranscribeVoiceHandler() {
276
358
  mimeType: mime,
277
359
  apiKey: geminiKey,
278
360
  language: language || undefined,
361
+ instructions,
279
362
  });
280
363
  const trimmed = text.trim();
281
364
  if (trimmed) {
@@ -289,27 +372,6 @@ export function createTranscribeVoiceHandler() {
289
372
  }
290
373
  }
291
374
  }
292
- // ── Builder proxy path ──────────────────────────────────────────────
293
- if (providerPref !== "openai" && (await resolveHasBuilderPrivateKey())) {
294
- try {
295
- const result = await transcribeWithBuilder({
296
- audioBytes,
297
- mimeType: mime,
298
- language: language || undefined,
299
- });
300
- return { text: (result.text ?? "").trim() };
301
- }
302
- catch (err) {
303
- const message = err?.message ?? String(err);
304
- // Surface 402 (credits exhausted) as a 402 so the client can show
305
- // a specific upgrade prompt.
306
- if (message.includes("credits exhausted")) {
307
- setResponseStatus(event, 402);
308
- return { error: message };
309
- }
310
- builderError = message;
311
- }
312
- }
313
375
  // If Builder is unavailable, fall through to BYOK providers rather than
314
376
  // hard-failing. This mirrors Clips' batch transcription path.
315
377
  // ── Groq / OpenAI Whisper-compatible path ──────────────────────────
@@ -341,8 +403,8 @@ export function createTranscribeVoiceHandler() {
341
403
  setResponseStatus(event, builderError ? 502 : 400);
342
404
  return {
343
405
  error: builderError
344
- ? `Builder transcription failed: ${builderError}. Add GROQ_API_KEY or OPENAI_API_KEY in Settings → API Keys to enable a fallback provider.`
345
- : "No voice transcription provider configured. Connect Builder.io or add GROQ_API_KEY / OPENAI_API_KEY in Settings → API Keys.",
406
+ ? `Builder transcription failed: ${builderError}. Add GEMINI_API_KEY, GROQ_API_KEY, or OPENAI_API_KEY in Settings → API Keys to enable a fallback provider.`
407
+ : "No voice transcription provider configured. Connect Builder.io or add GEMINI_API_KEY / GROQ_API_KEY / OPENAI_API_KEY in Settings → API Keys.",
346
408
  };
347
409
  }
348
410
  return await callWhisperCompat({
@@ -351,6 +413,7 @@ export function createTranscribeVoiceHandler() {
351
413
  audioBytes,
352
414
  mime,
353
415
  language,
416
+ instructions,
354
417
  });
355
418
  });
356
419
  }
@@ -361,7 +424,7 @@ export function createTranscribeVoiceHandler() {
361
424
  * strict-Groq preference path and the auto fallback chain share one
362
425
  * implementation.
363
426
  */
364
- async function callWhisperCompat({ event, provider, audioBytes, mime, language, }) {
427
+ async function callWhisperCompat({ event, provider, audioBytes, mime, language, instructions, }) {
365
428
  const ext = pickExtension(mime);
366
429
  const filename = `composer-voice.${ext}`;
367
430
  const form = new FormData();
@@ -370,6 +433,8 @@ async function callWhisperCompat({ event, provider, audioBytes, mime, language,
370
433
  form.append("response_format", "json");
371
434
  if (language)
372
435
  form.append("language", language);
436
+ if (instructions)
437
+ form.append("prompt", instructions);
373
438
  const controller = new AbortController();
374
439
  const timeout = setTimeout(() => controller.abort(), 45_000);
375
440
  try {
@@ -403,6 +468,252 @@ async function callWhisperCompat({ event, provider, audioBytes, mime, language,
403
468
  clearTimeout(timeout);
404
469
  }
405
470
  }
471
+ async function cleanupTranscriptText({ event, text, instructions, providerPref, hasBuilderPrivateKey, withRequestContext, resolveApiKey, }) {
472
+ const original = text.trim();
473
+ if (!original)
474
+ return { text: "" };
475
+ if (providerPref === "browser") {
476
+ return { text: original };
477
+ }
478
+ if (providerPref === "builder" || providerPref === "builder-gemini") {
479
+ if (!(await hasBuilderPrivateKey())) {
480
+ setResponseStatus(event, 400);
481
+ return {
482
+ error: "Builder.io cleanup is selected but Builder.io is not connected. Connect Builder.io in Settings, or change the provider preference.",
483
+ };
484
+ }
485
+ try {
486
+ const cleaned = await withRequestContext(() => cleanupWithBuilder({ text: original, instructions }));
487
+ return { text: cleaned || original };
488
+ }
489
+ catch (err) {
490
+ setResponseStatus(event, 502);
491
+ return {
492
+ error: `Builder.io cleanup failed: ${err?.message ?? String(err)}`,
493
+ };
494
+ }
495
+ }
496
+ if (providerPref === "gemini") {
497
+ const geminiKey = await resolveApiKey("GEMINI_API_KEY");
498
+ if (!geminiKey) {
499
+ setResponseStatus(event, 400);
500
+ return {
501
+ error: "Gemini cleanup is selected but GEMINI_API_KEY is not configured.",
502
+ };
503
+ }
504
+ try {
505
+ const cleaned = await cleanupWithGemini({
506
+ text: original,
507
+ apiKey: geminiKey,
508
+ instructions,
509
+ });
510
+ return { text: cleaned || original };
511
+ }
512
+ catch (err) {
513
+ setResponseStatus(event, 502);
514
+ return {
515
+ error: `Gemini cleanup failed: ${err?.message ?? String(err)}`,
516
+ };
517
+ }
518
+ }
519
+ if (providerPref === "openai" || providerPref === "groq") {
520
+ const keyName = providerPref === "openai" ? "OPENAI_API_KEY" : "GROQ_API_KEY";
521
+ const apiKey = await resolveApiKey(keyName);
522
+ if (!apiKey) {
523
+ setResponseStatus(event, 400);
524
+ return {
525
+ error: `${providerPref} cleanup is selected but ${keyName} is not configured.`,
526
+ };
527
+ }
528
+ try {
529
+ const cleaned = await cleanupWithChatProvider({
530
+ provider: providerPref,
531
+ text: original,
532
+ apiKey,
533
+ instructions,
534
+ });
535
+ return { text: cleaned || original };
536
+ }
537
+ catch (err) {
538
+ setResponseStatus(event, 502);
539
+ return {
540
+ error: `${providerPref} cleanup failed: ${err?.message ?? String(err)}`,
541
+ };
542
+ }
543
+ }
544
+ if (await hasBuilderPrivateKey()) {
545
+ try {
546
+ const cleaned = await withRequestContext(() => cleanupWithBuilder({ text: original, instructions }));
547
+ if (cleaned)
548
+ return { text: cleaned };
549
+ }
550
+ catch {
551
+ // Fall through to BYOK providers, then raw text.
552
+ }
553
+ }
554
+ const geminiKey = await resolveApiKey("GEMINI_API_KEY");
555
+ if (geminiKey) {
556
+ try {
557
+ const cleaned = await cleanupWithGemini({
558
+ text: original,
559
+ apiKey: geminiKey,
560
+ instructions,
561
+ });
562
+ if (cleaned)
563
+ return { text: cleaned };
564
+ }
565
+ catch {
566
+ // Fall through.
567
+ }
568
+ }
569
+ const groqKey = await resolveApiKey("GROQ_API_KEY");
570
+ if (groqKey) {
571
+ try {
572
+ const cleaned = await cleanupWithChatProvider({
573
+ provider: "groq",
574
+ text: original,
575
+ apiKey: groqKey,
576
+ instructions,
577
+ });
578
+ if (cleaned)
579
+ return { text: cleaned };
580
+ }
581
+ catch {
582
+ // Fall through.
583
+ }
584
+ }
585
+ const openaiKey = await resolveApiKey("OPENAI_API_KEY");
586
+ if (openaiKey) {
587
+ try {
588
+ const cleaned = await cleanupWithChatProvider({
589
+ provider: "openai",
590
+ text: original,
591
+ apiKey: openaiKey,
592
+ instructions,
593
+ });
594
+ if (cleaned)
595
+ return { text: cleaned };
596
+ }
597
+ catch {
598
+ // Fall through.
599
+ }
600
+ }
601
+ return { text: original };
602
+ }
603
+ async function cleanupWithBuilder({ text, instructions, }) {
604
+ const engine = createBuilderEngine();
605
+ const controller = new AbortController();
606
+ const timeout = setTimeout(() => controller.abort(), 8_000);
607
+ let streamedText = "";
608
+ let finalText = "";
609
+ let terminalError;
610
+ try {
611
+ for await (const event of engine.stream({
612
+ model: BUILDER_GEMINI_TRANSCRIPTION_MODEL,
613
+ systemPrompt: buildCleanupSystemPrompt(instructions),
614
+ messages: [
615
+ {
616
+ role: "user",
617
+ content: [{ type: "text", text: buildCleanupUserPrompt(text) }],
618
+ },
619
+ ],
620
+ tools: [],
621
+ abortSignal: controller.signal,
622
+ maxOutputTokens: Math.min(4096, Math.max(512, text.length * 2)),
623
+ temperature: 0,
624
+ })) {
625
+ if (event.type === "text-delta")
626
+ streamedText += event.text;
627
+ if (event.type === "assistant-content") {
628
+ finalText = event.parts
629
+ .filter((part) => part.type === "text")
630
+ .map((part) => part.text)
631
+ .join("")
632
+ .trim();
633
+ }
634
+ if (event.type === "stop" && event.reason === "error") {
635
+ terminalError = event.error ?? "Builder gateway returned an error";
636
+ }
637
+ }
638
+ }
639
+ finally {
640
+ clearTimeout(timeout);
641
+ }
642
+ if (terminalError)
643
+ throw new Error(terminalError);
644
+ return stripTranscriptEnvelope(finalText || streamedText);
645
+ }
646
+ async function cleanupWithGemini({ text, apiKey, instructions, }) {
647
+ const controller = new AbortController();
648
+ const timeout = setTimeout(() => controller.abort(), 8_000);
649
+ try {
650
+ const res = await fetch(GEMINI_URL, {
651
+ method: "POST",
652
+ headers: {
653
+ "Content-Type": "application/json",
654
+ "x-goog-api-key": apiKey,
655
+ },
656
+ body: JSON.stringify({
657
+ contents: [
658
+ {
659
+ parts: [
660
+ { text: buildCleanupSystemPrompt(instructions) },
661
+ { text: buildCleanupUserPrompt(text) },
662
+ ],
663
+ },
664
+ ],
665
+ generationConfig: { temperature: 0 },
666
+ }),
667
+ signal: controller.signal,
668
+ });
669
+ if (!res.ok) {
670
+ const body = await res.text().catch(() => "");
671
+ throw new Error(`Gemini ${res.status}: ${body.slice(0, 300)}`);
672
+ }
673
+ const data = (await res.json());
674
+ const cleaned = data.candidates?.[0]?.content?.parts
675
+ ?.map((p) => p.text ?? "")
676
+ .join("")
677
+ .trim();
678
+ return stripTranscriptEnvelope(cleaned ?? "");
679
+ }
680
+ finally {
681
+ clearTimeout(timeout);
682
+ }
683
+ }
684
+ async function cleanupWithChatProvider({ provider, text, apiKey, instructions, }) {
685
+ const controller = new AbortController();
686
+ const timeout = setTimeout(() => controller.abort(), 8_000);
687
+ const endpoint = provider === "openai" ? OPENAI_CHAT_URL : GROQ_CHAT_URL;
688
+ const model = provider === "openai" ? OPENAI_CLEANUP_MODEL : GROQ_CLEANUP_MODEL;
689
+ try {
690
+ const res = await fetch(endpoint, {
691
+ method: "POST",
692
+ headers: {
693
+ Authorization: `Bearer ${apiKey}`,
694
+ "Content-Type": "application/json",
695
+ },
696
+ body: JSON.stringify({
697
+ model,
698
+ messages: [
699
+ { role: "system", content: buildCleanupSystemPrompt(instructions) },
700
+ { role: "user", content: buildCleanupUserPrompt(text) },
701
+ ],
702
+ temperature: 0,
703
+ }),
704
+ signal: controller.signal,
705
+ });
706
+ if (!res.ok) {
707
+ const body = await res.text().catch(() => "");
708
+ throw new Error(`${provider} ${res.status}: ${body.slice(0, 300)}`);
709
+ }
710
+ const data = (await res.json());
711
+ return stripTranscriptEnvelope(data.choices?.[0]?.message?.content?.trim() ?? "");
712
+ }
713
+ finally {
714
+ clearTimeout(timeout);
715
+ }
716
+ }
406
717
  function pickExtension(mime) {
407
718
  const lower = mime.toLowerCase();
408
719
  if (lower.includes("mp4") || lower.includes("m4a"))
@@ -415,6 +726,51 @@ function pickExtension(mime) {
415
726
  return "wav";
416
727
  return "webm";
417
728
  }
729
+ function sanitizeInstructions(value) {
730
+ const trimmed = value.replace(/\0/g, "").trim();
731
+ if (!trimmed)
732
+ return undefined;
733
+ return trimmed.slice(0, 3000);
734
+ }
735
+ function sanitizeTranscriptText(value) {
736
+ const trimmed = value.replace(/\0/g, "").trim();
737
+ if (!trimmed)
738
+ return undefined;
739
+ return trimmed.slice(0, MAX_TRANSCRIPT_CHARS);
740
+ }
741
+ function buildCleanupSystemPrompt(instructions) {
742
+ const custom = instructions
743
+ ? `\n\nUser's custom cleanup instructions:\n${instructions}`
744
+ : "";
745
+ return `You clean up live speech-recognition transcripts before paste.
746
+
747
+ Rules:
748
+ - Preserve the speaker's meaning and voice.
749
+ - Fix obvious recognition mistakes, punctuation, capitalization, spacing, and casing.
750
+ - Remove false starts and filler only when they are clearly not intentional.
751
+ - Do not add facts, explanations, headings, bullets, quotes, or markdown.
752
+ - Output only the cleaned transcript text.${custom}`;
753
+ }
754
+ function buildCleanupUserPrompt(text) {
755
+ return `Clean up this transcript and return only the final text:\n\n<transcript>\n${text}\n</transcript>`;
756
+ }
757
+ function stripTranscriptEnvelope(value) {
758
+ return value
759
+ .trim()
760
+ .replace(/^```(?:text)?\s*/i, "")
761
+ .replace(/\s*```$/i, "")
762
+ .replace(/^["“](.*)["”]$/s, "$1")
763
+ .trim();
764
+ }
765
+ function buildGeminiTranscriptionPrompt({ language, instructions, }) {
766
+ const base = language
767
+ ? `Transcribe the speech in this audio (language: ${language}).`
768
+ : "Transcribe the speech in this audio.";
769
+ const custom = instructions
770
+ ? `\n\nAdditional user instructions for transcription cleanup:\n${instructions}\n\nApply these only to formatting, casing, punctuation, vocabulary, and cleanup. Do not add content that is not present in the audio.`
771
+ : "";
772
+ return `${base} Output only the transcript text — no preamble, no quotes, no formatting.${custom}`;
773
+ }
418
774
  /**
419
775
  * Transcribe audio via Gemini Flash Lite.
420
776
  *
@@ -426,11 +782,9 @@ function pickExtension(mime) {
426
782
  * FLAC — webm/opus is not officially supported but in practice it
427
783
  * accepts webm too. If Gemini rejects it the caller falls through.
428
784
  */
429
- async function transcribeWithGemini({ audioBytes, mimeType, apiKey, language, }) {
785
+ async function transcribeWithGemini({ audioBytes, mimeType, apiKey, language, instructions, }) {
430
786
  const base64 = uint8ArrayToBase64(audioBytes);
431
- const prompt = language
432
- ? `Transcribe the speech in this audio (language: ${language}). Output only the transcript text — no preamble, no quotes, no formatting.`
433
- : "Transcribe the speech in this audio. Output only the transcript text — no preamble, no quotes, no formatting.";
787
+ const prompt = buildGeminiTranscriptionPrompt({ language, instructions });
434
788
  const controller = new AbortController();
435
789
  const timeout = setTimeout(() => controller.abort(), 30_000);
436
790
  try {