@agent-native/core 0.7.21 → 0.7.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/engine/ai-sdk-engine.d.ts.map +1 -1
- package/dist/agent/engine/ai-sdk-engine.js +43 -1
- package/dist/agent/engine/ai-sdk-engine.js.map +1 -1
- package/dist/agent/engine/anthropic-engine.d.ts.map +1 -1
- package/dist/agent/engine/anthropic-engine.js +8 -0
- package/dist/agent/engine/anthropic-engine.js.map +1 -1
- package/dist/agent/engine/builder-engine.d.ts +1 -1
- package/dist/agent/engine/builder-engine.d.ts.map +1 -1
- package/dist/agent/engine/builder-engine.js +9 -4
- package/dist/agent/engine/builder-engine.js.map +1 -1
- package/dist/agent/engine/translate-ai-sdk.d.ts.map +1 -1
- package/dist/agent/engine/translate-ai-sdk.js +31 -1
- package/dist/agent/engine/translate-ai-sdk.js.map +1 -1
- package/dist/agent/engine/translate-anthropic.d.ts.map +1 -1
- package/dist/agent/engine/translate-anthropic.js +16 -0
- package/dist/agent/engine/translate-anthropic.js.map +1 -1
- package/dist/agent/engine/types.d.ts +16 -1
- package/dist/agent/engine/types.d.ts.map +1 -1
- package/dist/agent/engine/types.js.map +1 -1
- package/dist/agent/production-agent.d.ts +4 -0
- package/dist/agent/production-agent.d.ts.map +1 -1
- package/dist/agent/production-agent.js +96 -4
- package/dist/agent/production-agent.js.map +1 -1
- package/dist/agent/types.d.ts +3 -0
- package/dist/agent/types.d.ts.map +1 -1
- package/dist/agent/types.js.map +1 -1
- package/dist/client/AgentPanel.d.ts.map +1 -1
- package/dist/client/AgentPanel.js +5 -5
- package/dist/client/AgentPanel.js.map +1 -1
- package/dist/client/AssistantChat.d.ts +5 -0
- package/dist/client/AssistantChat.d.ts.map +1 -1
- package/dist/client/AssistantChat.js +54 -2
- package/dist/client/AssistantChat.js.map +1 -1
- package/dist/client/MultiTabAssistantChat.d.ts.map +1 -1
- package/dist/client/MultiTabAssistantChat.js +33 -2
- package/dist/client/MultiTabAssistantChat.js.map +1 -1
- package/dist/client/NewWorkspaceAppFlow.d.ts.map +1 -1
- package/dist/client/NewWorkspaceAppFlow.js +10 -1
- package/dist/client/NewWorkspaceAppFlow.js.map +1 -1
- package/dist/client/agent-chat-adapter.d.ts +4 -0
- package/dist/client/agent-chat-adapter.d.ts.map +1 -1
- package/dist/client/agent-chat-adapter.js +5 -1
- package/dist/client/agent-chat-adapter.js.map +1 -1
- package/dist/client/composer/TiptapComposer.d.ts +6 -1
- package/dist/client/composer/TiptapComposer.d.ts.map +1 -1
- package/dist/client/composer/TiptapComposer.js +25 -17
- package/dist/client/composer/TiptapComposer.js.map +1 -1
- package/dist/client/composer/useVoiceDictation.d.ts +6 -5
- package/dist/client/composer/useVoiceDictation.d.ts.map +1 -1
- package/dist/client/composer/useVoiceDictation.js +54 -21
- package/dist/client/composer/useVoiceDictation.js.map +1 -1
- package/dist/client/org/OrgSwitcher.d.ts +3 -1
- package/dist/client/org/OrgSwitcher.d.ts.map +1 -1
- package/dist/client/org/OrgSwitcher.js +12 -7
- package/dist/client/org/OrgSwitcher.js.map +1 -1
- package/dist/client/settings/AutomationsSection.d.ts.map +1 -1
- package/dist/client/settings/AutomationsSection.js +2 -2
- package/dist/client/settings/AutomationsSection.js.map +1 -1
- package/dist/client/settings/VoiceTranscriptionSection.d.ts.map +1 -1
- package/dist/client/settings/VoiceTranscriptionSection.js +46 -15
- package/dist/client/settings/VoiceTranscriptionSection.js.map +1 -1
- package/dist/client/tools/ToolViewer.d.ts.map +1 -1
- package/dist/client/tools/ToolViewer.js +2 -2
- package/dist/client/tools/ToolViewer.js.map +1 -1
- package/dist/client/tools/ToolsListPage.d.ts.map +1 -1
- package/dist/client/tools/ToolsListPage.js +4 -4
- package/dist/client/tools/ToolsListPage.js.map +1 -1
- package/dist/client/tools/ToolsSidebarSection.d.ts.map +1 -1
- package/dist/client/tools/ToolsSidebarSection.js +2 -2
- package/dist/client/tools/ToolsSidebarSection.js.map +1 -1
- package/dist/client/transcription/use-live-transcription.d.ts +1 -0
- package/dist/client/transcription/use-live-transcription.d.ts.map +1 -1
- package/dist/client/transcription/use-live-transcription.js +41 -0
- package/dist/client/transcription/use-live-transcription.js.map +1 -1
- package/dist/integrations/adapters/email.js +81 -5
- package/dist/integrations/adapters/email.js.map +1 -1
- package/dist/integrations/plugin.d.ts.map +1 -1
- package/dist/integrations/plugin.js +2 -1
- package/dist/integrations/plugin.js.map +1 -1
- package/dist/integrations/types.d.ts +2 -0
- package/dist/integrations/types.d.ts.map +1 -1
- package/dist/integrations/types.js.map +1 -1
- package/dist/integrations/webhook-handler.js +12 -2
- package/dist/integrations/webhook-handler.js.map +1 -1
- package/dist/oauth-tokens/store.d.ts.map +1 -1
- package/dist/oauth-tokens/store.js +34 -16
- package/dist/oauth-tokens/store.js.map +1 -1
- package/dist/scripts/db/exec.d.ts.map +1 -1
- package/dist/scripts/db/exec.js +32 -23
- package/dist/scripts/db/exec.js.map +1 -1
- package/dist/scripts/db/patch.d.ts.map +1 -1
- package/dist/scripts/db/patch.js +48 -35
- package/dist/scripts/db/patch.js.map +1 -1
- package/dist/scripts/db/query.d.ts.map +1 -1
- package/dist/scripts/db/query.js +22 -13
- package/dist/scripts/db/query.js.map +1 -1
- package/dist/scripts/db/safety.d.ts +2 -0
- package/dist/scripts/db/safety.d.ts.map +1 -0
- package/dist/scripts/db/safety.js +67 -0
- package/dist/scripts/db/safety.js.map +1 -0
- package/dist/scripts/db/scoping.js +4 -4
- package/dist/scripts/db/scoping.js.map +1 -1
- package/dist/server/email-template.d.ts +5 -0
- package/dist/server/email-template.d.ts.map +1 -1
- package/dist/server/email-template.js +7 -4
- package/dist/server/email-template.js.map +1 -1
- package/dist/server/google-auth-plugin.d.ts.map +1 -1
- package/dist/server/google-auth-plugin.js +1 -8
- package/dist/server/google-auth-plugin.js.map +1 -1
- package/dist/server/index.d.ts +3 -2
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +3 -2
- package/dist/server/index.js.map +1 -1
- package/dist/server/onboarding-html.d.ts.map +1 -1
- package/dist/server/onboarding-html.js +3 -10
- package/dist/server/onboarding-html.js.map +1 -1
- package/dist/server/transcribe-voice.d.ts +9 -9
- package/dist/server/transcribe-voice.d.ts.map +1 -1
- package/dist/server/transcribe-voice.js +405 -51
- package/dist/server/transcribe-voice.js.map +1 -1
- package/dist/server/voice-providers-status.d.ts.map +1 -1
- package/dist/server/voice-providers-status.js +13 -1
- package/dist/server/voice-providers-status.js.map +1 -1
- package/dist/settings/store.d.ts.map +1 -1
- package/dist/settings/store.js +14 -6
- package/dist/settings/store.js.map +1 -1
- package/dist/shared/reasoning-effort.d.ts +8 -0
- package/dist/shared/reasoning-effort.d.ts.map +1 -0
- package/dist/shared/reasoning-effort.js +94 -0
- package/dist/shared/reasoning-effort.js.map +1 -0
- package/dist/templates/default/public/favicon.svg +1 -13
- package/dist/templates/default/public/icon-180.svg +1 -13
- package/dist/templates/default/public/icon-192.svg +1 -13
- package/dist/templates/default/public/icon-512.svg +1 -13
- package/dist/templates/workspace-root/scripts/workspace-dev.ts +2 -0
- package/dist/transcription/builder-transcription.d.ts +2 -0
- package/dist/transcription/builder-transcription.d.ts.map +1 -1
- package/dist/transcription/builder-transcription.js +4 -0
- package/dist/transcription/builder-transcription.js.map +1 -1
- package/docs/content/voice-input.md +14 -13
- package/package.json +1 -1
- package/src/templates/default/public/favicon.svg +1 -13
- package/src/templates/default/public/icon-180.svg +1 -13
- package/src/templates/default/public/icon-192.svg +1 -13
- package/src/templates/default/public/icon-512.svg +1 -13
- package/src/templates/workspace-root/scripts/workspace-dev.ts +2 -0
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
* POST /_agent-native/transcribe-voice
|
|
3
3
|
*
|
|
4
4
|
* Receives an audio blob from the agent sidebar composer and forwards it to
|
|
5
|
-
*
|
|
5
|
+
* the configured transcription provider. Returns `{ text }` on success,
|
|
6
|
+
* `{ error }` on failure.
|
|
6
7
|
*
|
|
7
|
-
* Key resolution order
|
|
8
|
+
* Key resolution order for BYOK providers:
|
|
8
9
|
* 1. User-scoped encrypted secret (`readAppSecret` — set via the sidebar
|
|
9
10
|
* settings UI).
|
|
10
|
-
* 2. `resolveCredential("
|
|
11
|
+
* 2. `resolveCredential("<PROVIDER>_API_KEY")` — env var + SQL settings
|
|
12
|
+
* store.
|
|
11
13
|
*
|
|
12
|
-
* If no
|
|
13
|
-
* surface (the client falls back to
|
|
14
|
+
* If no server provider is configured, returns 400 with an error the
|
|
15
|
+
* composer UI can surface (the client falls back to Web Speech when possible).
|
|
14
16
|
*
|
|
15
17
|
* This is a framework route rather than a `defineAction` because multipart
|
|
16
18
|
* audio bodies aren't a clean fit for the action contract (actions are
|
|
@@ -23,18 +25,29 @@ import { getSession, DEV_MODE_USER_EMAIL } from "./auth.js";
|
|
|
23
25
|
import { appStateGet } from "../application-state/store.js";
|
|
24
26
|
import { resolveHasBuilderPrivateKey } from "./credential-provider.js";
|
|
25
27
|
import { transcribeWithBuilder } from "../transcription/builder-transcription.js";
|
|
28
|
+
import { runWithRequestContext } from "./request-context.js";
|
|
29
|
+
import { getOrgContext } from "../org/context.js";
|
|
30
|
+
import { createBuilderEngine } from "../agent/engine/builder-engine.js";
|
|
26
31
|
const WHISPER_URL = "https://api.openai.com/v1/audio/transcriptions";
|
|
27
32
|
const GROQ_URL = "https://api.groq.com/openai/v1/audio/transcriptions";
|
|
33
|
+
const GROQ_CHAT_URL = "https://api.groq.com/openai/v1/chat/completions";
|
|
28
34
|
const GROQ_MODEL = "whisper-large-v3-turbo";
|
|
35
|
+
const GROQ_CLEANUP_MODEL = "llama-3.3-70b-versatile";
|
|
29
36
|
const OPENAI_MODEL = "whisper-1";
|
|
37
|
+
const OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions";
|
|
38
|
+
const OPENAI_CLEANUP_MODEL = "gpt-5.4-mini";
|
|
30
39
|
const MAX_AUDIO_BYTES = 25 * 1024 * 1024; // Whisper hard limit.
|
|
31
|
-
|
|
40
|
+
const MAX_TRANSCRIPT_CHARS = 40_000;
|
|
41
|
+
// Public Builder transcription model id. The Builder gateway maps this to
|
|
42
|
+
// Gemini 3.1 Flash-Lite.
|
|
43
|
+
const BUILDER_GEMINI_TRANSCRIPTION_MODEL = "gemini-3-1-flash-lite";
|
|
44
|
+
// Gemini Flash Lite BYOK path when GEMINI_API_KEY is configured.
|
|
32
45
|
// Gemini accepts inline audio; we just give it the bytes and a "transcribe
|
|
33
46
|
// this" prompt and it replies with text. 2.5x faster TTFT than 2.5 Flash
|
|
34
47
|
// per Google's release notes, and noticeably snappier than the Whisper
|
|
35
48
|
// round-trip even on a fast connection.
|
|
36
|
-
//
|
|
37
|
-
//
|
|
49
|
+
// Keep the direct Google AI path on a stable public model id; Builder's
|
|
50
|
+
// managed provider above handles the newer Gemini 3.1 Flash-Lite preview.
|
|
38
51
|
const GEMINI_MODEL = "gemini-2.0-flash-lite";
|
|
39
52
|
const GEMINI_URL = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}:generateContent`;
|
|
40
53
|
/**
|
|
@@ -96,11 +109,15 @@ export function createTranscribeVoiceHandler() {
|
|
|
96
109
|
}
|
|
97
110
|
const parts = await readMultipartFormData(event).catch(() => null);
|
|
98
111
|
const audio = parts?.find((p) => p.name === "audio");
|
|
99
|
-
|
|
112
|
+
const textPart = parts?.find((p) => p.name === "text");
|
|
113
|
+
const transcriptText = textPart?.data
|
|
114
|
+
? sanitizeTranscriptText(textPart.data.toString("utf8"))
|
|
115
|
+
: undefined;
|
|
116
|
+
if (!audio?.data?.length && !transcriptText) {
|
|
100
117
|
setResponseStatus(event, 400);
|
|
101
|
-
return { error: "Missing audio payload" };
|
|
118
|
+
return { error: "Missing audio or transcript payload" };
|
|
102
119
|
}
|
|
103
|
-
if (audio.data.length > MAX_AUDIO_BYTES) {
|
|
120
|
+
if (audio?.data?.length && audio.data.length > MAX_AUDIO_BYTES) {
|
|
104
121
|
setResponseStatus(event, 413);
|
|
105
122
|
return { error: "Audio too large (max 25 MB)" };
|
|
106
123
|
}
|
|
@@ -108,6 +125,10 @@ export function createTranscribeVoiceHandler() {
|
|
|
108
125
|
const language = languagePart?.data
|
|
109
126
|
? languagePart.data.toString("utf8").trim().slice(0, 8)
|
|
110
127
|
: undefined;
|
|
128
|
+
const instructionsPart = parts?.find((p) => p.name === "instructions");
|
|
129
|
+
const instructions = instructionsPart?.data
|
|
130
|
+
? sanitizeInstructions(instructionsPart.data.toString("utf8"))
|
|
131
|
+
: undefined;
|
|
111
132
|
// Resolve provider preference. Per-request "provider" form field takes
|
|
112
133
|
// precedence (the desktop client sends it on every dictation press),
|
|
113
134
|
// falling back to the user's stored `voice-transcription-prefs` app
|
|
@@ -118,6 +139,18 @@ export function createTranscribeVoiceHandler() {
|
|
|
118
139
|
setResponseStatus(event, 401);
|
|
119
140
|
return { error: "Authentication required" };
|
|
120
141
|
}
|
|
142
|
+
const orgCtx = session?.email
|
|
143
|
+
? await getOrgContext(event).catch(() => null)
|
|
144
|
+
: null;
|
|
145
|
+
const requestContext = {
|
|
146
|
+
userEmail: session?.email,
|
|
147
|
+
orgId: orgCtx?.orgId ?? undefined,
|
|
148
|
+
};
|
|
149
|
+
const withRequestContext = async (fn) => requestContext.userEmail
|
|
150
|
+
? runWithRequestContext(requestContext, fn)
|
|
151
|
+
: fn();
|
|
152
|
+
const hasBuilderPrivateKey = async () => withRequestContext(() => resolveHasBuilderPrivateKey());
|
|
153
|
+
const transcribeWithBuilderForRequest = (opts) => withRequestContext(() => transcribeWithBuilder(opts));
|
|
121
154
|
const sessionId = session?.email === DEV_MODE_USER_EMAIL
|
|
122
155
|
? "local"
|
|
123
156
|
: (session?.email ?? "local");
|
|
@@ -137,6 +170,7 @@ export function createTranscribeVoiceHandler() {
|
|
|
137
170
|
if (v === "auto" ||
|
|
138
171
|
v === "browser" ||
|
|
139
172
|
v === "builder" ||
|
|
173
|
+
v === "builder-gemini" ||
|
|
140
174
|
v === "gemini" ||
|
|
141
175
|
v === "openai" ||
|
|
142
176
|
v === "groq") {
|
|
@@ -165,9 +199,6 @@ export function createTranscribeVoiceHandler() {
|
|
|
165
199
|
error: 'Voice provider is set to "browser" (Web Speech API only). Change the preference in Settings → Voice Transcription to use a server-side provider.',
|
|
166
200
|
};
|
|
167
201
|
}
|
|
168
|
-
const mime = audio.type || "audio/webm";
|
|
169
|
-
const audioBytes = new Uint8Array(audio.data.buffer, audio.data.byteOffset, audio.data.byteLength);
|
|
170
|
-
let builderError = null;
|
|
171
202
|
// Per-user-or-fallback API key resolution. Hoisted up so the Gemini
|
|
172
203
|
// path below can use it without duplicating logic.
|
|
173
204
|
async function resolveApiKey(key) {
|
|
@@ -181,6 +212,24 @@ export function createTranscribeVoiceHandler() {
|
|
|
181
212
|
}).catch(() => null);
|
|
182
213
|
return (userSecret?.value || (await resolveCredential(key, ctx)) || undefined);
|
|
183
214
|
}
|
|
215
|
+
if (transcriptText) {
|
|
216
|
+
return await cleanupTranscriptText({
|
|
217
|
+
event,
|
|
218
|
+
text: transcriptText,
|
|
219
|
+
instructions,
|
|
220
|
+
providerPref,
|
|
221
|
+
hasBuilderPrivateKey,
|
|
222
|
+
withRequestContext,
|
|
223
|
+
resolveApiKey,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
if (!audio?.data?.length) {
|
|
227
|
+
setResponseStatus(event, 400);
|
|
228
|
+
return { error: "Missing audio payload" };
|
|
229
|
+
}
|
|
230
|
+
const mime = audio.type || "audio/webm";
|
|
231
|
+
const audioBytes = new Uint8Array(audio.data.buffer, audio.data.byteOffset, audio.data.byteLength);
|
|
232
|
+
let builderError = null;
|
|
184
233
|
// ── Strict per-provider preferences ─────────────────────────────────
|
|
185
234
|
// When the user explicitly picks a single provider (gemini / builder /
|
|
186
235
|
// groq), we only try that provider and surface its error rather than
|
|
@@ -201,6 +250,7 @@ export function createTranscribeVoiceHandler() {
|
|
|
201
250
|
mimeType: mime,
|
|
202
251
|
apiKey: geminiKey,
|
|
203
252
|
language: language || undefined,
|
|
253
|
+
instructions,
|
|
204
254
|
});
|
|
205
255
|
const trimmed = text.trim();
|
|
206
256
|
if (!trimmed) {
|
|
@@ -216,18 +266,25 @@ export function createTranscribeVoiceHandler() {
|
|
|
216
266
|
};
|
|
217
267
|
}
|
|
218
268
|
}
|
|
219
|
-
if (providerPref === "builder") {
|
|
220
|
-
|
|
269
|
+
if (providerPref === "builder" || providerPref === "builder-gemini") {
|
|
270
|
+
const label = providerPref === "builder-gemini"
|
|
271
|
+
? "Builder Gemini Flash-Lite"
|
|
272
|
+
: "Builder";
|
|
273
|
+
if (!(await hasBuilderPrivateKey())) {
|
|
221
274
|
setResponseStatus(event, 400);
|
|
222
275
|
return {
|
|
223
|
-
error:
|
|
276
|
+
error: `${label} is selected but Builder.io is not connected. Connect Builder.io in Settings, or change the provider preference.`,
|
|
224
277
|
};
|
|
225
278
|
}
|
|
226
279
|
try {
|
|
227
|
-
const result = await
|
|
280
|
+
const result = await transcribeWithBuilderForRequest({
|
|
228
281
|
audioBytes,
|
|
229
282
|
mimeType: mime,
|
|
283
|
+
model: providerPref === "builder-gemini"
|
|
284
|
+
? BUILDER_GEMINI_TRANSCRIPTION_MODEL
|
|
285
|
+
: undefined,
|
|
230
286
|
language: language || undefined,
|
|
287
|
+
instructions,
|
|
231
288
|
});
|
|
232
289
|
return { text: (result.text ?? "").trim() };
|
|
233
290
|
}
|
|
@@ -238,7 +295,7 @@ export function createTranscribeVoiceHandler() {
|
|
|
238
295
|
return { error: message };
|
|
239
296
|
}
|
|
240
297
|
setResponseStatus(event, 502);
|
|
241
|
-
return { error:
|
|
298
|
+
return { error: `${label} transcription failed: ${message}` };
|
|
242
299
|
}
|
|
243
300
|
}
|
|
244
301
|
if (providerPref === "groq") {
|
|
@@ -260,13 +317,38 @@ export function createTranscribeVoiceHandler() {
|
|
|
260
317
|
audioBytes,
|
|
261
318
|
mime,
|
|
262
319
|
language,
|
|
320
|
+
instructions,
|
|
263
321
|
});
|
|
264
322
|
}
|
|
265
323
|
// ── Auto / undefined / openai fallback chain ────────────────────────
|
|
266
|
-
// ── Gemini Flash
|
|
267
|
-
// First-priority
|
|
268
|
-
//
|
|
269
|
-
|
|
324
|
+
// ── Builder Gemini Flash-Lite path ─────────────────────────────────
|
|
325
|
+
// First-priority in auto mode when Builder is connected. This lets users
|
|
326
|
+
// try Gemini 3.1 Flash-Lite without bringing their own Google key.
|
|
327
|
+
if (providerPref !== "openai" && (await hasBuilderPrivateKey())) {
|
|
328
|
+
try {
|
|
329
|
+
const result = await transcribeWithBuilderForRequest({
|
|
330
|
+
audioBytes,
|
|
331
|
+
mimeType: mime,
|
|
332
|
+
model: BUILDER_GEMINI_TRANSCRIPTION_MODEL,
|
|
333
|
+
language: language || undefined,
|
|
334
|
+
instructions,
|
|
335
|
+
});
|
|
336
|
+
return { text: (result.text ?? "").trim() };
|
|
337
|
+
}
|
|
338
|
+
catch (err) {
|
|
339
|
+
const message = err?.message ?? String(err);
|
|
340
|
+
// Surface 402 (credits exhausted) as a 402 so the client can show
|
|
341
|
+
// a specific upgrade prompt.
|
|
342
|
+
if (message.includes("credits exhausted")) {
|
|
343
|
+
setResponseStatus(event, 402);
|
|
344
|
+
return { error: message };
|
|
345
|
+
}
|
|
346
|
+
builderError = message;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
// ── Gemini Flash Lite BYOK path ────────────────────────────────────
|
|
350
|
+
// If Builder is unavailable, try a user-provided Gemini key before
|
|
351
|
+
// Whisper-compatible providers.
|
|
270
352
|
if (providerPref !== "openai") {
|
|
271
353
|
const geminiKey = await resolveApiKey("GEMINI_API_KEY");
|
|
272
354
|
if (geminiKey) {
|
|
@@ -276,6 +358,7 @@ export function createTranscribeVoiceHandler() {
|
|
|
276
358
|
mimeType: mime,
|
|
277
359
|
apiKey: geminiKey,
|
|
278
360
|
language: language || undefined,
|
|
361
|
+
instructions,
|
|
279
362
|
});
|
|
280
363
|
const trimmed = text.trim();
|
|
281
364
|
if (trimmed) {
|
|
@@ -289,27 +372,6 @@ export function createTranscribeVoiceHandler() {
|
|
|
289
372
|
}
|
|
290
373
|
}
|
|
291
374
|
}
|
|
292
|
-
// ── Builder proxy path ──────────────────────────────────────────────
|
|
293
|
-
if (providerPref !== "openai" && (await resolveHasBuilderPrivateKey())) {
|
|
294
|
-
try {
|
|
295
|
-
const result = await transcribeWithBuilder({
|
|
296
|
-
audioBytes,
|
|
297
|
-
mimeType: mime,
|
|
298
|
-
language: language || undefined,
|
|
299
|
-
});
|
|
300
|
-
return { text: (result.text ?? "").trim() };
|
|
301
|
-
}
|
|
302
|
-
catch (err) {
|
|
303
|
-
const message = err?.message ?? String(err);
|
|
304
|
-
// Surface 402 (credits exhausted) as a 402 so the client can show
|
|
305
|
-
// a specific upgrade prompt.
|
|
306
|
-
if (message.includes("credits exhausted")) {
|
|
307
|
-
setResponseStatus(event, 402);
|
|
308
|
-
return { error: message };
|
|
309
|
-
}
|
|
310
|
-
builderError = message;
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
375
|
// If Builder is unavailable, fall through to BYOK providers rather than
|
|
314
376
|
// hard-failing. This mirrors Clips' batch transcription path.
|
|
315
377
|
// ── Groq / OpenAI Whisper-compatible path ──────────────────────────
|
|
@@ -341,8 +403,8 @@ export function createTranscribeVoiceHandler() {
|
|
|
341
403
|
setResponseStatus(event, builderError ? 502 : 400);
|
|
342
404
|
return {
|
|
343
405
|
error: builderError
|
|
344
|
-
? `Builder transcription failed: ${builderError}. Add GROQ_API_KEY or OPENAI_API_KEY in Settings → API Keys to enable a fallback provider.`
|
|
345
|
-
: "No voice transcription provider configured. Connect Builder.io or add GROQ_API_KEY / OPENAI_API_KEY in Settings → API Keys.",
|
|
406
|
+
? `Builder transcription failed: ${builderError}. Add GEMINI_API_KEY, GROQ_API_KEY, or OPENAI_API_KEY in Settings → API Keys to enable a fallback provider.`
|
|
407
|
+
: "No voice transcription provider configured. Connect Builder.io or add GEMINI_API_KEY / GROQ_API_KEY / OPENAI_API_KEY in Settings → API Keys.",
|
|
346
408
|
};
|
|
347
409
|
}
|
|
348
410
|
return await callWhisperCompat({
|
|
@@ -351,6 +413,7 @@ export function createTranscribeVoiceHandler() {
|
|
|
351
413
|
audioBytes,
|
|
352
414
|
mime,
|
|
353
415
|
language,
|
|
416
|
+
instructions,
|
|
354
417
|
});
|
|
355
418
|
});
|
|
356
419
|
}
|
|
@@ -361,7 +424,7 @@ export function createTranscribeVoiceHandler() {
|
|
|
361
424
|
* strict-Groq preference path and the auto fallback chain share one
|
|
362
425
|
* implementation.
|
|
363
426
|
*/
|
|
364
|
-
async function callWhisperCompat({ event, provider, audioBytes, mime, language, }) {
|
|
427
|
+
async function callWhisperCompat({ event, provider, audioBytes, mime, language, instructions, }) {
|
|
365
428
|
const ext = pickExtension(mime);
|
|
366
429
|
const filename = `composer-voice.${ext}`;
|
|
367
430
|
const form = new FormData();
|
|
@@ -370,6 +433,8 @@ async function callWhisperCompat({ event, provider, audioBytes, mime, language,
|
|
|
370
433
|
form.append("response_format", "json");
|
|
371
434
|
if (language)
|
|
372
435
|
form.append("language", language);
|
|
436
|
+
if (instructions)
|
|
437
|
+
form.append("prompt", instructions);
|
|
373
438
|
const controller = new AbortController();
|
|
374
439
|
const timeout = setTimeout(() => controller.abort(), 45_000);
|
|
375
440
|
try {
|
|
@@ -403,6 +468,252 @@ async function callWhisperCompat({ event, provider, audioBytes, mime, language,
|
|
|
403
468
|
clearTimeout(timeout);
|
|
404
469
|
}
|
|
405
470
|
}
|
|
471
|
+
async function cleanupTranscriptText({ event, text, instructions, providerPref, hasBuilderPrivateKey, withRequestContext, resolveApiKey, }) {
|
|
472
|
+
const original = text.trim();
|
|
473
|
+
if (!original)
|
|
474
|
+
return { text: "" };
|
|
475
|
+
if (providerPref === "browser") {
|
|
476
|
+
return { text: original };
|
|
477
|
+
}
|
|
478
|
+
if (providerPref === "builder" || providerPref === "builder-gemini") {
|
|
479
|
+
if (!(await hasBuilderPrivateKey())) {
|
|
480
|
+
setResponseStatus(event, 400);
|
|
481
|
+
return {
|
|
482
|
+
error: "Builder.io cleanup is selected but Builder.io is not connected. Connect Builder.io in Settings, or change the provider preference.",
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
try {
|
|
486
|
+
const cleaned = await withRequestContext(() => cleanupWithBuilder({ text: original, instructions }));
|
|
487
|
+
return { text: cleaned || original };
|
|
488
|
+
}
|
|
489
|
+
catch (err) {
|
|
490
|
+
setResponseStatus(event, 502);
|
|
491
|
+
return {
|
|
492
|
+
error: `Builder.io cleanup failed: ${err?.message ?? String(err)}`,
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (providerPref === "gemini") {
|
|
497
|
+
const geminiKey = await resolveApiKey("GEMINI_API_KEY");
|
|
498
|
+
if (!geminiKey) {
|
|
499
|
+
setResponseStatus(event, 400);
|
|
500
|
+
return {
|
|
501
|
+
error: "Gemini cleanup is selected but GEMINI_API_KEY is not configured.",
|
|
502
|
+
};
|
|
503
|
+
}
|
|
504
|
+
try {
|
|
505
|
+
const cleaned = await cleanupWithGemini({
|
|
506
|
+
text: original,
|
|
507
|
+
apiKey: geminiKey,
|
|
508
|
+
instructions,
|
|
509
|
+
});
|
|
510
|
+
return { text: cleaned || original };
|
|
511
|
+
}
|
|
512
|
+
catch (err) {
|
|
513
|
+
setResponseStatus(event, 502);
|
|
514
|
+
return {
|
|
515
|
+
error: `Gemini cleanup failed: ${err?.message ?? String(err)}`,
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
if (providerPref === "openai" || providerPref === "groq") {
|
|
520
|
+
const keyName = providerPref === "openai" ? "OPENAI_API_KEY" : "GROQ_API_KEY";
|
|
521
|
+
const apiKey = await resolveApiKey(keyName);
|
|
522
|
+
if (!apiKey) {
|
|
523
|
+
setResponseStatus(event, 400);
|
|
524
|
+
return {
|
|
525
|
+
error: `${providerPref} cleanup is selected but ${keyName} is not configured.`,
|
|
526
|
+
};
|
|
527
|
+
}
|
|
528
|
+
try {
|
|
529
|
+
const cleaned = await cleanupWithChatProvider({
|
|
530
|
+
provider: providerPref,
|
|
531
|
+
text: original,
|
|
532
|
+
apiKey,
|
|
533
|
+
instructions,
|
|
534
|
+
});
|
|
535
|
+
return { text: cleaned || original };
|
|
536
|
+
}
|
|
537
|
+
catch (err) {
|
|
538
|
+
setResponseStatus(event, 502);
|
|
539
|
+
return {
|
|
540
|
+
error: `${providerPref} cleanup failed: ${err?.message ?? String(err)}`,
|
|
541
|
+
};
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
if (await hasBuilderPrivateKey()) {
|
|
545
|
+
try {
|
|
546
|
+
const cleaned = await withRequestContext(() => cleanupWithBuilder({ text: original, instructions }));
|
|
547
|
+
if (cleaned)
|
|
548
|
+
return { text: cleaned };
|
|
549
|
+
}
|
|
550
|
+
catch {
|
|
551
|
+
// Fall through to BYOK providers, then raw text.
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
const geminiKey = await resolveApiKey("GEMINI_API_KEY");
|
|
555
|
+
if (geminiKey) {
|
|
556
|
+
try {
|
|
557
|
+
const cleaned = await cleanupWithGemini({
|
|
558
|
+
text: original,
|
|
559
|
+
apiKey: geminiKey,
|
|
560
|
+
instructions,
|
|
561
|
+
});
|
|
562
|
+
if (cleaned)
|
|
563
|
+
return { text: cleaned };
|
|
564
|
+
}
|
|
565
|
+
catch {
|
|
566
|
+
// Fall through.
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
const groqKey = await resolveApiKey("GROQ_API_KEY");
|
|
570
|
+
if (groqKey) {
|
|
571
|
+
try {
|
|
572
|
+
const cleaned = await cleanupWithChatProvider({
|
|
573
|
+
provider: "groq",
|
|
574
|
+
text: original,
|
|
575
|
+
apiKey: groqKey,
|
|
576
|
+
instructions,
|
|
577
|
+
});
|
|
578
|
+
if (cleaned)
|
|
579
|
+
return { text: cleaned };
|
|
580
|
+
}
|
|
581
|
+
catch {
|
|
582
|
+
// Fall through.
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
const openaiKey = await resolveApiKey("OPENAI_API_KEY");
|
|
586
|
+
if (openaiKey) {
|
|
587
|
+
try {
|
|
588
|
+
const cleaned = await cleanupWithChatProvider({
|
|
589
|
+
provider: "openai",
|
|
590
|
+
text: original,
|
|
591
|
+
apiKey: openaiKey,
|
|
592
|
+
instructions,
|
|
593
|
+
});
|
|
594
|
+
if (cleaned)
|
|
595
|
+
return { text: cleaned };
|
|
596
|
+
}
|
|
597
|
+
catch {
|
|
598
|
+
// Fall through.
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
return { text: original };
|
|
602
|
+
}
|
|
603
|
+
async function cleanupWithBuilder({ text, instructions, }) {
|
|
604
|
+
const engine = createBuilderEngine();
|
|
605
|
+
const controller = new AbortController();
|
|
606
|
+
const timeout = setTimeout(() => controller.abort(), 8_000);
|
|
607
|
+
let streamedText = "";
|
|
608
|
+
let finalText = "";
|
|
609
|
+
let terminalError;
|
|
610
|
+
try {
|
|
611
|
+
for await (const event of engine.stream({
|
|
612
|
+
model: BUILDER_GEMINI_TRANSCRIPTION_MODEL,
|
|
613
|
+
systemPrompt: buildCleanupSystemPrompt(instructions),
|
|
614
|
+
messages: [
|
|
615
|
+
{
|
|
616
|
+
role: "user",
|
|
617
|
+
content: [{ type: "text", text: buildCleanupUserPrompt(text) }],
|
|
618
|
+
},
|
|
619
|
+
],
|
|
620
|
+
tools: [],
|
|
621
|
+
abortSignal: controller.signal,
|
|
622
|
+
maxOutputTokens: Math.min(4096, Math.max(512, text.length * 2)),
|
|
623
|
+
temperature: 0,
|
|
624
|
+
})) {
|
|
625
|
+
if (event.type === "text-delta")
|
|
626
|
+
streamedText += event.text;
|
|
627
|
+
if (event.type === "assistant-content") {
|
|
628
|
+
finalText = event.parts
|
|
629
|
+
.filter((part) => part.type === "text")
|
|
630
|
+
.map((part) => part.text)
|
|
631
|
+
.join("")
|
|
632
|
+
.trim();
|
|
633
|
+
}
|
|
634
|
+
if (event.type === "stop" && event.reason === "error") {
|
|
635
|
+
terminalError = event.error ?? "Builder gateway returned an error";
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
finally {
|
|
640
|
+
clearTimeout(timeout);
|
|
641
|
+
}
|
|
642
|
+
if (terminalError)
|
|
643
|
+
throw new Error(terminalError);
|
|
644
|
+
return stripTranscriptEnvelope(finalText || streamedText);
|
|
645
|
+
}
|
|
646
|
+
async function cleanupWithGemini({ text, apiKey, instructions, }) {
|
|
647
|
+
const controller = new AbortController();
|
|
648
|
+
const timeout = setTimeout(() => controller.abort(), 8_000);
|
|
649
|
+
try {
|
|
650
|
+
const res = await fetch(GEMINI_URL, {
|
|
651
|
+
method: "POST",
|
|
652
|
+
headers: {
|
|
653
|
+
"Content-Type": "application/json",
|
|
654
|
+
"x-goog-api-key": apiKey,
|
|
655
|
+
},
|
|
656
|
+
body: JSON.stringify({
|
|
657
|
+
contents: [
|
|
658
|
+
{
|
|
659
|
+
parts: [
|
|
660
|
+
{ text: buildCleanupSystemPrompt(instructions) },
|
|
661
|
+
{ text: buildCleanupUserPrompt(text) },
|
|
662
|
+
],
|
|
663
|
+
},
|
|
664
|
+
],
|
|
665
|
+
generationConfig: { temperature: 0 },
|
|
666
|
+
}),
|
|
667
|
+
signal: controller.signal,
|
|
668
|
+
});
|
|
669
|
+
if (!res.ok) {
|
|
670
|
+
const body = await res.text().catch(() => "");
|
|
671
|
+
throw new Error(`Gemini ${res.status}: ${body.slice(0, 300)}`);
|
|
672
|
+
}
|
|
673
|
+
const data = (await res.json());
|
|
674
|
+
const cleaned = data.candidates?.[0]?.content?.parts
|
|
675
|
+
?.map((p) => p.text ?? "")
|
|
676
|
+
.join("")
|
|
677
|
+
.trim();
|
|
678
|
+
return stripTranscriptEnvelope(cleaned ?? "");
|
|
679
|
+
}
|
|
680
|
+
finally {
|
|
681
|
+
clearTimeout(timeout);
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
async function cleanupWithChatProvider({ provider, text, apiKey, instructions, }) {
|
|
685
|
+
const controller = new AbortController();
|
|
686
|
+
const timeout = setTimeout(() => controller.abort(), 8_000);
|
|
687
|
+
const endpoint = provider === "openai" ? OPENAI_CHAT_URL : GROQ_CHAT_URL;
|
|
688
|
+
const model = provider === "openai" ? OPENAI_CLEANUP_MODEL : GROQ_CLEANUP_MODEL;
|
|
689
|
+
try {
|
|
690
|
+
const res = await fetch(endpoint, {
|
|
691
|
+
method: "POST",
|
|
692
|
+
headers: {
|
|
693
|
+
Authorization: `Bearer ${apiKey}`,
|
|
694
|
+
"Content-Type": "application/json",
|
|
695
|
+
},
|
|
696
|
+
body: JSON.stringify({
|
|
697
|
+
model,
|
|
698
|
+
messages: [
|
|
699
|
+
{ role: "system", content: buildCleanupSystemPrompt(instructions) },
|
|
700
|
+
{ role: "user", content: buildCleanupUserPrompt(text) },
|
|
701
|
+
],
|
|
702
|
+
temperature: 0,
|
|
703
|
+
}),
|
|
704
|
+
signal: controller.signal,
|
|
705
|
+
});
|
|
706
|
+
if (!res.ok) {
|
|
707
|
+
const body = await res.text().catch(() => "");
|
|
708
|
+
throw new Error(`${provider} ${res.status}: ${body.slice(0, 300)}`);
|
|
709
|
+
}
|
|
710
|
+
const data = (await res.json());
|
|
711
|
+
return stripTranscriptEnvelope(data.choices?.[0]?.message?.content?.trim() ?? "");
|
|
712
|
+
}
|
|
713
|
+
finally {
|
|
714
|
+
clearTimeout(timeout);
|
|
715
|
+
}
|
|
716
|
+
}
|
|
406
717
|
function pickExtension(mime) {
|
|
407
718
|
const lower = mime.toLowerCase();
|
|
408
719
|
if (lower.includes("mp4") || lower.includes("m4a"))
|
|
@@ -415,6 +726,51 @@ function pickExtension(mime) {
|
|
|
415
726
|
return "wav";
|
|
416
727
|
return "webm";
|
|
417
728
|
}
|
|
729
|
+
function sanitizeInstructions(value) {
|
|
730
|
+
const trimmed = value.replace(/\0/g, "").trim();
|
|
731
|
+
if (!trimmed)
|
|
732
|
+
return undefined;
|
|
733
|
+
return trimmed.slice(0, 3000);
|
|
734
|
+
}
|
|
735
|
+
function sanitizeTranscriptText(value) {
|
|
736
|
+
const trimmed = value.replace(/\0/g, "").trim();
|
|
737
|
+
if (!trimmed)
|
|
738
|
+
return undefined;
|
|
739
|
+
return trimmed.slice(0, MAX_TRANSCRIPT_CHARS);
|
|
740
|
+
}
|
|
741
|
+
function buildCleanupSystemPrompt(instructions) {
|
|
742
|
+
const custom = instructions
|
|
743
|
+
? `\n\nUser's custom cleanup instructions:\n${instructions}`
|
|
744
|
+
: "";
|
|
745
|
+
return `You clean up live speech-recognition transcripts before paste.
|
|
746
|
+
|
|
747
|
+
Rules:
|
|
748
|
+
- Preserve the speaker's meaning and voice.
|
|
749
|
+
- Fix obvious recognition mistakes, punctuation, capitalization, spacing, and casing.
|
|
750
|
+
- Remove false starts and filler only when they are clearly not intentional.
|
|
751
|
+
- Do not add facts, explanations, headings, bullets, quotes, or markdown.
|
|
752
|
+
- Output only the cleaned transcript text.${custom}`;
|
|
753
|
+
}
|
|
754
|
+
function buildCleanupUserPrompt(text) {
|
|
755
|
+
return `Clean up this transcript and return only the final text:\n\n<transcript>\n${text}\n</transcript>`;
|
|
756
|
+
}
|
|
757
|
+
function stripTranscriptEnvelope(value) {
|
|
758
|
+
return value
|
|
759
|
+
.trim()
|
|
760
|
+
.replace(/^```(?:text)?\s*/i, "")
|
|
761
|
+
.replace(/\s*```$/i, "")
|
|
762
|
+
.replace(/^["“](.*)["”]$/s, "$1")
|
|
763
|
+
.trim();
|
|
764
|
+
}
|
|
765
|
+
function buildGeminiTranscriptionPrompt({ language, instructions, }) {
|
|
766
|
+
const base = language
|
|
767
|
+
? `Transcribe the speech in this audio (language: ${language}).`
|
|
768
|
+
: "Transcribe the speech in this audio.";
|
|
769
|
+
const custom = instructions
|
|
770
|
+
? `\n\nAdditional user instructions for transcription cleanup:\n${instructions}\n\nApply these only to formatting, casing, punctuation, vocabulary, and cleanup. Do not add content that is not present in the audio.`
|
|
771
|
+
: "";
|
|
772
|
+
return `${base} Output only the transcript text — no preamble, no quotes, no formatting.${custom}`;
|
|
773
|
+
}
|
|
418
774
|
/**
|
|
419
775
|
* Transcribe audio via Gemini Flash Lite.
|
|
420
776
|
*
|
|
@@ -426,11 +782,9 @@ function pickExtension(mime) {
|
|
|
426
782
|
* FLAC — webm/opus is not officially supported but in practice it
|
|
427
783
|
* accepts webm too. If Gemini rejects it the caller falls through.
|
|
428
784
|
*/
|
|
429
|
-
async function transcribeWithGemini({ audioBytes, mimeType, apiKey, language, }) {
|
|
785
|
+
async function transcribeWithGemini({ audioBytes, mimeType, apiKey, language, instructions, }) {
|
|
430
786
|
const base64 = uint8ArrayToBase64(audioBytes);
|
|
431
|
-
const prompt = language
|
|
432
|
-
? `Transcribe the speech in this audio (language: ${language}). Output only the transcript text — no preamble, no quotes, no formatting.`
|
|
433
|
-
: "Transcribe the speech in this audio. Output only the transcript text — no preamble, no quotes, no formatting.";
|
|
787
|
+
const prompt = buildGeminiTranscriptionPrompt({ language, instructions });
|
|
434
788
|
const controller = new AbortController();
|
|
435
789
|
const timeout = setTimeout(() => controller.abort(), 30_000);
|
|
436
790
|
try {
|