cursor-buddy 0.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +422 -0
- package/dist/client-Ba6rv-du.d.mts +460 -0
- package/dist/client-Ba6rv-du.d.mts.map +1 -0
- package/dist/client-D-LeEdoH.mjs +2254 -0
- package/dist/client-D-LeEdoH.mjs.map +1 -0
- package/dist/index.d.mts +3 -0
- package/dist/index.mjs +3 -0
- package/dist/point-tool-DtHgq6gQ.mjs +54 -0
- package/dist/point-tool-DtHgq6gQ.mjs.map +1 -0
- package/dist/point-tool-kIviMn1q.d.mts +46 -0
- package/dist/point-tool-kIviMn1q.d.mts.map +1 -0
- package/dist/react/index.d.mts +142 -0
- package/dist/react/index.d.mts.map +1 -0
- package/dist/react/index.mjs +574 -0
- package/dist/react/index.mjs.map +1 -0
- package/dist/server/adapters/next.d.mts +22 -0
- package/dist/server/adapters/next.d.mts.map +1 -0
- package/dist/server/adapters/next.mjs +24 -0
- package/dist/server/adapters/next.mjs.map +1 -0
- package/dist/server/index.d.mts +31 -0
- package/dist/server/index.d.mts.map +1 -0
- package/dist/server/index.mjs +278 -0
- package/dist/server/index.mjs.map +1 -0
- package/dist/types-COQKMo5C.d.mts +44 -0
- package/dist/types-COQKMo5C.d.mts.map +1 -0
- package/package.json +108 -0
|
@@ -0,0 +1,2254 @@
|
|
|
1
|
+
import { atom } from "nanostores";
|
|
2
|
+
import html2canvas from "html2canvas-pro";
|
|
3
|
+
//#region src/core/atoms.ts
|
|
4
|
+
/**
|
|
5
|
+
* Nanostores atoms for reactive values that don't need state machine semantics.
|
|
6
|
+
* These update frequently (e.g., 60fps audio levels) and are framework-agnostic.
|
|
7
|
+
*/
|
|
8
|
+
const $audioLevel = atom(0);
|
|
9
|
+
const $cursorPosition = atom({
|
|
10
|
+
x: 0,
|
|
11
|
+
y: 0
|
|
12
|
+
});
|
|
13
|
+
const $buddyPosition = atom({
|
|
14
|
+
x: 0,
|
|
15
|
+
y: 0
|
|
16
|
+
});
|
|
17
|
+
const $buddyRotation = atom(0);
|
|
18
|
+
const $buddyScale = atom(1);
|
|
19
|
+
const $pointingTarget = atom(null);
|
|
20
|
+
const $isEnabled = atom(true);
|
|
21
|
+
atom(false);
|
|
22
|
+
const $conversationHistory = atom([]);
|
|
23
|
+
//#endregion
|
|
24
|
+
//#region src/core/utils/error.ts
|
|
25
|
+
/**
|
|
26
|
+
* Normalize unknown thrown values into Error instances.
|
|
27
|
+
*/
|
|
28
|
+
function toError(error, fallbackMessage = "Unknown error") {
|
|
29
|
+
if (error instanceof Error) return error;
|
|
30
|
+
if (typeof error === "string" && error) return new Error(error);
|
|
31
|
+
return new Error(fallbackMessage);
|
|
32
|
+
}
|
|
33
|
+
//#endregion
|
|
34
|
+
//#region src/core/services/audio-playback.ts
|
|
35
|
+
/**
|
|
36
|
+
* Framework-agnostic service for audio playback with abort support.
|
|
37
|
+
*/
|
|
38
|
+
var AudioPlaybackService = class {
|
|
39
|
+
audio = null;
|
|
40
|
+
currentUrl = null;
|
|
41
|
+
settlePlayback = null;
|
|
42
|
+
removeAbortListener = null;
|
|
43
|
+
/**
|
|
44
|
+
* Play audio from a blob. Stops any currently playing audio first.
|
|
45
|
+
* @param blob - Audio blob to play
|
|
46
|
+
* @param signal - Optional AbortSignal to cancel playback
|
|
47
|
+
* @returns Promise that resolves when playback completes
|
|
48
|
+
*/
|
|
49
|
+
async play(blob, signal) {
|
|
50
|
+
this.stop();
|
|
51
|
+
if (signal?.aborted) return;
|
|
52
|
+
const url = URL.createObjectURL(blob);
|
|
53
|
+
this.currentUrl = url;
|
|
54
|
+
this.audio = new Audio(url);
|
|
55
|
+
return new Promise((resolve, reject) => {
|
|
56
|
+
if (!this.audio) {
|
|
57
|
+
this.cleanup();
|
|
58
|
+
resolve();
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
let settled = false;
|
|
62
|
+
const audio = this.audio;
|
|
63
|
+
const settle = (outcome, error) => {
|
|
64
|
+
if (settled) return;
|
|
65
|
+
settled = true;
|
|
66
|
+
if (this.settlePlayback === settle) this.settlePlayback = null;
|
|
67
|
+
this.removeAbortListener?.();
|
|
68
|
+
this.removeAbortListener = null;
|
|
69
|
+
if (this.audio === audio) {
|
|
70
|
+
this.audio.onended = null;
|
|
71
|
+
this.audio.onerror = null;
|
|
72
|
+
this.audio = null;
|
|
73
|
+
}
|
|
74
|
+
this.cleanup();
|
|
75
|
+
if (outcome === "resolve") {
|
|
76
|
+
resolve();
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
reject(error ?? /* @__PURE__ */ new Error("Audio playback failed"));
|
|
80
|
+
};
|
|
81
|
+
this.settlePlayback = settle;
|
|
82
|
+
const abortHandler = () => {
|
|
83
|
+
audio.pause();
|
|
84
|
+
settle("resolve");
|
|
85
|
+
};
|
|
86
|
+
if (signal) {
|
|
87
|
+
signal.addEventListener("abort", abortHandler, { once: true });
|
|
88
|
+
this.removeAbortListener = () => {
|
|
89
|
+
signal.removeEventListener("abort", abortHandler);
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
this.audio.onended = () => {
|
|
93
|
+
settle("resolve");
|
|
94
|
+
};
|
|
95
|
+
this.audio.onerror = () => {
|
|
96
|
+
settle("reject", /* @__PURE__ */ new Error("Audio playback failed"));
|
|
97
|
+
};
|
|
98
|
+
this.audio.play().catch((err) => {
|
|
99
|
+
settle("reject", toError(err, "Audio playback failed"));
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Stop any currently playing audio.
|
|
105
|
+
*/
|
|
106
|
+
stop() {
|
|
107
|
+
if (this.audio) this.audio.pause();
|
|
108
|
+
if (this.settlePlayback) {
|
|
109
|
+
const settlePlayback = this.settlePlayback;
|
|
110
|
+
this.settlePlayback = null;
|
|
111
|
+
settlePlayback("resolve");
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
this.removeAbortListener?.();
|
|
115
|
+
this.removeAbortListener = null;
|
|
116
|
+
if (this.audio) {
|
|
117
|
+
this.audio.onended = null;
|
|
118
|
+
this.audio.onerror = null;
|
|
119
|
+
this.audio = null;
|
|
120
|
+
}
|
|
121
|
+
this.cleanup();
|
|
122
|
+
}
|
|
123
|
+
cleanup() {
|
|
124
|
+
if (this.currentUrl) {
|
|
125
|
+
URL.revokeObjectURL(this.currentUrl);
|
|
126
|
+
this.currentUrl = null;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
//#endregion
|
|
131
|
+
//#region src/core/utils/web-speech.ts
|
|
132
|
+
/**
|
|
133
|
+
* Normalize browser speech input and transcript output to a single-space form
|
|
134
|
+
* so UI state and speech synthesis stay stable across browser event quirks.
|
|
135
|
+
*/
|
|
136
|
+
function normalizeSpeechText(text) {
|
|
137
|
+
return text.replace(/\s+/g, " ").trim();
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Resolve the best browser locale to use for Web Speech APIs.
|
|
141
|
+
*
|
|
142
|
+
* We prefer the document language when the host app declares one, then fall
|
|
143
|
+
* back to the browser locale, and finally to English as a stable default.
|
|
144
|
+
*/
|
|
145
|
+
function resolveBrowserLanguage() {
|
|
146
|
+
if (typeof document !== "undefined") {
|
|
147
|
+
const documentLanguage = document.documentElement.lang.trim();
|
|
148
|
+
if (documentLanguage) return documentLanguage;
|
|
149
|
+
}
|
|
150
|
+
if (typeof navigator !== "undefined" && navigator.language) return navigator.language;
|
|
151
|
+
return "en-US";
|
|
152
|
+
}
|
|
153
|
+
//#endregion
|
|
154
|
+
//#region src/core/services/browser-speech.ts
|
|
155
|
+
function getSpeechSynthesis() {
|
|
156
|
+
return typeof globalThis.speechSynthesis === "undefined" ? void 0 : globalThis.speechSynthesis;
|
|
157
|
+
}
|
|
158
|
+
function getSpeechSynthesisUtterance() {
|
|
159
|
+
return typeof globalThis.SpeechSynthesisUtterance === "undefined" ? void 0 : globalThis.SpeechSynthesisUtterance;
|
|
160
|
+
}
|
|
161
|
+
function toSpeechError(event) {
|
|
162
|
+
const errorCode = event?.error;
|
|
163
|
+
return /* @__PURE__ */ new Error(errorCode ? `Browser speech failed: ${errorCode}` : "Browser speech failed");
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Browser-backed speech synthesis using the Web Speech API.
|
|
167
|
+
*/
|
|
168
|
+
var BrowserSpeechService = class {
|
|
169
|
+
removeAbortListener = null;
|
|
170
|
+
settleSpeech = null;
|
|
171
|
+
utterance = null;
|
|
172
|
+
/**
|
|
173
|
+
* Report whether this runtime exposes the browser Web Speech synthesis APIs.
|
|
174
|
+
*/
|
|
175
|
+
isAvailable() {
|
|
176
|
+
return Boolean(getSpeechSynthesis() && getSpeechSynthesisUtterance());
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Speak a single text segment in the browser.
|
|
180
|
+
*
|
|
181
|
+
* Each queue item owns its own utterance. We only stop an existing utterance
|
|
182
|
+
* when this service still has one in flight, so streamed playback does not
|
|
183
|
+
* spam global `speechSynthesis.cancel()` between already-completed segments.
|
|
184
|
+
*/
|
|
185
|
+
async speak(text, signal) {
|
|
186
|
+
const speechSynthesis = getSpeechSynthesis();
|
|
187
|
+
const SpeechSynthesisUtteranceCtor = getSpeechSynthesisUtterance();
|
|
188
|
+
if (!speechSynthesis || !SpeechSynthesisUtteranceCtor) throw new Error("Browser speech is not supported");
|
|
189
|
+
if (this.hasActiveSpeech()) this.stop();
|
|
190
|
+
const normalizedText = normalizeSpeechText(text);
|
|
191
|
+
if (!normalizedText || signal?.aborted) return;
|
|
192
|
+
const utterance = new SpeechSynthesisUtteranceCtor(normalizedText);
|
|
193
|
+
utterance.lang = resolveBrowserLanguage();
|
|
194
|
+
this.utterance = utterance;
|
|
195
|
+
return new Promise((resolve, reject) => {
|
|
196
|
+
let settled = false;
|
|
197
|
+
const settle = (outcome, error) => {
|
|
198
|
+
if (settled) return;
|
|
199
|
+
settled = true;
|
|
200
|
+
if (this.settleSpeech === settle) this.settleSpeech = null;
|
|
201
|
+
this.removeAbortListener?.();
|
|
202
|
+
this.removeAbortListener = null;
|
|
203
|
+
this.clearUtterance(utterance);
|
|
204
|
+
if (outcome === "resolve") {
|
|
205
|
+
resolve();
|
|
206
|
+
return;
|
|
207
|
+
}
|
|
208
|
+
reject(error ?? /* @__PURE__ */ new Error("Browser speech failed"));
|
|
209
|
+
};
|
|
210
|
+
this.settleSpeech = settle;
|
|
211
|
+
const abortHandler = () => {
|
|
212
|
+
try {
|
|
213
|
+
speechSynthesis.cancel();
|
|
214
|
+
} catch {}
|
|
215
|
+
settle("resolve");
|
|
216
|
+
};
|
|
217
|
+
if (signal) {
|
|
218
|
+
signal.addEventListener("abort", abortHandler, { once: true });
|
|
219
|
+
this.removeAbortListener = () => {
|
|
220
|
+
signal.removeEventListener("abort", abortHandler);
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
utterance.onend = () => {
|
|
224
|
+
settle("resolve");
|
|
225
|
+
};
|
|
226
|
+
utterance.onerror = (event) => {
|
|
227
|
+
if (signal?.aborted) {
|
|
228
|
+
settle("resolve");
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
settle("reject", toSpeechError(event));
|
|
232
|
+
};
|
|
233
|
+
try {
|
|
234
|
+
speechSynthesis.speak(utterance);
|
|
235
|
+
} catch (error) {
|
|
236
|
+
settle("reject", toError(error, "Browser speech failed to start"));
|
|
237
|
+
}
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Stop the current utterance owned by this service, if one is active.
|
|
242
|
+
*
|
|
243
|
+
* We intentionally do nothing when the service is idle so we do not cancel
|
|
244
|
+
* unrelated speech synthesis work that host apps may be doing elsewhere.
|
|
245
|
+
*/
|
|
246
|
+
stop() {
|
|
247
|
+
if (!this.hasActiveSpeech()) return;
|
|
248
|
+
const speechSynthesis = getSpeechSynthesis();
|
|
249
|
+
if (speechSynthesis) try {
|
|
250
|
+
speechSynthesis.cancel();
|
|
251
|
+
} catch {}
|
|
252
|
+
if (this.settleSpeech) {
|
|
253
|
+
const settleSpeech = this.settleSpeech;
|
|
254
|
+
this.settleSpeech = null;
|
|
255
|
+
settleSpeech("resolve");
|
|
256
|
+
return;
|
|
257
|
+
}
|
|
258
|
+
this.removeAbortListener?.();
|
|
259
|
+
this.removeAbortListener = null;
|
|
260
|
+
this.clearUtterance(this.utterance);
|
|
261
|
+
}
|
|
262
|
+
hasActiveSpeech() {
|
|
263
|
+
return Boolean(this.utterance || this.settleSpeech);
|
|
264
|
+
}
|
|
265
|
+
clearUtterance(utterance) {
|
|
266
|
+
if (!utterance) return;
|
|
267
|
+
utterance.onend = null;
|
|
268
|
+
utterance.onerror = null;
|
|
269
|
+
if (this.utterance === utterance) this.utterance = null;
|
|
270
|
+
}
|
|
271
|
+
};
|
|
272
|
+
//#endregion
|
|
273
|
+
//#region src/core/services/live-transcription.ts
|
|
274
|
+
function getSpeechRecognitionConstructor() {
|
|
275
|
+
const globalScope = globalThis;
|
|
276
|
+
return globalScope.SpeechRecognition ?? globalScope.webkitSpeechRecognition;
|
|
277
|
+
}
|
|
278
|
+
function toRecognitionError(event) {
|
|
279
|
+
const errorCode = event?.error;
|
|
280
|
+
const message = event?.message || (errorCode ? `Browser transcription failed: ${errorCode}` : "Browser transcription failed");
|
|
281
|
+
return new Error(message);
|
|
282
|
+
}
|
|
283
|
+
function buildTranscripts(results) {
|
|
284
|
+
let finalTranscript = "";
|
|
285
|
+
let interimTranscript = "";
|
|
286
|
+
for (let index = 0; index < results.length; index += 1) {
|
|
287
|
+
const result = results[index];
|
|
288
|
+
const transcript = (result?.[0])?.transcript ?? "";
|
|
289
|
+
if (!transcript) continue;
|
|
290
|
+
if (result.isFinal) finalTranscript += `${transcript} `;
|
|
291
|
+
else interimTranscript += `${transcript} `;
|
|
292
|
+
}
|
|
293
|
+
const normalizedFinal = normalizeSpeechText(finalTranscript);
|
|
294
|
+
return {
|
|
295
|
+
finalTranscript: normalizedFinal,
|
|
296
|
+
liveTranscript: normalizeSpeechText([normalizedFinal, normalizeSpeechText(interimTranscript)].filter(Boolean).join(" "))
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Browser-backed live transcription using the Web Speech API.
|
|
301
|
+
*/
|
|
302
|
+
var LiveTranscriptionService = class {
|
|
303
|
+
finalTranscript = "";
|
|
304
|
+
hasStarted = false;
|
|
305
|
+
hasEnded = false;
|
|
306
|
+
lastError = null;
|
|
307
|
+
partialCallback = null;
|
|
308
|
+
recognition = null;
|
|
309
|
+
startReject = null;
|
|
310
|
+
startResolve = null;
|
|
311
|
+
stopReject = null;
|
|
312
|
+
stopResolve = null;
|
|
313
|
+
isAvailable() {
|
|
314
|
+
return Boolean(getSpeechRecognitionConstructor());
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Register a callback for the latest browser transcript while the user is
|
|
318
|
+
* still speaking.
|
|
319
|
+
*/
|
|
320
|
+
onPartial(callback) {
|
|
321
|
+
this.partialCallback = callback;
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Start a new Web Speech recognition session.
|
|
325
|
+
*/
|
|
326
|
+
async start() {
|
|
327
|
+
const SpeechRecognitionCtor = getSpeechRecognitionConstructor();
|
|
328
|
+
if (!SpeechRecognitionCtor) throw new Error("Browser transcription is not supported");
|
|
329
|
+
this.dispose();
|
|
330
|
+
const recognition = new SpeechRecognitionCtor();
|
|
331
|
+
this.recognition = recognition;
|
|
332
|
+
recognition.continuous = true;
|
|
333
|
+
recognition.interimResults = true;
|
|
334
|
+
recognition.maxAlternatives = 1;
|
|
335
|
+
recognition.lang = resolveBrowserLanguage();
|
|
336
|
+
recognition.onstart = () => {
|
|
337
|
+
this.hasStarted = true;
|
|
338
|
+
this.startResolve?.();
|
|
339
|
+
this.startResolve = null;
|
|
340
|
+
this.startReject = null;
|
|
341
|
+
};
|
|
342
|
+
recognition.onresult = (event) => {
|
|
343
|
+
const transcripts = buildTranscripts(event.results);
|
|
344
|
+
this.finalTranscript = transcripts.finalTranscript;
|
|
345
|
+
this.partialCallback?.(transcripts.liveTranscript);
|
|
346
|
+
};
|
|
347
|
+
recognition.onerror = (event) => {
|
|
348
|
+
this.lastError = toRecognitionError(event);
|
|
349
|
+
if (!this.hasStarted) {
|
|
350
|
+
this.startReject?.(this.lastError);
|
|
351
|
+
this.startResolve = null;
|
|
352
|
+
this.startReject = null;
|
|
353
|
+
}
|
|
354
|
+
};
|
|
355
|
+
recognition.onend = () => {
|
|
356
|
+
this.hasEnded = true;
|
|
357
|
+
if (!this.hasStarted) {
|
|
358
|
+
const error = this.lastError ?? /* @__PURE__ */ new Error("Browser transcription ended before it could start");
|
|
359
|
+
this.startReject?.(error);
|
|
360
|
+
this.startResolve = null;
|
|
361
|
+
this.startReject = null;
|
|
362
|
+
}
|
|
363
|
+
if (this.stopResolve || this.stopReject) {
|
|
364
|
+
if (this.lastError) this.stopReject?.(this.lastError);
|
|
365
|
+
else this.stopResolve?.(normalizeSpeechText(this.finalTranscript));
|
|
366
|
+
this.stopResolve = null;
|
|
367
|
+
this.stopReject = null;
|
|
368
|
+
}
|
|
369
|
+
};
|
|
370
|
+
const started = new Promise((resolve, reject) => {
|
|
371
|
+
this.startResolve = resolve;
|
|
372
|
+
this.startReject = reject;
|
|
373
|
+
});
|
|
374
|
+
try {
|
|
375
|
+
recognition.start();
|
|
376
|
+
} catch (error) {
|
|
377
|
+
this.clearRecognition();
|
|
378
|
+
throw toError(error, "Browser transcription failed to start");
|
|
379
|
+
}
|
|
380
|
+
try {
|
|
381
|
+
await started;
|
|
382
|
+
} catch (error) {
|
|
383
|
+
this.clearRecognition();
|
|
384
|
+
throw toError(error, "Browser transcription failed to start");
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* Stop the current recognition session and resolve with the final transcript.
|
|
389
|
+
*/
|
|
390
|
+
async stop() {
|
|
391
|
+
if (!this.recognition) {
|
|
392
|
+
if (this.lastError) throw this.lastError;
|
|
393
|
+
return normalizeSpeechText(this.finalTranscript);
|
|
394
|
+
}
|
|
395
|
+
if (this.hasEnded) {
|
|
396
|
+
const transcript = normalizeSpeechText(this.finalTranscript);
|
|
397
|
+
const error = this.lastError;
|
|
398
|
+
this.clearRecognition();
|
|
399
|
+
if (error) throw error;
|
|
400
|
+
return transcript;
|
|
401
|
+
}
|
|
402
|
+
const recognition = this.recognition;
|
|
403
|
+
return normalizeSpeechText(await new Promise((resolve, reject) => {
|
|
404
|
+
this.stopResolve = resolve;
|
|
405
|
+
this.stopReject = reject;
|
|
406
|
+
try {
|
|
407
|
+
recognition.stop();
|
|
408
|
+
} catch (error) {
|
|
409
|
+
reject(toError(error, "Browser transcription failed to stop"));
|
|
410
|
+
}
|
|
411
|
+
}).finally(() => {
|
|
412
|
+
this.clearRecognition();
|
|
413
|
+
}));
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* Abort the current recognition session and reset the service for reuse.
|
|
417
|
+
*/
|
|
418
|
+
dispose() {
|
|
419
|
+
if (this.recognition) try {
|
|
420
|
+
this.recognition.abort();
|
|
421
|
+
} catch {}
|
|
422
|
+
this.startReject?.(/* @__PURE__ */ new Error("Browser transcription aborted"));
|
|
423
|
+
this.stopResolve?.(normalizeSpeechText(this.finalTranscript));
|
|
424
|
+
this.startResolve = null;
|
|
425
|
+
this.startReject = null;
|
|
426
|
+
this.stopResolve = null;
|
|
427
|
+
this.stopReject = null;
|
|
428
|
+
this.clearRecognition();
|
|
429
|
+
this.resetSessionState();
|
|
430
|
+
}
|
|
431
|
+
clearRecognition() {
|
|
432
|
+
if (!this.recognition) return;
|
|
433
|
+
this.recognition.onstart = null;
|
|
434
|
+
this.recognition.onresult = null;
|
|
435
|
+
this.recognition.onerror = null;
|
|
436
|
+
this.recognition.onend = null;
|
|
437
|
+
this.recognition = null;
|
|
438
|
+
}
|
|
439
|
+
resetSessionState() {
|
|
440
|
+
this.finalTranscript = "";
|
|
441
|
+
this.hasStarted = false;
|
|
442
|
+
this.hasEnded = false;
|
|
443
|
+
this.lastError = null;
|
|
444
|
+
this.partialCallback?.("");
|
|
445
|
+
}
|
|
446
|
+
};
|
|
447
|
+
//#endregion
|
|
448
|
+
//#region src/core/bezier.ts
|
|
449
|
+
/**
|
|
450
|
+
* Bezier flight animation for cursor pointing.
|
|
451
|
+
*/
|
|
452
|
+
/**
|
|
453
|
+
* Quadratic bezier curve: B(t) = (1-t)²P₀ + 2(1-t)t·P₁ + t²P₂
|
|
454
|
+
*/
|
|
455
|
+
function quadraticBezier(p0, p1, p2, t) {
|
|
456
|
+
const oneMinusT = 1 - t;
|
|
457
|
+
return {
|
|
458
|
+
x: oneMinusT * oneMinusT * p0.x + 2 * oneMinusT * t * p1.x + t * t * p2.x,
|
|
459
|
+
y: oneMinusT * oneMinusT * p0.y + 2 * oneMinusT * t * p1.y + t * t * p2.y
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Bezier tangent (derivative): B'(t) = 2(1-t)(P₁-P₀) + 2t(P₂-P₁)
|
|
464
|
+
*/
|
|
465
|
+
function bezierTangent(p0, p1, p2, t) {
|
|
466
|
+
const oneMinusT = 1 - t;
|
|
467
|
+
return {
|
|
468
|
+
x: 2 * oneMinusT * (p1.x - p0.x) + 2 * t * (p2.x - p1.x),
|
|
469
|
+
y: 2 * oneMinusT * (p1.y - p0.y) + 2 * t * (p2.y - p1.y)
|
|
470
|
+
};
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* Ease-in-out cubic for smooth acceleration/deceleration
|
|
474
|
+
*/
|
|
475
|
+
function easeInOutCubic(t) {
|
|
476
|
+
return t < .5 ? 4 * t * t * t : 1 - (-2 * t + 2) ** 3 / 2;
|
|
477
|
+
}
|
|
478
|
+
/**
|
|
479
|
+
* Animate cursor along a parabolic bezier arc from start to end.
|
|
480
|
+
* Used when the AI points at a UI element.
|
|
481
|
+
*
|
|
482
|
+
* @param from - Starting position
|
|
483
|
+
* @param to - Target position
|
|
484
|
+
* @param durationMs - Flight duration in milliseconds
|
|
485
|
+
* @param callbacks - Frame and completion callbacks
|
|
486
|
+
* @returns Cancel function to stop the animation
|
|
487
|
+
*/
|
|
488
|
+
function animateBezierFlight(from, to, durationMs, callbacks) {
|
|
489
|
+
const startTime = performance.now();
|
|
490
|
+
const distance = Math.hypot(to.x - from.x, to.y - from.y);
|
|
491
|
+
const controlPoint = {
|
|
492
|
+
x: (from.x + to.x) / 2,
|
|
493
|
+
y: Math.min(from.y, to.y) - distance * .2
|
|
494
|
+
};
|
|
495
|
+
let animationFrameId;
|
|
496
|
+
function animate(now) {
|
|
497
|
+
const elapsed = now - startTime;
|
|
498
|
+
const linearProgress = Math.min(elapsed / durationMs, 1);
|
|
499
|
+
const easedProgress = easeInOutCubic(linearProgress);
|
|
500
|
+
const position = quadraticBezier(from, controlPoint, to, easedProgress);
|
|
501
|
+
const tangent = bezierTangent(from, controlPoint, to, easedProgress);
|
|
502
|
+
const rotation = Math.atan2(tangent.y, tangent.x);
|
|
503
|
+
const scale = 1 + Math.sin(linearProgress * Math.PI) * .3;
|
|
504
|
+
callbacks.onFrame(position, rotation, scale);
|
|
505
|
+
if (linearProgress < 1) animationFrameId = requestAnimationFrame(animate);
|
|
506
|
+
else callbacks.onComplete();
|
|
507
|
+
}
|
|
508
|
+
animationFrameId = requestAnimationFrame(animate);
|
|
509
|
+
return () => cancelAnimationFrame(animationFrameId);
|
|
510
|
+
}
|
|
511
|
+
//#endregion
|
|
512
|
+
//#region src/core/services/pointer-controller.ts
|
|
513
|
+
const POINTING_LOCK_TIMEOUT_MS = 1e4;
|
|
514
|
+
/**
|
|
515
|
+
* Controller for cursor pointing behavior.
|
|
516
|
+
* Manages the pointer state machine (follow -> flying -> anchored -> follow)
|
|
517
|
+
* and cursor animation.
|
|
518
|
+
*/
|
|
519
|
+
var PointerController = class {
|
|
520
|
+
mode = "follow";
|
|
521
|
+
cancelAnimation = null;
|
|
522
|
+
releaseTimeout = null;
|
|
523
|
+
listeners = /* @__PURE__ */ new Set();
|
|
524
|
+
/**
|
|
525
|
+
* Animate cursor to point at a target.
|
|
526
|
+
*/
|
|
527
|
+
pointAt(target) {
|
|
528
|
+
this.release();
|
|
529
|
+
this.mode = "flying";
|
|
530
|
+
$pointingTarget.set(target);
|
|
531
|
+
const startPos = $buddyPosition.get();
|
|
532
|
+
const endPos = {
|
|
533
|
+
x: target.x,
|
|
534
|
+
y: target.y
|
|
535
|
+
};
|
|
536
|
+
this.cancelAnimation = animateBezierFlight(startPos, endPos, 800, {
|
|
537
|
+
onFrame: (position, rotation, scale) => {
|
|
538
|
+
$buddyPosition.set(position);
|
|
539
|
+
$buddyRotation.set(rotation);
|
|
540
|
+
$buddyScale.set(scale);
|
|
541
|
+
},
|
|
542
|
+
onComplete: () => {
|
|
543
|
+
this.cancelAnimation = null;
|
|
544
|
+
this.mode = "anchored";
|
|
545
|
+
$buddyPosition.set(endPos);
|
|
546
|
+
$buddyRotation.set(0);
|
|
547
|
+
$buddyScale.set(1);
|
|
548
|
+
this.scheduleRelease();
|
|
549
|
+
this.notify();
|
|
550
|
+
}
|
|
551
|
+
});
|
|
552
|
+
this.notify();
|
|
553
|
+
}
|
|
554
|
+
/**
|
|
555
|
+
* Release the cursor from pointing mode back to follow mode.
|
|
556
|
+
*/
|
|
557
|
+
release() {
|
|
558
|
+
if (this.cancelAnimation) {
|
|
559
|
+
this.cancelAnimation();
|
|
560
|
+
this.cancelAnimation = null;
|
|
561
|
+
}
|
|
562
|
+
if (this.releaseTimeout) {
|
|
563
|
+
clearTimeout(this.releaseTimeout);
|
|
564
|
+
this.releaseTimeout = null;
|
|
565
|
+
}
|
|
566
|
+
this.mode = "follow";
|
|
567
|
+
$pointingTarget.set(null);
|
|
568
|
+
$buddyPosition.set($cursorPosition.get());
|
|
569
|
+
$buddyRotation.set(0);
|
|
570
|
+
$buddyScale.set(1);
|
|
571
|
+
this.notify();
|
|
572
|
+
}
|
|
573
|
+
/**
|
|
574
|
+
* Check if cursor is currently pointing (flying or anchored).
|
|
575
|
+
*/
|
|
576
|
+
isPointing() {
|
|
577
|
+
return this.mode !== "follow";
|
|
578
|
+
}
|
|
579
|
+
/**
|
|
580
|
+
* Get current pointer mode.
|
|
581
|
+
*/
|
|
582
|
+
getMode() {
|
|
583
|
+
return this.mode;
|
|
584
|
+
}
|
|
585
|
+
/**
|
|
586
|
+
* Subscribe to pointer state changes.
|
|
587
|
+
*/
|
|
588
|
+
subscribe(listener) {
|
|
589
|
+
this.listeners.add(listener);
|
|
590
|
+
return () => this.listeners.delete(listener);
|
|
591
|
+
}
|
|
592
|
+
/**
|
|
593
|
+
* Update buddy position to follow cursor when in follow mode.
|
|
594
|
+
* Call this on cursor position changes.
|
|
595
|
+
*/
|
|
596
|
+
updateFollowPosition() {
|
|
597
|
+
if (this.mode === "follow") {
|
|
598
|
+
$buddyPosition.set($cursorPosition.get());
|
|
599
|
+
$buddyRotation.set(0);
|
|
600
|
+
$buddyScale.set(1);
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
scheduleRelease() {
|
|
604
|
+
this.releaseTimeout = setTimeout(() => {
|
|
605
|
+
this.releaseTimeout = null;
|
|
606
|
+
this.release();
|
|
607
|
+
}, POINTING_LOCK_TIMEOUT_MS);
|
|
608
|
+
}
|
|
609
|
+
notify() {
|
|
610
|
+
this.listeners.forEach((listener) => listener());
|
|
611
|
+
}
|
|
612
|
+
};
|
|
613
|
+
//#endregion
|
|
614
|
+
//#region src/core/utils/annotations.ts
|
|
615
|
+
const DEFAULT_STYLE = {
|
|
616
|
+
borderColor: "rgba(255, 0, 0, 0.8)",
|
|
617
|
+
labelBackground: "rgba(255, 0, 0, 0.9)",
|
|
618
|
+
labelColor: "#ffffff",
|
|
619
|
+
borderWidth: 2,
|
|
620
|
+
fontSize: 15,
|
|
621
|
+
labelPadding: 4
|
|
622
|
+
};
|
|
623
|
+
/**
|
|
624
|
+
* Draw annotation markers onto a canvas.
|
|
625
|
+
* Modifies the canvas in place.
|
|
626
|
+
*
|
|
627
|
+
* @param ctx Canvas 2D context to draw on
|
|
628
|
+
* @param markers Marker map from element discovery
|
|
629
|
+
* @param style Optional style overrides
|
|
630
|
+
*/
|
|
631
|
+
function drawAnnotations(ctx, markers, style = {}) {
|
|
632
|
+
const s = {
|
|
633
|
+
...DEFAULT_STYLE,
|
|
634
|
+
...style
|
|
635
|
+
};
|
|
636
|
+
ctx.save();
|
|
637
|
+
for (const marker of markers.values()) {
|
|
638
|
+
const { rect, id } = marker;
|
|
639
|
+
ctx.strokeStyle = s.borderColor;
|
|
640
|
+
ctx.lineWidth = s.borderWidth;
|
|
641
|
+
ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
|
|
642
|
+
const label = String(id);
|
|
643
|
+
ctx.font = `bold ${s.fontSize}px monospace`;
|
|
644
|
+
const textWidth = ctx.measureText(label).width;
|
|
645
|
+
const textHeight = s.fontSize;
|
|
646
|
+
const labelWidth = textWidth + s.labelPadding * 2;
|
|
647
|
+
const labelHeight = textHeight + s.labelPadding;
|
|
648
|
+
const labelX = rect.left - s.borderWidth;
|
|
649
|
+
const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
|
|
650
|
+
ctx.fillStyle = s.labelBackground;
|
|
651
|
+
ctx.beginPath();
|
|
652
|
+
ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
|
|
653
|
+
ctx.fill();
|
|
654
|
+
ctx.fillStyle = s.labelColor;
|
|
655
|
+
ctx.textBaseline = "top";
|
|
656
|
+
ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
|
|
657
|
+
}
|
|
658
|
+
ctx.restore();
|
|
659
|
+
}
|
|
660
|
+
/**
|
|
661
|
+
* Create an annotated copy of a canvas.
|
|
662
|
+
* Does not modify the original canvas.
|
|
663
|
+
*
|
|
664
|
+
* @param sourceCanvas Original screenshot canvas
|
|
665
|
+
* @param markers Marker map from element discovery
|
|
666
|
+
* @returns New canvas with annotations drawn
|
|
667
|
+
*/
|
|
668
|
+
function createAnnotatedCanvas(sourceCanvas, markers) {
|
|
669
|
+
const canvas = document.createElement("canvas");
|
|
670
|
+
canvas.width = sourceCanvas.width;
|
|
671
|
+
canvas.height = sourceCanvas.height;
|
|
672
|
+
const ctx = canvas.getContext("2d");
|
|
673
|
+
if (!ctx) throw new Error("Failed to get canvas 2D context");
|
|
674
|
+
ctx.drawImage(sourceCanvas, 0, 0);
|
|
675
|
+
drawAnnotations(ctx, markers);
|
|
676
|
+
return canvas;
|
|
677
|
+
}
|
|
678
|
+
/**
|
|
679
|
+
* Generate marker context string for AI prompt.
|
|
680
|
+
* Lists available markers with their descriptions.
|
|
681
|
+
*
|
|
682
|
+
* @param markers Marker map from element discovery
|
|
683
|
+
* @returns Formatted string listing markers
|
|
684
|
+
*/
|
|
685
|
+
function generateMarkerContext(markers) {
|
|
686
|
+
if (markers.size === 0) return "No interactive elements detected.";
|
|
687
|
+
const lines = ["Interactive elements (use marker number to point):"];
|
|
688
|
+
for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
|
|
689
|
+
return lines.join("\n");
|
|
690
|
+
}
|
|
691
|
+
//#endregion
|
|
692
|
+
//#region src/core/utils/elements.ts
|
|
693
|
+
/**
|
|
694
|
+
* Element discovery for annotated screenshots.
|
|
695
|
+
* Finds visible interactive elements and assigns marker IDs.
|
|
696
|
+
*/
|
|
697
|
+
/** Max characters for element descriptions passed to the model. */
|
|
698
|
+
const MAX_DESCRIPTION_LENGTH = 50;
|
|
699
|
+
/** Pixels tolerance for grouping elements into the same visual row. */
|
|
700
|
+
const ROW_TOLERANCE_PX = 20;
|
|
701
|
+
/**
|
|
702
|
+
* Interactive element selectors - elements users would want to click/interact with.
|
|
703
|
+
* Mirrors accessibility roles from agent-browser but using CSS selectors.
|
|
704
|
+
*/
|
|
705
|
+
const INTERACTIVE_SELECTORS = [
|
|
706
|
+
"button",
|
|
707
|
+
"[role=\"button\"]",
|
|
708
|
+
"input[type=\"button\"]",
|
|
709
|
+
"input[type=\"submit\"]",
|
|
710
|
+
"input[type=\"reset\"]",
|
|
711
|
+
"a[href]",
|
|
712
|
+
"[role=\"link\"]",
|
|
713
|
+
"input:not([type=\"hidden\"])",
|
|
714
|
+
"textarea",
|
|
715
|
+
"select",
|
|
716
|
+
"[role=\"textbox\"]",
|
|
717
|
+
"[role=\"searchbox\"]",
|
|
718
|
+
"[role=\"combobox\"]",
|
|
719
|
+
"[role=\"listbox\"]",
|
|
720
|
+
"[role=\"slider\"]",
|
|
721
|
+
"[role=\"spinbutton\"]",
|
|
722
|
+
"[role=\"checkbox\"]",
|
|
723
|
+
"[role=\"radio\"]",
|
|
724
|
+
"[role=\"switch\"]",
|
|
725
|
+
"[role=\"menuitem\"]",
|
|
726
|
+
"[role=\"menuitemcheckbox\"]",
|
|
727
|
+
"[role=\"menuitemradio\"]",
|
|
728
|
+
"[role=\"option\"]",
|
|
729
|
+
"[role=\"tab\"]",
|
|
730
|
+
"[role=\"treeitem\"]",
|
|
731
|
+
"video",
|
|
732
|
+
"audio",
|
|
733
|
+
"[data-cursor-buddy-interactive]"
|
|
734
|
+
];
|
|
735
|
+
/**
|
|
736
|
+
* Check if an element is visible in the viewport.
|
|
737
|
+
*/
|
|
738
|
+
function isElementVisible(element, rect = element.getBoundingClientRect()) {
|
|
739
|
+
if (rect.width <= 0 || rect.height <= 0) return false;
|
|
740
|
+
if (rect.bottom < 0 || rect.top > window.innerHeight || rect.right < 0 || rect.left > window.innerWidth) return false;
|
|
741
|
+
const style = window.getComputedStyle(element);
|
|
742
|
+
if (style.visibility === "hidden" || style.display === "none") return false;
|
|
743
|
+
if (Number.parseFloat(style.opacity) === 0) return false;
|
|
744
|
+
return true;
|
|
745
|
+
}
|
|
746
|
+
function truncateDescription(value) {
|
|
747
|
+
return value.slice(0, MAX_DESCRIPTION_LENGTH);
|
|
748
|
+
}
|
|
749
|
+
/**
|
|
750
|
+
* Generate a brief description for an element.
|
|
751
|
+
*/
|
|
752
|
+
function describeElement(element) {
|
|
753
|
+
const tag = element.tagName.toLowerCase();
|
|
754
|
+
const ariaLabel = element.getAttribute("aria-label");
|
|
755
|
+
if (ariaLabel) return truncateDescription(ariaLabel);
|
|
756
|
+
if (tag === "button" || tag === "a") {
|
|
757
|
+
const text = element.textContent?.trim();
|
|
758
|
+
if (text) return truncateDescription(text);
|
|
759
|
+
}
|
|
760
|
+
if (tag === "input" || tag === "textarea") {
|
|
761
|
+
const placeholder = element.getAttribute("placeholder");
|
|
762
|
+
if (placeholder) return truncateDescription(placeholder);
|
|
763
|
+
return `${element.getAttribute("type") || "text"} input`;
|
|
764
|
+
}
|
|
765
|
+
if (tag === "img") {
|
|
766
|
+
const alt = element.getAttribute("alt");
|
|
767
|
+
if (alt) return truncateDescription(alt);
|
|
768
|
+
return "image";
|
|
769
|
+
}
|
|
770
|
+
const role = element.getAttribute("role");
|
|
771
|
+
if (role) return role;
|
|
772
|
+
return tag;
|
|
773
|
+
}
|
|
774
|
+
function collectVisibleInteractiveElements() {
|
|
775
|
+
const selector = INTERACTIVE_SELECTORS.join(",");
|
|
776
|
+
const allElements = document.querySelectorAll(selector);
|
|
777
|
+
const visible = [];
|
|
778
|
+
for (const element of allElements) {
|
|
779
|
+
const rect = element.getBoundingClientRect();
|
|
780
|
+
if (!isElementVisible(element, rect)) continue;
|
|
781
|
+
visible.push({
|
|
782
|
+
element,
|
|
783
|
+
rect
|
|
784
|
+
});
|
|
785
|
+
}
|
|
786
|
+
visible.sort((a, b) => {
|
|
787
|
+
const rowDiff = Math.floor(a.rect.top / ROW_TOLERANCE_PX) - Math.floor(b.rect.top / ROW_TOLERANCE_PX);
|
|
788
|
+
if (rowDiff !== 0) return rowDiff;
|
|
789
|
+
return a.rect.left - b.rect.left;
|
|
790
|
+
});
|
|
791
|
+
return visible;
|
|
792
|
+
}
|
|
793
|
+
/**
|
|
794
|
+
* Create marker map from visible interactive elements.
|
|
795
|
+
* Assigns sequential IDs starting from 1.
|
|
796
|
+
*/
|
|
797
|
+
function createMarkerMap() {
|
|
798
|
+
const elements = collectVisibleInteractiveElements();
|
|
799
|
+
const map = /* @__PURE__ */ new Map();
|
|
800
|
+
elements.forEach(({ element, rect }, index) => {
|
|
801
|
+
const id = index + 1;
|
|
802
|
+
map.set(id, {
|
|
803
|
+
id,
|
|
804
|
+
element,
|
|
805
|
+
rect,
|
|
806
|
+
description: describeElement(element)
|
|
807
|
+
});
|
|
808
|
+
});
|
|
809
|
+
return map;
|
|
810
|
+
}
|
|
811
|
+
/**
|
|
812
|
+
* Get the center point of an element in viewport coordinates.
|
|
813
|
+
*/
|
|
814
|
+
function getElementCenter(element) {
|
|
815
|
+
const rect = element.getBoundingClientRect();
|
|
816
|
+
return {
|
|
817
|
+
x: Math.round(rect.left + rect.width / 2),
|
|
818
|
+
y: Math.round(rect.top + rect.height / 2)
|
|
819
|
+
};
|
|
820
|
+
}
|
|
821
|
+
/**
|
|
822
|
+
* Resolve a marker ID to viewport coordinates.
|
|
823
|
+
* Returns null if marker not found or element no longer visible.
|
|
824
|
+
*/
|
|
825
|
+
function resolveMarkerToCoordinates(markerMap, markerId) {
|
|
826
|
+
const marker = markerMap.get(markerId);
|
|
827
|
+
if (!marker) return null;
|
|
828
|
+
if (!document.contains(marker.element)) return null;
|
|
829
|
+
if (!isElementVisible(marker.element)) return null;
|
|
830
|
+
return getElementCenter(marker.element);
|
|
831
|
+
}
|
|
832
|
+
//#endregion
|
|
833
|
+
//#region src/core/utils/screenshot.ts
|
|
834
|
+
const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
|
|
835
|
+
/** Maximum width for compressed screenshots (maintains aspect ratio) */
|
|
836
|
+
const MAX_SCREENSHOT_WIDTH = 1280;
|
|
837
|
+
/** JPEG quality for compressed screenshots (0-1) */
|
|
838
|
+
const JPEG_QUALITY = .8;
|
|
839
|
+
/**
|
|
840
|
+
* Compress a canvas image by downscaling and converting to JPEG.
|
|
841
|
+
* Maintains aspect ratio and falls back to original if compression fails.
|
|
842
|
+
*
|
|
843
|
+
* @param sourceCanvas - The source canvas to compress
|
|
844
|
+
* @param maxWidth - Maximum width for the compressed image (default: MAX_SCREENSHOT_WIDTH)
|
|
845
|
+
* @param quality - JPEG quality 0-1 (default: JPEG_QUALITY)
|
|
846
|
+
* @returns Compression result with compressed image data and dimensions
|
|
847
|
+
*/
|
|
848
|
+
function compressImage(sourceCanvas, maxWidth = MAX_SCREENSHOT_WIDTH, quality = JPEG_QUALITY) {
|
|
849
|
+
const sourceWidth = sourceCanvas.width;
|
|
850
|
+
const sourceHeight = sourceCanvas.height;
|
|
851
|
+
if (sourceWidth <= maxWidth) return {
|
|
852
|
+
imageData: sourceCanvas.toDataURL("image/jpeg", quality),
|
|
853
|
+
width: sourceWidth,
|
|
854
|
+
height: sourceHeight
|
|
855
|
+
};
|
|
856
|
+
const scale = maxWidth / sourceWidth;
|
|
857
|
+
const targetWidth = Math.round(maxWidth);
|
|
858
|
+
const targetHeight = Math.round(sourceHeight * scale);
|
|
859
|
+
const canvas = document.createElement("canvas");
|
|
860
|
+
canvas.width = targetWidth;
|
|
861
|
+
canvas.height = targetHeight;
|
|
862
|
+
const ctx = canvas.getContext("2d");
|
|
863
|
+
if (!ctx) return {
|
|
864
|
+
imageData: sourceCanvas.toDataURL("image/jpeg", quality),
|
|
865
|
+
width: sourceWidth,
|
|
866
|
+
height: sourceHeight
|
|
867
|
+
};
|
|
868
|
+
ctx.imageSmoothingEnabled = true;
|
|
869
|
+
ctx.imageSmoothingQuality = "high";
|
|
870
|
+
ctx.drawImage(sourceCanvas, 0, 0, targetWidth, targetHeight);
|
|
871
|
+
return {
|
|
872
|
+
imageData: canvas.toDataURL("image/jpeg", quality),
|
|
873
|
+
width: targetWidth,
|
|
874
|
+
height: targetHeight
|
|
875
|
+
};
|
|
876
|
+
}
|
|
877
|
+
function getCaptureMetrics() {
|
|
878
|
+
return {
|
|
879
|
+
viewportWidth: window.innerWidth,
|
|
880
|
+
viewportHeight: window.innerHeight
|
|
881
|
+
};
|
|
882
|
+
}
|
|
883
|
+
function waitForNextPaint(doc) {
|
|
884
|
+
const view = doc.defaultView;
|
|
885
|
+
if (!view?.requestAnimationFrame) return Promise.resolve();
|
|
886
|
+
return new Promise((resolve) => {
|
|
887
|
+
view.requestAnimationFrame(() => {
|
|
888
|
+
view.requestAnimationFrame(() => resolve());
|
|
889
|
+
});
|
|
890
|
+
});
|
|
891
|
+
}
|
|
892
|
+
function isStylesheetReady(link) {
|
|
893
|
+
const sheet = link.sheet;
|
|
894
|
+
if (!sheet) return false;
|
|
895
|
+
try {
|
|
896
|
+
sheet.cssRules;
|
|
897
|
+
return true;
|
|
898
|
+
} catch (error) {
|
|
899
|
+
return error instanceof DOMException && error.name === "SecurityError";
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
function waitForStylesheetLink(link) {
|
|
903
|
+
if (isStylesheetReady(link)) return Promise.resolve();
|
|
904
|
+
return new Promise((resolve) => {
|
|
905
|
+
let settled = false;
|
|
906
|
+
let timeoutId = 0;
|
|
907
|
+
const finish = () => {
|
|
908
|
+
if (settled) return;
|
|
909
|
+
settled = true;
|
|
910
|
+
window.clearTimeout(timeoutId);
|
|
911
|
+
link.removeEventListener("load", handleReady);
|
|
912
|
+
link.removeEventListener("error", handleReady);
|
|
913
|
+
resolve();
|
|
914
|
+
};
|
|
915
|
+
const handleReady = () => {
|
|
916
|
+
if (isStylesheetReady(link)) {
|
|
917
|
+
finish();
|
|
918
|
+
return;
|
|
919
|
+
}
|
|
920
|
+
window.requestAnimationFrame(() => {
|
|
921
|
+
if (isStylesheetReady(link)) finish();
|
|
922
|
+
});
|
|
923
|
+
};
|
|
924
|
+
timeoutId = window.setTimeout(finish, CLONE_RESOURCE_TIMEOUT_MS);
|
|
925
|
+
link.addEventListener("load", handleReady, { once: true });
|
|
926
|
+
link.addEventListener("error", finish, { once: true });
|
|
927
|
+
handleReady();
|
|
928
|
+
});
|
|
929
|
+
}
|
|
930
|
+
async function waitForClonedDocumentStyles(doc) {
|
|
931
|
+
const stylesheetLinks = Array.from(doc.querySelectorAll("link[rel=\"stylesheet\"][href]"));
|
|
932
|
+
await Promise.all(stylesheetLinks.map(waitForStylesheetLink));
|
|
933
|
+
if (doc.fonts?.ready) await doc.fonts.ready;
|
|
934
|
+
await waitForNextPaint(doc);
|
|
935
|
+
}
|
|
936
|
+
function getHtml2CanvasOptions(captureMetrics) {
|
|
937
|
+
return {
|
|
938
|
+
scale: 1,
|
|
939
|
+
useCORS: true,
|
|
940
|
+
logging: false,
|
|
941
|
+
width: captureMetrics.viewportWidth,
|
|
942
|
+
height: captureMetrics.viewportHeight,
|
|
943
|
+
windowWidth: captureMetrics.viewportWidth,
|
|
944
|
+
windowHeight: captureMetrics.viewportHeight,
|
|
945
|
+
x: window.scrollX,
|
|
946
|
+
y: window.scrollY,
|
|
947
|
+
scrollX: window.scrollX,
|
|
948
|
+
scrollY: window.scrollY,
|
|
949
|
+
onclone: async (doc) => {
|
|
950
|
+
await waitForClonedDocumentStyles(doc);
|
|
951
|
+
}
|
|
952
|
+
};
|
|
953
|
+
}
|
|
954
|
+
/**
|
|
955
|
+
* Create a fallback canvas when screenshot capture fails.
|
|
956
|
+
* Returns a simple gray canvas with an error message.
|
|
957
|
+
*/
|
|
958
|
+
function createFallbackCanvas() {
|
|
959
|
+
const canvas = document.createElement("canvas");
|
|
960
|
+
canvas.width = window.innerWidth;
|
|
961
|
+
canvas.height = window.innerHeight;
|
|
962
|
+
const ctx = canvas.getContext("2d");
|
|
963
|
+
if (ctx) {
|
|
964
|
+
ctx.fillStyle = "#f0f0f0";
|
|
965
|
+
ctx.fillRect(0, 0, canvas.width, canvas.height);
|
|
966
|
+
ctx.fillStyle = "#666";
|
|
967
|
+
ctx.font = "16px sans-serif";
|
|
968
|
+
ctx.textAlign = "center";
|
|
969
|
+
ctx.fillText("Screenshot unavailable", canvas.width / 2, canvas.height / 2);
|
|
970
|
+
}
|
|
971
|
+
return canvas;
|
|
972
|
+
}
|
|
973
|
+
/**
|
|
974
|
+
* Capture a screenshot of the current viewport.
|
|
975
|
+
* Uses html2canvas to render the DOM to a canvas, then compresses to JPEG.
|
|
976
|
+
* Falls back to a placeholder if capture fails (e.g., due to unsupported CSS).
|
|
977
|
+
*/
|
|
978
|
+
async function captureViewport() {
|
|
979
|
+
const captureMetrics = getCaptureMetrics();
|
|
980
|
+
let canvas;
|
|
981
|
+
try {
|
|
982
|
+
canvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
|
|
983
|
+
} catch {
|
|
984
|
+
canvas = createFallbackCanvas();
|
|
985
|
+
}
|
|
986
|
+
let compressed;
|
|
987
|
+
try {
|
|
988
|
+
compressed = compressImage(canvas);
|
|
989
|
+
} catch {
|
|
990
|
+
compressed = {
|
|
991
|
+
imageData: canvas.toDataURL("image/png"),
|
|
992
|
+
width: canvas.width,
|
|
993
|
+
height: canvas.height
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
return {
|
|
997
|
+
imageData: compressed.imageData,
|
|
998
|
+
width: compressed.width,
|
|
999
|
+
height: compressed.height,
|
|
1000
|
+
viewportWidth: captureMetrics.viewportWidth,
|
|
1001
|
+
viewportHeight: captureMetrics.viewportHeight
|
|
1002
|
+
};
|
|
1003
|
+
}
|
|
1004
|
+
/**
|
|
1005
|
+
* Capture an annotated screenshot of the current viewport.
|
|
1006
|
+
* Interactive elements are marked with numbered labels.
|
|
1007
|
+
* Returns both the annotated image and a marker map for resolving IDs.
|
|
1008
|
+
*/
|
|
1009
|
+
async function captureAnnotatedViewport() {
|
|
1010
|
+
const captureMetrics = getCaptureMetrics();
|
|
1011
|
+
const markerMap = createMarkerMap();
|
|
1012
|
+
let sourceCanvas;
|
|
1013
|
+
try {
|
|
1014
|
+
sourceCanvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
|
|
1015
|
+
} catch {
|
|
1016
|
+
sourceCanvas = createFallbackCanvas();
|
|
1017
|
+
}
|
|
1018
|
+
const canvas = markerMap.size > 0 ? createAnnotatedCanvas(sourceCanvas, markerMap) : sourceCanvas;
|
|
1019
|
+
const markerContext = generateMarkerContext(markerMap);
|
|
1020
|
+
let compressed;
|
|
1021
|
+
try {
|
|
1022
|
+
compressed = compressImage(canvas);
|
|
1023
|
+
} catch {
|
|
1024
|
+
compressed = {
|
|
1025
|
+
imageData: canvas.toDataURL("image/png"),
|
|
1026
|
+
width: canvas.width,
|
|
1027
|
+
height: canvas.height
|
|
1028
|
+
};
|
|
1029
|
+
}
|
|
1030
|
+
return {
|
|
1031
|
+
imageData: compressed.imageData,
|
|
1032
|
+
width: compressed.width,
|
|
1033
|
+
height: compressed.height,
|
|
1034
|
+
viewportWidth: captureMetrics.viewportWidth,
|
|
1035
|
+
viewportHeight: captureMetrics.viewportHeight,
|
|
1036
|
+
markerMap,
|
|
1037
|
+
markerContext
|
|
1038
|
+
};
|
|
1039
|
+
}
|
|
1040
|
+
//#endregion
|
|
1041
|
+
//#region src/core/services/screen-capture.ts
|
|
1042
|
+
/**
|
|
1043
|
+
* Framework-agnostic service for capturing viewport screenshots.
|
|
1044
|
+
*/
|
|
1045
|
+
var ScreenCaptureService = class {
|
|
1046
|
+
/**
|
|
1047
|
+
* Capture a screenshot of the current viewport.
|
|
1048
|
+
* @returns Screenshot result with image data and dimensions
|
|
1049
|
+
*/
|
|
1050
|
+
async capture() {
|
|
1051
|
+
return captureViewport();
|
|
1052
|
+
}
|
|
1053
|
+
/**
|
|
1054
|
+
* Capture an annotated screenshot with marker overlays.
|
|
1055
|
+
* Interactive elements are marked with numbered labels.
|
|
1056
|
+
* @returns Annotated screenshot result with marker map
|
|
1057
|
+
*/
|
|
1058
|
+
async captureAnnotated() {
|
|
1059
|
+
return captureAnnotatedViewport();
|
|
1060
|
+
}
|
|
1061
|
+
};
|
|
1062
|
+
//#endregion
|
|
1063
|
+
//#region src/core/services/tts-playback-queue.ts
|
|
1064
|
+
/**
|
|
1065
|
+
* Queues sentence-level speech preparation immediately while keeping playback
|
|
1066
|
+
* strictly ordered.
|
|
1067
|
+
*
|
|
1068
|
+
* Preparation is allowed to run ahead of playback so server synthesis can
|
|
1069
|
+
* overlap with the currently playing segment, but the returned playback tasks
|
|
1070
|
+
* still execute one-by-one in enqueue order.
|
|
1071
|
+
*/
|
|
1072
|
+
var TTSPlaybackQueue = class {
|
|
1073
|
+
error = null;
|
|
1074
|
+
hasStartedPlayback = false;
|
|
1075
|
+
onError;
|
|
1076
|
+
onPlaybackStart;
|
|
1077
|
+
playbackChain = Promise.resolve();
|
|
1078
|
+
prepare;
|
|
1079
|
+
signal;
|
|
1080
|
+
constructor(options) {
|
|
1081
|
+
this.onError = options.onError;
|
|
1082
|
+
this.onPlaybackStart = options.onPlaybackStart;
|
|
1083
|
+
this.prepare = options.prepare;
|
|
1084
|
+
this.signal = options.signal;
|
|
1085
|
+
}
|
|
1086
|
+
/**
|
|
1087
|
+
* Queue a speakable text segment.
|
|
1088
|
+
*/
|
|
1089
|
+
enqueue(text) {
|
|
1090
|
+
const normalizedText = text.trim();
|
|
1091
|
+
if (!normalizedText || this.error || this.signal?.aborted) return;
|
|
1092
|
+
const preparedPlaybackTask = this.prepare(normalizedText, this.signal);
|
|
1093
|
+
preparedPlaybackTask.catch((error) => {
|
|
1094
|
+
this.fail(toError(error));
|
|
1095
|
+
});
|
|
1096
|
+
this.playbackChain = this.playbackChain.then(async () => {
|
|
1097
|
+
if (this.signal?.aborted) return;
|
|
1098
|
+
const play = await preparedPlaybackTask;
|
|
1099
|
+
if (this.signal?.aborted) return;
|
|
1100
|
+
if (!this.hasStartedPlayback) {
|
|
1101
|
+
this.hasStartedPlayback = true;
|
|
1102
|
+
this.onPlaybackStart?.();
|
|
1103
|
+
}
|
|
1104
|
+
await play();
|
|
1105
|
+
}).catch((error) => {
|
|
1106
|
+
this.fail(toError(error));
|
|
1107
|
+
});
|
|
1108
|
+
}
|
|
1109
|
+
/**
|
|
1110
|
+
* Wait until every queued segment has either played or the queue failed.
|
|
1111
|
+
*/
|
|
1112
|
+
async waitForCompletion() {
|
|
1113
|
+
await this.playbackChain;
|
|
1114
|
+
if (this.error) throw this.error;
|
|
1115
|
+
}
|
|
1116
|
+
fail(error) {
|
|
1117
|
+
if (this.error) return;
|
|
1118
|
+
this.error = error;
|
|
1119
|
+
this.onError?.(error);
|
|
1120
|
+
}
|
|
1121
|
+
};
|
|
1122
|
+
//#endregion
|
|
1123
|
+
//#region src/core/utils/audio.ts
|
|
1124
|
+
/**
|
|
1125
|
+
* Audio conversion utilities for voice capture.
|
|
1126
|
+
* Converts Float32 audio data to WAV format for server transcription.
|
|
1127
|
+
*/
|
|
1128
|
+
/**
|
|
1129
|
+
* Merge multiple Float32Array chunks into a single array
|
|
1130
|
+
*/
|
|
1131
|
+
function mergeAudioChunks(chunks) {
|
|
1132
|
+
const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
|
|
1133
|
+
const result = new Float32Array(totalLength);
|
|
1134
|
+
let offset = 0;
|
|
1135
|
+
for (const chunk of chunks) {
|
|
1136
|
+
result.set(chunk, offset);
|
|
1137
|
+
offset += chunk.length;
|
|
1138
|
+
}
|
|
1139
|
+
return result;
|
|
1140
|
+
}
|
|
1141
|
+
/**
|
|
1142
|
+
* Convert Float32 audio data to 16-bit PCM
|
|
1143
|
+
*/
|
|
1144
|
+
function floatTo16BitPCM(output, offset, input) {
|
|
1145
|
+
for (let i = 0; i < input.length; i++, offset += 2) {
|
|
1146
|
+
const sample = Math.max(-1, Math.min(1, input[i]));
|
|
1147
|
+
output.setInt16(offset, sample < 0 ? sample * 32768 : sample * 32767, true);
|
|
1148
|
+
}
|
|
1149
|
+
}
|
|
1150
|
+
/**
|
|
1151
|
+
* Write a string to a DataView
|
|
1152
|
+
*/
|
|
1153
|
+
function writeString(view, offset, string) {
|
|
1154
|
+
for (let i = 0; i < string.length; i++) view.setUint8(offset + i, string.charCodeAt(i));
|
|
1155
|
+
}
|
|
1156
|
+
/**
|
|
1157
|
+
* Encode Float32 audio data as a WAV file
|
|
1158
|
+
*/
|
|
1159
|
+
function encodeWAV(samples, sampleRate) {
|
|
1160
|
+
const numChannels = 1;
|
|
1161
|
+
const bitsPerSample = 16;
|
|
1162
|
+
const bytesPerSample = bitsPerSample / 8;
|
|
1163
|
+
const blockAlign = numChannels * bytesPerSample;
|
|
1164
|
+
const dataLength = samples.length * bytesPerSample;
|
|
1165
|
+
const buffer = new ArrayBuffer(44 + dataLength);
|
|
1166
|
+
const view = new DataView(buffer);
|
|
1167
|
+
writeString(view, 0, "RIFF");
|
|
1168
|
+
view.setUint32(4, 36 + dataLength, true);
|
|
1169
|
+
writeString(view, 8, "WAVE");
|
|
1170
|
+
writeString(view, 12, "fmt ");
|
|
1171
|
+
view.setUint32(16, 16, true);
|
|
1172
|
+
view.setUint16(20, 1, true);
|
|
1173
|
+
view.setUint16(22, numChannels, true);
|
|
1174
|
+
view.setUint32(24, sampleRate, true);
|
|
1175
|
+
view.setUint32(28, sampleRate * blockAlign, true);
|
|
1176
|
+
view.setUint16(32, blockAlign, true);
|
|
1177
|
+
view.setUint16(34, bitsPerSample, true);
|
|
1178
|
+
writeString(view, 36, "data");
|
|
1179
|
+
view.setUint32(40, dataLength, true);
|
|
1180
|
+
floatTo16BitPCM(view, 44, samples);
|
|
1181
|
+
return new Blob([buffer], { type: "audio/wav" });
|
|
1182
|
+
}
|
|
1183
|
+
//#endregion
|
|
1184
|
+
//#region src/core/utils/audio-worklet.ts
|
|
1185
|
+
/**
|
|
1186
|
+
* AudioWorklet processor code for voice capture.
|
|
1187
|
+
* Inlined as a blob URL to avoid separate file serving requirements.
|
|
1188
|
+
*/
|
|
1189
|
+
const workletCode = `
|
|
1190
|
+
class AudioCaptureProcessor extends AudioWorkletProcessor {
|
|
1191
|
+
constructor() {
|
|
1192
|
+
super()
|
|
1193
|
+
this.isRecording = true
|
|
1194
|
+
this.audioChunkSize = 2048
|
|
1195
|
+
this.audioBuffer = new Float32Array(this.audioChunkSize)
|
|
1196
|
+
this.audioBufferIndex = 0
|
|
1197
|
+
this.levelFramesPerUpdate = 4
|
|
1198
|
+
this.levelFrameCount = 0
|
|
1199
|
+
this.levelRmsSum = 0
|
|
1200
|
+
this.levelPeak = 0
|
|
1201
|
+
|
|
1202
|
+
this.port.onmessage = (event) => {
|
|
1203
|
+
if (event.data?.type === "flush") {
|
|
1204
|
+
this.flushAudio()
|
|
1205
|
+
this.flushLevel()
|
|
1206
|
+
this.port.postMessage({ type: "flush-complete" })
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
flushAudio() {
|
|
1212
|
+
if (this.audioBufferIndex === 0) return
|
|
1213
|
+
|
|
1214
|
+
const chunk = this.audioBuffer.slice(0, this.audioBufferIndex)
|
|
1215
|
+
this.port.postMessage({
|
|
1216
|
+
type: "audio",
|
|
1217
|
+
data: chunk
|
|
1218
|
+
})
|
|
1219
|
+
this.audioBufferIndex = 0
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
flushLevel() {
|
|
1223
|
+
if (this.levelFrameCount === 0) return
|
|
1224
|
+
|
|
1225
|
+
this.port.postMessage({
|
|
1226
|
+
type: "level",
|
|
1227
|
+
rms: this.levelRmsSum / this.levelFrameCount,
|
|
1228
|
+
peak: this.levelPeak
|
|
1229
|
+
})
|
|
1230
|
+
|
|
1231
|
+
this.levelFrameCount = 0
|
|
1232
|
+
this.levelRmsSum = 0
|
|
1233
|
+
this.levelPeak = 0
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
process(inputs) {
|
|
1237
|
+
if (!this.isRecording) return false
|
|
1238
|
+
|
|
1239
|
+
const input = inputs[0]
|
|
1240
|
+
if (input && input.length > 0) {
|
|
1241
|
+
const channelData = input[0]
|
|
1242
|
+
let sum = 0
|
|
1243
|
+
let peak = 0
|
|
1244
|
+
for (let i = 0; i < channelData.length; i++) {
|
|
1245
|
+
const sample = channelData[i]
|
|
1246
|
+
sum += sample * sample
|
|
1247
|
+
const absolute = Math.abs(sample)
|
|
1248
|
+
if (absolute > peak) peak = absolute
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
this.levelRmsSum += Math.sqrt(sum / channelData.length)
|
|
1252
|
+
this.levelPeak = Math.max(this.levelPeak, peak)
|
|
1253
|
+
this.levelFrameCount += 1
|
|
1254
|
+
|
|
1255
|
+
if (this.levelFrameCount >= this.levelFramesPerUpdate) {
|
|
1256
|
+
this.flushLevel()
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
let readOffset = 0
|
|
1260
|
+
while (readOffset < channelData.length) {
|
|
1261
|
+
const remaining = this.audioBuffer.length - this.audioBufferIndex
|
|
1262
|
+
const copyLength = Math.min(remaining, channelData.length - readOffset)
|
|
1263
|
+
|
|
1264
|
+
this.audioBuffer.set(
|
|
1265
|
+
channelData.subarray(readOffset, readOffset + copyLength),
|
|
1266
|
+
this.audioBufferIndex
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
this.audioBufferIndex += copyLength
|
|
1270
|
+
readOffset += copyLength
|
|
1271
|
+
|
|
1272
|
+
if (this.audioBufferIndex >= this.audioBuffer.length) {
|
|
1273
|
+
this.flushAudio()
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
return true
|
|
1279
|
+
}
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
registerProcessor("audio-capture-processor", AudioCaptureProcessor)
|
|
1283
|
+
`;
|
|
1284
|
+
let cachedBlobURL = null;
|
|
1285
|
+
/**
|
|
1286
|
+
* Create a blob URL for the audio worklet processor.
|
|
1287
|
+
* Caches the URL to avoid creating multiple blobs.
|
|
1288
|
+
*/
|
|
1289
|
+
function createWorkletBlobURL() {
|
|
1290
|
+
if (!cachedBlobURL) {
|
|
1291
|
+
const blob = new Blob([workletCode], { type: "application/javascript" });
|
|
1292
|
+
cachedBlobURL = URL.createObjectURL(blob);
|
|
1293
|
+
}
|
|
1294
|
+
return cachedBlobURL;
|
|
1295
|
+
}
|
|
1296
|
+
//#endregion
|
|
1297
|
+
//#region src/core/services/voice-capture.ts
|
|
1298
|
+
const SAMPLE_RATE = 16e3;
|
|
1299
|
+
const AUDIO_LEVEL_NOISE_GATE = 5e-4;
|
|
1300
|
+
const AUDIO_LEVEL_INPUT_GAIN = 600;
|
|
1301
|
+
const AUDIO_LEVEL_ATTACK = .7;
|
|
1302
|
+
const AUDIO_LEVEL_RELEASE = .25;
|
|
1303
|
+
function clamp$1(value, min, max) {
|
|
1304
|
+
return Math.min(Math.max(value, min), max);
|
|
1305
|
+
}
|
|
1306
|
+
function normalizeAudioLevel(rms) {
|
|
1307
|
+
const gatedRms = Math.max(0, rms - AUDIO_LEVEL_NOISE_GATE);
|
|
1308
|
+
return clamp$1(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
|
|
1309
|
+
}
|
|
1310
|
+
function smoothAudioLevel(current, target) {
|
|
1311
|
+
const smoothing = target > current ? AUDIO_LEVEL_ATTACK : AUDIO_LEVEL_RELEASE;
|
|
1312
|
+
return current + (target - current) * smoothing;
|
|
1313
|
+
}
|
|
1314
|
+
/**
|
|
1315
|
+
* Framework-agnostic service for voice capture using AudioWorkletNode.
|
|
1316
|
+
*/
|
|
1317
|
+
var VoiceCaptureService = class {
|
|
1318
|
+
audioContext = null;
|
|
1319
|
+
workletNode = null;
|
|
1320
|
+
sourceNode = null;
|
|
1321
|
+
silentGainNode = null;
|
|
1322
|
+
stream = null;
|
|
1323
|
+
chunks = [];
|
|
1324
|
+
levelCallback = null;
|
|
1325
|
+
visualLevel = 0;
|
|
1326
|
+
flushResolve = null;
|
|
1327
|
+
/**
|
|
1328
|
+
* Register a callback to receive audio level updates (0-1).
|
|
1329
|
+
* Called at ~60fps during recording for waveform visualization.
|
|
1330
|
+
*/
|
|
1331
|
+
onLevel(callback) {
|
|
1332
|
+
this.levelCallback = callback;
|
|
1333
|
+
}
|
|
1334
|
+
/**
|
|
1335
|
+
* Start recording audio from the microphone.
|
|
1336
|
+
* @throws Error if microphone access is denied
|
|
1337
|
+
*/
|
|
1338
|
+
async start() {
|
|
1339
|
+
this.chunks = [];
|
|
1340
|
+
this.visualLevel = 0;
|
|
1341
|
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: {
|
|
1342
|
+
sampleRate: SAMPLE_RATE,
|
|
1343
|
+
channelCount: 1,
|
|
1344
|
+
echoCancellation: true,
|
|
1345
|
+
noiseSuppression: true
|
|
1346
|
+
} });
|
|
1347
|
+
this.stream = stream;
|
|
1348
|
+
const audioContext = new AudioContext({ sampleRate: SAMPLE_RATE });
|
|
1349
|
+
this.audioContext = audioContext;
|
|
1350
|
+
await audioContext.resume();
|
|
1351
|
+
const workletURL = createWorkletBlobURL();
|
|
1352
|
+
await audioContext.audioWorklet.addModule(workletURL);
|
|
1353
|
+
const source = audioContext.createMediaStreamSource(stream);
|
|
1354
|
+
this.sourceNode = source;
|
|
1355
|
+
const workletNode = new AudioWorkletNode(audioContext, "audio-capture-processor");
|
|
1356
|
+
this.workletNode = workletNode;
|
|
1357
|
+
const silentGainNode = audioContext.createGain();
|
|
1358
|
+
silentGainNode.gain.value = 0;
|
|
1359
|
+
this.silentGainNode = silentGainNode;
|
|
1360
|
+
workletNode.port.onmessage = (event) => {
|
|
1361
|
+
const { type, data, rms, peak } = event.data;
|
|
1362
|
+
if (type === "audio") this.chunks.push(data);
|
|
1363
|
+
else if (type === "level" && this.levelCallback) {
|
|
1364
|
+
const targetLevel = normalizeAudioLevel(Math.max(rms ?? 0, (peak ?? 0) * .6));
|
|
1365
|
+
this.visualLevel = smoothAudioLevel(this.visualLevel, targetLevel);
|
|
1366
|
+
this.levelCallback(this.visualLevel);
|
|
1367
|
+
} else if (type === "flush-complete") {
|
|
1368
|
+
this.flushResolve?.();
|
|
1369
|
+
this.flushResolve = null;
|
|
1370
|
+
}
|
|
1371
|
+
};
|
|
1372
|
+
source.connect(workletNode);
|
|
1373
|
+
workletNode.connect(silentGainNode);
|
|
1374
|
+
silentGainNode.connect(audioContext.destination);
|
|
1375
|
+
}
|
|
1376
|
+
/**
|
|
1377
|
+
* Stop recording and return the captured audio as a WAV blob.
|
|
1378
|
+
*/
|
|
1379
|
+
async stop() {
|
|
1380
|
+
await this.flushPendingAudio();
|
|
1381
|
+
if (this.stream) {
|
|
1382
|
+
this.stream.getTracks().forEach((track) => track.stop());
|
|
1383
|
+
this.stream = null;
|
|
1384
|
+
}
|
|
1385
|
+
if (this.sourceNode) {
|
|
1386
|
+
this.sourceNode.disconnect();
|
|
1387
|
+
this.sourceNode = null;
|
|
1388
|
+
}
|
|
1389
|
+
if (this.workletNode) {
|
|
1390
|
+
this.workletNode.disconnect();
|
|
1391
|
+
this.workletNode = null;
|
|
1392
|
+
}
|
|
1393
|
+
if (this.silentGainNode) {
|
|
1394
|
+
this.silentGainNode.disconnect();
|
|
1395
|
+
this.silentGainNode = null;
|
|
1396
|
+
}
|
|
1397
|
+
if (this.audioContext) {
|
|
1398
|
+
await this.audioContext.close();
|
|
1399
|
+
this.audioContext = null;
|
|
1400
|
+
}
|
|
1401
|
+
this.visualLevel = 0;
|
|
1402
|
+
this.levelCallback?.(0);
|
|
1403
|
+
const wavBlob = encodeWAV(mergeAudioChunks(this.chunks), SAMPLE_RATE);
|
|
1404
|
+
this.chunks = [];
|
|
1405
|
+
return wavBlob;
|
|
1406
|
+
}
|
|
1407
|
+
/**
|
|
1408
|
+
* Clean up all resources.
|
|
1409
|
+
*
|
|
1410
|
+
* The level callback is intentionally preserved so the same service instance
|
|
1411
|
+
* can be reused across multiple push-to-talk turns without re-registering
|
|
1412
|
+
* the waveform subscription from the client.
|
|
1413
|
+
*/
|
|
1414
|
+
dispose() {
|
|
1415
|
+
if (this.stream) {
|
|
1416
|
+
this.stream.getTracks().forEach((track) => track.stop());
|
|
1417
|
+
this.stream = null;
|
|
1418
|
+
}
|
|
1419
|
+
if (this.sourceNode) {
|
|
1420
|
+
this.sourceNode.disconnect();
|
|
1421
|
+
this.sourceNode = null;
|
|
1422
|
+
}
|
|
1423
|
+
if (this.workletNode) {
|
|
1424
|
+
this.workletNode.disconnect();
|
|
1425
|
+
this.workletNode = null;
|
|
1426
|
+
}
|
|
1427
|
+
if (this.silentGainNode) {
|
|
1428
|
+
this.silentGainNode.disconnect();
|
|
1429
|
+
this.silentGainNode = null;
|
|
1430
|
+
}
|
|
1431
|
+
if (this.audioContext) {
|
|
1432
|
+
this.audioContext.close();
|
|
1433
|
+
this.audioContext = null;
|
|
1434
|
+
}
|
|
1435
|
+
this.chunks = [];
|
|
1436
|
+
this.visualLevel = 0;
|
|
1437
|
+
this.levelCallback?.(0);
|
|
1438
|
+
this.flushResolve = null;
|
|
1439
|
+
}
|
|
1440
|
+
async flushPendingAudio() {
|
|
1441
|
+
if (!this.workletNode) return;
|
|
1442
|
+
await new Promise((resolve) => {
|
|
1443
|
+
const timeoutId = setTimeout(() => {
|
|
1444
|
+
this.flushResolve = null;
|
|
1445
|
+
resolve();
|
|
1446
|
+
}, 50);
|
|
1447
|
+
this.flushResolve = () => {
|
|
1448
|
+
clearTimeout(timeoutId);
|
|
1449
|
+
resolve();
|
|
1450
|
+
};
|
|
1451
|
+
this.workletNode?.port.postMessage({ type: "flush" });
|
|
1452
|
+
});
|
|
1453
|
+
}
|
|
1454
|
+
};
|
|
1455
|
+
//#endregion
|
|
1456
|
+
//#region src/core/state-machine.ts
|
|
1457
|
+
/**
|
|
1458
|
+
* State transition table for the voice interaction flow.
|
|
1459
|
+
* Maps current state + event type to next state.
|
|
1460
|
+
*/
|
|
1461
|
+
const transitions = {
|
|
1462
|
+
idle: { HOTKEY_PRESSED: "listening" },
|
|
1463
|
+
listening: {
|
|
1464
|
+
HOTKEY_RELEASED: "processing",
|
|
1465
|
+
ERROR: "idle"
|
|
1466
|
+
},
|
|
1467
|
+
processing: {
|
|
1468
|
+
RESPONSE_STARTED: "responding",
|
|
1469
|
+
TTS_COMPLETE: "idle",
|
|
1470
|
+
HOTKEY_PRESSED: "listening",
|
|
1471
|
+
ERROR: "idle"
|
|
1472
|
+
},
|
|
1473
|
+
responding: {
|
|
1474
|
+
TTS_COMPLETE: "idle",
|
|
1475
|
+
HOTKEY_PRESSED: "listening",
|
|
1476
|
+
ERROR: "idle"
|
|
1477
|
+
}
|
|
1478
|
+
};
|
|
1479
|
+
/**
|
|
1480
|
+
* Create a simple typed state machine for the voice interaction flow.
|
|
1481
|
+
*
|
|
1482
|
+
* States: idle -> listening -> processing -> responding -> idle
|
|
1483
|
+
*
|
|
1484
|
+
* Supports interruption: pressing hotkey during processing or responding
|
|
1485
|
+
* immediately transitions back to listening.
|
|
1486
|
+
*/
|
|
1487
|
+
function createStateMachine(initial = "idle") {
|
|
1488
|
+
let state = initial;
|
|
1489
|
+
const listeners = /* @__PURE__ */ new Set();
|
|
1490
|
+
function notify() {
|
|
1491
|
+
listeners.forEach((listener) => listener());
|
|
1492
|
+
}
|
|
1493
|
+
return {
|
|
1494
|
+
getState: () => state,
|
|
1495
|
+
transition: (event) => {
|
|
1496
|
+
const nextState = transitions[state][event.type];
|
|
1497
|
+
if (!nextState) return false;
|
|
1498
|
+
state = nextState;
|
|
1499
|
+
notify();
|
|
1500
|
+
return true;
|
|
1501
|
+
},
|
|
1502
|
+
subscribe: (listener) => {
|
|
1503
|
+
listeners.add(listener);
|
|
1504
|
+
return () => listeners.delete(listener);
|
|
1505
|
+
},
|
|
1506
|
+
reset: () => {
|
|
1507
|
+
state = "idle";
|
|
1508
|
+
notify();
|
|
1509
|
+
}
|
|
1510
|
+
};
|
|
1511
|
+
}
|
|
1512
|
+
//#endregion
|
|
1513
|
+
//#region src/core/utils/ui-stream-parser.ts
|
|
1514
|
+
/**
|
|
1515
|
+
* Parse a single line from the UI message stream.
|
|
1516
|
+
* The stream format is SSE with "data: " prefix followed by JSON.
|
|
1517
|
+
*/
|
|
1518
|
+
function parseUIStreamLine(line) {
|
|
1519
|
+
const trimmed = line.trim();
|
|
1520
|
+
if (!trimmed) return null;
|
|
1521
|
+
let jsonStr = trimmed;
|
|
1522
|
+
if (trimmed.startsWith("data: ")) jsonStr = trimmed.slice(6);
|
|
1523
|
+
if (jsonStr === "[DONE]") return null;
|
|
1524
|
+
try {
|
|
1525
|
+
const chunk = JSON.parse(jsonStr);
|
|
1526
|
+
switch (chunk.type) {
|
|
1527
|
+
case "text-delta": return {
|
|
1528
|
+
type: "text-delta",
|
|
1529
|
+
delta: chunk.delta ?? ""
|
|
1530
|
+
};
|
|
1531
|
+
case "tool-input-available": return {
|
|
1532
|
+
type: "tool-input-available",
|
|
1533
|
+
toolName: chunk.toolName ?? "",
|
|
1534
|
+
input: chunk.input
|
|
1535
|
+
};
|
|
1536
|
+
case "finish": return { type: "finish" };
|
|
1537
|
+
case "error": return {
|
|
1538
|
+
type: "error",
|
|
1539
|
+
errorText: chunk.errorText ?? "Unknown error"
|
|
1540
|
+
};
|
|
1541
|
+
default: return { type: "unknown" };
|
|
1542
|
+
}
|
|
1543
|
+
} catch {
|
|
1544
|
+
return null;
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
/**
|
|
1548
|
+
* Check if a tool call is a point tool call with valid input.
|
|
1549
|
+
*/
|
|
1550
|
+
function isPointToolCall(chunk) {
|
|
1551
|
+
return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "type" in chunk.input && "label" in chunk.input;
|
|
1552
|
+
}
|
|
1553
|
+
//#endregion
|
|
1554
|
+
//#region src/core/utils/response-processor.ts
|
|
1555
|
+
const COMMON_ABBREVIATIONS = [
|
|
1556
|
+
"mr.",
|
|
1557
|
+
"mrs.",
|
|
1558
|
+
"ms.",
|
|
1559
|
+
"dr.",
|
|
1560
|
+
"prof.",
|
|
1561
|
+
"sr.",
|
|
1562
|
+
"jr.",
|
|
1563
|
+
"e.g.",
|
|
1564
|
+
"i.e."
|
|
1565
|
+
];
|
|
1566
|
+
const CLOSING_PUNCTUATION = new Set([
|
|
1567
|
+
"\"",
|
|
1568
|
+
"'",
|
|
1569
|
+
"”",
|
|
1570
|
+
"’",
|
|
1571
|
+
")",
|
|
1572
|
+
"]",
|
|
1573
|
+
"}"
|
|
1574
|
+
]);
|
|
1575
|
+
const SHORT_SEGMENT_THRESHOLD = 24;
|
|
1576
|
+
function isLikelySentenceBoundary(text, index) {
|
|
1577
|
+
const char = text[index];
|
|
1578
|
+
if (char === "!" || char === "?" || char === "…" || char === "\n") return true;
|
|
1579
|
+
if (char !== ".") return false;
|
|
1580
|
+
const previousChar = text[index - 1] ?? "";
|
|
1581
|
+
const nextChar = text[index + 1] ?? "";
|
|
1582
|
+
if (/\d/.test(previousChar) && /\d/.test(nextChar)) return false;
|
|
1583
|
+
const lookback = text.slice(Math.max(0, index - 10), index + 1).toLowerCase();
|
|
1584
|
+
if (COMMON_ABBREVIATIONS.some((abbreviation) => lookback.endsWith(abbreviation))) return false;
|
|
1585
|
+
return true;
|
|
1586
|
+
}
|
|
1587
|
+
function findBoundaryEnd(text, start) {
|
|
1588
|
+
for (let index = start; index < text.length; index++) {
|
|
1589
|
+
if (text[index] === "\n") {
|
|
1590
|
+
let end = index + 1;
|
|
1591
|
+
while (end < text.length && /\s/.test(text[end] ?? "")) end++;
|
|
1592
|
+
return end;
|
|
1593
|
+
}
|
|
1594
|
+
if (!isLikelySentenceBoundary(text, index)) continue;
|
|
1595
|
+
let end = index + 1;
|
|
1596
|
+
while (end < text.length && CLOSING_PUNCTUATION.has(text[end] ?? "")) end++;
|
|
1597
|
+
if (end < text.length) {
|
|
1598
|
+
const nextChar = text[end] ?? "";
|
|
1599
|
+
if (!/\s/.test(nextChar) && !/[A-Z0-9]/.test(nextChar)) continue;
|
|
1600
|
+
}
|
|
1601
|
+
while (end < text.length && /\s/.test(text[end] ?? "")) end++;
|
|
1602
|
+
return end;
|
|
1603
|
+
}
|
|
1604
|
+
return null;
|
|
1605
|
+
}
|
|
1606
|
+
function extractCompletedSegments(text) {
|
|
1607
|
+
const segments = [];
|
|
1608
|
+
let consumedLength = 0;
|
|
1609
|
+
while (consumedLength < text.length) {
|
|
1610
|
+
const boundaryEnd = findBoundaryEnd(text, consumedLength);
|
|
1611
|
+
if (boundaryEnd === null) break;
|
|
1612
|
+
const segment = text.slice(consumedLength, boundaryEnd).trim();
|
|
1613
|
+
if (segment) segments.push(segment);
|
|
1614
|
+
consumedLength = boundaryEnd;
|
|
1615
|
+
}
|
|
1616
|
+
return {
|
|
1617
|
+
consumedLength,
|
|
1618
|
+
segments
|
|
1619
|
+
};
|
|
1620
|
+
}
|
|
1621
|
+
/**
|
|
1622
|
+
* Processes a streaming AI SDK UI message stream response.
|
|
1623
|
+
* Extracts text for display/TTS and captures point tool calls.
|
|
1624
|
+
*/
|
|
1625
|
+
var ProgressiveResponseProcessor = class {
|
|
1626
|
+
consumedTextLength = 0;
|
|
1627
|
+
pendingShortSegment = "";
|
|
1628
|
+
rawText = "";
|
|
1629
|
+
buffer = "";
|
|
1630
|
+
pointToolCall = null;
|
|
1631
|
+
/**
|
|
1632
|
+
* Push raw stream data and extract text chunks and tool calls.
|
|
1633
|
+
* The UI message stream format is newline-delimited JSON.
|
|
1634
|
+
*/
|
|
1635
|
+
push(chunk) {
|
|
1636
|
+
this.buffer += chunk;
|
|
1637
|
+
const lines = this.buffer.split("\n");
|
|
1638
|
+
this.buffer = lines.pop() ?? "";
|
|
1639
|
+
const newTextParts = [];
|
|
1640
|
+
for (const line of lines) {
|
|
1641
|
+
const parsed = parseUIStreamLine(line);
|
|
1642
|
+
if (!parsed) continue;
|
|
1643
|
+
if (parsed.type === "text-delta") newTextParts.push(parsed.delta);
|
|
1644
|
+
else if (isPointToolCall(parsed)) {
|
|
1645
|
+
if (!this.pointToolCall) this.pointToolCall = parsed.input;
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
if (newTextParts.length > 0) this.rawText += newTextParts.join("");
|
|
1649
|
+
const { consumedLength, segments } = extractCompletedSegments(this.rawText.slice(this.consumedTextLength));
|
|
1650
|
+
this.consumedTextLength += consumedLength;
|
|
1651
|
+
return {
|
|
1652
|
+
visibleText: this.rawText,
|
|
1653
|
+
speechSegments: this.coalesceSegments(segments),
|
|
1654
|
+
pointToolCall: this.pointToolCall
|
|
1655
|
+
};
|
|
1656
|
+
}
|
|
1657
|
+
/**
|
|
1658
|
+
* Finalize processing and return any remaining text/tool call.
|
|
1659
|
+
*/
|
|
1660
|
+
finish() {
|
|
1661
|
+
if (this.buffer) {
|
|
1662
|
+
const parsed = parseUIStreamLine(this.buffer);
|
|
1663
|
+
if (parsed?.type === "text-delta") this.rawText += parsed.delta;
|
|
1664
|
+
else if (parsed && isPointToolCall(parsed) && !this.pointToolCall) this.pointToolCall = parsed.input;
|
|
1665
|
+
this.buffer = "";
|
|
1666
|
+
}
|
|
1667
|
+
const trailingText = this.rawText.slice(this.consumedTextLength).trim();
|
|
1668
|
+
const finalSegmentParts = [this.pendingShortSegment, trailingText].filter(Boolean);
|
|
1669
|
+
this.pendingShortSegment = "";
|
|
1670
|
+
return {
|
|
1671
|
+
finalResponseText: this.rawText.trim(),
|
|
1672
|
+
speechSegments: finalSegmentParts.length ? [finalSegmentParts.join(" ").trim()] : [],
|
|
1673
|
+
pointToolCall: this.pointToolCall
|
|
1674
|
+
};
|
|
1675
|
+
}
|
|
1676
|
+
coalesceSegments(segments) {
|
|
1677
|
+
const speechSegments = [];
|
|
1678
|
+
for (const segment of segments) {
|
|
1679
|
+
const normalizedSegment = segment.trim();
|
|
1680
|
+
if (!normalizedSegment) continue;
|
|
1681
|
+
const candidate = this.pendingShortSegment ? `${this.pendingShortSegment} ${normalizedSegment}` : normalizedSegment;
|
|
1682
|
+
if (candidate.length < SHORT_SEGMENT_THRESHOLD) {
|
|
1683
|
+
this.pendingShortSegment = candidate;
|
|
1684
|
+
continue;
|
|
1685
|
+
}
|
|
1686
|
+
this.pendingShortSegment = "";
|
|
1687
|
+
speechSegments.push(candidate);
|
|
1688
|
+
}
|
|
1689
|
+
return speechSegments;
|
|
1690
|
+
}
|
|
1691
|
+
};
|
|
1692
|
+
//#endregion
|
|
1693
|
+
//#region src/core/client.ts
|
|
1694
|
+
function clamp(value, min, max) {
|
|
1695
|
+
return Math.min(Math.max(value, min), max);
|
|
1696
|
+
}
|
|
1697
|
+
async function readErrorMessage(response, fallbackMessage) {
|
|
1698
|
+
try {
|
|
1699
|
+
if ((response.headers.get("Content-Type") ?? "").includes("application/json")) {
|
|
1700
|
+
const body = await response.json();
|
|
1701
|
+
if (body?.error) return body.error;
|
|
1702
|
+
}
|
|
1703
|
+
const text = await response.text();
|
|
1704
|
+
if (text) return text;
|
|
1705
|
+
} catch {}
|
|
1706
|
+
return fallbackMessage;
|
|
1707
|
+
}
|
|
1708
|
+
/**
|
|
1709
|
+
* Map coordinate-based pointing from screenshot space to viewport space.
|
|
1710
|
+
*/
|
|
1711
|
+
function mapCoordinatesToViewport(x, y, screenshot) {
|
|
1712
|
+
if (screenshot.width <= 0 || screenshot.height <= 0) return {
|
|
1713
|
+
x,
|
|
1714
|
+
y
|
|
1715
|
+
};
|
|
1716
|
+
const scaleX = screenshot.viewportWidth / screenshot.width;
|
|
1717
|
+
const scaleY = screenshot.viewportHeight / screenshot.height;
|
|
1718
|
+
return {
|
|
1719
|
+
x: clamp(Math.round(x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
|
|
1720
|
+
y: clamp(Math.round(y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
|
|
1721
|
+
};
|
|
1722
|
+
}
|
|
1723
|
+
/**
|
|
1724
|
+
* Framework-agnostic client for cursor buddy voice interactions.
|
|
1725
|
+
*
|
|
1726
|
+
* Manages the complete voice interaction flow:
|
|
1727
|
+
* idle -> listening -> processing -> responding -> idle
|
|
1728
|
+
*
|
|
1729
|
+
* Supports interruption: pressing hotkey during any state aborts
|
|
1730
|
+
* in-flight work and immediately transitions to listening.
|
|
1731
|
+
*/
|
|
1732
|
+
var CursorBuddyClient = class {
|
|
1733
|
+
endpoint;
|
|
1734
|
+
options;
|
|
1735
|
+
voiceCapture;
|
|
1736
|
+
audioPlayback;
|
|
1737
|
+
browserSpeech;
|
|
1738
|
+
liveTranscription;
|
|
1739
|
+
screenCapture;
|
|
1740
|
+
pointerController;
|
|
1741
|
+
stateMachine;
|
|
1742
|
+
liveTranscript = "";
|
|
1743
|
+
transcript = "";
|
|
1744
|
+
response = "";
|
|
1745
|
+
error = null;
|
|
1746
|
+
abortController = null;
|
|
1747
|
+
historyCommittedForTurn = false;
|
|
1748
|
+
speechProviderForTurn = null;
|
|
1749
|
+
screenshotPromise = null;
|
|
1750
|
+
cachedSnapshot;
|
|
1751
|
+
listeners = /* @__PURE__ */ new Set();
|
|
1752
|
+
constructor(endpoint, options = {}, services = {}) {
|
|
1753
|
+
this.endpoint = endpoint;
|
|
1754
|
+
this.options = options;
|
|
1755
|
+
this.voiceCapture = services.voiceCapture ?? new VoiceCaptureService();
|
|
1756
|
+
this.audioPlayback = services.audioPlayback ?? new AudioPlaybackService();
|
|
1757
|
+
this.browserSpeech = services.browserSpeech ?? new BrowserSpeechService();
|
|
1758
|
+
this.liveTranscription = services.liveTranscription ?? new LiveTranscriptionService();
|
|
1759
|
+
this.screenCapture = services.screenCapture ?? new ScreenCaptureService();
|
|
1760
|
+
this.pointerController = services.pointerController ?? new PointerController();
|
|
1761
|
+
this.stateMachine = createStateMachine();
|
|
1762
|
+
this.cachedSnapshot = this.buildSnapshot();
|
|
1763
|
+
this.voiceCapture.onLevel((level) => $audioLevel.set(level));
|
|
1764
|
+
this.liveTranscription.onPartial((text) => {
|
|
1765
|
+
if (this.liveTranscript === text) return;
|
|
1766
|
+
this.liveTranscript = text;
|
|
1767
|
+
this.notify();
|
|
1768
|
+
});
|
|
1769
|
+
this.stateMachine.subscribe(() => {
|
|
1770
|
+
this.options.onStateChange?.(this.stateMachine.getState());
|
|
1771
|
+
this.notify();
|
|
1772
|
+
});
|
|
1773
|
+
this.pointerController.subscribe(() => this.notify());
|
|
1774
|
+
}
|
|
1775
|
+
/**
|
|
1776
|
+
* Start listening for voice input.
|
|
1777
|
+
* Aborts any in-flight work from previous session.
|
|
1778
|
+
*/
|
|
1779
|
+
startListening() {
|
|
1780
|
+
this.abort();
|
|
1781
|
+
this.liveTranscript = "";
|
|
1782
|
+
this.transcript = "";
|
|
1783
|
+
this.response = "";
|
|
1784
|
+
this.error = null;
|
|
1785
|
+
this.historyCommittedForTurn = false;
|
|
1786
|
+
this.speechProviderForTurn = null;
|
|
1787
|
+
this.pointerController.release();
|
|
1788
|
+
this.stateMachine.transition({ type: "HOTKEY_PRESSED" });
|
|
1789
|
+
this.notify();
|
|
1790
|
+
this.abortController = new AbortController();
|
|
1791
|
+
const signal = this.abortController.signal;
|
|
1792
|
+
this.screenshotPromise = this.screenCapture.captureAnnotated();
|
|
1793
|
+
this.beginListeningSession(signal).catch((error) => {
|
|
1794
|
+
if (signal.aborted) return;
|
|
1795
|
+
this.voiceCapture.dispose();
|
|
1796
|
+
this.liveTranscription.dispose();
|
|
1797
|
+
this.handleError(toError(error, "Failed to start listening"));
|
|
1798
|
+
});
|
|
1799
|
+
}
|
|
1800
|
+
/**
|
|
1801
|
+
* Stop listening and process the voice input.
|
|
1802
|
+
*/
|
|
1803
|
+
async stopListening() {
|
|
1804
|
+
if (this.stateMachine.getState() !== "listening") return;
|
|
1805
|
+
this.stateMachine.transition({ type: "HOTKEY_RELEASED" });
|
|
1806
|
+
const signal = this.abortController?.signal;
|
|
1807
|
+
let turnFailure = null;
|
|
1808
|
+
const failTurn = (error) => {
|
|
1809
|
+
if (turnFailure || signal?.aborted) return;
|
|
1810
|
+
turnFailure = error;
|
|
1811
|
+
this.audioPlayback.stop();
|
|
1812
|
+
this.browserSpeech.stop();
|
|
1813
|
+
this.abortController?.abort();
|
|
1814
|
+
};
|
|
1815
|
+
try {
|
|
1816
|
+
const [audioBlob, browserTranscript] = await Promise.all([this.voiceCapture.stop(), this.stopLiveTranscription()]);
|
|
1817
|
+
let screenshot;
|
|
1818
|
+
try {
|
|
1819
|
+
if (!this.screenshotPromise) throw new Error("Screenshot was not started");
|
|
1820
|
+
screenshot = await this.screenshotPromise;
|
|
1821
|
+
} catch (screenshotError) {
|
|
1822
|
+
const errorMessage = screenshotError instanceof Error ? `Failed to capture screenshot: ${screenshotError.message}` : "Failed to capture screenshot";
|
|
1823
|
+
throw new Error(errorMessage);
|
|
1824
|
+
}
|
|
1825
|
+
if (turnFailure) throw turnFailure;
|
|
1826
|
+
if (signal?.aborted) return;
|
|
1827
|
+
const transcript = await this.resolveTranscript(browserTranscript, audioBlob, signal);
|
|
1828
|
+
if (turnFailure) throw turnFailure;
|
|
1829
|
+
if (signal?.aborted) return;
|
|
1830
|
+
this.liveTranscript = "";
|
|
1831
|
+
this.transcript = transcript;
|
|
1832
|
+
this.options.onTranscript?.(transcript);
|
|
1833
|
+
this.notify();
|
|
1834
|
+
this.prepareSpeechMode();
|
|
1835
|
+
const { cleanResponse, pointToolCall, playbackQueue } = await this.chatAndSpeak(transcript, screenshot, signal, {
|
|
1836
|
+
onFailure: failTurn,
|
|
1837
|
+
onPlaybackStart: () => {
|
|
1838
|
+
this.stateMachine.transition({ type: "RESPONSE_STARTED" });
|
|
1839
|
+
}
|
|
1840
|
+
});
|
|
1841
|
+
if (turnFailure) throw turnFailure;
|
|
1842
|
+
if (signal?.aborted) return;
|
|
1843
|
+
this.options.onResponse?.(cleanResponse);
|
|
1844
|
+
let pointTarget = null;
|
|
1845
|
+
if (pointToolCall) if (pointToolCall.type === "marker") {
|
|
1846
|
+
const coords = resolveMarkerToCoordinates(screenshot.markerMap, pointToolCall.markerId);
|
|
1847
|
+
if (coords) pointTarget = {
|
|
1848
|
+
...coords,
|
|
1849
|
+
label: pointToolCall.label
|
|
1850
|
+
};
|
|
1851
|
+
} else pointTarget = {
|
|
1852
|
+
...mapCoordinatesToViewport(pointToolCall.x, pointToolCall.y, screenshot),
|
|
1853
|
+
label: pointToolCall.label
|
|
1854
|
+
};
|
|
1855
|
+
if (pointTarget) {
|
|
1856
|
+
this.options.onPoint?.(pointTarget);
|
|
1857
|
+
this.pointerController.pointAt(pointTarget);
|
|
1858
|
+
}
|
|
1859
|
+
await playbackQueue.waitForCompletion();
|
|
1860
|
+
if (turnFailure) throw turnFailure;
|
|
1861
|
+
if (signal?.aborted) return;
|
|
1862
|
+
const newHistory = [
|
|
1863
|
+
...$conversationHistory.get(),
|
|
1864
|
+
{
|
|
1865
|
+
role: "user",
|
|
1866
|
+
content: transcript
|
|
1867
|
+
},
|
|
1868
|
+
{
|
|
1869
|
+
role: "assistant",
|
|
1870
|
+
content: cleanResponse
|
|
1871
|
+
}
|
|
1872
|
+
];
|
|
1873
|
+
$conversationHistory.set(newHistory);
|
|
1874
|
+
this.historyCommittedForTurn = true;
|
|
1875
|
+
this.stateMachine.transition({ type: "TTS_COMPLETE" });
|
|
1876
|
+
} catch (err) {
|
|
1877
|
+
if (turnFailure) {
|
|
1878
|
+
this.handleError(turnFailure);
|
|
1879
|
+
return;
|
|
1880
|
+
}
|
|
1881
|
+
if (signal?.aborted) return;
|
|
1882
|
+
this.handleError(toError(err));
|
|
1883
|
+
}
|
|
1884
|
+
}
|
|
1885
|
+
/**
|
|
1886
|
+
* Enable or disable the buddy.
|
|
1887
|
+
*/
|
|
1888
|
+
setEnabled(enabled) {
|
|
1889
|
+
$isEnabled.set(enabled);
|
|
1890
|
+
this.notify();
|
|
1891
|
+
}
|
|
1892
|
+
/**
|
|
1893
|
+
* Manually point at coordinates.
|
|
1894
|
+
*/
|
|
1895
|
+
pointAt(x, y, label) {
|
|
1896
|
+
this.pointerController.pointAt({
|
|
1897
|
+
x,
|
|
1898
|
+
y,
|
|
1899
|
+
label
|
|
1900
|
+
});
|
|
1901
|
+
}
|
|
1902
|
+
/**
|
|
1903
|
+
* Dismiss the current pointing target.
|
|
1904
|
+
*/
|
|
1905
|
+
dismissPointing() {
|
|
1906
|
+
this.pointerController.release();
|
|
1907
|
+
}
|
|
1908
|
+
/**
|
|
1909
|
+
* Reset to idle state and stop any in-progress work.
|
|
1910
|
+
*/
|
|
1911
|
+
reset() {
|
|
1912
|
+
this.abort();
|
|
1913
|
+
this.liveTranscript = "";
|
|
1914
|
+
this.transcript = "";
|
|
1915
|
+
this.response = "";
|
|
1916
|
+
this.error = null;
|
|
1917
|
+
this.historyCommittedForTurn = false;
|
|
1918
|
+
this.pointerController.release();
|
|
1919
|
+
this.stateMachine.reset();
|
|
1920
|
+
this.notify();
|
|
1921
|
+
}
|
|
1922
|
+
/**
|
|
1923
|
+
* Update buddy position to follow cursor.
|
|
1924
|
+
* Call this on cursor position changes.
|
|
1925
|
+
*/
|
|
1926
|
+
updateCursorPosition() {
|
|
1927
|
+
this.pointerController.updateFollowPosition();
|
|
1928
|
+
}
|
|
1929
|
+
/**
|
|
1930
|
+
* Subscribe to state changes.
|
|
1931
|
+
*/
|
|
1932
|
+
subscribe(listener) {
|
|
1933
|
+
this.listeners.add(listener);
|
|
1934
|
+
return () => this.listeners.delete(listener);
|
|
1935
|
+
}
|
|
1936
|
+
/**
|
|
1937
|
+
* Get current state snapshot for React's useSyncExternalStore.
|
|
1938
|
+
* Returns a cached object to ensure referential stability.
|
|
1939
|
+
*/
|
|
1940
|
+
getSnapshot() {
|
|
1941
|
+
return this.cachedSnapshot;
|
|
1942
|
+
}
|
|
1943
|
+
/**
|
|
1944
|
+
* Build a new snapshot object.
|
|
1945
|
+
*/
|
|
1946
|
+
buildSnapshot() {
|
|
1947
|
+
return {
|
|
1948
|
+
state: this.stateMachine.getState(),
|
|
1949
|
+
liveTranscript: this.liveTranscript,
|
|
1950
|
+
transcript: this.transcript,
|
|
1951
|
+
response: this.response,
|
|
1952
|
+
error: this.error,
|
|
1953
|
+
isPointing: this.pointerController.isPointing(),
|
|
1954
|
+
isEnabled: $isEnabled.get()
|
|
1955
|
+
};
|
|
1956
|
+
}
|
|
1957
|
+
abort() {
|
|
1958
|
+
this.commitPartialHistory();
|
|
1959
|
+
this.abortController?.abort();
|
|
1960
|
+
this.abortController = null;
|
|
1961
|
+
this.screenshotPromise = null;
|
|
1962
|
+
this.voiceCapture.dispose();
|
|
1963
|
+
this.liveTranscription.dispose();
|
|
1964
|
+
this.audioPlayback.stop();
|
|
1965
|
+
this.browserSpeech.stop();
|
|
1966
|
+
this.speechProviderForTurn = null;
|
|
1967
|
+
$audioLevel.set(0);
|
|
1968
|
+
}
|
|
1969
|
+
/**
|
|
1970
|
+
* Commit partial turn to history when interrupted.
|
|
1971
|
+
* Only commits if we have both transcript and response,
|
|
1972
|
+
* and haven't already committed for this turn.
|
|
1973
|
+
*/
|
|
1974
|
+
commitPartialHistory() {
|
|
1975
|
+
if (this.historyCommittedForTurn) return;
|
|
1976
|
+
if (!this.transcript || !this.response) return;
|
|
1977
|
+
const newHistory = [
|
|
1978
|
+
...$conversationHistory.get(),
|
|
1979
|
+
{
|
|
1980
|
+
role: "user",
|
|
1981
|
+
content: this.transcript
|
|
1982
|
+
},
|
|
1983
|
+
{
|
|
1984
|
+
role: "assistant",
|
|
1985
|
+
content: this.response
|
|
1986
|
+
}
|
|
1987
|
+
];
|
|
1988
|
+
$conversationHistory.set(newHistory);
|
|
1989
|
+
this.historyCommittedForTurn = true;
|
|
1990
|
+
}
|
|
1991
|
+
async transcribe(blob, signal) {
|
|
1992
|
+
const formData = new FormData();
|
|
1993
|
+
formData.append("audio", blob, "recording.wav");
|
|
1994
|
+
const response = await fetch(`${this.endpoint}/transcribe`, {
|
|
1995
|
+
method: "POST",
|
|
1996
|
+
body: formData,
|
|
1997
|
+
signal
|
|
1998
|
+
});
|
|
1999
|
+
if (!response.ok) throw new Error(await readErrorMessage(response, "Transcription failed"));
|
|
2000
|
+
const { text } = await response.json();
|
|
2001
|
+
return text;
|
|
2002
|
+
}
|
|
2003
|
+
/**
|
|
2004
|
+
* Stream the chat response, keep the visible text updated, and feed complete
|
|
2005
|
+
* speech segments into the TTS queue as soon as they are ready.
|
|
2006
|
+
*/
|
|
2007
|
+
async chatAndSpeak(transcript, screenshot, signal, options) {
|
|
2008
|
+
const history = $conversationHistory.get();
|
|
2009
|
+
const response = await fetch(`${this.endpoint}/chat`, {
|
|
2010
|
+
method: "POST",
|
|
2011
|
+
headers: { "Content-Type": "application/json" },
|
|
2012
|
+
body: JSON.stringify({
|
|
2013
|
+
screenshot: screenshot.imageData,
|
|
2014
|
+
capture: {
|
|
2015
|
+
width: screenshot.width,
|
|
2016
|
+
height: screenshot.height
|
|
2017
|
+
},
|
|
2018
|
+
transcript,
|
|
2019
|
+
history,
|
|
2020
|
+
markerContext: screenshot.markerContext
|
|
2021
|
+
}),
|
|
2022
|
+
signal
|
|
2023
|
+
});
|
|
2024
|
+
if (!response.ok) throw new Error("Chat request failed");
|
|
2025
|
+
const reader = response.body?.getReader();
|
|
2026
|
+
if (!reader) throw new Error("No response body");
|
|
2027
|
+
const decoder = new TextDecoder();
|
|
2028
|
+
const responseProcessor = new ProgressiveResponseProcessor();
|
|
2029
|
+
const playbackQueue = new TTSPlaybackQueue({
|
|
2030
|
+
onError: options.onFailure,
|
|
2031
|
+
onPlaybackStart: options.onPlaybackStart,
|
|
2032
|
+
prepare: (text, currentSignal) => this.prepareSpeechSegment(text, currentSignal),
|
|
2033
|
+
signal
|
|
2034
|
+
});
|
|
2035
|
+
const shouldStreamSpeech = this.isSpeechStreamingEnabled();
|
|
2036
|
+
while (true) {
|
|
2037
|
+
const { done, value } = await reader.read();
|
|
2038
|
+
if (done) break;
|
|
2039
|
+
const chunk = decoder.decode(value, { stream: true });
|
|
2040
|
+
const { speechSegments, visibleText } = responseProcessor.push(chunk);
|
|
2041
|
+
if (shouldStreamSpeech) for (const speechSegment of speechSegments) playbackQueue.enqueue(speechSegment);
|
|
2042
|
+
this.updateResponse(visibleText);
|
|
2043
|
+
}
|
|
2044
|
+
const trailingChunk = decoder.decode();
|
|
2045
|
+
if (trailingChunk) {
|
|
2046
|
+
const { speechSegments, visibleText } = responseProcessor.push(trailingChunk);
|
|
2047
|
+
if (shouldStreamSpeech) for (const speechSegment of speechSegments) playbackQueue.enqueue(speechSegment);
|
|
2048
|
+
this.updateResponse(visibleText);
|
|
2049
|
+
}
|
|
2050
|
+
const finalizedResponse = responseProcessor.finish();
|
|
2051
|
+
if (shouldStreamSpeech) for (const speechSegment of finalizedResponse.speechSegments) playbackQueue.enqueue(speechSegment);
|
|
2052
|
+
else playbackQueue.enqueue(finalizedResponse.finalResponseText);
|
|
2053
|
+
this.updateResponse(finalizedResponse.finalResponseText);
|
|
2054
|
+
return {
|
|
2055
|
+
cleanResponse: finalizedResponse.finalResponseText,
|
|
2056
|
+
pointToolCall: finalizedResponse.pointToolCall,
|
|
2057
|
+
playbackQueue
|
|
2058
|
+
};
|
|
2059
|
+
}
|
|
2060
|
+
/**
|
|
2061
|
+
* Request server-side TTS audio for one text segment.
|
|
2062
|
+
*/
|
|
2063
|
+
async synthesizeSpeech(text, signal) {
|
|
2064
|
+
const response = await fetch(`${this.endpoint}/tts`, {
|
|
2065
|
+
method: "POST",
|
|
2066
|
+
headers: { "Content-Type": "application/json" },
|
|
2067
|
+
body: JSON.stringify({ text }),
|
|
2068
|
+
signal
|
|
2069
|
+
});
|
|
2070
|
+
if (!response.ok) throw new Error(await readErrorMessage(response, "TTS request failed"));
|
|
2071
|
+
return response.blob();
|
|
2072
|
+
}
|
|
2073
|
+
/**
|
|
2074
|
+
* Resolve the initial speech provider for this turn.
|
|
2075
|
+
*
|
|
2076
|
+
* Decision tree:
|
|
2077
|
+
* 1. In `server` mode, always synthesize on the server.
|
|
2078
|
+
* 2. In `browser` mode, require browser speech support up front.
|
|
2079
|
+
* 3. In `auto` mode, prefer browser speech when available and keep that
|
|
2080
|
+
* choice cached so later segments stay on the same provider unless a
|
|
2081
|
+
* browser failure forces a one-way fallback to the server.
|
|
2082
|
+
*/
|
|
2083
|
+
prepareSpeechMode() {
|
|
2084
|
+
const speechMode = this.getSpeechMode();
|
|
2085
|
+
if (speechMode === "browser" && !this.browserSpeech.isAvailable()) throw new Error("Browser speech is not supported");
|
|
2086
|
+
if (speechMode === "server") {
|
|
2087
|
+
this.speechProviderForTurn = "server";
|
|
2088
|
+
return;
|
|
2089
|
+
}
|
|
2090
|
+
if (speechMode === "browser") {
|
|
2091
|
+
this.speechProviderForTurn = "browser";
|
|
2092
|
+
return;
|
|
2093
|
+
}
|
|
2094
|
+
this.speechProviderForTurn = this.browserSpeech.isAvailable() ? "browser" : "server";
|
|
2095
|
+
}
|
|
2096
|
+
/**
|
|
2097
|
+
* Prepare a playback task for one text segment.
|
|
2098
|
+
*
|
|
2099
|
+
* The queue calls this eagerly so server synthesis can overlap with the
|
|
2100
|
+
* currently playing segment, but the returned task is still executed in the
|
|
2101
|
+
* original enqueue order.
|
|
2102
|
+
*/
|
|
2103
|
+
async prepareSpeechSegment(text, signal) {
|
|
2104
|
+
switch (this.getSpeechMode()) {
|
|
2105
|
+
case "server": return this.prepareServerSpeechTask(text, signal);
|
|
2106
|
+
case "browser": return this.prepareBrowserSpeechTask(text, signal);
|
|
2107
|
+
default: return this.prepareAutoSpeechTask(text, signal);
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
2110
|
+
/**
|
|
2111
|
+
* Synthesize server audio immediately and return a playback task that reuses
|
|
2112
|
+
* the prepared blob later.
|
|
2113
|
+
*/
|
|
2114
|
+
async prepareServerSpeechTask(text, signal) {
|
|
2115
|
+
const blob = await this.synthesizeSpeech(text, signal);
|
|
2116
|
+
return () => this.audioPlayback.play(blob, signal);
|
|
2117
|
+
}
|
|
2118
|
+
/**
|
|
2119
|
+
* Return a browser playback task for one text segment.
|
|
2120
|
+
*/
|
|
2121
|
+
async prepareBrowserSpeechTask(text, signal) {
|
|
2122
|
+
return () => this.browserSpeech.speak(text, signal);
|
|
2123
|
+
}
|
|
2124
|
+
/**
|
|
2125
|
+
* Prepare a playback task for `auto` mode.
|
|
2126
|
+
*
|
|
2127
|
+
* We prefer the browser for low latency, but if browser speech fails for any
|
|
2128
|
+
* segment we permanently switch the remainder of the turn to server TTS so
|
|
2129
|
+
* later segments do not keep retrying the failing browser path.
|
|
2130
|
+
*/
|
|
2131
|
+
async prepareAutoSpeechTask(text, signal) {
|
|
2132
|
+
if (this.getAutoSpeechProvider() === "server") return this.prepareServerSpeechTask(text, signal);
|
|
2133
|
+
return async () => {
|
|
2134
|
+
if (this.getAutoSpeechProvider() === "server") {
|
|
2135
|
+
await (await this.prepareServerSpeechTask(text, signal))();
|
|
2136
|
+
return;
|
|
2137
|
+
}
|
|
2138
|
+
try {
|
|
2139
|
+
await this.browserSpeech.speak(text, signal);
|
|
2140
|
+
} catch (error) {
|
|
2141
|
+
if (signal?.aborted) return;
|
|
2142
|
+
this.speechProviderForTurn = "server";
|
|
2143
|
+
await (await this.prepareServerSpeechTask(text, signal))();
|
|
2144
|
+
}
|
|
2145
|
+
};
|
|
2146
|
+
}
|
|
2147
|
+
/**
|
|
2148
|
+
* Read the current provider choice for `auto` mode, lazily defaulting to the
|
|
2149
|
+
* browser when supported and the server otherwise.
|
|
2150
|
+
*/
|
|
2151
|
+
getAutoSpeechProvider() {
|
|
2152
|
+
if (this.speechProviderForTurn) return this.speechProviderForTurn;
|
|
2153
|
+
this.speechProviderForTurn = this.browserSpeech.isAvailable() ? "browser" : "server";
|
|
2154
|
+
return this.speechProviderForTurn;
|
|
2155
|
+
}
|
|
2156
|
+
handleError(err) {
|
|
2157
|
+
this.liveTranscript = "";
|
|
2158
|
+
this.error = err;
|
|
2159
|
+
this.stateMachine.transition({
|
|
2160
|
+
type: "ERROR",
|
|
2161
|
+
error: err
|
|
2162
|
+
});
|
|
2163
|
+
this.options.onError?.(err);
|
|
2164
|
+
this.notify();
|
|
2165
|
+
}
|
|
2166
|
+
/**
|
|
2167
|
+
* Resolve the effective transcription mode for the current client.
|
|
2168
|
+
*/
|
|
2169
|
+
getTranscriptionMode() {
|
|
2170
|
+
return this.options.transcription?.mode ?? "auto";
|
|
2171
|
+
}
|
|
2172
|
+
/**
|
|
2173
|
+
* Resolve the effective speech mode for the current client.
|
|
2174
|
+
*/
|
|
2175
|
+
getSpeechMode() {
|
|
2176
|
+
return this.options.speech?.mode ?? "server";
|
|
2177
|
+
}
|
|
2178
|
+
/**
|
|
2179
|
+
* Decide whether speech should start before the full chat response is ready.
|
|
2180
|
+
*/
|
|
2181
|
+
isSpeechStreamingEnabled() {
|
|
2182
|
+
return this.options.speech?.allowStreaming ?? false;
|
|
2183
|
+
}
|
|
2184
|
+
/**
|
|
2185
|
+
* Decide whether this turn should attempt browser speech recognition.
|
|
2186
|
+
*/
|
|
2187
|
+
shouldAttemptBrowserTranscription() {
|
|
2188
|
+
return this.getTranscriptionMode() !== "server";
|
|
2189
|
+
}
|
|
2190
|
+
/**
|
|
2191
|
+
* Decide whether browser speech recognition is mandatory for this turn.
|
|
2192
|
+
*/
|
|
2193
|
+
isBrowserTranscriptionRequired() {
|
|
2194
|
+
return this.getTranscriptionMode() === "browser";
|
|
2195
|
+
}
|
|
2196
|
+
/**
|
|
2197
|
+
* Start the recorder and browser speech recognition together.
|
|
2198
|
+
*
|
|
2199
|
+
* The recorder always runs so we keep waveform updates and preserve a raw
|
|
2200
|
+
* audio backup for server fallback in `auto` mode.
|
|
2201
|
+
*/
|
|
2202
|
+
async beginListeningSession(signal) {
|
|
2203
|
+
const shouldAttemptBrowser = this.shouldAttemptBrowserTranscription();
|
|
2204
|
+
const isBrowserTranscriptionAvailable = shouldAttemptBrowser && this.liveTranscription.isAvailable();
|
|
2205
|
+
if (shouldAttemptBrowser && !isBrowserTranscriptionAvailable) {
|
|
2206
|
+
if (this.isBrowserTranscriptionRequired()) throw new Error("Browser transcription is not supported");
|
|
2207
|
+
}
|
|
2208
|
+
const [voiceCaptureResult, browserTranscriptionResult] = await Promise.allSettled([this.voiceCapture.start(), isBrowserTranscriptionAvailable ? this.liveTranscription.start() : Promise.resolve(void 0)]);
|
|
2209
|
+
if (signal.aborted) return;
|
|
2210
|
+
if (voiceCaptureResult.status === "rejected") throw toError(voiceCaptureResult.reason, "Failed to start microphone");
|
|
2211
|
+
if (browserTranscriptionResult.status === "rejected" && this.isBrowserTranscriptionRequired()) throw toError(browserTranscriptionResult.reason, "Browser transcription failed to start");
|
|
2212
|
+
if (browserTranscriptionResult.status === "rejected") this.liveTranscription.dispose();
|
|
2213
|
+
}
|
|
2214
|
+
/**
|
|
2215
|
+
* Stop browser speech recognition and return the best final transcript it
|
|
2216
|
+
* produced for this turn.
|
|
2217
|
+
*/
|
|
2218
|
+
async stopLiveTranscription() {
|
|
2219
|
+
if (!this.shouldAttemptBrowserTranscription() || !this.liveTranscription.isAvailable()) return "";
|
|
2220
|
+
try {
|
|
2221
|
+
return await this.liveTranscription.stop();
|
|
2222
|
+
} catch (error) {
|
|
2223
|
+
if (this.isBrowserTranscriptionRequired()) throw toError(error, "Browser transcription failed");
|
|
2224
|
+
return "";
|
|
2225
|
+
}
|
|
2226
|
+
}
|
|
2227
|
+
/**
|
|
2228
|
+
* Choose the transcript that should drive the turn.
|
|
2229
|
+
*
|
|
2230
|
+
* Decision tree:
|
|
2231
|
+
* 1. Use the browser transcript when it is available.
|
|
2232
|
+
* 2. In browser-only mode, fail if the browser produced nothing usable.
|
|
2233
|
+
* 3. In auto/server modes, fall back to the recorded audio upload.
|
|
2234
|
+
*/
|
|
2235
|
+
async resolveTranscript(browserTranscript, audioBlob, signal) {
|
|
2236
|
+
const normalizedBrowserTranscript = browserTranscript.trim();
|
|
2237
|
+
if (normalizedBrowserTranscript) return normalizedBrowserTranscript;
|
|
2238
|
+
if (this.getTranscriptionMode() === "browser") throw new Error("Browser transcription did not produce a final transcript");
|
|
2239
|
+
return this.transcribe(audioBlob, signal);
|
|
2240
|
+
}
|
|
2241
|
+
updateResponse(text) {
|
|
2242
|
+
if (this.response === text) return;
|
|
2243
|
+
this.response = text;
|
|
2244
|
+
this.notify();
|
|
2245
|
+
}
|
|
2246
|
+
notify() {
|
|
2247
|
+
this.cachedSnapshot = this.buildSnapshot();
|
|
2248
|
+
this.listeners.forEach((listener) => listener());
|
|
2249
|
+
}
|
|
2250
|
+
};
|
|
2251
|
+
//#endregion
|
|
2252
|
+
export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
|
|
2253
|
+
|
|
2254
|
+
//# sourceMappingURL=client-D-LeEdoH.mjs.map
|