cursor-buddy 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,31 @@ const $conversationHistory = atom([]);
30
30
  * - Coordinate-based: [POINT:640,360:label] - 4 parts, raw pixel coordinates
31
31
  */
32
32
  const POINTING_TAG_REGEX = /\[POINT:(\d+)(?:,(\d+))?:([^\]]+)\]\s*$/;
33
+ const PARTIAL_POINTING_PREFIXES = new Set([
34
+ "[",
35
+ "[P",
36
+ "[PO",
37
+ "[POI",
38
+ "[POIN",
39
+ "[POINT",
40
+ "[POINT:"
41
+ ]);
42
+ function stripTrailingPointingTag(response, trimResult) {
43
+ const stripped = response.replace(POINTING_TAG_REGEX, "");
44
+ return trimResult ? stripped.trim() : stripped;
45
+ }
46
+ function getPartialPointingTagStart(response) {
47
+ const lastOpenBracket = response.lastIndexOf("[");
48
+ if (lastOpenBracket === -1) return -1;
49
+ const suffix = response.slice(lastOpenBracket).trimEnd();
50
+ if (suffix.includes("]")) return -1;
51
+ if (suffix.startsWith("[POINT:")) {
52
+ let start = lastOpenBracket;
53
+ while (start > 0 && /\s/.test(response[start - 1] ?? "")) start--;
54
+ return start;
55
+ }
56
+ return PARTIAL_POINTING_PREFIXES.has(suffix) ? lastOpenBracket : -1;
57
+ }
33
58
  /**
34
59
  * Parse pointing tag into structured result.
35
60
  * Returns null if no valid POINT tag is found at the end.
@@ -56,7 +81,28 @@ function parsePointingTagRaw(response) {
56
81
  * Remove POINT tag from response text for display/TTS.
57
82
  */
58
83
  function stripPointingTag(response) {
59
- return response.replace(POINTING_TAG_REGEX, "").trim();
84
+ return stripTrailingPointingTag(response, true);
85
+ }
86
+ /**
87
+ * Strip complete or partial trailing POINT syntax while the response streams.
88
+ * This keeps the visible text and TTS input stable even if the tag arrives
89
+ * incrementally over multiple chunks.
90
+ */
91
+ function stripTrailingPointingSyntax(response) {
92
+ const withoutCompleteTag = stripTrailingPointingTag(response, false);
93
+ const partialTagStart = getPartialPointingTagStart(withoutCompleteTag);
94
+ if (partialTagStart === -1) return withoutCompleteTag.trimEnd();
95
+ return withoutCompleteTag.slice(0, partialTagStart).trimEnd();
96
+ }
97
+ //#endregion
98
+ //#region src/core/utils/error.ts
99
+ /**
100
+ * Normalize unknown thrown values into Error instances.
101
+ */
102
+ function toError(error, fallbackMessage = "Unknown error") {
103
+ if (error instanceof Error) return error;
104
+ if (typeof error === "string" && error) return new Error(error);
105
+ return new Error(fallbackMessage);
60
106
  }
61
107
  //#endregion
62
108
  //#region src/core/services/audio-playback.ts
@@ -124,7 +170,7 @@ var AudioPlaybackService = class {
124
170
  settle("reject", /* @__PURE__ */ new Error("Audio playback failed"));
125
171
  };
126
172
  this.audio.play().catch((err) => {
127
- settle("reject", err instanceof Error ? err : new Error(String(err)));
173
+ settle("reject", toError(err, "Audio playback failed"));
128
174
  });
129
175
  });
130
176
  }
@@ -156,6 +202,323 @@ var AudioPlaybackService = class {
156
202
  }
157
203
  };
158
204
  //#endregion
205
+ //#region src/core/utils/web-speech.ts
206
+ /**
207
+ * Normalize browser speech input and transcript output to a single-space form
208
+ * so UI state and speech synthesis stay stable across browser event quirks.
209
+ */
210
+ function normalizeSpeechText(text) {
211
+ return text.replace(/\s+/g, " ").trim();
212
+ }
213
+ /**
214
+ * Resolve the best browser locale to use for Web Speech APIs.
215
+ *
216
+ * We prefer the document language when the host app declares one, then fall
217
+ * back to the browser locale, and finally to English as a stable default.
218
+ */
219
+ function resolveBrowserLanguage() {
220
+ if (typeof document !== "undefined") {
221
+ const documentLanguage = document.documentElement.lang.trim();
222
+ if (documentLanguage) return documentLanguage;
223
+ }
224
+ if (typeof navigator !== "undefined" && navigator.language) return navigator.language;
225
+ return "en-US";
226
+ }
227
+ //#endregion
228
+ //#region src/core/services/browser-speech.ts
229
+ function getSpeechSynthesis() {
230
+ return typeof globalThis.speechSynthesis === "undefined" ? void 0 : globalThis.speechSynthesis;
231
+ }
232
+ function getSpeechSynthesisUtterance() {
233
+ return typeof globalThis.SpeechSynthesisUtterance === "undefined" ? void 0 : globalThis.SpeechSynthesisUtterance;
234
+ }
235
+ function toSpeechError(event) {
236
+ const errorCode = event?.error;
237
+ return /* @__PURE__ */ new Error(errorCode ? `Browser speech failed: ${errorCode}` : "Browser speech failed");
238
+ }
239
+ /**
240
+ * Browser-backed speech synthesis using the Web Speech API.
241
+ */
242
+ var BrowserSpeechService = class {
243
+ removeAbortListener = null;
244
+ settleSpeech = null;
245
+ utterance = null;
246
+ /**
247
+ * Report whether this runtime exposes the browser Web Speech synthesis APIs.
248
+ */
249
+ isAvailable() {
250
+ return Boolean(getSpeechSynthesis() && getSpeechSynthesisUtterance());
251
+ }
252
+ /**
253
+ * Speak a single text segment in the browser.
254
+ *
255
+ * Each queue item owns its own utterance. We only stop an existing utterance
256
+ * when this service still has one in flight, so streamed playback does not
257
+ * spam global `speechSynthesis.cancel()` between already-completed segments.
258
+ */
259
+ async speak(text, signal) {
260
+ const speechSynthesis = getSpeechSynthesis();
261
+ const SpeechSynthesisUtteranceCtor = getSpeechSynthesisUtterance();
262
+ if (!speechSynthesis || !SpeechSynthesisUtteranceCtor) throw new Error("Browser speech is not supported");
263
+ if (this.hasActiveSpeech()) this.stop();
264
+ const normalizedText = normalizeSpeechText(text);
265
+ if (!normalizedText || signal?.aborted) return;
266
+ const utterance = new SpeechSynthesisUtteranceCtor(normalizedText);
267
+ utterance.lang = resolveBrowserLanguage();
268
+ this.utterance = utterance;
269
+ return new Promise((resolve, reject) => {
270
+ let settled = false;
271
+ const settle = (outcome, error) => {
272
+ if (settled) return;
273
+ settled = true;
274
+ if (this.settleSpeech === settle) this.settleSpeech = null;
275
+ this.removeAbortListener?.();
276
+ this.removeAbortListener = null;
277
+ this.clearUtterance(utterance);
278
+ if (outcome === "resolve") {
279
+ resolve();
280
+ return;
281
+ }
282
+ reject(error ?? /* @__PURE__ */ new Error("Browser speech failed"));
283
+ };
284
+ this.settleSpeech = settle;
285
+ const abortHandler = () => {
286
+ try {
287
+ speechSynthesis.cancel();
288
+ } catch {}
289
+ settle("resolve");
290
+ };
291
+ if (signal) {
292
+ signal.addEventListener("abort", abortHandler, { once: true });
293
+ this.removeAbortListener = () => {
294
+ signal.removeEventListener("abort", abortHandler);
295
+ };
296
+ }
297
+ utterance.onend = () => {
298
+ settle("resolve");
299
+ };
300
+ utterance.onerror = (event) => {
301
+ if (signal?.aborted) {
302
+ settle("resolve");
303
+ return;
304
+ }
305
+ settle("reject", toSpeechError(event));
306
+ };
307
+ try {
308
+ speechSynthesis.speak(utterance);
309
+ } catch (error) {
310
+ settle("reject", toError(error, "Browser speech failed to start"));
311
+ }
312
+ });
313
+ }
314
+ /**
315
+ * Stop the current utterance owned by this service, if one is active.
316
+ *
317
+ * We intentionally do nothing when the service is idle so we do not cancel
318
+ * unrelated speech synthesis work that host apps may be doing elsewhere.
319
+ */
320
+ stop() {
321
+ if (!this.hasActiveSpeech()) return;
322
+ const speechSynthesis = getSpeechSynthesis();
323
+ if (speechSynthesis) try {
324
+ speechSynthesis.cancel();
325
+ } catch {}
326
+ if (this.settleSpeech) {
327
+ const settleSpeech = this.settleSpeech;
328
+ this.settleSpeech = null;
329
+ settleSpeech("resolve");
330
+ return;
331
+ }
332
+ this.removeAbortListener?.();
333
+ this.removeAbortListener = null;
334
+ this.clearUtterance(this.utterance);
335
+ }
336
+ hasActiveSpeech() {
337
+ return Boolean(this.utterance || this.settleSpeech);
338
+ }
339
+ clearUtterance(utterance) {
340
+ if (!utterance) return;
341
+ utterance.onend = null;
342
+ utterance.onerror = null;
343
+ if (this.utterance === utterance) this.utterance = null;
344
+ }
345
+ };
346
+ //#endregion
347
+ //#region src/core/services/live-transcription.ts
348
+ function getSpeechRecognitionConstructor() {
349
+ const globalScope = globalThis;
350
+ return globalScope.SpeechRecognition ?? globalScope.webkitSpeechRecognition;
351
+ }
352
+ function toRecognitionError(event) {
353
+ const errorCode = event?.error;
354
+ const message = event?.message || (errorCode ? `Browser transcription failed: ${errorCode}` : "Browser transcription failed");
355
+ return new Error(message);
356
+ }
357
+ function buildTranscripts(results) {
358
+ let finalTranscript = "";
359
+ let interimTranscript = "";
360
+ for (let index = 0; index < results.length; index += 1) {
361
+ const result = results[index];
362
+ const transcript = (result?.[0])?.transcript ?? "";
363
+ if (!transcript) continue;
364
+ if (result.isFinal) finalTranscript += `${transcript} `;
365
+ else interimTranscript += `${transcript} `;
366
+ }
367
+ const normalizedFinal = normalizeSpeechText(finalTranscript);
368
+ return {
369
+ finalTranscript: normalizedFinal,
370
+ liveTranscript: normalizeSpeechText([normalizedFinal, normalizeSpeechText(interimTranscript)].filter(Boolean).join(" "))
371
+ };
372
+ }
373
+ /**
374
+ * Browser-backed live transcription using the Web Speech API.
375
+ */
376
+ var LiveTranscriptionService = class {
377
+ finalTranscript = "";
378
+ hasStarted = false;
379
+ hasEnded = false;
380
+ lastError = null;
381
+ partialCallback = null;
382
+ recognition = null;
383
+ startReject = null;
384
+ startResolve = null;
385
+ stopReject = null;
386
+ stopResolve = null;
387
+ isAvailable() {
388
+ return Boolean(getSpeechRecognitionConstructor());
389
+ }
390
+ /**
391
+ * Register a callback for the latest browser transcript while the user is
392
+ * still speaking.
393
+ */
394
+ onPartial(callback) {
395
+ this.partialCallback = callback;
396
+ }
397
+ /**
398
+ * Start a new Web Speech recognition session.
399
+ */
400
+ async start() {
401
+ const SpeechRecognitionCtor = getSpeechRecognitionConstructor();
402
+ if (!SpeechRecognitionCtor) throw new Error("Browser transcription is not supported");
403
+ this.dispose();
404
+ const recognition = new SpeechRecognitionCtor();
405
+ this.recognition = recognition;
406
+ recognition.continuous = true;
407
+ recognition.interimResults = true;
408
+ recognition.maxAlternatives = 1;
409
+ recognition.lang = resolveBrowserLanguage();
410
+ recognition.onstart = () => {
411
+ this.hasStarted = true;
412
+ this.startResolve?.();
413
+ this.startResolve = null;
414
+ this.startReject = null;
415
+ };
416
+ recognition.onresult = (event) => {
417
+ const transcripts = buildTranscripts(event.results);
418
+ this.finalTranscript = transcripts.finalTranscript;
419
+ this.partialCallback?.(transcripts.liveTranscript);
420
+ };
421
+ recognition.onerror = (event) => {
422
+ this.lastError = toRecognitionError(event);
423
+ if (!this.hasStarted) {
424
+ this.startReject?.(this.lastError);
425
+ this.startResolve = null;
426
+ this.startReject = null;
427
+ }
428
+ };
429
+ recognition.onend = () => {
430
+ this.hasEnded = true;
431
+ if (!this.hasStarted) {
432
+ const error = this.lastError ?? /* @__PURE__ */ new Error("Browser transcription ended before it could start");
433
+ this.startReject?.(error);
434
+ this.startResolve = null;
435
+ this.startReject = null;
436
+ }
437
+ if (this.stopResolve || this.stopReject) {
438
+ if (this.lastError) this.stopReject?.(this.lastError);
439
+ else this.stopResolve?.(normalizeSpeechText(this.finalTranscript));
440
+ this.stopResolve = null;
441
+ this.stopReject = null;
442
+ }
443
+ };
444
+ const started = new Promise((resolve, reject) => {
445
+ this.startResolve = resolve;
446
+ this.startReject = reject;
447
+ });
448
+ try {
449
+ recognition.start();
450
+ } catch (error) {
451
+ this.clearRecognition();
452
+ throw toError(error, "Browser transcription failed to start");
453
+ }
454
+ try {
455
+ await started;
456
+ } catch (error) {
457
+ this.clearRecognition();
458
+ throw toError(error, "Browser transcription failed to start");
459
+ }
460
+ }
461
+ /**
462
+ * Stop the current recognition session and resolve with the final transcript.
463
+ */
464
+ async stop() {
465
+ if (!this.recognition) {
466
+ if (this.lastError) throw this.lastError;
467
+ return normalizeSpeechText(this.finalTranscript);
468
+ }
469
+ if (this.hasEnded) {
470
+ const transcript = normalizeSpeechText(this.finalTranscript);
471
+ const error = this.lastError;
472
+ this.clearRecognition();
473
+ if (error) throw error;
474
+ return transcript;
475
+ }
476
+ const recognition = this.recognition;
477
+ return normalizeSpeechText(await new Promise((resolve, reject) => {
478
+ this.stopResolve = resolve;
479
+ this.stopReject = reject;
480
+ try {
481
+ recognition.stop();
482
+ } catch (error) {
483
+ reject(toError(error, "Browser transcription failed to stop"));
484
+ }
485
+ }).finally(() => {
486
+ this.clearRecognition();
487
+ }));
488
+ }
489
+ /**
490
+ * Abort the current recognition session and reset the service for reuse.
491
+ */
492
+ dispose() {
493
+ if (this.recognition) try {
494
+ this.recognition.abort();
495
+ } catch {}
496
+ this.startReject?.(/* @__PURE__ */ new Error("Browser transcription aborted"));
497
+ this.stopResolve?.(normalizeSpeechText(this.finalTranscript));
498
+ this.startResolve = null;
499
+ this.startReject = null;
500
+ this.stopResolve = null;
501
+ this.stopReject = null;
502
+ this.clearRecognition();
503
+ this.resetSessionState();
504
+ }
505
+ clearRecognition() {
506
+ if (!this.recognition) return;
507
+ this.recognition.onstart = null;
508
+ this.recognition.onresult = null;
509
+ this.recognition.onerror = null;
510
+ this.recognition.onend = null;
511
+ this.recognition = null;
512
+ }
513
+ resetSessionState() {
514
+ this.finalTranscript = "";
515
+ this.hasStarted = false;
516
+ this.hasEnded = false;
517
+ this.lastError = null;
518
+ this.partialCallback?.("");
519
+ }
520
+ };
521
+ //#endregion
159
522
  //#region src/core/bezier.ts
160
523
  /**
161
524
  * Bezier flight animation for cursor pointing.
@@ -322,6 +685,84 @@ var PointerController = class {
322
685
  }
323
686
  };
324
687
  //#endregion
688
+ //#region src/core/utils/annotations.ts
689
+ const DEFAULT_STYLE = {
690
+ borderColor: "rgba(255, 0, 0, 0.8)",
691
+ labelBackground: "rgba(255, 0, 0, 0.9)",
692
+ labelColor: "#ffffff",
693
+ borderWidth: 2,
694
+ fontSize: 11,
695
+ labelPadding: 4
696
+ };
697
+ /**
698
+ * Draw annotation markers onto a canvas.
699
+ * Modifies the canvas in place.
700
+ *
701
+ * @param ctx Canvas 2D context to draw on
702
+ * @param markers Marker map from element discovery
703
+ * @param style Optional style overrides
704
+ */
705
+ function drawAnnotations(ctx, markers, style = {}) {
706
+ const s = {
707
+ ...DEFAULT_STYLE,
708
+ ...style
709
+ };
710
+ ctx.save();
711
+ for (const marker of markers.values()) {
712
+ const { rect, id } = marker;
713
+ ctx.strokeStyle = s.borderColor;
714
+ ctx.lineWidth = s.borderWidth;
715
+ ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
716
+ const label = String(id);
717
+ ctx.font = `bold ${s.fontSize}px monospace`;
718
+ const textWidth = ctx.measureText(label).width;
719
+ const textHeight = s.fontSize;
720
+ const labelWidth = textWidth + s.labelPadding * 2;
721
+ const labelHeight = textHeight + s.labelPadding;
722
+ const labelX = rect.left - s.borderWidth;
723
+ const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
724
+ ctx.fillStyle = s.labelBackground;
725
+ ctx.beginPath();
726
+ ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
727
+ ctx.fill();
728
+ ctx.fillStyle = s.labelColor;
729
+ ctx.textBaseline = "top";
730
+ ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
731
+ }
732
+ ctx.restore();
733
+ }
734
+ /**
735
+ * Create an annotated copy of a canvas.
736
+ * Does not modify the original canvas.
737
+ *
738
+ * @param sourceCanvas Original screenshot canvas
739
+ * @param markers Marker map from element discovery
740
+ * @returns New canvas with annotations drawn
741
+ */
742
+ function createAnnotatedCanvas(sourceCanvas, markers) {
743
+ const canvas = document.createElement("canvas");
744
+ canvas.width = sourceCanvas.width;
745
+ canvas.height = sourceCanvas.height;
746
+ const ctx = canvas.getContext("2d");
747
+ if (!ctx) throw new Error("Failed to get canvas 2D context");
748
+ ctx.drawImage(sourceCanvas, 0, 0);
749
+ drawAnnotations(ctx, markers);
750
+ return canvas;
751
+ }
752
+ /**
753
+ * Generate marker context string for AI prompt.
754
+ * Lists available markers with their descriptions.
755
+ *
756
+ * @param markers Marker map from element discovery
757
+ * @returns Formatted string listing markers
758
+ */
759
+ function generateMarkerContext(markers) {
760
+ if (markers.size === 0) return "No interactive elements detected.";
761
+ const lines = ["Interactive elements (use marker number to point):"];
762
+ for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
763
+ return lines.join("\n");
764
+ }
765
+ //#endregion
325
766
  //#region src/core/utils/elements.ts
326
767
  /**
327
768
  * Element discovery for annotated screenshots.
@@ -463,84 +904,6 @@ function resolveMarkerToCoordinates(markerMap, markerId) {
463
904
  return getElementCenter(marker.element);
464
905
  }
465
906
  //#endregion
466
- //#region src/core/utils/annotations.ts
467
- const DEFAULT_STYLE = {
468
- borderColor: "rgba(255, 0, 0, 0.8)",
469
- labelBackground: "rgba(255, 0, 0, 0.9)",
470
- labelColor: "#ffffff",
471
- borderWidth: 2,
472
- fontSize: 11,
473
- labelPadding: 4
474
- };
475
- /**
476
- * Draw annotation markers onto a canvas.
477
- * Modifies the canvas in place.
478
- *
479
- * @param ctx Canvas 2D context to draw on
480
- * @param markers Marker map from element discovery
481
- * @param style Optional style overrides
482
- */
483
- function drawAnnotations(ctx, markers, style = {}) {
484
- const s = {
485
- ...DEFAULT_STYLE,
486
- ...style
487
- };
488
- ctx.save();
489
- for (const marker of markers.values()) {
490
- const { rect, id } = marker;
491
- ctx.strokeStyle = s.borderColor;
492
- ctx.lineWidth = s.borderWidth;
493
- ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
494
- const label = String(id);
495
- ctx.font = `bold ${s.fontSize}px monospace`;
496
- const textWidth = ctx.measureText(label).width;
497
- const textHeight = s.fontSize;
498
- const labelWidth = textWidth + s.labelPadding * 2;
499
- const labelHeight = textHeight + s.labelPadding;
500
- const labelX = rect.left - s.borderWidth;
501
- const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
502
- ctx.fillStyle = s.labelBackground;
503
- ctx.beginPath();
504
- ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
505
- ctx.fill();
506
- ctx.fillStyle = s.labelColor;
507
- ctx.textBaseline = "top";
508
- ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
509
- }
510
- ctx.restore();
511
- }
512
- /**
513
- * Create an annotated copy of a canvas.
514
- * Does not modify the original canvas.
515
- *
516
- * @param sourceCanvas Original screenshot canvas
517
- * @param markers Marker map from element discovery
518
- * @returns New canvas with annotations drawn
519
- */
520
- function createAnnotatedCanvas(sourceCanvas, markers) {
521
- const canvas = document.createElement("canvas");
522
- canvas.width = sourceCanvas.width;
523
- canvas.height = sourceCanvas.height;
524
- const ctx = canvas.getContext("2d");
525
- if (!ctx) throw new Error("Failed to get canvas 2D context");
526
- ctx.drawImage(sourceCanvas, 0, 0);
527
- drawAnnotations(ctx, markers);
528
- return canvas;
529
- }
530
- /**
531
- * Generate marker context string for AI prompt.
532
- * Lists available markers with their descriptions.
533
- *
534
- * @param markers Marker map from element discovery
535
- * @returns Formatted string listing markers
536
- */
537
- function generateMarkerContext(markers) {
538
- if (markers.size === 0) return "No interactive elements detected.";
539
- const lines = ["Interactive elements (use marker number to point):"];
540
- for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
541
- return lines.join("\n");
542
- }
543
- //#endregion
544
907
  //#region src/core/utils/screenshot.ts
545
908
  const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
546
909
  function getCaptureMetrics() {
@@ -709,6 +1072,66 @@ var ScreenCaptureService = class {
709
1072
  }
710
1073
  };
711
1074
  //#endregion
1075
+ //#region src/core/services/tts-playback-queue.ts
1076
+ /**
1077
+ * Queues sentence-level speech preparation immediately while keeping playback
1078
+ * strictly ordered.
1079
+ *
1080
+ * Preparation is allowed to run ahead of playback so server synthesis can
1081
+ * overlap with the currently playing segment, but the returned playback tasks
1082
+ * still execute one-by-one in enqueue order.
1083
+ */
1084
+ var TTSPlaybackQueue = class {
1085
+ error = null;
1086
+ hasStartedPlayback = false;
1087
+ onError;
1088
+ onPlaybackStart;
1089
+ playbackChain = Promise.resolve();
1090
+ prepare;
1091
+ signal;
1092
+ constructor(options) {
1093
+ this.onError = options.onError;
1094
+ this.onPlaybackStart = options.onPlaybackStart;
1095
+ this.prepare = options.prepare;
1096
+ this.signal = options.signal;
1097
+ }
1098
+ /**
1099
+ * Queue a speakable text segment.
1100
+ */
1101
+ enqueue(text) {
1102
+ const normalizedText = text.trim();
1103
+ if (!normalizedText || this.error || this.signal?.aborted) return;
1104
+ const preparedPlaybackTask = this.prepare(normalizedText, this.signal);
1105
+ preparedPlaybackTask.catch((error) => {
1106
+ this.fail(toError(error));
1107
+ });
1108
+ this.playbackChain = this.playbackChain.then(async () => {
1109
+ if (this.signal?.aborted) return;
1110
+ const play = await preparedPlaybackTask;
1111
+ if (this.signal?.aborted) return;
1112
+ if (!this.hasStartedPlayback) {
1113
+ this.hasStartedPlayback = true;
1114
+ this.onPlaybackStart?.();
1115
+ }
1116
+ await play();
1117
+ }).catch((error) => {
1118
+ this.fail(toError(error));
1119
+ });
1120
+ }
1121
+ /**
1122
+ * Wait until every queued segment has either played or the queue failed.
1123
+ */
1124
+ async waitForCompletion() {
1125
+ await this.playbackChain;
1126
+ if (this.error) throw this.error;
1127
+ }
1128
+ fail(error) {
1129
+ if (this.error) return;
1130
+ this.error = error;
1131
+ this.onError?.(error);
1132
+ }
1133
+ };
1134
+ //#endregion
712
1135
  //#region src/core/utils/audio.ts
713
1136
  /**
714
1137
  * Audio conversion utilities for voice capture.
@@ -995,6 +1418,10 @@ var VoiceCaptureService = class {
995
1418
  }
996
1419
  /**
997
1420
  * Clean up all resources.
1421
+ *
1422
+ * The level callback is intentionally preserved so the same service instance
1423
+ * can be reused across multiple push-to-talk turns without re-registering
1424
+ * the waveform subscription from the client.
998
1425
  */
999
1426
  dispose() {
1000
1427
  if (this.stream) {
@@ -1019,8 +1446,8 @@ var VoiceCaptureService = class {
1019
1446
  }
1020
1447
  this.chunks = [];
1021
1448
  this.visualLevel = 0;
1449
+ this.levelCallback?.(0);
1022
1450
  this.flushResolve = null;
1023
- this.levelCallback = null;
1024
1451
  }
1025
1452
  async flushPendingAudio() {
1026
1453
  if (!this.workletNode) return;
@@ -1050,7 +1477,8 @@ const transitions = {
1050
1477
  ERROR: "idle"
1051
1478
  },
1052
1479
  processing: {
1053
- AI_RESPONSE_COMPLETE: "responding",
1480
+ RESPONSE_STARTED: "responding",
1481
+ TTS_COMPLETE: "idle",
1054
1482
  HOTKEY_PRESSED: "listening",
1055
1483
  ERROR: "idle"
1056
1484
  },
@@ -1094,10 +1522,134 @@ function createStateMachine(initial = "idle") {
1094
1522
  };
1095
1523
  }
1096
1524
  //#endregion
1525
+ //#region src/core/utils/response-processor.ts
1526
+ const COMMON_ABBREVIATIONS = [
1527
+ "mr.",
1528
+ "mrs.",
1529
+ "ms.",
1530
+ "dr.",
1531
+ "prof.",
1532
+ "sr.",
1533
+ "jr.",
1534
+ "e.g.",
1535
+ "i.e."
1536
+ ];
1537
+ const CLOSING_PUNCTUATION = new Set([
1538
+ "\"",
1539
+ "'",
1540
+ "”",
1541
+ "’",
1542
+ ")",
1543
+ "]",
1544
+ "}"
1545
+ ]);
1546
+ const SHORT_SEGMENT_THRESHOLD = 24;
1547
+ function isLikelySentenceBoundary(text, index) {
1548
+ const char = text[index];
1549
+ if (char === "!" || char === "?" || char === "…" || char === "\n") return true;
1550
+ if (char !== ".") return false;
1551
+ const previousChar = text[index - 1] ?? "";
1552
+ const nextChar = text[index + 1] ?? "";
1553
+ if (/\d/.test(previousChar) && /\d/.test(nextChar)) return false;
1554
+ const lookback = text.slice(Math.max(0, index - 10), index + 1).toLowerCase();
1555
+ if (COMMON_ABBREVIATIONS.some((abbreviation) => lookback.endsWith(abbreviation))) return false;
1556
+ return true;
1557
+ }
1558
+ function findBoundaryEnd(text, start) {
1559
+ for (let index = start; index < text.length; index++) {
1560
+ if (text[index] === "\n") {
1561
+ let end = index + 1;
1562
+ while (end < text.length && /\s/.test(text[end] ?? "")) end++;
1563
+ return end;
1564
+ }
1565
+ if (!isLikelySentenceBoundary(text, index)) continue;
1566
+ let end = index + 1;
1567
+ while (end < text.length && CLOSING_PUNCTUATION.has(text[end] ?? "")) end++;
1568
+ if (end < text.length) {
1569
+ const nextChar = text[end] ?? "";
1570
+ if (!/\s/.test(nextChar) && !/[A-Z0-9]/.test(nextChar)) continue;
1571
+ }
1572
+ while (end < text.length && /\s/.test(text[end] ?? "")) end++;
1573
+ return end;
1574
+ }
1575
+ return null;
1576
+ }
1577
+ function extractCompletedSegments(text) {
1578
+ const segments = [];
1579
+ let consumedLength = 0;
1580
+ while (consumedLength < text.length) {
1581
+ const boundaryEnd = findBoundaryEnd(text, consumedLength);
1582
+ if (boundaryEnd === null) break;
1583
+ const segment = text.slice(consumedLength, boundaryEnd).trim();
1584
+ if (segment) segments.push(segment);
1585
+ consumedLength = boundaryEnd;
1586
+ }
1587
+ return {
1588
+ consumedLength,
1589
+ segments
1590
+ };
1591
+ }
1592
+ /**
1593
+ * Tracks a streaming assistant response, exposes a tag-free visible version for
1594
+ * the UI, and emits speakable segments as sentence boundaries become stable.
1595
+ */
1596
+ var ProgressiveResponseProcessor = class {
1597
+ consumedVisibleTextLength = 0;
1598
+ pendingShortSegment = "";
1599
+ rawResponse = "";
1600
+ push(chunk) {
1601
+ this.rawResponse += chunk;
1602
+ const visibleText = stripTrailingPointingSyntax(this.rawResponse);
1603
+ const { consumedLength, segments } = extractCompletedSegments(visibleText.slice(this.consumedVisibleTextLength));
1604
+ this.consumedVisibleTextLength += consumedLength;
1605
+ return {
1606
+ visibleText,
1607
+ speechSegments: this.coalesceSegments(segments)
1608
+ };
1609
+ }
1610
+ finish() {
1611
+ const finalResponseText = stripPointingTag(this.rawResponse);
1612
+ const trailingText = finalResponseText.slice(this.consumedVisibleTextLength).trim();
1613
+ const finalSegmentParts = [this.pendingShortSegment, trailingText].filter(Boolean);
1614
+ this.pendingShortSegment = "";
1615
+ return {
1616
+ fullResponse: this.rawResponse,
1617
+ finalResponseText,
1618
+ speechSegments: finalSegmentParts.length ? [finalSegmentParts.join(" ").trim()] : []
1619
+ };
1620
+ }
1621
+ coalesceSegments(segments) {
1622
+ const speechSegments = [];
1623
+ for (const segment of segments) {
1624
+ const normalizedSegment = segment.trim();
1625
+ if (!normalizedSegment) continue;
1626
+ const candidate = this.pendingShortSegment ? `${this.pendingShortSegment} ${normalizedSegment}` : normalizedSegment;
1627
+ if (candidate.length < SHORT_SEGMENT_THRESHOLD) {
1628
+ this.pendingShortSegment = candidate;
1629
+ continue;
1630
+ }
1631
+ this.pendingShortSegment = "";
1632
+ speechSegments.push(candidate);
1633
+ }
1634
+ return speechSegments;
1635
+ }
1636
+ };
1637
+ //#endregion
1097
1638
  //#region src/core/client.ts
1098
1639
  function clamp(value, min, max) {
1099
1640
  return Math.min(Math.max(value, min), max);
1100
1641
  }
1642
+ async function readErrorMessage(response, fallbackMessage) {
1643
+ try {
1644
+ if ((response.headers.get("Content-Type") ?? "").includes("application/json")) {
1645
+ const body = await response.json();
1646
+ if (body?.error) return body.error;
1647
+ }
1648
+ const text = await response.text();
1649
+ if (text) return text;
1650
+ } catch {}
1651
+ return fallbackMessage;
1652
+ }
1101
1653
  /**
1102
1654
  * Map coordinate-based pointing from screenshot space to viewport space.
1103
1655
  */
@@ -1127,14 +1679,18 @@ var CursorBuddyClient = class {
1127
1679
  options;
1128
1680
  voiceCapture;
1129
1681
  audioPlayback;
1682
+ browserSpeech;
1683
+ liveTranscription;
1130
1684
  screenCapture;
1131
1685
  pointerController;
1132
1686
  stateMachine;
1687
+ liveTranscript = "";
1133
1688
  transcript = "";
1134
1689
  response = "";
1135
1690
  error = null;
1136
1691
  abortController = null;
1137
1692
  historyCommittedForTurn = false;
1693
+ speechProviderForTurn = null;
1138
1694
  cachedSnapshot;
1139
1695
  listeners = /* @__PURE__ */ new Set();
1140
1696
  constructor(endpoint, options = {}, services = {}) {
@@ -1142,11 +1698,18 @@ var CursorBuddyClient = class {
1142
1698
  this.options = options;
1143
1699
  this.voiceCapture = services.voiceCapture ?? new VoiceCaptureService();
1144
1700
  this.audioPlayback = services.audioPlayback ?? new AudioPlaybackService();
1701
+ this.browserSpeech = services.browserSpeech ?? new BrowserSpeechService();
1702
+ this.liveTranscription = services.liveTranscription ?? new LiveTranscriptionService();
1145
1703
  this.screenCapture = services.screenCapture ?? new ScreenCaptureService();
1146
1704
  this.pointerController = services.pointerController ?? new PointerController();
1147
1705
  this.stateMachine = createStateMachine();
1148
1706
  this.cachedSnapshot = this.buildSnapshot();
1149
1707
  this.voiceCapture.onLevel((level) => $audioLevel.set(level));
1708
+ this.liveTranscription.onPartial((text) => {
1709
+ if (this.liveTranscript === text) return;
1710
+ this.liveTranscript = text;
1711
+ this.notify();
1712
+ });
1150
1713
  this.stateMachine.subscribe(() => {
1151
1714
  this.options.onStateChange?.(this.stateMachine.getState());
1152
1715
  this.notify();
@@ -1159,15 +1722,23 @@ var CursorBuddyClient = class {
1159
1722
  */
1160
1723
  startListening() {
1161
1724
  this.abort();
1725
+ this.liveTranscript = "";
1162
1726
  this.transcript = "";
1163
1727
  this.response = "";
1164
1728
  this.error = null;
1165
1729
  this.historyCommittedForTurn = false;
1730
+ this.speechProviderForTurn = null;
1166
1731
  this.pointerController.release();
1167
1732
  this.stateMachine.transition({ type: "HOTKEY_PRESSED" });
1168
1733
  this.notify();
1169
1734
  this.abortController = new AbortController();
1170
- this.voiceCapture.start().catch((err) => this.handleError(err));
1735
+ const signal = this.abortController.signal;
1736
+ this.beginListeningSession(signal).catch((error) => {
1737
+ if (signal.aborted) return;
1738
+ this.voiceCapture.dispose();
1739
+ this.liveTranscription.dispose();
1740
+ this.handleError(toError(error, "Failed to start listening"));
1741
+ });
1171
1742
  }
1172
1743
  /**
1173
1744
  * Stop listening and process the voice input.
@@ -1176,37 +1747,40 @@ var CursorBuddyClient = class {
1176
1747
  if (this.stateMachine.getState() !== "listening") return;
1177
1748
  this.stateMachine.transition({ type: "HOTKEY_RELEASED" });
1178
1749
  const signal = this.abortController?.signal;
1750
+ let turnFailure = null;
1751
+ const failTurn = (error) => {
1752
+ if (turnFailure || signal?.aborted) return;
1753
+ turnFailure = error;
1754
+ this.audioPlayback.stop();
1755
+ this.browserSpeech.stop();
1756
+ this.abortController?.abort();
1757
+ };
1179
1758
  try {
1180
- const [audioBlob, screenshot] = await Promise.all([this.voiceCapture.stop(), this.screenCapture.captureAnnotated()]);
1759
+ const [audioBlob, screenshot, browserTranscript] = await Promise.all([
1760
+ this.voiceCapture.stop(),
1761
+ this.screenCapture.captureAnnotated(),
1762
+ this.stopLiveTranscription()
1763
+ ]);
1764
+ if (turnFailure) throw turnFailure;
1181
1765
  if (signal?.aborted) return;
1182
- const transcript = await this.transcribe(audioBlob, signal);
1766
+ const transcript = await this.resolveTranscript(browserTranscript, audioBlob, signal);
1767
+ if (turnFailure) throw turnFailure;
1183
1768
  if (signal?.aborted) return;
1769
+ this.liveTranscript = "";
1184
1770
  this.transcript = transcript;
1185
1771
  this.options.onTranscript?.(transcript);
1186
1772
  this.notify();
1187
- const response = await this.chat(transcript, screenshot, signal);
1188
- if (signal?.aborted) return;
1189
- const parsed = parsePointingTagRaw(response);
1190
- const cleanResponse = stripPointingTag(response);
1191
- this.response = cleanResponse;
1192
- this.stateMachine.transition({
1193
- type: "AI_RESPONSE_COMPLETE",
1194
- response: cleanResponse
1773
+ this.prepareSpeechMode();
1774
+ const { cleanResponse, fullResponse, playbackQueue } = await this.chatAndSpeak(transcript, screenshot, signal, {
1775
+ onFailure: failTurn,
1776
+ onPlaybackStart: () => {
1777
+ this.stateMachine.transition({ type: "RESPONSE_STARTED" });
1778
+ }
1195
1779
  });
1780
+ if (turnFailure) throw turnFailure;
1781
+ if (signal?.aborted) return;
1782
+ const parsed = parsePointingTagRaw(fullResponse);
1196
1783
  this.options.onResponse?.(cleanResponse);
1197
- const newHistory = [
1198
- ...$conversationHistory.get(),
1199
- {
1200
- role: "user",
1201
- content: transcript
1202
- },
1203
- {
1204
- role: "assistant",
1205
- content: cleanResponse
1206
- }
1207
- ];
1208
- $conversationHistory.set(newHistory);
1209
- this.historyCommittedForTurn = true;
1210
1784
  let pointTarget = null;
1211
1785
  if (parsed) if (parsed.type === "marker") {
1212
1786
  const coords = resolveMarkerToCoordinates(screenshot.markerMap, parsed.markerId);
@@ -1222,12 +1796,30 @@ var CursorBuddyClient = class {
1222
1796
  this.options.onPoint?.(pointTarget);
1223
1797
  this.pointerController.pointAt(pointTarget);
1224
1798
  }
1225
- if (cleanResponse) await this.speak(cleanResponse, signal);
1799
+ await playbackQueue.waitForCompletion();
1800
+ if (turnFailure) throw turnFailure;
1226
1801
  if (signal?.aborted) return;
1802
+ const newHistory = [
1803
+ ...$conversationHistory.get(),
1804
+ {
1805
+ role: "user",
1806
+ content: transcript
1807
+ },
1808
+ {
1809
+ role: "assistant",
1810
+ content: cleanResponse
1811
+ }
1812
+ ];
1813
+ $conversationHistory.set(newHistory);
1814
+ this.historyCommittedForTurn = true;
1227
1815
  this.stateMachine.transition({ type: "TTS_COMPLETE" });
1228
1816
  } catch (err) {
1817
+ if (turnFailure) {
1818
+ this.handleError(turnFailure);
1819
+ return;
1820
+ }
1229
1821
  if (signal?.aborted) return;
1230
- this.handleError(err instanceof Error ? err : /* @__PURE__ */ new Error("Unknown error"));
1822
+ this.handleError(toError(err));
1231
1823
  }
1232
1824
  }
1233
1825
  /**
@@ -1258,6 +1850,7 @@ var CursorBuddyClient = class {
1258
1850
  */
1259
1851
  reset() {
1260
1852
  this.abort();
1853
+ this.liveTranscript = "";
1261
1854
  this.transcript = "";
1262
1855
  this.response = "";
1263
1856
  this.error = null;
@@ -1293,6 +1886,7 @@ var CursorBuddyClient = class {
1293
1886
  buildSnapshot() {
1294
1887
  return {
1295
1888
  state: this.stateMachine.getState(),
1889
+ liveTranscript: this.liveTranscript,
1296
1890
  transcript: this.transcript,
1297
1891
  response: this.response,
1298
1892
  error: this.error,
@@ -1304,7 +1898,11 @@ var CursorBuddyClient = class {
1304
1898
  this.commitPartialHistory();
1305
1899
  this.abortController?.abort();
1306
1900
  this.abortController = null;
1901
+ this.voiceCapture.dispose();
1902
+ this.liveTranscription.dispose();
1307
1903
  this.audioPlayback.stop();
1904
+ this.browserSpeech.stop();
1905
+ this.speechProviderForTurn = null;
1308
1906
  $audioLevel.set(0);
1309
1907
  }
1310
1908
  /**
@@ -1337,11 +1935,15 @@ var CursorBuddyClient = class {
1337
1935
  body: formData,
1338
1936
  signal
1339
1937
  });
1340
- if (!response.ok) throw new Error("Transcription failed");
1938
+ if (!response.ok) throw new Error(await readErrorMessage(response, "Transcription failed"));
1341
1939
  const { text } = await response.json();
1342
1940
  return text;
1343
1941
  }
1344
- async chat(transcript, screenshot, signal) {
1942
+ /**
1943
+ * Stream the chat response, keep the visible text updated, and feed complete
1944
+ * speech segments into the TTS queue as soon as they are ready.
1945
+ */
1946
+ async chatAndSpeak(transcript, screenshot, signal, options) {
1345
1947
  const history = $conversationHistory.get();
1346
1948
  const response = await fetch(`${this.endpoint}/chat`, {
1347
1949
  method: "POST",
@@ -1362,29 +1964,136 @@ var CursorBuddyClient = class {
1362
1964
  const reader = response.body?.getReader();
1363
1965
  if (!reader) throw new Error("No response body");
1364
1966
  const decoder = new TextDecoder();
1365
- let fullResponse = "";
1967
+ const responseProcessor = new ProgressiveResponseProcessor();
1968
+ const playbackQueue = new TTSPlaybackQueue({
1969
+ onError: options.onFailure,
1970
+ onPlaybackStart: options.onPlaybackStart,
1971
+ prepare: (text, currentSignal) => this.prepareSpeechSegment(text, currentSignal),
1972
+ signal
1973
+ });
1974
+ const shouldStreamSpeech = this.isSpeechStreamingEnabled();
1366
1975
  while (true) {
1367
1976
  const { done, value } = await reader.read();
1368
1977
  if (done) break;
1369
1978
  const chunk = decoder.decode(value, { stream: true });
1370
- fullResponse += chunk;
1371
- this.response = stripPointingTag(fullResponse);
1372
- this.notify();
1979
+ const { speechSegments, visibleText } = responseProcessor.push(chunk);
1980
+ if (shouldStreamSpeech) for (const speechSegment of speechSegments) playbackQueue.enqueue(speechSegment);
1981
+ this.updateResponse(visibleText);
1982
+ }
1983
+ const trailingChunk = decoder.decode();
1984
+ if (trailingChunk) {
1985
+ const { speechSegments, visibleText } = responseProcessor.push(trailingChunk);
1986
+ if (shouldStreamSpeech) for (const speechSegment of speechSegments) playbackQueue.enqueue(speechSegment);
1987
+ this.updateResponse(visibleText);
1373
1988
  }
1374
- return fullResponse;
1989
+ const finalizedResponse = responseProcessor.finish();
1990
+ if (shouldStreamSpeech) for (const speechSegment of finalizedResponse.speechSegments) playbackQueue.enqueue(speechSegment);
1991
+ else playbackQueue.enqueue(finalizedResponse.finalResponseText);
1992
+ this.updateResponse(finalizedResponse.finalResponseText);
1993
+ return {
1994
+ cleanResponse: finalizedResponse.finalResponseText,
1995
+ fullResponse: finalizedResponse.fullResponse,
1996
+ playbackQueue
1997
+ };
1375
1998
  }
1376
- async speak(text, signal) {
1999
+ /**
2000
+ * Request server-side TTS audio for one text segment.
2001
+ */
2002
+ async synthesizeSpeech(text, signal) {
1377
2003
  const response = await fetch(`${this.endpoint}/tts`, {
1378
2004
  method: "POST",
1379
2005
  headers: { "Content-Type": "application/json" },
1380
2006
  body: JSON.stringify({ text }),
1381
2007
  signal
1382
2008
  });
1383
- if (!response.ok) throw new Error("TTS request failed");
1384
- const audioBlob = await response.blob();
1385
- await this.audioPlayback.play(audioBlob, signal);
2009
+ if (!response.ok) throw new Error(await readErrorMessage(response, "TTS request failed"));
2010
+ return response.blob();
2011
+ }
2012
+ /**
2013
+ * Resolve the initial speech provider for this turn.
2014
+ *
2015
+ * Decision tree:
2016
+ * 1. In `server` mode, always synthesize on the server.
2017
+ * 2. In `browser` mode, require browser speech support up front.
2018
+ * 3. In `auto` mode, prefer browser speech when available and keep that
2019
+ * choice cached so later segments stay on the same provider unless a
2020
+ * browser failure forces a one-way fallback to the server.
2021
+ */
2022
+ prepareSpeechMode() {
2023
+ const speechMode = this.getSpeechMode();
2024
+ if (speechMode === "browser" && !this.browserSpeech.isAvailable()) throw new Error("Browser speech is not supported");
2025
+ if (speechMode === "server") {
2026
+ this.speechProviderForTurn = "server";
2027
+ return;
2028
+ }
2029
+ if (speechMode === "browser") {
2030
+ this.speechProviderForTurn = "browser";
2031
+ return;
2032
+ }
2033
+ this.speechProviderForTurn = this.browserSpeech.isAvailable() ? "browser" : "server";
2034
+ }
2035
+ /**
2036
+ * Prepare a playback task for one text segment.
2037
+ *
2038
+ * The queue calls this eagerly so server synthesis can overlap with the
2039
+ * currently playing segment, but the returned task is still executed in the
2040
+ * original enqueue order.
2041
+ */
2042
+ async prepareSpeechSegment(text, signal) {
2043
+ switch (this.getSpeechMode()) {
2044
+ case "server": return this.prepareServerSpeechTask(text, signal);
2045
+ case "browser": return this.prepareBrowserSpeechTask(text, signal);
2046
+ default: return this.prepareAutoSpeechTask(text, signal);
2047
+ }
2048
+ }
2049
+ /**
2050
+ * Synthesize server audio immediately and return a playback task that reuses
2051
+ * the prepared blob later.
2052
+ */
2053
+ async prepareServerSpeechTask(text, signal) {
2054
+ const blob = await this.synthesizeSpeech(text, signal);
2055
+ return () => this.audioPlayback.play(blob, signal);
2056
+ }
2057
+ /**
2058
+ * Return a browser playback task for one text segment.
2059
+ */
2060
+ async prepareBrowserSpeechTask(text, signal) {
2061
+ return () => this.browserSpeech.speak(text, signal);
2062
+ }
2063
+ /**
2064
+ * Prepare a playback task for `auto` mode.
2065
+ *
2066
+ * We prefer the browser for low latency, but if browser speech fails for any
2067
+ * segment we permanently switch the remainder of the turn to server TTS so
2068
+ * later segments do not keep retrying the failing browser path.
2069
+ */
2070
+ async prepareAutoSpeechTask(text, signal) {
2071
+ if (this.getAutoSpeechProvider() === "server") return this.prepareServerSpeechTask(text, signal);
2072
+ return async () => {
2073
+ if (this.getAutoSpeechProvider() === "server") {
2074
+ await (await this.prepareServerSpeechTask(text, signal))();
2075
+ return;
2076
+ }
2077
+ try {
2078
+ await this.browserSpeech.speak(text, signal);
2079
+ } catch (error) {
2080
+ if (signal?.aborted) return;
2081
+ this.speechProviderForTurn = "server";
2082
+ await (await this.prepareServerSpeechTask(text, signal))();
2083
+ }
2084
+ };
2085
+ }
2086
+ /**
2087
+ * Read the current provider choice for `auto` mode, lazily defaulting to the
2088
+ * browser when supported and the server otherwise.
2089
+ */
2090
+ getAutoSpeechProvider() {
2091
+ if (this.speechProviderForTurn) return this.speechProviderForTurn;
2092
+ this.speechProviderForTurn = this.browserSpeech.isAvailable() ? "browser" : "server";
2093
+ return this.speechProviderForTurn;
1386
2094
  }
1387
2095
  handleError(err) {
2096
+ this.liveTranscript = "";
1388
2097
  this.error = err;
1389
2098
  this.stateMachine.transition({
1390
2099
  type: "ERROR",
@@ -1393,6 +2102,86 @@ var CursorBuddyClient = class {
1393
2102
  this.options.onError?.(err);
1394
2103
  this.notify();
1395
2104
  }
2105
+ /**
2106
+ * Resolve the effective transcription mode for the current client.
2107
+ */
2108
+ getTranscriptionMode() {
2109
+ return this.options.transcription?.mode ?? "auto";
2110
+ }
2111
+ /**
2112
+ * Resolve the effective speech mode for the current client.
2113
+ */
2114
+ getSpeechMode() {
2115
+ return this.options.speech?.mode ?? "server";
2116
+ }
2117
+ /**
2118
+ * Decide whether speech should start before the full chat response is ready.
2119
+ */
2120
+ isSpeechStreamingEnabled() {
2121
+ return this.options.speech?.allowStreaming ?? false;
2122
+ }
2123
+ /**
2124
+ * Decide whether this turn should attempt browser speech recognition.
2125
+ */
2126
+ shouldAttemptBrowserTranscription() {
2127
+ return this.getTranscriptionMode() !== "server";
2128
+ }
2129
+ /**
2130
+ * Decide whether browser speech recognition is mandatory for this turn.
2131
+ */
2132
+ isBrowserTranscriptionRequired() {
2133
+ return this.getTranscriptionMode() === "browser";
2134
+ }
2135
+ /**
2136
+ * Start the recorder and browser speech recognition together.
2137
+ *
2138
+ * The recorder always runs so we keep waveform updates and preserve a raw
2139
+ * audio backup for server fallback in `auto` mode.
2140
+ */
2141
+ async beginListeningSession(signal) {
2142
+ const shouldAttemptBrowser = this.shouldAttemptBrowserTranscription();
2143
+ const isBrowserTranscriptionAvailable = shouldAttemptBrowser && this.liveTranscription.isAvailable();
2144
+ if (shouldAttemptBrowser && !isBrowserTranscriptionAvailable) {
2145
+ if (this.isBrowserTranscriptionRequired()) throw new Error("Browser transcription is not supported");
2146
+ }
2147
+ const [voiceCaptureResult, browserTranscriptionResult] = await Promise.allSettled([this.voiceCapture.start(), isBrowserTranscriptionAvailable ? this.liveTranscription.start() : Promise.resolve(void 0)]);
2148
+ if (signal.aborted) return;
2149
+ if (voiceCaptureResult.status === "rejected") throw toError(voiceCaptureResult.reason, "Failed to start microphone");
2150
+ if (browserTranscriptionResult.status === "rejected" && this.isBrowserTranscriptionRequired()) throw toError(browserTranscriptionResult.reason, "Browser transcription failed to start");
2151
+ if (browserTranscriptionResult.status === "rejected") this.liveTranscription.dispose();
2152
+ }
2153
+ /**
2154
+ * Stop browser speech recognition and return the best final transcript it
2155
+ * produced for this turn.
2156
+ */
2157
+ async stopLiveTranscription() {
2158
+ if (!this.shouldAttemptBrowserTranscription() || !this.liveTranscription.isAvailable()) return "";
2159
+ try {
2160
+ return await this.liveTranscription.stop();
2161
+ } catch (error) {
2162
+ if (this.isBrowserTranscriptionRequired()) throw toError(error, "Browser transcription failed");
2163
+ return "";
2164
+ }
2165
+ }
2166
+ /**
2167
+ * Choose the transcript that should drive the turn.
2168
+ *
2169
+ * Decision tree:
2170
+ * 1. Use the browser transcript when it is available.
2171
+ * 2. In browser-only mode, fail if the browser produced nothing usable.
2172
+ * 3. In auto/server modes, fall back to the recorded audio upload.
2173
+ */
2174
+ async resolveTranscript(browserTranscript, audioBlob, signal) {
2175
+ const normalizedBrowserTranscript = browserTranscript.trim();
2176
+ if (normalizedBrowserTranscript) return normalizedBrowserTranscript;
2177
+ if (this.getTranscriptionMode() === "browser") throw new Error("Browser transcription did not produce a final transcript");
2178
+ return this.transcribe(audioBlob, signal);
2179
+ }
2180
+ updateResponse(text) {
2181
+ if (this.response === text) return;
2182
+ this.response = text;
2183
+ this.notify();
2184
+ }
1396
2185
  notify() {
1397
2186
  this.cachedSnapshot = this.buildSnapshot();
1398
2187
  this.listeners.forEach((listener) => listener());
@@ -1401,4 +2190,4 @@ var CursorBuddyClient = class {
1401
2190
  //#endregion
1402
2191
  export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
1403
2192
 
1404
- //# sourceMappingURL=client-DAa4L2fE.mjs.map
2193
+ //# sourceMappingURL=client-UXGQt-7f.mjs.map