cursor-buddy 0.0.8 → 0.0.9-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,80 +21,6 @@ const $isEnabled = atom(true);
21
21
  atom(false);
22
22
  const $conversationHistory = atom([]);
23
23
  //#endregion
24
- //#region src/core/pointing.ts
25
- /**
26
- * Parses POINT tags from AI responses.
27
- *
28
- * Supports two formats:
29
- * - Marker-based: [POINT:5:label] - 3 parts, references a numbered marker
30
- * - Coordinate-based: [POINT:640,360:label] - 4 parts, raw pixel coordinates
31
- */
32
- const POINTING_TAG_REGEX = /\[POINT:(\d+)(?:,(\d+))?:([^\]]+)\]\s*$/;
33
- const PARTIAL_POINTING_PREFIXES = new Set([
34
- "[",
35
- "[P",
36
- "[PO",
37
- "[POI",
38
- "[POIN",
39
- "[POINT",
40
- "[POINT:"
41
- ]);
42
- function stripTrailingPointingTag(response, trimResult) {
43
- const stripped = response.replace(POINTING_TAG_REGEX, "");
44
- return trimResult ? stripped.trim() : stripped;
45
- }
46
- function getPartialPointingTagStart(response) {
47
- const lastOpenBracket = response.lastIndexOf("[");
48
- if (lastOpenBracket === -1) return -1;
49
- const suffix = response.slice(lastOpenBracket).trimEnd();
50
- if (suffix.includes("]")) return -1;
51
- if (suffix.startsWith("[POINT:")) {
52
- let start = lastOpenBracket;
53
- while (start > 0 && /\s/.test(response[start - 1] ?? "")) start--;
54
- return start;
55
- }
56
- return PARTIAL_POINTING_PREFIXES.has(suffix) ? lastOpenBracket : -1;
57
- }
58
- /**
59
- * Parse pointing tag into structured result.
60
- * Returns null if no valid POINT tag is found at the end.
61
- */
62
- function parsePointingTagRaw(response) {
63
- const match = response.match(POINTING_TAG_REGEX);
64
- if (!match) return null;
65
- const first = Number.parseInt(match[1], 10);
66
- const second = match[2] ? Number.parseInt(match[2], 10) : null;
67
- const label = match[3].trim();
68
- if (second !== null) return {
69
- type: "coordinates",
70
- x: first,
71
- y: second,
72
- label
73
- };
74
- return {
75
- type: "marker",
76
- markerId: first,
77
- label
78
- };
79
- }
80
- /**
81
- * Remove POINT tag from response text for display/TTS.
82
- */
83
- function stripPointingTag(response) {
84
- return stripTrailingPointingTag(response, true);
85
- }
86
- /**
87
- * Strip complete or partial trailing POINT syntax while the response streams.
88
- * This keeps the visible text and TTS input stable even if the tag arrives
89
- * incrementally over multiple chunks.
90
- */
91
- function stripTrailingPointingSyntax(response) {
92
- const withoutCompleteTag = stripTrailingPointingTag(response, false);
93
- const partialTagStart = getPartialPointingTagStart(withoutCompleteTag);
94
- if (partialTagStart === -1) return withoutCompleteTag.trimEnd();
95
- return withoutCompleteTag.slice(0, partialTagStart).trimEnd();
96
- }
97
- //#endregion
98
24
  //#region src/core/utils/error.ts
99
25
  /**
100
26
  * Normalize unknown thrown values into Error instances.
@@ -685,231 +611,199 @@ var PointerController = class {
685
611
  }
686
612
  };
687
613
  //#endregion
688
- //#region src/core/utils/annotations.ts
689
- const DEFAULT_STYLE = {
690
- borderColor: "rgba(255, 0, 0, 0.8)",
691
- labelBackground: "rgba(255, 0, 0, 0.9)",
692
- labelColor: "#ffffff",
693
- borderWidth: 2,
694
- fontSize: 11,
695
- labelPadding: 4
696
- };
697
- /**
698
- * Draw annotation markers onto a canvas.
699
- * Modifies the canvas in place.
700
- *
701
- * @param ctx Canvas 2D context to draw on
702
- * @param markers Marker map from element discovery
703
- * @param style Optional style overrides
704
- */
705
- function drawAnnotations(ctx, markers, style = {}) {
706
- const s = {
707
- ...DEFAULT_STYLE,
708
- ...style
709
- };
710
- ctx.save();
711
- for (const marker of markers.values()) {
712
- const { rect, id } = marker;
713
- ctx.strokeStyle = s.borderColor;
714
- ctx.lineWidth = s.borderWidth;
715
- ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
716
- const label = String(id);
717
- ctx.font = `bold ${s.fontSize}px monospace`;
718
- const textWidth = ctx.measureText(label).width;
719
- const textHeight = s.fontSize;
720
- const labelWidth = textWidth + s.labelPadding * 2;
721
- const labelHeight = textHeight + s.labelPadding;
722
- const labelX = rect.left - s.borderWidth;
723
- const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
724
- ctx.fillStyle = s.labelBackground;
725
- ctx.beginPath();
726
- ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
727
- ctx.fill();
728
- ctx.fillStyle = s.labelColor;
729
- ctx.textBaseline = "top";
730
- ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
731
- }
732
- ctx.restore();
733
- }
734
- /**
735
- * Create an annotated copy of a canvas.
736
- * Does not modify the original canvas.
737
- *
738
- * @param sourceCanvas Original screenshot canvas
739
- * @param markers Marker map from element discovery
740
- * @returns New canvas with annotations drawn
741
- */
742
- function createAnnotatedCanvas(sourceCanvas, markers) {
743
- const canvas = document.createElement("canvas");
744
- canvas.width = sourceCanvas.width;
745
- canvas.height = sourceCanvas.height;
746
- const ctx = canvas.getContext("2d");
747
- if (!ctx) throw new Error("Failed to get canvas 2D context");
748
- ctx.drawImage(sourceCanvas, 0, 0);
749
- drawAnnotations(ctx, markers);
750
- return canvas;
751
- }
752
- /**
753
- * Generate marker context string for AI prompt.
754
- * Lists available markers with their descriptions.
755
- *
756
- * @param markers Marker map from element discovery
757
- * @returns Formatted string listing markers
758
- */
759
- function generateMarkerContext(markers) {
760
- if (markers.size === 0) return "No interactive elements detected.";
761
- const lines = ["Interactive elements (use marker number to point):"];
762
- for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
763
- return lines.join("\n");
764
- }
765
- //#endregion
766
- //#region src/core/utils/elements.ts
767
- /**
768
- * Element discovery for annotated screenshots.
769
- * Finds visible interactive elements and assigns marker IDs.
770
- */
771
- /** Max characters for element descriptions passed to the model. */
772
- const MAX_DESCRIPTION_LENGTH = 50;
773
- /** Pixels tolerance for grouping elements into the same visual row. */
774
- const ROW_TOLERANCE_PX = 20;
775
- /**
776
- * Interactive element selectors - elements users would want to click/interact with.
777
- * Mirrors accessibility roles from agent-browser but using CSS selectors.
778
- */
779
- const INTERACTIVE_SELECTORS = [
780
- "button",
781
- "[role=\"button\"]",
782
- "input[type=\"button\"]",
783
- "input[type=\"submit\"]",
784
- "input[type=\"reset\"]",
785
- "a[href]",
786
- "[role=\"link\"]",
787
- "input:not([type=\"hidden\"])",
788
- "textarea",
789
- "select",
790
- "[role=\"textbox\"]",
791
- "[role=\"searchbox\"]",
792
- "[role=\"combobox\"]",
793
- "[role=\"listbox\"]",
794
- "[role=\"slider\"]",
795
- "[role=\"spinbutton\"]",
796
- "[role=\"checkbox\"]",
797
- "[role=\"radio\"]",
798
- "[role=\"switch\"]",
799
- "[role=\"menuitem\"]",
800
- "[role=\"menuitemcheckbox\"]",
801
- "[role=\"menuitemradio\"]",
802
- "[role=\"option\"]",
803
- "[role=\"tab\"]",
804
- "[role=\"treeitem\"]",
805
- "video",
806
- "audio",
807
- "[data-cursor-buddy-interactive]"
614
+ //#region src/core/utils/dom-snapshot.ts
615
+ const EXCLUDED_TAGS = new Set([
616
+ "script",
617
+ "link",
618
+ "style",
619
+ "noscript",
620
+ "head"
621
+ ]);
622
+ const DEFAULT_INCLUDED_ATTRIBUTES = [
623
+ "id",
624
+ "name",
625
+ "type",
626
+ "placeholder",
627
+ "href",
628
+ "title",
629
+ "value",
630
+ "role"
808
631
  ];
809
- /**
810
- * Check if an element is visible in the viewport.
811
- */
812
- function isElementVisible(element, rect = element.getBoundingClientRect()) {
813
- if (rect.width <= 0 || rect.height <= 0) return false;
814
- if (rect.bottom < 0 || rect.top > window.innerHeight || rect.right < 0 || rect.left > window.innerWidth) return false;
815
- const style = window.getComputedStyle(element);
816
- if (style.visibility === "hidden" || style.display === "none") return false;
817
- if (Number.parseFloat(style.opacity) === 0) return false;
818
- return true;
819
- }
820
- function truncateDescription(value) {
821
- return value.slice(0, MAX_DESCRIPTION_LENGTH);
822
- }
823
- /**
824
- * Generate a brief description for an element.
825
- */
826
- function describeElement(element) {
827
- const tag = element.tagName.toLowerCase();
828
- const ariaLabel = element.getAttribute("aria-label");
829
- if (ariaLabel) return truncateDescription(ariaLabel);
830
- if (tag === "button" || tag === "a") {
831
- const text = element.textContent?.trim();
832
- if (text) return truncateDescription(text);
833
- }
834
- if (tag === "input" || tag === "textarea") {
835
- const placeholder = element.getAttribute("placeholder");
836
- if (placeholder) return truncateDescription(placeholder);
837
- return `${element.getAttribute("type") || "text"} input`;
838
- }
839
- if (tag === "img") {
840
- const alt = element.getAttribute("alt");
841
- if (alt) return truncateDescription(alt);
842
- return "image";
843
- }
844
- const role = element.getAttribute("role");
845
- if (role) return role;
846
- return tag;
847
- }
848
- function collectVisibleInteractiveElements() {
849
- const selector = INTERACTIVE_SELECTORS.join(",");
850
- const allElements = document.querySelectorAll(selector);
851
- const visible = [];
852
- for (const element of allElements) {
853
- const rect = element.getBoundingClientRect();
854
- if (!isElementVisible(element, rect)) continue;
855
- visible.push({
856
- element,
857
- rect
858
- });
632
+ function buildVisibleDomSnapshot(root, options = {}) {
633
+ const { maxTextLength = 80, maxNodes = 1500, includeRects = true, rootLabel = "viewport", includedAttributes = DEFAULT_INCLUDED_ATTRIBUTES } = options;
634
+ const doc = root instanceof Document ? root : root.ownerDocument || document;
635
+ const startRoot = root instanceof Document ? root.documentElement : root;
636
+ const win = doc.defaultView || window;
637
+ const viewportW = win.innerWidth || 0;
638
+ const viewportH = win.innerHeight || 0;
639
+ let nextId = 1;
640
+ let nodeCount = 0;
641
+ const idToElement = /* @__PURE__ */ new Map();
642
+ const lines = [`# ${rootLabel} ${viewportW}x${viewportH}`];
643
+ /**
644
+ * Returns true when the element is worth considering for the snapshot.
645
+ *
646
+ * This is intentionally simple:
647
+ * - skip excluded tags
648
+ * - skip hidden/display:none/visibility:hidden/etc
649
+ * - skip zero-size elements
650
+ * - skip elements fully outside the viewport
651
+ */
652
+ function isElementVisible(el) {
653
+ const tag = el.tagName.toLowerCase();
654
+ if (EXCLUDED_TAGS.has(tag)) return false;
655
+ if (!(el instanceof HTMLElement)) return false;
656
+ if (el.hidden) return false;
657
+ if (el.closest("head")) return false;
658
+ if (typeof el.checkVisibility === "function") try {
659
+ if (!el.checkVisibility({
660
+ opacityProperty: true,
661
+ visibilityProperty: true,
662
+ contentVisibilityAuto: true
663
+ })) return false;
664
+ } catch {}
665
+ const style = win.getComputedStyle(el);
666
+ if (style.display === "none") return false;
667
+ if (style.visibility === "hidden" || style.visibility === "collapse") return false;
668
+ if (style.opacity === "0") return false;
669
+ if (style.contentVisibility === "hidden") return false;
670
+ const rect = el.getBoundingClientRect();
671
+ if (rect.width <= 0 || rect.height <= 0) return false;
672
+ if (rect.bottom <= 0 || rect.right <= 0) return false;
673
+ if (rect.top >= viewportH || rect.left >= viewportW) return false;
674
+ return true;
859
675
  }
860
- visible.sort((a, b) => {
861
- const rowDiff = Math.floor(a.rect.top / ROW_TOLERANCE_PX) - Math.floor(b.rect.top / ROW_TOLERANCE_PX);
862
- if (rowDiff !== 0) return rowDiff;
863
- return a.rect.left - b.rect.left;
864
- });
865
- return visible;
866
- }
867
- /**
868
- * Create marker map from visible interactive elements.
869
- * Assigns sequential IDs starting from 1.
870
- */
871
- function createMarkerMap() {
872
- const elements = collectVisibleInteractiveElements();
873
- const map = /* @__PURE__ */ new Map();
874
- elements.forEach(({ element, rect }, index) => {
875
- const id = index + 1;
876
- map.set(id, {
676
+ /**
677
+ * Extracts a compact text representation from the element itself.
678
+ *
679
+ * No semantic guessing:
680
+ * - prefer innerText when available
681
+ * - otherwise fall back to textContent
682
+ * - normalize whitespace
683
+ * - truncate aggressively
684
+ */
685
+ function getElementText(el) {
686
+ const text = normalizeWhitespace(el.innerText || el.textContent || "");
687
+ if (!text) return "";
688
+ return truncate(text, maxTextLength);
689
+ }
690
+ /**
691
+ * Keeps only a small allowlist of raw DOM attributes.
692
+ *
693
+ * This avoids dumping the full attribute bag, which is usually noisy
694
+ * and expensive in tokens.
695
+ */
696
+ function getIncludedAttributes(el) {
697
+ const attrs = {};
698
+ for (const name of includedAttributes) {
699
+ const value = el.getAttribute(name);
700
+ if (value == null) continue;
701
+ const clean = truncate(normalizeWhitespace(value), maxTextLength);
702
+ if (!clean) continue;
703
+ attrs[name] = clean;
704
+ }
705
+ return attrs;
706
+ }
707
+ /**
708
+ * Rounds the client rect so the output is smaller and more stable.
709
+ */
710
+ function quantizeRect(el) {
711
+ const r = el.getBoundingClientRect();
712
+ return {
713
+ x: Math.max(0, Math.round(r.left)),
714
+ y: Math.max(0, Math.round(r.top)),
715
+ w: Math.round(r.width),
716
+ h: Math.round(r.height)
717
+ };
718
+ }
719
+ /**
720
+ * Decides whether this node should be emitted.
721
+ *
722
+ * Simple rule:
723
+ * - keep it if it has visible kept children
724
+ * - or keep it if it has some text
725
+ * - or keep it if it has at least one included attribute
726
+ *
727
+ * This allows non-semantic div-heavy UIs to survive without trying
728
+ * to guess intent.
729
+ */
730
+ function shouldKeepNode(text, attrs, children) {
731
+ if (children.length > 0) return true;
732
+ if (text.length > 0) return true;
733
+ if (Object.keys(attrs).length > 0) return true;
734
+ return false;
735
+ }
736
+ /**
737
+ * Single DFS traversal over the DOM.
738
+ *
739
+ * Complexity target:
740
+ * - O(N) DOM walk
741
+ * - O(1) work per element, aside from browser layout/style calls
742
+ */
743
+ function walk(el) {
744
+ if (nodeCount >= maxNodes) return null;
745
+ if (!(el instanceof HTMLElement)) return null;
746
+ if (!isElementVisible(el)) return null;
747
+ const children = [];
748
+ for (const child of Array.from(el.children)) {
749
+ const childNode = walk(child);
750
+ if (childNode) children.push(childNode);
751
+ if (nodeCount >= maxNodes) break;
752
+ }
753
+ const text = getElementText(el);
754
+ const attrs = getIncludedAttributes(el);
755
+ if (!shouldKeepNode(text, attrs, children)) return null;
756
+ const id = nextId++;
757
+ nodeCount++;
758
+ idToElement.set(id, el);
759
+ return {
877
760
  id,
878
- element,
879
- rect,
880
- description: describeElement(element)
881
- });
882
- });
883
- return map;
884
- }
885
- /**
886
- * Get the center point of an element in viewport coordinates.
887
- */
888
- function getElementCenter(element) {
889
- const rect = element.getBoundingClientRect();
761
+ tag: el.tagName.toLowerCase(),
762
+ text,
763
+ attrs,
764
+ rect: includeRects ? quantizeRect(el) : void 0,
765
+ children
766
+ };
767
+ }
768
+ /**
769
+ * Emits the final compact line-based format.
770
+ *
771
+ * Example:
772
+ * @12 div "Settings" [id="settings"] [x=10 y=20 w=200 h=40]
773
+ */
774
+ function emit(node, depth) {
775
+ const parts = [`${" ".repeat(depth)}@${node.id} ${node.tag}`];
776
+ if (node.text) parts.push(`"${escapeQuotes(node.text)}"`);
777
+ for (const [key, value] of Object.entries(node.attrs)) parts.push(`[${key}="${escapeQuotes(value)}"]`);
778
+ if (node.rect) parts.push(`[x=${node.rect.x} y=${node.rect.y} w=${node.rect.w} h=${node.rect.h}]`);
779
+ lines.push(parts.join(" "));
780
+ for (const child of node.children) emit(child, depth + 1);
781
+ }
782
+ const tree = walk(startRoot);
783
+ if (tree) emit(tree, 0);
890
784
  return {
891
- x: Math.round(rect.left + rect.width / 2),
892
- y: Math.round(rect.top + rect.height / 2)
785
+ text: lines.join("\n"),
786
+ idToElement,
787
+ nodeCount
893
788
  };
894
789
  }
895
- /**
896
- * Resolve a marker ID to viewport coordinates.
897
- * Returns null if marker not found or element no longer visible.
898
- */
899
- function resolveMarkerToCoordinates(markerMap, markerId) {
900
- const marker = markerMap.get(markerId);
901
- if (!marker) return null;
902
- if (!document.contains(marker.element)) return null;
903
- if (!isElementVisible(marker.element)) return null;
904
- return getElementCenter(marker.element);
790
+ function normalizeWhitespace(text) {
791
+ return text.replace(/\s+/g, " ").trim();
792
+ }
793
+ function truncate(text, maxLength) {
794
+ if (text.length <= maxLength) return text;
795
+ return text.slice(0, maxLength - 1).trimEnd() + "…";
796
+ }
797
+ function escapeQuotes(text) {
798
+ return text.replace(/"/g, "\\\"");
905
799
  }
906
800
  //#endregion
907
801
  //#region src/core/utils/screenshot.ts
908
802
  const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
909
803
  /** Maximum width for compressed screenshots (maintains aspect ratio) */
910
- const MAX_SCREENSHOT_WIDTH = 1280;
911
- /** JPEG quality for compressed screenshots (0-1) */
912
- const JPEG_QUALITY = .8;
804
+ const MAX_SCREENSHOT_WIDTH = 1920;
805
+ /** JPEG quality for compressed screenshots (0-1) - higher quality for clearer details */
806
+ const JPEG_QUALITY = .95;
913
807
  /**
914
808
  * Compress a canvas image by downscaling and converting to JPEG.
915
809
  * Maintains aspect ratio and falls back to original if compression fails.
@@ -1009,7 +903,7 @@ async function waitForClonedDocumentStyles(doc) {
1009
903
  }
1010
904
  function getHtml2CanvasOptions(captureMetrics) {
1011
905
  return {
1012
- scale: 1,
906
+ scale: window.devicePixelRatio,
1013
907
  useCORS: true,
1014
908
  logging: false,
1015
909
  width: captureMetrics.viewportWidth,
@@ -1045,9 +939,10 @@ function createFallbackCanvas() {
1045
939
  return canvas;
1046
940
  }
1047
941
  /**
1048
- * Capture a screenshot of the current viewport.
1049
- * Uses html2canvas to render the DOM to a canvas, then compresses to JPEG.
1050
- * Falls back to a placeholder if capture fails (e.g., due to unsupported CSS).
942
+ * Capture a screenshot and DOM snapshot of the current viewport.
943
+ * Uses html2canvas to render the DOM to a canvas, compresses to high-quality JPEG,
944
+ * and builds a token-efficient DOM snapshot for AI context.
945
+ * Falls back to a placeholder if capture fails.
1051
946
  */
1052
947
  async function captureViewport() {
1053
948
  const captureMetrics = getCaptureMetrics();
@@ -1067,48 +962,19 @@ async function captureViewport() {
1067
962
  height: canvas.height
1068
963
  };
1069
964
  }
1070
- return {
1071
- imageData: compressed.imageData,
1072
- width: compressed.width,
1073
- height: compressed.height,
1074
- viewportWidth: captureMetrics.viewportWidth,
1075
- viewportHeight: captureMetrics.viewportHeight
1076
- };
1077
- }
1078
- /**
1079
- * Capture an annotated screenshot of the current viewport.
1080
- * Interactive elements are marked with numbered labels.
1081
- * Returns both the annotated image and a marker map for resolving IDs.
1082
- */
1083
- async function captureAnnotatedViewport() {
1084
- const captureMetrics = getCaptureMetrics();
1085
- const markerMap = createMarkerMap();
1086
- let sourceCanvas;
1087
- try {
1088
- sourceCanvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
1089
- } catch {
1090
- sourceCanvas = createFallbackCanvas();
1091
- }
1092
- const canvas = markerMap.size > 0 ? createAnnotatedCanvas(sourceCanvas, markerMap) : sourceCanvas;
1093
- const markerContext = generateMarkerContext(markerMap);
1094
- let compressed;
1095
- try {
1096
- compressed = compressImage(canvas);
1097
- } catch {
1098
- compressed = {
1099
- imageData: canvas.toDataURL("image/png"),
1100
- width: canvas.width,
1101
- height: canvas.height
1102
- };
1103
- }
965
+ const snapshot = buildVisibleDomSnapshot(document.body, {
966
+ maxNodes: 1500,
967
+ maxTextLength: 80,
968
+ includeRects: true
969
+ });
1104
970
  return {
1105
971
  imageData: compressed.imageData,
1106
972
  width: compressed.width,
1107
973
  height: compressed.height,
1108
974
  viewportWidth: captureMetrics.viewportWidth,
1109
975
  viewportHeight: captureMetrics.viewportHeight,
1110
- markerMap,
1111
- markerContext
976
+ domSnapshot: snapshot.text,
977
+ elementRegistry: snapshot.idToElement
1112
978
  };
1113
979
  }
1114
980
  //#endregion
@@ -1118,20 +984,12 @@ async function captureAnnotatedViewport() {
1118
984
  */
1119
985
  var ScreenCaptureService = class {
1120
986
  /**
1121
- * Capture a screenshot of the current viewport.
1122
- * @returns Screenshot result with image data and dimensions
987
+ * Capture a screenshot and DOM snapshot of the current viewport.
988
+ * @returns Screenshot result with image data, dimensions, and DOM snapshot
1123
989
  */
1124
990
  async capture() {
1125
991
  return captureViewport();
1126
992
  }
1127
- /**
1128
- * Capture an annotated screenshot with marker overlays.
1129
- * Interactive elements are marked with numbered labels.
1130
- * @returns Annotated screenshot result with marker map
1131
- */
1132
- async captureAnnotated() {
1133
- return captureAnnotatedViewport();
1134
- }
1135
993
  };
1136
994
  //#endregion
1137
995
  //#region src/core/services/tts-playback-queue.ts
@@ -1374,12 +1232,12 @@ const AUDIO_LEVEL_NOISE_GATE = 5e-4;
1374
1232
  const AUDIO_LEVEL_INPUT_GAIN = 600;
1375
1233
  const AUDIO_LEVEL_ATTACK = .7;
1376
1234
  const AUDIO_LEVEL_RELEASE = .25;
1377
- function clamp$1(value, min, max) {
1235
+ function clamp(value, min, max) {
1378
1236
  return Math.min(Math.max(value, min), max);
1379
1237
  }
1380
1238
  function normalizeAudioLevel(rms) {
1381
1239
  const gatedRms = Math.max(0, rms - AUDIO_LEVEL_NOISE_GATE);
1382
- return clamp$1(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
1240
+ return clamp(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
1383
1241
  }
1384
1242
  function smoothAudioLevel(current, target) {
1385
1243
  const smoothing = target > current ? AUDIO_LEVEL_ATTACK : AUDIO_LEVEL_RELEASE;
@@ -1584,6 +1442,47 @@ function createStateMachine(initial = "idle") {
1584
1442
  };
1585
1443
  }
1586
1444
  //#endregion
1445
+ //#region src/core/utils/ui-stream-parser.ts
1446
+ /**
1447
+ * Parse a single line from the UI message stream.
1448
+ * The stream format is SSE with "data: " prefix followed by JSON.
1449
+ */
1450
+ function parseUIStreamLine(line) {
1451
+ const trimmed = line.trim();
1452
+ if (!trimmed) return null;
1453
+ let jsonStr = trimmed;
1454
+ if (trimmed.startsWith("data: ")) jsonStr = trimmed.slice(6);
1455
+ if (jsonStr === "[DONE]") return null;
1456
+ try {
1457
+ const chunk = JSON.parse(jsonStr);
1458
+ switch (chunk.type) {
1459
+ case "text-delta": return {
1460
+ type: "text-delta",
1461
+ delta: chunk.delta ?? ""
1462
+ };
1463
+ case "tool-input-available": return {
1464
+ type: "tool-input-available",
1465
+ toolName: chunk.toolName ?? "",
1466
+ input: chunk.input
1467
+ };
1468
+ case "finish": return { type: "finish" };
1469
+ case "error": return {
1470
+ type: "error",
1471
+ errorText: chunk.errorText ?? "Unknown error"
1472
+ };
1473
+ default: return { type: "unknown" };
1474
+ }
1475
+ } catch {
1476
+ return null;
1477
+ }
1478
+ }
1479
+ /**
1480
+ * Check if a tool call is a point tool call with valid input.
1481
+ */
1482
+ function isPointToolCall(chunk) {
1483
+ return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "elementId" in chunk.input && "label" in chunk.input;
1484
+ }
1485
+ //#endregion
1587
1486
  //#region src/core/utils/response-processor.ts
1588
1487
  const COMMON_ABBREVIATIONS = [
1589
1488
  "mr.",
@@ -1652,32 +1551,58 @@ function extractCompletedSegments(text) {
1652
1551
  };
1653
1552
  }
1654
1553
  /**
1655
- * Tracks a streaming assistant response, exposes a tag-free visible version for
1656
- * the UI, and emits speakable segments as sentence boundaries become stable.
1554
+ * Processes a streaming AI SDK UI message stream response.
1555
+ * Extracts text for display/TTS and captures point tool calls.
1657
1556
  */
1658
1557
  var ProgressiveResponseProcessor = class {
1659
- consumedVisibleTextLength = 0;
1558
+ consumedTextLength = 0;
1660
1559
  pendingShortSegment = "";
1661
- rawResponse = "";
1560
+ rawText = "";
1561
+ buffer = "";
1562
+ pointToolCall = null;
1563
+ /**
1564
+ * Push raw stream data and extract text chunks and tool calls.
1565
+ * The UI message stream format is newline-delimited JSON.
1566
+ */
1662
1567
  push(chunk) {
1663
- this.rawResponse += chunk;
1664
- const visibleText = stripTrailingPointingSyntax(this.rawResponse);
1665
- const { consumedLength, segments } = extractCompletedSegments(visibleText.slice(this.consumedVisibleTextLength));
1666
- this.consumedVisibleTextLength += consumedLength;
1568
+ this.buffer += chunk;
1569
+ const lines = this.buffer.split("\n");
1570
+ this.buffer = lines.pop() ?? "";
1571
+ const newTextParts = [];
1572
+ for (const line of lines) {
1573
+ const parsed = parseUIStreamLine(line);
1574
+ if (!parsed) continue;
1575
+ if (parsed.type === "text-delta") newTextParts.push(parsed.delta);
1576
+ else if (isPointToolCall(parsed)) {
1577
+ if (!this.pointToolCall) this.pointToolCall = parsed.input;
1578
+ }
1579
+ }
1580
+ if (newTextParts.length > 0) this.rawText += newTextParts.join("");
1581
+ const { consumedLength, segments } = extractCompletedSegments(this.rawText.slice(this.consumedTextLength));
1582
+ this.consumedTextLength += consumedLength;
1667
1583
  return {
1668
- visibleText,
1669
- speechSegments: this.coalesceSegments(segments)
1584
+ visibleText: this.rawText,
1585
+ speechSegments: this.coalesceSegments(segments),
1586
+ pointToolCall: this.pointToolCall
1670
1587
  };
1671
1588
  }
1589
+ /**
1590
+ * Finalize processing and return any remaining text/tool call.
1591
+ */
1672
1592
  finish() {
1673
- const finalResponseText = stripPointingTag(this.rawResponse);
1674
- const trailingText = finalResponseText.slice(this.consumedVisibleTextLength).trim();
1593
+ if (this.buffer) {
1594
+ const parsed = parseUIStreamLine(this.buffer);
1595
+ if (parsed?.type === "text-delta") this.rawText += parsed.delta;
1596
+ else if (parsed && isPointToolCall(parsed) && !this.pointToolCall) this.pointToolCall = parsed.input;
1597
+ this.buffer = "";
1598
+ }
1599
+ const trailingText = this.rawText.slice(this.consumedTextLength).trim();
1675
1600
  const finalSegmentParts = [this.pendingShortSegment, trailingText].filter(Boolean);
1676
1601
  this.pendingShortSegment = "";
1677
1602
  return {
1678
- fullResponse: this.rawResponse,
1679
- finalResponseText,
1680
- speechSegments: finalSegmentParts.length ? [finalSegmentParts.join(" ").trim()] : []
1603
+ finalResponseText: this.rawText.trim(),
1604
+ speechSegments: finalSegmentParts.length ? [finalSegmentParts.join(" ").trim()] : [],
1605
+ pointToolCall: this.pointToolCall
1681
1606
  };
1682
1607
  }
1683
1608
  coalesceSegments(segments) {
@@ -1698,9 +1623,6 @@ var ProgressiveResponseProcessor = class {
1698
1623
  };
1699
1624
  //#endregion
1700
1625
  //#region src/core/client.ts
1701
- function clamp(value, min, max) {
1702
- return Math.min(Math.max(value, min), max);
1703
- }
1704
1626
  async function readErrorMessage(response, fallbackMessage) {
1705
1627
  try {
1706
1628
  if ((response.headers.get("Content-Type") ?? "").includes("application/json")) {
@@ -1713,21 +1635,6 @@ async function readErrorMessage(response, fallbackMessage) {
1713
1635
  return fallbackMessage;
1714
1636
  }
1715
1637
  /**
1716
- * Map coordinate-based pointing from screenshot space to viewport space.
1717
- */
1718
- function mapCoordinatesToViewport(x, y, screenshot) {
1719
- if (screenshot.width <= 0 || screenshot.height <= 0) return {
1720
- x,
1721
- y
1722
- };
1723
- const scaleX = screenshot.viewportWidth / screenshot.width;
1724
- const scaleY = screenshot.viewportHeight / screenshot.height;
1725
- return {
1726
- x: clamp(Math.round(x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
1727
- y: clamp(Math.round(y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
1728
- };
1729
- }
1730
- /**
1731
1638
  * Framework-agnostic client for cursor buddy voice interactions.
1732
1639
  *
1733
1640
  * Manages the complete voice interaction flow:
@@ -1796,7 +1703,7 @@ var CursorBuddyClient = class {
1796
1703
  this.notify();
1797
1704
  this.abortController = new AbortController();
1798
1705
  const signal = this.abortController.signal;
1799
- this.screenshotPromise = this.screenCapture.captureAnnotated();
1706
+ this.screenshotPromise = this.screenCapture.capture();
1800
1707
  this.beginListeningSession(signal).catch((error) => {
1801
1708
  if (signal.aborted) return;
1802
1709
  this.voiceCapture.dispose();
@@ -1839,7 +1746,7 @@ var CursorBuddyClient = class {
1839
1746
  this.options.onTranscript?.(transcript);
1840
1747
  this.notify();
1841
1748
  this.prepareSpeechMode();
1842
- const { cleanResponse, fullResponse, playbackQueue } = await this.chatAndSpeak(transcript, screenshot, signal, {
1749
+ const { cleanResponse, pointToolCall, playbackQueue } = await this.chatAndSpeak(transcript, screenshot, signal, {
1843
1750
  onFailure: failTurn,
1844
1751
  onPlaybackStart: () => {
1845
1752
  this.stateMachine.transition({ type: "RESPONSE_STARTED" });
@@ -1847,19 +1754,19 @@ var CursorBuddyClient = class {
1847
1754
  });
1848
1755
  if (turnFailure) throw turnFailure;
1849
1756
  if (signal?.aborted) return;
1850
- const parsed = parsePointingTagRaw(fullResponse);
1851
1757
  this.options.onResponse?.(cleanResponse);
1852
1758
  let pointTarget = null;
1853
- if (parsed) if (parsed.type === "marker") {
1854
- const coords = resolveMarkerToCoordinates(screenshot.markerMap, parsed.markerId);
1855
- if (coords) pointTarget = {
1856
- ...coords,
1857
- label: parsed.label
1858
- };
1859
- } else pointTarget = {
1860
- ...mapCoordinatesToViewport(parsed.x, parsed.y, screenshot),
1861
- label: parsed.label
1862
- };
1759
+ if (pointToolCall) {
1760
+ const element = screenshot.elementRegistry.get(pointToolCall.elementId);
1761
+ if (element) {
1762
+ const rect = element.getBoundingClientRect();
1763
+ pointTarget = {
1764
+ x: Math.round(rect.left + rect.width / 2),
1765
+ y: Math.round(rect.top + rect.height / 2),
1766
+ label: pointToolCall.label
1767
+ };
1768
+ }
1769
+ }
1863
1770
  if (pointTarget) {
1864
1771
  this.options.onPoint?.(pointTarget);
1865
1772
  this.pointerController.pointAt(pointTarget);
@@ -2025,7 +1932,7 @@ var CursorBuddyClient = class {
2025
1932
  },
2026
1933
  transcript,
2027
1934
  history,
2028
- markerContext: screenshot.markerContext
1935
+ domSnapshot: screenshot.domSnapshot
2029
1936
  }),
2030
1937
  signal
2031
1938
  });
@@ -2061,7 +1968,7 @@ var CursorBuddyClient = class {
2061
1968
  this.updateResponse(finalizedResponse.finalResponseText);
2062
1969
  return {
2063
1970
  cleanResponse: finalizedResponse.finalResponseText,
2064
- fullResponse: finalizedResponse.fullResponse,
1971
+ pointToolCall: finalizedResponse.pointToolCall,
2065
1972
  playbackQueue
2066
1973
  };
2067
1974
  }
@@ -2259,4 +2166,4 @@ var CursorBuddyClient = class {
2259
2166
  //#endregion
2260
2167
  export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
2261
2168
 
2262
- //# sourceMappingURL=client-D73KQZf8.mjs.map
2169
+ //# sourceMappingURL=client-CliXcNch.mjs.map