cursor-buddy 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,7 +12,7 @@ Customize its prompt, pass custom tools, choose between browser or server-side s
12
12
 
13
13
  - **Push-to-talk voice input** — Hold a hotkey to speak, release to send
14
14
  - **Browser-first live transcription** — Realtime transcript while speaking, with server fallback
15
- - **Annotated screenshot context** — AI sees your current viewport with numbered interactive elements
15
+ - **DOM snapshot context** — AI sees a token-efficient representation of your visible page structure
16
16
  - **Voice responses** — Browser or server TTS, with optional streaming playback
17
17
  - **Cursor pointing** — AI can point at UI elements it references
18
18
  - **Voice interruption** — Start talking again to cut off current response
@@ -367,17 +367,15 @@ client.stopListening()
367
367
 
368
368
  1. User holds the hotkey
369
369
  2. Microphone captures audio, waveform shows audio level, and browser speech recognition starts when available
370
- 3. User releases hotkey
371
- 4. An annotated screenshot of the viewport is captured, with numbered markers on visible interactive elements, based on [agent-browser](https://github.com/vercel-labs/agent-browser) implementation.
370
+ 3. At the same time, a screenshot and token-efficient DOM snapshot of the viewport are captured in the background. This runs in parallel with speech capture to minimize latency
371
+ 4. User releases hotkey
372
372
  5. The client prefers the browser transcript; if it is unavailable or empty in `auto` mode, the recorded audio is transcribed on the server
373
- 6. Screenshot + marker context are sent to the AI model
374
- 7. AI responds with text and can optionally call the `point` tool to indicate a location on screen:
375
- - `type: "marker"` with `markerId` for numbered interactive elements (most accurate)
376
- - `type: "coordinates"` with `x, y` pixel coordinates for anything without a marker
373
+ 6. The already-captured screenshot + DOM snapshot are sent to the AI model. Each element has an `@ID` (e.g., `@12`) that the AI can reference.
374
+ 7. AI responds with text and can optionally call the `point` tool to indicate an element on screen by its `@ID` from the DOM snapshot
377
375
  8. Response is spoken in the browser or on the server based on `speech.mode`,
378
- and can either wait for the full response or stream sentence-by-sentence
379
- based on `speech.allowStreaming`
380
- 9. If the AI calls the point tool, the cursor animates to the target location markers resolve to live DOM elements, coordinates map to viewport positions
376
+ and can either wait for the full response or stream sentence-by-sentence
377
+ based on `speech.allowStreaming`
378
+ 9. If the AI calls the point tool, the cursor animates to the target element's current position (it resolves the element from the snapshot registry and computes its center point)
381
379
  10. **If user presses hotkey again at any point, current response is interrupted**
382
380
 
383
381
  ## Security Best Practices
@@ -611,231 +611,199 @@ var PointerController = class {
611
611
  }
612
612
  };
613
613
  //#endregion
614
- //#region src/core/utils/annotations.ts
615
- const DEFAULT_STYLE = {
616
- borderColor: "rgba(255, 0, 0, 0.8)",
617
- labelBackground: "rgba(255, 0, 0, 0.9)",
618
- labelColor: "#ffffff",
619
- borderWidth: 2,
620
- fontSize: 15,
621
- labelPadding: 4
622
- };
623
- /**
624
- * Draw annotation markers onto a canvas.
625
- * Modifies the canvas in place.
626
- *
627
- * @param ctx Canvas 2D context to draw on
628
- * @param markers Marker map from element discovery
629
- * @param style Optional style overrides
630
- */
631
- function drawAnnotations(ctx, markers, style = {}) {
632
- const s = {
633
- ...DEFAULT_STYLE,
634
- ...style
635
- };
636
- ctx.save();
637
- for (const marker of markers.values()) {
638
- const { rect, id } = marker;
639
- ctx.strokeStyle = s.borderColor;
640
- ctx.lineWidth = s.borderWidth;
641
- ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
642
- const label = String(id);
643
- ctx.font = `bold ${s.fontSize}px monospace`;
644
- const textWidth = ctx.measureText(label).width;
645
- const textHeight = s.fontSize;
646
- const labelWidth = textWidth + s.labelPadding * 2;
647
- const labelHeight = textHeight + s.labelPadding;
648
- const labelX = rect.left - s.borderWidth;
649
- const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
650
- ctx.fillStyle = s.labelBackground;
651
- ctx.beginPath();
652
- ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
653
- ctx.fill();
654
- ctx.fillStyle = s.labelColor;
655
- ctx.textBaseline = "top";
656
- ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
657
- }
658
- ctx.restore();
659
- }
660
- /**
661
- * Create an annotated copy of a canvas.
662
- * Does not modify the original canvas.
663
- *
664
- * @param sourceCanvas Original screenshot canvas
665
- * @param markers Marker map from element discovery
666
- * @returns New canvas with annotations drawn
667
- */
668
- function createAnnotatedCanvas(sourceCanvas, markers) {
669
- const canvas = document.createElement("canvas");
670
- canvas.width = sourceCanvas.width;
671
- canvas.height = sourceCanvas.height;
672
- const ctx = canvas.getContext("2d");
673
- if (!ctx) throw new Error("Failed to get canvas 2D context");
674
- ctx.drawImage(sourceCanvas, 0, 0);
675
- drawAnnotations(ctx, markers);
676
- return canvas;
677
- }
678
- /**
679
- * Generate marker context string for AI prompt.
680
- * Lists available markers with their descriptions.
681
- *
682
- * @param markers Marker map from element discovery
683
- * @returns Formatted string listing markers
684
- */
685
- function generateMarkerContext(markers) {
686
- if (markers.size === 0) return "No interactive elements detected.";
687
- const lines = ["Interactive elements (use marker number to point):"];
688
- for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
689
- return lines.join("\n");
690
- }
691
- //#endregion
692
- //#region src/core/utils/elements.ts
693
- /**
694
- * Element discovery for annotated screenshots.
695
- * Finds visible interactive elements and assigns marker IDs.
696
- */
697
- /** Max characters for element descriptions passed to the model. */
698
- const MAX_DESCRIPTION_LENGTH = 50;
699
- /** Pixels tolerance for grouping elements into the same visual row. */
700
- const ROW_TOLERANCE_PX = 20;
701
- /**
702
- * Interactive element selectors - elements users would want to click/interact with.
703
- * Mirrors accessibility roles from agent-browser but using CSS selectors.
704
- */
705
- const INTERACTIVE_SELECTORS = [
706
- "button",
707
- "[role=\"button\"]",
708
- "input[type=\"button\"]",
709
- "input[type=\"submit\"]",
710
- "input[type=\"reset\"]",
711
- "a[href]",
712
- "[role=\"link\"]",
713
- "input:not([type=\"hidden\"])",
714
- "textarea",
715
- "select",
716
- "[role=\"textbox\"]",
717
- "[role=\"searchbox\"]",
718
- "[role=\"combobox\"]",
719
- "[role=\"listbox\"]",
720
- "[role=\"slider\"]",
721
- "[role=\"spinbutton\"]",
722
- "[role=\"checkbox\"]",
723
- "[role=\"radio\"]",
724
- "[role=\"switch\"]",
725
- "[role=\"menuitem\"]",
726
- "[role=\"menuitemcheckbox\"]",
727
- "[role=\"menuitemradio\"]",
728
- "[role=\"option\"]",
729
- "[role=\"tab\"]",
730
- "[role=\"treeitem\"]",
731
- "video",
732
- "audio",
733
- "[data-cursor-buddy-interactive]"
614
+ //#region src/core/utils/dom-snapshot.ts
615
+ const EXCLUDED_TAGS = new Set([
616
+ "script",
617
+ "link",
618
+ "style",
619
+ "noscript",
620
+ "head"
621
+ ]);
622
+ const DEFAULT_INCLUDED_ATTRIBUTES = [
623
+ "id",
624
+ "name",
625
+ "type",
626
+ "placeholder",
627
+ "href",
628
+ "title",
629
+ "value",
630
+ "role"
734
631
  ];
735
- /**
736
- * Check if an element is visible in the viewport.
737
- */
738
- function isElementVisible(element, rect = element.getBoundingClientRect()) {
739
- if (rect.width <= 0 || rect.height <= 0) return false;
740
- if (rect.bottom < 0 || rect.top > window.innerHeight || rect.right < 0 || rect.left > window.innerWidth) return false;
741
- const style = window.getComputedStyle(element);
742
- if (style.visibility === "hidden" || style.display === "none") return false;
743
- if (Number.parseFloat(style.opacity) === 0) return false;
744
- return true;
745
- }
746
- function truncateDescription(value) {
747
- return value.slice(0, MAX_DESCRIPTION_LENGTH);
748
- }
749
- /**
750
- * Generate a brief description for an element.
751
- */
752
- function describeElement(element) {
753
- const tag = element.tagName.toLowerCase();
754
- const ariaLabel = element.getAttribute("aria-label");
755
- if (ariaLabel) return truncateDescription(ariaLabel);
756
- if (tag === "button" || tag === "a") {
757
- const text = element.textContent?.trim();
758
- if (text) return truncateDescription(text);
759
- }
760
- if (tag === "input" || tag === "textarea") {
761
- const placeholder = element.getAttribute("placeholder");
762
- if (placeholder) return truncateDescription(placeholder);
763
- return `${element.getAttribute("type") || "text"} input`;
764
- }
765
- if (tag === "img") {
766
- const alt = element.getAttribute("alt");
767
- if (alt) return truncateDescription(alt);
768
- return "image";
769
- }
770
- const role = element.getAttribute("role");
771
- if (role) return role;
772
- return tag;
773
- }
774
- function collectVisibleInteractiveElements() {
775
- const selector = INTERACTIVE_SELECTORS.join(",");
776
- const allElements = document.querySelectorAll(selector);
777
- const visible = [];
778
- for (const element of allElements) {
779
- const rect = element.getBoundingClientRect();
780
- if (!isElementVisible(element, rect)) continue;
781
- visible.push({
782
- element,
783
- rect
784
- });
632
+ function buildVisibleDomSnapshot(root, options = {}) {
633
+ const { maxTextLength = 80, maxNodes = 1500, includeRects = true, rootLabel = "viewport", includedAttributes = DEFAULT_INCLUDED_ATTRIBUTES } = options;
634
+ const doc = root instanceof Document ? root : root.ownerDocument || document;
635
+ const startRoot = root instanceof Document ? root.documentElement : root;
636
+ const win = doc.defaultView || window;
637
+ const viewportW = win.innerWidth || 0;
638
+ const viewportH = win.innerHeight || 0;
639
+ let nextId = 1;
640
+ let nodeCount = 0;
641
+ const idToElement = /* @__PURE__ */ new Map();
642
+ const lines = [`# ${rootLabel} ${viewportW}x${viewportH}`];
643
+ /**
644
+ * Returns true when the element is worth considering for the snapshot.
645
+ *
646
+ * This is intentionally simple:
647
+ * - skip excluded tags
648
+ * - skip hidden/display:none/visibility:hidden/etc
649
+ * - skip zero-size elements
650
+ * - skip elements fully outside the viewport
651
+ */
652
+ function isElementVisible(el) {
653
+ const tag = el.tagName.toLowerCase();
654
+ if (EXCLUDED_TAGS.has(tag)) return false;
655
+ if (!(el instanceof HTMLElement)) return false;
656
+ if (el.hidden) return false;
657
+ if (el.closest("head")) return false;
658
+ if (typeof el.checkVisibility === "function") try {
659
+ if (!el.checkVisibility({
660
+ opacityProperty: true,
661
+ visibilityProperty: true,
662
+ contentVisibilityAuto: true
663
+ })) return false;
664
+ } catch {}
665
+ const style = win.getComputedStyle(el);
666
+ if (style.display === "none") return false;
667
+ if (style.visibility === "hidden" || style.visibility === "collapse") return false;
668
+ if (style.opacity === "0") return false;
669
+ if (style.contentVisibility === "hidden") return false;
670
+ const rect = el.getBoundingClientRect();
671
+ if (rect.width <= 0 || rect.height <= 0) return false;
672
+ if (rect.bottom <= 0 || rect.right <= 0) return false;
673
+ if (rect.top >= viewportH || rect.left >= viewportW) return false;
674
+ return true;
785
675
  }
786
- visible.sort((a, b) => {
787
- const rowDiff = Math.floor(a.rect.top / ROW_TOLERANCE_PX) - Math.floor(b.rect.top / ROW_TOLERANCE_PX);
788
- if (rowDiff !== 0) return rowDiff;
789
- return a.rect.left - b.rect.left;
790
- });
791
- return visible;
792
- }
793
- /**
794
- * Create marker map from visible interactive elements.
795
- * Assigns sequential IDs starting from 1.
796
- */
797
- function createMarkerMap() {
798
- const elements = collectVisibleInteractiveElements();
799
- const map = /* @__PURE__ */ new Map();
800
- elements.forEach(({ element, rect }, index) => {
801
- const id = index + 1;
802
- map.set(id, {
676
+ /**
677
+ * Extracts a compact text representation from the element itself.
678
+ *
679
+ * No semantic guessing:
680
+ * - prefer innerText when available
681
+ * - otherwise fall back to textContent
682
+ * - normalize whitespace
683
+ * - truncate aggressively
684
+ */
685
+ function getElementText(el) {
686
+ const text = normalizeWhitespace(el.innerText || el.textContent || "");
687
+ if (!text) return "";
688
+ return truncate(text, maxTextLength);
689
+ }
690
+ /**
691
+ * Keeps only a small allowlist of raw DOM attributes.
692
+ *
693
+ * This avoids dumping the full attribute bag, which is usually noisy
694
+ * and expensive in tokens.
695
+ */
696
+ function getIncludedAttributes(el) {
697
+ const attrs = {};
698
+ for (const name of includedAttributes) {
699
+ const value = el.getAttribute(name);
700
+ if (value == null) continue;
701
+ const clean = truncate(normalizeWhitespace(value), maxTextLength);
702
+ if (!clean) continue;
703
+ attrs[name] = clean;
704
+ }
705
+ return attrs;
706
+ }
707
+ /**
708
+ * Rounds the client rect so the output is smaller and more stable.
709
+ */
710
+ function quantizeRect(el) {
711
+ const r = el.getBoundingClientRect();
712
+ return {
713
+ x: Math.max(0, Math.round(r.left)),
714
+ y: Math.max(0, Math.round(r.top)),
715
+ w: Math.round(r.width),
716
+ h: Math.round(r.height)
717
+ };
718
+ }
719
+ /**
720
+ * Decides whether this node should be emitted.
721
+ *
722
+ * Simple rule:
723
+ * - keep it if it has visible kept children
724
+ * - or keep it if it has some text
725
+ * - or keep it if it has at least one included attribute
726
+ *
727
+ * This allows non-semantic div-heavy UIs to survive without trying
728
+ * to guess intent.
729
+ */
730
+ function shouldKeepNode(text, attrs, children) {
731
+ if (children.length > 0) return true;
732
+ if (text.length > 0) return true;
733
+ if (Object.keys(attrs).length > 0) return true;
734
+ return false;
735
+ }
736
+ /**
737
+ * Single DFS traversal over the DOM.
738
+ *
739
+ * Complexity target:
740
+ * - O(N) DOM walk
741
+ * - O(1) work per element, aside from browser layout/style calls
742
+ */
743
+ function walk(el) {
744
+ if (nodeCount >= maxNodes) return null;
745
+ if (!(el instanceof HTMLElement)) return null;
746
+ if (!isElementVisible(el)) return null;
747
+ const children = [];
748
+ for (const child of Array.from(el.children)) {
749
+ const childNode = walk(child);
750
+ if (childNode) children.push(childNode);
751
+ if (nodeCount >= maxNodes) break;
752
+ }
753
+ const text = getElementText(el);
754
+ const attrs = getIncludedAttributes(el);
755
+ if (!shouldKeepNode(text, attrs, children)) return null;
756
+ const id = nextId++;
757
+ nodeCount++;
758
+ idToElement.set(id, el);
759
+ return {
803
760
  id,
804
- element,
805
- rect,
806
- description: describeElement(element)
807
- });
808
- });
809
- return map;
810
- }
811
- /**
812
- * Get the center point of an element in viewport coordinates.
813
- */
814
- function getElementCenter(element) {
815
- const rect = element.getBoundingClientRect();
761
+ tag: el.tagName.toLowerCase(),
762
+ text,
763
+ attrs,
764
+ rect: includeRects ? quantizeRect(el) : void 0,
765
+ children
766
+ };
767
+ }
768
+ /**
769
+ * Emits the final compact line-based format.
770
+ *
771
+ * Example:
772
+ * @12 div "Settings" [id="settings"] [x=10 y=20 w=200 h=40]
773
+ */
774
+ function emit(node, depth) {
775
+ const parts = [`${" ".repeat(depth)}@${node.id} ${node.tag}`];
776
+ if (node.text) parts.push(`"${escapeQuotes(node.text)}"`);
777
+ for (const [key, value] of Object.entries(node.attrs)) parts.push(`[${key}="${escapeQuotes(value)}"]`);
778
+ if (node.rect) parts.push(`[x=${node.rect.x} y=${node.rect.y} w=${node.rect.w} h=${node.rect.h}]`);
779
+ lines.push(parts.join(" "));
780
+ for (const child of node.children) emit(child, depth + 1);
781
+ }
782
+ const tree = walk(startRoot);
783
+ if (tree) emit(tree, 0);
816
784
  return {
817
- x: Math.round(rect.left + rect.width / 2),
818
- y: Math.round(rect.top + rect.height / 2)
785
+ text: lines.join("\n"),
786
+ idToElement,
787
+ nodeCount
819
788
  };
820
789
  }
821
- /**
822
- * Resolve a marker ID to viewport coordinates.
823
- * Returns null if marker not found or element no longer visible.
824
- */
825
- function resolveMarkerToCoordinates(markerMap, markerId) {
826
- const marker = markerMap.get(markerId);
827
- if (!marker) return null;
828
- if (!document.contains(marker.element)) return null;
829
- if (!isElementVisible(marker.element)) return null;
830
- return getElementCenter(marker.element);
790
+ function normalizeWhitespace(text) {
791
+ return text.replace(/\s+/g, " ").trim();
792
+ }
793
+ function truncate(text, maxLength) {
794
+ if (text.length <= maxLength) return text;
795
+ return text.slice(0, maxLength - 1).trimEnd() + "…";
796
+ }
797
+ function escapeQuotes(text) {
798
+ return text.replace(/"/g, "\\\"");
831
799
  }
832
800
  //#endregion
833
801
  //#region src/core/utils/screenshot.ts
834
802
  const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
835
803
  /** Maximum width for compressed screenshots (maintains aspect ratio) */
836
- const MAX_SCREENSHOT_WIDTH = 1280;
837
- /** JPEG quality for compressed screenshots (0-1) */
838
- const JPEG_QUALITY = .8;
804
+ const MAX_SCREENSHOT_WIDTH = 1920;
805
+ /** JPEG quality for compressed screenshots (0-1) - higher quality for clearer details */
806
+ const JPEG_QUALITY = .95;
839
807
  /**
840
808
  * Compress a canvas image by downscaling and converting to JPEG.
841
809
  * Maintains aspect ratio and falls back to original if compression fails.
@@ -971,9 +939,10 @@ function createFallbackCanvas() {
971
939
  return canvas;
972
940
  }
973
941
  /**
974
- * Capture a screenshot of the current viewport.
975
- * Uses html2canvas to render the DOM to a canvas, then compresses to JPEG.
976
- * Falls back to a placeholder if capture fails (e.g., due to unsupported CSS).
942
+ * Capture a screenshot and DOM snapshot of the current viewport.
943
+ * Uses html2canvas to render the DOM to a canvas, compresses to high-quality JPEG,
944
+ * and builds a token-efficient DOM snapshot for AI context.
945
+ * Falls back to a placeholder if capture fails.
977
946
  */
978
947
  async function captureViewport() {
979
948
  const captureMetrics = getCaptureMetrics();
@@ -993,48 +962,19 @@ async function captureViewport() {
993
962
  height: canvas.height
994
963
  };
995
964
  }
996
- return {
997
- imageData: compressed.imageData,
998
- width: compressed.width,
999
- height: compressed.height,
1000
- viewportWidth: captureMetrics.viewportWidth,
1001
- viewportHeight: captureMetrics.viewportHeight
1002
- };
1003
- }
1004
- /**
1005
- * Capture an annotated screenshot of the current viewport.
1006
- * Interactive elements are marked with numbered labels.
1007
- * Returns both the annotated image and a marker map for resolving IDs.
1008
- */
1009
- async function captureAnnotatedViewport() {
1010
- const captureMetrics = getCaptureMetrics();
1011
- const markerMap = createMarkerMap();
1012
- let sourceCanvas;
1013
- try {
1014
- sourceCanvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
1015
- } catch {
1016
- sourceCanvas = createFallbackCanvas();
1017
- }
1018
- const canvas = markerMap.size > 0 ? createAnnotatedCanvas(sourceCanvas, markerMap) : sourceCanvas;
1019
- const markerContext = generateMarkerContext(markerMap);
1020
- let compressed;
1021
- try {
1022
- compressed = compressImage(canvas);
1023
- } catch {
1024
- compressed = {
1025
- imageData: canvas.toDataURL("image/png"),
1026
- width: canvas.width,
1027
- height: canvas.height
1028
- };
1029
- }
965
+ const snapshot = buildVisibleDomSnapshot(document.body, {
966
+ maxNodes: 1500,
967
+ maxTextLength: 80,
968
+ includeRects: true
969
+ });
1030
970
  return {
1031
971
  imageData: compressed.imageData,
1032
972
  width: compressed.width,
1033
973
  height: compressed.height,
1034
974
  viewportWidth: captureMetrics.viewportWidth,
1035
975
  viewportHeight: captureMetrics.viewportHeight,
1036
- markerMap,
1037
- markerContext
976
+ domSnapshot: snapshot.text,
977
+ elementRegistry: snapshot.idToElement
1038
978
  };
1039
979
  }
1040
980
  //#endregion
@@ -1044,20 +984,12 @@ async function captureAnnotatedViewport() {
1044
984
  */
1045
985
  var ScreenCaptureService = class {
1046
986
  /**
1047
- * Capture a screenshot of the current viewport.
1048
- * @returns Screenshot result with image data and dimensions
987
+ * Capture a screenshot and DOM snapshot of the current viewport.
988
+ * @returns Screenshot result with image data, dimensions, and DOM snapshot
1049
989
  */
1050
990
  async capture() {
1051
991
  return captureViewport();
1052
992
  }
1053
- /**
1054
- * Capture an annotated screenshot with marker overlays.
1055
- * Interactive elements are marked with numbered labels.
1056
- * @returns Annotated screenshot result with marker map
1057
- */
1058
- async captureAnnotated() {
1059
- return captureAnnotatedViewport();
1060
- }
1061
993
  };
1062
994
  //#endregion
1063
995
  //#region src/core/services/tts-playback-queue.ts
@@ -1300,12 +1232,12 @@ const AUDIO_LEVEL_NOISE_GATE = 5e-4;
1300
1232
  const AUDIO_LEVEL_INPUT_GAIN = 600;
1301
1233
  const AUDIO_LEVEL_ATTACK = .7;
1302
1234
  const AUDIO_LEVEL_RELEASE = .25;
1303
- function clamp$1(value, min, max) {
1235
+ function clamp(value, min, max) {
1304
1236
  return Math.min(Math.max(value, min), max);
1305
1237
  }
1306
1238
  function normalizeAudioLevel(rms) {
1307
1239
  const gatedRms = Math.max(0, rms - AUDIO_LEVEL_NOISE_GATE);
1308
- return clamp$1(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
1240
+ return clamp(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
1309
1241
  }
1310
1242
  function smoothAudioLevel(current, target) {
1311
1243
  const smoothing = target > current ? AUDIO_LEVEL_ATTACK : AUDIO_LEVEL_RELEASE;
@@ -1548,7 +1480,7 @@ function parseUIStreamLine(line) {
1548
1480
  * Check if a tool call is a point tool call with valid input.
1549
1481
  */
1550
1482
  function isPointToolCall(chunk) {
1551
- return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "type" in chunk.input && "label" in chunk.input;
1483
+ return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "elementId" in chunk.input && "label" in chunk.input;
1552
1484
  }
1553
1485
  //#endregion
1554
1486
  //#region src/core/utils/response-processor.ts
@@ -1691,9 +1623,6 @@ var ProgressiveResponseProcessor = class {
1691
1623
  };
1692
1624
  //#endregion
1693
1625
  //#region src/core/client.ts
1694
- function clamp(value, min, max) {
1695
- return Math.min(Math.max(value, min), max);
1696
- }
1697
1626
  async function readErrorMessage(response, fallbackMessage) {
1698
1627
  try {
1699
1628
  if ((response.headers.get("Content-Type") ?? "").includes("application/json")) {
@@ -1706,21 +1635,6 @@ async function readErrorMessage(response, fallbackMessage) {
1706
1635
  return fallbackMessage;
1707
1636
  }
1708
1637
  /**
1709
- * Map coordinate-based pointing from screenshot space to viewport space.
1710
- */
1711
- function mapCoordinatesToViewport(x, y, screenshot) {
1712
- if (screenshot.width <= 0 || screenshot.height <= 0) return {
1713
- x,
1714
- y
1715
- };
1716
- const scaleX = screenshot.viewportWidth / screenshot.width;
1717
- const scaleY = screenshot.viewportHeight / screenshot.height;
1718
- return {
1719
- x: clamp(Math.round(x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
1720
- y: clamp(Math.round(y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
1721
- };
1722
- }
1723
- /**
1724
1638
  * Framework-agnostic client for cursor buddy voice interactions.
1725
1639
  *
1726
1640
  * Manages the complete voice interaction flow:
@@ -1789,7 +1703,7 @@ var CursorBuddyClient = class {
1789
1703
  this.notify();
1790
1704
  this.abortController = new AbortController();
1791
1705
  const signal = this.abortController.signal;
1792
- this.screenshotPromise = this.screenCapture.captureAnnotated();
1706
+ this.screenshotPromise = this.screenCapture.capture();
1793
1707
  this.beginListeningSession(signal).catch((error) => {
1794
1708
  if (signal.aborted) return;
1795
1709
  this.voiceCapture.dispose();
@@ -1842,16 +1756,17 @@ var CursorBuddyClient = class {
1842
1756
  if (signal?.aborted) return;
1843
1757
  this.options.onResponse?.(cleanResponse);
1844
1758
  let pointTarget = null;
1845
- if (pointToolCall) if (pointToolCall.type === "marker") {
1846
- const coords = resolveMarkerToCoordinates(screenshot.markerMap, pointToolCall.markerId);
1847
- if (coords) pointTarget = {
1848
- ...coords,
1849
- label: pointToolCall.label
1850
- };
1851
- } else pointTarget = {
1852
- ...mapCoordinatesToViewport(pointToolCall.x, pointToolCall.y, screenshot),
1853
- label: pointToolCall.label
1854
- };
1759
+ if (pointToolCall) {
1760
+ const element = screenshot.elementRegistry.get(pointToolCall.elementId);
1761
+ if (element) {
1762
+ const rect = element.getBoundingClientRect();
1763
+ pointTarget = {
1764
+ x: Math.round(rect.left + rect.width / 2),
1765
+ y: Math.round(rect.top + rect.height / 2),
1766
+ label: pointToolCall.label
1767
+ };
1768
+ }
1769
+ }
1855
1770
  if (pointTarget) {
1856
1771
  this.options.onPoint?.(pointTarget);
1857
1772
  this.pointerController.pointAt(pointTarget);
@@ -2017,7 +1932,7 @@ var CursorBuddyClient = class {
2017
1932
  },
2018
1933
  transcript,
2019
1934
  history,
2020
- markerContext: screenshot.markerContext
1935
+ domSnapshot: screenshot.domSnapshot
2021
1936
  }),
2022
1937
  signal
2023
1938
  });
@@ -2251,4 +2166,4 @@ var CursorBuddyClient = class {
2251
2166
  //#endregion
2252
2167
  export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
2253
2168
 
2254
- //# sourceMappingURL=client-CSVSY-KV.mjs.map
2169
+ //# sourceMappingURL=client-CliXcNch.mjs.map