cursor-buddy 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -10
- package/dist/{client-CSVSY-KV.mjs → client-CliXcNch.mjs} +212 -297
- package/dist/client-CliXcNch.mjs.map +1 -0
- package/dist/{client-Ba6rv-du.d.mts → client-sjVVGYPU.d.mts} +6 -35
- package/dist/client-sjVVGYPU.d.mts.map +1 -0
- package/dist/index.d.mts +2 -2
- package/dist/index.mjs +2 -2
- package/dist/point-tool-DZJmhD8e.mjs +16 -0
- package/dist/point-tool-DZJmhD8e.mjs.map +1 -0
- package/dist/point-tool-l3FewgM9.d.mts +22 -0
- package/dist/point-tool-l3FewgM9.d.mts.map +1 -0
- package/dist/react/index.d.mts +1 -1
- package/dist/react/index.mjs +1 -1
- package/dist/server/index.d.mts +2 -2
- package/dist/server/index.mjs +70 -71
- package/dist/server/index.mjs.map +1 -1
- package/package.json +1 -1
- package/dist/client-Ba6rv-du.d.mts.map +0 -1
- package/dist/client-CSVSY-KV.mjs.map +0 -1
- package/dist/point-tool-Cv39qylv.mjs +0 -54
- package/dist/point-tool-Cv39qylv.mjs.map +0 -1
- package/dist/point-tool-kIviMn1q.d.mts +0 -46
- package/dist/point-tool-kIviMn1q.d.mts.map +0 -1
package/README.md
CHANGED
|
@@ -12,7 +12,7 @@ Customize its prompt, pass custom tools, choose between browser or server-side s
|
|
|
12
12
|
|
|
13
13
|
- **Push-to-talk voice input** — Hold a hotkey to speak, release to send
|
|
14
14
|
- **Browser-first live transcription** — Realtime transcript while speaking, with server fallback
|
|
15
|
-
- **
|
|
15
|
+
- **DOM snapshot context** — AI sees a token-efficient representation of your visible page structure
|
|
16
16
|
- **Voice responses** — Browser or server TTS, with optional streaming playback
|
|
17
17
|
- **Cursor pointing** — AI can point at UI elements it references
|
|
18
18
|
- **Voice interruption** — Start talking again to cut off current response
|
|
@@ -367,17 +367,15 @@ client.stopListening()
|
|
|
367
367
|
|
|
368
368
|
1. User holds the hotkey
|
|
369
369
|
2. Microphone captures audio, waveform shows audio level, and browser speech recognition starts when available
|
|
370
|
-
3.
|
|
371
|
-
4.
|
|
370
|
+
3. At the same time, a screenshot and token-efficient DOM snapshot of the viewport are captured in the background. This runs in parallel with speech capture to minimize latency
|
|
371
|
+
4. User releases hotkey
|
|
372
372
|
5. The client prefers the browser transcript; if it is unavailable or empty in `auto` mode, the recorded audio is transcribed on the server
|
|
373
|
-
6.
|
|
374
|
-
7. AI responds with text and can optionally call the `point` tool to indicate
|
|
375
|
-
- `type: "marker"` with `markerId` for numbered interactive elements (most accurate)
|
|
376
|
-
- `type: "coordinates"` with `x, y` pixel coordinates for anything without a marker
|
|
373
|
+
6. The already-captured screenshot + DOM snapshot are sent to the AI model. Each element has an `@ID` (e.g., `@12`) that the AI can reference.
|
|
374
|
+
7. AI responds with text and can optionally call the `point` tool to indicate an element on screen by its `@ID` from the DOM snapshot
|
|
377
375
|
8. Response is spoken in the browser or on the server based on `speech.mode`,
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
9. If the AI calls the point tool, the cursor animates to the target
|
|
376
|
+
and can either wait for the full response or stream sentence-by-sentence
|
|
377
|
+
based on `speech.allowStreaming`
|
|
378
|
+
9. If the AI calls the point tool, the cursor animates to the target element's current position (it resolves the element from the snapshot registry and computes its center point)
|
|
381
379
|
10. **If user presses hotkey again at any point, current response is interrupted**
|
|
382
380
|
|
|
383
381
|
## Security Best Practices
|
|
@@ -611,231 +611,199 @@ var PointerController = class {
|
|
|
611
611
|
}
|
|
612
612
|
};
|
|
613
613
|
//#endregion
|
|
614
|
-
//#region src/core/utils/
|
|
615
|
-
const
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
function drawAnnotations(ctx, markers, style = {}) {
|
|
632
|
-
const s = {
|
|
633
|
-
...DEFAULT_STYLE,
|
|
634
|
-
...style
|
|
635
|
-
};
|
|
636
|
-
ctx.save();
|
|
637
|
-
for (const marker of markers.values()) {
|
|
638
|
-
const { rect, id } = marker;
|
|
639
|
-
ctx.strokeStyle = s.borderColor;
|
|
640
|
-
ctx.lineWidth = s.borderWidth;
|
|
641
|
-
ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
|
|
642
|
-
const label = String(id);
|
|
643
|
-
ctx.font = `bold ${s.fontSize}px monospace`;
|
|
644
|
-
const textWidth = ctx.measureText(label).width;
|
|
645
|
-
const textHeight = s.fontSize;
|
|
646
|
-
const labelWidth = textWidth + s.labelPadding * 2;
|
|
647
|
-
const labelHeight = textHeight + s.labelPadding;
|
|
648
|
-
const labelX = rect.left - s.borderWidth;
|
|
649
|
-
const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
|
|
650
|
-
ctx.fillStyle = s.labelBackground;
|
|
651
|
-
ctx.beginPath();
|
|
652
|
-
ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
|
|
653
|
-
ctx.fill();
|
|
654
|
-
ctx.fillStyle = s.labelColor;
|
|
655
|
-
ctx.textBaseline = "top";
|
|
656
|
-
ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
|
|
657
|
-
}
|
|
658
|
-
ctx.restore();
|
|
659
|
-
}
|
|
660
|
-
/**
|
|
661
|
-
* Create an annotated copy of a canvas.
|
|
662
|
-
* Does not modify the original canvas.
|
|
663
|
-
*
|
|
664
|
-
* @param sourceCanvas Original screenshot canvas
|
|
665
|
-
* @param markers Marker map from element discovery
|
|
666
|
-
* @returns New canvas with annotations drawn
|
|
667
|
-
*/
|
|
668
|
-
function createAnnotatedCanvas(sourceCanvas, markers) {
|
|
669
|
-
const canvas = document.createElement("canvas");
|
|
670
|
-
canvas.width = sourceCanvas.width;
|
|
671
|
-
canvas.height = sourceCanvas.height;
|
|
672
|
-
const ctx = canvas.getContext("2d");
|
|
673
|
-
if (!ctx) throw new Error("Failed to get canvas 2D context");
|
|
674
|
-
ctx.drawImage(sourceCanvas, 0, 0);
|
|
675
|
-
drawAnnotations(ctx, markers);
|
|
676
|
-
return canvas;
|
|
677
|
-
}
|
|
678
|
-
/**
|
|
679
|
-
* Generate marker context string for AI prompt.
|
|
680
|
-
* Lists available markers with their descriptions.
|
|
681
|
-
*
|
|
682
|
-
* @param markers Marker map from element discovery
|
|
683
|
-
* @returns Formatted string listing markers
|
|
684
|
-
*/
|
|
685
|
-
function generateMarkerContext(markers) {
|
|
686
|
-
if (markers.size === 0) return "No interactive elements detected.";
|
|
687
|
-
const lines = ["Interactive elements (use marker number to point):"];
|
|
688
|
-
for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
|
|
689
|
-
return lines.join("\n");
|
|
690
|
-
}
|
|
691
|
-
//#endregion
|
|
692
|
-
//#region src/core/utils/elements.ts
|
|
693
|
-
/**
|
|
694
|
-
* Element discovery for annotated screenshots.
|
|
695
|
-
* Finds visible interactive elements and assigns marker IDs.
|
|
696
|
-
*/
|
|
697
|
-
/** Max characters for element descriptions passed to the model. */
|
|
698
|
-
const MAX_DESCRIPTION_LENGTH = 50;
|
|
699
|
-
/** Pixels tolerance for grouping elements into the same visual row. */
|
|
700
|
-
const ROW_TOLERANCE_PX = 20;
|
|
701
|
-
/**
|
|
702
|
-
* Interactive element selectors - elements users would want to click/interact with.
|
|
703
|
-
* Mirrors accessibility roles from agent-browser but using CSS selectors.
|
|
704
|
-
*/
|
|
705
|
-
const INTERACTIVE_SELECTORS = [
|
|
706
|
-
"button",
|
|
707
|
-
"[role=\"button\"]",
|
|
708
|
-
"input[type=\"button\"]",
|
|
709
|
-
"input[type=\"submit\"]",
|
|
710
|
-
"input[type=\"reset\"]",
|
|
711
|
-
"a[href]",
|
|
712
|
-
"[role=\"link\"]",
|
|
713
|
-
"input:not([type=\"hidden\"])",
|
|
714
|
-
"textarea",
|
|
715
|
-
"select",
|
|
716
|
-
"[role=\"textbox\"]",
|
|
717
|
-
"[role=\"searchbox\"]",
|
|
718
|
-
"[role=\"combobox\"]",
|
|
719
|
-
"[role=\"listbox\"]",
|
|
720
|
-
"[role=\"slider\"]",
|
|
721
|
-
"[role=\"spinbutton\"]",
|
|
722
|
-
"[role=\"checkbox\"]",
|
|
723
|
-
"[role=\"radio\"]",
|
|
724
|
-
"[role=\"switch\"]",
|
|
725
|
-
"[role=\"menuitem\"]",
|
|
726
|
-
"[role=\"menuitemcheckbox\"]",
|
|
727
|
-
"[role=\"menuitemradio\"]",
|
|
728
|
-
"[role=\"option\"]",
|
|
729
|
-
"[role=\"tab\"]",
|
|
730
|
-
"[role=\"treeitem\"]",
|
|
731
|
-
"video",
|
|
732
|
-
"audio",
|
|
733
|
-
"[data-cursor-buddy-interactive]"
|
|
614
|
+
//#region src/core/utils/dom-snapshot.ts
|
|
615
|
+
const EXCLUDED_TAGS = new Set([
|
|
616
|
+
"script",
|
|
617
|
+
"link",
|
|
618
|
+
"style",
|
|
619
|
+
"noscript",
|
|
620
|
+
"head"
|
|
621
|
+
]);
|
|
622
|
+
const DEFAULT_INCLUDED_ATTRIBUTES = [
|
|
623
|
+
"id",
|
|
624
|
+
"name",
|
|
625
|
+
"type",
|
|
626
|
+
"placeholder",
|
|
627
|
+
"href",
|
|
628
|
+
"title",
|
|
629
|
+
"value",
|
|
630
|
+
"role"
|
|
734
631
|
];
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
const
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
*
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
if (
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
for (const element of allElements) {
|
|
779
|
-
const rect = element.getBoundingClientRect();
|
|
780
|
-
if (!isElementVisible(element, rect)) continue;
|
|
781
|
-
visible.push({
|
|
782
|
-
element,
|
|
783
|
-
rect
|
|
784
|
-
});
|
|
632
|
+
function buildVisibleDomSnapshot(root, options = {}) {
|
|
633
|
+
const { maxTextLength = 80, maxNodes = 1500, includeRects = true, rootLabel = "viewport", includedAttributes = DEFAULT_INCLUDED_ATTRIBUTES } = options;
|
|
634
|
+
const doc = root instanceof Document ? root : root.ownerDocument || document;
|
|
635
|
+
const startRoot = root instanceof Document ? root.documentElement : root;
|
|
636
|
+
const win = doc.defaultView || window;
|
|
637
|
+
const viewportW = win.innerWidth || 0;
|
|
638
|
+
const viewportH = win.innerHeight || 0;
|
|
639
|
+
let nextId = 1;
|
|
640
|
+
let nodeCount = 0;
|
|
641
|
+
const idToElement = /* @__PURE__ */ new Map();
|
|
642
|
+
const lines = [`# ${rootLabel} ${viewportW}x${viewportH}`];
|
|
643
|
+
/**
|
|
644
|
+
* Returns true when the element is worth considering for the snapshot.
|
|
645
|
+
*
|
|
646
|
+
* This is intentionally simple:
|
|
647
|
+
* - skip excluded tags
|
|
648
|
+
* - skip hidden/display:none/visibility:hidden/etc
|
|
649
|
+
* - skip zero-size elements
|
|
650
|
+
* - skip elements fully outside the viewport
|
|
651
|
+
*/
|
|
652
|
+
function isElementVisible(el) {
|
|
653
|
+
const tag = el.tagName.toLowerCase();
|
|
654
|
+
if (EXCLUDED_TAGS.has(tag)) return false;
|
|
655
|
+
if (!(el instanceof HTMLElement)) return false;
|
|
656
|
+
if (el.hidden) return false;
|
|
657
|
+
if (el.closest("head")) return false;
|
|
658
|
+
if (typeof el.checkVisibility === "function") try {
|
|
659
|
+
if (!el.checkVisibility({
|
|
660
|
+
opacityProperty: true,
|
|
661
|
+
visibilityProperty: true,
|
|
662
|
+
contentVisibilityAuto: true
|
|
663
|
+
})) return false;
|
|
664
|
+
} catch {}
|
|
665
|
+
const style = win.getComputedStyle(el);
|
|
666
|
+
if (style.display === "none") return false;
|
|
667
|
+
if (style.visibility === "hidden" || style.visibility === "collapse") return false;
|
|
668
|
+
if (style.opacity === "0") return false;
|
|
669
|
+
if (style.contentVisibility === "hidden") return false;
|
|
670
|
+
const rect = el.getBoundingClientRect();
|
|
671
|
+
if (rect.width <= 0 || rect.height <= 0) return false;
|
|
672
|
+
if (rect.bottom <= 0 || rect.right <= 0) return false;
|
|
673
|
+
if (rect.top >= viewportH || rect.left >= viewportW) return false;
|
|
674
|
+
return true;
|
|
785
675
|
}
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
676
|
+
/**
|
|
677
|
+
* Extracts a compact text representation from the element itself.
|
|
678
|
+
*
|
|
679
|
+
* No semantic guessing:
|
|
680
|
+
* - prefer innerText when available
|
|
681
|
+
* - otherwise fall back to textContent
|
|
682
|
+
* - normalize whitespace
|
|
683
|
+
* - truncate aggressively
|
|
684
|
+
*/
|
|
685
|
+
function getElementText(el) {
|
|
686
|
+
const text = normalizeWhitespace(el.innerText || el.textContent || "");
|
|
687
|
+
if (!text) return "";
|
|
688
|
+
return truncate(text, maxTextLength);
|
|
689
|
+
}
|
|
690
|
+
/**
|
|
691
|
+
* Keeps only a small allowlist of raw DOM attributes.
|
|
692
|
+
*
|
|
693
|
+
* This avoids dumping the full attribute bag, which is usually noisy
|
|
694
|
+
* and expensive in tokens.
|
|
695
|
+
*/
|
|
696
|
+
function getIncludedAttributes(el) {
|
|
697
|
+
const attrs = {};
|
|
698
|
+
for (const name of includedAttributes) {
|
|
699
|
+
const value = el.getAttribute(name);
|
|
700
|
+
if (value == null) continue;
|
|
701
|
+
const clean = truncate(normalizeWhitespace(value), maxTextLength);
|
|
702
|
+
if (!clean) continue;
|
|
703
|
+
attrs[name] = clean;
|
|
704
|
+
}
|
|
705
|
+
return attrs;
|
|
706
|
+
}
|
|
707
|
+
/**
|
|
708
|
+
* Rounds the client rect so the output is smaller and more stable.
|
|
709
|
+
*/
|
|
710
|
+
function quantizeRect(el) {
|
|
711
|
+
const r = el.getBoundingClientRect();
|
|
712
|
+
return {
|
|
713
|
+
x: Math.max(0, Math.round(r.left)),
|
|
714
|
+
y: Math.max(0, Math.round(r.top)),
|
|
715
|
+
w: Math.round(r.width),
|
|
716
|
+
h: Math.round(r.height)
|
|
717
|
+
};
|
|
718
|
+
}
|
|
719
|
+
/**
|
|
720
|
+
* Decides whether this node should be emitted.
|
|
721
|
+
*
|
|
722
|
+
* Simple rule:
|
|
723
|
+
* - keep it if it has visible kept children
|
|
724
|
+
* - or keep it if it has some text
|
|
725
|
+
* - or keep it if it has at least one included attribute
|
|
726
|
+
*
|
|
727
|
+
* This allows non-semantic div-heavy UIs to survive without trying
|
|
728
|
+
* to guess intent.
|
|
729
|
+
*/
|
|
730
|
+
function shouldKeepNode(text, attrs, children) {
|
|
731
|
+
if (children.length > 0) return true;
|
|
732
|
+
if (text.length > 0) return true;
|
|
733
|
+
if (Object.keys(attrs).length > 0) return true;
|
|
734
|
+
return false;
|
|
735
|
+
}
|
|
736
|
+
/**
|
|
737
|
+
* Single DFS traversal over the DOM.
|
|
738
|
+
*
|
|
739
|
+
* Complexity target:
|
|
740
|
+
* - O(N) DOM walk
|
|
741
|
+
* - O(1) work per element, aside from browser layout/style calls
|
|
742
|
+
*/
|
|
743
|
+
function walk(el) {
|
|
744
|
+
if (nodeCount >= maxNodes) return null;
|
|
745
|
+
if (!(el instanceof HTMLElement)) return null;
|
|
746
|
+
if (!isElementVisible(el)) return null;
|
|
747
|
+
const children = [];
|
|
748
|
+
for (const child of Array.from(el.children)) {
|
|
749
|
+
const childNode = walk(child);
|
|
750
|
+
if (childNode) children.push(childNode);
|
|
751
|
+
if (nodeCount >= maxNodes) break;
|
|
752
|
+
}
|
|
753
|
+
const text = getElementText(el);
|
|
754
|
+
const attrs = getIncludedAttributes(el);
|
|
755
|
+
if (!shouldKeepNode(text, attrs, children)) return null;
|
|
756
|
+
const id = nextId++;
|
|
757
|
+
nodeCount++;
|
|
758
|
+
idToElement.set(id, el);
|
|
759
|
+
return {
|
|
803
760
|
id,
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
}
|
|
811
|
-
/**
|
|
812
|
-
*
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
761
|
+
tag: el.tagName.toLowerCase(),
|
|
762
|
+
text,
|
|
763
|
+
attrs,
|
|
764
|
+
rect: includeRects ? quantizeRect(el) : void 0,
|
|
765
|
+
children
|
|
766
|
+
};
|
|
767
|
+
}
|
|
768
|
+
/**
|
|
769
|
+
* Emits the final compact line-based format.
|
|
770
|
+
*
|
|
771
|
+
* Example:
|
|
772
|
+
* @12 div "Settings" [id="settings"] [x=10 y=20 w=200 h=40]
|
|
773
|
+
*/
|
|
774
|
+
function emit(node, depth) {
|
|
775
|
+
const parts = [`${" ".repeat(depth)}@${node.id} ${node.tag}`];
|
|
776
|
+
if (node.text) parts.push(`"${escapeQuotes(node.text)}"`);
|
|
777
|
+
for (const [key, value] of Object.entries(node.attrs)) parts.push(`[${key}="${escapeQuotes(value)}"]`);
|
|
778
|
+
if (node.rect) parts.push(`[x=${node.rect.x} y=${node.rect.y} w=${node.rect.w} h=${node.rect.h}]`);
|
|
779
|
+
lines.push(parts.join(" "));
|
|
780
|
+
for (const child of node.children) emit(child, depth + 1);
|
|
781
|
+
}
|
|
782
|
+
const tree = walk(startRoot);
|
|
783
|
+
if (tree) emit(tree, 0);
|
|
816
784
|
return {
|
|
817
|
-
|
|
818
|
-
|
|
785
|
+
text: lines.join("\n"),
|
|
786
|
+
idToElement,
|
|
787
|
+
nodeCount
|
|
819
788
|
};
|
|
820
789
|
}
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
return getElementCenter(marker.element);
|
|
790
|
+
function normalizeWhitespace(text) {
|
|
791
|
+
return text.replace(/\s+/g, " ").trim();
|
|
792
|
+
}
|
|
793
|
+
function truncate(text, maxLength) {
|
|
794
|
+
if (text.length <= maxLength) return text;
|
|
795
|
+
return text.slice(0, maxLength - 1).trimEnd() + "…";
|
|
796
|
+
}
|
|
797
|
+
function escapeQuotes(text) {
|
|
798
|
+
return text.replace(/"/g, "\\\"");
|
|
831
799
|
}
|
|
832
800
|
//#endregion
|
|
833
801
|
//#region src/core/utils/screenshot.ts
|
|
834
802
|
const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
|
|
835
803
|
/** Maximum width for compressed screenshots (maintains aspect ratio) */
|
|
836
|
-
const MAX_SCREENSHOT_WIDTH =
|
|
837
|
-
/** JPEG quality for compressed screenshots (0-1) */
|
|
838
|
-
const JPEG_QUALITY = .
|
|
804
|
+
const MAX_SCREENSHOT_WIDTH = 1920;
|
|
805
|
+
/** JPEG quality for compressed screenshots (0-1) - higher quality for clearer details */
|
|
806
|
+
const JPEG_QUALITY = .95;
|
|
839
807
|
/**
|
|
840
808
|
* Compress a canvas image by downscaling and converting to JPEG.
|
|
841
809
|
* Maintains aspect ratio and falls back to original if compression fails.
|
|
@@ -971,9 +939,10 @@ function createFallbackCanvas() {
|
|
|
971
939
|
return canvas;
|
|
972
940
|
}
|
|
973
941
|
/**
|
|
974
|
-
* Capture a screenshot of the current viewport.
|
|
975
|
-
* Uses html2canvas to render the DOM to a canvas,
|
|
976
|
-
*
|
|
942
|
+
* Capture a screenshot and DOM snapshot of the current viewport.
|
|
943
|
+
* Uses html2canvas to render the DOM to a canvas, compresses to high-quality JPEG,
|
|
944
|
+
* and builds a token-efficient DOM snapshot for AI context.
|
|
945
|
+
* Falls back to a placeholder if capture fails.
|
|
977
946
|
*/
|
|
978
947
|
async function captureViewport() {
|
|
979
948
|
const captureMetrics = getCaptureMetrics();
|
|
@@ -993,48 +962,19 @@ async function captureViewport() {
|
|
|
993
962
|
height: canvas.height
|
|
994
963
|
};
|
|
995
964
|
}
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
viewportHeight: captureMetrics.viewportHeight
|
|
1002
|
-
};
|
|
1003
|
-
}
|
|
1004
|
-
/**
|
|
1005
|
-
* Capture an annotated screenshot of the current viewport.
|
|
1006
|
-
* Interactive elements are marked with numbered labels.
|
|
1007
|
-
* Returns both the annotated image and a marker map for resolving IDs.
|
|
1008
|
-
*/
|
|
1009
|
-
async function captureAnnotatedViewport() {
|
|
1010
|
-
const captureMetrics = getCaptureMetrics();
|
|
1011
|
-
const markerMap = createMarkerMap();
|
|
1012
|
-
let sourceCanvas;
|
|
1013
|
-
try {
|
|
1014
|
-
sourceCanvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
|
|
1015
|
-
} catch {
|
|
1016
|
-
sourceCanvas = createFallbackCanvas();
|
|
1017
|
-
}
|
|
1018
|
-
const canvas = markerMap.size > 0 ? createAnnotatedCanvas(sourceCanvas, markerMap) : sourceCanvas;
|
|
1019
|
-
const markerContext = generateMarkerContext(markerMap);
|
|
1020
|
-
let compressed;
|
|
1021
|
-
try {
|
|
1022
|
-
compressed = compressImage(canvas);
|
|
1023
|
-
} catch {
|
|
1024
|
-
compressed = {
|
|
1025
|
-
imageData: canvas.toDataURL("image/png"),
|
|
1026
|
-
width: canvas.width,
|
|
1027
|
-
height: canvas.height
|
|
1028
|
-
};
|
|
1029
|
-
}
|
|
965
|
+
const snapshot = buildVisibleDomSnapshot(document.body, {
|
|
966
|
+
maxNodes: 1500,
|
|
967
|
+
maxTextLength: 80,
|
|
968
|
+
includeRects: true
|
|
969
|
+
});
|
|
1030
970
|
return {
|
|
1031
971
|
imageData: compressed.imageData,
|
|
1032
972
|
width: compressed.width,
|
|
1033
973
|
height: compressed.height,
|
|
1034
974
|
viewportWidth: captureMetrics.viewportWidth,
|
|
1035
975
|
viewportHeight: captureMetrics.viewportHeight,
|
|
1036
|
-
|
|
1037
|
-
|
|
976
|
+
domSnapshot: snapshot.text,
|
|
977
|
+
elementRegistry: snapshot.idToElement
|
|
1038
978
|
};
|
|
1039
979
|
}
|
|
1040
980
|
//#endregion
|
|
@@ -1044,20 +984,12 @@ async function captureAnnotatedViewport() {
|
|
|
1044
984
|
*/
|
|
1045
985
|
var ScreenCaptureService = class {
|
|
1046
986
|
/**
|
|
1047
|
-
* Capture a screenshot of the current viewport.
|
|
1048
|
-
* @returns Screenshot result with image data and
|
|
987
|
+
* Capture a screenshot and DOM snapshot of the current viewport.
|
|
988
|
+
* @returns Screenshot result with image data, dimensions, and DOM snapshot
|
|
1049
989
|
*/
|
|
1050
990
|
async capture() {
|
|
1051
991
|
return captureViewport();
|
|
1052
992
|
}
|
|
1053
|
-
/**
|
|
1054
|
-
* Capture an annotated screenshot with marker overlays.
|
|
1055
|
-
* Interactive elements are marked with numbered labels.
|
|
1056
|
-
* @returns Annotated screenshot result with marker map
|
|
1057
|
-
*/
|
|
1058
|
-
async captureAnnotated() {
|
|
1059
|
-
return captureAnnotatedViewport();
|
|
1060
|
-
}
|
|
1061
993
|
};
|
|
1062
994
|
//#endregion
|
|
1063
995
|
//#region src/core/services/tts-playback-queue.ts
|
|
@@ -1300,12 +1232,12 @@ const AUDIO_LEVEL_NOISE_GATE = 5e-4;
|
|
|
1300
1232
|
const AUDIO_LEVEL_INPUT_GAIN = 600;
|
|
1301
1233
|
const AUDIO_LEVEL_ATTACK = .7;
|
|
1302
1234
|
const AUDIO_LEVEL_RELEASE = .25;
|
|
1303
|
-
function clamp
|
|
1235
|
+
function clamp(value, min, max) {
|
|
1304
1236
|
return Math.min(Math.max(value, min), max);
|
|
1305
1237
|
}
|
|
1306
1238
|
function normalizeAudioLevel(rms) {
|
|
1307
1239
|
const gatedRms = Math.max(0, rms - AUDIO_LEVEL_NOISE_GATE);
|
|
1308
|
-
return clamp
|
|
1240
|
+
return clamp(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
|
|
1309
1241
|
}
|
|
1310
1242
|
function smoothAudioLevel(current, target) {
|
|
1311
1243
|
const smoothing = target > current ? AUDIO_LEVEL_ATTACK : AUDIO_LEVEL_RELEASE;
|
|
@@ -1548,7 +1480,7 @@ function parseUIStreamLine(line) {
|
|
|
1548
1480
|
* Check if a tool call is a point tool call with valid input.
|
|
1549
1481
|
*/
|
|
1550
1482
|
function isPointToolCall(chunk) {
|
|
1551
|
-
return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "
|
|
1483
|
+
return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "elementId" in chunk.input && "label" in chunk.input;
|
|
1552
1484
|
}
|
|
1553
1485
|
//#endregion
|
|
1554
1486
|
//#region src/core/utils/response-processor.ts
|
|
@@ -1691,9 +1623,6 @@ var ProgressiveResponseProcessor = class {
|
|
|
1691
1623
|
};
|
|
1692
1624
|
//#endregion
|
|
1693
1625
|
//#region src/core/client.ts
|
|
1694
|
-
function clamp(value, min, max) {
|
|
1695
|
-
return Math.min(Math.max(value, min), max);
|
|
1696
|
-
}
|
|
1697
1626
|
async function readErrorMessage(response, fallbackMessage) {
|
|
1698
1627
|
try {
|
|
1699
1628
|
if ((response.headers.get("Content-Type") ?? "").includes("application/json")) {
|
|
@@ -1706,21 +1635,6 @@ async function readErrorMessage(response, fallbackMessage) {
|
|
|
1706
1635
|
return fallbackMessage;
|
|
1707
1636
|
}
|
|
1708
1637
|
/**
|
|
1709
|
-
* Map coordinate-based pointing from screenshot space to viewport space.
|
|
1710
|
-
*/
|
|
1711
|
-
function mapCoordinatesToViewport(x, y, screenshot) {
|
|
1712
|
-
if (screenshot.width <= 0 || screenshot.height <= 0) return {
|
|
1713
|
-
x,
|
|
1714
|
-
y
|
|
1715
|
-
};
|
|
1716
|
-
const scaleX = screenshot.viewportWidth / screenshot.width;
|
|
1717
|
-
const scaleY = screenshot.viewportHeight / screenshot.height;
|
|
1718
|
-
return {
|
|
1719
|
-
x: clamp(Math.round(x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
|
|
1720
|
-
y: clamp(Math.round(y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
|
|
1721
|
-
};
|
|
1722
|
-
}
|
|
1723
|
-
/**
|
|
1724
1638
|
* Framework-agnostic client for cursor buddy voice interactions.
|
|
1725
1639
|
*
|
|
1726
1640
|
* Manages the complete voice interaction flow:
|
|
@@ -1789,7 +1703,7 @@ var CursorBuddyClient = class {
|
|
|
1789
1703
|
this.notify();
|
|
1790
1704
|
this.abortController = new AbortController();
|
|
1791
1705
|
const signal = this.abortController.signal;
|
|
1792
|
-
this.screenshotPromise = this.screenCapture.
|
|
1706
|
+
this.screenshotPromise = this.screenCapture.capture();
|
|
1793
1707
|
this.beginListeningSession(signal).catch((error) => {
|
|
1794
1708
|
if (signal.aborted) return;
|
|
1795
1709
|
this.voiceCapture.dispose();
|
|
@@ -1842,16 +1756,17 @@ var CursorBuddyClient = class {
|
|
|
1842
1756
|
if (signal?.aborted) return;
|
|
1843
1757
|
this.options.onResponse?.(cleanResponse);
|
|
1844
1758
|
let pointTarget = null;
|
|
1845
|
-
if (pointToolCall)
|
|
1846
|
-
const
|
|
1847
|
-
if (
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1759
|
+
if (pointToolCall) {
|
|
1760
|
+
const element = screenshot.elementRegistry.get(pointToolCall.elementId);
|
|
1761
|
+
if (element) {
|
|
1762
|
+
const rect = element.getBoundingClientRect();
|
|
1763
|
+
pointTarget = {
|
|
1764
|
+
x: Math.round(rect.left + rect.width / 2),
|
|
1765
|
+
y: Math.round(rect.top + rect.height / 2),
|
|
1766
|
+
label: pointToolCall.label
|
|
1767
|
+
};
|
|
1768
|
+
}
|
|
1769
|
+
}
|
|
1855
1770
|
if (pointTarget) {
|
|
1856
1771
|
this.options.onPoint?.(pointTarget);
|
|
1857
1772
|
this.pointerController.pointAt(pointTarget);
|
|
@@ -2017,7 +1932,7 @@ var CursorBuddyClient = class {
|
|
|
2017
1932
|
},
|
|
2018
1933
|
transcript,
|
|
2019
1934
|
history,
|
|
2020
|
-
|
|
1935
|
+
domSnapshot: screenshot.domSnapshot
|
|
2021
1936
|
}),
|
|
2022
1937
|
signal
|
|
2023
1938
|
});
|
|
@@ -2251,4 +2166,4 @@ var CursorBuddyClient = class {
|
|
|
2251
2166
|
//#endregion
|
|
2252
2167
|
export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
|
|
2253
2168
|
|
|
2254
|
-
//# sourceMappingURL=client-
|
|
2169
|
+
//# sourceMappingURL=client-CliXcNch.mjs.map
|