cursor-buddy 0.0.8 → 0.0.9-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -12
- package/dist/{client-D73KQZf8.mjs → client-CliXcNch.mjs} +296 -389
- package/dist/client-CliXcNch.mjs.map +1 -0
- package/dist/{client-Crn8tW7w.d.mts → client-sjVVGYPU.d.mts} +7 -39
- package/dist/client-sjVVGYPU.d.mts.map +1 -0
- package/dist/index.d.mts +3 -2
- package/dist/index.mjs +3 -2
- package/dist/point-tool-DZJmhD8e.mjs +16 -0
- package/dist/point-tool-DZJmhD8e.mjs.map +1 -0
- package/dist/point-tool-l3FewgM9.d.mts +22 -0
- package/dist/point-tool-l3FewgM9.d.mts.map +1 -0
- package/dist/react/index.d.mts +1 -1
- package/dist/react/index.mjs +1 -1
- package/dist/server/adapters/next.d.mts +2 -3
- package/dist/server/adapters/next.d.mts.map +1 -1
- package/dist/server/adapters/next.mjs +2 -5
- package/dist/server/adapters/next.mjs.map +1 -1
- package/dist/server/index.d.mts +4 -7
- package/dist/server/index.d.mts.map +1 -1
- package/dist/server/index.mjs +127 -39
- package/dist/server/index.mjs.map +1 -1
- package/dist/{types-BxBhjZju.d.mts → types-BJfkApb_.d.mts} +2 -1
- package/dist/types-BJfkApb_.d.mts.map +1 -0
- package/package.json +3 -2
- package/dist/client-Crn8tW7w.d.mts.map +0 -1
- package/dist/client-D73KQZf8.mjs.map +0 -1
- package/dist/types-BxBhjZju.d.mts.map +0 -1
|
@@ -21,80 +21,6 @@ const $isEnabled = atom(true);
|
|
|
21
21
|
atom(false);
|
|
22
22
|
const $conversationHistory = atom([]);
|
|
23
23
|
//#endregion
|
|
24
|
-
//#region src/core/pointing.ts
|
|
25
|
-
/**
|
|
26
|
-
* Parses POINT tags from AI responses.
|
|
27
|
-
*
|
|
28
|
-
* Supports two formats:
|
|
29
|
-
* - Marker-based: [POINT:5:label] - 3 parts, references a numbered marker
|
|
30
|
-
* - Coordinate-based: [POINT:640,360:label] - 4 parts, raw pixel coordinates
|
|
31
|
-
*/
|
|
32
|
-
const POINTING_TAG_REGEX = /\[POINT:(\d+)(?:,(\d+))?:([^\]]+)\]\s*$/;
|
|
33
|
-
const PARTIAL_POINTING_PREFIXES = new Set([
|
|
34
|
-
"[",
|
|
35
|
-
"[P",
|
|
36
|
-
"[PO",
|
|
37
|
-
"[POI",
|
|
38
|
-
"[POIN",
|
|
39
|
-
"[POINT",
|
|
40
|
-
"[POINT:"
|
|
41
|
-
]);
|
|
42
|
-
function stripTrailingPointingTag(response, trimResult) {
|
|
43
|
-
const stripped = response.replace(POINTING_TAG_REGEX, "");
|
|
44
|
-
return trimResult ? stripped.trim() : stripped;
|
|
45
|
-
}
|
|
46
|
-
function getPartialPointingTagStart(response) {
|
|
47
|
-
const lastOpenBracket = response.lastIndexOf("[");
|
|
48
|
-
if (lastOpenBracket === -1) return -1;
|
|
49
|
-
const suffix = response.slice(lastOpenBracket).trimEnd();
|
|
50
|
-
if (suffix.includes("]")) return -1;
|
|
51
|
-
if (suffix.startsWith("[POINT:")) {
|
|
52
|
-
let start = lastOpenBracket;
|
|
53
|
-
while (start > 0 && /\s/.test(response[start - 1] ?? "")) start--;
|
|
54
|
-
return start;
|
|
55
|
-
}
|
|
56
|
-
return PARTIAL_POINTING_PREFIXES.has(suffix) ? lastOpenBracket : -1;
|
|
57
|
-
}
|
|
58
|
-
/**
|
|
59
|
-
* Parse pointing tag into structured result.
|
|
60
|
-
* Returns null if no valid POINT tag is found at the end.
|
|
61
|
-
*/
|
|
62
|
-
function parsePointingTagRaw(response) {
|
|
63
|
-
const match = response.match(POINTING_TAG_REGEX);
|
|
64
|
-
if (!match) return null;
|
|
65
|
-
const first = Number.parseInt(match[1], 10);
|
|
66
|
-
const second = match[2] ? Number.parseInt(match[2], 10) : null;
|
|
67
|
-
const label = match[3].trim();
|
|
68
|
-
if (second !== null) return {
|
|
69
|
-
type: "coordinates",
|
|
70
|
-
x: first,
|
|
71
|
-
y: second,
|
|
72
|
-
label
|
|
73
|
-
};
|
|
74
|
-
return {
|
|
75
|
-
type: "marker",
|
|
76
|
-
markerId: first,
|
|
77
|
-
label
|
|
78
|
-
};
|
|
79
|
-
}
|
|
80
|
-
/**
|
|
81
|
-
* Remove POINT tag from response text for display/TTS.
|
|
82
|
-
*/
|
|
83
|
-
function stripPointingTag(response) {
|
|
84
|
-
return stripTrailingPointingTag(response, true);
|
|
85
|
-
}
|
|
86
|
-
/**
|
|
87
|
-
* Strip complete or partial trailing POINT syntax while the response streams.
|
|
88
|
-
* This keeps the visible text and TTS input stable even if the tag arrives
|
|
89
|
-
* incrementally over multiple chunks.
|
|
90
|
-
*/
|
|
91
|
-
function stripTrailingPointingSyntax(response) {
|
|
92
|
-
const withoutCompleteTag = stripTrailingPointingTag(response, false);
|
|
93
|
-
const partialTagStart = getPartialPointingTagStart(withoutCompleteTag);
|
|
94
|
-
if (partialTagStart === -1) return withoutCompleteTag.trimEnd();
|
|
95
|
-
return withoutCompleteTag.slice(0, partialTagStart).trimEnd();
|
|
96
|
-
}
|
|
97
|
-
//#endregion
|
|
98
24
|
//#region src/core/utils/error.ts
|
|
99
25
|
/**
|
|
100
26
|
* Normalize unknown thrown values into Error instances.
|
|
@@ -685,231 +611,199 @@ var PointerController = class {
|
|
|
685
611
|
}
|
|
686
612
|
};
|
|
687
613
|
//#endregion
|
|
688
|
-
//#region src/core/utils/
|
|
689
|
-
const
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
function drawAnnotations(ctx, markers, style = {}) {
|
|
706
|
-
const s = {
|
|
707
|
-
...DEFAULT_STYLE,
|
|
708
|
-
...style
|
|
709
|
-
};
|
|
710
|
-
ctx.save();
|
|
711
|
-
for (const marker of markers.values()) {
|
|
712
|
-
const { rect, id } = marker;
|
|
713
|
-
ctx.strokeStyle = s.borderColor;
|
|
714
|
-
ctx.lineWidth = s.borderWidth;
|
|
715
|
-
ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
|
|
716
|
-
const label = String(id);
|
|
717
|
-
ctx.font = `bold ${s.fontSize}px monospace`;
|
|
718
|
-
const textWidth = ctx.measureText(label).width;
|
|
719
|
-
const textHeight = s.fontSize;
|
|
720
|
-
const labelWidth = textWidth + s.labelPadding * 2;
|
|
721
|
-
const labelHeight = textHeight + s.labelPadding;
|
|
722
|
-
const labelX = rect.left - s.borderWidth;
|
|
723
|
-
const labelY = rect.top < labelHeight + 4 ? rect.top + 2 : rect.top - labelHeight;
|
|
724
|
-
ctx.fillStyle = s.labelBackground;
|
|
725
|
-
ctx.beginPath();
|
|
726
|
-
ctx.roundRect(labelX, labelY, labelWidth, labelHeight, 2);
|
|
727
|
-
ctx.fill();
|
|
728
|
-
ctx.fillStyle = s.labelColor;
|
|
729
|
-
ctx.textBaseline = "top";
|
|
730
|
-
ctx.fillText(label, labelX + s.labelPadding, labelY + s.labelPadding / 2);
|
|
731
|
-
}
|
|
732
|
-
ctx.restore();
|
|
733
|
-
}
|
|
734
|
-
/**
|
|
735
|
-
* Create an annotated copy of a canvas.
|
|
736
|
-
* Does not modify the original canvas.
|
|
737
|
-
*
|
|
738
|
-
* @param sourceCanvas Original screenshot canvas
|
|
739
|
-
* @param markers Marker map from element discovery
|
|
740
|
-
* @returns New canvas with annotations drawn
|
|
741
|
-
*/
|
|
742
|
-
function createAnnotatedCanvas(sourceCanvas, markers) {
|
|
743
|
-
const canvas = document.createElement("canvas");
|
|
744
|
-
canvas.width = sourceCanvas.width;
|
|
745
|
-
canvas.height = sourceCanvas.height;
|
|
746
|
-
const ctx = canvas.getContext("2d");
|
|
747
|
-
if (!ctx) throw new Error("Failed to get canvas 2D context");
|
|
748
|
-
ctx.drawImage(sourceCanvas, 0, 0);
|
|
749
|
-
drawAnnotations(ctx, markers);
|
|
750
|
-
return canvas;
|
|
751
|
-
}
|
|
752
|
-
/**
|
|
753
|
-
* Generate marker context string for AI prompt.
|
|
754
|
-
* Lists available markers with their descriptions.
|
|
755
|
-
*
|
|
756
|
-
* @param markers Marker map from element discovery
|
|
757
|
-
* @returns Formatted string listing markers
|
|
758
|
-
*/
|
|
759
|
-
function generateMarkerContext(markers) {
|
|
760
|
-
if (markers.size === 0) return "No interactive elements detected.";
|
|
761
|
-
const lines = ["Interactive elements (use marker number to point):"];
|
|
762
|
-
for (const marker of markers.values()) lines.push(` ${marker.id}: ${marker.description}`);
|
|
763
|
-
return lines.join("\n");
|
|
764
|
-
}
|
|
765
|
-
//#endregion
|
|
766
|
-
//#region src/core/utils/elements.ts
|
|
767
|
-
/**
|
|
768
|
-
* Element discovery for annotated screenshots.
|
|
769
|
-
* Finds visible interactive elements and assigns marker IDs.
|
|
770
|
-
*/
|
|
771
|
-
/** Max characters for element descriptions passed to the model. */
|
|
772
|
-
const MAX_DESCRIPTION_LENGTH = 50;
|
|
773
|
-
/** Pixels tolerance for grouping elements into the same visual row. */
|
|
774
|
-
const ROW_TOLERANCE_PX = 20;
|
|
775
|
-
/**
|
|
776
|
-
* Interactive element selectors - elements users would want to click/interact with.
|
|
777
|
-
* Mirrors accessibility roles from agent-browser but using CSS selectors.
|
|
778
|
-
*/
|
|
779
|
-
const INTERACTIVE_SELECTORS = [
|
|
780
|
-
"button",
|
|
781
|
-
"[role=\"button\"]",
|
|
782
|
-
"input[type=\"button\"]",
|
|
783
|
-
"input[type=\"submit\"]",
|
|
784
|
-
"input[type=\"reset\"]",
|
|
785
|
-
"a[href]",
|
|
786
|
-
"[role=\"link\"]",
|
|
787
|
-
"input:not([type=\"hidden\"])",
|
|
788
|
-
"textarea",
|
|
789
|
-
"select",
|
|
790
|
-
"[role=\"textbox\"]",
|
|
791
|
-
"[role=\"searchbox\"]",
|
|
792
|
-
"[role=\"combobox\"]",
|
|
793
|
-
"[role=\"listbox\"]",
|
|
794
|
-
"[role=\"slider\"]",
|
|
795
|
-
"[role=\"spinbutton\"]",
|
|
796
|
-
"[role=\"checkbox\"]",
|
|
797
|
-
"[role=\"radio\"]",
|
|
798
|
-
"[role=\"switch\"]",
|
|
799
|
-
"[role=\"menuitem\"]",
|
|
800
|
-
"[role=\"menuitemcheckbox\"]",
|
|
801
|
-
"[role=\"menuitemradio\"]",
|
|
802
|
-
"[role=\"option\"]",
|
|
803
|
-
"[role=\"tab\"]",
|
|
804
|
-
"[role=\"treeitem\"]",
|
|
805
|
-
"video",
|
|
806
|
-
"audio",
|
|
807
|
-
"[data-cursor-buddy-interactive]"
|
|
614
|
+
//#region src/core/utils/dom-snapshot.ts
|
|
615
|
+
const EXCLUDED_TAGS = new Set([
|
|
616
|
+
"script",
|
|
617
|
+
"link",
|
|
618
|
+
"style",
|
|
619
|
+
"noscript",
|
|
620
|
+
"head"
|
|
621
|
+
]);
|
|
622
|
+
const DEFAULT_INCLUDED_ATTRIBUTES = [
|
|
623
|
+
"id",
|
|
624
|
+
"name",
|
|
625
|
+
"type",
|
|
626
|
+
"placeholder",
|
|
627
|
+
"href",
|
|
628
|
+
"title",
|
|
629
|
+
"value",
|
|
630
|
+
"role"
|
|
808
631
|
];
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
const
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
}
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
*
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
if (
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
for (const element of allElements) {
|
|
853
|
-
const rect = element.getBoundingClientRect();
|
|
854
|
-
if (!isElementVisible(element, rect)) continue;
|
|
855
|
-
visible.push({
|
|
856
|
-
element,
|
|
857
|
-
rect
|
|
858
|
-
});
|
|
632
|
+
function buildVisibleDomSnapshot(root, options = {}) {
|
|
633
|
+
const { maxTextLength = 80, maxNodes = 1500, includeRects = true, rootLabel = "viewport", includedAttributes = DEFAULT_INCLUDED_ATTRIBUTES } = options;
|
|
634
|
+
const doc = root instanceof Document ? root : root.ownerDocument || document;
|
|
635
|
+
const startRoot = root instanceof Document ? root.documentElement : root;
|
|
636
|
+
const win = doc.defaultView || window;
|
|
637
|
+
const viewportW = win.innerWidth || 0;
|
|
638
|
+
const viewportH = win.innerHeight || 0;
|
|
639
|
+
let nextId = 1;
|
|
640
|
+
let nodeCount = 0;
|
|
641
|
+
const idToElement = /* @__PURE__ */ new Map();
|
|
642
|
+
const lines = [`# ${rootLabel} ${viewportW}x${viewportH}`];
|
|
643
|
+
/**
|
|
644
|
+
* Returns true when the element is worth considering for the snapshot.
|
|
645
|
+
*
|
|
646
|
+
* This is intentionally simple:
|
|
647
|
+
* - skip excluded tags
|
|
648
|
+
* - skip hidden/display:none/visibility:hidden/etc
|
|
649
|
+
* - skip zero-size elements
|
|
650
|
+
* - skip elements fully outside the viewport
|
|
651
|
+
*/
|
|
652
|
+
function isElementVisible(el) {
|
|
653
|
+
const tag = el.tagName.toLowerCase();
|
|
654
|
+
if (EXCLUDED_TAGS.has(tag)) return false;
|
|
655
|
+
if (!(el instanceof HTMLElement)) return false;
|
|
656
|
+
if (el.hidden) return false;
|
|
657
|
+
if (el.closest("head")) return false;
|
|
658
|
+
if (typeof el.checkVisibility === "function") try {
|
|
659
|
+
if (!el.checkVisibility({
|
|
660
|
+
opacityProperty: true,
|
|
661
|
+
visibilityProperty: true,
|
|
662
|
+
contentVisibilityAuto: true
|
|
663
|
+
})) return false;
|
|
664
|
+
} catch {}
|
|
665
|
+
const style = win.getComputedStyle(el);
|
|
666
|
+
if (style.display === "none") return false;
|
|
667
|
+
if (style.visibility === "hidden" || style.visibility === "collapse") return false;
|
|
668
|
+
if (style.opacity === "0") return false;
|
|
669
|
+
if (style.contentVisibility === "hidden") return false;
|
|
670
|
+
const rect = el.getBoundingClientRect();
|
|
671
|
+
if (rect.width <= 0 || rect.height <= 0) return false;
|
|
672
|
+
if (rect.bottom <= 0 || rect.right <= 0) return false;
|
|
673
|
+
if (rect.top >= viewportH || rect.left >= viewportW) return false;
|
|
674
|
+
return true;
|
|
859
675
|
}
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
676
|
+
/**
|
|
677
|
+
* Extracts a compact text representation from the element itself.
|
|
678
|
+
*
|
|
679
|
+
* No semantic guessing:
|
|
680
|
+
* - prefer innerText when available
|
|
681
|
+
* - otherwise fall back to textContent
|
|
682
|
+
* - normalize whitespace
|
|
683
|
+
* - truncate aggressively
|
|
684
|
+
*/
|
|
685
|
+
function getElementText(el) {
|
|
686
|
+
const text = normalizeWhitespace(el.innerText || el.textContent || "");
|
|
687
|
+
if (!text) return "";
|
|
688
|
+
return truncate(text, maxTextLength);
|
|
689
|
+
}
|
|
690
|
+
/**
|
|
691
|
+
* Keeps only a small allowlist of raw DOM attributes.
|
|
692
|
+
*
|
|
693
|
+
* This avoids dumping the full attribute bag, which is usually noisy
|
|
694
|
+
* and expensive in tokens.
|
|
695
|
+
*/
|
|
696
|
+
function getIncludedAttributes(el) {
|
|
697
|
+
const attrs = {};
|
|
698
|
+
for (const name of includedAttributes) {
|
|
699
|
+
const value = el.getAttribute(name);
|
|
700
|
+
if (value == null) continue;
|
|
701
|
+
const clean = truncate(normalizeWhitespace(value), maxTextLength);
|
|
702
|
+
if (!clean) continue;
|
|
703
|
+
attrs[name] = clean;
|
|
704
|
+
}
|
|
705
|
+
return attrs;
|
|
706
|
+
}
|
|
707
|
+
/**
|
|
708
|
+
* Rounds the client rect so the output is smaller and more stable.
|
|
709
|
+
*/
|
|
710
|
+
function quantizeRect(el) {
|
|
711
|
+
const r = el.getBoundingClientRect();
|
|
712
|
+
return {
|
|
713
|
+
x: Math.max(0, Math.round(r.left)),
|
|
714
|
+
y: Math.max(0, Math.round(r.top)),
|
|
715
|
+
w: Math.round(r.width),
|
|
716
|
+
h: Math.round(r.height)
|
|
717
|
+
};
|
|
718
|
+
}
|
|
719
|
+
/**
|
|
720
|
+
* Decides whether this node should be emitted.
|
|
721
|
+
*
|
|
722
|
+
* Simple rule:
|
|
723
|
+
* - keep it if it has visible kept children
|
|
724
|
+
* - or keep it if it has some text
|
|
725
|
+
* - or keep it if it has at least one included attribute
|
|
726
|
+
*
|
|
727
|
+
* This allows non-semantic div-heavy UIs to survive without trying
|
|
728
|
+
* to guess intent.
|
|
729
|
+
*/
|
|
730
|
+
function shouldKeepNode(text, attrs, children) {
|
|
731
|
+
if (children.length > 0) return true;
|
|
732
|
+
if (text.length > 0) return true;
|
|
733
|
+
if (Object.keys(attrs).length > 0) return true;
|
|
734
|
+
return false;
|
|
735
|
+
}
|
|
736
|
+
/**
|
|
737
|
+
* Single DFS traversal over the DOM.
|
|
738
|
+
*
|
|
739
|
+
* Complexity target:
|
|
740
|
+
* - O(N) DOM walk
|
|
741
|
+
* - O(1) work per element, aside from browser layout/style calls
|
|
742
|
+
*/
|
|
743
|
+
function walk(el) {
|
|
744
|
+
if (nodeCount >= maxNodes) return null;
|
|
745
|
+
if (!(el instanceof HTMLElement)) return null;
|
|
746
|
+
if (!isElementVisible(el)) return null;
|
|
747
|
+
const children = [];
|
|
748
|
+
for (const child of Array.from(el.children)) {
|
|
749
|
+
const childNode = walk(child);
|
|
750
|
+
if (childNode) children.push(childNode);
|
|
751
|
+
if (nodeCount >= maxNodes) break;
|
|
752
|
+
}
|
|
753
|
+
const text = getElementText(el);
|
|
754
|
+
const attrs = getIncludedAttributes(el);
|
|
755
|
+
if (!shouldKeepNode(text, attrs, children)) return null;
|
|
756
|
+
const id = nextId++;
|
|
757
|
+
nodeCount++;
|
|
758
|
+
idToElement.set(id, el);
|
|
759
|
+
return {
|
|
877
760
|
id,
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
}
|
|
885
|
-
/**
|
|
886
|
-
*
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
761
|
+
tag: el.tagName.toLowerCase(),
|
|
762
|
+
text,
|
|
763
|
+
attrs,
|
|
764
|
+
rect: includeRects ? quantizeRect(el) : void 0,
|
|
765
|
+
children
|
|
766
|
+
};
|
|
767
|
+
}
|
|
768
|
+
/**
|
|
769
|
+
* Emits the final compact line-based format.
|
|
770
|
+
*
|
|
771
|
+
* Example:
|
|
772
|
+
* @12 div "Settings" [id="settings"] [x=10 y=20 w=200 h=40]
|
|
773
|
+
*/
|
|
774
|
+
function emit(node, depth) {
|
|
775
|
+
const parts = [`${" ".repeat(depth)}@${node.id} ${node.tag}`];
|
|
776
|
+
if (node.text) parts.push(`"${escapeQuotes(node.text)}"`);
|
|
777
|
+
for (const [key, value] of Object.entries(node.attrs)) parts.push(`[${key}="${escapeQuotes(value)}"]`);
|
|
778
|
+
if (node.rect) parts.push(`[x=${node.rect.x} y=${node.rect.y} w=${node.rect.w} h=${node.rect.h}]`);
|
|
779
|
+
lines.push(parts.join(" "));
|
|
780
|
+
for (const child of node.children) emit(child, depth + 1);
|
|
781
|
+
}
|
|
782
|
+
const tree = walk(startRoot);
|
|
783
|
+
if (tree) emit(tree, 0);
|
|
890
784
|
return {
|
|
891
|
-
|
|
892
|
-
|
|
785
|
+
text: lines.join("\n"),
|
|
786
|
+
idToElement,
|
|
787
|
+
nodeCount
|
|
893
788
|
};
|
|
894
789
|
}
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
return getElementCenter(marker.element);
|
|
790
|
+
function normalizeWhitespace(text) {
|
|
791
|
+
return text.replace(/\s+/g, " ").trim();
|
|
792
|
+
}
|
|
793
|
+
function truncate(text, maxLength) {
|
|
794
|
+
if (text.length <= maxLength) return text;
|
|
795
|
+
return text.slice(0, maxLength - 1).trimEnd() + "…";
|
|
796
|
+
}
|
|
797
|
+
function escapeQuotes(text) {
|
|
798
|
+
return text.replace(/"/g, "\\\"");
|
|
905
799
|
}
|
|
906
800
|
//#endregion
|
|
907
801
|
//#region src/core/utils/screenshot.ts
|
|
908
802
|
const CLONE_RESOURCE_TIMEOUT_MS = 3e3;
|
|
909
803
|
/** Maximum width for compressed screenshots (maintains aspect ratio) */
|
|
910
|
-
const MAX_SCREENSHOT_WIDTH =
|
|
911
|
-
/** JPEG quality for compressed screenshots (0-1) */
|
|
912
|
-
const JPEG_QUALITY = .
|
|
804
|
+
const MAX_SCREENSHOT_WIDTH = 1920;
|
|
805
|
+
/** JPEG quality for compressed screenshots (0-1) - higher quality for clearer details */
|
|
806
|
+
const JPEG_QUALITY = .95;
|
|
913
807
|
/**
|
|
914
808
|
* Compress a canvas image by downscaling and converting to JPEG.
|
|
915
809
|
* Maintains aspect ratio and falls back to original if compression fails.
|
|
@@ -1009,7 +903,7 @@ async function waitForClonedDocumentStyles(doc) {
|
|
|
1009
903
|
}
|
|
1010
904
|
function getHtml2CanvasOptions(captureMetrics) {
|
|
1011
905
|
return {
|
|
1012
|
-
scale:
|
|
906
|
+
scale: window.devicePixelRatio,
|
|
1013
907
|
useCORS: true,
|
|
1014
908
|
logging: false,
|
|
1015
909
|
width: captureMetrics.viewportWidth,
|
|
@@ -1045,9 +939,10 @@ function createFallbackCanvas() {
|
|
|
1045
939
|
return canvas;
|
|
1046
940
|
}
|
|
1047
941
|
/**
|
|
1048
|
-
* Capture a screenshot of the current viewport.
|
|
1049
|
-
* Uses html2canvas to render the DOM to a canvas,
|
|
1050
|
-
*
|
|
942
|
+
* Capture a screenshot and DOM snapshot of the current viewport.
|
|
943
|
+
* Uses html2canvas to render the DOM to a canvas, compresses to high-quality JPEG,
|
|
944
|
+
* and builds a token-efficient DOM snapshot for AI context.
|
|
945
|
+
* Falls back to a placeholder if capture fails.
|
|
1051
946
|
*/
|
|
1052
947
|
async function captureViewport() {
|
|
1053
948
|
const captureMetrics = getCaptureMetrics();
|
|
@@ -1067,48 +962,19 @@ async function captureViewport() {
|
|
|
1067
962
|
height: canvas.height
|
|
1068
963
|
};
|
|
1069
964
|
}
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
viewportHeight: captureMetrics.viewportHeight
|
|
1076
|
-
};
|
|
1077
|
-
}
|
|
1078
|
-
/**
|
|
1079
|
-
* Capture an annotated screenshot of the current viewport.
|
|
1080
|
-
* Interactive elements are marked with numbered labels.
|
|
1081
|
-
* Returns both the annotated image and a marker map for resolving IDs.
|
|
1082
|
-
*/
|
|
1083
|
-
async function captureAnnotatedViewport() {
|
|
1084
|
-
const captureMetrics = getCaptureMetrics();
|
|
1085
|
-
const markerMap = createMarkerMap();
|
|
1086
|
-
let sourceCanvas;
|
|
1087
|
-
try {
|
|
1088
|
-
sourceCanvas = await html2canvas(document.body, getHtml2CanvasOptions(captureMetrics));
|
|
1089
|
-
} catch {
|
|
1090
|
-
sourceCanvas = createFallbackCanvas();
|
|
1091
|
-
}
|
|
1092
|
-
const canvas = markerMap.size > 0 ? createAnnotatedCanvas(sourceCanvas, markerMap) : sourceCanvas;
|
|
1093
|
-
const markerContext = generateMarkerContext(markerMap);
|
|
1094
|
-
let compressed;
|
|
1095
|
-
try {
|
|
1096
|
-
compressed = compressImage(canvas);
|
|
1097
|
-
} catch {
|
|
1098
|
-
compressed = {
|
|
1099
|
-
imageData: canvas.toDataURL("image/png"),
|
|
1100
|
-
width: canvas.width,
|
|
1101
|
-
height: canvas.height
|
|
1102
|
-
};
|
|
1103
|
-
}
|
|
965
|
+
const snapshot = buildVisibleDomSnapshot(document.body, {
|
|
966
|
+
maxNodes: 1500,
|
|
967
|
+
maxTextLength: 80,
|
|
968
|
+
includeRects: true
|
|
969
|
+
});
|
|
1104
970
|
return {
|
|
1105
971
|
imageData: compressed.imageData,
|
|
1106
972
|
width: compressed.width,
|
|
1107
973
|
height: compressed.height,
|
|
1108
974
|
viewportWidth: captureMetrics.viewportWidth,
|
|
1109
975
|
viewportHeight: captureMetrics.viewportHeight,
|
|
1110
|
-
|
|
1111
|
-
|
|
976
|
+
domSnapshot: snapshot.text,
|
|
977
|
+
elementRegistry: snapshot.idToElement
|
|
1112
978
|
};
|
|
1113
979
|
}
|
|
1114
980
|
//#endregion
|
|
@@ -1118,20 +984,12 @@ async function captureAnnotatedViewport() {
|
|
|
1118
984
|
*/
|
|
1119
985
|
var ScreenCaptureService = class {
|
|
1120
986
|
/**
|
|
1121
|
-
* Capture a screenshot of the current viewport.
|
|
1122
|
-
* @returns Screenshot result with image data and
|
|
987
|
+
* Capture a screenshot and DOM snapshot of the current viewport.
|
|
988
|
+
* @returns Screenshot result with image data, dimensions, and DOM snapshot
|
|
1123
989
|
*/
|
|
1124
990
|
async capture() {
|
|
1125
991
|
return captureViewport();
|
|
1126
992
|
}
|
|
1127
|
-
/**
|
|
1128
|
-
* Capture an annotated screenshot with marker overlays.
|
|
1129
|
-
* Interactive elements are marked with numbered labels.
|
|
1130
|
-
* @returns Annotated screenshot result with marker map
|
|
1131
|
-
*/
|
|
1132
|
-
async captureAnnotated() {
|
|
1133
|
-
return captureAnnotatedViewport();
|
|
1134
|
-
}
|
|
1135
993
|
};
|
|
1136
994
|
//#endregion
|
|
1137
995
|
//#region src/core/services/tts-playback-queue.ts
|
|
@@ -1374,12 +1232,12 @@ const AUDIO_LEVEL_NOISE_GATE = 5e-4;
|
|
|
1374
1232
|
const AUDIO_LEVEL_INPUT_GAIN = 600;
|
|
1375
1233
|
const AUDIO_LEVEL_ATTACK = .7;
|
|
1376
1234
|
const AUDIO_LEVEL_RELEASE = .25;
|
|
1377
|
-
function clamp
|
|
1235
|
+
function clamp(value, min, max) {
|
|
1378
1236
|
return Math.min(Math.max(value, min), max);
|
|
1379
1237
|
}
|
|
1380
1238
|
function normalizeAudioLevel(rms) {
|
|
1381
1239
|
const gatedRms = Math.max(0, rms - AUDIO_LEVEL_NOISE_GATE);
|
|
1382
|
-
return clamp
|
|
1240
|
+
return clamp(Math.log1p(gatedRms * AUDIO_LEVEL_INPUT_GAIN) / Math.log1p(AUDIO_LEVEL_INPUT_GAIN), 0, 1);
|
|
1383
1241
|
}
|
|
1384
1242
|
function smoothAudioLevel(current, target) {
|
|
1385
1243
|
const smoothing = target > current ? AUDIO_LEVEL_ATTACK : AUDIO_LEVEL_RELEASE;
|
|
@@ -1584,6 +1442,47 @@ function createStateMachine(initial = "idle") {
|
|
|
1584
1442
|
};
|
|
1585
1443
|
}
|
|
1586
1444
|
//#endregion
|
|
1445
|
+
//#region src/core/utils/ui-stream-parser.ts
|
|
1446
|
+
/**
|
|
1447
|
+
* Parse a single line from the UI message stream.
|
|
1448
|
+
* The stream format is SSE with "data: " prefix followed by JSON.
|
|
1449
|
+
*/
|
|
1450
|
+
function parseUIStreamLine(line) {
|
|
1451
|
+
const trimmed = line.trim();
|
|
1452
|
+
if (!trimmed) return null;
|
|
1453
|
+
let jsonStr = trimmed;
|
|
1454
|
+
if (trimmed.startsWith("data: ")) jsonStr = trimmed.slice(6);
|
|
1455
|
+
if (jsonStr === "[DONE]") return null;
|
|
1456
|
+
try {
|
|
1457
|
+
const chunk = JSON.parse(jsonStr);
|
|
1458
|
+
switch (chunk.type) {
|
|
1459
|
+
case "text-delta": return {
|
|
1460
|
+
type: "text-delta",
|
|
1461
|
+
delta: chunk.delta ?? ""
|
|
1462
|
+
};
|
|
1463
|
+
case "tool-input-available": return {
|
|
1464
|
+
type: "tool-input-available",
|
|
1465
|
+
toolName: chunk.toolName ?? "",
|
|
1466
|
+
input: chunk.input
|
|
1467
|
+
};
|
|
1468
|
+
case "finish": return { type: "finish" };
|
|
1469
|
+
case "error": return {
|
|
1470
|
+
type: "error",
|
|
1471
|
+
errorText: chunk.errorText ?? "Unknown error"
|
|
1472
|
+
};
|
|
1473
|
+
default: return { type: "unknown" };
|
|
1474
|
+
}
|
|
1475
|
+
} catch {
|
|
1476
|
+
return null;
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
/**
|
|
1480
|
+
* Check if a tool call is a point tool call with valid input.
|
|
1481
|
+
*/
|
|
1482
|
+
function isPointToolCall(chunk) {
|
|
1483
|
+
return chunk.type === "tool-input-available" && chunk.toolName === "point" && chunk.input != null && typeof chunk.input === "object" && "elementId" in chunk.input && "label" in chunk.input;
|
|
1484
|
+
}
|
|
1485
|
+
//#endregion
|
|
1587
1486
|
//#region src/core/utils/response-processor.ts
|
|
1588
1487
|
const COMMON_ABBREVIATIONS = [
|
|
1589
1488
|
"mr.",
|
|
@@ -1652,32 +1551,58 @@ function extractCompletedSegments(text) {
|
|
|
1652
1551
|
};
|
|
1653
1552
|
}
|
|
1654
1553
|
/**
|
|
1655
|
-
*
|
|
1656
|
-
*
|
|
1554
|
+
* Processes a streaming AI SDK UI message stream response.
|
|
1555
|
+
* Extracts text for display/TTS and captures point tool calls.
|
|
1657
1556
|
*/
|
|
1658
1557
|
var ProgressiveResponseProcessor = class {
|
|
1659
|
-
|
|
1558
|
+
consumedTextLength = 0;
|
|
1660
1559
|
pendingShortSegment = "";
|
|
1661
|
-
|
|
1560
|
+
rawText = "";
|
|
1561
|
+
buffer = "";
|
|
1562
|
+
pointToolCall = null;
|
|
1563
|
+
/**
|
|
1564
|
+
* Push raw stream data and extract text chunks and tool calls.
|
|
1565
|
+
* The UI message stream format is newline-delimited JSON.
|
|
1566
|
+
*/
|
|
1662
1567
|
push(chunk) {
|
|
1663
|
-
this.
|
|
1664
|
-
const
|
|
1665
|
-
|
|
1666
|
-
|
|
1568
|
+
this.buffer += chunk;
|
|
1569
|
+
const lines = this.buffer.split("\n");
|
|
1570
|
+
this.buffer = lines.pop() ?? "";
|
|
1571
|
+
const newTextParts = [];
|
|
1572
|
+
for (const line of lines) {
|
|
1573
|
+
const parsed = parseUIStreamLine(line);
|
|
1574
|
+
if (!parsed) continue;
|
|
1575
|
+
if (parsed.type === "text-delta") newTextParts.push(parsed.delta);
|
|
1576
|
+
else if (isPointToolCall(parsed)) {
|
|
1577
|
+
if (!this.pointToolCall) this.pointToolCall = parsed.input;
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1580
|
+
if (newTextParts.length > 0) this.rawText += newTextParts.join("");
|
|
1581
|
+
const { consumedLength, segments } = extractCompletedSegments(this.rawText.slice(this.consumedTextLength));
|
|
1582
|
+
this.consumedTextLength += consumedLength;
|
|
1667
1583
|
return {
|
|
1668
|
-
visibleText,
|
|
1669
|
-
speechSegments: this.coalesceSegments(segments)
|
|
1584
|
+
visibleText: this.rawText,
|
|
1585
|
+
speechSegments: this.coalesceSegments(segments),
|
|
1586
|
+
pointToolCall: this.pointToolCall
|
|
1670
1587
|
};
|
|
1671
1588
|
}
|
|
1589
|
+
/**
|
|
1590
|
+
* Finalize processing and return any remaining text/tool call.
|
|
1591
|
+
*/
|
|
1672
1592
|
finish() {
|
|
1673
|
-
|
|
1674
|
-
|
|
1593
|
+
if (this.buffer) {
|
|
1594
|
+
const parsed = parseUIStreamLine(this.buffer);
|
|
1595
|
+
if (parsed?.type === "text-delta") this.rawText += parsed.delta;
|
|
1596
|
+
else if (parsed && isPointToolCall(parsed) && !this.pointToolCall) this.pointToolCall = parsed.input;
|
|
1597
|
+
this.buffer = "";
|
|
1598
|
+
}
|
|
1599
|
+
const trailingText = this.rawText.slice(this.consumedTextLength).trim();
|
|
1675
1600
|
const finalSegmentParts = [this.pendingShortSegment, trailingText].filter(Boolean);
|
|
1676
1601
|
this.pendingShortSegment = "";
|
|
1677
1602
|
return {
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1603
|
+
finalResponseText: this.rawText.trim(),
|
|
1604
|
+
speechSegments: finalSegmentParts.length ? [finalSegmentParts.join(" ").trim()] : [],
|
|
1605
|
+
pointToolCall: this.pointToolCall
|
|
1681
1606
|
};
|
|
1682
1607
|
}
|
|
1683
1608
|
coalesceSegments(segments) {
|
|
@@ -1698,9 +1623,6 @@ var ProgressiveResponseProcessor = class {
|
|
|
1698
1623
|
};
|
|
1699
1624
|
//#endregion
|
|
1700
1625
|
//#region src/core/client.ts
|
|
1701
|
-
function clamp(value, min, max) {
|
|
1702
|
-
return Math.min(Math.max(value, min), max);
|
|
1703
|
-
}
|
|
1704
1626
|
async function readErrorMessage(response, fallbackMessage) {
|
|
1705
1627
|
try {
|
|
1706
1628
|
if ((response.headers.get("Content-Type") ?? "").includes("application/json")) {
|
|
@@ -1713,21 +1635,6 @@ async function readErrorMessage(response, fallbackMessage) {
|
|
|
1713
1635
|
return fallbackMessage;
|
|
1714
1636
|
}
|
|
1715
1637
|
/**
|
|
1716
|
-
* Map coordinate-based pointing from screenshot space to viewport space.
|
|
1717
|
-
*/
|
|
1718
|
-
function mapCoordinatesToViewport(x, y, screenshot) {
|
|
1719
|
-
if (screenshot.width <= 0 || screenshot.height <= 0) return {
|
|
1720
|
-
x,
|
|
1721
|
-
y
|
|
1722
|
-
};
|
|
1723
|
-
const scaleX = screenshot.viewportWidth / screenshot.width;
|
|
1724
|
-
const scaleY = screenshot.viewportHeight / screenshot.height;
|
|
1725
|
-
return {
|
|
1726
|
-
x: clamp(Math.round(x * scaleX), 0, Math.max(screenshot.viewportWidth - 1, 0)),
|
|
1727
|
-
y: clamp(Math.round(y * scaleY), 0, Math.max(screenshot.viewportHeight - 1, 0))
|
|
1728
|
-
};
|
|
1729
|
-
}
|
|
1730
|
-
/**
|
|
1731
1638
|
* Framework-agnostic client for cursor buddy voice interactions.
|
|
1732
1639
|
*
|
|
1733
1640
|
* Manages the complete voice interaction flow:
|
|
@@ -1796,7 +1703,7 @@ var CursorBuddyClient = class {
|
|
|
1796
1703
|
this.notify();
|
|
1797
1704
|
this.abortController = new AbortController();
|
|
1798
1705
|
const signal = this.abortController.signal;
|
|
1799
|
-
this.screenshotPromise = this.screenCapture.
|
|
1706
|
+
this.screenshotPromise = this.screenCapture.capture();
|
|
1800
1707
|
this.beginListeningSession(signal).catch((error) => {
|
|
1801
1708
|
if (signal.aborted) return;
|
|
1802
1709
|
this.voiceCapture.dispose();
|
|
@@ -1839,7 +1746,7 @@ var CursorBuddyClient = class {
|
|
|
1839
1746
|
this.options.onTranscript?.(transcript);
|
|
1840
1747
|
this.notify();
|
|
1841
1748
|
this.prepareSpeechMode();
|
|
1842
|
-
const { cleanResponse,
|
|
1749
|
+
const { cleanResponse, pointToolCall, playbackQueue } = await this.chatAndSpeak(transcript, screenshot, signal, {
|
|
1843
1750
|
onFailure: failTurn,
|
|
1844
1751
|
onPlaybackStart: () => {
|
|
1845
1752
|
this.stateMachine.transition({ type: "RESPONSE_STARTED" });
|
|
@@ -1847,19 +1754,19 @@ var CursorBuddyClient = class {
|
|
|
1847
1754
|
});
|
|
1848
1755
|
if (turnFailure) throw turnFailure;
|
|
1849
1756
|
if (signal?.aborted) return;
|
|
1850
|
-
const parsed = parsePointingTagRaw(fullResponse);
|
|
1851
1757
|
this.options.onResponse?.(cleanResponse);
|
|
1852
1758
|
let pointTarget = null;
|
|
1853
|
-
if (
|
|
1854
|
-
const
|
|
1855
|
-
if (
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1759
|
+
if (pointToolCall) {
|
|
1760
|
+
const element = screenshot.elementRegistry.get(pointToolCall.elementId);
|
|
1761
|
+
if (element) {
|
|
1762
|
+
const rect = element.getBoundingClientRect();
|
|
1763
|
+
pointTarget = {
|
|
1764
|
+
x: Math.round(rect.left + rect.width / 2),
|
|
1765
|
+
y: Math.round(rect.top + rect.height / 2),
|
|
1766
|
+
label: pointToolCall.label
|
|
1767
|
+
};
|
|
1768
|
+
}
|
|
1769
|
+
}
|
|
1863
1770
|
if (pointTarget) {
|
|
1864
1771
|
this.options.onPoint?.(pointTarget);
|
|
1865
1772
|
this.pointerController.pointAt(pointTarget);
|
|
@@ -2025,7 +1932,7 @@ var CursorBuddyClient = class {
|
|
|
2025
1932
|
},
|
|
2026
1933
|
transcript,
|
|
2027
1934
|
history,
|
|
2028
|
-
|
|
1935
|
+
domSnapshot: screenshot.domSnapshot
|
|
2029
1936
|
}),
|
|
2030
1937
|
signal
|
|
2031
1938
|
});
|
|
@@ -2061,7 +1968,7 @@ var CursorBuddyClient = class {
|
|
|
2061
1968
|
this.updateResponse(finalizedResponse.finalResponseText);
|
|
2062
1969
|
return {
|
|
2063
1970
|
cleanResponse: finalizedResponse.finalResponseText,
|
|
2064
|
-
|
|
1971
|
+
pointToolCall: finalizedResponse.pointToolCall,
|
|
2065
1972
|
playbackQueue
|
|
2066
1973
|
};
|
|
2067
1974
|
}
|
|
@@ -2259,4 +2166,4 @@ var CursorBuddyClient = class {
|
|
|
2259
2166
|
//#endregion
|
|
2260
2167
|
export { $buddyScale as a, $buddyRotation as i, $audioLevel as n, $cursorPosition as o, $buddyPosition as r, $pointingTarget as s, CursorBuddyClient as t };
|
|
2261
2168
|
|
|
2262
|
-
//# sourceMappingURL=client-
|
|
2169
|
+
//# sourceMappingURL=client-CliXcNch.mjs.map
|