loom-code 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loom_code/__init__.py +22 -0
- loom_code/_post_commit.py +119 -0
- loom_code/agent.py +544 -0
- loom_code/approval.py +616 -0
- loom_code/browse/__init__.py +291 -0
- loom_code/browse/act.py +467 -0
- loom_code/browse/observe.py +249 -0
- loom_code/browse/session.py +96 -0
- loom_code/browse/verify.py +194 -0
- loom_code/checkpoint.py +283 -0
- loom_code/cli.py +495 -0
- loom_code/code_index.py +703 -0
- loom_code/compact.py +143 -0
- loom_code/consent.py +47 -0
- loom_code/credentials.py +527 -0
- loom_code/edit_tool.py +635 -0
- loom_code/extensions.py +522 -0
- loom_code/file_history.py +322 -0
- loom_code/file_tools.py +93 -0
- loom_code/git_hook.py +200 -0
- loom_code/grep_tool.py +430 -0
- loom_code/hooks.py +297 -0
- loom_code/loominit/__init__.py +23 -0
- loom_code/loominit/_ast_walk.py +429 -0
- loom_code/loominit/_files.py +284 -0
- loom_code/loominit/_graph.py +141 -0
- loom_code/loominit/_resolve.py +392 -0
- loom_code/loominit/_tests_map.py +108 -0
- loom_code/loominit/extractor.py +332 -0
- loom_code/loominit/repomap.py +225 -0
- loom_code/loominit/schema.py +242 -0
- loom_code/lsp_tools.py +396 -0
- loom_code/mcp_host.py +79 -0
- loom_code/operator.py +449 -0
- loom_code/paste.py +97 -0
- loom_code/paths.py +52 -0
- loom_code/permissions.py +177 -0
- loom_code/project.py +104 -0
- loom_code/prompts.py +451 -0
- loom_code/render.py +783 -0
- loom_code/repl.py +4080 -0
- loom_code/rules.py +267 -0
- loom_code/sandboxed_bash.py +176 -0
- loom_code/scribe.py +88 -0
- loom_code/skills/__init__.py +16 -0
- loom_code/skills/graphify/SKILL.md +97 -0
- loom_code/skills/graphify/tools.py +570 -0
- loom_code/trust.py +216 -0
- loom_code/turn.py +169 -0
- loom_code/web_fetch.py +370 -0
- loom_code/workers.py +758 -0
- loom_code/worktree.py +134 -0
- loom_code-0.1.1.dist-info/METADATA +224 -0
- loom_code-0.1.1.dist-info/RECORD +58 -0
- loom_code-0.1.1.dist-info/WHEEL +5 -0
- loom_code-0.1.1.dist-info/entry_points.txt +2 -0
- loom_code-0.1.1.dist-info/licenses/LICENSE +21 -0
- loom_code-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Observe a page: find interactive elements, tag them with stable ids,
|
|
2
|
+
serialize a compact view for the LLM.
|
|
3
|
+
|
|
4
|
+
The JS below runs in the page and:
|
|
5
|
+
1. Finds interactive elements — native controls (a/button/input/
|
|
6
|
+
select/textarea/summary), ARIA roles (button/link/checkbox/combobox/
|
|
7
|
+
menuitem/tab/option/switch/radio), and contenteditable. Plus a
|
|
8
|
+
visibility + size filter so we don't list hidden/zero-area noise.
|
|
9
|
+
2. Tags each with ``data-loom-id="N"`` (stable handle for acting).
|
|
10
|
+
3. Returns ``[{id, tag, role, label, value, kind}]``.
|
|
11
|
+
|
|
12
|
+
The Python side turns that into lines the model reads:
|
|
13
|
+
[12] textbox "Where from?" (value="Kathmandu")
|
|
14
|
+
[13] button "Search"
|
|
15
|
+
``kind`` ("input"|"button"|"link"|"select") helps the model pick the
|
|
16
|
+
right action.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
# JS: tag + collect interactive elements. Returns a JSON-able list.
|
|
24
|
+
# Kept as one expression-returning function so page.evaluate gets the
|
|
25
|
+
# array back directly.
|
|
26
|
+
_COLLECT_JS = r"""
|
|
27
|
+
() => {
|
|
28
|
+
const INTERACTIVE_ROLES = new Set([
|
|
29
|
+
"button","link","checkbox","combobox","menuitem","tab","option",
|
|
30
|
+
"switch","radio","textbox","searchbox","slider","spinbutton",
|
|
31
|
+
]);
|
|
32
|
+
const NATIVE = new Set(["A","BUTTON","INPUT","SELECT","TEXTAREA","SUMMARY"]);
|
|
33
|
+
const out = [];
|
|
34
|
+
let id = 0;
|
|
35
|
+
|
|
36
|
+
const isVisible = (el) => {
|
|
37
|
+
const r = el.getBoundingClientRect();
|
|
38
|
+
if (r.width < 4 || r.height < 4) return false;
|
|
39
|
+
const s = window.getComputedStyle(el);
|
|
40
|
+
if (s.display === "none" || s.visibility === "hidden" || s.opacity === "0")
|
|
41
|
+
return false;
|
|
42
|
+
return true;
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
// Resolve an element's human label as robustly as possible — this is
|
|
46
|
+
// what lets the model understand "which field is which". Order matters:
|
|
47
|
+
// explicit accessible name first, then associated <label>, then nearby
|
|
48
|
+
// text, then attributes.
|
|
49
|
+
const clean = (s) => (s || "").replace(/\s+/g, " ").trim().slice(0, 90);
|
|
50
|
+
const labelFor = (el) => {
|
|
51
|
+
// 1. Explicit accessible name.
|
|
52
|
+
const aria = el.getAttribute("aria-label");
|
|
53
|
+
if (aria) return clean(aria);
|
|
54
|
+
// 2. aria-labelledby → referenced element(s) text.
|
|
55
|
+
const lb = el.getAttribute("aria-labelledby");
|
|
56
|
+
if (lb) {
|
|
57
|
+
const txt = lb.split(/\s+/)
|
|
58
|
+
.map((id) => document.getElementById(id))
|
|
59
|
+
.filter(Boolean)
|
|
60
|
+
.map((n) => n.innerText || n.textContent || "")
|
|
61
|
+
.join(" ");
|
|
62
|
+
if (clean(txt)) return clean(txt);
|
|
63
|
+
}
|
|
64
|
+
// 3. <label for=id> or wrapping <label>.
|
|
65
|
+
if (el.id) {
|
|
66
|
+
const lab = document.querySelector(`label[for="${CSS.escape(el.id)}"]`);
|
|
67
|
+
if (lab && clean(lab.innerText)) return clean(lab.innerText);
|
|
68
|
+
}
|
|
69
|
+
const wrapLab = el.closest("label");
|
|
70
|
+
if (wrapLab && clean(wrapLab.innerText)) return clean(wrapLab.innerText);
|
|
71
|
+
// 4. placeholder / title.
|
|
72
|
+
const ph = el.getAttribute("placeholder");
|
|
73
|
+
if (ph) return clean(ph);
|
|
74
|
+
const title = el.getAttribute("title");
|
|
75
|
+
if (title) return clean(title);
|
|
76
|
+
// 5. The element's own visible text (good for buttons/links).
|
|
77
|
+
const own = clean(el.innerText || el.textContent || "");
|
|
78
|
+
if (own) return own;
|
|
79
|
+
// 6. For inputs with no label yet: the nearest preceding text node /
|
|
80
|
+
// sibling label-ish element (handles "Where from?" rendered as a
|
|
81
|
+
// separate span above the input).
|
|
82
|
+
let p = el.previousElementSibling;
|
|
83
|
+
for (let i = 0; i < 3 && p; i++) {
|
|
84
|
+
const t = clean(p.innerText || p.textContent || "");
|
|
85
|
+
if (t && t.length < 40) return t;
|
|
86
|
+
p = p.previousElementSibling;
|
|
87
|
+
}
|
|
88
|
+
// 7. Last resort: name / id attribute.
|
|
89
|
+
const name = el.getAttribute("name") || el.getAttribute("id") || "";
|
|
90
|
+
return clean(name);
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
const kindFor = (el, role) => {
|
|
94
|
+
const t = el.tagName;
|
|
95
|
+
if (t === "INPUT" || t === "TEXTAREA" || role === "textbox" ||
|
|
96
|
+
role === "searchbox" || role === "combobox" ||
|
|
97
|
+
el.isContentEditable) return "input";
|
|
98
|
+
if (t === "SELECT") return "select";
|
|
99
|
+
if (t === "A" || role === "link") return "link";
|
|
100
|
+
return "button";
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
const isInteractive = (el) => {
|
|
104
|
+
if (NATIVE.has(el.tagName)) return true;
|
|
105
|
+
const role = el.getAttribute("role");
|
|
106
|
+
if (role && INTERACTIVE_ROLES.has(role)) return true;
|
|
107
|
+
if (el.isContentEditable) return true;
|
|
108
|
+
// Cursor:pointer is a strong custom-button signal.
|
|
109
|
+
if (window.getComputedStyle(el).cursor === "pointer" &&
|
|
110
|
+
(el.onclick || el.getAttribute("jsaction"))) return true;
|
|
111
|
+
return false;
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
const all = document.querySelectorAll("*");
|
|
115
|
+
for (const el of all) {
|
|
116
|
+
try {
|
|
117
|
+
if (!isInteractive(el)) continue;
|
|
118
|
+
if (!isVisible(el)) continue;
|
|
119
|
+
const role = el.getAttribute("role") || "";
|
|
120
|
+
el.setAttribute("data-loom-id", String(id));
|
|
121
|
+
out.push({
|
|
122
|
+
id,
|
|
123
|
+
tag: el.tagName.toLowerCase(),
|
|
124
|
+
role,
|
|
125
|
+
kind: kindFor(el, role),
|
|
126
|
+
label: labelFor(el),
|
|
127
|
+
value: (el.value != null ? String(el.value) : "").slice(0, 60),
|
|
128
|
+
});
|
|
129
|
+
id++;
|
|
130
|
+
} catch (e) { /* skip problematic node */ }
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# JS: extract the page's VISIBLE TEXT content — what a human reads
|
|
138
|
+
# (prices, listings, results, headings). Unlike observe (which lists
|
|
139
|
+
# CLICKABLE elements), this is for READING outcomes. Walks visible text
|
|
140
|
+
# nodes, collapses whitespace, dedupes, and keeps meaningful lines.
|
|
141
|
+
_READ_TEXT_JS = r"""
|
|
142
|
+
() => {
|
|
143
|
+
const isVisible = (el) => {
|
|
144
|
+
if (!el) return false;
|
|
145
|
+
const s = window.getComputedStyle(el);
|
|
146
|
+
if (s.display === "none" || s.visibility === "hidden" ||
|
|
147
|
+
s.opacity === "0") return false;
|
|
148
|
+
const r = el.getBoundingClientRect();
|
|
149
|
+
return r.width > 1 && r.height > 1;
|
|
150
|
+
};
|
|
151
|
+
// Skip script/style/nav-chrome-heavy containers.
|
|
152
|
+
const SKIP = new Set(["SCRIPT","STYLE","NOSCRIPT","SVG","PATH"]);
|
|
153
|
+
const out = [];
|
|
154
|
+
const seen = new Set();
|
|
155
|
+
const walker = document.createTreeWalker(
|
|
156
|
+
document.body, NodeFilter.SHOW_TEXT, null);
|
|
157
|
+
let n;
|
|
158
|
+
while ((n = walker.nextNode())) {
|
|
159
|
+
const t = (n.nodeValue || "").replace(/\s+/g, " ").trim();
|
|
160
|
+
if (!t || t.length < 2) continue;
|
|
161
|
+
const p = n.parentElement;
|
|
162
|
+
if (!p || SKIP.has(p.tagName)) continue;
|
|
163
|
+
if (!isVisible(p)) continue;
|
|
164
|
+
if (seen.has(t)) continue;
|
|
165
|
+
seen.add(t);
|
|
166
|
+
out.push(t);
|
|
167
|
+
if (out.length > 400) break; // safety cap
|
|
168
|
+
}
|
|
169
|
+
return out;
|
|
170
|
+
}
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
async def read_text(page: Any) -> str:
|
|
175
|
+
"""Return the page's visible text content (for reading prices /
|
|
176
|
+
results / listings). Compact + capped so it stays affordable."""
|
|
177
|
+
try:
|
|
178
|
+
lines: list[str] = await page.evaluate(_READ_TEXT_JS)
|
|
179
|
+
except Exception as exc: # noqa: BLE001
|
|
180
|
+
return f"(could not read page text: {exc})"
|
|
181
|
+
title = url = ""
|
|
182
|
+
try:
|
|
183
|
+
title = await page.title()
|
|
184
|
+
url = page.url
|
|
185
|
+
except Exception: # noqa: BLE001
|
|
186
|
+
pass
|
|
187
|
+
body = "\n".join(lines)
|
|
188
|
+
# Hard char cap so a huge results page can't blow the budget.
|
|
189
|
+
if len(body) > 8000:
|
|
190
|
+
body = body[:8000] + "\n… (truncated — ask to read more / scroll)"
|
|
191
|
+
return f"PAGE: {title}\nURL: {url}\n\n{body}"
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
async def observe(page: Any) -> tuple[list[dict[str, Any]], str]:
|
|
195
|
+
"""Tag + collect interactive elements; return (elements, rendered).
|
|
196
|
+
|
|
197
|
+
``rendered`` is the compact text the LLM reads. ``elements`` is the
|
|
198
|
+
structured list (also drives the selector map / id validity)."""
|
|
199
|
+
try:
|
|
200
|
+
elements: list[dict[str, Any]] = await page.evaluate(_COLLECT_JS)
|
|
201
|
+
except Exception as exc: # noqa: BLE001 — never break the run
|
|
202
|
+
return [], f"(could not read the page: {exc})"
|
|
203
|
+
|
|
204
|
+
title = ""
|
|
205
|
+
url = ""
|
|
206
|
+
try:
|
|
207
|
+
title = await page.title()
|
|
208
|
+
url = page.url
|
|
209
|
+
except Exception: # noqa: BLE001
|
|
210
|
+
pass
|
|
211
|
+
|
|
212
|
+
lines: list[str] = [f"PAGE: {title}", f"URL: {url}", ""]
|
|
213
|
+
if not elements:
|
|
214
|
+
lines.append("(no interactive elements found — the page may still "
|
|
215
|
+
"be loading; try page_observe again, or page_check to "
|
|
216
|
+
"see it visually.)")
|
|
217
|
+
return elements, "\n".join(lines)
|
|
218
|
+
|
|
219
|
+
# COST CONTROL: a page can have 800+ interactive elements (nav, footer
|
|
220
|
+
# links, ad chrome) — re-sending all of them every turn is the token
|
|
221
|
+
# blowup. Prioritise the elements that matter (inputs/comboboxes +
|
|
222
|
+
# buttons with real labels) and cap the rendered list. The full
|
|
223
|
+
# ``elements`` list (with ids) is still returned for acting; we only
|
|
224
|
+
# trim what the LLM READS.
|
|
225
|
+
PRIORITY = {"input": 0, "select": 1, "button": 2, "link": 3}
|
|
226
|
+
|
|
227
|
+
def _score(e: dict[str, Any]) -> tuple[int, int]:
|
|
228
|
+
kind = e.get("kind", "link")
|
|
229
|
+
has_label = 0 if (e.get("label") or "").strip() else 1
|
|
230
|
+
return (PRIORITY.get(kind, 9), has_label)
|
|
231
|
+
|
|
232
|
+
ranked = sorted(elements, key=_score)
|
|
233
|
+
CAP = 60
|
|
234
|
+
shown = ranked[:CAP]
|
|
235
|
+
# Keep them in id order within the shown set so the list reads
|
|
236
|
+
# naturally.
|
|
237
|
+
shown.sort(key=lambda e: e["id"])
|
|
238
|
+
for e in shown:
|
|
239
|
+
label = e.get("label") or "(no label)"
|
|
240
|
+
val = e.get("value") or ""
|
|
241
|
+
val_part = f' (value="{val}")' if val else ""
|
|
242
|
+
lines.append(f'[{e["id"]}] {e["kind"]} "{label}"{val_part}')
|
|
243
|
+
if len(elements) > len(shown):
|
|
244
|
+
lines.append(
|
|
245
|
+
f"… (+{len(elements) - len(shown)} more elements not shown — "
|
|
246
|
+
"mostly nav/footer. If you need one that isn't listed, say which "
|
|
247
|
+
"and I'll surface it.)"
|
|
248
|
+
)
|
|
249
|
+
return elements, "\n".join(lines)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""A single headed browser session for loom-code's ``/computer`` mode.
|
|
2
|
+
|
|
3
|
+
One ``BrowserSession`` owns one visible Chromium window for the life of a
|
|
4
|
+
REPL session. The agent's page_* tools all act through it. It is created
|
|
5
|
+
lazily on the first navigation and torn down on mode-off / exit.
|
|
6
|
+
|
|
7
|
+
Reliability design (learned from our Google-Flights failures + how
|
|
8
|
+
browser-use stays robust):
|
|
9
|
+
|
|
10
|
+
* **Stable element handles.** Instead of ephemeral snapshot refs (which
|
|
11
|
+
die the instant the DOM changes — the ``Ref e145 not found`` error),
|
|
12
|
+
every interactive element is tagged in the live DOM with a
|
|
13
|
+
``data-loom-id="N"`` attribute. Acting later re-selects by that
|
|
14
|
+
attribute (``[data-loom-id="N"]``), so Playwright re-resolves the
|
|
15
|
+
element FRESH on every action — no stale handles. The id rides on the
|
|
16
|
+
element through re-renders unless the element itself is recreated.
|
|
17
|
+
|
|
18
|
+
* **One persistent page.** The session keeps the active page so observe →
|
|
19
|
+
act → observe operates on the same evolving page, like a human.
|
|
20
|
+
|
|
21
|
+
This module is intentionally dependency-light: Playwright only (already
|
|
22
|
+
installed for the project). No browser-use, no extra packages.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BrowserSession:
|
|
31
|
+
"""Lazily-launched headed Chromium + the active page. Single-page for
|
|
32
|
+
now (the active tab); multi-tab can come later."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, *, headless: bool = False) -> None:
|
|
35
|
+
self._headless = headless
|
|
36
|
+
self._pw: Any = None
|
|
37
|
+
self._browser: Any = None
|
|
38
|
+
self._context: Any = None
|
|
39
|
+
self._page: Any = None
|
|
40
|
+
# index -> True; the live truth lives in the DOM (data-loom-id),
|
|
41
|
+
# this just tracks the highest id assigned this observe pass.
|
|
42
|
+
self._max_id: int = 0
|
|
43
|
+
|
|
44
|
+
async def start(self) -> None:
|
|
45
|
+
"""Launch the browser if not already running. Idempotent."""
|
|
46
|
+
if self._page is not None:
|
|
47
|
+
return
|
|
48
|
+
from playwright.async_api import async_playwright
|
|
49
|
+
|
|
50
|
+
self._pw = await async_playwright().start()
|
|
51
|
+
# Headed = the user watches it work (the whole point of /computer).
|
|
52
|
+
self._browser = await self._pw.chromium.launch(
|
|
53
|
+
headless=self._headless,
|
|
54
|
+
args=["--start-maximized"],
|
|
55
|
+
)
|
|
56
|
+
# A real-ish context: viewport=None lets the window size drive it
|
|
57
|
+
# (with --start-maximized), and a normal UA reduces trivial
|
|
58
|
+
# bot-walls.
|
|
59
|
+
self._context = await self._browser.new_context(
|
|
60
|
+
viewport=None,
|
|
61
|
+
user_agent=(
|
|
62
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
63
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
64
|
+
"Chrome/124.0.0.0 Safari/537.36"
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
self._page = await self._context.new_page()
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def page(self) -> Any:
|
|
71
|
+
if self._page is None:
|
|
72
|
+
raise RuntimeError("browser not started — call start() first")
|
|
73
|
+
return self._page
|
|
74
|
+
|
|
75
|
+
async def goto(self, url: str) -> None:
|
|
76
|
+
await self.start()
|
|
77
|
+
if not url.startswith(("http://", "https://")):
|
|
78
|
+
url = "https://" + url
|
|
79
|
+
await self._page.goto(
|
|
80
|
+
url, wait_until="domcontentloaded", timeout=45000
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
async def close(self) -> None:
|
|
84
|
+
"""Tear everything down. Safe to call multiple times."""
|
|
85
|
+
for closer in (
|
|
86
|
+
getattr(self._context, "close", None),
|
|
87
|
+
getattr(self._browser, "close", None),
|
|
88
|
+
getattr(self._pw, "stop", None),
|
|
89
|
+
):
|
|
90
|
+
if closer is not None:
|
|
91
|
+
try:
|
|
92
|
+
await closer()
|
|
93
|
+
except Exception: # noqa: BLE001 — best-effort teardown
|
|
94
|
+
pass
|
|
95
|
+
self._pw = self._browser = self._context = self._page = None
|
|
96
|
+
self._max_id = 0
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Verify page state by reading the live DOM.
|
|
2
|
+
|
|
3
|
+
loomflow has no image-to-model (vision) input yet, so instead of a
|
|
4
|
+
screenshot+vision check we read the ACTUAL values the page holds — the
|
|
5
|
+
text content of the inputs/fields visible on screen — and return them so
|
|
6
|
+
the agent can confirm "did Delhi land in the origin field?". This catches
|
|
7
|
+
the wrong-field / didn't-stick failures without needing vision: the
|
|
8
|
+
field's real ``value`` is ground truth.
|
|
9
|
+
|
|
10
|
+
It also saves a screenshot to disk so the user (and a future vision
|
|
11
|
+
upgrade) can inspect what the page looked like.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import base64
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
# Set-of-Marks: draw a numbered box on each interactive element (using
|
|
21
|
+
# the data-loom-id tags observe() already placed) so a screenshot the
|
|
22
|
+
# model SEES is annotated with the SAME [ids] it acts on. Returns a
|
|
23
|
+
# teardown function name so we can remove the overlay after the shot.
|
|
24
|
+
_DRAW_MARKS_JS = r"""
|
|
25
|
+
() => {
|
|
26
|
+
const old = document.getElementById('__loom_marks__');
|
|
27
|
+
if (old) old.remove();
|
|
28
|
+
const layer = document.createElement('div');
|
|
29
|
+
layer.id = '__loom_marks__';
|
|
30
|
+
layer.style.cssText =
|
|
31
|
+
'position:fixed;inset:0;z-index:2147483647;pointer-events:none;';
|
|
32
|
+
const COLORS = ['#e6194B','#3cb44b','#4363d8','#f58231','#911eb4',
|
|
33
|
+
'#469990','#9A6324','#800000','#808000','#000075'];
|
|
34
|
+
let n = 0;
|
|
35
|
+
for (const el of document.querySelectorAll('[data-loom-id]')) {
|
|
36
|
+
const r = el.getBoundingClientRect();
|
|
37
|
+
if (r.width < 4 || r.height < 4) continue;
|
|
38
|
+
if (r.bottom < 0 || r.top > window.innerHeight) continue; // off-screen
|
|
39
|
+
const id = el.getAttribute('data-loom-id');
|
|
40
|
+
const c = COLORS[n % COLORS.length]; n++;
|
|
41
|
+
const box = document.createElement('div');
|
|
42
|
+
box.style.cssText =
|
|
43
|
+
`position:fixed;left:${r.left}px;top:${r.top}px;width:${r.width}px;`+
|
|
44
|
+
`height:${r.height}px;border:2px solid ${c};box-sizing:border-box;`;
|
|
45
|
+
const tag = document.createElement('div');
|
|
46
|
+
tag.textContent = id;
|
|
47
|
+
tag.style.cssText =
|
|
48
|
+
`position:fixed;left:${r.left}px;top:${Math.max(0,r.top-14)}px;`+
|
|
49
|
+
`background:${c};color:#fff;font:bold 11px monospace;padding:0 3px;`+
|
|
50
|
+
`line-height:14px;`;
|
|
51
|
+
layer.appendChild(box); layer.appendChild(tag);
|
|
52
|
+
}
|
|
53
|
+
document.body.appendChild(layer);
|
|
54
|
+
}
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
_REMOVE_MARKS_JS = (
|
|
58
|
+
"() => { const m = document.getElementById('__loom_marks__'); "
|
|
59
|
+
"if (m) m.remove(); }"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def screenshot_b64(page: Any, marks: bool = True) -> str | None:
|
|
64
|
+
"""Base64 PNG of the current viewport. With ``marks`` (default), draw
|
|
65
|
+
the numbered Set-of-Marks overlay first so the model sees the [ids],
|
|
66
|
+
then remove it. Returns None on failure."""
|
|
67
|
+
drew = False
|
|
68
|
+
try:
|
|
69
|
+
if marks:
|
|
70
|
+
await page.evaluate(_DRAW_MARKS_JS)
|
|
71
|
+
drew = True
|
|
72
|
+
png = await page.screenshot(type="png")
|
|
73
|
+
return base64.b64encode(png).decode("ascii")
|
|
74
|
+
except Exception: # noqa: BLE001
|
|
75
|
+
return None
|
|
76
|
+
finally:
|
|
77
|
+
if drew:
|
|
78
|
+
try:
|
|
79
|
+
await page.evaluate(_REMOVE_MARKS_JS)
|
|
80
|
+
except Exception: # noqa: BLE001
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
# Snapshot the values of every labelled input/field + recent prices, so a
|
|
84
|
+
# "what's currently entered / shown" question is answerable from the DOM.
|
|
85
|
+
_STATE_JS = r"""
|
|
86
|
+
() => {
|
|
87
|
+
const clean = (s) => (s || "").replace(/\s+/g, " ").trim();
|
|
88
|
+
const fields = [];
|
|
89
|
+
for (const el of document.querySelectorAll(
|
|
90
|
+
"input, textarea, select, [role=combobox], [contenteditable]")) {
|
|
91
|
+
const r = el.getBoundingClientRect();
|
|
92
|
+
if (r.width < 4 || r.height < 4) continue;
|
|
93
|
+
const label = clean(
|
|
94
|
+
el.getAttribute("aria-label") ||
|
|
95
|
+
el.getAttribute("placeholder") ||
|
|
96
|
+
el.getAttribute("name") || "");
|
|
97
|
+
const val = clean(el.value || el.getAttribute("value") ||
|
|
98
|
+
el.innerText || "");
|
|
99
|
+
if (label || val) fields.push({ label, value: val.slice(0, 60) });
|
|
100
|
+
}
|
|
101
|
+
return { title: document.title, url: location.href, fields };
|
|
102
|
+
}
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def verify(page: Any, question: str, model: str | None = None) -> str:
|
|
107
|
+
"""Return the page's current title/URL + the values held in its
|
|
108
|
+
fields, so the agent can answer ``question`` from real DOM state.
|
|
109
|
+
|
|
110
|
+
(``model`` is accepted for API stability / a future vision upgrade
|
|
111
|
+
but unused — loomflow has no image input yet.)"""
|
|
112
|
+
try:
|
|
113
|
+
state = await page.evaluate(_STATE_JS)
|
|
114
|
+
except Exception as exc: # noqa: BLE001
|
|
115
|
+
return f"could not read page state ({exc}); try page_observe."
|
|
116
|
+
|
|
117
|
+
# Best-effort screenshot to disk for the user / future vision.
|
|
118
|
+
try:
|
|
119
|
+
shot = Path.home() / ".loom-code" / "last_page.png"
|
|
120
|
+
shot.parent.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
await page.screenshot(path=str(shot), type="png")
|
|
122
|
+
except Exception: # noqa: BLE001
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
lines = [
|
|
126
|
+
f"Checking: {question}",
|
|
127
|
+
f"PAGE: {state.get('title', '')}",
|
|
128
|
+
f"URL: {state.get('url', '')}",
|
|
129
|
+
"Current field values:",
|
|
130
|
+
]
|
|
131
|
+
fields = state.get("fields") or []
|
|
132
|
+
if not fields:
|
|
133
|
+
lines.append(" (no field values found)")
|
|
134
|
+
for f in fields:
|
|
135
|
+
lbl = f.get("label") or "(unlabelled)"
|
|
136
|
+
val = f.get("value") or ""
|
|
137
|
+
lines.append(f' {lbl}: "{val}"')
|
|
138
|
+
lines.append(
|
|
139
|
+
"\n→ Compare the values above to what you intended. If a field has "
|
|
140
|
+
"the WRONG value (e.g. origin shows your location, not what you "
|
|
141
|
+
"typed), fix it: re-observe, clear that field, type again, and for "
|
|
142
|
+
"autocompletes CLICK the matching suggestion."
|
|
143
|
+
)
|
|
144
|
+
return "\n".join(lines)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
async def look(page: Any, question: str, model: str | None = None) -> str:
|
|
148
|
+
"""VISION: screenshot the page (with Set-of-Marks [id] overlay) and
|
|
149
|
+
ask the session's multimodal model ``question`` about it. This is how
|
|
150
|
+
the agent SEES the page — reads prices/results, confirms layout,
|
|
151
|
+
disambiguates fields — instead of inferring from DOM text alone.
|
|
152
|
+
|
|
153
|
+
Sends the image via loomflow's new image input
|
|
154
|
+
(metadata['_loom_images']) so it works with whichever model the user
|
|
155
|
+
runs (Claude or GPT — both multimodal). Degrades to a DOM read if no
|
|
156
|
+
model / vision is available."""
|
|
157
|
+
b64 = await screenshot_b64(page, marks=True)
|
|
158
|
+
if b64 is None:
|
|
159
|
+
return await verify(page, question, model)
|
|
160
|
+
if not model:
|
|
161
|
+
return (
|
|
162
|
+
"no model available for page_look — using DOM values:\n\n"
|
|
163
|
+
+ await verify(page, question, model)
|
|
164
|
+
)
|
|
165
|
+
try:
|
|
166
|
+
from loomflow import Agent
|
|
167
|
+
except Exception: # noqa: BLE001
|
|
168
|
+
return await verify(page, question, model)
|
|
169
|
+
prompt = (
|
|
170
|
+
"You are looking at a screenshot of a web page. Numbered colored "
|
|
171
|
+
"boxes mark interactive elements (the number is the element id you "
|
|
172
|
+
"can act on). Answer concisely and concretely. If asked for prices/"
|
|
173
|
+
"results, read the actual values you see and list them.\n\n"
|
|
174
|
+
f"Question: {question}"
|
|
175
|
+
)
|
|
176
|
+
try:
|
|
177
|
+
probe = Agent(prompt, model=model)
|
|
178
|
+
result = await probe.run(
|
|
179
|
+
"(see attached screenshot)",
|
|
180
|
+
metadata={
|
|
181
|
+
"_loom_images": [
|
|
182
|
+
{"data": b64, "media_type": "image/png"}
|
|
183
|
+
]
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
out = getattr(result, "output", None) or str(result)
|
|
187
|
+
return str(out).strip() or "(model returned nothing)"
|
|
188
|
+
except Exception as exc: # noqa: BLE001
|
|
189
|
+
# Vision failed (model not multimodal / API issue) → DOM fallback.
|
|
190
|
+
return (
|
|
191
|
+
f"page_look vision unavailable ({type(exc).__name__}); "
|
|
192
|
+
"using DOM values instead:\n\n"
|
|
193
|
+
+ await verify(page, question, model)
|
|
194
|
+
)
|