loom-code 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. loom_code/__init__.py +22 -0
  2. loom_code/_post_commit.py +119 -0
  3. loom_code/agent.py +544 -0
  4. loom_code/approval.py +616 -0
  5. loom_code/browse/__init__.py +291 -0
  6. loom_code/browse/act.py +467 -0
  7. loom_code/browse/observe.py +249 -0
  8. loom_code/browse/session.py +96 -0
  9. loom_code/browse/verify.py +194 -0
  10. loom_code/checkpoint.py +283 -0
  11. loom_code/cli.py +495 -0
  12. loom_code/code_index.py +703 -0
  13. loom_code/compact.py +143 -0
  14. loom_code/consent.py +47 -0
  15. loom_code/credentials.py +527 -0
  16. loom_code/edit_tool.py +635 -0
  17. loom_code/extensions.py +522 -0
  18. loom_code/file_history.py +322 -0
  19. loom_code/file_tools.py +93 -0
  20. loom_code/git_hook.py +200 -0
  21. loom_code/grep_tool.py +430 -0
  22. loom_code/hooks.py +297 -0
  23. loom_code/loominit/__init__.py +23 -0
  24. loom_code/loominit/_ast_walk.py +429 -0
  25. loom_code/loominit/_files.py +284 -0
  26. loom_code/loominit/_graph.py +141 -0
  27. loom_code/loominit/_resolve.py +392 -0
  28. loom_code/loominit/_tests_map.py +108 -0
  29. loom_code/loominit/extractor.py +332 -0
  30. loom_code/loominit/repomap.py +225 -0
  31. loom_code/loominit/schema.py +242 -0
  32. loom_code/lsp_tools.py +396 -0
  33. loom_code/mcp_host.py +79 -0
  34. loom_code/operator.py +449 -0
  35. loom_code/paste.py +97 -0
  36. loom_code/paths.py +52 -0
  37. loom_code/permissions.py +177 -0
  38. loom_code/project.py +104 -0
  39. loom_code/prompts.py +451 -0
  40. loom_code/render.py +783 -0
  41. loom_code/repl.py +4080 -0
  42. loom_code/rules.py +267 -0
  43. loom_code/sandboxed_bash.py +176 -0
  44. loom_code/scribe.py +88 -0
  45. loom_code/skills/__init__.py +16 -0
  46. loom_code/skills/graphify/SKILL.md +97 -0
  47. loom_code/skills/graphify/tools.py +570 -0
  48. loom_code/trust.py +216 -0
  49. loom_code/turn.py +169 -0
  50. loom_code/web_fetch.py +370 -0
  51. loom_code/workers.py +758 -0
  52. loom_code/worktree.py +134 -0
  53. loom_code-0.1.1.dist-info/METADATA +224 -0
  54. loom_code-0.1.1.dist-info/RECORD +58 -0
  55. loom_code-0.1.1.dist-info/WHEEL +5 -0
  56. loom_code-0.1.1.dist-info/entry_points.txt +2 -0
  57. loom_code-0.1.1.dist-info/licenses/LICENSE +21 -0
  58. loom_code-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,249 @@
1
+ """Observe a page: find interactive elements, tag them with stable ids,
2
+ serialize a compact view for the LLM.
3
+
4
+ The JS below runs in the page and:
5
+ 1. Finds interactive elements — native controls (a/button/input/
6
+ select/textarea/summary), ARIA roles (button/link/checkbox/combobox/
7
+ menuitem/tab/option/switch/radio), and contenteditable. Plus a
8
+ visibility + size filter so we don't list hidden/zero-area noise.
9
+ 2. Tags each with ``data-loom-id="N"`` (stable handle for acting).
10
+ 3. Returns ``[{id, tag, role, label, value, kind}]``.
11
+
12
+ The Python side turns that into lines the model reads:
13
+ [12] textbox "Where from?" (value="Kathmandu")
14
+ [13] button "Search"
15
+ ``kind`` ("input"|"button"|"link"|"select") helps the model pick the
16
+ right action.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from typing import Any
22
+
23
+ # JS: tag + collect interactive elements. Returns a JSON-able list.
24
+ # Kept as one expression-returning function so page.evaluate gets the
25
+ # array back directly.
26
+ _COLLECT_JS = r"""
27
+ () => {
28
+ const INTERACTIVE_ROLES = new Set([
29
+ "button","link","checkbox","combobox","menuitem","tab","option",
30
+ "switch","radio","textbox","searchbox","slider","spinbutton",
31
+ ]);
32
+ const NATIVE = new Set(["A","BUTTON","INPUT","SELECT","TEXTAREA","SUMMARY"]);
33
+ const out = [];
34
+ let id = 0;
35
+
36
+ const isVisible = (el) => {
37
+ const r = el.getBoundingClientRect();
38
+ if (r.width < 4 || r.height < 4) return false;
39
+ const s = window.getComputedStyle(el);
40
+ if (s.display === "none" || s.visibility === "hidden" || s.opacity === "0")
41
+ return false;
42
+ return true;
43
+ };
44
+
45
+ // Resolve an element's human label as robustly as possible — this is
46
+ // what lets the model understand "which field is which". Order matters:
47
+ // explicit accessible name first, then associated <label>, then nearby
48
+ // text, then attributes.
49
+ const clean = (s) => (s || "").replace(/\s+/g, " ").trim().slice(0, 90);
50
+ const labelFor = (el) => {
51
+ // 1. Explicit accessible name.
52
+ const aria = el.getAttribute("aria-label");
53
+ if (aria) return clean(aria);
54
+ // 2. aria-labelledby → referenced element(s) text.
55
+ const lb = el.getAttribute("aria-labelledby");
56
+ if (lb) {
57
+ const txt = lb.split(/\s+/)
58
+ .map((id) => document.getElementById(id))
59
+ .filter(Boolean)
60
+ .map((n) => n.innerText || n.textContent || "")
61
+ .join(" ");
62
+ if (clean(txt)) return clean(txt);
63
+ }
64
+ // 3. <label for=id> or wrapping <label>.
65
+ if (el.id) {
66
+ const lab = document.querySelector(`label[for="${CSS.escape(el.id)}"]`);
67
+ if (lab && clean(lab.innerText)) return clean(lab.innerText);
68
+ }
69
+ const wrapLab = el.closest("label");
70
+ if (wrapLab && clean(wrapLab.innerText)) return clean(wrapLab.innerText);
71
+ // 4. placeholder / title.
72
+ const ph = el.getAttribute("placeholder");
73
+ if (ph) return clean(ph);
74
+ const title = el.getAttribute("title");
75
+ if (title) return clean(title);
76
+ // 5. The element's own visible text (good for buttons/links).
77
+ const own = clean(el.innerText || el.textContent || "");
78
+ if (own) return own;
79
+ // 6. For inputs with no label yet: the nearest preceding text node /
80
+ // sibling label-ish element (handles "Where from?" rendered as a
81
+ // separate span above the input).
82
+ let p = el.previousElementSibling;
83
+ for (let i = 0; i < 3 && p; i++) {
84
+ const t = clean(p.innerText || p.textContent || "");
85
+ if (t && t.length < 40) return t;
86
+ p = p.previousElementSibling;
87
+ }
88
+ // 7. Last resort: name / id attribute.
89
+ const name = el.getAttribute("name") || el.getAttribute("id") || "";
90
+ return clean(name);
91
+ };
92
+
93
+ const kindFor = (el, role) => {
94
+ const t = el.tagName;
95
+ if (t === "INPUT" || t === "TEXTAREA" || role === "textbox" ||
96
+ role === "searchbox" || role === "combobox" ||
97
+ el.isContentEditable) return "input";
98
+ if (t === "SELECT") return "select";
99
+ if (t === "A" || role === "link") return "link";
100
+ return "button";
101
+ };
102
+
103
+ const isInteractive = (el) => {
104
+ if (NATIVE.has(el.tagName)) return true;
105
+ const role = el.getAttribute("role");
106
+ if (role && INTERACTIVE_ROLES.has(role)) return true;
107
+ if (el.isContentEditable) return true;
108
+ // Cursor:pointer is a strong custom-button signal.
109
+ if (window.getComputedStyle(el).cursor === "pointer" &&
110
+ (el.onclick || el.getAttribute("jsaction"))) return true;
111
+ return false;
112
+ };
113
+
114
+ const all = document.querySelectorAll("*");
115
+ for (const el of all) {
116
+ try {
117
+ if (!isInteractive(el)) continue;
118
+ if (!isVisible(el)) continue;
119
+ const role = el.getAttribute("role") || "";
120
+ el.setAttribute("data-loom-id", String(id));
121
+ out.push({
122
+ id,
123
+ tag: el.tagName.toLowerCase(),
124
+ role,
125
+ kind: kindFor(el, role),
126
+ label: labelFor(el),
127
+ value: (el.value != null ? String(el.value) : "").slice(0, 60),
128
+ });
129
+ id++;
130
+ } catch (e) { /* skip problematic node */ }
131
+ }
132
+ return out;
133
+ }
134
+ """
135
+
136
+
137
+ # JS: extract the page's VISIBLE TEXT content — what a human reads
138
+ # (prices, listings, results, headings). Unlike observe (which lists
139
+ # CLICKABLE elements), this is for READING outcomes. Walks visible text
140
+ # nodes, collapses whitespace, dedupes, and keeps meaningful lines.
141
+ _READ_TEXT_JS = r"""
142
+ () => {
143
+ const isVisible = (el) => {
144
+ if (!el) return false;
145
+ const s = window.getComputedStyle(el);
146
+ if (s.display === "none" || s.visibility === "hidden" ||
147
+ s.opacity === "0") return false;
148
+ const r = el.getBoundingClientRect();
149
+ return r.width > 1 && r.height > 1;
150
+ };
151
+ // Skip script/style/nav-chrome-heavy containers.
152
+ const SKIP = new Set(["SCRIPT","STYLE","NOSCRIPT","SVG","PATH"]);
153
+ const out = [];
154
+ const seen = new Set();
155
+ const walker = document.createTreeWalker(
156
+ document.body, NodeFilter.SHOW_TEXT, null);
157
+ let n;
158
+ while ((n = walker.nextNode())) {
159
+ const t = (n.nodeValue || "").replace(/\s+/g, " ").trim();
160
+ if (!t || t.length < 2) continue;
161
+ const p = n.parentElement;
162
+ if (!p || SKIP.has(p.tagName)) continue;
163
+ if (!isVisible(p)) continue;
164
+ if (seen.has(t)) continue;
165
+ seen.add(t);
166
+ out.push(t);
167
+ if (out.length > 400) break; // safety cap
168
+ }
169
+ return out;
170
+ }
171
+ """
172
+
173
+
174
+ async def read_text(page: Any) -> str:
175
+ """Return the page's visible text content (for reading prices /
176
+ results / listings). Compact + capped so it stays affordable."""
177
+ try:
178
+ lines: list[str] = await page.evaluate(_READ_TEXT_JS)
179
+ except Exception as exc: # noqa: BLE001
180
+ return f"(could not read page text: {exc})"
181
+ title = url = ""
182
+ try:
183
+ title = await page.title()
184
+ url = page.url
185
+ except Exception: # noqa: BLE001
186
+ pass
187
+ body = "\n".join(lines)
188
+ # Hard char cap so a huge results page can't blow the budget.
189
+ if len(body) > 8000:
190
+ body = body[:8000] + "\n… (truncated — ask to read more / scroll)"
191
+ return f"PAGE: {title}\nURL: {url}\n\n{body}"
192
+
193
+
194
+ async def observe(page: Any) -> tuple[list[dict[str, Any]], str]:
195
+ """Tag + collect interactive elements; return (elements, rendered).
196
+
197
+ ``rendered`` is the compact text the LLM reads. ``elements`` is the
198
+ structured list (also drives the selector map / id validity)."""
199
+ try:
200
+ elements: list[dict[str, Any]] = await page.evaluate(_COLLECT_JS)
201
+ except Exception as exc: # noqa: BLE001 — never break the run
202
+ return [], f"(could not read the page: {exc})"
203
+
204
+ title = ""
205
+ url = ""
206
+ try:
207
+ title = await page.title()
208
+ url = page.url
209
+ except Exception: # noqa: BLE001
210
+ pass
211
+
212
+ lines: list[str] = [f"PAGE: {title}", f"URL: {url}", ""]
213
+ if not elements:
214
+ lines.append("(no interactive elements found — the page may still "
215
+ "be loading; try page_observe again, or page_check to "
216
+ "see it visually.)")
217
+ return elements, "\n".join(lines)
218
+
219
+ # COST CONTROL: a page can have 800+ interactive elements (nav, footer
220
+ # links, ad chrome) — re-sending all of them every turn is the token
221
+ # blowup. Prioritise the elements that matter (inputs/comboboxes +
222
+ # buttons with real labels) and cap the rendered list. The full
223
+ # ``elements`` list (with ids) is still returned for acting; we only
224
+ # trim what the LLM READS.
225
+ PRIORITY = {"input": 0, "select": 1, "button": 2, "link": 3}
226
+
227
+ def _score(e: dict[str, Any]) -> tuple[int, int]:
228
+ kind = e.get("kind", "link")
229
+ has_label = 0 if (e.get("label") or "").strip() else 1
230
+ return (PRIORITY.get(kind, 9), has_label)
231
+
232
+ ranked = sorted(elements, key=_score)
233
+ CAP = 60
234
+ shown = ranked[:CAP]
235
+ # Keep them in id order within the shown set so the list reads
236
+ # naturally.
237
+ shown.sort(key=lambda e: e["id"])
238
+ for e in shown:
239
+ label = e.get("label") or "(no label)"
240
+ val = e.get("value") or ""
241
+ val_part = f' (value="{val}")' if val else ""
242
+ lines.append(f'[{e["id"]}] {e["kind"]} "{label}"{val_part}')
243
+ if len(elements) > len(shown):
244
+ lines.append(
245
+ f"… (+{len(elements) - len(shown)} more elements not shown — "
246
+ "mostly nav/footer. If you need one that isn't listed, say which "
247
+ "and I'll surface it.)"
248
+ )
249
+ return elements, "\n".join(lines)
@@ -0,0 +1,96 @@
1
+ """A single headed browser session for loom-code's ``/computer`` mode.
2
+
3
+ One ``BrowserSession`` owns one visible Chromium window for the life of a
4
+ REPL session. The agent's page_* tools all act through it. It is created
5
+ lazily on the first navigation and torn down on mode-off / exit.
6
+
7
+ Reliability design (learned from our Google-Flights failures + how
8
+ browser-use stays robust):
9
+
10
+ * **Stable element handles.** Instead of ephemeral snapshot refs (which
11
+ die the instant the DOM changes — the ``Ref e145 not found`` error),
12
+ every interactive element is tagged in the live DOM with a
13
+ ``data-loom-id="N"`` attribute. Acting later re-selects by that
14
+ attribute (``[data-loom-id="N"]``), so Playwright re-resolves the
15
+ element FRESH on every action — no stale handles. The id rides on the
16
+ element through re-renders unless the element itself is recreated.
17
+
18
+ * **One persistent page.** The session keeps the active page so observe →
19
+ act → observe operates on the same evolving page, like a human.
20
+
21
+ This module is intentionally dependency-light: Playwright only (already
22
+ installed for the project). No browser-use, no extra packages.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import Any
28
+
29
+
30
+ class BrowserSession:
31
+ """Lazily-launched headed Chromium + the active page. Single-page for
32
+ now (the active tab); multi-tab can come later."""
33
+
34
+ def __init__(self, *, headless: bool = False) -> None:
35
+ self._headless = headless
36
+ self._pw: Any = None
37
+ self._browser: Any = None
38
+ self._context: Any = None
39
+ self._page: Any = None
40
+ # index -> True; the live truth lives in the DOM (data-loom-id),
41
+ # this just tracks the highest id assigned this observe pass.
42
+ self._max_id: int = 0
43
+
44
+ async def start(self) -> None:
45
+ """Launch the browser if not already running. Idempotent."""
46
+ if self._page is not None:
47
+ return
48
+ from playwright.async_api import async_playwright
49
+
50
+ self._pw = await async_playwright().start()
51
+ # Headed = the user watches it work (the whole point of /computer).
52
+ self._browser = await self._pw.chromium.launch(
53
+ headless=self._headless,
54
+ args=["--start-maximized"],
55
+ )
56
+ # A real-ish context: viewport=None lets the window size drive it
57
+ # (with --start-maximized), and a normal UA reduces trivial
58
+ # bot-walls.
59
+ self._context = await self._browser.new_context(
60
+ viewport=None,
61
+ user_agent=(
62
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
63
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
64
+ "Chrome/124.0.0.0 Safari/537.36"
65
+ ),
66
+ )
67
+ self._page = await self._context.new_page()
68
+
69
+ @property
70
+ def page(self) -> Any:
71
+ if self._page is None:
72
+ raise RuntimeError("browser not started — call start() first")
73
+ return self._page
74
+
75
+ async def goto(self, url: str) -> None:
76
+ await self.start()
77
+ if not url.startswith(("http://", "https://")):
78
+ url = "https://" + url
79
+ await self._page.goto(
80
+ url, wait_until="domcontentloaded", timeout=45000
81
+ )
82
+
83
+ async def close(self) -> None:
84
+ """Tear everything down. Safe to call multiple times."""
85
+ for closer in (
86
+ getattr(self._context, "close", None),
87
+ getattr(self._browser, "close", None),
88
+ getattr(self._pw, "stop", None),
89
+ ):
90
+ if closer is not None:
91
+ try:
92
+ await closer()
93
+ except Exception: # noqa: BLE001 — best-effort teardown
94
+ pass
95
+ self._pw = self._browser = self._context = self._page = None
96
+ self._max_id = 0
@@ -0,0 +1,194 @@
1
+ """Verify page state by reading the live DOM.
2
+
3
+ loomflow has no image-to-model (vision) input yet, so instead of a
4
+ screenshot+vision check we read the ACTUAL values the page holds — the
5
+ text content of the inputs/fields visible on screen — and return them so
6
+ the agent can confirm "did Delhi land in the origin field?". This catches
7
+ the wrong-field / didn't-stick failures without needing vision: the
8
+ field's real ``value`` is ground truth.
9
+
10
+ It also saves a screenshot to disk so the user (and a future vision
11
+ upgrade) can inspect what the page looked like.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import base64
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ # Set-of-Marks: draw a numbered box on each interactive element (using
21
+ # the data-loom-id tags observe() already placed) so a screenshot the
22
+ # model SEES is annotated with the SAME [ids] it acts on. Returns a
23
+ # teardown function name so we can remove the overlay after the shot.
24
+ _DRAW_MARKS_JS = r"""
25
+ () => {
26
+ const old = document.getElementById('__loom_marks__');
27
+ if (old) old.remove();
28
+ const layer = document.createElement('div');
29
+ layer.id = '__loom_marks__';
30
+ layer.style.cssText =
31
+ 'position:fixed;inset:0;z-index:2147483647;pointer-events:none;';
32
+ const COLORS = ['#e6194B','#3cb44b','#4363d8','#f58231','#911eb4',
33
+ '#469990','#9A6324','#800000','#808000','#000075'];
34
+ let n = 0;
35
+ for (const el of document.querySelectorAll('[data-loom-id]')) {
36
+ const r = el.getBoundingClientRect();
37
+ if (r.width < 4 || r.height < 4) continue;
38
+ if (r.bottom < 0 || r.top > window.innerHeight) continue; // off-screen
39
+ const id = el.getAttribute('data-loom-id');
40
+ const c = COLORS[n % COLORS.length]; n++;
41
+ const box = document.createElement('div');
42
+ box.style.cssText =
43
+ `position:fixed;left:${r.left}px;top:${r.top}px;width:${r.width}px;`+
44
+ `height:${r.height}px;border:2px solid ${c};box-sizing:border-box;`;
45
+ const tag = document.createElement('div');
46
+ tag.textContent = id;
47
+ tag.style.cssText =
48
+ `position:fixed;left:${r.left}px;top:${Math.max(0,r.top-14)}px;`+
49
+ `background:${c};color:#fff;font:bold 11px monospace;padding:0 3px;`+
50
+ `line-height:14px;`;
51
+ layer.appendChild(box); layer.appendChild(tag);
52
+ }
53
+ document.body.appendChild(layer);
54
+ }
55
+ """
56
+
57
+ _REMOVE_MARKS_JS = (
58
+ "() => { const m = document.getElementById('__loom_marks__'); "
59
+ "if (m) m.remove(); }"
60
+ )
61
+
62
+
63
+ async def screenshot_b64(page: Any, marks: bool = True) -> str | None:
64
+ """Base64 PNG of the current viewport. With ``marks`` (default), draw
65
+ the numbered Set-of-Marks overlay first so the model sees the [ids],
66
+ then remove it. Returns None on failure."""
67
+ drew = False
68
+ try:
69
+ if marks:
70
+ await page.evaluate(_DRAW_MARKS_JS)
71
+ drew = True
72
+ png = await page.screenshot(type="png")
73
+ return base64.b64encode(png).decode("ascii")
74
+ except Exception: # noqa: BLE001
75
+ return None
76
+ finally:
77
+ if drew:
78
+ try:
79
+ await page.evaluate(_REMOVE_MARKS_JS)
80
+ except Exception: # noqa: BLE001
81
+ pass
82
+
83
+ # Snapshot the values of every labelled input/field + recent prices, so a
84
+ # "what's currently entered / shown" question is answerable from the DOM.
85
+ _STATE_JS = r"""
86
+ () => {
87
+ const clean = (s) => (s || "").replace(/\s+/g, " ").trim();
88
+ const fields = [];
89
+ for (const el of document.querySelectorAll(
90
+ "input, textarea, select, [role=combobox], [contenteditable]")) {
91
+ const r = el.getBoundingClientRect();
92
+ if (r.width < 4 || r.height < 4) continue;
93
+ const label = clean(
94
+ el.getAttribute("aria-label") ||
95
+ el.getAttribute("placeholder") ||
96
+ el.getAttribute("name") || "");
97
+ const val = clean(el.value || el.getAttribute("value") ||
98
+ el.innerText || "");
99
+ if (label || val) fields.push({ label, value: val.slice(0, 60) });
100
+ }
101
+ return { title: document.title, url: location.href, fields };
102
+ }
103
+ """
104
+
105
+
106
+ async def verify(page: Any, question: str, model: str | None = None) -> str:
107
+ """Return the page's current title/URL + the values held in its
108
+ fields, so the agent can answer ``question`` from real DOM state.
109
+
110
+ (``model`` is accepted for API stability / a future vision upgrade
111
+ but unused — loomflow has no image input yet.)"""
112
+ try:
113
+ state = await page.evaluate(_STATE_JS)
114
+ except Exception as exc: # noqa: BLE001
115
+ return f"could not read page state ({exc}); try page_observe."
116
+
117
+ # Best-effort screenshot to disk for the user / future vision.
118
+ try:
119
+ shot = Path.home() / ".loom-code" / "last_page.png"
120
+ shot.parent.mkdir(parents=True, exist_ok=True)
121
+ await page.screenshot(path=str(shot), type="png")
122
+ except Exception: # noqa: BLE001
123
+ pass
124
+
125
+ lines = [
126
+ f"Checking: {question}",
127
+ f"PAGE: {state.get('title', '')}",
128
+ f"URL: {state.get('url', '')}",
129
+ "Current field values:",
130
+ ]
131
+ fields = state.get("fields") or []
132
+ if not fields:
133
+ lines.append(" (no field values found)")
134
+ for f in fields:
135
+ lbl = f.get("label") or "(unlabelled)"
136
+ val = f.get("value") or ""
137
+ lines.append(f' {lbl}: "{val}"')
138
+ lines.append(
139
+ "\n→ Compare the values above to what you intended. If a field has "
140
+ "the WRONG value (e.g. origin shows your location, not what you "
141
+ "typed), fix it: re-observe, clear that field, type again, and for "
142
+ "autocompletes CLICK the matching suggestion."
143
+ )
144
+ return "\n".join(lines)
145
+
146
+
147
+ async def look(page: Any, question: str, model: str | None = None) -> str:
148
+ """VISION: screenshot the page (with Set-of-Marks [id] overlay) and
149
+ ask the session's multimodal model ``question`` about it. This is how
150
+ the agent SEES the page — reads prices/results, confirms layout,
151
+ disambiguates fields — instead of inferring from DOM text alone.
152
+
153
+ Sends the image via loomflow's new image input
154
+ (metadata['_loom_images']) so it works with whichever model the user
155
+ runs (Claude or GPT — both multimodal). Degrades to a DOM read if no
156
+ model / vision is available."""
157
+ b64 = await screenshot_b64(page, marks=True)
158
+ if b64 is None:
159
+ return await verify(page, question, model)
160
+ if not model:
161
+ return (
162
+ "no model available for page_look — using DOM values:\n\n"
163
+ + await verify(page, question, model)
164
+ )
165
+ try:
166
+ from loomflow import Agent
167
+ except Exception: # noqa: BLE001
168
+ return await verify(page, question, model)
169
+ prompt = (
170
+ "You are looking at a screenshot of a web page. Numbered colored "
171
+ "boxes mark interactive elements (the number is the element id you "
172
+ "can act on). Answer concisely and concretely. If asked for prices/"
173
+ "results, read the actual values you see and list them.\n\n"
174
+ f"Question: {question}"
175
+ )
176
+ try:
177
+ probe = Agent(prompt, model=model)
178
+ result = await probe.run(
179
+ "(see attached screenshot)",
180
+ metadata={
181
+ "_loom_images": [
182
+ {"data": b64, "media_type": "image/png"}
183
+ ]
184
+ },
185
+ )
186
+ out = getattr(result, "output", None) or str(result)
187
+ return str(out).strip() or "(model returned nothing)"
188
+ except Exception as exc: # noqa: BLE001
189
+ # Vision failed (model not multimodal / API issue) → DOM fallback.
190
+ return (
191
+ f"page_look vision unavailable ({type(exc).__name__}); "
192
+ "using DOM values instead:\n\n"
193
+ + await verify(page, question, model)
194
+ )