zu-tools 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zu_tools/__init__.py +5 -0
- zu_tools/action_surface.py +429 -0
- zu_tools/browser.py +172 -0
- zu_tools/fetch.py +132 -0
- zu_tools/net.py +242 -0
- zu_tools/parse.py +47 -0
- zu_tools/pointer.py +317 -0
- zu_tools/recall.py +82 -0
- zu_tools/render.py +205 -0
- zu_tools/search.py +156 -0
- zu_tools/simulate.py +67 -0
- zu_tools-0.2.0.dist-info/METADATA +54 -0
- zu_tools-0.2.0.dist-info/RECORD +15 -0
- zu_tools-0.2.0.dist-info/WHEEL +4 -0
- zu_tools-0.2.0.dist-info/entry_points.txt +10 -0
zu_tools/__init__.py
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
"""action_surface — the perception-reduction tool (tier 3, Engineering Design §11).
|
|
2
|
+
|
|
3
|
+
A rendered web page is a DOM of 100k–1M+ tokens; the decision the agent needs
|
|
4
|
+
from it — "click Place order" — is a handful. Pushing the whole blob through the
|
|
5
|
+
model is slow, expensive, and *worse for accuracy* (the signal drowns in
|
|
6
|
+
markup). The way out is a reframe: the agent almost never needs the page — it
|
|
7
|
+
needs the **set of things it can do** on the page. That set is a few dozen
|
|
8
|
+
affordances, a few hundred tokens.
|
|
9
|
+
|
|
10
|
+
This tool produces that set, **deterministically**. The decision rule (§4.5)
|
|
11
|
+
settles why it is a tool and not a model job: a script may *enumerate what is
|
|
12
|
+
possible* (every actionable element), but it must not *decide what is reasonable*
|
|
13
|
+
(which one to pick) — that is the policy's judgment. So the reducer surfaces the
|
|
14
|
+
possible and never ranks or prunes by guessed task-relevance.
|
|
15
|
+
|
|
16
|
+
The pipeline (§11.2), run over an accessibility tree rather than the raw DOM:
|
|
17
|
+
|
|
18
|
+
1. Walk the accessibility tree — roles, names, states — an order of magnitude
|
|
19
|
+
smaller than the DOM, built to answer "what can a user do here".
|
|
20
|
+
2. Filter to interactive + meaningful (actions, plus the headings/labels/errors
|
|
21
|
+
an action needs); drop the rest.
|
|
22
|
+
3. Prune the invisible — ignored, off-screen, zero-area, hidden.
|
|
23
|
+
4. Resolve a stable, human-meaningful label per element.
|
|
24
|
+
5. Assign a stable, opaque handle (a1, a2 …) that maps back, harness-side, to a
|
|
25
|
+
role+name locator. The model emits the handle, never a selector (§11.3).
|
|
26
|
+
6. Emit a compact, typed representation.
|
|
27
|
+
|
|
28
|
+
And the competence boundary (§11.4): the honest risk is a false negative —
|
|
29
|
+
pruning the one element the task needed (a canvas button, an unlabeled icon). So
|
|
30
|
+
the reducer must know when it is **blind** and *signal* escalation to tier-4
|
|
31
|
+
vision rather than silently return an incomplete surface. ``blind`` on the
|
|
32
|
+
result is that signal; the ``action-surface-blind`` detector turns it into an
|
|
33
|
+
ESCALATE. Graceful degradation, never silent incompleteness.
|
|
34
|
+
|
|
35
|
+
The deterministic reducer (:func:`reduce_surface`) is the whole value and is
|
|
36
|
+
pure — it runs on an accessibility-tree snapshot with no browser, which is how a
|
|
37
|
+
coding harness drives it offline and how it is tested at $0. The live arm asks a
|
|
38
|
+
browser session for the tree (:meth:`ActionSurface.__call__` with ``op=open``)
|
|
39
|
+
and runs the same reducer over it.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
from typing import Any
|
|
45
|
+
from urllib.parse import urlsplit
|
|
46
|
+
|
|
47
|
+
from pydantic import BaseModel, Field
|
|
48
|
+
|
|
49
|
+
from zu_core.ports import CAP_NET, CAP_SANDBOX, EGRESS_OPEN, BrowserSessionHandle, SessionBackend
|
|
50
|
+
|
|
51
|
+
from .net import validate_and_pin
|
|
52
|
+
|
|
53
|
+
_DEFAULT_IMAGE = "ghcr.io/k3-mt/zu-render-chromium:latest"
|
|
54
|
+
|
|
55
|
+
# Roles that represent something the agent can *do*. The list is generous on
|
|
56
|
+
# purpose — enumerating the possible is the job; choosing among it is the
|
|
57
|
+
# policy's. Anything actionable a real accessibility tree exposes belongs here.
|
|
58
|
+
INTERACTIVE_ROLES: frozenset[str] = frozenset({
|
|
59
|
+
"button", "link", "textbox", "searchbox", "combobox", "checkbox", "radio",
|
|
60
|
+
"switch", "slider", "spinbutton", "menuitem", "menuitemcheckbox",
|
|
61
|
+
"menuitemradio", "tab", "option", "textarea", "listbox", "menubutton",
|
|
62
|
+
"togglebutton", "datepicker", "colorwell",
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
# Roles whose *text* is meaningful context for choosing an action — headings
|
|
66
|
+
# orient, alerts/status carry the error and validation text an action needs —
|
|
67
|
+
# but which are not themselves actionable. We keep their names as context, never
|
|
68
|
+
# as affordances.
|
|
69
|
+
CONTEXT_ROLES: frozenset[str] = frozenset({
|
|
70
|
+
"heading", "alert", "status", "alertdialog", "log", "marquee",
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class AxNode(BaseModel):
|
|
75
|
+
"""One normalised accessibility-tree node — the reducer's input currency.
|
|
76
|
+
|
|
77
|
+
A small, serialisable shape so the reducer is pure and a harness can feed it
|
|
78
|
+
a captured tree directly. :func:`normalize_axtree` produces these from the
|
|
79
|
+
raw CDP ``Accessibility.getFullAXTree`` format.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
role: str
|
|
83
|
+
name: str = ""
|
|
84
|
+
value: str | None = None
|
|
85
|
+
states: list[str] = Field(default_factory=list)
|
|
86
|
+
placeholder: str | None = None
|
|
87
|
+
description: str | None = None
|
|
88
|
+
# Pruning inputs. ``visible`` folds in aria-hidden/display:none/off-screen;
|
|
89
|
+
# ``ignored`` is the tree's own "not exposed" flag; ``bounds`` is [x,y,w,h].
|
|
90
|
+
visible: bool = True
|
|
91
|
+
ignored: bool = False
|
|
92
|
+
bounds: list[float] | None = None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Affordance(BaseModel):
|
|
96
|
+
"""One thing the policy can do, addressed by an opaque handle."""
|
|
97
|
+
|
|
98
|
+
handle: str
|
|
99
|
+
role: str
|
|
100
|
+
label: str
|
|
101
|
+
value: str | None = None
|
|
102
|
+
states: list[str] = Field(default_factory=list)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class Surface(BaseModel):
|
|
106
|
+
"""The compact, typed reduction of a page — a few hundred tokens.
|
|
107
|
+
|
|
108
|
+
``handle_map`` is the harness-side indirection (§11.3): handle → role+name
|
|
109
|
+
locator. The model only ever sees and emits handles; the durable locator
|
|
110
|
+
stays here and is re-resolved at action time.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
title: str = ""
|
|
114
|
+
url: str = ""
|
|
115
|
+
affordances: list[Affordance] = Field(default_factory=list)
|
|
116
|
+
context: list[str] = Field(default_factory=list)
|
|
117
|
+
handle_map: dict[str, dict] = Field(default_factory=dict)
|
|
118
|
+
blind: bool = False
|
|
119
|
+
blind_reason: str | None = None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _label_of(node: AxNode) -> str:
|
|
123
|
+
"""The stable, human-meaningful label (§11.2 step 4): accessible name first
|
|
124
|
+
(which already folds in aria-label and an associated <label>), then
|
|
125
|
+
placeholder, then description. Class soup never reaches here — if none of
|
|
126
|
+
these is set, the element is unlabeled and counts toward blindness."""
|
|
127
|
+
for candidate in (node.name, node.placeholder, node.description):
|
|
128
|
+
if candidate and candidate.strip():
|
|
129
|
+
return candidate.strip()
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _is_pruned(node: AxNode) -> bool:
|
|
134
|
+
"""Step 3 — prune the invisible. ignored / not-visible / zero-area go."""
|
|
135
|
+
if node.ignored or not node.visible:
|
|
136
|
+
return True
|
|
137
|
+
if node.bounds is not None and len(node.bounds) == 4:
|
|
138
|
+
w, h = node.bounds[2], node.bounds[3]
|
|
139
|
+
if w <= 0 or h <= 0:
|
|
140
|
+
return True
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def reduce_surface(
|
|
145
|
+
nodes: list[AxNode],
|
|
146
|
+
*,
|
|
147
|
+
title: str = "",
|
|
148
|
+
url: str = "",
|
|
149
|
+
unlabeled_ratio: float = 0.5,
|
|
150
|
+
) -> Surface:
|
|
151
|
+
"""Reduce an accessibility tree to the action surface — pure, deterministic.
|
|
152
|
+
|
|
153
|
+
Handles are assigned ``a1, a2 …`` in document (input) order over the emitted
|
|
154
|
+
affordances, so the same tree always yields the same handles. The blind
|
|
155
|
+
signal (§11.4) fires when the surface cannot be trusted to be complete: the
|
|
156
|
+
page had content but yielded no affordances, or too large a fraction of the
|
|
157
|
+
interactive elements have no resolvable label (a canvas/icon-heavy page the
|
|
158
|
+
accessibility tree describes poorly).
|
|
159
|
+
"""
|
|
160
|
+
affordances: list[Affordance] = []
|
|
161
|
+
handle_map: dict[str, dict] = {}
|
|
162
|
+
context: list[str] = []
|
|
163
|
+
unlabeled = 0
|
|
164
|
+
interactive_seen = 0
|
|
165
|
+
kept_any_content = False
|
|
166
|
+
|
|
167
|
+
for node in nodes:
|
|
168
|
+
if _is_pruned(node):
|
|
169
|
+
continue
|
|
170
|
+
kept_any_content = True
|
|
171
|
+
role = node.role
|
|
172
|
+
|
|
173
|
+
if role in CONTEXT_ROLES:
|
|
174
|
+
label = _label_of(node)
|
|
175
|
+
if label:
|
|
176
|
+
context.append(label)
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
if role in INTERACTIVE_ROLES:
|
|
180
|
+
interactive_seen += 1
|
|
181
|
+
label = _label_of(node)
|
|
182
|
+
if not label:
|
|
183
|
+
# Enumerated as possible, but unaddressable — a blindness signal,
|
|
184
|
+
# not a meaningless handle handed to the model.
|
|
185
|
+
unlabeled += 1
|
|
186
|
+
continue
|
|
187
|
+
handle = f"a{len(affordances) + 1}"
|
|
188
|
+
affordances.append(
|
|
189
|
+
Affordance(
|
|
190
|
+
handle=handle,
|
|
191
|
+
role=role,
|
|
192
|
+
label=label,
|
|
193
|
+
value=node.value,
|
|
194
|
+
states=list(node.states),
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
# The durable locator the model never sees (role + accessible name).
|
|
198
|
+
handle_map[handle] = {"role": role, "name": label}
|
|
199
|
+
|
|
200
|
+
blind = False
|
|
201
|
+
blind_reason: str | None = None
|
|
202
|
+
if not affordances and kept_any_content:
|
|
203
|
+
blind = True
|
|
204
|
+
blind_reason = "page had content but the accessibility tree yielded no addressable actions"
|
|
205
|
+
elif interactive_seen and (unlabeled / interactive_seen) > unlabeled_ratio:
|
|
206
|
+
blind = True
|
|
207
|
+
blind_reason = (
|
|
208
|
+
f"{unlabeled}/{interactive_seen} interactive elements are unlabeled "
|
|
209
|
+
"in the accessibility tree — too thin to trust"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return Surface(
|
|
213
|
+
title=title,
|
|
214
|
+
url=url,
|
|
215
|
+
affordances=affordances,
|
|
216
|
+
context=context,
|
|
217
|
+
handle_map=handle_map,
|
|
218
|
+
blind=blind,
|
|
219
|
+
blind_reason=blind_reason,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _ax_string(field: Any) -> str:
|
|
224
|
+
"""Read a CDP AX value object ``{"type":...,"value":...}`` as a string."""
|
|
225
|
+
if isinstance(field, dict):
|
|
226
|
+
v = field.get("value")
|
|
227
|
+
return str(v) if v is not None else ""
|
|
228
|
+
return ""
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def normalize_axtree(cdp_nodes: list[dict]) -> list[AxNode]:
|
|
232
|
+
"""Normalise the raw CDP ``Accessibility.getFullAXTree`` node list into
|
|
233
|
+
:class:`AxNode` records, in document (pre-order) order as CDP returns them.
|
|
234
|
+
|
|
235
|
+
CDP shape per node: ``role``/``name`` are ``{type,value}`` objects;
|
|
236
|
+
``properties`` is a list of ``{name, value:{value}}``; ``ignored`` is a bool.
|
|
237
|
+
States we surface: disabled, checked, expanded, required, focused, selected,
|
|
238
|
+
invalid. Placeholder/description/value are read from their AX properties.
|
|
239
|
+
"""
|
|
240
|
+
out: list[AxNode] = []
|
|
241
|
+
state_props = {"disabled", "checked", "expanded", "required", "focused", "selected", "invalid"}
|
|
242
|
+
for n in cdp_nodes:
|
|
243
|
+
role = _ax_string(n.get("role"))
|
|
244
|
+
if not role:
|
|
245
|
+
continue
|
|
246
|
+
props = {p.get("name"): p.get("value", {}) for p in n.get("properties", []) if isinstance(p, dict)}
|
|
247
|
+
states: list[str] = []
|
|
248
|
+
for sp in sorted(state_props):
|
|
249
|
+
val = props.get(sp, {})
|
|
250
|
+
v = val.get("value") if isinstance(val, dict) else None
|
|
251
|
+
if v is True or (isinstance(v, str) and v not in ("false", "")):
|
|
252
|
+
states.append(sp if not isinstance(v, str) or v == "true" else f"{sp}:{v}")
|
|
253
|
+
out.append(
|
|
254
|
+
AxNode(
|
|
255
|
+
role=role,
|
|
256
|
+
name=_ax_string(n.get("name")),
|
|
257
|
+
value=_ax_string(n.get("value")) or None,
|
|
258
|
+
states=states,
|
|
259
|
+
placeholder=_ax_string(props.get("placeholder")) or None,
|
|
260
|
+
description=_ax_string(n.get("description")) or None,
|
|
261
|
+
ignored=bool(n.get("ignored", False)),
|
|
262
|
+
# CDP marks unexposed nodes via ``ignored``; visibility off-screen
|
|
263
|
+
# is folded into ``hidden`` when the server supplies bounds.
|
|
264
|
+
visible=not bool(props.get("hidden", {}).get("value", False))
|
|
265
|
+
if isinstance(props.get("hidden"), dict) else True,
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
return out
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class ActionSurface:
|
|
272
|
+
"""Tier-3 tool: reduce a page to its action surface (and keep the handle map).
|
|
273
|
+
|
|
274
|
+
Two ways in, one reducer:
|
|
275
|
+
|
|
276
|
+
* ``op=reduce`` (default) — reduce a tree the caller already has. Pass
|
|
277
|
+
``nodes`` (AxNode dicts) or raw ``axtree`` (CDP nodes), plus ``title`` /
|
|
278
|
+
``url``. No browser, fully offline — the harness-driven and tested path.
|
|
279
|
+
* ``op=open`` — open ``url`` in a headless browser session, ask it for the
|
|
280
|
+
accessibility tree, and reduce that. The live arm.
|
|
281
|
+
|
|
282
|
+
After a reduction the handle→locator map is held on the instance for the run;
|
|
283
|
+
``op=resolve`` returns the durable locator for a handle (a stale handle is an
|
|
284
|
+
escalation, not a crash — the caller re-resolves at action time, §11.3).
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
name = "action_surface"
|
|
288
|
+
tier = 3 # the accessibility-tree tier; unlocked by a detector ESCALATE
|
|
289
|
+
schema = {
|
|
290
|
+
"name": "action_surface",
|
|
291
|
+
"description": (
|
|
292
|
+
"Reduce a web page to the compact SET OF THINGS YOU CAN DO on it — a "
|
|
293
|
+
"flat list of affordances (button/link/textbox/…) each with an opaque "
|
|
294
|
+
"handle (a1, a2 …) and a human label. You choose a handle and act on "
|
|
295
|
+
"it; you never see or emit a CSS selector. op=open a url to capture and "
|
|
296
|
+
"reduce its accessibility tree; op=resolve a handle to its locator. If "
|
|
297
|
+
"'blind' is true the tree is too thin to trust — escalate to vision."
|
|
298
|
+
),
|
|
299
|
+
"parameters": {
|
|
300
|
+
"type": "object",
|
|
301
|
+
"properties": {
|
|
302
|
+
"op": {"type": "string", "enum": ["reduce", "open", "resolve"]},
|
|
303
|
+
"url": {"type": "string", "description": "for op=open: the page to reduce"},
|
|
304
|
+
"handle": {"type": "string", "description": "for op=resolve: the handle to resolve"},
|
|
305
|
+
"axtree": {"type": "array", "items": {"type": "object"},
|
|
306
|
+
"description": "for op=reduce: raw CDP getFullAXTree nodes"},
|
|
307
|
+
"nodes": {"type": "array", "items": {"type": "object"},
|
|
308
|
+
"description": "for op=reduce: pre-normalised AxNode dicts"},
|
|
309
|
+
"title": {"type": "string"},
|
|
310
|
+
},
|
|
311
|
+
"required": ["op"],
|
|
312
|
+
},
|
|
313
|
+
}
|
|
314
|
+
prompt_fragment = (
|
|
315
|
+
"action_surface(op=open, url): reduce a page to a short list of affordances "
|
|
316
|
+
"(handles a1,a2,… with labels) instead of reading the whole DOM. Pick a handle "
|
|
317
|
+
"to act on; resolve(handle) gives its locator. 'blind' means escalate to vision."
|
|
318
|
+
)
|
|
319
|
+
capabilities = frozenset({CAP_NET, CAP_SANDBOX})
|
|
320
|
+
egress = frozenset({EGRESS_OPEN})
|
|
321
|
+
|
|
322
|
+
def __init__(
|
|
323
|
+
self,
|
|
324
|
+
backend: SessionBackend | None = None,
|
|
325
|
+
image: str = _DEFAULT_IMAGE,
|
|
326
|
+
*,
|
|
327
|
+
allow_private: bool | None = None,
|
|
328
|
+
unlabeled_ratio: float = 0.5,
|
|
329
|
+
) -> None:
|
|
330
|
+
self._backend = backend
|
|
331
|
+
self.image = image
|
|
332
|
+
self.allow_private = allow_private
|
|
333
|
+
self.unlabeled_ratio = unlabeled_ratio
|
|
334
|
+
self._handle_map: dict[str, dict] = {}
|
|
335
|
+
self._session: BrowserSessionHandle | None = None
|
|
336
|
+
|
|
337
|
+
def _resolve_backend(self) -> SessionBackend:
|
|
338
|
+
if self._backend is None:
|
|
339
|
+
from zu_backends.local_docker import LocalDockerBackend
|
|
340
|
+
|
|
341
|
+
self._backend = LocalDockerBackend()
|
|
342
|
+
return self._backend
|
|
343
|
+
|
|
344
|
+
async def __call__(
|
|
345
|
+
self,
|
|
346
|
+
ctx: Any,
|
|
347
|
+
op: str = "reduce",
|
|
348
|
+
url: str | None = None,
|
|
349
|
+
handle: str | None = None,
|
|
350
|
+
axtree: list | None = None,
|
|
351
|
+
nodes: list | None = None,
|
|
352
|
+
title: str | None = None,
|
|
353
|
+
) -> dict:
|
|
354
|
+
if op == "reduce":
|
|
355
|
+
return self._reduce_op(nodes=nodes, axtree=axtree, title=title or "", url=url or "")
|
|
356
|
+
|
|
357
|
+
if op == "resolve":
|
|
358
|
+
if not handle:
|
|
359
|
+
return {"error": "op=resolve requires a handle"}
|
|
360
|
+
locator = self._handle_map.get(handle)
|
|
361
|
+
if locator is None:
|
|
362
|
+
# Stale/unknown handle: signal a re-resolve, never a crash (§11.3).
|
|
363
|
+
return {"stale_handle": handle,
|
|
364
|
+
"error": f"handle {handle!r} is not on the current surface; re-capture"}
|
|
365
|
+
return {"handle": handle, "locator": locator}
|
|
366
|
+
|
|
367
|
+
if op == "open":
|
|
368
|
+
if not url:
|
|
369
|
+
return {"error": "op=open requires a url"}
|
|
370
|
+
return await self._open_op(url, title or "")
|
|
371
|
+
|
|
372
|
+
return {"error": f"unknown op {op!r}; use reduce/open/resolve"}
|
|
373
|
+
|
|
374
|
+
def _reduce_op(self, *, nodes: list | None, axtree: list | None, title: str, url: str) -> dict:
|
|
375
|
+
if nodes is not None:
|
|
376
|
+
ax = [n if isinstance(n, AxNode) else AxNode.model_validate(n) for n in nodes]
|
|
377
|
+
elif axtree is not None:
|
|
378
|
+
ax = normalize_axtree([n for n in axtree if isinstance(n, dict)])
|
|
379
|
+
else:
|
|
380
|
+
return {"error": "op=reduce requires 'nodes' or 'axtree'"}
|
|
381
|
+
surface = reduce_surface(ax, title=title, url=url, unlabeled_ratio=self.unlabeled_ratio)
|
|
382
|
+
return self._emit(surface)
|
|
383
|
+
|
|
384
|
+
async def _open_op(self, url: str, title: str) -> dict:
|
|
385
|
+
await self._close_session()
|
|
386
|
+
pinned_ip = validate_and_pin(url, allow_private=self.allow_private)
|
|
387
|
+
spec: dict[str, Any] = {"image": self.image, "tier": self.tier, "network": True}
|
|
388
|
+
host = urlsplit(url).hostname
|
|
389
|
+
if pinned_ip is not None and host:
|
|
390
|
+
spec["extra_hosts"] = {host: pinned_ip}
|
|
391
|
+
self._session = await self._resolve_backend().open_session(spec)
|
|
392
|
+
# Ask the session for the accessibility tree. The browser server returns
|
|
393
|
+
# ``{axtree: [...CDP nodes...], title, url}``; an older server that lacks
|
|
394
|
+
# the op returns an error, which we surface (not a crash).
|
|
395
|
+
resp = await self._session.send({"op": "axtree", "url": url})
|
|
396
|
+
if not isinstance(resp, dict) or resp.get("axtree") is None:
|
|
397
|
+
err = resp.get("error") if isinstance(resp, dict) else "bad session response"
|
|
398
|
+
return {"error": f"could not capture accessibility tree: {err}"}
|
|
399
|
+
ax = normalize_axtree([n for n in resp["axtree"] if isinstance(n, dict)])
|
|
400
|
+
surface = reduce_surface(
|
|
401
|
+
ax,
|
|
402
|
+
title=title or str(resp.get("title", "")),
|
|
403
|
+
url=str(resp.get("url", url)),
|
|
404
|
+
unlabeled_ratio=self.unlabeled_ratio,
|
|
405
|
+
)
|
|
406
|
+
return self._emit(surface)
|
|
407
|
+
|
|
408
|
+
def _emit(self, surface: Surface) -> dict:
|
|
409
|
+
"""The surface as a loop-friendly observation. The handle map is held on
|
|
410
|
+
the instance (harness-side) and echoed for the harness; ``surface_blind``
|
|
411
|
+
is the top-level flag the blind detector reads."""
|
|
412
|
+
self._handle_map = dict(surface.handle_map)
|
|
413
|
+
return {
|
|
414
|
+
"action_surface": surface.model_dump(exclude={"handle_map"}),
|
|
415
|
+
"handle_map": surface.handle_map,
|
|
416
|
+
"surface_blind": surface.blind,
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
async def _close_session(self) -> None:
|
|
420
|
+
if self._session is not None:
|
|
421
|
+
session, self._session = self._session, None
|
|
422
|
+
try:
|
|
423
|
+
await session.close()
|
|
424
|
+
except Exception: # noqa: BLE001 — teardown must not raise over a result
|
|
425
|
+
pass
|
|
426
|
+
|
|
427
|
+
async def aclose(self) -> None:
|
|
428
|
+
"""Close any lingering session — for run teardown so a container never leaks."""
|
|
429
|
+
await self._close_session()
|
zu_tools/browser.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""browser — a PERSISTENT, event-driven headless-browser session (tier 2).
|
|
2
|
+
|
|
3
|
+
Where ``render_dom`` is one-shot (a fresh browser per call), ``browser`` keeps ONE
|
|
4
|
+
headless browser ALIVE across calls so a model can drive a reactive, multi-step
|
|
5
|
+
widget the way a person does: ``open`` a url, then ``act`` / ``read`` repeatedly —
|
|
6
|
+
observing the real state (and the network responses it triggered) after each step,
|
|
7
|
+
reacting to what actually happened — then ``close``. That removes the
|
|
8
|
+
timing-fragility of replaying a fixed action sequence into a fresh browser, which a
|
|
9
|
+
reactive SPA defeats (a selection must register before the next step).
|
|
10
|
+
|
|
11
|
+
It surfaces content (rendered text, captured XHR/JSON, optional html); it does not
|
|
12
|
+
provide a transaction-submitting primitive. The session lives in the same hardened,
|
|
13
|
+
headless container as ``render_dom`` (caps dropped, DNS-pinned, --no-sandbox); the
|
|
14
|
+
state is held by the long-lived ``zu-browser`` server inside it.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import Any
|
|
20
|
+
from urllib.parse import urlsplit
|
|
21
|
+
|
|
22
|
+
from zu_core.ports import CAP_NET, CAP_SANDBOX, EGRESS_OPEN, BrowserSessionHandle, SessionBackend
|
|
23
|
+
|
|
24
|
+
from .net import validate_and_pin
|
|
25
|
+
|
|
26
|
+
_DEFAULT_IMAGE = "ghcr.io/k3-mt/zu-render-chromium:latest"
|
|
27
|
+
_OBS_KEYS = ("status", "url", "text", "html", "content", "network",
|
|
28
|
+
"action_error", "action_error_kind", "consent_dismissed")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Browser:
|
|
32
|
+
name = "browser"
|
|
33
|
+
tier = 2 # like render_dom — unlocked only after a detector escalates
|
|
34
|
+
schema = {
|
|
35
|
+
"name": "browser",
|
|
36
|
+
"description": (
|
|
37
|
+
"Drive a PERSISTENT headless browser across calls to work through a "
|
|
38
|
+
"reactive, multi-step JS widget. op=open a url, then op=act / op=read "
|
|
39
|
+
"repeatedly (the page state is held between calls), then op=close. "
|
|
40
|
+
"Read the returned text after each step and decide the next action — "
|
|
41
|
+
"if action_error comes back, the selector missed; try another."
|
|
42
|
+
),
|
|
43
|
+
"parameters": {
|
|
44
|
+
"type": "object",
|
|
45
|
+
"properties": {
|
|
46
|
+
"op": {"type": "string", "enum": ["open", "act", "read", "close"]},
|
|
47
|
+
"url": {"type": "string", "description": "for op=open: the page to open"},
|
|
48
|
+
"actions": {
|
|
49
|
+
"type": "array",
|
|
50
|
+
"description": "for op=act: actions run in order on the HELD page — "
|
|
51
|
+
"{click|fill|select|wait_for: <selector>, value?} | {wait_ms:<n>}. "
|
|
52
|
+
"A selector is CSS or a text= selector; target what you SEE. "
|
|
53
|
+
"For an AMBIGUOUS option (e.g. a '1'/'2'/'3' button that appears "
|
|
54
|
+
"many times), add \"near\": \"<label text>\" to a click — it picks "
|
|
55
|
+
"the matching control closest to that label, e.g. "
|
|
56
|
+
"{\"click\": \"1\", \"near\": \"Number of pets\"}.",
|
|
57
|
+
"items": {"type": "object"},
|
|
58
|
+
},
|
|
59
|
+
"wait_until": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"enum": ["load", "domcontentloaded", "networkidle", "commit"],
|
|
62
|
+
"description": "for op=open: when navigation is done (optional)",
|
|
63
|
+
},
|
|
64
|
+
"capture_network": {
|
|
65
|
+
"type": "boolean",
|
|
66
|
+
"description": "for op=open: capture XHR/JSON responses (the widget's data) "
|
|
67
|
+
"for the whole session (optional)",
|
|
68
|
+
},
|
|
69
|
+
"width": {"type": "integer"},
|
|
70
|
+
"height": {"type": "integer"},
|
|
71
|
+
"html": {"type": "boolean", "description": "also return raw html (optional)"},
|
|
72
|
+
},
|
|
73
|
+
"required": ["op"],
|
|
74
|
+
},
|
|
75
|
+
}
|
|
76
|
+
prompt_fragment = (
|
|
77
|
+
"browser(op=open|act|read|close, url?, actions?, capture_network?): a PERSISTENT "
|
|
78
|
+
"headless browser. Open a url, then act/read step by step (state is kept) to drive "
|
|
79
|
+
"a multi-step widget to the data you need; capture_network grabs the JSON it fetches."
|
|
80
|
+
)
|
|
81
|
+
capabilities = frozenset({CAP_NET, CAP_SANDBOX})
|
|
82
|
+
egress = frozenset({EGRESS_OPEN})
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
backend: SessionBackend | None = None,
|
|
87
|
+
image: str = _DEFAULT_IMAGE,
|
|
88
|
+
*,
|
|
89
|
+
allow_private: bool | None = None,
|
|
90
|
+
) -> None:
|
|
91
|
+
self._backend = backend
|
|
92
|
+
self.image = image
|
|
93
|
+
self.allow_private = allow_private
|
|
94
|
+
self._session: BrowserSessionHandle | None = None # held across calls within a run
|
|
95
|
+
|
|
96
|
+
def _resolve_backend(self) -> SessionBackend:
|
|
97
|
+
if self._backend is None:
|
|
98
|
+
from zu_backends.local_docker import LocalDockerBackend
|
|
99
|
+
|
|
100
|
+
self._backend = LocalDockerBackend()
|
|
101
|
+
return self._backend
|
|
102
|
+
|
|
103
|
+
async def __call__(
|
|
104
|
+
self, ctx: Any, op: str, url: str | None = None, actions: list | None = None,
|
|
105
|
+
wait_until: str | None = None, capture_network: bool = False,
|
|
106
|
+
width: int | None = None, height: int | None = None, html: bool = False,
|
|
107
|
+
) -> dict:
|
|
108
|
+
if op == "open":
|
|
109
|
+
if not url:
|
|
110
|
+
return {"error": "op=open requires a url"}
|
|
111
|
+
await self._close_session() # one session at a time; replace any prior
|
|
112
|
+
# Same SSRF backstop + DNS pin as render_dom, before leasing a browser.
|
|
113
|
+
pinned_ip = validate_and_pin(url, allow_private=self.allow_private)
|
|
114
|
+
spec: dict[str, Any] = {"image": self.image, "tier": self.tier, "network": True}
|
|
115
|
+
host = urlsplit(url).hostname
|
|
116
|
+
if pinned_ip is not None and host:
|
|
117
|
+
spec["extra_hosts"] = {host: pinned_ip}
|
|
118
|
+
self._session = await self._resolve_backend().open_session(spec)
|
|
119
|
+
cmd: dict[str, Any] = {"op": "open", "url": url}
|
|
120
|
+
if wait_until:
|
|
121
|
+
cmd["wait_until"] = wait_until
|
|
122
|
+
if capture_network:
|
|
123
|
+
cmd["capture_network"] = True
|
|
124
|
+
if width:
|
|
125
|
+
cmd["width"] = int(width)
|
|
126
|
+
if height:
|
|
127
|
+
cmd["height"] = int(height)
|
|
128
|
+
if html:
|
|
129
|
+
cmd["html"] = True
|
|
130
|
+
return self._normalise(await self._session.send(cmd))
|
|
131
|
+
|
|
132
|
+
if op in ("act", "read"):
|
|
133
|
+
if self._session is None:
|
|
134
|
+
return {"error": "no open session; call browser(op=open, url=...) first"}
|
|
135
|
+
cmd = {"op": op}
|
|
136
|
+
if op == "act" and actions:
|
|
137
|
+
cmd["actions"] = actions
|
|
138
|
+
if html:
|
|
139
|
+
cmd["html"] = True
|
|
140
|
+
return self._normalise(await self._session.send(cmd))
|
|
141
|
+
|
|
142
|
+
if op == "close":
|
|
143
|
+
await self._close_session()
|
|
144
|
+
return {"closed": True}
|
|
145
|
+
|
|
146
|
+
return {"error": f"unknown op {op!r}; use open/act/read/close"}
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _normalise(obs: Any) -> dict:
|
|
150
|
+
"""The session response as a loop-friendly observation (content keys the
|
|
151
|
+
loop stores for grounding; a session/command error passed through)."""
|
|
152
|
+
if not isinstance(obs, dict):
|
|
153
|
+
return {"error": "bad session response"}
|
|
154
|
+
if "error" in obs and "text" not in obs:
|
|
155
|
+
return {"error": obs["error"]}
|
|
156
|
+
out: dict[str, Any] = {"rendered": True}
|
|
157
|
+
for k in _OBS_KEYS:
|
|
158
|
+
if obs.get(k) is not None:
|
|
159
|
+
out[k] = obs[k]
|
|
160
|
+
return out
|
|
161
|
+
|
|
162
|
+
async def _close_session(self) -> None:
|
|
163
|
+
if self._session is not None:
|
|
164
|
+
session, self._session = self._session, None
|
|
165
|
+
try:
|
|
166
|
+
await session.close()
|
|
167
|
+
except Exception: # noqa: BLE001 - teardown must not raise over a result
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
async def aclose(self) -> None:
|
|
171
|
+
"""Close a lingering session — for run teardown so a container never leaks."""
|
|
172
|
+
await self._close_session()
|