zu-tools 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zu_tools-0.2.0/.gitignore +66 -0
- zu_tools-0.2.0/PKG-INFO +54 -0
- zu_tools-0.2.0/README.md +32 -0
- zu_tools-0.2.0/pyproject.toml +43 -0
- zu_tools-0.2.0/src/zu_tools/__init__.py +5 -0
- zu_tools-0.2.0/src/zu_tools/action_surface.py +429 -0
- zu_tools-0.2.0/src/zu_tools/browser.py +172 -0
- zu_tools-0.2.0/src/zu_tools/fetch.py +132 -0
- zu_tools-0.2.0/src/zu_tools/net.py +242 -0
- zu_tools-0.2.0/src/zu_tools/parse.py +47 -0
- zu_tools-0.2.0/src/zu_tools/pointer.py +317 -0
- zu_tools-0.2.0/src/zu_tools/recall.py +82 -0
- zu_tools-0.2.0/src/zu_tools/render.py +205 -0
- zu_tools-0.2.0/src/zu_tools/search.py +156 -0
- zu_tools-0.2.0/src/zu_tools/simulate.py +67 -0
- zu_tools-0.2.0/tests/test_action_surface.py +194 -0
- zu_tools-0.2.0/tests/test_browser.py +111 -0
- zu_tools-0.2.0/tests/test_fetch.py +92 -0
- zu_tools-0.2.0/tests/test_net.py +153 -0
- zu_tools-0.2.0/tests/test_parse.py +30 -0
- zu_tools-0.2.0/tests/test_pointer.py +142 -0
- zu_tools-0.2.0/tests/test_recall.py +59 -0
- zu_tools-0.2.0/tests/test_render.py +133 -0
- zu_tools-0.2.0/tests/test_search.py +82 -0
- zu_tools-0.2.0/tests/test_simulate.py +39 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
|
|
9
|
+
# uv / venv
|
|
10
|
+
.venv/
|
|
11
|
+
uv.lock.bak
|
|
12
|
+
|
|
13
|
+
# Test / type caches
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
.coverage
|
|
18
|
+
htmlcov/
|
|
19
|
+
|
|
20
|
+
# Zu runtime artifacts
|
|
21
|
+
*.db
|
|
22
|
+
zu.db
|
|
23
|
+
zu.yaml.local
|
|
24
|
+
zu_review.jsonl
|
|
25
|
+
*.review.jsonl
|
|
26
|
+
# Per-agent cost telemetry ledger — machine-local run history, not source.
|
|
27
|
+
cost.jsonl
|
|
28
|
+
# A recorded replay path is learned per-run and machine-local — regenerated on
|
|
29
|
+
# every successful run, not source. The agent ships; its track does not.
|
|
30
|
+
track.json
|
|
31
|
+
# …except the flagship example ships its track on purpose, as a demo of the
|
|
32
|
+
# record/replay convergence (committed; re-runs show as ordinary modifications).
|
|
33
|
+
!examples/agents/vet-appointment/track.json
|
|
34
|
+
|
|
35
|
+
# Editor / OS
|
|
36
|
+
.idea/
|
|
37
|
+
.vscode/
|
|
38
|
+
.DS_Store
|
|
39
|
+
|
|
40
|
+
# Claude Code local session state
|
|
41
|
+
.claude/
|
|
42
|
+
|
|
43
|
+
# Secrets
|
|
44
|
+
.env
|
|
45
|
+
.env.*
|
|
46
|
+
!.env.example
|
|
47
|
+
|
|
48
|
+
# Microsoft Office temp/lock files
|
|
49
|
+
~$*
|
|
50
|
+
|
|
51
|
+
# Internal design / strategy docs — kept local, never in the public repo
|
|
52
|
+
*.docx
|
|
53
|
+
*.pdf
|
|
54
|
+
# BUILD.md is the internal build-sequence / deferred-gaps ledger — kept local.
|
|
55
|
+
# (ARCHITECTURE.md is public: an onboarding agent needs the structural map.)
|
|
56
|
+
docs/BUILD.md
|
|
57
|
+
|
|
58
|
+
# Local secret — API key for live validation, never commit
|
|
59
|
+
zu_demo_key.md
|
|
60
|
+
*_key.md
|
|
61
|
+
|
|
62
|
+
# Local PyPI publish token — never commit
|
|
63
|
+
/pypi
|
|
64
|
+
|
|
65
|
+
# Local Discord credentials (bot token / app secrets) — never commit
|
|
66
|
+
/discord
|
zu_tools-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zu-tools
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Zu built-in tools: web_search, http_fetch, html_parse, render_dom, browser
|
|
5
|
+
Project-URL: Homepage, https://github.com/k3-mt/zu
|
|
6
|
+
Project-URL: Repository, https://github.com/k3-mt/zu
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: httpx
|
|
18
|
+
Requires-Dist: selectolax
|
|
19
|
+
Requires-Dist: zu-backends==0.2.0
|
|
20
|
+
Requires-Dist: zu-core==0.2.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# zu-tools
|
|
24
|
+
|
|
25
|
+
Tools — the **`Tool`** port: actions the model may take. A tool declares its
|
|
26
|
+
tier (the escalation ladder), its JSON `schema`, a `prompt_fragment`, and its
|
|
27
|
+
**capability envelope** (`capabilities` + `egress`) so its blast radius is
|
|
28
|
+
visible in its own code and the gate can bound it.
|
|
29
|
+
|
|
30
|
+
## Registered plugins (`zu.tools`)
|
|
31
|
+
|
|
32
|
+
| Name | Class | Tier | Envelope |
|
|
33
|
+
|------|-------|------|----------|
|
|
34
|
+
| `http_fetch` | `HttpFetch` | 1 | `CAP_NET`, open egress — a general web fetcher with a host-level SSRF guard (`net.check_url`). |
|
|
35
|
+
| `html_parse` | `HtmlParse` | 1 | none — pure CPU on HTML it is handed (least privilege). |
|
|
36
|
+
| `render_dom` | `RenderDom` | 2 | `CAP_NET` + `CAP_SANDBOX`, open egress — renders a URL in a headless browser inside a `SandboxBackend` (unlocked only after a detector escalates off tier 1). |
|
|
37
|
+
|
|
38
|
+
## The tier ladder
|
|
39
|
+
|
|
40
|
+
`http_fetch` and `html_parse` are tier 1 (cheap, offered from the start).
|
|
41
|
+
`render_dom` is tier 2 — the escalation target when a JavaScript page defeats
|
|
42
|
+
tier 1. The loop only offers tools at or below the current tier; a detector
|
|
43
|
+
`ESCALATE` climbs the ladder. The browser runs in a sandbox behind a seam tests
|
|
44
|
+
can freeze (a saved rendered page), so the escalation arc is proven offline.
|
|
45
|
+
|
|
46
|
+
## Extend
|
|
47
|
+
|
|
48
|
+
Implement the `Tool` shape (see [`AGENTS.md`](../../AGENTS.md) → *Recipe: add a
|
|
49
|
+
tool*), declare a minimal `capabilities`/`egress`, register under `zu.tools`, and
|
|
50
|
+
add a deterministic test (use an `httpx.MockTransport` to fixture the network).
|
|
51
|
+
|
|
52
|
+
## Tests
|
|
53
|
+
|
|
54
|
+
`uv run pytest packages/zu-tools` — offline; the network is fixtured.
|
zu_tools-0.2.0/README.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# zu-tools
|
|
2
|
+
|
|
3
|
+
Tools — the **`Tool`** port: actions the model may take. A tool declares its
|
|
4
|
+
tier (the escalation ladder), its JSON `schema`, a `prompt_fragment`, and its
|
|
5
|
+
**capability envelope** (`capabilities` + `egress`) so its blast radius is
|
|
6
|
+
visible in its own code and the gate can bound it.
|
|
7
|
+
|
|
8
|
+
## Registered plugins (`zu.tools`)
|
|
9
|
+
|
|
10
|
+
| Name | Class | Tier | Envelope |
|
|
11
|
+
|------|-------|------|----------|
|
|
12
|
+
| `http_fetch` | `HttpFetch` | 1 | `CAP_NET`, open egress — a general web fetcher with a host-level SSRF guard (`net.check_url`). |
|
|
13
|
+
| `html_parse` | `HtmlParse` | 1 | none — pure CPU on HTML it is handed (least privilege). |
|
|
14
|
+
| `render_dom` | `RenderDom` | 2 | `CAP_NET` + `CAP_SANDBOX`, open egress — renders a URL in a headless browser inside a `SandboxBackend` (unlocked only after a detector escalates off tier 1). |
|
|
15
|
+
|
|
16
|
+
## The tier ladder
|
|
17
|
+
|
|
18
|
+
`http_fetch` and `html_parse` are tier 1 (cheap, offered from the start).
|
|
19
|
+
`render_dom` is tier 2 — the escalation target when a JavaScript page defeats
|
|
20
|
+
tier 1. The loop only offers tools at or below the current tier; a detector
|
|
21
|
+
`ESCALATE` climbs the ladder. The browser runs in a sandbox behind a seam tests
|
|
22
|
+
can freeze (a saved rendered page), so the escalation arc is proven offline.
|
|
23
|
+
|
|
24
|
+
## Extend
|
|
25
|
+
|
|
26
|
+
Implement the `Tool` shape (see [`AGENTS.md`](../../AGENTS.md) → *Recipe: add a
|
|
27
|
+
tool*), declare a minimal `capabilities`/`egress`, register under `zu.tools`, and
|
|
28
|
+
add a deterministic test (use an `httpx.MockTransport` to fixture the network).
|
|
29
|
+
|
|
30
|
+
## Tests
|
|
31
|
+
|
|
32
|
+
`uv run pytest packages/zu-tools` — offline; the network is fixtured.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "zu-tools"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Zu built-in tools: web_search, http_fetch, html_parse, render_dom, browser"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
classifiers = [
|
|
9
|
+
"Development Status :: 4 - Beta",
|
|
10
|
+
"Intended Audience :: Developers",
|
|
11
|
+
"License :: OSI Approved :: Apache Software License",
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Programming Language :: Python :: 3.11",
|
|
14
|
+
"Programming Language :: Python :: 3.12",
|
|
15
|
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
|
16
|
+
"Typing :: Typed",
|
|
17
|
+
]
|
|
18
|
+
# zu-backends supplies render_dom's default sandbox (local-docker); it is
|
|
19
|
+
# imported lazily, so a tier-1-only run never touches it, but the default
|
|
20
|
+
# tier-2 render works out of the box once installed.
|
|
21
|
+
dependencies = ["zu-core==0.2.0", "httpx", "selectolax", "zu-backends==0.2.0"]
|
|
22
|
+
|
|
23
|
+
[project.entry-points."zu.tools"] # <- how a tool is registered
|
|
24
|
+
web_search = "zu_tools.search:WebSearch"
|
|
25
|
+
http_fetch = "zu_tools.fetch:HttpFetch"
|
|
26
|
+
browser = "zu_tools.browser:Browser"
|
|
27
|
+
recall = "zu_tools.recall:Recall"
|
|
28
|
+
html_parse = "zu_tools.parse:HtmlParse"
|
|
29
|
+
render_dom = "zu_tools.render:RenderDom"
|
|
30
|
+
action_surface = "zu_tools.action_surface:ActionSurface"
|
|
31
|
+
pointer = "zu_tools.pointer:PointerControl"
|
|
32
|
+
simulate = "zu_tools.simulate:Simulate"
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/k3-mt/zu"
|
|
36
|
+
Repository = "https://github.com/k3-mt/zu"
|
|
37
|
+
|
|
38
|
+
[build-system]
|
|
39
|
+
requires = ["hatchling"]
|
|
40
|
+
build-backend = "hatchling.build"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/zu_tools"]
|
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
"""action_surface — the perception-reduction tool (tier 3, Engineering Design §11).
|
|
2
|
+
|
|
3
|
+
A rendered web page is a DOM of 100k–1M+ tokens; the decision the agent needs
|
|
4
|
+
from it — "click Place order" — is a handful. Pushing the whole blob through the
|
|
5
|
+
model is slow, expensive, and *worse for accuracy* (the signal drowns in
|
|
6
|
+
markup). The way out is a reframe: the agent almost never needs the page — it
|
|
7
|
+
needs the **set of things it can do** on the page. That set is a few dozen
|
|
8
|
+
affordances, a few hundred tokens.
|
|
9
|
+
|
|
10
|
+
This tool produces that set, **deterministically**. The decision rule (§4.5)
|
|
11
|
+
settles why it is a tool and not a model job: a script may *enumerate what is
|
|
12
|
+
possible* (every actionable element), but it must not *decide what is reasonable*
|
|
13
|
+
(which one to pick) — that is the policy's judgment. So the reducer surfaces the
|
|
14
|
+
possible and never ranks or prunes by guessed task-relevance.
|
|
15
|
+
|
|
16
|
+
The pipeline (§11.2), run over an accessibility tree rather than the raw DOM:
|
|
17
|
+
|
|
18
|
+
1. Walk the accessibility tree — roles, names, states — an order of magnitude
|
|
19
|
+
smaller than the DOM, built to answer "what can a user do here".
|
|
20
|
+
2. Filter to interactive + meaningful (actions, plus the headings/labels/errors
|
|
21
|
+
an action needs); drop the rest.
|
|
22
|
+
3. Prune the invisible — ignored, off-screen, zero-area, hidden.
|
|
23
|
+
4. Resolve a stable, human-meaningful label per element.
|
|
24
|
+
5. Assign a stable, opaque handle (a1, a2 …) that maps back, harness-side, to a
|
|
25
|
+
role+name locator. The model emits the handle, never a selector (§11.3).
|
|
26
|
+
6. Emit a compact, typed representation.
|
|
27
|
+
|
|
28
|
+
And the competence boundary (§11.4): the honest risk is a false negative —
|
|
29
|
+
pruning the one element the task needed (a canvas button, an unlabeled icon). So
|
|
30
|
+
the reducer must know when it is **blind** and *signal* escalation to tier-4
|
|
31
|
+
vision rather than silently return an incomplete surface. ``blind`` on the
|
|
32
|
+
result is that signal; the ``action-surface-blind`` detector turns it into an
|
|
33
|
+
ESCALATE. Graceful degradation, never silent incompleteness.
|
|
34
|
+
|
|
35
|
+
The deterministic reducer (:func:`reduce_surface`) is the whole value and is
|
|
36
|
+
pure — it runs on an accessibility-tree snapshot with no browser, which is how a
|
|
37
|
+
coding harness drives it offline and how it is tested at $0. The live arm asks a
|
|
38
|
+
browser session for the tree (:meth:`ActionSurface.__call__` with ``op=open``)
|
|
39
|
+
and runs the same reducer over it.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
from typing import Any
|
|
45
|
+
from urllib.parse import urlsplit
|
|
46
|
+
|
|
47
|
+
from pydantic import BaseModel, Field
|
|
48
|
+
|
|
49
|
+
from zu_core.ports import CAP_NET, CAP_SANDBOX, EGRESS_OPEN, BrowserSessionHandle, SessionBackend
|
|
50
|
+
|
|
51
|
+
from .net import validate_and_pin
|
|
52
|
+
|
|
53
|
+
_DEFAULT_IMAGE = "ghcr.io/k3-mt/zu-render-chromium:latest"
|
|
54
|
+
|
|
55
|
+
# Roles that represent something the agent can *do*. The list is generous on
|
|
56
|
+
# purpose — enumerating the possible is the job; choosing among it is the
|
|
57
|
+
# policy's. Anything actionable a real accessibility tree exposes belongs here.
|
|
58
|
+
INTERACTIVE_ROLES: frozenset[str] = frozenset({
|
|
59
|
+
"button", "link", "textbox", "searchbox", "combobox", "checkbox", "radio",
|
|
60
|
+
"switch", "slider", "spinbutton", "menuitem", "menuitemcheckbox",
|
|
61
|
+
"menuitemradio", "tab", "option", "textarea", "listbox", "menubutton",
|
|
62
|
+
"togglebutton", "datepicker", "colorwell",
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
# Roles whose *text* is meaningful context for choosing an action — headings
|
|
66
|
+
# orient, alerts/status carry the error and validation text an action needs —
|
|
67
|
+
# but which are not themselves actionable. We keep their names as context, never
|
|
68
|
+
# as affordances.
|
|
69
|
+
CONTEXT_ROLES: frozenset[str] = frozenset({
|
|
70
|
+
"heading", "alert", "status", "alertdialog", "log", "marquee",
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class AxNode(BaseModel):
|
|
75
|
+
"""One normalised accessibility-tree node — the reducer's input currency.
|
|
76
|
+
|
|
77
|
+
A small, serialisable shape so the reducer is pure and a harness can feed it
|
|
78
|
+
a captured tree directly. :func:`normalize_axtree` produces these from the
|
|
79
|
+
raw CDP ``Accessibility.getFullAXTree`` format.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
role: str
|
|
83
|
+
name: str = ""
|
|
84
|
+
value: str | None = None
|
|
85
|
+
states: list[str] = Field(default_factory=list)
|
|
86
|
+
placeholder: str | None = None
|
|
87
|
+
description: str | None = None
|
|
88
|
+
# Pruning inputs. ``visible`` folds in aria-hidden/display:none/off-screen;
|
|
89
|
+
# ``ignored`` is the tree's own "not exposed" flag; ``bounds`` is [x,y,w,h].
|
|
90
|
+
visible: bool = True
|
|
91
|
+
ignored: bool = False
|
|
92
|
+
bounds: list[float] | None = None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Affordance(BaseModel):
|
|
96
|
+
"""One thing the policy can do, addressed by an opaque handle."""
|
|
97
|
+
|
|
98
|
+
handle: str
|
|
99
|
+
role: str
|
|
100
|
+
label: str
|
|
101
|
+
value: str | None = None
|
|
102
|
+
states: list[str] = Field(default_factory=list)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class Surface(BaseModel):
|
|
106
|
+
"""The compact, typed reduction of a page — a few hundred tokens.
|
|
107
|
+
|
|
108
|
+
``handle_map`` is the harness-side indirection (§11.3): handle → role+name
|
|
109
|
+
locator. The model only ever sees and emits handles; the durable locator
|
|
110
|
+
stays here and is re-resolved at action time.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
title: str = ""
|
|
114
|
+
url: str = ""
|
|
115
|
+
affordances: list[Affordance] = Field(default_factory=list)
|
|
116
|
+
context: list[str] = Field(default_factory=list)
|
|
117
|
+
handle_map: dict[str, dict] = Field(default_factory=dict)
|
|
118
|
+
blind: bool = False
|
|
119
|
+
blind_reason: str | None = None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _label_of(node: AxNode) -> str:
|
|
123
|
+
"""The stable, human-meaningful label (§11.2 step 4): accessible name first
|
|
124
|
+
(which already folds in aria-label and an associated <label>), then
|
|
125
|
+
placeholder, then description. Class soup never reaches here — if none of
|
|
126
|
+
these is set, the element is unlabeled and counts toward blindness."""
|
|
127
|
+
for candidate in (node.name, node.placeholder, node.description):
|
|
128
|
+
if candidate and candidate.strip():
|
|
129
|
+
return candidate.strip()
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _is_pruned(node: AxNode) -> bool:
|
|
134
|
+
"""Step 3 — prune the invisible. ignored / not-visible / zero-area go."""
|
|
135
|
+
if node.ignored or not node.visible:
|
|
136
|
+
return True
|
|
137
|
+
if node.bounds is not None and len(node.bounds) == 4:
|
|
138
|
+
w, h = node.bounds[2], node.bounds[3]
|
|
139
|
+
if w <= 0 or h <= 0:
|
|
140
|
+
return True
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def reduce_surface(
|
|
145
|
+
nodes: list[AxNode],
|
|
146
|
+
*,
|
|
147
|
+
title: str = "",
|
|
148
|
+
url: str = "",
|
|
149
|
+
unlabeled_ratio: float = 0.5,
|
|
150
|
+
) -> Surface:
|
|
151
|
+
"""Reduce an accessibility tree to the action surface — pure, deterministic.
|
|
152
|
+
|
|
153
|
+
Handles are assigned ``a1, a2 …`` in document (input) order over the emitted
|
|
154
|
+
affordances, so the same tree always yields the same handles. The blind
|
|
155
|
+
signal (§11.4) fires when the surface cannot be trusted to be complete: the
|
|
156
|
+
page had content but yielded no affordances, or too large a fraction of the
|
|
157
|
+
interactive elements have no resolvable label (a canvas/icon-heavy page the
|
|
158
|
+
accessibility tree describes poorly).
|
|
159
|
+
"""
|
|
160
|
+
affordances: list[Affordance] = []
|
|
161
|
+
handle_map: dict[str, dict] = {}
|
|
162
|
+
context: list[str] = []
|
|
163
|
+
unlabeled = 0
|
|
164
|
+
interactive_seen = 0
|
|
165
|
+
kept_any_content = False
|
|
166
|
+
|
|
167
|
+
for node in nodes:
|
|
168
|
+
if _is_pruned(node):
|
|
169
|
+
continue
|
|
170
|
+
kept_any_content = True
|
|
171
|
+
role = node.role
|
|
172
|
+
|
|
173
|
+
if role in CONTEXT_ROLES:
|
|
174
|
+
label = _label_of(node)
|
|
175
|
+
if label:
|
|
176
|
+
context.append(label)
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
if role in INTERACTIVE_ROLES:
|
|
180
|
+
interactive_seen += 1
|
|
181
|
+
label = _label_of(node)
|
|
182
|
+
if not label:
|
|
183
|
+
# Enumerated as possible, but unaddressable — a blindness signal,
|
|
184
|
+
# not a meaningless handle handed to the model.
|
|
185
|
+
unlabeled += 1
|
|
186
|
+
continue
|
|
187
|
+
handle = f"a{len(affordances) + 1}"
|
|
188
|
+
affordances.append(
|
|
189
|
+
Affordance(
|
|
190
|
+
handle=handle,
|
|
191
|
+
role=role,
|
|
192
|
+
label=label,
|
|
193
|
+
value=node.value,
|
|
194
|
+
states=list(node.states),
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
# The durable locator the model never sees (role + accessible name).
|
|
198
|
+
handle_map[handle] = {"role": role, "name": label}
|
|
199
|
+
|
|
200
|
+
blind = False
|
|
201
|
+
blind_reason: str | None = None
|
|
202
|
+
if not affordances and kept_any_content:
|
|
203
|
+
blind = True
|
|
204
|
+
blind_reason = "page had content but the accessibility tree yielded no addressable actions"
|
|
205
|
+
elif interactive_seen and (unlabeled / interactive_seen) > unlabeled_ratio:
|
|
206
|
+
blind = True
|
|
207
|
+
blind_reason = (
|
|
208
|
+
f"{unlabeled}/{interactive_seen} interactive elements are unlabeled "
|
|
209
|
+
"in the accessibility tree — too thin to trust"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return Surface(
|
|
213
|
+
title=title,
|
|
214
|
+
url=url,
|
|
215
|
+
affordances=affordances,
|
|
216
|
+
context=context,
|
|
217
|
+
handle_map=handle_map,
|
|
218
|
+
blind=blind,
|
|
219
|
+
blind_reason=blind_reason,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _ax_string(field: Any) -> str:
|
|
224
|
+
"""Read a CDP AX value object ``{"type":...,"value":...}`` as a string."""
|
|
225
|
+
if isinstance(field, dict):
|
|
226
|
+
v = field.get("value")
|
|
227
|
+
return str(v) if v is not None else ""
|
|
228
|
+
return ""
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def normalize_axtree(cdp_nodes: list[dict]) -> list[AxNode]:
|
|
232
|
+
"""Normalise the raw CDP ``Accessibility.getFullAXTree`` node list into
|
|
233
|
+
:class:`AxNode` records, in document (pre-order) order as CDP returns them.
|
|
234
|
+
|
|
235
|
+
CDP shape per node: ``role``/``name`` are ``{type,value}`` objects;
|
|
236
|
+
``properties`` is a list of ``{name, value:{value}}``; ``ignored`` is a bool.
|
|
237
|
+
States we surface: disabled, checked, expanded, required, focused, selected,
|
|
238
|
+
invalid. Placeholder/description/value are read from their AX properties.
|
|
239
|
+
"""
|
|
240
|
+
out: list[AxNode] = []
|
|
241
|
+
state_props = {"disabled", "checked", "expanded", "required", "focused", "selected", "invalid"}
|
|
242
|
+
for n in cdp_nodes:
|
|
243
|
+
role = _ax_string(n.get("role"))
|
|
244
|
+
if not role:
|
|
245
|
+
continue
|
|
246
|
+
props = {p.get("name"): p.get("value", {}) for p in n.get("properties", []) if isinstance(p, dict)}
|
|
247
|
+
states: list[str] = []
|
|
248
|
+
for sp in sorted(state_props):
|
|
249
|
+
val = props.get(sp, {})
|
|
250
|
+
v = val.get("value") if isinstance(val, dict) else None
|
|
251
|
+
if v is True or (isinstance(v, str) and v not in ("false", "")):
|
|
252
|
+
states.append(sp if not isinstance(v, str) or v == "true" else f"{sp}:{v}")
|
|
253
|
+
out.append(
|
|
254
|
+
AxNode(
|
|
255
|
+
role=role,
|
|
256
|
+
name=_ax_string(n.get("name")),
|
|
257
|
+
value=_ax_string(n.get("value")) or None,
|
|
258
|
+
states=states,
|
|
259
|
+
placeholder=_ax_string(props.get("placeholder")) or None,
|
|
260
|
+
description=_ax_string(n.get("description")) or None,
|
|
261
|
+
ignored=bool(n.get("ignored", False)),
|
|
262
|
+
# CDP marks unexposed nodes via ``ignored``; visibility off-screen
|
|
263
|
+
# is folded into ``hidden`` when the server supplies bounds.
|
|
264
|
+
visible=not bool(props.get("hidden", {}).get("value", False))
|
|
265
|
+
if isinstance(props.get("hidden"), dict) else True,
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
return out
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class ActionSurface:
|
|
272
|
+
"""Tier-3 tool: reduce a page to its action surface (and keep the handle map).
|
|
273
|
+
|
|
274
|
+
Two ways in, one reducer:
|
|
275
|
+
|
|
276
|
+
* ``op=reduce`` (default) — reduce a tree the caller already has. Pass
|
|
277
|
+
``nodes`` (AxNode dicts) or raw ``axtree`` (CDP nodes), plus ``title`` /
|
|
278
|
+
``url``. No browser, fully offline — the harness-driven and tested path.
|
|
279
|
+
* ``op=open`` — open ``url`` in a headless browser session, ask it for the
|
|
280
|
+
accessibility tree, and reduce that. The live arm.
|
|
281
|
+
|
|
282
|
+
After a reduction the handle→locator map is held on the instance for the run;
|
|
283
|
+
``op=resolve`` returns the durable locator for a handle (a stale handle is an
|
|
284
|
+
escalation, not a crash — the caller re-resolves at action time, §11.3).
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
name = "action_surface"
|
|
288
|
+
tier = 3 # the accessibility-tree tier; unlocked by a detector ESCALATE
|
|
289
|
+
schema = {
|
|
290
|
+
"name": "action_surface",
|
|
291
|
+
"description": (
|
|
292
|
+
"Reduce a web page to the compact SET OF THINGS YOU CAN DO on it — a "
|
|
293
|
+
"flat list of affordances (button/link/textbox/…) each with an opaque "
|
|
294
|
+
"handle (a1, a2 …) and a human label. You choose a handle and act on "
|
|
295
|
+
"it; you never see or emit a CSS selector. op=open a url to capture and "
|
|
296
|
+
"reduce its accessibility tree; op=resolve a handle to its locator. If "
|
|
297
|
+
"'blind' is true the tree is too thin to trust — escalate to vision."
|
|
298
|
+
),
|
|
299
|
+
"parameters": {
|
|
300
|
+
"type": "object",
|
|
301
|
+
"properties": {
|
|
302
|
+
"op": {"type": "string", "enum": ["reduce", "open", "resolve"]},
|
|
303
|
+
"url": {"type": "string", "description": "for op=open: the page to reduce"},
|
|
304
|
+
"handle": {"type": "string", "description": "for op=resolve: the handle to resolve"},
|
|
305
|
+
"axtree": {"type": "array", "items": {"type": "object"},
|
|
306
|
+
"description": "for op=reduce: raw CDP getFullAXTree nodes"},
|
|
307
|
+
"nodes": {"type": "array", "items": {"type": "object"},
|
|
308
|
+
"description": "for op=reduce: pre-normalised AxNode dicts"},
|
|
309
|
+
"title": {"type": "string"},
|
|
310
|
+
},
|
|
311
|
+
"required": ["op"],
|
|
312
|
+
},
|
|
313
|
+
}
|
|
314
|
+
prompt_fragment = (
|
|
315
|
+
"action_surface(op=open, url): reduce a page to a short list of affordances "
|
|
316
|
+
"(handles a1,a2,… with labels) instead of reading the whole DOM. Pick a handle "
|
|
317
|
+
"to act on; resolve(handle) gives its locator. 'blind' means escalate to vision."
|
|
318
|
+
)
|
|
319
|
+
capabilities = frozenset({CAP_NET, CAP_SANDBOX})
|
|
320
|
+
egress = frozenset({EGRESS_OPEN})
|
|
321
|
+
|
|
322
|
+
def __init__(
|
|
323
|
+
self,
|
|
324
|
+
backend: SessionBackend | None = None,
|
|
325
|
+
image: str = _DEFAULT_IMAGE,
|
|
326
|
+
*,
|
|
327
|
+
allow_private: bool | None = None,
|
|
328
|
+
unlabeled_ratio: float = 0.5,
|
|
329
|
+
) -> None:
|
|
330
|
+
self._backend = backend
|
|
331
|
+
self.image = image
|
|
332
|
+
self.allow_private = allow_private
|
|
333
|
+
self.unlabeled_ratio = unlabeled_ratio
|
|
334
|
+
self._handle_map: dict[str, dict] = {}
|
|
335
|
+
self._session: BrowserSessionHandle | None = None
|
|
336
|
+
|
|
337
|
+
def _resolve_backend(self) -> SessionBackend:
|
|
338
|
+
if self._backend is None:
|
|
339
|
+
from zu_backends.local_docker import LocalDockerBackend
|
|
340
|
+
|
|
341
|
+
self._backend = LocalDockerBackend()
|
|
342
|
+
return self._backend
|
|
343
|
+
|
|
344
|
+
async def __call__(
|
|
345
|
+
self,
|
|
346
|
+
ctx: Any,
|
|
347
|
+
op: str = "reduce",
|
|
348
|
+
url: str | None = None,
|
|
349
|
+
handle: str | None = None,
|
|
350
|
+
axtree: list | None = None,
|
|
351
|
+
nodes: list | None = None,
|
|
352
|
+
title: str | None = None,
|
|
353
|
+
) -> dict:
|
|
354
|
+
if op == "reduce":
|
|
355
|
+
return self._reduce_op(nodes=nodes, axtree=axtree, title=title or "", url=url or "")
|
|
356
|
+
|
|
357
|
+
if op == "resolve":
|
|
358
|
+
if not handle:
|
|
359
|
+
return {"error": "op=resolve requires a handle"}
|
|
360
|
+
locator = self._handle_map.get(handle)
|
|
361
|
+
if locator is None:
|
|
362
|
+
# Stale/unknown handle: signal a re-resolve, never a crash (§11.3).
|
|
363
|
+
return {"stale_handle": handle,
|
|
364
|
+
"error": f"handle {handle!r} is not on the current surface; re-capture"}
|
|
365
|
+
return {"handle": handle, "locator": locator}
|
|
366
|
+
|
|
367
|
+
if op == "open":
|
|
368
|
+
if not url:
|
|
369
|
+
return {"error": "op=open requires a url"}
|
|
370
|
+
return await self._open_op(url, title or "")
|
|
371
|
+
|
|
372
|
+
return {"error": f"unknown op {op!r}; use reduce/open/resolve"}
|
|
373
|
+
|
|
374
|
+
def _reduce_op(self, *, nodes: list | None, axtree: list | None, title: str, url: str) -> dict:
|
|
375
|
+
if nodes is not None:
|
|
376
|
+
ax = [n if isinstance(n, AxNode) else AxNode.model_validate(n) for n in nodes]
|
|
377
|
+
elif axtree is not None:
|
|
378
|
+
ax = normalize_axtree([n for n in axtree if isinstance(n, dict)])
|
|
379
|
+
else:
|
|
380
|
+
return {"error": "op=reduce requires 'nodes' or 'axtree'"}
|
|
381
|
+
surface = reduce_surface(ax, title=title, url=url, unlabeled_ratio=self.unlabeled_ratio)
|
|
382
|
+
return self._emit(surface)
|
|
383
|
+
|
|
384
|
+
async def _open_op(self, url: str, title: str) -> dict:
|
|
385
|
+
await self._close_session()
|
|
386
|
+
pinned_ip = validate_and_pin(url, allow_private=self.allow_private)
|
|
387
|
+
spec: dict[str, Any] = {"image": self.image, "tier": self.tier, "network": True}
|
|
388
|
+
host = urlsplit(url).hostname
|
|
389
|
+
if pinned_ip is not None and host:
|
|
390
|
+
spec["extra_hosts"] = {host: pinned_ip}
|
|
391
|
+
self._session = await self._resolve_backend().open_session(spec)
|
|
392
|
+
# Ask the session for the accessibility tree. The browser server returns
|
|
393
|
+
# ``{axtree: [...CDP nodes...], title, url}``; an older server that lacks
|
|
394
|
+
# the op returns an error, which we surface (not a crash).
|
|
395
|
+
resp = await self._session.send({"op": "axtree", "url": url})
|
|
396
|
+
if not isinstance(resp, dict) or resp.get("axtree") is None:
|
|
397
|
+
err = resp.get("error") if isinstance(resp, dict) else "bad session response"
|
|
398
|
+
return {"error": f"could not capture accessibility tree: {err}"}
|
|
399
|
+
ax = normalize_axtree([n for n in resp["axtree"] if isinstance(n, dict)])
|
|
400
|
+
surface = reduce_surface(
|
|
401
|
+
ax,
|
|
402
|
+
title=title or str(resp.get("title", "")),
|
|
403
|
+
url=str(resp.get("url", url)),
|
|
404
|
+
unlabeled_ratio=self.unlabeled_ratio,
|
|
405
|
+
)
|
|
406
|
+
return self._emit(surface)
|
|
407
|
+
|
|
408
|
+
def _emit(self, surface: Surface) -> dict:
|
|
409
|
+
"""The surface as a loop-friendly observation. The handle map is held on
|
|
410
|
+
the instance (harness-side) and echoed for the harness; ``surface_blind``
|
|
411
|
+
is the top-level flag the blind detector reads."""
|
|
412
|
+
self._handle_map = dict(surface.handle_map)
|
|
413
|
+
return {
|
|
414
|
+
"action_surface": surface.model_dump(exclude={"handle_map"}),
|
|
415
|
+
"handle_map": surface.handle_map,
|
|
416
|
+
"surface_blind": surface.blind,
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
async def _close_session(self) -> None:
|
|
420
|
+
if self._session is not None:
|
|
421
|
+
session, self._session = self._session, None
|
|
422
|
+
try:
|
|
423
|
+
await session.close()
|
|
424
|
+
except Exception: # noqa: BLE001 — teardown must not raise over a result
|
|
425
|
+
pass
|
|
426
|
+
|
|
427
|
+
async def aclose(self) -> None:
|
|
428
|
+
"""Close any lingering session — for run teardown so a container never leaks."""
|
|
429
|
+
await self._close_session()
|