aliax 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aliax-1.0.0/CHANGELOG.md +37 -0
- aliax-1.0.0/LICENSE +21 -0
- aliax-1.0.0/MANIFEST.in +5 -0
- aliax-1.0.0/PKG-INFO +265 -0
- aliax-1.0.0/README.md +207 -0
- aliax-1.0.0/aliax/__init__.py +22 -0
- aliax-1.0.0/aliax/_version.py +13 -0
- aliax-1.0.0/aliax/client.py +3581 -0
- aliax-1.0.0/aliax/dom-mapper.min.js +2 -0
- aliax-1.0.0/aliax/prompts.py +352 -0
- aliax-1.0.0/aliax/py.typed +0 -0
- aliax-1.0.0/aliax.egg-info/PKG-INFO +265 -0
- aliax-1.0.0/aliax.egg-info/SOURCES.txt +18 -0
- aliax-1.0.0/aliax.egg-info/dependency_links.txt +1 -0
- aliax-1.0.0/aliax.egg-info/requires.txt +7 -0
- aliax-1.0.0/aliax.egg-info/top_level.txt +1 -0
- aliax-1.0.0/pyproject.toml +67 -0
- aliax-1.0.0/setup.cfg +4 -0
- aliax-1.0.0/tests/test_async_context.py +24 -0
- aliax-1.0.0/tests/test_client.py +49 -0
aliax-1.0.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to the Aliax Python SDK are documented here. This
|
|
4
|
+
project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
5
|
+
|
|
6
|
+
## [1.0.0] — 2026-06-20
|
|
7
|
+
|
|
8
|
+
First official PyPI release. The SDK has been hardened for enterprise
|
|
9
|
+
deployment (zipped wheels, serverless layers, long-running agents).
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **`async with Aliax()` support.** `__aenter__` / `__aexit__` close the
|
|
13
|
+
`httpx` connection pool deterministically — no more socket leaks when an
|
|
14
|
+
agent loop crashes without calling `aclose()`.
|
|
15
|
+
- **`ALIAX_API_KEY` environment variable fallback.** Constructor accepts
|
|
16
|
+
`api_key=None` and reads from the env, so 12-factor deployments don't
|
|
17
|
+
need to thread the key through code.
|
|
18
|
+
- **`py.typed` marker.** PEP 561 type-checker support — `mypy`, Pyright,
|
|
19
|
+
VS Code and PyCharm now consume the inline type hints.
|
|
20
|
+
- **`aliax.__version__` exported** and included in `__all__`.
|
|
21
|
+
- Structured `tests/` directory with `pytest` and a CI matrix
|
|
22
|
+
(Python 3.8 → 3.12) plus an sdist/wheel build check via `twine check`.
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
- **PEP 621 packaging.** Migrated from `setup.py` to `pyproject.toml`
|
|
26
|
+
with `setuptools.build_meta`. Version is now read dynamically from
|
|
27
|
+
`aliax/_version.py` via `tool.setuptools.dynamic`.
|
|
28
|
+
- **Distribution renamed** from `aliax-sdk` to `aliax` so the install
|
|
29
|
+
name matches the import name (`pip install aliax` → `import aliax`).
|
|
30
|
+
- **Zip-safe bundled-asset loading.** `dom-mapper.min.js` is now read
|
|
31
|
+
via `importlib.resources` instead of `os.path.join(__file__, …)`, so
|
|
32
|
+
the SDK works inside zipped wheels, AWS Lambda layers, and Google
|
|
33
|
+
Cloud Run container images.
|
|
34
|
+
|
|
35
|
+
### Added (legal)
|
|
36
|
+
- MIT `LICENSE` file in the package root and SPDX classifier on the
|
|
37
|
+
distribution metadata.
|
aliax-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aliax
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
aliax-1.0.0/MANIFEST.in
ADDED
aliax-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aliax
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Aliax SDK — real-time interceptor middleware for visual web agents (Set-of-Mark + deterministic execution).
|
|
5
|
+
Author-email: Aliax <hello@aliax.xyz>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Aliax
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in
|
|
18
|
+
all copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
26
|
+
THE SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://aliax.xyz
|
|
29
|
+
Project-URL: Documentation, https://aliax.xyz/docs
|
|
30
|
+
Project-URL: Source, https://github.com/aliax/aliax-sdk
|
|
31
|
+
Project-URL: Tracker, https://github.com/aliax/aliax-sdk/issues
|
|
32
|
+
Project-URL: Changelog, https://github.com/aliax/aliax-sdk/blob/main/CHANGELOG.md
|
|
33
|
+
Keywords: ai,agents,web-agents,playwright,vlm,set-of-mark,browser-automation,llm-tools
|
|
34
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
35
|
+
Classifier: Intended Audience :: Developers
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python
|
|
39
|
+
Classifier: Programming Language :: Python :: 3
|
|
40
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
44
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
45
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
46
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
47
|
+
Classifier: Typing :: Typed
|
|
48
|
+
Requires-Python: >=3.8
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
License-File: LICENSE
|
|
51
|
+
Requires-Dist: httpx<2,>=0.24
|
|
52
|
+
Requires-Dist: playwright<2,>=1.30
|
|
53
|
+
Provides-Extra: test
|
|
54
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
55
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "test"
|
|
56
|
+
Requires-Dist: respx>=0.20; extra == "test"
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
|
|
59
|
+
# Aliax SDK
|
|
60
|
+
|
|
61
|
+
[](https://pypi.org/project/aliax/)
|
|
62
|
+
[](https://pypi.org/project/aliax/)
|
|
63
|
+
[](https://github.com/aliax/aliax-sdk/actions/workflows/test.yml)
|
|
64
|
+
[](LICENSE)
|
|
65
|
+
|
|
66
|
+
**Aliax** is the real-time interceptor middleware for visual web agents.
|
|
67
|
+
Translate the live page into a numbered Set-of-Mark image, hand it to
|
|
68
|
+
your VLM, and let Aliax execute the action it picks — clicks, types,
|
|
69
|
+
scrolls, the works — with deterministic accuracy.
|
|
70
|
+
|
|
71
|
+
When the agent still gets stuck, ship the failure to the offline
|
|
72
|
+
annotation queue with `capture_failure()`.
|
|
73
|
+
|
|
74
|
+
**Current version:** `1.0.0` · **Python:** 3.8 – 3.12 · [Changelog](CHANGELOG.md) · [Contributing](CONTRIBUTING.md)
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install aliax
|
|
80
|
+
playwright install chromium
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## The 3-line agent loop
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from aliax import Aliax
|
|
87
|
+
from playwright.async_api import async_playwright
|
|
88
|
+
|
|
89
|
+
aliax = Aliax(api_key="sk_live_...") # mint one at /api-keys
|
|
90
|
+
|
|
91
|
+
async with async_playwright() as p:
|
|
92
|
+
page = await (await p.chromium.launch()).new_page()
|
|
93
|
+
await page.goto("https://shop.example.com/cart")
|
|
94
|
+
|
|
95
|
+
# 1. Translate the live DOM into a VLM-friendly Set-of-Mark.
|
|
96
|
+
ctx = await aliax.parse_ui(page)
|
|
97
|
+
|
|
98
|
+
# 2. Ask your VLM. It sees a numbered screenshot + a tiny JSON map.
|
|
99
|
+
decision = await ask_llm(image=ctx.image_bytes, map=ctx.elements)
|
|
100
|
+
# e.g. {"action": "CLICK", "element_id": "el_41"}
|
|
101
|
+
|
|
102
|
+
# 3. Aliax executes it natively — scroll-into-view, React onChange,
|
|
103
|
+
# iframe coords, retina DPR, all handled.
|
|
104
|
+
await aliax.execute(page, decision)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
That's the entire happy path. No XPath wrangling, no Playwright
|
|
108
|
+
boilerplate, no pixel guessing.
|
|
109
|
+
|
|
110
|
+
## The safety net — `capture_failure()`
|
|
111
|
+
|
|
112
|
+
When the agent loops on a popup or hallucinates past recovery, snapshot
|
|
113
|
+
the full state and ship it to the Aliax annotation queue for human
|
|
114
|
+
correction:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
await aliax.capture_failure(
|
|
118
|
+
page,
|
|
119
|
+
goal="Close newsletter popup",
|
|
120
|
+
thoughts=agent.current_reasoning,
|
|
121
|
+
last_attempted_action={"action": "CLICK", "element_id": "el_41"},
|
|
122
|
+
failure_reason="modal_blocked",
|
|
123
|
+
step=agent.loop_step,
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Each failure becomes a (negative, positive) DPO pair — the agent's
|
|
128
|
+
"stupid move" + the annotator's corrected tap — ready for fine-tuning.
|
|
129
|
+
|
|
130
|
+
## `parse_ui()` — what you get back
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
ctx = await aliax.parse_ui(page)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
| Attribute | Type | Description |
|
|
137
|
+
|------------------|-----------------|-------------|
|
|
138
|
+
| `image_bytes` | `bytes` | Native browser screenshot (JPEG by default — see `render_config`) with numbered Set-of-Mark boxes painted by the bundled JS overlay. Zero Python image processing. |
|
|
139
|
+
| `image_mime` | `str` | `"image/jpeg"` by default, `"image/png"` if you opt in. |
|
|
140
|
+
| `image_size` | `(int, int)` | `(width, height)` in physical pixels, parsed from the image header. |
|
|
141
|
+
| `elements` | `list[dict]` | Each element: `element_id`, `tag`, `role`, `text`, `bounds`, `editable`, `is_canvas`, `attrs`. |
|
|
142
|
+
| `viewport` | `dict` | `{width, height, dpr, scroll_x, scroll_y}` in CSS px. |
|
|
143
|
+
| `url` | `str` | Page URL at capture time. |
|
|
144
|
+
|
|
145
|
+
### Power-user: shrink your VLM bill with `render_config`
|
|
146
|
+
|
|
147
|
+
VLM providers bill image tokens by file weight + dimensions. JPEG at
|
|
148
|
+
`quality=40` costs roughly **10× fewer tokens** than the lossless
|
|
149
|
+
default. The Set-of-Mark IDs stay readable because they're rendered as
|
|
150
|
+
crisp DOM text by Chromium *before* the JPEG encoder runs.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# Default — pristine JPEG at q=80.
|
|
154
|
+
ctx = await aliax.parse_ui(page)
|
|
155
|
+
|
|
156
|
+
# Token-optimised — same readable IDs, ~10× cheaper per VLM call.
|
|
157
|
+
ctx = await aliax.parse_ui(
|
|
158
|
+
page,
|
|
159
|
+
render_config={"format": "jpeg", "quality": 40},
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Lossless when you need it (research / fine-tuning datasets).
|
|
163
|
+
ctx = await aliax.parse_ui(
|
|
164
|
+
page,
|
|
165
|
+
render_config={"format": "png"},
|
|
166
|
+
)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Quality is clamped to `[30, 100]` so a typo like `quality=5` can't turn
|
|
170
|
+
the numbered boxes into illegible mush and crash your agent loop.
|
|
171
|
+
|
|
172
|
+
Convenience helper:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
prompt = f"""
|
|
176
|
+
{ctx.llm_text_block()}
|
|
177
|
+
Pick the element_id to click. Reply JSON {{action, element_id}}.
|
|
178
|
+
"""
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## `execute()` — the action verbs
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
await aliax.execute(page, {"action": "CLICK", "element_id": "el_41"})
|
|
185
|
+
await aliax.execute(page, {"action": "TYPE", "element_id": "el_7", "value": "Nike Shoes"})
|
|
186
|
+
await aliax.execute(page, {"action": "HOVER", "element_id": "el_14"})
|
|
187
|
+
await aliax.execute(page, {"action": "PRESS", "element_id": "el_7", "key": "Enter"})
|
|
188
|
+
await aliax.execute(page, {"action": "SCROLL", "dy": 600})
|
|
189
|
+
await aliax.execute(page, {"action": "NAVIGATE", "url": "https://..."})
|
|
190
|
+
await aliax.execute(page, {"action": "WAIT", "ms": 1500})
|
|
191
|
+
await aliax.execute(page, {"action": "DONE"})
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Raw coordinates are also accepted as an escape hatch:
|
|
195
|
+
```python
|
|
196
|
+
await aliax.execute(page, {"action": "CLICK", "x": 905, "y": 150})
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Under the hood, `execute()` scrolls the target into view, fires
|
|
200
|
+
React-friendly events (focus → keystrokes with a 50ms cadence), and
|
|
201
|
+
returns a small dict like `{"ok": True, "coords": [905, 150]}`. It
|
|
202
|
+
never raises — your agent loop stays alive.
|
|
203
|
+
|
|
204
|
+
## How it works under the hood
|
|
205
|
+
|
|
206
|
+
1. The bundled DOM mapper (`dom-mapper.min.js`, shipped inside the
|
|
207
|
+
wheel) traverses the live page including shadow DOMs and same-origin
|
|
208
|
+
iframes. It returns only **truly interactable** elements with a
|
|
209
|
+
minimum size of 12 CSS pixels — no clutter, no hallucinated boxes.
|
|
210
|
+
2. The same JS module paints a `position:fixed; pointer-events:none;
|
|
211
|
+
contain:strict` overlay of numbered colored boxes at the top of the
|
|
212
|
+
z-stack. Chromium renders the boxes in microseconds and the host
|
|
213
|
+
page's layout / hover / IntersectionObserver state is untouched.
|
|
214
|
+
3. Playwright snaps **one** native screenshot via its C++ CDP path at
|
|
215
|
+
the format / quality you asked for in `render_config`. No Python
|
|
216
|
+
image processing, no Pillow dependency.
|
|
217
|
+
4. The overlay layer is torn down in a `try/finally` so a crash mid-
|
|
218
|
+
capture still leaves the live page exactly as we found it.
|
|
219
|
+
5. Coordinates are cached so `execute(page, {"element_id": "el_41"})`
|
|
220
|
+
resolves instantly against the most recent `parse_ui()`.
|
|
221
|
+
6. Every call fires a non-blocking telemetry ping for usage tracking.
|
|
222
|
+
|
|
223
|
+
## What's new in v1.0.0 — first PyPI release
|
|
224
|
+
|
|
225
|
+
The 1.0.0 cut is the first official PyPI release. The SDK has been
|
|
226
|
+
hardened for enterprise deployment — zipped wheels, serverless layers,
|
|
227
|
+
long-running agents — and the install name now matches the import
|
|
228
|
+
(`pip install aliax` → `import aliax`).
|
|
229
|
+
|
|
230
|
+
- **`async with Aliax() as aliax:`** — full async context manager support
|
|
231
|
+
so the `httpx` connection pool is torn down deterministically even if
|
|
232
|
+
the agent loop crashes. No more socket leaks in 24/7 bots.
|
|
233
|
+
- **`ALIAX_API_KEY` env-var fallback** — `Aliax()` with no args reads
|
|
234
|
+
the key from the environment; 12-factor Docker / Lambda / GitHub
|
|
235
|
+
Actions deployments don't have to thread it through code.
|
|
236
|
+
- **Zip-safe asset loading** — `dom-mapper.min.js` is read via
|
|
237
|
+
`importlib.resources`, so the SDK runs inside AWS Lambda layers and
|
|
238
|
+
Google Cloud Run container images where `__file__` is a virtual path.
|
|
239
|
+
- **PEP 561 typed** — ships `py.typed` so `mypy`, Pyright, VS Code, and
|
|
240
|
+
PyCharm consume the inline type hints out of the box.
|
|
241
|
+
- **PEP 621 packaging** — pure `pyproject.toml`, dynamic version pulled
|
|
242
|
+
from `aliax/_version.py`, MIT-licensed, classified as
|
|
243
|
+
`Development Status :: 5 - Production/Stable`.
|
|
244
|
+
- **`render_config={"format", "quality"}`** — the only knob AI engineers
|
|
245
|
+
get for VLM token economics. Defaults are pristine JPEG at
|
|
246
|
+
`quality=80`; drop to `quality=40` for roughly 10× cheaper VLM calls
|
|
247
|
+
without sacrificing Set-of-Mark ID legibility. Quality is clamped to
|
|
248
|
+
`[30, 100]` so a typo can't crash the agent.
|
|
249
|
+
- **Browser-native overlay** — Set-of-Mark boxes are painted by the
|
|
250
|
+
bundled `dom-mapper.min.js` as a `position:fixed; pointer-events:none;
|
|
251
|
+
contain:strict` DOM layer. Chromium renders them in microseconds;
|
|
252
|
+
Playwright snaps the screenshot via its native C++ CDP path. The SDK
|
|
253
|
+
never touches the pixel buffer — no Pillow, no OpenCV, no Cairo.
|
|
254
|
+
- **`parse_ui(page)` / `execute(page, decision)` / `capture_failure(...)`**
|
|
255
|
+
— the canonical 3-call agent loop, with a typed `ParseContext`,
|
|
256
|
+
`AttemptedAction` dataclass, and the full action catalog
|
|
257
|
+
(`CLICK`, `TYPE`, `TYPE_AND_ENTER`, `HOVER`, `SCROLL_{DOWN,UP,LEFT,RIGHT}`).
|
|
258
|
+
- **Hardened transport** — bearer-only auth, idempotency keys on
|
|
259
|
+
`capture_failure`, race-safe HTTP client init, non-blocking usage
|
|
260
|
+
telemetry, automatic `*.workers.dev` fallback if the apex DNS / TLS
|
|
261
|
+
flakes, opt-out via `fallback_endpoint=""`.
|
|
262
|
+
|
|
263
|
+
See [CHANGELOG.md](CHANGELOG.md) for the full changelog.
|
|
264
|
+
|
|
265
|
+
|
aliax-1.0.0/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Aliax SDK
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/aliax/)
|
|
4
|
+
[](https://pypi.org/project/aliax/)
|
|
5
|
+
[](https://github.com/aliax/aliax-sdk/actions/workflows/test.yml)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
**Aliax** is the real-time interceptor middleware for visual web agents.
|
|
9
|
+
Translate the live page into a numbered Set-of-Mark image, hand it to
|
|
10
|
+
your VLM, and let Aliax execute the action it picks — clicks, types,
|
|
11
|
+
scrolls, the works — with deterministic accuracy.
|
|
12
|
+
|
|
13
|
+
When the agent still gets stuck, ship the failure to the offline
|
|
14
|
+
annotation queue with `capture_failure()`.
|
|
15
|
+
|
|
16
|
+
**Current version:** `1.0.0` · **Python:** 3.8 – 3.12 · [Changelog](CHANGELOG.md) · [Contributing](CONTRIBUTING.md)
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install aliax
|
|
22
|
+
playwright install chromium
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## The 3-line agent loop
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from aliax import Aliax
|
|
29
|
+
from playwright.async_api import async_playwright
|
|
30
|
+
|
|
31
|
+
aliax = Aliax(api_key="sk_live_...") # mint one at /api-keys
|
|
32
|
+
|
|
33
|
+
async with async_playwright() as p:
|
|
34
|
+
page = await (await p.chromium.launch()).new_page()
|
|
35
|
+
await page.goto("https://shop.example.com/cart")
|
|
36
|
+
|
|
37
|
+
# 1. Translate the live DOM into a VLM-friendly Set-of-Mark.
|
|
38
|
+
ctx = await aliax.parse_ui(page)
|
|
39
|
+
|
|
40
|
+
# 2. Ask your VLM. It sees a numbered screenshot + a tiny JSON map.
|
|
41
|
+
decision = await ask_llm(image=ctx.image_bytes, map=ctx.elements)
|
|
42
|
+
# e.g. {"action": "CLICK", "element_id": "el_41"}
|
|
43
|
+
|
|
44
|
+
# 3. Aliax executes it natively — scroll-into-view, React onChange,
|
|
45
|
+
# iframe coords, retina DPR, all handled.
|
|
46
|
+
await aliax.execute(page, decision)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
That's the entire happy path. No XPath wrangling, no Playwright
|
|
50
|
+
boilerplate, no pixel guessing.
|
|
51
|
+
|
|
52
|
+
## The safety net — `capture_failure()`
|
|
53
|
+
|
|
54
|
+
When the agent loops on a popup or hallucinates past recovery, snapshot
|
|
55
|
+
the full state and ship it to the Aliax annotation queue for human
|
|
56
|
+
correction:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
await aliax.capture_failure(
|
|
60
|
+
page,
|
|
61
|
+
goal="Close newsletter popup",
|
|
62
|
+
thoughts=agent.current_reasoning,
|
|
63
|
+
last_attempted_action={"action": "CLICK", "element_id": "el_41"},
|
|
64
|
+
failure_reason="modal_blocked",
|
|
65
|
+
step=agent.loop_step,
|
|
66
|
+
)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Each failure becomes a (negative, positive) DPO pair — the agent's
|
|
70
|
+
"stupid move" + the annotator's corrected tap — ready for fine-tuning.
|
|
71
|
+
|
|
72
|
+
## `parse_ui()` — what you get back
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
ctx = await aliax.parse_ui(page)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
| Attribute | Type | Description |
|
|
79
|
+
|------------------|-----------------|-------------|
|
|
80
|
+
| `image_bytes` | `bytes` | Native browser screenshot (JPEG by default — see `render_config`) with numbered Set-of-Mark boxes painted by the bundled JS overlay. Zero Python image processing. |
|
|
81
|
+
| `image_mime` | `str` | `"image/jpeg"` by default, `"image/png"` if you opt in. |
|
|
82
|
+
| `image_size` | `(int, int)` | `(width, height)` in physical pixels, parsed from the image header. |
|
|
83
|
+
| `elements` | `list[dict]` | Each element: `element_id`, `tag`, `role`, `text`, `bounds`, `editable`, `is_canvas`, `attrs`. |
|
|
84
|
+
| `viewport` | `dict` | `{width, height, dpr, scroll_x, scroll_y}` in CSS px. |
|
|
85
|
+
| `url` | `str` | Page URL at capture time. |
|
|
86
|
+
|
|
87
|
+
### Power-user: shrink your VLM bill with `render_config`
|
|
88
|
+
|
|
89
|
+
VLM providers bill image tokens by file weight + dimensions. JPEG at
|
|
90
|
+
`quality=40` costs roughly **10× fewer tokens** than the lossless
|
|
91
|
+
default. The Set-of-Mark IDs stay readable because they're rendered as
|
|
92
|
+
crisp DOM text by Chromium *before* the JPEG encoder runs.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# Default — pristine JPEG at q=80.
|
|
96
|
+
ctx = await aliax.parse_ui(page)
|
|
97
|
+
|
|
98
|
+
# Token-optimised — same readable IDs, ~10× cheaper per VLM call.
|
|
99
|
+
ctx = await aliax.parse_ui(
|
|
100
|
+
page,
|
|
101
|
+
render_config={"format": "jpeg", "quality": 40},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Lossless when you need it (research / fine-tuning datasets).
|
|
105
|
+
ctx = await aliax.parse_ui(
|
|
106
|
+
page,
|
|
107
|
+
render_config={"format": "png"},
|
|
108
|
+
)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Quality is clamped to `[30, 100]` so a typo like `quality=5` can't turn
|
|
112
|
+
the numbered boxes into illegible mush and crash your agent loop.
|
|
113
|
+
|
|
114
|
+
Convenience helper:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
prompt = f"""
|
|
118
|
+
{ctx.llm_text_block()}
|
|
119
|
+
Pick the element_id to click. Reply JSON {{action, element_id}}.
|
|
120
|
+
"""
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## `execute()` — the action verbs
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
await aliax.execute(page, {"action": "CLICK", "element_id": "el_41"})
|
|
127
|
+
await aliax.execute(page, {"action": "TYPE", "element_id": "el_7", "value": "Nike Shoes"})
|
|
128
|
+
await aliax.execute(page, {"action": "HOVER", "element_id": "el_14"})
|
|
129
|
+
await aliax.execute(page, {"action": "PRESS", "element_id": "el_7", "key": "Enter"})
|
|
130
|
+
await aliax.execute(page, {"action": "SCROLL", "dy": 600})
|
|
131
|
+
await aliax.execute(page, {"action": "NAVIGATE", "url": "https://..."})
|
|
132
|
+
await aliax.execute(page, {"action": "WAIT", "ms": 1500})
|
|
133
|
+
await aliax.execute(page, {"action": "DONE"})
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Raw coordinates are also accepted as an escape hatch:
|
|
137
|
+
```python
|
|
138
|
+
await aliax.execute(page, {"action": "CLICK", "x": 905, "y": 150})
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Under the hood, `execute()` scrolls the target into view, fires
|
|
142
|
+
React-friendly events (focus → keystrokes with a 50ms cadence), and
|
|
143
|
+
returns a small dict like `{"ok": True, "coords": [905, 150]}`. It
|
|
144
|
+
never raises — your agent loop stays alive.
|
|
145
|
+
|
|
146
|
+
## How it works under the hood
|
|
147
|
+
|
|
148
|
+
1. The bundled DOM mapper (`dom-mapper.min.js`, shipped inside the
|
|
149
|
+
wheel) traverses the live page including shadow DOMs and same-origin
|
|
150
|
+
iframes. It returns only **truly interactable** elements with a
|
|
151
|
+
minimum size of 12 CSS pixels — no clutter, no hallucinated boxes.
|
|
152
|
+
2. The same JS module paints a `position:fixed; pointer-events:none;
|
|
153
|
+
contain:strict` overlay of numbered colored boxes at the top of the
|
|
154
|
+
z-stack. Chromium renders the boxes in microseconds and the host
|
|
155
|
+
page's layout / hover / IntersectionObserver state is untouched.
|
|
156
|
+
3. Playwright snaps **one** native screenshot via its C++ CDP path at
|
|
157
|
+
the format / quality you asked for in `render_config`. No Python
|
|
158
|
+
image processing, no Pillow dependency.
|
|
159
|
+
4. The overlay layer is torn down in a `try/finally` so a crash mid-
|
|
160
|
+
capture still leaves the live page exactly as we found it.
|
|
161
|
+
5. Coordinates are cached so `execute(page, {"element_id": "el_41"})`
|
|
162
|
+
resolves instantly against the most recent `parse_ui()`.
|
|
163
|
+
6. Every call fires a non-blocking telemetry ping for usage tracking.
|
|
164
|
+
|
|
165
|
+
## What's new in v1.0.0 — first PyPI release
|
|
166
|
+
|
|
167
|
+
The 1.0.0 cut is the first official PyPI release. The SDK has been
|
|
168
|
+
hardened for enterprise deployment — zipped wheels, serverless layers,
|
|
169
|
+
long-running agents — and the install name now matches the import
|
|
170
|
+
(`pip install aliax` → `import aliax`).
|
|
171
|
+
|
|
172
|
+
- **`async with Aliax() as aliax:`** — full async context manager support
|
|
173
|
+
so the `httpx` connection pool is torn down deterministically even if
|
|
174
|
+
the agent loop crashes. No more socket leaks in 24/7 bots.
|
|
175
|
+
- **`ALIAX_API_KEY` env-var fallback** — `Aliax()` with no args reads
|
|
176
|
+
the key from the environment; 12-factor Docker / Lambda / GitHub
|
|
177
|
+
Actions deployments don't have to thread it through code.
|
|
178
|
+
- **Zip-safe asset loading** — `dom-mapper.min.js` is read via
|
|
179
|
+
`importlib.resources`, so the SDK runs inside AWS Lambda layers and
|
|
180
|
+
Google Cloud Run container images where `__file__` is a virtual path.
|
|
181
|
+
- **PEP 561 typed** — ships `py.typed` so `mypy`, Pyright, VS Code, and
|
|
182
|
+
PyCharm consume the inline type hints out of the box.
|
|
183
|
+
- **PEP 621 packaging** — pure `pyproject.toml`, dynamic version pulled
|
|
184
|
+
from `aliax/_version.py`, MIT-licensed, classified as
|
|
185
|
+
`Development Status :: 5 - Production/Stable`.
|
|
186
|
+
- **`render_config={"format", "quality"}`** — the only knob AI engineers
|
|
187
|
+
get for VLM token economics. Defaults are pristine JPEG at
|
|
188
|
+
`quality=80`; drop to `quality=40` for roughly 10× cheaper VLM calls
|
|
189
|
+
without sacrificing Set-of-Mark ID legibility. Quality is clamped to
|
|
190
|
+
`[30, 100]` so a typo can't crash the agent.
|
|
191
|
+
- **Browser-native overlay** — Set-of-Mark boxes are painted by the
|
|
192
|
+
bundled `dom-mapper.min.js` as a `position:fixed; pointer-events:none;
|
|
193
|
+
contain:strict` DOM layer. Chromium renders them in microseconds;
|
|
194
|
+
Playwright snaps the screenshot via its native C++ CDP path. The SDK
|
|
195
|
+
never touches the pixel buffer — no Pillow, no OpenCV, no Cairo.
|
|
196
|
+
- **`parse_ui(page)` / `execute(page, decision)` / `capture_failure(...)`**
|
|
197
|
+
— the canonical 3-call agent loop, with a typed `ParseContext`,
|
|
198
|
+
`AttemptedAction` dataclass, and the full action catalog
|
|
199
|
+
(`CLICK`, `TYPE`, `TYPE_AND_ENTER`, `HOVER`, `SCROLL_{DOWN,UP,LEFT,RIGHT}`).
|
|
200
|
+
- **Hardened transport** — bearer-only auth, idempotency keys on
|
|
201
|
+
`capture_failure`, race-safe HTTP client init, non-blocking usage
|
|
202
|
+
telemetry, automatic `*.workers.dev` fallback if the apex DNS / TLS
|
|
203
|
+
flakes, opt-out via `fallback_endpoint=""`.
|
|
204
|
+
|
|
205
|
+
See [CHANGELOG.md](CHANGELOG.md) for the full changelog.
|
|
206
|
+
|
|
207
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from ._version import __version__
|
|
2
|
+
from .client import (
|
|
3
|
+
Aliax,
|
|
4
|
+
AliaxError,
|
|
5
|
+
AliaxInvalidKeyError,
|
|
6
|
+
AliaxOutOfCreditsError,
|
|
7
|
+
AttemptedAction,
|
|
8
|
+
ParseContext,
|
|
9
|
+
)
|
|
10
|
+
from .prompts import ALIAX_SYSTEM_INSTRUCTIONS, SYSTEM_INSTRUCTIONS
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"__version__",
|
|
14
|
+
"Aliax",
|
|
15
|
+
"AliaxError",
|
|
16
|
+
"AliaxInvalidKeyError",
|
|
17
|
+
"AliaxOutOfCreditsError",
|
|
18
|
+
"AttemptedAction",
|
|
19
|
+
"ParseContext",
|
|
20
|
+
"SYSTEM_INSTRUCTIONS",
|
|
21
|
+
"ALIAX_SYSTEM_INSTRUCTIONS",
|
|
22
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Single source of truth for the SDK version.
|
|
2
|
+
|
|
3
|
+
Bumped manually on each PyPI release. Read by:
|
|
4
|
+
- aliax.client.Aliax (sent as `sdk_version` form field on every capture
|
|
5
|
+
and in the /v1/telemetry pings emitted from parse_ui / execute)
|
|
6
|
+
- pyproject.toml (via [tool.setuptools.dynamic] — so pip metadata matches)
|
|
7
|
+
- the background version-ping that warns users on stale installs
|
|
8
|
+
|
|
9
|
+
Bump the Cloudflare Worker's LATEST_SDK_VERSION wrangler var in the same
|
|
10
|
+
release so existing installs see the upgrade nag.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "1.0.0"
|