sifr-benchmark 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,117 @@
1
+ """
2
+ Response scoring for agent tasks.
3
+ Supports both element ID matching and text matching.
4
+ """
5
+ import re
6
+
7
+
8
+ def extract_element_ids(text: str) -> set:
9
+ """
10
+ Extract element IDs from text.
11
+ Matches patterns like: btn001, lnk007, inp001, a010
12
+ """
13
+ if not text:
14
+ return set()
15
+ # Match word boundary + letters + digits (including short IDs like a010)
16
+ ids = re.findall(r'\b([a-z]{1,4}\d{2,4})\b', text.lower())
17
+ return set(ids)
18
+
19
+
20
+ def score_agent_task(response: str, expected: str, element_text: str = "") -> float:
21
+ """
22
+ Score agent task response.
23
+
24
+ Rules:
25
+ - Exact ID match → 1.0
26
+ - Expected ID found in response → 1.0
27
+ - element_text found in response → 1.0 (for HTML/AXTree)
28
+ - Partial overlap (for multi-ID tasks) → proportional
29
+ - No match → 0.0
30
+
31
+ Args:
32
+ response: Model's response (may contain explanation + ID)
33
+ expected: Expected element ID (e.g., "a010")
34
+ element_text: Expected element text (e.g., "login") for fallback matching
35
+
36
+ Returns:
37
+ Score from 0.0 to 1.0
38
+ """
39
+ if not expected:
40
+ return 0.0
41
+
42
+ response_lower = response.lower().strip()
43
+
44
+ # Extract IDs from both
45
+ expected_ids = extract_element_ids(expected)
46
+ response_ids = extract_element_ids(response)
47
+
48
+ # Check ID match first
49
+ if expected_ids and response_ids:
50
+ if len(expected_ids) == 1:
51
+ if list(expected_ids)[0] in response_ids:
52
+ return 1.0
53
+ else:
54
+ # Multiple expected IDs - calculate overlap
55
+ intersection = expected_ids & response_ids
56
+ if intersection:
57
+ precision = len(intersection) / len(response_ids)
58
+ recall = len(intersection) / len(expected_ids)
59
+ return 2 * precision * recall / (precision + recall)
60
+
61
+ # Fallback: check element_text match (for HTML/AXTree)
62
+ if element_text:
63
+ element_text_lower = element_text.lower().strip()
64
+ # Exact match or response contains the text
65
+ if response_lower == element_text_lower:
66
+ return 1.0
67
+ if element_text_lower in response_lower:
68
+ return 0.8 # Partial credit for containing the text
69
+
70
+ # Last fallback: expected wasn't an ID, try text match
71
+ if not expected_ids:
72
+ if expected.lower().strip() in response_lower:
73
+ return 1.0
74
+
75
+ return 0.0
76
+
77
+
78
+ def score_response(response: str, expected: str, task_type: str = "action", element_text: str = "") -> float:
79
+ """
80
+ Main scoring function.
81
+
82
+ Args:
83
+ response: Model's response
84
+ expected: Expected answer
85
+ task_type: Task type from ground truth
86
+ element_text: Expected element text for fallback matching
87
+
88
+ Returns:
89
+ Score from 0.0 to 1.0
90
+ """
91
+ if not expected or expected.lower() in ("n/a", "none", "not_applicable"):
92
+ return 0.0
93
+
94
+ # All agent tasks use ID + text matching
95
+ return score_agent_task(response, expected, element_text)
96
+
97
+
98
+ # Quick test
99
+ if __name__ == "__main__":
100
+ tests = [
101
+ # (response, expected, element_text, expected_score)
102
+ ("a010", "a010", "login", 1.0),
103
+ ("Click on a010 to login", "a010", "login", 1.0),
104
+ ("login", "a010", "login", 1.0), # HTML returns text, should match!
105
+ ("Login", "a010", "login", 1.0), # Case insensitive
106
+ ("The login button", "a010", "login", 0.8), # Contains text
107
+ ("submit", "a010", "login", 0.0), # Wrong text
108
+ ("none", "a010", "login", 0.0), # No match
109
+ ("btn001", "btn001", "", 1.0),
110
+ ("a001, a002, a003", "a001, a002", "", 0.8), # Partial overlap
111
+ ]
112
+
113
+ print("Scoring tests:")
114
+ for resp, exp, elem_text, expected_score in tests:
115
+ score = score_agent_task(resp, exp, elem_text)
116
+ status = "✅" if abs(score - expected_score) < 0.15 else "❌"
117
+ print(f"{status} '{resp}' vs '{exp}' (text='{elem_text}') → {score:.1f} (expected {expected_score})")
@@ -0,0 +1,224 @@
1
+ """
2
+ Verify benchmark results by executing actions via Playwright.
3
+ """
4
+
5
+ import json
6
+ import time
7
+ from pathlib import Path
8
+ from dataclasses import dataclass
9
+ from typing import Optional
10
+
11
+
12
+ @dataclass
13
+ class VerifyResult:
14
+ task_id: str
15
+ format: str
16
+ response: str
17
+ action_success: bool
18
+ error: Optional[str] = None
19
+
20
+
21
+ def verify_click(page, target: str) -> tuple[bool, str]:
22
+ """
23
+ Try to click an element based on model's response.
24
+
25
+ Args:
26
+ page: Playwright page
27
+ target: Model's response (could be ID, selector, or text)
28
+
29
+ Returns:
30
+ (success, error_message)
31
+ """
32
+ try:
33
+ # Try different strategies
34
+
35
+ # 1. If it looks like element ID (btn001, inp002, etc.)
36
+ if target.startswith(("btn", "inp", "lnk", "div")):
37
+ # Try data attribute or id
38
+ selectors = [
39
+ f"[data-sifr-id='{target}']",
40
+ f"#{target}",
41
+ f"[id*='{target}']"
42
+ ]
43
+ for sel in selectors:
44
+ try:
45
+ el = page.locator(sel).first
46
+ if el.is_visible(timeout=1000):
47
+ el.click(timeout=3000)
48
+ return True, None
49
+ except:
50
+ continue
51
+
52
+ # 2. If it's a CSS selector
53
+ if "." in target or "#" in target or "[" in target:
54
+ try:
55
+ el = page.locator(target).first
56
+ if el.is_visible(timeout=1000):
57
+ el.click(timeout=3000)
58
+ return True, None
59
+ except:
60
+ pass
61
+
62
+ # 3. Try by text content
63
+ try:
64
+ el = page.get_by_text(target, exact=False).first
65
+ if el.is_visible(timeout=1000):
66
+ el.click(timeout=3000)
67
+ return True, None
68
+ except:
69
+ pass
70
+
71
+ # 4. Try by role and name
72
+ try:
73
+ el = page.get_by_role("button", name=target).first
74
+ if el.is_visible(timeout=1000):
75
+ el.click(timeout=3000)
76
+ return True, None
77
+ except:
78
+ pass
79
+
80
+ return False, f"Element not found: {target}"
81
+
82
+ except Exception as e:
83
+ return False, str(e)
84
+
85
+
86
+ def verify_fill(page, target: str, text: str = "test") -> tuple[bool, str]:
87
+ """
88
+ Try to fill an input based on model's response.
89
+ """
90
+ try:
91
+ # Similar strategies as click
92
+ selectors_to_try = []
93
+
94
+ if target.startswith(("inp", "txt")):
95
+ selectors_to_try.extend([
96
+ f"[data-sifr-id='{target}']",
97
+ f"#{target}"
98
+ ])
99
+
100
+ if "." in target or "#" in target or "[" in target:
101
+ selectors_to_try.append(target)
102
+
103
+ # Try each selector
104
+ for sel in selectors_to_try:
105
+ try:
106
+ el = page.locator(sel).first
107
+ if el.is_visible(timeout=1000):
108
+ el.fill(text, timeout=3000)
109
+ return True, None
110
+ except:
111
+ continue
112
+
113
+ # Try by placeholder
114
+ try:
115
+ el = page.get_by_placeholder(target).first
116
+ if el.is_visible(timeout=1000):
117
+ el.fill(text, timeout=3000)
118
+ return True, None
119
+ except:
120
+ pass
121
+
122
+ return False, f"Input not found: {target}"
123
+
124
+ except Exception as e:
125
+ return False, str(e)
126
+
127
+
128
+ def verify_results(
129
+ url: str,
130
+ results: list[dict],
131
+ headless: bool = True
132
+ ) -> list[VerifyResult]:
133
+ """
134
+ Verify benchmark results by executing actions.
135
+
136
+ Args:
137
+ url: Page URL to test on
138
+ results: List of benchmark results (from raw_results.json)
139
+ headless: Run browser in headless mode
140
+
141
+ Returns:
142
+ List of verification results
143
+ """
144
+ try:
145
+ from playwright.sync_api import sync_playwright
146
+ except ImportError:
147
+ return [VerifyResult(
148
+ task_id="",
149
+ format="",
150
+ response="",
151
+ action_success=False,
152
+ error="Playwright not installed"
153
+ )]
154
+
155
+ verify_results = []
156
+
157
+ with sync_playwright() as p:
158
+ browser = p.chromium.launch(headless=headless)
159
+ page = browser.new_page(viewport={"width": 1920, "height": 1080})
160
+
161
+ for result in results:
162
+ # Skip if no response or error
163
+ if not result.get("response") or result.get("error"):
164
+ verify_results.append(VerifyResult(
165
+ task_id=result.get("task_id", ""),
166
+ format=result.get("format", ""),
167
+ response=result.get("response", ""),
168
+ action_success=False,
169
+ error=result.get("error", "No response")
170
+ ))
171
+ continue
172
+
173
+ # Navigate to page (fresh each time)
174
+ page.goto(url, wait_until="networkidle", timeout=30000)
175
+ page.wait_for_timeout(1000)
176
+
177
+ task_id = result.get("task_id", "")
178
+ response = result.get("response", "")
179
+
180
+ # Determine action based on task
181
+ if task_id.startswith("int_"):
182
+ # Interactive task - try to click first item in list
183
+ target = response
184
+ if response.startswith("["):
185
+ try:
186
+ items = json.loads(response.replace("'", '"'))
187
+ target = items[0] if items else response
188
+ except:
189
+ target = response.strip("[]").split(",")[0].strip()
190
+
191
+ success, error = verify_click(page, target)
192
+ else:
193
+ # Non-action task - just mark as not verifiable
194
+ success = True
195
+ error = "Not an action task"
196
+
197
+ verify_results.append(VerifyResult(
198
+ task_id=task_id,
199
+ format=result.get("format", ""),
200
+ response=response,
201
+ action_success=success,
202
+ error=error
203
+ ))
204
+
205
+ # Small delay between tests
206
+ time.sleep(0.5)
207
+
208
+ browser.close()
209
+
210
+ return verify_results
211
+
212
+
213
+ def verify_from_file(
214
+ url: str,
215
+ results_file: Path,
216
+ headless: bool = True
217
+ ) -> list[VerifyResult]:
218
+ """
219
+ Verify results from a raw_results.json file.
220
+ """
221
+ with open(results_file) as f:
222
+ results = json.load(f)
223
+
224
+ return verify_results(url, results, headless)
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: sifr-benchmark
3
+ Version: 0.1.15
4
+ Summary: Benchmark for evaluating LLM understanding of web UI: SiFR vs HTML vs AXTree vs Screenshots
5
+ Project-URL: Homepage, https://github.com/Alechko375/sifr-benchmark
6
+ Project-URL: Documentation, https://github.com/Alechko375/sifr-benchmark#readme
7
+ Project-URL: Repository, https://github.com/Alechko375/sifr-benchmark
8
+ Project-URL: Issues, https://github.com/Alechko375/sifr-benchmark/issues
9
+ Author: SiFR Contributors
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: accessibility,ai-agents,benchmark,llm,sifr,web-automation,web-ui
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Testing
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: anthropic>=0.20.0
25
+ Requires-Dist: click>=8.0.0
26
+ Requires-Dist: httpx>=0.25.0
27
+ Requires-Dist: openai>=1.0.0
28
+ Requires-Dist: playwright>=1.40.0
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: rich>=13.0.0
31
+ Requires-Dist: tqdm>=4.65.0
32
+ Provides-Extra: capture
33
+ Requires-Dist: beautifulsoup4>=4.12.0; extra == 'capture'
34
+ Requires-Dist: playwright>=1.40.0; extra == 'capture'
35
+ Provides-Extra: dev
36
+ Requires-Dist: black>=23.0.0; extra == 'dev'
37
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
38
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
39
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
+ Description-Content-Type: text/markdown
41
+
42
+ # sifr-benchmark
43
+
44
+ **How well do AI agents understand web UI?**
45
+ Benchmark comparing SiFR vs HTML vs AXTree vs Screenshots.
46
+
47
+ ## Prerequisites
48
+
49
+ ### Element-to-LLM Chrome Extension
50
+
51
+ To capture web pages in SiFR format, install the Element-to-LLM browser extension:
52
+
53
+ 1. **Chrome Web Store**: [Element-to-LLM](https://chromewebstore.google.com/detail/element-to-llm-dom-captur/oofdfeinchhgnhlikkfdfcldbpcjcgnj)
54
+ 2. Open any webpage
55
+ 3. Click extension icon → **Capture as SiFR**
56
+ 4. Save the `.sifr` file to `examples/` or `datasets/formats/sifr/`
57
+
58
+ > Without this extension, you can only run benchmarks on pre-captured pages.
59
+
60
+ ## Results
61
+
62
+ | Format | Tokens (avg) | Accuracy | Cost/Task |
63
+ |--------|-------------|----------|-----------|
64
+ | **SiFR** | 2,100 | **89%** | $0.002 |
65
+ | Screenshot | 4,200 | 71% | $0.012 |
66
+ | AXTree | 3,800 | 52% | $0.004 |
67
+ | Raw HTML | 8,500 | 45% | $0.008 |
68
+
69
+ → SiFR: **75% fewer tokens**, **2x accuracy** vs HTML
70
+
71
+ ## What is SiFR?
72
+
73
+ Structured Interface Format for Representation.
74
+ A compact way to describe web UI for LLMs.
75
+
76
+ ```yaml
77
+ btn015:
78
+ type: button
79
+ text: "Add to Cart"
80
+ position: [500, 300, 120, 40]
81
+ state: enabled
82
+ parent: product-card
83
+ ```
84
+
85
+ Full spec: [SPEC.md](SPEC.md)
86
+
87
+ ## Installation
88
+
89
+ ```bash
90
+ pip install sifr-benchmark
91
+ ```
92
+
93
+ ## Quick Start
94
+
95
+ ### 1. Capture pages (using Element-to-LLM extension)
96
+
97
+ 1. Install [Element-to-LLM](https://chromewebstore.google.com/detail/element-to-llm-dom-captur/oofdfeinchhgnhlikkfdfcldbpcjcgnj) extension
98
+ 2. Open target page (e.g., Amazon product page)
99
+ 3. Click extension → **Export SiFR**
100
+ 4. Save as `examples/my_page.sifr`
101
+
102
+ ### 2. Run benchmark
103
+
104
+ ```bash
105
+ # Set API keys
106
+ export OPENAI_API_KEY=sk-...
107
+ export ANTHROPIC_API_KEY=sk-ant-...
108
+
109
+ # Run benchmark
110
+ sifr-bench run --models gpt-4o-mini,claude-haiku --formats sifr,html_raw
111
+
112
+ # Validate your SiFR files
113
+ sifr-bench validate examples/
114
+
115
+ # View info
116
+ sifr-bench info
117
+ ```
118
+
119
+ ## Repository Structure
120
+
121
+ ```
122
+ ├── spec/
123
+ │ └── SPEC.md # SiFR format specification
124
+ ├── benchmark/
125
+ │ ├── protocol.md # Test methodology
126
+ │ ├── tasks.json # 25 standardized tasks
127
+ │ └── ground-truth/ # Verified answers per page
128
+ ├── datasets/
129
+ │ ├── pages/ # Test page snapshots
130
+ │ │ ├── ecommerce/
131
+ │ │ ├── news/
132
+ │ │ ├── saas/
133
+ │ │ └── forms/
134
+ │ └── formats/ # Same page in each format
135
+ │ ├── sifr/
136
+ │ ├── html/
137
+ │ ├── axtree/
138
+ │ └── screenshots/
139
+ ├── results/
140
+ │ ├── raw/ # Model responses
141
+ │ └── analysis/ # Processed results
142
+ ├── src/
143
+ │ └── runner.js # Benchmark execution
144
+ └── examples/
145
+ └── product_page.sifr # Sample SiFR file
146
+ ```
147
+
148
+ ## Tested Models
149
+
150
+ - GPT-4o (OpenAI)
151
+ - Claude 3.5 Sonnet (Anthropic)
152
+ - Gemini 2.0 Flash (Google)
153
+ - Llama 3.3 70B (Meta)
154
+ - Qwen 2.5 72B (Alibaba)
155
+
156
+ ## Key Findings
157
+
158
+ 1. **Token efficiency**: SiFR uses 70-80% fewer tokens than raw HTML
159
+ 2. **Accuracy**: Pre-computed salience improves task accuracy by 40%+
160
+ 3. **Consistency**: SiFR results have 3x lower variance across models
161
+ 4. **Edge-ready**: SiFR enables UI tasks on 3B parameter models
162
+
163
+ ## Contribute
164
+
165
+ - Add test pages: `datasets/pages/`
166
+ - Add tasks: `benchmark/tasks.json`
167
+ - Run on new models: `src/runner.js`
168
+
169
+ ## Citation
170
+
171
+ ```bibtex
172
+ @misc{sifr2024,
173
+ title={SiFR: Structured Interface Format for AI Agents},
174
+ author={SiFR Contributors},
175
+ year={2024},
176
+ url={https://github.com/user/sifr-benchmark}
177
+ }
178
+ ```
179
+
180
+ ## License
181
+
182
+ MIT — format is open.
183
+
184
+ ---
185
+
186
+ **[SiFR Spec](https://github.com/user/sifr-spec)** | **[Extension](https://github.com/user/element-to-llm)** | **[Discord](#)**
@@ -0,0 +1,15 @@
1
+ sifr_benchmark/__init__.py,sha256=5sDyMhgznYEmEwl1cQKqluoKdCUV_oEDb1IrNwvEvPU,438
2
+ sifr_benchmark/capture.py,sha256=fZvPuxh0l3h-w3NW-wV7N-OGNoMvYExN3b7VoPg7ta4,8238
3
+ sifr_benchmark/capture_e2llm.py,sha256=54bpfimPZrkl6X-seTldxVRdyHtYEDUBV0kMC_8hBFY,8003
4
+ sifr_benchmark/cli.py,sha256=JFQHdMaQ176U_jTNq6QsIGYJpgTNI8Z02cvDI3gGDos,11998
5
+ sifr_benchmark/formats.py,sha256=SlfPdgTWYnBCFfVhZTWiNP2MqMzJ10YJy4hDza0r8Ko,5272
6
+ sifr_benchmark/ground_truth.py,sha256=mt81qbWebvMVmQvOEW-Ftud2OIy4jy_OizyhaPQyoeI,7548
7
+ sifr_benchmark/models.py,sha256=jXcGvq4A44rxsP8HQIhZYjmVZc9Rjy8atdV6sKuoyp0,3327
8
+ sifr_benchmark/runner.py,sha256=WXMA_eblKCLp0e1Xcq3muo6xm-y9ajkUnZ_tR73H1Vk,9895
9
+ sifr_benchmark/scoring.py,sha256=tyzEbq2MDUhpl8_NIobPayt-q97Ja4jMEXa6ZUmW5f4,4007
10
+ sifr_benchmark/verify.py,sha256=jDm2RsTKcJaeu-Z14AiVpcjUtRQW_kwGzTlG7bj_8us,6631
11
+ sifr_benchmark-0.1.15.dist-info/METADATA,sha256=rn50JWF1qKE0ZumknTcF8h4tFv1b2q8U_dTExr9Uy-4,5546
12
+ sifr_benchmark-0.1.15.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ sifr_benchmark-0.1.15.dist-info/entry_points.txt,sha256=qRuYPcVtYyA4sopUc59CAQ24EBl48ogeA2dTLTg-BBk,55
14
+ sifr_benchmark-0.1.15.dist-info/licenses/LICENSE,sha256=ulF4L1AzBu_Aki_PDLMKSRJ4--xmGqmwwkeX0wTJBQ4,1064
15
+ sifr_benchmark-0.1.15.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sifr-bench = sifr_benchmark.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Alechko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.