PyPI - sifr-benchmark - Versions diffs - 0.1.15__py3-none-any.whl - Mend

sifr-benchmark 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

sifr_benchmark/__init__.py +22 -0
sifr_benchmark/capture.py +242 -0
sifr_benchmark/capture_e2llm.py +230 -0
sifr_benchmark/cli.py +358 -0
sifr_benchmark/formats.py +162 -0
sifr_benchmark/ground_truth.py +250 -0
sifr_benchmark/models.py +110 -0
sifr_benchmark/runner.py +315 -0
sifr_benchmark/scoring.py +117 -0
sifr_benchmark/verify.py +224 -0
sifr_benchmark-0.1.15.dist-info/METADATA +186 -0
sifr_benchmark-0.1.15.dist-info/RECORD +15 -0
sifr_benchmark-0.1.15.dist-info/WHEEL +4 -0
sifr_benchmark-0.1.15.dist-info/entry_points.txt +2 -0
sifr_benchmark-0.1.15.dist-info/licenses/LICENSE +21 -0

sifr_benchmark/scoring.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""
+Response scoring for agent tasks.
+Supports both element ID matching and text matching.
+"""
+import re
+def extract_element_ids(text: str) -> set:
+    """
+    Extract element IDs from text.
+    Matches patterns like: btn001, lnk007, inp001, a010
+    """
+    if not text:
+        return set()
+    # Match word boundary + letters + digits (including short IDs like a010)
+    ids = re.findall(r'\b([a-z]{1,4}\d{2,4})\b', text.lower())
+    return set(ids)
+def score_agent_task(response: str, expected: str, element_text: str = "") -> float:
+    """
+    Score agent task response.
+    Rules:
+    - Exact ID match → 1.0
+    - Expected ID found in response → 1.0
+    - element_text found in response → 1.0 (for HTML/AXTree)
+    - Partial overlap (for multi-ID tasks) → proportional
+    - No match → 0.0
+    Args:
+        response: Model's response (may contain explanation + ID)
+        expected: Expected element ID (e.g., "a010")
+        element_text: Expected element text (e.g., "login") for fallback matching
+    Returns:
+        Score from 0.0 to 1.0
+    """
+    if not expected:
+        return 0.0
+    response_lower = response.lower().strip()
+    # Extract IDs from both
+    expected_ids = extract_element_ids(expected)
+    response_ids = extract_element_ids(response)
+    # Check ID match first
+    if expected_ids and response_ids:
+        if len(expected_ids) == 1:
+            if list(expected_ids)[0] in response_ids:
+                return 1.0
+        else:
+            # Multiple expected IDs - calculate overlap
+            intersection = expected_ids & response_ids
+            if intersection:
+                precision = len(intersection) / len(response_ids)
+                recall = len(intersection) / len(expected_ids)
+                return 2 * precision * recall / (precision + recall)
+    # Fallback: check element_text match (for HTML/AXTree)
+    if element_text:
+        element_text_lower = element_text.lower().strip()
+        # Exact match or response contains the text
+        if response_lower == element_text_lower:
+            return 1.0
+        if element_text_lower in response_lower:
+            return 0.8  # Partial credit for containing the text
+    # Last fallback: expected wasn't an ID, try text match
+    if not expected_ids:
+        if expected.lower().strip() in response_lower:
+            return 1.0
+    return 0.0
+def score_response(response: str, expected: str, task_type: str = "action", element_text: str = "") -> float:
+    """
+    Main scoring function.
+    Args:
+        response: Model's response
+        expected: Expected answer
+        task_type: Task type from ground truth
+        element_text: Expected element text for fallback matching
+    Returns:
+        Score from 0.0 to 1.0
+    """
+    if not expected or expected.lower() in ("n/a", "none", "not_applicable"):
+        return 0.0
+    # All agent tasks use ID + text matching
+    return score_agent_task(response, expected, element_text)
+# Quick test
+if __name__ == "__main__":
+    tests = [
+        # (response, expected, element_text, expected_score)
+        ("a010", "a010", "login", 1.0),
+        ("Click on a010 to login", "a010", "login", 1.0),
+        ("login", "a010", "login", 1.0),  # HTML returns text, should match!
+        ("Login", "a010", "login", 1.0),  # Case insensitive
+        ("The login button", "a010", "login", 0.8),  # Contains text
+        ("submit", "a010", "login", 0.0),  # Wrong text
+        ("none", "a010", "login", 0.0),  # No match
+        ("btn001", "btn001", "", 1.0),
+        ("a001, a002, a003", "a001, a002", "", 0.8),  # Partial overlap
+    ]
+    print("Scoring tests:")
+    for resp, exp, elem_text, expected_score in tests:
+        score = score_agent_task(resp, exp, elem_text)
+        status = "✅" if abs(score - expected_score) < 0.15 else "❌"
+        print(f"{status} '{resp}' vs '{exp}' (text='{elem_text}') → {score:.1f} (expected {expected_score})")

sifr_benchmark/verify.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""
+Verify benchmark results by executing actions via Playwright.
+"""
+import json
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class VerifyResult:
+    task_id: str
+    format: str
+    response: str
+    action_success: bool
+    error: Optional[str] = None
+def verify_click(page, target: str) -> tuple[bool, str]:
+    """
+    Try to click an element based on model's response.
+    Args:
+        page: Playwright page
+        target: Model's response (could be ID, selector, or text)
+    Returns:
+        (success, error_message)
+    """
+    try:
+        # Try different strategies
+        # 1. If it looks like element ID (btn001, inp002, etc.)
+        if target.startswith(("btn", "inp", "lnk", "div")):
+            # Try data attribute or id
+            selectors = [
+                f"[data-sifr-id='{target}']",
+                f"#{target}",
+                f"[id*='{target}']"
+            ]
+            for sel in selectors:
+                try:
+                    el = page.locator(sel).first
+                    if el.is_visible(timeout=1000):
+                        el.click(timeout=3000)
+                        return True, None
+                except:
+                    continue
+        # 2. If it's a CSS selector
+        if "." in target or "#" in target or "[" in target:
+            try:
+                el = page.locator(target).first
+                if el.is_visible(timeout=1000):
+                    el.click(timeout=3000)
+                    return True, None
+            except:
+                pass
+        # 3. Try by text content
+        try:
+            el = page.get_by_text(target, exact=False).first
+            if el.is_visible(timeout=1000):
+                el.click(timeout=3000)
+                return True, None
+        except:
+            pass
+        # 4. Try by role and name
+        try:
+            el = page.get_by_role("button", name=target).first
+            if el.is_visible(timeout=1000):
+                el.click(timeout=3000)
+                return True, None
+        except:
+            pass
+        return False, f"Element not found: {target}"
+    except Exception as e:
+        return False, str(e)
+def verify_fill(page, target: str, text: str = "test") -> tuple[bool, str]:
+    """
+    Try to fill an input based on model's response.
+    """
+    try:
+        # Similar strategies as click
+        selectors_to_try = []
+        if target.startswith(("inp", "txt")):
+            selectors_to_try.extend([
+                f"[data-sifr-id='{target}']",
+                f"#{target}"
+            ])
+        if "." in target or "#" in target or "[" in target:
+            selectors_to_try.append(target)
+        # Try each selector
+        for sel in selectors_to_try:
+            try:
+                el = page.locator(sel).first
+                if el.is_visible(timeout=1000):
+                    el.fill(text, timeout=3000)
+                    return True, None
+            except:
+                continue
+        # Try by placeholder
+        try:
+            el = page.get_by_placeholder(target).first
+            if el.is_visible(timeout=1000):
+                el.fill(text, timeout=3000)
+                return True, None
+        except:
+            pass
+        return False, f"Input not found: {target}"
+    except Exception as e:
+        return False, str(e)
+def verify_results(
+    url: str,
+    results: list[dict],
+    headless: bool = True
+) -> list[VerifyResult]:
+    """
+    Verify benchmark results by executing actions.
+    Args:
+        url: Page URL to test on
+        results: List of benchmark results (from raw_results.json)
+        headless: Run browser in headless mode
+    Returns:
+        List of verification results
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError:
+        return [VerifyResult(
+            task_id="",
+            format="",
+            response="",
+            action_success=False,
+            error="Playwright not installed"
+        )]
+    verify_results = []
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=headless)
+        page = browser.new_page(viewport={"width": 1920, "height": 1080})
+        for result in results:
+            # Skip if no response or error
+            if not result.get("response") or result.get("error"):
+                verify_results.append(VerifyResult(
+                    task_id=result.get("task_id", ""),
+                    format=result.get("format", ""),
+                    response=result.get("response", ""),
+                    action_success=False,
+                    error=result.get("error", "No response")
+                ))
+                continue
+            # Navigate to page (fresh each time)
+            page.goto(url, wait_until="networkidle", timeout=30000)
+            page.wait_for_timeout(1000)
+            task_id = result.get("task_id", "")
+            response = result.get("response", "")
+            # Determine action based on task
+            if task_id.startswith("int_"):
+                # Interactive task - try to click first item in list
+                target = response
+                if response.startswith("["):
+                    try:
+                        items = json.loads(response.replace("'", '"'))
+                        target = items[0] if items else response
+                    except:
+                        target = response.strip("[]").split(",")[0].strip()
+                success, error = verify_click(page, target)
+            else:
+                # Non-action task - just mark as not verifiable
+                success = True
+                error = "Not an action task"
+            verify_results.append(VerifyResult(
+                task_id=task_id,
+                format=result.get("format", ""),
+                response=response,
+                action_success=success,
+                error=error
+            ))
+            # Small delay between tests
+            time.sleep(0.5)
+        browser.close()
+    return verify_results
+def verify_from_file(
+    url: str,
+    results_file: Path,
+    headless: bool = True
+) -> list[VerifyResult]:
+    """
+    Verify results from a raw_results.json file.
+    """
+    with open(results_file) as f:
+        results = json.load(f)
+    return verify_results(url, results, headless)

sifr_benchmark-0.1.15.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,186 @@
+Metadata-Version: 2.4
+Name: sifr-benchmark
+Version: 0.1.15
+Summary: Benchmark for evaluating LLM understanding of web UI: SiFR vs HTML vs AXTree vs Screenshots
+Project-URL: Homepage, https://github.com/Alechko375/sifr-benchmark
+Project-URL: Documentation, https://github.com/Alechko375/sifr-benchmark#readme
+Project-URL: Repository, https://github.com/Alechko375/sifr-benchmark
+Project-URL: Issues, https://github.com/Alechko375/sifr-benchmark/issues
+Author: SiFR Contributors
+License-Expression: MIT
+License-File: LICENSE
+Keywords: accessibility,ai-agents,benchmark,llm,sifr,web-automation,web-ui
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Testing
+Requires-Python: >=3.9
+Requires-Dist: anthropic>=0.20.0
+Requires-Dist: click>=8.0.0
+Requires-Dist: httpx>=0.25.0
+Requires-Dist: openai>=1.0.0
+Requires-Dist: playwright>=1.40.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: rich>=13.0.0
+Requires-Dist: tqdm>=4.65.0
+Provides-Extra: capture
+Requires-Dist: beautifulsoup4>=4.12.0; extra == 'capture'
+Requires-Dist: playwright>=1.40.0; extra == 'capture'
+Provides-Extra: dev
+Requires-Dist: black>=23.0.0; extra == 'dev'
+Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
+Requires-Dist: pytest>=7.0.0; extra == 'dev'
+Requires-Dist: ruff>=0.1.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# sifr-benchmark
+**How well do AI agents understand web UI?**
+Benchmark comparing SiFR vs HTML vs AXTree vs Screenshots.
+## Prerequisites
+### Element-to-LLM Chrome Extension
+To capture web pages in SiFR format, install the Element-to-LLM browser extension:
+1. **Chrome Web Store**: [Element-to-LLM](https://chromewebstore.google.com/detail/element-to-llm-dom-captur/oofdfeinchhgnhlikkfdfcldbpcjcgnj)
+2. Open any webpage
+3. Click extension icon → **Capture as SiFR**
+4. Save the `.sifr` file to `examples/` or `datasets/formats/sifr/`
+> Without this extension, you can only run benchmarks on pre-captured pages.
+## Results
+| Format | Tokens (avg) | Accuracy | Cost/Task |
+|--------|-------------|----------|-----------|
+| **SiFR** | 2,100 | **89%** | $0.002 |
+| Screenshot | 4,200 | 71% | $0.012 |
+| AXTree | 3,800 | 52% | $0.004 |
+| Raw HTML | 8,500 | 45% | $0.008 |
+→ SiFR: **75% fewer tokens**, **2x accuracy** vs HTML
+## What is SiFR?
+Structured Interface Format for Representation.
+A compact way to describe web UI for LLMs.
+```yaml
+btn015:
+  type: button
+  text: "Add to Cart"
+  position: [500, 300, 120, 40]
+  state: enabled
+  parent: product-card
+```
+Full spec: [SPEC.md](SPEC.md)
+## Installation
+```bash
+pip install sifr-benchmark
+```
+## Quick Start
+### 1. Capture pages (using Element-to-LLM extension)
+1. Install [Element-to-LLM](https://chromewebstore.google.com/detail/element-to-llm-dom-captur/oofdfeinchhgnhlikkfdfcldbpcjcgnj) extension
+2. Open target page (e.g., Amazon product page)
+3. Click extension → **Export SiFR**
+4. Save as `examples/my_page.sifr`
+### 2. Run benchmark
+```bash
+# Set API keys
+export OPENAI_API_KEY=sk-...
+export ANTHROPIC_API_KEY=sk-ant-...
+# Run benchmark
+sifr-bench run --models gpt-4o-mini,claude-haiku --formats sifr,html_raw
+# Validate your SiFR files
+sifr-bench validate examples/
+# View info
+sifr-bench info
+```
+## Repository Structure
+```
+├── spec/
+│   └── SPEC.md              # SiFR format specification
+├── benchmark/
+│   ├── protocol.md          # Test methodology
+│   ├── tasks.json           # 25 standardized tasks
+│   └── ground-truth/        # Verified answers per page
+├── datasets/
+│   ├── pages/               # Test page snapshots
+│   │   ├── ecommerce/
+│   │   ├── news/
+│   │   ├── saas/
+│   │   └── forms/
+│   └── formats/             # Same page in each format
+│       ├── sifr/
+│       ├── html/
+│       ├── axtree/
+│       └── screenshots/
+├── results/
+│   ├── raw/                 # Model responses
+│   └── analysis/            # Processed results
+├── src/
+│   └── runner.js            # Benchmark execution
+└── examples/
+    └── product_page.sifr    # Sample SiFR file
+```
+## Tested Models
+- GPT-4o (OpenAI)
+- Claude 3.5 Sonnet (Anthropic)
+- Gemini 2.0 Flash (Google)
+- Llama 3.3 70B (Meta)
+- Qwen 2.5 72B (Alibaba)
+## Key Findings
+1. **Token efficiency**: SiFR uses 70-80% fewer tokens than raw HTML
+2. **Accuracy**: Pre-computed salience improves task accuracy by 40%+
+3. **Consistency**: SiFR results have 3x lower variance across models
+4. **Edge-ready**: SiFR enables UI tasks on 3B parameter models
+## Contribute
+- Add test pages: `datasets/pages/`
+- Add tasks: `benchmark/tasks.json`
+- Run on new models: `src/runner.js`
+## Citation
+```bibtex
+@misc{sifr2024,
+  title={SiFR: Structured Interface Format for AI Agents},
+  author={SiFR Contributors},
+  year={2024},
+  url={https://github.com/user/sifr-benchmark}
+}
+```
+## License
+MIT — format is open.
+---
+**[SiFR Spec](https://github.com/user/sifr-spec)** | **[Extension](https://github.com/user/element-to-llm)** | **[Discord](#)**

sifr_benchmark-0.1.15.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+sifr_benchmark/__init__.py,sha256=5sDyMhgznYEmEwl1cQKqluoKdCUV_oEDb1IrNwvEvPU,438
+sifr_benchmark/capture.py,sha256=fZvPuxh0l3h-w3NW-wV7N-OGNoMvYExN3b7VoPg7ta4,8238
+sifr_benchmark/capture_e2llm.py,sha256=54bpfimPZrkl6X-seTldxVRdyHtYEDUBV0kMC_8hBFY,8003
+sifr_benchmark/cli.py,sha256=JFQHdMaQ176U_jTNq6QsIGYJpgTNI8Z02cvDI3gGDos,11998
+sifr_benchmark/formats.py,sha256=SlfPdgTWYnBCFfVhZTWiNP2MqMzJ10YJy4hDza0r8Ko,5272
+sifr_benchmark/ground_truth.py,sha256=mt81qbWebvMVmQvOEW-Ftud2OIy4jy_OizyhaPQyoeI,7548
+sifr_benchmark/models.py,sha256=jXcGvq4A44rxsP8HQIhZYjmVZc9Rjy8atdV6sKuoyp0,3327
+sifr_benchmark/runner.py,sha256=WXMA_eblKCLp0e1Xcq3muo6xm-y9ajkUnZ_tR73H1Vk,9895
+sifr_benchmark/scoring.py,sha256=tyzEbq2MDUhpl8_NIobPayt-q97Ja4jMEXa6ZUmW5f4,4007
+sifr_benchmark/verify.py,sha256=jDm2RsTKcJaeu-Z14AiVpcjUtRQW_kwGzTlG7bj_8us,6631
+sifr_benchmark-0.1.15.dist-info/METADATA,sha256=rn50JWF1qKE0ZumknTcF8h4tFv1b2q8U_dTExr9Uy-4,5546
+sifr_benchmark-0.1.15.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+sifr_benchmark-0.1.15.dist-info/entry_points.txt,sha256=qRuYPcVtYyA4sopUc59CAQ24EBl48ogeA2dTLTg-BBk,55
+sifr_benchmark-0.1.15.dist-info/licenses/LICENSE,sha256=ulF4L1AzBu_Aki_PDLMKSRJ4--xmGqmwwkeX0wTJBQ4,1064
+sifr_benchmark-0.1.15.dist-info/RECORD,,

sifr_benchmark-0.1.15.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any

sifr_benchmark-0.1.15.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ sifr-bench = sifr_benchmark.cli:main

sifr_benchmark-0.1.15.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Alechko
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.