sifr-benchmark 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ """
2
+ SiFR Benchmark - Evaluate LLM understanding of web UI across formats.
3
+
4
+ Usage:
5
+ pip install sifr-benchmark
6
+ sifr-bench --help
7
+ """
8
+
9
+ __version__ = "0.1.15"
10
+ __author__ = "SiFR Contributors"
11
+
12
+ from .runner import BenchmarkRunner
13
+ from .scoring import score_response
14
+ from .formats import load_sifr, load_html, load_axtree
15
+
16
+ __all__ = [
17
+ "BenchmarkRunner",
18
+ "score_response",
19
+ "load_sifr",
20
+ "load_html",
21
+ "load_axtree",
22
+ ]
@@ -0,0 +1,242 @@
1
+ """
2
+ Page capture module - captures pages in all formats.
3
+ """
4
+
5
+ import json
6
+ import asyncio
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from dataclasses import dataclass
10
+
11
+
12
+ @dataclass
13
+ class CaptureResult:
14
+ url: str
15
+ sifr_path: Optional[Path] = None
16
+ html_path: Optional[Path] = None
17
+ screenshot_path: Optional[Path] = None
18
+ axtree_path: Optional[Path] = None
19
+ error: Optional[str] = None
20
+
21
+
22
+ def check_playwright():
23
+ """Check if playwright is installed."""
24
+ try:
25
+ from playwright.sync_api import sync_playwright
26
+ return True
27
+ except ImportError:
28
+ return False
29
+
30
+
31
+ def install_playwright_browsers():
32
+ """Install playwright browsers."""
33
+ import subprocess
34
+ subprocess.run(["playwright", "install", "chromium"], check=True)
35
+
36
+
37
+ def generate_sifr_from_page(page) -> dict:
38
+ """Generate SiFR format from Playwright page."""
39
+
40
+ # Get page info
41
+ url = page.url
42
+ title = page.title()
43
+ viewport = page.viewport_size
44
+
45
+ # Extract elements using JavaScript
46
+ elements = page.evaluate("""() => {
47
+ const results = { high: {}, med: {}, low: {} };
48
+ const buttons = document.querySelectorAll('button, [role="button"], input[type="submit"]');
49
+ const links = document.querySelectorAll('a[href]');
50
+ const inputs = document.querySelectorAll('input, textarea, select');
51
+ const headings = document.querySelectorAll('h1, h2, h3');
52
+
53
+ let btnCount = 1, linkCount = 1, inputCount = 1, textCount = 1;
54
+
55
+ // High salience: buttons, main inputs
56
+ buttons.forEach((el, i) => {
57
+ if (i < 10) {
58
+ const rect = el.getBoundingClientRect();
59
+ results.high['btn' + String(btnCount++).padStart(3, '0')] = {
60
+ type: 'button',
61
+ text: el.textContent?.trim().slice(0, 50) || el.value || '',
62
+ position: [Math.round(rect.x), Math.round(rect.y), Math.round(rect.width), Math.round(rect.height)],
63
+ state: el.disabled ? 'disabled' : 'enabled'
64
+ };
65
+ }
66
+ });
67
+
68
+ // High salience: main input
69
+ inputs.forEach((el, i) => {
70
+ if (i < 5) {
71
+ const rect = el.getBoundingClientRect();
72
+ results.high['inp' + String(inputCount++).padStart(3, '0')] = {
73
+ type: 'input',
74
+ placeholder: el.placeholder || '',
75
+ input_type: el.type || 'text',
76
+ position: [Math.round(rect.x), Math.round(rect.y), Math.round(rect.width), Math.round(rect.height)],
77
+ state: el.disabled ? 'disabled' : 'enabled'
78
+ };
79
+ }
80
+ });
81
+
82
+ // Med salience: links
83
+ links.forEach((el, i) => {
84
+ if (i < 20) {
85
+ const rect = el.getBoundingClientRect();
86
+ if (!results.med.link) results.med.link = {};
87
+ results.med['lnk' + String(linkCount++).padStart(3, '0')] = {
88
+ type: 'link',
89
+ text: el.textContent?.trim().slice(0, 50) || '',
90
+ href: el.href,
91
+ position: [Math.round(rect.x), Math.round(rect.y), Math.round(rect.width), Math.round(rect.height)]
92
+ };
93
+ }
94
+ });
95
+
96
+ // Low salience: headings as text
97
+ headings.forEach((el, i) => {
98
+ const rect = el.getBoundingClientRect();
99
+ results.low['txt' + String(textCount++).padStart(3, '0')] = {
100
+ type: 'text',
101
+ content: el.textContent?.trim().slice(0, 100) || '',
102
+ tag: el.tagName.toLowerCase(),
103
+ position: [Math.round(rect.x), Math.round(rect.y), Math.round(rect.width), Math.round(rect.height)]
104
+ };
105
+ });
106
+
107
+ return results;
108
+ }""")
109
+
110
+ # Build SiFR structure
111
+ sifr = {
112
+ "====METADATA====": {
113
+ "format": "sifr-v2.0",
114
+ "url": url,
115
+ "title": title,
116
+ "viewport": viewport,
117
+ "stats": {
118
+ "high": len(elements.get("high", {})),
119
+ "med": len(elements.get("med", {})),
120
+ "low": len(elements.get("low", {}))
121
+ }
122
+ },
123
+ "====NODES====": elements,
124
+ "====SUMMARY====": {
125
+ "page": {
126
+ "purpose": f"Page at {url}",
127
+ "title": title
128
+ }
129
+ }
130
+ }
131
+
132
+ return sifr
133
+
134
+
135
+ def get_accessibility_tree(page) -> dict:
136
+ """Get accessibility tree from page."""
137
+ snapshot = page.accessibility.snapshot()
138
+ return snapshot or {}
139
+
140
+
141
+ def capture_page(
142
+ url: str,
143
+ output_dir: Path,
144
+ name: str,
145
+ formats: list[str] = None,
146
+ headless: bool = True
147
+ ) -> CaptureResult:
148
+ """
149
+ Capture a page in multiple formats.
150
+
151
+ Args:
152
+ url: URL to capture
153
+ output_dir: Output directory
154
+ name: Base name for files
155
+ formats: List of formats to capture (sifr, html, screenshot, axtree)
156
+ headless: Run browser in headless mode
157
+
158
+ Returns:
159
+ CaptureResult with paths to captured files
160
+ """
161
+ if not check_playwright():
162
+ return CaptureResult(url=url, error="Playwright not installed. Run: pip install playwright && playwright install chromium")
163
+
164
+ if formats is None:
165
+ formats = ["sifr", "html", "screenshot", "axtree"]
166
+
167
+ from playwright.sync_api import sync_playwright
168
+
169
+ result = CaptureResult(url=url)
170
+
171
+ # Create output directories
172
+ output_dir = Path(output_dir)
173
+ (output_dir / "sifr").mkdir(parents=True, exist_ok=True)
174
+ (output_dir / "html").mkdir(parents=True, exist_ok=True)
175
+ (output_dir / "screenshots").mkdir(parents=True, exist_ok=True)
176
+ (output_dir / "axtree").mkdir(parents=True, exist_ok=True)
177
+
178
+ try:
179
+ with sync_playwright() as p:
180
+ browser = p.chromium.launch(headless=headless)
181
+ page = browser.new_page(viewport={"width": 1920, "height": 1080})
182
+
183
+ # Navigate
184
+ page.goto(url, wait_until="networkidle", timeout=30000)
185
+ page.wait_for_timeout(2000) # Extra wait for dynamic content
186
+
187
+ # Capture SiFR
188
+ if "sifr" in formats:
189
+ sifr_data = generate_sifr_from_page(page)
190
+ sifr_path = output_dir / "sifr" / f"{name}.sifr"
191
+ with open(sifr_path, "w", encoding="utf-8") as f:
192
+ json.dump(sifr_data, f, indent=2)
193
+ result.sifr_path = sifr_path
194
+
195
+ # Capture HTML
196
+ if "html" in formats:
197
+ html_content = page.content()
198
+ html_path = output_dir / "html" / f"{name}.html"
199
+ with open(html_path, "w", encoding="utf-8") as f:
200
+ f.write(html_content)
201
+ result.html_path = html_path
202
+
203
+ # Capture Screenshot
204
+ if "screenshot" in formats:
205
+ screenshot_path = output_dir / "screenshots" / f"{name}.png"
206
+ page.screenshot(path=str(screenshot_path), full_page=False)
207
+ result.screenshot_path = screenshot_path
208
+
209
+ # Capture Accessibility Tree
210
+ if "axtree" in formats:
211
+ axtree = get_accessibility_tree(page)
212
+ axtree_path = output_dir / "axtree" / f"{name}.json"
213
+ with open(axtree_path, "w", encoding="utf-8") as f:
214
+ json.dump(axtree, f, indent=2)
215
+ result.axtree_path = axtree_path
216
+
217
+ browser.close()
218
+
219
+ except Exception as e:
220
+ result.error = str(e)
221
+
222
+ return result
223
+
224
+
225
+ def capture_multiple(
226
+ urls: list[str],
227
+ output_dir: Path,
228
+ formats: list[str] = None,
229
+ headless: bool = True
230
+ ) -> list[CaptureResult]:
231
+ """Capture multiple pages."""
232
+ results = []
233
+ for url in urls:
234
+ # Generate name from URL
235
+ from urllib.parse import urlparse
236
+ parsed = urlparse(url)
237
+ name = parsed.netloc.replace(".", "_").replace("www_", "")
238
+
239
+ result = capture_page(url, output_dir, name, formats, headless)
240
+ results.append(result)
241
+
242
+ return results
@@ -0,0 +1,230 @@
1
+ """
2
+ Capture pages using E2LLM extension API.
3
+ Requires: pip install playwright
4
+ First run: playwright install chromium
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Optional
11
+ from dataclasses import dataclass
12
+
13
+
14
+ @dataclass
15
+ class CaptureResult:
16
+ url: str
17
+ sifr: str
18
+ html: str
19
+ axtree: dict
20
+ screenshot: Optional[bytes] = None
21
+
22
+
23
+ async def capture_with_e2llm(
24
+ page,
25
+ selector: str = "body",
26
+ timeout: int = 30000
27
+ ) -> dict:
28
+ """
29
+ Capture page using E2LLM extension CustomEvent API.
30
+
31
+ Returns:
32
+ dict with sifr (stringified), html, axtree, metadata
33
+ """
34
+
35
+ result = await page.evaluate("""
36
+ ([selector, timeout]) => {
37
+ return new Promise((resolve, reject) => {
38
+ const id = Date.now().toString();
39
+
40
+ const timer = setTimeout(() => {
41
+ reject(new Error('E2LLM capture timeout - is extension installed?'));
42
+ }, timeout);
43
+
44
+ document.addEventListener('e2llm-capture-response', (e) => {
45
+ if (e.detail && e.detail.requestId === id) {
46
+ clearTimeout(timer);
47
+
48
+ // E2LLM v2.6.x returns: {requestId, success, data, meta}
49
+ // data contains the SiFR structure directly
50
+ const response = e.detail;
51
+
52
+ if (response.success && response.data) {
53
+ resolve({
54
+ sifr: JSON.stringify(response.data, null, 2),
55
+ meta: response.meta || {},
56
+ html: document.documentElement.outerHTML
57
+ });
58
+ } else {
59
+ resolve({
60
+ sifr: '',
61
+ meta: {},
62
+ html: document.documentElement.outerHTML,
63
+ error: response.error || 'Unknown error'
64
+ });
65
+ }
66
+ }
67
+ }, { once: true });
68
+
69
+ document.dispatchEvent(new CustomEvent('e2llm-capture-request', {
70
+ detail: {
71
+ requestId: id,
72
+ selector: selector
73
+ }
74
+ }));
75
+ });
76
+ }
77
+ """, [selector, timeout])
78
+
79
+ return result
80
+
81
+
82
+ async def capture_page(
83
+ url: str,
84
+ extension_path: str,
85
+ user_data_dir: str = "./e2llm-chrome-profile",
86
+ headless: bool = False,
87
+ selector: str = "body"
88
+ ) -> CaptureResult:
89
+ """
90
+ Capture a page using Playwright + E2LLM extension.
91
+ """
92
+ from playwright.async_api import async_playwright
93
+
94
+ async with async_playwright() as p:
95
+ context = await p.chromium.launch_persistent_context(
96
+ user_data_dir=user_data_dir,
97
+ headless=headless,
98
+ args=[
99
+ f"--disable-extensions-except={extension_path}",
100
+ f"--load-extension={extension_path}",
101
+ ]
102
+ )
103
+
104
+ page = await context.new_page()
105
+
106
+ try:
107
+ await page.goto(url, wait_until="networkidle", timeout=30000)
108
+ await page.wait_for_timeout(2000) # Wait for extension to be ready
109
+
110
+ result = await capture_with_e2llm(page, selector)
111
+ screenshot = await page.screenshot(full_page=True)
112
+ axtree = await page.accessibility.snapshot()
113
+
114
+ return CaptureResult(
115
+ url=url,
116
+ sifr=result.get("sifr", ""),
117
+ html=result.get("html", ""),
118
+ axtree=axtree or {},
119
+ screenshot=screenshot
120
+ )
121
+
122
+ finally:
123
+ await context.close()
124
+
125
+
126
+ async def capture_multiple(
127
+ urls: list[str],
128
+ extension_path: str,
129
+ output_dir: str = "./datasets/formats",
130
+ user_data_dir: str = "./e2llm-chrome-profile"
131
+ ) -> list[CaptureResult]:
132
+ """
133
+ Capture multiple pages, saving to output directory.
134
+ """
135
+ from playwright.async_api import async_playwright
136
+
137
+ output = Path(output_dir)
138
+ (output / "sifr").mkdir(parents=True, exist_ok=True)
139
+ (output / "html").mkdir(parents=True, exist_ok=True)
140
+ (output / "axtree").mkdir(parents=True, exist_ok=True)
141
+ (output / "screenshots").mkdir(parents=True, exist_ok=True)
142
+
143
+ results = []
144
+
145
+ async with async_playwright() as p:
146
+ context = await p.chromium.launch_persistent_context(
147
+ user_data_dir=user_data_dir,
148
+ headless=False,
149
+ args=[
150
+ f"--disable-extensions-except={extension_path}",
151
+ f"--load-extension={extension_path}",
152
+ ]
153
+ )
154
+
155
+ page = await context.new_page()
156
+
157
+ for url in urls:
158
+ try:
159
+ print(f"Capturing: {url}")
160
+
161
+ await page.goto(url, wait_until="networkidle", timeout=30000)
162
+ await page.wait_for_timeout(2000) # Wait for extension
163
+
164
+ result = await capture_with_e2llm(page)
165
+ screenshot = await page.screenshot(full_page=True)
166
+
167
+ # Get real accessibility tree via Playwright
168
+ axtree = await page.accessibility.snapshot()
169
+
170
+ # Generate page_id from URL
171
+ page_id = url.replace("https://", "").replace("http://", "")
172
+ page_id = page_id.replace("/", "_").replace(".", "_").rstrip("_")
173
+
174
+ sifr_content = result.get("sifr", "")
175
+ html_content = result.get("html", "")
176
+
177
+ # Save files
178
+ (output / "sifr" / f"{page_id}.sifr").write_text(
179
+ sifr_content, encoding="utf-8"
180
+ )
181
+ (output / "html" / f"{page_id}.html").write_text(
182
+ html_content, encoding="utf-8"
183
+ )
184
+ (output / "axtree" / f"{page_id}.json").write_text(
185
+ json.dumps(axtree, indent=2, ensure_ascii=False),
186
+ encoding="utf-8"
187
+ )
188
+ (output / "screenshots" / f"{page_id}.png").write_bytes(screenshot)
189
+
190
+ results.append(CaptureResult(
191
+ url=url,
192
+ sifr=sifr_content,
193
+ html=html_content,
194
+ axtree=axtree or {},
195
+ screenshot=screenshot
196
+ ))
197
+
198
+ sifr_size = len(sifr_content)
199
+ print(f" ✅ Saved: {page_id} (SiFR: {sifr_size} bytes)")
200
+
201
+ await page.wait_for_timeout(500)
202
+
203
+ except Exception as e:
204
+ print(f" ❌ Error: {e}")
205
+ # Save empty files to avoid breaking pipeline
206
+ (output / "sifr" / f"{page_id}.sifr").write_text("", encoding="utf-8")
207
+ (output / "html" / f"{page_id}.html").write_text("", encoding="utf-8")
208
+
209
+ await context.close()
210
+
211
+ return results
212
+
213
+
214
+ if __name__ == "__main__":
215
+ import argparse
216
+
217
+ parser = argparse.ArgumentParser(description="Capture pages using E2LLM extension")
218
+ parser.add_argument("urls", nargs="+", help="URLs to capture")
219
+ parser.add_argument("--extension", "-e", required=True, help="Path to E2LLM extension")
220
+ parser.add_argument("--output", "-o", default="./datasets/formats", help="Output directory")
221
+ parser.add_argument("--profile", default="./e2llm-chrome-profile", help="Chrome profile dir")
222
+
223
+ args = parser.parse_args()
224
+
225
+ asyncio.run(capture_multiple(
226
+ urls=args.urls,
227
+ extension_path=args.extension,
228
+ output_dir=args.output,
229
+ user_data_dir=args.profile
230
+ ))