sifr-benchmark 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sifr_benchmark/cli.py ADDED
@@ -0,0 +1,358 @@
1
+ """
2
+ CLI interface for SiFR Benchmark.
3
+ Each benchmark run creates an isolated directory with all data.
4
+ """
5
+
6
+ import click
7
+ from pathlib import Path
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+ from rich.progress import Progress, SpinnerColumn, TextColumn
11
+ import json
12
+ import os
13
+ from datetime import datetime
14
+
15
+ from . import __version__
16
+ from .runner import BenchmarkRunner
17
+ from .formats import validate_sifr_file
18
+
19
+ console = Console()
20
+
21
+
22
+ def create_run_dir(base_path: str = "./benchmark_runs") -> Path:
23
+ """Create isolated run directory with timestamp."""
24
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
25
+ run_dir = Path(base_path) / f"run_{timestamp}"
26
+
27
+ # Create subdirectories
28
+ (run_dir / "captures" / "sifr").mkdir(parents=True, exist_ok=True)
29
+ (run_dir / "captures" / "html").mkdir(parents=True, exist_ok=True)
30
+ (run_dir / "captures" / "axtree").mkdir(parents=True, exist_ok=True)
31
+ (run_dir / "captures" / "screenshots").mkdir(parents=True, exist_ok=True)
32
+ (run_dir / "ground-truth").mkdir(parents=True, exist_ok=True)
33
+ (run_dir / "results").mkdir(parents=True, exist_ok=True)
34
+
35
+ return run_dir
36
+
37
+
38
+ @click.group()
39
+ @click.version_option(version=__version__)
40
+ def main():
41
+ """SiFR Benchmark - Evaluate LLM understanding of web UI."""
42
+ pass
43
+
44
+
45
+ @main.command()
46
+ @click.option("--models", "-m", default="gpt-4o-mini", help="Models to test")
47
+ @click.option("--formats", "-f", default="sifr,html_raw,axtree", help="Formats to test")
48
+ @click.option("--run-dir", "-d", required=True, type=click.Path(exists=True), help="Run directory with captures")
49
+ @click.option("--runs", "-r", default=1, type=int, help="Runs per test")
50
+ def run(models, formats, run_dir, runs):
51
+ """Run benchmark on existing captures."""
52
+
53
+ console.print(f"\n[bold blue]🚀 SiFR Benchmark v{__version__}[/bold blue]\n")
54
+
55
+ run_path = Path(run_dir)
56
+ model_list = [m.strip() for m in models.split(",")]
57
+ format_list = [f.strip() for f in formats.split(",")]
58
+
59
+ # Check API keys
60
+ if any("gpt" in m for m in model_list) and not os.getenv("OPENAI_API_KEY"):
61
+ console.print("[red]❌ OPENAI_API_KEY not set[/red]")
62
+ return
63
+
64
+ if any("claude" in m for m in model_list) and not os.getenv("ANTHROPIC_API_KEY"):
65
+ console.print("[red]❌ ANTHROPIC_API_KEY not set[/red]")
66
+ return
67
+
68
+ runner = BenchmarkRunner(
69
+ models=model_list,
70
+ formats=format_list,
71
+ runs=runs,
72
+ base_dir=run_path
73
+ )
74
+
75
+ with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
76
+ task = progress.add_task("Running benchmark...", total=None)
77
+ results = runner.run()
78
+ progress.update(task, completed=True)
79
+
80
+ summary = runner.aggregate(results)
81
+
82
+ # Display results
83
+ table = Table(title="Benchmark Results")
84
+ table.add_column("Format", style="cyan")
85
+ table.add_column("Accuracy", style="green")
86
+ table.add_column("Avg Tokens", style="yellow")
87
+ table.add_column("Avg Latency", style="blue")
88
+
89
+ for row in summary:
90
+ table.add_row(row["format"], row["accuracy"], str(row["avg_tokens"]), row["avg_latency"])
91
+
92
+ console.print(table)
93
+
94
+ # Save results
95
+ with open(run_path / "results" / "raw_results.json", "w") as f:
96
+ json.dump(results, f, indent=2, default=str)
97
+
98
+ with open(run_path / "results" / "summary.json", "w") as f:
99
+ json.dump(summary, f, indent=2)
100
+
101
+ console.print(f"\n[green]✅ Results saved to {run_path}/results/[/green]")
102
+
103
+
104
+ @main.command()
105
+ @click.argument("urls", nargs=-1, required=True)
106
+ @click.option("--extension", "-e", required=True, help="Path to E2LLM extension")
107
+ @click.option("--models", "-m", default="gpt-4o-mini", help="Models to test")
108
+ @click.option("--runs", "-r", default=1, type=int, help="Runs per test")
109
+ @click.option("--base-dir", "-b", default="./benchmark_runs", help="Base directory for runs")
110
+ def full_benchmark_e2llm(urls, extension, models, runs, base_dir):
111
+ """Full benchmark: capture → ground truth → test (isolated run)."""
112
+ import asyncio
113
+
114
+ try:
115
+ from .capture_e2llm import capture_multiple
116
+ except ImportError:
117
+ console.print("[red]Error: playwright not installed[/red]")
118
+ return
119
+
120
+ # Create isolated run directory
121
+ run_dir = create_run_dir(base_dir)
122
+
123
+ console.print(f"[bold blue]🚀 Full Benchmark with E2LLM[/bold blue]")
124
+ console.print(f"Run directory: [cyan]{run_dir}[/cyan]")
125
+ console.print(f"URLs: {len(urls)}")
126
+
127
+ # Step 1: Capture
128
+ console.print("\n[bold]Step 1/3: Capturing with E2LLM...[/bold]")
129
+
130
+ captures_dir = run_dir / "captures"
131
+ results = asyncio.run(capture_multiple(
132
+ urls=list(urls),
133
+ extension_path=extension,
134
+ output_dir=str(captures_dir)
135
+ ))
136
+
137
+ captured_pages = []
138
+ for url in urls:
139
+ page_id = url.replace("https://", "").replace("http://", "")
140
+ page_id = page_id.replace("/", "_").replace(".", "_").rstrip("_")
141
+ captured_pages.append(page_id)
142
+ console.print(f" ✅ Saved: {page_id}")
143
+
144
+ console.print(f"[green]✅ Captured {len(results)} pages[/green]")
145
+
146
+ # Step 2: Ground truth
147
+ console.print("\n[bold]Step 2/3: Generating ground truth...[/bold]")
148
+ from .ground_truth import generate_ground_truth
149
+
150
+ for page_id in captured_pages:
151
+ screenshot_path = captures_dir / "screenshots" / f"{page_id}.png"
152
+ sifr_path = captures_dir / "sifr" / f"{page_id}.sifr"
153
+ gt_output = run_dir / "ground-truth" / f"{page_id}.json"
154
+
155
+ if not screenshot_path.exists():
156
+ console.print(f" ⚠️ {page_id}: screenshot not found")
157
+ continue
158
+
159
+ if not sifr_path.exists():
160
+ console.print(f" ⚠️ {page_id}: sifr not found")
161
+ continue
162
+
163
+ try:
164
+ result = generate_ground_truth(screenshot_path, sifr_path, gt_output)
165
+ if "error" in result:
166
+ console.print(f" ⚠️ {page_id}: {result['error']}")
167
+ else:
168
+ console.print(f" ✅ {page_id}")
169
+ except Exception as e:
170
+ console.print(f" ⚠️ {page_id}: {e}")
171
+
172
+ # Step 3: Benchmark
173
+ console.print("\n[bold]Step 3/3: Running benchmark...[/bold]")
174
+ model_list = [m.strip() for m in models.split(",")]
175
+
176
+ runner = BenchmarkRunner(
177
+ models=model_list,
178
+ formats=["sifr", "html_raw", "axtree"],
179
+ runs=runs,
180
+ base_dir=run_dir
181
+ )
182
+
183
+ bench_results = runner.run()
184
+ summary = runner.aggregate(bench_results)
185
+
186
+ # Display results
187
+ table = Table(title="Benchmark Results")
188
+ table.add_column("Format", style="cyan")
189
+ table.add_column("Accuracy", style="green")
190
+ table.add_column("Avg Tokens", style="yellow")
191
+ table.add_column("Avg Latency", style="blue")
192
+
193
+ for row in summary:
194
+ table.add_row(row["format"], row["accuracy"], str(row["avg_tokens"]), row["avg_latency"])
195
+
196
+ console.print(table)
197
+
198
+ # Save results
199
+ with open(run_dir / "results" / "raw_results.json", "w") as f:
200
+ json.dump(bench_results, f, indent=2, default=str)
201
+
202
+ with open(run_dir / "results" / "summary.json", "w") as f:
203
+ json.dump(summary, f, indent=2)
204
+
205
+ # Save run metadata
206
+ metadata = {
207
+ "timestamp": datetime.now().isoformat(),
208
+ "urls": list(urls),
209
+ "models": model_list,
210
+ "formats": ["sifr", "html_raw", "axtree"],
211
+ "pages": captured_pages
212
+ }
213
+ with open(run_dir / "run_meta.json", "w") as f:
214
+ json.dump(metadata, f, indent=2)
215
+
216
+ console.print(f"\n[green]✅ Benchmark complete![/green]")
217
+ console.print(f"[cyan]Results: {run_dir}[/cyan]")
218
+
219
+
220
+ @main.command()
221
+ @click.argument("path", type=click.Path(exists=True))
222
+ def validate(path):
223
+ """Validate SiFR files."""
224
+ path = Path(path)
225
+ files = list(path.glob("**/*.sifr")) if path.is_dir() else [path]
226
+
227
+ valid = invalid = 0
228
+ for f in files:
229
+ try:
230
+ errors = validate_sifr_file(f)
231
+ if errors:
232
+ console.print(f"[red]❌ {f.name}[/red]")
233
+ for err in errors:
234
+ console.print(f" {err}")
235
+ invalid += 1
236
+ else:
237
+ console.print(f"[green]✅ {f.name}[/green]")
238
+ valid += 1
239
+ except Exception as e:
240
+ console.print(f"[red]❌ {f.name}: {e}[/red]")
241
+ invalid += 1
242
+
243
+ console.print(f"\n[bold]Summary: {valid} valid, {invalid} invalid[/bold]")
244
+
245
+
246
+ @main.command()
247
+ @click.argument("run_dirs", nargs=-1, type=click.Path(exists=True))
248
+ def compare(run_dirs):
249
+ """Compare multiple benchmark runs."""
250
+ if len(run_dirs) < 2:
251
+ console.print("[red]Need at least 2 run directories[/red]")
252
+ return
253
+
254
+ table = Table(title="Run Comparison")
255
+ table.add_column("Run", style="cyan")
256
+ table.add_column("Date", style="dim")
257
+ table.add_column("Pages", style="yellow")
258
+ table.add_column("Best Format", style="green")
259
+ table.add_column("Accuracy", style="magenta")
260
+ table.add_column("Tokens", style="blue")
261
+
262
+ for d in run_dirs:
263
+ run_path = Path(d)
264
+ summary_path = run_path / "results" / "summary.json"
265
+ meta_path = run_path / "run_meta.json"
266
+
267
+ if not summary_path.exists():
268
+ continue
269
+
270
+ with open(summary_path) as f:
271
+ summary = json.load(f)
272
+
273
+ meta = {}
274
+ if meta_path.exists():
275
+ with open(meta_path) as f:
276
+ meta = json.load(f)
277
+
278
+ if summary:
279
+ best = max(summary, key=lambda x: float(x["accuracy"].rstrip("%") or 0))
280
+ table.add_row(
281
+ run_path.name,
282
+ meta.get("timestamp", "")[:10],
283
+ str(len(meta.get("pages", []))),
284
+ best["format"],
285
+ best["accuracy"],
286
+ str(best["avg_tokens"])
287
+ )
288
+
289
+ console.print(table)
290
+
291
+
292
+ @main.command()
293
+ def list_runs():
294
+ """List all benchmark runs."""
295
+ runs_dir = Path("./benchmark_runs")
296
+ if not runs_dir.exists():
297
+ console.print("[yellow]No runs found[/yellow]")
298
+ return
299
+
300
+ table = Table(title="Benchmark Runs")
301
+ table.add_column("Run", style="cyan")
302
+ table.add_column("Date", style="dim")
303
+ table.add_column("URLs", style="yellow")
304
+ table.add_column("Status", style="green")
305
+
306
+ for run_dir in sorted(runs_dir.iterdir(), reverse=True):
307
+ if not run_dir.is_dir():
308
+ continue
309
+
310
+ meta_path = run_dir / "run_meta.json"
311
+ results_path = run_dir / "results" / "summary.json"
312
+
313
+ meta = {}
314
+ if meta_path.exists():
315
+ with open(meta_path) as f:
316
+ meta = json.load(f)
317
+
318
+ status = "✅ Complete" if results_path.exists() else "⏳ Partial"
319
+
320
+ table.add_row(
321
+ run_dir.name,
322
+ meta.get("timestamp", "")[:16].replace("T", " "),
323
+ str(len(meta.get("urls", []))),
324
+ status
325
+ )
326
+
327
+ console.print(table)
328
+
329
+
330
+ @main.command()
331
+ def info():
332
+ """Show benchmark information."""
333
+ console.print(f"\n[bold blue]SiFR Benchmark v{__version__}[/bold blue]\n")
334
+ console.print("""
335
+ [bold]Quick Start:[/bold]
336
+ sifr-bench full-benchmark-e2llm https://news.ycombinator.com -e /path/to/extension
337
+
338
+ [bold]Commands:[/bold]
339
+ full-benchmark-e2llm Full pipeline (capture → ground truth → test)
340
+ run Run benchmark on existing captures
341
+ list-runs Show all benchmark runs
342
+ compare Compare multiple runs
343
+
344
+ [bold]Run Structure:[/bold]
345
+ benchmark_runs/run_YYYYMMDD_HHMMSS/
346
+ ├── captures/
347
+ │ ├── sifr/
348
+ │ ├── html/
349
+ │ ├── axtree/
350
+ │ └── screenshots/
351
+ ├── ground-truth/
352
+ ├── results/
353
+ └── run_meta.json
354
+ """)
355
+
356
+
357
+ if __name__ == "__main__":
358
+ main()
@@ -0,0 +1,162 @@
1
+ """
2
+ Format loading and validation utilities.
3
+ Supports isolated run directory structure.
4
+ """
5
+
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+
11
+ def load_sifr(page_id: str, base_dir: Optional[Path] = None) -> str:
12
+ """Load a SiFR file, truncating if too large."""
13
+ paths_to_try = []
14
+
15
+ # New structure: base_dir/captures/sifr/
16
+ if base_dir:
17
+ paths_to_try.append(base_dir / "captures" / "sifr" / f"{page_id}.sifr")
18
+
19
+ # Legacy paths
20
+ paths_to_try.extend([
21
+ Path(f"datasets/formats/sifr/{page_id}.sifr"),
22
+ Path(f"examples/{page_id}.sifr"),
23
+ ])
24
+
25
+ for path in paths_to_try:
26
+ if path.exists():
27
+ content = path.read_text(encoding="utf-8")
28
+
29
+ # Truncate if too large (gpt-4o-mini limit ~200K tokens ≈ 600KB)
30
+ MAX_CHARS = 100000 # ~25K tokens, safe limit
31
+ if len(content) > MAX_CHARS:
32
+ content = content[:MAX_CHARS]
33
+ last_newline = content.rfind('\n')
34
+ if last_newline > MAX_CHARS * 0.8:
35
+ content = content[:last_newline]
36
+ content += "\n... [truncated]"
37
+
38
+ return content
39
+
40
+ raise FileNotFoundError(f"SiFR file not found for: {page_id}. Tried: {[str(p) for p in paths_to_try]}")
41
+
42
+
43
+ def load_html(page_id: str, base_dir: Optional[Path] = None, clean: bool = False) -> str:
44
+ """Load an HTML file."""
45
+ suffix = "_clean" if clean else ""
46
+ paths_to_try = []
47
+
48
+ # New structure: base_dir/captures/html/
49
+ if base_dir:
50
+ paths_to_try.append(base_dir / "captures" / "html" / f"{page_id}{suffix}.html")
51
+
52
+ # Legacy paths
53
+ paths_to_try.extend([
54
+ Path(f"datasets/formats/html/{page_id}{suffix}.html"),
55
+ ])
56
+
57
+ for path in paths_to_try:
58
+ if path.exists():
59
+ return path.read_text(encoding="utf-8")
60
+
61
+ raise FileNotFoundError(f"HTML file not found for: {page_id}. Tried: {[str(p) for p in paths_to_try]}")
62
+
63
+
64
+ def load_axtree(page_id: str, base_dir: Optional[Path] = None) -> str:
65
+ """Load an accessibility tree file."""
66
+ paths_to_try = []
67
+
68
+ # New structure: base_dir/captures/axtree/
69
+ if base_dir:
70
+ paths_to_try.append(base_dir / "captures" / "axtree" / f"{page_id}.json")
71
+
72
+ # Legacy paths
73
+ paths_to_try.extend([
74
+ Path(f"datasets/formats/axtree/{page_id}.json"),
75
+ Path(f"datasets/formats/axtree/{page_id}.txt"),
76
+ ])
77
+
78
+ for path in paths_to_try:
79
+ if path.exists():
80
+ content = path.read_text(encoding="utf-8")
81
+ if path.suffix == ".json":
82
+ try:
83
+ data = json.loads(content)
84
+ return json.dumps(data, indent=2)
85
+ except json.JSONDecodeError:
86
+ pass
87
+ return content
88
+
89
+ raise FileNotFoundError(f"AXTree file not found for: {page_id}. Tried: {[str(p) for p in paths_to_try]}")
90
+
91
+
92
+ def load_format(page_id: str, format_name: str, base_dir: Optional[Path] = None) -> str:
93
+ """
94
+ Load a page in specified format.
95
+
96
+ Args:
97
+ page_id: Page identifier
98
+ format_name: One of: sifr, html_raw, html_clean, axtree
99
+ base_dir: Run directory (new structure) or None for legacy
100
+
101
+ Returns:
102
+ File content as string
103
+ """
104
+ if format_name == "sifr":
105
+ return load_sifr(page_id, base_dir)
106
+ elif format_name == "html_raw":
107
+ return load_html(page_id, base_dir, clean=False)
108
+ elif format_name == "html_clean":
109
+ return load_html(page_id, base_dir, clean=True)
110
+ elif format_name == "axtree":
111
+ return load_axtree(page_id, base_dir)
112
+ else:
113
+ raise ValueError(f"Unknown format: {format_name}")
114
+
115
+
116
+ def discover_pages(base_dir: Path) -> list[str]:
117
+ """Discover available pages in a run directory."""
118
+ # Look for ground truth files
119
+ gt_dir = base_dir / "ground-truth"
120
+ if gt_dir.exists():
121
+ return [f.stem for f in gt_dir.glob("*.json")]
122
+
123
+ # Fallback: look for SiFR files
124
+ sifr_dir = base_dir / "captures" / "sifr"
125
+ if sifr_dir.exists():
126
+ return [f.stem for f in sifr_dir.glob("*.sifr")]
127
+
128
+ return []
129
+
130
+
131
+ def validate_sifr_file(path: Path) -> list[str]:
132
+ """
133
+ Validate a SiFR file.
134
+ Returns list of error messages (empty if valid).
135
+ """
136
+ errors = []
137
+
138
+ try:
139
+ content = path.read_text(encoding="utf-8")
140
+ except Exception as e:
141
+ return [f"Cannot read file: {e}"]
142
+
143
+ if content.strip().startswith("{"):
144
+ try:
145
+ data = json.loads(content)
146
+ metadata = data.get("====METADATA====", {})
147
+ if not metadata.get("format"):
148
+ errors.append("Missing metadata field: format")
149
+ if not metadata.get("url"):
150
+ errors.append("Missing metadata field: url")
151
+ if "====NODES====" not in data:
152
+ errors.append("Missing NODES section")
153
+ return errors
154
+ except json.JSONDecodeError as e:
155
+ return [f"Invalid JSON: {e}"]
156
+
157
+ required_sections = ["====METADATA====", "====NODES===="]
158
+ for section in required_sections:
159
+ if section not in content:
160
+ errors.append(f"Missing required section: {section}")
161
+
162
+ return errors