sifr-benchmark 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,250 @@
1
+ """
2
+ Ground truth generation for agent tasks using GPT-4o Vision.
3
+ Generates tasks with element IDs as answers.
4
+ """
5
+
6
+ import base64
7
+ import json
8
+ from pathlib import Path
9
+
10
+ AGENT_GROUND_TRUTH_PROMPT = """You are analyzing a webpage screenshot alongside its SiFR representation.
11
+
12
+ SiFR is a compact format describing UI elements. Each element has an ID like btn001, lnk003, inp001.
13
+
14
+ Your task: Generate agent tasks where the answer is an element ID from the SiFR.
15
+
16
+ Look at the screenshot to understand WHAT each element does.
17
+ Look at the SiFR to find the correct element ID.
18
+
19
+ Generate these task types:
20
+
21
+ 1. ACTION_CLICK (3-5 tasks): "What element ID should I click to [action]?"
22
+ - Login/signup buttons
23
+ - Navigation links
24
+ - Submit buttons
25
+ - Menu items
26
+
27
+ 2. ACTION_INPUT (1-2 tasks): "What element ID should I use to [input action]?"
28
+ - Search fields
29
+ - Text inputs
30
+
31
+ 3. ACTION_LOCATE (2-3 tasks): "What element ID contains [content]?"
32
+ - Main heading
33
+ - Specific text or logo
34
+
35
+ Rules:
36
+ - ONLY use element IDs that exist in the SiFR below
37
+ - Each answer must be a single element ID (e.g., "btn001", "lnk007", "inp001")
38
+ - Tasks should be clear and unambiguous
39
+ - Focus on common agent actions: login, search, navigate, submit
40
+
41
+ Respond ONLY in this JSON format:
42
+ {
43
+ "page_title": "detected page title",
44
+ "tasks": [
45
+ {
46
+ "id": "act_01",
47
+ "type": "action_click",
48
+ "question": "What element ID should I click to login?",
49
+ "answer": "lnk007",
50
+ "element_text": "login"
51
+ },
52
+ {
53
+ "id": "act_02",
54
+ "type": "action_input",
55
+ "question": "What element ID should I use to enter a search query?",
56
+ "answer": "inp001",
57
+ "element_text": "Search"
58
+ }
59
+ ]
60
+ }
61
+
62
+ SiFR content:
63
+ ```
64
+ {sifr_content}
65
+ ```
66
+ """
67
+
68
+
69
+ def encode_image(image_path: Path) -> str:
70
+ """Encode image to base64."""
71
+ with open(image_path, "rb") as f:
72
+ return base64.b64encode(f.read()).decode("utf-8")
73
+
74
+
75
+ def load_sifr(sifr_path: Path) -> str:
76
+ """Load SiFR file content."""
77
+ with open(sifr_path, "r", encoding="utf-8") as f:
78
+ return f.read()
79
+
80
+
81
+ def extract_json(text: str) -> str:
82
+ """
83
+ Extract JSON object from text - handles various GPT response formats.
84
+ """
85
+ # Try markdown code blocks first
86
+ if "```json" in text:
87
+ try:
88
+ return text.split("```json")[1].split("```")[0].strip()
89
+ except IndexError:
90
+ pass
91
+
92
+ if "```" in text:
93
+ try:
94
+ return text.split("```")[1].split("```")[0].strip()
95
+ except IndexError:
96
+ pass
97
+
98
+ # Find JSON object by matching braces
99
+ start = text.find("{")
100
+ if start == -1:
101
+ return None
102
+
103
+ # Find matching closing brace
104
+ depth = 0
105
+ for i, char in enumerate(text[start:], start):
106
+ if char == "{":
107
+ depth += 1
108
+ elif char == "}":
109
+ depth -= 1
110
+ if depth == 0:
111
+ return text[start:i+1]
112
+
113
+ return None
114
+
115
+
116
+ def generate_ground_truth(
117
+ screenshot_path: Path,
118
+ sifr_path: Path,
119
+ output_path: Path = None
120
+ ) -> dict:
121
+ """
122
+ Generate agent ground truth from screenshot + SiFR.
123
+
124
+ Args:
125
+ screenshot_path: Path to screenshot PNG
126
+ sifr_path: Path to SiFR file
127
+ output_path: Optional path to save ground truth JSON
128
+
129
+ Returns:
130
+ Ground truth dict with agent tasks
131
+ """
132
+ import os
133
+ from openai import OpenAI
134
+
135
+ api_key = os.getenv("OPENAI_API_KEY")
136
+ if not api_key:
137
+ return {"error": "OPENAI_API_KEY not set"}
138
+
139
+ client = OpenAI(api_key=api_key)
140
+
141
+ # Load inputs
142
+ base64_image = encode_image(screenshot_path)
143
+ sifr_content = load_sifr(sifr_path)
144
+
145
+ # Check if SiFR is empty
146
+ if not sifr_content or len(sifr_content.strip()) < 10:
147
+ return {"error": f"SiFR file is empty or too small: {sifr_path}"}
148
+
149
+ # Truncate SiFR if too large (GPT-4o has ~128K token limit)
150
+ MAX_SIFR_CHARS = 100000 # ~12K tokens, safe limit
151
+ if len(sifr_content) > MAX_SIFR_CHARS:
152
+ # Try to truncate at a sensible point
153
+ sifr_content = sifr_content[:MAX_SIFR_CHARS]
154
+ # Find last complete line
155
+ last_newline = sifr_content.rfind('\n')
156
+ if last_newline > MAX_SIFR_CHARS * 0.8:
157
+ sifr_content = sifr_content[:last_newline]
158
+ sifr_content += "\n... [truncated - showing first 50KB]"
159
+
160
+ # Build prompt with SiFR (use concatenation, not .format() - SiFR contains {})
161
+ prompt = AGENT_GROUND_TRUTH_PROMPT.replace("{sifr_content}", sifr_content)
162
+
163
+ try:
164
+ response = client.chat.completions.create(
165
+ model="gpt-4o",
166
+ messages=[
167
+ {
168
+ "role": "user",
169
+ "content": [
170
+ {"type": "text", "text": prompt},
171
+ {
172
+ "type": "image_url",
173
+ "image_url": {
174
+ "url": f"data:image/png;base64,{base64_image}",
175
+ "detail": "high"
176
+ }
177
+ }
178
+ ]
179
+ }
180
+ ],
181
+ max_tokens=2000,
182
+ temperature=0
183
+ )
184
+
185
+ # Parse response
186
+ content = response.choices[0].message.content
187
+
188
+ # Extract JSON from response - robust parsing
189
+ json_str = extract_json(content)
190
+ if not json_str:
191
+ return {"error": f"Could not extract JSON from response: {content[:100]}..."}
192
+
193
+ ground_truth = json.loads(json_str)
194
+ ground_truth["_meta"] = {
195
+ "screenshot": str(screenshot_path),
196
+ "sifr": str(sifr_path),
197
+ "model": "gpt-4o",
198
+ "tokens": response.usage.total_tokens,
199
+ "mode": "agent"
200
+ }
201
+
202
+ # Save if output path provided
203
+ if output_path:
204
+ output_path.parent.mkdir(parents=True, exist_ok=True)
205
+ with open(output_path, "w", encoding="utf-8") as f:
206
+ json.dump(ground_truth, f, indent=2, ensure_ascii=False)
207
+
208
+ return ground_truth
209
+
210
+ except Exception as e:
211
+ return {"error": str(e)}
212
+
213
+
214
+ def generate_ground_truth_for_page(page_name: str, base_dir: Path = None) -> dict:
215
+ """
216
+ Generate agent ground truth for a captured page.
217
+
218
+ Args:
219
+ page_name: Name of the page (e.g., "news_ycombinator_com")
220
+ base_dir: Base directory with datasets/formats structure
221
+
222
+ Returns:
223
+ Ground truth dict with agent tasks
224
+ """
225
+ if base_dir is None:
226
+ base_dir = Path(".")
227
+
228
+ screenshot_path = base_dir / "datasets" / "formats" / "screenshots" / f"{page_name}.png"
229
+ sifr_path = base_dir / "datasets" / "formats" / "sifr" / f"{page_name}.sifr"
230
+ output_path = base_dir / "benchmark" / "ground-truth" / f"{page_name}.json"
231
+
232
+ if not screenshot_path.exists():
233
+ return {"error": f"Screenshot not found: {screenshot_path}"}
234
+
235
+ if not sifr_path.exists():
236
+ return {"error": f"SiFR not found: {sifr_path}"}
237
+
238
+ return generate_ground_truth(screenshot_path, sifr_path, output_path)
239
+
240
+
241
+ # CLI support
242
+ if __name__ == "__main__":
243
+ import sys
244
+ if len(sys.argv) > 1:
245
+ page_name = sys.argv[1]
246
+ base_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".")
247
+ result = generate_ground_truth_for_page(page_name, base_dir)
248
+ print(json.dumps(result, indent=2, ensure_ascii=False))
249
+ else:
250
+ print("Usage: python ground_truth.py <page_name> [base_dir]")
@@ -0,0 +1,110 @@
1
+ """
2
+ Model API integrations.
3
+ """
4
+
5
+ import os
6
+ from typing import Optional
7
+
8
+ SUPPORTED_MODELS = {
9
+ # OpenAI
10
+ "gpt-4o": {"provider": "openai", "model": "gpt-4o"},
11
+ "gpt-4o-mini": {"provider": "openai", "model": "gpt-4o-mini"},
12
+ "gpt-4-turbo": {"provider": "openai", "model": "gpt-4-turbo"},
13
+ # Anthropic
14
+ "claude-sonnet": {"provider": "anthropic", "model": "claude-sonnet-4-20250514"},
15
+ "claude-haiku": {"provider": "anthropic", "model": "claude-haiku-4-5-20251001"},
16
+ "claude-opus": {"provider": "anthropic", "model": "claude-opus-4-20250514"},
17
+ }
18
+
19
+
20
+ def query_openai(model_id: str, prompt: str) -> dict:
21
+ """Query OpenAI API."""
22
+ try:
23
+ from openai import OpenAI
24
+ except ImportError:
25
+ return {"error": "openai package not installed", "response": None, "tokens": 0}
26
+
27
+ api_key = os.getenv("OPENAI_API_KEY")
28
+ if not api_key:
29
+ return {"error": "OPENAI_API_KEY not set", "response": None, "tokens": 0}
30
+
31
+ try:
32
+ client = OpenAI(api_key=api_key)
33
+ response = client.chat.completions.create(
34
+ model=model_id,
35
+ messages=[{"role": "user", "content": prompt}],
36
+ temperature=0,
37
+ max_tokens=500,
38
+ )
39
+
40
+ return {
41
+ "response": response.choices[0].message.content,
42
+ "tokens": response.usage.total_tokens,
43
+ "error": None,
44
+ }
45
+ except Exception as e:
46
+ return {"error": str(e), "response": None, "tokens": 0}
47
+
48
+
49
+ def query_anthropic(model_id: str, prompt: str) -> dict:
50
+ """Query Anthropic API."""
51
+ try:
52
+ from anthropic import Anthropic
53
+ except ImportError:
54
+ return {"error": "anthropic package not installed", "response": None, "tokens": 0}
55
+
56
+ api_key = os.getenv("ANTHROPIC_API_KEY")
57
+ if not api_key:
58
+ return {"error": "ANTHROPIC_API_KEY not set", "response": None, "tokens": 0}
59
+
60
+ try:
61
+ client = Anthropic(api_key=api_key)
62
+ response = client.messages.create(
63
+ model=model_id,
64
+ max_tokens=500,
65
+ messages=[{"role": "user", "content": prompt}],
66
+ )
67
+
68
+ total_tokens = response.usage.input_tokens + response.usage.output_tokens
69
+
70
+ return {
71
+ "response": response.content[0].text,
72
+ "tokens": total_tokens,
73
+ "error": None,
74
+ }
75
+ except Exception as e:
76
+ return {"error": str(e), "response": None, "tokens": 0}
77
+
78
+
79
+ def query_model(model_key: str, prompt: str) -> dict:
80
+ """
81
+ Query a model by key.
82
+
83
+ Args:
84
+ model_key: Key from SUPPORTED_MODELS (e.g., "gpt-4o-mini")
85
+ prompt: The prompt to send
86
+
87
+ Returns:
88
+ dict with keys: response, tokens, error
89
+ """
90
+ if model_key not in SUPPORTED_MODELS:
91
+ return {
92
+ "error": f"Unknown model: {model_key}",
93
+ "response": None,
94
+ "tokens": 0,
95
+ }
96
+
97
+ config = SUPPORTED_MODELS[model_key]
98
+ provider = config["provider"]
99
+ model_id = config["model"]
100
+
101
+ if provider == "openai":
102
+ return query_openai(model_id, prompt)
103
+ elif provider == "anthropic":
104
+ return query_anthropic(model_id, prompt)
105
+ else:
106
+ return {
107
+ "error": f"Unknown provider: {provider}",
108
+ "response": None,
109
+ "tokens": 0,
110
+ }
@@ -0,0 +1,315 @@
1
+ """
2
+ Benchmark runner - executes agent tasks across models and formats.
3
+ """
4
+
5
+ import json
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from dataclasses import dataclass
10
+
11
+ from .models import query_model, SUPPORTED_MODELS
12
+ from .scoring import score_response
13
+ from .formats import load_format
14
+
15
+
16
+ @dataclass
17
+ class TestResult:
18
+ model: str
19
+ format: str
20
+ page_id: str
21
+ task_id: str
22
+ run: int
23
+ response: str
24
+ expected: str
25
+ score: float
26
+ confidence: int
27
+ tokens: int
28
+ latency_ms: int
29
+ error: Optional[str] = None
30
+
31
+
32
+ # Agent-focused prompt template
33
+ AGENT_PROMPT = """You are a web automation agent. You need to identify which UI element to interact with.
34
+
35
+ The webpage is described below in {format_name} format:
36
+
37
+ {context}
38
+
39
+ TASK: {question}
40
+
41
+ Rules:
42
+ - Return ONLY the element ID (like btn001, lnk007, inp001)
43
+ - If multiple elements match, return the most relevant one
44
+ - If no element matches, respond with "none"
45
+
46
+ Respond in this exact format:
47
+ ANSWER: [element ID]
48
+ CONFIDENCE: [0-100]"""
49
+
50
+
51
+ class BenchmarkRunner:
52
+ """Main benchmark runner for agent tasks."""
53
+
54
+ def __init__(
55
+ self,
56
+ models: list[str],
57
+ formats: list[str],
58
+ pages: Optional[list[str]] = None,
59
+ runs: int = 1,
60
+ base_dir: Optional[Path] = None,
61
+ ):
62
+ self.models = models
63
+ self.formats = formats
64
+ self.pages = pages
65
+ self.runs = runs
66
+ self.base_dir = Path(base_dir) if base_dir else Path(".")
67
+
68
+ self._validate_config()
69
+
70
+ def _validate_config(self):
71
+ """Validate configuration."""
72
+ for model in self.models:
73
+ if model not in SUPPORTED_MODELS:
74
+ raise ValueError(f"Unknown model: {model}. Supported: {list(SUPPORTED_MODELS.keys())}")
75
+
76
+ def _load_ground_truth(self, page_id: str) -> dict:
77
+ """Load ground truth for a page."""
78
+ patterns = [
79
+ self.base_dir / "ground-truth" / f"{page_id}.json", # New structure
80
+ self.base_dir / "benchmark" / "ground-truth" / f"{page_id}.json", # Legacy
81
+ ]
82
+
83
+ for path in patterns:
84
+ if path.exists():
85
+ with open(path, encoding="utf-8") as f:
86
+ return json.load(f)
87
+
88
+ return {}
89
+
90
+ def _get_tasks_from_ground_truth(self, ground_truth: dict) -> list[dict]:
91
+ """Extract tasks from ground truth."""
92
+ # New agent format: tasks array
93
+ if "tasks" in ground_truth and isinstance(ground_truth["tasks"], list):
94
+ return ground_truth["tasks"]
95
+
96
+ # Legacy format: convert to tasks
97
+ legacy_tasks = []
98
+
99
+ if ground_truth.get("title"):
100
+ legacy_tasks.append({
101
+ "id": "ext_01",
102
+ "type": "action_locate",
103
+ "question": "What element ID contains the page title?",
104
+ "answer": ground_truth.get("title", "")
105
+ })
106
+
107
+ if ground_truth.get("primary_button", {}).get("text"):
108
+ legacy_tasks.append({
109
+ "id": "act_01",
110
+ "type": "action_click",
111
+ "question": f"What element ID should I click to {ground_truth['primary_button']['text']}?",
112
+ "answer": "" # Unknown in legacy format
113
+ })
114
+
115
+ return legacy_tasks
116
+
117
+ def _discover_pages(self) -> list[str]:
118
+ """Discover available test pages."""
119
+ if self.pages:
120
+ return self.pages
121
+
122
+ # New structure: base_dir/ground-truth/
123
+ gt_path = self.base_dir / "ground-truth"
124
+ if gt_path.exists():
125
+ pages = [f.stem for f in gt_path.glob("*.json")]
126
+ if pages:
127
+ return pages
128
+
129
+ # Fallback: look for SiFR files
130
+ sifr_path = self.base_dir / "captures" / "sifr"
131
+ if sifr_path.exists():
132
+ pages = [f.stem for f in sifr_path.glob("*.sifr")]
133
+ if pages:
134
+ return pages
135
+
136
+ return []
137
+
138
+ def _build_prompt(self, task: dict, context: str, format_name: str) -> str:
139
+ """Build prompt for agent task."""
140
+ return AGENT_PROMPT.format(
141
+ format_name=format_name,
142
+ context=context,
143
+ question=task["question"]
144
+ )
145
+
146
+ def _parse_response(self, raw: str) -> dict:
147
+ """Parse model response."""
148
+ result = {"answer": "", "confidence": 50}
149
+
150
+ for line in raw.split("\n"):
151
+ line = line.strip()
152
+ if line.upper().startswith("ANSWER:"):
153
+ result["answer"] = line.split(":", 1)[1].strip()
154
+ elif line.upper().startswith("CONFIDENCE:"):
155
+ try:
156
+ result["confidence"] = int(line.split(":", 1)[1].strip())
157
+ except ValueError:
158
+ pass
159
+
160
+ # Fallback: try to extract element ID from raw response
161
+ if not result["answer"]:
162
+ import re
163
+ ids = re.findall(r'\b([a-z]{2,4}\d{2,4})\b', raw.lower())
164
+ if ids:
165
+ result["answer"] = ids[0]
166
+ else:
167
+ result["answer"] = raw.strip()[:100]
168
+
169
+ return result
170
+
171
+ def run_single(
172
+ self,
173
+ model: str,
174
+ format_name: str,
175
+ page_id: str,
176
+ task: dict,
177
+ run_num: int,
178
+ ) -> TestResult:
179
+ """Run a single test."""
180
+
181
+ try:
182
+ context = load_format(page_id, format_name, self.base_dir)
183
+ except FileNotFoundError as e:
184
+ return TestResult(
185
+ model=model,
186
+ format=format_name,
187
+ page_id=page_id,
188
+ task_id=task["id"],
189
+ run=run_num,
190
+ response="",
191
+ expected=task.get("answer", ""),
192
+ score=0.0,
193
+ confidence=0,
194
+ tokens=0,
195
+ latency_ms=0,
196
+ error=f"Format file not found: {format_name}/{page_id}",
197
+ )
198
+
199
+ prompt = self._build_prompt(task, context, format_name)
200
+
201
+ start = time.time()
202
+ response_data = query_model(model, prompt)
203
+ latency = int((time.time() - start) * 1000)
204
+
205
+ if response_data.get("error"):
206
+ return TestResult(
207
+ model=model,
208
+ format=format_name,
209
+ page_id=page_id,
210
+ task_id=task["id"],
211
+ run=run_num,
212
+ response="",
213
+ expected=task.get("answer", ""),
214
+ score=0.0,
215
+ confidence=0,
216
+ tokens=response_data.get("tokens", 0),
217
+ latency_ms=latency,
218
+ error=response_data["error"],
219
+ )
220
+
221
+ parsed = self._parse_response(response_data["response"])
222
+ expected = task.get("answer", "")
223
+ element_text = task.get("element_text", "") # For HTML/AXTree fallback
224
+
225
+ score = score_response(
226
+ parsed["answer"],
227
+ expected,
228
+ task.get("type", "action"),
229
+ element_text
230
+ )
231
+
232
+ return TestResult(
233
+ model=model,
234
+ format=format_name,
235
+ page_id=page_id,
236
+ task_id=task["id"],
237
+ run=run_num,
238
+ response=parsed["answer"],
239
+ expected=expected,
240
+ score=score,
241
+ confidence=parsed["confidence"],
242
+ tokens=response_data.get("tokens", 0),
243
+ latency_ms=latency,
244
+ )
245
+
246
+ def run(self) -> list[dict]:
247
+ """Run full benchmark."""
248
+ pages = self._discover_pages()
249
+ results = []
250
+
251
+ if not pages:
252
+ print("No pages found for benchmark")
253
+ return results
254
+
255
+ for page_id in pages:
256
+ ground_truth = self._load_ground_truth(page_id)
257
+ tasks = self._get_tasks_from_ground_truth(ground_truth)
258
+
259
+ if not tasks:
260
+ print(f" ⚠️ No tasks for {page_id}")
261
+ continue
262
+
263
+ for model in self.models:
264
+ for format_name in self.formats:
265
+ for task in tasks:
266
+ for run_num in range(1, self.runs + 1):
267
+ result = self.run_single(
268
+ model=model,
269
+ format_name=format_name,
270
+ page_id=page_id,
271
+ task=task,
272
+ run_num=run_num,
273
+ )
274
+ results.append(result.__dict__)
275
+
276
+ # Rate limiting
277
+ time.sleep(0.3)
278
+
279
+ return results
280
+
281
+ def aggregate(self, results: list[dict]) -> list[dict]:
282
+ """Aggregate results by format (across models)."""
283
+ agg = {}
284
+
285
+ for r in results:
286
+ if r.get("error"):
287
+ continue
288
+
289
+ key = r["format"]
290
+ if key not in agg:
291
+ agg[key] = {
292
+ "format": r["format"],
293
+ "scores": [],
294
+ "tokens": [],
295
+ "latencies": [],
296
+ }
297
+
298
+ agg[key]["scores"].append(r["score"])
299
+ agg[key]["tokens"].append(r["tokens"])
300
+ agg[key]["latencies"].append(r["latency_ms"])
301
+
302
+ summary = []
303
+ for data in agg.values():
304
+ scores = data["scores"]
305
+ tokens = data["tokens"]
306
+ latencies = data["latencies"]
307
+
308
+ summary.append({
309
+ "format": data["format"],
310
+ "accuracy": f"{(sum(scores) / len(scores) * 100):.1f}%" if scores else "N/A",
311
+ "avg_tokens": int(sum(tokens) / len(tokens)) if tokens else 0,
312
+ "avg_latency": f"{int(sum(latencies) / len(latencies))}ms" if latencies else "N/A",
313
+ })
314
+
315
+ return sorted(summary, key=lambda x: float(x["accuracy"].rstrip("%") or 0), reverse=True)