sifr-benchmark 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sifr_benchmark/__init__.py +22 -0
- sifr_benchmark/capture.py +242 -0
- sifr_benchmark/capture_e2llm.py +230 -0
- sifr_benchmark/cli.py +358 -0
- sifr_benchmark/formats.py +162 -0
- sifr_benchmark/ground_truth.py +250 -0
- sifr_benchmark/models.py +110 -0
- sifr_benchmark/runner.py +315 -0
- sifr_benchmark/scoring.py +117 -0
- sifr_benchmark/verify.py +224 -0
- sifr_benchmark-0.1.15.dist-info/METADATA +186 -0
- sifr_benchmark-0.1.15.dist-info/RECORD +15 -0
- sifr_benchmark-0.1.15.dist-info/WHEEL +4 -0
- sifr_benchmark-0.1.15.dist-info/entry_points.txt +2 -0
- sifr_benchmark-0.1.15.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ground truth generation for agent tasks using GPT-4o Vision.
|
|
3
|
+
Generates tasks with element IDs as answers.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
AGENT_GROUND_TRUTH_PROMPT = """You are analyzing a webpage screenshot alongside its SiFR representation.
|
|
11
|
+
|
|
12
|
+
SiFR is a compact format describing UI elements. Each element has an ID like btn001, lnk003, inp001.
|
|
13
|
+
|
|
14
|
+
Your task: Generate agent tasks where the answer is an element ID from the SiFR.
|
|
15
|
+
|
|
16
|
+
Look at the screenshot to understand WHAT each element does.
|
|
17
|
+
Look at the SiFR to find the correct element ID.
|
|
18
|
+
|
|
19
|
+
Generate these task types:
|
|
20
|
+
|
|
21
|
+
1. ACTION_CLICK (3-5 tasks): "What element ID should I click to [action]?"
|
|
22
|
+
- Login/signup buttons
|
|
23
|
+
- Navigation links
|
|
24
|
+
- Submit buttons
|
|
25
|
+
- Menu items
|
|
26
|
+
|
|
27
|
+
2. ACTION_INPUT (1-2 tasks): "What element ID should I use to [input action]?"
|
|
28
|
+
- Search fields
|
|
29
|
+
- Text inputs
|
|
30
|
+
|
|
31
|
+
3. ACTION_LOCATE (2-3 tasks): "What element ID contains [content]?"
|
|
32
|
+
- Main heading
|
|
33
|
+
- Specific text or logo
|
|
34
|
+
|
|
35
|
+
Rules:
|
|
36
|
+
- ONLY use element IDs that exist in the SiFR below
|
|
37
|
+
- Each answer must be a single element ID (e.g., "btn001", "lnk007", "inp001")
|
|
38
|
+
- Tasks should be clear and unambiguous
|
|
39
|
+
- Focus on common agent actions: login, search, navigate, submit
|
|
40
|
+
|
|
41
|
+
Respond ONLY in this JSON format:
|
|
42
|
+
{
|
|
43
|
+
"page_title": "detected page title",
|
|
44
|
+
"tasks": [
|
|
45
|
+
{
|
|
46
|
+
"id": "act_01",
|
|
47
|
+
"type": "action_click",
|
|
48
|
+
"question": "What element ID should I click to login?",
|
|
49
|
+
"answer": "lnk007",
|
|
50
|
+
"element_text": "login"
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"id": "act_02",
|
|
54
|
+
"type": "action_input",
|
|
55
|
+
"question": "What element ID should I use to enter a search query?",
|
|
56
|
+
"answer": "inp001",
|
|
57
|
+
"element_text": "Search"
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
SiFR content:
|
|
63
|
+
```
|
|
64
|
+
{sifr_content}
|
|
65
|
+
```
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def encode_image(image_path: Path) -> str:
|
|
70
|
+
"""Encode image to base64."""
|
|
71
|
+
with open(image_path, "rb") as f:
|
|
72
|
+
return base64.b64encode(f.read()).decode("utf-8")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load_sifr(sifr_path: Path) -> str:
|
|
76
|
+
"""Load SiFR file content."""
|
|
77
|
+
with open(sifr_path, "r", encoding="utf-8") as f:
|
|
78
|
+
return f.read()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def extract_json(text: str) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Extract JSON object from text - handles various GPT response formats.
|
|
84
|
+
"""
|
|
85
|
+
# Try markdown code blocks first
|
|
86
|
+
if "```json" in text:
|
|
87
|
+
try:
|
|
88
|
+
return text.split("```json")[1].split("```")[0].strip()
|
|
89
|
+
except IndexError:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
if "```" in text:
|
|
93
|
+
try:
|
|
94
|
+
return text.split("```")[1].split("```")[0].strip()
|
|
95
|
+
except IndexError:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
# Find JSON object by matching braces
|
|
99
|
+
start = text.find("{")
|
|
100
|
+
if start == -1:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
# Find matching closing brace
|
|
104
|
+
depth = 0
|
|
105
|
+
for i, char in enumerate(text[start:], start):
|
|
106
|
+
if char == "{":
|
|
107
|
+
depth += 1
|
|
108
|
+
elif char == "}":
|
|
109
|
+
depth -= 1
|
|
110
|
+
if depth == 0:
|
|
111
|
+
return text[start:i+1]
|
|
112
|
+
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def generate_ground_truth(
|
|
117
|
+
screenshot_path: Path,
|
|
118
|
+
sifr_path: Path,
|
|
119
|
+
output_path: Path = None
|
|
120
|
+
) -> dict:
|
|
121
|
+
"""
|
|
122
|
+
Generate agent ground truth from screenshot + SiFR.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
screenshot_path: Path to screenshot PNG
|
|
126
|
+
sifr_path: Path to SiFR file
|
|
127
|
+
output_path: Optional path to save ground truth JSON
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Ground truth dict with agent tasks
|
|
131
|
+
"""
|
|
132
|
+
import os
|
|
133
|
+
from openai import OpenAI
|
|
134
|
+
|
|
135
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
136
|
+
if not api_key:
|
|
137
|
+
return {"error": "OPENAI_API_KEY not set"}
|
|
138
|
+
|
|
139
|
+
client = OpenAI(api_key=api_key)
|
|
140
|
+
|
|
141
|
+
# Load inputs
|
|
142
|
+
base64_image = encode_image(screenshot_path)
|
|
143
|
+
sifr_content = load_sifr(sifr_path)
|
|
144
|
+
|
|
145
|
+
# Check if SiFR is empty
|
|
146
|
+
if not sifr_content or len(sifr_content.strip()) < 10:
|
|
147
|
+
return {"error": f"SiFR file is empty or too small: {sifr_path}"}
|
|
148
|
+
|
|
149
|
+
# Truncate SiFR if too large (GPT-4o has ~128K token limit)
|
|
150
|
+
MAX_SIFR_CHARS = 100000 # ~12K tokens, safe limit
|
|
151
|
+
if len(sifr_content) > MAX_SIFR_CHARS:
|
|
152
|
+
# Try to truncate at a sensible point
|
|
153
|
+
sifr_content = sifr_content[:MAX_SIFR_CHARS]
|
|
154
|
+
# Find last complete line
|
|
155
|
+
last_newline = sifr_content.rfind('\n')
|
|
156
|
+
if last_newline > MAX_SIFR_CHARS * 0.8:
|
|
157
|
+
sifr_content = sifr_content[:last_newline]
|
|
158
|
+
sifr_content += "\n... [truncated - showing first 50KB]"
|
|
159
|
+
|
|
160
|
+
# Build prompt with SiFR (use concatenation, not .format() - SiFR contains {})
|
|
161
|
+
prompt = AGENT_GROUND_TRUTH_PROMPT.replace("{sifr_content}", sifr_content)
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
response = client.chat.completions.create(
|
|
165
|
+
model="gpt-4o",
|
|
166
|
+
messages=[
|
|
167
|
+
{
|
|
168
|
+
"role": "user",
|
|
169
|
+
"content": [
|
|
170
|
+
{"type": "text", "text": prompt},
|
|
171
|
+
{
|
|
172
|
+
"type": "image_url",
|
|
173
|
+
"image_url": {
|
|
174
|
+
"url": f"data:image/png;base64,{base64_image}",
|
|
175
|
+
"detail": "high"
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
]
|
|
179
|
+
}
|
|
180
|
+
],
|
|
181
|
+
max_tokens=2000,
|
|
182
|
+
temperature=0
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Parse response
|
|
186
|
+
content = response.choices[0].message.content
|
|
187
|
+
|
|
188
|
+
# Extract JSON from response - robust parsing
|
|
189
|
+
json_str = extract_json(content)
|
|
190
|
+
if not json_str:
|
|
191
|
+
return {"error": f"Could not extract JSON from response: {content[:100]}..."}
|
|
192
|
+
|
|
193
|
+
ground_truth = json.loads(json_str)
|
|
194
|
+
ground_truth["_meta"] = {
|
|
195
|
+
"screenshot": str(screenshot_path),
|
|
196
|
+
"sifr": str(sifr_path),
|
|
197
|
+
"model": "gpt-4o",
|
|
198
|
+
"tokens": response.usage.total_tokens,
|
|
199
|
+
"mode": "agent"
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Save if output path provided
|
|
203
|
+
if output_path:
|
|
204
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
205
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
206
|
+
json.dump(ground_truth, f, indent=2, ensure_ascii=False)
|
|
207
|
+
|
|
208
|
+
return ground_truth
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
return {"error": str(e)}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def generate_ground_truth_for_page(page_name: str, base_dir: Path = None) -> dict:
|
|
215
|
+
"""
|
|
216
|
+
Generate agent ground truth for a captured page.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
page_name: Name of the page (e.g., "news_ycombinator_com")
|
|
220
|
+
base_dir: Base directory with datasets/formats structure
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Ground truth dict with agent tasks
|
|
224
|
+
"""
|
|
225
|
+
if base_dir is None:
|
|
226
|
+
base_dir = Path(".")
|
|
227
|
+
|
|
228
|
+
screenshot_path = base_dir / "datasets" / "formats" / "screenshots" / f"{page_name}.png"
|
|
229
|
+
sifr_path = base_dir / "datasets" / "formats" / "sifr" / f"{page_name}.sifr"
|
|
230
|
+
output_path = base_dir / "benchmark" / "ground-truth" / f"{page_name}.json"
|
|
231
|
+
|
|
232
|
+
if not screenshot_path.exists():
|
|
233
|
+
return {"error": f"Screenshot not found: {screenshot_path}"}
|
|
234
|
+
|
|
235
|
+
if not sifr_path.exists():
|
|
236
|
+
return {"error": f"SiFR not found: {sifr_path}"}
|
|
237
|
+
|
|
238
|
+
return generate_ground_truth(screenshot_path, sifr_path, output_path)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# CLI support
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
import sys
|
|
244
|
+
if len(sys.argv) > 1:
|
|
245
|
+
page_name = sys.argv[1]
|
|
246
|
+
base_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".")
|
|
247
|
+
result = generate_ground_truth_for_page(page_name, base_dir)
|
|
248
|
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
249
|
+
else:
|
|
250
|
+
print("Usage: python ground_truth.py <page_name> [base_dir]")
|
sifr_benchmark/models.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Model API integrations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
SUPPORTED_MODELS = {
|
|
9
|
+
# OpenAI
|
|
10
|
+
"gpt-4o": {"provider": "openai", "model": "gpt-4o"},
|
|
11
|
+
"gpt-4o-mini": {"provider": "openai", "model": "gpt-4o-mini"},
|
|
12
|
+
"gpt-4-turbo": {"provider": "openai", "model": "gpt-4-turbo"},
|
|
13
|
+
# Anthropic
|
|
14
|
+
"claude-sonnet": {"provider": "anthropic", "model": "claude-sonnet-4-20250514"},
|
|
15
|
+
"claude-haiku": {"provider": "anthropic", "model": "claude-haiku-4-5-20251001"},
|
|
16
|
+
"claude-opus": {"provider": "anthropic", "model": "claude-opus-4-20250514"},
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def query_openai(model_id: str, prompt: str) -> dict:
|
|
21
|
+
"""Query OpenAI API."""
|
|
22
|
+
try:
|
|
23
|
+
from openai import OpenAI
|
|
24
|
+
except ImportError:
|
|
25
|
+
return {"error": "openai package not installed", "response": None, "tokens": 0}
|
|
26
|
+
|
|
27
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
28
|
+
if not api_key:
|
|
29
|
+
return {"error": "OPENAI_API_KEY not set", "response": None, "tokens": 0}
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
client = OpenAI(api_key=api_key)
|
|
33
|
+
response = client.chat.completions.create(
|
|
34
|
+
model=model_id,
|
|
35
|
+
messages=[{"role": "user", "content": prompt}],
|
|
36
|
+
temperature=0,
|
|
37
|
+
max_tokens=500,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
"response": response.choices[0].message.content,
|
|
42
|
+
"tokens": response.usage.total_tokens,
|
|
43
|
+
"error": None,
|
|
44
|
+
}
|
|
45
|
+
except Exception as e:
|
|
46
|
+
return {"error": str(e), "response": None, "tokens": 0}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def query_anthropic(model_id: str, prompt: str) -> dict:
|
|
50
|
+
"""Query Anthropic API."""
|
|
51
|
+
try:
|
|
52
|
+
from anthropic import Anthropic
|
|
53
|
+
except ImportError:
|
|
54
|
+
return {"error": "anthropic package not installed", "response": None, "tokens": 0}
|
|
55
|
+
|
|
56
|
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
57
|
+
if not api_key:
|
|
58
|
+
return {"error": "ANTHROPIC_API_KEY not set", "response": None, "tokens": 0}
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
client = Anthropic(api_key=api_key)
|
|
62
|
+
response = client.messages.create(
|
|
63
|
+
model=model_id,
|
|
64
|
+
max_tokens=500,
|
|
65
|
+
messages=[{"role": "user", "content": prompt}],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
total_tokens = response.usage.input_tokens + response.usage.output_tokens
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
"response": response.content[0].text,
|
|
72
|
+
"tokens": total_tokens,
|
|
73
|
+
"error": None,
|
|
74
|
+
}
|
|
75
|
+
except Exception as e:
|
|
76
|
+
return {"error": str(e), "response": None, "tokens": 0}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def query_model(model_key: str, prompt: str) -> dict:
|
|
80
|
+
"""
|
|
81
|
+
Query a model by key.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
model_key: Key from SUPPORTED_MODELS (e.g., "gpt-4o-mini")
|
|
85
|
+
prompt: The prompt to send
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
dict with keys: response, tokens, error
|
|
89
|
+
"""
|
|
90
|
+
if model_key not in SUPPORTED_MODELS:
|
|
91
|
+
return {
|
|
92
|
+
"error": f"Unknown model: {model_key}",
|
|
93
|
+
"response": None,
|
|
94
|
+
"tokens": 0,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
config = SUPPORTED_MODELS[model_key]
|
|
98
|
+
provider = config["provider"]
|
|
99
|
+
model_id = config["model"]
|
|
100
|
+
|
|
101
|
+
if provider == "openai":
|
|
102
|
+
return query_openai(model_id, prompt)
|
|
103
|
+
elif provider == "anthropic":
|
|
104
|
+
return query_anthropic(model_id, prompt)
|
|
105
|
+
else:
|
|
106
|
+
return {
|
|
107
|
+
"error": f"Unknown provider: {provider}",
|
|
108
|
+
"response": None,
|
|
109
|
+
"tokens": 0,
|
|
110
|
+
}
|
sifr_benchmark/runner.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark runner - executes agent tasks across models and formats.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
from .models import query_model, SUPPORTED_MODELS
|
|
12
|
+
from .scoring import score_response
|
|
13
|
+
from .formats import load_format
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class TestResult:
|
|
18
|
+
model: str
|
|
19
|
+
format: str
|
|
20
|
+
page_id: str
|
|
21
|
+
task_id: str
|
|
22
|
+
run: int
|
|
23
|
+
response: str
|
|
24
|
+
expected: str
|
|
25
|
+
score: float
|
|
26
|
+
confidence: int
|
|
27
|
+
tokens: int
|
|
28
|
+
latency_ms: int
|
|
29
|
+
error: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Agent-focused prompt template
|
|
33
|
+
AGENT_PROMPT = """You are a web automation agent. You need to identify which UI element to interact with.
|
|
34
|
+
|
|
35
|
+
The webpage is described below in {format_name} format:
|
|
36
|
+
|
|
37
|
+
{context}
|
|
38
|
+
|
|
39
|
+
TASK: {question}
|
|
40
|
+
|
|
41
|
+
Rules:
|
|
42
|
+
- Return ONLY the element ID (like btn001, lnk007, inp001)
|
|
43
|
+
- If multiple elements match, return the most relevant one
|
|
44
|
+
- If no element matches, respond with "none"
|
|
45
|
+
|
|
46
|
+
Respond in this exact format:
|
|
47
|
+
ANSWER: [element ID]
|
|
48
|
+
CONFIDENCE: [0-100]"""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BenchmarkRunner:
|
|
52
|
+
"""Main benchmark runner for agent tasks."""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
models: list[str],
|
|
57
|
+
formats: list[str],
|
|
58
|
+
pages: Optional[list[str]] = None,
|
|
59
|
+
runs: int = 1,
|
|
60
|
+
base_dir: Optional[Path] = None,
|
|
61
|
+
):
|
|
62
|
+
self.models = models
|
|
63
|
+
self.formats = formats
|
|
64
|
+
self.pages = pages
|
|
65
|
+
self.runs = runs
|
|
66
|
+
self.base_dir = Path(base_dir) if base_dir else Path(".")
|
|
67
|
+
|
|
68
|
+
self._validate_config()
|
|
69
|
+
|
|
70
|
+
def _validate_config(self):
|
|
71
|
+
"""Validate configuration."""
|
|
72
|
+
for model in self.models:
|
|
73
|
+
if model not in SUPPORTED_MODELS:
|
|
74
|
+
raise ValueError(f"Unknown model: {model}. Supported: {list(SUPPORTED_MODELS.keys())}")
|
|
75
|
+
|
|
76
|
+
def _load_ground_truth(self, page_id: str) -> dict:
|
|
77
|
+
"""Load ground truth for a page."""
|
|
78
|
+
patterns = [
|
|
79
|
+
self.base_dir / "ground-truth" / f"{page_id}.json", # New structure
|
|
80
|
+
self.base_dir / "benchmark" / "ground-truth" / f"{page_id}.json", # Legacy
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
for path in patterns:
|
|
84
|
+
if path.exists():
|
|
85
|
+
with open(path, encoding="utf-8") as f:
|
|
86
|
+
return json.load(f)
|
|
87
|
+
|
|
88
|
+
return {}
|
|
89
|
+
|
|
90
|
+
def _get_tasks_from_ground_truth(self, ground_truth: dict) -> list[dict]:
|
|
91
|
+
"""Extract tasks from ground truth."""
|
|
92
|
+
# New agent format: tasks array
|
|
93
|
+
if "tasks" in ground_truth and isinstance(ground_truth["tasks"], list):
|
|
94
|
+
return ground_truth["tasks"]
|
|
95
|
+
|
|
96
|
+
# Legacy format: convert to tasks
|
|
97
|
+
legacy_tasks = []
|
|
98
|
+
|
|
99
|
+
if ground_truth.get("title"):
|
|
100
|
+
legacy_tasks.append({
|
|
101
|
+
"id": "ext_01",
|
|
102
|
+
"type": "action_locate",
|
|
103
|
+
"question": "What element ID contains the page title?",
|
|
104
|
+
"answer": ground_truth.get("title", "")
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
if ground_truth.get("primary_button", {}).get("text"):
|
|
108
|
+
legacy_tasks.append({
|
|
109
|
+
"id": "act_01",
|
|
110
|
+
"type": "action_click",
|
|
111
|
+
"question": f"What element ID should I click to {ground_truth['primary_button']['text']}?",
|
|
112
|
+
"answer": "" # Unknown in legacy format
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
return legacy_tasks
|
|
116
|
+
|
|
117
|
+
def _discover_pages(self) -> list[str]:
|
|
118
|
+
"""Discover available test pages."""
|
|
119
|
+
if self.pages:
|
|
120
|
+
return self.pages
|
|
121
|
+
|
|
122
|
+
# New structure: base_dir/ground-truth/
|
|
123
|
+
gt_path = self.base_dir / "ground-truth"
|
|
124
|
+
if gt_path.exists():
|
|
125
|
+
pages = [f.stem for f in gt_path.glob("*.json")]
|
|
126
|
+
if pages:
|
|
127
|
+
return pages
|
|
128
|
+
|
|
129
|
+
# Fallback: look for SiFR files
|
|
130
|
+
sifr_path = self.base_dir / "captures" / "sifr"
|
|
131
|
+
if sifr_path.exists():
|
|
132
|
+
pages = [f.stem for f in sifr_path.glob("*.sifr")]
|
|
133
|
+
if pages:
|
|
134
|
+
return pages
|
|
135
|
+
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
def _build_prompt(self, task: dict, context: str, format_name: str) -> str:
|
|
139
|
+
"""Build prompt for agent task."""
|
|
140
|
+
return AGENT_PROMPT.format(
|
|
141
|
+
format_name=format_name,
|
|
142
|
+
context=context,
|
|
143
|
+
question=task["question"]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def _parse_response(self, raw: str) -> dict:
|
|
147
|
+
"""Parse model response."""
|
|
148
|
+
result = {"answer": "", "confidence": 50}
|
|
149
|
+
|
|
150
|
+
for line in raw.split("\n"):
|
|
151
|
+
line = line.strip()
|
|
152
|
+
if line.upper().startswith("ANSWER:"):
|
|
153
|
+
result["answer"] = line.split(":", 1)[1].strip()
|
|
154
|
+
elif line.upper().startswith("CONFIDENCE:"):
|
|
155
|
+
try:
|
|
156
|
+
result["confidence"] = int(line.split(":", 1)[1].strip())
|
|
157
|
+
except ValueError:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
# Fallback: try to extract element ID from raw response
|
|
161
|
+
if not result["answer"]:
|
|
162
|
+
import re
|
|
163
|
+
ids = re.findall(r'\b([a-z]{2,4}\d{2,4})\b', raw.lower())
|
|
164
|
+
if ids:
|
|
165
|
+
result["answer"] = ids[0]
|
|
166
|
+
else:
|
|
167
|
+
result["answer"] = raw.strip()[:100]
|
|
168
|
+
|
|
169
|
+
return result
|
|
170
|
+
|
|
171
|
+
def run_single(
|
|
172
|
+
self,
|
|
173
|
+
model: str,
|
|
174
|
+
format_name: str,
|
|
175
|
+
page_id: str,
|
|
176
|
+
task: dict,
|
|
177
|
+
run_num: int,
|
|
178
|
+
) -> TestResult:
|
|
179
|
+
"""Run a single test."""
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
context = load_format(page_id, format_name, self.base_dir)
|
|
183
|
+
except FileNotFoundError as e:
|
|
184
|
+
return TestResult(
|
|
185
|
+
model=model,
|
|
186
|
+
format=format_name,
|
|
187
|
+
page_id=page_id,
|
|
188
|
+
task_id=task["id"],
|
|
189
|
+
run=run_num,
|
|
190
|
+
response="",
|
|
191
|
+
expected=task.get("answer", ""),
|
|
192
|
+
score=0.0,
|
|
193
|
+
confidence=0,
|
|
194
|
+
tokens=0,
|
|
195
|
+
latency_ms=0,
|
|
196
|
+
error=f"Format file not found: {format_name}/{page_id}",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
prompt = self._build_prompt(task, context, format_name)
|
|
200
|
+
|
|
201
|
+
start = time.time()
|
|
202
|
+
response_data = query_model(model, prompt)
|
|
203
|
+
latency = int((time.time() - start) * 1000)
|
|
204
|
+
|
|
205
|
+
if response_data.get("error"):
|
|
206
|
+
return TestResult(
|
|
207
|
+
model=model,
|
|
208
|
+
format=format_name,
|
|
209
|
+
page_id=page_id,
|
|
210
|
+
task_id=task["id"],
|
|
211
|
+
run=run_num,
|
|
212
|
+
response="",
|
|
213
|
+
expected=task.get("answer", ""),
|
|
214
|
+
score=0.0,
|
|
215
|
+
confidence=0,
|
|
216
|
+
tokens=response_data.get("tokens", 0),
|
|
217
|
+
latency_ms=latency,
|
|
218
|
+
error=response_data["error"],
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
parsed = self._parse_response(response_data["response"])
|
|
222
|
+
expected = task.get("answer", "")
|
|
223
|
+
element_text = task.get("element_text", "") # For HTML/AXTree fallback
|
|
224
|
+
|
|
225
|
+
score = score_response(
|
|
226
|
+
parsed["answer"],
|
|
227
|
+
expected,
|
|
228
|
+
task.get("type", "action"),
|
|
229
|
+
element_text
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return TestResult(
|
|
233
|
+
model=model,
|
|
234
|
+
format=format_name,
|
|
235
|
+
page_id=page_id,
|
|
236
|
+
task_id=task["id"],
|
|
237
|
+
run=run_num,
|
|
238
|
+
response=parsed["answer"],
|
|
239
|
+
expected=expected,
|
|
240
|
+
score=score,
|
|
241
|
+
confidence=parsed["confidence"],
|
|
242
|
+
tokens=response_data.get("tokens", 0),
|
|
243
|
+
latency_ms=latency,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def run(self) -> list[dict]:
|
|
247
|
+
"""Run full benchmark."""
|
|
248
|
+
pages = self._discover_pages()
|
|
249
|
+
results = []
|
|
250
|
+
|
|
251
|
+
if not pages:
|
|
252
|
+
print("No pages found for benchmark")
|
|
253
|
+
return results
|
|
254
|
+
|
|
255
|
+
for page_id in pages:
|
|
256
|
+
ground_truth = self._load_ground_truth(page_id)
|
|
257
|
+
tasks = self._get_tasks_from_ground_truth(ground_truth)
|
|
258
|
+
|
|
259
|
+
if not tasks:
|
|
260
|
+
print(f" ⚠️ No tasks for {page_id}")
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
for model in self.models:
|
|
264
|
+
for format_name in self.formats:
|
|
265
|
+
for task in tasks:
|
|
266
|
+
for run_num in range(1, self.runs + 1):
|
|
267
|
+
result = self.run_single(
|
|
268
|
+
model=model,
|
|
269
|
+
format_name=format_name,
|
|
270
|
+
page_id=page_id,
|
|
271
|
+
task=task,
|
|
272
|
+
run_num=run_num,
|
|
273
|
+
)
|
|
274
|
+
results.append(result.__dict__)
|
|
275
|
+
|
|
276
|
+
# Rate limiting
|
|
277
|
+
time.sleep(0.3)
|
|
278
|
+
|
|
279
|
+
return results
|
|
280
|
+
|
|
281
|
+
def aggregate(self, results: list[dict]) -> list[dict]:
|
|
282
|
+
"""Aggregate results by format (across models)."""
|
|
283
|
+
agg = {}
|
|
284
|
+
|
|
285
|
+
for r in results:
|
|
286
|
+
if r.get("error"):
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
key = r["format"]
|
|
290
|
+
if key not in agg:
|
|
291
|
+
agg[key] = {
|
|
292
|
+
"format": r["format"],
|
|
293
|
+
"scores": [],
|
|
294
|
+
"tokens": [],
|
|
295
|
+
"latencies": [],
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
agg[key]["scores"].append(r["score"])
|
|
299
|
+
agg[key]["tokens"].append(r["tokens"])
|
|
300
|
+
agg[key]["latencies"].append(r["latency_ms"])
|
|
301
|
+
|
|
302
|
+
summary = []
|
|
303
|
+
for data in agg.values():
|
|
304
|
+
scores = data["scores"]
|
|
305
|
+
tokens = data["tokens"]
|
|
306
|
+
latencies = data["latencies"]
|
|
307
|
+
|
|
308
|
+
summary.append({
|
|
309
|
+
"format": data["format"],
|
|
310
|
+
"accuracy": f"{(sum(scores) / len(scores) * 100):.1f}%" if scores else "N/A",
|
|
311
|
+
"avg_tokens": int(sum(tokens) / len(tokens)) if tokens else 0,
|
|
312
|
+
"avg_latency": f"{int(sum(latencies) / len(latencies))}ms" if latencies else "N/A",
|
|
313
|
+
})
|
|
314
|
+
|
|
315
|
+
return sorted(summary, key=lambda x: float(x["accuracy"].rstrip("%") or 0), reverse=True)
|