connectonion 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- connectonion/__init__.py +1 -1
- connectonion/cli/browser_agent/browser.py +82 -139
- connectonion/cli/browser_agent/element_finder.py +139 -0
- connectonion/cli/browser_agent/highlight_screenshot.py +174 -0
- connectonion/cli/browser_agent/prompt.md +188 -105
- connectonion/cli/browser_agent/prompts/element_matcher.md +59 -0
- connectonion/cli/browser_agent/prompts/form_filler.md +19 -0
- connectonion/cli/browser_agent/prompts/scroll_strategy.md +36 -0
- connectonion/cli/browser_agent/scripts/extract_elements.js +126 -0
- connectonion/cli/browser_agent/scroll.py +137 -0
- {connectonion-0.6.1.dist-info → connectonion-0.6.2.dist-info}/METADATA +1 -1
- {connectonion-0.6.1.dist-info → connectonion-0.6.2.dist-info}/RECORD +14 -8
- connectonion/cli/browser_agent/scroll_strategies.py +0 -276
- {connectonion-0.6.1.dist-info → connectonion-0.6.2.dist-info}/WHEEL +0 -0
- {connectonion-0.6.1.dist-info → connectonion-0.6.2.dist-info}/entry_points.txt +0 -0
connectonion/__init__.py
CHANGED
|
@@ -20,6 +20,7 @@ from typing import Optional, List, Dict, Any
|
|
|
20
20
|
from connectonion import Agent, llm_do
|
|
21
21
|
from dotenv import load_dotenv
|
|
22
22
|
from pydantic import BaseModel, Field
|
|
23
|
+
from . import element_finder
|
|
23
24
|
|
|
24
25
|
# Default screenshots directory
|
|
25
26
|
SCREENSHOTS_DIR = Path.cwd() / ".tmp"
|
|
@@ -53,7 +54,7 @@ class BrowserAutomation:
|
|
|
53
54
|
Supports Chrome profile for persistent sessions.
|
|
54
55
|
"""
|
|
55
56
|
|
|
56
|
-
def __init__(self, use_chrome_profile: bool =
|
|
57
|
+
def __init__(self, use_chrome_profile: bool = True, headless: bool = True):
|
|
57
58
|
"""Initialize browser automation.
|
|
58
59
|
|
|
59
60
|
Args:
|
|
@@ -109,17 +110,28 @@ class BrowserAutomation:
|
|
|
109
110
|
source_profile = home / ".config/google-chrome"
|
|
110
111
|
|
|
111
112
|
if source_profile.exists():
|
|
113
|
+
def safe_copy(src, dst):
|
|
114
|
+
try:
|
|
115
|
+
shutil.copy2(src, dst)
|
|
116
|
+
except:
|
|
117
|
+
pass # Skip any file that can't be copied
|
|
118
|
+
|
|
112
119
|
shutil.copytree(
|
|
113
120
|
source_profile,
|
|
114
121
|
chromium_profile,
|
|
115
|
-
ignore=shutil.ignore_patterns(
|
|
122
|
+
ignore=shutil.ignore_patterns(
|
|
123
|
+
'*Cache*', '*cache*', 'Service Worker', 'ShaderCache',
|
|
124
|
+
'Singleton*', '*lock*', '*Lock*', '*.tmp', 'GPUCache',
|
|
125
|
+
'Code Cache', 'DawnCache', 'GrShaderCache', 'blob_storage'
|
|
126
|
+
),
|
|
127
|
+
copy_function=safe_copy,
|
|
116
128
|
dirs_exist_ok=True
|
|
117
129
|
)
|
|
118
130
|
|
|
119
131
|
self.browser = self.playwright.chromium.launch_persistent_context(
|
|
120
132
|
str(chromium_profile),
|
|
121
133
|
headless=headless,
|
|
122
|
-
args=['--disable-blink-features=AutomationControlled'],
|
|
134
|
+
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
|
|
123
135
|
ignore_default_args=['--enable-automation'],
|
|
124
136
|
timeout=120000,
|
|
125
137
|
)
|
|
@@ -127,10 +139,12 @@ class BrowserAutomation:
|
|
|
127
139
|
self.page.add_init_script("""
|
|
128
140
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
129
141
|
""")
|
|
142
|
+
self.page.set_viewport_size({"width": 1920, "height": 1080})
|
|
130
143
|
return f"Browser opened with Chrome profile: {chromium_profile}"
|
|
131
144
|
else:
|
|
132
145
|
self.browser = self.playwright.chromium.launch(headless=headless)
|
|
133
146
|
self.page = self.browser.new_page()
|
|
147
|
+
self.page.set_viewport_size({"width": 1920, "height": 1080})
|
|
134
148
|
return "Browser opened successfully"
|
|
135
149
|
|
|
136
150
|
def go_to(self, url: str) -> str:
|
|
@@ -149,88 +163,94 @@ class BrowserAutomation:
|
|
|
149
163
|
def find_element_by_description(self, description: str) -> str:
|
|
150
164
|
"""Find element using natural language description.
|
|
151
165
|
|
|
152
|
-
Uses
|
|
166
|
+
Uses element_finder: LLM selects from indexed list, never generates CSS.
|
|
153
167
|
|
|
154
168
|
Args:
|
|
155
169
|
description: e.g., "the submit button", "email input field"
|
|
156
170
|
|
|
157
171
|
Returns:
|
|
158
|
-
|
|
172
|
+
Pre-built locator string, or error message
|
|
159
173
|
"""
|
|
160
174
|
if not self.page:
|
|
161
175
|
return "Browser not open"
|
|
162
176
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
confidence: float = Field(..., description="Confidence score 0-1")
|
|
168
|
-
explanation: str = Field(..., description="Why this element matches")
|
|
169
|
-
|
|
170
|
-
result = llm_do(
|
|
171
|
-
f"""Analyze this HTML and find the CSS selector for: "{description}"
|
|
172
|
-
|
|
173
|
-
HTML (first 15000 chars): {html[:15000]}
|
|
174
|
-
|
|
175
|
-
Return the most specific CSS selector that uniquely identifies this element.
|
|
176
|
-
""",
|
|
177
|
-
output=ElementSelector,
|
|
178
|
-
model="gpt-4o",
|
|
179
|
-
temperature=0.1
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
if self.page.locator(result.selector).count() > 0:
|
|
183
|
-
return result.selector
|
|
184
|
-
else:
|
|
185
|
-
return f"Found selector {result.selector} but element not on page"
|
|
177
|
+
element = element_finder.find_element(self.page, description)
|
|
178
|
+
if element:
|
|
179
|
+
return element.locator
|
|
180
|
+
return f"Could not find element matching: {description}"
|
|
186
181
|
|
|
187
182
|
def click(self, description: str) -> str:
|
|
188
183
|
"""Click on an element using natural language description.
|
|
189
184
|
|
|
190
|
-
|
|
191
|
-
description: e.g., "the blue submit button", "link to contact page"
|
|
185
|
+
Uses element_finder: LLM selects from pre-built locators, never generates CSS.
|
|
192
186
|
"""
|
|
193
187
|
if not self.page:
|
|
194
188
|
return "Browser not open"
|
|
195
189
|
|
|
196
|
-
|
|
190
|
+
element = element_finder.find_element(self.page, description)
|
|
197
191
|
|
|
198
|
-
if
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
192
|
+
if not element:
|
|
193
|
+
# Fallback to simple text matching
|
|
194
|
+
text_locator = self.page.get_by_text(description)
|
|
195
|
+
if text_locator.count() > 0:
|
|
196
|
+
text_locator.first.click()
|
|
197
|
+
return f"Clicked on '{description}' (by text fallback)"
|
|
198
|
+
return f"Could not find element matching: {description}"
|
|
199
|
+
|
|
200
|
+
# Try the locator with fresh bounding box
|
|
201
|
+
locator = self.page.locator(element.locator)
|
|
203
202
|
|
|
204
|
-
|
|
205
|
-
|
|
203
|
+
if locator.count() > 0:
|
|
204
|
+
box = locator.first.bounding_box()
|
|
205
|
+
if box:
|
|
206
|
+
x = box['x'] + box['width'] / 2
|
|
207
|
+
y = box['y'] + box['height'] / 2
|
|
208
|
+
self.page.mouse.click(x, y)
|
|
209
|
+
return f"Clicked [{element.index}] {element.tag} '{element.text}'"
|
|
210
|
+
|
|
211
|
+
locator.first.click(force=True)
|
|
212
|
+
return f"Clicked [{element.index}] {element.tag} '{element.text}' (force)"
|
|
213
|
+
|
|
214
|
+
# Fallback: use original coordinates
|
|
215
|
+
x = element.x + element.width // 2
|
|
216
|
+
y = element.y + element.height // 2
|
|
217
|
+
self.page.mouse.click(x, y)
|
|
218
|
+
return f"Clicked [{element.index}] '{element.text}' at ({x}, {y})"
|
|
206
219
|
|
|
207
220
|
def type_text(self, field_description: str, text: str) -> str:
|
|
208
221
|
"""Type text into a form field.
|
|
209
222
|
|
|
210
|
-
|
|
211
|
-
field_description: e.g., "email field", "password input"
|
|
212
|
-
text: The text to type
|
|
223
|
+
Uses element_finder: LLM selects from pre-built locators, never generates CSS.
|
|
213
224
|
"""
|
|
214
225
|
if not self.page:
|
|
215
226
|
return "Browser not open"
|
|
216
227
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
if
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
element = element_finder.find_element(self.page, field_description)
|
|
229
|
+
|
|
230
|
+
if not element:
|
|
231
|
+
# Fallback to placeholder matching
|
|
232
|
+
placeholder_locator = self.page.get_by_placeholder(field_description)
|
|
233
|
+
if placeholder_locator.count() > 0:
|
|
234
|
+
placeholder_locator.first.fill(text)
|
|
235
|
+
self.form_data[field_description] = text
|
|
236
|
+
return f"Typed into '{field_description}'"
|
|
237
|
+
return f"Could not find field: {field_description}"
|
|
238
|
+
|
|
239
|
+
# Try the pre-built locator
|
|
240
|
+
locator = self.page.locator(element.locator)
|
|
241
|
+
|
|
242
|
+
if locator.count() > 0:
|
|
243
|
+
locator.first.fill(text)
|
|
244
|
+
self.form_data[field_description] = text
|
|
245
|
+
return f"Typed into [{element.index}] {element.tag}"
|
|
246
|
+
|
|
247
|
+
# Fallback: click then type
|
|
248
|
+
x = element.x + element.width // 2
|
|
249
|
+
y = element.y + element.height // 2
|
|
250
|
+
self.page.mouse.click(x, y)
|
|
251
|
+
self.page.keyboard.type(text)
|
|
232
252
|
self.form_data[field_description] = text
|
|
233
|
-
return f"Typed into {
|
|
253
|
+
return f"Typed into [{element.index}] at ({x}, {y})"
|
|
234
254
|
|
|
235
255
|
def get_text(self) -> str:
|
|
236
256
|
"""Get all visible text from the page."""
|
|
@@ -306,27 +326,6 @@ class BrowserAutomation:
|
|
|
306
326
|
self.page.set_viewport_size({"width": width, "height": height})
|
|
307
327
|
return f"Viewport set to {width}x{height}"
|
|
308
328
|
|
|
309
|
-
def screenshot_mobile(self, url: str = None) -> str:
|
|
310
|
-
"""Take screenshot with iPhone viewport (390x844)."""
|
|
311
|
-
if url:
|
|
312
|
-
self.go_to(url)
|
|
313
|
-
self.set_viewport(390, 844)
|
|
314
|
-
return self.take_screenshot()
|
|
315
|
-
|
|
316
|
-
def screenshot_tablet(self, url: str = None) -> str:
|
|
317
|
-
"""Take screenshot with iPad viewport (768x1024)."""
|
|
318
|
-
if url:
|
|
319
|
-
self.go_to(url)
|
|
320
|
-
self.set_viewport(768, 1024)
|
|
321
|
-
return self.take_screenshot()
|
|
322
|
-
|
|
323
|
-
def screenshot_desktop(self, url: str = None) -> str:
|
|
324
|
-
"""Take screenshot with desktop viewport (1920x1080)."""
|
|
325
|
-
if url:
|
|
326
|
-
self.go_to(url)
|
|
327
|
-
self.set_viewport(1920, 1080)
|
|
328
|
-
return self.take_screenshot()
|
|
329
|
-
|
|
330
329
|
def find_forms(self) -> List[FormField]:
|
|
331
330
|
"""Find all form fields on the current page."""
|
|
332
331
|
if not self.page:
|
|
@@ -440,69 +439,13 @@ class BrowserAutomation:
|
|
|
440
439
|
return f"Waited for {seconds} seconds"
|
|
441
440
|
|
|
442
441
|
def scroll(self, times: int = 5, description: str = "the main content area") -> str:
|
|
443
|
-
"""Universal scroll with
|
|
444
|
-
|
|
445
|
-
Tries multiple strategies until one works:
|
|
446
|
-
1. AI-generated strategy (analyzes page structure)
|
|
447
|
-
2. Element scrolling
|
|
448
|
-
3. Page scrolling
|
|
442
|
+
"""Universal scroll with AI strategy and fallback.
|
|
449
443
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
description: What to scroll (e.g., "the email list")
|
|
453
|
-
|
|
454
|
-
Returns:
|
|
455
|
-
Status message with successful strategy
|
|
444
|
+
Tries: AI-generated → Element scroll → Page scroll
|
|
445
|
+
Verifies success with screenshot comparison.
|
|
456
446
|
"""
|
|
457
|
-
from . import
|
|
458
|
-
return
|
|
459
|
-
page=self.page,
|
|
460
|
-
take_screenshot=self.take_screenshot,
|
|
461
|
-
times=times,
|
|
462
|
-
description=description
|
|
463
|
-
)
|
|
464
|
-
|
|
465
|
-
def scroll_page(self, direction: str = "down", amount: int = 1000) -> str:
|
|
466
|
-
"""Scroll the page in a direction.
|
|
467
|
-
|
|
468
|
-
Args:
|
|
469
|
-
direction: "down", "up", "top", or "bottom"
|
|
470
|
-
amount: Pixels to scroll (ignored for "bottom"/"top")
|
|
471
|
-
"""
|
|
472
|
-
if not self.page:
|
|
473
|
-
return "Browser not open"
|
|
474
|
-
|
|
475
|
-
if direction == "bottom":
|
|
476
|
-
self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
477
|
-
return "Scrolled to bottom of page"
|
|
478
|
-
elif direction == "top":
|
|
479
|
-
self.page.evaluate("window.scrollTo(0, 0)")
|
|
480
|
-
return "Scrolled to top of page"
|
|
481
|
-
elif direction == "down":
|
|
482
|
-
self.page.evaluate(f"window.scrollBy(0, {amount})")
|
|
483
|
-
return f"Scrolled down {amount} pixels"
|
|
484
|
-
elif direction == "up":
|
|
485
|
-
self.page.evaluate(f"window.scrollBy(0, -{amount})")
|
|
486
|
-
return f"Scrolled up {amount} pixels"
|
|
487
|
-
else:
|
|
488
|
-
return f"Unknown direction: {direction}"
|
|
489
|
-
|
|
490
|
-
def scroll_element(self, selector: str, amount: int = 1000) -> str:
|
|
491
|
-
"""Scroll a specific element by CSS selector."""
|
|
492
|
-
if not self.page:
|
|
493
|
-
return "Browser not open"
|
|
494
|
-
|
|
495
|
-
result = self.page.evaluate(f"""
|
|
496
|
-
(() => {{
|
|
497
|
-
const element = document.querySelector('{selector}');
|
|
498
|
-
if (!element) return 'Element not found: {selector}';
|
|
499
|
-
const beforeScroll = element.scrollTop;
|
|
500
|
-
element.scrollTop += {amount};
|
|
501
|
-
const afterScroll = element.scrollTop;
|
|
502
|
-
return `Scrolled from ${{beforeScroll}}px to ${{afterScroll}}px`;
|
|
503
|
-
}})()
|
|
504
|
-
""")
|
|
505
|
-
return result
|
|
447
|
+
from . import scroll
|
|
448
|
+
return scroll.scroll(self.page, self.take_screenshot, times, description)
|
|
506
449
|
|
|
507
450
|
def wait_for_manual_login(self, site_name: str = "the website") -> str:
|
|
508
451
|
"""Pause automation for user to login manually.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Element Finder - Find interactive elements by natural language description.
|
|
3
|
+
|
|
4
|
+
Inspired by browser-use (https://github.com/browser-use/browser-use).
|
|
5
|
+
|
|
6
|
+
Architecture:
|
|
7
|
+
1. JavaScript injects `data-browser-agent-id` into each interactive element
|
|
8
|
+
2. LLM SELECTS from indexed element list, never GENERATES CSS selectors
|
|
9
|
+
3. Pre-built locators are guaranteed to work
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
elements = extract_elements(page)
|
|
13
|
+
element = find_element(page, "the login button", elements)
|
|
14
|
+
page.locator(element.locator).click()
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import List, Optional
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
from connectonion import llm_do
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Load JavaScript and prompt from files
|
|
24
|
+
_BASE_DIR = Path(__file__).parent
|
|
25
|
+
_EXTRACT_JS = (_BASE_DIR / "scripts" / "extract_elements.js").read_text()
|
|
26
|
+
_ELEMENT_MATCHER_PROMPT = (_BASE_DIR / "prompts" / "element_matcher.md").read_text()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class InteractiveElement(BaseModel):
|
|
30
|
+
"""An interactive element on the page with pre-built locator."""
|
|
31
|
+
index: int
|
|
32
|
+
tag: str
|
|
33
|
+
text: str = ""
|
|
34
|
+
role: Optional[str] = None
|
|
35
|
+
aria_label: Optional[str] = None
|
|
36
|
+
placeholder: Optional[str] = None
|
|
37
|
+
input_type: Optional[str] = None
|
|
38
|
+
href: Optional[str] = None
|
|
39
|
+
x: int = 0
|
|
40
|
+
y: int = 0
|
|
41
|
+
width: int = 0
|
|
42
|
+
height: int = 0
|
|
43
|
+
locator: str = ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ElementMatch(BaseModel):
|
|
47
|
+
"""LLM's element selection result."""
|
|
48
|
+
index: int = Field(..., description="Index of the matching element")
|
|
49
|
+
confidence: float = Field(..., description="Confidence 0-1")
|
|
50
|
+
reasoning: str = Field(..., description="Why this element matches")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def extract_elements(page) -> List[InteractiveElement]:
|
|
54
|
+
"""Extract all interactive elements from the page.
|
|
55
|
+
|
|
56
|
+
Returns elements with:
|
|
57
|
+
- Bounding boxes (for position matching with screenshot)
|
|
58
|
+
- Pre-built Playwright locators (guaranteed to work)
|
|
59
|
+
- Text/aria/placeholder for LLM matching
|
|
60
|
+
"""
|
|
61
|
+
raw = page.evaluate(_EXTRACT_JS)
|
|
62
|
+
return [InteractiveElement(**el) for el in raw]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def format_elements_for_llm(elements: List[InteractiveElement], max_count: int = 150) -> str:
|
|
66
|
+
"""Format elements as compact list for LLM context.
|
|
67
|
+
|
|
68
|
+
Format: [index] tag "text" pos=(x,y) {extra info}
|
|
69
|
+
"""
|
|
70
|
+
lines = []
|
|
71
|
+
for el in elements[:max_count]:
|
|
72
|
+
parts = [f"[{el.index}]", el.tag]
|
|
73
|
+
|
|
74
|
+
if el.text:
|
|
75
|
+
parts.append(f'"{el.text}"')
|
|
76
|
+
elif el.placeholder:
|
|
77
|
+
parts.append(f'placeholder="{el.placeholder}"')
|
|
78
|
+
elif el.aria_label:
|
|
79
|
+
parts.append(f'aria="{el.aria_label}"')
|
|
80
|
+
|
|
81
|
+
parts.append(f"pos=({el.x},{el.y})")
|
|
82
|
+
|
|
83
|
+
if el.input_type and el.tag == 'input':
|
|
84
|
+
parts.append(f"type={el.input_type}")
|
|
85
|
+
|
|
86
|
+
if el.role:
|
|
87
|
+
parts.append(f"role={el.role}")
|
|
88
|
+
|
|
89
|
+
if el.href:
|
|
90
|
+
href_short = el.href.split('?')[0][-30:]
|
|
91
|
+
parts.append(f"href=...{href_short}")
|
|
92
|
+
|
|
93
|
+
lines.append(' '.join(parts))
|
|
94
|
+
|
|
95
|
+
return '\n'.join(lines)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def find_element(
|
|
99
|
+
page,
|
|
100
|
+
description: str,
|
|
101
|
+
elements: List[InteractiveElement] = None
|
|
102
|
+
) -> Optional[InteractiveElement]:
|
|
103
|
+
"""Find an interactive element by natural language description.
|
|
104
|
+
|
|
105
|
+
This is the core function. LLM SELECTS from pre-built options.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
page: Playwright page
|
|
109
|
+
description: Natural language like "the login button" or "email field"
|
|
110
|
+
elements: Pre-extracted elements (will extract if not provided)
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Matching InteractiveElement with pre-built locator, or None
|
|
114
|
+
"""
|
|
115
|
+
if elements is None:
|
|
116
|
+
elements = extract_elements(page)
|
|
117
|
+
|
|
118
|
+
if not elements:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
element_list = format_elements_for_llm(elements)
|
|
122
|
+
|
|
123
|
+
# Build prompt from template
|
|
124
|
+
prompt = _ELEMENT_MATCHER_PROMPT.format(
|
|
125
|
+
description=description,
|
|
126
|
+
element_list=element_list
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
result = llm_do(
|
|
130
|
+
prompt,
|
|
131
|
+
output=ElementMatch,
|
|
132
|
+
model="co/gemini-2.5-flash",
|
|
133
|
+
temperature=0.1
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if 0 <= result.index < len(elements):
|
|
137
|
+
return elements[result.index]
|
|
138
|
+
|
|
139
|
+
return None
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Screenshot highlighting - draw bounding boxes and indices on screenshots.
|
|
3
|
+
Inspired by browser-use's python_highlights.py approach.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List
|
|
9
|
+
import element_finder
|
|
10
|
+
|
|
11
|
+
# Color scheme for different element types
|
|
12
|
+
ELEMENT_COLORS = {
|
|
13
|
+
'button': '#FF6B6B', # Red
|
|
14
|
+
'input': '#4ECDC4', # Teal
|
|
15
|
+
'select': '#45B7D1', # Blue
|
|
16
|
+
'a': '#96CEB4', # Green
|
|
17
|
+
'textarea': '#FF8C42', # Orange
|
|
18
|
+
'div': '#DDA0DD', # Light purple
|
|
19
|
+
'span': '#FFD93D', # Yellow
|
|
20
|
+
'default': '#9B59B6', # Purple
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_font(size: int = 14):
|
|
25
|
+
"""Get a cross-platform font."""
|
|
26
|
+
font_paths = [
|
|
27
|
+
'/System/Library/Fonts/Arial.ttf', # macOS
|
|
28
|
+
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', # Linux
|
|
29
|
+
'C:\\Windows\\Fonts\\arial.ttf', # Windows
|
|
30
|
+
]
|
|
31
|
+
for path in font_paths:
|
|
32
|
+
try:
|
|
33
|
+
return ImageFont.truetype(path, size)
|
|
34
|
+
except OSError:
|
|
35
|
+
continue
|
|
36
|
+
return ImageFont.load_default()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def draw_dashed_rect(draw: ImageDraw.Draw, bbox: tuple, color: str, dash: int = 4, gap: int = 4):
|
|
40
|
+
"""Draw a dashed rectangle."""
|
|
41
|
+
x1, y1, x2, y2 = bbox
|
|
42
|
+
|
|
43
|
+
def draw_dashed_line(start, end, is_horizontal: bool):
|
|
44
|
+
if is_horizontal:
|
|
45
|
+
x, y = start
|
|
46
|
+
while x < end[0]:
|
|
47
|
+
end_x = min(x + dash, end[0])
|
|
48
|
+
draw.line([(x, y), (end_x, y)], fill=color, width=2)
|
|
49
|
+
x += dash + gap
|
|
50
|
+
else:
|
|
51
|
+
x, y = start
|
|
52
|
+
while y < end[1]:
|
|
53
|
+
end_y = min(y + dash, end[1])
|
|
54
|
+
draw.line([(x, y), (x, end_y)], fill=color, width=2)
|
|
55
|
+
y += dash + gap
|
|
56
|
+
|
|
57
|
+
# Draw four sides
|
|
58
|
+
draw_dashed_line((x1, y1), (x2, y1), True) # Top
|
|
59
|
+
draw_dashed_line((x2, y1), (x2, y2), False) # Right
|
|
60
|
+
draw_dashed_line((x1, y2), (x2, y2), True) # Bottom
|
|
61
|
+
draw_dashed_line((x1, y1), (x1, y2), False) # Left
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def highlight_screenshot(
|
|
65
|
+
screenshot_path: str,
|
|
66
|
+
elements: List[element_finder.InteractiveElement],
|
|
67
|
+
output_path: str = None
|
|
68
|
+
) -> str:
|
|
69
|
+
"""Draw bounding boxes and indices on a screenshot.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
screenshot_path: Path to the screenshot image
|
|
73
|
+
elements: List of InteractiveElement objects with bounding boxes
|
|
74
|
+
output_path: Optional output path (defaults to {original}_highlighted.png)
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Path to the highlighted screenshot
|
|
78
|
+
"""
|
|
79
|
+
# Load image
|
|
80
|
+
image = Image.open(screenshot_path).convert('RGBA')
|
|
81
|
+
draw = ImageDraw.Draw(image)
|
|
82
|
+
font = get_font(14)
|
|
83
|
+
small_font = get_font(11)
|
|
84
|
+
|
|
85
|
+
for el in elements:
|
|
86
|
+
# Skip elements with no size
|
|
87
|
+
if el.width < 5 or el.height < 5:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# Get color based on tag
|
|
91
|
+
color = ELEMENT_COLORS.get(el.tag, ELEMENT_COLORS['default'])
|
|
92
|
+
|
|
93
|
+
# Calculate bounding box
|
|
94
|
+
x1, y1 = el.x, el.y
|
|
95
|
+
x2, y2 = el.x + el.width, el.y + el.height
|
|
96
|
+
|
|
97
|
+
# Draw dashed bounding box
|
|
98
|
+
draw_dashed_rect(draw, (x1, y1, x2, y2), color)
|
|
99
|
+
|
|
100
|
+
# Draw index label
|
|
101
|
+
label = str(el.index)
|
|
102
|
+
bbox = draw.textbbox((0, 0), label, font=font)
|
|
103
|
+
label_w = bbox[2] - bbox[0]
|
|
104
|
+
label_h = bbox[3] - bbox[1]
|
|
105
|
+
padding = 3
|
|
106
|
+
|
|
107
|
+
# Position: top-center of element, or above if small
|
|
108
|
+
label_x = x1 + (el.width - label_w) // 2 - padding
|
|
109
|
+
if el.height < 40:
|
|
110
|
+
label_y = max(0, y1 - label_h - padding * 2 - 2)
|
|
111
|
+
else:
|
|
112
|
+
label_y = y1 + 2
|
|
113
|
+
|
|
114
|
+
# Draw label background
|
|
115
|
+
draw.rectangle(
|
|
116
|
+
[label_x, label_y,
|
|
117
|
+
label_x + label_w + padding * 2,
|
|
118
|
+
label_y + label_h + padding * 2],
|
|
119
|
+
fill=color,
|
|
120
|
+
outline='white',
|
|
121
|
+
width=1
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Draw label text
|
|
125
|
+
draw.text(
|
|
126
|
+
(label_x + padding, label_y + padding),
|
|
127
|
+
label,
|
|
128
|
+
fill='white',
|
|
129
|
+
font=font
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Save output
|
|
133
|
+
if not output_path:
|
|
134
|
+
p = Path(screenshot_path)
|
|
135
|
+
output_path = str(p.parent / f"{p.stem}_highlighted{p.suffix}")
|
|
136
|
+
|
|
137
|
+
image.save(output_path)
|
|
138
|
+
return output_path
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def highlight_current_page(page, output_path: str = "screenshots/highlighted.png") -> str:
|
|
142
|
+
"""Take a screenshot and highlight all interactive elements.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
page: Playwright page object
|
|
146
|
+
output_path: Path to save the highlighted screenshot
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Path to the highlighted screenshot
|
|
150
|
+
"""
|
|
151
|
+
import os
|
|
152
|
+
from datetime import datetime
|
|
153
|
+
|
|
154
|
+
# Ensure directory exists
|
|
155
|
+
os.makedirs("screenshots", exist_ok=True)
|
|
156
|
+
|
|
157
|
+
# Take screenshot
|
|
158
|
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
159
|
+
screenshot_path = f"screenshots/raw_{timestamp}.png"
|
|
160
|
+
page.screenshot(path=screenshot_path)
|
|
161
|
+
|
|
162
|
+
# Extract elements
|
|
163
|
+
elements = element_finder.extract_elements(page)
|
|
164
|
+
|
|
165
|
+
# Generate output path
|
|
166
|
+
output_path = f"screenshots/highlighted_{timestamp}.png"
|
|
167
|
+
|
|
168
|
+
# Create highlighted version
|
|
169
|
+
result = highlight_screenshot(screenshot_path, elements, output_path)
|
|
170
|
+
|
|
171
|
+
# Clean up raw screenshot
|
|
172
|
+
os.remove(screenshot_path)
|
|
173
|
+
|
|
174
|
+
return result
|