connectonion 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
connectonion/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """ConnectOnion - A simple agent framework with behavior tracking."""
2
2
 
3
- __version__ = "0.6.1"
3
+ __version__ = "0.6.2"
4
4
 
5
5
  # Auto-load .env files for the entire framework
6
6
  from dotenv import load_dotenv
@@ -20,6 +20,7 @@ from typing import Optional, List, Dict, Any
20
20
  from connectonion import Agent, llm_do
21
21
  from dotenv import load_dotenv
22
22
  from pydantic import BaseModel, Field
23
+ from . import element_finder
23
24
 
24
25
  # Default screenshots directory
25
26
  SCREENSHOTS_DIR = Path.cwd() / ".tmp"
@@ -53,7 +54,7 @@ class BrowserAutomation:
53
54
  Supports Chrome profile for persistent sessions.
54
55
  """
55
56
 
56
- def __init__(self, use_chrome_profile: bool = False, headless: bool = True):
57
+ def __init__(self, use_chrome_profile: bool = True, headless: bool = True):
57
58
  """Initialize browser automation.
58
59
 
59
60
  Args:
@@ -109,17 +110,28 @@ class BrowserAutomation:
109
110
  source_profile = home / ".config/google-chrome"
110
111
 
111
112
  if source_profile.exists():
113
+ def safe_copy(src, dst):
114
+ try:
115
+ shutil.copy2(src, dst)
116
+ except:
117
+ pass # Skip any file that can't be copied
118
+
112
119
  shutil.copytree(
113
120
  source_profile,
114
121
  chromium_profile,
115
- ignore=shutil.ignore_patterns('*Cache*', '*cache*', 'Service Worker', 'ShaderCache'),
122
+ ignore=shutil.ignore_patterns(
123
+ '*Cache*', '*cache*', 'Service Worker', 'ShaderCache',
124
+ 'Singleton*', '*lock*', '*Lock*', '*.tmp', 'GPUCache',
125
+ 'Code Cache', 'DawnCache', 'GrShaderCache', 'blob_storage'
126
+ ),
127
+ copy_function=safe_copy,
116
128
  dirs_exist_ok=True
117
129
  )
118
130
 
119
131
  self.browser = self.playwright.chromium.launch_persistent_context(
120
132
  str(chromium_profile),
121
133
  headless=headless,
122
- args=['--disable-blink-features=AutomationControlled'],
134
+ args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
123
135
  ignore_default_args=['--enable-automation'],
124
136
  timeout=120000,
125
137
  )
@@ -127,10 +139,12 @@ class BrowserAutomation:
127
139
  self.page.add_init_script("""
128
140
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
129
141
  """)
142
+ self.page.set_viewport_size({"width": 1920, "height": 1080})
130
143
  return f"Browser opened with Chrome profile: {chromium_profile}"
131
144
  else:
132
145
  self.browser = self.playwright.chromium.launch(headless=headless)
133
146
  self.page = self.browser.new_page()
147
+ self.page.set_viewport_size({"width": 1920, "height": 1080})
134
148
  return "Browser opened successfully"
135
149
 
136
150
  def go_to(self, url: str) -> str:
@@ -149,88 +163,94 @@ class BrowserAutomation:
149
163
  def find_element_by_description(self, description: str) -> str:
150
164
  """Find element using natural language description.
151
165
 
152
- Uses AI to analyze HTML and find the best matching element.
166
+ Uses element_finder: LLM selects from indexed list, never generates CSS.
153
167
 
154
168
  Args:
155
169
  description: e.g., "the submit button", "email input field"
156
170
 
157
171
  Returns:
158
- CSS selector for the element, or error message
172
+ Pre-built locator string, or error message
159
173
  """
160
174
  if not self.page:
161
175
  return "Browser not open"
162
176
 
163
- html = self.page.content()
164
-
165
- class ElementSelector(BaseModel):
166
- selector: str = Field(..., description="CSS selector for the element")
167
- confidence: float = Field(..., description="Confidence score 0-1")
168
- explanation: str = Field(..., description="Why this element matches")
169
-
170
- result = llm_do(
171
- f"""Analyze this HTML and find the CSS selector for: "{description}"
172
-
173
- HTML (first 15000 chars): {html[:15000]}
174
-
175
- Return the most specific CSS selector that uniquely identifies this element.
176
- """,
177
- output=ElementSelector,
178
- model="gpt-4o",
179
- temperature=0.1
180
- )
181
-
182
- if self.page.locator(result.selector).count() > 0:
183
- return result.selector
184
- else:
185
- return f"Found selector {result.selector} but element not on page"
177
+ element = element_finder.find_element(self.page, description)
178
+ if element:
179
+ return element.locator
180
+ return f"Could not find element matching: {description}"
186
181
 
187
182
  def click(self, description: str) -> str:
188
183
  """Click on an element using natural language description.
189
184
 
190
- Args:
191
- description: e.g., "the blue submit button", "link to contact page"
185
+ Uses element_finder: LLM selects from pre-built locators, never generates CSS.
192
186
  """
193
187
  if not self.page:
194
188
  return "Browser not open"
195
189
 
196
- selector = self.find_element_by_description(description)
190
+ element = element_finder.find_element(self.page, description)
197
191
 
198
- if selector.startswith("Could not") or selector.startswith("Found selector"):
199
- if self.page.locator(f"text='{description}'").count() > 0:
200
- self.page.click(f"text='{description}'")
201
- return f"Clicked on '{description}' (by text)"
202
- return selector
192
+ if not element:
193
+ # Fallback to simple text matching
194
+ text_locator = self.page.get_by_text(description)
195
+ if text_locator.count() > 0:
196
+ text_locator.first.click()
197
+ return f"Clicked on '{description}' (by text fallback)"
198
+ return f"Could not find element matching: {description}"
199
+
200
+ # Try the locator with fresh bounding box
201
+ locator = self.page.locator(element.locator)
203
202
 
204
- self.page.click(selector)
205
- return f"Clicked on '{description}'"
203
+ if locator.count() > 0:
204
+ box = locator.first.bounding_box()
205
+ if box:
206
+ x = box['x'] + box['width'] / 2
207
+ y = box['y'] + box['height'] / 2
208
+ self.page.mouse.click(x, y)
209
+ return f"Clicked [{element.index}] {element.tag} '{element.text}'"
210
+
211
+ locator.first.click(force=True)
212
+ return f"Clicked [{element.index}] {element.tag} '{element.text}' (force)"
213
+
214
+ # Fallback: use original coordinates
215
+ x = element.x + element.width // 2
216
+ y = element.y + element.height // 2
217
+ self.page.mouse.click(x, y)
218
+ return f"Clicked [{element.index}] '{element.text}' at ({x}, {y})"
206
219
 
207
220
  def type_text(self, field_description: str, text: str) -> str:
208
221
  """Type text into a form field.
209
222
 
210
- Args:
211
- field_description: e.g., "email field", "password input"
212
- text: The text to type
223
+ Uses element_finder: LLM selects from pre-built locators, never generates CSS.
213
224
  """
214
225
  if not self.page:
215
226
  return "Browser not open"
216
227
 
217
- selector = self.find_element_by_description(field_description)
218
-
219
- if selector.startswith("Could not") or selector.startswith("Found selector"):
220
- for fallback in [
221
- f"input[placeholder*='{field_description}' i]",
222
- f"[aria-label*='{field_description}' i]",
223
- f"input[name*='{field_description}' i]"
224
- ]:
225
- if self.page.locator(fallback).count() > 0:
226
- self.page.fill(fallback, text)
227
- self.form_data[field_description] = text
228
- return f"Typed into {field_description}"
229
- return f"Could not find field '{field_description}'"
230
-
231
- self.page.fill(selector, text)
228
+ element = element_finder.find_element(self.page, field_description)
229
+
230
+ if not element:
231
+ # Fallback to placeholder matching
232
+ placeholder_locator = self.page.get_by_placeholder(field_description)
233
+ if placeholder_locator.count() > 0:
234
+ placeholder_locator.first.fill(text)
235
+ self.form_data[field_description] = text
236
+ return f"Typed into '{field_description}'"
237
+ return f"Could not find field: {field_description}"
238
+
239
+ # Try the pre-built locator
240
+ locator = self.page.locator(element.locator)
241
+
242
+ if locator.count() > 0:
243
+ locator.first.fill(text)
244
+ self.form_data[field_description] = text
245
+ return f"Typed into [{element.index}] {element.tag}"
246
+
247
+ # Fallback: click then type
248
+ x = element.x + element.width // 2
249
+ y = element.y + element.height // 2
250
+ self.page.mouse.click(x, y)
251
+ self.page.keyboard.type(text)
232
252
  self.form_data[field_description] = text
233
- return f"Typed into {field_description}"
253
+ return f"Typed into [{element.index}] at ({x}, {y})"
234
254
 
235
255
  def get_text(self) -> str:
236
256
  """Get all visible text from the page."""
@@ -306,27 +326,6 @@ class BrowserAutomation:
306
326
  self.page.set_viewport_size({"width": width, "height": height})
307
327
  return f"Viewport set to {width}x{height}"
308
328
 
309
- def screenshot_mobile(self, url: str = None) -> str:
310
- """Take screenshot with iPhone viewport (390x844)."""
311
- if url:
312
- self.go_to(url)
313
- self.set_viewport(390, 844)
314
- return self.take_screenshot()
315
-
316
- def screenshot_tablet(self, url: str = None) -> str:
317
- """Take screenshot with iPad viewport (768x1024)."""
318
- if url:
319
- self.go_to(url)
320
- self.set_viewport(768, 1024)
321
- return self.take_screenshot()
322
-
323
- def screenshot_desktop(self, url: str = None) -> str:
324
- """Take screenshot with desktop viewport (1920x1080)."""
325
- if url:
326
- self.go_to(url)
327
- self.set_viewport(1920, 1080)
328
- return self.take_screenshot()
329
-
330
329
  def find_forms(self) -> List[FormField]:
331
330
  """Find all form fields on the current page."""
332
331
  if not self.page:
@@ -440,69 +439,13 @@ class BrowserAutomation:
440
439
  return f"Waited for {seconds} seconds"
441
440
 
442
441
  def scroll(self, times: int = 5, description: str = "the main content area") -> str:
443
- """Universal scroll with automatic strategy selection.
444
-
445
- Tries multiple strategies until one works:
446
- 1. AI-generated strategy (analyzes page structure)
447
- 2. Element scrolling
448
- 3. Page scrolling
442
+ """Universal scroll with AI strategy and fallback.
449
443
 
450
- Args:
451
- times: Number of scroll iterations
452
- description: What to scroll (e.g., "the email list")
453
-
454
- Returns:
455
- Status message with successful strategy
444
+ Tries: AI-generated → Element scroll → Page scroll
445
+ Verifies success with screenshot comparison.
456
446
  """
457
- from . import scroll_strategies
458
- return scroll_strategies.scroll_with_verification(
459
- page=self.page,
460
- take_screenshot=self.take_screenshot,
461
- times=times,
462
- description=description
463
- )
464
-
465
- def scroll_page(self, direction: str = "down", amount: int = 1000) -> str:
466
- """Scroll the page in a direction.
467
-
468
- Args:
469
- direction: "down", "up", "top", or "bottom"
470
- amount: Pixels to scroll (ignored for "bottom"/"top")
471
- """
472
- if not self.page:
473
- return "Browser not open"
474
-
475
- if direction == "bottom":
476
- self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
477
- return "Scrolled to bottom of page"
478
- elif direction == "top":
479
- self.page.evaluate("window.scrollTo(0, 0)")
480
- return "Scrolled to top of page"
481
- elif direction == "down":
482
- self.page.evaluate(f"window.scrollBy(0, {amount})")
483
- return f"Scrolled down {amount} pixels"
484
- elif direction == "up":
485
- self.page.evaluate(f"window.scrollBy(0, -{amount})")
486
- return f"Scrolled up {amount} pixels"
487
- else:
488
- return f"Unknown direction: {direction}"
489
-
490
- def scroll_element(self, selector: str, amount: int = 1000) -> str:
491
- """Scroll a specific element by CSS selector."""
492
- if not self.page:
493
- return "Browser not open"
494
-
495
- result = self.page.evaluate(f"""
496
- (() => {{
497
- const element = document.querySelector('{selector}');
498
- if (!element) return 'Element not found: {selector}';
499
- const beforeScroll = element.scrollTop;
500
- element.scrollTop += {amount};
501
- const afterScroll = element.scrollTop;
502
- return `Scrolled from ${{beforeScroll}}px to ${{afterScroll}}px`;
503
- }})()
504
- """)
505
- return result
447
+ from . import scroll
448
+ return scroll.scroll(self.page, self.take_screenshot, times, description)
506
449
 
507
450
  def wait_for_manual_login(self, site_name: str = "the website") -> str:
508
451
  """Pause automation for user to login manually.
@@ -0,0 +1,139 @@
1
+ """
2
+ Element Finder - Find interactive elements by natural language description.
3
+
4
+ Inspired by browser-use (https://github.com/browser-use/browser-use).
5
+
6
+ Architecture:
7
+ 1. JavaScript injects `data-browser-agent-id` into each interactive element
8
+ 2. LLM SELECTS from indexed element list, never GENERATES CSS selectors
9
+ 3. Pre-built locators are guaranteed to work
10
+
11
+ Usage:
12
+ elements = extract_elements(page)
13
+ element = find_element(page, "the login button", elements)
14
+ page.locator(element.locator).click()
15
+ """
16
+
17
+ from typing import List, Optional
18
+ from pathlib import Path
19
+ from pydantic import BaseModel, Field
20
+ from connectonion import llm_do
21
+
22
+
23
+ # Load JavaScript and prompt from files
24
+ _BASE_DIR = Path(__file__).parent
25
+ _EXTRACT_JS = (_BASE_DIR / "scripts" / "extract_elements.js").read_text()
26
+ _ELEMENT_MATCHER_PROMPT = (_BASE_DIR / "prompts" / "element_matcher.md").read_text()
27
+
28
+
29
+ class InteractiveElement(BaseModel):
30
+ """An interactive element on the page with pre-built locator."""
31
+ index: int
32
+ tag: str
33
+ text: str = ""
34
+ role: Optional[str] = None
35
+ aria_label: Optional[str] = None
36
+ placeholder: Optional[str] = None
37
+ input_type: Optional[str] = None
38
+ href: Optional[str] = None
39
+ x: int = 0
40
+ y: int = 0
41
+ width: int = 0
42
+ height: int = 0
43
+ locator: str = ""
44
+
45
+
46
+ class ElementMatch(BaseModel):
47
+ """LLM's element selection result."""
48
+ index: int = Field(..., description="Index of the matching element")
49
+ confidence: float = Field(..., description="Confidence 0-1")
50
+ reasoning: str = Field(..., description="Why this element matches")
51
+
52
+
53
+ def extract_elements(page) -> List[InteractiveElement]:
54
+ """Extract all interactive elements from the page.
55
+
56
+ Returns elements with:
57
+ - Bounding boxes (for position matching with screenshot)
58
+ - Pre-built Playwright locators (guaranteed to work)
59
+ - Text/aria/placeholder for LLM matching
60
+ """
61
+ raw = page.evaluate(_EXTRACT_JS)
62
+ return [InteractiveElement(**el) for el in raw]
63
+
64
+
65
+ def format_elements_for_llm(elements: List[InteractiveElement], max_count: int = 150) -> str:
66
+ """Format elements as compact list for LLM context.
67
+
68
+ Format: [index] tag "text" pos=(x,y) {extra info}
69
+ """
70
+ lines = []
71
+ for el in elements[:max_count]:
72
+ parts = [f"[{el.index}]", el.tag]
73
+
74
+ if el.text:
75
+ parts.append(f'"{el.text}"')
76
+ elif el.placeholder:
77
+ parts.append(f'placeholder="{el.placeholder}"')
78
+ elif el.aria_label:
79
+ parts.append(f'aria="{el.aria_label}"')
80
+
81
+ parts.append(f"pos=({el.x},{el.y})")
82
+
83
+ if el.input_type and el.tag == 'input':
84
+ parts.append(f"type={el.input_type}")
85
+
86
+ if el.role:
87
+ parts.append(f"role={el.role}")
88
+
89
+ if el.href:
90
+ href_short = el.href.split('?')[0][-30:]
91
+ parts.append(f"href=...{href_short}")
92
+
93
+ lines.append(' '.join(parts))
94
+
95
+ return '\n'.join(lines)
96
+
97
+
98
+ def find_element(
99
+ page,
100
+ description: str,
101
+ elements: List[InteractiveElement] = None
102
+ ) -> Optional[InteractiveElement]:
103
+ """Find an interactive element by natural language description.
104
+
105
+ This is the core function. LLM SELECTS from pre-built options.
106
+
107
+ Args:
108
+ page: Playwright page
109
+ description: Natural language like "the login button" or "email field"
110
+ elements: Pre-extracted elements (will extract if not provided)
111
+
112
+ Returns:
113
+ Matching InteractiveElement with pre-built locator, or None
114
+ """
115
+ if elements is None:
116
+ elements = extract_elements(page)
117
+
118
+ if not elements:
119
+ return None
120
+
121
+ element_list = format_elements_for_llm(elements)
122
+
123
+ # Build prompt from template
124
+ prompt = _ELEMENT_MATCHER_PROMPT.format(
125
+ description=description,
126
+ element_list=element_list
127
+ )
128
+
129
+ result = llm_do(
130
+ prompt,
131
+ output=ElementMatch,
132
+ model="co/gemini-2.5-flash",
133
+ temperature=0.1
134
+ )
135
+
136
+ if 0 <= result.index < len(elements):
137
+ return elements[result.index]
138
+
139
+ return None
@@ -0,0 +1,174 @@
1
+ """
2
+ Screenshot highlighting - draw bounding boxes and indices on screenshots.
3
+ Inspired by browser-use's python_highlights.py approach.
4
+ """
5
+
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ from pathlib import Path
8
+ from typing import List
9
+ import element_finder
10
+
11
+ # Color scheme for different element types
12
+ ELEMENT_COLORS = {
13
+ 'button': '#FF6B6B', # Red
14
+ 'input': '#4ECDC4', # Teal
15
+ 'select': '#45B7D1', # Blue
16
+ 'a': '#96CEB4', # Green
17
+ 'textarea': '#FF8C42', # Orange
18
+ 'div': '#DDA0DD', # Light purple
19
+ 'span': '#FFD93D', # Yellow
20
+ 'default': '#9B59B6', # Purple
21
+ }
22
+
23
+
24
+ def get_font(size: int = 14):
25
+ """Get a cross-platform font."""
26
+ font_paths = [
27
+ '/System/Library/Fonts/Arial.ttf', # macOS
28
+ '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', # Linux
29
+ 'C:\\Windows\\Fonts\\arial.ttf', # Windows
30
+ ]
31
+ for path in font_paths:
32
+ try:
33
+ return ImageFont.truetype(path, size)
34
+ except OSError:
35
+ continue
36
+ return ImageFont.load_default()
37
+
38
+
39
+ def draw_dashed_rect(draw: ImageDraw.Draw, bbox: tuple, color: str, dash: int = 4, gap: int = 4):
40
+ """Draw a dashed rectangle."""
41
+ x1, y1, x2, y2 = bbox
42
+
43
+ def draw_dashed_line(start, end, is_horizontal: bool):
44
+ if is_horizontal:
45
+ x, y = start
46
+ while x < end[0]:
47
+ end_x = min(x + dash, end[0])
48
+ draw.line([(x, y), (end_x, y)], fill=color, width=2)
49
+ x += dash + gap
50
+ else:
51
+ x, y = start
52
+ while y < end[1]:
53
+ end_y = min(y + dash, end[1])
54
+ draw.line([(x, y), (x, end_y)], fill=color, width=2)
55
+ y += dash + gap
56
+
57
+ # Draw four sides
58
+ draw_dashed_line((x1, y1), (x2, y1), True) # Top
59
+ draw_dashed_line((x2, y1), (x2, y2), False) # Right
60
+ draw_dashed_line((x1, y2), (x2, y2), True) # Bottom
61
+ draw_dashed_line((x1, y1), (x1, y2), False) # Left
62
+
63
+
64
+ def highlight_screenshot(
65
+ screenshot_path: str,
66
+ elements: List[element_finder.InteractiveElement],
67
+ output_path: str = None
68
+ ) -> str:
69
+ """Draw bounding boxes and indices on a screenshot.
70
+
71
+ Args:
72
+ screenshot_path: Path to the screenshot image
73
+ elements: List of InteractiveElement objects with bounding boxes
74
+ output_path: Optional output path (defaults to {original}_highlighted.png)
75
+
76
+ Returns:
77
+ Path to the highlighted screenshot
78
+ """
79
+ # Load image
80
+ image = Image.open(screenshot_path).convert('RGBA')
81
+ draw = ImageDraw.Draw(image)
82
+ font = get_font(14)
83
+ small_font = get_font(11)
84
+
85
+ for el in elements:
86
+ # Skip elements with no size
87
+ if el.width < 5 or el.height < 5:
88
+ continue
89
+
90
+ # Get color based on tag
91
+ color = ELEMENT_COLORS.get(el.tag, ELEMENT_COLORS['default'])
92
+
93
+ # Calculate bounding box
94
+ x1, y1 = el.x, el.y
95
+ x2, y2 = el.x + el.width, el.y + el.height
96
+
97
+ # Draw dashed bounding box
98
+ draw_dashed_rect(draw, (x1, y1, x2, y2), color)
99
+
100
+ # Draw index label
101
+ label = str(el.index)
102
+ bbox = draw.textbbox((0, 0), label, font=font)
103
+ label_w = bbox[2] - bbox[0]
104
+ label_h = bbox[3] - bbox[1]
105
+ padding = 3
106
+
107
+ # Position: top-center of element, or above if small
108
+ label_x = x1 + (el.width - label_w) // 2 - padding
109
+ if el.height < 40:
110
+ label_y = max(0, y1 - label_h - padding * 2 - 2)
111
+ else:
112
+ label_y = y1 + 2
113
+
114
+ # Draw label background
115
+ draw.rectangle(
116
+ [label_x, label_y,
117
+ label_x + label_w + padding * 2,
118
+ label_y + label_h + padding * 2],
119
+ fill=color,
120
+ outline='white',
121
+ width=1
122
+ )
123
+
124
+ # Draw label text
125
+ draw.text(
126
+ (label_x + padding, label_y + padding),
127
+ label,
128
+ fill='white',
129
+ font=font
130
+ )
131
+
132
+ # Save output
133
+ if not output_path:
134
+ p = Path(screenshot_path)
135
+ output_path = str(p.parent / f"{p.stem}_highlighted{p.suffix}")
136
+
137
+ image.save(output_path)
138
+ return output_path
139
+
140
+
141
+ def highlight_current_page(page, output_path: str = "screenshots/highlighted.png") -> str:
142
+ """Take a screenshot and highlight all interactive elements.
143
+
144
+ Args:
145
+ page: Playwright page object
146
+ output_path: Path to save the highlighted screenshot
147
+
148
+ Returns:
149
+ Path to the highlighted screenshot
150
+ """
151
+ import os
152
+ from datetime import datetime
153
+
154
+ # Ensure directory exists
155
+ os.makedirs("screenshots", exist_ok=True)
156
+
157
+ # Take screenshot
158
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
159
+ screenshot_path = f"screenshots/raw_{timestamp}.png"
160
+ page.screenshot(path=screenshot_path)
161
+
162
+ # Extract elements
163
+ elements = element_finder.extract_elements(page)
164
+
165
+ # Generate output path
166
+ output_path = f"screenshots/highlighted_{timestamp}.png"
167
+
168
+ # Create highlighted version
169
+ result = highlight_screenshot(screenshot_path, elements, output_path)
170
+
171
+ # Clean up raw screenshot
172
+ os.remove(screenshot_path)
173
+
174
+ return result