semantio 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,11 @@
1
1
  # web_browser.py
2
2
  from typing import Dict, Any, List, Optional, Callable
3
3
  from pydantic import Field, BaseModel
4
- from selenium import webdriver
5
- from selenium.webdriver.common.by import By
6
- from selenium.webdriver.common.action_chains import ActionChains
7
- from selenium.webdriver.remote.webelement import WebElement
8
- from selenium.webdriver.support.ui import WebDriverWait
9
- from selenium.webdriver.support import expected_conditions as EC
10
- from selenium.webdriver.chrome.options import Options
11
- from selenium.webdriver.chrome.service import Service
12
- from webdriver_manager.chrome import ChromeDriverManager
13
- from bs4 import BeautifulSoup
14
- import json
15
- import time
16
- import re
17
- import logging
18
- import os
19
- import difflib
4
+ from playwright.sync_api import sync_playwright, Page, TimeoutError as PlaywrightTimeoutError
5
+ import json, time, re, logging, os, difflib
20
6
  from .base_tool import BaseTool
21
7
 
8
+ # Global logger
22
9
  logger = logging.getLogger(__name__)
23
10
 
24
11
  class BrowserPlan(BaseModel):
@@ -30,50 +17,63 @@ class BrowserPlan(BaseModel):
30
17
  class WebBrowserTool(BaseTool):
31
18
  name: str = Field("WebBrowser", description="Name of the tool")
32
19
  description: str = Field(
33
- "Highly advanced universal web automation tool with advanced element identification, AJAX waiting, modal dismissal, multi-tab support, and custom JS injection.",
20
+ "Universal web automation tool with advanced element identification (DOM and image fallback), modal analysis, AJAX waiting, multi-tab support, and custom JS injection.",
34
21
  description="Tool description"
35
22
  )
36
-
37
- default_timeout: int = 15 # Default wait timeout in seconds
38
- max_retries: int = 3 # Increased maximum retries for any task
23
+ default_timeout: int = 15000 # 15 seconds in milliseconds
24
+ max_retries: int = 3
25
+
26
+ def __init__(self, *args, **kwargs):
27
+ super().__init__(*args, **kwargs)
28
+ # Bypass Pydantic's restrictions for extra attributes.
29
+ object.__setattr__(self, "logger", logging.getLogger(__name__))
39
30
 
40
31
  def execute(self, input: Dict[str, Any]) -> Dict[str, Any]:
41
- """Execute an advanced dynamic web automation workflow."""
42
- driver = None
32
+ """
33
+ Execute the browser automation workflow.
34
+ Maintains a context string of executed tasks and passes it to fallback routines.
35
+ DOES NOT close the browser after successful execution.
36
+ """
43
37
  overall_start = time.time()
38
+ results = [] # to hold summaries of executed tasks (for context)
39
+ current_url = ""
44
40
  try:
45
41
  headless = input.get("headless", False)
46
- self.default_timeout = int(input.get("timeout", self.default_timeout))
42
+ self.default_timeout = int(input.get("timeout", 15)) * 1000
47
43
  self.max_retries = int(input.get("max_retries", self.max_retries))
48
- driver = self._init_browser(headless)
49
- results = []
50
- current_url = ""
51
-
52
- plan = self._generate_plan(input.get('query', ''), current_url)
44
+ plan = self._generate_plan(input.get("query", ""), current_url)
53
45
  if not plan.tasks:
54
46
  raise ValueError("No valid tasks in the generated plan.")
55
47
 
56
- # Dynamic mapping: action name to handler function.
57
- action_map: Dict[str, Callable[[webdriver.Chrome, Dict[str, Any]], Dict[str, Any]]] = {
58
- "navigate": lambda d, task: self._handle_navigation(d, task.get("value", "")),
59
- "click": lambda d, task: self._handle_click(d, task.get("selector", "")),
60
- "type": lambda d, task: self._handle_typing(d, task.get("selector", ""), task.get("value", ""), task),
61
- "wait": lambda d, task: self._handle_wait(task.get("value", "")),
62
- "wait_for_ajax": lambda d, task: self._handle_wait_for_ajax(d, task.get("value", "30")),
63
- "scroll": lambda d, task: self._handle_scroll(d, task.get("selector", "")),
64
- "hover": lambda d, task: self._handle_hover(d, task.get("selector", "")),
65
- "screenshot": lambda d, task: self._handle_screenshot(d, task.get("value", "screenshot.png")),
66
- "switch_tab": lambda d, task: self._handle_switch_tab(d, task.get("value", "0")),
67
- "execute_script": lambda d, task: self._handle_execute_script(d, task.get("value", "")),
68
- "drag_and_drop": lambda d, task: self._handle_drag_and_drop(d, task.get("selector", ""), task.get("value", "")),
48
+ # Start Playwright without a "with" block so we can leave the browser open.
49
+ p = sync_playwright().start()
50
+ browser = p.chromium.launch(headless=headless)
51
+ context = browser.new_context()
52
+ page = context.new_page()
53
+
54
+ # Map actions to handlers.
55
+ action_map: Dict[str, Callable[[Page, Dict[str, Any]], Dict[str, Any]]] = {
56
+ "navigate": lambda p, task: self._handle_navigation(p, task.get("value", "")),
57
+ "click": lambda p, task: self._handle_click(p, task.get("selector", "")),
58
+ "type": lambda p, task: self._handle_typing(p, task.get("selector", ""), task.get("value", ""), task),
59
+ "wait": lambda p, task: self._handle_wait(task.get("value", "")),
60
+ "wait_for_ajax": lambda p, task: self._handle_wait_for_ajax(p, task.get("value", "")),
61
+ "scroll": lambda p, task: self._handle_scroll(p, task.get("selector", "")),
62
+ "hover": lambda p, task: self._handle_hover(p, task.get("selector", "")),
63
+ "screenshot": lambda p, task: self._handle_screenshot(p, task.get("value", "screenshot.png")),
64
+ "switch_tab": lambda p, task: self._handle_switch_tab(context, task.get("value", "0")),
65
+ "execute_script": lambda p, task: self._handle_execute_script(p, task.get("value", "")),
66
+ "drag_and_drop": lambda p, task: self._handle_drag_and_drop(p, task.get("selector", ""), task.get("value", "")),
69
67
  }
70
68
 
71
69
  for task in plan.tasks:
72
- # Before each action, dismiss modals/overlays.
73
- self._dismiss_unwanted_modals(driver)
70
+ self._dismiss_unwanted_modals(page, task_context=task.get("description", ""))
74
71
  action = task.get("action", "").lower()
75
- logger.info(f"Executing task: {task.get('description', action)}")
72
+ self.logger.info(f"Executing task: {task.get('description', action)}")
76
73
  start_time = time.time()
74
+
75
+ # Build a context string from previously executed tasks.
76
+ executed_context = "\n".join([f"{r['action']}: {r['message']}" for r in results])
77
77
  handler = action_map.get(action)
78
78
  if not handler:
79
79
  results.append({
@@ -83,45 +83,28 @@ class WebBrowserTool(BaseTool):
83
83
  })
84
84
  continue
85
85
 
86
- result = self._execute_with_retries(driver, task, handler)
86
+ result = self._execute_with_retries(page, task, handler, executed_context)
87
87
  elapsed = time.time() - start_time
88
88
  result["elapsed"] = elapsed
89
- logger.info(f"Action '{action}' completed in {elapsed:.2f} seconds.")
89
+ self.logger.info(f"Action '{action}' completed in {elapsed:.2f} seconds.")
90
90
  results.append(result)
91
91
 
92
- if not result.get('success', False):
93
- logger.error(f"Task failed: {result.get('message')}")
94
- self._capture_failure_screenshot(driver, action)
92
+ if not result.get("success", False):
93
+ self.logger.error(f"Task failed: {result.get('message')}")
94
+ self._capture_failure_screenshot(page, action)
95
95
  break
96
96
 
97
- current_url = driver.current_url
97
+ current_url = page.url
98
98
 
99
99
  overall_elapsed = time.time() - overall_start
100
- logger.info(f"Total execution time: {overall_elapsed:.2f} seconds.")
100
+ self.logger.info(f"Total execution time: {overall_elapsed:.2f} seconds.")
101
+ # Do not close the browser.
101
102
  return {"status": "success", "results": results, "total_time": overall_elapsed}
102
-
103
103
  except Exception as e:
104
- logger.exception("Execution error:")
104
+ self.logger.exception("Execution error:")
105
105
  return {"status": "error", "message": str(e)}
106
- finally:
107
- if driver:
108
- driver.quit()
109
-
110
- def _init_browser(self, headless: bool) -> webdriver.Chrome:
111
- """Initialize browser with advanced options."""
112
- options = Options()
113
- options.add_argument("--start-maximized")
114
- options.add_argument("--disable-blink-features=AutomationControlled")
115
- options.add_experimental_option("excludeSwitches", ["enable-automation"])
116
- if headless:
117
- options.add_argument("--headless=new")
118
- return webdriver.Chrome(
119
- service=Service(ChromeDriverManager().install()),
120
- options=options
121
- )
122
106
 
123
107
  def _generate_plan(self, query: str, current_url: str) -> BrowserPlan:
124
- """Generate an adaptive execution plan using an LLM or other dynamic planner."""
125
108
  prompt = f"""Generate browser automation plan for: {query}
126
109
 
127
110
  Current URL: {current_url or 'No page loaded yet'}
@@ -150,7 +133,6 @@ Guidelines:
150
133
  return self._parse_plan(response)
151
134
 
152
135
  def _parse_plan(self, response: str) -> BrowserPlan:
153
- """Robust JSON parsing with multiple fallback strategies."""
154
136
  try:
155
137
  json_match = re.search(r'```json\n?(.+?)\n?```', response, re.DOTALL)
156
138
  if json_match:
@@ -163,7 +145,7 @@ Guidelines:
163
145
  validated_tasks = []
164
146
  for task in plan_data.get("tasks", []):
165
147
  if not all(key in task for key in ["action", "description"]):
166
- logger.warning(f"Skipping task due to missing keys: {task}")
148
+ self.logger.warning(f"Skipping task due to missing keys: {task}")
167
149
  continue
168
150
  validated_tasks.append({
169
151
  "action": task["action"],
@@ -173,267 +155,391 @@ Guidelines:
173
155
  })
174
156
  return BrowserPlan(tasks=validated_tasks)
175
157
  except (json.JSONDecodeError, AttributeError, ValueError) as e:
176
- logger.error(f"Plan parsing failed: {e}")
158
+ self.logger.error(f"Plan parsing failed: {e}")
177
159
  return BrowserPlan(tasks=[])
178
160
 
179
- def _execute_with_retries(self, driver: webdriver.Chrome, task: Dict[str, Any],
180
- handler: Callable[[webdriver.Chrome, Dict[str, Any]], Dict[str, Any]]) -> Dict[str, Any]:
181
- """Execute a task with retry logic and exponential backoff."""
161
+ def _execute_with_retries(self, page: Page, task: Dict[str, Any],
162
+ handler: Callable[[Page, Dict[str, Any]], Dict[str, Any]],
163
+ executed_context: str = "") -> Dict[str, Any]:
164
+ """Execute a task with retry logic. If it fails, pass the executed_context to the fallback prompt.
165
+ The fallback now returns a JSON array of tasks, which are executed sequentially."""
182
166
  attempts = 0
183
167
  result = {}
184
168
  while attempts < self.max_retries:
185
- result = self._execute_safe_task(driver, task, handler)
169
+ result = self._execute_safe_task(page, task, handler)
186
170
  if result.get("success", False):
187
171
  return result
188
172
  attempts += 1
189
- logger.info(f"Retrying task '{task.get('action')}' (attempt {attempts + 1}/{self.max_retries})")
173
+ self.logger.info(f"Retrying task '{task.get('action')}' (attempt {attempts + 1}/{self.max_retries})")
190
174
  time.sleep(1 * attempts)
175
+ if task.get("action") in ["click", "type"]:
176
+ self.logger.info("HTML-based automation failed. Using fallback with image-based LLM.")
177
+ result = self._fallback_with_image_llm(page, task, executed_context)
191
178
  return result
192
179
 
193
- def _execute_safe_task(self, driver: webdriver.Chrome, task: Dict[str, Any],
194
- handler: Callable[[webdriver.Chrome, Dict[str, Any]], Dict[str, Any]]) -> Dict[str, Any]:
195
- """Execute a task with comprehensive error handling."""
180
+ def _execute_safe_task(self, page: Page, task: Dict[str, Any],
181
+ handler: Callable[[Page, Dict[str, Any]], Dict[str, Any]]) -> Dict[str, Any]:
196
182
  try:
197
- return handler(driver, task)
183
+ return handler(page, task)
198
184
  except Exception as e:
199
185
  action = task.get("action", "unknown")
200
- logger.exception(f"Error executing task '{action}':")
186
+ self.logger.exception(f"Error executing task '{action}':")
201
187
  return {"action": action, "success": False, "message": f"Critical error: {str(e)}"}
202
188
 
203
- def _dismiss_unwanted_modals(self, driver: webdriver.Chrome):
204
- """
205
- Dismiss or remove unwanted modals, overlays, or pop-ups.
206
- First attempts to click a close button; if not available, removes the element via JS.
207
- """
189
+ def _dismiss_unwanted_modals(self, page: Page, task_context: str = ""):
190
+ modal_selectors = [".modal", ".popup", '[role="dialog"]', ".overlay", ".lightbox"]
191
+ for selector in modal_selectors:
192
+ elements = page.query_selector_all(selector)
193
+ for modal in elements:
194
+ if modal.is_visible():
195
+ self._handle_modal(page, modal, task_context)
196
+
197
+ def _handle_modal(self, page: Page, modal_element, task_context: str):
208
198
  try:
209
- modal_selectors = [".modal", ".popup", '[role="dialog"]', ".overlay", ".lightbox"]
210
- for selector in modal_selectors:
211
- elements = driver.find_elements(By.CSS_SELECTOR, selector)
212
- for modal in elements:
213
- if modal.is_displayed():
214
- close_selectors = [".close", ".btn-close", "[aria-label='Close']", "[data-dismiss='modal']"]
215
- dismissed = False
216
- for close_sel in close_selectors:
217
- try:
218
- close_button = modal.find_element(By.CSS_SELECTOR, close_sel)
219
- if close_button.is_displayed():
220
- close_button.click()
221
- dismissed = True
222
- logger.info(f"Dismissed modal using selector {close_sel}")
223
- time.sleep(1)
224
- break
225
- except Exception:
226
- continue
227
- if not dismissed:
228
- # Remove overlay by setting display to none
229
- driver.execute_script("arguments[0].remove();", modal)
230
- logger.info(f"Removed overlay/modal with selector {selector}")
199
+ modal_screenshot = modal_element.screenshot()
200
+ prompt = (
201
+ f"A modal is displayed on the page. The content is visible in the attached image. "
202
+ f"The current task context is: \"{task_context}\". "
203
+ "Based on the content of the modal and the task context, decide whether to dismiss the modal. "
204
+ "Return a JSON response in the format: { \"action\": \"dismiss\" } to dismiss or { \"action\": \"ignore\" } to leave it. "
205
+ "Return only the JSON."
206
+ )
207
+ response_text = self.llm.generate_from_image(prompt, image_bytes=modal_screenshot)
208
+ self.logger.info(f"LLM response for modal analysis: {response_text}")
209
+ json_match = re.search(r'```json\n?(.+?)\n?```', response_text, re.DOTALL)
210
+ json_text = json_match.group(1).strip() if json_match else response_text.strip()
211
+ decision = json.loads(json_text)
212
+ if decision.get("action") == "dismiss":
213
+ close_buttons = modal_element.query_selector_all(".close, .btn-close, [aria-label='Close'], [data-dismiss='modal']")
214
+ for btn in close_buttons:
215
+ if btn.is_visible():
216
+ btn.click()
217
+ self.logger.info("Modal dismissed using a close button.")
218
+ return
219
+ page.evaluate("(modal) => modal.remove()", modal_element)
220
+ self.logger.info("Modal dismissed by removal.")
221
+ else:
222
+ self.logger.info("Modal left intact according to LLM analysis.")
231
223
  except Exception as e:
232
- logger.debug(f"Modal dismissal error: {e}")
224
+ self.logger.error(f"Modal handling error: {e}")
233
225
 
234
- def _advanced_find_element(self, driver: webdriver.Chrome, keyword: str) -> Optional[WebElement]:
226
+ def _advanced_find_element(self, page: Page, keyword: str):
227
+ try:
228
+ candidates = page.query_selector_all("input, textarea, button, a, div")
229
+ best_match = None
230
+ best_ratio = 0.0
231
+ for candidate in candidates:
232
+ attrs = page.evaluate(
233
+ """(el) => {
234
+ return {
235
+ id: el.id,
236
+ name: el.getAttribute('name'),
237
+ placeholder: el.getAttribute('placeholder'),
238
+ aria: el.getAttribute('aria-label'),
239
+ text: el.innerText
240
+ };
241
+ }""",
242
+ candidate,
243
+ )
244
+ combined_text = " ".join(
245
+ filter(None, [
246
+ attrs.get("id"),
247
+ attrs.get("name"),
248
+ attrs.get("placeholder"),
249
+ attrs.get("aria"),
250
+ attrs.get("text"),
251
+ ])
252
+ )
253
+ ratio = difflib.SequenceMatcher(None, combined_text.lower(), keyword.lower()).ratio()
254
+ if ratio > best_ratio:
255
+ best_ratio = ratio
256
+ best_match = candidate
257
+ if best_ratio > 0.5:
258
+ self.logger.info(f"Advanced fallback detected element with similarity {best_ratio:.2f} for keyword '{keyword}'")
259
+ return best_match
260
+ return None
261
+ except Exception as e:
262
+ self.logger.error(f"Advanced find element error: {e}")
263
+ return None
264
+
265
+ def _annotate_page_with_numbers(self, page: Page, query: str = "button, a, input, [onclick]"):
266
+ script = f"""
267
+ (() => {{
268
+ document.querySelectorAll('.automation-annotation-overlay').forEach(el => el.remove());
269
+ const elements = document.querySelectorAll('{query}');
270
+ let counter = 1;
271
+ elements.forEach(el => {{
272
+ const rect = el.getBoundingClientRect();
273
+ if (rect.width === 0 || rect.height === 0) return;
274
+ const overlay = document.createElement('div');
275
+ overlay.classList.add('automation-annotation-overlay');
276
+ overlay.style.position = 'absolute';
277
+ overlay.style.left = (rect.left + window.scrollX) + 'px';
278
+ overlay.style.top = (rect.top + window.scrollY) + 'px';
279
+ overlay.style.width = rect.width + 'px';
280
+ overlay.style.height = rect.height + 'px';
281
+ overlay.style.border = '2px solid red';
282
+ overlay.style.zIndex = 9999;
283
+ overlay.style.pointerEvents = 'none';
284
+ overlay.textContent = counter;
285
+ overlay.style.fontSize = '16px';
286
+ overlay.style.fontWeight = 'bold';
287
+ overlay.style.color = 'red';
288
+ overlay.style.backgroundColor = 'rgba(255, 255, 255, 0.7)';
289
+ document.body.appendChild(overlay);
290
+ counter += 1;
291
+ }});
292
+ }})();
235
293
  """
236
- Advanced fallback for finding an element.
237
- Searches across multiple attributes and inner text using fuzzy matching.
294
+ page.evaluate(script)
295
+
296
+ def _click_element_by_number(self, page: Page, number: int) -> Dict[str, Any]:
297
+ candidates = [el for el in page.query_selector_all("button, a, input, [onclick]") if el.is_visible()]
298
+ index = number - 1
299
+ if index < len(candidates):
300
+ candidate = candidates[index]
301
+ candidate.scroll_into_view_if_needed()
302
+ try:
303
+ candidate.click()
304
+ return {"action": "click", "success": True, "message": f"Clicked element number {number}"}
305
+ except Exception as e:
306
+ return {"action": "click", "success": False, "message": f"Click failed: {str(e)}"}
307
+ else:
308
+ return {"action": "click", "success": False, "message": f"Element number {number} not found."}
309
+
310
+ def _fallback_with_image_llm(self, page: Page, task: Dict[str, Any], executed_context: str = "") -> Dict[str, Any]:
311
+ """
312
+ Fallback method: Annotate the page, capture a screenshot, and ask the LLM (via image analysis)
313
+ to generate a JSON array of tasks for the next steps.
314
+ Each fallback task is an object:
315
+ {
316
+ "action": "click" or "type",
317
+ "element_number": <number>,
318
+ "text": <if action is 'type', the text to type; otherwise an empty string>
319
+ }
320
+ The prompt includes the executed_context.
238
321
  """
239
- candidates = driver.find_elements(By.CSS_SELECTOR, "input, textarea, button, a, div")
240
- best_match = None
241
- best_ratio = 0.0
242
- for candidate in candidates:
243
- combined_text = " ".join([
244
- candidate.get_attribute("id") or "",
245
- candidate.get_attribute("name") or "",
246
- candidate.get_attribute("placeholder") or "",
247
- candidate.get_attribute("aria-label") or "",
248
- candidate.text or "",
249
- ])
250
- ratio = difflib.SequenceMatcher(None, combined_text.lower(), keyword.lower()).ratio()
251
- if ratio > best_ratio:
252
- best_ratio = ratio
253
- best_match = candidate
254
- if best_ratio > 0.5:
255
- logger.info(f"Advanced fallback detected element with similarity {best_ratio:.2f} for keyword '{keyword}'")
256
- return best_match
257
- return None
258
-
259
- def _handle_navigation(self, driver: webdriver.Chrome, url: str) -> Dict[str, Any]:
260
- """Handle navigation with URL correction."""
322
+ query = "input, textarea" if task.get("action") == "type" else "button, a, input, [onclick]"
323
+ self._annotate_page_with_numbers(page, query=query)
324
+ time.sleep(1)
325
+ screenshot_bytes = page.screenshot(type="png")
326
+ extra = ""
327
+ if task.get("action") == "type":
328
+ extra = f"\nThe exact text to be entered is: \"{task.get('value', '').strip()}\"."
329
+ prompt = (
330
+ f"Tasks executed so far:\n{executed_context}\n\n"
331
+ f"The following task remains: {task.get('description', '')}.{extra}\n"
332
+ "I have annotated the page with numbered overlays using the appropriate query. "
333
+ "Based on the attached screenshot, generate a JSON array of tasks that need to be performed next. "
334
+ "Each task should be a JSON object with the format:\n"
335
+ "[\n"
336
+ " {\n"
337
+ " \"action\": \"click\" or \"type\",\n"
338
+ " \"element_number\": <number>,\n"
339
+ " \"text\": <if action is 'type', the text to type; otherwise an empty string>\n"
340
+ " },\n"
341
+ " ...\n"
342
+ "]\n"
343
+ "Return only the JSON array."
344
+ )
345
+ response_text = self.llm.generate_from_image(prompt, image_bytes=screenshot_bytes)
346
+ self.logger.info(f"LLM response for fallback: {response_text}")
347
+ try:
348
+ fallback_tasks = json.loads(response_text.strip())
349
+ if not isinstance(fallback_tasks, list):
350
+ fallback_tasks = [fallback_tasks]
351
+ except Exception as e:
352
+ json_match = re.search(r'```json\n?(.+?)\n?```', response_text, re.DOTALL)
353
+ if json_match:
354
+ json_text = json_match.group(1).strip()
355
+ fallback_tasks = json.loads(json_text)
356
+ if not isinstance(fallback_tasks, list):
357
+ fallback_tasks = [fallback_tasks]
358
+ else:
359
+ return {"action": task.get("action"), "success": False, "message": f"Fallback failed to parse JSON: {str(e)}"}
360
+
361
+ fallback_results = []
362
+ for fb_task in fallback_tasks:
363
+ action = fb_task.get("action")
364
+ element_number = fb_task.get("element_number")
365
+ if action == "type":
366
+ returned_text = fb_task.get("text", "").strip()
367
+ original_text = task.get("value", "").strip()
368
+ if returned_text.lower() != original_text.lower():
369
+ self.logger.info("Overriding LLM-provided text with original input text.")
370
+ text = original_text
371
+ else:
372
+ text = returned_text
373
+ else:
374
+ text = fb_task.get("text", "")
375
+ if action == "click":
376
+ self.logger.info(f"LLM indicated fallback click on element number {element_number}.")
377
+ res = self._click_element_by_number(page, element_number)
378
+ elif action == "type":
379
+ candidates = [el for el in page.query_selector_all("input, textarea") if el.is_visible()]
380
+ if element_number - 1 < len(candidates):
381
+ candidate = candidates[element_number - 1]
382
+ candidate.scroll_into_view_if_needed()
383
+ try:
384
+ candidate.fill(text, timeout=self.default_timeout)
385
+ res = {"action": "type", "success": True, "message": f"Typed '{text}' into element number {element_number}"}
386
+ except Exception as ex:
387
+ res = {"action": "type", "success": False, "message": f"Typing failed on fallback element: {str(ex)}"}
388
+ else:
389
+ res = {"action": "type", "success": False, "message": f"Element number {element_number} not found."}
390
+ else:
391
+ res = {"action": task.get("action"), "success": False, "message": "Invalid fallback action."}
392
+ fallback_results.append(res)
393
+ overall_success = any(r.get("success", False) for r in fallback_results)
394
+ overall_message = "; ".join([r.get("message", "") for r in fallback_results])
395
+ return {"action": task.get("action"), "success": overall_success, "message": overall_message}
396
+
397
+ def _handle_navigation(self, page: Page, url: str) -> Dict[str, Any]:
261
398
  if not url.startswith(("http://", "https://")):
262
399
  url = f"https://{url}"
263
400
  try:
264
- driver.get(url)
265
- WebDriverWait(driver, self.default_timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
401
+ page.goto(url, timeout=self.default_timeout)
402
+ page.wait_for_selector("body", timeout=self.default_timeout)
266
403
  return {"action": "navigate", "success": True, "message": f"Navigated to {url}"}
404
+ except PlaywrightTimeoutError as e:
405
+ self.logger.error(f"Navigation to {url} timed out: {e}")
406
+ return {"action": "navigate", "success": False, "message": f"Navigation timed out: {str(e)}"}
267
407
  except Exception as e:
268
- logger.error(f"Navigation to {url} failed: {e}")
408
+ self.logger.error(f"Navigation to {url} failed: {e}")
269
409
  return {"action": "navigate", "success": False, "message": f"Navigation failed: {str(e)}"}
270
410
 
271
- def _handle_click(self, driver: webdriver.Chrome, selector: str) -> Dict[str, Any]:
272
- """Handle click actions with fallback using JS if needed."""
411
+ def _handle_click(self, page: Page, selector: str) -> Dict[str, Any]:
273
412
  try:
274
- element = WebDriverWait(driver, self.default_timeout).until(
275
- EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
276
- )
277
- driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", element)
278
- try:
279
- element.click()
280
- except Exception:
281
- driver.execute_script("arguments[0].click();", element)
413
+ page.wait_for_selector(selector, state="visible", timeout=self.default_timeout)
414
+ page.click(selector, timeout=self.default_timeout)
282
415
  return {"action": "click", "success": True, "message": f"Clicked element: {selector}"}
416
+ except PlaywrightTimeoutError as e:
417
+ self.logger.error(f"Click action timed out on selector {selector}: {e}")
418
+ return {"action": "click", "success": False, "message": f"Click timed out: {str(e)}"}
283
419
  except Exception as e:
284
- logger.error(f"Click action failed on selector {selector}: {e}")
420
+ self.logger.error(f"Click action failed on selector {selector}: {e}")
285
421
  return {"action": "click", "success": False, "message": f"Click failed: {str(e)}"}
286
422
 
287
- def _handle_typing(self, driver: webdriver.Chrome, selector: str, text: str, task: Dict[str, Any]) -> Dict[str, Any]:
288
- """
289
- Handle typing into an element.
290
- If the primary selector fails, attempt advanced fallback detection.
291
- """
292
- try:
293
- element = WebDriverWait(driver, self.default_timeout).until(
294
- EC.presence_of_element_located((By.CSS_SELECTOR, selector))
295
- )
296
- except Exception as e:
297
- # If the task seems to involve search or similar text, use advanced fallback.
298
- if "search" in task.get("description", "").lower() or "search" in selector.lower():
299
- logger.info("Primary selector failed; using advanced fallback for element detection.")
300
- element = self._advanced_find_element(driver, "search")
301
- if not element:
302
- return {"action": "type", "success": False, "message": f"Typing failed: No search-like element found; error: {str(e)}"}
303
- else:
304
- return {"action": "type", "success": False, "message": f"Typing failed: {str(e)}"}
423
+ def _handle_typing(self, page: Page, selector: str, text: str, task: Dict[str, Any]) -> Dict[str, Any]:
305
424
  try:
306
- element.clear()
307
- element.send_keys(text)
425
+ page.wait_for_selector(selector, state="attached", timeout=self.default_timeout)
426
+ page.fill(selector, text, timeout=self.default_timeout)
308
427
  return {"action": "type", "success": True, "message": f"Typed '{text}' into element."}
428
+ except PlaywrightTimeoutError as e:
429
+ self.logger.info("Primary selector failed; using advanced fallback for element detection.")
430
+ element = self._advanced_find_element(page, "search")
431
+ if not element:
432
+ return {"action": "type", "success": False, "message": f"Typing failed: No search-like element found; error: {str(e)}"}
433
+ try:
434
+ element.fill(text, timeout=self.default_timeout)
435
+ return {"action": "type", "success": True, "message": f"Typed '{text}' into fallback element."}
436
+ except Exception as ex:
437
+ return {"action": "type", "success": False, "message": f"Typing failed on fallback element: {str(ex)}"}
309
438
  except Exception as e:
310
- logger.error(f"Typing action failed: {e}")
439
+ self.logger.error(f"Typing action failed: {e}")
311
440
  return {"action": "type", "success": False, "message": f"Typing failed: {str(e)}"}
312
441
 
313
442
  def _handle_wait(self, seconds: str) -> Dict[str, Any]:
314
- """Handle a simple wait."""
315
443
  try:
316
444
  wait_time = float(seconds)
317
- logger.info(f"Waiting for {wait_time} seconds")
445
+ self.logger.info(f"Waiting for {wait_time} seconds")
318
446
  time.sleep(wait_time)
319
447
  return {"action": "wait", "success": True, "message": f"Waited {wait_time} seconds"}
320
448
  except ValueError as e:
321
- logger.error(f"Invalid wait time provided: {seconds}")
449
+ self.logger.error(f"Invalid wait time provided: {seconds}")
322
450
  return {"action": "wait", "success": False, "message": "Invalid wait time"}
323
451
 
324
- def _handle_wait_for_ajax(self, driver: webdriver.Chrome, seconds: str) -> Dict[str, Any]:
325
- """
326
- Wait until AJAX/network activity has subsided.
327
- This implementation first checks for jQuery, then falls back to a generic check.
328
- """
452
+ def _handle_wait_for_ajax(self, page: Page, seconds: str) -> Dict[str, Any]:
329
453
  try:
330
- timeout = int(seconds)
331
- logger.info(f"Waiting for AJAX/network activity for up to {timeout} seconds.")
332
- end_time = time.time() + timeout
454
+ timeout_seconds = int(seconds) if seconds.strip() != "" else 30
455
+ self.logger.info(f"Waiting for AJAX/network activity for up to {timeout_seconds} seconds.")
456
+ end_time = time.time() + timeout_seconds
333
457
  while time.time() < end_time:
334
- ajax_complete = driver.execute_script("""
335
- return (window.jQuery ? jQuery.active === 0 : true) &&
336
- (typeof window.fetch === 'function' ? true : true);
458
+ ajax_complete = page.evaluate("""
459
+ () => {
460
+ return (window.jQuery ? jQuery.active === 0 : true) &&
461
+ (typeof window.fetch === 'function' ? true : true);
462
+ }
337
463
  """)
338
464
  if ajax_complete:
339
465
  break
340
466
  time.sleep(0.5)
341
467
  return {"action": "wait_for_ajax", "success": True, "message": "AJAX/network activity subsided."}
342
468
  except Exception as e:
343
- logger.error(f"Wait for AJAX failed: {e}")
469
+ self.logger.error(f"Wait for AJAX failed: {e}")
344
470
  return {"action": "wait_for_ajax", "success": False, "message": f"Wait for AJAX failed: {str(e)}"}
345
471
 
346
- def _handle_scroll(self, driver: webdriver.Chrome, selector: str) -> Dict[str, Any]:
347
- """Handle scrolling to a specific element or page bottom."""
472
+ def _handle_scroll(self, page: Page, selector: str) -> Dict[str, Any]:
348
473
  try:
349
474
  if selector:
350
- element = WebDriverWait(driver, self.default_timeout).until(
351
- EC.presence_of_element_located((By.CSS_SELECTOR, selector))
352
- )
353
- driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", element)
475
+ page.wait_for_selector(selector, timeout=self.default_timeout)
476
+ page.eval_on_selector(selector, "el => el.scrollIntoView({behavior: 'smooth', block: 'center'})")
354
477
  scroll_target = selector
355
478
  else:
356
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
479
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
357
480
  scroll_target = "page bottom"
358
481
  return {"action": "scroll", "success": True, "message": f"Scrolled to {scroll_target}"}
359
482
  except Exception as e:
360
- logger.error(f"Scroll action failed on selector {selector}: {e}")
483
+ self.logger.error(f"Scroll action failed on selector {selector}: {e}")
361
484
  return {"action": "scroll", "success": False, "message": f"Scroll failed: {str(e)}"}
362
485
 
363
- def _handle_hover(self, driver: webdriver.Chrome, selector: str) -> Dict[str, Any]:
364
- """Handle mouse hover action."""
486
+ def _handle_hover(self, page: Page, selector: str) -> Dict[str, Any]:
365
487
  try:
366
- element = WebDriverWait(driver, self.default_timeout).until(
367
- EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
368
- )
369
- ActionChains(driver).move_to_element(element).perform()
488
+ page.wait_for_selector(selector, state="visible", timeout=self.default_timeout)
489
+ page.hover(selector, timeout=self.default_timeout)
370
490
  return {"action": "hover", "success": True, "message": f"Hovered over {selector}"}
371
491
  except Exception as e:
372
- logger.error(f"Hover action failed on selector {selector}: {e}")
492
+ self.logger.error(f"Hover action failed on selector {selector}: {e}")
373
493
  return {"action": "hover", "success": False, "message": f"Hover failed: {str(e)}"}
374
494
 
375
- def _handle_screenshot(self, driver: webdriver.Chrome, filename: str) -> Dict[str, Any]:
376
- """Capture a screenshot of the current browser state."""
495
+ def _handle_screenshot(self, page: Page, filename: str) -> Dict[str, Any]:
377
496
  try:
378
- driver.save_screenshot(filename)
497
+ page.screenshot(path=filename)
379
498
  return {"action": "screenshot", "success": True, "message": f"Screenshot saved as {filename}"}
380
499
  except Exception as e:
381
- logger.error(f"Screenshot capture failed: {e}")
500
+ self.logger.error(f"Screenshot capture failed: {e}")
382
501
  return {"action": "screenshot", "success": False, "message": f"Screenshot failed: {str(e)}"}
383
502
 
384
- def _handle_switch_tab(self, driver: webdriver.Chrome, value: str) -> Dict[str, Any]:
385
- """
386
- Switch between tabs. 'value' can be an index or the keyword 'new'.
387
- """
503
+ def _handle_switch_tab(self, context, value: str) -> Dict[str, Any]:
388
504
  try:
389
- handles = driver.window_handles
505
+ pages = context.pages
390
506
  if value.lower() == "new":
391
- target_handle = handles[-1]
507
+ target_page = pages[-1]
392
508
  else:
393
509
  idx = int(value)
394
- if idx < len(handles):
395
- target_handle = handles[idx]
510
+ if idx < len(pages):
511
+ target_page = pages[idx]
396
512
  else:
397
513
  return {"action": "switch_tab", "success": False, "message": f"Tab index {value} out of range"}
398
- driver.switch_to.window(target_handle)
399
514
  return {"action": "switch_tab", "success": True, "message": f"Switched to tab {value}"}
400
515
  except Exception as e:
401
- logger.error(f"Switch tab failed: {e}")
516
+ self.logger.error(f"Switch tab failed: {e}")
402
517
  return {"action": "switch_tab", "success": False, "message": f"Switch tab failed: {str(e)}"}
403
518
 
404
- def _handle_execute_script(self, driver: webdriver.Chrome, script: str) -> Dict[str, Any]:
405
- """
406
- Execute arbitrary JavaScript code.
407
- """
519
+ def _handle_execute_script(self, page: Page, script: str) -> Dict[str, Any]:
408
520
  try:
409
- result = driver.execute_script(script)
521
+ result = page.evaluate(script)
410
522
  return {"action": "execute_script", "success": True, "message": "Script executed successfully", "result": result}
411
523
  except Exception as e:
412
- logger.error(f"Execute script failed: {e}")
524
+ self.logger.error(f"Execute script failed: {e}")
413
525
  return {"action": "execute_script", "success": False, "message": f"Script execution failed: {str(e)}"}
414
526
 
415
- def _handle_drag_and_drop(self, driver: webdriver.Chrome, source_selector: str, target_selector: str) -> Dict[str, Any]:
416
- """
417
- Simulate a drag-and-drop operation.
418
- """
527
+ def _handle_drag_and_drop(self, page: Page, source_selector: str, target_selector: str) -> Dict[str, Any]:
419
528
  try:
420
- source = WebDriverWait(driver, self.default_timeout).until(
421
- EC.presence_of_element_located((By.CSS_SELECTOR, source_selector))
422
- )
423
- target = WebDriverWait(driver, self.default_timeout).until(
424
- EC.presence_of_element_located((By.CSS_SELECTOR, target_selector))
425
- )
426
- ActionChains(driver).drag_and_drop(source, target).perform()
529
+ page.wait_for_selector(source_selector, timeout=self.default_timeout)
530
+ page.wait_for_selector(target_selector, timeout=self.default_timeout)
531
+ source = page.locator(source_selector)
532
+ target = page.locator(target_selector)
533
+ source.drag_to(target, timeout=self.default_timeout)
427
534
  return {"action": "drag_and_drop", "success": True, "message": f"Dragged element from {source_selector} to {target_selector}"}
428
535
  except Exception as e:
429
- logger.error(f"Drag and drop failed from {source_selector} to {target_selector}: {e}")
536
+ self.logger.error(f"Drag and drop failed from {source_selector} to {target_selector}: {e}")
430
537
  return {"action": "drag_and_drop", "success": False, "message": f"Drag and drop failed: {str(e)}"}
431
538
 
432
- def _capture_failure_screenshot(self, driver: webdriver.Chrome, action: str):
433
- """Capture a screenshot for debugging when an error occurs."""
539
+ def _capture_failure_screenshot(self, page: Page, action: str):
434
540
  filename = f"failure_{action}_{int(time.time())}.png"
435
541
  try:
436
- driver.save_screenshot(filename)
437
- logger.info(f"Failure screenshot captured: {filename}")
542
+ page.screenshot(path=filename)
543
+ self.logger.info(f"Failure screenshot captured: {filename}")
438
544
  except Exception as e:
439
- logger.error(f"Failed to capture screenshot: {e}")
545
+ self.logger.error(f"Failed to capture screenshot: {e}")