lumivor 0.1.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,557 @@
1
+ import asyncio
2
+ import logging
3
+
4
+ from main_content_extractor import MainContentExtractor
5
+ from playwright.async_api import Page
6
+
7
+ from lumivor.agent.views import ActionModel, ActionResult
8
+ from lumivor.browser.context import BrowserContext
9
+ from lumivor.controller.registry.service import Registry
10
+ from lumivor.controller.views import (
11
+ ClickElementAction,
12
+ DoneAction,
13
+ ExtractPageContentAction,
14
+ GoToUrlAction,
15
+ InputTextAction,
16
+ OpenTabAction,
17
+ ScrollAction,
18
+ SearchGoogleAction,
19
+ SendKeysAction,
20
+ SwitchTabAction,
21
+ )
22
+ from lumivor.utils import time_execution_async, time_execution_sync
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class Controller:
28
+ def __init__(
29
+ self,
30
+ ):
31
+ self.registry = Registry()
32
+ self._register_default_actions()
33
+
34
+ def _register_default_actions(self):
35
+ """Register all default browser actions"""
36
+
37
+ # Basic Navigation Actions
38
+ @self.registry.action(
39
+ 'Search Google in the current tab',
40
+ param_model=SearchGoogleAction,
41
+ requires_browser=True,
42
+ )
43
+ async def search_google(params: SearchGoogleAction, browser: BrowserContext):
44
+ page = await browser.get_current_page()
45
+ await page.goto(f'https://www.google.com/search?q={params.query}')
46
+ await page.wait_for_load_state()
47
+ msg = f'🔍 Searched for "{params.query}" in Google'
48
+ logger.info(msg)
49
+ return ActionResult(extracted_content=msg, include_in_memory=True)
50
+
51
+ @self.registry.action(
52
+ 'Navigate to URL in the current tab', param_model=GoToUrlAction, requires_browser=True
53
+ )
54
+ async def go_to_url(params: GoToUrlAction, browser: BrowserContext):
55
+ page = await browser.get_current_page()
56
+ await page.goto(params.url)
57
+ await page.wait_for_load_state()
58
+ msg = f'🔗 Navigated to {params.url}'
59
+ logger.info(msg)
60
+ return ActionResult(extracted_content=msg, include_in_memory=True)
61
+
62
+ @self.registry.action('Go back', requires_browser=True)
63
+ async def go_back(browser: BrowserContext):
64
+ page = await browser.get_current_page()
65
+ await page.go_back()
66
+ await page.wait_for_load_state()
67
+ msg = '🔙 Navigated back'
68
+ logger.info(msg)
69
+ return ActionResult(extracted_content=msg, include_in_memory=True)
70
+
71
+ # Element Interaction Actions
72
+ @self.registry.action(
73
+ 'Click element', param_model=ClickElementAction, requires_browser=True
74
+ )
75
+ async def click_element(params: ClickElementAction, browser: BrowserContext):
76
+ session = await browser.get_session()
77
+ state = session.cached_state
78
+
79
+ if params.index not in state.selector_map:
80
+ raise Exception(
81
+ f'Element with index {
82
+ params.index} does not exist - retry or use alternative actions'
83
+ )
84
+
85
+ element_node = state.selector_map[params.index]
86
+ initial_pages = len(session.context.pages)
87
+
88
+ # if element has file uploader then dont click
89
+ if await browser.is_file_uploader(element_node):
90
+ msg = f'Index {
91
+ params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
92
+ logger.info(msg)
93
+ return ActionResult(extracted_content=msg, include_in_memory=True)
94
+
95
+ msg = None
96
+
97
+ try:
98
+ await browser._click_element_node(element_node)
99
+ msg = f'🖱️ Clicked index {params.index}'
100
+ logger.info(msg)
101
+ logger.debug(f'Element xpath: {element_node.xpath}')
102
+ if len(session.context.pages) > initial_pages:
103
+ new_tab_msg = 'New tab opened - switching to it'
104
+ msg += f' - {new_tab_msg}'
105
+ logger.info(new_tab_msg)
106
+ await browser.switch_to_tab(-1)
107
+ return ActionResult(extracted_content=msg, include_in_memory=True)
108
+ except Exception as e:
109
+ logger.warning(
110
+ f'Element no longer available with index {
111
+ params.index} - most likely the page changed'
112
+ )
113
+ return ActionResult(error=str(e))
114
+
115
+ @self.registry.action(
116
+ 'Input text into a input interactive element',
117
+ param_model=InputTextAction,
118
+ requires_browser=True,
119
+ )
120
+ async def input_text(params: InputTextAction, browser: BrowserContext):
121
+ session = await browser.get_session()
122
+ state = session.cached_state
123
+
124
+ if params.index not in state.selector_map:
125
+ raise Exception(
126
+ f'Element index {
127
+ params.index} does not exist - retry or use alternative actions'
128
+ )
129
+
130
+ element_node = state.selector_map[params.index]
131
+ await browser._input_text_element_node(element_node, params.text)
132
+ msg = f'⌨️ Input "{params.text}" into index {params.index}'
133
+ logger.info(msg)
134
+ logger.debug(f'Element xpath: {element_node.xpath}')
135
+ return ActionResult(extracted_content=msg, include_in_memory=True)
136
+
137
+ # Tab Management Actions
138
+ @self.registry.action('Switch tab', param_model=SwitchTabAction, requires_browser=True)
139
+ async def switch_tab(params: SwitchTabAction, browser: BrowserContext):
140
+ await browser.switch_to_tab(params.page_id)
141
+ # Wait for tab to be ready
142
+ page = await browser.get_current_page()
143
+ await page.wait_for_load_state()
144
+ msg = f'🔄 Switched to tab {params.page_id}'
145
+ logger.info(msg)
146
+ return ActionResult(extracted_content=msg, include_in_memory=True)
147
+
148
+ @self.registry.action(
149
+ 'Open url in new tab', param_model=OpenTabAction, requires_browser=True
150
+ )
151
+ async def open_tab(params: OpenTabAction, browser: BrowserContext):
152
+ await browser.create_new_tab(params.url)
153
+ msg = f'🔗 Opened new tab with {params.url}'
154
+ logger.info(msg)
155
+ return ActionResult(extracted_content=msg, include_in_memory=True)
156
+
157
+ # Content Actions
158
+ @self.registry.action(
159
+ 'Extract page content to get the text or markdown ',
160
+ param_model=ExtractPageContentAction,
161
+ requires_browser=True,
162
+ )
163
+ async def extract_content(params: ExtractPageContentAction, browser: BrowserContext):
164
+ page = await browser.get_current_page()
165
+
166
+ content = MainContentExtractor.extract( # type: ignore
167
+ html=await page.content(),
168
+ output_format=params.value,
169
+ )
170
+ msg = f'📄 Extracted page content\n: {content}\n'
171
+ logger.info(msg)
172
+ return ActionResult(extracted_content=msg)
173
+
174
+ @self.registry.action('Complete task', param_model=DoneAction)
175
+ async def done(params: DoneAction):
176
+ return ActionResult(is_done=True, extracted_content=params.text)
177
+
178
+ @self.registry.action(
179
+ 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
180
+ param_model=ScrollAction,
181
+ requires_browser=True,
182
+ )
183
+ async def scroll_down(params: ScrollAction, browser: BrowserContext):
184
+ page = await browser.get_current_page()
185
+ if params.amount is not None:
186
+ await page.evaluate(f'window.scrollBy(0, {params.amount});')
187
+ else:
188
+ await page.keyboard.press('PageDown')
189
+
190
+ amount = f'{
191
+ params.amount} pixels' if params.amount is not None else 'one page'
192
+ msg = f'🔍 Scrolled down the page by {amount}'
193
+ logger.info(msg)
194
+ return ActionResult(
195
+ extracted_content=msg,
196
+ include_in_memory=True,
197
+ )
198
+
199
+ # scroll up
200
+ @self.registry.action(
201
+ 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
202
+ param_model=ScrollAction,
203
+ requires_browser=True,
204
+ )
205
+ async def scroll_up(params: ScrollAction, browser: BrowserContext):
206
+ page = await browser.get_current_page()
207
+ if params.amount is not None:
208
+ await page.evaluate(f'window.scrollBy(0, -{params.amount});')
209
+ else:
210
+ await page.keyboard.press('PageUp')
211
+
212
+ amount = f'{
213
+ params.amount} pixels' if params.amount is not None else 'one page'
214
+ msg = f'🔍 Scrolled up the page by {amount}'
215
+ logger.info(msg)
216
+ return ActionResult(
217
+ extracted_content=msg,
218
+ include_in_memory=True,
219
+ )
220
+
221
+ # send keys
222
+ @self.registry.action(
223
+ 'Send strings of special keys like Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. Be aware of different operating systems and their shortcuts',
224
+ param_model=SendKeysAction,
225
+ requires_browser=True,
226
+ )
227
+ async def send_keys(params: SendKeysAction, browser: BrowserContext):
228
+ page = await browser.get_current_page()
229
+
230
+ await page.keyboard.press(params.keys)
231
+ msg = f'⌨️ Sent keys: {params.keys}'
232
+ logger.info(msg)
233
+ return ActionResult(extracted_content=msg, include_in_memory=True)
234
+
235
+ @self.registry.action(
236
+ description='If you dont find something which you want to interact with, scroll to it',
237
+ requires_browser=True,
238
+ )
239
+ async def scroll_to_text(text: str, browser: BrowserContext): # type: ignore
240
+ page = await browser.get_current_page()
241
+ try:
242
+ # Try different locator strategies
243
+ locators = [
244
+ page.get_by_text(text, exact=False),
245
+ page.locator(f'text={text}'),
246
+ page.locator(f"//*[contains(text(), '{text}')]"),
247
+ ]
248
+
249
+ for locator in locators:
250
+ try:
251
+ # First check if element exists and is visible
252
+ if await locator.count() > 0 and await locator.first.is_visible():
253
+ await locator.first.scroll_into_view_if_needed()
254
+ # Wait for scroll to complete
255
+ await asyncio.sleep(0.5)
256
+ msg = f'🔍 Scrolled to text: {text}'
257
+ logger.info(msg)
258
+ return ActionResult(extracted_content=msg, include_in_memory=True)
259
+ except Exception as e:
260
+ logger.debug(f'Locator attempt failed: {str(e)}')
261
+ continue
262
+
263
+ msg = f"Text '{text}' not found or not visible on page"
264
+ logger.info(msg)
265
+ return ActionResult(extracted_content=msg, include_in_memory=True)
266
+
267
+ except Exception as e:
268
+ msg = f"Failed to scroll to text '{text}': {str(e)}"
269
+ logger.error(msg)
270
+ return ActionResult(error=msg, include_in_memory=True)
271
+
272
+ @self.registry.action(
273
+ description='Get all options from a native dropdown',
274
+ requires_browser=True,
275
+ )
276
+ async def get_dropdown_options(index: int, browser: BrowserContext) -> ActionResult:
277
+ """Get all options from a native dropdown"""
278
+ page = await browser.get_current_page()
279
+ selector_map = await browser.get_selector_map()
280
+ dom_element = selector_map[index]
281
+
282
+ try:
283
+ # Frame-aware approach since we know it works
284
+ all_options = []
285
+ frame_index = 0
286
+
287
+ for frame in page.frames:
288
+ try:
289
+ options = await frame.evaluate(
290
+ """
291
+ (xpath) => {
292
+ const select = document.evaluate(xpath, document, null,
293
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
294
+ if (!select) return null;
295
+
296
+ return {
297
+ options: Array.from(select.options).map(opt => ({
298
+ text: opt.text.trim(),
299
+ value: opt.value,
300
+ index: opt.index
301
+ })),
302
+ id: select.id,
303
+ name: select.name
304
+ };
305
+ }
306
+ """,
307
+ dom_element.xpath,
308
+ )
309
+
310
+ if options:
311
+ logger.debug(
312
+ f'Found dropdown in frame {frame_index}')
313
+ logger.debug(f"Dropdown ID: {options['id']}, Name: {
314
+ options['name']}")
315
+
316
+ formatted_options = []
317
+ for opt in options['options']:
318
+ formatted_options.append(
319
+ f"{opt['index']}: {
320
+ opt['text']} (value={opt['value']})"
321
+ )
322
+
323
+ all_options.extend(formatted_options)
324
+
325
+ except Exception as frame_e:
326
+ logger.debug(f'Frame {frame_index} evaluation failed: {
327
+ str(frame_e)}')
328
+
329
+ frame_index += 1
330
+
331
+ if all_options:
332
+ msg = '\n'.join(all_options)
333
+ logger.info(msg)
334
+ return ActionResult(extracted_content=msg, include_in_memory=True)
335
+ else:
336
+ msg = 'No options found in any frame for dropdown'
337
+ logger.info(msg)
338
+ return ActionResult(extracted_content=msg, include_in_memory=True)
339
+
340
+ except Exception as e:
341
+ logger.error(f'Failed to get dropdown options: {str(e)}')
342
+ msg = f'Error getting options: {str(e)}'
343
+ logger.info(msg)
344
+ return ActionResult(extracted_content=msg, include_in_memory=True)
345
+
346
+ @self.registry.action(
347
+ description='Select dropdown option for interactive element index by the text of the option you want to select',
348
+ requires_browser=True,
349
+ )
350
+ async def select_dropdown_option(
351
+ index: int,
352
+ text: str,
353
+ browser: BrowserContext,
354
+ ) -> ActionResult:
355
+ """Select dropdown option by the text of the option you want to select"""
356
+ page = await browser.get_current_page()
357
+ selector_map = await browser.get_selector_map()
358
+ dom_element = selector_map[index]
359
+
360
+ # Validate that we're working with a select element
361
+ if dom_element.tag_name != 'select':
362
+ logger.error(
363
+ f'Element is not a select! Tag: {
364
+ dom_element.tag_name}, Attributes: {dom_element.attributes}'
365
+ )
366
+ msg = f'Cannot select option: Element with index {
367
+ index} is a {dom_element.tag_name}, not a select'
368
+ return ActionResult(extracted_content=msg, include_in_memory=True)
369
+
370
+ logger.debug(f"Attempting to select '{
371
+ text}' using xpath: {dom_element.xpath}")
372
+ logger.debug(f'Element attributes: {dom_element.attributes}')
373
+ logger.debug(f'Element tag: {dom_element.tag_name}')
374
+
375
+ try:
376
+ frame_index = 0
377
+ for frame in page.frames:
378
+ try:
379
+ logger.debug(f'Trying frame {
380
+ frame_index} URL: {frame.url}')
381
+
382
+ # First verify we can find the dropdown in this frame
383
+ find_dropdown_js = """
384
+ (xpath) => {
385
+ try {
386
+ const select = document.evaluate(xpath, document, null,
387
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
388
+ if (!select) return null;
389
+ if (select.tagName.toLowerCase() !== 'select') {
390
+ return {
391
+ error: `Found element but it's a ${select.tagName}, not a SELECT`,
392
+ found: false
393
+ };
394
+ }
395
+ return {
396
+ id: select.id,
397
+ name: select.name,
398
+ found: true,
399
+ tagName: select.tagName,
400
+ optionCount: select.options.length,
401
+ currentValue: select.value,
402
+ availableOptions: Array.from(select.options).map(o => o.text.trim())
403
+ };
404
+ } catch (e) {
405
+ return {error: e.toString(), found: false};
406
+ }
407
+ }
408
+ """
409
+
410
+ dropdown_info = await frame.evaluate(find_dropdown_js, dom_element.xpath)
411
+
412
+ if dropdown_info:
413
+ if not dropdown_info.get('found'):
414
+ logger.error(
415
+ f"Frame {frame_index} error: {
416
+ dropdown_info.get('error')}"
417
+ )
418
+ continue
419
+
420
+ logger.debug(f'Found dropdown in frame {
421
+ frame_index}: {dropdown_info}')
422
+
423
+ # Rest of the selection code remains the same...
424
+ select_option_js = """
425
+ (params) => {
426
+ try {
427
+ const select = document.evaluate(params.xpath, document, null,
428
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
429
+ if (!select || select.tagName.toLowerCase() !== 'select') {
430
+ return {success: false, error: 'Select not found or invalid element type'};
431
+ }
432
+
433
+ const option = Array.from(select.options)
434
+ .find(opt => opt.text.trim() === params.text);
435
+
436
+ if (!option) {
437
+ return {
438
+ success: false,
439
+ error: 'Option not found',
440
+ availableOptions: Array.from(select.options).map(o => o.text.trim())
441
+ };
442
+ }
443
+
444
+ select.value = option.value;
445
+ select.dispatchEvent(new Event('change'));
446
+ return {
447
+ success: true,
448
+ selectedValue: option.value,
449
+ selectedText: option.text.trim()
450
+ };
451
+ } catch (e) {
452
+ return {success: false, error: e.toString()};
453
+ }
454
+ }
455
+ """
456
+
457
+ params = {'xpath': dom_element.xpath, 'text': text}
458
+
459
+ result = await frame.evaluate(select_option_js, params)
460
+ logger.debug(f'Selection result: {result}')
461
+
462
+ if result.get('success'):
463
+ msg = (
464
+ f"Selected option '{text}' (value={
465
+ result.get('selectedValue')}"
466
+ )
467
+ logger.info(msg + f' in frame {frame_index}')
468
+ return ActionResult(extracted_content=msg, include_in_memory=True)
469
+ else:
470
+ logger.error(f"Selection failed: {
471
+ result.get('error')}")
472
+ if 'availableOptions' in result:
473
+ logger.error(f"Available options: {
474
+ result['availableOptions']}")
475
+
476
+ except Exception as frame_e:
477
+ logger.error(f'Frame {frame_index} attempt failed: {
478
+ str(frame_e)}')
479
+ logger.error(f'Frame type: {type(frame)}')
480
+ logger.error(f'Frame URL: {frame.url}')
481
+
482
+ frame_index += 1
483
+
484
+ msg = f"Could not select option '{text}' in any frame"
485
+ logger.info(msg)
486
+ return ActionResult(extracted_content=msg, include_in_memory=True)
487
+
488
+ except Exception as e:
489
+ msg = f'Selection failed: {str(e)}'
490
+ logger.error(msg)
491
+ return ActionResult(error=msg, include_in_memory=True)
492
+
493
+ def action(self, description: str, **kwargs):
494
+ """Decorator for registering custom actions
495
+
496
+ @param description: Describe the LLM what the function does (better description == better function calling)
497
+ """
498
+ return self.registry.action(description, **kwargs)
499
+
500
+ @time_execution_async('--multi-act')
501
+ async def multi_act(
502
+ self, actions: list[ActionModel], browser_context: BrowserContext
503
+ ) -> list[ActionResult]:
504
+ """Execute multiple actions"""
505
+ results = []
506
+
507
+ session = await browser_context.get_session()
508
+ cached_selector_map = session.cached_state.selector_map
509
+ cached_path_hashes = set(
510
+ e.hash.branch_path_hash for e in cached_selector_map.values())
511
+ await browser_context.remove_highlights()
512
+
513
+ for i, action in enumerate(actions):
514
+ if action.get_index() is not None and i != 0:
515
+ new_state = await browser_context.get_state()
516
+ new_path_hashes = set(
517
+ e.hash.branch_path_hash for e in new_state.selector_map.values()
518
+ )
519
+ if not new_path_hashes.issubset(cached_path_hashes):
520
+ # next action requires index but there are new elements on the page
521
+ logger.info(f'Something new appeared after action {
522
+ i} / {len(actions)}')
523
+ break
524
+
525
+ results.append(await self.act(action, browser_context))
526
+
527
+ logger.debug(f'Executed action {i + 1} / {len(actions)}')
528
+ if results[-1].is_done or results[-1].error or i == len(actions) - 1:
529
+ break
530
+
531
+ await asyncio.sleep(browser_context.config.wait_between_actions)
532
+ # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)
533
+
534
+ return results
535
+
536
+ @time_execution_sync('--act')
537
+ async def act(self, action: ActionModel, browser_context: BrowserContext) -> ActionResult:
538
+ """Execute an action"""
539
+ try:
540
+ for action_name, params in action.model_dump(exclude_unset=True).items():
541
+ if params is not None:
542
+ # remove highlights
543
+ result = await self.registry.execute_action(
544
+ action_name, params, browser=browser_context
545
+ )
546
+ if isinstance(result, str):
547
+ return ActionResult(extracted_content=result)
548
+ elif isinstance(result, ActionResult):
549
+ return result
550
+ elif result is None:
551
+ return ActionResult()
552
+ else:
553
+ raise ValueError(f'Invalid action result type: {
554
+ type(result)} of {result}')
555
+ return ActionResult()
556
+ except Exception as e:
557
+ raise e
@@ -0,0 +1,47 @@
1
+ from typing import Literal, Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ # Action Input Models
7
+ class SearchGoogleAction(BaseModel):
8
+ query: str
9
+
10
+
11
+ class GoToUrlAction(BaseModel):
12
+ url: str
13
+
14
+
15
+ class ClickElementAction(BaseModel):
16
+ index: int
17
+ xpath: Optional[str] = None
18
+
19
+
20
+ class InputTextAction(BaseModel):
21
+ index: int
22
+ text: str
23
+ xpath: Optional[str] = None
24
+
25
+
26
+ class DoneAction(BaseModel):
27
+ text: str
28
+
29
+
30
+ class SwitchTabAction(BaseModel):
31
+ page_id: int
32
+
33
+
34
+ class OpenTabAction(BaseModel):
35
+ url: str
36
+
37
+
38
+ class ExtractPageContentAction(BaseModel):
39
+ value: Literal['text', 'markdown', 'html'] = 'text'
40
+
41
+
42
+ class ScrollAction(BaseModel):
43
+ amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page
44
+
45
+
46
+ class SendKeysAction(BaseModel):
47
+ keys: str
File without changes