lumivor 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ import time
2
+
3
+ from tokencost import count_string_tokens
4
+
5
+ from lumivor.browser.browser import Browser, BrowserConfig
6
+
7
+ # from lumivor.browser.service import Browser
8
+ from lumivor.dom.service import DomService
9
+ from lumivor.utils import time_execution_sync
10
+
11
+
12
+ # @pytest.mark.skip("slow af")
13
+ async def test_process_html_file():
14
+ browser = Browser(config=BrowserConfig(headless=False))
15
+
16
+ async with await browser.new_context() as context:
17
+ page = await context.get_current_page()
18
+
19
+ dom_service = DomService(page)
20
+
21
+ # await page.goto('https://kayak.com/flights')
22
+ # browser.go_to_url('https://google.com/flights')
23
+ await page.goto('https://immobilienscout24.de')
24
+
25
+ time.sleep(3)
26
+ # browser._click_element_by_xpath(
27
+ # '/html/body/div[5]/div/div[2]/div/div/div[3]/div/div[1]/button[1]'
28
+ # )
29
+ # browser._click_element_by_xpath("//button[div/div[text()='Alle akzeptieren']]")
30
+
31
+ dom_state = await time_execution_sync('get_clickable_elements')(
32
+ dom_service.get_clickable_elements
33
+ )()
34
+ elements = dom_state.element_tree
35
+ selector_map = dom_state.selector_map
36
+
37
+ print(elements.clickable_elements_to_string())
38
+ print(
39
+ 'Tokens:', count_string_tokens(
40
+ elements.clickable_elements_to_string(), model='gpt-4o')
41
+ )
42
+ print(len(selector_map.keys()), 'elements highlighted')
43
+
44
+ input('Press Enter to continue...')
@@ -0,0 +1,40 @@
1
+ import json
2
+ import os
3
+ import time
4
+
5
+ from lumivor.browser.browser import Browser, BrowserConfig
6
+
7
+
8
+ async def test_process_dom():
9
+ browser = Browser(config=BrowserConfig(headless=False))
10
+
11
+ async with await browser.new_context() as context:
12
+ page = await context.get_current_page()
13
+ await page.goto('https://kayak.com/flights')
14
+ # await page.goto('https://google.com/flights')
15
+ # await page.goto('https://immobilienscout24.de')
16
+ # await page.goto('https://seleniumbase.io/w3schools/iframes')
17
+
18
+ time.sleep(3)
19
+
20
+ with open('lumivor/dom/process_dom.js', 'r') as f:
21
+ js_code = f.read()
22
+
23
+ start = time.time()
24
+ dom_tree = await page.evaluate(js_code)
25
+ end = time.time()
26
+
27
+ # print(dom_tree)
28
+ print(f'Time: {end - start:.2f}s')
29
+
30
+ os.makedirs('./tmp', exist_ok=True)
31
+ with open('./tmp/dom.json', 'w') as f:
32
+ json.dump(dom_tree, f, indent=1)
33
+
34
+ # both of these work for immobilienscout24.de
35
+ # await page.click('.sc-dcJsrY.ezjNCe')
36
+ # await page.click(
37
+ # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
38
+ # )
39
+
40
+ input('Press Enter to continue...')
lumivor/dom/views.py ADDED
@@ -0,0 +1,187 @@
1
+ from dataclasses import dataclass
2
+ from functools import cached_property
3
+ from typing import TYPE_CHECKING, Dict, List, Optional
4
+
5
+ from lumivor.dom.history_tree_processor.view import HashedDomElement
6
+
7
+ # Avoid circular import issues
8
+ if TYPE_CHECKING:
9
+ from .views import DOMElementNode
10
+
11
+
12
+ @dataclass(frozen=False)
13
+ class DOMBaseNode:
14
+ is_visible: bool
15
+ # Use None as default and set parent later to avoid circular reference issues
16
+ parent: Optional['DOMElementNode']
17
+
18
+
19
+ @dataclass(frozen=False)
20
+ class DOMTextNode(DOMBaseNode):
21
+ text: str
22
+ type: str = 'TEXT_NODE'
23
+
24
+ def has_parent_with_highlight_index(self) -> bool:
25
+ current = self.parent
26
+ while current is not None:
27
+ if current.highlight_index is not None:
28
+ return True
29
+ current = current.parent
30
+ return False
31
+
32
+
33
+ @dataclass(frozen=False)
34
+ class DOMElementNode(DOMBaseNode):
35
+ """
36
+ xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe).
37
+ To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`)
38
+ """
39
+
40
+ tag_name: str
41
+ xpath: str
42
+ attributes: Dict[str, str]
43
+ children: List[DOMBaseNode]
44
+ is_interactive: bool = False
45
+ is_top_element: bool = False
46
+ shadow_root: bool = False
47
+ highlight_index: Optional[int] = None
48
+
49
+ def __repr__(self) -> str:
50
+ tag_str = f'<{self.tag_name}'
51
+
52
+ # Add attributes
53
+ for key, value in self.attributes.items():
54
+ tag_str += f' {key}="{value}"'
55
+ tag_str += '>'
56
+
57
+ # Add extra info
58
+ extras = []
59
+ if self.is_interactive:
60
+ extras.append('interactive')
61
+ if self.is_top_element:
62
+ extras.append('top')
63
+ if self.shadow_root:
64
+ extras.append('shadow-root')
65
+ if self.highlight_index is not None:
66
+ extras.append(f'highlight:{self.highlight_index}')
67
+
68
+ if extras:
69
+ tag_str += f' [{", ".join(extras)}]'
70
+
71
+ return tag_str
72
+
73
+ @cached_property
74
+ def hash(self) -> HashedDomElement:
75
+ from lumivor.dom.history_tree_processor.service import (
76
+ HistoryTreeProcessor,
77
+ )
78
+
79
+ return HistoryTreeProcessor._hash_dom_element(self)
80
+
81
+ def get_all_text_till_next_clickable_element(self) -> str:
82
+ text_parts = []
83
+
84
+ def collect_text(node: DOMBaseNode) -> None:
85
+ # Skip this branch if we hit a highlighted element (except for the current node)
86
+ if (
87
+ isinstance(node, DOMElementNode)
88
+ and node != self
89
+ and node.highlight_index is not None
90
+ ):
91
+ return
92
+
93
+ if isinstance(node, DOMTextNode):
94
+ text_parts.append(node.text)
95
+ elif isinstance(node, DOMElementNode):
96
+ for child in node.children:
97
+ collect_text(child)
98
+
99
+ collect_text(self)
100
+ return '\n'.join(text_parts).strip()
101
+
102
+ def clickable_elements_to_string(self, include_attributes: list[str] = []) -> str:
103
+ """Convert the processed DOM content to HTML."""
104
+ formatted_text = []
105
+
106
+ def process_node(node: DOMBaseNode, depth: int) -> None:
107
+ if isinstance(node, DOMElementNode):
108
+ # Add element with highlight_index
109
+ if node.highlight_index is not None:
110
+ attributes_str = ''
111
+ if include_attributes:
112
+ attributes_str = ' ' + ' '.join(
113
+ f'{key}="{value}"'
114
+ for key, value in node.attributes.items()
115
+ if key in include_attributes
116
+ )
117
+ formatted_text.append(
118
+ f'{node.highlight_index}[:]<{node.tag_name}{attributes_str}>{
119
+ node.get_all_text_till_next_clickable_element()}</{node.tag_name}>'
120
+ )
121
+
122
+ # Process children regardless
123
+ for child in node.children:
124
+ process_node(child, depth + 1)
125
+
126
+ elif isinstance(node, DOMTextNode):
127
+ # Add text only if it doesn't have a highlighted parent
128
+ if not node.has_parent_with_highlight_index():
129
+ formatted_text.append(f'_[:]{node.text}')
130
+
131
+ process_node(self, 0)
132
+ return '\n'.join(formatted_text)
133
+
134
+ def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']:
135
+ # Check if current element is a file input
136
+ if self.tag_name == 'input' and self.attributes.get('type') == 'file':
137
+ return self
138
+
139
+ # Check children
140
+ for child in self.children:
141
+ if isinstance(child, DOMElementNode):
142
+ result = child.get_file_upload_element(check_siblings=False)
143
+ if result:
144
+ return result
145
+
146
+ # Check siblings only for the initial call
147
+ if check_siblings and self.parent:
148
+ for sibling in self.parent.children:
149
+ if sibling is not self and isinstance(sibling, DOMElementNode):
150
+ result = sibling.get_file_upload_element(
151
+ check_siblings=False)
152
+ if result:
153
+ return result
154
+
155
+ return None
156
+
157
+
158
+ class ElementTreeSerializer:
159
+ @staticmethod
160
+ def serialize_clickable_elements(element_tree: DOMElementNode) -> str:
161
+ return element_tree.clickable_elements_to_string()
162
+
163
+ @staticmethod
164
+ def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
165
+ def node_to_dict(node: DOMBaseNode) -> dict:
166
+ if isinstance(node, DOMTextNode):
167
+ return {'type': 'text', 'text': node.text}
168
+ elif isinstance(node, DOMElementNode):
169
+ return {
170
+ 'type': 'element',
171
+ 'tag_name': node.tag_name,
172
+ 'attributes': node.attributes,
173
+ 'highlight_index': node.highlight_index,
174
+ 'children': [node_to_dict(child) for child in node.children],
175
+ }
176
+ return {}
177
+
178
+ return node_to_dict(element_tree)
179
+
180
+
181
+ SelectorMap = dict[int, DOMElementNode]
182
+
183
+
184
+ @dataclass
185
+ class DOMState:
186
+ element_tree: DOMElementNode
187
+ selector_map: SelectorMap
@@ -0,0 +1,128 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+
5
+
6
+ def addLoggingLevel(levelName, levelNum, methodName=None):
7
+ """
8
+ Comprehensively adds a new logging level to the `logging` module and the
9
+ currently configured logging class.
10
+
11
+ `levelName` becomes an attribute of the `logging` module with the value
12
+ `levelNum`. `methodName` becomes a convenience method for both `logging`
13
+ itself and the class returned by `logging.getLoggerClass()` (usually just
14
+ `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
15
+ used.
16
+
17
+ To avoid accidental clobberings of existing attributes, this method will
18
+ raise an `AttributeError` if the level name is already an attribute of the
19
+ `logging` module or if the method name is already present
20
+
21
+ Example
22
+ -------
23
+ >>> addLoggingLevel('TRACE', logging.DEBUG - 5)
24
+ >>> logging.getLogger(__name__).setLevel('TRACE')
25
+ >>> logging.getLogger(__name__).trace('that worked')
26
+ >>> logging.trace('so did this')
27
+ >>> logging.TRACE
28
+ 5
29
+
30
+ """
31
+ if not methodName:
32
+ methodName = levelName.lower()
33
+
34
+ if hasattr(logging, levelName):
35
+ raise AttributeError(
36
+ '{} already defined in logging module'.format(levelName))
37
+ if hasattr(logging, methodName):
38
+ raise AttributeError(
39
+ '{} already defined in logging module'.format(methodName))
40
+ if hasattr(logging.getLoggerClass(), methodName):
41
+ raise AttributeError(
42
+ '{} already defined in logger class'.format(methodName))
43
+
44
+ # This method was inspired by the answers to Stack Overflow post
45
+ # http://stackoverflow.com/q/2183233/2988730, especially
46
+ # http://stackoverflow.com/a/13638084/2988730
47
+ def logForLevel(self, message, *args, **kwargs):
48
+ if self.isEnabledFor(levelNum):
49
+ self._log(levelNum, message, args, **kwargs)
50
+
51
+ def logToRoot(message, *args, **kwargs):
52
+ logging.log(levelNum, message, *args, **kwargs)
53
+
54
+ logging.addLevelName(levelNum, levelName)
55
+ setattr(logging, levelName, levelNum)
56
+ setattr(logging.getLoggerClass(), methodName, logForLevel)
57
+ setattr(logging, methodName, logToRoot)
58
+
59
+
60
+ def setup_logging():
61
+ # Try to add RESULT level, but ignore if it already exists
62
+ try:
63
+ addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL
64
+ except AttributeError:
65
+ pass # Level already exists, which is fine
66
+
67
+ log_type = os.getenv('LUMIVOR_LOGGING_LEVEL', 'info').lower()
68
+
69
+ # Check if handlers are already set up
70
+ if logging.getLogger().hasHandlers():
71
+ return
72
+
73
+ # Clear existing handlers
74
+ root = logging.getLogger()
75
+ root.handlers = []
76
+
77
+ class BrowserUseFormatter(logging.Formatter):
78
+ def format(self, record):
79
+ if record.name.startswith('lumivor.'):
80
+ record.name = record.name.split('.')[-2]
81
+ return super().format(record)
82
+
83
+ # Setup single handler for all loggers
84
+ console = logging.StreamHandler(sys.stdout)
85
+
86
+ # adittional setLevel here to filter logs
87
+ if log_type == 'result':
88
+ console.setLevel('RESULT')
89
+ console.setFormatter(BrowserUseFormatter('%(message)s'))
90
+ else:
91
+ console.setFormatter(BrowserUseFormatter(
92
+ '%(levelname)-8s [%(name)s] %(message)s'))
93
+
94
+ # Configure root logger only
95
+ root.addHandler(console)
96
+
97
+ # switch cases for log_type
98
+ if log_type == 'result':
99
+ root.setLevel('RESULT') # string usage to avoid syntax error
100
+ elif log_type == 'debug':
101
+ root.setLevel(logging.DEBUG)
102
+ else:
103
+ root.setLevel(logging.INFO)
104
+
105
+ # Configure lumivor logger
106
+ lumivor_logger = logging.getLogger('lumivor')
107
+ lumivor_logger.propagate = False # Don't propagate to root logger
108
+ lumivor_logger.addHandler(console)
109
+ lumivor_logger.setLevel(root.level) # Set same level as root logger
110
+
111
+ logger = logging.getLogger('lumivor')
112
+ logger.info('BrowserUse logging setup complete with level %s', log_type)
113
+ # Silence third-party loggers
114
+ for logger in [
115
+ 'WDM',
116
+ 'httpx',
117
+ 'selenium',
118
+ 'playwright',
119
+ 'urllib3',
120
+ 'asyncio',
121
+ 'langchain',
122
+ 'openai',
123
+ 'httpcore',
124
+ 'charset_normalizer',
125
+ ]:
126
+ third_party = logging.getLogger(logger)
127
+ third_party.setLevel(logging.ERROR)
128
+ third_party.propagate = False
@@ -0,0 +1,114 @@
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ from dotenv import load_dotenv
7
+ from posthog import Posthog
8
+
9
+ from lumivor.telemetry.views import BaseTelemetryEvent
10
+ from lumivor.utils import singleton
11
+
12
+ load_dotenv()
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ POSTHOG_EVENT_SETTINGS = {
19
+ 'process_person_profile': True,
20
+ 'geoip_city_name': None,
21
+ 'geoip_country_name': None,
22
+ 'geoip_country_code': None,
23
+ 'geoip_continent_name': None,
24
+ 'geoip_continent_code': None,
25
+ 'geoip_time_zone': None,
26
+ }
27
+
28
+
29
+ @singleton
30
+ class ProductTelemetry:
31
+ """
32
+ Service for capturing anonymized telemetry data.
33
+
34
+ If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled.
35
+ """
36
+
37
+ USER_ID_PATH = str(Path.home() / '.cache' /
38
+ 'lumivor' / 'telemetry_user_id')
39
+ PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh'
40
+ HOST = 'https://eu.i.posthog.com'
41
+ UNKNOWN_USER_ID = 'UNKNOWN'
42
+
43
+ _curr_user_id = None
44
+
45
+ def __init__(self) -> None:
46
+ telemetry_disabled = os.getenv(
47
+ 'ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
48
+ self.debug_logging = os.getenv(
49
+ 'LUMIVOR_LOGGING_LEVEL', 'info').lower() == 'debug'
50
+
51
+ if telemetry_disabled:
52
+ self._posthog_client = None
53
+ else:
54
+ logging.info(
55
+ 'Anonymized telemetry enabled. See https://github.com/Lumivor-Labs/lumivor for more information.'
56
+ )
57
+ self._posthog_client = Posthog(
58
+ project_api_key=self.PROJECT_API_KEY,
59
+ host=self.HOST,
60
+ disable_geoip=False,
61
+ )
62
+
63
+ # Silence posthog's logging
64
+ if not self.debug_logging:
65
+ posthog_logger = logging.getLogger('posthog')
66
+ posthog_logger.disabled = True
67
+
68
+ if self._posthog_client is None:
69
+ logger.debug('Telemetry disabled')
70
+
71
+ def capture(self, event: BaseTelemetryEvent) -> None:
72
+ if self._posthog_client is None:
73
+ return
74
+
75
+ if self.debug_logging:
76
+ logger.debug(f'Telemetry event: {event.name} {event.properties}')
77
+ self._direct_capture(event)
78
+
79
+ def _direct_capture(self, event: BaseTelemetryEvent) -> None:
80
+ """
81
+ Should not be thread blocking because posthog magically handles it
82
+ """
83
+ if self._posthog_client is None:
84
+ return
85
+
86
+ try:
87
+ self._posthog_client.capture(
88
+ self.user_id,
89
+ event.name,
90
+ {**event.properties, **POSTHOG_EVENT_SETTINGS},
91
+ )
92
+ except Exception as e:
93
+ logger.error(f'Failed to send telemetry event {event.name}: {e}')
94
+
95
+ @property
96
+ def user_id(self) -> str:
97
+ if self._curr_user_id:
98
+ return self._curr_user_id
99
+
100
+ # File access may fail due to permissions or other reasons. We don't want to
101
+ # crash so we catch all exceptions.
102
+ try:
103
+ if not os.path.exists(self.USER_ID_PATH):
104
+ os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True)
105
+ with open(self.USER_ID_PATH, 'w') as f:
106
+ new_user_id = str(uuid.uuid4())
107
+ f.write(new_user_id)
108
+ self._curr_user_id = new_user_id
109
+ else:
110
+ with open(self.USER_ID_PATH, 'r') as f:
111
+ self._curr_user_id = f.read()
112
+ except Exception:
113
+ self._curr_user_id = 'UNKNOWN_USER_ID'
114
+ return self._curr_user_id
@@ -0,0 +1,51 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import asdict, dataclass
3
+ from typing import Any, Dict, Optional
4
+
5
+
6
+ @dataclass
7
+ class BaseTelemetryEvent(ABC):
8
+ @property
9
+ @abstractmethod
10
+ def name(self) -> str:
11
+ pass
12
+
13
+ @property
14
+ def properties(self) -> Dict[str, Any]:
15
+ return {k: v for k, v in asdict(self).items() if k != "name"}
16
+
17
+
18
+ @dataclass
19
+ class RegisteredFunction:
20
+ name: str
21
+ params: dict[str, Any]
22
+
23
+
24
+ @dataclass
25
+ class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent):
26
+ registered_functions: list[RegisteredFunction]
27
+ name: str = "controller_registered_functions"
28
+
29
+
30
+ @dataclass
31
+ class AgentRunTelemetryEvent(BaseTelemetryEvent):
32
+ agent_id: str
33
+ task: str
34
+ name: str = "agent_run"
35
+
36
+
37
+ @dataclass
38
+ class AgentStepErrorTelemetryEvent(BaseTelemetryEvent):
39
+ agent_id: str
40
+ error: str
41
+ name: str = "agent_step_error"
42
+
43
+
44
+ @dataclass
45
+ class AgentEndTelemetryEvent(BaseTelemetryEvent):
46
+ agent_id: str
47
+ task: str
48
+ steps: int
49
+ success: bool
50
+ error: Optional[str] = None
51
+ name: str = "agent_end"
lumivor/utils.py ADDED
@@ -0,0 +1,54 @@
1
+ import logging
2
+ import time
3
+ from functools import wraps
4
+ from typing import Any, Callable, Coroutine, ParamSpec, TypeVar
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ # Define generic type variables for return type and parameters
10
+ R = TypeVar('R')
11
+ P = ParamSpec('P')
12
+
13
+
14
+ def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], Callable[P, R]]:
15
+ def decorator(func: Callable[P, R]) -> Callable[P, R]:
16
+ @wraps(func)
17
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
18
+ start_time = time.time()
19
+ result = func(*args, **kwargs)
20
+ execution_time = time.time() - start_time
21
+ logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
22
+ return result
23
+
24
+ return wrapper
25
+
26
+ return decorator
27
+
28
+
29
+ def time_execution_async(
30
+ additional_text: str = '',
31
+ ) -> Callable[[Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]]:
32
+ def decorator(func: Callable[P, Coroutine[Any, Any, R]]) -> Callable[P, Coroutine[Any, Any, R]]:
33
+ @wraps(func)
34
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
35
+ start_time = time.time()
36
+ result = await func(*args, **kwargs)
37
+ execution_time = time.time() - start_time
38
+ logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
39
+ return result
40
+
41
+ return wrapper
42
+
43
+ return decorator
44
+
45
+
46
+ def singleton(cls):
47
+ instance = [None]
48
+
49
+ def wrapper(*args, **kwargs):
50
+ if instance[0] is None:
51
+ instance[0] = cls(*args, **kwargs)
52
+ return instance[0]
53
+
54
+ return wrapper