pydoll-python 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ from contextlib import suppress
5
+ from tempfile import TemporaryDirectory
6
+
7
+ from pydoll.browser.options import Options
8
+
9
+
10
+ class ProxyManager:
11
+ def __init__(self, options):
12
+ self.options = options
13
+
14
+ def get_proxy_credentials(self) -> tuple[bool, tuple[str, str]]:
15
+ """
16
+ Configura as configurações de proxy e extrai credenciais se presentes.
17
+
18
+ Returns:
19
+ tuple[bool, tuple[str, str]]: (private_proxy, (username, password))
20
+ """
21
+ private_proxy = False
22
+ credentials = (None, None)
23
+
24
+ proxy_arg = self._find_proxy_argument()
25
+
26
+ if proxy_arg is not None:
27
+ index, proxy_value = proxy_arg
28
+ has_credentials, username, password, clean_proxy = (
29
+ self._parse_proxy(proxy_value)
30
+ )
31
+
32
+ if has_credentials:
33
+ self._update_proxy_argument(index, clean_proxy)
34
+ private_proxy = True
35
+ credentials = (username, password)
36
+
37
+ return private_proxy, credentials
38
+
39
+ def _find_proxy_argument(self) -> tuple[int, str] | None:
40
+ """Encontra o primeiro argumento --proxy-server válido"""
41
+ for index, arg in enumerate(self.options.arguments):
42
+ if arg.startswith('--proxy-server='):
43
+ return index, arg.split('=', 1)[1]
44
+ return None
45
+
46
+ @staticmethod
47
+ def _parse_proxy(proxy_value: str) -> tuple[bool, str, str, str]:
48
+ """Extrai credenciais e limpa o valor do proxy"""
49
+ if '@' not in proxy_value:
50
+ return False, None, None, proxy_value
51
+
52
+ try:
53
+ creds_part, server_part = proxy_value.split('@', 1)
54
+ username, password = creds_part.split(':', 1)
55
+ return True, username, password, server_part
56
+ except ValueError:
57
+ return False, None, None, proxy_value
58
+
59
+ def _update_proxy_argument(self, index: int, clean_proxy: str) -> None:
60
+ """Atualiza a lista de argumentos com proxy limpo"""
61
+ self.options.arguments[index] = f'--proxy-server={clean_proxy}'
62
+
63
+
64
+ class BrowserProcessManager:
65
+ def __init__(self, process_creator=None):
66
+ self._process_creator = (
67
+ process_creator or self._default_process_creator
68
+ )
69
+ self._process = None
70
+
71
+ def start_browser_process(
72
+ self, binary_location: str, port: int, arguments: list
73
+ ) -> None:
74
+ """Inicia o processo do navegador"""
75
+ self._process = self._process_creator([
76
+ binary_location,
77
+ f'--remote-debugging-port={port}',
78
+ *arguments,
79
+ ])
80
+ return self._process
81
+
82
+ @staticmethod
83
+ def _default_process_creator(command: list[str]):
84
+ return subprocess.Popen(
85
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
86
+ )
87
+
88
+ def stop_process(self):
89
+ """Para o processo do navegador se estiver em execução"""
90
+ if self._process:
91
+ self._process.terminate()
92
+
93
+
94
+ class TempDirectoryManager:
95
+ def __init__(self, temp_dir_factory=TemporaryDirectory):
96
+ self._temp_dir_factory = temp_dir_factory
97
+ self._temp_dirs = []
98
+
99
+ def create_temp_dir(self):
100
+ """
101
+ Cria um diretório temporário para a instância do navegador.
102
+
103
+ Returns:
104
+ TemporaryDirectory: O diretório temporário.
105
+ """
106
+ temp_dir = self._temp_dir_factory()
107
+ self._temp_dirs.append(temp_dir)
108
+ return temp_dir
109
+
110
+ def cleanup(self):
111
+ """Limpa todos os diretórios temporários"""
112
+ for temp_dir in self._temp_dirs:
113
+ with suppress(OSError):
114
+ shutil.rmtree(temp_dir.name)
115
+
116
+
117
+ class BrowserOptionsManager:
118
+ @staticmethod
119
+ def initialize_options(options: Options | None) -> Options:
120
+ """
121
+ Inicializa as opções para o navegador.
122
+
123
+ Args:
124
+ options (Options | None): Uma instância da classe Options ou None.
125
+
126
+ Returns:
127
+ Options: A instância de opções inicializada.
128
+ """
129
+ if options is None:
130
+ return Options()
131
+ if not isinstance(options, Options):
132
+ raise ValueError('Invalid options')
133
+ return options
134
+
135
+ @staticmethod
136
+ def add_default_arguments(options: Options):
137
+ """Adiciona argumentos padrão aos argumentos fornecidos"""
138
+ options.arguments.append('--no-first-run')
139
+ options.arguments.append('--no-default-browser-check')
140
+
141
+ @staticmethod
142
+ def validate_browser_path(path: str) -> str:
143
+ """
144
+ Valida o caminho fornecido do navegador.
145
+
146
+ Args:
147
+ path (str): O caminho do arquivo executável do navegador.
148
+
149
+ Returns:
150
+ str: O caminho do navegador validado.
151
+ """
152
+ if not os.path.exists(path):
153
+ raise ValueError(f'Browser not found: {path}')
154
+ return path
@@ -0,0 +1,62 @@
1
+ class Options:
2
+ """
3
+ A class to manage command-line options for a browser instance.
4
+
5
+ This class allows the user to specify command-line arguments and
6
+ the binary location of the browser executable.
7
+ """
8
+
9
+ def __init__(self):
10
+ """
11
+ Initializes the Options instance.
12
+
13
+ Sets up an empty list for command-line arguments and a string
14
+ for the binary location of the browser.
15
+ """
16
+ self._arguments = []
17
+ self._binary_location = ''
18
+
19
+ @property
20
+ def arguments(self) -> list:
21
+ """
22
+ Gets the list of command-line arguments.
23
+
24
+ Returns:
25
+ list: A list of command-line arguments added to the options.
26
+ """
27
+ return self._arguments
28
+
29
+ @property
30
+ def binary_location(self) -> str:
31
+ """
32
+ Gets the location of the browser binary.
33
+
34
+ Returns:
35
+ str: The file path to the browser executable.
36
+ """
37
+ return self._binary_location
38
+
39
+ @binary_location.setter
40
+ def binary_location(self, location: str):
41
+ """
42
+ Sets the location of the browser binary.
43
+
44
+ Args:
45
+ location (str): The file path to the browser executable.
46
+ """
47
+ self._binary_location = location
48
+
49
+ def add_argument(self, argument: str):
50
+ """
51
+ Adds a command-line argument to the options.
52
+
53
+ Args:
54
+ argument (str): The command-line argument to be added.
55
+
56
+ Raises:
57
+ ValueError: If the argument is already in the list of arguments.
58
+ """
59
+ if argument not in self.arguments:
60
+ self.arguments.append(argument)
61
+ else:
62
+ raise ValueError(f'Argument already exists: {argument}')
pydoll/browser/page.py ADDED
@@ -0,0 +1,433 @@
1
+ import asyncio
2
+ import json
3
+
4
+ import aiofiles
5
+
6
+ from pydoll.commands.dom import DomCommands
7
+ from pydoll.commands.fetch import FetchCommands
8
+ from pydoll.commands.network import NetworkCommands
9
+ from pydoll.commands.page import PageCommands
10
+ from pydoll.commands.runtime import RuntimeCommands
11
+ from pydoll.commands.storage import StorageCommands
12
+ from pydoll.connection.connection import ConnectionHandler
13
+ from pydoll.element import WebElement
14
+ from pydoll.mixins.find_elements import FindElementsMixin
15
+ from pydoll.utils import decode_image_to_bytes
16
+
17
+
18
+ class Page(FindElementsMixin): # noqa: PLR0904
19
+ def __init__(self, connection_port: int, page_id: str):
20
+ """
21
+ Initializes the Page instance.
22
+
23
+ Args:
24
+ connection_handler (ConnectionHandler): The connection handler
25
+ instance.
26
+ page_id (str): The ID of the page, obtained via the DevTools
27
+ Protocol.
28
+ """
29
+ self._connection_handler = ConnectionHandler(connection_port, page_id)
30
+ self._page_events_enabled = False
31
+ self._network_events_enabled = False
32
+ self._fetch_events_enabled = False
33
+ self._dom_events_enabled = False
34
+
35
+ @property
36
+ def page_events_enabled(self) -> bool:
37
+ """
38
+ Returns whether page events are enabled or not.
39
+
40
+ Returns:
41
+ bool: True if page events are enabled, False otherwise.
42
+ """
43
+ return self._page_events_enabled
44
+
45
+ @property
46
+ def network_events_enabled(self) -> bool:
47
+ """
48
+ Returns whether network events are enabled or not.
49
+
50
+ Returns:
51
+ bool: True if network events are enabled, False otherwise.
52
+ """
53
+ return self._network_events_enabled
54
+
55
+ @property
56
+ def fetch_events_enabled(self) -> bool:
57
+ """
58
+ Returns whether fetch events are enabled or not.
59
+
60
+ Returns:
61
+ bool: True if fetch events are enabled, False otherwise.
62
+ """
63
+ return self._fetch_events_enabled
64
+
65
+ @property
66
+ def dom_events_enabled(self) -> bool:
67
+ """
68
+ Returns whether DOM events are enabled or not.
69
+
70
+ Returns:
71
+ bool: True if DOM events are enabled, False otherwise.
72
+ """
73
+ return self._dom_events_enabled
74
+
75
+ @property
76
+ async def current_url(self) -> str:
77
+ """
78
+ Retrieves the current URL of the page.
79
+
80
+ Returns:
81
+ str: The current URL of the page.
82
+ """
83
+ response = await self._execute_command(DomCommands.get_current_url())
84
+ return response['result']['result']['value']
85
+
86
+ @property
87
+ async def page_source(self) -> str:
88
+ """
89
+ Retrieves the source code of the page.
90
+
91
+ Returns:
92
+ str: The source code of the page.
93
+
94
+ """
95
+ response = await self._execute_command(
96
+ RuntimeCommands.evaluate_script(
97
+ 'document.documentElement.outerHTML'
98
+ )
99
+ )
100
+ return response['result']['result']['value']
101
+
102
+ async def close(self):
103
+ """
104
+ Closes the page.
105
+ """
106
+ await self._execute_command(PageCommands.close())
107
+
108
+ async def get_cookies(self) -> list[dict]:
109
+ """
110
+ Retrieves the cookies of the page.
111
+
112
+ Returns:
113
+ list: A list of cookies.
114
+ """
115
+ response = await self._execute_command(
116
+ NetworkCommands.get_all_cookies()
117
+ )
118
+ return response['result']['cookies']
119
+
120
+ async def set_cookies(self, cookies: list[dict]):
121
+ """
122
+ Sets cookies for the page.
123
+
124
+ Args:
125
+ cookies (list): A list of cookies to set.
126
+ """
127
+ await self._execute_command(NetworkCommands.set_cookies(cookies))
128
+
129
+ async def delete_all_cookies(self):
130
+ """
131
+ Deletes all cookies from the browser.
132
+ """
133
+ await self._execute_command(StorageCommands.clear_cookies())
134
+ await self._execute_command(NetworkCommands.clear_browser_cookies())
135
+
136
+ async def has_dialog(self) -> bool:
137
+ """
138
+ Checks if a dialog is present on the page.
139
+
140
+ Returns:
141
+ bool: True if a dialog is present, False otherwise.
142
+ """
143
+ if self._connection_handler.dialog:
144
+ return True
145
+ return False
146
+
147
+ async def get_dialog_message(self) -> str:
148
+ """
149
+ Retrieves the message of the dialog on the page.
150
+
151
+ Returns:
152
+ str: The message of the dialog.
153
+ """
154
+ if not await self.has_dialog():
155
+ raise LookupError('No dialog present on the page')
156
+ return self._connection_handler.dialog['params']['message']
157
+
158
+ async def accept_dialog(self):
159
+ """
160
+ Accepts the dialog on the page.
161
+
162
+ Raises:
163
+ LookupError: If no dialog is present on the page.
164
+ """
165
+ if not await self.has_dialog():
166
+ raise LookupError('No dialog present on the page')
167
+ await self._execute_command(PageCommands.handle_dialog(True))
168
+
169
+ async def go_to(self, url: str, timeout=300):
170
+ """
171
+ Navigates to a URL in the page.
172
+
173
+ Args:
174
+ url (str): The URL to navigate to.
175
+ """
176
+ if await self._refresh_if_url_not_changed(url):
177
+ return
178
+
179
+ await self._execute_command(PageCommands.go_to(url))
180
+
181
+ try:
182
+ await self._wait_page_load(timeout=timeout)
183
+ except asyncio.TimeoutError:
184
+ raise TimeoutError('Page load timed out')
185
+
186
+ async def refresh(self):
187
+ """
188
+ Refreshes the page.
189
+ """
190
+ await self._execute_command(PageCommands.refresh())
191
+ try:
192
+ await self._wait_page_load()
193
+ except asyncio.TimeoutError:
194
+ raise TimeoutError('Page load timed out')
195
+
196
+ async def get_screenshot(self, path: str):
197
+ """
198
+ Captures a screenshot of the page.
199
+
200
+ Args:
201
+ path (str): The file path to save the screenshot to.
202
+ """
203
+ response = await self._execute_command(PageCommands.screenshot())
204
+ screenshot_b64 = response['result']['data'].encode('utf-8')
205
+ screenshot_bytes = decode_image_to_bytes(screenshot_b64)
206
+ async with aiofiles.open(path, 'wb') as file:
207
+ await file.write(screenshot_bytes)
208
+
209
+ async def set_download_path(self, path: str):
210
+ """
211
+ Sets the download path for the page.
212
+
213
+ Args:
214
+ path (str): The path where the downloaded files should be saved.
215
+ """
216
+ await self._execute_command(PageCommands.set_download_path(path))
217
+
218
+ async def get_pdf_base64(self):
219
+ """
220
+ Retrieves the PDF data of the page.
221
+
222
+ Returns:
223
+ str: The PDF data of the page.
224
+ """
225
+ response = await self._execute_command(PageCommands.print_to_pdf())
226
+ return response['result']['data']
227
+
228
+ async def print_to_pdf(self, path: str):
229
+ """
230
+ Prints the page to a PDF file.
231
+
232
+ Args:
233
+ path (str): The file path to save the PDF file to.
234
+ """
235
+ response = await self._execute_command(PageCommands.print_to_pdf(path))
236
+ pdf_b64 = response['result']['data'].encode('utf-8')
237
+ pdf_bytes = decode_image_to_bytes(pdf_b64)
238
+ async with aiofiles.open(path, 'wb') as file:
239
+ await file.write(pdf_bytes)
240
+
241
+ async def get_network_logs(self, matches: list[str] = []):
242
+ """
243
+ Retrieves network logs from the page.
244
+
245
+ Args:
246
+ matches (str): The URL pattern to match network logs against.
247
+
248
+ Returns:
249
+ list: A list of network logs that match the specified pattern.
250
+ """
251
+ network_logs = self._connection_handler.network_logs
252
+ logs_matched = []
253
+ for log in network_logs:
254
+ if not log.get('params', {}).get('request', {}).get('url'):
255
+ continue
256
+ for match in matches:
257
+ if match in log['params']['request']['url']:
258
+ logs_matched.append(log)
259
+ break
260
+
261
+ if not logs_matched:
262
+ raise LookupError('No network logs matched the specified pattern')
263
+
264
+ return logs_matched
265
+
266
+ async def get_network_response_bodies(self, matches: list[str] = []):
267
+ """
268
+ Retrieves the response bodies of network requests that match the
269
+ specified pattern.
270
+
271
+ Args:
272
+ matches (list): The URL patterns to match network requests against.
273
+
274
+ Returns:
275
+ list: A list of response bodies from network requests that match
276
+ the specified patterns.
277
+ """
278
+ logs_matched = await self.get_network_logs(matches)
279
+ responses = []
280
+ for log in logs_matched:
281
+ try:
282
+ body, base64encoded = await self.get_network_response_body(
283
+ log['params']['requestId']
284
+ )
285
+ except KeyError:
286
+ continue
287
+ response = json.loads(body) if not base64encoded else body
288
+ responses.append(response)
289
+ return responses
290
+
291
+ async def get_network_response_body(self, request_id: str):
292
+ """
293
+ Retrieves the response body of a network request.
294
+
295
+ Args:
296
+ request_id (str): The ID of the network request.
297
+
298
+ Returns:
299
+ str: The response body of the network request.
300
+ """
301
+ response = await self._execute_command(
302
+ NetworkCommands.get_response_body(request_id)
303
+ )
304
+ return (
305
+ response['result']['body'],
306
+ response['result']['base64Encoded'],
307
+ )
308
+
309
+ async def enable_page_events(self):
310
+ """
311
+ Enables page events for the page.
312
+ """
313
+ await self._execute_command(PageCommands.enable_page())
314
+ self._page_events_enabled = True
315
+
316
+ async def enable_network_events(self):
317
+ """
318
+ Enables network events for the page.
319
+ """
320
+ await self._execute_command(NetworkCommands.enable_network_events())
321
+ self._network_events_enabled = True
322
+
323
+ async def enable_fetch_events(
324
+ self, handle_auth: bool = False, resource_type: str = 'Document'
325
+ ):
326
+ """
327
+ Enables fetch events for the page.
328
+ """
329
+ await self._execute_command(
330
+ FetchCommands.enable_fetch_events(handle_auth, resource_type)
331
+ )
332
+ self._fetch_events_enabled = True
333
+
334
+ async def enable_dom_events(self):
335
+ """
336
+ Enables DOM events for the page.
337
+ """
338
+ await self._execute_command(DomCommands.enable_dom_events())
339
+ self._dom_events_enabled = True
340
+
341
+ async def disable_fetch_events(self):
342
+ """
343
+ Disables fetch events for the page.
344
+ """
345
+ await self._execute_command(FetchCommands.disable_fetch_events())
346
+ self._fetch_events_enabled = False
347
+
348
+ async def disable_page_events(self):
349
+ """
350
+ Disables page events for the page.
351
+ """
352
+ await self._execute_command(PageCommands.disable_page())
353
+ self._page_events_enabled = False
354
+
355
+ async def on(
356
+ self, event_name: str, callback: callable, temporary: bool = False
357
+ ):
358
+ """
359
+ Registers an event listener for the page.
360
+
361
+ Args:
362
+ event (str): The event to listen for.
363
+ callback (callable): The callback function to execute when the
364
+ event is triggered.
365
+ temporary (bool): Whether the event listener is temporary or not.
366
+ """
367
+
368
+ async def callback_wrapper(event):
369
+ asyncio.create_task(callback(event))
370
+
371
+ if asyncio.iscoroutinefunction(callback):
372
+ function_to_register = callback_wrapper
373
+ else:
374
+ function_to_register = callback
375
+
376
+ return await self._connection_handler.register_callback(
377
+ event_name, function_to_register, temporary
378
+ )
379
+
380
+ async def execute_script(self, script: str, element: WebElement = None):
381
+ """
382
+ Executes a JavaScript script in the page.
383
+ If an element is provided, the script will be executed in the context
384
+ of that element. To provide the element context, use the 'argument'
385
+ keyword in the script.
386
+
387
+ Examples:
388
+ ```python
389
+ await page.execute_script('argument.click()', element)
390
+ await page.execute_script('argument.value = "Hello, World!"', element)
391
+ ```
392
+
393
+ Args:
394
+ script (str): The JavaScript script to execute.
395
+ """
396
+ if element:
397
+ script = script.replace('argument', 'this')
398
+ script = f'function(){{ {script} }}'
399
+ object_id = element._object_id
400
+ command = RuntimeCommands.call_function_on(
401
+ object_id, script, return_by_value=True
402
+ )
403
+ else:
404
+ command = RuntimeCommands.evaluate_script(script)
405
+ return await self._execute_command(command)
406
+
407
+ async def _refresh_if_url_not_changed(self, url: str):
408
+ """
409
+ Refreshes the page if the URL has not changed.
410
+
411
+ Args:
412
+ url (str): The URL to compare against.
413
+ """
414
+ current_url = await self.current_url
415
+ if current_url == url:
416
+ await self.refresh()
417
+ return True
418
+ return False
419
+
420
+ async def _wait_page_load(self, timeout: int = 300):
421
+ """
422
+ Waits for the page to finish loading.
423
+ """
424
+ start_time = asyncio.get_event_loop().time()
425
+ while True:
426
+ response = await self._execute_command(
427
+ RuntimeCommands.evaluate_script('document.readyState')
428
+ )
429
+ if response['result']['result']['value'] == 'complete':
430
+ break
431
+ if asyncio.get_event_loop().time() - start_time > timeout:
432
+ raise asyncio.TimeoutError('Page load timed out')
433
+ await asyncio.sleep(0.5)
@@ -0,0 +1,18 @@
1
+ # global imports
2
+ from pydoll.commands.dom import DomCommands
3
+ from pydoll.commands.fetch import FetchCommands
4
+ from pydoll.commands.input import InputCommands
5
+ from pydoll.commands.network import NetworkCommands
6
+ from pydoll.commands.page import PageCommands
7
+ from pydoll.commands.runtime import RuntimeCommands
8
+ from pydoll.commands.storage import StorageCommands
9
+
10
+ __all__ = [
11
+ 'DomCommands',
12
+ 'FetchCommands',
13
+ 'InputCommands',
14
+ 'NetworkCommands',
15
+ 'PageCommands',
16
+ 'RuntimeCommands',
17
+ 'StorageCommands',
18
+ ]