seleniumbase 4.32.1__py3-none-any.whl → 4.32.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1319 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ import logging
4
+ import pathlib
5
+ import warnings
6
+ from typing import Dict, List, Union, Optional, Tuple
7
+ from . import browser as cdp_browser
8
+ from . import element
9
+ from . import cdp_util as util
10
+ from .config import PathLike
11
+ from .connection import Connection, ProtocolException
12
+ import mycdp as cdp
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class Tab(Connection):
18
+ """
19
+ :ref:`tab` is the controlling mechanism/connection to a 'target',
20
+ for most of us 'target' can be read as 'tab'. However it could also
21
+ be an iframe, serviceworker or background script for example,
22
+ although there isn't much to control for those.
23
+ If you open a new window by using
24
+ :py:meth:`browser.get(..., new_window=True)`
25
+ Your url will open a new window. This window is a 'tab'.
26
+ When you browse to another page, the tab will be the same (browser view).
27
+ It's important to keep some reference to tab objects, in case you're
28
+ done interacting with elements and want to operate on the page level again.
29
+
30
+ Custom CDP commands
31
+ ---------------------------
32
+ Tab object provide many useful and often-used methods. It is also possible
33
+ to utilize the included cdp classes to to something totally custom.
34
+
35
+ The cdp package is a set of so-called "domains" with each having methods,
36
+ events and types.
37
+ To send a cdp method, for example :py:obj:`cdp.page.navigate`,
38
+ you'll have to check whether the method accepts any parameters
39
+ and whether they are required or not.
40
+
41
+ You can use:
42
+
43
+ ```python
44
+ await tab.send(cdp.page.navigate(url='https://Your-URL-Here'))
45
+ ```
46
+
47
+ So tab.send() accepts a generator object,
48
+ which is created by calling a cdp method.
49
+ This way you can build very detailed and customized commands.
50
+ (Note: Finding correct command combos can be a time-consuming task.
51
+ A whole bunch of useful methods have been added,
52
+ preferably having the same apis or lookalikes, as in selenium.)
53
+
54
+ Some useful, often needed and simply required methods
55
+ ===================================================================
56
+
57
+ :py:meth:`~find` | find(text)
58
+ ----------------------------------------
59
+ Finds and returns a single element by text match.
60
+ By default, returns the first element found.
61
+ Much more powerful is the best_match flag,
62
+ although also much more expensive.
63
+ When no match is found, it will retry for <timeout> seconds (default: 10),
64
+ so this is also suitable to use as wait condition.
65
+
66
+ :py:meth:`~find` | find(text, best_match=True) or find(text, True)
67
+ -----------------------------------------------------------------------
68
+ Much more powerful (and expensive) than the above is
69
+ the use of the `find(text, best_match=True)` flag.
70
+ It will still return 1 element, but when multiple matches are found,
71
+ it picks the one having the most similar text length.
72
+ How would that help?
73
+ For example, you search for "login",
74
+ you'd probably want the "login" button element,
75
+ and not thousands of scripts/meta/headings,
76
+ which happens to contain a string of "login".
77
+
78
+ When no match is found, it will retry for <timeout> seconds (default: 10),
79
+ so this is also suitable to use as wait condition.
80
+
81
+ :py:meth:`~select` | select(selector)
82
+ ----------------------------------------
83
+ Finds and returns a single element by css selector match.
84
+ When no match is found, it will retry for <timeout> seconds (default: 10),
85
+ so this is also suitable to use as wait condition.
86
+
87
+ :py:meth:`~select_all` | select_all(selector)
88
+ ------------------------------------------------
89
+ Finds and returns all elements by css selector match.
90
+ When no match is found, it will retry for <timeout> seconds (default: 10),
91
+ so this is also suitable to use as wait condition.
92
+
93
+ await :py:obj:`Tab`
94
+ ---------------------------
95
+ Calling `await tab` will do a lot of stuff under the hood,
96
+ and ensures all references are up to date.
97
+ Also it allows for the script to "breathe",
98
+ as it is oftentime faster than your browser or webpage.
99
+ So whenever you get stuck and things crashes or element could not be found,
100
+ you should probably let it "breathe" by calling `await page`
101
+ and/or `await page.sleep()`.
102
+
103
+ It ensures :py:obj:`~url` will be updated to the most recent one,
104
+ which is quite important in some other methods.
105
+
106
+ Using other and custom CDP commands
107
+ ======================================================
108
+ Using the included cdp module, you can easily craft commands,
109
+ which will always return an generator object.
110
+ This generator object can be easily sent to the :py:meth:`~send` method.
111
+
112
+ :py:meth:`~send`
113
+ ---------------------------
114
+ This is probably the most important method,
115
+ although you won't ever call it, unless you want to go really custom.
116
+ The send method accepts a :py:obj:`cdp` command.
117
+ Each of which can be found in the cdp section.
118
+
119
+ When you import * from this package, cdp will be in your namespace,
120
+ and contains all domains/actions/events you can act upon.
121
+ """
122
+ browser: cdp_browser.Browser
123
+ _download_behavior: List[str] = None
124
+
125
+ def __init__(
126
+ self,
127
+ websocket_url: str,
128
+ target: cdp.target.TargetInfo,
129
+ browser: Optional["cdp_browser.Browser"] = None,
130
+ **kwargs,
131
+ ):
132
+ super().__init__(websocket_url, target, browser, **kwargs)
133
+ self.browser = browser
134
+ self._dom = None
135
+ self._window_id = None
136
+
137
+ @property
138
+ def inspector_url(self):
139
+ """
140
+ Get the inspector url.
141
+ This url can be used in another browser to show you
142
+ the devtools interface for current tab.
143
+ Useful for debugging and headless mode.
144
+ """
145
+ return f"http://{self.browser.config.host}:{self.browser.config.port}/devtools/inspector.html?ws={self.websocket_url[5:]}" # noqa
146
+
147
+ def inspector_open(self):
148
+ import webbrowser
149
+
150
+ webbrowser.open(self.inspector_url, new=2)
151
+
152
+ async def open_external_inspector(self):
153
+ """
154
+ Opens the system's browser containing the devtools inspector page
155
+ for this tab. Could be handy, especially to debug in headless mode.
156
+ """
157
+ import webbrowser
158
+
159
+ webbrowser.open(self.inspector_url)
160
+
161
+ async def find(
162
+ self,
163
+ text: str,
164
+ best_match: bool = False,
165
+ return_enclosing_element: bool = True,
166
+ timeout: Union[int, float] = 10,
167
+ ):
168
+ """
169
+ Find single element by text.
170
+ Can also be used to wait for such element to appear.
171
+ :param text:
172
+ Text to search for. Note: Script contents are also considered text.
173
+ :type text: str
174
+ :param best_match: :param best_match:
175
+ When True (default), it will return the element which has the most
176
+ comparable string length. This could help a lot. Eg:
177
+ If you search for "login", you probably want the login button element,
178
+ and not thousands of tags/scripts containing a "login" string.
179
+ When False, it returns just the first match (but is way faster).
180
+ :type best_match: bool
181
+ :param return_enclosing_element:
182
+ Since we deal with nodes instead of elements,
183
+ the find function most often returns so called text nodes,
184
+ which is actually a element of plain text,
185
+ which is the somehow imaginary "child" of a "span", "p", "script"
186
+ or any other elements which have text between their opening
187
+ and closing tags.
188
+ Most often when we search by text, we actually aim for the
189
+ element containing the text instead of a lousy plain text node,
190
+ so by default the containing element is returned.
191
+ There are exceptions. Eg:
192
+ Elements that use the "placeholder=" property.
193
+ :type return_enclosing_element: bool
194
+ :param timeout:
195
+ Raise timeout exception when after this many seconds nothing is found.
196
+ :type timeout: float,int
197
+ """
198
+ loop = asyncio.get_running_loop()
199
+ start_time = loop.time()
200
+ text = text.strip()
201
+ item = None
202
+ try:
203
+ item = await self.find_element_by_text(
204
+ text, best_match, return_enclosing_element
205
+ )
206
+ except (Exception, TypeError):
207
+ pass
208
+ while not item:
209
+ await self
210
+ item = await self.find_element_by_text(
211
+ text, best_match, return_enclosing_element
212
+ )
213
+ if loop.time() - start_time > timeout:
214
+ raise asyncio.TimeoutError(
215
+ "Time ran out while waiting for: {%s}" % text
216
+ )
217
+ await self.sleep(0.5)
218
+ return item
219
+
220
+ async def select(
221
+ self,
222
+ selector: str,
223
+ timeout: Union[int, float] = 10,
224
+ ) -> element.Element:
225
+ """
226
+ Find a single element by css selector.
227
+ Can also be used to wait for such an element to appear.
228
+ :param selector: css selector,
229
+ eg a[href], button[class*=close], a > img[src]
230
+ :type selector: str
231
+ :param timeout:
232
+ Raise timeout exception when after this many seconds nothing is found.
233
+ :type timeout: float,int
234
+ """
235
+ loop = asyncio.get_running_loop()
236
+ start_time = loop.time()
237
+ selector = selector.strip()
238
+ item = None
239
+ try:
240
+ item = await self.query_selector(selector)
241
+ except (Exception, TypeError):
242
+ pass
243
+ while not item:
244
+ await self
245
+ item = await self.query_selector(selector)
246
+ if loop.time() - start_time > timeout:
247
+ raise asyncio.TimeoutError(
248
+ "Time ran out while waiting for: {%s}" % selector
249
+ )
250
+ await self.sleep(0.5)
251
+ return item
252
+
253
+ async def find_all(
254
+ self,
255
+ text: str,
256
+ timeout: Union[int, float] = 10,
257
+ ) -> List[element.Element]:
258
+ """
259
+ Find multiple elements by text.
260
+ Can also be used to wait for such elements to appear.
261
+ :param text: Text to search for.
262
+ Note: Script contents are also considered text.
263
+ :type text: str
264
+ :param timeout:
265
+ Raise timeout exception when after this many seconds nothing is found.
266
+ :type timeout: float,int
267
+ """
268
+ loop = asyncio.get_running_loop()
269
+ now = loop.time()
270
+ text = text.strip()
271
+ items = []
272
+ try:
273
+ items = await self.find_elements_by_text(text)
274
+ except (Exception, TypeError):
275
+ pass
276
+ while not items:
277
+ await self
278
+ items = await self.find_elements_by_text(text)
279
+ if loop.time() - now > timeout:
280
+ raise asyncio.TimeoutError(
281
+ "Time ran out while waiting for: {%s}" % text
282
+ )
283
+ await self.sleep(0.5)
284
+ return items
285
+
286
+ async def select_all(
287
+ self,
288
+ selector: str,
289
+ timeout: Union[int, float] = 10,
290
+ include_frames=False,
291
+ ) -> List[element.Element]:
292
+ """
293
+ Find multiple elements by CSS Selector.
294
+ Can also be used to wait for such elements to appear.
295
+ :param selector: css selector,
296
+ eg a[href], button[class*=close], a > img[src]
297
+ :type selector: str
298
+ :param timeout:
299
+ Raise timeout exception when after this many seconds nothing is found.
300
+ :type timeout: float,int
301
+ :param include_frames: Whether to include results in iframes.
302
+ :type include_frames: bool
303
+ """
304
+ loop = asyncio.get_running_loop()
305
+ now = loop.time()
306
+ selector = selector.strip()
307
+ items = []
308
+ if include_frames:
309
+ frames = await self.query_selector_all("iframe")
310
+ # Unfortunately, asyncio.gather is not an option here
311
+ for fr in frames:
312
+ items.extend(await fr.query_selector_all(selector))
313
+ items.extend(await self.query_selector_all(selector))
314
+ while not items:
315
+ await self
316
+ items = await self.query_selector_all(selector)
317
+ if loop.time() - now > timeout:
318
+ raise asyncio.TimeoutError(
319
+ "Time ran out while waiting for: {%s}" % selector
320
+ )
321
+ await self.sleep(0.5)
322
+ return items
323
+
324
+ async def get(
325
+ self,
326
+ url="chrome://welcome",
327
+ new_tab: bool = False,
328
+ new_window: bool = False,
329
+ ):
330
+ """
331
+ Top level get. Utilizes the first tab to retrieve the given url.
332
+ This is a convenience function known from selenium.
333
+ This function handles waits/sleeps and detects when DOM events fired,
334
+ so it's the safest way of navigating.
335
+ :param url: the url to navigate to
336
+ :param new_tab: open new tab
337
+ :param new_window: open new window
338
+ :return: Page
339
+ """
340
+ if not self.browser:
341
+ raise AttributeError(
342
+ "This page/tab has no browser attribute, "
343
+ "so you can't use get()"
344
+ )
345
+ if new_window and not new_tab:
346
+ new_tab = True
347
+ if new_tab:
348
+ return await self.browser.get(url, new_tab, new_window)
349
+ else:
350
+ frame_id, loader_id, *_ = await self.send(cdp.page.navigate(url))
351
+ await self
352
+ return self
353
+
354
+ async def query_selector_all(
355
+ self,
356
+ selector: str,
357
+ _node: Optional[Union[cdp.dom.Node, "element.Element"]] = None,
358
+ ):
359
+ """
360
+ Equivalent of JavaScript "document.querySelectorAll".
361
+ This is considered one of the main methods to use in this package.
362
+ It returns all matching :py:obj:`element.Element` objects.
363
+ :param selector: css selector.
364
+ (first time? => https://www.w3schools.com/cssref/css_selectors.php )
365
+ :type selector: str
366
+ :param _node: internal use
367
+ """
368
+ if not _node:
369
+ doc: cdp.dom.Node = await self.send(cdp.dom.get_document(-1, True))
370
+ else:
371
+ doc = _node
372
+ if _node.node_name == "IFRAME":
373
+ doc = _node.content_document
374
+ node_ids = []
375
+ try:
376
+ node_ids = await self.send(
377
+ cdp.dom.query_selector_all(doc.node_id, selector)
378
+ )
379
+ except ProtocolException as e:
380
+ if _node is not None:
381
+ if "could not find node" in e.message.lower():
382
+ if getattr(_node, "__last", None):
383
+ del _node.__last
384
+ return []
385
+ # If the supplied node is not found,
386
+ # then the DOM has changed since acquiring the element.
387
+ # Therefore, we need to update our node, and try again.
388
+ await _node.update()
389
+ _node.__last = (
390
+ True # Make sure this isn't turned into infinite loop.
391
+ )
392
+ return await self.query_selector_all(selector, _node)
393
+ else:
394
+ await self.send(cdp.dom.disable())
395
+ raise
396
+ if not node_ids:
397
+ return []
398
+ items = []
399
+ for nid in node_ids:
400
+ node = util.filter_recurse(doc, lambda n: n.node_id == nid)
401
+ # Pass along the retrieved document tree to improve performance.
402
+ if not node:
403
+ continue
404
+ elem = element.create(node, self, doc)
405
+ items.append(elem)
406
+ return items
407
+
408
+ async def query_selector(
409
+ self,
410
+ selector: str,
411
+ _node: Optional[Union[cdp.dom.Node, element.Element]] = None,
412
+ ):
413
+ """
414
+ Find a single element based on a CSS Selector string.
415
+ :param selector: CSS Selector(s)
416
+ :type selector: str
417
+ """
418
+ selector = selector.strip()
419
+ if not _node:
420
+ doc: cdp.dom.Node = await self.send(cdp.dom.get_document(-1, True))
421
+ else:
422
+ doc = _node
423
+ if _node.node_name == "IFRAME":
424
+ doc = _node.content_document
425
+ node_id = None
426
+ try:
427
+ node_id = await self.send(
428
+ cdp.dom.query_selector(doc.node_id, selector)
429
+ )
430
+ except ProtocolException as e:
431
+ if _node is not None:
432
+ if "could not find node" in e.message.lower():
433
+ if getattr(_node, "__last", None):
434
+ del _node.__last
435
+ return []
436
+ # If supplied node is not found,
437
+ # the dom has changed since acquiring the element,
438
+ # therefore, update our passed node and try again.
439
+ await _node.update()
440
+ _node.__last = (
441
+ True # Make sure this isn't turned into infinite loop.
442
+ )
443
+ return await self.query_selector(selector, _node)
444
+ else:
445
+ await self.send(cdp.dom.disable())
446
+ raise
447
+ if not node_id:
448
+ return
449
+ node = util.filter_recurse(doc, lambda n: n.node_id == node_id)
450
+ if not node:
451
+ return
452
+ return element.create(node, self, doc)
453
+
454
+ async def find_elements_by_text(
455
+ self,
456
+ text: str,
457
+ ) -> List[element.Element]:
458
+ """
459
+ Returns element which match the given text.
460
+ Note: This may (or will) also return any other element
461
+ (like inline scripts), which happen to contain that text.
462
+ :param text:
463
+ """
464
+ text = text.strip()
465
+ doc = await self.send(cdp.dom.get_document(-1, True))
466
+ search_id, nresult = await self.send(
467
+ cdp.dom.perform_search(text, True)
468
+ )
469
+ if nresult:
470
+ node_ids = await self.send(
471
+ cdp.dom.get_search_results(search_id, 0, nresult)
472
+ )
473
+ else:
474
+ node_ids = []
475
+ await self.send(cdp.dom.discard_search_results(search_id))
476
+ items = []
477
+ for nid in node_ids:
478
+ node = util.filter_recurse(doc, lambda n: n.node_id == nid)
479
+ if not node:
480
+ node = await self.send(cdp.dom.resolve_node(node_id=nid))
481
+ if not node:
482
+ continue
483
+ # remote_object = await self.send(
484
+ # cdp.dom.resolve_node(backend_node_id=node.backend_node_id)
485
+ # )
486
+ # node_id = await self.send(
487
+ # cdp.dom.request_node(object_id=remote_object.object_id)
488
+ # )
489
+ try:
490
+ elem = element.create(node, self, doc)
491
+ except BaseException:
492
+ continue
493
+ if elem.node_type == 3:
494
+ # If found element is a text node (which is plain text,
495
+ # and useless for our purpose), we return the parent element
496
+ # of the node (which is often a tag which can have text
497
+ # between their opening and closing tags (that is most tags,
498
+ # except for example "img" and "video", "br").
499
+ if not elem.parent:
500
+ # Check if parent actually has a parent
501
+ # and update it to be absolutely sure.
502
+ await elem.update()
503
+ items.append(
504
+ elem.parent or elem
505
+ ) # When there's no parent, use the text node itself.
506
+ continue
507
+ else:
508
+ # Add the element itself.
509
+ items.append(elem)
510
+ # Since we already fetched the entire doc, including shadow and frames,
511
+ # let's also search through the iframes.
512
+ iframes = util.filter_recurse_all(
513
+ doc, lambda node: node.node_name == "IFRAME"
514
+ )
515
+ if iframes:
516
+ iframes_elems = [
517
+ element.create(iframe, self, iframe.content_document)
518
+ for iframe in iframes
519
+ ]
520
+ for iframe_elem in iframes_elems:
521
+ if iframe_elem.content_document:
522
+ iframe_text_nodes = util.filter_recurse_all(
523
+ iframe_elem,
524
+ lambda node: node.node_type == 3 # noqa
525
+ and text.lower() in node.node_value.lower(),
526
+ )
527
+ if iframe_text_nodes:
528
+ iframe_text_elems = [
529
+ element.create(text_node, self, iframe_elem.tree)
530
+ for text_node in iframe_text_nodes
531
+ ]
532
+ items.extend(
533
+ text_node.parent for text_node in iframe_text_elems
534
+ )
535
+ await self.send(cdp.dom.disable())
536
+ return items or []
537
+
538
+ async def find_element_by_text(
539
+ self,
540
+ text: str,
541
+ best_match: Optional[bool] = False,
542
+ return_enclosing_element: Optional[bool] = True,
543
+ ) -> Union[element.Element, None]:
544
+ """
545
+ Finds and returns the first element containing <text>, or best match.
546
+ :param text:
547
+ :param best_match:
548
+ When True, which is MUCH more expensive (thus much slower),
549
+ will find the closest match based on length.
550
+ When searching for "login", you probably want the button element,
551
+ and not thousands of tags/scripts containing the "login" string.
552
+ :type best_match: bool
553
+ :param return_enclosing_element:
554
+ """
555
+ doc = await self.send(cdp.dom.get_document(-1, True))
556
+ text = text.strip()
557
+ search_id, nresult = await self.send(
558
+ cdp.dom.perform_search(text, True)
559
+ )
560
+ node_ids = await self.send(
561
+ cdp.dom.get_search_results(search_id, 0, nresult)
562
+ )
563
+ await self.send(cdp.dom.discard_search_results(search_id))
564
+ if not node_ids:
565
+ node_ids = []
566
+ items = []
567
+ for nid in node_ids:
568
+ node = util.filter_recurse(doc, lambda n: n.node_id == nid)
569
+ try:
570
+ elem = element.create(node, self, doc)
571
+ except BaseException:
572
+ continue
573
+ if elem.node_type == 3:
574
+ # If found element is a text node
575
+ # (which is plain text, and useless for our purpose),
576
+ # then return the parent element of the node
577
+ # (which is often a tag which can have text between their
578
+ # opening and closing tags (that is most tags,
579
+ # except for example "img" and "video", "br").
580
+ if not elem.parent:
581
+ # Check if parent has a parent, and update it to be sure.
582
+ await elem.update()
583
+ items.append(
584
+ elem.parent or elem
585
+ ) # When it really has no parent, use the text node itself
586
+ continue
587
+ else:
588
+ # Add the element itself
589
+ items.append(elem)
590
+ # Since the entire doc is already fetched, including shadow and frames,
591
+ # also search through the iframes.
592
+ iframes = util.filter_recurse_all(
593
+ doc, lambda node: node.node_name == "IFRAME"
594
+ )
595
+ if iframes:
596
+ iframes_elems = [
597
+ element.create(iframe, self, iframe.content_document)
598
+ for iframe in iframes
599
+ ]
600
+ for iframe_elem in iframes_elems:
601
+ iframe_text_nodes = util.filter_recurse_all(
602
+ iframe_elem,
603
+ lambda node: node.node_type == 3 # noqa
604
+ and text.lower() in node.node_value.lower(),
605
+ )
606
+ if iframe_text_nodes:
607
+ iframe_text_elems = [
608
+ element.create(text_node, self, iframe_elem.tree)
609
+ for text_node in iframe_text_nodes
610
+ ]
611
+ items.extend(
612
+ text_node.parent for text_node in iframe_text_elems
613
+ )
614
+ try:
615
+ if not items:
616
+ return
617
+ if best_match:
618
+ closest_by_length = min(
619
+ items, key=lambda el: abs(len(text) - len(el.text_all))
620
+ )
621
+ elem = closest_by_length or items[0]
622
+ return elem
623
+ else:
624
+ # Return the first result
625
+ for elem in items:
626
+ if elem:
627
+ return elem
628
+ finally:
629
+ await self.send(cdp.dom.disable())
630
+
631
+ async def back(self):
632
+ """History back"""
633
+ await self.send(cdp.runtime.evaluate("window.history.back()"))
634
+
635
+ async def forward(self):
636
+ """History forward"""
637
+ await self.send(cdp.runtime.evaluate("window.history.forward()"))
638
+
639
+ async def reload(
640
+ self,
641
+ ignore_cache: Optional[bool] = True,
642
+ script_to_evaluate_on_load: Optional[str] = None,
643
+ ):
644
+ """
645
+ Reloads the page
646
+ :param ignore_cache: When set to True (default),
647
+ it ignores cache, and re-downloads the items.
648
+ :param script_to_evaluate_on_load: Script to run on load.
649
+ """
650
+ await self.send(
651
+ cdp.page.reload(
652
+ ignore_cache=ignore_cache,
653
+ script_to_evaluate_on_load=script_to_evaluate_on_load,
654
+ ),
655
+ )
656
+
657
+ async def evaluate(
658
+ self, expression: str, await_promise=False, return_by_value=True
659
+ ):
660
+ remote_object, errors = await self.send(
661
+ cdp.runtime.evaluate(
662
+ expression=expression,
663
+ user_gesture=True,
664
+ await_promise=await_promise,
665
+ return_by_value=return_by_value,
666
+ allow_unsafe_eval_blocked_by_csp=True,
667
+ )
668
+ )
669
+ if errors:
670
+ raise ProtocolException(errors)
671
+ if remote_object:
672
+ if return_by_value:
673
+ if remote_object.value:
674
+ return remote_object.value
675
+ else:
676
+ return remote_object, errors
677
+
678
+ async def js_dumps(
679
+ self, obj_name: str, return_by_value: Optional[bool] = True
680
+ ) -> Union[
681
+ Dict,
682
+ Tuple[cdp.runtime.RemoteObject, cdp.runtime.ExceptionDetails],
683
+ ]:
684
+ """
685
+ Dump Given js object with its properties and values as a dict.
686
+ Note: Complex objects might not be serializable,
687
+ therefore this method is not a "source of truth"
688
+ :param obj_name: the js object to dump
689
+ :type obj_name: str
690
+ :param return_by_value: If you want an tuple of cdp objects
691
+ (returnvalue, errors), then set this to False.
692
+ :type return_by_value: bool
693
+
694
+ Example
695
+ -------
696
+
697
+ x = await self.js_dumps('window')
698
+ print(x)
699
+ '...{
700
+ 'pageYOffset': 0,
701
+ 'visualViewport': {},
702
+ 'screenX': 10,
703
+ 'screenY': 10,
704
+ 'outerWidth': 1050,
705
+ 'outerHeight': 832,
706
+ 'devicePixelRatio': 1,
707
+ 'screenLeft': 10,
708
+ 'screenTop': 10,
709
+ 'styleMedia': {},
710
+ 'onsearch': None,
711
+ 'isSecureContext': True,
712
+ 'trustedTypes': {},
713
+ 'performance': {'timeOrigin': 1707823094767.9,
714
+ 'timing': {'connectStart': 0,
715
+ 'navigationStart': 1707823094768,
716
+ ]...
717
+ """
718
+ js_code_a = (
719
+ """
720
+ function ___dump(obj, _d = 0) {
721
+ let _typesA = ['object', 'function'];
722
+ let _typesB = ['number', 'string', 'boolean'];
723
+ if (_d == 2) {
724
+ console.log('maxdepth reached for ', obj);
725
+ return
726
+ }
727
+ let tmp = {}
728
+ for (let k in obj) {
729
+ if (obj[k] == window) continue;
730
+ let v;
731
+ try {
732
+ if (obj[k] === null
733
+ || obj[k] === undefined
734
+ || obj[k] === NaN) {
735
+ console.log('obj[k] is null or undefined or Nan',
736
+ k, '=>', obj[k])
737
+ tmp[k] = obj[k];
738
+ continue
739
+ }
740
+ } catch (e) {
741
+ tmp[k] = null;
742
+ continue
743
+ }
744
+ if (_typesB.includes(typeof obj[k])) {
745
+ tmp[k] = obj[k]
746
+ continue
747
+ }
748
+ try {
749
+ if (typeof obj[k] === 'function') {
750
+ tmp[k] = obj[k].toString()
751
+ continue
752
+ }
753
+ if (typeof obj[k] === 'object') {
754
+ tmp[k] = ___dump(obj[k], _d + 1);
755
+ continue
756
+ }
757
+ } catch (e) {}
758
+ try {
759
+ tmp[k] = JSON.stringify(obj[k])
760
+ continue
761
+ } catch (e) {
762
+ }
763
+ try {
764
+ tmp[k] = obj[k].toString();
765
+ continue
766
+ } catch (e) {}
767
+ }
768
+ return tmp
769
+ }
770
+ function ___dumpY(obj) {
771
+ var objKeys = (obj) => {
772
+ var [target, result] = [obj, []];
773
+ while (target !== null) {
774
+ result = result.concat(
775
+ Object.getOwnPropertyNames(target)
776
+ );
777
+ target = Object.getPrototypeOf(target);
778
+ }
779
+ return result;
780
+ }
781
+ return Object.fromEntries(
782
+ objKeys(obj).map(_ => [_, ___dump(obj[_])]))
783
+ }
784
+ ___dumpY( %s )
785
+ """
786
+ % obj_name
787
+ )
788
+ js_code_b = (
789
+ """
790
+ ((obj, visited = new WeakSet()) => {
791
+ if (visited.has(obj)) {
792
+ return {}
793
+ }
794
+ visited.add(obj)
795
+ var result = {}, _tmp;
796
+ for (var i in obj) {
797
+ try {
798
+ if (i === 'enabledPlugin'
799
+ || typeof obj[i] === 'function') {
800
+ continue;
801
+ } else if (typeof obj[i] === 'object') {
802
+ _tmp = recurse(obj[i], visited);
803
+ if (Object.keys(_tmp).length) {
804
+ result[i] = _tmp;
805
+ }
806
+ } else {
807
+ result[i] = obj[i];
808
+ }
809
+ } catch (error) {
810
+ // console.error('Error:', error);
811
+ }
812
+ }
813
+ return result;
814
+ })(%s)
815
+ """
816
+ % obj_name
817
+ )
818
+ # No self.evaluate here to prevent infinite loop on certain expressions
819
+ remote_object, exception_details = await self.send(
820
+ cdp.runtime.evaluate(
821
+ js_code_a,
822
+ await_promise=True,
823
+ return_by_value=return_by_value,
824
+ allow_unsafe_eval_blocked_by_csp=True,
825
+ )
826
+ )
827
+ if exception_details:
828
+ # Try second variant
829
+ remote_object, exception_details = await self.send(
830
+ cdp.runtime.evaluate(
831
+ js_code_b,
832
+ await_promise=True,
833
+ return_by_value=return_by_value,
834
+ allow_unsafe_eval_blocked_by_csp=True,
835
+ )
836
+ )
837
+ if exception_details:
838
+ raise ProtocolException(exception_details)
839
+ if return_by_value:
840
+ if remote_object.value:
841
+ return remote_object.value
842
+ else:
843
+ return remote_object, exception_details
844
+
845
+ async def close(self):
846
+ """Close the current target (ie: tab,window,page)"""
847
+ if self.target and self.target.target_id:
848
+ await self.send(
849
+ cdp.target.close_target(target_id=self.target.target_id)
850
+ )
851
+
852
+ async def get_window(self) -> Tuple[
853
+ cdp.browser.WindowID, cdp.browser.Bounds
854
+ ]:
855
+ """Get the window Bounds"""
856
+ window_id, bounds = await self.send(
857
+ cdp.browser.get_window_for_target(self.target_id)
858
+ )
859
+ return window_id, bounds
860
+
861
+ async def get_content(self):
862
+ """Gets the current page source content (html)"""
863
+ doc: cdp.dom.Node = await self.send(cdp.dom.get_document(-1, True))
864
+ return await self.send(
865
+ cdp.dom.get_outer_html(backend_node_id=doc.backend_node_id)
866
+ )
867
+
868
+ async def maximize(self):
869
+ """Maximize page/tab/window"""
870
+ return await self.set_window_state(state="maximize")
871
+
872
+ async def minimize(self):
873
+ """Minimize page/tab/window"""
874
+ return await self.set_window_state(state="minimize")
875
+
876
+ async def fullscreen(self):
877
+ """Minimize page/tab/window"""
878
+ return await self.set_window_state(state="fullscreen")
879
+
880
+ async def medimize(self):
881
+ return await self.set_window_state(state="normal")
882
+
883
+ async def set_window_size(self, left=0, top=0, width=1280, height=1024):
884
+ """
885
+ Set window size and position.
886
+ :param left:
887
+ Pixels from the left of the screen to the window top-left corner.
888
+ :param top:
889
+ Pixels from the top of the screen to the window top-left corner.
890
+ :param width: width of the window in pixels
891
+ :param height: height of the window in pixels
892
+ """
893
+ return await self.set_window_state(left, top, width, height)
894
+
895
+ async def activate(self):
896
+ """Active this target (Eg: tab, window, page)"""
897
+ await self.send(cdp.target.activate_target(self.target.target_id))
898
+
899
+ async def bring_to_front(self):
900
+ """Alias to self.activate"""
901
+ await self.activate()
902
+
903
+ async def set_window_state(
904
+ self, left=0, top=0, width=1280, height=720, state="normal"
905
+ ):
906
+ """
907
+ Sets the window size or state.
908
+ For state you can provide the full name like minimized, maximized,
909
+ normal, fullscreen, or something which leads to either of those,
910
+ like min, mini, mi, max, ma, maxi, full, fu, no, nor.
911
+ In case state is set other than "normal",
912
+ the left, top, width, and height are ignored.
913
+ :param left:
914
+ desired offset from left, in pixels
915
+ :type left: int
916
+ :param top:
917
+ desired offset from the top, in pixels
918
+ :type top: int
919
+ :param width:
920
+ desired width in pixels
921
+ :type width: int
922
+ :param height:
923
+ desired height in pixels
924
+ :type height: int
925
+ :param state:
926
+ can be one of the following strings:
927
+ - normal
928
+ - fullscreen
929
+ - maximized
930
+ - minimized
931
+ :type state: str
932
+ """
933
+ available_states = ["minimized", "maximized", "fullscreen", "normal"]
934
+ window_id: cdp.browser.WindowID
935
+ bounds: cdp.browser.Bounds
936
+ (window_id, bounds) = await self.get_window()
937
+ for state_name in available_states:
938
+ if all(x in state_name for x in state.lower()):
939
+ break
940
+ else:
941
+ raise NameError(
942
+ "could not determine any of %s from input '%s'"
943
+ % (",".join(available_states), state)
944
+ )
945
+ window_state = getattr(
946
+ cdp.browser.WindowState,
947
+ state_name.upper(),
948
+ cdp.browser.WindowState.NORMAL,
949
+ )
950
+ if window_state == cdp.browser.WindowState.NORMAL:
951
+ bounds = cdp.browser.Bounds(
952
+ left, top, width, height, window_state
953
+ )
954
+ else:
955
+ # min, max, full can only be used when current state == NORMAL,
956
+ # therefore, first switch to NORMAL
957
+ await self.set_window_state(state="normal")
958
+ bounds = cdp.browser.Bounds(window_state=window_state)
959
+
960
+ await self.send(
961
+ cdp.browser.set_window_bounds(window_id, bounds=bounds)
962
+ )
963
+
964
+ async def scroll_down(self, amount=25):
965
+ """
966
+ Scrolls the page down.
967
+ :param amount: Number in percentage.
968
+ 25 is a quarter of page, 50 half, and 1000 is 10x the page.
969
+ :type amount: int
970
+ """
971
+ window_id: cdp.browser.WindowID
972
+ bounds: cdp.browser.Bounds
973
+ (window_id, bounds) = await self.get_window()
974
+ await self.send(
975
+ cdp.input_.synthesize_scroll_gesture(
976
+ x=0,
977
+ y=0,
978
+ y_distance=-(bounds.height * (amount / 100)),
979
+ y_overscroll=0,
980
+ x_overscroll=0,
981
+ prevent_fling=True,
982
+ repeat_delay_ms=0,
983
+ speed=7777,
984
+ )
985
+ )
986
+
987
+ async def scroll_up(self, amount=25):
988
+ """
989
+ Scrolls the page up.
990
+ :param amount: Number in percentage.
991
+ 25 is a quarter of page, 50 half, and 1000 is 10x the page.
992
+ :type amount: int
993
+ """
994
+ window_id: cdp.browser.WindowID
995
+ bounds: cdp.browser.Bounds
996
+ (window_id, bounds) = await self.get_window()
997
+ await self.send(
998
+ cdp.input_.synthesize_scroll_gesture(
999
+ x=0,
1000
+ y=0,
1001
+ y_distance=(bounds.height * (amount / 100)),
1002
+ x_overscroll=0,
1003
+ prevent_fling=True,
1004
+ repeat_delay_ms=0,
1005
+ speed=7777,
1006
+ )
1007
+ )
1008
+
1009
+ async def wait_for(
1010
+ self,
1011
+ selector: Optional[str] = "",
1012
+ text: Optional[str] = "",
1013
+ timeout: Optional[Union[int, float]] = 10,
1014
+ ) -> element.Element:
1015
+ """
1016
+ Variant on query_selector_all and find_elements_by_text.
1017
+ This variant takes either selector or text,
1018
+ and will block until the requested element(s) are found.
1019
+ It will block for a maximum of <timeout> seconds,
1020
+ after which a TimeoutError will be raised.
1021
+ :param selector: css selector
1022
+ :param text: text
1023
+ :param timeout:
1024
+ :return: Element
1025
+ :raises: asyncio.TimeoutError
1026
+ """
1027
+ loop = asyncio.get_running_loop()
1028
+ now = loop.time()
1029
+ if selector:
1030
+ item = await self.query_selector(selector)
1031
+ while not item:
1032
+ item = await self.query_selector(selector)
1033
+ if loop.time() - now > timeout:
1034
+ raise asyncio.TimeoutError(
1035
+ "Time ran out while waiting for: {%s}" % selector
1036
+ )
1037
+ await self.sleep(0.5)
1038
+ return item
1039
+ if text:
1040
+ item = await self.find_element_by_text(text)
1041
+ while not item:
1042
+ item = await self.find_element_by_text(text)
1043
+ if loop.time() - now > timeout:
1044
+ raise asyncio.TimeoutError(
1045
+ "Time ran out while waiting for: {%s}" % text
1046
+ )
1047
+ await self.sleep(0.5)
1048
+ return item
1049
+
1050
+ async def download_file(
1051
+ self, url: str, filename: Optional[PathLike] = None
1052
+ ):
1053
+ """
1054
+ Downloads the file by the given url.
1055
+ :param url: The URL of the file.
1056
+ :param filename: The name for the file.
1057
+ If not specified, the name is composed from the url file name
1058
+ """
1059
+ if not self._download_behavior:
1060
+ directory_path = pathlib.Path.cwd() / "downloads"
1061
+ directory_path.mkdir(exist_ok=True)
1062
+ await self.set_download_path(directory_path)
1063
+
1064
+ warnings.warn(
1065
+ f"No download path set, so creating and using a default of "
1066
+ f"{directory_path}"
1067
+ )
1068
+ if not filename:
1069
+ filename = url.rsplit("/")[-1]
1070
+ filename = filename.split("?")[0]
1071
+ code = """
1072
+ (elem) => {
1073
+ async function _downloadFile(
1074
+ imageSrc,
1075
+ nameOfDownload,
1076
+ ) {
1077
+ const response = await fetch(imageSrc);
1078
+ const blobImage = await response.blob();
1079
+ const href = URL.createObjectURL(blobImage);
1080
+ const anchorElement = document.createElement('a');
1081
+ anchorElement.href = href;
1082
+ anchorElement.download = nameOfDownload;
1083
+ document.body.appendChild(anchorElement);
1084
+ anchorElement.click();
1085
+ setTimeout(() => {
1086
+ document.body.removeChild(anchorElement);
1087
+ window.URL.revokeObjectURL(href);
1088
+ }, 500);
1089
+ }
1090
+ _downloadFile('%s', '%s')
1091
+ }
1092
+ """ % (
1093
+ url,
1094
+ filename,
1095
+ )
1096
+ body = (await self.query_selector_all("body"))[0]
1097
+ await body.update()
1098
+ await self.send(
1099
+ cdp.runtime.call_function_on(
1100
+ code,
1101
+ object_id=body.object_id,
1102
+ arguments=[cdp.runtime.CallArgument(object_id=body.object_id)],
1103
+ )
1104
+ )
1105
+
1106
+ async def save_screenshot(
1107
+ self,
1108
+ filename: Optional[PathLike] = "auto",
1109
+ format: Optional[str] = "png",
1110
+ full_page: Optional[bool] = False,
1111
+ ) -> str:
1112
+ """
1113
+ Saves a screenshot of the page.
1114
+ This is not the same as :py:obj:`Element.save_screenshot`,
1115
+ which saves a screenshot of a single element only.
1116
+ :param filename: uses this as the save path
1117
+ :type filename: PathLike
1118
+ :param format: jpeg or png (defaults to jpeg)
1119
+ :type format: str
1120
+ :param full_page:
1121
+ When False (default), it captures the current viewport.
1122
+ When True, it captures the entire page.
1123
+ :type full_page: bool
1124
+ :return: The path/filename of the saved screenshot.
1125
+ :rtype: str
1126
+ """
1127
+ import urllib.parse
1128
+ import datetime
1129
+
1130
+ await self.sleep() # Update the target's URL
1131
+ path = None
1132
+ if format.lower() in ["jpg", "jpeg"]:
1133
+ ext = ".jpg"
1134
+ format = "jpeg"
1135
+ elif format.lower() in ["png"]:
1136
+ ext = ".png"
1137
+ format = "png"
1138
+ if not filename or filename == "auto":
1139
+ parsed = urllib.parse.urlparse(self.target.url)
1140
+ parts = parsed.path.split("/")
1141
+ last_part = parts[-1]
1142
+ last_part = last_part.rsplit("?", 1)[0]
1143
+ dt_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
1144
+ candidate = f"{parsed.hostname}__{last_part}_{dt_str}"
1145
+ path = pathlib.Path(candidate + ext) # noqa
1146
+ else:
1147
+ path = pathlib.Path(filename)
1148
+ path.parent.mkdir(parents=True, exist_ok=True)
1149
+ data = await self.send(
1150
+ cdp.page.capture_screenshot(
1151
+ format_=format, capture_beyond_viewport=full_page
1152
+ )
1153
+ )
1154
+ if not data:
1155
+ raise ProtocolException(
1156
+ "Could not take screenshot. "
1157
+ "Most possible cause is the page "
1158
+ "has not finished loading yet."
1159
+ )
1160
+ import base64
1161
+
1162
+ data_bytes = base64.b64decode(data)
1163
+ if not path:
1164
+ raise RuntimeError("Invalid filename or path: '%s'" % filename)
1165
+ path.write_bytes(data_bytes)
1166
+ return str(path)
1167
+
1168
+ async def set_download_path(self, path: PathLike):
1169
+ """
1170
+ Sets the download path.
1171
+ When not set, a default folder is used.
1172
+ :param path:
1173
+ """
1174
+ await self.send(
1175
+ cdp.browser.set_download_behavior(
1176
+ behavior="allow", download_path=str(path.resolve())
1177
+ )
1178
+ )
1179
+ self._download_behavior = ["allow", str(path.resolve())]
1180
+
1181
+ async def get_all_linked_sources(self) -> List["element.Element"]:
1182
+ """Get all elements of tag: link, a, img, scripts meta, video, audio"""
1183
+ all_assets = await self.query_selector_all(
1184
+ selector="a,link,img,script,meta"
1185
+ )
1186
+ return [element.create(asset, self) for asset in all_assets]
1187
+
1188
+ async def get_all_urls(self, absolute=True) -> List[str]:
1189
+ """
1190
+ Convenience function, which returns all links (a,link,img,script,meta).
1191
+ :param absolute:
1192
+ Try to build all the links in absolute form
1193
+ instead of "as is", often relative.
1194
+ :return: List of URLs.
1195
+ """
1196
+ import urllib.parse
1197
+
1198
+ res = []
1199
+ all_assets = await self.query_selector_all(
1200
+ selector="a,link,img,script,meta"
1201
+ )
1202
+ for asset in all_assets:
1203
+ if not absolute:
1204
+ res.append(asset.src or asset.href)
1205
+ else:
1206
+ for k, v in asset.attrs.items():
1207
+ if k in ("src", "href"):
1208
+ if "#" in v:
1209
+ continue
1210
+ if not any([_ in v for _ in ("http", "//", "/")]):
1211
+ continue
1212
+ abs_url = urllib.parse.urljoin(
1213
+ "/".join(self.url.rsplit("/")[:3]), v
1214
+ )
1215
+ if not abs_url.startswith(("http", "//", "ws")):
1216
+ continue
1217
+ res.append(abs_url)
1218
+ return res
1219
+
1220
+ async def verify_cf(self):
1221
+ """(An attempt)"""
1222
+ checkbox = None
1223
+ checkbox_sibling = await self.wait_for(text="verify you are human")
1224
+ if checkbox_sibling:
1225
+ parent = checkbox_sibling.parent
1226
+ while parent:
1227
+ checkbox = await parent.query_selector("input[type=checkbox]")
1228
+ if checkbox:
1229
+ break
1230
+ parent = parent.parent
1231
+ await checkbox.mouse_move()
1232
+ await checkbox.mouse_click()
1233
+
1234
+ async def get_document(self):
1235
+ return await self.send(cdp.dom.get_document())
1236
+
1237
+ async def get_flattened_document(self):
1238
+ return await self.send(cdp.dom.get_flattened_document())
1239
+
1240
+ async def get_local_storage(self):
1241
+ """
1242
+ Get local storage items as dict of strings.
1243
+ Proper deserialization may need to be done.
1244
+ """
1245
+ if not self.target.url:
1246
+ await self
1247
+ origin = "/".join(self.url.split("/", 3)[:-1])
1248
+ items = await self.send(
1249
+ cdp.dom_storage.get_dom_storage_items(
1250
+ cdp.dom_storage.StorageId(
1251
+ is_local_storage=True, security_origin=origin
1252
+ )
1253
+ )
1254
+ )
1255
+ retval = {}
1256
+ for item in items:
1257
+ retval[item[0]] = item[1]
1258
+ return retval
1259
+
1260
+ async def set_local_storage(self, items: dict):
1261
+ """
1262
+ Set local storage.
1263
+ Dict items must be strings.
1264
+ Simple types will be converted to strings automatically.
1265
+ :param items: dict containing {key:str, value:str}
1266
+ :type items: dict[str,str]
1267
+ """
1268
+ if not self.target.url:
1269
+ await self
1270
+ origin = "/".join(self.url.split("/", 3)[:-1])
1271
+ await asyncio.gather(
1272
+ *[
1273
+ self.send(
1274
+ cdp.dom_storage.set_dom_storage_item(
1275
+ storage_id=cdp.dom_storage.StorageId(
1276
+ is_local_storage=True, security_origin=origin
1277
+ ),
1278
+ key=str(key),
1279
+ value=str(val),
1280
+ )
1281
+ )
1282
+ for key, val in items.items()
1283
+ ]
1284
+ )
1285
+
1286
+ def __call__(
1287
+ self,
1288
+ text: Optional[str] = "",
1289
+ selector: Optional[str] = "",
1290
+ timeout: Optional[Union[int, float]] = 10,
1291
+ ):
1292
+ """
1293
+ Alias to query_selector_all or find_elements_by_text,
1294
+ depending on whether text= is set or selector= is set.
1295
+ :param selector: css selector string
1296
+ :type selector: str
1297
+ """
1298
+ return self.wait_for(text, selector, timeout)
1299
+
1300
+ def __eq__(self, other: Tab):
1301
+ try:
1302
+ return other.target == self.target
1303
+ except (AttributeError, TypeError):
1304
+ return False
1305
+
1306
+ def __getattr__(self, item):
1307
+ try:
1308
+ return getattr(self._target, item)
1309
+ except AttributeError:
1310
+ raise AttributeError(
1311
+ f'"{self.__class__.__name__}" has no attribute "%s"' % item
1312
+ )
1313
+
1314
+ def __repr__(self):
1315
+ extra = ""
1316
+ if self.target.url:
1317
+ extra = f"[url: {self.target.url}]"
1318
+ s = f"<{type(self).__name__} [{self.target_id}] [{self.type_}] {extra}>" # noqa
1319
+ return s