scrapling 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -4
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +89 -15
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +158 -83
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +20 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +229 -14
- scrapling/parser.py +49 -21
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/METADATA +32 -16
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
2
|
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
-
from scrapling.fetchers import (CustomFetcher, Fetcher,
|
4
|
-
StealthyFetcher)
|
3
|
+
from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
4
|
+
PlayWrightFetcher, StealthyFetcher)
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.9"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
12
|
-
__all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
12
|
+
__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
scrapling/core/custom_types.py
CHANGED
@@ -14,11 +14,70 @@ class TextHandler(str):
|
|
14
14
|
__slots__ = ()
|
15
15
|
|
16
16
|
def __new__(cls, string):
|
17
|
-
|
18
|
-
if type(string) is str:
|
17
|
+
if isinstance(string, str):
|
19
18
|
return super().__new__(cls, string)
|
20
|
-
|
21
|
-
|
19
|
+
return super().__new__(cls, '')
|
20
|
+
|
21
|
+
# Make methods from original `str` class return `TextHandler` instead of returning `str` again
|
22
|
+
# Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
|
23
|
+
# and I made sonnet write it for me :)
|
24
|
+
def strip(self, chars=None):
|
25
|
+
return TextHandler(super().strip(chars))
|
26
|
+
|
27
|
+
def lstrip(self, chars=None):
|
28
|
+
return TextHandler(super().lstrip(chars))
|
29
|
+
|
30
|
+
def rstrip(self, chars=None):
|
31
|
+
return TextHandler(super().rstrip(chars))
|
32
|
+
|
33
|
+
def capitalize(self):
|
34
|
+
return TextHandler(super().capitalize())
|
35
|
+
|
36
|
+
def casefold(self):
|
37
|
+
return TextHandler(super().casefold())
|
38
|
+
|
39
|
+
def center(self, width, fillchar=' '):
|
40
|
+
return TextHandler(super().center(width, fillchar))
|
41
|
+
|
42
|
+
def expandtabs(self, tabsize=8):
|
43
|
+
return TextHandler(super().expandtabs(tabsize))
|
44
|
+
|
45
|
+
def format(self, *args, **kwargs):
|
46
|
+
return TextHandler(super().format(*args, **kwargs))
|
47
|
+
|
48
|
+
def format_map(self, mapping):
|
49
|
+
return TextHandler(super().format_map(mapping))
|
50
|
+
|
51
|
+
def join(self, iterable):
|
52
|
+
return TextHandler(super().join(iterable))
|
53
|
+
|
54
|
+
def ljust(self, width, fillchar=' '):
|
55
|
+
return TextHandler(super().ljust(width, fillchar))
|
56
|
+
|
57
|
+
def rjust(self, width, fillchar=' '):
|
58
|
+
return TextHandler(super().rjust(width, fillchar))
|
59
|
+
|
60
|
+
def swapcase(self):
|
61
|
+
return TextHandler(super().swapcase())
|
62
|
+
|
63
|
+
def title(self):
|
64
|
+
return TextHandler(super().title())
|
65
|
+
|
66
|
+
def translate(self, table):
|
67
|
+
return TextHandler(super().translate(table))
|
68
|
+
|
69
|
+
def zfill(self, width):
|
70
|
+
return TextHandler(super().zfill(width))
|
71
|
+
|
72
|
+
def replace(self, old, new, count=-1):
|
73
|
+
return TextHandler(super().replace(old, new, count))
|
74
|
+
|
75
|
+
def upper(self):
|
76
|
+
return TextHandler(super().upper())
|
77
|
+
|
78
|
+
def lower(self):
|
79
|
+
return TextHandler(super().lower())
|
80
|
+
##############
|
22
81
|
|
23
82
|
def sort(self, reverse: bool = False) -> str:
|
24
83
|
"""Return a sorted version of the string"""
|
@@ -30,11 +89,21 @@ class TextHandler(str):
|
|
30
89
|
data = re.sub(' +', ' ', data)
|
31
90
|
return self.__class__(data.strip())
|
32
91
|
|
92
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
93
|
+
def get(self, default=None):
|
94
|
+
return self
|
95
|
+
|
96
|
+
def get_all(self):
|
97
|
+
return self
|
98
|
+
|
99
|
+
extract = get_all
|
100
|
+
extract_first = get
|
101
|
+
|
33
102
|
def json(self) -> Dict:
|
34
103
|
"""Return json response if the response is jsonable otherwise throw error"""
|
35
|
-
# Using
|
104
|
+
# Using str function as a workaround for orjson issue with subclasses of str
|
36
105
|
# Check this out: https://github.com/ijl/orjson/issues/445
|
37
|
-
return loads(self
|
106
|
+
return loads(str(self))
|
38
107
|
|
39
108
|
def re(
|
40
109
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
@@ -127,6 +196,19 @@ class TextHandlers(List[TextHandler]):
|
|
127
196
|
return result
|
128
197
|
return default
|
129
198
|
|
199
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
200
|
+
def get(self, default=None):
|
201
|
+
"""Returns the first item of the current list
|
202
|
+
:param default: the default value to return if the current list is empty
|
203
|
+
"""
|
204
|
+
return self[0] if len(self) > 0 else default
|
205
|
+
|
206
|
+
def extract(self):
|
207
|
+
return self
|
208
|
+
|
209
|
+
extract_first = get
|
210
|
+
get_all = extract
|
211
|
+
|
130
212
|
|
131
213
|
class AttributesHandler(Mapping):
|
132
214
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
import sqlite3
|
3
2
|
import threading
|
4
3
|
from abc import ABC, abstractmethod
|
@@ -9,7 +8,7 @@ from lxml import html
|
|
9
8
|
from tldextract import extract as tld
|
10
9
|
|
11
10
|
from scrapling.core._types import Dict, Optional, Union
|
12
|
-
from scrapling.core.utils import _StorageTools,
|
11
|
+
from scrapling.core.utils import _StorageTools, log, lru_cache
|
13
12
|
|
14
13
|
|
15
14
|
class StorageSystemMixin(ABC):
|
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
20
19
|
"""
|
21
20
|
self.url = url
|
22
21
|
|
23
|
-
@
|
22
|
+
@lru_cache(None, typed=True)
|
24
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
25
24
|
if not self.url or type(self.url) is not str:
|
26
25
|
return default_value
|
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
52
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
53
52
|
|
54
53
|
@staticmethod
|
55
|
-
@
|
54
|
+
@lru_cache(None, typed=True)
|
56
55
|
def _get_hash(identifier: str) -> str:
|
57
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
58
57
|
identifier = identifier.lower().strip()
|
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
64
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
65
64
|
|
66
65
|
|
67
|
-
@
|
66
|
+
@lru_cache(None, typed=True)
|
68
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
69
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
70
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
86
85
|
self.connection.execute("PRAGMA journal_mode=WAL")
|
87
86
|
self.cursor = self.connection.cursor()
|
88
87
|
self._setup_database()
|
89
|
-
|
88
|
+
log.debug(
|
90
89
|
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
91
90
|
)
|
92
91
|
|
scrapling/core/translator.py
CHANGED
@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
|
17
17
|
from w3lib.html import HTML5_WHITESPACE
|
18
18
|
|
19
19
|
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
-
from scrapling.core.utils import
|
20
|
+
from scrapling.core.utils import lru_cache
|
21
21
|
|
22
22
|
regex = f"[{HTML5_WHITESPACE}]+"
|
23
23
|
replace_html5_whitespaces = re.compile(regex).sub
|
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@
|
142
|
+
@lru_cache(maxsize=256)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
scrapling/core/utils.py
CHANGED
@@ -9,17 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
|
|
9
9
|
|
10
10
|
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
11
11
|
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
12
|
-
from functools import lru_cache
|
13
|
-
|
12
|
+
from functools import lru_cache # isort:skip
|
14
13
|
|
15
14
|
html_forbidden = {html.HtmlComment, }
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
|
16
|
+
|
17
|
+
@lru_cache(1, typed=True)
|
18
|
+
def setup_logger():
|
19
|
+
"""Create and configure a logger with a standard format.
|
20
|
+
|
21
|
+
:returns: logging.Logger: Configured logger instance
|
22
|
+
"""
|
23
|
+
logger = logging.getLogger('scrapling')
|
24
|
+
logger.setLevel(logging.INFO)
|
25
|
+
|
26
|
+
formatter = logging.Formatter(
|
27
|
+
fmt="[%(asctime)s] %(levelname)s: %(message)s",
|
28
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
29
|
+
)
|
30
|
+
|
31
|
+
console_handler = logging.StreamHandler()
|
32
|
+
console_handler.setFormatter(formatter)
|
33
|
+
|
34
|
+
# Add handler to logger (if not already added)
|
35
|
+
if not logger.handlers:
|
36
|
+
logger.addHandler(console_handler)
|
37
|
+
|
38
|
+
return logger
|
39
|
+
|
40
|
+
|
41
|
+
log = setup_logger()
|
23
42
|
|
24
43
|
|
25
44
|
def is_jsonable(content: Union[bytes, str]) -> bool:
|
@@ -33,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
|
|
33
52
|
return False
|
34
53
|
|
35
54
|
|
36
|
-
@cache(None, typed=True)
|
37
|
-
def setup_basic_logging(level: str = 'debug'):
|
38
|
-
levels = {
|
39
|
-
'debug': logging.DEBUG,
|
40
|
-
'info': logging.INFO,
|
41
|
-
'warning': logging.WARNING,
|
42
|
-
'error': logging.ERROR,
|
43
|
-
'critical': logging.CRITICAL
|
44
|
-
}
|
45
|
-
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
46
|
-
lvl = levels[level.lower()]
|
47
|
-
handler = logging.StreamHandler()
|
48
|
-
handler.setFormatter(formatter)
|
49
|
-
# Configure the root logger
|
50
|
-
logging.basicConfig(level=lvl, handlers=[handler])
|
51
|
-
|
52
|
-
|
53
55
|
def flatten(lst: Iterable):
|
54
56
|
return list(chain.from_iterable(lst))
|
55
57
|
|
@@ -113,7 +115,7 @@ class _StorageTools:
|
|
113
115
|
# return _impl
|
114
116
|
|
115
117
|
|
116
|
-
@
|
118
|
+
@lru_cache(None, typed=True)
|
117
119
|
def clean_spaces(string):
|
118
120
|
string = string.replace('\t', ' ')
|
119
121
|
string = re.sub('[\n|\r]', '', string)
|
scrapling/defaults.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
|
1
|
+
from .fetchers import AsyncFetcher, Fetcher, PlayWrightFetcher, StealthyFetcher
|
2
2
|
|
3
3
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
4
|
Fetcher = Fetcher()
|
5
|
+
AsyncFetcher = AsyncFetcher()
|
5
6
|
StealthyFetcher = StealthyFetcher()
|
6
7
|
PlayWrightFetcher = PlayWrightFetcher()
|
scrapling/engines/camo.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
1
|
from camoufox import DefaultAddons
|
2
|
+
from camoufox.async_api import AsyncCamoufox
|
4
3
|
from camoufox.sync_api import Camoufox
|
5
4
|
|
6
5
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
7
6
|
Union)
|
7
|
+
from scrapling.core.utils import log
|
8
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
async_intercept_route,
|
9
10
|
check_type_validity,
|
10
|
-
construct_proxy_dict,
|
11
|
+
construct_proxy_dict,
|
11
12
|
generate_convincing_referer,
|
12
13
|
get_os_name, intercept_route)
|
13
14
|
|
@@ -15,10 +16,11 @@ from scrapling.engines.toolbelt import (Response, StatusText,
|
|
15
16
|
class CamoufoxEngine:
|
16
17
|
def __init__(
|
17
18
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
18
|
-
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] =
|
19
|
-
timeout: Optional[float] = 30000, page_action: Callable =
|
19
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
|
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
20
21
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
21
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
23
|
+
geoip: Optional[bool] = False,
|
22
24
|
adaptor_arguments: Dict = None,
|
23
25
|
):
|
24
26
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
@@ -32,13 +34,15 @@ class CamoufoxEngine:
|
|
32
34
|
:param block_webrtc: Blocks WebRTC entirely.
|
33
35
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
34
36
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
35
|
-
:param allow_webgl:
|
37
|
+
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
36
38
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
37
39
|
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
38
40
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
39
41
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
40
42
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
41
43
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
44
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
45
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
42
46
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
43
47
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
44
48
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
@@ -54,16 +58,20 @@ class CamoufoxEngine:
|
|
54
58
|
self.google_search = bool(google_search)
|
55
59
|
self.os_randomize = bool(os_randomize)
|
56
60
|
self.disable_ads = bool(disable_ads)
|
61
|
+
self.geoip = bool(geoip)
|
57
62
|
self.extra_headers = extra_headers or {}
|
58
63
|
self.proxy = construct_proxy_dict(proxy)
|
59
64
|
self.addons = addons or []
|
60
65
|
self.humanize = humanize
|
61
66
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
62
|
-
if
|
63
|
-
|
67
|
+
if page_action is not None:
|
68
|
+
if callable(page_action):
|
69
|
+
self.page_action = page_action
|
70
|
+
else:
|
71
|
+
self.page_action = None
|
72
|
+
log.error('[Ignored] Argument "page_action" must be callable')
|
64
73
|
else:
|
65
|
-
self.page_action =
|
66
|
-
logging.error('[Ignored] Argument "page_action" must be callable')
|
74
|
+
self.page_action = None
|
67
75
|
|
68
76
|
self.wait_selector = wait_selector
|
69
77
|
self.wait_selector_state = wait_selector_state
|
@@ -77,6 +85,7 @@ class CamoufoxEngine:
|
|
77
85
|
"""
|
78
86
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
79
87
|
with Camoufox(
|
88
|
+
geoip=self.geoip,
|
80
89
|
proxy=self.proxy,
|
81
90
|
addons=self.addons,
|
82
91
|
exclude_addons=addons,
|
@@ -102,7 +111,8 @@ class CamoufoxEngine:
|
|
102
111
|
if self.network_idle:
|
103
112
|
page.wait_for_load_state('networkidle')
|
104
113
|
|
105
|
-
|
114
|
+
if self.page_action is not None:
|
115
|
+
page = self.page_action(page)
|
106
116
|
|
107
117
|
if self.wait_selector and type(self.wait_selector) is str:
|
108
118
|
waiter = page.locator(self.wait_selector)
|
@@ -115,11 +125,8 @@ class CamoufoxEngine:
|
|
115
125
|
|
116
126
|
# This will be parsed inside `Response`
|
117
127
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
118
|
-
|
119
|
-
status_text = res.status_text
|
120
128
|
# PlayWright API sometimes give empty status text for some reason!
|
121
|
-
|
122
|
-
status_text = StatusText.get(res.status)
|
129
|
+
status_text = res.status_text or StatusText.get(res.status)
|
123
130
|
|
124
131
|
response = Response(
|
125
132
|
url=res.url,
|
@@ -136,3 +143,70 @@ class CamoufoxEngine:
|
|
136
143
|
page.close()
|
137
144
|
|
138
145
|
return response
|
146
|
+
|
147
|
+
async def async_fetch(self, url: str) -> Response:
|
148
|
+
"""Opens up the browser and do your request based on your chosen options.
|
149
|
+
|
150
|
+
:param url: Target url.
|
151
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
152
|
+
"""
|
153
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
154
|
+
async with AsyncCamoufox(
|
155
|
+
geoip=self.geoip,
|
156
|
+
proxy=self.proxy,
|
157
|
+
addons=self.addons,
|
158
|
+
exclude_addons=addons,
|
159
|
+
headless=self.headless,
|
160
|
+
humanize=self.humanize,
|
161
|
+
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
162
|
+
allow_webgl=self.allow_webgl,
|
163
|
+
block_webrtc=self.block_webrtc,
|
164
|
+
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
165
|
+
os=None if self.os_randomize else get_os_name(),
|
166
|
+
) as browser:
|
167
|
+
page = await browser.new_page()
|
168
|
+
page.set_default_navigation_timeout(self.timeout)
|
169
|
+
page.set_default_timeout(self.timeout)
|
170
|
+
if self.disable_resources:
|
171
|
+
await page.route("**/*", async_intercept_route)
|
172
|
+
|
173
|
+
if self.extra_headers:
|
174
|
+
await page.set_extra_http_headers(self.extra_headers)
|
175
|
+
|
176
|
+
res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
177
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
178
|
+
if self.network_idle:
|
179
|
+
await page.wait_for_load_state('networkidle')
|
180
|
+
|
181
|
+
if self.page_action is not None:
|
182
|
+
page = await self.page_action(page)
|
183
|
+
|
184
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
185
|
+
waiter = page.locator(self.wait_selector)
|
186
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
187
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
188
|
+
await page.wait_for_load_state(state="load")
|
189
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
190
|
+
if self.network_idle:
|
191
|
+
await page.wait_for_load_state('networkidle')
|
192
|
+
|
193
|
+
# This will be parsed inside `Response`
|
194
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
195
|
+
# PlayWright API sometimes give empty status text for some reason!
|
196
|
+
status_text = res.status_text or StatusText.get(res.status)
|
197
|
+
|
198
|
+
response = Response(
|
199
|
+
url=res.url,
|
200
|
+
text=await page.content(),
|
201
|
+
body=(await page.content()).encode('utf-8'),
|
202
|
+
status=res.status,
|
203
|
+
reason=status_text,
|
204
|
+
encoding=encoding,
|
205
|
+
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
206
|
+
headers=await res.all_headers(),
|
207
|
+
request_headers=await res.request.all_headers(),
|
208
|
+
**self.adaptor_arguments
|
209
|
+
)
|
210
|
+
await page.close()
|
211
|
+
|
212
|
+
return response
|
scrapling/engines/constants.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Disable loading these resources for speed
|
2
|
-
DEFAULT_DISABLED_RESOURCES =
|
2
|
+
DEFAULT_DISABLED_RESOURCES = {
|
3
3
|
'font',
|
4
4
|
'image',
|
5
5
|
'media',
|
@@ -10,9 +10,9 @@ DEFAULT_DISABLED_RESOURCES = [
|
|
10
10
|
'websocket',
|
11
11
|
'csp_report',
|
12
12
|
'stylesheet',
|
13
|
-
|
13
|
+
}
|
14
14
|
|
15
|
-
DEFAULT_STEALTH_FLAGS =
|
15
|
+
DEFAULT_STEALTH_FLAGS = (
|
16
16
|
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
17
17
|
# Generally this will make the browser faster and less detectable
|
18
18
|
'--no-pings',
|
@@ -87,7 +87,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
87
87
|
'--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
|
88
88
|
'--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
|
89
89
|
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
|
90
|
-
|
90
|
+
)
|
91
91
|
|
92
92
|
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
93
93
|
NSTBROWSER_DEFAULT_QUERY = {
|