scrapling 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -4
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +89 -15
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +158 -83
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +20 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +229 -14
- scrapling/parser.py +49 -21
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/METADATA +32 -16
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
2
|
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
-
from scrapling.fetchers import (CustomFetcher, Fetcher,
|
4
|
-
StealthyFetcher)
|
3
|
+
from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
4
|
+
PlayWrightFetcher, StealthyFetcher)
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.9"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
12
|
-
__all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
12
|
+
__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
scrapling/core/custom_types.py
CHANGED
@@ -14,11 +14,70 @@ class TextHandler(str):
|
|
14
14
|
__slots__ = ()
|
15
15
|
|
16
16
|
def __new__(cls, string):
|
17
|
-
|
18
|
-
if type(string) is str:
|
17
|
+
if isinstance(string, str):
|
19
18
|
return super().__new__(cls, string)
|
20
|
-
|
21
|
-
|
19
|
+
return super().__new__(cls, '')
|
20
|
+
|
21
|
+
# Make methods from original `str` class return `TextHandler` instead of returning `str` again
|
22
|
+
# Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
|
23
|
+
# and I made sonnet write it for me :)
|
24
|
+
def strip(self, chars=None):
|
25
|
+
return TextHandler(super().strip(chars))
|
26
|
+
|
27
|
+
def lstrip(self, chars=None):
|
28
|
+
return TextHandler(super().lstrip(chars))
|
29
|
+
|
30
|
+
def rstrip(self, chars=None):
|
31
|
+
return TextHandler(super().rstrip(chars))
|
32
|
+
|
33
|
+
def capitalize(self):
|
34
|
+
return TextHandler(super().capitalize())
|
35
|
+
|
36
|
+
def casefold(self):
|
37
|
+
return TextHandler(super().casefold())
|
38
|
+
|
39
|
+
def center(self, width, fillchar=' '):
|
40
|
+
return TextHandler(super().center(width, fillchar))
|
41
|
+
|
42
|
+
def expandtabs(self, tabsize=8):
|
43
|
+
return TextHandler(super().expandtabs(tabsize))
|
44
|
+
|
45
|
+
def format(self, *args, **kwargs):
|
46
|
+
return TextHandler(super().format(*args, **kwargs))
|
47
|
+
|
48
|
+
def format_map(self, mapping):
|
49
|
+
return TextHandler(super().format_map(mapping))
|
50
|
+
|
51
|
+
def join(self, iterable):
|
52
|
+
return TextHandler(super().join(iterable))
|
53
|
+
|
54
|
+
def ljust(self, width, fillchar=' '):
|
55
|
+
return TextHandler(super().ljust(width, fillchar))
|
56
|
+
|
57
|
+
def rjust(self, width, fillchar=' '):
|
58
|
+
return TextHandler(super().rjust(width, fillchar))
|
59
|
+
|
60
|
+
def swapcase(self):
|
61
|
+
return TextHandler(super().swapcase())
|
62
|
+
|
63
|
+
def title(self):
|
64
|
+
return TextHandler(super().title())
|
65
|
+
|
66
|
+
def translate(self, table):
|
67
|
+
return TextHandler(super().translate(table))
|
68
|
+
|
69
|
+
def zfill(self, width):
|
70
|
+
return TextHandler(super().zfill(width))
|
71
|
+
|
72
|
+
def replace(self, old, new, count=-1):
|
73
|
+
return TextHandler(super().replace(old, new, count))
|
74
|
+
|
75
|
+
def upper(self):
|
76
|
+
return TextHandler(super().upper())
|
77
|
+
|
78
|
+
def lower(self):
|
79
|
+
return TextHandler(super().lower())
|
80
|
+
##############
|
22
81
|
|
23
82
|
def sort(self, reverse: bool = False) -> str:
|
24
83
|
"""Return a sorted version of the string"""
|
@@ -30,11 +89,21 @@ class TextHandler(str):
|
|
30
89
|
data = re.sub(' +', ' ', data)
|
31
90
|
return self.__class__(data.strip())
|
32
91
|
|
92
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
93
|
+
def get(self, default=None):
|
94
|
+
return self
|
95
|
+
|
96
|
+
def get_all(self):
|
97
|
+
return self
|
98
|
+
|
99
|
+
extract = get_all
|
100
|
+
extract_first = get
|
101
|
+
|
33
102
|
def json(self) -> Dict:
|
34
103
|
"""Return json response if the response is jsonable otherwise throw error"""
|
35
|
-
# Using
|
104
|
+
# Using str function as a workaround for orjson issue with subclasses of str
|
36
105
|
# Check this out: https://github.com/ijl/orjson/issues/445
|
37
|
-
return loads(self
|
106
|
+
return loads(str(self))
|
38
107
|
|
39
108
|
def re(
|
40
109
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
@@ -127,6 +196,19 @@ class TextHandlers(List[TextHandler]):
|
|
127
196
|
return result
|
128
197
|
return default
|
129
198
|
|
199
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
200
|
+
def get(self, default=None):
|
201
|
+
"""Returns the first item of the current list
|
202
|
+
:param default: the default value to return if the current list is empty
|
203
|
+
"""
|
204
|
+
return self[0] if len(self) > 0 else default
|
205
|
+
|
206
|
+
def extract(self):
|
207
|
+
return self
|
208
|
+
|
209
|
+
extract_first = get
|
210
|
+
get_all = extract
|
211
|
+
|
130
212
|
|
131
213
|
class AttributesHandler(Mapping):
|
132
214
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
import sqlite3
|
3
2
|
import threading
|
4
3
|
from abc import ABC, abstractmethod
|
@@ -9,7 +8,7 @@ from lxml import html
|
|
9
8
|
from tldextract import extract as tld
|
10
9
|
|
11
10
|
from scrapling.core._types import Dict, Optional, Union
|
12
|
-
from scrapling.core.utils import _StorageTools,
|
11
|
+
from scrapling.core.utils import _StorageTools, log, lru_cache
|
13
12
|
|
14
13
|
|
15
14
|
class StorageSystemMixin(ABC):
|
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
20
19
|
"""
|
21
20
|
self.url = url
|
22
21
|
|
23
|
-
@
|
22
|
+
@lru_cache(None, typed=True)
|
24
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
25
24
|
if not self.url or type(self.url) is not str:
|
26
25
|
return default_value
|
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
52
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
53
52
|
|
54
53
|
@staticmethod
|
55
|
-
@
|
54
|
+
@lru_cache(None, typed=True)
|
56
55
|
def _get_hash(identifier: str) -> str:
|
57
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
58
57
|
identifier = identifier.lower().strip()
|
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
64
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
65
64
|
|
66
65
|
|
67
|
-
@
|
66
|
+
@lru_cache(None, typed=True)
|
68
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
69
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
70
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
86
85
|
self.connection.execute("PRAGMA journal_mode=WAL")
|
87
86
|
self.cursor = self.connection.cursor()
|
88
87
|
self._setup_database()
|
89
|
-
|
88
|
+
log.debug(
|
90
89
|
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
91
90
|
)
|
92
91
|
|
scrapling/core/translator.py
CHANGED
@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
|
17
17
|
from w3lib.html import HTML5_WHITESPACE
|
18
18
|
|
19
19
|
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
-
from scrapling.core.utils import
|
20
|
+
from scrapling.core.utils import lru_cache
|
21
21
|
|
22
22
|
regex = f"[{HTML5_WHITESPACE}]+"
|
23
23
|
replace_html5_whitespaces = re.compile(regex).sub
|
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@
|
142
|
+
@lru_cache(maxsize=256)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
scrapling/core/utils.py
CHANGED
@@ -9,17 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
|
|
9
9
|
|
10
10
|
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
11
11
|
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
12
|
-
from functools import lru_cache
|
13
|
-
|
12
|
+
from functools import lru_cache # isort:skip
|
14
13
|
|
15
14
|
html_forbidden = {html.HtmlComment, }
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
|
16
|
+
|
17
|
+
@lru_cache(1, typed=True)
|
18
|
+
def setup_logger():
|
19
|
+
"""Create and configure a logger with a standard format.
|
20
|
+
|
21
|
+
:returns: logging.Logger: Configured logger instance
|
22
|
+
"""
|
23
|
+
logger = logging.getLogger('scrapling')
|
24
|
+
logger.setLevel(logging.INFO)
|
25
|
+
|
26
|
+
formatter = logging.Formatter(
|
27
|
+
fmt="[%(asctime)s] %(levelname)s: %(message)s",
|
28
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
29
|
+
)
|
30
|
+
|
31
|
+
console_handler = logging.StreamHandler()
|
32
|
+
console_handler.setFormatter(formatter)
|
33
|
+
|
34
|
+
# Add handler to logger (if not already added)
|
35
|
+
if not logger.handlers:
|
36
|
+
logger.addHandler(console_handler)
|
37
|
+
|
38
|
+
return logger
|
39
|
+
|
40
|
+
|
41
|
+
log = setup_logger()
|
23
42
|
|
24
43
|
|
25
44
|
def is_jsonable(content: Union[bytes, str]) -> bool:
|
@@ -33,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
|
|
33
52
|
return False
|
34
53
|
|
35
54
|
|
36
|
-
@cache(None, typed=True)
|
37
|
-
def setup_basic_logging(level: str = 'debug'):
|
38
|
-
levels = {
|
39
|
-
'debug': logging.DEBUG,
|
40
|
-
'info': logging.INFO,
|
41
|
-
'warning': logging.WARNING,
|
42
|
-
'error': logging.ERROR,
|
43
|
-
'critical': logging.CRITICAL
|
44
|
-
}
|
45
|
-
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
46
|
-
lvl = levels[level.lower()]
|
47
|
-
handler = logging.StreamHandler()
|
48
|
-
handler.setFormatter(formatter)
|
49
|
-
# Configure the root logger
|
50
|
-
logging.basicConfig(level=lvl, handlers=[handler])
|
51
|
-
|
52
|
-
|
53
55
|
def flatten(lst: Iterable):
|
54
56
|
return list(chain.from_iterable(lst))
|
55
57
|
|
@@ -113,7 +115,7 @@ class _StorageTools:
|
|
113
115
|
# return _impl
|
114
116
|
|
115
117
|
|
116
|
-
@
|
118
|
+
@lru_cache(None, typed=True)
|
117
119
|
def clean_spaces(string):
|
118
120
|
string = string.replace('\t', ' ')
|
119
121
|
string = re.sub('[\n|\r]', '', string)
|
scrapling/defaults.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
|
1
|
+
from .fetchers import AsyncFetcher, Fetcher, PlayWrightFetcher, StealthyFetcher
|
2
2
|
|
3
3
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
4
|
Fetcher = Fetcher()
|
5
|
+
AsyncFetcher = AsyncFetcher()
|
5
6
|
StealthyFetcher = StealthyFetcher()
|
6
7
|
PlayWrightFetcher = PlayWrightFetcher()
|
scrapling/engines/camo.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
1
|
from camoufox import DefaultAddons
|
2
|
+
from camoufox.async_api import AsyncCamoufox
|
4
3
|
from camoufox.sync_api import Camoufox
|
5
4
|
|
6
5
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
7
6
|
Union)
|
7
|
+
from scrapling.core.utils import log
|
8
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
async_intercept_route,
|
9
10
|
check_type_validity,
|
10
|
-
construct_proxy_dict,
|
11
|
+
construct_proxy_dict,
|
11
12
|
generate_convincing_referer,
|
12
13
|
get_os_name, intercept_route)
|
13
14
|
|
@@ -15,10 +16,11 @@ from scrapling.engines.toolbelt import (Response, StatusText,
|
|
15
16
|
class CamoufoxEngine:
|
16
17
|
def __init__(
|
17
18
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
18
|
-
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] =
|
19
|
-
timeout: Optional[float] = 30000, page_action: Callable =
|
19
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
|
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
20
21
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
21
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
23
|
+
geoip: Optional[bool] = False,
|
22
24
|
adaptor_arguments: Dict = None,
|
23
25
|
):
|
24
26
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
@@ -32,13 +34,15 @@ class CamoufoxEngine:
|
|
32
34
|
:param block_webrtc: Blocks WebRTC entirely.
|
33
35
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
34
36
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
35
|
-
:param allow_webgl:
|
37
|
+
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
36
38
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
37
39
|
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
38
40
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
39
41
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
40
42
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
41
43
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
44
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
45
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
42
46
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
43
47
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
44
48
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
@@ -54,16 +58,20 @@ class CamoufoxEngine:
|
|
54
58
|
self.google_search = bool(google_search)
|
55
59
|
self.os_randomize = bool(os_randomize)
|
56
60
|
self.disable_ads = bool(disable_ads)
|
61
|
+
self.geoip = bool(geoip)
|
57
62
|
self.extra_headers = extra_headers or {}
|
58
63
|
self.proxy = construct_proxy_dict(proxy)
|
59
64
|
self.addons = addons or []
|
60
65
|
self.humanize = humanize
|
61
66
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
62
|
-
if
|
63
|
-
|
67
|
+
if page_action is not None:
|
68
|
+
if callable(page_action):
|
69
|
+
self.page_action = page_action
|
70
|
+
else:
|
71
|
+
self.page_action = None
|
72
|
+
log.error('[Ignored] Argument "page_action" must be callable')
|
64
73
|
else:
|
65
|
-
self.page_action =
|
66
|
-
logging.error('[Ignored] Argument "page_action" must be callable')
|
74
|
+
self.page_action = None
|
67
75
|
|
68
76
|
self.wait_selector = wait_selector
|
69
77
|
self.wait_selector_state = wait_selector_state
|
@@ -77,6 +85,7 @@ class CamoufoxEngine:
|
|
77
85
|
"""
|
78
86
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
79
87
|
with Camoufox(
|
88
|
+
geoip=self.geoip,
|
80
89
|
proxy=self.proxy,
|
81
90
|
addons=self.addons,
|
82
91
|
exclude_addons=addons,
|
@@ -102,7 +111,8 @@ class CamoufoxEngine:
|
|
102
111
|
if self.network_idle:
|
103
112
|
page.wait_for_load_state('networkidle')
|
104
113
|
|
105
|
-
|
114
|
+
if self.page_action is not None:
|
115
|
+
page = self.page_action(page)
|
106
116
|
|
107
117
|
if self.wait_selector and type(self.wait_selector) is str:
|
108
118
|
waiter = page.locator(self.wait_selector)
|
@@ -115,11 +125,8 @@ class CamoufoxEngine:
|
|
115
125
|
|
116
126
|
# This will be parsed inside `Response`
|
117
127
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
118
|
-
|
119
|
-
status_text = res.status_text
|
120
128
|
# PlayWright API sometimes give empty status text for some reason!
|
121
|
-
|
122
|
-
status_text = StatusText.get(res.status)
|
129
|
+
status_text = res.status_text or StatusText.get(res.status)
|
123
130
|
|
124
131
|
response = Response(
|
125
132
|
url=res.url,
|
@@ -136,3 +143,70 @@ class CamoufoxEngine:
|
|
136
143
|
page.close()
|
137
144
|
|
138
145
|
return response
|
146
|
+
|
147
|
+
async def async_fetch(self, url: str) -> Response:
|
148
|
+
"""Opens up the browser and do your request based on your chosen options.
|
149
|
+
|
150
|
+
:param url: Target url.
|
151
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
152
|
+
"""
|
153
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
154
|
+
async with AsyncCamoufox(
|
155
|
+
geoip=self.geoip,
|
156
|
+
proxy=self.proxy,
|
157
|
+
addons=self.addons,
|
158
|
+
exclude_addons=addons,
|
159
|
+
headless=self.headless,
|
160
|
+
humanize=self.humanize,
|
161
|
+
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
162
|
+
allow_webgl=self.allow_webgl,
|
163
|
+
block_webrtc=self.block_webrtc,
|
164
|
+
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
165
|
+
os=None if self.os_randomize else get_os_name(),
|
166
|
+
) as browser:
|
167
|
+
page = await browser.new_page()
|
168
|
+
page.set_default_navigation_timeout(self.timeout)
|
169
|
+
page.set_default_timeout(self.timeout)
|
170
|
+
if self.disable_resources:
|
171
|
+
await page.route("**/*", async_intercept_route)
|
172
|
+
|
173
|
+
if self.extra_headers:
|
174
|
+
await page.set_extra_http_headers(self.extra_headers)
|
175
|
+
|
176
|
+
res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
177
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
178
|
+
if self.network_idle:
|
179
|
+
await page.wait_for_load_state('networkidle')
|
180
|
+
|
181
|
+
if self.page_action is not None:
|
182
|
+
page = await self.page_action(page)
|
183
|
+
|
184
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
185
|
+
waiter = page.locator(self.wait_selector)
|
186
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
187
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
188
|
+
await page.wait_for_load_state(state="load")
|
189
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
190
|
+
if self.network_idle:
|
191
|
+
await page.wait_for_load_state('networkidle')
|
192
|
+
|
193
|
+
# This will be parsed inside `Response`
|
194
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
195
|
+
# PlayWright API sometimes give empty status text for some reason!
|
196
|
+
status_text = res.status_text or StatusText.get(res.status)
|
197
|
+
|
198
|
+
response = Response(
|
199
|
+
url=res.url,
|
200
|
+
text=await page.content(),
|
201
|
+
body=(await page.content()).encode('utf-8'),
|
202
|
+
status=res.status,
|
203
|
+
reason=status_text,
|
204
|
+
encoding=encoding,
|
205
|
+
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
206
|
+
headers=await res.all_headers(),
|
207
|
+
request_headers=await res.request.all_headers(),
|
208
|
+
**self.adaptor_arguments
|
209
|
+
)
|
210
|
+
await page.close()
|
211
|
+
|
212
|
+
return response
|
scrapling/engines/constants.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Disable loading these resources for speed
|
2
|
-
DEFAULT_DISABLED_RESOURCES =
|
2
|
+
DEFAULT_DISABLED_RESOURCES = {
|
3
3
|
'font',
|
4
4
|
'image',
|
5
5
|
'media',
|
@@ -10,9 +10,9 @@ DEFAULT_DISABLED_RESOURCES = [
|
|
10
10
|
'websocket',
|
11
11
|
'csp_report',
|
12
12
|
'stylesheet',
|
13
|
-
|
13
|
+
}
|
14
14
|
|
15
|
-
DEFAULT_STEALTH_FLAGS =
|
15
|
+
DEFAULT_STEALTH_FLAGS = (
|
16
16
|
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
17
17
|
# Generally this will make the browser faster and less detectable
|
18
18
|
'--no-pings',
|
@@ -87,7 +87,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
87
87
|
'--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
|
88
88
|
'--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
|
89
89
|
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
|
90
|
-
|
90
|
+
)
|
91
91
|
|
92
92
|
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
93
93
|
NSTBROWSER_DEFAULT_QUERY = {
|