scrapling 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/custom_types.py +2 -3
- scrapling/core/translator.py +6 -4
- scrapling/engines/camo.py +3 -6
- scrapling/engines/pw.py +18 -15
- scrapling/engines/static.py +5 -1
- scrapling/engines/toolbelt/custom.py +69 -1
- scrapling/engines/toolbelt/fingerprints.py +1 -1
- scrapling/fetchers.py +22 -12
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/METADATA +11 -8
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/RECORD +18 -17
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/WHEEL +1 -1
- tests/fetchers/test_camoufox.py +2 -0
- tests/fetchers/test_playwright.py +2 -0
- tests/fetchers/test_utils.py +129 -0
- tests/parser/test_general.py +1 -1
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/LICENSE +0 -0
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.6"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/core/custom_types.py
CHANGED
@@ -129,9 +129,8 @@ class TextHandlers(List[TextHandler]):
|
|
129
129
|
|
130
130
|
|
131
131
|
class AttributesHandler(Mapping):
|
132
|
-
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
133
|
-
|
134
|
-
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
132
|
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
133
|
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
135
134
|
"""
|
136
135
|
__slots__ = ('_data',)
|
137
136
|
|
scrapling/core/translator.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
"""
|
2
2
|
Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
|
3
|
-
|
4
|
-
which will be important in future releases but most importantly...
|
5
|
-
|
6
|
-
|
3
|
+
|
4
|
+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
|
5
|
+
|
6
|
+
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
|
7
|
+
|
8
|
+
if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
|
7
9
|
"""
|
8
10
|
|
9
11
|
import re
|
scrapling/engines/camo.py
CHANGED
@@ -104,13 +104,10 @@ class CamoufoxEngine:
|
|
104
104
|
|
105
105
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
106
|
waiter = page.locator(self.wait_selector)
|
107
|
-
waiter.wait_for(state=self.wait_selector_state)
|
107
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
encoding = 'utf-8' # default encoding
|
112
|
-
if 'charset=' in content_type.lower():
|
113
|
-
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
109
|
+
# This will be parsed inside `Response`
|
110
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
114
111
|
|
115
112
|
status_text = res.status_text
|
116
113
|
# PlayWright API sometimes give empty status text for some reason!
|
scrapling/engines/pw.py
CHANGED
@@ -27,11 +27,12 @@ class PlaywrightEngine:
|
|
27
27
|
page_action: Callable = do_nothing,
|
28
28
|
wait_selector: Optional[str] = None,
|
29
29
|
wait_selector_state: Optional[str] = 'attached',
|
30
|
-
stealth: bool = False,
|
31
|
-
|
32
|
-
|
30
|
+
stealth: Optional[bool] = False,
|
31
|
+
real_chrome: Optional[bool] = False,
|
32
|
+
hide_canvas: Optional[bool] = False,
|
33
|
+
disable_webgl: Optional[bool] = False,
|
33
34
|
cdp_url: Optional[str] = None,
|
34
|
-
nstbrowser_mode: bool = False,
|
35
|
+
nstbrowser_mode: Optional[bool] = False,
|
35
36
|
nstbrowser_config: Optional[Dict] = None,
|
36
37
|
google_search: Optional[bool] = True,
|
37
38
|
extra_headers: Optional[Dict[str, str]] = None,
|
@@ -51,6 +52,7 @@ class PlaywrightEngine:
|
|
51
52
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
52
53
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
53
54
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
54
56
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
55
57
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
56
58
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
@@ -67,6 +69,7 @@ class PlaywrightEngine:
|
|
67
69
|
self.stealth = bool(stealth)
|
68
70
|
self.hide_canvas = bool(hide_canvas)
|
69
71
|
self.disable_webgl = bool(disable_webgl)
|
72
|
+
self.real_chrome = bool(real_chrome)
|
70
73
|
self.google_search = bool(google_search)
|
71
74
|
self.extra_headers = extra_headers or {}
|
72
75
|
self.proxy = construct_proxy_dict(proxy)
|
@@ -119,7 +122,8 @@ class PlaywrightEngine:
|
|
119
122
|
:param url: Target url.
|
120
123
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
121
124
|
"""
|
122
|
-
if not self.stealth:
|
125
|
+
if not self.stealth or self.real_chrome:
|
126
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
123
127
|
from playwright.sync_api import sync_playwright
|
124
128
|
else:
|
125
129
|
from rebrowser_playwright.sync_api import sync_playwright
|
@@ -130,8 +134,8 @@ class PlaywrightEngine:
|
|
130
134
|
extra_headers = {}
|
131
135
|
useragent = self.useragent
|
132
136
|
else:
|
133
|
-
extra_headers =
|
134
|
-
useragent =
|
137
|
+
extra_headers = {}
|
138
|
+
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
135
139
|
|
136
140
|
# Prepare the flags before diving
|
137
141
|
flags = DEFAULT_STEALTH_FLAGS
|
@@ -146,9 +150,11 @@ class PlaywrightEngine:
|
|
146
150
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
147
151
|
else:
|
148
152
|
if self.stealth:
|
149
|
-
browser = p.chromium.launch(
|
153
|
+
browser = p.chromium.launch(
|
154
|
+
headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
|
+
)
|
150
156
|
else:
|
151
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
|
157
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
|
152
158
|
|
153
159
|
# Creating the context
|
154
160
|
if self.stealth:
|
@@ -214,13 +220,10 @@ class PlaywrightEngine:
|
|
214
220
|
|
215
221
|
if self.wait_selector and type(self.wait_selector) is str:
|
216
222
|
waiter = page.locator(self.wait_selector)
|
217
|
-
waiter.wait_for(state=self.wait_selector_state)
|
223
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
218
224
|
|
219
|
-
|
220
|
-
|
221
|
-
encoding = 'utf-8' # default encoding
|
222
|
-
if 'charset=' in content_type.lower():
|
223
|
-
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
225
|
+
# This will be parsed inside `Response`
|
226
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
224
227
|
|
225
228
|
status_text = res.status_text
|
226
229
|
# PlayWright API sometimes give empty status text for some reason!
|
scrapling/engines/static.py
CHANGED
@@ -23,7 +23,7 @@ class StaticEngine:
|
|
23
23
|
@staticmethod
|
24
24
|
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
25
25
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
26
|
-
|
26
|
+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
27
27
|
|
28
28
|
:param headers: Current headers in the request if the user passed any
|
29
29
|
:param url: The Target URL.
|
@@ -65,6 +65,7 @@ class StaticEngine:
|
|
65
65
|
|
66
66
|
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
67
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
|
+
|
68
69
|
:param url: Target url.
|
69
70
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
70
71
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -77,6 +78,7 @@ class StaticEngine:
|
|
77
78
|
|
78
79
|
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
79
80
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
|
+
|
80
82
|
:param url: Target url.
|
81
83
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
82
84
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -89,6 +91,7 @@ class StaticEngine:
|
|
89
91
|
|
90
92
|
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
91
93
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
|
+
|
92
95
|
:param url: Target url.
|
93
96
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
94
97
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -101,6 +104,7 @@ class StaticEngine:
|
|
101
104
|
|
102
105
|
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
103
106
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
|
+
|
104
108
|
:param url: Target url.
|
105
109
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
106
110
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -3,11 +3,78 @@ Functions related to custom types or type checking
|
|
3
3
|
"""
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
|
+
from email.message import Message
|
6
7
|
|
7
8
|
from scrapling.core.custom_types import MappingProxyType
|
8
9
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
10
|
from scrapling.core.utils import setup_basic_logging, cache
|
10
|
-
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
11
|
+
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
|
12
|
+
|
13
|
+
|
14
|
+
class ResponseEncoding:
|
15
|
+
__DEFAULT_ENCODING = "utf-8"
|
16
|
+
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
@cache(maxsize=None)
|
20
|
+
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
21
|
+
"""Parse content type and parameters from a content-type header value.
|
22
|
+
|
23
|
+
Uses `email.message.Message` for robust header parsing according to RFC 2045.
|
24
|
+
|
25
|
+
:param header_value: Raw content-type header string
|
26
|
+
:return: Tuple of (content_type, parameters_dict)
|
27
|
+
"""
|
28
|
+
# Create a Message object and set the Content-Type header then get the content type and parameters
|
29
|
+
msg = Message()
|
30
|
+
msg['content-type'] = header_value
|
31
|
+
|
32
|
+
content_type = msg.get_content_type()
|
33
|
+
params = dict(msg.get_params(failobj=[]))
|
34
|
+
|
35
|
+
# Remove the content-type from params if present somehow
|
36
|
+
params.pop('content-type', None)
|
37
|
+
|
38
|
+
return content_type, params
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
@cache(maxsize=None)
|
42
|
+
def get_value(cls, content_type: Optional[str]) -> str:
|
43
|
+
"""Determine the appropriate character encoding from a content-type header.
|
44
|
+
|
45
|
+
The encoding is determined by these rules in order:
|
46
|
+
1. If no content-type is provided, use UTF-8
|
47
|
+
2. If charset parameter is present, use that encoding
|
48
|
+
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
|
49
|
+
4. If content-type is application/json, use UTF-8 per RFC 4627
|
50
|
+
5. Default to UTF-8 if nothing else matches
|
51
|
+
|
52
|
+
:param content_type: Content-Type header value or None
|
53
|
+
:return: String naming the character encoding
|
54
|
+
"""
|
55
|
+
if not content_type:
|
56
|
+
return cls.__DEFAULT_ENCODING
|
57
|
+
|
58
|
+
try:
|
59
|
+
content_type, params = cls.__parse_content_type(content_type)
|
60
|
+
|
61
|
+
# First check for explicit charset parameter
|
62
|
+
if "charset" in params:
|
63
|
+
encoding = params["charset"].strip("'\"")
|
64
|
+
"test".encode(encoding) # Validate encoding
|
65
|
+
return encoding
|
66
|
+
|
67
|
+
# Apply content-type specific rules
|
68
|
+
if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
69
|
+
return "ISO-8859-1"
|
70
|
+
|
71
|
+
if content_type == "application/json":
|
72
|
+
return cls.__DEFAULT_ENCODING
|
73
|
+
|
74
|
+
return cls.__DEFAULT_ENCODING
|
75
|
+
|
76
|
+
except (ValueError, LookupError, UnicodeEncodeError):
|
77
|
+
return cls.__DEFAULT_ENCODING
|
11
78
|
|
12
79
|
|
13
80
|
class Response(Adaptor):
|
@@ -20,6 +87,7 @@ class Response(Adaptor):
|
|
20
87
|
self.cookies = cookies
|
21
88
|
self.headers = headers
|
22
89
|
self.request_headers = request_headers
|
90
|
+
encoding = ResponseEncoding.get_value(encoding)
|
23
91
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
24
92
|
# For back-ward compatibility
|
25
93
|
self.adaptor = self
|
@@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
67
67
|
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
68
68
|
os_name = get_os_name()
|
69
69
|
return HeaderGenerator(
|
70
|
-
browser=[Browser(name='chrome', min_version=
|
70
|
+
browser=[Browser(name='chrome', min_version=130)],
|
71
71
|
os=os_name, # None is ignored
|
72
72
|
device='desktop'
|
73
73
|
).generate()
|
scrapling/fetchers.py
CHANGED
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
|
|
11
11
|
"""
|
12
12
|
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
13
13
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
|
+
|
14
15
|
:param url: Target url.
|
15
16
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
17
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -24,6 +25,7 @@ class Fetcher(BaseFetcher):
|
|
24
25
|
|
25
26
|
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
26
27
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
|
+
|
27
29
|
:param url: Target url.
|
28
30
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
29
31
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -37,12 +39,14 @@ class Fetcher(BaseFetcher):
|
|
37
39
|
|
38
40
|
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
39
41
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
|
+
|
40
43
|
:param url: Target url
|
41
44
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
42
45
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
43
46
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
44
|
-
|
47
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
45
48
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
|
+
|
46
50
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
47
51
|
"""
|
48
52
|
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
@@ -50,6 +54,7 @@ class Fetcher(BaseFetcher):
|
|
50
54
|
|
51
55
|
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
52
56
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
|
+
|
53
58
|
:param url: Target url
|
54
59
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
55
60
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -77,6 +82,7 @@ class StealthyFetcher(BaseFetcher):
|
|
77
82
|
) -> Response:
|
78
83
|
"""
|
79
84
|
Opens up a browser and do your request based on your chosen options below.
|
85
|
+
|
80
86
|
:param url: Target url.
|
81
87
|
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
82
88
|
:param block_images: Prevent the loading of images through Firefox preferences.
|
@@ -127,26 +133,28 @@ class PlayWrightFetcher(BaseFetcher):
|
|
127
133
|
Using this Fetcher class, you can do requests with:
|
128
134
|
- Vanilla Playwright without any modifications other than the ones you chose.
|
129
135
|
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
136
|
+
Some of the things stealth mode does include:
|
137
|
+
1) Patches the CDP runtime fingerprint.
|
138
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
139
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
140
|
+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
141
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
136
142
|
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
137
|
-
|
143
|
+
|
144
|
+
> Note that these are the main options with PlayWright but it can be mixed together.
|
138
145
|
"""
|
139
146
|
def fetch(
|
140
147
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
141
148
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
142
|
-
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
143
|
-
hide_canvas: bool =
|
149
|
+
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
|
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
144
151
|
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
145
|
-
stealth: bool = False,
|
152
|
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
146
153
|
cdp_url: Optional[str] = None,
|
147
|
-
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
154
|
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
148
155
|
) -> Response:
|
149
156
|
"""Opens up a browser and do your request based on your chosen options below.
|
157
|
+
|
150
158
|
:param url: Target url.
|
151
159
|
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
152
160
|
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
@@ -159,6 +167,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
159
167
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
160
168
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
161
169
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
170
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
162
171
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
163
172
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
164
173
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
@@ -176,6 +185,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
176
185
|
cdp_url=cdp_url,
|
177
186
|
headless=headless,
|
178
187
|
useragent=useragent,
|
188
|
+
real_chrome=real_chrome,
|
179
189
|
page_action=page_action,
|
180
190
|
hide_canvas=hide_canvas,
|
181
191
|
network_idle=network_idle,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -32,16 +32,16 @@ Classifier: Typing :: Typed
|
|
32
32
|
Requires-Python: >=3.8
|
33
33
|
Description-Content-Type: text/markdown
|
34
34
|
License-File: LICENSE
|
35
|
-
Requires-Dist: requests
|
36
|
-
Requires-Dist: lxml
|
37
|
-
Requires-Dist: cssselect
|
35
|
+
Requires-Dist: requests>=2.3
|
36
|
+
Requires-Dist: lxml>=4.5
|
37
|
+
Requires-Dist: cssselect>=1.2
|
38
38
|
Requires-Dist: w3lib
|
39
|
-
Requires-Dist: orjson
|
39
|
+
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
42
|
+
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox
|
44
|
+
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -336,9 +336,11 @@ Using this Fetcher class, you can make requests with:
|
|
336
336
|
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
337
337
|
* Using custom flags on launch to hide Playwright even more and make it faster.
|
338
338
|
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
339
|
-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
339
|
+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
340
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
341
|
|
342
|
+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
|
343
|
+
|
342
344
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
343
345
|
|
344
346
|
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
@@ -360,6 +362,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
360
362
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
361
363
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
362
364
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
|
+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
363
366
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
364
367
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
365
368
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -1,23 +1,23 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256
|
3
|
+
scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
8
|
-
scrapling/core/custom_types.py,sha256
|
8
|
+
scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
|
9
9
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
10
|
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
11
|
-
scrapling/core/translator.py,sha256=
|
11
|
+
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
|
15
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
-
scrapling/engines/static.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
|
17
|
+
scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
23
|
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
@@ -28,14 +28,15 @@ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gI
|
|
28
28
|
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
29
|
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
30
|
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
-
tests/fetchers/test_camoufox.py,sha256=
|
31
|
+
tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
|
32
32
|
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
33
|
-
tests/fetchers/test_playwright.py,sha256=
|
33
|
+
tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
|
34
|
+
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
34
35
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
36
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
36
|
-
tests/parser/test_general.py,sha256=
|
37
|
-
scrapling-0.2.
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
37
|
+
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
+
scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
|
40
|
+
scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.6.dist-info/RECORD,,
|
tests/fetchers/test_camoufox.py
CHANGED
@@ -36,6 +36,7 @@ class TestStealthyFetcher(unittest.TestCase):
|
|
36
36
|
def test_waiting_selector(self):
|
37
37
|
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
38
|
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
39
40
|
|
40
41
|
def test_cookies_loading(self):
|
41
42
|
"""Test if cookies are set after the request"""
|
@@ -56,6 +57,7 @@ class TestStealthyFetcher(unittest.TestCase):
|
|
56
57
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
57
58
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
58
59
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
60
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
|
59
61
|
|
60
62
|
def test_infinite_timeout(self):
|
61
63
|
"""Test if infinite timeout breaks the code or not"""
|
@@ -35,6 +35,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
|
|
35
35
|
def test_waiting_selector(self):
|
36
36
|
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
37
|
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
38
39
|
|
39
40
|
def test_cookies_loading(self):
|
40
41
|
"""Test if cookies are set after the request"""
|
@@ -56,6 +57,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
|
|
56
57
|
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
57
58
|
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
58
59
|
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
60
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
|
59
61
|
|
60
62
|
def test_cdp_url(self):
|
61
63
|
"""Test if it's going to try to connect to cdp url or not"""
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
|
4
|
+
|
5
|
+
|
6
|
+
class TestPlayWrightFetcher(unittest.TestCase):
|
7
|
+
def setUp(self):
|
8
|
+
self.content_type_map = {
|
9
|
+
# A map generated by ChatGPT for most possible `content_type` values and the expected outcome
|
10
|
+
'text/html; charset=UTF-8': 'UTF-8',
|
11
|
+
'text/html; charset=ISO-8859-1': 'ISO-8859-1',
|
12
|
+
'text/html': 'ISO-8859-1',
|
13
|
+
'application/json; charset=UTF-8': 'UTF-8',
|
14
|
+
'application/json': 'utf-8',
|
15
|
+
'text/json': 'utf-8',
|
16
|
+
'application/javascript; charset=UTF-8': 'UTF-8',
|
17
|
+
'application/javascript': 'utf-8',
|
18
|
+
'text/plain; charset=UTF-8': 'UTF-8',
|
19
|
+
'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
|
20
|
+
'text/plain': 'ISO-8859-1',
|
21
|
+
'application/xhtml+xml; charset=UTF-8': 'UTF-8',
|
22
|
+
'application/xhtml+xml': 'utf-8',
|
23
|
+
'text/html; charset=windows-1252': 'windows-1252',
|
24
|
+
'application/json; charset=windows-1252': 'windows-1252',
|
25
|
+
'text/plain; charset=windows-1252': 'windows-1252',
|
26
|
+
'text/html; charset="UTF-8"': 'UTF-8',
|
27
|
+
'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
|
28
|
+
'text/html; charset="windows-1252"': 'windows-1252',
|
29
|
+
'application/json; charset="UTF-8"': 'UTF-8',
|
30
|
+
'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
|
31
|
+
'application/json; charset="windows-1252"': 'windows-1252',
|
32
|
+
'text/json; charset="UTF-8"': 'UTF-8',
|
33
|
+
'application/javascript; charset="UTF-8"': 'UTF-8',
|
34
|
+
'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
|
35
|
+
'text/plain; charset="UTF-8"': 'UTF-8',
|
36
|
+
'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
|
37
|
+
'text/plain; charset="windows-1252"': 'windows-1252',
|
38
|
+
'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
|
39
|
+
'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
40
|
+
'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
|
41
|
+
'text/html; charset="US-ASCII"': 'US-ASCII',
|
42
|
+
'application/json; charset="US-ASCII"': 'US-ASCII',
|
43
|
+
'text/plain; charset="US-ASCII"': 'US-ASCII',
|
44
|
+
'text/html; charset="Shift_JIS"': 'Shift_JIS',
|
45
|
+
'application/json; charset="Shift_JIS"': 'Shift_JIS',
|
46
|
+
'text/plain; charset="Shift_JIS"': 'Shift_JIS',
|
47
|
+
'application/xml; charset="UTF-8"': 'UTF-8',
|
48
|
+
'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
49
|
+
'application/xml': 'utf-8',
|
50
|
+
'text/xml; charset="UTF-8"': 'UTF-8',
|
51
|
+
'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
52
|
+
'text/xml': 'utf-8'
|
53
|
+
}
|
54
|
+
self.status_map = {
|
55
|
+
100: "Continue",
|
56
|
+
101: "Switching Protocols",
|
57
|
+
102: "Processing",
|
58
|
+
103: "Early Hints",
|
59
|
+
200: "OK",
|
60
|
+
201: "Created",
|
61
|
+
202: "Accepted",
|
62
|
+
203: "Non-Authoritative Information",
|
63
|
+
204: "No Content",
|
64
|
+
205: "Reset Content",
|
65
|
+
206: "Partial Content",
|
66
|
+
207: "Multi-Status",
|
67
|
+
208: "Already Reported",
|
68
|
+
226: "IM Used",
|
69
|
+
300: "Multiple Choices",
|
70
|
+
301: "Moved Permanently",
|
71
|
+
302: "Found",
|
72
|
+
303: "See Other",
|
73
|
+
304: "Not Modified",
|
74
|
+
305: "Use Proxy",
|
75
|
+
307: "Temporary Redirect",
|
76
|
+
308: "Permanent Redirect",
|
77
|
+
400: "Bad Request",
|
78
|
+
401: "Unauthorized",
|
79
|
+
402: "Payment Required",
|
80
|
+
403: "Forbidden",
|
81
|
+
404: "Not Found",
|
82
|
+
405: "Method Not Allowed",
|
83
|
+
406: "Not Acceptable",
|
84
|
+
407: "Proxy Authentication Required",
|
85
|
+
408: "Request Timeout",
|
86
|
+
409: "Conflict",
|
87
|
+
410: "Gone",
|
88
|
+
411: "Length Required",
|
89
|
+
412: "Precondition Failed",
|
90
|
+
413: "Payload Too Large",
|
91
|
+
414: "URI Too Long",
|
92
|
+
415: "Unsupported Media Type",
|
93
|
+
416: "Range Not Satisfiable",
|
94
|
+
417: "Expectation Failed",
|
95
|
+
418: "I'm a teapot",
|
96
|
+
421: "Misdirected Request",
|
97
|
+
422: "Unprocessable Entity",
|
98
|
+
423: "Locked",
|
99
|
+
424: "Failed Dependency",
|
100
|
+
425: "Too Early",
|
101
|
+
426: "Upgrade Required",
|
102
|
+
428: "Precondition Required",
|
103
|
+
429: "Too Many Requests",
|
104
|
+
431: "Request Header Fields Too Large",
|
105
|
+
451: "Unavailable For Legal Reasons",
|
106
|
+
500: "Internal Server Error",
|
107
|
+
501: "Not Implemented",
|
108
|
+
502: "Bad Gateway",
|
109
|
+
503: "Service Unavailable",
|
110
|
+
504: "Gateway Timeout",
|
111
|
+
505: "HTTP Version Not Supported",
|
112
|
+
506: "Variant Also Negotiates",
|
113
|
+
507: "Insufficient Storage",
|
114
|
+
508: "Loop Detected",
|
115
|
+
510: "Not Extended",
|
116
|
+
511: "Network Authentication Required"
|
117
|
+
}
|
118
|
+
|
119
|
+
def test_parsing_content_type(self):
|
120
|
+
"""Test if parsing different types of content-type returns the expected result"""
|
121
|
+
for header_value, expected_encoding in self.content_type_map.items():
|
122
|
+
self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
|
123
|
+
|
124
|
+
def test_parsing_response_status(self):
|
125
|
+
"""Test if using different http responses' status codes returns the expected result"""
|
126
|
+
for status_code, expected_status_text in self.status_map.items():
|
127
|
+
self.assertEqual(StatusText.get(status_code), expected_status_text)
|
128
|
+
|
129
|
+
self.assertEqual(StatusText.get(1000), "Unknown Status Code")
|
tests/parser/test_general.py
CHANGED
@@ -278,7 +278,7 @@ class TestParser(unittest.TestCase):
|
|
278
278
|
self.assertEqual(len(elements), 5000)
|
279
279
|
# Converting 5000 elements to a class and doing operations on them will take time
|
280
280
|
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
281
|
-
self.assertLess(end_time - start_time, 0.1
|
281
|
+
self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
|
282
282
|
|
283
283
|
|
284
284
|
# Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
|
File without changes
|
File without changes
|