scrapling 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/core/custom_types.py +2 -3
- scrapling/core/translator.py +6 -4
- scrapling/engines/camo.py +3 -6
- scrapling/engines/pw.py +18 -15
- scrapling/engines/static.py +5 -1
- scrapling/engines/toolbelt/custom.py +69 -1
- scrapling/engines/toolbelt/fingerprints.py +1 -1
- scrapling/fetchers.py +22 -12
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/METADATA +11 -8
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/RECORD +18 -17
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/WHEEL +1 -1
- tests/fetchers/test_camoufox.py +2 -0
- tests/fetchers/test_playwright.py +2 -0
- tests/fetchers/test_utils.py +129 -0
- tests/parser/test_general.py +1 -1
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/LICENSE +0 -0
- {scrapling-0.2.4.dist-info → scrapling-0.2.6.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.6"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/core/custom_types.py
CHANGED
@@ -129,9 +129,8 @@ class TextHandlers(List[TextHandler]):
|
|
129
129
|
|
130
130
|
|
131
131
|
class AttributesHandler(Mapping):
|
132
|
-
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
133
|
-
|
134
|
-
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
132
|
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
133
|
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
135
134
|
"""
|
136
135
|
__slots__ = ('_data',)
|
137
136
|
|
scrapling/core/translator.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
"""
|
2
2
|
Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
|
3
|
-
|
4
|
-
which will be important in future releases but most importantly...
|
5
|
-
|
6
|
-
|
3
|
+
|
4
|
+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
|
5
|
+
|
6
|
+
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
|
7
|
+
|
8
|
+
if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
|
7
9
|
"""
|
8
10
|
|
9
11
|
import re
|
scrapling/engines/camo.py
CHANGED
@@ -104,13 +104,10 @@ class CamoufoxEngine:
|
|
104
104
|
|
105
105
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
106
|
waiter = page.locator(self.wait_selector)
|
107
|
-
waiter.wait_for(state=self.wait_selector_state)
|
107
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
encoding = 'utf-8' # default encoding
|
112
|
-
if 'charset=' in content_type.lower():
|
113
|
-
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
109
|
+
# This will be parsed inside `Response`
|
110
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
114
111
|
|
115
112
|
status_text = res.status_text
|
116
113
|
# PlayWright API sometimes give empty status text for some reason!
|
scrapling/engines/pw.py
CHANGED
@@ -27,11 +27,12 @@ class PlaywrightEngine:
|
|
27
27
|
page_action: Callable = do_nothing,
|
28
28
|
wait_selector: Optional[str] = None,
|
29
29
|
wait_selector_state: Optional[str] = 'attached',
|
30
|
-
stealth: bool = False,
|
31
|
-
|
32
|
-
|
30
|
+
stealth: Optional[bool] = False,
|
31
|
+
real_chrome: Optional[bool] = False,
|
32
|
+
hide_canvas: Optional[bool] = False,
|
33
|
+
disable_webgl: Optional[bool] = False,
|
33
34
|
cdp_url: Optional[str] = None,
|
34
|
-
nstbrowser_mode: bool = False,
|
35
|
+
nstbrowser_mode: Optional[bool] = False,
|
35
36
|
nstbrowser_config: Optional[Dict] = None,
|
36
37
|
google_search: Optional[bool] = True,
|
37
38
|
extra_headers: Optional[Dict[str, str]] = None,
|
@@ -51,6 +52,7 @@ class PlaywrightEngine:
|
|
51
52
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
52
53
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
53
54
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
54
56
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
55
57
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
56
58
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
@@ -67,6 +69,7 @@ class PlaywrightEngine:
|
|
67
69
|
self.stealth = bool(stealth)
|
68
70
|
self.hide_canvas = bool(hide_canvas)
|
69
71
|
self.disable_webgl = bool(disable_webgl)
|
72
|
+
self.real_chrome = bool(real_chrome)
|
70
73
|
self.google_search = bool(google_search)
|
71
74
|
self.extra_headers = extra_headers or {}
|
72
75
|
self.proxy = construct_proxy_dict(proxy)
|
@@ -119,7 +122,8 @@ class PlaywrightEngine:
|
|
119
122
|
:param url: Target url.
|
120
123
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
121
124
|
"""
|
122
|
-
if not self.stealth:
|
125
|
+
if not self.stealth or self.real_chrome:
|
126
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
123
127
|
from playwright.sync_api import sync_playwright
|
124
128
|
else:
|
125
129
|
from rebrowser_playwright.sync_api import sync_playwright
|
@@ -130,8 +134,8 @@ class PlaywrightEngine:
|
|
130
134
|
extra_headers = {}
|
131
135
|
useragent = self.useragent
|
132
136
|
else:
|
133
|
-
extra_headers =
|
134
|
-
useragent =
|
137
|
+
extra_headers = {}
|
138
|
+
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
135
139
|
|
136
140
|
# Prepare the flags before diving
|
137
141
|
flags = DEFAULT_STEALTH_FLAGS
|
@@ -146,9 +150,11 @@ class PlaywrightEngine:
|
|
146
150
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
147
151
|
else:
|
148
152
|
if self.stealth:
|
149
|
-
browser = p.chromium.launch(
|
153
|
+
browser = p.chromium.launch(
|
154
|
+
headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
|
+
)
|
150
156
|
else:
|
151
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
|
157
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
|
152
158
|
|
153
159
|
# Creating the context
|
154
160
|
if self.stealth:
|
@@ -214,13 +220,10 @@ class PlaywrightEngine:
|
|
214
220
|
|
215
221
|
if self.wait_selector and type(self.wait_selector) is str:
|
216
222
|
waiter = page.locator(self.wait_selector)
|
217
|
-
waiter.wait_for(state=self.wait_selector_state)
|
223
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
218
224
|
|
219
|
-
|
220
|
-
|
221
|
-
encoding = 'utf-8' # default encoding
|
222
|
-
if 'charset=' in content_type.lower():
|
223
|
-
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
225
|
+
# This will be parsed inside `Response`
|
226
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
224
227
|
|
225
228
|
status_text = res.status_text
|
226
229
|
# PlayWright API sometimes give empty status text for some reason!
|
scrapling/engines/static.py
CHANGED
@@ -23,7 +23,7 @@ class StaticEngine:
|
|
23
23
|
@staticmethod
|
24
24
|
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
25
25
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
26
|
-
|
26
|
+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
27
27
|
|
28
28
|
:param headers: Current headers in the request if the user passed any
|
29
29
|
:param url: The Target URL.
|
@@ -65,6 +65,7 @@ class StaticEngine:
|
|
65
65
|
|
66
66
|
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
67
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
|
+
|
68
69
|
:param url: Target url.
|
69
70
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
70
71
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -77,6 +78,7 @@ class StaticEngine:
|
|
77
78
|
|
78
79
|
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
79
80
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
|
+
|
80
82
|
:param url: Target url.
|
81
83
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
82
84
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -89,6 +91,7 @@ class StaticEngine:
|
|
89
91
|
|
90
92
|
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
91
93
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
|
+
|
92
95
|
:param url: Target url.
|
93
96
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
94
97
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -101,6 +104,7 @@ class StaticEngine:
|
|
101
104
|
|
102
105
|
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
103
106
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
|
+
|
104
108
|
:param url: Target url.
|
105
109
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
106
110
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -3,11 +3,78 @@ Functions related to custom types or type checking
|
|
3
3
|
"""
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
|
+
from email.message import Message
|
6
7
|
|
7
8
|
from scrapling.core.custom_types import MappingProxyType
|
8
9
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
10
|
from scrapling.core.utils import setup_basic_logging, cache
|
10
|
-
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
11
|
+
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
|
12
|
+
|
13
|
+
|
14
|
+
class ResponseEncoding:
|
15
|
+
__DEFAULT_ENCODING = "utf-8"
|
16
|
+
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
@cache(maxsize=None)
|
20
|
+
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
21
|
+
"""Parse content type and parameters from a content-type header value.
|
22
|
+
|
23
|
+
Uses `email.message.Message` for robust header parsing according to RFC 2045.
|
24
|
+
|
25
|
+
:param header_value: Raw content-type header string
|
26
|
+
:return: Tuple of (content_type, parameters_dict)
|
27
|
+
"""
|
28
|
+
# Create a Message object and set the Content-Type header then get the content type and parameters
|
29
|
+
msg = Message()
|
30
|
+
msg['content-type'] = header_value
|
31
|
+
|
32
|
+
content_type = msg.get_content_type()
|
33
|
+
params = dict(msg.get_params(failobj=[]))
|
34
|
+
|
35
|
+
# Remove the content-type from params if present somehow
|
36
|
+
params.pop('content-type', None)
|
37
|
+
|
38
|
+
return content_type, params
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
@cache(maxsize=None)
|
42
|
+
def get_value(cls, content_type: Optional[str]) -> str:
|
43
|
+
"""Determine the appropriate character encoding from a content-type header.
|
44
|
+
|
45
|
+
The encoding is determined by these rules in order:
|
46
|
+
1. If no content-type is provided, use UTF-8
|
47
|
+
2. If charset parameter is present, use that encoding
|
48
|
+
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
|
49
|
+
4. If content-type is application/json, use UTF-8 per RFC 4627
|
50
|
+
5. Default to UTF-8 if nothing else matches
|
51
|
+
|
52
|
+
:param content_type: Content-Type header value or None
|
53
|
+
:return: String naming the character encoding
|
54
|
+
"""
|
55
|
+
if not content_type:
|
56
|
+
return cls.__DEFAULT_ENCODING
|
57
|
+
|
58
|
+
try:
|
59
|
+
content_type, params = cls.__parse_content_type(content_type)
|
60
|
+
|
61
|
+
# First check for explicit charset parameter
|
62
|
+
if "charset" in params:
|
63
|
+
encoding = params["charset"].strip("'\"")
|
64
|
+
"test".encode(encoding) # Validate encoding
|
65
|
+
return encoding
|
66
|
+
|
67
|
+
# Apply content-type specific rules
|
68
|
+
if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
69
|
+
return "ISO-8859-1"
|
70
|
+
|
71
|
+
if content_type == "application/json":
|
72
|
+
return cls.__DEFAULT_ENCODING
|
73
|
+
|
74
|
+
return cls.__DEFAULT_ENCODING
|
75
|
+
|
76
|
+
except (ValueError, LookupError, UnicodeEncodeError):
|
77
|
+
return cls.__DEFAULT_ENCODING
|
11
78
|
|
12
79
|
|
13
80
|
class Response(Adaptor):
|
@@ -20,6 +87,7 @@ class Response(Adaptor):
|
|
20
87
|
self.cookies = cookies
|
21
88
|
self.headers = headers
|
22
89
|
self.request_headers = request_headers
|
90
|
+
encoding = ResponseEncoding.get_value(encoding)
|
23
91
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
24
92
|
# For back-ward compatibility
|
25
93
|
self.adaptor = self
|
@@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
67
67
|
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
68
68
|
os_name = get_os_name()
|
69
69
|
return HeaderGenerator(
|
70
|
-
browser=[Browser(name='chrome', min_version=
|
70
|
+
browser=[Browser(name='chrome', min_version=130)],
|
71
71
|
os=os_name, # None is ignored
|
72
72
|
device='desktop'
|
73
73
|
).generate()
|
scrapling/fetchers.py
CHANGED
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
|
|
11
11
|
"""
|
12
12
|
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
13
13
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
|
+
|
14
15
|
:param url: Target url.
|
15
16
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
17
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -24,6 +25,7 @@ class Fetcher(BaseFetcher):
|
|
24
25
|
|
25
26
|
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
26
27
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
|
+
|
27
29
|
:param url: Target url.
|
28
30
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
29
31
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -37,12 +39,14 @@ class Fetcher(BaseFetcher):
|
|
37
39
|
|
38
40
|
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
39
41
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
|
+
|
40
43
|
:param url: Target url
|
41
44
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
42
45
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
43
46
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
44
|
-
|
47
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
45
48
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
|
+
|
46
50
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
47
51
|
"""
|
48
52
|
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
@@ -50,6 +54,7 @@ class Fetcher(BaseFetcher):
|
|
50
54
|
|
51
55
|
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
52
56
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
|
+
|
53
58
|
:param url: Target url
|
54
59
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
55
60
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -77,6 +82,7 @@ class StealthyFetcher(BaseFetcher):
|
|
77
82
|
) -> Response:
|
78
83
|
"""
|
79
84
|
Opens up a browser and do your request based on your chosen options below.
|
85
|
+
|
80
86
|
:param url: Target url.
|
81
87
|
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
82
88
|
:param block_images: Prevent the loading of images through Firefox preferences.
|
@@ -127,26 +133,28 @@ class PlayWrightFetcher(BaseFetcher):
|
|
127
133
|
Using this Fetcher class, you can do requests with:
|
128
134
|
- Vanilla Playwright without any modifications other than the ones you chose.
|
129
135
|
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
136
|
+
Some of the things stealth mode does include:
|
137
|
+
1) Patches the CDP runtime fingerprint.
|
138
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
139
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
140
|
+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
141
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
136
142
|
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
137
|
-
|
143
|
+
|
144
|
+
> Note that these are the main options with PlayWright but it can be mixed together.
|
138
145
|
"""
|
139
146
|
def fetch(
|
140
147
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
141
148
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
142
|
-
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
143
|
-
hide_canvas: bool =
|
149
|
+
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
|
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
144
151
|
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
145
|
-
stealth: bool = False,
|
152
|
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
146
153
|
cdp_url: Optional[str] = None,
|
147
|
-
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
154
|
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
148
155
|
) -> Response:
|
149
156
|
"""Opens up a browser and do your request based on your chosen options below.
|
157
|
+
|
150
158
|
:param url: Target url.
|
151
159
|
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
152
160
|
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
@@ -159,6 +167,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
159
167
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
160
168
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
161
169
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
170
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
162
171
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
163
172
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
164
173
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
@@ -176,6 +185,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
176
185
|
cdp_url=cdp_url,
|
177
186
|
headless=headless,
|
178
187
|
useragent=useragent,
|
188
|
+
real_chrome=real_chrome,
|
179
189
|
page_action=page_action,
|
180
190
|
hide_canvas=hide_canvas,
|
181
191
|
network_idle=network_idle,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -32,16 +32,16 @@ Classifier: Typing :: Typed
|
|
32
32
|
Requires-Python: >=3.8
|
33
33
|
Description-Content-Type: text/markdown
|
34
34
|
License-File: LICENSE
|
35
|
-
Requires-Dist: requests
|
36
|
-
Requires-Dist: lxml
|
37
|
-
Requires-Dist: cssselect
|
35
|
+
Requires-Dist: requests>=2.3
|
36
|
+
Requires-Dist: lxml>=4.5
|
37
|
+
Requires-Dist: cssselect>=1.2
|
38
38
|
Requires-Dist: w3lib
|
39
|
-
Requires-Dist: orjson
|
39
|
+
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
42
|
+
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox
|
44
|
+
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -336,9 +336,11 @@ Using this Fetcher class, you can make requests with:
|
|
336
336
|
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
337
337
|
* Using custom flags on launch to hide Playwright even more and make it faster.
|
338
338
|
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
339
|
-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
339
|
+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
340
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
341
|
|
342
|
+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
|
343
|
+
|
342
344
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
343
345
|
|
344
346
|
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
@@ -360,6 +362,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
360
362
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
361
363
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
362
364
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
|
+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
363
366
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
364
367
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
365
368
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -1,23 +1,23 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256
|
3
|
+
scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
8
|
-
scrapling/core/custom_types.py,sha256
|
8
|
+
scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
|
9
9
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
10
|
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
11
|
-
scrapling/core/translator.py,sha256=
|
11
|
+
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
|
15
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
-
scrapling/engines/static.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
|
17
|
+
scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
23
|
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
@@ -28,14 +28,15 @@ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gI
|
|
28
28
|
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
29
|
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
30
|
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
-
tests/fetchers/test_camoufox.py,sha256=
|
31
|
+
tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
|
32
32
|
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
33
|
-
tests/fetchers/test_playwright.py,sha256=
|
33
|
+
tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
|
34
|
+
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
34
35
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
36
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
36
|
-
tests/parser/test_general.py,sha256=
|
37
|
-
scrapling-0.2.
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
37
|
+
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
+
scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
|
40
|
+
scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.6.dist-info/RECORD,,
|
tests/fetchers/test_camoufox.py
CHANGED
@@ -36,6 +36,7 @@ class TestStealthyFetcher(unittest.TestCase):
|
|
36
36
|
def test_waiting_selector(self):
|
37
37
|
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
38
|
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
39
40
|
|
40
41
|
def test_cookies_loading(self):
|
41
42
|
"""Test if cookies are set after the request"""
|
@@ -56,6 +57,7 @@ class TestStealthyFetcher(unittest.TestCase):
|
|
56
57
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
57
58
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
58
59
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
60
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
|
59
61
|
|
60
62
|
def test_infinite_timeout(self):
|
61
63
|
"""Test if infinite timeout breaks the code or not"""
|
@@ -35,6 +35,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
|
|
35
35
|
def test_waiting_selector(self):
|
36
36
|
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
37
|
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
38
39
|
|
39
40
|
def test_cookies_loading(self):
|
40
41
|
"""Test if cookies are set after the request"""
|
@@ -56,6 +57,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
|
|
56
57
|
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
57
58
|
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
58
59
|
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
60
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
|
59
61
|
|
60
62
|
def test_cdp_url(self):
|
61
63
|
"""Test if it's going to try to connect to cdp url or not"""
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
|
4
|
+
|
5
|
+
|
6
|
+
class TestPlayWrightFetcher(unittest.TestCase):
|
7
|
+
def setUp(self):
|
8
|
+
self.content_type_map = {
|
9
|
+
# A map generated by ChatGPT for most possible `content_type` values and the expected outcome
|
10
|
+
'text/html; charset=UTF-8': 'UTF-8',
|
11
|
+
'text/html; charset=ISO-8859-1': 'ISO-8859-1',
|
12
|
+
'text/html': 'ISO-8859-1',
|
13
|
+
'application/json; charset=UTF-8': 'UTF-8',
|
14
|
+
'application/json': 'utf-8',
|
15
|
+
'text/json': 'utf-8',
|
16
|
+
'application/javascript; charset=UTF-8': 'UTF-8',
|
17
|
+
'application/javascript': 'utf-8',
|
18
|
+
'text/plain; charset=UTF-8': 'UTF-8',
|
19
|
+
'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
|
20
|
+
'text/plain': 'ISO-8859-1',
|
21
|
+
'application/xhtml+xml; charset=UTF-8': 'UTF-8',
|
22
|
+
'application/xhtml+xml': 'utf-8',
|
23
|
+
'text/html; charset=windows-1252': 'windows-1252',
|
24
|
+
'application/json; charset=windows-1252': 'windows-1252',
|
25
|
+
'text/plain; charset=windows-1252': 'windows-1252',
|
26
|
+
'text/html; charset="UTF-8"': 'UTF-8',
|
27
|
+
'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
|
28
|
+
'text/html; charset="windows-1252"': 'windows-1252',
|
29
|
+
'application/json; charset="UTF-8"': 'UTF-8',
|
30
|
+
'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
|
31
|
+
'application/json; charset="windows-1252"': 'windows-1252',
|
32
|
+
'text/json; charset="UTF-8"': 'UTF-8',
|
33
|
+
'application/javascript; charset="UTF-8"': 'UTF-8',
|
34
|
+
'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
|
35
|
+
'text/plain; charset="UTF-8"': 'UTF-8',
|
36
|
+
'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
|
37
|
+
'text/plain; charset="windows-1252"': 'windows-1252',
|
38
|
+
'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
|
39
|
+
'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
40
|
+
'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
|
41
|
+
'text/html; charset="US-ASCII"': 'US-ASCII',
|
42
|
+
'application/json; charset="US-ASCII"': 'US-ASCII',
|
43
|
+
'text/plain; charset="US-ASCII"': 'US-ASCII',
|
44
|
+
'text/html; charset="Shift_JIS"': 'Shift_JIS',
|
45
|
+
'application/json; charset="Shift_JIS"': 'Shift_JIS',
|
46
|
+
'text/plain; charset="Shift_JIS"': 'Shift_JIS',
|
47
|
+
'application/xml; charset="UTF-8"': 'UTF-8',
|
48
|
+
'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
49
|
+
'application/xml': 'utf-8',
|
50
|
+
'text/xml; charset="UTF-8"': 'UTF-8',
|
51
|
+
'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
52
|
+
'text/xml': 'utf-8'
|
53
|
+
}
|
54
|
+
self.status_map = {
|
55
|
+
100: "Continue",
|
56
|
+
101: "Switching Protocols",
|
57
|
+
102: "Processing",
|
58
|
+
103: "Early Hints",
|
59
|
+
200: "OK",
|
60
|
+
201: "Created",
|
61
|
+
202: "Accepted",
|
62
|
+
203: "Non-Authoritative Information",
|
63
|
+
204: "No Content",
|
64
|
+
205: "Reset Content",
|
65
|
+
206: "Partial Content",
|
66
|
+
207: "Multi-Status",
|
67
|
+
208: "Already Reported",
|
68
|
+
226: "IM Used",
|
69
|
+
300: "Multiple Choices",
|
70
|
+
301: "Moved Permanently",
|
71
|
+
302: "Found",
|
72
|
+
303: "See Other",
|
73
|
+
304: "Not Modified",
|
74
|
+
305: "Use Proxy",
|
75
|
+
307: "Temporary Redirect",
|
76
|
+
308: "Permanent Redirect",
|
77
|
+
400: "Bad Request",
|
78
|
+
401: "Unauthorized",
|
79
|
+
402: "Payment Required",
|
80
|
+
403: "Forbidden",
|
81
|
+
404: "Not Found",
|
82
|
+
405: "Method Not Allowed",
|
83
|
+
406: "Not Acceptable",
|
84
|
+
407: "Proxy Authentication Required",
|
85
|
+
408: "Request Timeout",
|
86
|
+
409: "Conflict",
|
87
|
+
410: "Gone",
|
88
|
+
411: "Length Required",
|
89
|
+
412: "Precondition Failed",
|
90
|
+
413: "Payload Too Large",
|
91
|
+
414: "URI Too Long",
|
92
|
+
415: "Unsupported Media Type",
|
93
|
+
416: "Range Not Satisfiable",
|
94
|
+
417: "Expectation Failed",
|
95
|
+
418: "I'm a teapot",
|
96
|
+
421: "Misdirected Request",
|
97
|
+
422: "Unprocessable Entity",
|
98
|
+
423: "Locked",
|
99
|
+
424: "Failed Dependency",
|
100
|
+
425: "Too Early",
|
101
|
+
426: "Upgrade Required",
|
102
|
+
428: "Precondition Required",
|
103
|
+
429: "Too Many Requests",
|
104
|
+
431: "Request Header Fields Too Large",
|
105
|
+
451: "Unavailable For Legal Reasons",
|
106
|
+
500: "Internal Server Error",
|
107
|
+
501: "Not Implemented",
|
108
|
+
502: "Bad Gateway",
|
109
|
+
503: "Service Unavailable",
|
110
|
+
504: "Gateway Timeout",
|
111
|
+
505: "HTTP Version Not Supported",
|
112
|
+
506: "Variant Also Negotiates",
|
113
|
+
507: "Insufficient Storage",
|
114
|
+
508: "Loop Detected",
|
115
|
+
510: "Not Extended",
|
116
|
+
511: "Network Authentication Required"
|
117
|
+
}
|
118
|
+
|
119
|
+
def test_parsing_content_type(self):
|
120
|
+
"""Test if parsing different types of content-type returns the expected result"""
|
121
|
+
for header_value, expected_encoding in self.content_type_map.items():
|
122
|
+
self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
|
123
|
+
|
124
|
+
def test_parsing_response_status(self):
|
125
|
+
"""Test if using different http responses' status codes returns the expected result"""
|
126
|
+
for status_code, expected_status_text in self.status_map.items():
|
127
|
+
self.assertEqual(StatusText.get(status_code), expected_status_text)
|
128
|
+
|
129
|
+
self.assertEqual(StatusText.get(1000), "Unknown Status Code")
|
tests/parser/test_general.py
CHANGED
@@ -278,7 +278,7 @@ class TestParser(unittest.TestCase):
|
|
278
278
|
self.assertEqual(len(elements), 5000)
|
279
279
|
# Converting 5000 elements to a class and doing operations on them will take time
|
280
280
|
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
281
|
-
self.assertLess(end_time - start_time, 0.1
|
281
|
+
self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
|
282
282
|
|
283
283
|
|
284
284
|
# Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
|
File without changes
|
File without changes
|