scrapling 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/engines/camo.py +12 -1
- scrapling/engines/constants.py +1 -1
- scrapling/engines/pw.py +21 -3
- scrapling/engines/static.py +20 -8
- scrapling/engines/toolbelt/custom.py +12 -8
- scrapling/fetchers.py +18 -10
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/METADATA +16 -9
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/RECORD +12 -12
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/LICENSE +0 -0
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/WHEEL +0 -0
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.7"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/engines/camo.py
CHANGED
@@ -12,6 +12,7 @@ from scrapling.engines.toolbelt import (
|
|
12
12
|
generate_convincing_referer,
|
13
13
|
)
|
14
14
|
|
15
|
+
from camoufox import DefaultAddons
|
15
16
|
from camoufox.sync_api import Camoufox
|
16
17
|
|
17
18
|
|
@@ -21,7 +22,8 @@ class CamoufoxEngine:
|
|
21
22
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
22
23
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
23
24
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
24
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None,
|
25
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
26
|
+
adaptor_arguments: Dict = None,
|
25
27
|
):
|
26
28
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
27
29
|
|
@@ -36,6 +38,7 @@ class CamoufoxEngine:
|
|
36
38
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
39
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
38
40
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
41
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
39
42
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
40
43
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
41
44
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -54,6 +57,7 @@ class CamoufoxEngine:
|
|
54
57
|
self.network_idle = bool(network_idle)
|
55
58
|
self.google_search = bool(google_search)
|
56
59
|
self.os_randomize = bool(os_randomize)
|
60
|
+
self.disable_ads = bool(disable_ads)
|
57
61
|
self.extra_headers = extra_headers or {}
|
58
62
|
self.proxy = construct_proxy_dict(proxy)
|
59
63
|
self.addons = addons or []
|
@@ -75,9 +79,11 @@ class CamoufoxEngine:
|
|
75
79
|
:param url: Target url.
|
76
80
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
77
81
|
"""
|
82
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
78
83
|
with Camoufox(
|
79
84
|
proxy=self.proxy,
|
80
85
|
addons=self.addons,
|
86
|
+
exclude_addons=addons,
|
81
87
|
headless=self.headless,
|
82
88
|
humanize=self.humanize,
|
83
89
|
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
@@ -105,6 +111,11 @@ class CamoufoxEngine:
|
|
105
111
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
112
|
waiter = page.locator(self.wait_selector)
|
107
113
|
waiter.first.wait_for(state=self.wait_selector_state)
|
114
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
115
|
+
page.wait_for_load_state(state="load")
|
116
|
+
page.wait_for_load_state(state="domcontentloaded")
|
117
|
+
if self.network_idle:
|
118
|
+
page.wait_for_load_state('networkidle')
|
108
119
|
|
109
120
|
# This will be parsed inside `Response`
|
110
121
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
scrapling/engines/constants.py
CHANGED
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
44
44
|
'--disable-default-apps',
|
45
45
|
'--disable-print-preview',
|
46
46
|
'--disable-dev-shm-usage',
|
47
|
-
'--disable-popup-blocking',
|
47
|
+
# '--disable-popup-blocking',
|
48
48
|
'--metrics-recording-only',
|
49
49
|
'--disable-crash-reporter',
|
50
50
|
'--disable-partial-raster',
|
scrapling/engines/pw.py
CHANGED
@@ -26,6 +26,7 @@ class PlaywrightEngine:
|
|
26
26
|
timeout: Optional[float] = 30000,
|
27
27
|
page_action: Callable = do_nothing,
|
28
28
|
wait_selector: Optional[str] = None,
|
29
|
+
locale: Optional[str] = 'en-US',
|
29
30
|
wait_selector_state: Optional[str] = 'attached',
|
30
31
|
stealth: Optional[bool] = False,
|
31
32
|
real_chrome: Optional[bool] = False,
|
@@ -50,6 +51,7 @@ class PlaywrightEngine:
|
|
50
51
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
51
52
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
52
53
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
54
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
53
55
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
54
56
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
57
|
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
@@ -64,6 +66,7 @@ class PlaywrightEngine:
|
|
64
66
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
65
67
|
"""
|
66
68
|
self.headless = headless
|
69
|
+
self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
|
67
70
|
self.disable_resources = disable_resources
|
68
71
|
self.network_idle = bool(network_idle)
|
69
72
|
self.stealth = bool(stealth)
|
@@ -87,6 +90,14 @@ class PlaywrightEngine:
|
|
87
90
|
self.nstbrowser_mode = bool(nstbrowser_mode)
|
88
91
|
self.nstbrowser_config = nstbrowser_config
|
89
92
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
93
|
+
self.harmful_default_args = [
|
94
|
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
95
|
+
'--enable-automation',
|
96
|
+
'--disable-popup-blocking',
|
97
|
+
# '--disable-component-update',
|
98
|
+
# '--disable-default-apps',
|
99
|
+
# '--disable-extensions',
|
100
|
+
]
|
90
101
|
|
91
102
|
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
92
103
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
@@ -151,15 +162,15 @@ class PlaywrightEngine:
|
|
151
162
|
else:
|
152
163
|
if self.stealth:
|
153
164
|
browser = p.chromium.launch(
|
154
|
-
headless=self.headless, args=flags, ignore_default_args=
|
165
|
+
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
166
|
)
|
156
167
|
else:
|
157
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=
|
168
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
158
169
|
|
159
170
|
# Creating the context
|
160
171
|
if self.stealth:
|
161
172
|
context = browser.new_context(
|
162
|
-
locale=
|
173
|
+
locale=self.locale,
|
163
174
|
is_mobile=False,
|
164
175
|
has_touch=False,
|
165
176
|
proxy=self.proxy,
|
@@ -176,6 +187,8 @@ class PlaywrightEngine:
|
|
176
187
|
)
|
177
188
|
else:
|
178
189
|
context = browser.new_context(
|
190
|
+
locale=self.locale,
|
191
|
+
proxy=self.proxy,
|
179
192
|
color_scheme='dark',
|
180
193
|
user_agent=useragent,
|
181
194
|
device_scale_factor=2,
|
@@ -221,6 +234,11 @@ class PlaywrightEngine:
|
|
221
234
|
if self.wait_selector and type(self.wait_selector) is str:
|
222
235
|
waiter = page.locator(self.wait_selector)
|
223
236
|
waiter.first.wait_for(state=self.wait_selector_state)
|
237
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
238
|
+
page.wait_for_load_state(state="load")
|
239
|
+
page.wait_for_load_state(state="domcontentloaded")
|
240
|
+
if self.network_idle:
|
241
|
+
page.wait_for_load_state('networkidle')
|
224
242
|
|
225
243
|
# This will be parsed inside `Response`
|
226
244
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
scrapling/engines/static.py
CHANGED
@@ -63,54 +63,66 @@ class StaticEngine:
|
|
63
63
|
**self.adaptor_arguments
|
64
64
|
)
|
65
65
|
|
66
|
-
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
66
|
+
def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
67
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
68
|
|
69
69
|
:param url: Target url.
|
70
70
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
71
71
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
72
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
72
73
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
73
74
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
74
75
|
"""
|
75
76
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
76
|
-
|
77
|
+
with httpx.Client(proxy=proxy) as client:
|
78
|
+
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
79
|
+
|
77
80
|
return self._prepare_response(request)
|
78
81
|
|
79
|
-
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
82
|
+
def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
80
83
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
84
|
|
82
85
|
:param url: Target url.
|
83
86
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
84
87
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
88
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
85
89
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
86
90
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
87
91
|
"""
|
88
92
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
89
|
-
|
93
|
+
with httpx.Client(proxy=proxy) as client:
|
94
|
+
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
95
|
+
|
90
96
|
return self._prepare_response(request)
|
91
97
|
|
92
|
-
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
98
|
+
def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
93
99
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
100
|
|
95
101
|
:param url: Target url.
|
96
102
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
97
103
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
104
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
98
105
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
99
106
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
100
107
|
"""
|
101
108
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
102
|
-
|
109
|
+
with httpx.Client(proxy=proxy) as client:
|
110
|
+
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
111
|
+
|
103
112
|
return self._prepare_response(request)
|
104
113
|
|
105
|
-
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
114
|
+
def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
106
115
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
116
|
|
108
117
|
:param url: Target url.
|
109
118
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
110
119
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
120
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
111
121
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
112
122
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
113
123
|
"""
|
114
124
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
115
|
-
|
125
|
+
with httpx.Client(proxy=proxy) as client:
|
126
|
+
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
127
|
+
|
116
128
|
return self._prepare_response(request)
|
@@ -39,7 +39,7 @@ class ResponseEncoding:
|
|
39
39
|
|
40
40
|
@classmethod
|
41
41
|
@cache(maxsize=None)
|
42
|
-
def get_value(cls, content_type: Optional[str]) -> str:
|
42
|
+
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
43
|
"""Determine the appropriate character encoding from a content-type header.
|
44
44
|
|
45
45
|
The encoding is determined by these rules in order:
|
@@ -50,26 +50,30 @@ class ResponseEncoding:
|
|
50
50
|
5. Default to UTF-8 if nothing else matches
|
51
51
|
|
52
52
|
:param content_type: Content-Type header value or None
|
53
|
+
:param text: A text to test the encoding on it
|
53
54
|
:return: String naming the character encoding
|
54
55
|
"""
|
55
56
|
if not content_type:
|
56
57
|
return cls.__DEFAULT_ENCODING
|
57
58
|
|
58
59
|
try:
|
60
|
+
encoding = None
|
59
61
|
content_type, params = cls.__parse_content_type(content_type)
|
60
62
|
|
61
63
|
# First check for explicit charset parameter
|
62
64
|
if "charset" in params:
|
63
65
|
encoding = params["charset"].strip("'\"")
|
64
|
-
"test".encode(encoding) # Validate encoding
|
65
|
-
return encoding
|
66
66
|
|
67
67
|
# Apply content-type specific rules
|
68
|
-
|
69
|
-
|
68
|
+
elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
69
|
+
encoding = "ISO-8859-1"
|
70
|
+
|
71
|
+
elif content_type == "application/json":
|
72
|
+
encoding = cls.__DEFAULT_ENCODING
|
70
73
|
|
71
|
-
if
|
72
|
-
|
74
|
+
if encoding:
|
75
|
+
_ = text.encode(encoding) # Validate encoding and validate it can encode the given text
|
76
|
+
return encoding
|
73
77
|
|
74
78
|
return cls.__DEFAULT_ENCODING
|
75
79
|
|
@@ -87,7 +91,7 @@ class Response(Adaptor):
|
|
87
91
|
self.cookies = cookies
|
88
92
|
self.headers = headers
|
89
93
|
self.request_headers = request_headers
|
90
|
-
encoding = ResponseEncoding.get_value(encoding)
|
94
|
+
encoding = ResponseEncoding.get_value(encoding, text)
|
91
95
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
92
96
|
# For back-ward compatibility
|
93
97
|
self.adaptor = self
|
scrapling/fetchers.py
CHANGED
@@ -9,7 +9,7 @@ class Fetcher(BaseFetcher):
|
|
9
9
|
|
10
10
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
11
|
"""
|
12
|
-
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
12
|
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
13
13
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
14
|
|
15
15
|
:param url: Target url.
|
@@ -17,13 +17,14 @@ class Fetcher(BaseFetcher):
|
|
17
17
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
18
18
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
19
19
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
20
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
20
21
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
21
22
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
22
23
|
"""
|
23
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
24
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs)
|
24
25
|
return response_object
|
25
26
|
|
26
|
-
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
27
|
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
27
28
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
29
|
|
29
30
|
:param url: Target url.
|
@@ -31,13 +32,14 @@ class Fetcher(BaseFetcher):
|
|
31
32
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
32
33
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
33
34
|
create a referer header as if this request came from Google's search of this URL's domain.
|
35
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
34
36
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
35
37
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
36
38
|
"""
|
37
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
39
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs)
|
38
40
|
return response_object
|
39
41
|
|
40
|
-
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
42
|
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
41
43
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
44
|
|
43
45
|
:param url: Target url
|
@@ -45,14 +47,15 @@ class Fetcher(BaseFetcher):
|
|
45
47
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
46
48
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
47
49
|
create a referer header as if this request came from Google's search of this URL's domain.
|
50
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
48
51
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
52
|
|
50
53
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
51
54
|
"""
|
52
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
55
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs)
|
53
56
|
return response_object
|
54
57
|
|
55
|
-
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
58
|
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
56
59
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
60
|
|
58
61
|
:param url: Target url
|
@@ -60,10 +63,11 @@ class Fetcher(BaseFetcher):
|
|
60
63
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
61
64
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
62
65
|
create a referer header as if this request came from Google's search of this URL's domain.
|
66
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
63
67
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
64
68
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
65
69
|
"""
|
66
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
70
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs)
|
67
71
|
return response_object
|
68
72
|
|
69
73
|
|
@@ -78,7 +82,7 @@ class StealthyFetcher(BaseFetcher):
|
|
78
82
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
79
83
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
80
84
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
81
|
-
os_randomize: Optional[bool] = None
|
85
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
82
86
|
) -> Response:
|
83
87
|
"""
|
84
88
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -92,6 +96,7 @@ class StealthyFetcher(BaseFetcher):
|
|
92
96
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
93
97
|
:param block_webrtc: Blocks WebRTC entirely.
|
94
98
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
99
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
95
100
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
96
101
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
97
102
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
@@ -111,6 +116,7 @@ class StealthyFetcher(BaseFetcher):
|
|
111
116
|
timeout=timeout,
|
112
117
|
headless=headless,
|
113
118
|
humanize=humanize,
|
119
|
+
disable_ads=disable_ads,
|
114
120
|
allow_webgl=allow_webgl,
|
115
121
|
page_action=page_action,
|
116
122
|
network_idle=network_idle,
|
@@ -148,7 +154,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
148
154
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
149
155
|
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
156
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
151
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
157
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
152
158
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
153
159
|
cdp_url: Optional[str] = None,
|
154
160
|
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
@@ -163,6 +169,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
163
169
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
164
170
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
165
171
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
172
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
166
173
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
167
174
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
168
175
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
@@ -180,6 +187,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
180
187
|
"""
|
181
188
|
engine = PlaywrightEngine(
|
182
189
|
proxy=proxy,
|
190
|
+
locale=locale,
|
183
191
|
timeout=timeout,
|
184
192
|
stealth=stealth,
|
185
193
|
cdp_url=cdp_url,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
90
90
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
91
91
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
92
92
|
* [Installation](#installation)
|
93
|
-
* [Fetching Websites
|
94
|
-
* [
|
95
|
-
* [
|
96
|
-
* [
|
93
|
+
* [Fetching Websites](#fetching-websites)
|
94
|
+
* [Features](#features)
|
95
|
+
* [Fetcher class](#fetcher)
|
96
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
97
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
97
98
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
98
99
|
* [Smart Navigation](#smart-navigation)
|
99
100
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -256,7 +257,10 @@ playwright install chromium
|
|
256
257
|
python -m browserforge update
|
257
258
|
```
|
258
259
|
|
259
|
-
## Fetching Websites
|
260
|
+
## Fetching Websites
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
|
+
|
263
|
+
### Features
|
260
264
|
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
265
|
```python
|
262
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
@@ -279,9 +283,11 @@ Also, the `Response` object returned from all fetchers is the same as `Adaptor`
|
|
279
283
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
280
284
|
|
281
285
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
286
|
+
|
287
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
282
288
|
```python
|
283
289
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
284
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
290
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
285
291
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
286
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
287
293
|
```
|
@@ -309,6 +315,7 @@ True
|
|
309
315
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
310
316
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
311
317
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
318
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
312
319
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
313
320
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
314
321
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -363,6 +370,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
363
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
364
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
372
|
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
366
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
367
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
368
376
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
814
822
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
815
823
|
|
816
824
|
## More Sponsors!
|
817
|
-
|
818
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
825
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
819
826
|
|
820
827
|
|
821
828
|
## Contributing
|
@@ -1,6 +1,6 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=WjvhJ6xkiSHp7St2YJYYJIsiKL8WDYuAQ_qIsg03v-0,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256
|
3
|
+
scrapling/fetchers.py,sha256=vjAsa-oleb7FfYsxqmEUVZGNxdo7LMVuiLuyjIGySQE,17417
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -11,12 +11,12 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
11
11
|
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
15
|
-
scrapling/engines/constants.py,sha256=
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
-
scrapling/engines/static.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=Lw_uZ5SMBy3T6MkCNOMPk1i51Lnpfd0M7HyAUJAzKIg,8284
|
15
|
+
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
+
scrapling/engines/pw.py,sha256=ZRmbFNQWzvxUHVrIUcKefyg6fDpBrN6erdatDpcLBaw,13762
|
17
|
+
scrapling/engines/static.py,sha256=ryVCIjTpVLNlCxSf_NYwDSdsoDbafnsGpkCoCROPhlI,8021
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=KopO0SVWzFoNB8LbFDQhtErm8KCid6nkQcGqRaItC6U,12752
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -35,8 +35,8 @@ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,
|
|
35
35
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
37
|
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
42
|
-
scrapling-0.2.
|
38
|
+
scrapling-0.2.7.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.7.dist-info/METADATA,sha256=kYARTFqiiLsL_cvnU03pf2I1E5N_NmJk25gbeLzSR4M,66607
|
40
|
+
scrapling-0.2.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.7.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|