scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
scrapling/fetchers.py
CHANGED
@@ -1,217 +1,127 @@
|
|
1
|
-
from scrapling.core._types import (
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
from scrapling.core._types import (
|
2
|
+
Callable,
|
3
|
+
Dict,
|
4
|
+
List,
|
5
|
+
Optional,
|
6
|
+
SelectorWaitStates,
|
7
|
+
Iterable,
|
8
|
+
)
|
9
|
+
from scrapling.engines import (
|
10
|
+
FetcherSession,
|
11
|
+
StealthySession,
|
12
|
+
AsyncStealthySession,
|
13
|
+
DynamicSession,
|
14
|
+
AsyncDynamicSession,
|
15
|
+
FetcherClient as _FetcherClient,
|
16
|
+
AsyncFetcherClient as _AsyncFetcherClient,
|
17
|
+
)
|
5
18
|
from scrapling.engines.toolbelt import BaseFetcher, Response
|
6
19
|
|
20
|
+
__FetcherClientInstance__ = _FetcherClient()
|
21
|
+
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
|
7
22
|
|
8
|
-
class Fetcher(BaseFetcher):
|
9
|
-
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on httpx.
|
10
|
-
|
11
|
-
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
12
|
-
"""
|
13
|
-
def get(
|
14
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
15
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
16
|
-
"""Make basic HTTP GET request for you but with some added flavors.
|
17
|
-
|
18
|
-
:param url: Target url.
|
19
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
20
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
21
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
22
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
23
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
24
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
25
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
26
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
27
|
-
"""
|
28
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
29
|
-
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
|
30
|
-
return response_object
|
31
|
-
|
32
|
-
def post(
|
33
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
34
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
35
|
-
"""Make basic HTTP POST request for you but with some added flavors.
|
36
|
-
|
37
|
-
:param url: Target url.
|
38
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
39
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
40
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
41
|
-
create a referer header as if this request came from Google's search of this URL's domain.
|
42
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
43
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
44
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
45
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
46
|
-
"""
|
47
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
48
|
-
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
|
49
|
-
return response_object
|
50
|
-
|
51
|
-
def put(
|
52
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
53
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
54
|
-
"""Make basic HTTP PUT request for you but with some added flavors.
|
55
|
-
|
56
|
-
:param url: Target url
|
57
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
58
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
59
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
60
|
-
create a referer header as if this request came from Google's search of this URL's domain.
|
61
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
62
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
63
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
64
|
-
|
65
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
66
|
-
"""
|
67
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
68
|
-
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
|
69
|
-
return response_object
|
70
|
-
|
71
|
-
def delete(
|
72
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
73
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
74
|
-
"""Make basic HTTP DELETE request for you but with some added flavors.
|
75
|
-
|
76
|
-
:param url: Target url
|
77
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
78
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
79
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
80
|
-
create a referer header as if this request came from Google's search of this URL's domain.
|
81
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
82
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
83
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
84
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
|
-
"""
|
86
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
87
|
-
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
|
88
|
-
return response_object
|
89
|
-
|
90
|
-
|
91
|
-
class AsyncFetcher(Fetcher):
|
92
|
-
async def get(
|
93
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
94
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
95
|
-
"""Make basic HTTP GET request for you but with some added flavors.
|
96
|
-
|
97
|
-
:param url: Target url.
|
98
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
99
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
100
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
101
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
102
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
103
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
104
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
105
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
106
|
-
"""
|
107
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
108
|
-
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
|
109
|
-
return response_object
|
110
23
|
|
111
|
-
|
112
|
-
|
113
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
114
|
-
"""Make basic HTTP POST request for you but with some added flavors.
|
115
|
-
|
116
|
-
:param url: Target url.
|
117
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
118
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
119
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
120
|
-
create a referer header as if this request came from Google's search of this URL's domain.
|
121
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
122
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
123
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
124
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
125
|
-
"""
|
126
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
127
|
-
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
|
128
|
-
return response_object
|
24
|
+
class Fetcher(BaseFetcher):
|
25
|
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
129
26
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
27
|
+
get = __FetcherClientInstance__.get
|
28
|
+
post = __FetcherClientInstance__.post
|
29
|
+
put = __FetcherClientInstance__.put
|
30
|
+
delete = __FetcherClientInstance__.delete
|
134
31
|
|
135
|
-
:param url: Target url
|
136
|
-
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
137
|
-
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
138
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
139
|
-
create a referer header as if this request came from Google's search of this URL's domain.
|
140
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
141
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
142
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
143
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
144
|
-
"""
|
145
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
146
|
-
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
|
147
|
-
return response_object
|
148
32
|
|
149
|
-
|
150
|
-
|
151
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
152
|
-
"""Make basic HTTP DELETE request for you but with some added flavors.
|
33
|
+
class AsyncFetcher(BaseFetcher):
|
34
|
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
153
35
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
create a referer header as if this request came from Google's search of this URL's domain.
|
159
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
160
|
-
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
161
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
162
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
163
|
-
"""
|
164
|
-
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
165
|
-
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
|
166
|
-
return response_object
|
36
|
+
get = __AsyncFetcherClientInstance__.get
|
37
|
+
post = __AsyncFetcherClientInstance__.post
|
38
|
+
put = __AsyncFetcherClientInstance__.put
|
39
|
+
delete = __AsyncFetcherClientInstance__.delete
|
167
40
|
|
168
41
|
|
169
42
|
class StealthyFetcher(BaseFetcher):
|
170
|
-
"""A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
|
43
|
+
"""A `Fetcher` class type that is a completely stealthy fetcher that uses a modified version of Firefox.
|
171
44
|
|
172
|
-
|
173
|
-
|
45
|
+
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
46
|
+
Other added flavors include setting the faked OS fingerprints to match the user's OS, and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
174
47
|
"""
|
48
|
+
|
49
|
+
@classmethod
|
175
50
|
def fetch(
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
51
|
+
cls,
|
52
|
+
url: str,
|
53
|
+
headless: bool = True, # noqa: F821
|
54
|
+
block_images: bool = False,
|
55
|
+
disable_resources: bool = False,
|
56
|
+
block_webrtc: bool = False,
|
57
|
+
allow_webgl: bool = True,
|
58
|
+
network_idle: bool = False,
|
59
|
+
humanize: bool | float = True,
|
60
|
+
solve_cloudflare: bool = False,
|
61
|
+
wait: int | float = 0,
|
62
|
+
timeout: int | float = 30000,
|
63
|
+
page_action: Optional[Callable] = None,
|
64
|
+
wait_selector: Optional[str] = None,
|
65
|
+
addons: Optional[List[str]] = None,
|
66
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
67
|
+
cookies: Optional[List[Dict]] = None,
|
68
|
+
google_search: bool = True,
|
69
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
70
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
71
|
+
os_randomize: bool = False,
|
72
|
+
disable_ads: bool = False,
|
73
|
+
geoip: bool = False,
|
74
|
+
custom_config: Optional[Dict] = None,
|
75
|
+
additional_args: Optional[Dict] = None,
|
181
76
|
) -> Response:
|
182
77
|
"""
|
183
78
|
Opens up a browser and do your request based on your chosen options below.
|
184
79
|
|
185
80
|
:param url: Target url.
|
186
|
-
:param headless: Run the browser in headless/hidden (default),
|
81
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
187
82
|
:param block_images: Prevent the loading of images through Firefox preferences.
|
188
83
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
189
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
84
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
190
85
|
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
191
86
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
192
87
|
:param block_webrtc: Blocks WebRTC entirely.
|
88
|
+
:param cookies: Set cookies for the next request.
|
193
89
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
194
|
-
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
195
90
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
196
|
-
:param
|
197
|
-
:param
|
198
|
-
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
91
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
92
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
199
93
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
94
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
200
95
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
201
|
-
:param
|
96
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
97
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
202
98
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
203
|
-
:param wait_selector: Wait for a specific
|
204
|
-
:param
|
205
|
-
|
99
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
100
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
101
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
102
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
103
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
206
104
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
207
105
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
208
|
-
:
|
106
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
107
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
108
|
+
:return: A `Response` object.
|
209
109
|
"""
|
210
|
-
|
110
|
+
if not custom_config:
|
111
|
+
custom_config = {}
|
112
|
+
elif not isinstance(custom_config, dict):
|
113
|
+
ValueError(
|
114
|
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
115
|
+
)
|
116
|
+
|
117
|
+
with StealthySession(
|
118
|
+
wait=wait,
|
119
|
+
max_pages=1,
|
211
120
|
proxy=proxy,
|
212
121
|
geoip=geoip,
|
213
122
|
addons=addons,
|
214
123
|
timeout=timeout,
|
124
|
+
cookies=cookies,
|
215
125
|
headless=headless,
|
216
126
|
humanize=humanize,
|
217
127
|
disable_ads=disable_ads,
|
@@ -224,52 +134,90 @@ class StealthyFetcher(BaseFetcher):
|
|
224
134
|
wait_selector=wait_selector,
|
225
135
|
google_search=google_search,
|
226
136
|
extra_headers=extra_headers,
|
137
|
+
solve_cloudflare=solve_cloudflare,
|
227
138
|
disable_resources=disable_resources,
|
228
139
|
wait_selector_state=wait_selector_state,
|
229
|
-
|
230
|
-
|
231
|
-
|
140
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
141
|
+
additional_args=additional_args or {},
|
142
|
+
) as engine:
|
143
|
+
return engine.fetch(url)
|
232
144
|
|
145
|
+
@classmethod
|
233
146
|
async def async_fetch(
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
147
|
+
cls,
|
148
|
+
url: str,
|
149
|
+
headless: bool = True, # noqa: F821
|
150
|
+
block_images: bool = False,
|
151
|
+
disable_resources: bool = False,
|
152
|
+
block_webrtc: bool = False,
|
153
|
+
allow_webgl: bool = True,
|
154
|
+
network_idle: bool = False,
|
155
|
+
humanize: bool | float = True,
|
156
|
+
solve_cloudflare: bool = False,
|
157
|
+
wait: int | float = 0,
|
158
|
+
timeout: int | float = 30000,
|
159
|
+
page_action: Optional[Callable] = None,
|
160
|
+
wait_selector: Optional[str] = None,
|
161
|
+
addons: Optional[List[str]] = None,
|
162
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
163
|
+
cookies: Optional[List[Dict]] = None,
|
164
|
+
google_search: bool = True,
|
165
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
166
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
167
|
+
os_randomize: bool = False,
|
168
|
+
disable_ads: bool = False,
|
169
|
+
geoip: bool = False,
|
170
|
+
custom_config: Optional[Dict] = None,
|
171
|
+
additional_args: Optional[Dict] = None,
|
239
172
|
) -> Response:
|
240
173
|
"""
|
241
174
|
Opens up a browser and do your request based on your chosen options below.
|
242
175
|
|
243
176
|
:param url: Target url.
|
244
|
-
:param headless: Run the browser in headless/hidden (default),
|
177
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
245
178
|
:param block_images: Prevent the loading of images through Firefox preferences.
|
246
179
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
247
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
180
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
248
181
|
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
249
182
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
250
183
|
:param block_webrtc: Blocks WebRTC entirely.
|
184
|
+
:param cookies: Set cookies for the next request.
|
251
185
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
252
|
-
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
253
186
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
254
|
-
:param
|
255
|
-
:param
|
256
|
-
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
187
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
188
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
257
189
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
190
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
258
191
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
259
|
-
:param
|
192
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
193
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
260
194
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
261
|
-
:param wait_selector: Wait for a specific
|
262
|
-
:param
|
263
|
-
|
195
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
196
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
197
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
198
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
199
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
264
200
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
265
201
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
266
|
-
:
|
202
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
203
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
204
|
+
:return: A `Response` object.
|
267
205
|
"""
|
268
|
-
|
206
|
+
if not custom_config:
|
207
|
+
custom_config = {}
|
208
|
+
elif not isinstance(custom_config, dict):
|
209
|
+
ValueError(
|
210
|
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
211
|
+
)
|
212
|
+
|
213
|
+
async with AsyncStealthySession(
|
214
|
+
wait=wait,
|
215
|
+
max_pages=1,
|
269
216
|
proxy=proxy,
|
270
217
|
geoip=geoip,
|
271
218
|
addons=addons,
|
272
219
|
timeout=timeout,
|
220
|
+
cookies=cookies,
|
273
221
|
headless=headless,
|
274
222
|
humanize=humanize,
|
275
223
|
disable_ads=disable_ads,
|
@@ -282,71 +230,99 @@ class StealthyFetcher(BaseFetcher):
|
|
282
230
|
wait_selector=wait_selector,
|
283
231
|
google_search=google_search,
|
284
232
|
extra_headers=extra_headers,
|
233
|
+
solve_cloudflare=solve_cloudflare,
|
285
234
|
disable_resources=disable_resources,
|
286
235
|
wait_selector_state=wait_selector_state,
|
287
|
-
|
288
|
-
|
289
|
-
|
236
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
237
|
+
additional_args=additional_args or {},
|
238
|
+
) as engine:
|
239
|
+
return await engine.fetch(url)
|
290
240
|
|
291
241
|
|
292
|
-
class
|
242
|
+
class DynamicFetcher(BaseFetcher):
|
293
243
|
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
294
244
|
|
295
245
|
Using this Fetcher class, you can do requests with:
|
296
246
|
- Vanilla Playwright without any modifications other than the ones you chose.
|
297
|
-
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
247
|
+
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
|
298
248
|
Some of the things stealth mode does include:
|
299
249
|
1) Patches the CDP runtime fingerprint.
|
300
250
|
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
301
251
|
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
302
|
-
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
303
|
-
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
252
|
+
4) Generates real browser's headers of the same type and same user OS, then append it to the request.
|
253
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
|
304
254
|
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
305
255
|
|
306
|
-
> Note that these are the main options with PlayWright but it can be mixed
|
256
|
+
> Note that these are the main options with PlayWright, but it can be mixed.
|
307
257
|
"""
|
258
|
+
|
259
|
+
@classmethod
|
308
260
|
def fetch(
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
261
|
+
cls,
|
262
|
+
url: str,
|
263
|
+
headless: bool = True,
|
264
|
+
google_search: bool = True,
|
265
|
+
hide_canvas: bool = False,
|
266
|
+
disable_webgl: bool = False,
|
267
|
+
real_chrome: bool = False,
|
268
|
+
stealth: bool = False,
|
269
|
+
wait: int | float = 0,
|
270
|
+
page_action: Optional[Callable] = None,
|
271
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
272
|
+
locale: str = "en-US",
|
273
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
274
|
+
useragent: Optional[str] = None,
|
275
|
+
cdp_url: Optional[str] = None,
|
276
|
+
timeout: int | float = 30000,
|
277
|
+
disable_resources: bool = False,
|
278
|
+
wait_selector: Optional[str] = None,
|
279
|
+
cookies: Optional[Iterable[Dict]] = None,
|
280
|
+
network_idle: bool = False,
|
281
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
282
|
+
custom_config: Optional[Dict] = None,
|
317
283
|
) -> Response:
|
318
284
|
"""Opens up a browser and do your request based on your chosen options below.
|
319
285
|
|
320
286
|
:param url: Target url.
|
321
287
|
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
322
|
-
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
288
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
323
289
|
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
324
290
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
325
291
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
292
|
+
:param cookies: Set cookies for the next request.
|
326
293
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
327
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is
|
328
|
-
:param
|
294
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
295
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
329
296
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
330
|
-
:param wait_selector: Wait for a specific
|
331
|
-
:param
|
297
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
298
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
299
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
332
300
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
333
|
-
:param real_chrome: If you have
|
301
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
334
302
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
335
303
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
336
|
-
:param
|
304
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
305
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
337
306
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
338
307
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
339
|
-
:param
|
340
|
-
:
|
341
|
-
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
342
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
308
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
309
|
+
:return: A `Response` object.
|
343
310
|
"""
|
344
|
-
|
311
|
+
if not custom_config:
|
312
|
+
custom_config = {}
|
313
|
+
elif not isinstance(custom_config, dict):
|
314
|
+
raise ValueError(
|
315
|
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
316
|
+
)
|
317
|
+
|
318
|
+
with DynamicSession(
|
319
|
+
wait=wait,
|
345
320
|
proxy=proxy,
|
346
321
|
locale=locale,
|
347
322
|
timeout=timeout,
|
348
323
|
stealth=stealth,
|
349
324
|
cdp_url=cdp_url,
|
325
|
+
cookies=cookies,
|
350
326
|
headless=headless,
|
351
327
|
useragent=useragent,
|
352
328
|
real_chrome=real_chrome,
|
@@ -357,58 +333,82 @@ class PlayWrightFetcher(BaseFetcher):
|
|
357
333
|
extra_headers=extra_headers,
|
358
334
|
wait_selector=wait_selector,
|
359
335
|
disable_webgl=disable_webgl,
|
360
|
-
nstbrowser_mode=nstbrowser_mode,
|
361
|
-
nstbrowser_config=nstbrowser_config,
|
362
336
|
disable_resources=disable_resources,
|
363
337
|
wait_selector_state=wait_selector_state,
|
364
|
-
|
365
|
-
)
|
366
|
-
|
338
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
339
|
+
) as session:
|
340
|
+
return session.fetch(url)
|
367
341
|
|
342
|
+
@classmethod
|
368
343
|
async def async_fetch(
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
344
|
+
cls,
|
345
|
+
url: str,
|
346
|
+
headless: bool = True,
|
347
|
+
google_search: bool = True,
|
348
|
+
hide_canvas: bool = False,
|
349
|
+
disable_webgl: bool = False,
|
350
|
+
real_chrome: bool = False,
|
351
|
+
stealth: bool = False,
|
352
|
+
wait: int | float = 0,
|
353
|
+
page_action: Optional[Callable] = None,
|
354
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
355
|
+
locale: str = "en-US",
|
356
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
357
|
+
useragent: Optional[str] = None,
|
358
|
+
cdp_url: Optional[str] = None,
|
359
|
+
timeout: int | float = 30000,
|
360
|
+
disable_resources: bool = False,
|
361
|
+
wait_selector: Optional[str] = None,
|
362
|
+
cookies: Optional[Iterable[Dict]] = None,
|
363
|
+
network_idle: bool = False,
|
364
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
365
|
+
custom_config: Optional[Dict] = None,
|
377
366
|
) -> Response:
|
378
367
|
"""Opens up a browser and do your request based on your chosen options below.
|
379
368
|
|
380
369
|
:param url: Target url.
|
381
370
|
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
382
|
-
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
371
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
383
372
|
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
384
373
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
385
374
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
375
|
+
:param cookies: Set cookies for the next request.
|
386
376
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
387
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is
|
388
|
-
:param
|
377
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
378
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
389
379
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
390
|
-
:param wait_selector: Wait for a specific
|
391
|
-
:param
|
380
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
381
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
382
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
392
383
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
393
|
-
:param real_chrome: If you have
|
384
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
394
385
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
395
386
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
396
|
-
:param
|
387
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
388
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
397
389
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
398
390
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
399
|
-
:param
|
400
|
-
:
|
401
|
-
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
402
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
391
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
392
|
+
:return: A `Response` object.
|
403
393
|
"""
|
404
|
-
|
394
|
+
if not custom_config:
|
395
|
+
custom_config = {}
|
396
|
+
elif not isinstance(custom_config, dict):
|
397
|
+
raise ValueError(
|
398
|
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
399
|
+
)
|
400
|
+
|
401
|
+
async with AsyncDynamicSession(
|
402
|
+
wait=wait,
|
405
403
|
proxy=proxy,
|
406
404
|
locale=locale,
|
407
405
|
timeout=timeout,
|
408
406
|
stealth=stealth,
|
409
407
|
cdp_url=cdp_url,
|
408
|
+
cookies=cookies,
|
410
409
|
headless=headless,
|
411
410
|
useragent=useragent,
|
411
|
+
max_pages=1,
|
412
412
|
real_chrome=real_chrome,
|
413
413
|
page_action=page_action,
|
414
414
|
hide_canvas=hide_canvas,
|
@@ -417,16 +417,11 @@ class PlayWrightFetcher(BaseFetcher):
|
|
417
417
|
extra_headers=extra_headers,
|
418
418
|
wait_selector=wait_selector,
|
419
419
|
disable_webgl=disable_webgl,
|
420
|
-
nstbrowser_mode=nstbrowser_mode,
|
421
|
-
nstbrowser_config=nstbrowser_config,
|
422
420
|
disable_resources=disable_resources,
|
423
421
|
wait_selector_state=wait_selector_state,
|
424
|
-
|
425
|
-
)
|
426
|
-
|
422
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
423
|
+
) as session:
|
424
|
+
return await session.fetch(url)
|
427
425
|
|
428
426
|
|
429
|
-
|
430
|
-
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|
431
|
-
engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
|
432
|
-
return engine.fetch(url)
|
427
|
+
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|