scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +227 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
scrapling/engines/camo.py
DELETED
@@ -1,339 +0,0 @@
|
|
1
|
-
from camoufox import DefaultAddons
|
2
|
-
from camoufox.async_api import AsyncCamoufox
|
3
|
-
from camoufox.sync_api import Camoufox
|
4
|
-
|
5
|
-
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
6
|
-
SelectorWaitStates, Union)
|
7
|
-
from scrapling.core.utils import log
|
8
|
-
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
-
async_intercept_route,
|
10
|
-
check_type_validity,
|
11
|
-
construct_proxy_dict,
|
12
|
-
generate_convincing_referer,
|
13
|
-
get_os_name, intercept_route)
|
14
|
-
|
15
|
-
|
16
|
-
class CamoufoxEngine:
|
17
|
-
def __init__(
|
18
|
-
self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
19
|
-
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True, wait: Optional[int] = 0,
|
20
|
-
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
|
23
|
-
geoip: bool = False,
|
24
|
-
adaptor_arguments: Dict = None,
|
25
|
-
additional_arguments: Dict = None
|
26
|
-
):
|
27
|
-
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
28
|
-
|
29
|
-
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
30
|
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
31
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
32
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
33
|
-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
34
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
35
|
-
:param block_webrtc: Blocks WebRTC entirely.
|
36
|
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
37
|
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
38
|
-
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
39
|
-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
40
|
-
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
41
|
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
42
|
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
43
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
44
|
-
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
45
|
-
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
46
|
-
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
47
|
-
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
48
|
-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
49
|
-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
50
|
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
51
|
-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
52
|
-
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
53
|
-
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
|
54
|
-
"""
|
55
|
-
self.headless = headless
|
56
|
-
self.block_images = bool(block_images)
|
57
|
-
self.disable_resources = bool(disable_resources)
|
58
|
-
self.block_webrtc = bool(block_webrtc)
|
59
|
-
self.allow_webgl = bool(allow_webgl)
|
60
|
-
self.network_idle = bool(network_idle)
|
61
|
-
self.google_search = bool(google_search)
|
62
|
-
self.os_randomize = bool(os_randomize)
|
63
|
-
self.disable_ads = bool(disable_ads)
|
64
|
-
self.geoip = bool(geoip)
|
65
|
-
self.extra_headers = extra_headers or {}
|
66
|
-
self.additional_arguments = additional_arguments or {}
|
67
|
-
self.proxy = construct_proxy_dict(proxy)
|
68
|
-
self.addons = addons or []
|
69
|
-
self.humanize = humanize
|
70
|
-
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
71
|
-
self.wait = check_type_validity(wait, [int, float], 0)
|
72
|
-
|
73
|
-
# Page action callable validation
|
74
|
-
self.page_action = None
|
75
|
-
if page_action is not None:
|
76
|
-
if callable(page_action):
|
77
|
-
self.page_action = page_action
|
78
|
-
else:
|
79
|
-
log.error('[Ignored] Argument "page_action" must be callable')
|
80
|
-
|
81
|
-
self.wait_selector = wait_selector
|
82
|
-
self.wait_selector_state = wait_selector_state
|
83
|
-
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
84
|
-
|
85
|
-
def _get_camoufox_options(self):
|
86
|
-
"""Return consistent browser options dictionary for both sync and async methods"""
|
87
|
-
return {
|
88
|
-
"geoip": self.geoip,
|
89
|
-
"proxy": self.proxy,
|
90
|
-
"enable_cache": True,
|
91
|
-
"addons": self.addons,
|
92
|
-
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
93
|
-
"headless": self.headless,
|
94
|
-
"humanize": self.humanize,
|
95
|
-
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
96
|
-
"allow_webgl": self.allow_webgl,
|
97
|
-
"block_webrtc": self.block_webrtc,
|
98
|
-
"block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
99
|
-
"os": None if self.os_randomize else get_os_name(),
|
100
|
-
**self.additional_arguments
|
101
|
-
}
|
102
|
-
|
103
|
-
def _process_response_history(self, first_response):
|
104
|
-
"""Process response history to build a list of Response objects"""
|
105
|
-
history = []
|
106
|
-
current_request = first_response.request.redirected_from
|
107
|
-
|
108
|
-
try:
|
109
|
-
while current_request:
|
110
|
-
try:
|
111
|
-
current_response = current_request.response()
|
112
|
-
history.insert(0, Response(
|
113
|
-
url=current_request.url,
|
114
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
115
|
-
text='',
|
116
|
-
body=b'',
|
117
|
-
status=current_response.status if current_response else 301,
|
118
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
119
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
120
|
-
cookies={},
|
121
|
-
headers=current_response.all_headers() if current_response else {},
|
122
|
-
request_headers=current_request.all_headers(),
|
123
|
-
**self.adaptor_arguments
|
124
|
-
))
|
125
|
-
except Exception as e:
|
126
|
-
log.error(f"Error processing redirect: {e}")
|
127
|
-
break
|
128
|
-
|
129
|
-
current_request = current_request.redirected_from
|
130
|
-
except Exception as e:
|
131
|
-
log.error(f"Error processing response history: {e}")
|
132
|
-
|
133
|
-
return history
|
134
|
-
|
135
|
-
async def _async_process_response_history(self, first_response):
|
136
|
-
"""Process response history to build a list of Response objects"""
|
137
|
-
history = []
|
138
|
-
current_request = first_response.request.redirected_from
|
139
|
-
|
140
|
-
try:
|
141
|
-
while current_request:
|
142
|
-
try:
|
143
|
-
current_response = await current_request.response()
|
144
|
-
history.insert(0, Response(
|
145
|
-
url=current_request.url,
|
146
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
147
|
-
text='',
|
148
|
-
body=b'',
|
149
|
-
status=current_response.status if current_response else 301,
|
150
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
151
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
152
|
-
cookies={},
|
153
|
-
headers=await current_response.all_headers() if current_response else {},
|
154
|
-
request_headers=await current_request.all_headers(),
|
155
|
-
**self.adaptor_arguments
|
156
|
-
))
|
157
|
-
except Exception as e:
|
158
|
-
log.error(f"Error processing redirect: {e}")
|
159
|
-
break
|
160
|
-
|
161
|
-
current_request = current_request.redirected_from
|
162
|
-
except Exception as e:
|
163
|
-
log.error(f"Error processing response history: {e}")
|
164
|
-
|
165
|
-
return history
|
166
|
-
|
167
|
-
def fetch(self, url: str) -> Response:
|
168
|
-
"""Opens up the browser and do your request based on your chosen options.
|
169
|
-
|
170
|
-
:param url: Target url.
|
171
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
172
|
-
"""
|
173
|
-
final_response = None
|
174
|
-
referer = generate_convincing_referer(url) if self.google_search else None
|
175
|
-
|
176
|
-
def handle_response(finished_response):
|
177
|
-
nonlocal final_response
|
178
|
-
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
179
|
-
final_response = finished_response
|
180
|
-
|
181
|
-
with Camoufox(**self._get_camoufox_options()) as browser:
|
182
|
-
context = browser.new_context()
|
183
|
-
page = context.new_page()
|
184
|
-
page.set_default_navigation_timeout(self.timeout)
|
185
|
-
page.set_default_timeout(self.timeout)
|
186
|
-
page.on("response", handle_response)
|
187
|
-
|
188
|
-
if self.disable_resources:
|
189
|
-
page.route("**/*", intercept_route)
|
190
|
-
|
191
|
-
if self.extra_headers:
|
192
|
-
page.set_extra_http_headers(self.extra_headers)
|
193
|
-
|
194
|
-
first_response = page.goto(url, referer=referer)
|
195
|
-
page.wait_for_load_state(state="domcontentloaded")
|
196
|
-
|
197
|
-
if self.network_idle:
|
198
|
-
page.wait_for_load_state('networkidle')
|
199
|
-
|
200
|
-
if self.page_action is not None:
|
201
|
-
try:
|
202
|
-
page = self.page_action(page)
|
203
|
-
except Exception as e:
|
204
|
-
log.error(f"Error executing page_action: {e}")
|
205
|
-
|
206
|
-
if self.wait_selector and type(self.wait_selector) is str:
|
207
|
-
try:
|
208
|
-
waiter = page.locator(self.wait_selector)
|
209
|
-
waiter.first.wait_for(state=self.wait_selector_state)
|
210
|
-
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
211
|
-
page.wait_for_load_state(state="load")
|
212
|
-
page.wait_for_load_state(state="domcontentloaded")
|
213
|
-
if self.network_idle:
|
214
|
-
page.wait_for_load_state('networkidle')
|
215
|
-
except Exception as e:
|
216
|
-
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
217
|
-
|
218
|
-
page.wait_for_timeout(self.wait)
|
219
|
-
# In case we didn't catch a document type somehow
|
220
|
-
final_response = final_response if final_response else first_response
|
221
|
-
if not final_response:
|
222
|
-
raise ValueError("Failed to get a response from the page")
|
223
|
-
|
224
|
-
# This will be parsed inside `Response`
|
225
|
-
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
226
|
-
# PlayWright API sometimes give empty status text for some reason!
|
227
|
-
status_text = final_response.status_text or StatusText.get(final_response.status)
|
228
|
-
|
229
|
-
history = self._process_response_history(first_response)
|
230
|
-
try:
|
231
|
-
page_content = page.content()
|
232
|
-
except Exception as e:
|
233
|
-
log.error(f"Error getting page content: {e}")
|
234
|
-
page_content = ""
|
235
|
-
|
236
|
-
response = Response(
|
237
|
-
url=page.url,
|
238
|
-
text=page_content,
|
239
|
-
body=page_content.encode('utf-8'),
|
240
|
-
status=final_response.status,
|
241
|
-
reason=status_text,
|
242
|
-
encoding=encoding,
|
243
|
-
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
244
|
-
headers=first_response.all_headers(),
|
245
|
-
request_headers=first_response.request.all_headers(),
|
246
|
-
history=history,
|
247
|
-
**self.adaptor_arguments
|
248
|
-
)
|
249
|
-
page.close()
|
250
|
-
context.close()
|
251
|
-
|
252
|
-
return response
|
253
|
-
|
254
|
-
async def async_fetch(self, url: str) -> Response:
|
255
|
-
"""Opens up the browser and do your request based on your chosen options.
|
256
|
-
|
257
|
-
:param url: Target url.
|
258
|
-
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
259
|
-
"""
|
260
|
-
final_response = None
|
261
|
-
referer = generate_convincing_referer(url) if self.google_search else None
|
262
|
-
|
263
|
-
async def handle_response(finished_response):
|
264
|
-
nonlocal final_response
|
265
|
-
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
266
|
-
final_response = finished_response
|
267
|
-
|
268
|
-
async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
|
269
|
-
context = await browser.new_context()
|
270
|
-
page = await context.new_page()
|
271
|
-
page.set_default_navigation_timeout(self.timeout)
|
272
|
-
page.set_default_timeout(self.timeout)
|
273
|
-
page.on("response", handle_response)
|
274
|
-
|
275
|
-
if self.disable_resources:
|
276
|
-
await page.route("**/*", async_intercept_route)
|
277
|
-
|
278
|
-
if self.extra_headers:
|
279
|
-
await page.set_extra_http_headers(self.extra_headers)
|
280
|
-
|
281
|
-
first_response = await page.goto(url, referer=referer)
|
282
|
-
await page.wait_for_load_state(state="domcontentloaded")
|
283
|
-
|
284
|
-
if self.network_idle:
|
285
|
-
await page.wait_for_load_state('networkidle')
|
286
|
-
|
287
|
-
if self.page_action is not None:
|
288
|
-
try:
|
289
|
-
page = await self.page_action(page)
|
290
|
-
except Exception as e:
|
291
|
-
log.error(f"Error executing async page_action: {e}")
|
292
|
-
|
293
|
-
if self.wait_selector and type(self.wait_selector) is str:
|
294
|
-
try:
|
295
|
-
waiter = page.locator(self.wait_selector)
|
296
|
-
await waiter.first.wait_for(state=self.wait_selector_state)
|
297
|
-
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
298
|
-
await page.wait_for_load_state(state="load")
|
299
|
-
await page.wait_for_load_state(state="domcontentloaded")
|
300
|
-
if self.network_idle:
|
301
|
-
await page.wait_for_load_state('networkidle')
|
302
|
-
except Exception as e:
|
303
|
-
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
304
|
-
|
305
|
-
await page.wait_for_timeout(self.wait)
|
306
|
-
# In case we didn't catch a document type somehow
|
307
|
-
final_response = final_response if final_response else first_response
|
308
|
-
if not final_response:
|
309
|
-
raise ValueError("Failed to get a response from the page")
|
310
|
-
|
311
|
-
# This will be parsed inside `Response`
|
312
|
-
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
313
|
-
# PlayWright API sometimes give empty status text for some reason!
|
314
|
-
status_text = final_response.status_text or StatusText.get(final_response.status)
|
315
|
-
|
316
|
-
history = await self._async_process_response_history(first_response)
|
317
|
-
try:
|
318
|
-
page_content = await page.content()
|
319
|
-
except Exception as e:
|
320
|
-
log.error(f"Error getting page content in async: {e}")
|
321
|
-
page_content = ""
|
322
|
-
|
323
|
-
response = Response(
|
324
|
-
url=page.url,
|
325
|
-
text=page_content,
|
326
|
-
body=page_content.encode('utf-8'),
|
327
|
-
status=final_response.status,
|
328
|
-
reason=status_text,
|
329
|
-
encoding=encoding,
|
330
|
-
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
331
|
-
headers=await first_response.all_headers(),
|
332
|
-
request_headers=await first_response.request.all_headers(),
|
333
|
-
history=history,
|
334
|
-
**self.adaptor_arguments
|
335
|
-
)
|
336
|
-
await page.close()
|
337
|
-
await context.close()
|
338
|
-
|
339
|
-
return response
|