scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
scrapling/engines/pw.py DELETED
@@ -1,465 +0,0 @@
1
- import json
2
-
3
- from scrapling.core._types import (Callable, Dict, Optional,
4
- SelectorWaitStates, Union)
5
- from scrapling.core.utils import log, lru_cache
6
- from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
7
- NSTBROWSER_DEFAULT_QUERY)
8
- from scrapling.engines.toolbelt import (Response, StatusText,
9
- async_intercept_route,
10
- check_type_validity, construct_cdp_url,
11
- construct_proxy_dict,
12
- generate_convincing_referer,
13
- generate_headers, intercept_route,
14
- js_bypass_path)
15
-
16
-
17
- class PlaywrightEngine:
18
- def __init__(
19
- self, headless: Union[bool, str] = True,
20
- disable_resources: bool = False,
21
- useragent: Optional[str] = None,
22
- network_idle: bool = False,
23
- timeout: Optional[float] = 30000,
24
- wait: Optional[int] = 0,
25
- page_action: Callable = None,
26
- wait_selector: Optional[str] = None,
27
- locale: Optional[str] = 'en-US',
28
- wait_selector_state: SelectorWaitStates = 'attached',
29
- stealth: bool = False,
30
- real_chrome: bool = False,
31
- hide_canvas: bool = False,
32
- disable_webgl: bool = False,
33
- cdp_url: Optional[str] = None,
34
- nstbrowser_mode: bool = False,
35
- nstbrowser_config: Optional[Dict] = None,
36
- google_search: bool = True,
37
- extra_headers: Optional[Dict[str, str]] = None,
38
- proxy: Optional[Union[str, Dict[str, str]]] = None,
39
- adaptor_arguments: Dict = None
40
- ):
41
- """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
42
-
43
- :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
44
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
45
- Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
46
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
47
- :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
48
- :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
49
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
50
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
51
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
52
- :param wait_selector: Wait for a specific css selector to be in a specific state.
53
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
54
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
55
- :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
56
- :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
57
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
58
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
59
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
60
- :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
61
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
62
- :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
63
- :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
64
- :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
65
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
66
- """
67
- self.headless = headless
68
- self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
69
- self.disable_resources = disable_resources
70
- self.network_idle = bool(network_idle)
71
- self.stealth = bool(stealth)
72
- self.hide_canvas = bool(hide_canvas)
73
- self.disable_webgl = bool(disable_webgl)
74
- self.real_chrome = bool(real_chrome)
75
- self.google_search = bool(google_search)
76
- self.extra_headers = extra_headers or {}
77
- self.proxy = construct_proxy_dict(proxy)
78
- self.cdp_url = cdp_url
79
- self.useragent = useragent
80
- self.timeout = check_type_validity(timeout, [int, float], 30000)
81
- self.wait = check_type_validity(wait, [int, float], 0)
82
- if page_action is not None:
83
- if callable(page_action):
84
- self.page_action = page_action
85
- else:
86
- self.page_action = None
87
- log.error('[Ignored] Argument "page_action" must be callable')
88
- else:
89
- self.page_action = None
90
-
91
- self.wait_selector = wait_selector
92
- self.wait_selector_state = wait_selector_state
93
- self.nstbrowser_mode = bool(nstbrowser_mode)
94
- self.nstbrowser_config = nstbrowser_config
95
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
96
- self.harmful_default_args = [
97
- # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
98
- '--enable-automation',
99
- '--disable-popup-blocking',
100
- # '--disable-component-update',
101
- # '--disable-default-apps',
102
- # '--disable-extensions',
103
- ]
104
-
105
- def _cdp_url_logic(self) -> str:
106
- """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
107
- :return: CDP URL
108
- """
109
- cdp_url = self.cdp_url
110
- if self.nstbrowser_mode:
111
- if self.nstbrowser_config and isinstance(self.nstbrowser_config, dict):
112
- config = self.nstbrowser_config
113
- else:
114
- query = NSTBROWSER_DEFAULT_QUERY.copy()
115
- if self.stealth:
116
- flags = self.__set_flags()
117
- query.update({
118
- "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
119
- })
120
-
121
- config = {
122
- 'config': json.dumps(query),
123
- # 'token': ''
124
- }
125
- cdp_url = construct_cdp_url(cdp_url, config)
126
- else:
127
- # To validate it
128
- cdp_url = construct_cdp_url(cdp_url)
129
-
130
- return cdp_url
131
-
132
- @lru_cache(32, typed=True)
133
- def __set_flags(self):
134
- """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
135
- flags = DEFAULT_STEALTH_FLAGS
136
- if self.hide_canvas:
137
- flags += ('--fingerprinting-canvas-image-data-noise',)
138
- if self.disable_webgl:
139
- flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
140
-
141
- return flags
142
-
143
- def __launch_kwargs(self):
144
- """Creates the arguments we will use while launching playwright's browser"""
145
- launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
146
- if self.stealth:
147
- launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
148
-
149
- return launch_kwargs
150
-
151
- def __context_kwargs(self):
152
- """Creates the arguments for the browser context"""
153
- context_kwargs = {
154
- "proxy": self.proxy,
155
- "locale": self.locale,
156
- "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
157
- "device_scale_factor": 2,
158
- "extra_http_headers": self.extra_headers if self.extra_headers else {},
159
- "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
160
- }
161
- if self.stealth:
162
- context_kwargs.update({
163
- 'is_mobile': False,
164
- 'has_touch': False,
165
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
166
- 'service_workers': 'allow',
167
- 'ignore_https_errors': True,
168
- 'screen': {'width': 1920, 'height': 1080},
169
- 'viewport': {'width': 1920, 'height': 1080},
170
- 'permissions': ['geolocation', 'notifications']
171
- })
172
-
173
- return context_kwargs
174
-
175
- @lru_cache(1)
176
- def __stealth_scripts(self):
177
- # Basic bypasses nothing fancy as I'm still working on it
178
- # But with adding these bypasses to the above config, it bypasses many online tests like
179
- # https://bot.sannysoft.com/
180
- # https://kaliiiiiiiiii.github.io/brotector/
181
- # https://pixelscan.net/
182
- # https://iphey.com/
183
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
184
- # https://arh.antoinevastel.com/bots/areyouheadless/
185
- # https://prescience-data.github.io/execution-monitor.html
186
- return tuple(
187
- js_bypass_path(script) for script in (
188
- # Order is important
189
- 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
190
- 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
191
- )
192
- )
193
-
194
- def _process_response_history(self, first_response):
195
- """Process response history to build a list of Response objects"""
196
- history = []
197
- current_request = first_response.request.redirected_from
198
-
199
- try:
200
- while current_request:
201
- try:
202
- current_response = current_request.response()
203
- history.insert(0, Response(
204
- url=current_request.url,
205
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
206
- text='',
207
- body=b'',
208
- status=current_response.status if current_response else 301,
209
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
210
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
211
- cookies={},
212
- headers=current_response.all_headers() if current_response else {},
213
- request_headers=current_request.all_headers(),
214
- **self.adaptor_arguments
215
- ))
216
- except Exception as e:
217
- log.error(f"Error processing redirect: {e}")
218
- break
219
-
220
- current_request = current_request.redirected_from
221
- except Exception as e:
222
- log.error(f"Error processing response history: {e}")
223
-
224
- return history
225
-
226
- async def _async_process_response_history(self, first_response):
227
- """Process response history to build a list of Response objects"""
228
- history = []
229
- current_request = first_response.request.redirected_from
230
-
231
- try:
232
- while current_request:
233
- try:
234
- current_response = await current_request.response()
235
- history.insert(0, Response(
236
- url=current_request.url,
237
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
238
- text='',
239
- body=b'',
240
- status=current_response.status if current_response else 301,
241
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
242
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
243
- cookies={},
244
- headers=await current_response.all_headers() if current_response else {},
245
- request_headers=await current_request.all_headers(),
246
- **self.adaptor_arguments
247
- ))
248
- except Exception as e:
249
- log.error(f"Error processing redirect: {e}")
250
- break
251
-
252
- current_request = current_request.redirected_from
253
- except Exception as e:
254
- log.error(f"Error processing response history: {e}")
255
-
256
- return history
257
-
258
- def fetch(self, url: str) -> Response:
259
- """Opens up the browser and do your request based on your chosen options.
260
-
261
- :param url: Target url.
262
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
263
- """
264
- from playwright.sync_api import Response as PlaywrightResponse
265
- if not self.stealth or self.real_chrome:
266
- # Because rebrowser_playwright doesn't play well with real browsers
267
- from playwright.sync_api import sync_playwright
268
- else:
269
- from rebrowser_playwright.sync_api import sync_playwright
270
-
271
- final_response = None
272
- referer = generate_convincing_referer(url) if self.google_search else None
273
-
274
- def handle_response(finished_response: PlaywrightResponse):
275
- nonlocal final_response
276
- if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
277
- final_response = finished_response
278
-
279
- with sync_playwright() as p:
280
- # Creating the browser
281
- if self.cdp_url:
282
- cdp_url = self._cdp_url_logic()
283
- browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
284
- else:
285
- browser = p.chromium.launch(**self.__launch_kwargs())
286
-
287
- context = browser.new_context(**self.__context_kwargs())
288
- page = context.new_page()
289
- page.set_default_navigation_timeout(self.timeout)
290
- page.set_default_timeout(self.timeout)
291
- page.on("response", handle_response)
292
-
293
- if self.extra_headers:
294
- page.set_extra_http_headers(self.extra_headers)
295
-
296
- if self.disable_resources:
297
- page.route("**/*", intercept_route)
298
-
299
- if self.stealth:
300
- for script in self.__stealth_scripts():
301
- page.add_init_script(path=script)
302
-
303
- first_response = page.goto(url, referer=referer)
304
- page.wait_for_load_state(state="domcontentloaded")
305
-
306
- if self.network_idle:
307
- page.wait_for_load_state('networkidle')
308
-
309
- if self.page_action is not None:
310
- try:
311
- page = self.page_action(page)
312
- except Exception as e:
313
- log.error(f"Error executing page_action: {e}")
314
-
315
- if self.wait_selector and type(self.wait_selector) is str:
316
- try:
317
- waiter = page.locator(self.wait_selector)
318
- waiter.first.wait_for(state=self.wait_selector_state)
319
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
320
- page.wait_for_load_state(state="load")
321
- page.wait_for_load_state(state="domcontentloaded")
322
- if self.network_idle:
323
- page.wait_for_load_state('networkidle')
324
- except Exception as e:
325
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
326
-
327
- page.wait_for_timeout(self.wait)
328
- # In case we didn't catch a document type somehow
329
- final_response = final_response if final_response else first_response
330
- if not final_response:
331
- raise ValueError("Failed to get a response from the page")
332
-
333
- # This will be parsed inside `Response`
334
- encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
335
- # PlayWright API sometimes give empty status text for some reason!
336
- status_text = final_response.status_text or StatusText.get(final_response.status)
337
-
338
- history = self._process_response_history(first_response)
339
- try:
340
- page_content = page.content()
341
- except Exception as e:
342
- log.error(f"Error getting page content: {e}")
343
- page_content = ""
344
-
345
- response = Response(
346
- url=page.url,
347
- text=page_content,
348
- body=page_content.encode('utf-8'),
349
- status=final_response.status,
350
- reason=status_text,
351
- encoding=encoding,
352
- cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
353
- headers=first_response.all_headers(),
354
- request_headers=first_response.request.all_headers(),
355
- history=history,
356
- **self.adaptor_arguments
357
- )
358
- page.close()
359
- context.close()
360
- return response
361
-
362
- async def async_fetch(self, url: str) -> Response:
363
- """Async version of `fetch`
364
-
365
- :param url: Target url.
366
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
367
- """
368
- from playwright.async_api import Response as PlaywrightResponse
369
- if not self.stealth or self.real_chrome:
370
- # Because rebrowser_playwright doesn't play well with real browsers
371
- from playwright.async_api import async_playwright
372
- else:
373
- from rebrowser_playwright.async_api import async_playwright
374
-
375
- final_response = None
376
- referer = generate_convincing_referer(url) if self.google_search else None
377
-
378
- async def handle_response(finished_response: PlaywrightResponse):
379
- nonlocal final_response
380
- if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
381
- final_response = finished_response
382
-
383
- async with async_playwright() as p:
384
- # Creating the browser
385
- if self.cdp_url:
386
- cdp_url = self._cdp_url_logic()
387
- browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
388
- else:
389
- browser = await p.chromium.launch(**self.__launch_kwargs())
390
-
391
- context = await browser.new_context(**self.__context_kwargs())
392
- page = await context.new_page()
393
- page.set_default_navigation_timeout(self.timeout)
394
- page.set_default_timeout(self.timeout)
395
- page.on("response", handle_response)
396
-
397
- if self.extra_headers:
398
- await page.set_extra_http_headers(self.extra_headers)
399
-
400
- if self.disable_resources:
401
- await page.route("**/*", async_intercept_route)
402
-
403
- if self.stealth:
404
- for script in self.__stealth_scripts():
405
- await page.add_init_script(path=script)
406
-
407
- first_response = await page.goto(url, referer=referer)
408
- await page.wait_for_load_state(state="domcontentloaded")
409
-
410
- if self.network_idle:
411
- await page.wait_for_load_state('networkidle')
412
-
413
- if self.page_action is not None:
414
- try:
415
- page = await self.page_action(page)
416
- except Exception as e:
417
- log.error(f"Error executing async page_action: {e}")
418
-
419
- if self.wait_selector and type(self.wait_selector) is str:
420
- try:
421
- waiter = page.locator(self.wait_selector)
422
- await waiter.first.wait_for(state=self.wait_selector_state)
423
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
424
- await page.wait_for_load_state(state="load")
425
- await page.wait_for_load_state(state="domcontentloaded")
426
- if self.network_idle:
427
- await page.wait_for_load_state('networkidle')
428
- except Exception as e:
429
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
430
-
431
- await page.wait_for_timeout(self.wait)
432
- # In case we didn't catch a document type somehow
433
- final_response = final_response if final_response else first_response
434
- if not final_response:
435
- raise ValueError("Failed to get a response from the page")
436
-
437
- # This will be parsed inside `Response`
438
- encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
439
- # PlayWright API sometimes give empty status text for some reason!
440
- status_text = final_response.status_text or StatusText.get(final_response.status)
441
-
442
- history = await self._async_process_response_history(first_response)
443
- try:
444
- page_content = await page.content()
445
- except Exception as e:
446
- log.error(f"Error getting page content in async: {e}")
447
- page_content = ""
448
-
449
- response = Response(
450
- url=page.url,
451
- text=page_content,
452
- body=page_content.encode('utf-8'),
453
- status=final_response.status,
454
- reason=status_text,
455
- encoding=encoding,
456
- cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
457
- headers=await first_response.all_headers(),
458
- request_headers=await first_response.request.all_headers(),
459
- history=history,
460
- **self.adaptor_arguments
461
- )
462
- await page.close()
463
- await context.close()
464
-
465
- return response
@@ -1,5 +0,0 @@
1
- // PDF viewer enabled
2
- // Bypasses `pdfIsDisabled` test in creepsjs's 'Like Headless' sections
3
- Object.defineProperty(navigator, 'pdfViewerEnabled', {
4
- get: () => true,
5
- });