scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,130 @@
1
+ from functools import lru_cache
2
+
3
+ from scrapling.core._types import Tuple
4
+ from scrapling.engines.constants import (
5
+ DEFAULT_STEALTH_FLAGS,
6
+ HARMFUL_DEFAULT_ARGS,
7
+ DEFAULT_FLAGS,
8
+ )
9
+ from scrapling.engines.toolbelt import js_bypass_path, generate_headers
10
+
11
+ __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
12
+
13
+
14
+ @lru_cache(1)
15
+ def _compiled_stealth_scripts():
16
+ """Pre-read and compile stealth scripts"""
17
+ # Basic bypasses nothing fancy as I'm still working on it
18
+ # But with adding these bypasses to the above config, it bypasses many online tests like
19
+ # https://bot.sannysoft.com/
20
+ # https://kaliiiiiiiiii.github.io/brotector/
21
+ # https://pixelscan.net/
22
+ # https://iphey.com/
23
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
24
+ # https://arh.antoinevastel.com/bots/areyouheadless/
25
+ # https://prescience-data.github.io/execution-monitor.html
26
+ stealth_scripts_paths = tuple(
27
+ js_bypass_path(script)
28
+ for script in (
29
+ # Order is important
30
+ "webdriver_fully.js",
31
+ "window_chrome.js",
32
+ "navigator_plugins.js",
33
+ "notification_permission.js",
34
+ "screen_props.js",
35
+ "playwright_fingerprint.js",
36
+ )
37
+ )
38
+ scripts = []
39
+ for script_path in stealth_scripts_paths:
40
+ with open(script_path, "r") as f:
41
+ scripts.append(f.read())
42
+ return tuple(scripts)
43
+
44
+
45
+ @lru_cache(2, typed=True)
46
+ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
47
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
48
+ flags = DEFAULT_STEALTH_FLAGS
49
+ if hide_canvas:
50
+ flags += ("--fingerprinting-canvas-image-data-noise",)
51
+ if disable_webgl:
52
+ flags += (
53
+ "--disable-webgl",
54
+ "--disable-webgl-image-chromium",
55
+ "--disable-webgl2",
56
+ )
57
+
58
+ return flags
59
+
60
+
61
+ @lru_cache(2, typed=True)
62
+ def _launch_kwargs(
63
+ headless,
64
+ proxy,
65
+ locale,
66
+ extra_headers,
67
+ useragent,
68
+ real_chrome,
69
+ stealth,
70
+ hide_canvas,
71
+ disable_webgl,
72
+ ) -> Tuple:
73
+ """Creates the arguments we will use while launching playwright's browser"""
74
+ launch_kwargs = {
75
+ "locale": locale,
76
+ "headless": headless,
77
+ "args": DEFAULT_FLAGS,
78
+ "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
79
+ "proxy": proxy or tuple(),
80
+ "device_scale_factor": 2,
81
+ "ignore_default_args": HARMFUL_DEFAULT_ARGS,
82
+ "channel": "chrome" if real_chrome else "chromium",
83
+ "extra_http_headers": extra_headers or tuple(),
84
+ "user_agent": useragent or __default_useragent__,
85
+ }
86
+ if stealth:
87
+ launch_kwargs.update(
88
+ {
89
+ "args": DEFAULT_FLAGS + _set_flags(hide_canvas, disable_webgl),
90
+ "chromium_sandbox": True,
91
+ "is_mobile": False,
92
+ "has_touch": False,
93
+ # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
94
+ "service_workers": "allow",
95
+ "ignore_https_errors": True,
96
+ "screen": {"width": 1920, "height": 1080},
97
+ "viewport": {"width": 1920, "height": 1080},
98
+ "permissions": ["geolocation", "notifications"],
99
+ }
100
+ )
101
+
102
+ return tuple(launch_kwargs.items())
103
+
104
+
105
+ @lru_cache(2, typed=True)
106
+ def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
107
+ """Creates the arguments for the browser context"""
108
+ context_kwargs = {
109
+ "proxy": proxy or tuple(),
110
+ "locale": locale,
111
+ "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
112
+ "device_scale_factor": 2,
113
+ "extra_http_headers": extra_headers or tuple(),
114
+ "user_agent": useragent or __default_useragent__,
115
+ }
116
+ if stealth:
117
+ context_kwargs.update(
118
+ {
119
+ "is_mobile": False,
120
+ "has_touch": False,
121
+ # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
122
+ "service_workers": "allow",
123
+ "ignore_https_errors": True,
124
+ "screen": {"width": 1920, "height": 1080},
125
+ "viewport": {"width": 1920, "height": 1080},
126
+ "permissions": ["geolocation", "notifications"],
127
+ }
128
+ )
129
+
130
+ return tuple(context_kwargs.items())