scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +759 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +644 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +170 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +239 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.1.dist-info/METADATA +411 -0
- scrapling-0.3.1.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
|
3
|
+
from scrapling.core._types import Tuple
|
4
|
+
from scrapling.engines.constants import (
|
5
|
+
DEFAULT_STEALTH_FLAGS,
|
6
|
+
HARMFUL_DEFAULT_ARGS,
|
7
|
+
DEFAULT_FLAGS,
|
8
|
+
)
|
9
|
+
from scrapling.engines.toolbelt import js_bypass_path, generate_headers
|
10
|
+
|
11
|
+
__default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
|
12
|
+
|
13
|
+
|
14
|
+
@lru_cache(1)
|
15
|
+
def _compiled_stealth_scripts():
|
16
|
+
"""Pre-read and compile stealth scripts"""
|
17
|
+
# Basic bypasses nothing fancy as I'm still working on it
|
18
|
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
19
|
+
# https://bot.sannysoft.com/
|
20
|
+
# https://kaliiiiiiiiii.github.io/brotector/
|
21
|
+
# https://pixelscan.net/
|
22
|
+
# https://iphey.com/
|
23
|
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
24
|
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
25
|
+
# https://prescience-data.github.io/execution-monitor.html
|
26
|
+
stealth_scripts_paths = tuple(
|
27
|
+
js_bypass_path(script)
|
28
|
+
for script in (
|
29
|
+
# Order is important
|
30
|
+
"webdriver_fully.js",
|
31
|
+
"window_chrome.js",
|
32
|
+
"navigator_plugins.js",
|
33
|
+
"notification_permission.js",
|
34
|
+
"screen_props.js",
|
35
|
+
"playwright_fingerprint.js",
|
36
|
+
)
|
37
|
+
)
|
38
|
+
scripts = []
|
39
|
+
for script_path in stealth_scripts_paths:
|
40
|
+
with open(script_path, "r") as f:
|
41
|
+
scripts.append(f.read())
|
42
|
+
return tuple(scripts)
|
43
|
+
|
44
|
+
|
45
|
+
@lru_cache(2, typed=True)
|
46
|
+
def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
|
47
|
+
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
48
|
+
flags = DEFAULT_STEALTH_FLAGS
|
49
|
+
if hide_canvas:
|
50
|
+
flags += ("--fingerprinting-canvas-image-data-noise",)
|
51
|
+
if disable_webgl:
|
52
|
+
flags += (
|
53
|
+
"--disable-webgl",
|
54
|
+
"--disable-webgl-image-chromium",
|
55
|
+
"--disable-webgl2",
|
56
|
+
)
|
57
|
+
|
58
|
+
return flags
|
59
|
+
|
60
|
+
|
61
|
+
@lru_cache(2, typed=True)
|
62
|
+
def _launch_kwargs(
|
63
|
+
headless,
|
64
|
+
proxy,
|
65
|
+
locale,
|
66
|
+
extra_headers,
|
67
|
+
useragent,
|
68
|
+
real_chrome,
|
69
|
+
stealth,
|
70
|
+
hide_canvas,
|
71
|
+
disable_webgl,
|
72
|
+
) -> Tuple:
|
73
|
+
"""Creates the arguments we will use while launching playwright's browser"""
|
74
|
+
launch_kwargs = {
|
75
|
+
"locale": locale,
|
76
|
+
"headless": headless,
|
77
|
+
"args": DEFAULT_FLAGS,
|
78
|
+
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
79
|
+
"proxy": proxy or tuple(),
|
80
|
+
"device_scale_factor": 2,
|
81
|
+
"ignore_default_args": HARMFUL_DEFAULT_ARGS,
|
82
|
+
"channel": "chrome" if real_chrome else "chromium",
|
83
|
+
"extra_http_headers": extra_headers or tuple(),
|
84
|
+
"user_agent": useragent or __default_useragent__,
|
85
|
+
}
|
86
|
+
if stealth:
|
87
|
+
launch_kwargs.update(
|
88
|
+
{
|
89
|
+
"args": DEFAULT_FLAGS + _set_flags(hide_canvas, disable_webgl),
|
90
|
+
"chromium_sandbox": True,
|
91
|
+
"is_mobile": False,
|
92
|
+
"has_touch": False,
|
93
|
+
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
94
|
+
"service_workers": "allow",
|
95
|
+
"ignore_https_errors": True,
|
96
|
+
"screen": {"width": 1920, "height": 1080},
|
97
|
+
"viewport": {"width": 1920, "height": 1080},
|
98
|
+
"permissions": ["geolocation", "notifications"],
|
99
|
+
}
|
100
|
+
)
|
101
|
+
|
102
|
+
return tuple(launch_kwargs.items())
|
103
|
+
|
104
|
+
|
105
|
+
@lru_cache(2, typed=True)
|
106
|
+
def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
|
107
|
+
"""Creates the arguments for the browser context"""
|
108
|
+
context_kwargs = {
|
109
|
+
"proxy": proxy or tuple(),
|
110
|
+
"locale": locale,
|
111
|
+
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
112
|
+
"device_scale_factor": 2,
|
113
|
+
"extra_http_headers": extra_headers or tuple(),
|
114
|
+
"user_agent": useragent or __default_useragent__,
|
115
|
+
}
|
116
|
+
if stealth:
|
117
|
+
context_kwargs.update(
|
118
|
+
{
|
119
|
+
"is_mobile": False,
|
120
|
+
"has_touch": False,
|
121
|
+
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
122
|
+
"service_workers": "allow",
|
123
|
+
"ignore_https_errors": True,
|
124
|
+
"screen": {"width": 1920, "height": 1080},
|
125
|
+
"viewport": {"width": 1920, "height": 1080},
|
126
|
+
"permissions": ["geolocation", "notifications"],
|
127
|
+
}
|
128
|
+
)
|
129
|
+
|
130
|
+
return tuple(context_kwargs.items())
|