scrapling 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +25 -8
- scrapling/core/_types.py +0 -2
- scrapling/core/ai.py +22 -14
- scrapling/core/custom_types.py +2 -2
- scrapling/core/shell.py +6 -5
- scrapling/core/storage.py +2 -1
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +11 -36
- scrapling/engines/_browsers/_camoufox.py +75 -60
- scrapling/engines/_browsers/_controllers.py +43 -52
- scrapling/engines/_browsers/_page.py +1 -42
- scrapling/engines/_browsers/_validators.py +130 -65
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +417 -16
- scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling/fetchers/__init__.py +36 -0
- scrapling/fetchers/chrome.py +205 -0
- scrapling/fetchers/firefox.py +216 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +7 -7
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/METADATA +25 -23
- scrapling-0.3.6.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.4.dist-info/RECORD +0 -44
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/WHEEL +0 -0
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
|
|
1
|
+
from scrapling.core._types import (
|
2
|
+
Callable,
|
3
|
+
Dict,
|
4
|
+
List,
|
5
|
+
Optional,
|
6
|
+
SelectorWaitStates,
|
7
|
+
Iterable,
|
8
|
+
)
|
9
|
+
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
10
|
+
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
11
|
+
|
12
|
+
|
13
|
+
class DynamicFetcher(BaseFetcher):
|
14
|
+
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
15
|
+
|
16
|
+
Using this Fetcher class, you can do requests with:
|
17
|
+
- Vanilla Playwright without any modifications other than the ones you chose.
|
18
|
+
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
|
19
|
+
Some of the things stealth mode does include:
|
20
|
+
1) Patches the CDP runtime fingerprint.
|
21
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
22
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
23
|
+
4) Generates real browser's headers of the same type and same user OS, then append it to the request.
|
24
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
|
25
|
+
|
26
|
+
> Note that these are the main options with PlayWright, but it can be mixed.
|
27
|
+
"""
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
def fetch(
|
31
|
+
cls,
|
32
|
+
url: str,
|
33
|
+
headless: bool = True,
|
34
|
+
google_search: bool = True,
|
35
|
+
hide_canvas: bool = False,
|
36
|
+
disable_webgl: bool = False,
|
37
|
+
real_chrome: bool = False,
|
38
|
+
stealth: bool = False,
|
39
|
+
wait: int | float = 0,
|
40
|
+
page_action: Optional[Callable] = None,
|
41
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
42
|
+
locale: str = "en-US",
|
43
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
44
|
+
useragent: Optional[str] = None,
|
45
|
+
cdp_url: Optional[str] = None,
|
46
|
+
timeout: int | float = 30000,
|
47
|
+
disable_resources: bool = False,
|
48
|
+
wait_selector: Optional[str] = None,
|
49
|
+
init_script: Optional[str] = None,
|
50
|
+
cookies: Optional[Iterable[Dict]] = None,
|
51
|
+
network_idle: bool = False,
|
52
|
+
load_dom: bool = True,
|
53
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
54
|
+
custom_config: Optional[Dict] = None,
|
55
|
+
) -> Response:
|
56
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
57
|
+
|
58
|
+
:param url: Target url.
|
59
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
60
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
61
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
62
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
63
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
64
|
+
:param cookies: Set cookies for the next request.
|
65
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
66
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
67
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
68
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
69
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
70
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
71
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
72
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
73
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
74
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
75
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
76
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
77
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
78
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
79
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
80
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
81
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
82
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
83
|
+
:return: A `Response` object.
|
84
|
+
"""
|
85
|
+
if not custom_config:
|
86
|
+
custom_config = {}
|
87
|
+
elif not isinstance(custom_config, dict):
|
88
|
+
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
89
|
+
|
90
|
+
with DynamicSession(
|
91
|
+
wait=wait,
|
92
|
+
proxy=proxy,
|
93
|
+
locale=locale,
|
94
|
+
timeout=timeout,
|
95
|
+
stealth=stealth,
|
96
|
+
cdp_url=cdp_url,
|
97
|
+
cookies=cookies,
|
98
|
+
headless=headless,
|
99
|
+
load_dom=load_dom,
|
100
|
+
useragent=useragent,
|
101
|
+
real_chrome=real_chrome,
|
102
|
+
page_action=page_action,
|
103
|
+
hide_canvas=hide_canvas,
|
104
|
+
init_script=init_script,
|
105
|
+
network_idle=network_idle,
|
106
|
+
google_search=google_search,
|
107
|
+
extra_headers=extra_headers,
|
108
|
+
wait_selector=wait_selector,
|
109
|
+
disable_webgl=disable_webgl,
|
110
|
+
disable_resources=disable_resources,
|
111
|
+
wait_selector_state=wait_selector_state,
|
112
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
113
|
+
) as session:
|
114
|
+
return session.fetch(url)
|
115
|
+
|
116
|
+
@classmethod
|
117
|
+
async def async_fetch(
|
118
|
+
cls,
|
119
|
+
url: str,
|
120
|
+
headless: bool = True,
|
121
|
+
google_search: bool = True,
|
122
|
+
hide_canvas: bool = False,
|
123
|
+
disable_webgl: bool = False,
|
124
|
+
real_chrome: bool = False,
|
125
|
+
stealth: bool = False,
|
126
|
+
wait: int | float = 0,
|
127
|
+
page_action: Optional[Callable] = None,
|
128
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
129
|
+
locale: str = "en-US",
|
130
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
131
|
+
useragent: Optional[str] = None,
|
132
|
+
cdp_url: Optional[str] = None,
|
133
|
+
timeout: int | float = 30000,
|
134
|
+
disable_resources: bool = False,
|
135
|
+
wait_selector: Optional[str] = None,
|
136
|
+
init_script: Optional[str] = None,
|
137
|
+
cookies: Optional[Iterable[Dict]] = None,
|
138
|
+
network_idle: bool = False,
|
139
|
+
load_dom: bool = True,
|
140
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
141
|
+
custom_config: Optional[Dict] = None,
|
142
|
+
) -> Response:
|
143
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
144
|
+
|
145
|
+
:param url: Target url.
|
146
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
147
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
148
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
149
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
150
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
151
|
+
:param cookies: Set cookies for the next request.
|
152
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
153
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
154
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
155
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
156
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
157
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
158
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
159
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
160
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
161
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
162
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
163
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
164
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
165
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
166
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
167
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
168
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
169
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
170
|
+
:return: A `Response` object.
|
171
|
+
"""
|
172
|
+
if not custom_config:
|
173
|
+
custom_config = {}
|
174
|
+
elif not isinstance(custom_config, dict):
|
175
|
+
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
176
|
+
|
177
|
+
async with AsyncDynamicSession(
|
178
|
+
wait=wait,
|
179
|
+
max_pages=1,
|
180
|
+
proxy=proxy,
|
181
|
+
locale=locale,
|
182
|
+
timeout=timeout,
|
183
|
+
stealth=stealth,
|
184
|
+
cdp_url=cdp_url,
|
185
|
+
cookies=cookies,
|
186
|
+
headless=headless,
|
187
|
+
load_dom=load_dom,
|
188
|
+
useragent=useragent,
|
189
|
+
real_chrome=real_chrome,
|
190
|
+
page_action=page_action,
|
191
|
+
hide_canvas=hide_canvas,
|
192
|
+
init_script=init_script,
|
193
|
+
network_idle=network_idle,
|
194
|
+
google_search=google_search,
|
195
|
+
extra_headers=extra_headers,
|
196
|
+
wait_selector=wait_selector,
|
197
|
+
disable_webgl=disable_webgl,
|
198
|
+
disable_resources=disable_resources,
|
199
|
+
wait_selector_state=wait_selector_state,
|
200
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
201
|
+
) as session:
|
202
|
+
return await session.fetch(url)
|
203
|
+
|
204
|
+
|
205
|
+
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|
@@ -0,0 +1,216 @@
|
|
1
|
+
from scrapling.core._types import (
|
2
|
+
Callable,
|
3
|
+
Dict,
|
4
|
+
List,
|
5
|
+
Optional,
|
6
|
+
SelectorWaitStates,
|
7
|
+
)
|
8
|
+
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
9
|
+
from scrapling.engines._browsers._camoufox import StealthySession, AsyncStealthySession
|
10
|
+
|
11
|
+
|
12
|
+
class StealthyFetcher(BaseFetcher):
|
13
|
+
"""A `Fetcher` class type that is a completely stealthy fetcher that uses a modified version of Firefox.
|
14
|
+
|
15
|
+
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
16
|
+
Other added flavors include setting the faked OS fingerprints to match the user's OS, and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
17
|
+
"""
|
18
|
+
|
19
|
+
@classmethod
|
20
|
+
def fetch(
|
21
|
+
cls,
|
22
|
+
url: str,
|
23
|
+
headless: bool = True, # noqa: F821
|
24
|
+
block_images: bool = False,
|
25
|
+
disable_resources: bool = False,
|
26
|
+
block_webrtc: bool = False,
|
27
|
+
allow_webgl: bool = True,
|
28
|
+
network_idle: bool = False,
|
29
|
+
load_dom: bool = True,
|
30
|
+
humanize: bool | float = True,
|
31
|
+
solve_cloudflare: bool = False,
|
32
|
+
wait: int | float = 0,
|
33
|
+
timeout: int | float = 30000,
|
34
|
+
page_action: Optional[Callable] = None,
|
35
|
+
wait_selector: Optional[str] = None,
|
36
|
+
init_script: Optional[str] = None,
|
37
|
+
addons: Optional[List[str]] = None,
|
38
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
39
|
+
cookies: Optional[List[Dict]] = None,
|
40
|
+
google_search: bool = True,
|
41
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
42
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
43
|
+
os_randomize: bool = False,
|
44
|
+
disable_ads: bool = False,
|
45
|
+
geoip: bool = False,
|
46
|
+
custom_config: Optional[Dict] = None,
|
47
|
+
additional_args: Optional[Dict] = None,
|
48
|
+
) -> Response:
|
49
|
+
"""
|
50
|
+
Opens up a browser and do your request based on your chosen options below.
|
51
|
+
|
52
|
+
:param url: Target url.
|
53
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
54
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
55
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
56
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
57
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
58
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
59
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
60
|
+
:param cookies: Set cookies for the next request.
|
61
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
62
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
63
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
64
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
65
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
66
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
67
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
68
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
69
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
70
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
71
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
72
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
73
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
74
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
75
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
76
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
77
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
78
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
79
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
80
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
81
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
82
|
+
:return: A `Response` object.
|
83
|
+
"""
|
84
|
+
if not custom_config:
|
85
|
+
custom_config = {}
|
86
|
+
elif not isinstance(custom_config, dict):
|
87
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
88
|
+
|
89
|
+
with StealthySession(
|
90
|
+
wait=wait,
|
91
|
+
proxy=proxy,
|
92
|
+
geoip=geoip,
|
93
|
+
addons=addons,
|
94
|
+
timeout=timeout,
|
95
|
+
cookies=cookies,
|
96
|
+
headless=headless,
|
97
|
+
humanize=humanize,
|
98
|
+
load_dom=load_dom,
|
99
|
+
disable_ads=disable_ads,
|
100
|
+
allow_webgl=allow_webgl,
|
101
|
+
page_action=page_action,
|
102
|
+
init_script=init_script,
|
103
|
+
network_idle=network_idle,
|
104
|
+
block_images=block_images,
|
105
|
+
block_webrtc=block_webrtc,
|
106
|
+
os_randomize=os_randomize,
|
107
|
+
wait_selector=wait_selector,
|
108
|
+
google_search=google_search,
|
109
|
+
extra_headers=extra_headers,
|
110
|
+
solve_cloudflare=solve_cloudflare,
|
111
|
+
disable_resources=disable_resources,
|
112
|
+
wait_selector_state=wait_selector_state,
|
113
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
114
|
+
additional_args=additional_args or {},
|
115
|
+
) as engine:
|
116
|
+
return engine.fetch(url)
|
117
|
+
|
118
|
+
@classmethod
|
119
|
+
async def async_fetch(
|
120
|
+
cls,
|
121
|
+
url: str,
|
122
|
+
headless: bool = True, # noqa: F821
|
123
|
+
block_images: bool = False,
|
124
|
+
disable_resources: bool = False,
|
125
|
+
block_webrtc: bool = False,
|
126
|
+
allow_webgl: bool = True,
|
127
|
+
network_idle: bool = False,
|
128
|
+
load_dom: bool = True,
|
129
|
+
humanize: bool | float = True,
|
130
|
+
solve_cloudflare: bool = False,
|
131
|
+
wait: int | float = 0,
|
132
|
+
timeout: int | float = 30000,
|
133
|
+
page_action: Optional[Callable] = None,
|
134
|
+
wait_selector: Optional[str] = None,
|
135
|
+
init_script: Optional[str] = None,
|
136
|
+
addons: Optional[List[str]] = None,
|
137
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
138
|
+
cookies: Optional[List[Dict]] = None,
|
139
|
+
google_search: bool = True,
|
140
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
141
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
142
|
+
os_randomize: bool = False,
|
143
|
+
disable_ads: bool = False,
|
144
|
+
geoip: bool = False,
|
145
|
+
custom_config: Optional[Dict] = None,
|
146
|
+
additional_args: Optional[Dict] = None,
|
147
|
+
) -> Response:
|
148
|
+
"""
|
149
|
+
Opens up a browser and do your request based on your chosen options below.
|
150
|
+
|
151
|
+
:param url: Target url.
|
152
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
153
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
154
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
155
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
156
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
157
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
158
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
159
|
+
:param cookies: Set cookies for the next request.
|
160
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
161
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
162
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
163
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
164
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
165
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
166
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
167
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
168
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
169
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
170
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
171
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
172
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
173
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
174
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
175
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
176
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
177
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
178
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
179
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
180
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
181
|
+
:return: A `Response` object.
|
182
|
+
"""
|
183
|
+
if not custom_config:
|
184
|
+
custom_config = {}
|
185
|
+
elif not isinstance(custom_config, dict):
|
186
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
187
|
+
|
188
|
+
async with AsyncStealthySession(
|
189
|
+
wait=wait,
|
190
|
+
max_pages=1,
|
191
|
+
proxy=proxy,
|
192
|
+
geoip=geoip,
|
193
|
+
addons=addons,
|
194
|
+
timeout=timeout,
|
195
|
+
cookies=cookies,
|
196
|
+
headless=headless,
|
197
|
+
humanize=humanize,
|
198
|
+
load_dom=load_dom,
|
199
|
+
disable_ads=disable_ads,
|
200
|
+
allow_webgl=allow_webgl,
|
201
|
+
page_action=page_action,
|
202
|
+
init_script=init_script,
|
203
|
+
network_idle=network_idle,
|
204
|
+
block_images=block_images,
|
205
|
+
block_webrtc=block_webrtc,
|
206
|
+
os_randomize=os_randomize,
|
207
|
+
wait_selector=wait_selector,
|
208
|
+
google_search=google_search,
|
209
|
+
extra_headers=extra_headers,
|
210
|
+
solve_cloudflare=solve_cloudflare,
|
211
|
+
disable_resources=disable_resources,
|
212
|
+
wait_selector_state=wait_selector_state,
|
213
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
214
|
+
additional_args=additional_args or {},
|
215
|
+
) as engine:
|
216
|
+
return await engine.fetch(url)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from scrapling.engines.static import (
|
2
|
+
FetcherSession,
|
3
|
+
FetcherClient as _FetcherClient,
|
4
|
+
AsyncFetcherClient as _AsyncFetcherClient,
|
5
|
+
)
|
6
|
+
from scrapling.engines.toolbelt.custom import BaseFetcher
|
7
|
+
|
8
|
+
|
9
|
+
__FetcherClientInstance__ = _FetcherClient()
|
10
|
+
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
|
11
|
+
|
12
|
+
|
13
|
+
class Fetcher(BaseFetcher):
|
14
|
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
15
|
+
|
16
|
+
get = __FetcherClientInstance__.get
|
17
|
+
post = __FetcherClientInstance__.post
|
18
|
+
put = __FetcherClientInstance__.put
|
19
|
+
delete = __FetcherClientInstance__.delete
|
20
|
+
|
21
|
+
|
22
|
+
class AsyncFetcher(BaseFetcher):
|
23
|
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
24
|
+
|
25
|
+
get = __AsyncFetcherClientInstance__.get
|
26
|
+
post = __AsyncFetcherClientInstance__.post
|
27
|
+
put = __AsyncFetcherClientInstance__.put
|
28
|
+
delete = __AsyncFetcherClientInstance__.delete
|
scrapling/parser.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
import re
|
2
1
|
from pathlib import Path
|
3
2
|
from inspect import signature
|
4
3
|
from urllib.parse import urljoin
|
5
4
|
from difflib import SequenceMatcher
|
5
|
+
from re import Pattern as re_Pattern
|
6
6
|
|
7
7
|
from lxml.html import HtmlElement, HtmlMixin, HTMLParser
|
8
8
|
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
@@ -239,7 +239,7 @@ class Selector(SelectorsGeneration):
|
|
239
239
|
)
|
240
240
|
|
241
241
|
def __handle_element(
|
242
|
-
self, element: HtmlElement | _ElementUnicodeResult
|
242
|
+
self, element: Optional[HtmlElement | _ElementUnicodeResult]
|
243
243
|
) -> Optional[Union[TextHandler, "Selector"]]:
|
244
244
|
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
245
245
|
if element is None:
|
@@ -341,11 +341,11 @@ class Selector(SelectorsGeneration):
|
|
341
341
|
"""Return the inner HTML code of the element"""
|
342
342
|
content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
|
343
343
|
if isinstance(content, bytes):
|
344
|
-
content = content.decode(
|
344
|
+
content = content.strip().decode(self.encoding)
|
345
345
|
return TextHandler(content)
|
346
346
|
|
347
347
|
@property
|
348
|
-
def body(self):
|
348
|
+
def body(self) -> str | bytes:
|
349
349
|
"""Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
|
350
350
|
return self._raw_body
|
351
351
|
|
@@ -359,7 +359,7 @@ class Selector(SelectorsGeneration):
|
|
359
359
|
with_tail=False,
|
360
360
|
)
|
361
361
|
if isinstance(content, bytes):
|
362
|
-
content = content.decode(
|
362
|
+
content = content.strip().decode(self.encoding)
|
363
363
|
return TextHandler(content)
|
364
364
|
|
365
365
|
def has_class(self, class_name: str) -> bool:
|
@@ -751,7 +751,7 @@ class Selector(SelectorsGeneration):
|
|
751
751
|
)
|
752
752
|
attributes.update(arg)
|
753
753
|
|
754
|
-
elif isinstance(arg,
|
754
|
+
elif isinstance(arg, re_Pattern):
|
755
755
|
patterns.add(arg)
|
756
756
|
|
757
757
|
elif callable(arg):
|
@@ -1259,7 +1259,7 @@ class Selectors(List[Selector]):
|
|
1259
1259
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1260
1260
|
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
1261
1261
|
"""
|
1262
|
-
results = [n.
|
1262
|
+
results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
|
1263
1263
|
return TextHandlers(flatten(results))
|
1264
1264
|
|
1265
1265
|
def re_first(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.6
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -64,20 +64,20 @@ Classifier: Typing :: Typed
|
|
64
64
|
Requires-Python: >=3.10
|
65
65
|
Description-Content-Type: text/markdown
|
66
66
|
License-File: LICENSE
|
67
|
-
Requires-Dist: lxml>=6.0.
|
67
|
+
Requires-Dist: lxml>=6.0.2
|
68
68
|
Requires-Dist: cssselect>=1.3.0
|
69
69
|
Requires-Dist: orjson>=3.11.3
|
70
70
|
Requires-Dist: tldextract>=5.3.0
|
71
71
|
Provides-Extra: fetchers
|
72
|
-
Requires-Dist: click>=8.
|
72
|
+
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
73
73
|
Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
|
74
|
-
Requires-Dist: playwright>=1.
|
75
|
-
Requires-Dist:
|
74
|
+
Requires-Dist: playwright>=1.55.0; extra == "fetchers"
|
75
|
+
Requires-Dist: patchright>=1.55.2; extra == "fetchers"
|
76
76
|
Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.
|
80
|
+
Requires-Dist: mcp>=1.15.0; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -139,7 +139,7 @@ Dynamic: license-file
|
|
139
139
|
|
140
140
|
Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
|
141
141
|
|
142
|
-
Built for the modern Web, Scrapling
|
142
|
+
Built for the modern Web, Scrapling features its own rapid parsing engine and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
143
143
|
|
144
144
|
```python
|
145
145
|
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
@@ -157,12 +157,14 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
157
157
|
|
158
158
|
<!-- sponsors -->
|
159
159
|
|
160
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
160
161
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
161
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
162
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
163
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
163
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
164
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
165
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
166
|
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
167
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
166
168
|
|
167
169
|
<!-- /sponsors -->
|
168
170
|
|
@@ -175,7 +177,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
175
177
|
### Advanced Websites Fetching with Session Support
|
176
178
|
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
|
177
179
|
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
|
178
|
-
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all
|
180
|
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile and Interstitial with automation easily.
|
179
181
|
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
180
182
|
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
181
183
|
|
@@ -199,13 +201,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
199
201
|
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
200
202
|
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
201
203
|
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
202
|
-
|
203
|
-
### New Session Architecture
|
204
|
-
Scrapling 0.3 introduces a completely revamped session system:
|
205
|
-
- **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
|
206
|
-
- **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
|
207
|
-
- **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
|
208
|
-
- **Concurrent Session Support**: Run multiple isolated sessions simultaneously
|
204
|
+
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
209
205
|
|
210
206
|
## Getting Started
|
211
207
|
|
@@ -323,11 +319,11 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
323
319
|
```
|
324
320
|
|
325
321
|
> [!NOTE]
|
326
|
-
> There are many additional features, but we want to keep this page
|
322
|
+
> There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
327
323
|
|
328
324
|
## Performance Benchmarks
|
329
325
|
|
330
|
-
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3
|
326
|
+
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations.
|
331
327
|
|
332
328
|
### Text Extraction Speed Test (5000 nested elements)
|
333
329
|
|
@@ -390,6 +386,13 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
390
386
|
```
|
391
387
|
Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
392
388
|
|
389
|
+
### Docker
|
390
|
+
You can also install a Docker image with all extras and browsers with the following command:
|
391
|
+
```bash
|
392
|
+
docker pull scrapling
|
393
|
+
```
|
394
|
+
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
395
|
+
|
393
396
|
## Contributing
|
394
397
|
|
395
398
|
We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
|
@@ -397,7 +400,7 @@ We welcome contributions! Please read our [contributing guidelines](https://gith
|
|
397
400
|
## Disclaimer
|
398
401
|
|
399
402
|
> [!CAUTION]
|
400
|
-
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect
|
403
|
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
|
401
404
|
|
402
405
|
## License
|
403
406
|
|
@@ -411,10 +414,9 @@ This project includes code adapted from:
|
|
411
414
|
## Thanks and References
|
412
415
|
|
413
416
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
414
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
417
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
415
418
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
416
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
417
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
419
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
418
420
|
|
419
421
|
---
|
420
422
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|