scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +14 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +128 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +237 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +19 -0
- scrapling/engines/toolbelt/custom.py +154 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +108 -0
- scrapling/fetchers.py +198 -0
- scrapling/parser.py +223 -70
- scrapling/py.typed +1 -0
- scrapling-0.2.1.dist-info/METADATA +835 -0
- scrapling-0.2.1.dist-info/RECORD +33 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
"""
|
2
|
+
Functions related to files and URLs
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import logging
|
7
|
+
from urllib.parse import urlparse, urlencode
|
8
|
+
|
9
|
+
from scrapling.core.utils import cache
|
10
|
+
from scrapling.core._types import Union, Dict, Optional
|
11
|
+
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
12
|
+
|
13
|
+
from playwright.sync_api import Route
|
14
|
+
|
15
|
+
|
16
|
+
def intercept_route(route: Route) -> Union[Route, None]:
|
17
|
+
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
18
|
+
|
19
|
+
:param route: PlayWright `Route` object of the current page
|
20
|
+
:return: PlayWright `Route` object
|
21
|
+
"""
|
22
|
+
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
23
|
+
logging.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
24
|
+
return route.abort()
|
25
|
+
return route.continue_()
|
26
|
+
|
27
|
+
|
28
|
+
def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
|
29
|
+
"""Validate a proxy and return it in the acceptable format for Playwright
|
30
|
+
Reference: https://playwright.dev/python/docs/network#http-proxy
|
31
|
+
|
32
|
+
:param proxy_string: A string or a dictionary representation of the proxy.
|
33
|
+
:return:
|
34
|
+
"""
|
35
|
+
if proxy_string:
|
36
|
+
if isinstance(proxy_string, str):
|
37
|
+
proxy = urlparse(proxy_string)
|
38
|
+
try:
|
39
|
+
return {
|
40
|
+
'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
|
41
|
+
'username': proxy.username or '',
|
42
|
+
'password': proxy.password or '',
|
43
|
+
}
|
44
|
+
except ValueError:
|
45
|
+
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
46
|
+
raise TypeError(f'The proxy argument\'s string is in invalid format!')
|
47
|
+
|
48
|
+
elif isinstance(proxy_string, dict):
|
49
|
+
valid_keys = ('server', 'username', 'password', )
|
50
|
+
if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
|
51
|
+
return proxy_string
|
52
|
+
else:
|
53
|
+
raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
|
54
|
+
|
55
|
+
else:
|
56
|
+
raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
|
57
|
+
|
58
|
+
# The default value for proxy in Playwright's source is `None`
|
59
|
+
return None
|
60
|
+
|
61
|
+
|
62
|
+
def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
63
|
+
"""Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
|
64
|
+
|
65
|
+
:param cdp_url: The target URL.
|
66
|
+
:param query_params: A dictionary of the parameters to add.
|
67
|
+
:return: The new CDP URL.
|
68
|
+
"""
|
69
|
+
try:
|
70
|
+
# Validate the base URL structure
|
71
|
+
parsed = urlparse(cdp_url)
|
72
|
+
|
73
|
+
# Check scheme
|
74
|
+
if parsed.scheme not in ('ws', 'wss'):
|
75
|
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
76
|
+
|
77
|
+
# Validate hostname and port
|
78
|
+
if not parsed.netloc:
|
79
|
+
raise ValueError("Invalid hostname for the CDP URL")
|
80
|
+
|
81
|
+
# Ensure path starts with /
|
82
|
+
path = parsed.path
|
83
|
+
if not path.startswith('/'):
|
84
|
+
path = '/' + path
|
85
|
+
|
86
|
+
# Reconstruct the base URL with validated parts
|
87
|
+
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
88
|
+
|
89
|
+
# Add query parameters
|
90
|
+
if query_params:
|
91
|
+
query_string = urlencode(query_params)
|
92
|
+
return f"{validated_base}?{query_string}"
|
93
|
+
|
94
|
+
return validated_base
|
95
|
+
|
96
|
+
except Exception as e:
|
97
|
+
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
98
|
+
|
99
|
+
|
100
|
+
@cache(None, typed=True)
|
101
|
+
def js_bypass_path(filename: str) -> str:
|
102
|
+
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
103
|
+
|
104
|
+
:param filename: The base filename of the JS file.
|
105
|
+
:return: The full path of the JS file.
|
106
|
+
"""
|
107
|
+
current_directory = os.path.dirname(__file__)
|
108
|
+
return os.path.join(current_directory, 'bypasses', filename)
|
scrapling/fetchers.py
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal
|
2
|
+
|
3
|
+
from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
|
4
|
+
from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
|
5
|
+
|
6
|
+
|
7
|
+
class Fetcher(BaseFetcher):
|
8
|
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on httpx.
|
9
|
+
|
10
|
+
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
|
+
"""
|
12
|
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
13
|
+
"""Make basic HTTP GET request for you but with some added flavors.
|
14
|
+
:param url: Target url.
|
15
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
17
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
18
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
19
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
20
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
21
|
+
"""
|
22
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
23
|
+
return response_object
|
24
|
+
|
25
|
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
26
|
+
"""Make basic HTTP POST request for you but with some added flavors.
|
27
|
+
:param url: Target url.
|
28
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
29
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
30
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
31
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
32
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
33
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
34
|
+
"""
|
35
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
36
|
+
return response_object
|
37
|
+
|
38
|
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
39
|
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
40
|
+
:param url: Target url
|
41
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
42
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
43
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
44
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
45
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
46
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
47
|
+
"""
|
48
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
49
|
+
return response_object
|
50
|
+
|
51
|
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
52
|
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
53
|
+
:param url: Target url
|
54
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
55
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
56
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
57
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
58
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
59
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
60
|
+
"""
|
61
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
62
|
+
return response_object
|
63
|
+
|
64
|
+
|
65
|
+
class StealthyFetcher(BaseFetcher):
|
66
|
+
"""A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
|
67
|
+
|
68
|
+
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
69
|
+
Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
70
|
+
"""
|
71
|
+
def fetch(
|
72
|
+
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
73
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
74
|
+
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
75
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
76
|
+
os_randomize: Optional[bool] = None
|
77
|
+
) -> Response:
|
78
|
+
"""
|
79
|
+
Opens up a browser and do your request based on your chosen options below.
|
80
|
+
:param url: Target url.
|
81
|
+
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
82
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
83
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
84
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
85
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
86
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
87
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
88
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
89
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
90
|
+
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
91
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
92
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
93
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
94
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
95
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
96
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
97
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
98
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
99
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
100
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
101
|
+
"""
|
102
|
+
engine = CamoufoxEngine(
|
103
|
+
proxy=proxy,
|
104
|
+
addons=addons,
|
105
|
+
timeout=timeout,
|
106
|
+
headless=headless,
|
107
|
+
humanize=humanize,
|
108
|
+
allow_webgl=allow_webgl,
|
109
|
+
page_action=page_action,
|
110
|
+
network_idle=network_idle,
|
111
|
+
block_images=block_images,
|
112
|
+
block_webrtc=block_webrtc,
|
113
|
+
os_randomize=os_randomize,
|
114
|
+
wait_selector=wait_selector,
|
115
|
+
google_search=google_search,
|
116
|
+
extra_headers=extra_headers,
|
117
|
+
disable_resources=disable_resources,
|
118
|
+
wait_selector_state=wait_selector_state,
|
119
|
+
adaptor_arguments=self.adaptor_arguments,
|
120
|
+
)
|
121
|
+
return engine.fetch(url)
|
122
|
+
|
123
|
+
|
124
|
+
class PlayWrightFetcher(BaseFetcher):
|
125
|
+
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
126
|
+
|
127
|
+
Using this Fetcher class, you can do requests with:
|
128
|
+
- Vanilla Playwright without any modifications other than the ones you chose.
|
129
|
+
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
130
|
+
Some of the things stealth mode does include:
|
131
|
+
1) Patches the CDP runtime fingerprint.
|
132
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
133
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
134
|
+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
135
|
+
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
136
|
+
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
137
|
+
> Note that these are the main options with PlayWright but it can be mixed together.
|
138
|
+
"""
|
139
|
+
def fetch(
|
140
|
+
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
141
|
+
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
142
|
+
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
143
|
+
hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
144
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
145
|
+
stealth: bool = False,
|
146
|
+
cdp_url: Optional[str] = None,
|
147
|
+
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
148
|
+
) -> Response:
|
149
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
150
|
+
:param url: Target url.
|
151
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
152
|
+
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
153
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
154
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
155
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
156
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
157
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
158
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
159
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
160
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
161
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
162
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
163
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
164
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
165
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
166
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
167
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
168
|
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
169
|
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
170
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
171
|
+
"""
|
172
|
+
engine = PlaywrightEngine(
|
173
|
+
proxy=proxy,
|
174
|
+
timeout=timeout,
|
175
|
+
stealth=stealth,
|
176
|
+
cdp_url=cdp_url,
|
177
|
+
headless=headless,
|
178
|
+
useragent=useragent,
|
179
|
+
page_action=page_action,
|
180
|
+
hide_canvas=hide_canvas,
|
181
|
+
network_idle=network_idle,
|
182
|
+
google_search=google_search,
|
183
|
+
extra_headers=extra_headers,
|
184
|
+
wait_selector=wait_selector,
|
185
|
+
disable_webgl=disable_webgl,
|
186
|
+
nstbrowser_mode=nstbrowser_mode,
|
187
|
+
nstbrowser_config=nstbrowser_config,
|
188
|
+
disable_resources=disable_resources,
|
189
|
+
wait_selector_state=wait_selector_state,
|
190
|
+
adaptor_arguments=self.adaptor_arguments,
|
191
|
+
)
|
192
|
+
return engine.fetch(url)
|
193
|
+
|
194
|
+
|
195
|
+
class CustomFetcher(BaseFetcher):
|
196
|
+
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|
197
|
+
engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
|
198
|
+
return engine.fetch(url)
|