scrapling 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/engines/camo.py +12 -1
- scrapling/engines/constants.py +1 -1
- scrapling/engines/pw.py +21 -3
- scrapling/engines/static.py +20 -8
- scrapling/engines/toolbelt/custom.py +12 -8
- scrapling/fetchers.py +18 -10
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/METADATA +16 -9
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/RECORD +12 -12
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/LICENSE +0 -0
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/WHEEL +0 -0
- {scrapling-0.2.6.dist-info → scrapling-0.2.7.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.7"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/engines/camo.py
CHANGED
@@ -12,6 +12,7 @@ from scrapling.engines.toolbelt import (
|
|
12
12
|
generate_convincing_referer,
|
13
13
|
)
|
14
14
|
|
15
|
+
from camoufox import DefaultAddons
|
15
16
|
from camoufox.sync_api import Camoufox
|
16
17
|
|
17
18
|
|
@@ -21,7 +22,8 @@ class CamoufoxEngine:
|
|
21
22
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
22
23
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
23
24
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
24
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None,
|
25
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
26
|
+
adaptor_arguments: Dict = None,
|
25
27
|
):
|
26
28
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
27
29
|
|
@@ -36,6 +38,7 @@ class CamoufoxEngine:
|
|
36
38
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
39
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
38
40
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
41
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
39
42
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
40
43
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
41
44
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -54,6 +57,7 @@ class CamoufoxEngine:
|
|
54
57
|
self.network_idle = bool(network_idle)
|
55
58
|
self.google_search = bool(google_search)
|
56
59
|
self.os_randomize = bool(os_randomize)
|
60
|
+
self.disable_ads = bool(disable_ads)
|
57
61
|
self.extra_headers = extra_headers or {}
|
58
62
|
self.proxy = construct_proxy_dict(proxy)
|
59
63
|
self.addons = addons or []
|
@@ -75,9 +79,11 @@ class CamoufoxEngine:
|
|
75
79
|
:param url: Target url.
|
76
80
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
77
81
|
"""
|
82
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
78
83
|
with Camoufox(
|
79
84
|
proxy=self.proxy,
|
80
85
|
addons=self.addons,
|
86
|
+
exclude_addons=addons,
|
81
87
|
headless=self.headless,
|
82
88
|
humanize=self.humanize,
|
83
89
|
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
@@ -105,6 +111,11 @@ class CamoufoxEngine:
|
|
105
111
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
112
|
waiter = page.locator(self.wait_selector)
|
107
113
|
waiter.first.wait_for(state=self.wait_selector_state)
|
114
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
115
|
+
page.wait_for_load_state(state="load")
|
116
|
+
page.wait_for_load_state(state="domcontentloaded")
|
117
|
+
if self.network_idle:
|
118
|
+
page.wait_for_load_state('networkidle')
|
108
119
|
|
109
120
|
# This will be parsed inside `Response`
|
110
121
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
scrapling/engines/constants.py
CHANGED
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
44
44
|
'--disable-default-apps',
|
45
45
|
'--disable-print-preview',
|
46
46
|
'--disable-dev-shm-usage',
|
47
|
-
'--disable-popup-blocking',
|
47
|
+
# '--disable-popup-blocking',
|
48
48
|
'--metrics-recording-only',
|
49
49
|
'--disable-crash-reporter',
|
50
50
|
'--disable-partial-raster',
|
scrapling/engines/pw.py
CHANGED
@@ -26,6 +26,7 @@ class PlaywrightEngine:
|
|
26
26
|
timeout: Optional[float] = 30000,
|
27
27
|
page_action: Callable = do_nothing,
|
28
28
|
wait_selector: Optional[str] = None,
|
29
|
+
locale: Optional[str] = 'en-US',
|
29
30
|
wait_selector_state: Optional[str] = 'attached',
|
30
31
|
stealth: Optional[bool] = False,
|
31
32
|
real_chrome: Optional[bool] = False,
|
@@ -50,6 +51,7 @@ class PlaywrightEngine:
|
|
50
51
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
51
52
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
52
53
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
54
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
53
55
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
54
56
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
57
|
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
@@ -64,6 +66,7 @@ class PlaywrightEngine:
|
|
64
66
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
65
67
|
"""
|
66
68
|
self.headless = headless
|
69
|
+
self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
|
67
70
|
self.disable_resources = disable_resources
|
68
71
|
self.network_idle = bool(network_idle)
|
69
72
|
self.stealth = bool(stealth)
|
@@ -87,6 +90,14 @@ class PlaywrightEngine:
|
|
87
90
|
self.nstbrowser_mode = bool(nstbrowser_mode)
|
88
91
|
self.nstbrowser_config = nstbrowser_config
|
89
92
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
93
|
+
self.harmful_default_args = [
|
94
|
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
95
|
+
'--enable-automation',
|
96
|
+
'--disable-popup-blocking',
|
97
|
+
# '--disable-component-update',
|
98
|
+
# '--disable-default-apps',
|
99
|
+
# '--disable-extensions',
|
100
|
+
]
|
90
101
|
|
91
102
|
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
92
103
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
@@ -151,15 +162,15 @@ class PlaywrightEngine:
|
|
151
162
|
else:
|
152
163
|
if self.stealth:
|
153
164
|
browser = p.chromium.launch(
|
154
|
-
headless=self.headless, args=flags, ignore_default_args=
|
165
|
+
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
166
|
)
|
156
167
|
else:
|
157
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=
|
168
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
158
169
|
|
159
170
|
# Creating the context
|
160
171
|
if self.stealth:
|
161
172
|
context = browser.new_context(
|
162
|
-
locale=
|
173
|
+
locale=self.locale,
|
163
174
|
is_mobile=False,
|
164
175
|
has_touch=False,
|
165
176
|
proxy=self.proxy,
|
@@ -176,6 +187,8 @@ class PlaywrightEngine:
|
|
176
187
|
)
|
177
188
|
else:
|
178
189
|
context = browser.new_context(
|
190
|
+
locale=self.locale,
|
191
|
+
proxy=self.proxy,
|
179
192
|
color_scheme='dark',
|
180
193
|
user_agent=useragent,
|
181
194
|
device_scale_factor=2,
|
@@ -221,6 +234,11 @@ class PlaywrightEngine:
|
|
221
234
|
if self.wait_selector and type(self.wait_selector) is str:
|
222
235
|
waiter = page.locator(self.wait_selector)
|
223
236
|
waiter.first.wait_for(state=self.wait_selector_state)
|
237
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
238
|
+
page.wait_for_load_state(state="load")
|
239
|
+
page.wait_for_load_state(state="domcontentloaded")
|
240
|
+
if self.network_idle:
|
241
|
+
page.wait_for_load_state('networkidle')
|
224
242
|
|
225
243
|
# This will be parsed inside `Response`
|
226
244
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
scrapling/engines/static.py
CHANGED
@@ -63,54 +63,66 @@ class StaticEngine:
|
|
63
63
|
**self.adaptor_arguments
|
64
64
|
)
|
65
65
|
|
66
|
-
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
66
|
+
def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
67
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
68
|
|
69
69
|
:param url: Target url.
|
70
70
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
71
71
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
72
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
72
73
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
73
74
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
74
75
|
"""
|
75
76
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
76
|
-
|
77
|
+
with httpx.Client(proxy=proxy) as client:
|
78
|
+
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
79
|
+
|
77
80
|
return self._prepare_response(request)
|
78
81
|
|
79
|
-
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
82
|
+
def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
80
83
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
84
|
|
82
85
|
:param url: Target url.
|
83
86
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
84
87
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
88
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
85
89
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
86
90
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
87
91
|
"""
|
88
92
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
89
|
-
|
93
|
+
with httpx.Client(proxy=proxy) as client:
|
94
|
+
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
95
|
+
|
90
96
|
return self._prepare_response(request)
|
91
97
|
|
92
|
-
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
98
|
+
def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
93
99
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
100
|
|
95
101
|
:param url: Target url.
|
96
102
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
97
103
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
104
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
98
105
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
99
106
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
100
107
|
"""
|
101
108
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
102
|
-
|
109
|
+
with httpx.Client(proxy=proxy) as client:
|
110
|
+
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
111
|
+
|
103
112
|
return self._prepare_response(request)
|
104
113
|
|
105
|
-
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
114
|
+
def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
106
115
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
116
|
|
108
117
|
:param url: Target url.
|
109
118
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
110
119
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
120
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
111
121
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
112
122
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
113
123
|
"""
|
114
124
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
115
|
-
|
125
|
+
with httpx.Client(proxy=proxy) as client:
|
126
|
+
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
127
|
+
|
116
128
|
return self._prepare_response(request)
|
@@ -39,7 +39,7 @@ class ResponseEncoding:
|
|
39
39
|
|
40
40
|
@classmethod
|
41
41
|
@cache(maxsize=None)
|
42
|
-
def get_value(cls, content_type: Optional[str]) -> str:
|
42
|
+
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
43
|
"""Determine the appropriate character encoding from a content-type header.
|
44
44
|
|
45
45
|
The encoding is determined by these rules in order:
|
@@ -50,26 +50,30 @@ class ResponseEncoding:
|
|
50
50
|
5. Default to UTF-8 if nothing else matches
|
51
51
|
|
52
52
|
:param content_type: Content-Type header value or None
|
53
|
+
:param text: A text to test the encoding on it
|
53
54
|
:return: String naming the character encoding
|
54
55
|
"""
|
55
56
|
if not content_type:
|
56
57
|
return cls.__DEFAULT_ENCODING
|
57
58
|
|
58
59
|
try:
|
60
|
+
encoding = None
|
59
61
|
content_type, params = cls.__parse_content_type(content_type)
|
60
62
|
|
61
63
|
# First check for explicit charset parameter
|
62
64
|
if "charset" in params:
|
63
65
|
encoding = params["charset"].strip("'\"")
|
64
|
-
"test".encode(encoding) # Validate encoding
|
65
|
-
return encoding
|
66
66
|
|
67
67
|
# Apply content-type specific rules
|
68
|
-
|
69
|
-
|
68
|
+
elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
69
|
+
encoding = "ISO-8859-1"
|
70
|
+
|
71
|
+
elif content_type == "application/json":
|
72
|
+
encoding = cls.__DEFAULT_ENCODING
|
70
73
|
|
71
|
-
if
|
72
|
-
|
74
|
+
if encoding:
|
75
|
+
_ = text.encode(encoding) # Validate encoding and validate it can encode the given text
|
76
|
+
return encoding
|
73
77
|
|
74
78
|
return cls.__DEFAULT_ENCODING
|
75
79
|
|
@@ -87,7 +91,7 @@ class Response(Adaptor):
|
|
87
91
|
self.cookies = cookies
|
88
92
|
self.headers = headers
|
89
93
|
self.request_headers = request_headers
|
90
|
-
encoding = ResponseEncoding.get_value(encoding)
|
94
|
+
encoding = ResponseEncoding.get_value(encoding, text)
|
91
95
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
92
96
|
# For back-ward compatibility
|
93
97
|
self.adaptor = self
|
scrapling/fetchers.py
CHANGED
@@ -9,7 +9,7 @@ class Fetcher(BaseFetcher):
|
|
9
9
|
|
10
10
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
11
|
"""
|
12
|
-
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
12
|
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
13
13
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
14
|
|
15
15
|
:param url: Target url.
|
@@ -17,13 +17,14 @@ class Fetcher(BaseFetcher):
|
|
17
17
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
18
18
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
19
19
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
20
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
20
21
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
21
22
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
22
23
|
"""
|
23
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
24
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs)
|
24
25
|
return response_object
|
25
26
|
|
26
|
-
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
27
|
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
27
28
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
29
|
|
29
30
|
:param url: Target url.
|
@@ -31,13 +32,14 @@ class Fetcher(BaseFetcher):
|
|
31
32
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
32
33
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
33
34
|
create a referer header as if this request came from Google's search of this URL's domain.
|
35
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
34
36
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
35
37
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
36
38
|
"""
|
37
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
39
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs)
|
38
40
|
return response_object
|
39
41
|
|
40
|
-
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
42
|
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
41
43
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
44
|
|
43
45
|
:param url: Target url
|
@@ -45,14 +47,15 @@ class Fetcher(BaseFetcher):
|
|
45
47
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
46
48
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
47
49
|
create a referer header as if this request came from Google's search of this URL's domain.
|
50
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
48
51
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
52
|
|
50
53
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
51
54
|
"""
|
52
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
55
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs)
|
53
56
|
return response_object
|
54
57
|
|
55
|
-
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
58
|
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
56
59
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
60
|
|
58
61
|
:param url: Target url
|
@@ -60,10 +63,11 @@ class Fetcher(BaseFetcher):
|
|
60
63
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
61
64
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
62
65
|
create a referer header as if this request came from Google's search of this URL's domain.
|
66
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
63
67
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
64
68
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
65
69
|
"""
|
66
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
70
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs)
|
67
71
|
return response_object
|
68
72
|
|
69
73
|
|
@@ -78,7 +82,7 @@ class StealthyFetcher(BaseFetcher):
|
|
78
82
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
79
83
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
80
84
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
81
|
-
os_randomize: Optional[bool] = None
|
85
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
82
86
|
) -> Response:
|
83
87
|
"""
|
84
88
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -92,6 +96,7 @@ class StealthyFetcher(BaseFetcher):
|
|
92
96
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
93
97
|
:param block_webrtc: Blocks WebRTC entirely.
|
94
98
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
99
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
95
100
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
96
101
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
97
102
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
@@ -111,6 +116,7 @@ class StealthyFetcher(BaseFetcher):
|
|
111
116
|
timeout=timeout,
|
112
117
|
headless=headless,
|
113
118
|
humanize=humanize,
|
119
|
+
disable_ads=disable_ads,
|
114
120
|
allow_webgl=allow_webgl,
|
115
121
|
page_action=page_action,
|
116
122
|
network_idle=network_idle,
|
@@ -148,7 +154,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
148
154
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
149
155
|
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
156
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
151
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
157
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
152
158
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
153
159
|
cdp_url: Optional[str] = None,
|
154
160
|
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
@@ -163,6 +169,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
163
169
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
164
170
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
165
171
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
172
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
166
173
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
167
174
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
168
175
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
@@ -180,6 +187,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
180
187
|
"""
|
181
188
|
engine = PlaywrightEngine(
|
182
189
|
proxy=proxy,
|
190
|
+
locale=locale,
|
183
191
|
timeout=timeout,
|
184
192
|
stealth=stealth,
|
185
193
|
cdp_url=cdp_url,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
90
90
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
91
91
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
92
92
|
* [Installation](#installation)
|
93
|
-
* [Fetching Websites
|
94
|
-
* [
|
95
|
-
* [
|
96
|
-
* [
|
93
|
+
* [Fetching Websites](#fetching-websites)
|
94
|
+
* [Features](#features)
|
95
|
+
* [Fetcher class](#fetcher)
|
96
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
97
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
97
98
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
98
99
|
* [Smart Navigation](#smart-navigation)
|
99
100
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -256,7 +257,10 @@ playwright install chromium
|
|
256
257
|
python -m browserforge update
|
257
258
|
```
|
258
259
|
|
259
|
-
## Fetching Websites
|
260
|
+
## Fetching Websites
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
|
+
|
263
|
+
### Features
|
260
264
|
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
265
|
```python
|
262
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
@@ -279,9 +283,11 @@ Also, the `Response` object returned from all fetchers is the same as `Adaptor`
|
|
279
283
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
280
284
|
|
281
285
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
286
|
+
|
287
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
282
288
|
```python
|
283
289
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
284
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
290
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
285
291
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
286
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
287
293
|
```
|
@@ -309,6 +315,7 @@ True
|
|
309
315
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
310
316
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
311
317
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
318
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
312
319
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
313
320
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
314
321
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -363,6 +370,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
363
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
364
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
372
|
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
366
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
367
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
368
376
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
814
822
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
815
823
|
|
816
824
|
## More Sponsors!
|
817
|
-
|
818
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
825
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
819
826
|
|
820
827
|
|
821
828
|
## Contributing
|
@@ -1,6 +1,6 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=WjvhJ6xkiSHp7St2YJYYJIsiKL8WDYuAQ_qIsg03v-0,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256
|
3
|
+
scrapling/fetchers.py,sha256=vjAsa-oleb7FfYsxqmEUVZGNxdo7LMVuiLuyjIGySQE,17417
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -11,12 +11,12 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
11
11
|
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
15
|
-
scrapling/engines/constants.py,sha256=
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
-
scrapling/engines/static.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=Lw_uZ5SMBy3T6MkCNOMPk1i51Lnpfd0M7HyAUJAzKIg,8284
|
15
|
+
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
+
scrapling/engines/pw.py,sha256=ZRmbFNQWzvxUHVrIUcKefyg6fDpBrN6erdatDpcLBaw,13762
|
17
|
+
scrapling/engines/static.py,sha256=ryVCIjTpVLNlCxSf_NYwDSdsoDbafnsGpkCoCROPhlI,8021
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=KopO0SVWzFoNB8LbFDQhtErm8KCid6nkQcGqRaItC6U,12752
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -35,8 +35,8 @@ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,
|
|
35
35
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
37
|
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
42
|
-
scrapling-0.2.
|
38
|
+
scrapling-0.2.7.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.7.dist-info/METADATA,sha256=kYARTFqiiLsL_cvnU03pf2I1E5N_NmJk25gbeLzSR4M,66607
|
40
|
+
scrapling-0.2.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.7.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|