scrapling 0.2.96__py3-none-any.whl → 0.2.97__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/custom_types.py +1 -3
- scrapling/core/storage_adaptors.py +3 -3
- scrapling/core/translator.py +1 -1
- scrapling/core/utils.py +1 -1
- scrapling/engines/camo.py +123 -104
- scrapling/engines/pw.py +100 -75
- scrapling/engines/static.py +22 -42
- scrapling/engines/toolbelt/custom.py +2 -2
- scrapling/engines/toolbelt/fingerprints.py +2 -2
- scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling/fetchers.py +24 -24
- scrapling/parser.py +1 -1
- {scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/METADATA +17 -16
- {scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/RECORD +19 -19
- {scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/WHEEL +1 -1
- {scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/LICENSE +0 -0
- {scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.97"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/core/custom_types.py
CHANGED
@@ -19,9 +19,7 @@ class TextHandler(str):
|
|
19
19
|
__slots__ = ()
|
20
20
|
|
21
21
|
def __new__(cls, string):
|
22
|
-
|
23
|
-
return super().__new__(cls, string)
|
24
|
-
return super().__new__(cls, '')
|
22
|
+
return super().__new__(cls, str(string))
|
25
23
|
|
26
24
|
def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
|
27
25
|
lst = super().__getitem__(key)
|
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
19
19
|
"""
|
20
20
|
self.url = url
|
21
21
|
|
22
|
-
@lru_cache(
|
22
|
+
@lru_cache(126, typed=True)
|
23
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
24
24
|
if not self.url or type(self.url) is not str:
|
25
25
|
return default_value
|
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
51
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
52
52
|
|
53
53
|
@staticmethod
|
54
|
-
@lru_cache(
|
54
|
+
@lru_cache(256, typed=True)
|
55
55
|
def _get_hash(identifier: str) -> str:
|
56
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
57
57
|
identifier = identifier.lower().strip()
|
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
63
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
64
64
|
|
65
65
|
|
66
|
-
@lru_cache(
|
66
|
+
@lru_cache(10, typed=True)
|
67
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
68
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
69
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
scrapling/core/translator.py
CHANGED
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@lru_cache(maxsize=
|
142
|
+
@lru_cache(maxsize=256)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
scrapling/core/utils.py
CHANGED
scrapling/engines/camo.py
CHANGED
@@ -15,12 +15,12 @@ from scrapling.engines.toolbelt import (Response, StatusText,
|
|
15
15
|
|
16
16
|
class CamoufoxEngine:
|
17
17
|
def __init__(
|
18
|
-
self, headless:
|
19
|
-
block_webrtc:
|
18
|
+
self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
19
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state:
|
22
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize:
|
23
|
-
geoip:
|
21
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
|
23
|
+
geoip: bool = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
25
25
|
):
|
26
26
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
@@ -64,107 +64,140 @@ class CamoufoxEngine:
|
|
64
64
|
self.addons = addons or []
|
65
65
|
self.humanize = humanize
|
66
66
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
67
|
+
|
68
|
+
# Page action callable validation
|
69
|
+
self.page_action = None
|
67
70
|
if page_action is not None:
|
68
71
|
if callable(page_action):
|
69
72
|
self.page_action = page_action
|
70
73
|
else:
|
71
|
-
self.page_action = None
|
72
74
|
log.error('[Ignored] Argument "page_action" must be callable')
|
73
|
-
else:
|
74
|
-
self.page_action = None
|
75
75
|
|
76
76
|
self.wait_selector = wait_selector
|
77
77
|
self.wait_selector_state = wait_selector_state
|
78
78
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
79
79
|
|
80
|
+
def _get_camoufox_options(self):
|
81
|
+
"""Return consistent browser options dictionary for both sync and async methods"""
|
82
|
+
return {
|
83
|
+
"geoip": self.geoip,
|
84
|
+
"proxy": self.proxy,
|
85
|
+
"enable_cache": True,
|
86
|
+
"addons": self.addons,
|
87
|
+
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
88
|
+
"headless": self.headless,
|
89
|
+
"humanize": self.humanize,
|
90
|
+
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
91
|
+
"allow_webgl": self.allow_webgl,
|
92
|
+
"block_webrtc": self.block_webrtc,
|
93
|
+
"block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
94
|
+
"os": None if self.os_randomize else get_os_name(),
|
95
|
+
}
|
96
|
+
|
97
|
+
def _process_response_history(self, first_response):
|
98
|
+
"""Process response history to build a list of Response objects"""
|
99
|
+
history = []
|
100
|
+
current_request = first_response.request.redirected_from
|
101
|
+
|
102
|
+
try:
|
103
|
+
while current_request:
|
104
|
+
try:
|
105
|
+
current_response = current_request.response()
|
106
|
+
history.insert(0, Response(
|
107
|
+
url=current_request.url,
|
108
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
109
|
+
text='',
|
110
|
+
body=b'',
|
111
|
+
status=current_response.status if current_response else 301,
|
112
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
113
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
114
|
+
cookies={},
|
115
|
+
headers=current_response.all_headers() if current_response else {},
|
116
|
+
request_headers=current_request.all_headers(),
|
117
|
+
**self.adaptor_arguments
|
118
|
+
))
|
119
|
+
except Exception as e:
|
120
|
+
log.error(f"Error processing redirect: {e}")
|
121
|
+
break
|
122
|
+
|
123
|
+
current_request = current_request.redirected_from
|
124
|
+
except Exception as e:
|
125
|
+
log.error(f"Error processing response history: {e}")
|
126
|
+
|
127
|
+
return history
|
128
|
+
|
80
129
|
def fetch(self, url: str) -> Response:
|
81
130
|
"""Opens up the browser and do your request based on your chosen options.
|
82
131
|
|
83
132
|
:param url: Target url.
|
84
133
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
134
|
"""
|
86
|
-
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
87
|
-
# Store the final response
|
88
135
|
final_response = None
|
136
|
+
referer = generate_convincing_referer(url) if self.google_search else None
|
89
137
|
|
90
138
|
def handle_response(finished_response):
|
91
139
|
nonlocal final_response
|
92
140
|
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
93
141
|
final_response = finished_response
|
94
142
|
|
95
|
-
with Camoufox(
|
96
|
-
|
97
|
-
|
98
|
-
enable_cache=True,
|
99
|
-
addons=self.addons,
|
100
|
-
exclude_addons=addons,
|
101
|
-
headless=self.headless,
|
102
|
-
humanize=self.humanize,
|
103
|
-
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
104
|
-
allow_webgl=self.allow_webgl,
|
105
|
-
block_webrtc=self.block_webrtc,
|
106
|
-
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
107
|
-
os=None if self.os_randomize else get_os_name(),
|
108
|
-
) as browser:
|
109
|
-
page = browser.new_page()
|
143
|
+
with Camoufox(**self._get_camoufox_options()) as browser:
|
144
|
+
context = browser.new_context()
|
145
|
+
page = context.new_page()
|
110
146
|
page.set_default_navigation_timeout(self.timeout)
|
111
147
|
page.set_default_timeout(self.timeout)
|
112
|
-
# Listen for all responses
|
113
148
|
page.on("response", handle_response)
|
149
|
+
|
114
150
|
if self.disable_resources:
|
115
151
|
page.route("**/*", intercept_route)
|
116
152
|
|
117
153
|
if self.extra_headers:
|
118
154
|
page.set_extra_http_headers(self.extra_headers)
|
119
155
|
|
120
|
-
first_response = page.goto(url, referer=
|
156
|
+
first_response = page.goto(url, referer=referer)
|
121
157
|
page.wait_for_load_state(state="domcontentloaded")
|
158
|
+
|
122
159
|
if self.network_idle:
|
123
160
|
page.wait_for_load_state('networkidle')
|
124
161
|
|
125
162
|
if self.page_action is not None:
|
126
|
-
|
163
|
+
try:
|
164
|
+
page = self.page_action(page)
|
165
|
+
except Exception as e:
|
166
|
+
log.error(f"Error executing page_action: {e}")
|
127
167
|
|
128
168
|
if self.wait_selector and type(self.wait_selector) is str:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
169
|
+
try:
|
170
|
+
waiter = page.locator(self.wait_selector)
|
171
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
172
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
173
|
+
page.wait_for_load_state(state="load")
|
174
|
+
page.wait_for_load_state(state="domcontentloaded")
|
175
|
+
if self.network_idle:
|
176
|
+
page.wait_for_load_state('networkidle')
|
177
|
+
except Exception as e:
|
178
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
136
179
|
|
137
180
|
# In case we didn't catch a document type somehow
|
138
181
|
final_response = final_response if final_response else first_response
|
182
|
+
if not final_response:
|
183
|
+
raise ValueError("Failed to get a response from the page")
|
184
|
+
|
139
185
|
# This will be parsed inside `Response`
|
140
186
|
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
141
187
|
# PlayWright API sometimes give empty status text for some reason!
|
142
188
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
143
189
|
|
144
|
-
history =
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
url=current_request.url,
|
151
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
152
|
-
text='',
|
153
|
-
body=b'',
|
154
|
-
status=current_response.status if current_response else 301,
|
155
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
156
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
157
|
-
cookies={},
|
158
|
-
headers=current_response.all_headers() if current_response else {},
|
159
|
-
request_headers=current_request.all_headers(),
|
160
|
-
**self.adaptor_arguments
|
161
|
-
))
|
162
|
-
current_request = current_request.redirected_from
|
190
|
+
history = self._process_response_history(first_response)
|
191
|
+
try:
|
192
|
+
page_content = page.content()
|
193
|
+
except Exception as e:
|
194
|
+
log.error(f"Error getting page content: {e}")
|
195
|
+
page_content = ""
|
163
196
|
|
164
197
|
response = Response(
|
165
198
|
url=page.url,
|
166
|
-
text=
|
167
|
-
body=
|
199
|
+
text=page_content,
|
200
|
+
body=page_content.encode('utf-8'),
|
168
201
|
status=final_response.status,
|
169
202
|
reason=status_text,
|
170
203
|
encoding=encoding,
|
@@ -175,6 +208,7 @@ class CamoufoxEngine:
|
|
175
208
|
**self.adaptor_arguments
|
176
209
|
)
|
177
210
|
page.close()
|
211
|
+
context.close()
|
178
212
|
|
179
213
|
return response
|
180
214
|
|
@@ -184,88 +218,72 @@ class CamoufoxEngine:
|
|
184
218
|
:param url: Target url.
|
185
219
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
186
220
|
"""
|
187
|
-
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
188
|
-
# Store the final response
|
189
221
|
final_response = None
|
222
|
+
referer = generate_convincing_referer(url) if self.google_search else None
|
190
223
|
|
191
224
|
async def handle_response(finished_response):
|
192
225
|
nonlocal final_response
|
193
226
|
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
194
227
|
final_response = finished_response
|
195
228
|
|
196
|
-
async with AsyncCamoufox(
|
197
|
-
|
198
|
-
|
199
|
-
enable_cache=True,
|
200
|
-
addons=self.addons,
|
201
|
-
exclude_addons=addons,
|
202
|
-
headless=self.headless,
|
203
|
-
humanize=self.humanize,
|
204
|
-
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
205
|
-
allow_webgl=self.allow_webgl,
|
206
|
-
block_webrtc=self.block_webrtc,
|
207
|
-
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
208
|
-
os=None if self.os_randomize else get_os_name(),
|
209
|
-
) as browser:
|
210
|
-
page = await browser.new_page()
|
229
|
+
async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
|
230
|
+
context = await browser.new_context()
|
231
|
+
page = await context.new_page()
|
211
232
|
page.set_default_navigation_timeout(self.timeout)
|
212
233
|
page.set_default_timeout(self.timeout)
|
213
|
-
# Listen for all responses
|
214
234
|
page.on("response", handle_response)
|
235
|
+
|
215
236
|
if self.disable_resources:
|
216
237
|
await page.route("**/*", async_intercept_route)
|
217
238
|
|
218
239
|
if self.extra_headers:
|
219
240
|
await page.set_extra_http_headers(self.extra_headers)
|
220
241
|
|
221
|
-
first_response = await page.goto(url, referer=
|
242
|
+
first_response = await page.goto(url, referer=referer)
|
222
243
|
await page.wait_for_load_state(state="domcontentloaded")
|
244
|
+
|
223
245
|
if self.network_idle:
|
224
246
|
await page.wait_for_load_state('networkidle')
|
225
247
|
|
226
248
|
if self.page_action is not None:
|
227
|
-
|
249
|
+
try:
|
250
|
+
page = await self.page_action(page)
|
251
|
+
except Exception as e:
|
252
|
+
log.error(f"Error executing async page_action: {e}")
|
228
253
|
|
229
254
|
if self.wait_selector and type(self.wait_selector) is str:
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
255
|
+
try:
|
256
|
+
waiter = page.locator(self.wait_selector)
|
257
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
258
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
259
|
+
await page.wait_for_load_state(state="load")
|
260
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
261
|
+
if self.network_idle:
|
262
|
+
await page.wait_for_load_state('networkidle')
|
263
|
+
except Exception as e:
|
264
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
237
265
|
|
238
266
|
# In case we didn't catch a document type somehow
|
239
267
|
final_response = final_response if final_response else first_response
|
268
|
+
if not final_response:
|
269
|
+
raise ValueError("Failed to get a response from the page")
|
270
|
+
|
240
271
|
# This will be parsed inside `Response`
|
241
272
|
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
242
273
|
# PlayWright API sometimes give empty status text for some reason!
|
243
274
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
244
275
|
|
245
|
-
history =
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
url=current_request.url,
|
252
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
253
|
-
text='',
|
254
|
-
body=b'',
|
255
|
-
status=current_response.status if current_response else 301,
|
256
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
257
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
258
|
-
cookies={},
|
259
|
-
headers=await current_response.all_headers() if current_response else {},
|
260
|
-
request_headers=await current_request.all_headers(),
|
261
|
-
**self.adaptor_arguments
|
262
|
-
))
|
263
|
-
current_request = current_request.redirected_from
|
276
|
+
history = self._process_response_history(first_response)
|
277
|
+
try:
|
278
|
+
page_content = await page.content()
|
279
|
+
except Exception as e:
|
280
|
+
log.error(f"Error getting page content in async: {e}")
|
281
|
+
page_content = ""
|
264
282
|
|
265
283
|
response = Response(
|
266
284
|
url=page.url,
|
267
|
-
text=
|
268
|
-
body=
|
285
|
+
text=page_content,
|
286
|
+
body=page_content.encode('utf-8'),
|
269
287
|
status=final_response.status,
|
270
288
|
reason=status_text,
|
271
289
|
encoding=encoding,
|
@@ -276,5 +294,6 @@ class CamoufoxEngine:
|
|
276
294
|
**self.adaptor_arguments
|
277
295
|
)
|
278
296
|
await page.close()
|
297
|
+
await context.close()
|
279
298
|
|
280
299
|
return response
|
scrapling/engines/pw.py
CHANGED
@@ -19,20 +19,20 @@ class PlaywrightEngine:
|
|
19
19
|
self, headless: Union[bool, str] = True,
|
20
20
|
disable_resources: bool = False,
|
21
21
|
useragent: Optional[str] = None,
|
22
|
-
network_idle:
|
22
|
+
network_idle: bool = False,
|
23
23
|
timeout: Optional[float] = 30000,
|
24
24
|
page_action: Callable = None,
|
25
25
|
wait_selector: Optional[str] = None,
|
26
26
|
locale: Optional[str] = 'en-US',
|
27
27
|
wait_selector_state: SelectorWaitStates = 'attached',
|
28
|
-
stealth:
|
29
|
-
real_chrome:
|
30
|
-
hide_canvas:
|
31
|
-
disable_webgl:
|
28
|
+
stealth: bool = False,
|
29
|
+
real_chrome: bool = False,
|
30
|
+
hide_canvas: bool = False,
|
31
|
+
disable_webgl: bool = False,
|
32
32
|
cdp_url: Optional[str] = None,
|
33
|
-
nstbrowser_mode:
|
33
|
+
nstbrowser_mode: bool = False,
|
34
34
|
nstbrowser_config: Optional[Dict] = None,
|
35
|
-
google_search:
|
35
|
+
google_search: bool = True,
|
36
36
|
extra_headers: Optional[Dict[str, str]] = None,
|
37
37
|
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
38
38
|
adaptor_arguments: Dict = None
|
@@ -126,7 +126,7 @@ class PlaywrightEngine:
|
|
126
126
|
|
127
127
|
return cdp_url
|
128
128
|
|
129
|
-
@lru_cache(typed=True)
|
129
|
+
@lru_cache(126, typed=True)
|
130
130
|
def __set_flags(self):
|
131
131
|
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
132
132
|
flags = DEFAULT_STEALTH_FLAGS
|
@@ -169,7 +169,7 @@ class PlaywrightEngine:
|
|
169
169
|
|
170
170
|
return context_kwargs
|
171
171
|
|
172
|
-
@lru_cache()
|
172
|
+
@lru_cache(10)
|
173
173
|
def __stealth_scripts(self):
|
174
174
|
# Basic bypasses nothing fancy as I'm still working on it
|
175
175
|
# But with adding these bypasses to the above config, it bypasses many online tests like
|
@@ -188,6 +188,38 @@ class PlaywrightEngine:
|
|
188
188
|
)
|
189
189
|
)
|
190
190
|
|
191
|
+
def _process_response_history(self, first_response):
|
192
|
+
"""Process response history to build a list of Response objects"""
|
193
|
+
history = []
|
194
|
+
current_request = first_response.request.redirected_from
|
195
|
+
|
196
|
+
try:
|
197
|
+
while current_request:
|
198
|
+
try:
|
199
|
+
current_response = current_request.response()
|
200
|
+
history.insert(0, Response(
|
201
|
+
url=current_request.url,
|
202
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
203
|
+
text='',
|
204
|
+
body=b'',
|
205
|
+
status=current_response.status if current_response else 301,
|
206
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
207
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
208
|
+
cookies={},
|
209
|
+
headers=current_response.all_headers() if current_response else {},
|
210
|
+
request_headers=current_request.all_headers(),
|
211
|
+
**self.adaptor_arguments
|
212
|
+
))
|
213
|
+
except Exception as e:
|
214
|
+
log.error(f"Error processing redirect: {e}")
|
215
|
+
break
|
216
|
+
|
217
|
+
current_request = current_request.redirected_from
|
218
|
+
except Exception as e:
|
219
|
+
log.error(f"Error processing response history: {e}")
|
220
|
+
|
221
|
+
return history
|
222
|
+
|
191
223
|
def fetch(self, url: str) -> Response:
|
192
224
|
"""Opens up the browser and do your request based on your chosen options.
|
193
225
|
|
@@ -201,8 +233,8 @@ class PlaywrightEngine:
|
|
201
233
|
else:
|
202
234
|
from rebrowser_playwright.sync_api import sync_playwright
|
203
235
|
|
204
|
-
# Store the final response
|
205
236
|
final_response = None
|
237
|
+
referer = generate_convincing_referer(url) if self.google_search else None
|
206
238
|
|
207
239
|
def handle_response(finished_response: PlaywrightResponse):
|
208
240
|
nonlocal final_response
|
@@ -218,11 +250,9 @@ class PlaywrightEngine:
|
|
218
250
|
browser = p.chromium.launch(**self.__launch_kwargs())
|
219
251
|
|
220
252
|
context = browser.new_context(**self.__context_kwargs())
|
221
|
-
# Finally we are in business
|
222
253
|
page = context.new_page()
|
223
254
|
page.set_default_navigation_timeout(self.timeout)
|
224
255
|
page.set_default_timeout(self.timeout)
|
225
|
-
# Listen for all responses
|
226
256
|
page.on("response", handle_response)
|
227
257
|
|
228
258
|
if self.extra_headers:
|
@@ -235,54 +265,51 @@ class PlaywrightEngine:
|
|
235
265
|
for script in self.__stealth_scripts():
|
236
266
|
page.add_init_script(path=script)
|
237
267
|
|
238
|
-
first_response = page.goto(url, referer=
|
268
|
+
first_response = page.goto(url, referer=referer)
|
239
269
|
page.wait_for_load_state(state="domcontentloaded")
|
270
|
+
|
240
271
|
if self.network_idle:
|
241
272
|
page.wait_for_load_state('networkidle')
|
242
273
|
|
243
274
|
if self.page_action is not None:
|
244
|
-
|
275
|
+
try:
|
276
|
+
page = self.page_action(page)
|
277
|
+
except Exception as e:
|
278
|
+
log.error(f"Error executing page_action: {e}")
|
245
279
|
|
246
280
|
if self.wait_selector and type(self.wait_selector) is str:
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
281
|
+
try:
|
282
|
+
waiter = page.locator(self.wait_selector)
|
283
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
284
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
285
|
+
page.wait_for_load_state(state="load")
|
286
|
+
page.wait_for_load_state(state="domcontentloaded")
|
287
|
+
if self.network_idle:
|
288
|
+
page.wait_for_load_state('networkidle')
|
289
|
+
except Exception as e:
|
290
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
254
291
|
|
255
292
|
# In case we didn't catch a document type somehow
|
256
293
|
final_response = final_response if final_response else first_response
|
294
|
+
if not final_response:
|
295
|
+
raise ValueError("Failed to get a response from the page")
|
296
|
+
|
257
297
|
# This will be parsed inside `Response`
|
258
298
|
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
259
299
|
# PlayWright API sometimes give empty status text for some reason!
|
260
300
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
261
301
|
|
262
|
-
history =
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
url=current_request.url,
|
269
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
270
|
-
text='',
|
271
|
-
body=b'',
|
272
|
-
status=current_response.status if current_response else 301,
|
273
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
274
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
275
|
-
cookies={},
|
276
|
-
headers=current_response.all_headers() if current_response else {},
|
277
|
-
request_headers=current_request.all_headers(),
|
278
|
-
**self.adaptor_arguments
|
279
|
-
))
|
280
|
-
current_request = current_request.redirected_from
|
302
|
+
history = self._process_response_history(first_response)
|
303
|
+
try:
|
304
|
+
page_content = page.content()
|
305
|
+
except Exception as e:
|
306
|
+
log.error(f"Error getting page content: {e}")
|
307
|
+
page_content = ""
|
281
308
|
|
282
309
|
response = Response(
|
283
310
|
url=page.url,
|
284
|
-
text=
|
285
|
-
body=
|
311
|
+
text=page_content,
|
312
|
+
body=page_content.encode('utf-8'),
|
286
313
|
status=final_response.status,
|
287
314
|
reason=status_text,
|
288
315
|
encoding=encoding,
|
@@ -293,6 +320,7 @@ class PlaywrightEngine:
|
|
293
320
|
**self.adaptor_arguments
|
294
321
|
)
|
295
322
|
page.close()
|
323
|
+
context.close()
|
296
324
|
return response
|
297
325
|
|
298
326
|
async def async_fetch(self, url: str) -> Response:
|
@@ -308,8 +336,8 @@ class PlaywrightEngine:
|
|
308
336
|
else:
|
309
337
|
from rebrowser_playwright.async_api import async_playwright
|
310
338
|
|
311
|
-
# Store the final response
|
312
339
|
final_response = None
|
340
|
+
referer = generate_convincing_referer(url) if self.google_search else None
|
313
341
|
|
314
342
|
async def handle_response(finished_response: PlaywrightResponse):
|
315
343
|
nonlocal final_response
|
@@ -325,11 +353,9 @@ class PlaywrightEngine:
|
|
325
353
|
browser = await p.chromium.launch(**self.__launch_kwargs())
|
326
354
|
|
327
355
|
context = await browser.new_context(**self.__context_kwargs())
|
328
|
-
# Finally we are in business
|
329
356
|
page = await context.new_page()
|
330
357
|
page.set_default_navigation_timeout(self.timeout)
|
331
358
|
page.set_default_timeout(self.timeout)
|
332
|
-
# Listen for all responses
|
333
359
|
page.on("response", handle_response)
|
334
360
|
|
335
361
|
if self.extra_headers:
|
@@ -342,54 +368,51 @@ class PlaywrightEngine:
|
|
342
368
|
for script in self.__stealth_scripts():
|
343
369
|
await page.add_init_script(path=script)
|
344
370
|
|
345
|
-
first_response = await page.goto(url, referer=
|
371
|
+
first_response = await page.goto(url, referer=referer)
|
346
372
|
await page.wait_for_load_state(state="domcontentloaded")
|
373
|
+
|
347
374
|
if self.network_idle:
|
348
375
|
await page.wait_for_load_state('networkidle')
|
349
376
|
|
350
377
|
if self.page_action is not None:
|
351
|
-
|
378
|
+
try:
|
379
|
+
page = await self.page_action(page)
|
380
|
+
except Exception as e:
|
381
|
+
log.error(f"Error executing async page_action: {e}")
|
352
382
|
|
353
383
|
if self.wait_selector and type(self.wait_selector) is str:
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
384
|
+
try:
|
385
|
+
waiter = page.locator(self.wait_selector)
|
386
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
387
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
388
|
+
await page.wait_for_load_state(state="load")
|
389
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
390
|
+
if self.network_idle:
|
391
|
+
await page.wait_for_load_state('networkidle')
|
392
|
+
except Exception as e:
|
393
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
361
394
|
|
362
395
|
# In case we didn't catch a document type somehow
|
363
396
|
final_response = final_response if final_response else first_response
|
397
|
+
if not final_response:
|
398
|
+
raise ValueError("Failed to get a response from the page")
|
399
|
+
|
364
400
|
# This will be parsed inside `Response`
|
365
401
|
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
366
402
|
# PlayWright API sometimes give empty status text for some reason!
|
367
403
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
368
404
|
|
369
|
-
history =
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
url=current_request.url,
|
376
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
377
|
-
text='',
|
378
|
-
body=b'',
|
379
|
-
status=current_response.status if current_response else 301,
|
380
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
381
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
382
|
-
cookies={},
|
383
|
-
headers=await current_response.all_headers() if current_response else {},
|
384
|
-
request_headers=await current_request.all_headers(),
|
385
|
-
**self.adaptor_arguments
|
386
|
-
))
|
387
|
-
current_request = current_request.redirected_from
|
405
|
+
history = self._process_response_history(first_response)
|
406
|
+
try:
|
407
|
+
page_content = await page.content()
|
408
|
+
except Exception as e:
|
409
|
+
log.error(f"Error getting page content in async: {e}")
|
410
|
+
page_content = ""
|
388
411
|
|
389
412
|
response = Response(
|
390
413
|
url=page.url,
|
391
|
-
text=
|
392
|
-
body=
|
414
|
+
text=page_content,
|
415
|
+
body=page_content.encode('utf-8'),
|
393
416
|
status=final_response.status,
|
394
417
|
reason=status_text,
|
395
418
|
encoding=encoding,
|
@@ -400,4 +423,6 @@ class PlaywrightEngine:
|
|
400
423
|
**self.adaptor_arguments
|
401
424
|
)
|
402
425
|
await page.close()
|
426
|
+
await context.close()
|
427
|
+
|
403
428
|
return response
|
scrapling/engines/static.py
CHANGED
@@ -7,10 +7,10 @@ from scrapling.core.utils import log, lru_cache
|
|
7
7
|
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
8
8
|
|
9
9
|
|
10
|
-
@lru_cache(typed=True)
|
10
|
+
@lru_cache(5, typed=True) # Singleton easily
|
11
11
|
class StaticEngine:
|
12
12
|
def __init__(
|
13
|
-
self, url: str, proxy: Optional[str] = None, stealthy_headers:
|
13
|
+
self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
|
14
14
|
timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
|
15
15
|
):
|
16
16
|
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
@@ -79,17 +79,25 @@ class StaticEngine:
|
|
79
79
|
**self.adaptor_arguments
|
80
80
|
)
|
81
81
|
|
82
|
+
def _make_request(self, method: str, **kwargs) -> Response:
|
83
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
84
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
85
|
+
request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
86
|
+
return self._prepare_response(request)
|
87
|
+
|
88
|
+
async def _async_make_request(self, method: str, **kwargs) -> Response:
|
89
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
90
|
+
async with httpx.AsyncClient(proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)) as client:
|
91
|
+
request = await getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
92
|
+
return self._prepare_response(request)
|
93
|
+
|
82
94
|
def get(self, **kwargs: Dict) -> Response:
|
83
95
|
"""Make basic HTTP GET request for you but with some added flavors.
|
84
96
|
|
85
97
|
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
86
98
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
87
99
|
"""
|
88
|
-
|
89
|
-
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
90
|
-
request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
91
|
-
|
92
|
-
return self._prepare_response(request)
|
100
|
+
return self._make_request('get', **kwargs)
|
93
101
|
|
94
102
|
async def async_get(self, **kwargs: Dict) -> Response:
|
95
103
|
"""Make basic async HTTP GET request for you but with some added flavors.
|
@@ -97,11 +105,7 @@ class StaticEngine:
|
|
97
105
|
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
98
106
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
99
107
|
"""
|
100
|
-
|
101
|
-
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
102
|
-
request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
103
|
-
|
104
|
-
return self._prepare_response(request)
|
108
|
+
return await self._async_make_request('get', **kwargs)
|
105
109
|
|
106
110
|
def post(self, **kwargs: Dict) -> Response:
|
107
111
|
"""Make basic HTTP POST request for you but with some added flavors.
|
@@ -109,11 +113,7 @@ class StaticEngine:
|
|
109
113
|
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
110
114
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
111
115
|
"""
|
112
|
-
|
113
|
-
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
114
|
-
request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
115
|
-
|
116
|
-
return self._prepare_response(request)
|
116
|
+
return self._make_request('post', **kwargs)
|
117
117
|
|
118
118
|
async def async_post(self, **kwargs: Dict) -> Response:
|
119
119
|
"""Make basic async HTTP POST request for you but with some added flavors.
|
@@ -121,11 +121,7 @@ class StaticEngine:
|
|
121
121
|
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
122
122
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
123
123
|
"""
|
124
|
-
|
125
|
-
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
126
|
-
request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
127
|
-
|
128
|
-
return self._prepare_response(request)
|
124
|
+
return await self._async_make_request('post', **kwargs)
|
129
125
|
|
130
126
|
def delete(self, **kwargs: Dict) -> Response:
|
131
127
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
@@ -133,11 +129,7 @@ class StaticEngine:
|
|
133
129
|
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
134
130
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
135
131
|
"""
|
136
|
-
|
137
|
-
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
138
|
-
request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
139
|
-
|
140
|
-
return self._prepare_response(request)
|
132
|
+
return self._make_request('delete', **kwargs)
|
141
133
|
|
142
134
|
async def async_delete(self, **kwargs: Dict) -> Response:
|
143
135
|
"""Make basic async HTTP DELETE request for you but with some added flavors.
|
@@ -145,11 +137,7 @@ class StaticEngine:
|
|
145
137
|
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
146
138
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
147
139
|
"""
|
148
|
-
|
149
|
-
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
150
|
-
request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
151
|
-
|
152
|
-
return self._prepare_response(request)
|
140
|
+
return await self._async_make_request('delete', **kwargs)
|
153
141
|
|
154
142
|
def put(self, **kwargs: Dict) -> Response:
|
155
143
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
@@ -157,11 +145,7 @@ class StaticEngine:
|
|
157
145
|
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
158
146
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
159
147
|
"""
|
160
|
-
|
161
|
-
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
162
|
-
request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
163
|
-
|
164
|
-
return self._prepare_response(request)
|
148
|
+
return self._make_request('put', **kwargs)
|
165
149
|
|
166
150
|
async def async_put(self, **kwargs: Dict) -> Response:
|
167
151
|
"""Make basic async HTTP PUT request for you but with some added flavors.
|
@@ -169,8 +153,4 @@ class StaticEngine:
|
|
169
153
|
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
170
154
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
171
155
|
"""
|
172
|
-
|
173
|
-
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
174
|
-
request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
175
|
-
|
176
|
-
return self._prepare_response(request)
|
156
|
+
return await self._async_make_request('put', **kwargs)
|
@@ -16,7 +16,7 @@ class ResponseEncoding:
|
|
16
16
|
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
17
17
|
|
18
18
|
@classmethod
|
19
|
-
@lru_cache(maxsize=
|
19
|
+
@lru_cache(maxsize=256)
|
20
20
|
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
21
21
|
"""Parse content type and parameters from a content-type header value.
|
22
22
|
|
@@ -38,7 +38,7 @@ class ResponseEncoding:
|
|
38
38
|
return content_type, params
|
39
39
|
|
40
40
|
@classmethod
|
41
|
-
@lru_cache(maxsize=
|
41
|
+
@lru_cache(maxsize=256)
|
42
42
|
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
43
|
"""Determine the appropriate character encoding from a content-type header.
|
44
44
|
|
@@ -12,7 +12,7 @@ from scrapling.core._types import Dict, Union
|
|
12
12
|
from scrapling.core.utils import lru_cache
|
13
13
|
|
14
14
|
|
15
|
-
@lru_cache(
|
15
|
+
@lru_cache(128, typed=True)
|
16
16
|
def generate_convincing_referer(url: str) -> str:
|
17
17
|
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
18
18
|
|
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
26
26
|
return f'https://www.google.com/search?q={website_name}'
|
27
27
|
|
28
28
|
|
29
|
-
@lru_cache(
|
29
|
+
@lru_cache(128, typed=True)
|
30
30
|
def get_os_name() -> Union[str, None]:
|
31
31
|
"""Get the current OS name in the same format needed for browserforge
|
32
32
|
|
@@ -110,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
110
110
|
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
111
111
|
|
112
112
|
|
113
|
-
@lru_cache(
|
113
|
+
@lru_cache(126, typed=True)
|
114
114
|
def js_bypass_path(filename: str) -> str:
|
115
115
|
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
116
116
|
|
scrapling/fetchers.py
CHANGED
@@ -11,7 +11,7 @@ class Fetcher(BaseFetcher):
|
|
11
11
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
12
12
|
"""
|
13
13
|
def get(
|
14
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
14
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
15
15
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
16
16
|
"""Make basic HTTP GET request for you but with some added flavors.
|
17
17
|
|
@@ -30,7 +30,7 @@ class Fetcher(BaseFetcher):
|
|
30
30
|
return response_object
|
31
31
|
|
32
32
|
def post(
|
33
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
33
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
34
34
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
35
35
|
"""Make basic HTTP POST request for you but with some added flavors.
|
36
36
|
|
@@ -49,7 +49,7 @@ class Fetcher(BaseFetcher):
|
|
49
49
|
return response_object
|
50
50
|
|
51
51
|
def put(
|
52
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
52
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
53
53
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
54
54
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
55
55
|
|
@@ -69,7 +69,7 @@ class Fetcher(BaseFetcher):
|
|
69
69
|
return response_object
|
70
70
|
|
71
71
|
def delete(
|
72
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
72
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
73
73
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
74
74
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
75
75
|
|
@@ -90,7 +90,7 @@ class Fetcher(BaseFetcher):
|
|
90
90
|
|
91
91
|
class AsyncFetcher(Fetcher):
|
92
92
|
async def get(
|
93
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
93
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
94
94
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
95
95
|
"""Make basic HTTP GET request for you but with some added flavors.
|
96
96
|
|
@@ -109,7 +109,7 @@ class AsyncFetcher(Fetcher):
|
|
109
109
|
return response_object
|
110
110
|
|
111
111
|
async def post(
|
112
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
112
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
113
113
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
114
114
|
"""Make basic HTTP POST request for you but with some added flavors.
|
115
115
|
|
@@ -128,7 +128,7 @@ class AsyncFetcher(Fetcher):
|
|
128
128
|
return response_object
|
129
129
|
|
130
130
|
async def put(
|
131
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
131
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
132
132
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
133
133
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
134
134
|
|
@@ -147,7 +147,7 @@ class AsyncFetcher(Fetcher):
|
|
147
147
|
return response_object
|
148
148
|
|
149
149
|
async def delete(
|
150
|
-
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers:
|
150
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
151
151
|
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
152
152
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
153
153
|
|
@@ -173,11 +173,11 @@ class StealthyFetcher(BaseFetcher):
|
|
173
173
|
Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
174
174
|
"""
|
175
175
|
def fetch(
|
176
|
-
self, url: str, headless:
|
177
|
-
block_webrtc:
|
176
|
+
self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
177
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
|
-
wait_selector_state: SelectorWaitStates = 'attached', google_search:
|
180
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize:
|
179
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -231,11 +231,11 @@ class StealthyFetcher(BaseFetcher):
|
|
231
231
|
return engine.fetch(url)
|
232
232
|
|
233
233
|
async def async_fetch(
|
234
|
-
self, url: str, headless:
|
235
|
-
block_webrtc:
|
234
|
+
self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
235
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
-
wait_selector_state: SelectorWaitStates = 'attached', google_search:
|
238
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize:
|
237
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -307,13 +307,13 @@ class PlayWrightFetcher(BaseFetcher):
|
|
307
307
|
"""
|
308
308
|
def fetch(
|
309
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
310
|
-
useragent: Optional[str] = None, network_idle:
|
310
|
+
useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
|
311
311
|
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
312
|
-
hide_canvas:
|
312
|
+
hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
|
313
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
314
|
-
stealth:
|
314
|
+
stealth: bool = False, real_chrome: bool = False,
|
315
315
|
cdp_url: Optional[str] = None,
|
316
|
-
nstbrowser_mode:
|
316
|
+
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
317
317
|
) -> Response:
|
318
318
|
"""Opens up a browser and do your request based on your chosen options below.
|
319
319
|
|
@@ -367,13 +367,13 @@ class PlayWrightFetcher(BaseFetcher):
|
|
367
367
|
|
368
368
|
async def async_fetch(
|
369
369
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
|
-
useragent: Optional[str] = None, network_idle:
|
370
|
+
useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
|
371
371
|
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
372
|
-
hide_canvas:
|
372
|
+
hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
|
373
373
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
|
-
stealth:
|
374
|
+
stealth: bool = False, real_chrome: bool = False,
|
375
375
|
cdp_url: Optional[str] = None,
|
376
|
-
nstbrowser_mode:
|
376
|
+
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
377
377
|
) -> Response:
|
378
378
|
"""Opens up a browser and do your request based on your chosen options below.
|
379
379
|
|
scrapling/parser.py
CHANGED
@@ -71,7 +71,7 @@ class Adaptor(SelectorsGeneration):
|
|
71
71
|
if root is None and not body and text is None:
|
72
72
|
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
73
73
|
|
74
|
-
self.__text =
|
74
|
+
self.__text = ''
|
75
75
|
self.__raw_body = ''
|
76
76
|
if root is None:
|
77
77
|
if text is None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.97
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
73
73
|
|
74
74
|
# Sponsors
|
75
75
|
|
76
|
+
[Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
|
77
|
+
|
78
|
+
Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
|
79
|
+
- covering 20+ Google SERP scenarios and mainstream search engines.
|
80
|
+
- support real-time data updates to ensure real-time and accurate information.
|
81
|
+
- It can integrate information from all available online channels and search engines.
|
82
|
+
- Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
|
83
|
+
- **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
|
84
|
+
- 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
85
|
+
- ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
86
|
+
- 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
87
|
+
|
88
|
+
|
89
|
+
[](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
|
90
|
+
---
|
91
|
+
|
76
92
|
[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
|
77
93
|
|
78
94
|
- 👩💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
|
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
88
104
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
89
105
|
---
|
90
106
|
|
91
|
-
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
92
|
-
|
93
|
-
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
94
|
-
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
95
|
-
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
96
|
-
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
97
|
-
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
98
|
-
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
99
|
-
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
100
|
-
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
101
|
-
|
102
|
-
|
103
|
-
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
104
|
-
---
|
105
|
-
|
106
107
|
## Table of content
|
107
108
|
* [Key Features](#key-features)
|
108
109
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -1,25 +1,25 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=5yeUml2K0xHe2NAALM2x2hGSl_ORcEttIZL17b1cWtg,500
|
2
2
|
scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
|
3
3
|
scrapling/defaults.py,sha256=sdXeZjXEX7PmCtaa0weK0nRrAUzqZukNNqipZ_sltYE,469
|
4
|
-
scrapling/fetchers.py,sha256=
|
5
|
-
scrapling/parser.py,sha256=
|
4
|
+
scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
|
5
|
+
scrapling/parser.py,sha256=U6qFV23qeeX1pYl6mw0TZEL4FlaQw6puaoDTldUpi-M,54328
|
6
6
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
7
7
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
|
9
|
-
scrapling/core/custom_types.py,sha256=
|
9
|
+
scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
|
10
10
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
11
|
-
scrapling/core/storage_adaptors.py,sha256=
|
12
|
-
scrapling/core/translator.py,sha256=
|
13
|
-
scrapling/core/utils.py,sha256=
|
11
|
+
scrapling/core/storage_adaptors.py,sha256=EkSE8LlOS9SggFblBNzgyEp0fLxl8dqYU3-MAuXUitY,6216
|
12
|
+
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
13
|
+
scrapling/core/utils.py,sha256=0e3jD029CXj3gfA_MIKcBC0Mai9fXW2scIuoKtHy1e8,3704
|
14
14
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
15
|
-
scrapling/engines/camo.py,sha256=
|
15
|
+
scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
|
16
16
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
17
|
-
scrapling/engines/pw.py,sha256=
|
18
|
-
scrapling/engines/static.py,sha256=
|
17
|
+
scrapling/engines/pw.py,sha256=_fy8mhkVrOnb_Qho8zKCjFyd1Y_kr2mkdo0PHrBks4M,21371
|
18
|
+
scrapling/engines/static.py,sha256=okrEIFfYaxqVuIXPanxQDxQpN8i88AgWODo7Dnex2EI,9306
|
19
19
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
20
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
21
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=
|
22
|
-
scrapling/engines/toolbelt/navigation.py,sha256=
|
20
|
+
scrapling/engines/toolbelt/custom.py,sha256=dwpuEHNOd9uJbMf7sx8sXsYZhozSXStrwqfpooce1Wk,12811
|
21
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=spJMij0qBGvbSlVjv9xJWCF8KFDf6xnNz5fWtXWhrzY,2927
|
22
|
+
scrapling/engines/toolbelt/navigation.py,sha256=KyFQ4vHS4jR7z378VRGtUeXQHWr5NMy5nNp2-c_Evk8,4566
|
23
23
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
24
24
|
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
25
25
|
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
|
|
41
41
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
42
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
43
43
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
48
|
-
scrapling-0.2.
|
49
|
-
scrapling-0.2.
|
44
|
+
scrapling-0.2.97.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
+
scrapling-0.2.97.dist-info/METADATA,sha256=VnP3UEy6RcQytld-8ZYSF0Cpdd4fb-tKoX01jajFneo,69666
|
46
|
+
scrapling-0.2.97.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
47
|
+
scrapling-0.2.97.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
+
scrapling-0.2.97.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
+
scrapling-0.2.97.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|