scrapling 0.2.96__py3-none-any.whl → 0.2.97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.96"
8
+ __version__ = "0.2.97"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
@@ -19,9 +19,7 @@ class TextHandler(str):
19
19
  __slots__ = ()
20
20
 
21
21
  def __new__(cls, string):
22
- if isinstance(string, str):
23
- return super().__new__(cls, string)
24
- return super().__new__(cls, '')
22
+ return super().__new__(cls, str(string))
25
23
 
26
24
  def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
27
25
  lst = super().__getitem__(key)
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
19
19
  """
20
20
  self.url = url
21
21
 
22
- @lru_cache(None, typed=True)
22
+ @lru_cache(126, typed=True)
23
23
  def _get_base_url(self, default_value: str = 'default') -> str:
24
24
  if not self.url or type(self.url) is not str:
25
25
  return default_value
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
51
51
  raise NotImplementedError('Storage system must implement `save` method')
52
52
 
53
53
  @staticmethod
54
- @lru_cache(None, typed=True)
54
+ @lru_cache(256, typed=True)
55
55
  def _get_hash(identifier: str) -> str:
56
56
  """If you want to hash identifier in your storage system, use this safer"""
57
57
  identifier = identifier.lower().strip()
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
63
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
64
 
65
65
 
66
- @lru_cache(None, typed=True)
66
+ @lru_cache(10, typed=True)
67
67
  class SQLiteStorageSystem(StorageSystemMixin):
68
68
  """The recommended system to use, it's race condition safe and thread safe.
69
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -139,6 +139,6 @@ class TranslatorMixin:
139
139
 
140
140
 
141
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @lru_cache(maxsize=2048)
142
+ @lru_cache(maxsize=256)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
scrapling/core/utils.py CHANGED
@@ -115,7 +115,7 @@ class _StorageTools:
115
115
  # return _impl
116
116
 
117
117
 
118
- @lru_cache(None, typed=True)
118
+ @lru_cache(256, typed=True)
119
119
  def clean_spaces(string):
120
120
  string = string.replace('\t', ' ')
121
121
  string = re.sub('[\n|\r]', '', string)
scrapling/engines/camo.py CHANGED
@@ -15,12 +15,12 @@ from scrapling.engines.toolbelt import (Response, StatusText,
15
15
 
16
16
  class CamoufoxEngine:
17
17
  def __init__(
18
- self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
- block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
18
+ self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
19
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
22
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
23
- geoip: Optional[bool] = False,
21
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
22
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
23
+ geoip: bool = False,
24
24
  adaptor_arguments: Dict = None,
25
25
  ):
26
26
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
@@ -64,107 +64,140 @@ class CamoufoxEngine:
64
64
  self.addons = addons or []
65
65
  self.humanize = humanize
66
66
  self.timeout = check_type_validity(timeout, [int, float], 30000)
67
+
68
+ # Page action callable validation
69
+ self.page_action = None
67
70
  if page_action is not None:
68
71
  if callable(page_action):
69
72
  self.page_action = page_action
70
73
  else:
71
- self.page_action = None
72
74
  log.error('[Ignored] Argument "page_action" must be callable')
73
- else:
74
- self.page_action = None
75
75
 
76
76
  self.wait_selector = wait_selector
77
77
  self.wait_selector_state = wait_selector_state
78
78
  self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
79
79
 
80
+ def _get_camoufox_options(self):
81
+ """Return consistent browser options dictionary for both sync and async methods"""
82
+ return {
83
+ "geoip": self.geoip,
84
+ "proxy": self.proxy,
85
+ "enable_cache": True,
86
+ "addons": self.addons,
87
+ "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
88
+ "headless": self.headless,
89
+ "humanize": self.humanize,
90
+ "i_know_what_im_doing": True, # To turn warnings off with the user configurations
91
+ "allow_webgl": self.allow_webgl,
92
+ "block_webrtc": self.block_webrtc,
93
+ "block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
94
+ "os": None if self.os_randomize else get_os_name(),
95
+ }
96
+
97
+ def _process_response_history(self, first_response):
98
+ """Process response history to build a list of Response objects"""
99
+ history = []
100
+ current_request = first_response.request.redirected_from
101
+
102
+ try:
103
+ while current_request:
104
+ try:
105
+ current_response = current_request.response()
106
+ history.insert(0, Response(
107
+ url=current_request.url,
108
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
109
+ text='',
110
+ body=b'',
111
+ status=current_response.status if current_response else 301,
112
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
113
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
114
+ cookies={},
115
+ headers=current_response.all_headers() if current_response else {},
116
+ request_headers=current_request.all_headers(),
117
+ **self.adaptor_arguments
118
+ ))
119
+ except Exception as e:
120
+ log.error(f"Error processing redirect: {e}")
121
+ break
122
+
123
+ current_request = current_request.redirected_from
124
+ except Exception as e:
125
+ log.error(f"Error processing response history: {e}")
126
+
127
+ return history
128
+
80
129
  def fetch(self, url: str) -> Response:
81
130
  """Opens up the browser and do your request based on your chosen options.
82
131
 
83
132
  :param url: Target url.
84
133
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
134
  """
86
- addons = [] if self.disable_ads else [DefaultAddons.UBO]
87
- # Store the final response
88
135
  final_response = None
136
+ referer = generate_convincing_referer(url) if self.google_search else None
89
137
 
90
138
  def handle_response(finished_response):
91
139
  nonlocal final_response
92
140
  if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
93
141
  final_response = finished_response
94
142
 
95
- with Camoufox(
96
- geoip=self.geoip,
97
- proxy=self.proxy,
98
- enable_cache=True,
99
- addons=self.addons,
100
- exclude_addons=addons,
101
- headless=self.headless,
102
- humanize=self.humanize,
103
- i_know_what_im_doing=True, # To turn warnings off with the user configurations
104
- allow_webgl=self.allow_webgl,
105
- block_webrtc=self.block_webrtc,
106
- block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
107
- os=None if self.os_randomize else get_os_name(),
108
- ) as browser:
109
- page = browser.new_page()
143
+ with Camoufox(**self._get_camoufox_options()) as browser:
144
+ context = browser.new_context()
145
+ page = context.new_page()
110
146
  page.set_default_navigation_timeout(self.timeout)
111
147
  page.set_default_timeout(self.timeout)
112
- # Listen for all responses
113
148
  page.on("response", handle_response)
149
+
114
150
  if self.disable_resources:
115
151
  page.route("**/*", intercept_route)
116
152
 
117
153
  if self.extra_headers:
118
154
  page.set_extra_http_headers(self.extra_headers)
119
155
 
120
- first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
156
+ first_response = page.goto(url, referer=referer)
121
157
  page.wait_for_load_state(state="domcontentloaded")
158
+
122
159
  if self.network_idle:
123
160
  page.wait_for_load_state('networkidle')
124
161
 
125
162
  if self.page_action is not None:
126
- page = self.page_action(page)
163
+ try:
164
+ page = self.page_action(page)
165
+ except Exception as e:
166
+ log.error(f"Error executing page_action: {e}")
127
167
 
128
168
  if self.wait_selector and type(self.wait_selector) is str:
129
- waiter = page.locator(self.wait_selector)
130
- waiter.first.wait_for(state=self.wait_selector_state)
131
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
132
- page.wait_for_load_state(state="load")
133
- page.wait_for_load_state(state="domcontentloaded")
134
- if self.network_idle:
135
- page.wait_for_load_state('networkidle')
169
+ try:
170
+ waiter = page.locator(self.wait_selector)
171
+ waiter.first.wait_for(state=self.wait_selector_state)
172
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
173
+ page.wait_for_load_state(state="load")
174
+ page.wait_for_load_state(state="domcontentloaded")
175
+ if self.network_idle:
176
+ page.wait_for_load_state('networkidle')
177
+ except Exception as e:
178
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
136
179
 
137
180
  # In case we didn't catch a document type somehow
138
181
  final_response = final_response if final_response else first_response
182
+ if not final_response:
183
+ raise ValueError("Failed to get a response from the page")
184
+
139
185
  # This will be parsed inside `Response`
140
186
  encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
141
187
  # PlayWright API sometimes give empty status text for some reason!
142
188
  status_text = final_response.status_text or StatusText.get(final_response.status)
143
189
 
144
- history = []
145
- current_request = first_response.request.redirected_from
146
- while current_request:
147
- current_response = current_request.response()
148
-
149
- history.insert(0, Response(
150
- url=current_request.url,
151
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
152
- text='',
153
- body=b'',
154
- status=current_response.status if current_response else 301,
155
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
156
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
157
- cookies={},
158
- headers=current_response.all_headers() if current_response else {},
159
- request_headers=current_request.all_headers(),
160
- **self.adaptor_arguments
161
- ))
162
- current_request = current_request.redirected_from
190
+ history = self._process_response_history(first_response)
191
+ try:
192
+ page_content = page.content()
193
+ except Exception as e:
194
+ log.error(f"Error getting page content: {e}")
195
+ page_content = ""
163
196
 
164
197
  response = Response(
165
198
  url=page.url,
166
- text=page.content(),
167
- body=page.content().encode('utf-8'),
199
+ text=page_content,
200
+ body=page_content.encode('utf-8'),
168
201
  status=final_response.status,
169
202
  reason=status_text,
170
203
  encoding=encoding,
@@ -175,6 +208,7 @@ class CamoufoxEngine:
175
208
  **self.adaptor_arguments
176
209
  )
177
210
  page.close()
211
+ context.close()
178
212
 
179
213
  return response
180
214
 
@@ -184,88 +218,72 @@ class CamoufoxEngine:
184
218
  :param url: Target url.
185
219
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
186
220
  """
187
- addons = [] if self.disable_ads else [DefaultAddons.UBO]
188
- # Store the final response
189
221
  final_response = None
222
+ referer = generate_convincing_referer(url) if self.google_search else None
190
223
 
191
224
  async def handle_response(finished_response):
192
225
  nonlocal final_response
193
226
  if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
194
227
  final_response = finished_response
195
228
 
196
- async with AsyncCamoufox(
197
- geoip=self.geoip,
198
- proxy=self.proxy,
199
- enable_cache=True,
200
- addons=self.addons,
201
- exclude_addons=addons,
202
- headless=self.headless,
203
- humanize=self.humanize,
204
- i_know_what_im_doing=True, # To turn warnings off with the user configurations
205
- allow_webgl=self.allow_webgl,
206
- block_webrtc=self.block_webrtc,
207
- block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
208
- os=None if self.os_randomize else get_os_name(),
209
- ) as browser:
210
- page = await browser.new_page()
229
+ async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
230
+ context = await browser.new_context()
231
+ page = await context.new_page()
211
232
  page.set_default_navigation_timeout(self.timeout)
212
233
  page.set_default_timeout(self.timeout)
213
- # Listen for all responses
214
234
  page.on("response", handle_response)
235
+
215
236
  if self.disable_resources:
216
237
  await page.route("**/*", async_intercept_route)
217
238
 
218
239
  if self.extra_headers:
219
240
  await page.set_extra_http_headers(self.extra_headers)
220
241
 
221
- first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
242
+ first_response = await page.goto(url, referer=referer)
222
243
  await page.wait_for_load_state(state="domcontentloaded")
244
+
223
245
  if self.network_idle:
224
246
  await page.wait_for_load_state('networkidle')
225
247
 
226
248
  if self.page_action is not None:
227
- page = await self.page_action(page)
249
+ try:
250
+ page = await self.page_action(page)
251
+ except Exception as e:
252
+ log.error(f"Error executing async page_action: {e}")
228
253
 
229
254
  if self.wait_selector and type(self.wait_selector) is str:
230
- waiter = page.locator(self.wait_selector)
231
- await waiter.first.wait_for(state=self.wait_selector_state)
232
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
233
- await page.wait_for_load_state(state="load")
234
- await page.wait_for_load_state(state="domcontentloaded")
235
- if self.network_idle:
236
- await page.wait_for_load_state('networkidle')
255
+ try:
256
+ waiter = page.locator(self.wait_selector)
257
+ await waiter.first.wait_for(state=self.wait_selector_state)
258
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
259
+ await page.wait_for_load_state(state="load")
260
+ await page.wait_for_load_state(state="domcontentloaded")
261
+ if self.network_idle:
262
+ await page.wait_for_load_state('networkidle')
263
+ except Exception as e:
264
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
237
265
 
238
266
  # In case we didn't catch a document type somehow
239
267
  final_response = final_response if final_response else first_response
268
+ if not final_response:
269
+ raise ValueError("Failed to get a response from the page")
270
+
240
271
  # This will be parsed inside `Response`
241
272
  encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
242
273
  # PlayWright API sometimes give empty status text for some reason!
243
274
  status_text = final_response.status_text or StatusText.get(final_response.status)
244
275
 
245
- history = []
246
- current_request = first_response.request.redirected_from
247
- while current_request:
248
- current_response = await current_request.response()
249
-
250
- history.insert(0, Response(
251
- url=current_request.url,
252
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
253
- text='',
254
- body=b'',
255
- status=current_response.status if current_response else 301,
256
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
257
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
258
- cookies={},
259
- headers=await current_response.all_headers() if current_response else {},
260
- request_headers=await current_request.all_headers(),
261
- **self.adaptor_arguments
262
- ))
263
- current_request = current_request.redirected_from
276
+ history = self._process_response_history(first_response)
277
+ try:
278
+ page_content = await page.content()
279
+ except Exception as e:
280
+ log.error(f"Error getting page content in async: {e}")
281
+ page_content = ""
264
282
 
265
283
  response = Response(
266
284
  url=page.url,
267
- text=await page.content(),
268
- body=(await page.content()).encode('utf-8'),
285
+ text=page_content,
286
+ body=page_content.encode('utf-8'),
269
287
  status=final_response.status,
270
288
  reason=status_text,
271
289
  encoding=encoding,
@@ -276,5 +294,6 @@ class CamoufoxEngine:
276
294
  **self.adaptor_arguments
277
295
  )
278
296
  await page.close()
297
+ await context.close()
279
298
 
280
299
  return response
scrapling/engines/pw.py CHANGED
@@ -19,20 +19,20 @@ class PlaywrightEngine:
19
19
  self, headless: Union[bool, str] = True,
20
20
  disable_resources: bool = False,
21
21
  useragent: Optional[str] = None,
22
- network_idle: Optional[bool] = False,
22
+ network_idle: bool = False,
23
23
  timeout: Optional[float] = 30000,
24
24
  page_action: Callable = None,
25
25
  wait_selector: Optional[str] = None,
26
26
  locale: Optional[str] = 'en-US',
27
27
  wait_selector_state: SelectorWaitStates = 'attached',
28
- stealth: Optional[bool] = False,
29
- real_chrome: Optional[bool] = False,
30
- hide_canvas: Optional[bool] = False,
31
- disable_webgl: Optional[bool] = False,
28
+ stealth: bool = False,
29
+ real_chrome: bool = False,
30
+ hide_canvas: bool = False,
31
+ disable_webgl: bool = False,
32
32
  cdp_url: Optional[str] = None,
33
- nstbrowser_mode: Optional[bool] = False,
33
+ nstbrowser_mode: bool = False,
34
34
  nstbrowser_config: Optional[Dict] = None,
35
- google_search: Optional[bool] = True,
35
+ google_search: bool = True,
36
36
  extra_headers: Optional[Dict[str, str]] = None,
37
37
  proxy: Optional[Union[str, Dict[str, str]]] = None,
38
38
  adaptor_arguments: Dict = None
@@ -126,7 +126,7 @@ class PlaywrightEngine:
126
126
 
127
127
  return cdp_url
128
128
 
129
- @lru_cache(typed=True)
129
+ @lru_cache(126, typed=True)
130
130
  def __set_flags(self):
131
131
  """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
132
132
  flags = DEFAULT_STEALTH_FLAGS
@@ -169,7 +169,7 @@ class PlaywrightEngine:
169
169
 
170
170
  return context_kwargs
171
171
 
172
- @lru_cache()
172
+ @lru_cache(10)
173
173
  def __stealth_scripts(self):
174
174
  # Basic bypasses nothing fancy as I'm still working on it
175
175
  # But with adding these bypasses to the above config, it bypasses many online tests like
@@ -188,6 +188,38 @@ class PlaywrightEngine:
188
188
  )
189
189
  )
190
190
 
191
+ def _process_response_history(self, first_response):
192
+ """Process response history to build a list of Response objects"""
193
+ history = []
194
+ current_request = first_response.request.redirected_from
195
+
196
+ try:
197
+ while current_request:
198
+ try:
199
+ current_response = current_request.response()
200
+ history.insert(0, Response(
201
+ url=current_request.url,
202
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
203
+ text='',
204
+ body=b'',
205
+ status=current_response.status if current_response else 301,
206
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
207
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
208
+ cookies={},
209
+ headers=current_response.all_headers() if current_response else {},
210
+ request_headers=current_request.all_headers(),
211
+ **self.adaptor_arguments
212
+ ))
213
+ except Exception as e:
214
+ log.error(f"Error processing redirect: {e}")
215
+ break
216
+
217
+ current_request = current_request.redirected_from
218
+ except Exception as e:
219
+ log.error(f"Error processing response history: {e}")
220
+
221
+ return history
222
+
191
223
  def fetch(self, url: str) -> Response:
192
224
  """Opens up the browser and do your request based on your chosen options.
193
225
 
@@ -201,8 +233,8 @@ class PlaywrightEngine:
201
233
  else:
202
234
  from rebrowser_playwright.sync_api import sync_playwright
203
235
 
204
- # Store the final response
205
236
  final_response = None
237
+ referer = generate_convincing_referer(url) if self.google_search else None
206
238
 
207
239
  def handle_response(finished_response: PlaywrightResponse):
208
240
  nonlocal final_response
@@ -218,11 +250,9 @@ class PlaywrightEngine:
218
250
  browser = p.chromium.launch(**self.__launch_kwargs())
219
251
 
220
252
  context = browser.new_context(**self.__context_kwargs())
221
- # Finally we are in business
222
253
  page = context.new_page()
223
254
  page.set_default_navigation_timeout(self.timeout)
224
255
  page.set_default_timeout(self.timeout)
225
- # Listen for all responses
226
256
  page.on("response", handle_response)
227
257
 
228
258
  if self.extra_headers:
@@ -235,54 +265,51 @@ class PlaywrightEngine:
235
265
  for script in self.__stealth_scripts():
236
266
  page.add_init_script(path=script)
237
267
 
238
- first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
268
+ first_response = page.goto(url, referer=referer)
239
269
  page.wait_for_load_state(state="domcontentloaded")
270
+
240
271
  if self.network_idle:
241
272
  page.wait_for_load_state('networkidle')
242
273
 
243
274
  if self.page_action is not None:
244
- page = self.page_action(page)
275
+ try:
276
+ page = self.page_action(page)
277
+ except Exception as e:
278
+ log.error(f"Error executing page_action: {e}")
245
279
 
246
280
  if self.wait_selector and type(self.wait_selector) is str:
247
- waiter = page.locator(self.wait_selector)
248
- waiter.first.wait_for(state=self.wait_selector_state)
249
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
250
- page.wait_for_load_state(state="load")
251
- page.wait_for_load_state(state="domcontentloaded")
252
- if self.network_idle:
253
- page.wait_for_load_state('networkidle')
281
+ try:
282
+ waiter = page.locator(self.wait_selector)
283
+ waiter.first.wait_for(state=self.wait_selector_state)
284
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
285
+ page.wait_for_load_state(state="load")
286
+ page.wait_for_load_state(state="domcontentloaded")
287
+ if self.network_idle:
288
+ page.wait_for_load_state('networkidle')
289
+ except Exception as e:
290
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
254
291
 
255
292
  # In case we didn't catch a document type somehow
256
293
  final_response = final_response if final_response else first_response
294
+ if not final_response:
295
+ raise ValueError("Failed to get a response from the page")
296
+
257
297
  # This will be parsed inside `Response`
258
298
  encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
259
299
  # PlayWright API sometimes give empty status text for some reason!
260
300
  status_text = final_response.status_text or StatusText.get(final_response.status)
261
301
 
262
- history = []
263
- current_request = first_response.request.redirected_from
264
- while current_request:
265
- current_response = current_request.response()
266
-
267
- history.insert(0, Response(
268
- url=current_request.url,
269
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
270
- text='',
271
- body=b'',
272
- status=current_response.status if current_response else 301,
273
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
274
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
275
- cookies={},
276
- headers=current_response.all_headers() if current_response else {},
277
- request_headers=current_request.all_headers(),
278
- **self.adaptor_arguments
279
- ))
280
- current_request = current_request.redirected_from
302
+ history = self._process_response_history(first_response)
303
+ try:
304
+ page_content = page.content()
305
+ except Exception as e:
306
+ log.error(f"Error getting page content: {e}")
307
+ page_content = ""
281
308
 
282
309
  response = Response(
283
310
  url=page.url,
284
- text=page.content(),
285
- body=page.content().encode('utf-8'),
311
+ text=page_content,
312
+ body=page_content.encode('utf-8'),
286
313
  status=final_response.status,
287
314
  reason=status_text,
288
315
  encoding=encoding,
@@ -293,6 +320,7 @@ class PlaywrightEngine:
293
320
  **self.adaptor_arguments
294
321
  )
295
322
  page.close()
323
+ context.close()
296
324
  return response
297
325
 
298
326
  async def async_fetch(self, url: str) -> Response:
@@ -308,8 +336,8 @@ class PlaywrightEngine:
308
336
  else:
309
337
  from rebrowser_playwright.async_api import async_playwright
310
338
 
311
- # Store the final response
312
339
  final_response = None
340
+ referer = generate_convincing_referer(url) if self.google_search else None
313
341
 
314
342
  async def handle_response(finished_response: PlaywrightResponse):
315
343
  nonlocal final_response
@@ -325,11 +353,9 @@ class PlaywrightEngine:
325
353
  browser = await p.chromium.launch(**self.__launch_kwargs())
326
354
 
327
355
  context = await browser.new_context(**self.__context_kwargs())
328
- # Finally we are in business
329
356
  page = await context.new_page()
330
357
  page.set_default_navigation_timeout(self.timeout)
331
358
  page.set_default_timeout(self.timeout)
332
- # Listen for all responses
333
359
  page.on("response", handle_response)
334
360
 
335
361
  if self.extra_headers:
@@ -342,54 +368,51 @@ class PlaywrightEngine:
342
368
  for script in self.__stealth_scripts():
343
369
  await page.add_init_script(path=script)
344
370
 
345
- first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
371
+ first_response = await page.goto(url, referer=referer)
346
372
  await page.wait_for_load_state(state="domcontentloaded")
373
+
347
374
  if self.network_idle:
348
375
  await page.wait_for_load_state('networkidle')
349
376
 
350
377
  if self.page_action is not None:
351
- page = await self.page_action(page)
378
+ try:
379
+ page = await self.page_action(page)
380
+ except Exception as e:
381
+ log.error(f"Error executing async page_action: {e}")
352
382
 
353
383
  if self.wait_selector and type(self.wait_selector) is str:
354
- waiter = page.locator(self.wait_selector)
355
- await waiter.first.wait_for(state=self.wait_selector_state)
356
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
357
- await page.wait_for_load_state(state="load")
358
- await page.wait_for_load_state(state="domcontentloaded")
359
- if self.network_idle:
360
- await page.wait_for_load_state('networkidle')
384
+ try:
385
+ waiter = page.locator(self.wait_selector)
386
+ await waiter.first.wait_for(state=self.wait_selector_state)
387
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
388
+ await page.wait_for_load_state(state="load")
389
+ await page.wait_for_load_state(state="domcontentloaded")
390
+ if self.network_idle:
391
+ await page.wait_for_load_state('networkidle')
392
+ except Exception as e:
393
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
361
394
 
362
395
  # In case we didn't catch a document type somehow
363
396
  final_response = final_response if final_response else first_response
397
+ if not final_response:
398
+ raise ValueError("Failed to get a response from the page")
399
+
364
400
  # This will be parsed inside `Response`
365
401
  encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
366
402
  # PlayWright API sometimes give empty status text for some reason!
367
403
  status_text = final_response.status_text or StatusText.get(final_response.status)
368
404
 
369
- history = []
370
- current_request = first_response.request.redirected_from
371
- while current_request:
372
- current_response = await current_request.response()
373
-
374
- history.insert(0, Response(
375
- url=current_request.url,
376
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
377
- text='',
378
- body=b'',
379
- status=current_response.status if current_response else 301,
380
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
381
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
382
- cookies={},
383
- headers=await current_response.all_headers() if current_response else {},
384
- request_headers=await current_request.all_headers(),
385
- **self.adaptor_arguments
386
- ))
387
- current_request = current_request.redirected_from
405
+ history = self._process_response_history(first_response)
406
+ try:
407
+ page_content = await page.content()
408
+ except Exception as e:
409
+ log.error(f"Error getting page content in async: {e}")
410
+ page_content = ""
388
411
 
389
412
  response = Response(
390
413
  url=page.url,
391
- text=await page.content(),
392
- body=(await page.content()).encode('utf-8'),
414
+ text=page_content,
415
+ body=page_content.encode('utf-8'),
393
416
  status=final_response.status,
394
417
  reason=status_text,
395
418
  encoding=encoding,
@@ -400,4 +423,6 @@ class PlaywrightEngine:
400
423
  **self.adaptor_arguments
401
424
  )
402
425
  await page.close()
426
+ await context.close()
427
+
403
428
  return response
@@ -7,10 +7,10 @@ from scrapling.core.utils import log, lru_cache
7
7
  from .toolbelt import Response, generate_convincing_referer, generate_headers
8
8
 
9
9
 
10
- @lru_cache(typed=True)
10
+ @lru_cache(5, typed=True) # Singleton easily
11
11
  class StaticEngine:
12
12
  def __init__(
13
- self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
13
+ self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
14
14
  timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
15
15
  ):
16
16
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
@@ -79,17 +79,25 @@ class StaticEngine:
79
79
  **self.adaptor_arguments
80
80
  )
81
81
 
82
+ def _make_request(self, method: str, **kwargs) -> Response:
83
+ headers = self._headers_job(kwargs.pop('headers', {}))
84
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
85
+ request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
86
+ return self._prepare_response(request)
87
+
88
+ async def _async_make_request(self, method: str, **kwargs) -> Response:
89
+ headers = self._headers_job(kwargs.pop('headers', {}))
90
+ async with httpx.AsyncClient(proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)) as client:
91
+ request = await getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
92
+ return self._prepare_response(request)
93
+
82
94
  def get(self, **kwargs: Dict) -> Response:
83
95
  """Make basic HTTP GET request for you but with some added flavors.
84
96
 
85
97
  :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
86
98
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
87
99
  """
88
- headers = self._headers_job(kwargs.pop('headers', {}))
89
- with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
90
- request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
91
-
92
- return self._prepare_response(request)
100
+ return self._make_request('get', **kwargs)
93
101
 
94
102
  async def async_get(self, **kwargs: Dict) -> Response:
95
103
  """Make basic async HTTP GET request for you but with some added flavors.
@@ -97,11 +105,7 @@ class StaticEngine:
97
105
  :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
98
106
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
99
107
  """
100
- headers = self._headers_job(kwargs.pop('headers', {}))
101
- async with httpx.AsyncClient(proxy=self.proxy) as client:
102
- request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
103
-
104
- return self._prepare_response(request)
108
+ return await self._async_make_request('get', **kwargs)
105
109
 
106
110
  def post(self, **kwargs: Dict) -> Response:
107
111
  """Make basic HTTP POST request for you but with some added flavors.
@@ -109,11 +113,7 @@ class StaticEngine:
109
113
  :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
110
114
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
111
115
  """
112
- headers = self._headers_job(kwargs.pop('headers', {}))
113
- with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
114
- request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
115
-
116
- return self._prepare_response(request)
116
+ return self._make_request('post', **kwargs)
117
117
 
118
118
  async def async_post(self, **kwargs: Dict) -> Response:
119
119
  """Make basic async HTTP POST request for you but with some added flavors.
@@ -121,11 +121,7 @@ class StaticEngine:
121
121
  :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
122
122
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
123
123
  """
124
- headers = self._headers_job(kwargs.pop('headers', {}))
125
- async with httpx.AsyncClient(proxy=self.proxy) as client:
126
- request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
127
-
128
- return self._prepare_response(request)
124
+ return await self._async_make_request('post', **kwargs)
129
125
 
130
126
  def delete(self, **kwargs: Dict) -> Response:
131
127
  """Make basic HTTP DELETE request for you but with some added flavors.
@@ -133,11 +129,7 @@ class StaticEngine:
133
129
  :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
134
130
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
135
131
  """
136
- headers = self._headers_job(kwargs.pop('headers', {}))
137
- with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
138
- request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
139
-
140
- return self._prepare_response(request)
132
+ return self._make_request('delete', **kwargs)
141
133
 
142
134
  async def async_delete(self, **kwargs: Dict) -> Response:
143
135
  """Make basic async HTTP DELETE request for you but with some added flavors.
@@ -145,11 +137,7 @@ class StaticEngine:
145
137
  :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
146
138
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
147
139
  """
148
- headers = self._headers_job(kwargs.pop('headers', {}))
149
- async with httpx.AsyncClient(proxy=self.proxy) as client:
150
- request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
151
-
152
- return self._prepare_response(request)
140
+ return await self._async_make_request('delete', **kwargs)
153
141
 
154
142
  def put(self, **kwargs: Dict) -> Response:
155
143
  """Make basic HTTP PUT request for you but with some added flavors.
@@ -157,11 +145,7 @@ class StaticEngine:
157
145
  :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
158
146
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
159
147
  """
160
- headers = self._headers_job(kwargs.pop('headers', {}))
161
- with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
162
- request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
163
-
164
- return self._prepare_response(request)
148
+ return self._make_request('put', **kwargs)
165
149
 
166
150
  async def async_put(self, **kwargs: Dict) -> Response:
167
151
  """Make basic async HTTP PUT request for you but with some added flavors.
@@ -169,8 +153,4 @@ class StaticEngine:
169
153
  :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
170
154
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
171
155
  """
172
- headers = self._headers_job(kwargs.pop('headers', {}))
173
- async with httpx.AsyncClient(proxy=self.proxy) as client:
174
- request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
175
-
176
- return self._prepare_response(request)
156
+ return await self._async_make_request('put', **kwargs)
@@ -16,7 +16,7 @@ class ResponseEncoding:
16
16
  __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
17
17
 
18
18
  @classmethod
19
- @lru_cache(maxsize=None)
19
+ @lru_cache(maxsize=256)
20
20
  def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
21
21
  """Parse content type and parameters from a content-type header value.
22
22
 
@@ -38,7 +38,7 @@ class ResponseEncoding:
38
38
  return content_type, params
39
39
 
40
40
  @classmethod
41
- @lru_cache(maxsize=None)
41
+ @lru_cache(maxsize=256)
42
42
  def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
43
43
  """Determine the appropriate character encoding from a content-type header.
44
44
 
@@ -12,7 +12,7 @@ from scrapling.core._types import Dict, Union
12
12
  from scrapling.core.utils import lru_cache
13
13
 
14
14
 
15
- @lru_cache(None, typed=True)
15
+ @lru_cache(128, typed=True)
16
16
  def generate_convincing_referer(url: str) -> str:
17
17
  """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
18
 
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
26
26
  return f'https://www.google.com/search?q={website_name}'
27
27
 
28
28
 
29
- @lru_cache(None, typed=True)
29
+ @lru_cache(128, typed=True)
30
30
  def get_os_name() -> Union[str, None]:
31
31
  """Get the current OS name in the same format needed for browserforge
32
32
 
@@ -110,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
110
110
  raise ValueError(f"Invalid CDP URL: {str(e)}")
111
111
 
112
112
 
113
- @lru_cache(None, typed=True)
113
+ @lru_cache(126, typed=True)
114
114
  def js_bypass_path(filename: str) -> str:
115
115
  """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
116
116
 
scrapling/fetchers.py CHANGED
@@ -11,7 +11,7 @@ class Fetcher(BaseFetcher):
11
11
  Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
12
12
  """
13
13
  def get(
14
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
14
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
15
15
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
16
16
  """Make basic HTTP GET request for you but with some added flavors.
17
17
 
@@ -30,7 +30,7 @@ class Fetcher(BaseFetcher):
30
30
  return response_object
31
31
 
32
32
  def post(
33
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
33
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
34
34
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
35
35
  """Make basic HTTP POST request for you but with some added flavors.
36
36
 
@@ -49,7 +49,7 @@ class Fetcher(BaseFetcher):
49
49
  return response_object
50
50
 
51
51
  def put(
52
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
52
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
53
53
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
54
54
  """Make basic HTTP PUT request for you but with some added flavors.
55
55
 
@@ -69,7 +69,7 @@ class Fetcher(BaseFetcher):
69
69
  return response_object
70
70
 
71
71
  def delete(
72
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
72
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
73
73
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
74
74
  """Make basic HTTP DELETE request for you but with some added flavors.
75
75
 
@@ -90,7 +90,7 @@ class Fetcher(BaseFetcher):
90
90
 
91
91
  class AsyncFetcher(Fetcher):
92
92
  async def get(
93
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
93
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
94
94
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
95
95
  """Make basic HTTP GET request for you but with some added flavors.
96
96
 
@@ -109,7 +109,7 @@ class AsyncFetcher(Fetcher):
109
109
  return response_object
110
110
 
111
111
  async def post(
112
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
112
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
113
113
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
114
114
  """Make basic HTTP POST request for you but with some added flavors.
115
115
 
@@ -128,7 +128,7 @@ class AsyncFetcher(Fetcher):
128
128
  return response_object
129
129
 
130
130
  async def put(
131
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
131
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
132
132
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
133
133
  """Make basic HTTP PUT request for you but with some added flavors.
134
134
 
@@ -147,7 +147,7 @@ class AsyncFetcher(Fetcher):
147
147
  return response_object
148
148
 
149
149
  async def delete(
150
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
150
+ self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
151
151
  proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
152
152
  """Make basic HTTP DELETE request for you but with some added flavors.
153
153
 
@@ -173,11 +173,11 @@ class StealthyFetcher(BaseFetcher):
173
173
  Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
174
174
  """
175
175
  def fetch(
176
- self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
177
- block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
176
+ self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
177
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
178
178
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
179
- wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
180
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
179
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
180
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
181
181
  ) -> Response:
182
182
  """
183
183
  Opens up a browser and do your request based on your chosen options below.
@@ -231,11 +231,11 @@ class StealthyFetcher(BaseFetcher):
231
231
  return engine.fetch(url)
232
232
 
233
233
  async def async_fetch(
234
- self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
235
- block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
234
+ self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
235
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
236
236
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
237
- wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
238
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
237
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
238
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
239
239
  ) -> Response:
240
240
  """
241
241
  Opens up a browser and do your request based on your chosen options below.
@@ -307,13 +307,13 @@ class PlayWrightFetcher(BaseFetcher):
307
307
  """
308
308
  def fetch(
309
309
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
310
- useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
310
+ useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
311
311
  page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
312
- hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
312
+ hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
313
313
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
314
- stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
314
+ stealth: bool = False, real_chrome: bool = False,
315
315
  cdp_url: Optional[str] = None,
316
- nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
316
+ nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
317
317
  ) -> Response:
318
318
  """Opens up a browser and do your request based on your chosen options below.
319
319
 
@@ -367,13 +367,13 @@ class PlayWrightFetcher(BaseFetcher):
367
367
 
368
368
  async def async_fetch(
369
369
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
370
- useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
370
+ useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
371
371
  page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
372
- hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
372
+ hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
373
373
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
374
- stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
374
+ stealth: bool = False, real_chrome: bool = False,
375
375
  cdp_url: Optional[str] = None,
376
- nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
376
+ nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
377
377
  ) -> Response:
378
378
  """Opens up a browser and do your request based on your chosen options below.
379
379
 
scrapling/parser.py CHANGED
@@ -71,7 +71,7 @@ class Adaptor(SelectorsGeneration):
71
71
  if root is None and not body and text is None:
72
72
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
73
73
 
74
- self.__text = None
74
+ self.__text = ''
75
75
  self.__raw_body = ''
76
76
  if root is None:
77
77
  if text is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scrapling
3
- Version: 0.2.96
3
+ Version: 0.2.97
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
73
73
 
74
74
  # Sponsors
75
75
 
76
+ [Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
77
+
78
+ Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
79
+ - covering 20+ Google SERP scenarios and mainstream search engines.
80
+ - support real-time data updates to ensure real-time and accurate information.
81
+ - It can integrate information from all available online channels and search engines.
82
+ - Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
83
+ - **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
84
+ - 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
85
+ - ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
86
+ - 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
87
+
88
+
89
+ [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
90
+ ---
91
+
76
92
  [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
77
93
 
78
94
  - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
88
104
  [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
89
105
  ---
90
106
 
91
- [Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
92
-
93
- - 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
94
- - ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
95
- - 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
96
- - 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
97
- - 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
98
- - 🎁 Free Trial: Try before you buy—experience our service firsthand.
99
- - 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
100
- - 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
101
-
102
-
103
- [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
104
- ---
105
-
106
107
  ## Table of content
107
108
  * [Key Features](#key-features)
108
109
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -1,25 +1,25 @@
1
- scrapling/__init__.py,sha256=5r6_yxrfXbeoh8UqUaCdmmbWH9TQxBivP9cLWUXPI5g,500
1
+ scrapling/__init__.py,sha256=5yeUml2K0xHe2NAALM2x2hGSl_ORcEttIZL17b1cWtg,500
2
2
  scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
3
3
  scrapling/defaults.py,sha256=sdXeZjXEX7PmCtaa0weK0nRrAUzqZukNNqipZ_sltYE,469
4
- scrapling/fetchers.py,sha256=qmiJ6S-bnPWvP48Z6rKxBnSuR-tdwHlJwlIsYxGxFM0,35405
5
- scrapling/parser.py,sha256=b_1eHxRwHRCidyvm3F6ST6qIYvVEVU6GhTTCI1LblVk,54330
4
+ scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
5
+ scrapling/parser.py,sha256=U6qFV23qeeX1pYl6mw0TZEL4FlaQw6puaoDTldUpi-M,54328
6
6
  scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
7
7
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
9
- scrapling/core/custom_types.py,sha256=tejeLYmWa_aLaLtMSymG4z7h6rxO-9EvmiRWEWcW54s,13022
9
+ scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
10
10
  scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
11
- scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
12
- scrapling/core/translator.py,sha256=hFSc3mxG5pYhbwRgingeFbD_E73U799vCsvVv0uFEXw,5237
13
- scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
11
+ scrapling/core/storage_adaptors.py,sha256=EkSE8LlOS9SggFblBNzgyEp0fLxl8dqYU3-MAuXUitY,6216
12
+ scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
13
+ scrapling/core/utils.py,sha256=0e3jD029CXj3gfA_MIKcBC0Mai9fXW2scIuoKtHy1e8,3704
14
14
  scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
15
- scrapling/engines/camo.py,sha256=SHMRnIrN6599upo5-G3fZQ10455xyB-bB_EsLMjBStA,16072
15
+ scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
16
16
  scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
17
- scrapling/engines/pw.py,sha256=LvS1jvTf3s7mfdeQo7_OyQ5zpiOzvBu5g88hOLlQBCQ,20856
18
- scrapling/engines/static.py,sha256=8v6RmdsSP6fAtWNXaJG24evHPsZ2oDiBl7yfkLrdARU,10635
17
+ scrapling/engines/pw.py,sha256=_fy8mhkVrOnb_Qho8zKCjFyd1Y_kr2mkdo0PHrBks4M,21371
18
+ scrapling/engines/static.py,sha256=okrEIFfYaxqVuIXPanxQDxQpN8i88AgWODo7Dnex2EI,9306
19
19
  scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
20
- scrapling/engines/toolbelt/custom.py,sha256=qgONLwpxUoEIAIQBF1RcakYu8cqAAmX8qdyaol5hfjA,12813
21
- scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
22
- scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
20
+ scrapling/engines/toolbelt/custom.py,sha256=dwpuEHNOd9uJbMf7sx8sXsYZhozSXStrwqfpooce1Wk,12811
21
+ scrapling/engines/toolbelt/fingerprints.py,sha256=spJMij0qBGvbSlVjv9xJWCF8KFDf6xnNz5fWtXWhrzY,2927
22
+ scrapling/engines/toolbelt/navigation.py,sha256=KyFQ4vHS4jR7z378VRGtUeXQHWr5NMy5nNp2-c_Evk8,4566
23
23
  scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
24
24
  scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
25
25
  scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
41
41
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
43
43
  tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
44
- scrapling-0.2.96.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
45
- scrapling-0.2.96.dist-info/METADATA,sha256=yNRmjMR5qmJyH_6ob-6nwLuqD6iXIegMI-d-xQ95ZpA,69063
46
- scrapling-0.2.96.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
47
- scrapling-0.2.96.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
- scrapling-0.2.96.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
- scrapling-0.2.96.dist-info/RECORD,,
44
+ scrapling-0.2.97.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
45
+ scrapling-0.2.97.dist-info/METADATA,sha256=VnP3UEy6RcQytld-8ZYSF0Cpdd4fb-tKoX01jajFneo,69666
46
+ scrapling-0.2.97.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
47
+ scrapling-0.2.97.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
+ scrapling-0.2.97.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
+ scrapling-0.2.97.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5