scrapling 0.2.96__py3-none-any.whl → 0.2.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -1,12 +1,41 @@
1
- # Declare top-level shortcuts
2
- from scrapling.core.custom_types import AttributesHandler, TextHandler
3
- from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
4
- PlayWrightFetcher, StealthyFetcher)
5
- from scrapling.parser import Adaptor, Adaptors
6
1
 
7
2
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.96"
3
+ __version__ = "0.2.98"
9
4
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
5
 
11
6
 
7
+ # A lightweight approach to create lazy loader for each import for backward compatibility
8
+ # This will reduces initial memory footprint significantly (only loads what's used)
9
+ def __getattr__(name):
10
+ if name == 'Fetcher':
11
+ from scrapling.fetchers import Fetcher as cls
12
+ return cls
13
+ elif name == 'Adaptor':
14
+ from scrapling.parser import Adaptor as cls
15
+ return cls
16
+ elif name == 'Adaptors':
17
+ from scrapling.parser import Adaptors as cls
18
+ return cls
19
+ elif name == 'AttributesHandler':
20
+ from scrapling.core.custom_types import AttributesHandler as cls
21
+ return cls
22
+ elif name == 'TextHandler':
23
+ from scrapling.core.custom_types import TextHandler as cls
24
+ return cls
25
+ elif name == 'AsyncFetcher':
26
+ from scrapling.fetchers import AsyncFetcher as cls
27
+ return cls
28
+ elif name == 'StealthyFetcher':
29
+ from scrapling.fetchers import StealthyFetcher as cls
30
+ return cls
31
+ elif name == 'PlayWrightFetcher':
32
+ from scrapling.fetchers import PlayWrightFetcher as cls
33
+ return cls
34
+ elif name == 'CustomFetcher':
35
+ from scrapling.fetchers import CustomFetcher as cls
36
+ return cls
37
+ else:
38
+ raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
39
+
40
+
12
41
  __all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
@@ -19,9 +19,7 @@ class TextHandler(str):
19
19
  __slots__ = ()
20
20
 
21
21
  def __new__(cls, string):
22
- if isinstance(string, str):
23
- return super().__new__(cls, string)
24
- return super().__new__(cls, '')
22
+ return super().__new__(cls, str(string))
25
23
 
26
24
  def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
27
25
  lst = super().__getitem__(key)
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
19
19
  """
20
20
  self.url = url
21
21
 
22
- @lru_cache(None, typed=True)
22
+ @lru_cache(64, typed=True)
23
23
  def _get_base_url(self, default_value: str = 'default') -> str:
24
24
  if not self.url or type(self.url) is not str:
25
25
  return default_value
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
51
51
  raise NotImplementedError('Storage system must implement `save` method')
52
52
 
53
53
  @staticmethod
54
- @lru_cache(None, typed=True)
54
+ @lru_cache(128, typed=True)
55
55
  def _get_hash(identifier: str) -> str:
56
56
  """If you want to hash identifier in your storage system, use this safer"""
57
57
  identifier = identifier.lower().strip()
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
63
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
64
 
65
65
 
66
- @lru_cache(None, typed=True)
66
+ @lru_cache(1, typed=True)
67
67
  class SQLiteStorageSystem(StorageSystemMixin):
68
68
  """The recommended system to use, it's race condition safe and thread safe.
69
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -139,6 +139,9 @@ class TranslatorMixin:
139
139
 
140
140
 
141
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @lru_cache(maxsize=2048)
142
+ @lru_cache(maxsize=256)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
145
+
146
+
147
+ translator_instance = HTMLTranslator()
scrapling/core/utils.py CHANGED
@@ -115,7 +115,7 @@ class _StorageTools:
115
115
  # return _impl
116
116
 
117
117
 
118
- @lru_cache(None, typed=True)
118
+ @lru_cache(128, typed=True)
119
119
  def clean_spaces(string):
120
120
  string = string.replace('\t', ' ')
121
121
  string = re.sub('[\n|\r]', '', string)
scrapling/defaults.py CHANGED
@@ -1,10 +1,19 @@
1
- from .fetchers import AsyncFetcher as _AsyncFetcher
2
- from .fetchers import Fetcher as _Fetcher
3
- from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
4
- from .fetchers import StealthyFetcher as _StealthyFetcher
5
-
6
1
  # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
7
- Fetcher = _Fetcher()
8
- AsyncFetcher = _AsyncFetcher()
9
- StealthyFetcher = _StealthyFetcher()
10
- PlayWrightFetcher = _PlayWrightFetcher()
2
+
3
+ # A lightweight approach to create lazy loader for each import for backward compatibility
4
+ # This will reduces initial memory footprint significantly (only loads what's used)
5
+ def __getattr__(name):
6
+ if name == 'Fetcher':
7
+ from scrapling.fetchers import Fetcher as cls
8
+ return cls()
9
+ elif name == 'AsyncFetcher':
10
+ from scrapling.fetchers import AsyncFetcher as cls
11
+ return cls()
12
+ elif name == 'StealthyFetcher':
13
+ from scrapling.fetchers import StealthyFetcher as cls
14
+ return cls()
15
+ elif name == 'PlayWrightFetcher':
16
+ from scrapling.fetchers import PlayWrightFetcher as cls
17
+ return cls()
18
+ else:
19
+ raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
scrapling/engines/camo.py CHANGED
@@ -15,12 +15,12 @@ from scrapling.engines.toolbelt import (Response, StatusText,
15
15
 
16
16
  class CamoufoxEngine:
17
17
  def __init__(
18
- self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
- block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
18
+ self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
19
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
22
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
23
- geoip: Optional[bool] = False,
21
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
22
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
23
+ geoip: bool = False,
24
24
  adaptor_arguments: Dict = None,
25
25
  ):
26
26
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
@@ -64,107 +64,140 @@ class CamoufoxEngine:
64
64
  self.addons = addons or []
65
65
  self.humanize = humanize
66
66
  self.timeout = check_type_validity(timeout, [int, float], 30000)
67
+
68
+ # Page action callable validation
69
+ self.page_action = None
67
70
  if page_action is not None:
68
71
  if callable(page_action):
69
72
  self.page_action = page_action
70
73
  else:
71
- self.page_action = None
72
74
  log.error('[Ignored] Argument "page_action" must be callable')
73
- else:
74
- self.page_action = None
75
75
 
76
76
  self.wait_selector = wait_selector
77
77
  self.wait_selector_state = wait_selector_state
78
78
  self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
79
79
 
80
+ def _get_camoufox_options(self):
81
+ """Return consistent browser options dictionary for both sync and async methods"""
82
+ return {
83
+ "geoip": self.geoip,
84
+ "proxy": self.proxy,
85
+ "enable_cache": True,
86
+ "addons": self.addons,
87
+ "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
88
+ "headless": self.headless,
89
+ "humanize": self.humanize,
90
+ "i_know_what_im_doing": True, # To turn warnings off with the user configurations
91
+ "allow_webgl": self.allow_webgl,
92
+ "block_webrtc": self.block_webrtc,
93
+ "block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
94
+ "os": None if self.os_randomize else get_os_name(),
95
+ }
96
+
97
+ def _process_response_history(self, first_response):
98
+ """Process response history to build a list of Response objects"""
99
+ history = []
100
+ current_request = first_response.request.redirected_from
101
+
102
+ try:
103
+ while current_request:
104
+ try:
105
+ current_response = current_request.response()
106
+ history.insert(0, Response(
107
+ url=current_request.url,
108
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
109
+ text='',
110
+ body=b'',
111
+ status=current_response.status if current_response else 301,
112
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
113
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
114
+ cookies={},
115
+ headers=current_response.all_headers() if current_response else {},
116
+ request_headers=current_request.all_headers(),
117
+ **self.adaptor_arguments
118
+ ))
119
+ except Exception as e:
120
+ log.error(f"Error processing redirect: {e}")
121
+ break
122
+
123
+ current_request = current_request.redirected_from
124
+ except Exception as e:
125
+ log.error(f"Error processing response history: {e}")
126
+
127
+ return history
128
+
80
129
  def fetch(self, url: str) -> Response:
81
130
  """Opens up the browser and do your request based on your chosen options.
82
131
 
83
132
  :param url: Target url.
84
133
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
134
  """
86
- addons = [] if self.disable_ads else [DefaultAddons.UBO]
87
- # Store the final response
88
135
  final_response = None
136
+ referer = generate_convincing_referer(url) if self.google_search else None
89
137
 
90
138
  def handle_response(finished_response):
91
139
  nonlocal final_response
92
140
  if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
93
141
  final_response = finished_response
94
142
 
95
- with Camoufox(
96
- geoip=self.geoip,
97
- proxy=self.proxy,
98
- enable_cache=True,
99
- addons=self.addons,
100
- exclude_addons=addons,
101
- headless=self.headless,
102
- humanize=self.humanize,
103
- i_know_what_im_doing=True, # To turn warnings off with the user configurations
104
- allow_webgl=self.allow_webgl,
105
- block_webrtc=self.block_webrtc,
106
- block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
107
- os=None if self.os_randomize else get_os_name(),
108
- ) as browser:
109
- page = browser.new_page()
143
+ with Camoufox(**self._get_camoufox_options()) as browser:
144
+ context = browser.new_context()
145
+ page = context.new_page()
110
146
  page.set_default_navigation_timeout(self.timeout)
111
147
  page.set_default_timeout(self.timeout)
112
- # Listen for all responses
113
148
  page.on("response", handle_response)
149
+
114
150
  if self.disable_resources:
115
151
  page.route("**/*", intercept_route)
116
152
 
117
153
  if self.extra_headers:
118
154
  page.set_extra_http_headers(self.extra_headers)
119
155
 
120
- first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
156
+ first_response = page.goto(url, referer=referer)
121
157
  page.wait_for_load_state(state="domcontentloaded")
158
+
122
159
  if self.network_idle:
123
160
  page.wait_for_load_state('networkidle')
124
161
 
125
162
  if self.page_action is not None:
126
- page = self.page_action(page)
163
+ try:
164
+ page = self.page_action(page)
165
+ except Exception as e:
166
+ log.error(f"Error executing page_action: {e}")
127
167
 
128
168
  if self.wait_selector and type(self.wait_selector) is str:
129
- waiter = page.locator(self.wait_selector)
130
- waiter.first.wait_for(state=self.wait_selector_state)
131
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
132
- page.wait_for_load_state(state="load")
133
- page.wait_for_load_state(state="domcontentloaded")
134
- if self.network_idle:
135
- page.wait_for_load_state('networkidle')
169
+ try:
170
+ waiter = page.locator(self.wait_selector)
171
+ waiter.first.wait_for(state=self.wait_selector_state)
172
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
173
+ page.wait_for_load_state(state="load")
174
+ page.wait_for_load_state(state="domcontentloaded")
175
+ if self.network_idle:
176
+ page.wait_for_load_state('networkidle')
177
+ except Exception as e:
178
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
136
179
 
137
180
  # In case we didn't catch a document type somehow
138
181
  final_response = final_response if final_response else first_response
182
+ if not final_response:
183
+ raise ValueError("Failed to get a response from the page")
184
+
139
185
  # This will be parsed inside `Response`
140
186
  encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
141
187
  # PlayWright API sometimes give empty status text for some reason!
142
188
  status_text = final_response.status_text or StatusText.get(final_response.status)
143
189
 
144
- history = []
145
- current_request = first_response.request.redirected_from
146
- while current_request:
147
- current_response = current_request.response()
148
-
149
- history.insert(0, Response(
150
- url=current_request.url,
151
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
152
- text='',
153
- body=b'',
154
- status=current_response.status if current_response else 301,
155
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
156
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
157
- cookies={},
158
- headers=current_response.all_headers() if current_response else {},
159
- request_headers=current_request.all_headers(),
160
- **self.adaptor_arguments
161
- ))
162
- current_request = current_request.redirected_from
190
+ history = self._process_response_history(first_response)
191
+ try:
192
+ page_content = page.content()
193
+ except Exception as e:
194
+ log.error(f"Error getting page content: {e}")
195
+ page_content = ""
163
196
 
164
197
  response = Response(
165
198
  url=page.url,
166
- text=page.content(),
167
- body=page.content().encode('utf-8'),
199
+ text=page_content,
200
+ body=page_content.encode('utf-8'),
168
201
  status=final_response.status,
169
202
  reason=status_text,
170
203
  encoding=encoding,
@@ -175,6 +208,7 @@ class CamoufoxEngine:
175
208
  **self.adaptor_arguments
176
209
  )
177
210
  page.close()
211
+ context.close()
178
212
 
179
213
  return response
180
214
 
@@ -184,88 +218,72 @@ class CamoufoxEngine:
184
218
  :param url: Target url.
185
219
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
186
220
  """
187
- addons = [] if self.disable_ads else [DefaultAddons.UBO]
188
- # Store the final response
189
221
  final_response = None
222
+ referer = generate_convincing_referer(url) if self.google_search else None
190
223
 
191
224
  async def handle_response(finished_response):
192
225
  nonlocal final_response
193
226
  if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
194
227
  final_response = finished_response
195
228
 
196
- async with AsyncCamoufox(
197
- geoip=self.geoip,
198
- proxy=self.proxy,
199
- enable_cache=True,
200
- addons=self.addons,
201
- exclude_addons=addons,
202
- headless=self.headless,
203
- humanize=self.humanize,
204
- i_know_what_im_doing=True, # To turn warnings off with the user configurations
205
- allow_webgl=self.allow_webgl,
206
- block_webrtc=self.block_webrtc,
207
- block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
208
- os=None if self.os_randomize else get_os_name(),
209
- ) as browser:
210
- page = await browser.new_page()
229
+ async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
230
+ context = await browser.new_context()
231
+ page = await context.new_page()
211
232
  page.set_default_navigation_timeout(self.timeout)
212
233
  page.set_default_timeout(self.timeout)
213
- # Listen for all responses
214
234
  page.on("response", handle_response)
235
+
215
236
  if self.disable_resources:
216
237
  await page.route("**/*", async_intercept_route)
217
238
 
218
239
  if self.extra_headers:
219
240
  await page.set_extra_http_headers(self.extra_headers)
220
241
 
221
- first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
242
+ first_response = await page.goto(url, referer=referer)
222
243
  await page.wait_for_load_state(state="domcontentloaded")
244
+
223
245
  if self.network_idle:
224
246
  await page.wait_for_load_state('networkidle')
225
247
 
226
248
  if self.page_action is not None:
227
- page = await self.page_action(page)
249
+ try:
250
+ page = await self.page_action(page)
251
+ except Exception as e:
252
+ log.error(f"Error executing async page_action: {e}")
228
253
 
229
254
  if self.wait_selector and type(self.wait_selector) is str:
230
- waiter = page.locator(self.wait_selector)
231
- await waiter.first.wait_for(state=self.wait_selector_state)
232
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
233
- await page.wait_for_load_state(state="load")
234
- await page.wait_for_load_state(state="domcontentloaded")
235
- if self.network_idle:
236
- await page.wait_for_load_state('networkidle')
255
+ try:
256
+ waiter = page.locator(self.wait_selector)
257
+ await waiter.first.wait_for(state=self.wait_selector_state)
258
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
259
+ await page.wait_for_load_state(state="load")
260
+ await page.wait_for_load_state(state="domcontentloaded")
261
+ if self.network_idle:
262
+ await page.wait_for_load_state('networkidle')
263
+ except Exception as e:
264
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
237
265
 
238
266
  # In case we didn't catch a document type somehow
239
267
  final_response = final_response if final_response else first_response
268
+ if not final_response:
269
+ raise ValueError("Failed to get a response from the page")
270
+
240
271
  # This will be parsed inside `Response`
241
272
  encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
242
273
  # PlayWright API sometimes give empty status text for some reason!
243
274
  status_text = final_response.status_text or StatusText.get(final_response.status)
244
275
 
245
- history = []
246
- current_request = first_response.request.redirected_from
247
- while current_request:
248
- current_response = await current_request.response()
249
-
250
- history.insert(0, Response(
251
- url=current_request.url,
252
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
253
- text='',
254
- body=b'',
255
- status=current_response.status if current_response else 301,
256
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
257
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
258
- cookies={},
259
- headers=await current_response.all_headers() if current_response else {},
260
- request_headers=await current_request.all_headers(),
261
- **self.adaptor_arguments
262
- ))
263
- current_request = current_request.redirected_from
276
+ history = self._process_response_history(first_response)
277
+ try:
278
+ page_content = await page.content()
279
+ except Exception as e:
280
+ log.error(f"Error getting page content in async: {e}")
281
+ page_content = ""
264
282
 
265
283
  response = Response(
266
284
  url=page.url,
267
- text=await page.content(),
268
- body=(await page.content()).encode('utf-8'),
285
+ text=page_content,
286
+ body=page_content.encode('utf-8'),
269
287
  status=final_response.status,
270
288
  reason=status_text,
271
289
  encoding=encoding,
@@ -276,5 +294,6 @@ class CamoufoxEngine:
276
294
  **self.adaptor_arguments
277
295
  )
278
296
  await page.close()
297
+ await context.close()
279
298
 
280
299
  return response