scrapling 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -1,12 +1,41 @@
1
- # Declare top-level shortcuts
2
- from scrapling.core.custom_types import AttributesHandler, TextHandler
3
- from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
4
- PlayWrightFetcher, StealthyFetcher)
5
- from scrapling.parser import Adaptor, Adaptors
6
1
 
7
2
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.97"
3
+ __version__ = "0.2.99"
9
4
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
5
 
11
6
 
7
+ # A lightweight approach to create lazy loader for each import for backward compatibility
8
+ # This will reduces initial memory footprint significantly (only loads what's used)
9
+ def __getattr__(name):
10
+ if name == 'Fetcher':
11
+ from scrapling.fetchers import Fetcher as cls
12
+ return cls
13
+ elif name == 'Adaptor':
14
+ from scrapling.parser import Adaptor as cls
15
+ return cls
16
+ elif name == 'Adaptors':
17
+ from scrapling.parser import Adaptors as cls
18
+ return cls
19
+ elif name == 'AttributesHandler':
20
+ from scrapling.core.custom_types import AttributesHandler as cls
21
+ return cls
22
+ elif name == 'TextHandler':
23
+ from scrapling.core.custom_types import TextHandler as cls
24
+ return cls
25
+ elif name == 'AsyncFetcher':
26
+ from scrapling.fetchers import AsyncFetcher as cls
27
+ return cls
28
+ elif name == 'StealthyFetcher':
29
+ from scrapling.fetchers import StealthyFetcher as cls
30
+ return cls
31
+ elif name == 'PlayWrightFetcher':
32
+ from scrapling.fetchers import PlayWrightFetcher as cls
33
+ return cls
34
+ elif name == 'CustomFetcher':
35
+ from scrapling.fetchers import CustomFetcher as cls
36
+ return cls
37
+ else:
38
+ raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
39
+
40
+
12
41
  __all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
19
19
  """
20
20
  self.url = url
21
21
 
22
- @lru_cache(126, typed=True)
22
+ @lru_cache(64, typed=True)
23
23
  def _get_base_url(self, default_value: str = 'default') -> str:
24
24
  if not self.url or type(self.url) is not str:
25
25
  return default_value
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
51
51
  raise NotImplementedError('Storage system must implement `save` method')
52
52
 
53
53
  @staticmethod
54
- @lru_cache(256, typed=True)
54
+ @lru_cache(128, typed=True)
55
55
  def _get_hash(identifier: str) -> str:
56
56
  """If you want to hash identifier in your storage system, use this safer"""
57
57
  identifier = identifier.lower().strip()
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
63
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
64
 
65
65
 
66
- @lru_cache(10, typed=True)
66
+ @lru_cache(1, typed=True)
67
67
  class SQLiteStorageSystem(StorageSystemMixin):
68
68
  """The recommended system to use, it's race condition safe and thread safe.
69
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -142,3 +142,6 @@ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
142
  @lru_cache(maxsize=256)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
145
+
146
+
147
+ translator_instance = HTMLTranslator()
scrapling/core/utils.py CHANGED
@@ -115,7 +115,7 @@ class _StorageTools:
115
115
  # return _impl
116
116
 
117
117
 
118
- @lru_cache(256, typed=True)
118
+ @lru_cache(128, typed=True)
119
119
  def clean_spaces(string):
120
120
  string = string.replace('\t', ' ')
121
121
  string = re.sub('[\n|\r]', '', string)
scrapling/defaults.py CHANGED
@@ -1,10 +1,25 @@
1
- from .fetchers import AsyncFetcher as _AsyncFetcher
2
- from .fetchers import Fetcher as _Fetcher
3
- from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
4
- from .fetchers import StealthyFetcher as _StealthyFetcher
1
+ # Left this file for backward-compatibility before 0.2.99
2
+ from scrapling.core.utils import log
5
3
 
6
- # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
7
- Fetcher = _Fetcher()
8
- AsyncFetcher = _AsyncFetcher()
9
- StealthyFetcher = _StealthyFetcher()
10
- PlayWrightFetcher = _PlayWrightFetcher()
4
+
5
+ # A lightweight approach to create lazy loader for each import for backward compatibility
6
+ # This will reduces initial memory footprint significantly (only loads what's used)
7
+ def __getattr__(name):
8
+ if name == 'Fetcher':
9
+ from scrapling.fetchers import Fetcher as cls
10
+ log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead')
11
+ return cls
12
+ elif name == 'AsyncFetcher':
13
+ from scrapling.fetchers import AsyncFetcher as cls
14
+ log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead')
15
+ return cls
16
+ elif name == 'StealthyFetcher':
17
+ from scrapling.fetchers import StealthyFetcher as cls
18
+ log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead')
19
+ return cls
20
+ elif name == 'PlayWrightFetcher':
21
+ from scrapling.fetchers import PlayWrightFetcher as cls
22
+ log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead')
23
+ return cls
24
+ else:
25
+ raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
scrapling/engines/camo.py CHANGED
@@ -16,12 +16,13 @@ from scrapling.engines.toolbelt import (Response, StatusText,
16
16
  class CamoufoxEngine:
17
17
  def __init__(
18
18
  self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
19
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
19
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True, wait: Optional[int] = 0,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
21
  wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
22
22
  proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
23
23
  geoip: bool = False,
24
24
  adaptor_arguments: Dict = None,
25
+ additional_arguments: Dict = None
25
26
  ):
26
27
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
27
28
 
@@ -38,6 +39,7 @@ class CamoufoxEngine:
38
39
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
39
40
  :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
40
41
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
42
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
41
43
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
42
44
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
43
45
  :param wait_selector: Wait for a specific css selector to be in a specific state.
@@ -48,6 +50,7 @@ class CamoufoxEngine:
48
50
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
49
51
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
50
52
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
53
+ :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
51
54
  """
52
55
  self.headless = headless
53
56
  self.block_images = bool(block_images)
@@ -60,10 +63,12 @@ class CamoufoxEngine:
60
63
  self.disable_ads = bool(disable_ads)
61
64
  self.geoip = bool(geoip)
62
65
  self.extra_headers = extra_headers or {}
66
+ self.additional_arguments = additional_arguments or {}
63
67
  self.proxy = construct_proxy_dict(proxy)
64
68
  self.addons = addons or []
65
69
  self.humanize = humanize
66
70
  self.timeout = check_type_validity(timeout, [int, float], 30000)
71
+ self.wait = check_type_validity(wait, [int, float], 0)
67
72
 
68
73
  # Page action callable validation
69
74
  self.page_action = None
@@ -92,6 +97,7 @@ class CamoufoxEngine:
92
97
  "block_webrtc": self.block_webrtc,
93
98
  "block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
94
99
  "os": None if self.os_randomize else get_os_name(),
100
+ **self.additional_arguments
95
101
  }
96
102
 
97
103
  def _process_response_history(self, first_response):
@@ -126,6 +132,38 @@ class CamoufoxEngine:
126
132
 
127
133
  return history
128
134
 
135
+ async def _async_process_response_history(self, first_response):
136
+ """Process response history to build a list of Response objects"""
137
+ history = []
138
+ current_request = first_response.request.redirected_from
139
+
140
+ try:
141
+ while current_request:
142
+ try:
143
+ current_response = await current_request.response()
144
+ history.insert(0, Response(
145
+ url=current_request.url,
146
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
147
+ text='',
148
+ body=b'',
149
+ status=current_response.status if current_response else 301,
150
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
151
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
152
+ cookies={},
153
+ headers=await current_response.all_headers() if current_response else {},
154
+ request_headers=await current_request.all_headers(),
155
+ **self.adaptor_arguments
156
+ ))
157
+ except Exception as e:
158
+ log.error(f"Error processing redirect: {e}")
159
+ break
160
+
161
+ current_request = current_request.redirected_from
162
+ except Exception as e:
163
+ log.error(f"Error processing response history: {e}")
164
+
165
+ return history
166
+
129
167
  def fetch(self, url: str) -> Response:
130
168
  """Opens up the browser and do your request based on your chosen options.
131
169
 
@@ -177,6 +215,7 @@ class CamoufoxEngine:
177
215
  except Exception as e:
178
216
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
179
217
 
218
+ page.wait_for_timeout(self.wait)
180
219
  # In case we didn't catch a document type somehow
181
220
  final_response = final_response if final_response else first_response
182
221
  if not final_response:
@@ -263,6 +302,7 @@ class CamoufoxEngine:
263
302
  except Exception as e:
264
303
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
265
304
 
305
+ await page.wait_for_timeout(self.wait)
266
306
  # In case we didn't catch a document type somehow
267
307
  final_response = final_response if final_response else first_response
268
308
  if not final_response:
@@ -273,7 +313,7 @@ class CamoufoxEngine:
273
313
  # PlayWright API sometimes give empty status text for some reason!
274
314
  status_text = final_response.status_text or StatusText.get(final_response.status)
275
315
 
276
- history = self._process_response_history(first_response)
316
+ history = await self._async_process_response_history(first_response)
277
317
  try:
278
318
  page_content = await page.content()
279
319
  except Exception as e:
scrapling/engines/pw.py CHANGED
@@ -21,6 +21,7 @@ class PlaywrightEngine:
21
21
  useragent: Optional[str] = None,
22
22
  network_idle: bool = False,
23
23
  timeout: Optional[float] = 30000,
24
+ wait: Optional[int] = 0,
24
25
  page_action: Callable = None,
25
26
  wait_selector: Optional[str] = None,
26
27
  locale: Optional[str] = 'en-US',
@@ -46,6 +47,7 @@ class PlaywrightEngine:
46
47
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
47
48
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
48
49
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
50
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
49
51
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
50
52
  :param wait_selector: Wait for a specific css selector to be in a specific state.
51
53
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -76,6 +78,7 @@ class PlaywrightEngine:
76
78
  self.cdp_url = cdp_url
77
79
  self.useragent = useragent
78
80
  self.timeout = check_type_validity(timeout, [int, float], 30000)
81
+ self.wait = check_type_validity(wait, [int, float], 0)
79
82
  if page_action is not None:
80
83
  if callable(page_action):
81
84
  self.page_action = page_action
@@ -126,7 +129,7 @@ class PlaywrightEngine:
126
129
 
127
130
  return cdp_url
128
131
 
129
- @lru_cache(126, typed=True)
132
+ @lru_cache(32, typed=True)
130
133
  def __set_flags(self):
131
134
  """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
132
135
  flags = DEFAULT_STEALTH_FLAGS
@@ -169,7 +172,7 @@ class PlaywrightEngine:
169
172
 
170
173
  return context_kwargs
171
174
 
172
- @lru_cache(10)
175
+ @lru_cache(1)
173
176
  def __stealth_scripts(self):
174
177
  # Basic bypasses nothing fancy as I'm still working on it
175
178
  # But with adding these bypasses to the above config, it bypasses many online tests like
@@ -220,6 +223,38 @@ class PlaywrightEngine:
220
223
 
221
224
  return history
222
225
 
226
+ async def _async_process_response_history(self, first_response):
227
+ """Process response history to build a list of Response objects"""
228
+ history = []
229
+ current_request = first_response.request.redirected_from
230
+
231
+ try:
232
+ while current_request:
233
+ try:
234
+ current_response = await current_request.response()
235
+ history.insert(0, Response(
236
+ url=current_request.url,
237
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
238
+ text='',
239
+ body=b'',
240
+ status=current_response.status if current_response else 301,
241
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
242
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
243
+ cookies={},
244
+ headers=await current_response.all_headers() if current_response else {},
245
+ request_headers=await current_request.all_headers(),
246
+ **self.adaptor_arguments
247
+ ))
248
+ except Exception as e:
249
+ log.error(f"Error processing redirect: {e}")
250
+ break
251
+
252
+ current_request = current_request.redirected_from
253
+ except Exception as e:
254
+ log.error(f"Error processing response history: {e}")
255
+
256
+ return history
257
+
223
258
  def fetch(self, url: str) -> Response:
224
259
  """Opens up the browser and do your request based on your chosen options.
225
260
 
@@ -289,6 +324,7 @@ class PlaywrightEngine:
289
324
  except Exception as e:
290
325
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
291
326
 
327
+ page.wait_for_timeout(self.wait)
292
328
  # In case we didn't catch a document type somehow
293
329
  final_response = final_response if final_response else first_response
294
330
  if not final_response:
@@ -392,6 +428,7 @@ class PlaywrightEngine:
392
428
  except Exception as e:
393
429
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
394
430
 
431
+ await page.wait_for_timeout(self.wait)
395
432
  # In case we didn't catch a document type somehow
396
433
  final_response = final_response if final_response else first_response
397
434
  if not final_response:
@@ -402,7 +439,7 @@ class PlaywrightEngine:
402
439
  # PlayWright API sometimes give empty status text for some reason!
403
440
  status_text = final_response.status_text or StatusText.get(final_response.status)
404
441
 
405
- history = self._process_response_history(first_response)
442
+ history = await self._async_process_response_history(first_response)
406
443
  try:
407
444
  page_content = await page.content()
408
445
  except Exception as e:
@@ -7,7 +7,7 @@ from scrapling.core.utils import log, lru_cache
7
7
  from .toolbelt import Response, generate_convincing_referer, generate_headers
8
8
 
9
9
 
10
- @lru_cache(5, typed=True) # Singleton easily
10
+ @lru_cache(2, typed=True) # Singleton easily
11
11
  class StaticEngine:
12
12
  def __init__(
13
13
  self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
@@ -16,7 +16,7 @@ class ResponseEncoding:
16
16
  __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
17
17
 
18
18
  @classmethod
19
- @lru_cache(maxsize=256)
19
+ @lru_cache(maxsize=128)
20
20
  def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
21
21
  """Parse content type and parameters from a content-type header value.
22
22
 
@@ -38,7 +38,7 @@ class ResponseEncoding:
38
38
  return content_type, params
39
39
 
40
40
  @classmethod
41
- @lru_cache(maxsize=256)
41
+ @lru_cache(maxsize=128)
42
42
  def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
43
43
  """Determine the appropriate character encoding from a content-type header.
44
44
 
@@ -105,41 +105,77 @@ class Response(Adaptor):
105
105
 
106
106
 
107
107
  class BaseFetcher:
108
- def __init__(
109
- self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
110
- storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
111
- automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
112
- ):
113
- """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
114
- are detected and passed automatically from the Fetcher based on the response for accessibility.
115
-
116
- :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
117
- libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
118
- :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
119
- :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
120
- :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
121
- priority over all auto-match related arguments/functions in the class.
122
- :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
123
- :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
124
- If empty, default values will be used.
125
- :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
126
- Otherwise, the domain of the request is used by default.
108
+ __slots__ = ()
109
+ huge_tree: bool = True
110
+ auto_match: Optional[bool] = False
111
+ storage: Any = SQLiteStorageSystem
112
+ keep_cdata: Optional[bool] = False
113
+ storage_args: Optional[Dict] = None
114
+ keep_comments: Optional[bool] = False
115
+ automatch_domain: Optional[str] = None
116
+ parser_keywords: Tuple = ('huge_tree', 'auto_match', 'storage', 'keep_cdata', 'storage_args', 'keep_comments', 'automatch_domain',) # Left open for the user
117
+
118
+ def __init__(self, *args, **kwargs):
119
+ # For backward-compatibility before 0.2.99
120
+ args_str = ", ".join(args) or ''
121
+ kwargs_str = ", ".join(f'{k}={v}' for k, v in kwargs.items()) or ''
122
+ if args_str:
123
+ args_str += ', '
124
+
125
+ log.warning(f'This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching')
126
+ pass
127
+
128
+ @classmethod
129
+ def display_config(cls):
130
+ return dict(
131
+ huge_tree=cls.huge_tree,
132
+ keep_comments=cls.keep_comments,
133
+ keep_cdata=cls.keep_cdata,
134
+ auto_match=cls.auto_match,
135
+ storage=cls.storage,
136
+ storage_args=cls.storage_args,
137
+ automatch_domain=cls.automatch_domain,
138
+ )
139
+
140
+ @classmethod
141
+ def configure(cls, **kwargs):
142
+ """Set multiple arguments for the parser at once globally
143
+
144
+ :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, auto_match, storage, storage_args, automatch_domain
127
145
  """
146
+ for key, value in kwargs.items():
147
+ key = key.strip().lower()
148
+ if hasattr(cls, key):
149
+ if key in cls.parser_keywords:
150
+ setattr(cls, key, value)
151
+ else:
152
+ # Yup, no fun allowed LOL
153
+ raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
154
+ else:
155
+ raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
156
+
157
+ if not kwargs:
158
+ raise AttributeError(f'You must pass a keyword to configure, current keywords: {cls.parser_keywords}?')
159
+
160
+ @classmethod
161
+ def _generate_parser_arguments(cls) -> Dict:
128
162
  # Adaptor class parameters
129
163
  # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
130
- self.adaptor_arguments = dict(
131
- huge_tree=huge_tree,
132
- keep_comments=keep_comments,
133
- keep_cdata=keep_cdata,
134
- auto_match=auto_match,
135
- storage=storage,
136
- storage_args=storage_args
164
+ parser_arguments = dict(
165
+ huge_tree=cls.huge_tree,
166
+ keep_comments=cls.keep_comments,
167
+ keep_cdata=cls.keep_cdata,
168
+ auto_match=cls.auto_match,
169
+ storage=cls.storage,
170
+ storage_args=cls.storage_args
137
171
  )
138
- if automatch_domain:
139
- if type(automatch_domain) is not str:
172
+ if cls.automatch_domain:
173
+ if type(cls.automatch_domain) is not str:
140
174
  log.warning('[Ignored] The argument "automatch_domain" must be of string type')
141
175
  else:
142
- self.adaptor_arguments.update({'automatch_domain': automatch_domain})
176
+ parser_arguments.update({'automatch_domain': cls.automatch_domain})
177
+
178
+ return parser_arguments
143
179
 
144
180
 
145
181
  class StatusText:
@@ -12,7 +12,7 @@ from scrapling.core._types import Dict, Union
12
12
  from scrapling.core.utils import lru_cache
13
13
 
14
14
 
15
- @lru_cache(128, typed=True)
15
+ @lru_cache(10, typed=True)
16
16
  def generate_convincing_referer(url: str) -> str:
17
17
  """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
18
 
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
26
26
  return f'https://www.google.com/search?q={website_name}'
27
27
 
28
28
 
29
- @lru_cache(128, typed=True)
29
+ @lru_cache(1, typed=True)
30
30
  def get_os_name() -> Union[str, None]:
31
31
  """Get the current OS name in the same format needed for browserforge
32
32
 
@@ -110,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
110
110
  raise ValueError(f"Invalid CDP URL: {str(e)}")
111
111
 
112
112
 
113
- @lru_cache(126, typed=True)
113
+ @lru_cache(10, typed=True)
114
114
  def js_bypass_path(filename: str) -> str:
115
115
  """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
116
116