scrapling 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +35 -6
- scrapling/core/storage_adaptors.py +3 -3
- scrapling/core/translator.py +3 -0
- scrapling/core/utils.py +1 -1
- scrapling/defaults.py +24 -9
- scrapling/engines/camo.py +42 -2
- scrapling/engines/pw.py +40 -3
- scrapling/engines/static.py +1 -1
- scrapling/engines/toolbelt/custom.py +67 -31
- scrapling/engines/toolbelt/fingerprints.py +2 -2
- scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling/fetchers.py +142 -41
- scrapling/parser.py +6 -12
- scrapling-0.2.99.dist-info/METADATA +290 -0
- {scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/RECORD +25 -25
- {scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/WHEEL +1 -1
- tests/fetchers/async/test_camoufox.py +3 -1
- tests/fetchers/async/test_httpx.py +3 -1
- tests/fetchers/async/test_playwright.py +3 -1
- tests/fetchers/sync/test_camoufox.py +3 -1
- tests/fetchers/sync/test_httpx.py +3 -1
- tests/fetchers/sync/test_playwright.py +3 -1
- scrapling-0.2.97.dist-info/METADATA +0 -867
- {scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info/licenses}/LICENSE +0 -0
- {scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,12 +1,41 @@
|
|
1
|
-
# Declare top-level shortcuts
|
2
|
-
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
-
from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
4
|
-
PlayWrightFetcher, StealthyFetcher)
|
5
|
-
from scrapling.parser import Adaptor, Adaptors
|
6
1
|
|
7
2
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
3
|
+
__version__ = "0.2.99"
|
9
4
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
5
|
|
11
6
|
|
7
|
+
# A lightweight approach to create lazy loader for each import for backward compatibility
|
8
|
+
# This will reduces initial memory footprint significantly (only loads what's used)
|
9
|
+
def __getattr__(name):
|
10
|
+
if name == 'Fetcher':
|
11
|
+
from scrapling.fetchers import Fetcher as cls
|
12
|
+
return cls
|
13
|
+
elif name == 'Adaptor':
|
14
|
+
from scrapling.parser import Adaptor as cls
|
15
|
+
return cls
|
16
|
+
elif name == 'Adaptors':
|
17
|
+
from scrapling.parser import Adaptors as cls
|
18
|
+
return cls
|
19
|
+
elif name == 'AttributesHandler':
|
20
|
+
from scrapling.core.custom_types import AttributesHandler as cls
|
21
|
+
return cls
|
22
|
+
elif name == 'TextHandler':
|
23
|
+
from scrapling.core.custom_types import TextHandler as cls
|
24
|
+
return cls
|
25
|
+
elif name == 'AsyncFetcher':
|
26
|
+
from scrapling.fetchers import AsyncFetcher as cls
|
27
|
+
return cls
|
28
|
+
elif name == 'StealthyFetcher':
|
29
|
+
from scrapling.fetchers import StealthyFetcher as cls
|
30
|
+
return cls
|
31
|
+
elif name == 'PlayWrightFetcher':
|
32
|
+
from scrapling.fetchers import PlayWrightFetcher as cls
|
33
|
+
return cls
|
34
|
+
elif name == 'CustomFetcher':
|
35
|
+
from scrapling.fetchers import CustomFetcher as cls
|
36
|
+
return cls
|
37
|
+
else:
|
38
|
+
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
39
|
+
|
40
|
+
|
12
41
|
__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
19
19
|
"""
|
20
20
|
self.url = url
|
21
21
|
|
22
|
-
@lru_cache(
|
22
|
+
@lru_cache(64, typed=True)
|
23
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
24
24
|
if not self.url or type(self.url) is not str:
|
25
25
|
return default_value
|
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
51
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
52
52
|
|
53
53
|
@staticmethod
|
54
|
-
@lru_cache(
|
54
|
+
@lru_cache(128, typed=True)
|
55
55
|
def _get_hash(identifier: str) -> str:
|
56
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
57
57
|
identifier = identifier.lower().strip()
|
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
63
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
64
64
|
|
65
65
|
|
66
|
-
@lru_cache(
|
66
|
+
@lru_cache(1, typed=True)
|
67
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
68
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
69
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
scrapling/core/translator.py
CHANGED
@@ -142,3 +142,6 @@ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
|
142
142
|
@lru_cache(maxsize=256)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
145
|
+
|
146
|
+
|
147
|
+
translator_instance = HTMLTranslator()
|
scrapling/core/utils.py
CHANGED
scrapling/defaults.py
CHANGED
@@ -1,10 +1,25 @@
|
|
1
|
-
|
2
|
-
from .
|
3
|
-
from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
|
4
|
-
from .fetchers import StealthyFetcher as _StealthyFetcher
|
1
|
+
# Left this file for backward-compatibility before 0.2.99
|
2
|
+
from scrapling.core.utils import log
|
5
3
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
4
|
+
|
5
|
+
# A lightweight approach to create lazy loader for each import for backward compatibility
|
6
|
+
# This will reduces initial memory footprint significantly (only loads what's used)
|
7
|
+
def __getattr__(name):
|
8
|
+
if name == 'Fetcher':
|
9
|
+
from scrapling.fetchers import Fetcher as cls
|
10
|
+
log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead')
|
11
|
+
return cls
|
12
|
+
elif name == 'AsyncFetcher':
|
13
|
+
from scrapling.fetchers import AsyncFetcher as cls
|
14
|
+
log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead')
|
15
|
+
return cls
|
16
|
+
elif name == 'StealthyFetcher':
|
17
|
+
from scrapling.fetchers import StealthyFetcher as cls
|
18
|
+
log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead')
|
19
|
+
return cls
|
20
|
+
elif name == 'PlayWrightFetcher':
|
21
|
+
from scrapling.fetchers import PlayWrightFetcher as cls
|
22
|
+
log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead')
|
23
|
+
return cls
|
24
|
+
else:
|
25
|
+
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
scrapling/engines/camo.py
CHANGED
@@ -16,12 +16,13 @@ from scrapling.engines.toolbelt import (Response, StatusText,
|
|
16
16
|
class CamoufoxEngine:
|
17
17
|
def __init__(
|
18
18
|
self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
19
|
-
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
|
19
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True, wait: Optional[int] = 0,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
21
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
|
23
23
|
geoip: bool = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
25
|
+
additional_arguments: Dict = None
|
25
26
|
):
|
26
27
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
27
28
|
|
@@ -38,6 +39,7 @@ class CamoufoxEngine:
|
|
38
39
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
39
40
|
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
40
41
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
42
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
41
43
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
42
44
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
43
45
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
@@ -48,6 +50,7 @@ class CamoufoxEngine:
|
|
48
50
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
49
51
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
50
52
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
53
|
+
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
|
51
54
|
"""
|
52
55
|
self.headless = headless
|
53
56
|
self.block_images = bool(block_images)
|
@@ -60,10 +63,12 @@ class CamoufoxEngine:
|
|
60
63
|
self.disable_ads = bool(disable_ads)
|
61
64
|
self.geoip = bool(geoip)
|
62
65
|
self.extra_headers = extra_headers or {}
|
66
|
+
self.additional_arguments = additional_arguments or {}
|
63
67
|
self.proxy = construct_proxy_dict(proxy)
|
64
68
|
self.addons = addons or []
|
65
69
|
self.humanize = humanize
|
66
70
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
71
|
+
self.wait = check_type_validity(wait, [int, float], 0)
|
67
72
|
|
68
73
|
# Page action callable validation
|
69
74
|
self.page_action = None
|
@@ -92,6 +97,7 @@ class CamoufoxEngine:
|
|
92
97
|
"block_webrtc": self.block_webrtc,
|
93
98
|
"block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
94
99
|
"os": None if self.os_randomize else get_os_name(),
|
100
|
+
**self.additional_arguments
|
95
101
|
}
|
96
102
|
|
97
103
|
def _process_response_history(self, first_response):
|
@@ -126,6 +132,38 @@ class CamoufoxEngine:
|
|
126
132
|
|
127
133
|
return history
|
128
134
|
|
135
|
+
async def _async_process_response_history(self, first_response):
|
136
|
+
"""Process response history to build a list of Response objects"""
|
137
|
+
history = []
|
138
|
+
current_request = first_response.request.redirected_from
|
139
|
+
|
140
|
+
try:
|
141
|
+
while current_request:
|
142
|
+
try:
|
143
|
+
current_response = await current_request.response()
|
144
|
+
history.insert(0, Response(
|
145
|
+
url=current_request.url,
|
146
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
147
|
+
text='',
|
148
|
+
body=b'',
|
149
|
+
status=current_response.status if current_response else 301,
|
150
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
151
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
152
|
+
cookies={},
|
153
|
+
headers=await current_response.all_headers() if current_response else {},
|
154
|
+
request_headers=await current_request.all_headers(),
|
155
|
+
**self.adaptor_arguments
|
156
|
+
))
|
157
|
+
except Exception as e:
|
158
|
+
log.error(f"Error processing redirect: {e}")
|
159
|
+
break
|
160
|
+
|
161
|
+
current_request = current_request.redirected_from
|
162
|
+
except Exception as e:
|
163
|
+
log.error(f"Error processing response history: {e}")
|
164
|
+
|
165
|
+
return history
|
166
|
+
|
129
167
|
def fetch(self, url: str) -> Response:
|
130
168
|
"""Opens up the browser and do your request based on your chosen options.
|
131
169
|
|
@@ -177,6 +215,7 @@ class CamoufoxEngine:
|
|
177
215
|
except Exception as e:
|
178
216
|
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
179
217
|
|
218
|
+
page.wait_for_timeout(self.wait)
|
180
219
|
# In case we didn't catch a document type somehow
|
181
220
|
final_response = final_response if final_response else first_response
|
182
221
|
if not final_response:
|
@@ -263,6 +302,7 @@ class CamoufoxEngine:
|
|
263
302
|
except Exception as e:
|
264
303
|
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
265
304
|
|
305
|
+
await page.wait_for_timeout(self.wait)
|
266
306
|
# In case we didn't catch a document type somehow
|
267
307
|
final_response = final_response if final_response else first_response
|
268
308
|
if not final_response:
|
@@ -273,7 +313,7 @@ class CamoufoxEngine:
|
|
273
313
|
# PlayWright API sometimes give empty status text for some reason!
|
274
314
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
275
315
|
|
276
|
-
history = self.
|
316
|
+
history = await self._async_process_response_history(first_response)
|
277
317
|
try:
|
278
318
|
page_content = await page.content()
|
279
319
|
except Exception as e:
|
scrapling/engines/pw.py
CHANGED
@@ -21,6 +21,7 @@ class PlaywrightEngine:
|
|
21
21
|
useragent: Optional[str] = None,
|
22
22
|
network_idle: bool = False,
|
23
23
|
timeout: Optional[float] = 30000,
|
24
|
+
wait: Optional[int] = 0,
|
24
25
|
page_action: Callable = None,
|
25
26
|
wait_selector: Optional[str] = None,
|
26
27
|
locale: Optional[str] = 'en-US',
|
@@ -46,6 +47,7 @@ class PlaywrightEngine:
|
|
46
47
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
47
48
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
48
49
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
50
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
49
51
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
50
52
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
51
53
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -76,6 +78,7 @@ class PlaywrightEngine:
|
|
76
78
|
self.cdp_url = cdp_url
|
77
79
|
self.useragent = useragent
|
78
80
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
81
|
+
self.wait = check_type_validity(wait, [int, float], 0)
|
79
82
|
if page_action is not None:
|
80
83
|
if callable(page_action):
|
81
84
|
self.page_action = page_action
|
@@ -126,7 +129,7 @@ class PlaywrightEngine:
|
|
126
129
|
|
127
130
|
return cdp_url
|
128
131
|
|
129
|
-
@lru_cache(
|
132
|
+
@lru_cache(32, typed=True)
|
130
133
|
def __set_flags(self):
|
131
134
|
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
132
135
|
flags = DEFAULT_STEALTH_FLAGS
|
@@ -169,7 +172,7 @@ class PlaywrightEngine:
|
|
169
172
|
|
170
173
|
return context_kwargs
|
171
174
|
|
172
|
-
@lru_cache(
|
175
|
+
@lru_cache(1)
|
173
176
|
def __stealth_scripts(self):
|
174
177
|
# Basic bypasses nothing fancy as I'm still working on it
|
175
178
|
# But with adding these bypasses to the above config, it bypasses many online tests like
|
@@ -220,6 +223,38 @@ class PlaywrightEngine:
|
|
220
223
|
|
221
224
|
return history
|
222
225
|
|
226
|
+
async def _async_process_response_history(self, first_response):
|
227
|
+
"""Process response history to build a list of Response objects"""
|
228
|
+
history = []
|
229
|
+
current_request = first_response.request.redirected_from
|
230
|
+
|
231
|
+
try:
|
232
|
+
while current_request:
|
233
|
+
try:
|
234
|
+
current_response = await current_request.response()
|
235
|
+
history.insert(0, Response(
|
236
|
+
url=current_request.url,
|
237
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
238
|
+
text='',
|
239
|
+
body=b'',
|
240
|
+
status=current_response.status if current_response else 301,
|
241
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
242
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
243
|
+
cookies={},
|
244
|
+
headers=await current_response.all_headers() if current_response else {},
|
245
|
+
request_headers=await current_request.all_headers(),
|
246
|
+
**self.adaptor_arguments
|
247
|
+
))
|
248
|
+
except Exception as e:
|
249
|
+
log.error(f"Error processing redirect: {e}")
|
250
|
+
break
|
251
|
+
|
252
|
+
current_request = current_request.redirected_from
|
253
|
+
except Exception as e:
|
254
|
+
log.error(f"Error processing response history: {e}")
|
255
|
+
|
256
|
+
return history
|
257
|
+
|
223
258
|
def fetch(self, url: str) -> Response:
|
224
259
|
"""Opens up the browser and do your request based on your chosen options.
|
225
260
|
|
@@ -289,6 +324,7 @@ class PlaywrightEngine:
|
|
289
324
|
except Exception as e:
|
290
325
|
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
291
326
|
|
327
|
+
page.wait_for_timeout(self.wait)
|
292
328
|
# In case we didn't catch a document type somehow
|
293
329
|
final_response = final_response if final_response else first_response
|
294
330
|
if not final_response:
|
@@ -392,6 +428,7 @@ class PlaywrightEngine:
|
|
392
428
|
except Exception as e:
|
393
429
|
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
394
430
|
|
431
|
+
await page.wait_for_timeout(self.wait)
|
395
432
|
# In case we didn't catch a document type somehow
|
396
433
|
final_response = final_response if final_response else first_response
|
397
434
|
if not final_response:
|
@@ -402,7 +439,7 @@ class PlaywrightEngine:
|
|
402
439
|
# PlayWright API sometimes give empty status text for some reason!
|
403
440
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
404
441
|
|
405
|
-
history = self.
|
442
|
+
history = await self._async_process_response_history(first_response)
|
406
443
|
try:
|
407
444
|
page_content = await page.content()
|
408
445
|
except Exception as e:
|
scrapling/engines/static.py
CHANGED
@@ -7,7 +7,7 @@ from scrapling.core.utils import log, lru_cache
|
|
7
7
|
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
8
8
|
|
9
9
|
|
10
|
-
@lru_cache(
|
10
|
+
@lru_cache(2, typed=True) # Singleton easily
|
11
11
|
class StaticEngine:
|
12
12
|
def __init__(
|
13
13
|
self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
|
@@ -16,7 +16,7 @@ class ResponseEncoding:
|
|
16
16
|
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
17
17
|
|
18
18
|
@classmethod
|
19
|
-
@lru_cache(maxsize=
|
19
|
+
@lru_cache(maxsize=128)
|
20
20
|
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
21
21
|
"""Parse content type and parameters from a content-type header value.
|
22
22
|
|
@@ -38,7 +38,7 @@ class ResponseEncoding:
|
|
38
38
|
return content_type, params
|
39
39
|
|
40
40
|
@classmethod
|
41
|
-
@lru_cache(maxsize=
|
41
|
+
@lru_cache(maxsize=128)
|
42
42
|
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
43
|
"""Determine the appropriate character encoding from a content-type header.
|
44
44
|
|
@@ -105,41 +105,77 @@ class Response(Adaptor):
|
|
105
105
|
|
106
106
|
|
107
107
|
class BaseFetcher:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
108
|
+
__slots__ = ()
|
109
|
+
huge_tree: bool = True
|
110
|
+
auto_match: Optional[bool] = False
|
111
|
+
storage: Any = SQLiteStorageSystem
|
112
|
+
keep_cdata: Optional[bool] = False
|
113
|
+
storage_args: Optional[Dict] = None
|
114
|
+
keep_comments: Optional[bool] = False
|
115
|
+
automatch_domain: Optional[str] = None
|
116
|
+
parser_keywords: Tuple = ('huge_tree', 'auto_match', 'storage', 'keep_cdata', 'storage_args', 'keep_comments', 'automatch_domain',) # Left open for the user
|
117
|
+
|
118
|
+
def __init__(self, *args, **kwargs):
|
119
|
+
# For backward-compatibility before 0.2.99
|
120
|
+
args_str = ", ".join(args) or ''
|
121
|
+
kwargs_str = ", ".join(f'{k}={v}' for k, v in kwargs.items()) or ''
|
122
|
+
if args_str:
|
123
|
+
args_str += ', '
|
124
|
+
|
125
|
+
log.warning(f'This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching')
|
126
|
+
pass
|
127
|
+
|
128
|
+
@classmethod
|
129
|
+
def display_config(cls):
|
130
|
+
return dict(
|
131
|
+
huge_tree=cls.huge_tree,
|
132
|
+
keep_comments=cls.keep_comments,
|
133
|
+
keep_cdata=cls.keep_cdata,
|
134
|
+
auto_match=cls.auto_match,
|
135
|
+
storage=cls.storage,
|
136
|
+
storage_args=cls.storage_args,
|
137
|
+
automatch_domain=cls.automatch_domain,
|
138
|
+
)
|
139
|
+
|
140
|
+
@classmethod
|
141
|
+
def configure(cls, **kwargs):
|
142
|
+
"""Set multiple arguments for the parser at once globally
|
143
|
+
|
144
|
+
:param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, auto_match, storage, storage_args, automatch_domain
|
127
145
|
"""
|
146
|
+
for key, value in kwargs.items():
|
147
|
+
key = key.strip().lower()
|
148
|
+
if hasattr(cls, key):
|
149
|
+
if key in cls.parser_keywords:
|
150
|
+
setattr(cls, key, value)
|
151
|
+
else:
|
152
|
+
# Yup, no fun allowed LOL
|
153
|
+
raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
|
154
|
+
else:
|
155
|
+
raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
|
156
|
+
|
157
|
+
if not kwargs:
|
158
|
+
raise AttributeError(f'You must pass a keyword to configure, current keywords: {cls.parser_keywords}?')
|
159
|
+
|
160
|
+
@classmethod
|
161
|
+
def _generate_parser_arguments(cls) -> Dict:
|
128
162
|
# Adaptor class parameters
|
129
163
|
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
130
|
-
|
131
|
-
huge_tree=huge_tree,
|
132
|
-
keep_comments=keep_comments,
|
133
|
-
keep_cdata=keep_cdata,
|
134
|
-
auto_match=auto_match,
|
135
|
-
storage=storage,
|
136
|
-
storage_args=storage_args
|
164
|
+
parser_arguments = dict(
|
165
|
+
huge_tree=cls.huge_tree,
|
166
|
+
keep_comments=cls.keep_comments,
|
167
|
+
keep_cdata=cls.keep_cdata,
|
168
|
+
auto_match=cls.auto_match,
|
169
|
+
storage=cls.storage,
|
170
|
+
storage_args=cls.storage_args
|
137
171
|
)
|
138
|
-
if automatch_domain:
|
139
|
-
if type(automatch_domain) is not str:
|
172
|
+
if cls.automatch_domain:
|
173
|
+
if type(cls.automatch_domain) is not str:
|
140
174
|
log.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
141
175
|
else:
|
142
|
-
|
176
|
+
parser_arguments.update({'automatch_domain': cls.automatch_domain})
|
177
|
+
|
178
|
+
return parser_arguments
|
143
179
|
|
144
180
|
|
145
181
|
class StatusText:
|
@@ -12,7 +12,7 @@ from scrapling.core._types import Dict, Union
|
|
12
12
|
from scrapling.core.utils import lru_cache
|
13
13
|
|
14
14
|
|
15
|
-
@lru_cache(
|
15
|
+
@lru_cache(10, typed=True)
|
16
16
|
def generate_convincing_referer(url: str) -> str:
|
17
17
|
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
18
18
|
|
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
26
26
|
return f'https://www.google.com/search?q={website_name}'
|
27
27
|
|
28
28
|
|
29
|
-
@lru_cache(
|
29
|
+
@lru_cache(1, typed=True)
|
30
30
|
def get_os_name() -> Union[str, None]:
|
31
31
|
"""Get the current OS name in the same format needed for browserforge
|
32
32
|
|
@@ -110,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
110
110
|
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
111
111
|
|
112
112
|
|
113
|
-
@lru_cache(
|
113
|
+
@lru_cache(10, typed=True)
|
114
114
|
def js_bypass_path(filename: str) -> str:
|
115
115
|
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
116
116
|
|