scrapling 0.2.96__tar.gz → 0.2.98__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {scrapling-0.2.96/scrapling.egg-info → scrapling-0.2.98}/PKG-INFO +23 -22
  2. {scrapling-0.2.96 → scrapling-0.2.98}/README.md +22 -21
  3. scrapling-0.2.98/scrapling/__init__.py +41 -0
  4. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/custom_types.py +1 -3
  5. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/storage_adaptors.py +3 -3
  6. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/translator.py +4 -1
  7. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/utils.py +1 -1
  8. scrapling-0.2.98/scrapling/defaults.py +19 -0
  9. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/camo.py +123 -104
  10. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/pw.py +100 -75
  11. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/static.py +22 -42
  12. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/custom.py +2 -2
  13. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/fingerprints.py +2 -2
  14. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/navigation.py +1 -1
  15. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/fetchers.py +24 -24
  16. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/parser.py +6 -12
  17. {scrapling-0.2.96 → scrapling-0.2.98/scrapling.egg-info}/PKG-INFO +23 -22
  18. {scrapling-0.2.96 → scrapling-0.2.98}/setup.cfg +1 -1
  19. {scrapling-0.2.96 → scrapling-0.2.98}/setup.py +1 -1
  20. scrapling-0.2.96/scrapling/__init__.py +0 -12
  21. scrapling-0.2.96/scrapling/defaults.py +0 -10
  22. {scrapling-0.2.96 → scrapling-0.2.98}/LICENSE +0 -0
  23. {scrapling-0.2.96 → scrapling-0.2.98}/MANIFEST.in +0 -0
  24. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/cli.py +0 -0
  25. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/__init__.py +0 -0
  26. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/_types.py +0 -0
  27. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/mixins.py +0 -0
  28. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/__init__.py +0 -0
  29. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/constants.py +0 -0
  30. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/__init__.py +0 -0
  31. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  32. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  33. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  34. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  35. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  36. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  37. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  38. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/py.typed +0 -0
  39. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/SOURCES.txt +0 -0
  40. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/dependency_links.txt +0 -0
  41. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/entry_points.txt +0 -0
  42. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/not-zip-safe +0 -0
  43. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/requires.txt +0 -0
  44. {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/top_level.txt +0 -0
  45. {scrapling-0.2.96 → scrapling-0.2.98}/tests/__init__.py +0 -0
  46. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/__init__.py +0 -0
  47. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/__init__.py +0 -0
  48. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/test_camoufox.py +0 -0
  49. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/test_httpx.py +0 -0
  50. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/test_playwright.py +0 -0
  51. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/__init__.py +0 -0
  52. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/test_camoufox.py +0 -0
  53. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/test_httpx.py +0 -0
  54. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/test_playwright.py +0 -0
  55. {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/test_utils.py +0 -0
  56. {scrapling-0.2.96 → scrapling-0.2.98}/tests/parser/__init__.py +0 -0
  57. {scrapling-0.2.96 → scrapling-0.2.98}/tests/parser/test_automatch.py +0 -0
  58. {scrapling-0.2.96 → scrapling-0.2.98}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scrapling
3
- Version: 0.2.96
3
+ Version: 0.2.98
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
73
73
 
74
74
  # Sponsors
75
75
 
76
+ [Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
77
+
78
+ Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
79
+ - covering 20+ Google SERP scenarios and mainstream search engines.
80
+ - support real-time data updates to ensure real-time and accurate information.
81
+ - It can integrate information from all available online channels and search engines.
82
+ - Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
83
+ - **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
84
+ - 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
85
+ - ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
86
+ - 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
87
+
88
+
89
+ [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
90
+ ---
91
+
76
92
  [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
77
93
 
78
94
  - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
88
104
  [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
89
105
  ---
90
106
 
91
- [Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
92
-
93
- - 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
94
- - ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
95
- - 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
96
- - 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
97
- - 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
98
- - 🎁 Free Trial: Try before you buy—experience our service firsthand.
99
- - 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
100
- - 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
101
-
102
-
103
- [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
104
- ---
105
-
106
107
  ## Table of content
107
108
  * [Key Features](#key-features)
108
109
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -172,7 +173,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
172
173
  ## Getting Started
173
174
 
174
175
  ```python
175
- from scrapling import Fetcher
176
+ from scrapling.fetchers import Fetcher
176
177
 
177
178
  fetcher = Fetcher(auto_match=False)
178
179
 
@@ -254,7 +255,7 @@ Fetchers are interfaces built on top of other libraries with added features that
254
255
  ### Features
255
256
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
256
257
  ```python
257
- from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
258
+ from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
258
259
  ```
259
260
  All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
260
261
 
@@ -286,7 +287,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
286
287
  ```
287
288
  For Async requests, you will just replace the import like below:
288
289
  ```python
289
- >> from scrapling import AsyncFetcher
290
+ >> from scrapling.fetchers import AsyncFetcher
290
291
  >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
291
292
  >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
292
293
  >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
@@ -540,7 +541,7 @@ When website owners implement structural changes like
540
541
  The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
541
542
 
542
543
  ```python
543
- from scrapling import Adaptor
544
+ from scrapling.parser import Adaptor
544
545
  # Before the change
545
546
  page = Adaptor(page_source, url='example.com')
546
547
  element = page.css('#p1' auto_save=True)
@@ -558,7 +559,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
558
559
  If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
559
560
  Now let's test the same selector in both versions
560
561
  ```python
561
- >> from scrapling import Fetcher
562
+ >> from scrapling.fetchers import Fetcher
562
563
  >> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
563
564
  >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
564
565
  >> new_url = "https://stackoverflow.com/"
@@ -619,7 +620,7 @@ Note: The filtering process always starts from the first filter it finds in the
619
620
  Examples to clear any confusion :)
620
621
 
621
622
  ```python
622
- >> from scrapling import Fetcher
623
+ >> from scrapling.fetchers import Fetcher
623
624
  >> page = Fetcher().get('https://quotes.toscrape.com/')
624
625
  # Find all elements with tag name `div`.
625
626
  >> page.find_all('div')
@@ -18,6 +18,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
18
18
 
19
19
  # Sponsors
20
20
 
21
+ [Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
22
+
23
+ Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
24
+ - covering 20+ Google SERP scenarios and mainstream search engines.
25
+ - support real-time data updates to ensure real-time and accurate information.
26
+ - It can integrate information from all available online channels and search engines.
27
+ - Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
28
+ - **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
29
+ - 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
30
+ - ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
31
+ - 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
32
+
33
+
34
+ [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
35
+ ---
36
+
21
37
  [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
22
38
 
23
39
  - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
@@ -33,21 +49,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
33
49
  [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
34
50
  ---
35
51
 
36
- [Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
37
-
38
- - 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
39
- - ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
40
- - 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
41
- - 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
42
- - 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
43
- - 🎁 Free Trial: Try before you buy—experience our service firsthand.
44
- - 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
45
- - 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
46
-
47
-
48
- [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
49
- ---
50
-
51
52
  ## Table of content
52
53
  * [Key Features](#key-features)
53
54
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -117,7 +118,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
117
118
  ## Getting Started
118
119
 
119
120
  ```python
120
- from scrapling import Fetcher
121
+ from scrapling.fetchers import Fetcher
121
122
 
122
123
  fetcher = Fetcher(auto_match=False)
123
124
 
@@ -199,7 +200,7 @@ Fetchers are interfaces built on top of other libraries with added features that
199
200
  ### Features
200
201
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
201
202
  ```python
202
- from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
203
+ from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
203
204
  ```
204
205
  All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
205
206
 
@@ -231,7 +232,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
231
232
  ```
232
233
  For Async requests, you will just replace the import like below:
233
234
  ```python
234
- >> from scrapling import AsyncFetcher
235
+ >> from scrapling.fetchers import AsyncFetcher
235
236
  >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
236
237
  >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
237
238
  >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
@@ -485,7 +486,7 @@ When website owners implement structural changes like
485
486
  The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
486
487
 
487
488
  ```python
488
- from scrapling import Adaptor
489
+ from scrapling.parser import Adaptor
489
490
  # Before the change
490
491
  page = Adaptor(page_source, url='example.com')
491
492
  element = page.css('#p1' auto_save=True)
@@ -503,7 +504,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
503
504
  If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
504
505
  Now let's test the same selector in both versions
505
506
  ```python
506
- >> from scrapling import Fetcher
507
+ >> from scrapling.fetchers import Fetcher
507
508
  >> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
508
509
  >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
509
510
  >> new_url = "https://stackoverflow.com/"
@@ -564,7 +565,7 @@ Note: The filtering process always starts from the first filter it finds in the
564
565
  Examples to clear any confusion :)
565
566
 
566
567
  ```python
567
- >> from scrapling import Fetcher
568
+ >> from scrapling.fetchers import Fetcher
568
569
  >> page = Fetcher().get('https://quotes.toscrape.com/')
569
570
  # Find all elements with tag name `div`.
570
571
  >> page.find_all('div')
@@ -0,0 +1,41 @@
1
+
2
+ __author__ = "Karim Shoair (karim.shoair@pm.me)"
3
+ __version__ = "0.2.98"
4
+ __copyright__ = "Copyright (c) 2024 Karim Shoair"
5
+
6
+
7
+ # A lightweight approach to create lazy loader for each import for backward compatibility
8
+ # This will reduces initial memory footprint significantly (only loads what's used)
9
+ def __getattr__(name):
10
+ if name == 'Fetcher':
11
+ from scrapling.fetchers import Fetcher as cls
12
+ return cls
13
+ elif name == 'Adaptor':
14
+ from scrapling.parser import Adaptor as cls
15
+ return cls
16
+ elif name == 'Adaptors':
17
+ from scrapling.parser import Adaptors as cls
18
+ return cls
19
+ elif name == 'AttributesHandler':
20
+ from scrapling.core.custom_types import AttributesHandler as cls
21
+ return cls
22
+ elif name == 'TextHandler':
23
+ from scrapling.core.custom_types import TextHandler as cls
24
+ return cls
25
+ elif name == 'AsyncFetcher':
26
+ from scrapling.fetchers import AsyncFetcher as cls
27
+ return cls
28
+ elif name == 'StealthyFetcher':
29
+ from scrapling.fetchers import StealthyFetcher as cls
30
+ return cls
31
+ elif name == 'PlayWrightFetcher':
32
+ from scrapling.fetchers import PlayWrightFetcher as cls
33
+ return cls
34
+ elif name == 'CustomFetcher':
35
+ from scrapling.fetchers import CustomFetcher as cls
36
+ return cls
37
+ else:
38
+ raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
39
+
40
+
41
+ __all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
@@ -19,9 +19,7 @@ class TextHandler(str):
19
19
  __slots__ = ()
20
20
 
21
21
  def __new__(cls, string):
22
- if isinstance(string, str):
23
- return super().__new__(cls, string)
24
- return super().__new__(cls, '')
22
+ return super().__new__(cls, str(string))
25
23
 
26
24
  def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
27
25
  lst = super().__getitem__(key)
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
19
19
  """
20
20
  self.url = url
21
21
 
22
- @lru_cache(None, typed=True)
22
+ @lru_cache(64, typed=True)
23
23
  def _get_base_url(self, default_value: str = 'default') -> str:
24
24
  if not self.url or type(self.url) is not str:
25
25
  return default_value
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
51
51
  raise NotImplementedError('Storage system must implement `save` method')
52
52
 
53
53
  @staticmethod
54
- @lru_cache(None, typed=True)
54
+ @lru_cache(128, typed=True)
55
55
  def _get_hash(identifier: str) -> str:
56
56
  """If you want to hash identifier in your storage system, use this safer"""
57
57
  identifier = identifier.lower().strip()
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
63
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
64
 
65
65
 
66
- @lru_cache(None, typed=True)
66
+ @lru_cache(1, typed=True)
67
67
  class SQLiteStorageSystem(StorageSystemMixin):
68
68
  """The recommended system to use, it's race condition safe and thread safe.
69
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -139,6 +139,9 @@ class TranslatorMixin:
139
139
 
140
140
 
141
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @lru_cache(maxsize=2048)
142
+ @lru_cache(maxsize=256)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
145
+
146
+
147
+ translator_instance = HTMLTranslator()
@@ -115,7 +115,7 @@ class _StorageTools:
115
115
  # return _impl
116
116
 
117
117
 
118
- @lru_cache(None, typed=True)
118
+ @lru_cache(128, typed=True)
119
119
  def clean_spaces(string):
120
120
  string = string.replace('\t', ' ')
121
121
  string = re.sub('[\n|\r]', '', string)
@@ -0,0 +1,19 @@
1
+ # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
2
+
3
+ # A lightweight approach to create lazy loader for each import for backward compatibility
4
+ # This will reduces initial memory footprint significantly (only loads what's used)
5
+ def __getattr__(name):
6
+ if name == 'Fetcher':
7
+ from scrapling.fetchers import Fetcher as cls
8
+ return cls()
9
+ elif name == 'AsyncFetcher':
10
+ from scrapling.fetchers import AsyncFetcher as cls
11
+ return cls()
12
+ elif name == 'StealthyFetcher':
13
+ from scrapling.fetchers import StealthyFetcher as cls
14
+ return cls()
15
+ elif name == 'PlayWrightFetcher':
16
+ from scrapling.fetchers import PlayWrightFetcher as cls
17
+ return cls()
18
+ else:
19
+ raise AttributeError(f"module 'scrapling' has no attribute '{name}'")