scrapling 0.2.96__tar.gz → 0.2.98__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.2.96/scrapling.egg-info → scrapling-0.2.98}/PKG-INFO +23 -22
- {scrapling-0.2.96 → scrapling-0.2.98}/README.md +22 -21
- scrapling-0.2.98/scrapling/__init__.py +41 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/custom_types.py +1 -3
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/storage_adaptors.py +3 -3
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/translator.py +4 -1
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/utils.py +1 -1
- scrapling-0.2.98/scrapling/defaults.py +19 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/camo.py +123 -104
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/pw.py +100 -75
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/static.py +22 -42
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/custom.py +2 -2
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/fingerprints.py +2 -2
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/navigation.py +1 -1
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/fetchers.py +24 -24
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/parser.py +6 -12
- {scrapling-0.2.96 → scrapling-0.2.98/scrapling.egg-info}/PKG-INFO +23 -22
- {scrapling-0.2.96 → scrapling-0.2.98}/setup.cfg +1 -1
- {scrapling-0.2.96 → scrapling-0.2.98}/setup.py +1 -1
- scrapling-0.2.96/scrapling/__init__.py +0 -12
- scrapling-0.2.96/scrapling/defaults.py +0 -10
- {scrapling-0.2.96 → scrapling-0.2.98}/LICENSE +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/MANIFEST.in +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/cli.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling/py.typed +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/async/test_playwright.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/sync/test_playwright.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.96 → scrapling-0.2.98}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.98
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
73
73
|
|
74
74
|
# Sponsors
|
75
75
|
|
76
|
+
[Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
|
77
|
+
|
78
|
+
Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
|
79
|
+
- covering 20+ Google SERP scenarios and mainstream search engines.
|
80
|
+
- support real-time data updates to ensure real-time and accurate information.
|
81
|
+
- It can integrate information from all available online channels and search engines.
|
82
|
+
- Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
|
83
|
+
- **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
|
84
|
+
- 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
85
|
+
- ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
86
|
+
- 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
87
|
+
|
88
|
+
|
89
|
+
[](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
|
90
|
+
---
|
91
|
+
|
76
92
|
[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
|
77
93
|
|
78
94
|
- 👩💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
|
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
88
104
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
89
105
|
---
|
90
106
|
|
91
|
-
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
92
|
-
|
93
|
-
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
94
|
-
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
95
|
-
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
96
|
-
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
97
|
-
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
98
|
-
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
99
|
-
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
100
|
-
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
101
|
-
|
102
|
-
|
103
|
-
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
104
|
-
---
|
105
|
-
|
106
107
|
## Table of content
|
107
108
|
* [Key Features](#key-features)
|
108
109
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -172,7 +173,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
172
173
|
## Getting Started
|
173
174
|
|
174
175
|
```python
|
175
|
-
from scrapling import Fetcher
|
176
|
+
from scrapling.fetchers import Fetcher
|
176
177
|
|
177
178
|
fetcher = Fetcher(auto_match=False)
|
178
179
|
|
@@ -254,7 +255,7 @@ Fetchers are interfaces built on top of other libraries with added features that
|
|
254
255
|
### Features
|
255
256
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
256
257
|
```python
|
257
|
-
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
258
|
+
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
|
258
259
|
```
|
259
260
|
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
260
261
|
|
@@ -286,7 +287,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
|
|
286
287
|
```
|
287
288
|
For Async requests, you will just replace the import like below:
|
288
289
|
```python
|
289
|
-
>> from scrapling import AsyncFetcher
|
290
|
+
>> from scrapling.fetchers import AsyncFetcher
|
290
291
|
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
291
292
|
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
292
293
|
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
@@ -540,7 +541,7 @@ When website owners implement structural changes like
|
|
540
541
|
The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
|
541
542
|
|
542
543
|
```python
|
543
|
-
from scrapling import Adaptor
|
544
|
+
from scrapling.parser import Adaptor
|
544
545
|
# Before the change
|
545
546
|
page = Adaptor(page_source, url='example.com')
|
546
547
|
element = page.css('#p1' auto_save=True)
|
@@ -558,7 +559,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
|
|
558
559
|
If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
|
559
560
|
Now let's test the same selector in both versions
|
560
561
|
```python
|
561
|
-
>> from scrapling import Fetcher
|
562
|
+
>> from scrapling.fetchers import Fetcher
|
562
563
|
>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
|
563
564
|
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
|
564
565
|
>> new_url = "https://stackoverflow.com/"
|
@@ -619,7 +620,7 @@ Note: The filtering process always starts from the first filter it finds in the
|
|
619
620
|
Examples to clear any confusion :)
|
620
621
|
|
621
622
|
```python
|
622
|
-
>> from scrapling import Fetcher
|
623
|
+
>> from scrapling.fetchers import Fetcher
|
623
624
|
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
624
625
|
# Find all elements with tag name `div`.
|
625
626
|
>> page.find_all('div')
|
@@ -18,6 +18,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
18
18
|
|
19
19
|
# Sponsors
|
20
20
|
|
21
|
+
[Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
|
22
|
+
|
23
|
+
Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
|
24
|
+
- covering 20+ Google SERP scenarios and mainstream search engines.
|
25
|
+
- support real-time data updates to ensure real-time and accurate information.
|
26
|
+
- It can integrate information from all available online channels and search engines.
|
27
|
+
- Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
|
28
|
+
- **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
|
29
|
+
- 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
30
|
+
- ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
31
|
+
- 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
32
|
+
|
33
|
+
|
34
|
+
[](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
|
35
|
+
---
|
36
|
+
|
21
37
|
[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
|
22
38
|
|
23
39
|
- 👩💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
|
@@ -33,21 +49,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
33
49
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
34
50
|
---
|
35
51
|
|
36
|
-
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
37
|
-
|
38
|
-
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
39
|
-
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
40
|
-
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
41
|
-
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
42
|
-
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
43
|
-
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
44
|
-
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
45
|
-
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
46
|
-
|
47
|
-
|
48
|
-
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
49
|
-
---
|
50
|
-
|
51
52
|
## Table of content
|
52
53
|
* [Key Features](#key-features)
|
53
54
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -117,7 +118,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
117
118
|
## Getting Started
|
118
119
|
|
119
120
|
```python
|
120
|
-
from scrapling import Fetcher
|
121
|
+
from scrapling.fetchers import Fetcher
|
121
122
|
|
122
123
|
fetcher = Fetcher(auto_match=False)
|
123
124
|
|
@@ -199,7 +200,7 @@ Fetchers are interfaces built on top of other libraries with added features that
|
|
199
200
|
### Features
|
200
201
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
201
202
|
```python
|
202
|
-
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
203
|
+
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
|
203
204
|
```
|
204
205
|
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
205
206
|
|
@@ -231,7 +232,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
|
|
231
232
|
```
|
232
233
|
For Async requests, you will just replace the import like below:
|
233
234
|
```python
|
234
|
-
>> from scrapling import AsyncFetcher
|
235
|
+
>> from scrapling.fetchers import AsyncFetcher
|
235
236
|
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
236
237
|
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
237
238
|
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
@@ -485,7 +486,7 @@ When website owners implement structural changes like
|
|
485
486
|
The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
|
486
487
|
|
487
488
|
```python
|
488
|
-
from scrapling import Adaptor
|
489
|
+
from scrapling.parser import Adaptor
|
489
490
|
# Before the change
|
490
491
|
page = Adaptor(page_source, url='example.com')
|
491
492
|
element = page.css('#p1' auto_save=True)
|
@@ -503,7 +504,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
|
|
503
504
|
If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
|
504
505
|
Now let's test the same selector in both versions
|
505
506
|
```python
|
506
|
-
>> from scrapling import Fetcher
|
507
|
+
>> from scrapling.fetchers import Fetcher
|
507
508
|
>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
|
508
509
|
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
|
509
510
|
>> new_url = "https://stackoverflow.com/"
|
@@ -564,7 +565,7 @@ Note: The filtering process always starts from the first filter it finds in the
|
|
564
565
|
Examples to clear any confusion :)
|
565
566
|
|
566
567
|
```python
|
567
|
-
>> from scrapling import Fetcher
|
568
|
+
>> from scrapling.fetchers import Fetcher
|
568
569
|
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
569
570
|
# Find all elements with tag name `div`.
|
570
571
|
>> page.find_all('div')
|
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
3
|
+
__version__ = "0.2.98"
|
4
|
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
5
|
+
|
6
|
+
|
7
|
+
# A lightweight approach to create lazy loader for each import for backward compatibility
|
8
|
+
# This will reduces initial memory footprint significantly (only loads what's used)
|
9
|
+
def __getattr__(name):
|
10
|
+
if name == 'Fetcher':
|
11
|
+
from scrapling.fetchers import Fetcher as cls
|
12
|
+
return cls
|
13
|
+
elif name == 'Adaptor':
|
14
|
+
from scrapling.parser import Adaptor as cls
|
15
|
+
return cls
|
16
|
+
elif name == 'Adaptors':
|
17
|
+
from scrapling.parser import Adaptors as cls
|
18
|
+
return cls
|
19
|
+
elif name == 'AttributesHandler':
|
20
|
+
from scrapling.core.custom_types import AttributesHandler as cls
|
21
|
+
return cls
|
22
|
+
elif name == 'TextHandler':
|
23
|
+
from scrapling.core.custom_types import TextHandler as cls
|
24
|
+
return cls
|
25
|
+
elif name == 'AsyncFetcher':
|
26
|
+
from scrapling.fetchers import AsyncFetcher as cls
|
27
|
+
return cls
|
28
|
+
elif name == 'StealthyFetcher':
|
29
|
+
from scrapling.fetchers import StealthyFetcher as cls
|
30
|
+
return cls
|
31
|
+
elif name == 'PlayWrightFetcher':
|
32
|
+
from scrapling.fetchers import PlayWrightFetcher as cls
|
33
|
+
return cls
|
34
|
+
elif name == 'CustomFetcher':
|
35
|
+
from scrapling.fetchers import CustomFetcher as cls
|
36
|
+
return cls
|
37
|
+
else:
|
38
|
+
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
39
|
+
|
40
|
+
|
41
|
+
__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
@@ -19,9 +19,7 @@ class TextHandler(str):
|
|
19
19
|
__slots__ = ()
|
20
20
|
|
21
21
|
def __new__(cls, string):
|
22
|
-
|
23
|
-
return super().__new__(cls, string)
|
24
|
-
return super().__new__(cls, '')
|
22
|
+
return super().__new__(cls, str(string))
|
25
23
|
|
26
24
|
def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
|
27
25
|
lst = super().__getitem__(key)
|
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
19
19
|
"""
|
20
20
|
self.url = url
|
21
21
|
|
22
|
-
@lru_cache(
|
22
|
+
@lru_cache(64, typed=True)
|
23
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
24
24
|
if not self.url or type(self.url) is not str:
|
25
25
|
return default_value
|
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
51
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
52
52
|
|
53
53
|
@staticmethod
|
54
|
-
@lru_cache(
|
54
|
+
@lru_cache(128, typed=True)
|
55
55
|
def _get_hash(identifier: str) -> str:
|
56
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
57
57
|
identifier = identifier.lower().strip()
|
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
63
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
64
64
|
|
65
65
|
|
66
|
-
@lru_cache(
|
66
|
+
@lru_cache(1, typed=True)
|
67
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
68
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
69
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
@@ -139,6 +139,9 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@lru_cache(maxsize=
|
142
|
+
@lru_cache(maxsize=256)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
145
|
+
|
146
|
+
|
147
|
+
translator_instance = HTMLTranslator()
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
2
|
+
|
3
|
+
# A lightweight approach to create lazy loader for each import for backward compatibility
|
4
|
+
# This will reduces initial memory footprint significantly (only loads what's used)
|
5
|
+
def __getattr__(name):
|
6
|
+
if name == 'Fetcher':
|
7
|
+
from scrapling.fetchers import Fetcher as cls
|
8
|
+
return cls()
|
9
|
+
elif name == 'AsyncFetcher':
|
10
|
+
from scrapling.fetchers import AsyncFetcher as cls
|
11
|
+
return cls()
|
12
|
+
elif name == 'StealthyFetcher':
|
13
|
+
from scrapling.fetchers import StealthyFetcher as cls
|
14
|
+
return cls()
|
15
|
+
elif name == 'PlayWrightFetcher':
|
16
|
+
from scrapling.fetchers import PlayWrightFetcher as cls
|
17
|
+
return cls()
|
18
|
+
else:
|
19
|
+
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|