scrapling 0.2.92__tar.gz → 0.2.93__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.92/scrapling.egg-info → scrapling-0.2.93}/PKG-INFO +58 -32
- {scrapling-0.2.92 → scrapling-0.2.93}/README.md +42 -26
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/__init__.py +1 -1
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/_types.py +2 -1
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/custom_types.py +91 -39
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/translator.py +1 -1
- scrapling-0.2.93/scrapling/defaults.py +10 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/camo.py +6 -2
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/pw.py +1 -1
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/fetchers.py +5 -5
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/parser.py +153 -189
- {scrapling-0.2.92 → scrapling-0.2.93/scrapling.egg-info}/PKG-INFO +58 -32
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling.egg-info/requires.txt +2 -3
- {scrapling-0.2.92 → scrapling-0.2.93}/setup.cfg +1 -1
- {scrapling-0.2.92 → scrapling-0.2.93}/setup.py +4 -5
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/async/test_playwright.py +1 -1
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/sync/test_playwright.py +1 -1
- scrapling-0.2.92/scrapling/defaults.py +0 -7
- {scrapling-0.2.92 → scrapling-0.2.93}/LICENSE +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/MANIFEST.in +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/cli.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/static.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling/py.typed +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.93}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.93
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
|
|
10
10
|
Project-URL: Source, https://github.com/D4Vinci/Scrapling
|
11
11
|
Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
|
12
12
|
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
14
|
Classifier: Intended Audience :: Developers
|
15
15
|
Classifier: License :: OSI Approved :: BSD License
|
16
16
|
Classifier: Natural Language :: English
|
@@ -31,8 +31,7 @@ Classifier: Typing :: Typed
|
|
31
31
|
Requires-Python: >=3.9
|
32
32
|
Description-Content-Type: text/markdown
|
33
33
|
License-File: LICENSE
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist: lxml>=4.5
|
34
|
+
Requires-Dist: lxml>=5.0
|
36
35
|
Requires-Dist: cssselect>=1.2
|
37
36
|
Requires-Dist: click
|
38
37
|
Requires-Dist: w3lib
|
@@ -41,7 +40,18 @@ Requires-Dist: tldextract
|
|
41
40
|
Requires-Dist: httpx[brotli,socks,zstd]
|
42
41
|
Requires-Dist: playwright>=1.49.1
|
43
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
|
-
Requires-Dist: camoufox[geoip]>=0.4.
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.10
|
44
|
+
Dynamic: author
|
45
|
+
Dynamic: author-email
|
46
|
+
Dynamic: classifier
|
47
|
+
Dynamic: description
|
48
|
+
Dynamic: description-content-type
|
49
|
+
Dynamic: home-page
|
50
|
+
Dynamic: license
|
51
|
+
Dynamic: project-url
|
52
|
+
Dynamic: requires-dist
|
53
|
+
Dynamic: requires-python
|
54
|
+
Dynamic: summary
|
45
55
|
|
46
56
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
47
57
|
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
@@ -78,6 +88,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
78
88
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
79
89
|
---
|
80
90
|
|
91
|
+
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
92
|
+
|
93
|
+
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
94
|
+
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
95
|
+
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
96
|
+
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
97
|
+
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
98
|
+
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
99
|
+
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
100
|
+
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
101
|
+
|
102
|
+
|
103
|
+
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
104
|
+
---
|
105
|
+
|
81
106
|
## Table of content
|
82
107
|
* [Key Features](#key-features)
|
83
108
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -122,27 +147,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
122
147
|
## Key Features
|
123
148
|
|
124
149
|
### Fetch websites as you prefer with async support
|
125
|
-
- **HTTP
|
126
|
-
- **
|
127
|
-
- **
|
150
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
|
151
|
+
- **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
|
152
|
+
- **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
|
128
153
|
|
129
154
|
### Adaptive Scraping
|
130
|
-
- 🔄 **Smart Element Tracking**:
|
131
|
-
- 🎯 **Flexible
|
132
|
-
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you
|
155
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
|
156
|
+
- 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
|
157
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
|
133
158
|
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
134
159
|
|
135
|
-
### Performance
|
136
|
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries
|
160
|
+
### High Performance
|
161
|
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
|
137
162
|
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
138
|
-
- ⚡ **Fast JSON serialization**: 10x faster
|
163
|
+
- ⚡ **Fast JSON serialization**: 10x faster than standard library.
|
139
164
|
|
140
|
-
###
|
141
|
-
- 🛠️ **Powerful Navigation API**:
|
142
|
-
- 🧬 **Rich Text Processing**: All strings have built-in
|
143
|
-
- 📝 **
|
144
|
-
- 🔌 **API Similar to Scrapy/BeautifulSoup
|
145
|
-
- 📘 **Type hints
|
165
|
+
### Developer Friendly
|
166
|
+
- 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
|
167
|
+
- 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
|
168
|
+
- 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
|
169
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
|
170
|
+
- 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
|
146
171
|
|
147
172
|
## Getting Started
|
148
173
|
|
@@ -151,21 +176,22 @@ from scrapling import Fetcher
|
|
151
176
|
|
152
177
|
fetcher = Fetcher(auto_match=False)
|
153
178
|
|
154
|
-
#
|
179
|
+
# Do http GET request to a web page and create an Adaptor instance
|
155
180
|
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
156
|
-
# Get all
|
181
|
+
# Get all text content from all HTML tags in the page except `script` and `style` tags
|
157
182
|
page.get_all_text(ignore_tags=('script', 'style'))
|
158
183
|
|
159
|
-
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
184
|
+
# Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
|
160
185
|
quotes = page.css('.quote .text::text') # CSS selector
|
161
186
|
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
162
187
|
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
163
188
|
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
164
189
|
|
165
190
|
# Get the first quote element
|
166
|
-
quote = page.css_first('.quote') #
|
191
|
+
quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
|
167
192
|
|
168
193
|
# Tired of selectors? Use find_all/find
|
194
|
+
# Get all 'div' HTML tags that one of its 'class' values is 'quote'
|
169
195
|
quotes = page.find_all('div', {'class': 'quote'})
|
170
196
|
# Same as
|
171
197
|
quotes = page.find_all('div', class_='quote')
|
@@ -173,10 +199,10 @@ quotes = page.find_all(['div'], class_='quote')
|
|
173
199
|
quotes = page.find_all(class_='quote') # and so on...
|
174
200
|
|
175
201
|
# Working with elements
|
176
|
-
quote.html_content # Inner HTML
|
177
|
-
quote.prettify() # Prettified version of Inner HTML
|
178
|
-
quote.attrib #
|
179
|
-
quote.path # DOM path to element (List)
|
202
|
+
quote.html_content # Get Inner HTML of this element
|
203
|
+
quote.prettify() # Prettified version of Inner HTML above
|
204
|
+
quote.attrib # Get that element's attributes
|
205
|
+
quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
|
180
206
|
```
|
181
207
|
To keep it simple, all methods can be chained on top of each other!
|
182
208
|
|
@@ -292,7 +318,7 @@ True
|
|
292
318
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
293
319
|
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
294
320
|
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
295
|
-
| disable_ads |
|
321
|
+
| disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
296
322
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
297
323
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
298
324
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -574,7 +600,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
|
|
574
600
|
* Any string passed is considered a tag name
|
575
601
|
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
576
602
|
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
577
|
-
* Any regex patterns passed are used as filters
|
603
|
+
* Any regex patterns passed are used as filters to elements by their text content
|
578
604
|
* Any functions passed are used as filters
|
579
605
|
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
580
606
|
|
@@ -583,7 +609,7 @@ So the way it works is after collecting all passed arguments and keywords, each
|
|
583
609
|
|
584
610
|
1. All elements with the passed tag name(s).
|
585
611
|
2. All elements that match all passed attribute(s).
|
586
|
-
3. All elements that match all passed regex patterns.
|
612
|
+
3. All elements that its text content match all passed regex patterns.
|
587
613
|
4. All elements that fulfill all passed function(s).
|
588
614
|
|
589
615
|
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
@@ -33,6 +33,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
33
33
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
34
34
|
---
|
35
35
|
|
36
|
+
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
37
|
+
|
38
|
+
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
39
|
+
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
40
|
+
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
41
|
+
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
42
|
+
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
43
|
+
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
44
|
+
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
45
|
+
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
46
|
+
|
47
|
+
|
48
|
+
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
49
|
+
---
|
50
|
+
|
36
51
|
## Table of content
|
37
52
|
* [Key Features](#key-features)
|
38
53
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -77,27 +92,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
77
92
|
## Key Features
|
78
93
|
|
79
94
|
### Fetch websites as you prefer with async support
|
80
|
-
- **HTTP
|
81
|
-
- **
|
82
|
-
- **
|
95
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
|
96
|
+
- **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
|
97
|
+
- **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
|
83
98
|
|
84
99
|
### Adaptive Scraping
|
85
|
-
- 🔄 **Smart Element Tracking**:
|
86
|
-
- 🎯 **Flexible
|
87
|
-
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you
|
100
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
|
101
|
+
- 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
|
102
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
|
88
103
|
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
89
104
|
|
90
|
-
### Performance
|
91
|
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries
|
105
|
+
### High Performance
|
106
|
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
|
92
107
|
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
93
|
-
- ⚡ **Fast JSON serialization**: 10x faster
|
108
|
+
- ⚡ **Fast JSON serialization**: 10x faster than standard library.
|
94
109
|
|
95
|
-
###
|
96
|
-
- 🛠️ **Powerful Navigation API**:
|
97
|
-
- 🧬 **Rich Text Processing**: All strings have built-in
|
98
|
-
- 📝 **
|
99
|
-
- 🔌 **API Similar to Scrapy/BeautifulSoup
|
100
|
-
- 📘 **Type hints
|
110
|
+
### Developer Friendly
|
111
|
+
- 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
|
112
|
+
- 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
|
113
|
+
- 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
|
114
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
|
115
|
+
- 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
|
101
116
|
|
102
117
|
## Getting Started
|
103
118
|
|
@@ -106,21 +121,22 @@ from scrapling import Fetcher
|
|
106
121
|
|
107
122
|
fetcher = Fetcher(auto_match=False)
|
108
123
|
|
109
|
-
#
|
124
|
+
# Do http GET request to a web page and create an Adaptor instance
|
110
125
|
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
111
|
-
# Get all
|
126
|
+
# Get all text content from all HTML tags in the page except `script` and `style` tags
|
112
127
|
page.get_all_text(ignore_tags=('script', 'style'))
|
113
128
|
|
114
|
-
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
129
|
+
# Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
|
115
130
|
quotes = page.css('.quote .text::text') # CSS selector
|
116
131
|
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
117
132
|
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
118
133
|
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
119
134
|
|
120
135
|
# Get the first quote element
|
121
|
-
quote = page.css_first('.quote') #
|
136
|
+
quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
|
122
137
|
|
123
138
|
# Tired of selectors? Use find_all/find
|
139
|
+
# Get all 'div' HTML tags that one of its 'class' values is 'quote'
|
124
140
|
quotes = page.find_all('div', {'class': 'quote'})
|
125
141
|
# Same as
|
126
142
|
quotes = page.find_all('div', class_='quote')
|
@@ -128,10 +144,10 @@ quotes = page.find_all(['div'], class_='quote')
|
|
128
144
|
quotes = page.find_all(class_='quote') # and so on...
|
129
145
|
|
130
146
|
# Working with elements
|
131
|
-
quote.html_content # Inner HTML
|
132
|
-
quote.prettify() # Prettified version of Inner HTML
|
133
|
-
quote.attrib #
|
134
|
-
quote.path # DOM path to element (List)
|
147
|
+
quote.html_content # Get Inner HTML of this element
|
148
|
+
quote.prettify() # Prettified version of Inner HTML above
|
149
|
+
quote.attrib # Get that element's attributes
|
150
|
+
quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
|
135
151
|
```
|
136
152
|
To keep it simple, all methods can be chained on top of each other!
|
137
153
|
|
@@ -247,7 +263,7 @@ True
|
|
247
263
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
248
264
|
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
249
265
|
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
250
|
-
| disable_ads |
|
266
|
+
| disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
251
267
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
252
268
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
253
269
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -529,7 +545,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
|
|
529
545
|
* Any string passed is considered a tag name
|
530
546
|
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
531
547
|
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
532
|
-
* Any regex patterns passed are used as filters
|
548
|
+
* Any regex patterns passed are used as filters to elements by their text content
|
533
549
|
* Any functions passed are used as filters
|
534
550
|
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
535
551
|
|
@@ -538,7 +554,7 @@ So the way it works is after collecting all passed arguments and keywords, each
|
|
538
554
|
|
539
555
|
1. All elements with the passed tag name(s).
|
540
556
|
2. All elements that match all passed attribute(s).
|
541
|
-
3. All elements that match all passed regex patterns.
|
557
|
+
3. All elements that its text content match all passed regex patterns.
|
542
558
|
4. All elements that fulfill all passed function(s).
|
543
559
|
|
544
560
|
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.93"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
-
List, Literal, Optional, Pattern, Tuple, Type,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
|
7
|
+
Union)
|
7
8
|
|
8
9
|
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
10
|
|
@@ -1,13 +1,18 @@
|
|
1
1
|
import re
|
2
|
+
import typing
|
2
3
|
from collections.abc import Mapping
|
3
4
|
from types import MappingProxyType
|
4
5
|
|
5
6
|
from orjson import dumps, loads
|
6
7
|
from w3lib.html import replace_entities as _replace_entities
|
7
8
|
|
8
|
-
from scrapling.core._types import Dict, List,
|
9
|
+
from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
|
10
|
+
Pattern, SupportsIndex, TypeVar, Union)
|
9
11
|
from scrapling.core.utils import _is_iterable, flatten
|
10
12
|
|
13
|
+
# Define type variable for AttributeHandler value type
|
14
|
+
_TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
|
15
|
+
|
11
16
|
|
12
17
|
class TextHandler(str):
|
13
18
|
"""Extends standard Python string by adding more functionality"""
|
@@ -18,72 +23,89 @@ class TextHandler(str):
|
|
18
23
|
return super().__new__(cls, string)
|
19
24
|
return super().__new__(cls, '')
|
20
25
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
26
|
+
@typing.overload
|
27
|
+
def __getitem__(self, key: SupportsIndex) -> 'TextHandler':
|
28
|
+
pass
|
29
|
+
|
30
|
+
@typing.overload
|
31
|
+
def __getitem__(self, key: slice) -> "TextHandlers":
|
32
|
+
pass
|
33
|
+
|
34
|
+
def __getitem__(self, key: Union[SupportsIndex, slice]) -> Union["TextHandler", "TextHandlers"]:
|
35
|
+
lst = super().__getitem__(key)
|
36
|
+
if isinstance(key, slice):
|
37
|
+
lst = [TextHandler(s) for s in lst]
|
38
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
39
|
+
return typing.cast(_TextHandlerType, TextHandler(lst))
|
40
|
+
|
41
|
+
def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
|
42
|
+
return TextHandlers(
|
43
|
+
typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
|
44
|
+
)
|
45
|
+
|
46
|
+
def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
25
47
|
return TextHandler(super().strip(chars))
|
26
48
|
|
27
|
-
def lstrip(self, chars=None):
|
49
|
+
def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
28
50
|
return TextHandler(super().lstrip(chars))
|
29
51
|
|
30
|
-
def rstrip(self, chars=None):
|
52
|
+
def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
31
53
|
return TextHandler(super().rstrip(chars))
|
32
54
|
|
33
|
-
def capitalize(self):
|
55
|
+
def capitalize(self) -> Union[str, 'TextHandler']:
|
34
56
|
return TextHandler(super().capitalize())
|
35
57
|
|
36
|
-
def casefold(self):
|
58
|
+
def casefold(self) -> Union[str, 'TextHandler']:
|
37
59
|
return TextHandler(super().casefold())
|
38
60
|
|
39
|
-
def center(self, width, fillchar=' '):
|
61
|
+
def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
40
62
|
return TextHandler(super().center(width, fillchar))
|
41
63
|
|
42
|
-
def expandtabs(self, tabsize=8):
|
64
|
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
|
43
65
|
return TextHandler(super().expandtabs(tabsize))
|
44
66
|
|
45
|
-
def format(self, *args, **kwargs):
|
67
|
+
def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
|
46
68
|
return TextHandler(super().format(*args, **kwargs))
|
47
69
|
|
48
|
-
def format_map(self, mapping):
|
70
|
+
def format_map(self, mapping) -> Union[str, 'TextHandler']:
|
49
71
|
return TextHandler(super().format_map(mapping))
|
50
72
|
|
51
|
-
def join(self, iterable):
|
73
|
+
def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
|
52
74
|
return TextHandler(super().join(iterable))
|
53
75
|
|
54
|
-
def ljust(self, width, fillchar=' '):
|
76
|
+
def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
55
77
|
return TextHandler(super().ljust(width, fillchar))
|
56
78
|
|
57
|
-
def rjust(self, width, fillchar=' '):
|
79
|
+
def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
58
80
|
return TextHandler(super().rjust(width, fillchar))
|
59
81
|
|
60
|
-
def swapcase(self):
|
82
|
+
def swapcase(self) -> Union[str, 'TextHandler']:
|
61
83
|
return TextHandler(super().swapcase())
|
62
84
|
|
63
|
-
def title(self):
|
85
|
+
def title(self) -> Union[str, 'TextHandler']:
|
64
86
|
return TextHandler(super().title())
|
65
87
|
|
66
|
-
def translate(self, table):
|
88
|
+
def translate(self, table) -> Union[str, 'TextHandler']:
|
67
89
|
return TextHandler(super().translate(table))
|
68
90
|
|
69
|
-
def zfill(self, width):
|
91
|
+
def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
|
70
92
|
return TextHandler(super().zfill(width))
|
71
93
|
|
72
|
-
def replace(self, old, new, count
|
94
|
+
def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
|
73
95
|
return TextHandler(super().replace(old, new, count))
|
74
96
|
|
75
|
-
def upper(self):
|
97
|
+
def upper(self) -> Union[str, 'TextHandler']:
|
76
98
|
return TextHandler(super().upper())
|
77
99
|
|
78
|
-
def lower(self):
|
100
|
+
def lower(self) -> Union[str, 'TextHandler']:
|
79
101
|
return TextHandler(super().lower())
|
80
102
|
##############
|
81
103
|
|
82
|
-
def sort(self, reverse: bool = False) -> str:
|
104
|
+
def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
|
83
105
|
"""Return a sorted version of the string"""
|
84
106
|
return self.__class__("".join(sorted(self, reverse=reverse)))
|
85
107
|
|
86
|
-
def clean(self) -> str:
|
108
|
+
def clean(self) -> Union[str, 'TextHandler']:
|
87
109
|
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
88
110
|
data = re.sub(r'[\t|\r|\n]', '', self)
|
89
111
|
data = re.sub(' +', ' ', data)
|
@@ -105,10 +127,32 @@ class TextHandler(str):
|
|
105
127
|
# Check this out: https://github.com/ijl/orjson/issues/445
|
106
128
|
return loads(str(self))
|
107
129
|
|
130
|
+
@typing.overload
|
131
|
+
def re(
|
132
|
+
self,
|
133
|
+
regex: Union[str, Pattern[str]],
|
134
|
+
check_match: Literal[True],
|
135
|
+
replace_entities: bool = True,
|
136
|
+
clean_match: bool = False,
|
137
|
+
case_sensitive: bool = False,
|
138
|
+
) -> bool:
|
139
|
+
...
|
140
|
+
|
141
|
+
@typing.overload
|
142
|
+
def re(
|
143
|
+
self,
|
144
|
+
regex: Union[str, Pattern[str]],
|
145
|
+
replace_entities: bool = True,
|
146
|
+
clean_match: bool = False,
|
147
|
+
case_sensitive: bool = False,
|
148
|
+
check_match: Literal[False] = False,
|
149
|
+
) -> "TextHandlers[TextHandler]":
|
150
|
+
...
|
151
|
+
|
108
152
|
def re(
|
109
153
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
110
154
|
case_sensitive: bool = False, check_match: bool = False
|
111
|
-
) -> Union[
|
155
|
+
) -> Union["TextHandlers[TextHandler]", bool]:
|
112
156
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
113
157
|
|
114
158
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -133,12 +177,12 @@ class TextHandler(str):
|
|
133
177
|
results = flatten(results)
|
134
178
|
|
135
179
|
if not replace_entities:
|
136
|
-
return [TextHandler(string) for string in results]
|
180
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
|
137
181
|
|
138
|
-
return [TextHandler(_replace_entities(s)) for s in results]
|
182
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
|
139
183
|
|
140
184
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
141
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
185
|
+
clean_match: bool = False, case_sensitive: bool = False) -> "TextHandler":
|
142
186
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
143
187
|
|
144
188
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -158,15 +202,23 @@ class TextHandlers(List[TextHandler]):
|
|
158
202
|
"""
|
159
203
|
__slots__ = ()
|
160
204
|
|
161
|
-
|
205
|
+
@typing.overload
|
206
|
+
def __getitem__(self, pos: SupportsIndex) -> TextHandler:
|
207
|
+
pass
|
208
|
+
|
209
|
+
@typing.overload
|
210
|
+
def __getitem__(self, pos: slice) -> "TextHandlers":
|
211
|
+
pass
|
212
|
+
|
213
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
|
162
214
|
lst = super().__getitem__(pos)
|
163
215
|
if isinstance(pos, slice):
|
164
|
-
|
165
|
-
|
166
|
-
|
216
|
+
lst = [TextHandler(s) for s in lst]
|
217
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
218
|
+
return typing.cast(_TextHandlerType, TextHandler(lst))
|
167
219
|
|
168
220
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
169
|
-
case_sensitive: bool = False) -> '
|
221
|
+
case_sensitive: bool = False) -> 'TextHandlers[TextHandler]':
|
170
222
|
"""Call the ``.re()`` method for each element in this list and return
|
171
223
|
their results flattened as TextHandlers.
|
172
224
|
|
@@ -178,10 +230,10 @@ class TextHandlers(List[TextHandler]):
|
|
178
230
|
results = [
|
179
231
|
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
180
232
|
]
|
181
|
-
return flatten(results)
|
233
|
+
return TextHandlers(flatten(results))
|
182
234
|
|
183
235
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
184
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
236
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
|
185
237
|
"""Call the ``.re_first()`` method for each element in this list and return
|
186
238
|
the first result or the default value otherwise.
|
187
239
|
|
@@ -210,7 +262,7 @@ class TextHandlers(List[TextHandler]):
|
|
210
262
|
get_all = extract
|
211
263
|
|
212
264
|
|
213
|
-
class AttributesHandler(Mapping):
|
265
|
+
class AttributesHandler(Mapping[str, _TextHandlerType]):
|
214
266
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
215
267
|
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
216
268
|
"""
|
@@ -231,7 +283,7 @@ class AttributesHandler(Mapping):
|
|
231
283
|
# Fastest read-only mapping type
|
232
284
|
self._data = MappingProxyType(mapping)
|
233
285
|
|
234
|
-
def get(self, key, default=None):
|
286
|
+
def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
|
235
287
|
"""Acts like standard dictionary `.get()` method"""
|
236
288
|
return self._data.get(key, default)
|
237
289
|
|
@@ -253,7 +305,7 @@ class AttributesHandler(Mapping):
|
|
253
305
|
"""Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
|
254
306
|
return dumps(dict(self._data))
|
255
307
|
|
256
|
-
def __getitem__(self, key):
|
308
|
+
def __getitem__(self, key: str) -> _TextHandlerType:
|
257
309
|
return self._data[key]
|
258
310
|
|
259
311
|
def __iter__(self):
|
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@lru_cache(maxsize=
|
142
|
+
@lru_cache(maxsize=2048)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from .fetchers import AsyncFetcher as _AsyncFetcher
|
2
|
+
from .fetchers import Fetcher as _Fetcher
|
3
|
+
from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
|
4
|
+
from .fetchers import StealthyFetcher as _StealthyFetcher
|
5
|
+
|
6
|
+
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
7
|
+
Fetcher = _Fetcher()
|
8
|
+
AsyncFetcher = _AsyncFetcher()
|
9
|
+
StealthyFetcher = _StealthyFetcher()
|
10
|
+
PlayWrightFetcher = _PlayWrightFetcher()
|