scrapling 0.2.92__tar.gz → 0.2.94__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.92/scrapling.egg-info → scrapling-0.2.94}/PKG-INFO +59 -33
- {scrapling-0.2.92 → scrapling-0.2.94}/README.md +43 -27
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/__init__.py +1 -1
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/_types.py +2 -1
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/custom_types.py +97 -45
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/translator.py +1 -1
- scrapling-0.2.94/scrapling/defaults.py +10 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/camo.py +46 -2
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/pw.py +43 -1
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/static.py +1 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/custom.py +2 -1
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/fetchers.py +5 -5
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/parser.py +158 -194
- {scrapling-0.2.92 → scrapling-0.2.94/scrapling.egg-info}/PKG-INFO +59 -33
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling.egg-info/requires.txt +2 -3
- {scrapling-0.2.92 → scrapling-0.2.94}/setup.cfg +1 -1
- {scrapling-0.2.92 → scrapling-0.2.94}/setup.py +4 -5
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/async/test_playwright.py +1 -1
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/sync/test_playwright.py +1 -1
- scrapling-0.2.92/scrapling/defaults.py +0 -7
- {scrapling-0.2.92 → scrapling-0.2.94}/LICENSE +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/MANIFEST.in +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/cli.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling/py.typed +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.92 → scrapling-0.2.94}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.94
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
|
|
10
10
|
Project-URL: Source, https://github.com/D4Vinci/Scrapling
|
11
11
|
Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
|
12
12
|
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
14
|
Classifier: Intended Audience :: Developers
|
15
15
|
Classifier: License :: OSI Approved :: BSD License
|
16
16
|
Classifier: Natural Language :: English
|
@@ -31,8 +31,7 @@ Classifier: Typing :: Typed
|
|
31
31
|
Requires-Python: >=3.9
|
32
32
|
Description-Content-Type: text/markdown
|
33
33
|
License-File: LICENSE
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist: lxml>=4.5
|
34
|
+
Requires-Dist: lxml>=5.0
|
36
35
|
Requires-Dist: cssselect>=1.2
|
37
36
|
Requires-Dist: click
|
38
37
|
Requires-Dist: w3lib
|
@@ -41,7 +40,18 @@ Requires-Dist: tldextract
|
|
41
40
|
Requires-Dist: httpx[brotli,socks,zstd]
|
42
41
|
Requires-Dist: playwright>=1.49.1
|
43
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
|
-
Requires-Dist: camoufox[geoip]>=0.4.
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.11
|
44
|
+
Dynamic: author
|
45
|
+
Dynamic: author-email
|
46
|
+
Dynamic: classifier
|
47
|
+
Dynamic: description
|
48
|
+
Dynamic: description-content-type
|
49
|
+
Dynamic: home-page
|
50
|
+
Dynamic: license
|
51
|
+
Dynamic: project-url
|
52
|
+
Dynamic: requires-dist
|
53
|
+
Dynamic: requires-python
|
54
|
+
Dynamic: summary
|
45
55
|
|
46
56
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
47
57
|
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
@@ -78,6 +88,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
78
88
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
79
89
|
---
|
80
90
|
|
91
|
+
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
92
|
+
|
93
|
+
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
94
|
+
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
95
|
+
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
96
|
+
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
97
|
+
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
98
|
+
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
99
|
+
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
100
|
+
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
101
|
+
|
102
|
+
|
103
|
+
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
104
|
+
---
|
105
|
+
|
81
106
|
## Table of content
|
82
107
|
* [Key Features](#key-features)
|
83
108
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -122,27 +147,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
122
147
|
## Key Features
|
123
148
|
|
124
149
|
### Fetch websites as you prefer with async support
|
125
|
-
- **HTTP
|
126
|
-
- **
|
127
|
-
- **
|
150
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
|
151
|
+
- **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
|
152
|
+
- **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
|
128
153
|
|
129
154
|
### Adaptive Scraping
|
130
|
-
- 🔄 **Smart Element Tracking**:
|
131
|
-
- 🎯 **Flexible
|
132
|
-
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you
|
155
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
|
156
|
+
- 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
|
157
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
|
133
158
|
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
134
159
|
|
135
|
-
### Performance
|
136
|
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries
|
160
|
+
### High Performance
|
161
|
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
|
137
162
|
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
138
|
-
- ⚡ **Fast JSON serialization**: 10x faster
|
163
|
+
- ⚡ **Fast JSON serialization**: 10x faster than standard library.
|
139
164
|
|
140
|
-
###
|
141
|
-
- 🛠️ **Powerful Navigation API**:
|
142
|
-
- 🧬 **Rich Text Processing**: All strings have built-in
|
143
|
-
- 📝 **
|
144
|
-
- 🔌 **API Similar to Scrapy/BeautifulSoup
|
145
|
-
- 📘 **Type hints
|
165
|
+
### Developer Friendly
|
166
|
+
- 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
|
167
|
+
- 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
|
168
|
+
- 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
|
169
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
|
170
|
+
- 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
|
146
171
|
|
147
172
|
## Getting Started
|
148
173
|
|
@@ -151,21 +176,22 @@ from scrapling import Fetcher
|
|
151
176
|
|
152
177
|
fetcher = Fetcher(auto_match=False)
|
153
178
|
|
154
|
-
#
|
179
|
+
# Do http GET request to a web page and create an Adaptor instance
|
155
180
|
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
156
|
-
# Get all
|
181
|
+
# Get all text content from all HTML tags in the page except `script` and `style` tags
|
157
182
|
page.get_all_text(ignore_tags=('script', 'style'))
|
158
183
|
|
159
|
-
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
184
|
+
# Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
|
160
185
|
quotes = page.css('.quote .text::text') # CSS selector
|
161
186
|
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
162
187
|
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
163
188
|
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
164
189
|
|
165
190
|
# Get the first quote element
|
166
|
-
quote = page.css_first('.quote') #
|
191
|
+
quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
|
167
192
|
|
168
193
|
# Tired of selectors? Use find_all/find
|
194
|
+
# Get all 'div' HTML tags that one of its 'class' values is 'quote'
|
169
195
|
quotes = page.find_all('div', {'class': 'quote'})
|
170
196
|
# Same as
|
171
197
|
quotes = page.find_all('div', class_='quote')
|
@@ -173,10 +199,10 @@ quotes = page.find_all(['div'], class_='quote')
|
|
173
199
|
quotes = page.find_all(class_='quote') # and so on...
|
174
200
|
|
175
201
|
# Working with elements
|
176
|
-
quote.html_content # Inner HTML
|
177
|
-
quote.prettify() # Prettified version of Inner HTML
|
178
|
-
quote.attrib #
|
179
|
-
quote.path # DOM path to element (List)
|
202
|
+
quote.html_content # Get Inner HTML of this element
|
203
|
+
quote.prettify() # Prettified version of Inner HTML above
|
204
|
+
quote.attrib # Get that element's attributes
|
205
|
+
quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
|
180
206
|
```
|
181
207
|
To keep it simple, all methods can be chained on top of each other!
|
182
208
|
|
@@ -241,7 +267,7 @@ then use it right away without initializing like:
|
|
241
267
|
page = StealthyFetcher.fetch('https://example.com')
|
242
268
|
```
|
243
269
|
|
244
|
-
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
270
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
245
271
|
> [!NOTE]
|
246
272
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
247
273
|
### Fetcher
|
@@ -292,7 +318,7 @@ True
|
|
292
318
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
293
319
|
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
294
320
|
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
295
|
-
| disable_ads |
|
321
|
+
| disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
296
322
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
297
323
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
298
324
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -574,7 +600,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
|
|
574
600
|
* Any string passed is considered a tag name
|
575
601
|
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
576
602
|
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
577
|
-
* Any regex patterns passed are used as filters
|
603
|
+
* Any regex patterns passed are used as filters to elements by their text content
|
578
604
|
* Any functions passed are used as filters
|
579
605
|
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
580
606
|
|
@@ -583,7 +609,7 @@ So the way it works is after collecting all passed arguments and keywords, each
|
|
583
609
|
|
584
610
|
1. All elements with the passed tag name(s).
|
585
611
|
2. All elements that match all passed attribute(s).
|
586
|
-
3. All elements that match all passed regex patterns.
|
612
|
+
3. All elements that its text content match all passed regex patterns.
|
587
613
|
4. All elements that fulfill all passed function(s).
|
588
614
|
|
589
615
|
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
@@ -33,6 +33,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
33
33
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
34
34
|
---
|
35
35
|
|
36
|
+
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
37
|
+
|
38
|
+
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
39
|
+
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
40
|
+
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
41
|
+
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
42
|
+
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
43
|
+
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
44
|
+
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
45
|
+
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
46
|
+
|
47
|
+
|
48
|
+
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
49
|
+
---
|
50
|
+
|
36
51
|
## Table of content
|
37
52
|
* [Key Features](#key-features)
|
38
53
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -77,27 +92,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
77
92
|
## Key Features
|
78
93
|
|
79
94
|
### Fetch websites as you prefer with async support
|
80
|
-
- **HTTP
|
81
|
-
- **
|
82
|
-
- **
|
95
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
|
96
|
+
- **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
|
97
|
+
- **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
|
83
98
|
|
84
99
|
### Adaptive Scraping
|
85
|
-
- 🔄 **Smart Element Tracking**:
|
86
|
-
- 🎯 **Flexible
|
87
|
-
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you
|
100
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
|
101
|
+
- 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
|
102
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
|
88
103
|
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
89
104
|
|
90
|
-
### Performance
|
91
|
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries
|
105
|
+
### High Performance
|
106
|
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
|
92
107
|
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
93
|
-
- ⚡ **Fast JSON serialization**: 10x faster
|
108
|
+
- ⚡ **Fast JSON serialization**: 10x faster than standard library.
|
94
109
|
|
95
|
-
###
|
96
|
-
- 🛠️ **Powerful Navigation API**:
|
97
|
-
- 🧬 **Rich Text Processing**: All strings have built-in
|
98
|
-
- 📝 **
|
99
|
-
- 🔌 **API Similar to Scrapy/BeautifulSoup
|
100
|
-
- 📘 **Type hints
|
110
|
+
### Developer Friendly
|
111
|
+
- 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
|
112
|
+
- 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
|
113
|
+
- 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
|
114
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
|
115
|
+
- 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
|
101
116
|
|
102
117
|
## Getting Started
|
103
118
|
|
@@ -106,21 +121,22 @@ from scrapling import Fetcher
|
|
106
121
|
|
107
122
|
fetcher = Fetcher(auto_match=False)
|
108
123
|
|
109
|
-
#
|
124
|
+
# Do http GET request to a web page and create an Adaptor instance
|
110
125
|
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
111
|
-
# Get all
|
126
|
+
# Get all text content from all HTML tags in the page except `script` and `style` tags
|
112
127
|
page.get_all_text(ignore_tags=('script', 'style'))
|
113
128
|
|
114
|
-
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
129
|
+
# Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
|
115
130
|
quotes = page.css('.quote .text::text') # CSS selector
|
116
131
|
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
117
132
|
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
118
133
|
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
119
134
|
|
120
135
|
# Get the first quote element
|
121
|
-
quote = page.css_first('.quote') #
|
136
|
+
quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
|
122
137
|
|
123
138
|
# Tired of selectors? Use find_all/find
|
139
|
+
# Get all 'div' HTML tags that one of its 'class' values is 'quote'
|
124
140
|
quotes = page.find_all('div', {'class': 'quote'})
|
125
141
|
# Same as
|
126
142
|
quotes = page.find_all('div', class_='quote')
|
@@ -128,10 +144,10 @@ quotes = page.find_all(['div'], class_='quote')
|
|
128
144
|
quotes = page.find_all(class_='quote') # and so on...
|
129
145
|
|
130
146
|
# Working with elements
|
131
|
-
quote.html_content # Inner HTML
|
132
|
-
quote.prettify() # Prettified version of Inner HTML
|
133
|
-
quote.attrib #
|
134
|
-
quote.path # DOM path to element (List)
|
147
|
+
quote.html_content # Get Inner HTML of this element
|
148
|
+
quote.prettify() # Prettified version of Inner HTML above
|
149
|
+
quote.attrib # Get that element's attributes
|
150
|
+
quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
|
135
151
|
```
|
136
152
|
To keep it simple, all methods can be chained on top of each other!
|
137
153
|
|
@@ -196,7 +212,7 @@ then use it right away without initializing like:
|
|
196
212
|
page = StealthyFetcher.fetch('https://example.com')
|
197
213
|
```
|
198
214
|
|
199
|
-
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
215
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
200
216
|
> [!NOTE]
|
201
217
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
202
218
|
### Fetcher
|
@@ -247,7 +263,7 @@ True
|
|
247
263
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
248
264
|
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
249
265
|
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
250
|
-
| disable_ads |
|
266
|
+
| disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
251
267
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
252
268
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
253
269
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -529,7 +545,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
|
|
529
545
|
* Any string passed is considered a tag name
|
530
546
|
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
531
547
|
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
532
|
-
* Any regex patterns passed are used as filters
|
548
|
+
* Any regex patterns passed are used as filters to elements by their text content
|
533
549
|
* Any functions passed are used as filters
|
534
550
|
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
535
551
|
|
@@ -538,7 +554,7 @@ So the way it works is after collecting all passed arguments and keywords, each
|
|
538
554
|
|
539
555
|
1. All elements with the passed tag name(s).
|
540
556
|
2. All elements that match all passed attribute(s).
|
541
|
-
3. All elements that match all passed regex patterns.
|
557
|
+
3. All elements that its text content match all passed regex patterns.
|
542
558
|
4. All elements that fulfill all passed function(s).
|
543
559
|
|
544
560
|
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.94"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
-
List, Literal, Optional, Pattern, Tuple, Type,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
|
7
|
+
Union)
|
7
8
|
|
8
9
|
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
10
|
|