scrapling 0.2.91__tar.gz → 0.2.93__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.91 → scrapling-0.2.93}/MANIFEST.in +3 -0
- {scrapling-0.2.91/scrapling.egg-info → scrapling-0.2.93}/PKG-INFO +64 -71
- {scrapling-0.2.91 → scrapling-0.2.93}/README.md +47 -65
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/__init__.py +1 -1
- scrapling-0.2.93/scrapling/cli.py +37 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/_types.py +2 -1
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/custom_types.py +91 -39
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/translator.py +1 -1
- scrapling-0.2.93/scrapling/defaults.py +10 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/camo.py +16 -14
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/pw.py +11 -13
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/fetchers.py +5 -5
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/parser.py +155 -191
- {scrapling-0.2.91 → scrapling-0.2.93/scrapling.egg-info}/PKG-INFO +64 -71
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/SOURCES.txt +2 -0
- scrapling-0.2.93/scrapling.egg-info/entry_points.txt +2 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/requires.txt +3 -3
- {scrapling-0.2.91 → scrapling-0.2.93}/setup.cfg +1 -1
- {scrapling-0.2.91 → scrapling-0.2.93}/setup.py +10 -5
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/test_playwright.py +1 -1
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/test_playwright.py +1 -1
- scrapling-0.2.91/scrapling/defaults.py +0 -7
- {scrapling-0.2.91 → scrapling-0.2.93}/LICENSE +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/static.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/py.typed +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.91 → scrapling-0.2.93}/tests/parser/test_general.py +0 -0
@@ -4,7 +4,10 @@ include *.js
|
|
4
4
|
include scrapling/engines/toolbelt/bypasses/*.js
|
5
5
|
include scrapling/*.db
|
6
6
|
include scrapling/*.db*
|
7
|
+
include scrapling/*.db-*
|
7
8
|
include scrapling/py.typed
|
9
|
+
include scrapling/.scrapling_dependencies_installed
|
10
|
+
include .scrapling_dependencies_installed
|
8
11
|
|
9
12
|
recursive-exclude * __pycache__
|
10
13
|
recursive-exclude * *.py[co]
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.93
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
|
|
10
10
|
Project-URL: Source, https://github.com/D4Vinci/Scrapling
|
11
11
|
Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
|
12
12
|
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
14
|
Classifier: Intended Audience :: Developers
|
15
15
|
Classifier: License :: OSI Approved :: BSD License
|
16
16
|
Classifier: Natural Language :: English
|
@@ -31,16 +31,27 @@ Classifier: Typing :: Typed
|
|
31
31
|
Requires-Python: >=3.9
|
32
32
|
Description-Content-Type: text/markdown
|
33
33
|
License-File: LICENSE
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist: lxml>=4.5
|
34
|
+
Requires-Dist: lxml>=5.0
|
36
35
|
Requires-Dist: cssselect>=1.2
|
36
|
+
Requires-Dist: click
|
37
37
|
Requires-Dist: w3lib
|
38
38
|
Requires-Dist: orjson>=3
|
39
39
|
Requires-Dist: tldextract
|
40
40
|
Requires-Dist: httpx[brotli,socks,zstd]
|
41
41
|
Requires-Dist: playwright>=1.49.1
|
42
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
43
|
-
Requires-Dist: camoufox[geoip]>=0.4.
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.10
|
44
|
+
Dynamic: author
|
45
|
+
Dynamic: author-email
|
46
|
+
Dynamic: classifier
|
47
|
+
Dynamic: description
|
48
|
+
Dynamic: description-content-type
|
49
|
+
Dynamic: home-page
|
50
|
+
Dynamic: license
|
51
|
+
Dynamic: project-url
|
52
|
+
Dynamic: requires-dist
|
53
|
+
Dynamic: requires-python
|
54
|
+
Dynamic: summary
|
44
55
|
|
45
56
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
46
57
|
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
@@ -77,6 +88,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
77
88
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
78
89
|
---
|
79
90
|
|
91
|
+
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
92
|
+
|
93
|
+
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
94
|
+
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
95
|
+
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
96
|
+
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
97
|
+
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
98
|
+
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
99
|
+
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
100
|
+
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
101
|
+
|
102
|
+
|
103
|
+
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
104
|
+
---
|
105
|
+
|
80
106
|
## Table of content
|
81
107
|
* [Key Features](#key-features)
|
82
108
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -121,27 +147,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
121
147
|
## Key Features
|
122
148
|
|
123
149
|
### Fetch websites as you prefer with async support
|
124
|
-
- **HTTP
|
125
|
-
- **
|
126
|
-
- **
|
150
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
|
151
|
+
- **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
|
152
|
+
- **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
|
127
153
|
|
128
154
|
### Adaptive Scraping
|
129
|
-
- 🔄 **Smart Element Tracking**:
|
130
|
-
- 🎯 **Flexible
|
131
|
-
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you
|
155
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
|
156
|
+
- 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
|
157
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
|
132
158
|
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
133
159
|
|
134
|
-
### Performance
|
135
|
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries
|
160
|
+
### High Performance
|
161
|
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
|
136
162
|
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
137
|
-
- ⚡ **Fast JSON serialization**: 10x faster
|
163
|
+
- ⚡ **Fast JSON serialization**: 10x faster than standard library.
|
138
164
|
|
139
|
-
###
|
140
|
-
- 🛠️ **Powerful Navigation API**:
|
141
|
-
- 🧬 **Rich Text Processing**: All strings have built-in
|
142
|
-
- 📝 **
|
143
|
-
- 🔌 **API Similar to Scrapy/BeautifulSoup
|
144
|
-
- 📘 **Type hints
|
165
|
+
### Developer Friendly
|
166
|
+
- 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
|
167
|
+
- 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
|
168
|
+
- 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
|
169
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
|
170
|
+
- 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
|
145
171
|
|
146
172
|
## Getting Started
|
147
173
|
|
@@ -150,21 +176,22 @@ from scrapling import Fetcher
|
|
150
176
|
|
151
177
|
fetcher = Fetcher(auto_match=False)
|
152
178
|
|
153
|
-
#
|
179
|
+
# Do http GET request to a web page and create an Adaptor instance
|
154
180
|
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
155
|
-
# Get all
|
181
|
+
# Get all text content from all HTML tags in the page except `script` and `style` tags
|
156
182
|
page.get_all_text(ignore_tags=('script', 'style'))
|
157
183
|
|
158
|
-
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
184
|
+
# Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
|
159
185
|
quotes = page.css('.quote .text::text') # CSS selector
|
160
186
|
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
161
187
|
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
162
188
|
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
163
189
|
|
164
190
|
# Get the first quote element
|
165
|
-
quote = page.css_first('.quote') #
|
191
|
+
quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
|
166
192
|
|
167
193
|
# Tired of selectors? Use find_all/find
|
194
|
+
# Get all 'div' HTML tags that one of its 'class' values is 'quote'
|
168
195
|
quotes = page.find_all('div', {'class': 'quote'})
|
169
196
|
# Same as
|
170
197
|
quotes = page.find_all('div', class_='quote')
|
@@ -172,10 +199,10 @@ quotes = page.find_all(['div'], class_='quote')
|
|
172
199
|
quotes = page.find_all(class_='quote') # and so on...
|
173
200
|
|
174
201
|
# Working with elements
|
175
|
-
quote.html_content # Inner HTML
|
176
|
-
quote.prettify() # Prettified version of Inner HTML
|
177
|
-
quote.attrib #
|
178
|
-
quote.path # DOM path to element (List)
|
202
|
+
quote.html_content # Get Inner HTML of this element
|
203
|
+
quote.prettify() # Prettified version of Inner HTML above
|
204
|
+
quote.attrib # Get that element's attributes
|
205
|
+
quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
|
179
206
|
```
|
180
207
|
To keep it simple, all methods can be chained on top of each other!
|
181
208
|
|
@@ -211,52 +238,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
211
238
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
212
239
|
|
213
240
|
## Installation
|
214
|
-
Scrapling is a breeze to get started with
|
241
|
+
Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
|
215
242
|
```bash
|
216
243
|
pip3 install scrapling
|
217
244
|
```
|
218
|
-
|
219
|
-
<details><summary>Windows OS</summary>
|
220
|
-
|
245
|
+
Then run this command to install browsers' dependencies needed to use Fetcher classes
|
221
246
|
```bash
|
222
|
-
|
223
|
-
```
|
224
|
-
</details>
|
225
|
-
<details><summary>MacOS</summary>
|
226
|
-
|
227
|
-
```bash
|
228
|
-
python3 -m camoufox fetch --browserforge
|
229
|
-
```
|
230
|
-
</details>
|
231
|
-
<details><summary>Linux</summary>
|
232
|
-
|
233
|
-
```bash
|
234
|
-
python -m camoufox fetch --browserforge
|
235
|
-
```
|
236
|
-
On a fresh installation of Linux, you may also need the following Firefox dependencies:
|
237
|
-
- Debian-based distros
|
238
|
-
```bash
|
239
|
-
sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
|
240
|
-
```
|
241
|
-
- Arch-based distros
|
242
|
-
```bash
|
243
|
-
sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
|
244
|
-
```
|
245
|
-
</details>
|
246
|
-
|
247
|
-
<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
|
248
|
-
|
249
|
-
- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
|
250
|
-
```commandline
|
251
|
-
playwright install chromium
|
252
|
-
```
|
253
|
-
- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
|
254
|
-
```commandline
|
255
|
-
python -m browserforge update
|
247
|
+
scrapling install
|
256
248
|
```
|
249
|
+
If you have any installation issues, please open an issue.
|
257
250
|
|
258
251
|
## Fetching Websites
|
259
|
-
Fetchers are
|
252
|
+
Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
260
253
|
|
261
254
|
### Features
|
262
255
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
@@ -325,7 +318,7 @@ True
|
|
325
318
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
326
319
|
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
327
320
|
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
328
|
-
| disable_ads |
|
321
|
+
| disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
329
322
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
330
323
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
331
324
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -607,7 +600,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
|
|
607
600
|
* Any string passed is considered a tag name
|
608
601
|
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
609
602
|
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
610
|
-
* Any regex patterns passed are used as filters
|
603
|
+
* Any regex patterns passed are used as filters to elements by their text content
|
611
604
|
* Any functions passed are used as filters
|
612
605
|
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
613
606
|
|
@@ -616,7 +609,7 @@ So the way it works is after collecting all passed arguments and keywords, each
|
|
616
609
|
|
617
610
|
1. All elements with the passed tag name(s).
|
618
611
|
2. All elements that match all passed attribute(s).
|
619
|
-
3. All elements that match all passed regex patterns.
|
612
|
+
3. All elements that its text content match all passed regex patterns.
|
620
613
|
4. All elements that fulfill all passed function(s).
|
621
614
|
|
622
615
|
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
@@ -33,6 +33,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
33
33
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
34
34
|
---
|
35
35
|
|
36
|
+
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
37
|
+
|
38
|
+
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
39
|
+
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
40
|
+
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
41
|
+
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
42
|
+
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
43
|
+
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
44
|
+
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
45
|
+
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
46
|
+
|
47
|
+
|
48
|
+
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
49
|
+
---
|
50
|
+
|
36
51
|
## Table of content
|
37
52
|
* [Key Features](#key-features)
|
38
53
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -77,27 +92,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
77
92
|
## Key Features
|
78
93
|
|
79
94
|
### Fetch websites as you prefer with async support
|
80
|
-
- **HTTP
|
81
|
-
- **
|
82
|
-
- **
|
95
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
|
96
|
+
- **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
|
97
|
+
- **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
|
83
98
|
|
84
99
|
### Adaptive Scraping
|
85
|
-
- 🔄 **Smart Element Tracking**:
|
86
|
-
- 🎯 **Flexible
|
87
|
-
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you
|
100
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
|
101
|
+
- 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
|
102
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
|
88
103
|
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
89
104
|
|
90
|
-
### Performance
|
91
|
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries
|
105
|
+
### High Performance
|
106
|
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
|
92
107
|
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
93
|
-
- ⚡ **Fast JSON serialization**: 10x faster
|
108
|
+
- ⚡ **Fast JSON serialization**: 10x faster than standard library.
|
94
109
|
|
95
|
-
###
|
96
|
-
- 🛠️ **Powerful Navigation API**:
|
97
|
-
- 🧬 **Rich Text Processing**: All strings have built-in
|
98
|
-
- 📝 **
|
99
|
-
- 🔌 **API Similar to Scrapy/BeautifulSoup
|
100
|
-
- 📘 **Type hints
|
110
|
+
### Developer Friendly
|
111
|
+
- 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
|
112
|
+
- 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
|
113
|
+
- 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
|
114
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
|
115
|
+
- 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
|
101
116
|
|
102
117
|
## Getting Started
|
103
118
|
|
@@ -106,21 +121,22 @@ from scrapling import Fetcher
|
|
106
121
|
|
107
122
|
fetcher = Fetcher(auto_match=False)
|
108
123
|
|
109
|
-
#
|
124
|
+
# Do http GET request to a web page and create an Adaptor instance
|
110
125
|
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
111
|
-
# Get all
|
126
|
+
# Get all text content from all HTML tags in the page except `script` and `style` tags
|
112
127
|
page.get_all_text(ignore_tags=('script', 'style'))
|
113
128
|
|
114
|
-
# Get all quotes, any of these methods will return a list of strings (TextHandlers)
|
129
|
+
# Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
|
115
130
|
quotes = page.css('.quote .text::text') # CSS selector
|
116
131
|
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
117
132
|
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
118
133
|
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
119
134
|
|
120
135
|
# Get the first quote element
|
121
|
-
quote = page.css_first('.quote') #
|
136
|
+
quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
|
122
137
|
|
123
138
|
# Tired of selectors? Use find_all/find
|
139
|
+
# Get all 'div' HTML tags that one of its 'class' values is 'quote'
|
124
140
|
quotes = page.find_all('div', {'class': 'quote'})
|
125
141
|
# Same as
|
126
142
|
quotes = page.find_all('div', class_='quote')
|
@@ -128,10 +144,10 @@ quotes = page.find_all(['div'], class_='quote')
|
|
128
144
|
quotes = page.find_all(class_='quote') # and so on...
|
129
145
|
|
130
146
|
# Working with elements
|
131
|
-
quote.html_content # Inner HTML
|
132
|
-
quote.prettify() # Prettified version of Inner HTML
|
133
|
-
quote.attrib #
|
134
|
-
quote.path # DOM path to element (List)
|
147
|
+
quote.html_content # Get Inner HTML of this element
|
148
|
+
quote.prettify() # Prettified version of Inner HTML above
|
149
|
+
quote.attrib # Get that element's attributes
|
150
|
+
quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
|
135
151
|
```
|
136
152
|
To keep it simple, all methods can be chained on top of each other!
|
137
153
|
|
@@ -167,52 +183,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
167
183
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
168
184
|
|
169
185
|
## Installation
|
170
|
-
Scrapling is a breeze to get started with
|
186
|
+
Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
|
171
187
|
```bash
|
172
188
|
pip3 install scrapling
|
173
189
|
```
|
174
|
-
|
175
|
-
<details><summary>Windows OS</summary>
|
176
|
-
|
190
|
+
Then run this command to install browsers' dependencies needed to use Fetcher classes
|
177
191
|
```bash
|
178
|
-
|
179
|
-
```
|
180
|
-
</details>
|
181
|
-
<details><summary>MacOS</summary>
|
182
|
-
|
183
|
-
```bash
|
184
|
-
python3 -m camoufox fetch --browserforge
|
185
|
-
```
|
186
|
-
</details>
|
187
|
-
<details><summary>Linux</summary>
|
188
|
-
|
189
|
-
```bash
|
190
|
-
python -m camoufox fetch --browserforge
|
191
|
-
```
|
192
|
-
On a fresh installation of Linux, you may also need the following Firefox dependencies:
|
193
|
-
- Debian-based distros
|
194
|
-
```bash
|
195
|
-
sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
|
196
|
-
```
|
197
|
-
- Arch-based distros
|
198
|
-
```bash
|
199
|
-
sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
|
200
|
-
```
|
201
|
-
</details>
|
202
|
-
|
203
|
-
<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
|
204
|
-
|
205
|
-
- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
|
206
|
-
```commandline
|
207
|
-
playwright install chromium
|
208
|
-
```
|
209
|
-
- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
|
210
|
-
```commandline
|
211
|
-
python -m browserforge update
|
192
|
+
scrapling install
|
212
193
|
```
|
194
|
+
If you have any installation issues, please open an issue.
|
213
195
|
|
214
196
|
## Fetching Websites
|
215
|
-
Fetchers are
|
197
|
+
Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
216
198
|
|
217
199
|
### Features
|
218
200
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
@@ -281,7 +263,7 @@ True
|
|
281
263
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
282
264
|
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
283
265
|
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
284
|
-
| disable_ads |
|
266
|
+
| disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
285
267
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
286
268
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
287
269
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -563,7 +545,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
|
|
563
545
|
* Any string passed is considered a tag name
|
564
546
|
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
565
547
|
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
566
|
-
* Any regex patterns passed are used as filters
|
548
|
+
* Any regex patterns passed are used as filters to elements by their text content
|
567
549
|
* Any functions passed are used as filters
|
568
550
|
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
569
551
|
|
@@ -572,7 +554,7 @@ So the way it works is after collecting all passed arguments and keywords, each
|
|
572
554
|
|
573
555
|
1. All elements with the passed tag name(s).
|
574
556
|
2. All elements that match all passed attribute(s).
|
575
|
-
3. All elements that match all passed regex patterns.
|
557
|
+
3. All elements that its text content match all passed regex patterns.
|
576
558
|
4. All elements that fulfill all passed function(s).
|
577
559
|
|
578
560
|
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.93"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import sys
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import click
|
7
|
+
|
8
|
+
|
9
|
+
def get_package_dir():
|
10
|
+
return Path(os.path.dirname(__file__))
|
11
|
+
|
12
|
+
|
13
|
+
def run_command(command, line):
|
14
|
+
print(f"Installing {line}...")
|
15
|
+
_ = subprocess.check_call(command, shell=True)
|
16
|
+
# I meant to not use try except here
|
17
|
+
|
18
|
+
|
19
|
+
@click.command(help="Install all Scrapling's Fetchers dependencies")
|
20
|
+
def install():
|
21
|
+
if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
|
22
|
+
run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
|
23
|
+
run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
|
24
|
+
run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
|
25
|
+
# if no errors raised by above commands, then we add below file
|
26
|
+
get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
|
27
|
+
else:
|
28
|
+
print('The dependencies are already installed')
|
29
|
+
|
30
|
+
|
31
|
+
@click.group()
|
32
|
+
def main():
|
33
|
+
pass
|
34
|
+
|
35
|
+
|
36
|
+
# Adding commands
|
37
|
+
main.add_command(install)
|
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
-
List, Literal, Optional, Pattern, Tuple, Type,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
|
7
|
+
Union)
|
7
8
|
|
8
9
|
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
10
|
|