scrapling 0.2.98__tar.gz → 0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling-0.3/PKG-INFO +409 -0
- scrapling-0.3/README.md +322 -0
- scrapling-0.3/pyproject.toml +103 -0
- scrapling-0.3/scrapling/__init__.py +28 -0
- scrapling-0.3/scrapling/cli.py +836 -0
- scrapling-0.3/scrapling/core/_html_utils.py +348 -0
- scrapling-0.3/scrapling/core/_types.py +44 -0
- scrapling-0.3/scrapling/core/ai.py +611 -0
- scrapling-0.3/scrapling/core/custom_types.py +394 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/core/mixins.py +27 -19
- scrapling-0.3/scrapling/core/shell.py +647 -0
- scrapling-0.2.98/scrapling/core/storage_adaptors.py → scrapling-0.3/scrapling/core/storage.py +41 -33
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/core/translator.py +20 -26
- scrapling-0.3/scrapling/core/utils.py +117 -0
- scrapling-0.3/scrapling/engines/__init__.py +16 -0
- scrapling-0.3/scrapling/engines/_browsers/__init__.py +2 -0
- scrapling-0.3/scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling-0.3/scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling-0.3/scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling-0.3/scrapling/engines/_browsers/_page.py +93 -0
- scrapling-0.3/scrapling/engines/_browsers/_validators.py +150 -0
- scrapling-0.3/scrapling/engines/constants.py +121 -0
- scrapling-0.3/scrapling/engines/static.py +713 -0
- scrapling-0.3/scrapling/engines/toolbelt/__init__.py +20 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling-0.3/scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling-0.3/scrapling/engines/toolbelt/custom.py +318 -0
- scrapling-0.3/scrapling/engines/toolbelt/fingerprints.py +67 -0
- scrapling-0.3/scrapling/engines/toolbelt/navigation.py +150 -0
- scrapling-0.3/scrapling/fetchers.py +427 -0
- scrapling-0.3/scrapling/parser.py +1412 -0
- scrapling-0.3/scrapling.egg-info/PKG-INFO +409 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/SOURCES.txt +13 -21
- scrapling-0.3/scrapling.egg-info/requires.txt +22 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/top_level.txt +0 -1
- {scrapling-0.2.98 → scrapling-0.3}/setup.cfg +2 -2
- scrapling-0.2.98/PKG-INFO +0 -867
- scrapling-0.2.98/README.md +0 -812
- scrapling-0.2.98/scrapling/__init__.py +0 -41
- scrapling-0.2.98/scrapling/cli.py +0 -38
- scrapling-0.2.98/scrapling/core/_types.py +0 -27
- scrapling-0.2.98/scrapling/core/custom_types.py +0 -311
- scrapling-0.2.98/scrapling/core/utils.py +0 -122
- scrapling-0.2.98/scrapling/defaults.py +0 -19
- scrapling-0.2.98/scrapling/engines/__init__.py +0 -7
- scrapling-0.2.98/scrapling/engines/camo.py +0 -299
- scrapling-0.2.98/scrapling/engines/constants.py +0 -108
- scrapling-0.2.98/scrapling/engines/pw.py +0 -428
- scrapling-0.2.98/scrapling/engines/static.py +0 -156
- scrapling-0.2.98/scrapling/engines/toolbelt/__init__.py +0 -6
- scrapling-0.2.98/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98/scrapling/engines/toolbelt/custom.py +0 -299
- scrapling-0.2.98/scrapling/engines/toolbelt/fingerprints.py +0 -81
- scrapling-0.2.98/scrapling/engines/toolbelt/navigation.py +0 -121
- scrapling-0.2.98/scrapling/fetchers.py +0 -432
- scrapling-0.2.98/scrapling/parser.py +0 -1080
- scrapling-0.2.98/scrapling.egg-info/PKG-INFO +0 -867
- scrapling-0.2.98/scrapling.egg-info/requires.txt +0 -10
- scrapling-0.2.98/setup.py +0 -72
- scrapling-0.2.98/tests/__init__.py +0 -1
- scrapling-0.2.98/tests/fetchers/__init__.py +0 -1
- scrapling-0.2.98/tests/fetchers/async/__init__.py +0 -0
- scrapling-0.2.98/tests/fetchers/async/test_camoufox.py +0 -95
- scrapling-0.2.98/tests/fetchers/async/test_httpx.py +0 -83
- scrapling-0.2.98/tests/fetchers/async/test_playwright.py +0 -99
- scrapling-0.2.98/tests/fetchers/sync/__init__.py +0 -0
- scrapling-0.2.98/tests/fetchers/sync/test_camoufox.py +0 -68
- scrapling-0.2.98/tests/fetchers/sync/test_httpx.py +0 -82
- scrapling-0.2.98/tests/fetchers/sync/test_playwright.py +0 -87
- scrapling-0.2.98/tests/fetchers/test_utils.py +0 -97
- scrapling-0.2.98/tests/parser/__init__.py +0 -0
- scrapling-0.2.98/tests/parser/test_automatch.py +0 -111
- scrapling-0.2.98/tests/parser/test_general.py +0 -330
- {scrapling-0.2.98 → scrapling-0.3}/LICENSE +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/MANIFEST.in +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling/py.typed +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/not-zip-safe +0 -0
scrapling-0.3/PKG-INFO
ADDED
@@ -0,0 +1,409 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: scrapling
|
3
|
+
Version: 0.3
|
4
|
+
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
|
+
Home-page: https://github.com/D4Vinci/Scrapling
|
6
|
+
Author: Karim Shoair
|
7
|
+
Author-email: Karim Shoair <karim.shoair@pm.me>
|
8
|
+
Maintainer-email: Karim Shoair <karim.shoair@pm.me>
|
9
|
+
License: BSD 3-Clause License
|
10
|
+
|
11
|
+
Copyright (c) 2024, Karim shoair
|
12
|
+
|
13
|
+
Redistribution and use in source and binary forms, with or without
|
14
|
+
modification, are permitted provided that the following conditions are met:
|
15
|
+
|
16
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
17
|
+
list of conditions and the following disclaimer.
|
18
|
+
|
19
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
20
|
+
this list of conditions and the following disclaimer in the documentation
|
21
|
+
and/or other materials provided with the distribution.
|
22
|
+
|
23
|
+
3. Neither the name of the copyright holder nor the names of its
|
24
|
+
contributors may be used to endorse or promote products derived from
|
25
|
+
this software without specific prior written permission.
|
26
|
+
|
27
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
28
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
29
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
30
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
31
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
32
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
33
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
34
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
35
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
36
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
37
|
+
|
38
|
+
Project-URL: Homepage, https://github.com/D4Vinci/Scrapling
|
39
|
+
Project-URL: Documentation, https://scrapling.readthedocs.io/en/latest/
|
40
|
+
Project-URL: Repository, https://github.com/D4Vinci/Scrapling
|
41
|
+
Project-URL: Bug Tracker, https://github.com/D4Vinci/Scrapling/issues
|
42
|
+
Keywords: web-scraping,scraping,automation,browser-automation,data-extraction,html-parsing,undetectable,playwright,selenium-alternative,web-crawler,browser,crawling
|
43
|
+
Classifier: Operating System :: OS Independent
|
44
|
+
Classifier: Development Status :: 4 - Beta
|
45
|
+
Classifier: Intended Audience :: Developers
|
46
|
+
Classifier: Intended Audience :: Information Technology
|
47
|
+
Classifier: License :: OSI Approved :: BSD License
|
48
|
+
Classifier: Natural Language :: English
|
49
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
50
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
51
|
+
Classifier: Topic :: Text Processing :: Markup
|
52
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
53
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
54
|
+
Classifier: Topic :: Software Development :: Libraries
|
55
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
56
|
+
Classifier: Programming Language :: Python :: 3
|
57
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
58
|
+
Classifier: Programming Language :: Python :: 3.10
|
59
|
+
Classifier: Programming Language :: Python :: 3.11
|
60
|
+
Classifier: Programming Language :: Python :: 3.12
|
61
|
+
Classifier: Programming Language :: Python :: 3.13
|
62
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
63
|
+
Classifier: Typing :: Typed
|
64
|
+
Requires-Python: >=3.10
|
65
|
+
Description-Content-Type: text/markdown
|
66
|
+
License-File: LICENSE
|
67
|
+
Requires-Dist: lxml>=6.0.0
|
68
|
+
Requires-Dist: cssselect>=1.3.0
|
69
|
+
Requires-Dist: click>=8.2.1
|
70
|
+
Requires-Dist: orjson>=3.11.2
|
71
|
+
Requires-Dist: tldextract>=5.3.0
|
72
|
+
Requires-Dist: curl_cffi>=0.13.0
|
73
|
+
Requires-Dist: playwright>=1.52.0
|
74
|
+
Requires-Dist: rebrowser-playwright>=1.52.0
|
75
|
+
Requires-Dist: camoufox>=0.4.11
|
76
|
+
Requires-Dist: geoip2>=5.1.0
|
77
|
+
Requires-Dist: msgspec>=0.19.0
|
78
|
+
Provides-Extra: ai
|
79
|
+
Requires-Dist: mcp>=1.13.0; extra == "ai"
|
80
|
+
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
81
|
+
Provides-Extra: shell
|
82
|
+
Requires-Dist: IPython>=8.37; extra == "shell"
|
83
|
+
Requires-Dist: markdownify>=1.2.0; extra == "shell"
|
84
|
+
Provides-Extra: all
|
85
|
+
Requires-Dist: scrapling[ai,shell]; extra == "all"
|
86
|
+
Dynamic: license-file
|
87
|
+
|
88
|
+
<p align=center>
|
89
|
+
<br>
|
90
|
+
<a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;"/></a>
|
91
|
+
<br>
|
92
|
+
<i><code>Easy, effortless Web Scraping as it should be!</code></i>
|
93
|
+
</p>
|
94
|
+
<p align="center">
|
95
|
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
96
|
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
97
|
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
98
|
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
99
|
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
100
|
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GREEN&left_text=Downloads"></a>
|
101
|
+
<br/>
|
102
|
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
103
|
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
104
|
+
</a>
|
105
|
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
106
|
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
107
|
+
</a>
|
108
|
+
<br/>
|
109
|
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
110
|
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
111
|
+
</p>
|
112
|
+
|
113
|
+
<p align="center">
|
114
|
+
<a href="https://scrapling.readthedocs.io/en/latest/#installation">
|
115
|
+
Installation
|
116
|
+
</a>
|
117
|
+
·
|
118
|
+
<a href="https://scrapling.readthedocs.io/en/latest/overview/">
|
119
|
+
Overview
|
120
|
+
</a>
|
121
|
+
·
|
122
|
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
123
|
+
Selection methods
|
124
|
+
</a>
|
125
|
+
·
|
126
|
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
|
127
|
+
Choosing a fetcher
|
128
|
+
</a>
|
129
|
+
·
|
130
|
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
131
|
+
Migrating from Beautifulsoup
|
132
|
+
</a>
|
133
|
+
</p>
|
134
|
+
|
135
|
+
**Stop fighting anti-bot systems. Stop rewriting selectors after every website update.**
|
136
|
+
|
137
|
+
Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
|
138
|
+
|
139
|
+
Built for the modern Web, Scrapling has its own rapid parsing engine and its fetchers to handle all Web Scraping challenges you are facing or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
140
|
+
|
141
|
+
```python
|
142
|
+
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
143
|
+
>> StealthyFetcher.adaptive = True
|
144
|
+
# Fetch websites' source under the radar!
|
145
|
+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
146
|
+
>> print(page.status)
|
147
|
+
200
|
148
|
+
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
149
|
+
>> # Later, if the website structure changes, pass `adaptive=True`
|
150
|
+
>> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
|
151
|
+
```
|
152
|
+
|
153
|
+
# Sponsors
|
154
|
+
|
155
|
+
<!-- sponsors -->
|
156
|
+
|
157
|
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
158
|
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
159
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
160
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
161
|
+
|
162
|
+
<!-- /sponsors -->
|
163
|
+
|
164
|
+
<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci) and choose the tier that suites you!</sub></i>
|
165
|
+
|
166
|
+
---
|
167
|
+
|
168
|
+
## Key Features
|
169
|
+
|
170
|
+
### Advanced Websites Fetching with Session Support
|
171
|
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
|
172
|
+
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
|
173
|
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all levels of Cloudflare's Turnstile with automation easily.
|
174
|
+
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
175
|
+
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
176
|
+
|
177
|
+
### Adaptive Scraping & AI Integration
|
178
|
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
179
|
+
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
180
|
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
181
|
+
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage.
|
182
|
+
|
183
|
+
### High-Performance & battle-tested Architecture
|
184
|
+
- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
|
185
|
+
- 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.
|
186
|
+
- ⚡ **Fast JSON Serialization**: 10x faster than the standard library.
|
187
|
+
- 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.
|
188
|
+
|
189
|
+
### Developer/Web Scraper Friendly Experience
|
190
|
+
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
|
191
|
+
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code!
|
192
|
+
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
|
193
|
+
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
|
194
|
+
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
195
|
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
196
|
+
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
197
|
+
|
198
|
+
### New Session Architecture
|
199
|
+
Scrapling 0.3 introduces a completely revamped session system:
|
200
|
+
- **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
|
201
|
+
- **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
|
202
|
+
- **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
|
203
|
+
- **Concurrent Session Support**: Run multiple isolated sessions simultaneously
|
204
|
+
|
205
|
+
## Getting Started
|
206
|
+
|
207
|
+
### Basic Usage
|
208
|
+
```python
|
209
|
+
from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
|
210
|
+
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
211
|
+
|
212
|
+
# HTTP requests with session support
|
213
|
+
with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
|
214
|
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
215
|
+
quotes = page.css('.quote .text::text')
|
216
|
+
|
217
|
+
# Or use one-off requests
|
218
|
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
219
|
+
quotes = page.css('.quote .text::text')
|
220
|
+
|
221
|
+
# Advanced stealth mode (Keep the browser open until you finish)
|
222
|
+
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
223
|
+
page = session.fetch('https://nopecha.com/demo/cloudflare')
|
224
|
+
data = page.css('#padded_content a')
|
225
|
+
|
226
|
+
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
227
|
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
228
|
+
data = page.css('#padded_content a')
|
229
|
+
|
230
|
+
# Full browser automation (Keep the browser open until you finish)
|
231
|
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
|
232
|
+
page = session.fetch('https://quotes.toscrape.com/')
|
233
|
+
data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
|
234
|
+
|
235
|
+
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
236
|
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
237
|
+
data = page.css('.quote .text::text')
|
238
|
+
```
|
239
|
+
|
240
|
+
### Advanced Parsing & Navigation
|
241
|
+
```python
|
242
|
+
from scrapling.fetchers import Fetcher
|
243
|
+
|
244
|
+
# Rich element selection and navigation
|
245
|
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
246
|
+
|
247
|
+
# Get quotes with multiple selection methods
|
248
|
+
quotes = page.css('.quote') # CSS selector
|
249
|
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
250
|
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-style
|
251
|
+
# Same as
|
252
|
+
quotes = page.find_all('div', class_='quote')
|
253
|
+
quotes = page.find_all(['div'], class_='quote')
|
254
|
+
quotes = page.find_all(class_='quote') # and so on...
|
255
|
+
# Find element by text content
|
256
|
+
quotes = page.find_by_text('quote', tag='div')
|
257
|
+
|
258
|
+
# Advanced navigation
|
259
|
+
first_quote = page.css_first('.quote')
|
260
|
+
quote_text = first_quote.css('.text::text')
|
261
|
+
quote_text = page.css('.quote').css_first('.text::text') # Chained selectors
|
262
|
+
quote_text = page.css_first('.quote .text').text # Using `css_first` is faster than `css` if you want the first element
|
263
|
+
author = first_quote.next_sibling.css('.author::text')
|
264
|
+
parent_container = first_quote.parent
|
265
|
+
|
266
|
+
# Element relationships and similarity
|
267
|
+
similar_elements = first_quote.find_similar()
|
268
|
+
below_elements = first_quote.below_elements()
|
269
|
+
```
|
270
|
+
You can use the parser right away if you don't want to fetch websites like below:
|
271
|
+
```python
|
272
|
+
from scrapling.parser import Selector
|
273
|
+
|
274
|
+
page = Selector("<html>...</html>")
|
275
|
+
```
|
276
|
+
And it works exactly the same!
|
277
|
+
|
278
|
+
### Async Session Management Examples
|
279
|
+
```python
|
280
|
+
import asyncio
|
281
|
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
282
|
+
|
283
|
+
async with FetcherSession(http3=True) as session: # `FetcherSession` is context-aware and can work in both sync/async patterns
|
284
|
+
page1 = session.get('https://quotes.toscrape.com/')
|
285
|
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
286
|
+
|
287
|
+
# Async session usage
|
288
|
+
async with AsyncStealthySession(max_pages=2) as session:
|
289
|
+
tasks = []
|
290
|
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
291
|
+
|
292
|
+
for url in urls:
|
293
|
+
task = session.fetch(url)
|
294
|
+
tasks.append(task)
|
295
|
+
|
296
|
+
print(session.get_pool_stats()) # Optional - The status of the browser tabs pool (busy/free/error)
|
297
|
+
results = await asyncio.gather(*tasks)
|
298
|
+
print(session.get_pool_stats())
|
299
|
+
```
|
300
|
+
|
301
|
+
## CLI & Interactive Shell
|
302
|
+
|
303
|
+
Scrapling v0.3 includes a powerful command-line interface:
|
304
|
+
|
305
|
+
```bash
|
306
|
+
# Launch interactive Web Scraping shell
|
307
|
+
scrapling shell
|
308
|
+
|
309
|
+
# Extract pages to a file directly without programming (Extracts the content inside `body` tag by default)
|
310
|
+
# If the output file ends with `.txt`, then the text content of the target will be extracted.
|
311
|
+
# If ended with `.md`, it will be a markdown representation of the HTML content, and `.html` will be the HTML content right away.
|
312
|
+
scrapling extract get 'https://example.com' content.md
|
313
|
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # All elements matching the CSS selector '#fromSkipToProducts'
|
314
|
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
315
|
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
316
|
+
```
|
317
|
+
|
318
|
+
> [!NOTE]
|
319
|
+
> There are many additional features, but we want to keep this page short, like the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
320
|
+
|
321
|
+
## Performance Benchmarks
|
322
|
+
|
323
|
+
Scrapling isn't just powerful—it's also blazing fast, and version 0.3 delivers exceptional performance improvements across all operations!
|
324
|
+
|
325
|
+
### Text Extraction Speed Test (5000 nested elements)
|
326
|
+
|
327
|
+
| # | Library | Time (ms) | vs Scrapling |
|
328
|
+
|---|:-----------------:|:---------:|:------------:|
|
329
|
+
| 1 | Scrapling | 1.88 | 1.0x |
|
330
|
+
| 2 | Parsel/Scrapy | 1.96 | 1.043x |
|
331
|
+
| 3 | Raw Lxml | 2.32 | 1.234x |
|
332
|
+
| 4 | PyQuery | 20.2 | ~11x |
|
333
|
+
| 5 | Selectolax | 85.2 | ~45x |
|
334
|
+
| 6 | MechanicalSoup | 1305.84 | ~695x |
|
335
|
+
| 7 | BS4 with Lxml | 1307.92 | ~696x |
|
336
|
+
| 8 | BS4 with html5lib | 3336.28 | ~1775x |
|
337
|
+
|
338
|
+
### Element Similarity & Text Search Performance
|
339
|
+
|
340
|
+
Scrapling's adaptive element finding capabilities significantly outperform alternatives:
|
341
|
+
|
342
|
+
| Library | Time (ms) | vs Scrapling |
|
343
|
+
|-------------|:---------:|:------------:|
|
344
|
+
| Scrapling | 2.02 | 1.0x |
|
345
|
+
| AutoScraper | 10.26 | 5.08x |
|
346
|
+
|
347
|
+
|
348
|
+
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
349
|
+
|
350
|
+
## Installation
|
351
|
+
|
352
|
+
Scrapling requires Python 3.10 or higher:
|
353
|
+
|
354
|
+
```bash
|
355
|
+
pip install scrapling
|
356
|
+
```
|
357
|
+
|
358
|
+
#### Fetchers Setup
|
359
|
+
|
360
|
+
If you are going to use any of the fetchers or their classes, then install browser dependencies with
|
361
|
+
```bash
|
362
|
+
scrapling install
|
363
|
+
```
|
364
|
+
|
365
|
+
This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
|
366
|
+
|
367
|
+
### Optional Dependencies
|
368
|
+
|
369
|
+
- Install the MCP server feature:
|
370
|
+
```bash
|
371
|
+
pip install "scrapling[ai]"
|
372
|
+
```
|
373
|
+
- Install shell features (Web Scraping shell and the `extract` command):
|
374
|
+
```bash
|
375
|
+
pip install "scrapling[shell]"
|
376
|
+
```
|
377
|
+
- Install everything:
|
378
|
+
```bash
|
379
|
+
pip install "scrapling[all]"
|
380
|
+
```
|
381
|
+
|
382
|
+
## Contributing
|
383
|
+
|
384
|
+
We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
|
385
|
+
|
386
|
+
## Disclaimer
|
387
|
+
|
388
|
+
> [!CAUTION]
|
389
|
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect website terms of service and robots.txt files.
|
390
|
+
|
391
|
+
## License
|
392
|
+
|
393
|
+
This work is licensed under the BSD-3-Clause License.
|
394
|
+
|
395
|
+
## Acknowledgments
|
396
|
+
|
397
|
+
This project includes code adapted from:
|
398
|
+
- Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
|
399
|
+
|
400
|
+
## Thanks and References
|
401
|
+
|
402
|
+
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
403
|
+
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
404
|
+
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
405
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
406
|
+
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
407
|
+
|
408
|
+
---
|
409
|
+
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|