scrapling 0.2.98__tar.gz → 0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scrapling-0.3/PKG-INFO +409 -0
  2. scrapling-0.3/README.md +322 -0
  3. scrapling-0.3/pyproject.toml +103 -0
  4. scrapling-0.3/scrapling/__init__.py +28 -0
  5. scrapling-0.3/scrapling/cli.py +836 -0
  6. scrapling-0.3/scrapling/core/_html_utils.py +348 -0
  7. scrapling-0.3/scrapling/core/_types.py +44 -0
  8. scrapling-0.3/scrapling/core/ai.py +611 -0
  9. scrapling-0.3/scrapling/core/custom_types.py +394 -0
  10. {scrapling-0.2.98 → scrapling-0.3}/scrapling/core/mixins.py +27 -19
  11. scrapling-0.3/scrapling/core/shell.py +647 -0
  12. scrapling-0.2.98/scrapling/core/storage_adaptors.py → scrapling-0.3/scrapling/core/storage.py +41 -33
  13. {scrapling-0.2.98 → scrapling-0.3}/scrapling/core/translator.py +20 -26
  14. scrapling-0.3/scrapling/core/utils.py +117 -0
  15. scrapling-0.3/scrapling/engines/__init__.py +16 -0
  16. scrapling-0.3/scrapling/engines/_browsers/__init__.py +2 -0
  17. scrapling-0.3/scrapling/engines/_browsers/_camoufox.py +745 -0
  18. scrapling-0.3/scrapling/engines/_browsers/_config_tools.py +130 -0
  19. scrapling-0.3/scrapling/engines/_browsers/_controllers.py +630 -0
  20. scrapling-0.3/scrapling/engines/_browsers/_page.py +93 -0
  21. scrapling-0.3/scrapling/engines/_browsers/_validators.py +150 -0
  22. scrapling-0.3/scrapling/engines/constants.py +121 -0
  23. scrapling-0.3/scrapling/engines/static.py +713 -0
  24. scrapling-0.3/scrapling/engines/toolbelt/__init__.py +20 -0
  25. {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  26. scrapling-0.3/scrapling/engines/toolbelt/convertor.py +254 -0
  27. scrapling-0.3/scrapling/engines/toolbelt/custom.py +318 -0
  28. scrapling-0.3/scrapling/engines/toolbelt/fingerprints.py +67 -0
  29. scrapling-0.3/scrapling/engines/toolbelt/navigation.py +150 -0
  30. scrapling-0.3/scrapling/fetchers.py +427 -0
  31. scrapling-0.3/scrapling/parser.py +1412 -0
  32. scrapling-0.3/scrapling.egg-info/PKG-INFO +409 -0
  33. {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/SOURCES.txt +13 -21
  34. scrapling-0.3/scrapling.egg-info/requires.txt +22 -0
  35. {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/top_level.txt +0 -1
  36. {scrapling-0.2.98 → scrapling-0.3}/setup.cfg +2 -2
  37. scrapling-0.2.98/PKG-INFO +0 -867
  38. scrapling-0.2.98/README.md +0 -812
  39. scrapling-0.2.98/scrapling/__init__.py +0 -41
  40. scrapling-0.2.98/scrapling/cli.py +0 -38
  41. scrapling-0.2.98/scrapling/core/_types.py +0 -27
  42. scrapling-0.2.98/scrapling/core/custom_types.py +0 -311
  43. scrapling-0.2.98/scrapling/core/utils.py +0 -122
  44. scrapling-0.2.98/scrapling/defaults.py +0 -19
  45. scrapling-0.2.98/scrapling/engines/__init__.py +0 -7
  46. scrapling-0.2.98/scrapling/engines/camo.py +0 -299
  47. scrapling-0.2.98/scrapling/engines/constants.py +0 -108
  48. scrapling-0.2.98/scrapling/engines/pw.py +0 -428
  49. scrapling-0.2.98/scrapling/engines/static.py +0 -156
  50. scrapling-0.2.98/scrapling/engines/toolbelt/__init__.py +0 -6
  51. scrapling-0.2.98/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  52. scrapling-0.2.98/scrapling/engines/toolbelt/custom.py +0 -299
  53. scrapling-0.2.98/scrapling/engines/toolbelt/fingerprints.py +0 -81
  54. scrapling-0.2.98/scrapling/engines/toolbelt/navigation.py +0 -121
  55. scrapling-0.2.98/scrapling/fetchers.py +0 -432
  56. scrapling-0.2.98/scrapling/parser.py +0 -1080
  57. scrapling-0.2.98/scrapling.egg-info/PKG-INFO +0 -867
  58. scrapling-0.2.98/scrapling.egg-info/requires.txt +0 -10
  59. scrapling-0.2.98/setup.py +0 -72
  60. scrapling-0.2.98/tests/__init__.py +0 -1
  61. scrapling-0.2.98/tests/fetchers/__init__.py +0 -1
  62. scrapling-0.2.98/tests/fetchers/async/__init__.py +0 -0
  63. scrapling-0.2.98/tests/fetchers/async/test_camoufox.py +0 -95
  64. scrapling-0.2.98/tests/fetchers/async/test_httpx.py +0 -83
  65. scrapling-0.2.98/tests/fetchers/async/test_playwright.py +0 -99
  66. scrapling-0.2.98/tests/fetchers/sync/__init__.py +0 -0
  67. scrapling-0.2.98/tests/fetchers/sync/test_camoufox.py +0 -68
  68. scrapling-0.2.98/tests/fetchers/sync/test_httpx.py +0 -82
  69. scrapling-0.2.98/tests/fetchers/sync/test_playwright.py +0 -87
  70. scrapling-0.2.98/tests/fetchers/test_utils.py +0 -97
  71. scrapling-0.2.98/tests/parser/__init__.py +0 -0
  72. scrapling-0.2.98/tests/parser/test_automatch.py +0 -111
  73. scrapling-0.2.98/tests/parser/test_general.py +0 -330
  74. {scrapling-0.2.98 → scrapling-0.3}/LICENSE +0 -0
  75. {scrapling-0.2.98 → scrapling-0.3}/MANIFEST.in +0 -0
  76. {scrapling-0.2.98 → scrapling-0.3}/scrapling/core/__init__.py +0 -0
  77. {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  78. {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  79. {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  80. {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  81. {scrapling-0.2.98 → scrapling-0.3}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  82. {scrapling-0.2.98 → scrapling-0.3}/scrapling/py.typed +0 -0
  83. {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/dependency_links.txt +0 -0
  84. {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/entry_points.txt +0 -0
  85. {scrapling-0.2.98 → scrapling-0.3}/scrapling.egg-info/not-zip-safe +0 -0
scrapling-0.3/PKG-INFO ADDED
@@ -0,0 +1,409 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapling
3
+ Version: 0.3
4
+ Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
+ Home-page: https://github.com/D4Vinci/Scrapling
6
+ Author: Karim Shoair
7
+ Author-email: Karim Shoair <karim.shoair@pm.me>
8
+ Maintainer-email: Karim Shoair <karim.shoair@pm.me>
9
+ License: BSD 3-Clause License
10
+
11
+ Copyright (c) 2024, Karim shoair
12
+
13
+ Redistribution and use in source and binary forms, with or without
14
+ modification, are permitted provided that the following conditions are met:
15
+
16
+ 1. Redistributions of source code must retain the above copyright notice, this
17
+ list of conditions and the following disclaimer.
18
+
19
+ 2. Redistributions in binary form must reproduce the above copyright notice,
20
+ this list of conditions and the following disclaimer in the documentation
21
+ and/or other materials provided with the distribution.
22
+
23
+ 3. Neither the name of the copyright holder nor the names of its
24
+ contributors may be used to endorse or promote products derived from
25
+ this software without specific prior written permission.
26
+
27
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
31
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
33
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
34
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
35
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
+
38
+ Project-URL: Homepage, https://github.com/D4Vinci/Scrapling
39
+ Project-URL: Documentation, https://scrapling.readthedocs.io/en/latest/
40
+ Project-URL: Repository, https://github.com/D4Vinci/Scrapling
41
+ Project-URL: Bug Tracker, https://github.com/D4Vinci/Scrapling/issues
42
+ Keywords: web-scraping,scraping,automation,browser-automation,data-extraction,html-parsing,undetectable,playwright,selenium-alternative,web-crawler,browser,crawling
43
+ Classifier: Operating System :: OS Independent
44
+ Classifier: Development Status :: 4 - Beta
45
+ Classifier: Intended Audience :: Developers
46
+ Classifier: Intended Audience :: Information Technology
47
+ Classifier: License :: OSI Approved :: BSD License
48
+ Classifier: Natural Language :: English
49
+ Classifier: Topic :: Internet :: WWW/HTTP
50
+ Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
51
+ Classifier: Topic :: Text Processing :: Markup
52
+ Classifier: Topic :: Text Processing :: Markup :: HTML
53
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
54
+ Classifier: Topic :: Software Development :: Libraries
55
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
56
+ Classifier: Programming Language :: Python :: 3
57
+ Classifier: Programming Language :: Python :: 3 :: Only
58
+ Classifier: Programming Language :: Python :: 3.10
59
+ Classifier: Programming Language :: Python :: 3.11
60
+ Classifier: Programming Language :: Python :: 3.12
61
+ Classifier: Programming Language :: Python :: 3.13
62
+ Classifier: Programming Language :: Python :: Implementation :: CPython
63
+ Classifier: Typing :: Typed
64
+ Requires-Python: >=3.10
65
+ Description-Content-Type: text/markdown
66
+ License-File: LICENSE
67
+ Requires-Dist: lxml>=6.0.0
68
+ Requires-Dist: cssselect>=1.3.0
69
+ Requires-Dist: click>=8.2.1
70
+ Requires-Dist: orjson>=3.11.2
71
+ Requires-Dist: tldextract>=5.3.0
72
+ Requires-Dist: curl_cffi>=0.13.0
73
+ Requires-Dist: playwright>=1.52.0
74
+ Requires-Dist: rebrowser-playwright>=1.52.0
75
+ Requires-Dist: camoufox>=0.4.11
76
+ Requires-Dist: geoip2>=5.1.0
77
+ Requires-Dist: msgspec>=0.19.0
78
+ Provides-Extra: ai
79
+ Requires-Dist: mcp>=1.13.0; extra == "ai"
80
+ Requires-Dist: markdownify>=1.2.0; extra == "ai"
81
+ Provides-Extra: shell
82
+ Requires-Dist: IPython>=8.37; extra == "shell"
83
+ Requires-Dist: markdownify>=1.2.0; extra == "shell"
84
+ Provides-Extra: all
85
+ Requires-Dist: scrapling[ai,shell]; extra == "all"
86
+ Dynamic: license-file
87
+
88
+ <p align=center>
89
+ <br>
90
+ <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;"/></a>
91
+ <br>
92
+ <i><code>Easy, effortless Web Scraping as it should be!</code></i>
93
+ </p>
94
+ <p align="center">
95
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
96
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
97
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
98
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
99
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
100
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GREEN&left_text=Downloads"></a>
101
+ <br/>
102
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
103
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
104
+ </a>
105
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
106
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
107
+ </a>
108
+ <br/>
109
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
110
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
111
+ </p>
112
+
113
+ <p align="center">
114
+ <a href="https://scrapling.readthedocs.io/en/latest/#installation">
115
+ Installation
116
+ </a>
117
+ ·
118
+ <a href="https://scrapling.readthedocs.io/en/latest/overview/">
119
+ Overview
120
+ </a>
121
+ ·
122
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
123
+ Selection methods
124
+ </a>
125
+ ·
126
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
127
+ Choosing a fetcher
128
+ </a>
129
+ ·
130
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
131
+ Migrating from Beautifulsoup
132
+ </a>
133
+ </p>
134
+
135
+ **Stop fighting anti-bot systems. Stop rewriting selectors after every website update.**
136
+
137
+ Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
138
+
139
+ Built for the modern Web, Scrapling has its own rapid parsing engine and its fetchers to handle all Web Scraping challenges you are facing or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
140
+
141
+ ```python
142
+ >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
143
+ >> StealthyFetcher.adaptive = True
144
+ # Fetch websites' source under the radar!
145
+ >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
146
+ >> print(page.status)
147
+ 200
148
+ >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
149
+ >> # Later, if the website structure changes, pass `adaptive=True`
150
+ >> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
151
+ ```
152
+
153
+ # Sponsors
154
+
155
+ <!-- sponsors -->
156
+
157
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
158
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
159
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
160
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
161
+
162
+ <!-- /sponsors -->
163
+
164
+ <i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci) and choose the tier that suites you!</sub></i>
165
+
166
+ ---
167
+
168
+ ## Key Features
169
+
170
+ ### Advanced Websites Fetching with Session Support
171
+ - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
172
+ - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
173
+ - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all levels of Cloudflare's Turnstile with automation easily.
174
+ - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
175
+ - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
176
+
177
+ ### Adaptive Scraping & AI Integration
178
+ - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
179
+ - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
180
+ - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
181
+ - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage.
182
+
183
+ ### High-Performance & battle-tested Architecture
184
+ - 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
185
+ - 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.
186
+ - ⚡ **Fast JSON Serialization**: 10x faster than the standard library.
187
+ - 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.
188
+
189
+ ### Developer/Web Scraper Friendly Experience
190
+ - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
191
+ - 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code!
192
+ - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
193
+ - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
194
+ - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
195
+ - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
196
+ - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
197
+
198
+ ### New Session Architecture
199
+ Scrapling 0.3 introduces a completely revamped session system:
200
+ - **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
201
+ - **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
202
+ - **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
203
+ - **Concurrent Session Support**: Run multiple isolated sessions simultaneously
204
+
205
+ ## Getting Started
206
+
207
+ ### Basic Usage
208
+ ```python
209
+ from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
210
+ from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
211
+
212
+ # HTTP requests with session support
213
+ with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
214
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
215
+ quotes = page.css('.quote .text::text')
216
+
217
+ # Or use one-off requests
218
+ page = Fetcher.get('https://quotes.toscrape.com/')
219
+ quotes = page.css('.quote .text::text')
220
+
221
+ # Advanced stealth mode (Keep the browser open until you finish)
222
+ with StealthySession(headless=True, solve_cloudflare=True) as session:
223
+ page = session.fetch('https://nopecha.com/demo/cloudflare')
224
+ data = page.css('#padded_content a')
225
+
226
+ # Or use one-off request style, it opens the browser for this request, then closes it after finishing
227
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
228
+ data = page.css('#padded_content a')
229
+
230
+ # Full browser automation (Keep the browser open until you finish)
231
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
232
+ page = session.fetch('https://quotes.toscrape.com/')
233
+ data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
234
+
235
+ # Or use one-off request style, it opens the browser for this request, then closes it after finishing
236
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
237
+ data = page.css('.quote .text::text')
238
+ ```
239
+
240
+ ### Advanced Parsing & Navigation
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # Rich element selection and navigation
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # Get quotes with multiple selection methods
248
+ quotes = page.css('.quote') # CSS selector
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-style
251
+ # Same as
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # and so on...
255
+ # Find element by text content
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # Advanced navigation
259
+ first_quote = page.css_first('.quote')
260
+ quote_text = first_quote.css('.text::text')
261
+ quote_text = page.css('.quote').css_first('.text::text') # Chained selectors
262
+ quote_text = page.css_first('.quote .text').text # Using `css_first` is faster than `css` if you want the first element
263
+ author = first_quote.next_sibling.css('.author::text')
264
+ parent_container = first_quote.parent
265
+
266
+ # Element relationships and similarity
267
+ similar_elements = first_quote.find_similar()
268
+ below_elements = first_quote.below_elements()
269
+ ```
270
+ You can use the parser right away if you don't want to fetch websites like below:
271
+ ```python
272
+ from scrapling.parser import Selector
273
+
274
+ page = Selector("<html>...</html>")
275
+ ```
276
+ And it works exactly the same!
277
+
278
+ ### Async Session Management Examples
279
+ ```python
280
+ import asyncio
281
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
282
+
283
+ async with FetcherSession(http3=True) as session: # `FetcherSession` is context-aware and can work in both sync/async patterns
284
+ page1 = session.get('https://quotes.toscrape.com/')
285
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
286
+
287
+ # Async session usage
288
+ async with AsyncStealthySession(max_pages=2) as session:
289
+ tasks = []
290
+ urls = ['https://example.com/page1', 'https://example.com/page2']
291
+
292
+ for url in urls:
293
+ task = session.fetch(url)
294
+ tasks.append(task)
295
+
296
+ print(session.get_pool_stats()) # Optional - The status of the browser tabs pool (busy/free/error)
297
+ results = await asyncio.gather(*tasks)
298
+ print(session.get_pool_stats())
299
+ ```
300
+
301
+ ## CLI & Interactive Shell
302
+
303
+ Scrapling v0.3 includes a powerful command-line interface:
304
+
305
+ ```bash
306
+ # Launch interactive Web Scraping shell
307
+ scrapling shell
308
+
309
+ # Extract pages to a file directly without programming (Extracts the content inside `body` tag by default)
310
+ # If the output file ends with `.txt`, then the text content of the target will be extracted.
311
+ # If ended with `.md`, it will be a markdown representation of the HTML content, and `.html` will be the HTML content right away.
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # All elements matching the CSS selector '#fromSkipToProducts'
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > There are many additional features, but we want to keep this page short, like the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
320
+
321
+ ## Performance Benchmarks
322
+
323
+ Scrapling isn't just powerful—it's also blazing fast, and version 0.3 delivers exceptional performance improvements across all operations!
324
+
325
+ ### Text Extraction Speed Test (5000 nested elements)
326
+
327
+ | # | Library | Time (ms) | vs Scrapling |
328
+ |---|:-----------------:|:---------:|:------------:|
329
+ | 1 | Scrapling | 1.88 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 1.96 | 1.043x |
331
+ | 3 | Raw Lxml | 2.32 | 1.234x |
332
+ | 4 | PyQuery | 20.2 | ~11x |
333
+ | 5 | Selectolax | 85.2 | ~45x |
334
+ | 6 | MechanicalSoup | 1305.84 | ~695x |
335
+ | 7 | BS4 with Lxml | 1307.92 | ~696x |
336
+ | 8 | BS4 with html5lib | 3336.28 | ~1775x |
337
+
338
+ ### Element Similarity & Text Search Performance
339
+
340
+ Scrapling's adaptive element finding capabilities significantly outperform alternatives:
341
+
342
+ | Library | Time (ms) | vs Scrapling |
343
+ |-------------|:---------:|:------------:|
344
+ | Scrapling | 2.02 | 1.0x |
345
+ | AutoScraper | 10.26 | 5.08x |
346
+
347
+
348
+ > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
349
+
350
+ ## Installation
351
+
352
+ Scrapling requires Python 3.10 or higher:
353
+
354
+ ```bash
355
+ pip install scrapling
356
+ ```
357
+
358
+ #### Fetchers Setup
359
+
360
+ If you are going to use any of the fetchers or their classes, then install browser dependencies with
361
+ ```bash
362
+ scrapling install
363
+ ```
364
+
365
+ This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
366
+
367
+ ### Optional Dependencies
368
+
369
+ - Install the MCP server feature:
370
+ ```bash
371
+ pip install "scrapling[ai]"
372
+ ```
373
+ - Install shell features (Web Scraping shell and the `extract` command):
374
+ ```bash
375
+ pip install "scrapling[shell]"
376
+ ```
377
+ - Install everything:
378
+ ```bash
379
+ pip install "scrapling[all]"
380
+ ```
381
+
382
+ ## Contributing
383
+
384
+ We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
385
+
386
+ ## Disclaimer
387
+
388
+ > [!CAUTION]
389
+ > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect website terms of service and robots.txt files.
390
+
391
+ ## License
392
+
393
+ This work is licensed under the BSD-3-Clause License.
394
+
395
+ ## Acknowledgments
396
+
397
+ This project includes code adapted from:
398
+ - Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
399
+
400
+ ## Thanks and References
401
+
402
+ - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
403
+ - [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
404
+ - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
405
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
406
+ - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
407
+
408
+ ---
409
+ <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>