scrapling 0.2.91__tar.gz → 0.2.93__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. {scrapling-0.2.91 → scrapling-0.2.93}/MANIFEST.in +3 -0
  2. {scrapling-0.2.91/scrapling.egg-info → scrapling-0.2.93}/PKG-INFO +64 -71
  3. {scrapling-0.2.91 → scrapling-0.2.93}/README.md +47 -65
  4. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/__init__.py +1 -1
  5. scrapling-0.2.93/scrapling/cli.py +37 -0
  6. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/_types.py +2 -1
  7. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/custom_types.py +91 -39
  8. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/translator.py +1 -1
  9. scrapling-0.2.93/scrapling/defaults.py +10 -0
  10. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/camo.py +16 -14
  11. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/pw.py +11 -13
  12. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/fetchers.py +5 -5
  13. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/parser.py +155 -191
  14. {scrapling-0.2.91 → scrapling-0.2.93/scrapling.egg-info}/PKG-INFO +64 -71
  15. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/SOURCES.txt +2 -0
  16. scrapling-0.2.93/scrapling.egg-info/entry_points.txt +2 -0
  17. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/requires.txt +3 -3
  18. {scrapling-0.2.91 → scrapling-0.2.93}/setup.cfg +1 -1
  19. {scrapling-0.2.91 → scrapling-0.2.93}/setup.py +10 -5
  20. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/test_playwright.py +1 -1
  21. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/test_playwright.py +1 -1
  22. scrapling-0.2.91/scrapling/defaults.py +0 -7
  23. {scrapling-0.2.91 → scrapling-0.2.93}/LICENSE +0 -0
  24. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/__init__.py +0 -0
  25. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/mixins.py +0 -0
  26. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/storage_adaptors.py +0 -0
  27. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/core/utils.py +0 -0
  28. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/__init__.py +0 -0
  29. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/constants.py +0 -0
  30. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/static.py +0 -0
  31. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/__init__.py +0 -0
  32. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  33. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  34. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  35. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  36. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  37. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  38. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  39. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/custom.py +0 -0
  40. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  41. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/engines/toolbelt/navigation.py +0 -0
  42. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling/py.typed +0 -0
  43. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/dependency_links.txt +0 -0
  44. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/not-zip-safe +0 -0
  45. {scrapling-0.2.91 → scrapling-0.2.93}/scrapling.egg-info/top_level.txt +0 -0
  46. {scrapling-0.2.91 → scrapling-0.2.93}/tests/__init__.py +0 -0
  47. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/__init__.py +0 -0
  48. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/__init__.py +0 -0
  49. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/test_camoufox.py +0 -0
  50. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/async/test_httpx.py +0 -0
  51. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/__init__.py +0 -0
  52. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/test_camoufox.py +0 -0
  53. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/sync/test_httpx.py +0 -0
  54. {scrapling-0.2.91 → scrapling-0.2.93}/tests/fetchers/test_utils.py +0 -0
  55. {scrapling-0.2.91 → scrapling-0.2.93}/tests/parser/__init__.py +0 -0
  56. {scrapling-0.2.91 → scrapling-0.2.93}/tests/parser/test_automatch.py +0 -0
  57. {scrapling-0.2.91 → scrapling-0.2.93}/tests/parser/test_general.py +0 -0
@@ -4,7 +4,10 @@ include *.js
4
4
  include scrapling/engines/toolbelt/bypasses/*.js
5
5
  include scrapling/*.db
6
6
  include scrapling/*.db*
7
+ include scrapling/*.db-*
7
8
  include scrapling/py.typed
9
+ include scrapling/.scrapling_dependencies_installed
10
+ include .scrapling_dependencies_installed
8
11
 
9
12
  recursive-exclude * __pycache__
10
13
  recursive-exclude * *.py[co]
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: scrapling
3
- Version: 0.2.91
3
+ Version: 0.2.93
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
10
10
  Project-URL: Source, https://github.com/D4Vinci/Scrapling
11
11
  Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
12
12
  Classifier: Operating System :: OS Independent
13
- Classifier: Development Status :: 4 - Beta
13
+ Classifier: Development Status :: 4 - Beta
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: License :: OSI Approved :: BSD License
16
16
  Classifier: Natural Language :: English
@@ -31,16 +31,27 @@ Classifier: Typing :: Typed
31
31
  Requires-Python: >=3.9
32
32
  Description-Content-Type: text/markdown
33
33
  License-File: LICENSE
34
- Requires-Dist: requests>=2.3
35
- Requires-Dist: lxml>=4.5
34
+ Requires-Dist: lxml>=5.0
36
35
  Requires-Dist: cssselect>=1.2
36
+ Requires-Dist: click
37
37
  Requires-Dist: w3lib
38
38
  Requires-Dist: orjson>=3
39
39
  Requires-Dist: tldextract
40
40
  Requires-Dist: httpx[brotli,socks,zstd]
41
41
  Requires-Dist: playwright>=1.49.1
42
42
  Requires-Dist: rebrowser-playwright>=1.49.1
43
- Requires-Dist: camoufox[geoip]>=0.4.9
43
+ Requires-Dist: camoufox[geoip]>=0.4.10
44
+ Dynamic: author
45
+ Dynamic: author-email
46
+ Dynamic: classifier
47
+ Dynamic: description
48
+ Dynamic: description-content-type
49
+ Dynamic: home-page
50
+ Dynamic: license
51
+ Dynamic: project-url
52
+ Dynamic: requires-dist
53
+ Dynamic: requires-python
54
+ Dynamic: summary
44
55
 
45
56
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
46
57
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
@@ -77,6 +88,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
77
88
  [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
78
89
  ---
79
90
 
91
+ [Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
92
+
93
+ - 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
94
+ - ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
95
+ - 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
96
+ - 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
97
+ - 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
98
+ - 🎁 Free Trial: Try before you buy—experience our service firsthand.
99
+ - 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
100
+ - 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
101
+
102
+
103
+ [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
104
+ ---
105
+
80
106
  ## Table of content
81
107
  * [Key Features](#key-features)
82
108
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -121,27 +147,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
121
147
  ## Key Features
122
148
 
123
149
  ### Fetch websites as you prefer with async support
124
- - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
125
- - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
126
- - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
150
+ - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
151
+ - **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
152
+ - **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
127
153
 
128
154
  ### Adaptive Scraping
129
- - 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
130
- - 🎯 **Flexible Querying**: Use CSS selectors, XPath, Elements filters, text search, or regex - chain them however you want!
131
- - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
155
+ - 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
156
+ - 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
157
+ - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
132
158
  - 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
133
159
 
134
- ### Performance
135
- - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup in parsing by up to 620x in our tests).
160
+ ### High Performance
161
+ - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
136
162
  - 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
137
- - ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
163
+ - ⚡ **Fast JSON serialization**: 10x faster than standard library.
138
164
 
139
- ### Developing Experience
140
- - 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
141
- - 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
142
- - 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
143
- - 🔌 **API Similar to Scrapy/BeautifulSoup**: Familiar methods and similar pseudo-elements for Scrapy and BeautifulSoup users.
144
- - 📘 **Type hints and test coverage**: Complete type coverage and almost full test coverage for better IDE support and fewer bugs, respectively.
165
+ ### Developer Friendly
166
+ - 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
167
+ - 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
168
+ - 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
169
+ - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
170
+ - 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
145
171
 
146
172
  ## Getting Started
147
173
 
@@ -150,21 +176,22 @@ from scrapling import Fetcher
150
176
 
151
177
  fetcher = Fetcher(auto_match=False)
152
178
 
153
- # Fetch a web page and create an Adaptor instance
179
+ # Do http GET request to a web page and create an Adaptor instance
154
180
  page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
155
- # Get all strings in the full page
181
+ # Get all text content from all HTML tags in the page except `script` and `style` tags
156
182
  page.get_all_text(ignore_tags=('script', 'style'))
157
183
 
158
- # Get all quotes, any of these methods will return a list of strings (TextHandlers)
184
+ # Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
159
185
  quotes = page.css('.quote .text::text') # CSS selector
160
186
  quotes = page.xpath('//span[@class="text"]/text()') # XPath
161
187
  quotes = page.css('.quote').css('.text::text') # Chained selectors
162
188
  quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
163
189
 
164
190
  # Get the first quote element
165
- quote = page.css_first('.quote') # / page.css('.quote').first / page.css('.quote')[0]
191
+ quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
166
192
 
167
193
  # Tired of selectors? Use find_all/find
194
+ # Get all 'div' HTML tags that one of its 'class' values is 'quote'
168
195
  quotes = page.find_all('div', {'class': 'quote'})
169
196
  # Same as
170
197
  quotes = page.find_all('div', class_='quote')
@@ -172,10 +199,10 @@ quotes = page.find_all(['div'], class_='quote')
172
199
  quotes = page.find_all(class_='quote') # and so on...
173
200
 
174
201
  # Working with elements
175
- quote.html_content # Inner HTML
176
- quote.prettify() # Prettified version of Inner HTML
177
- quote.attrib # Element attributes
178
- quote.path # DOM path to element (List)
202
+ quote.html_content # Get Inner HTML of this element
203
+ quote.prettify() # Prettified version of Inner HTML above
204
+ quote.attrib # Get that element's attributes
205
+ quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
179
206
  ```
180
207
  To keep it simple, all methods can be chained on top of each other!
181
208
 
@@ -211,52 +238,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
211
238
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
212
239
 
213
240
  ## Installation
214
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
241
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
215
242
  ```bash
216
243
  pip3 install scrapling
217
244
  ```
218
- - For using the `StealthyFetcher`, go to the command line and download the browser with
219
- <details><summary>Windows OS</summary>
220
-
245
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
221
246
  ```bash
222
- camoufox fetch --browserforge
223
- ```
224
- </details>
225
- <details><summary>MacOS</summary>
226
-
227
- ```bash
228
- python3 -m camoufox fetch --browserforge
229
- ```
230
- </details>
231
- <details><summary>Linux</summary>
232
-
233
- ```bash
234
- python -m camoufox fetch --browserforge
235
- ```
236
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
237
- - Debian-based distros
238
- ```bash
239
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
240
- ```
241
- - Arch-based distros
242
- ```bash
243
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
244
- ```
245
- </details>
246
-
247
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
248
-
249
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
250
- ```commandline
251
- playwright install chromium
252
- ```
253
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
254
- ```commandline
255
- python -m browserforge update
247
+ scrapling install
256
248
  ```
249
+ If you have any installation issues, please open an issue.
257
250
 
258
251
  ## Fetching Websites
259
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
252
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
260
253
 
261
254
  ### Features
262
255
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -325,7 +318,7 @@ True
325
318
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
326
319
  | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
327
320
  | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
328
- | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
321
+ | disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
329
322
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
330
323
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
331
324
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
@@ -607,7 +600,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
607
600
  * Any string passed is considered a tag name
608
601
  * Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
609
602
  * Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
610
- * Any regex patterns passed are used as filters
603
+ * Any regex patterns passed are used as filters to elements by their text content
611
604
  * Any functions passed are used as filters
612
605
  * Any keyword argument passed is considered as an HTML element attribute with its value.
613
606
 
@@ -616,7 +609,7 @@ So the way it works is after collecting all passed arguments and keywords, each
616
609
 
617
610
  1. All elements with the passed tag name(s).
618
611
  2. All elements that match all passed attribute(s).
619
- 3. All elements that match all passed regex patterns.
612
+ 3. All elements that its text content match all passed regex patterns.
620
613
  4. All elements that fulfill all passed function(s).
621
614
 
622
615
  Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
@@ -33,6 +33,21 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
33
33
  [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
34
34
  ---
35
35
 
36
+ [Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
37
+
38
+ - 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
39
+ - ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
40
+ - 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
41
+ - 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
42
+ - 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
43
+ - 🎁 Free Trial: Try before you buy—experience our service firsthand.
44
+ - 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
45
+ - 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
46
+
47
+
48
+ [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
49
+ ---
50
+
36
51
  ## Table of content
37
52
  * [Key Features](#key-features)
38
53
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -77,27 +92,27 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
77
92
  ## Key Features
78
93
 
79
94
  ### Fetch websites as you prefer with async support
80
- - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
81
- - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
82
- - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
95
+ - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
96
+ - **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
97
+ - **Anti-bot Protections Bypass**: Easily bypass protections with `StealthyFetcher` and `PlayWrightFetcher` classes.
83
98
 
84
99
  ### Adaptive Scraping
85
- - 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
86
- - 🎯 **Flexible Querying**: Use CSS selectors, XPath, Elements filters, text search, or regex - chain them however you want!
87
- - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
100
+ - 🔄 **Smart Element Tracking**: Relocate elements after website changes, using an intelligent similarity system and integrated storage.
101
+ - 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search and more.
102
+ - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
88
103
  - 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
89
104
 
90
- ### Performance
91
- - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup in parsing by up to 620x in our tests).
105
+ ### High Performance
106
+ - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
92
107
  - 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
93
- - ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
108
+ - ⚡ **Fast JSON serialization**: 10x faster than standard library.
94
109
 
95
- ### Developing Experience
96
- - 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
97
- - 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
98
- - 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
99
- - 🔌 **API Similar to Scrapy/BeautifulSoup**: Familiar methods and similar pseudo-elements for Scrapy and BeautifulSoup users.
100
- - 📘 **Type hints and test coverage**: Complete type coverage and almost full test coverage for better IDE support and fewer bugs, respectively.
110
+ ### Developer Friendly
111
+ - 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
112
+ - 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries that takes less memory than standard dictionaries with added methods.
113
+ - 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
114
+ - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
115
+ - 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
101
116
 
102
117
  ## Getting Started
103
118
 
@@ -106,21 +121,22 @@ from scrapling import Fetcher
106
121
 
107
122
  fetcher = Fetcher(auto_match=False)
108
123
 
109
- # Fetch a web page and create an Adaptor instance
124
+ # Do http GET request to a web page and create an Adaptor instance
110
125
  page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
111
- # Get all strings in the full page
126
+ # Get all text content from all HTML tags in the page except `script` and `style` tags
112
127
  page.get_all_text(ignore_tags=('script', 'style'))
113
128
 
114
- # Get all quotes, any of these methods will return a list of strings (TextHandlers)
129
+ # Get all quotes elements, any of these methods will return a list of strings directly (TextHandlers)
115
130
  quotes = page.css('.quote .text::text') # CSS selector
116
131
  quotes = page.xpath('//span[@class="text"]/text()') # XPath
117
132
  quotes = page.css('.quote').css('.text::text') # Chained selectors
118
133
  quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
119
134
 
120
135
  # Get the first quote element
121
- quote = page.css_first('.quote') # / page.css('.quote').first / page.css('.quote')[0]
136
+ quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
122
137
 
123
138
  # Tired of selectors? Use find_all/find
139
+ # Get all 'div' HTML tags that one of its 'class' values is 'quote'
124
140
  quotes = page.find_all('div', {'class': 'quote'})
125
141
  # Same as
126
142
  quotes = page.find_all('div', class_='quote')
@@ -128,10 +144,10 @@ quotes = page.find_all(['div'], class_='quote')
128
144
  quotes = page.find_all(class_='quote') # and so on...
129
145
 
130
146
  # Working with elements
131
- quote.html_content # Inner HTML
132
- quote.prettify() # Prettified version of Inner HTML
133
- quote.attrib # Element attributes
134
- quote.path # DOM path to element (List)
147
+ quote.html_content # Get Inner HTML of this element
148
+ quote.prettify() # Prettified version of Inner HTML above
149
+ quote.attrib # Get that element's attributes
150
+ quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
135
151
  ```
136
152
  To keep it simple, all methods can be chained on top of each other!
137
153
 
@@ -167,52 +183,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
167
183
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
168
184
 
169
185
  ## Installation
170
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
186
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
171
187
  ```bash
172
188
  pip3 install scrapling
173
189
  ```
174
- - For using the `StealthyFetcher`, go to the command line and download the browser with
175
- <details><summary>Windows OS</summary>
176
-
190
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
177
191
  ```bash
178
- camoufox fetch --browserforge
179
- ```
180
- </details>
181
- <details><summary>MacOS</summary>
182
-
183
- ```bash
184
- python3 -m camoufox fetch --browserforge
185
- ```
186
- </details>
187
- <details><summary>Linux</summary>
188
-
189
- ```bash
190
- python -m camoufox fetch --browserforge
191
- ```
192
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
193
- - Debian-based distros
194
- ```bash
195
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
196
- ```
197
- - Arch-based distros
198
- ```bash
199
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
200
- ```
201
- </details>
202
-
203
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
204
-
205
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
206
- ```commandline
207
- playwright install chromium
208
- ```
209
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
210
- ```commandline
211
- python -m browserforge update
192
+ scrapling install
212
193
  ```
194
+ If you have any installation issues, please open an issue.
213
195
 
214
196
  ## Fetching Websites
215
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
197
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
216
198
 
217
199
  ### Features
218
200
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -281,7 +263,7 @@ True
281
263
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
282
264
  | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
283
265
  | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
284
- | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
266
+ | disable_ads | Disabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
285
267
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
286
268
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
287
269
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
@@ -563,7 +545,7 @@ Inspired by BeautifulSoup's `find_all` function you can find elements by using `
563
545
  * Any string passed is considered a tag name
564
546
  * Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
565
547
  * Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
566
- * Any regex patterns passed are used as filters
548
+ * Any regex patterns passed are used as filters to elements by their text content
567
549
  * Any functions passed are used as filters
568
550
  * Any keyword argument passed is considered as an HTML element attribute with its value.
569
551
 
@@ -572,7 +554,7 @@ So the way it works is after collecting all passed arguments and keywords, each
572
554
 
573
555
  1. All elements with the passed tag name(s).
574
556
  2. All elements that match all passed attribute(s).
575
- 3. All elements that match all passed regex patterns.
557
+ 3. All elements that its text content match all passed regex patterns.
576
558
  4. All elements that fulfill all passed function(s).
577
559
 
578
560
  Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.91"
8
+ __version__ = "0.2.93"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
@@ -0,0 +1,37 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+
9
+ def get_package_dir():
10
+ return Path(os.path.dirname(__file__))
11
+
12
+
13
+ def run_command(command, line):
14
+ print(f"Installing {line}...")
15
+ _ = subprocess.check_call(command, shell=True)
16
+ # I meant to not use try except here
17
+
18
+
19
+ @click.command(help="Install all Scrapling's Fetchers dependencies")
20
+ def install():
21
+ if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
22
+ run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
23
+ run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
24
+ run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
25
+ # if no errors raised by above commands, then we add below file
26
+ get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
27
+ else:
28
+ print('The dependencies are already installed')
29
+
30
+
31
+ @click.group()
32
+ def main():
33
+ pass
34
+
35
+
36
+ # Adding commands
37
+ main.add_command(install)
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
3
3
  """
4
4
 
5
5
  from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
- List, Literal, Optional, Pattern, Tuple, Type, Union)
6
+ List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
7
+ Union)
7
8
 
8
9
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9
10