PyPI - scrapling - Versions diffs - 0.2.96__py3-none-any.whl → 0.2.98__py3-none-any.whl - Mend

scrapling 0.2.96py3-none-any.whl → 0.2.98py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

scrapling/__init__.py +35 -6
scrapling/core/custom_types.py +1 -3
scrapling/core/storage_adaptors.py +3 -3
scrapling/core/translator.py +4 -1
scrapling/core/utils.py +1 -1
scrapling/defaults.py +18 -9
scrapling/engines/camo.py +123 -104
scrapling/engines/pw.py +100 -75
scrapling/engines/static.py +22 -42
scrapling/engines/toolbelt/custom.py +2 -2
scrapling/engines/toolbelt/fingerprints.py +2 -2
scrapling/engines/toolbelt/navigation.py +1 -1
scrapling/fetchers.py +24 -24
scrapling/parser.py +6 -12
{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/METADATA +23 -22
{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/RECORD +20 -20
{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/WHEEL +1 -1
{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/LICENSE +0 -0
{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/entry_points.txt +0 -0
{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/top_level.txt +0 -0

scrapling/parser.py CHANGED Viewed

@@ -17,7 +17,7 @@ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
 from scrapling.core.mixins import SelectorsGeneration
 from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
                                              StorageSystemMixin, _StorageTools)
-from scrapling.core.translator import HTMLTranslator
+from scrapling.core.translator import translator_instance
 from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
                                   is_jsonable, log)
@@ -26,7 +26,7 @@ class Adaptor(SelectorsGeneration):
     __slots__ = (
         'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
-        '__keep_cdata', '__raw_body'
+        '__keep_cdata'
     )
     def __init__(
@@ -71,21 +71,18 @@ class Adaptor(SelectorsGeneration):
         if root is None and not body and text is None:
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
-        self.__text = None
-        self.__raw_body = ''
+        self.__text = ''
         if root is None:
             if text is None:
                 if not body or not isinstance(body, bytes):
                     raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
                 body = body.replace(b"\x00", b"").strip()
-                self.__raw_body = body.replace(b"\x00", b"").strip().decode()
             else:
                 if not isinstance(text, str):
                     raise TypeError(f"text argument must be of type str, got {text.__class__}")
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
-                self.__raw_body = text.strip()
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
@@ -250,10 +247,7 @@ class Adaptor(SelectorsGeneration):
         """Return the inner html code of the element"""
         return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
-    @property
-    def body(self) -> TextHandler:
-        """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
-        return TextHandler(self.__raw_body) or self.html_content
+    body = html_content
     def prettify(self) -> TextHandler:
         """Return a prettified version of the element's inner html-code"""
@@ -476,7 +470,7 @@ class Adaptor(SelectorsGeneration):
         try:
             if not self.__auto_match_enabled or ',' not in selector:
                 # No need to split selectors in this case, let's save some CPU cycles :)
-                xpath_selector = HTMLTranslator().css_to_xpath(selector)
+                xpath_selector = translator_instance.css_to_xpath(selector)
                 return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
             results = []
@@ -484,7 +478,7 @@ class Adaptor(SelectorsGeneration):
                 for single_selector in split_selectors(selector):
                     # I'm doing this only so the `save` function save data correctly for combined selectors
                     # Like using the ',' to combine two different selectors that point to different elements.
-                    xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
+                    xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
                     results += self.xpath(
                         xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
                     )

{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: scrapling
-Version: 0.2.96
+Version: 0.2.98
 Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 # Sponsors
+[Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
+Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
+- covering 20+ Google SERP scenarios and mainstream search engines.
+- support real-time data updates to ensure real-time and accurate information.
+- It can integrate information from all available online channels and search engines.
+- Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
+- **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
+- 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
+- ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
+- 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
+[![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
+---
 [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
 - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
 ---
-[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
-- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
-- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
-- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
-- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
-- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
-- 🎁 Free Trial: Try before you buy—experience our service firsthand.
-- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
-- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
-[![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
----
 ## Table of content
   * [Key Features](#key-features)
     * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -172,7 +173,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 ## Getting Started
 ```python
-from scrapling import Fetcher
+from scrapling.fetchers import Fetcher
 fetcher = Fetcher(auto_match=False)
@@ -254,7 +255,7 @@ Fetchers are interfaces built on top of other libraries with added features that
 ### Features
 You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
 ```python
-from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
+from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
 All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
@@ -286,7 +287,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
 ```
 For Async requests, you will just replace the import like below:
 ```python
->> from scrapling import AsyncFetcher
+>> from scrapling.fetchers import AsyncFetcher
 >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
 >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
 >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
@@ -540,7 +541,7 @@ When website owners implement structural changes like
 The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
 ```python
-from scrapling import Adaptor
+from scrapling.parser import Adaptor
 # Before the change
 page = Adaptor(page_source, url='example.com')
 element = page.css('#p1' auto_save=True)
@@ -558,7 +559,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
 If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
 Now let's test the same selector in both versions
 ```python
->> from scrapling import Fetcher
+>> from scrapling.fetchers import Fetcher
 >> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
 >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
 >> new_url = "https://stackoverflow.com/"
@@ -619,7 +620,7 @@ Note: The filtering process always starts from the first filter it finds in the
 Examples to clear any confusion :)
 ```python
->> from scrapling import Fetcher
+>> from scrapling.fetchers import Fetcher
 >> page = Fetcher().get('https://quotes.toscrape.com/')
 # Find all elements with tag name `div`.
 >> page.find_all('div')

{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,25 @@
-scrapling/__init__.py,sha256=5r6_yxrfXbeoh8UqUaCdmmbWH9TQxBivP9cLWUXPI5g,500
+scrapling/__init__.py,sha256=S-SWj9O2r0Tu8Z-mPxDJ-z3h5k-bBfhFOETaCY4A9dc,1510
 scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
-scrapling/defaults.py,sha256=sdXeZjXEX7PmCtaa0weK0nRrAUzqZukNNqipZ_sltYE,469
-scrapling/fetchers.py,sha256=qmiJ6S-bnPWvP48Z6rKxBnSuR-tdwHlJwlIsYxGxFM0,35405
-scrapling/parser.py,sha256=b_1eHxRwHRCidyvm3F6ST6qIYvVEVU6GhTTCI1LblVk,54330
+scrapling/defaults.py,sha256=MAn2MMLBFvoe4i3u_qlp6YEvGUiCjNPPDux1cFCdpsU,866
+scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
+scrapling/parser.py,sha256=1xS1UjCm1GVnKcVAtup9rSE1xuYPxXOgJe-8LJE5gUk,53956
 scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
 scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
-scrapling/core/custom_types.py,sha256=tejeLYmWa_aLaLtMSymG4z7h6rxO-9EvmiRWEWcW54s,13022
+scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
 scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
-scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
-scrapling/core/translator.py,sha256=hFSc3mxG5pYhbwRgingeFbD_E73U799vCsvVv0uFEXw,5237
-scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
+scrapling/core/storage_adaptors.py,sha256=gZbUpHtLOL7o_oZbES_o40r39zShxTeTM8YK6dXA5Zo,6214
+scrapling/core/translator.py,sha256=3a2VX9KR-q-GzwT1OgGDv1UlzIkvBggkQXUdiMyL-4c,5277
+scrapling/core/utils.py,sha256=KX88B3tV1-SgCAr69TUN3LfmsTDcLnEhYJiPuWd31yA,3704
 scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
-scrapling/engines/camo.py,sha256=SHMRnIrN6599upo5-G3fZQ10455xyB-bB_EsLMjBStA,16072
+scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
 scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
-scrapling/engines/pw.py,sha256=LvS1jvTf3s7mfdeQo7_OyQ5zpiOzvBu5g88hOLlQBCQ,20856
-scrapling/engines/static.py,sha256=8v6RmdsSP6fAtWNXaJG24evHPsZ2oDiBl7yfkLrdARU,10635
+scrapling/engines/pw.py,sha256=cZraIBWd9ulEGEdhETIGmpevi62CN9JGcUU1OIDdxkA,21369
+scrapling/engines/static.py,sha256=EjdaR0beqWfEKKavT7vlBnozoayQaVpqeVtaOuzd384,9306
 scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
-scrapling/engines/toolbelt/custom.py,sha256=qgONLwpxUoEIAIQBF1RcakYu8cqAAmX8qdyaol5hfjA,12813
-scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
-scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
+scrapling/engines/toolbelt/custom.py,sha256=_-baGB8oOOHogbaddtGsq_K_01ccOjOkGA6tOKk28hM,12811
+scrapling/engines/toolbelt/fingerprints.py,sha256=Zzoqq3p6X_8D7eTxACz3z96cBZWWK61iKOGo2sZUtlg,2924
+scrapling/engines/toolbelt/navigation.py,sha256=fMjDgicqy2MoZZll2h5EvrrxkL6yNrC09v8isTpwAt0,4565
 scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
 scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
 scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
 tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
 tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
-scrapling-0.2.96.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
-scrapling-0.2.96.dist-info/METADATA,sha256=yNRmjMR5qmJyH_6ob-6nwLuqD6iXIegMI-d-xQ95ZpA,69063
-scrapling-0.2.96.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-scrapling-0.2.96.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
-scrapling-0.2.96.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
-scrapling-0.2.96.dist-info/RECORD,,
+scrapling-0.2.98.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
+scrapling-0.2.98.dist-info/METADATA,sha256=Un_ROxrGIvk_8w-ECQbwKAcJzYyx3MTWS1DHt9FRqdI,69718
+scrapling-0.2.98.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+scrapling-0.2.98.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
+scrapling-0.2.98.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
+scrapling-0.2.98.dist-info/RECORD,,

{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.2)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/LICENSE RENAMED Viewed

File without changes

{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/top_level.txt RENAMED Viewed

File without changes

scrapling 0.2.96__py3-none-any.whl → 0.2.98__py3-none-any.whl

scrapling 0.2.96py3-none-any.whl → 0.2.98py3-none-any.whl