scrapling 0.2.96__py3-none-any.whl → 0.2.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/parser.py CHANGED
@@ -17,7 +17,7 @@ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
17
17
  from scrapling.core.mixins import SelectorsGeneration
18
18
  from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
19
19
  StorageSystemMixin, _StorageTools)
20
- from scrapling.core.translator import HTMLTranslator
20
+ from scrapling.core.translator import translator_instance
21
21
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
22
22
  is_jsonable, log)
23
23
 
@@ -26,7 +26,7 @@ class Adaptor(SelectorsGeneration):
26
26
  __slots__ = (
27
27
  'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
28
28
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
29
- '__keep_cdata', '__raw_body'
29
+ '__keep_cdata'
30
30
  )
31
31
 
32
32
  def __init__(
@@ -71,21 +71,18 @@ class Adaptor(SelectorsGeneration):
71
71
  if root is None and not body and text is None:
72
72
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
73
73
 
74
- self.__text = None
75
- self.__raw_body = ''
74
+ self.__text = ''
76
75
  if root is None:
77
76
  if text is None:
78
77
  if not body or not isinstance(body, bytes):
79
78
  raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
80
79
 
81
80
  body = body.replace(b"\x00", b"").strip()
82
- self.__raw_body = body.replace(b"\x00", b"").strip().decode()
83
81
  else:
84
82
  if not isinstance(text, str):
85
83
  raise TypeError(f"text argument must be of type str, got {text.__class__}")
86
84
 
87
85
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
88
- self.__raw_body = text.strip()
89
86
 
90
87
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
91
88
  parser = html.HTMLParser(
@@ -250,10 +247,7 @@ class Adaptor(SelectorsGeneration):
250
247
  """Return the inner html code of the element"""
251
248
  return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
252
249
 
253
- @property
254
- def body(self) -> TextHandler:
255
- """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
256
- return TextHandler(self.__raw_body) or self.html_content
250
+ body = html_content
257
251
 
258
252
  def prettify(self) -> TextHandler:
259
253
  """Return a prettified version of the element's inner html-code"""
@@ -476,7 +470,7 @@ class Adaptor(SelectorsGeneration):
476
470
  try:
477
471
  if not self.__auto_match_enabled or ',' not in selector:
478
472
  # No need to split selectors in this case, let's save some CPU cycles :)
479
- xpath_selector = HTMLTranslator().css_to_xpath(selector)
473
+ xpath_selector = translator_instance.css_to_xpath(selector)
480
474
  return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
481
475
 
482
476
  results = []
@@ -484,7 +478,7 @@ class Adaptor(SelectorsGeneration):
484
478
  for single_selector in split_selectors(selector):
485
479
  # I'm doing this only so the `save` function save data correctly for combined selectors
486
480
  # Like using the ',' to combine two different selectors that point to different elements.
487
- xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
481
+ xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
488
482
  results += self.xpath(
489
483
  xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
490
484
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scrapling
3
- Version: 0.2.96
3
+ Version: 0.2.98
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
73
73
 
74
74
  # Sponsors
75
75
 
76
+ [Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
77
+
78
+ Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
79
+ - covering 20+ Google SERP scenarios and mainstream search engines.
80
+ - support real-time data updates to ensure real-time and accurate information.
81
+ - It can integrate information from all available online channels and search engines.
82
+ - Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
83
+ - **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
84
+ - 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
85
+ - ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
86
+ - 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
87
+
88
+
89
+ [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
90
+ ---
91
+
76
92
  [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
77
93
 
78
94
  - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
88
104
  [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
89
105
  ---
90
106
 
91
- [Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
92
-
93
- - 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
94
- - ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
95
- - 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
96
- - 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
97
- - 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
98
- - 🎁 Free Trial: Try before you buy—experience our service firsthand.
99
- - 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
100
- - 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
101
-
102
-
103
- [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
104
- ---
105
-
106
107
  ## Table of content
107
108
  * [Key Features](#key-features)
108
109
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
@@ -172,7 +173,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
172
173
  ## Getting Started
173
174
 
174
175
  ```python
175
- from scrapling import Fetcher
176
+ from scrapling.fetchers import Fetcher
176
177
 
177
178
  fetcher = Fetcher(auto_match=False)
178
179
 
@@ -254,7 +255,7 @@ Fetchers are interfaces built on top of other libraries with added features that
254
255
  ### Features
255
256
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
256
257
  ```python
257
- from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
258
+ from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
258
259
  ```
259
260
  All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
260
261
 
@@ -286,7 +287,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
286
287
  ```
287
288
  For Async requests, you will just replace the import like below:
288
289
  ```python
289
- >> from scrapling import AsyncFetcher
290
+ >> from scrapling.fetchers import AsyncFetcher
290
291
  >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
291
292
  >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
292
293
  >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
@@ -540,7 +541,7 @@ When website owners implement structural changes like
540
541
  The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
541
542
 
542
543
  ```python
543
- from scrapling import Adaptor
544
+ from scrapling.parser import Adaptor
544
545
  # Before the change
545
546
  page = Adaptor(page_source, url='example.com')
546
547
  element = page.css('#p1' auto_save=True)
@@ -558,7 +559,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
558
559
  If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
559
560
  Now let's test the same selector in both versions
560
561
  ```python
561
- >> from scrapling import Fetcher
562
+ >> from scrapling.fetchers import Fetcher
562
563
  >> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
563
564
  >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
564
565
  >> new_url = "https://stackoverflow.com/"
@@ -619,7 +620,7 @@ Note: The filtering process always starts from the first filter it finds in the
619
620
  Examples to clear any confusion :)
620
621
 
621
622
  ```python
622
- >> from scrapling import Fetcher
623
+ >> from scrapling.fetchers import Fetcher
623
624
  >> page = Fetcher().get('https://quotes.toscrape.com/')
624
625
  # Find all elements with tag name `div`.
625
626
  >> page.find_all('div')
@@ -1,25 +1,25 @@
1
- scrapling/__init__.py,sha256=5r6_yxrfXbeoh8UqUaCdmmbWH9TQxBivP9cLWUXPI5g,500
1
+ scrapling/__init__.py,sha256=S-SWj9O2r0Tu8Z-mPxDJ-z3h5k-bBfhFOETaCY4A9dc,1510
2
2
  scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
3
- scrapling/defaults.py,sha256=sdXeZjXEX7PmCtaa0weK0nRrAUzqZukNNqipZ_sltYE,469
4
- scrapling/fetchers.py,sha256=qmiJ6S-bnPWvP48Z6rKxBnSuR-tdwHlJwlIsYxGxFM0,35405
5
- scrapling/parser.py,sha256=b_1eHxRwHRCidyvm3F6ST6qIYvVEVU6GhTTCI1LblVk,54330
3
+ scrapling/defaults.py,sha256=MAn2MMLBFvoe4i3u_qlp6YEvGUiCjNPPDux1cFCdpsU,866
4
+ scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
5
+ scrapling/parser.py,sha256=1xS1UjCm1GVnKcVAtup9rSE1xuYPxXOgJe-8LJE5gUk,53956
6
6
  scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
7
7
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
9
- scrapling/core/custom_types.py,sha256=tejeLYmWa_aLaLtMSymG4z7h6rxO-9EvmiRWEWcW54s,13022
9
+ scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
10
10
  scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
11
- scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
12
- scrapling/core/translator.py,sha256=hFSc3mxG5pYhbwRgingeFbD_E73U799vCsvVv0uFEXw,5237
13
- scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
11
+ scrapling/core/storage_adaptors.py,sha256=gZbUpHtLOL7o_oZbES_o40r39zShxTeTM8YK6dXA5Zo,6214
12
+ scrapling/core/translator.py,sha256=3a2VX9KR-q-GzwT1OgGDv1UlzIkvBggkQXUdiMyL-4c,5277
13
+ scrapling/core/utils.py,sha256=KX88B3tV1-SgCAr69TUN3LfmsTDcLnEhYJiPuWd31yA,3704
14
14
  scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
15
- scrapling/engines/camo.py,sha256=SHMRnIrN6599upo5-G3fZQ10455xyB-bB_EsLMjBStA,16072
15
+ scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
16
16
  scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
17
- scrapling/engines/pw.py,sha256=LvS1jvTf3s7mfdeQo7_OyQ5zpiOzvBu5g88hOLlQBCQ,20856
18
- scrapling/engines/static.py,sha256=8v6RmdsSP6fAtWNXaJG24evHPsZ2oDiBl7yfkLrdARU,10635
17
+ scrapling/engines/pw.py,sha256=cZraIBWd9ulEGEdhETIGmpevi62CN9JGcUU1OIDdxkA,21369
18
+ scrapling/engines/static.py,sha256=EjdaR0beqWfEKKavT7vlBnozoayQaVpqeVtaOuzd384,9306
19
19
  scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
20
- scrapling/engines/toolbelt/custom.py,sha256=qgONLwpxUoEIAIQBF1RcakYu8cqAAmX8qdyaol5hfjA,12813
21
- scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
22
- scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
20
+ scrapling/engines/toolbelt/custom.py,sha256=_-baGB8oOOHogbaddtGsq_K_01ccOjOkGA6tOKk28hM,12811
21
+ scrapling/engines/toolbelt/fingerprints.py,sha256=Zzoqq3p6X_8D7eTxACz3z96cBZWWK61iKOGo2sZUtlg,2924
22
+ scrapling/engines/toolbelt/navigation.py,sha256=fMjDgicqy2MoZZll2h5EvrrxkL6yNrC09v8isTpwAt0,4565
23
23
  scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
24
24
  scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
25
25
  scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
41
41
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
43
43
  tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
44
- scrapling-0.2.96.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
45
- scrapling-0.2.96.dist-info/METADATA,sha256=yNRmjMR5qmJyH_6ob-6nwLuqD6iXIegMI-d-xQ95ZpA,69063
46
- scrapling-0.2.96.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
47
- scrapling-0.2.96.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
- scrapling-0.2.96.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
- scrapling-0.2.96.dist-info/RECORD,,
44
+ scrapling-0.2.98.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
45
+ scrapling-0.2.98.dist-info/METADATA,sha256=Un_ROxrGIvk_8w-ECQbwKAcJzYyx3MTWS1DHt9FRqdI,69718
46
+ scrapling-0.2.98.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
47
+ scrapling-0.2.98.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
+ scrapling-0.2.98.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
+ scrapling-0.2.98.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5