scrapling 0.2.96__py3-none-any.whl → 0.2.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +35 -6
- scrapling/core/custom_types.py +1 -3
- scrapling/core/storage_adaptors.py +3 -3
- scrapling/core/translator.py +4 -1
- scrapling/core/utils.py +1 -1
- scrapling/defaults.py +18 -9
- scrapling/engines/camo.py +123 -104
- scrapling/engines/pw.py +100 -75
- scrapling/engines/static.py +22 -42
- scrapling/engines/toolbelt/custom.py +2 -2
- scrapling/engines/toolbelt/fingerprints.py +2 -2
- scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling/fetchers.py +24 -24
- scrapling/parser.py +6 -12
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/METADATA +23 -22
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/RECORD +20 -20
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/WHEEL +1 -1
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/LICENSE +0 -0
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -17,7 +17,7 @@ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
|
|
17
17
|
from scrapling.core.mixins import SelectorsGeneration
|
18
18
|
from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
19
19
|
StorageSystemMixin, _StorageTools)
|
20
|
-
from scrapling.core.translator import
|
20
|
+
from scrapling.core.translator import translator_instance
|
21
21
|
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
22
22
|
is_jsonable, log)
|
23
23
|
|
@@ -26,7 +26,7 @@ class Adaptor(SelectorsGeneration):
|
|
26
26
|
__slots__ = (
|
27
27
|
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
28
28
|
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
29
|
-
'__keep_cdata'
|
29
|
+
'__keep_cdata'
|
30
30
|
)
|
31
31
|
|
32
32
|
def __init__(
|
@@ -71,21 +71,18 @@ class Adaptor(SelectorsGeneration):
|
|
71
71
|
if root is None and not body and text is None:
|
72
72
|
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
73
73
|
|
74
|
-
self.__text =
|
75
|
-
self.__raw_body = ''
|
74
|
+
self.__text = ''
|
76
75
|
if root is None:
|
77
76
|
if text is None:
|
78
77
|
if not body or not isinstance(body, bytes):
|
79
78
|
raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
|
80
79
|
|
81
80
|
body = body.replace(b"\x00", b"").strip()
|
82
|
-
self.__raw_body = body.replace(b"\x00", b"").strip().decode()
|
83
81
|
else:
|
84
82
|
if not isinstance(text, str):
|
85
83
|
raise TypeError(f"text argument must be of type str, got {text.__class__}")
|
86
84
|
|
87
85
|
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
88
|
-
self.__raw_body = text.strip()
|
89
86
|
|
90
87
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
91
88
|
parser = html.HTMLParser(
|
@@ -250,10 +247,7 @@ class Adaptor(SelectorsGeneration):
|
|
250
247
|
"""Return the inner html code of the element"""
|
251
248
|
return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
|
252
249
|
|
253
|
-
|
254
|
-
def body(self) -> TextHandler:
|
255
|
-
"""Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
|
256
|
-
return TextHandler(self.__raw_body) or self.html_content
|
250
|
+
body = html_content
|
257
251
|
|
258
252
|
def prettify(self) -> TextHandler:
|
259
253
|
"""Return a prettified version of the element's inner html-code"""
|
@@ -476,7 +470,7 @@ class Adaptor(SelectorsGeneration):
|
|
476
470
|
try:
|
477
471
|
if not self.__auto_match_enabled or ',' not in selector:
|
478
472
|
# No need to split selectors in this case, let's save some CPU cycles :)
|
479
|
-
xpath_selector =
|
473
|
+
xpath_selector = translator_instance.css_to_xpath(selector)
|
480
474
|
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
481
475
|
|
482
476
|
results = []
|
@@ -484,7 +478,7 @@ class Adaptor(SelectorsGeneration):
|
|
484
478
|
for single_selector in split_selectors(selector):
|
485
479
|
# I'm doing this only so the `save` function save data correctly for combined selectors
|
486
480
|
# Like using the ',' to combine two different selectors that point to different elements.
|
487
|
-
xpath_selector =
|
481
|
+
xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
|
488
482
|
results += self.xpath(
|
489
483
|
xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
|
490
484
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.98
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
73
73
|
|
74
74
|
# Sponsors
|
75
75
|
|
76
|
+
[Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
|
77
|
+
|
78
|
+
Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
|
79
|
+
- covering 20+ Google SERP scenarios and mainstream search engines.
|
80
|
+
- support real-time data updates to ensure real-time and accurate information.
|
81
|
+
- It can integrate information from all available online channels and search engines.
|
82
|
+
- Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
|
83
|
+
- **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
|
84
|
+
- 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
85
|
+
- ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
86
|
+
- 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
87
|
+
|
88
|
+
|
89
|
+
[](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
|
90
|
+
---
|
91
|
+
|
76
92
|
[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
|
77
93
|
|
78
94
|
- 👩💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
|
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
88
104
|
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
89
105
|
---
|
90
106
|
|
91
|
-
[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
|
92
|
-
|
93
|
-
- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
|
94
|
-
- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
|
95
|
-
- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
|
96
|
-
- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
|
97
|
-
- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
|
98
|
-
- 🎁 Free Trial: Try before you buy—experience our service firsthand.
|
99
|
-
- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
|
100
|
-
- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
|
101
|
-
|
102
|
-
|
103
|
-
[](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
|
104
|
-
---
|
105
|
-
|
106
107
|
## Table of content
|
107
108
|
* [Key Features](#key-features)
|
108
109
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
@@ -172,7 +173,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
172
173
|
## Getting Started
|
173
174
|
|
174
175
|
```python
|
175
|
-
from scrapling import Fetcher
|
176
|
+
from scrapling.fetchers import Fetcher
|
176
177
|
|
177
178
|
fetcher = Fetcher(auto_match=False)
|
178
179
|
|
@@ -254,7 +255,7 @@ Fetchers are interfaces built on top of other libraries with added features that
|
|
254
255
|
### Features
|
255
256
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
256
257
|
```python
|
257
|
-
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
258
|
+
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
|
258
259
|
```
|
259
260
|
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
260
261
|
|
@@ -286,7 +287,7 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
|
|
286
287
|
```
|
287
288
|
For Async requests, you will just replace the import like below:
|
288
289
|
```python
|
289
|
-
>> from scrapling import AsyncFetcher
|
290
|
+
>> from scrapling.fetchers import AsyncFetcher
|
290
291
|
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
291
292
|
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
292
293
|
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
@@ -540,7 +541,7 @@ When website owners implement structural changes like
|
|
540
541
|
The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
|
541
542
|
|
542
543
|
```python
|
543
|
-
from scrapling import Adaptor
|
544
|
+
from scrapling.parser import Adaptor
|
544
545
|
# Before the change
|
545
546
|
page = Adaptor(page_source, url='example.com')
|
546
547
|
element = page.css('#p1' auto_save=True)
|
@@ -558,7 +559,7 @@ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayba
|
|
558
559
|
If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
|
559
560
|
Now let's test the same selector in both versions
|
560
561
|
```python
|
561
|
-
>> from scrapling import Fetcher
|
562
|
+
>> from scrapling.fetchers import Fetcher
|
562
563
|
>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
|
563
564
|
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
|
564
565
|
>> new_url = "https://stackoverflow.com/"
|
@@ -619,7 +620,7 @@ Note: The filtering process always starts from the first filter it finds in the
|
|
619
620
|
Examples to clear any confusion :)
|
620
621
|
|
621
622
|
```python
|
622
|
-
>> from scrapling import Fetcher
|
623
|
+
>> from scrapling.fetchers import Fetcher
|
623
624
|
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
624
625
|
# Find all elements with tag name `div`.
|
625
626
|
>> page.find_all('div')
|
@@ -1,25 +1,25 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=S-SWj9O2r0Tu8Z-mPxDJ-z3h5k-bBfhFOETaCY4A9dc,1510
|
2
2
|
scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
|
3
|
-
scrapling/defaults.py,sha256=
|
4
|
-
scrapling/fetchers.py,sha256=
|
5
|
-
scrapling/parser.py,sha256=
|
3
|
+
scrapling/defaults.py,sha256=MAn2MMLBFvoe4i3u_qlp6YEvGUiCjNPPDux1cFCdpsU,866
|
4
|
+
scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
|
5
|
+
scrapling/parser.py,sha256=1xS1UjCm1GVnKcVAtup9rSE1xuYPxXOgJe-8LJE5gUk,53956
|
6
6
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
7
7
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
|
9
|
-
scrapling/core/custom_types.py,sha256=
|
9
|
+
scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
|
10
10
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
11
|
-
scrapling/core/storage_adaptors.py,sha256=
|
12
|
-
scrapling/core/translator.py,sha256=
|
13
|
-
scrapling/core/utils.py,sha256=
|
11
|
+
scrapling/core/storage_adaptors.py,sha256=gZbUpHtLOL7o_oZbES_o40r39zShxTeTM8YK6dXA5Zo,6214
|
12
|
+
scrapling/core/translator.py,sha256=3a2VX9KR-q-GzwT1OgGDv1UlzIkvBggkQXUdiMyL-4c,5277
|
13
|
+
scrapling/core/utils.py,sha256=KX88B3tV1-SgCAr69TUN3LfmsTDcLnEhYJiPuWd31yA,3704
|
14
14
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
15
|
-
scrapling/engines/camo.py,sha256=
|
15
|
+
scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
|
16
16
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
17
|
-
scrapling/engines/pw.py,sha256=
|
18
|
-
scrapling/engines/static.py,sha256=
|
17
|
+
scrapling/engines/pw.py,sha256=cZraIBWd9ulEGEdhETIGmpevi62CN9JGcUU1OIDdxkA,21369
|
18
|
+
scrapling/engines/static.py,sha256=EjdaR0beqWfEKKavT7vlBnozoayQaVpqeVtaOuzd384,9306
|
19
19
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
20
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
21
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=
|
22
|
-
scrapling/engines/toolbelt/navigation.py,sha256=
|
20
|
+
scrapling/engines/toolbelt/custom.py,sha256=_-baGB8oOOHogbaddtGsq_K_01ccOjOkGA6tOKk28hM,12811
|
21
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=Zzoqq3p6X_8D7eTxACz3z96cBZWWK61iKOGo2sZUtlg,2924
|
22
|
+
scrapling/engines/toolbelt/navigation.py,sha256=fMjDgicqy2MoZZll2h5EvrrxkL6yNrC09v8isTpwAt0,4565
|
23
23
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
24
24
|
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
25
25
|
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
|
|
41
41
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
42
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
43
43
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
48
|
-
scrapling-0.2.
|
49
|
-
scrapling-0.2.
|
44
|
+
scrapling-0.2.98.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
+
scrapling-0.2.98.dist-info/METADATA,sha256=Un_ROxrGIvk_8w-ECQbwKAcJzYyx3MTWS1DHt9FRqdI,69718
|
46
|
+
scrapling-0.2.98.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
47
|
+
scrapling-0.2.98.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
+
scrapling-0.2.98.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
+
scrapling-0.2.98.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|