scrapling 0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/parser.py +20 -3
- {scrapling-0.1.dist-info → scrapling-0.1.2.dist-info}/METADATA +7 -5
- scrapling-0.1.2.dist-info/RECORD +12 -0
- {scrapling-0.1.dist-info → scrapling-0.1.2.dist-info}/WHEEL +1 -1
- scrapling-0.1.dist-info/RECORD +0 -12
- {scrapling-0.1.dist-info → scrapling-0.1.2.dist-info}/LICENSE +0 -0
- {scrapling-0.1.dist-info → scrapling-0.1.2.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -3,7 +3,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
3
3
|
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
4
|
|
5
5
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
-
__version__ = "0.1"
|
6
|
+
__version__ = "0.1.2"
|
7
7
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
8
|
|
9
9
|
|
scrapling/parser.py
CHANGED
@@ -78,7 +78,7 @@ class Adaptor(SelectorsGeneration):
|
|
78
78
|
|
79
79
|
parser = html.HTMLParser(
|
80
80
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
81
|
-
recover=True, remove_blank_text=True, remove_comments=(keep_comments is
|
81
|
+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
82
82
|
compact=True, huge_tree=huge_tree, default_doctype=True
|
83
83
|
)
|
84
84
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
@@ -142,7 +142,8 @@ class Adaptor(SelectorsGeneration):
|
|
142
142
|
if issubclass(type(element), html.HtmlMixin):
|
143
143
|
return self.__class__(
|
144
144
|
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
145
|
-
keep_comments=
|
145
|
+
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
|
146
|
+
huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
146
147
|
)
|
147
148
|
return element
|
148
149
|
|
@@ -186,7 +187,23 @@ class Adaptor(SelectorsGeneration):
|
|
186
187
|
def text(self) -> TextHandler:
|
187
188
|
"""Get text content of the element"""
|
188
189
|
if not self.__text:
|
189
|
-
|
190
|
+
if self.__keep_comments:
|
191
|
+
if not self.children:
|
192
|
+
# If use chose to keep comments, remove comments from text
|
193
|
+
# Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
194
|
+
# This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
|
195
|
+
code = self.html_content
|
196
|
+
parser = html.HTMLParser(
|
197
|
+
recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
|
198
|
+
compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
|
199
|
+
)
|
200
|
+
fragment_root = html.fragment_fromstring(code, parser=parser)
|
201
|
+
self.__text = TextHandler(fragment_root.text)
|
202
|
+
else:
|
203
|
+
self.__text = TextHandler(self._root.text)
|
204
|
+
else:
|
205
|
+
# If user already chose to not keep comments then all is good
|
206
|
+
self.__text = TextHandler(self._root.text)
|
190
207
|
return self.__text
|
191
208
|
|
192
209
|
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.1
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
8
8
|
License: BSD
|
9
|
-
Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/
|
9
|
+
Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
|
10
10
|
Project-URL: Source, https://github.com/D4Vinci/Scrapling
|
11
11
|
Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
|
12
12
|
Classifier: Operating System :: OS Independent
|
@@ -20,7 +20,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
20
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
21
|
Classifier: Programming Language :: Python :: 3
|
22
22
|
Classifier: Programming Language :: Python :: 3 :: Only
|
23
|
-
Classifier: Programming Language :: Python :: 3.6
|
24
23
|
Classifier: Programming Language :: Python :: 3.7
|
25
24
|
Classifier: Programming Language :: Python :: 3.8
|
26
25
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -40,7 +39,7 @@ Requires-Dist: orjson >=3
|
|
40
39
|
Requires-Dist: tldextract
|
41
40
|
|
42
41
|
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
43
|
-
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
44
43
|
|
45
44
|
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
46
45
|
|
@@ -78,7 +77,7 @@ products = page.css('.product', auto_match=True) # Still finds them!
|
|
78
77
|
|
79
78
|
## Getting Started
|
80
79
|
|
81
|
-
Let's walk through a basic example that demonstrates small group of Scrapling's core features:
|
80
|
+
Let's walk through a basic example that demonstrates a small group of Scrapling's core features:
|
82
81
|
|
83
82
|
```python
|
84
83
|
import requests
|
@@ -456,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
456
455
|
### Is Scrapling thread-safe?
|
457
456
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
|
458
457
|
|
458
|
+
## Sponsors
|
459
|
+
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
460
|
+
|
459
461
|
## Contributing
|
460
462
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
461
463
|
|
@@ -0,0 +1,12 @@
|
|
1
|
+
scrapling/__init__.py,sha256=bxgmUv7rTGX8os8Spzxg4lDsNvVv1BWrHXQVDJu86r4,337
|
2
|
+
scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
|
3
|
+
scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
|
4
|
+
scrapling/parser.py,sha256=HNDq9OQzU3zpn31AfmreVlZv9kHielb8XoElSwLzK34,44650
|
5
|
+
scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
|
6
|
+
scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
|
7
|
+
scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
|
8
|
+
scrapling-0.1.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
9
|
+
scrapling-0.1.2.dist-info/METADATA,sha256=-EErnYT0EABbPVgXR-eln3-XS-8haAlHDikNs3pZAKU,27357
|
10
|
+
scrapling-0.1.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
11
|
+
scrapling-0.1.2.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
12
|
+
scrapling-0.1.2.dist-info/RECORD,,
|
scrapling-0.1.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=WXWIoBQvms7OwK3KzfASCOCdZibvesbadaDXjjvDHVA,335
|
2
|
-
scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
|
3
|
-
scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
|
4
|
-
scrapling/parser.py,sha256=FGFa9ePo5vRvVDmKcj5qvXyppAy-LAz1AZbz-QKn1fs,43470
|
5
|
-
scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
|
6
|
-
scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
|
7
|
-
scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
|
8
|
-
scrapling-0.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
9
|
-
scrapling-0.1.dist-info/METADATA,sha256=8fSe6Vsr8E-uVgnv1ZAs9nJHW6zTNF6Gnny-pq-1YfA,27037
|
10
|
-
scrapling-0.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
11
|
-
scrapling-0.1.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
12
|
-
scrapling-0.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|