scrapling 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/parser.py +20 -3
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/METADATA +5 -2
- scrapling-0.1.2.dist-info/RECORD +12 -0
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/WHEEL +1 -1
- scrapling-0.1.1.dist-info/RECORD +0 -12
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/LICENSE +0 -0
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -3,7 +3,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
3
3
|
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
4
|
|
5
5
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
-
__version__ = "0.1.
|
6
|
+
__version__ = "0.1.2"
|
7
7
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
8
|
|
9
9
|
|
scrapling/parser.py
CHANGED
@@ -78,7 +78,7 @@ class Adaptor(SelectorsGeneration):
|
|
78
78
|
|
79
79
|
parser = html.HTMLParser(
|
80
80
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
81
|
-
recover=True, remove_blank_text=True, remove_comments=(keep_comments is
|
81
|
+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
82
82
|
compact=True, huge_tree=huge_tree, default_doctype=True
|
83
83
|
)
|
84
84
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
@@ -142,7 +142,8 @@ class Adaptor(SelectorsGeneration):
|
|
142
142
|
if issubclass(type(element), html.HtmlMixin):
|
143
143
|
return self.__class__(
|
144
144
|
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
145
|
-
keep_comments=
|
145
|
+
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
|
146
|
+
huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
146
147
|
)
|
147
148
|
return element
|
148
149
|
|
@@ -186,7 +187,23 @@ class Adaptor(SelectorsGeneration):
|
|
186
187
|
def text(self) -> TextHandler:
|
187
188
|
"""Get text content of the element"""
|
188
189
|
if not self.__text:
|
189
|
-
|
190
|
+
if self.__keep_comments:
|
191
|
+
if not self.children:
|
192
|
+
# If use chose to keep comments, remove comments from text
|
193
|
+
# Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
194
|
+
# This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
|
195
|
+
code = self.html_content
|
196
|
+
parser = html.HTMLParser(
|
197
|
+
recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
|
198
|
+
compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
|
199
|
+
)
|
200
|
+
fragment_root = html.fragment_fromstring(code, parser=parser)
|
201
|
+
self.__text = TextHandler(fragment_root.text)
|
202
|
+
else:
|
203
|
+
self.__text = TextHandler(self._root.text)
|
204
|
+
else:
|
205
|
+
# If user already chose to not keep comments then all is good
|
206
|
+
self.__text = TextHandler(self._root.text)
|
190
207
|
return self.__text
|
191
208
|
|
192
209
|
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -39,7 +39,7 @@ Requires-Dist: orjson >=3
|
|
39
39
|
Requires-Dist: tldextract
|
40
40
|
|
41
41
|
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
42
|
-
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
43
43
|
|
44
44
|
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
45
45
|
|
@@ -455,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
455
455
|
### Is Scrapling thread-safe?
|
456
456
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
|
457
457
|
|
458
|
+
## Sponsors
|
459
|
+
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
460
|
+
|
458
461
|
## Contributing
|
459
462
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
460
463
|
|
@@ -0,0 +1,12 @@
|
|
1
|
+
scrapling/__init__.py,sha256=bxgmUv7rTGX8os8Spzxg4lDsNvVv1BWrHXQVDJu86r4,337
|
2
|
+
scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
|
3
|
+
scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
|
4
|
+
scrapling/parser.py,sha256=HNDq9OQzU3zpn31AfmreVlZv9kHielb8XoElSwLzK34,44650
|
5
|
+
scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
|
6
|
+
scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
|
7
|
+
scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
|
8
|
+
scrapling-0.1.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
9
|
+
scrapling-0.1.2.dist-info/METADATA,sha256=-EErnYT0EABbPVgXR-eln3-XS-8haAlHDikNs3pZAKU,27357
|
10
|
+
scrapling-0.1.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
11
|
+
scrapling-0.1.2.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
12
|
+
scrapling-0.1.2.dist-info/RECORD,,
|
scrapling-0.1.1.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=EgHQIn26VZ00qaWR6ANRmiGAQ1VQfwaVrWUssfyrD0Y,337
|
2
|
-
scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
|
3
|
-
scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
|
4
|
-
scrapling/parser.py,sha256=FGFa9ePo5vRvVDmKcj5qvXyppAy-LAz1AZbz-QKn1fs,43470
|
5
|
-
scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
|
6
|
-
scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
|
7
|
-
scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
|
8
|
-
scrapling-0.1.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
9
|
-
scrapling-0.1.1.dist-info/METADATA,sha256=sffCnzPbmLaqzIL3C3ehbntKeFOevNYuQGiN9UnLMGo,27154
|
10
|
-
scrapling-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
11
|
-
scrapling-0.1.1.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
12
|
-
scrapling-0.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|