scrapling 0.1.1__tar.gz → 0.1.2__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.1.1 → scrapling-0.1.2}/MANIFEST.in +2 -0
- {scrapling-0.1.1/scrapling.egg-info → scrapling-0.1.2}/PKG-INFO +5 -2
- {scrapling-0.1.1 → scrapling-0.1.2}/README.md +4 -1
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/__init__.py +1 -1
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/parser.py +20 -3
- {scrapling-0.1.1 → scrapling-0.1.2/scrapling.egg-info}/PKG-INFO +5 -2
- {scrapling-0.1.1 → scrapling-0.1.2}/setup.cfg +1 -1
- {scrapling-0.1.1 → scrapling-0.1.2}/setup.py +1 -1
- {scrapling-0.1.1 → scrapling-0.1.2}/LICENSE +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/custom_types.py +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/mixins.py +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/storage_adaptors.py +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/translator.py +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling/utils.py +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.1.1 → scrapling-0.1.2}/tests/test_all_functions.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -39,7 +39,7 @@ Requires-Dist: orjson>=3
|
|
39
39
|
Requires-Dist: tldextract
|
40
40
|
|
41
41
|
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
42
|
-
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
43
43
|
|
44
44
|
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
45
45
|
|
@@ -455,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
455
455
|
### Is Scrapling thread-safe?
|
456
456
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
|
457
457
|
|
458
|
+
## Sponsors
|
459
|
+
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
460
|
+
|
458
461
|
## Contributing
|
459
462
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
460
463
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
2
|
-
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
3
3
|
|
4
4
|
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
5
5
|
|
@@ -415,6 +415,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
415
415
|
### Is Scrapling thread-safe?
|
416
416
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
|
417
417
|
|
418
|
+
## Sponsors
|
419
|
+
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
420
|
+
|
418
421
|
## Contributing
|
419
422
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
420
423
|
|
@@ -3,7 +3,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
3
3
|
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
4
|
|
5
5
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
-
__version__ = "0.1.
|
6
|
+
__version__ = "0.1.2"
|
7
7
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
8
|
|
9
9
|
|
@@ -78,7 +78,7 @@ class Adaptor(SelectorsGeneration):
|
|
78
78
|
|
79
79
|
parser = html.HTMLParser(
|
80
80
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
81
|
-
recover=True, remove_blank_text=True, remove_comments=(keep_comments is
|
81
|
+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
82
82
|
compact=True, huge_tree=huge_tree, default_doctype=True
|
83
83
|
)
|
84
84
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
@@ -142,7 +142,8 @@ class Adaptor(SelectorsGeneration):
|
|
142
142
|
if issubclass(type(element), html.HtmlMixin):
|
143
143
|
return self.__class__(
|
144
144
|
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
145
|
-
keep_comments=
|
145
|
+
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
|
146
|
+
huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
146
147
|
)
|
147
148
|
return element
|
148
149
|
|
@@ -186,7 +187,23 @@ class Adaptor(SelectorsGeneration):
|
|
186
187
|
def text(self) -> TextHandler:
|
187
188
|
"""Get text content of the element"""
|
188
189
|
if not self.__text:
|
189
|
-
|
190
|
+
if self.__keep_comments:
|
191
|
+
if not self.children:
|
192
|
+
# If use chose to keep comments, remove comments from text
|
193
|
+
# Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
194
|
+
# This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
|
195
|
+
code = self.html_content
|
196
|
+
parser = html.HTMLParser(
|
197
|
+
recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
|
198
|
+
compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
|
199
|
+
)
|
200
|
+
fragment_root = html.fragment_fromstring(code, parser=parser)
|
201
|
+
self.__text = TextHandler(fragment_root.text)
|
202
|
+
else:
|
203
|
+
self.__text = TextHandler(self._root.text)
|
204
|
+
else:
|
205
|
+
# If user already chose to not keep comments then all is good
|
206
|
+
self.__text = TextHandler(self._root.text)
|
190
207
|
return self.__text
|
191
208
|
|
192
209
|
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -39,7 +39,7 @@ Requires-Dist: orjson>=3
|
|
39
39
|
Requires-Dist: tldextract
|
40
40
|
|
41
41
|
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
42
|
-
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
43
43
|
|
44
44
|
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
45
45
|
|
@@ -455,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
455
455
|
### Is Scrapling thread-safe?
|
456
456
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
|
457
457
|
|
458
|
+
## Sponsors
|
459
|
+
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
460
|
+
|
458
461
|
## Contributing
|
459
462
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
460
463
|
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.1.
|
9
|
+
version="0.1.2",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|