scrapling 0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -3,7 +3,7 @@ from scrapling.parser import Adaptor, Adaptors
3
3
  from scrapling.custom_types import TextHandler, AttributesHandler
4
4
 
5
5
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
6
- __version__ = "0.1"
6
+ __version__ = "0.1.2"
7
7
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
8
8
 
9
9
 
scrapling/parser.py CHANGED
@@ -78,7 +78,7 @@ class Adaptor(SelectorsGeneration):
78
78
 
79
79
  parser = html.HTMLParser(
80
80
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
81
- recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
81
+ recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
82
82
  compact=True, huge_tree=huge_tree, default_doctype=True
83
83
  )
84
84
  self._root = etree.fromstring(body, parser=parser, base_url=url)
@@ -142,7 +142,8 @@ class Adaptor(SelectorsGeneration):
142
142
  if issubclass(type(element), html.HtmlMixin):
143
143
  return self.__class__(
144
144
  root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
145
- keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
145
+ keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
146
+ huge_tree=self.__huge_tree_enabled, debug=self.__debug
146
147
  )
147
148
  return element
148
149
 
@@ -186,7 +187,23 @@ class Adaptor(SelectorsGeneration):
186
187
  def text(self) -> TextHandler:
187
188
  """Get text content of the element"""
188
189
  if not self.__text:
189
- self.__text = TextHandler(self._root.text)
190
+ if self.__keep_comments:
191
+ if not self.children:
192
+ # If use chose to keep comments, remove comments from text
193
+ # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
194
+ # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
195
+ code = self.html_content
196
+ parser = html.HTMLParser(
197
+ recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
198
+ compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
199
+ )
200
+ fragment_root = html.fragment_fromstring(code, parser=parser)
201
+ self.__text = TextHandler(fragment_root.text)
202
+ else:
203
+ self.__text = TextHandler(self._root.text)
204
+ else:
205
+ # If user already chose to not keep comments then all is good
206
+ self.__text = TextHandler(self._root.text)
190
207
  return self.__text
191
208
 
192
209
  def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.1
3
+ Version: 0.1.2
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
8
8
  License: BSD
9
- Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/Docs
9
+ Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
10
10
  Project-URL: Source, https://github.com/D4Vinci/Scrapling
11
11
  Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
12
12
  Classifier: Operating System :: OS Independent
@@ -20,7 +20,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
20
20
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
21
  Classifier: Programming Language :: Python :: 3
22
22
  Classifier: Programming Language :: Python :: 3 :: Only
23
- Classifier: Programming Language :: Python :: 3.6
24
23
  Classifier: Programming Language :: Python :: 3.7
25
24
  Classifier: Programming Language :: Python :: 3.8
26
25
  Classifier: Programming Language :: Python :: 3.9
@@ -40,7 +39,7 @@ Requires-Dist: orjson >=3
40
39
  Requires-Dist: tldextract
41
40
 
42
41
  # 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
43
- [![PyPI version](https://badge.fury.io/py/scrapling.svg)](https://badge.fury.io/py/scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![License](https://img.shields.io/badge/License-BSD--3-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
42
+ [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
44
43
 
45
44
  Dealing with failing web scrapers due to website changes? Meet Scrapling.
46
45
 
@@ -78,7 +77,7 @@ products = page.css('.product', auto_match=True) # Still finds them!
78
77
 
79
78
  ## Getting Started
80
79
 
81
- Let's walk through a basic example that demonstrates small group of Scrapling's core features:
80
+ Let's walk through a basic example that demonstrates a small group of Scrapling's core features:
82
81
 
83
82
  ```python
84
83
  import requests
@@ -456,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
456
455
  ### Is Scrapling thread-safe?
457
456
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
458
457
 
458
+ ## Sponsors
459
+ [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
460
+
459
461
  ## Contributing
460
462
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
461
463
 
@@ -0,0 +1,12 @@
1
+ scrapling/__init__.py,sha256=bxgmUv7rTGX8os8Spzxg4lDsNvVv1BWrHXQVDJu86r4,337
2
+ scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
3
+ scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
4
+ scrapling/parser.py,sha256=HNDq9OQzU3zpn31AfmreVlZv9kHielb8XoElSwLzK34,44650
5
+ scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
6
+ scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
7
+ scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
8
+ scrapling-0.1.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
9
+ scrapling-0.1.2.dist-info/METADATA,sha256=-EErnYT0EABbPVgXR-eln3-XS-8haAlHDikNs3pZAKU,27357
10
+ scrapling-0.1.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
11
+ scrapling-0.1.2.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
12
+ scrapling-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,12 +0,0 @@
1
- scrapling/__init__.py,sha256=WXWIoBQvms7OwK3KzfASCOCdZibvesbadaDXjjvDHVA,335
2
- scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
3
- scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
4
- scrapling/parser.py,sha256=FGFa9ePo5vRvVDmKcj5qvXyppAy-LAz1AZbz-QKn1fs,43470
5
- scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
6
- scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
7
- scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
8
- scrapling-0.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
9
- scrapling-0.1.dist-info/METADATA,sha256=8fSe6Vsr8E-uVgnv1ZAs9nJHW6zTNF6Gnny-pq-1YfA,27037
10
- scrapling-0.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
11
- scrapling-0.1.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
12
- scrapling-0.1.dist-info/RECORD,,