scrapling 0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -3,7 +3,7 @@ from scrapling.parser import Adaptor, Adaptors
3
3
  from scrapling.custom_types import TextHandler, AttributesHandler
4
4
 
5
5
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
6
- __version__ = "0.1"
6
+ __version__ = "0.1.2"
7
7
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
8
8
 
9
9
 
scrapling/parser.py CHANGED
@@ -78,7 +78,7 @@ class Adaptor(SelectorsGeneration):
78
78
 
79
79
  parser = html.HTMLParser(
80
80
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
81
- recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
81
+ recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
82
82
  compact=True, huge_tree=huge_tree, default_doctype=True
83
83
  )
84
84
  self._root = etree.fromstring(body, parser=parser, base_url=url)
@@ -142,7 +142,8 @@ class Adaptor(SelectorsGeneration):
142
142
  if issubclass(type(element), html.HtmlMixin):
143
143
  return self.__class__(
144
144
  root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
145
- keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
145
+ keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
146
+ huge_tree=self.__huge_tree_enabled, debug=self.__debug
146
147
  )
147
148
  return element
148
149
 
@@ -186,7 +187,23 @@ class Adaptor(SelectorsGeneration):
186
187
  def text(self) -> TextHandler:
187
188
  """Get text content of the element"""
188
189
  if not self.__text:
189
- self.__text = TextHandler(self._root.text)
190
+ if self.__keep_comments:
191
+ if not self.children:
192
+ # If use chose to keep comments, remove comments from text
193
+ # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
194
+ # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
195
+ code = self.html_content
196
+ parser = html.HTMLParser(
197
+ recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
198
+ compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
199
+ )
200
+ fragment_root = html.fragment_fromstring(code, parser=parser)
201
+ self.__text = TextHandler(fragment_root.text)
202
+ else:
203
+ self.__text = TextHandler(self._root.text)
204
+ else:
205
+ # If user already chose to not keep comments then all is good
206
+ self.__text = TextHandler(self._root.text)
190
207
  return self.__text
191
208
 
192
209
  def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.1
3
+ Version: 0.1.2
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
8
8
  License: BSD
9
- Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/Docs
9
+ Project-URL: Documentation, https://github.com/D4Vinci/Scrapling/tree/main/docs
10
10
  Project-URL: Source, https://github.com/D4Vinci/Scrapling
11
11
  Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
12
12
  Classifier: Operating System :: OS Independent
@@ -20,7 +20,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
20
20
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
21
  Classifier: Programming Language :: Python :: 3
22
22
  Classifier: Programming Language :: Python :: 3 :: Only
23
- Classifier: Programming Language :: Python :: 3.6
24
23
  Classifier: Programming Language :: Python :: 3.7
25
24
  Classifier: Programming Language :: Python :: 3.8
26
25
  Classifier: Programming Language :: Python :: 3.9
@@ -40,7 +39,7 @@ Requires-Dist: orjson >=3
40
39
  Requires-Dist: tldextract
41
40
 
42
41
  # 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
43
- [![PyPI version](https://badge.fury.io/py/scrapling.svg)](https://badge.fury.io/py/scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![License](https://img.shields.io/badge/License-BSD--3-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
42
+ [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
44
43
 
45
44
  Dealing with failing web scrapers due to website changes? Meet Scrapling.
46
45
 
@@ -78,7 +77,7 @@ products = page.css('.product', auto_match=True) # Still finds them!
78
77
 
79
78
  ## Getting Started
80
79
 
81
- Let's walk through a basic example that demonstrates small group of Scrapling's core features:
80
+ Let's walk through a basic example that demonstrates a small group of Scrapling's core features:
82
81
 
83
82
  ```python
84
83
  import requests
@@ -456,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
456
455
  ### Is Scrapling thread-safe?
457
456
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
458
457
 
458
+ ## Sponsors
459
+ [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
460
+
459
461
  ## Contributing
460
462
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
461
463
 
@@ -0,0 +1,12 @@
1
+ scrapling/__init__.py,sha256=bxgmUv7rTGX8os8Spzxg4lDsNvVv1BWrHXQVDJu86r4,337
2
+ scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
3
+ scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
4
+ scrapling/parser.py,sha256=HNDq9OQzU3zpn31AfmreVlZv9kHielb8XoElSwLzK34,44650
5
+ scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
6
+ scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
7
+ scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
8
+ scrapling-0.1.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
9
+ scrapling-0.1.2.dist-info/METADATA,sha256=-EErnYT0EABbPVgXR-eln3-XS-8haAlHDikNs3pZAKU,27357
10
+ scrapling-0.1.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
11
+ scrapling-0.1.2.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
12
+ scrapling-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,12 +0,0 @@
1
- scrapling/__init__.py,sha256=WXWIoBQvms7OwK3KzfASCOCdZibvesbadaDXjjvDHVA,335
2
- scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
3
- scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
4
- scrapling/parser.py,sha256=FGFa9ePo5vRvVDmKcj5qvXyppAy-LAz1AZbz-QKn1fs,43470
5
- scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
6
- scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
7
- scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
8
- scrapling-0.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
9
- scrapling-0.1.dist-info/METADATA,sha256=8fSe6Vsr8E-uVgnv1ZAs9nJHW6zTNF6Gnny-pq-1YfA,27037
10
- scrapling-0.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
11
- scrapling-0.1.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
12
- scrapling-0.1.dist-info/RECORD,,