scrapling 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/parser.py +20 -3
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/METADATA +5 -2
- scrapling-0.1.2.dist-info/RECORD +12 -0
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/WHEEL +1 -1
- scrapling-0.1.1.dist-info/RECORD +0 -12
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/LICENSE +0 -0
- {scrapling-0.1.1.dist-info → scrapling-0.1.2.dist-info}/top_level.txt +0 -0
    
        scrapling/__init__.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ from scrapling.parser import Adaptor, Adaptors | |
| 3 3 | 
             
            from scrapling.custom_types import TextHandler, AttributesHandler
         | 
| 4 4 |  | 
| 5 5 | 
             
            __author__ = "Karim Shoair (karim.shoair@pm.me)"
         | 
| 6 | 
            -
            __version__ = "0.1. | 
| 6 | 
            +
            __version__ = "0.1.2"
         | 
| 7 7 | 
             
            __copyright__ = "Copyright (c) 2024 Karim Shoair"
         | 
| 8 8 |  | 
| 9 9 |  | 
    
        scrapling/parser.py
    CHANGED
    
    | @@ -78,7 +78,7 @@ class Adaptor(SelectorsGeneration): | |
| 78 78 |  | 
| 79 79 | 
             
                        parser = html.HTMLParser(
         | 
| 80 80 | 
             
                            # https://lxml.de/api/lxml.etree.HTMLParser-class.html
         | 
| 81 | 
            -
                            recover=True, remove_blank_text=True, remove_comments=(keep_comments is  | 
| 81 | 
            +
                            recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
         | 
| 82 82 | 
             
                            compact=True, huge_tree=huge_tree, default_doctype=True
         | 
| 83 83 | 
             
                        )
         | 
| 84 84 | 
             
                        self._root = etree.fromstring(body, parser=parser, base_url=url)
         | 
| @@ -142,7 +142,8 @@ class Adaptor(SelectorsGeneration): | |
| 142 142 | 
             
                        if issubclass(type(element), html.HtmlMixin):
         | 
| 143 143 | 
             
                            return self.__class__(
         | 
| 144 144 | 
             
                                root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
         | 
| 145 | 
            -
                                keep_comments= | 
| 145 | 
            +
                                keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
         | 
| 146 | 
            +
                                huge_tree=self.__huge_tree_enabled, debug=self.__debug
         | 
| 146 147 | 
             
                            )
         | 
| 147 148 | 
             
                        return element
         | 
| 148 149 |  | 
| @@ -186,7 +187,23 @@ class Adaptor(SelectorsGeneration): | |
| 186 187 | 
             
                def text(self) -> TextHandler:
         | 
| 187 188 | 
             
                    """Get text content of the element"""
         | 
| 188 189 | 
             
                    if not self.__text:
         | 
| 189 | 
            -
                         | 
| 190 | 
            +
                        if self.__keep_comments:
         | 
| 191 | 
            +
                            if not self.children:
         | 
| 192 | 
            +
                                # If use chose to keep comments, remove comments from text
         | 
| 193 | 
            +
                                # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
         | 
| 194 | 
            +
                                # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
         | 
| 195 | 
            +
                                code = self.html_content
         | 
| 196 | 
            +
                                parser = html.HTMLParser(
         | 
| 197 | 
            +
                                    recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
         | 
| 198 | 
            +
                                    compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
         | 
| 199 | 
            +
                                )
         | 
| 200 | 
            +
                                fragment_root = html.fragment_fromstring(code, parser=parser)
         | 
| 201 | 
            +
                                self.__text = TextHandler(fragment_root.text)
         | 
| 202 | 
            +
                            else:
         | 
| 203 | 
            +
                                self.__text = TextHandler(self._root.text)
         | 
| 204 | 
            +
                        else:
         | 
| 205 | 
            +
                            # If user already chose to not keep comments then all is good
         | 
| 206 | 
            +
                            self.__text = TextHandler(self._root.text)
         | 
| 190 207 | 
             
                    return self.__text
         | 
| 191 208 |  | 
| 192 209 | 
             
                def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.1
         | 
| 2 2 | 
             
            Name: scrapling
         | 
| 3 | 
            -
            Version: 0.1. | 
| 3 | 
            +
            Version: 0.1.2
         | 
| 4 4 | 
             
            Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It 
         | 
| 5 5 | 
             
            Home-page: https://github.com/D4Vinci/Scrapling
         | 
| 6 6 | 
             
            Author: Karim Shoair
         | 
| @@ -39,7 +39,7 @@ Requires-Dist: orjson >=3 | |
| 39 39 | 
             
            Requires-Dist: tldextract
         | 
| 40 40 |  | 
| 41 41 | 
             
            # 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
         | 
| 42 | 
            -
            [](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
         | 
| 43 43 |  | 
| 44 44 | 
             
            Dealing with failing web scrapers due to website changes? Meet Scrapling.
         | 
| 45 45 |  | 
| @@ -455,6 +455,9 @@ Of course, you can find elements by text/regex, find similar elements in a more | |
| 455 455 | 
             
            ### Is Scrapling thread-safe?
         | 
| 456 456 | 
             
            Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
         | 
| 457 457 |  | 
| 458 | 
            +
            ## Sponsors
         | 
| 459 | 
            +
            [](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
         | 
| 460 | 
            +
             | 
| 458 461 | 
             
            ## Contributing
         | 
| 459 462 | 
             
            Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
         | 
| 460 463 |  | 
| @@ -0,0 +1,12 @@ | |
| 1 | 
            +
            scrapling/__init__.py,sha256=bxgmUv7rTGX8os8Spzxg4lDsNvVv1BWrHXQVDJu86r4,337
         | 
| 2 | 
            +
            scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
         | 
| 3 | 
            +
            scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
         | 
| 4 | 
            +
            scrapling/parser.py,sha256=HNDq9OQzU3zpn31AfmreVlZv9kHielb8XoElSwLzK34,44650
         | 
| 5 | 
            +
            scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
         | 
| 6 | 
            +
            scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
         | 
| 7 | 
            +
            scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
         | 
| 8 | 
            +
            scrapling-0.1.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
         | 
| 9 | 
            +
            scrapling-0.1.2.dist-info/METADATA,sha256=-EErnYT0EABbPVgXR-eln3-XS-8haAlHDikNs3pZAKU,27357
         | 
| 10 | 
            +
            scrapling-0.1.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
         | 
| 11 | 
            +
            scrapling-0.1.2.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
         | 
| 12 | 
            +
            scrapling-0.1.2.dist-info/RECORD,,
         | 
    
        scrapling-0.1.1.dist-info/RECORD
    DELETED
    
    | @@ -1,12 +0,0 @@ | |
| 1 | 
            -
            scrapling/__init__.py,sha256=EgHQIn26VZ00qaWR6ANRmiGAQ1VQfwaVrWUssfyrD0Y,337
         | 
| 2 | 
            -
            scrapling/custom_types.py,sha256=D4bAwpm3JIjfw4I2FS0odcq371OQAAUTkBN4ajPWX-4,6001
         | 
| 3 | 
            -
            scrapling/mixins.py,sha256=m3ZvTyjt9JNtfZ4NYuaQsfi84UbFkatN_NEiLVZM7TY,2924
         | 
| 4 | 
            -
            scrapling/parser.py,sha256=FGFa9ePo5vRvVDmKcj5qvXyppAy-LAz1AZbz-QKn1fs,43470
         | 
| 5 | 
            -
            scrapling/storage_adaptors.py,sha256=FEh0E8iss2ZDXAL1IDGxQVpEcdbLOupfmUwcsp_z-QY,6198
         | 
| 6 | 
            -
            scrapling/translator.py,sha256=E0oueabsELxTGSv91q6AQgQnsN4-oQ76mq1u0jR2Ofo,5410
         | 
| 7 | 
            -
            scrapling/utils.py,sha256=ApmNjCxxy-N_gAYRnzutLvPBvY_s9FTYp8UhdyeZXSc,5960
         | 
| 8 | 
            -
            scrapling-0.1.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
         | 
| 9 | 
            -
            scrapling-0.1.1.dist-info/METADATA,sha256=sffCnzPbmLaqzIL3C3ehbntKeFOevNYuQGiN9UnLMGo,27154
         | 
| 10 | 
            -
            scrapling-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
         | 
| 11 | 
            -
            scrapling-0.1.1.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
         | 
| 12 | 
            -
            scrapling-0.1.1.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |