scrapling 0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ import orjson
2
+ import sqlite3
3
+ import logging
4
+ import threading
5
+ from hashlib import sha256
6
+ from abc import ABC, abstractmethod
7
+ from typing import Dict, Optional, Union
8
+
9
+ from scrapling.utils import _StorageTools, cache
10
+
11
+ from lxml import html
12
+ from tldextract import extract as tld
13
+
14
+
15
+ class StorageSystemMixin(ABC):
16
+ # If you want to make your own storage system, you have to inherit from this
17
+ def __init__(self, url: Union[str, None] = None):
18
+ """
19
+ :param url: URL of the website we are working on to separate it from other websites data
20
+ """
21
+ self.url = url
22
+
23
+ @cache(None, typed=True)
24
+ def _get_base_url(self, default_value: str = 'default') -> str:
25
+ if not self.url or type(self.url) is not str:
26
+ return default_value
27
+
28
+ try:
29
+ extracted = tld(self.url)
30
+ return extracted.registered_domain or extracted.domain or default_value
31
+ except AttributeError:
32
+ return default_value
33
+
34
+ @abstractmethod
35
+ def save(self, element: html.HtmlElement, identifier: str) -> None:
36
+ """Saves the element's unique properties to the storage for retrieval and relocation later
37
+
38
+ :param element: The element itself that we want to save to storage.
39
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
40
+ the docs for more info.
41
+ """
42
+ raise NotImplementedError('Storage system must implement `save` method')
43
+
44
+ @abstractmethod
45
+ def retrieve(self, identifier: str) -> Optional[Dict]:
46
+ """Using the identifier, we search the storage and return the unique properties of the element
47
+
48
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
49
+ the docs for more info.
50
+ :return: A dictionary of the unique properties
51
+ """
52
+ raise NotImplementedError('Storage system must implement `save` method')
53
+
54
+ @staticmethod
55
+ @cache(None, typed=True)
56
+ def _get_hash(identifier: str) -> str:
57
+ """If you want to hash identifier in your storage system, use this safer"""
58
+ identifier = identifier.lower().strip()
59
+ if isinstance(identifier, str):
60
+ # Hash functions have to take bytes
61
+ identifier = identifier.encode('utf-8')
62
+
63
+ hash_value = sha256(identifier).hexdigest()
64
+ return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
65
+
66
+
67
+ @cache(None, typed=True)
68
+ class SQLiteStorageSystem(StorageSystemMixin):
69
+ """The recommended system to use, it's race condition safe and thread safe.
70
+ Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
71
+ > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
72
+ def __init__(self, storage_file: str, url: Union[str, None] = None):
73
+ """
74
+ :param storage_file: File to be used to store elements
75
+ :param url: URL of the website we are working on to separate it from other websites data
76
+
77
+ """
78
+ super().__init__(url)
79
+ self.storage_file = storage_file
80
+ # We use a threading.Lock to ensure thread-safety instead of relying on thread-local storage.
81
+ self.lock = threading.Lock()
82
+ # >SQLite default mode in earlier version is 1 not 2 (1=thread-safe 2=serialized)
83
+ # `check_same_thread=False` to allow it to be used across different threads.
84
+ self.connection = sqlite3.connect(self.storage_file, check_same_thread=False)
85
+ # WAL (Write-Ahead Logging) allows for better concurrency.
86
+ self.connection.execute("PRAGMA journal_mode=WAL")
87
+ self.cursor = self.connection.cursor()
88
+ self._setup_database()
89
+ logging.debug(
90
+ f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
91
+ )
92
+
93
+ def _setup_database(self) -> None:
94
+ self.cursor.execute("""
95
+ CREATE TABLE IF NOT EXISTS storage (
96
+ id INTEGER PRIMARY KEY,
97
+ url TEXT,
98
+ identifier TEXT,
99
+ element_data TEXT,
100
+ UNIQUE (url, identifier)
101
+ )
102
+ """)
103
+ self.connection.commit()
104
+
105
+ def save(self, element: html.HtmlElement, identifier: str):
106
+ """Saves the elements unique properties to the storage for retrieval and relocation later
107
+
108
+ :param element: The element itself that we want to save to storage.
109
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
110
+ the docs for more info.
111
+ """
112
+ url = self._get_base_url()
113
+ element_data = _StorageTools.element_to_dict(element)
114
+ with self.lock:
115
+ self.cursor.execute("""
116
+ INSERT OR REPLACE INTO storage (url, identifier, element_data)
117
+ VALUES (?, ?, ?)
118
+ """, (url, identifier, orjson.dumps(element_data)))
119
+ self.cursor.fetchall()
120
+ self.connection.commit()
121
+
122
+ def retrieve(self, identifier: str) -> Optional[Dict]:
123
+ """Using the identifier, we search the storage and return the unique properties of the element
124
+
125
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
126
+ the docs for more info.
127
+ :return: A dictionary of the unique properties
128
+ """
129
+ url = self._get_base_url()
130
+ with self.lock:
131
+ self.cursor.execute(
132
+ "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
133
+ (url, identifier)
134
+ )
135
+ result = self.cursor.fetchone()
136
+ if result:
137
+ return orjson.loads(result[0])
138
+ return None
139
+
140
+ def close(self):
141
+ """Close all connections, will be useful when with some things like scrapy Spider.closed() function/signal"""
142
+ with self.lock:
143
+ self.connection.commit()
144
+ self.cursor.close()
145
+ self.connection.close()
146
+
147
+ def __del__(self):
148
+ """To ensure all connections are closed when the object is destroyed."""
149
+ self.close()
@@ -0,0 +1,153 @@
1
+ """
2
+ Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
3
+ To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
4
+ which will be important in future releases but most importantly...
5
+ so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
6
+ > if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
7
+ """
8
+
9
+ import re
10
+
11
+ from w3lib.html import HTML5_WHITESPACE
12
+ from typing import TYPE_CHECKING, Any, Optional
13
+ try:
14
+ from typing import Protocol
15
+ except ImportError:
16
+ # Added in Python 3.8
17
+ Protocol = object
18
+
19
+ from scrapling.utils import cache
20
+
21
+ from cssselect.xpath import ExpressionError
22
+ from cssselect.xpath import XPathExpr as OriginalXPathExpr
23
+ from cssselect import HTMLTranslator as OriginalHTMLTranslator
24
+ from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
25
+
26
+ if TYPE_CHECKING:
27
+ # typing.Self requires Python 3.11
28
+ from typing_extensions import Self
29
+
30
+
31
+ regex = f"[{HTML5_WHITESPACE}]+"
32
+ replace_html5_whitespaces = re.compile(regex).sub
33
+
34
+
35
+ class XPathExpr(OriginalXPathExpr):
36
+
37
+ textnode: bool = False
38
+ attribute: Optional[str] = None
39
+
40
+ @classmethod
41
+ def from_xpath(
42
+ cls,
43
+ xpath: OriginalXPathExpr,
44
+ textnode: bool = False,
45
+ attribute: Optional[str] = None,
46
+ ) -> "Self":
47
+ x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
48
+ x.textnode = textnode
49
+ x.attribute = attribute
50
+ return x
51
+
52
+ def __str__(self) -> str:
53
+ path = super().__str__()
54
+ if self.textnode:
55
+ if path == "*":
56
+ path = "text()"
57
+ elif path.endswith("::*/*"):
58
+ path = path[:-3] + "text()"
59
+ else:
60
+ path += "/text()"
61
+
62
+ if self.attribute is not None:
63
+ if path.endswith("::*/*"):
64
+ path = path[:-2]
65
+ path += f"/@{self.attribute}"
66
+
67
+ return path
68
+
69
+ def join(
70
+ self: "Self",
71
+ combiner: str,
72
+ other: OriginalXPathExpr,
73
+ *args: Any,
74
+ **kwargs: Any,
75
+ ) -> "Self":
76
+ if not isinstance(other, XPathExpr):
77
+ raise ValueError(
78
+ f"Expressions of type {__name__}.XPathExpr can ony join expressions"
79
+ f" of the same type (or its descendants), got {type(other)}"
80
+ )
81
+ super().join(combiner, other, *args, **kwargs)
82
+ self.textnode = other.textnode
83
+ self.attribute = other.attribute
84
+ return self
85
+
86
+
87
+ # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
88
+ class TranslatorProtocol(Protocol):
89
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr:
90
+ pass
91
+
92
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str:
93
+ pass
94
+
95
+
96
+ class TranslatorMixin:
97
+ """This mixin adds support to CSS pseudo elements via dynamic dispatch.
98
+
99
+ Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
100
+ """
101
+
102
+ def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
103
+ # https://github.com/python/mypy/issues/12344
104
+ xpath = super().xpath_element(selector) # type: ignore[safe-super]
105
+ return XPathExpr.from_xpath(xpath)
106
+
107
+ def xpath_pseudo_element(
108
+ self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
109
+ ) -> OriginalXPathExpr:
110
+ """
111
+ Dispatch method that transforms XPath to support pseudo-elements.
112
+ """
113
+ if isinstance(pseudo_element, FunctionalPseudoElement):
114
+ method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
115
+ method = getattr(self, method_name, None)
116
+ if not method:
117
+ raise ExpressionError(
118
+ f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
119
+ )
120
+ xpath = method(xpath, pseudo_element)
121
+ else:
122
+ method_name = (
123
+ f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
124
+ )
125
+ method = getattr(self, method_name, None)
126
+ if not method:
127
+ raise ExpressionError(
128
+ f"The pseudo-element ::{pseudo_element} is unknown"
129
+ )
130
+ xpath = method(xpath)
131
+ return xpath
132
+
133
+ @staticmethod
134
+ def xpath_attr_functional_pseudo_element(
135
+ xpath: OriginalXPathExpr, function: FunctionalPseudoElement
136
+ ) -> XPathExpr:
137
+ """Support selecting attribute values using ::attr() pseudo-element"""
138
+ if function.argument_types() not in (["STRING"], ["IDENT"]):
139
+ raise ExpressionError(
140
+ f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
141
+ )
142
+ return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
143
+
144
+ @staticmethod
145
+ def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
146
+ """Support selecting text nodes using ::text pseudo-element"""
147
+ return XPathExpr.from_xpath(xpath, textnode=True)
148
+
149
+
150
+ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
151
+ @cache(maxsize=256)
152
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
153
+ return super().css_to_xpath(css, prefix)
scrapling/utils.py ADDED
@@ -0,0 +1,164 @@
1
+ import re
2
+ import os
3
+ import logging
4
+ from itertools import chain
5
+ from logging import handlers
6
+ # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
7
+ from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
8
+
9
+ from typing import Dict, Iterable, Any
10
+
11
+ from lxml import html
12
+ html_forbidden = {html.HtmlComment, }
13
+ logging.basicConfig(
14
+ level=logging.ERROR,
15
+ format='%(asctime)s - %(levelname)s - %(message)s',
16
+ handlers=[
17
+ logging.StreamHandler()
18
+ ]
19
+ )
20
+
21
+
22
+ @cache(None, typed=True)
23
+ def setup_basic_logging(level: str = 'debug'):
24
+ levels = {
25
+ 'debug': logging.DEBUG,
26
+ 'info': logging.INFO,
27
+ 'warning': logging.WARNING,
28
+ 'error': logging.ERROR,
29
+ 'critical': logging.CRITICAL
30
+ }
31
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
32
+ lvl = levels[level.lower()]
33
+ handler = logging.StreamHandler()
34
+ handler.setFormatter(formatter)
35
+ # Configure the root logger
36
+ logging.basicConfig(level=lvl, handlers=[handler])
37
+
38
+
39
+ def flatten(lst: Iterable):
40
+ return list(chain.from_iterable(lst))
41
+
42
+
43
+ def _is_iterable(s: Any):
44
+ # This will be used only in regex functions to make sure it's iterable but not string/bytes
45
+ return isinstance(s, (list, tuple,))
46
+
47
+
48
+ @cache(None, typed=True)
49
+ class _Logger(object):
50
+ # I will leave this class here for now in case I decide I want to come back to use it :)
51
+ __slots__ = ('console_logger', 'logger_file_path',)
52
+ levels = {
53
+ 'debug': logging.DEBUG,
54
+ 'info': logging.INFO,
55
+ 'warning': logging.WARNING,
56
+ 'error': logging.ERROR,
57
+ 'critical': logging.CRITICAL
58
+ }
59
+
60
+ def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
61
+ os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
62
+ format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
63
+
64
+ # on-screen output
65
+ lvl = self.levels[level.lower()]
66
+ self.console_logger = logging.getLogger('Scrapling')
67
+ self.console_logger.setLevel(lvl)
68
+ console_handler = logging.StreamHandler()
69
+ console_handler.setLevel(lvl)
70
+ console_handler.setFormatter(format_str)
71
+ self.console_logger.addHandler(console_handler)
72
+
73
+ if lvl == logging.DEBUG:
74
+ filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
75
+ self.logger_file_path = filename
76
+ # Automatically generates the logging file at specified intervals
77
+ file_handler = handlers.TimedRotatingFileHandler(
78
+ # If more than (backcount+1) existed, oldest logs will be deleted
79
+ filename=filename, when=when, backupCount=backcount, encoding='utf-8'
80
+ )
81
+ file_handler.setLevel(lvl)
82
+ file_handler.setFormatter(format_str)
83
+ # This for the logger when it appends the date to the new log
84
+ file_handler.namer = lambda name: name.replace(".log", "") + ".log"
85
+ self.console_logger.addHandler(file_handler)
86
+ self.debug(f'Debug log path: {self.logger_file_path}')
87
+ else:
88
+ self.logger_file_path = None
89
+
90
+ def debug(self, message: str) -> None:
91
+ self.console_logger.debug(message)
92
+
93
+ def info(self, message: str) -> None:
94
+ self.console_logger.info(message)
95
+
96
+ def warning(self, message: str) -> None:
97
+ self.console_logger.warning(message)
98
+
99
+ def error(self, message: str) -> None:
100
+ self.console_logger.error(message)
101
+
102
+ def critical(self, message: str) -> None:
103
+ self.console_logger.critical(message)
104
+
105
+
106
+ class _StorageTools:
107
+ @staticmethod
108
+ def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
109
+ if not element.attrib:
110
+ return {}
111
+ return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
112
+
113
+ @classmethod
114
+ def element_to_dict(cls, element: html.HtmlElement) -> Dict:
115
+ parent = element.getparent()
116
+ result = {
117
+ 'tag': str(element.tag),
118
+ 'attributes': cls.__clean_attributes(element),
119
+ 'text': element.text.strip() if element.text else None,
120
+ 'path': cls._get_element_path(element)
121
+ }
122
+ if parent is not None:
123
+ result.update({
124
+ 'parent_name': parent.tag,
125
+ 'parent_attribs': dict(parent.attrib),
126
+ 'parent_text': parent.text.strip() if parent.text else None
127
+ })
128
+
129
+ siblings = [child.tag for child in parent.iterchildren() if child != element]
130
+ if siblings:
131
+ result.update({'siblings': tuple(siblings)})
132
+
133
+ children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
134
+ if children:
135
+ result.update({'children': tuple(children)})
136
+
137
+ return result
138
+
139
+ @classmethod
140
+ def _get_element_path(cls, element: html.HtmlElement):
141
+ parent = element.getparent()
142
+ return tuple(
143
+ (element.tag,) if parent is None else (
144
+ cls._get_element_path(parent) + (element.tag,)
145
+ )
146
+ )
147
+
148
+
149
+ # def _root_type_verifier(method):
150
+ # # Just to make sure we are safe
151
+ # @wraps(method)
152
+ # def _impl(self, *args, **kw):
153
+ # # All html types inherits from HtmlMixin so this to check for all at once
154
+ # if not issubclass(type(self._root), html.HtmlMixin):
155
+ # raise ValueError(f"Cannot use function on a Node of type {type(self._root)!r}")
156
+ # return method(self, *args, **kw)
157
+ # return _impl
158
+
159
+
160
+ @cache(None, typed=True)
161
+ def clean_spaces(string):
162
+ string = string.replace('\t', ' ')
163
+ string = re.sub('[\n|\r]', '', string)
164
+ return re.sub(' +', ' ', string)
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2024, Karim shoair
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.