scrapling 0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ # Declare top-level shortcuts
2
+ from scrapling.parser import Adaptor, Adaptors
3
+ from scrapling.custom_types import TextHandler, AttributesHandler
4
+
5
+ __author__ = "Karim Shoair (karim.shoair@pm.me)"
6
+ __version__ = "0.1"
7
+ __copyright__ = "Copyright (c) 2024 Karim Shoair"
8
+
9
+
10
+ __all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
@@ -0,0 +1,146 @@
1
+ import re
2
+ from types import MappingProxyType
3
+ from collections.abc import Mapping
4
+ from typing import Dict, List, Union, Pattern
5
+
6
+ from scrapling.utils import _is_iterable, flatten
7
+
8
+ from orjson import loads, dumps
9
+ from w3lib.html import replace_entities as _replace_entities
10
+
11
+
12
+ class TextHandler(str):
13
+ """Extends standard Python string by adding more functionality"""
14
+ __slots__ = ()
15
+
16
+ def __new__(cls, string):
17
+ # Because str is immutable and we can't override __init__
18
+ if type(string) is str:
19
+ return super().__new__(cls, string)
20
+ else:
21
+ return super().__new__(cls, '')
22
+
23
+ def sort(self, reverse: bool = False) -> str:
24
+ """Return a sorted version of the string"""
25
+ return self.__class__("".join(sorted(self, reverse=reverse)))
26
+
27
+ def clean(self) -> str:
28
+ """Return a new version of the string after removing all white spaces and consecutive spaces"""
29
+ data = re.sub(r'[\t|\r|\n]', '', self)
30
+ data = re.sub(' +', ' ', data)
31
+ return self.__class__(data.strip())
32
+
33
+ def json(self) -> Dict:
34
+ """Return json response if the response is jsonable otherwise throw error"""
35
+ # Using __str__ function as a workaround for orjson issue with subclasses of str
36
+ # Check this out: https://github.com/ijl/orjson/issues/445
37
+ return loads(self.__str__())
38
+
39
+ def re(
40
+ self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
41
+ case_sensitive: bool = False, check_match: bool = False
42
+ ) -> Union[List[str], bool]:
43
+ """Apply the given regex to the current text and return a list of strings with the matches.
44
+
45
+ :param regex: Can be either a compiled regular expression or a string.
46
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
47
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
48
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
49
+ :param check_match: used to quickly check if this regex matches or not without any operations on the results
50
+
51
+ """
52
+ if isinstance(regex, str):
53
+ if not case_sensitive:
54
+ regex = re.compile(regex, re.UNICODE)
55
+ else:
56
+ regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
57
+
58
+ input_text = self.clean() if clean_match else self
59
+ results = regex.findall(input_text)
60
+ if check_match:
61
+ return bool(results)
62
+
63
+ if all(_is_iterable(res) for res in results):
64
+ results = flatten(results)
65
+
66
+ if not replace_entities:
67
+ return [TextHandler(string) for string in results]
68
+
69
+ return [TextHandler(_replace_entities(s)) for s in results]
70
+
71
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
72
+ clean_match: bool = False, case_sensitive: bool = False,):
73
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
74
+
75
+ :param regex: Can be either a compiled regular expression or a string.
76
+ :param default: The default value to be returned if there is no match
77
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
78
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
79
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
80
+
81
+ """
82
+ result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
83
+ return result[0] if result else default
84
+
85
+
86
+ class AttributesHandler(Mapping):
87
+ """A read-only mapping to use instead of the standard dictionary for the speed boost but
88
+ at the same time I use it to add more functionalities.
89
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
90
+ """
91
+ __slots__ = ('_data',)
92
+
93
+ def __init__(self, mapping=None, **kwargs):
94
+ mapping = {
95
+ key: TextHandler(value) if type(value) is str else value
96
+ for key, value in mapping.items()
97
+ } if mapping is not None else {}
98
+
99
+ if kwargs:
100
+ mapping.update({
101
+ key: TextHandler(value) if type(value) is str else value
102
+ for key, value in kwargs.items()
103
+ })
104
+
105
+ # Fastest read-only mapping type
106
+ self._data = MappingProxyType(mapping)
107
+
108
+ def get(self, key, default=None):
109
+ """Acts like standard dictionary `.get()` method"""
110
+ return self._data.get(key, default)
111
+
112
+ def search_values(self, keyword, partial=False):
113
+ """Search current attributes by values and return dictionary of each matching item
114
+ :param keyword: The keyword to search for in the attributes values
115
+ :param partial: If True, the function will search if keyword in each value instead of perfect match
116
+ """
117
+ for key, value in self._data.items():
118
+ if partial:
119
+ if keyword in value:
120
+ yield AttributesHandler({key: value})
121
+ else:
122
+ if keyword == value:
123
+ yield AttributesHandler({key: value})
124
+
125
+ @property
126
+ def json_string(self):
127
+ """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
128
+ return dumps(dict(self._data))
129
+
130
+ def __getitem__(self, key):
131
+ return self._data[key]
132
+
133
+ def __iter__(self):
134
+ return iter(self._data)
135
+
136
+ def __len__(self):
137
+ return len(self._data)
138
+
139
+ def __repr__(self):
140
+ return f"{self.__class__.__name__}({self._data})"
141
+
142
+ def __str__(self):
143
+ return str(self._data)
144
+
145
+ def __contains__(self, key):
146
+ return key in self._data
scrapling/mixins.py ADDED
@@ -0,0 +1,74 @@
1
+
2
+ class SelectorsGeneration:
3
+ """Selectors generation functions
4
+ Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
+ Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
6
+
7
+ def __general_selection(self, selection: str = 'css') -> str:
8
+ """Generate a selector for the current element.
9
+ :return: A string of the generated selector.
10
+ """
11
+ selectorPath = []
12
+ target = self
13
+ css = selection.lower() == 'css'
14
+ while target is not None:
15
+ if target.parent:
16
+ if target.attrib.get('id'):
17
+ # id is enough
18
+ part = (
19
+ f'#{target.attrib["id"]}' if css
20
+ else f"[@id='{target.attrib['id']}']"
21
+ )
22
+ selectorPath.append(part)
23
+ return (
24
+ " > ".join(reversed(selectorPath)) if css
25
+ else '//*' + "/".join(reversed(selectorPath))
26
+ )
27
+ else:
28
+ part = f'{target.tag}'
29
+ # We won't use classes anymore because I some websites share exact classes between elements
30
+ # classes = target.attrib.get('class', '').split()
31
+ # if classes and css:
32
+ # part += f".{'.'.join(classes)}"
33
+ # else:
34
+ counter = {}
35
+ for child in target.parent.children:
36
+ counter.setdefault(child.tag, 0)
37
+ counter[child.tag] += 1
38
+ if child._root == target._root:
39
+ break
40
+
41
+ if counter[target.tag] > 1:
42
+ part += (
43
+ f":nth-of-type({counter[target.tag]})" if css
44
+ else f"[{counter[target.tag]}]"
45
+ )
46
+
47
+ selectorPath.append(part)
48
+ target = target.parent
49
+ if target is None or target.tag == 'html':
50
+ return (
51
+ " > ".join(reversed(selectorPath)) if css
52
+ else '//' + "/".join(reversed(selectorPath))
53
+ )
54
+ else:
55
+ break
56
+
57
+ return (
58
+ " > ".join(reversed(selectorPath)) if css
59
+ else '//' + "/".join(reversed(selectorPath))
60
+ )
61
+
62
+ @property
63
+ def css_selector(self) -> str:
64
+ """Generate a CSS selector for the current element
65
+ :return: A string of the generated selector.
66
+ """
67
+ return self.__general_selection()
68
+
69
+ @property
70
+ def xpath_selector(self) -> str:
71
+ """Generate a XPath selector for the current element
72
+ :return: A string of the generated selector.
73
+ """
74
+ return self.__general_selection('xpath')