scrapling 0.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ # Declare top-level shortcuts
2
+ from scrapling.parser import Adaptor, Adaptors
3
+ from scrapling.custom_types import TextHandler, AttributesHandler
4
+
5
+ __author__ = "Karim Shoair (karim.shoair@pm.me)"
6
+ __version__ = "0.1"
7
+ __copyright__ = "Copyright (c) 2024 Karim Shoair"
8
+
9
+
10
+ __all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
@@ -0,0 +1,146 @@
1
+ import re
2
+ from types import MappingProxyType
3
+ from collections.abc import Mapping
4
+ from typing import Dict, List, Union, Pattern
5
+
6
+ from scrapling.utils import _is_iterable, flatten
7
+
8
+ from orjson import loads, dumps
9
+ from w3lib.html import replace_entities as _replace_entities
10
+
11
+
12
+ class TextHandler(str):
13
+ """Extends standard Python string by adding more functionality"""
14
+ __slots__ = ()
15
+
16
+ def __new__(cls, string):
17
+ # Because str is immutable and we can't override __init__
18
+ if type(string) is str:
19
+ return super().__new__(cls, string)
20
+ else:
21
+ return super().__new__(cls, '')
22
+
23
+ def sort(self, reverse: bool = False) -> str:
24
+ """Return a sorted version of the string"""
25
+ return self.__class__("".join(sorted(self, reverse=reverse)))
26
+
27
+ def clean(self) -> str:
28
+ """Return a new version of the string after removing all white spaces and consecutive spaces"""
29
+ data = re.sub(r'[\t|\r|\n]', '', self)
30
+ data = re.sub(' +', ' ', data)
31
+ return self.__class__(data.strip())
32
+
33
+ def json(self) -> Dict:
34
+ """Return json response if the response is jsonable otherwise throw error"""
35
+ # Using __str__ function as a workaround for orjson issue with subclasses of str
36
+ # Check this out: https://github.com/ijl/orjson/issues/445
37
+ return loads(self.__str__())
38
+
39
+ def re(
40
+ self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
41
+ case_sensitive: bool = False, check_match: bool = False
42
+ ) -> Union[List[str], bool]:
43
+ """Apply the given regex to the current text and return a list of strings with the matches.
44
+
45
+ :param regex: Can be either a compiled regular expression or a string.
46
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
47
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
48
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
49
+ :param check_match: used to quickly check if this regex matches or not without any operations on the results
50
+
51
+ """
52
+ if isinstance(regex, str):
53
+ if not case_sensitive:
54
+ regex = re.compile(regex, re.UNICODE)
55
+ else:
56
+ regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
57
+
58
+ input_text = self.clean() if clean_match else self
59
+ results = regex.findall(input_text)
60
+ if check_match:
61
+ return bool(results)
62
+
63
+ if all(_is_iterable(res) for res in results):
64
+ results = flatten(results)
65
+
66
+ if not replace_entities:
67
+ return [TextHandler(string) for string in results]
68
+
69
+ return [TextHandler(_replace_entities(s)) for s in results]
70
+
71
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
72
+ clean_match: bool = False, case_sensitive: bool = False,):
73
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
74
+
75
+ :param regex: Can be either a compiled regular expression or a string.
76
+ :param default: The default value to be returned if there is no match
77
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
78
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
79
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
80
+
81
+ """
82
+ result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
83
+ return result[0] if result else default
84
+
85
+
86
+ class AttributesHandler(Mapping):
87
+ """A read-only mapping to use instead of the standard dictionary for the speed boost but
88
+ at the same time I use it to add more functionalities.
89
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
90
+ """
91
+ __slots__ = ('_data',)
92
+
93
+ def __init__(self, mapping=None, **kwargs):
94
+ mapping = {
95
+ key: TextHandler(value) if type(value) is str else value
96
+ for key, value in mapping.items()
97
+ } if mapping is not None else {}
98
+
99
+ if kwargs:
100
+ mapping.update({
101
+ key: TextHandler(value) if type(value) is str else value
102
+ for key, value in kwargs.items()
103
+ })
104
+
105
+ # Fastest read-only mapping type
106
+ self._data = MappingProxyType(mapping)
107
+
108
+ def get(self, key, default=None):
109
+ """Acts like standard dictionary `.get()` method"""
110
+ return self._data.get(key, default)
111
+
112
+ def search_values(self, keyword, partial=False):
113
+ """Search current attributes by values and return dictionary of each matching item
114
+ :param keyword: The keyword to search for in the attributes values
115
+ :param partial: If True, the function will search if keyword in each value instead of perfect match
116
+ """
117
+ for key, value in self._data.items():
118
+ if partial:
119
+ if keyword in value:
120
+ yield AttributesHandler({key: value})
121
+ else:
122
+ if keyword == value:
123
+ yield AttributesHandler({key: value})
124
+
125
+ @property
126
+ def json_string(self):
127
+ """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
128
+ return dumps(dict(self._data))
129
+
130
+ def __getitem__(self, key):
131
+ return self._data[key]
132
+
133
+ def __iter__(self):
134
+ return iter(self._data)
135
+
136
+ def __len__(self):
137
+ return len(self._data)
138
+
139
+ def __repr__(self):
140
+ return f"{self.__class__.__name__}({self._data})"
141
+
142
+ def __str__(self):
143
+ return str(self._data)
144
+
145
+ def __contains__(self, key):
146
+ return key in self._data
scrapling/mixins.py ADDED
@@ -0,0 +1,74 @@
1
+
2
+ class SelectorsGeneration:
3
+ """Selectors generation functions
4
+ Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
+ Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
6
+
7
+ def __general_selection(self, selection: str = 'css') -> str:
8
+ """Generate a selector for the current element.
9
+ :return: A string of the generated selector.
10
+ """
11
+ selectorPath = []
12
+ target = self
13
+ css = selection.lower() == 'css'
14
+ while target is not None:
15
+ if target.parent:
16
+ if target.attrib.get('id'):
17
+ # id is enough
18
+ part = (
19
+ f'#{target.attrib["id"]}' if css
20
+ else f"[@id='{target.attrib['id']}']"
21
+ )
22
+ selectorPath.append(part)
23
+ return (
24
+ " > ".join(reversed(selectorPath)) if css
25
+ else '//*' + "/".join(reversed(selectorPath))
26
+ )
27
+ else:
28
+ part = f'{target.tag}'
29
+ # We won't use classes anymore because I some websites share exact classes between elements
30
+ # classes = target.attrib.get('class', '').split()
31
+ # if classes and css:
32
+ # part += f".{'.'.join(classes)}"
33
+ # else:
34
+ counter = {}
35
+ for child in target.parent.children:
36
+ counter.setdefault(child.tag, 0)
37
+ counter[child.tag] += 1
38
+ if child._root == target._root:
39
+ break
40
+
41
+ if counter[target.tag] > 1:
42
+ part += (
43
+ f":nth-of-type({counter[target.tag]})" if css
44
+ else f"[{counter[target.tag]}]"
45
+ )
46
+
47
+ selectorPath.append(part)
48
+ target = target.parent
49
+ if target is None or target.tag == 'html':
50
+ return (
51
+ " > ".join(reversed(selectorPath)) if css
52
+ else '//' + "/".join(reversed(selectorPath))
53
+ )
54
+ else:
55
+ break
56
+
57
+ return (
58
+ " > ".join(reversed(selectorPath)) if css
59
+ else '//' + "/".join(reversed(selectorPath))
60
+ )
61
+
62
+ @property
63
+ def css_selector(self) -> str:
64
+ """Generate a CSS selector for the current element
65
+ :return: A string of the generated selector.
66
+ """
67
+ return self.__general_selection()
68
+
69
+ @property
70
+ def xpath_selector(self) -> str:
71
+ """Generate a XPath selector for the current element
72
+ :return: A string of the generated selector.
73
+ """
74
+ return self.__general_selection('xpath')