scrapling 0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +10 -0
- scrapling/custom_types.py +146 -0
- scrapling/mixins.py +74 -0
- scrapling/parser.py +908 -0
- scrapling/storage_adaptors.py +149 -0
- scrapling/translator.py +153 -0
- scrapling/utils.py +164 -0
- scrapling-0.1.dist-info/LICENSE +28 -0
- scrapling-0.1.dist-info/METADATA +475 -0
- scrapling-0.1.dist-info/RECORD +12 -0
- scrapling-0.1.dist-info/WHEEL +5 -0
- scrapling-0.1.dist-info/top_level.txt +1 -0
scrapling/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# Declare top-level shortcuts
|
2
|
+
from scrapling.parser import Adaptor, Adaptors
|
3
|
+
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
|
+
|
5
|
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
+
__version__ = "0.1"
|
7
|
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
|
+
|
9
|
+
|
10
|
+
__all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
|
@@ -0,0 +1,146 @@
|
|
1
|
+
import re
|
2
|
+
from types import MappingProxyType
|
3
|
+
from collections.abc import Mapping
|
4
|
+
from typing import Dict, List, Union, Pattern
|
5
|
+
|
6
|
+
from scrapling.utils import _is_iterable, flatten
|
7
|
+
|
8
|
+
from orjson import loads, dumps
|
9
|
+
from w3lib.html import replace_entities as _replace_entities
|
10
|
+
|
11
|
+
|
12
|
+
class TextHandler(str):
|
13
|
+
"""Extends standard Python string by adding more functionality"""
|
14
|
+
__slots__ = ()
|
15
|
+
|
16
|
+
def __new__(cls, string):
|
17
|
+
# Because str is immutable and we can't override __init__
|
18
|
+
if type(string) is str:
|
19
|
+
return super().__new__(cls, string)
|
20
|
+
else:
|
21
|
+
return super().__new__(cls, '')
|
22
|
+
|
23
|
+
def sort(self, reverse: bool = False) -> str:
|
24
|
+
"""Return a sorted version of the string"""
|
25
|
+
return self.__class__("".join(sorted(self, reverse=reverse)))
|
26
|
+
|
27
|
+
def clean(self) -> str:
|
28
|
+
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
29
|
+
data = re.sub(r'[\t|\r|\n]', '', self)
|
30
|
+
data = re.sub(' +', ' ', data)
|
31
|
+
return self.__class__(data.strip())
|
32
|
+
|
33
|
+
def json(self) -> Dict:
|
34
|
+
"""Return json response if the response is jsonable otherwise throw error"""
|
35
|
+
# Using __str__ function as a workaround for orjson issue with subclasses of str
|
36
|
+
# Check this out: https://github.com/ijl/orjson/issues/445
|
37
|
+
return loads(self.__str__())
|
38
|
+
|
39
|
+
def re(
|
40
|
+
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
41
|
+
case_sensitive: bool = False, check_match: bool = False
|
42
|
+
) -> Union[List[str], bool]:
|
43
|
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
44
|
+
|
45
|
+
:param regex: Can be either a compiled regular expression or a string.
|
46
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
47
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
48
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
49
|
+
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
50
|
+
|
51
|
+
"""
|
52
|
+
if isinstance(regex, str):
|
53
|
+
if not case_sensitive:
|
54
|
+
regex = re.compile(regex, re.UNICODE)
|
55
|
+
else:
|
56
|
+
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
57
|
+
|
58
|
+
input_text = self.clean() if clean_match else self
|
59
|
+
results = regex.findall(input_text)
|
60
|
+
if check_match:
|
61
|
+
return bool(results)
|
62
|
+
|
63
|
+
if all(_is_iterable(res) for res in results):
|
64
|
+
results = flatten(results)
|
65
|
+
|
66
|
+
if not replace_entities:
|
67
|
+
return [TextHandler(string) for string in results]
|
68
|
+
|
69
|
+
return [TextHandler(_replace_entities(s)) for s in results]
|
70
|
+
|
71
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
72
|
+
clean_match: bool = False, case_sensitive: bool = False,):
|
73
|
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
74
|
+
|
75
|
+
:param regex: Can be either a compiled regular expression or a string.
|
76
|
+
:param default: The default value to be returned if there is no match
|
77
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
78
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
79
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
80
|
+
|
81
|
+
"""
|
82
|
+
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
83
|
+
return result[0] if result else default
|
84
|
+
|
85
|
+
|
86
|
+
class AttributesHandler(Mapping):
|
87
|
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
88
|
+
at the same time I use it to add more functionalities.
|
89
|
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
90
|
+
"""
|
91
|
+
__slots__ = ('_data',)
|
92
|
+
|
93
|
+
def __init__(self, mapping=None, **kwargs):
|
94
|
+
mapping = {
|
95
|
+
key: TextHandler(value) if type(value) is str else value
|
96
|
+
for key, value in mapping.items()
|
97
|
+
} if mapping is not None else {}
|
98
|
+
|
99
|
+
if kwargs:
|
100
|
+
mapping.update({
|
101
|
+
key: TextHandler(value) if type(value) is str else value
|
102
|
+
for key, value in kwargs.items()
|
103
|
+
})
|
104
|
+
|
105
|
+
# Fastest read-only mapping type
|
106
|
+
self._data = MappingProxyType(mapping)
|
107
|
+
|
108
|
+
def get(self, key, default=None):
|
109
|
+
"""Acts like standard dictionary `.get()` method"""
|
110
|
+
return self._data.get(key, default)
|
111
|
+
|
112
|
+
def search_values(self, keyword, partial=False):
|
113
|
+
"""Search current attributes by values and return dictionary of each matching item
|
114
|
+
:param keyword: The keyword to search for in the attributes values
|
115
|
+
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
116
|
+
"""
|
117
|
+
for key, value in self._data.items():
|
118
|
+
if partial:
|
119
|
+
if keyword in value:
|
120
|
+
yield AttributesHandler({key: value})
|
121
|
+
else:
|
122
|
+
if keyword == value:
|
123
|
+
yield AttributesHandler({key: value})
|
124
|
+
|
125
|
+
@property
|
126
|
+
def json_string(self):
|
127
|
+
"""Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
|
128
|
+
return dumps(dict(self._data))
|
129
|
+
|
130
|
+
def __getitem__(self, key):
|
131
|
+
return self._data[key]
|
132
|
+
|
133
|
+
def __iter__(self):
|
134
|
+
return iter(self._data)
|
135
|
+
|
136
|
+
def __len__(self):
|
137
|
+
return len(self._data)
|
138
|
+
|
139
|
+
def __repr__(self):
|
140
|
+
return f"{self.__class__.__name__}({self._data})"
|
141
|
+
|
142
|
+
def __str__(self):
|
143
|
+
return str(self._data)
|
144
|
+
|
145
|
+
def __contains__(self, key):
|
146
|
+
return key in self._data
|
scrapling/mixins.py
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
class SelectorsGeneration:
|
3
|
+
"""Selectors generation functions
|
4
|
+
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
5
|
+
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
6
|
+
|
7
|
+
def __general_selection(self, selection: str = 'css') -> str:
|
8
|
+
"""Generate a selector for the current element.
|
9
|
+
:return: A string of the generated selector.
|
10
|
+
"""
|
11
|
+
selectorPath = []
|
12
|
+
target = self
|
13
|
+
css = selection.lower() == 'css'
|
14
|
+
while target is not None:
|
15
|
+
if target.parent:
|
16
|
+
if target.attrib.get('id'):
|
17
|
+
# id is enough
|
18
|
+
part = (
|
19
|
+
f'#{target.attrib["id"]}' if css
|
20
|
+
else f"[@id='{target.attrib['id']}']"
|
21
|
+
)
|
22
|
+
selectorPath.append(part)
|
23
|
+
return (
|
24
|
+
" > ".join(reversed(selectorPath)) if css
|
25
|
+
else '//*' + "/".join(reversed(selectorPath))
|
26
|
+
)
|
27
|
+
else:
|
28
|
+
part = f'{target.tag}'
|
29
|
+
# We won't use classes anymore because I some websites share exact classes between elements
|
30
|
+
# classes = target.attrib.get('class', '').split()
|
31
|
+
# if classes and css:
|
32
|
+
# part += f".{'.'.join(classes)}"
|
33
|
+
# else:
|
34
|
+
counter = {}
|
35
|
+
for child in target.parent.children:
|
36
|
+
counter.setdefault(child.tag, 0)
|
37
|
+
counter[child.tag] += 1
|
38
|
+
if child._root == target._root:
|
39
|
+
break
|
40
|
+
|
41
|
+
if counter[target.tag] > 1:
|
42
|
+
part += (
|
43
|
+
f":nth-of-type({counter[target.tag]})" if css
|
44
|
+
else f"[{counter[target.tag]}]"
|
45
|
+
)
|
46
|
+
|
47
|
+
selectorPath.append(part)
|
48
|
+
target = target.parent
|
49
|
+
if target is None or target.tag == 'html':
|
50
|
+
return (
|
51
|
+
" > ".join(reversed(selectorPath)) if css
|
52
|
+
else '//' + "/".join(reversed(selectorPath))
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
break
|
56
|
+
|
57
|
+
return (
|
58
|
+
" > ".join(reversed(selectorPath)) if css
|
59
|
+
else '//' + "/".join(reversed(selectorPath))
|
60
|
+
)
|
61
|
+
|
62
|
+
@property
|
63
|
+
def css_selector(self) -> str:
|
64
|
+
"""Generate a CSS selector for the current element
|
65
|
+
:return: A string of the generated selector.
|
66
|
+
"""
|
67
|
+
return self.__general_selection()
|
68
|
+
|
69
|
+
@property
|
70
|
+
def xpath_selector(self) -> str:
|
71
|
+
"""Generate a XPath selector for the current element
|
72
|
+
:return: A string of the generated selector.
|
73
|
+
"""
|
74
|
+
return self.__general_selection('xpath')
|