scrapling 0.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +10 -0
- scrapling/custom_types.py +146 -0
- scrapling/mixins.py +74 -0
- scrapling/parser.py +908 -0
- scrapling/storage_adaptors.py +149 -0
- scrapling/translator.py +153 -0
- scrapling/utils.py +164 -0
- scrapling-0.1.dist-info/LICENSE +28 -0
- scrapling-0.1.dist-info/METADATA +475 -0
- scrapling-0.1.dist-info/RECORD +12 -0
- scrapling-0.1.dist-info/WHEEL +5 -0
- scrapling-0.1.dist-info/top_level.txt +1 -0
scrapling/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# Declare top-level shortcuts
|
2
|
+
from scrapling.parser import Adaptor, Adaptors
|
3
|
+
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
|
+
|
5
|
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
+
__version__ = "0.1"
|
7
|
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
|
+
|
9
|
+
|
10
|
+
__all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
|
@@ -0,0 +1,146 @@
|
|
1
|
+
import re
|
2
|
+
from types import MappingProxyType
|
3
|
+
from collections.abc import Mapping
|
4
|
+
from typing import Dict, List, Union, Pattern
|
5
|
+
|
6
|
+
from scrapling.utils import _is_iterable, flatten
|
7
|
+
|
8
|
+
from orjson import loads, dumps
|
9
|
+
from w3lib.html import replace_entities as _replace_entities
|
10
|
+
|
11
|
+
|
12
|
+
class TextHandler(str):
|
13
|
+
"""Extends standard Python string by adding more functionality"""
|
14
|
+
__slots__ = ()
|
15
|
+
|
16
|
+
def __new__(cls, string):
|
17
|
+
# Because str is immutable and we can't override __init__
|
18
|
+
if type(string) is str:
|
19
|
+
return super().__new__(cls, string)
|
20
|
+
else:
|
21
|
+
return super().__new__(cls, '')
|
22
|
+
|
23
|
+
def sort(self, reverse: bool = False) -> str:
|
24
|
+
"""Return a sorted version of the string"""
|
25
|
+
return self.__class__("".join(sorted(self, reverse=reverse)))
|
26
|
+
|
27
|
+
def clean(self) -> str:
|
28
|
+
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
29
|
+
data = re.sub(r'[\t|\r|\n]', '', self)
|
30
|
+
data = re.sub(' +', ' ', data)
|
31
|
+
return self.__class__(data.strip())
|
32
|
+
|
33
|
+
def json(self) -> Dict:
|
34
|
+
"""Return json response if the response is jsonable otherwise throw error"""
|
35
|
+
# Using __str__ function as a workaround for orjson issue with subclasses of str
|
36
|
+
# Check this out: https://github.com/ijl/orjson/issues/445
|
37
|
+
return loads(self.__str__())
|
38
|
+
|
39
|
+
def re(
|
40
|
+
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
41
|
+
case_sensitive: bool = False, check_match: bool = False
|
42
|
+
) -> Union[List[str], bool]:
|
43
|
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
44
|
+
|
45
|
+
:param regex: Can be either a compiled regular expression or a string.
|
46
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
47
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
48
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
49
|
+
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
50
|
+
|
51
|
+
"""
|
52
|
+
if isinstance(regex, str):
|
53
|
+
if not case_sensitive:
|
54
|
+
regex = re.compile(regex, re.UNICODE)
|
55
|
+
else:
|
56
|
+
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
57
|
+
|
58
|
+
input_text = self.clean() if clean_match else self
|
59
|
+
results = regex.findall(input_text)
|
60
|
+
if check_match:
|
61
|
+
return bool(results)
|
62
|
+
|
63
|
+
if all(_is_iterable(res) for res in results):
|
64
|
+
results = flatten(results)
|
65
|
+
|
66
|
+
if not replace_entities:
|
67
|
+
return [TextHandler(string) for string in results]
|
68
|
+
|
69
|
+
return [TextHandler(_replace_entities(s)) for s in results]
|
70
|
+
|
71
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
72
|
+
clean_match: bool = False, case_sensitive: bool = False,):
|
73
|
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
74
|
+
|
75
|
+
:param regex: Can be either a compiled regular expression or a string.
|
76
|
+
:param default: The default value to be returned if there is no match
|
77
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
78
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
79
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
80
|
+
|
81
|
+
"""
|
82
|
+
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
83
|
+
return result[0] if result else default
|
84
|
+
|
85
|
+
|
86
|
+
class AttributesHandler(Mapping):
|
87
|
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
88
|
+
at the same time I use it to add more functionalities.
|
89
|
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
90
|
+
"""
|
91
|
+
__slots__ = ('_data',)
|
92
|
+
|
93
|
+
def __init__(self, mapping=None, **kwargs):
|
94
|
+
mapping = {
|
95
|
+
key: TextHandler(value) if type(value) is str else value
|
96
|
+
for key, value in mapping.items()
|
97
|
+
} if mapping is not None else {}
|
98
|
+
|
99
|
+
if kwargs:
|
100
|
+
mapping.update({
|
101
|
+
key: TextHandler(value) if type(value) is str else value
|
102
|
+
for key, value in kwargs.items()
|
103
|
+
})
|
104
|
+
|
105
|
+
# Fastest read-only mapping type
|
106
|
+
self._data = MappingProxyType(mapping)
|
107
|
+
|
108
|
+
def get(self, key, default=None):
|
109
|
+
"""Acts like standard dictionary `.get()` method"""
|
110
|
+
return self._data.get(key, default)
|
111
|
+
|
112
|
+
def search_values(self, keyword, partial=False):
|
113
|
+
"""Search current attributes by values and return dictionary of each matching item
|
114
|
+
:param keyword: The keyword to search for in the attributes values
|
115
|
+
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
116
|
+
"""
|
117
|
+
for key, value in self._data.items():
|
118
|
+
if partial:
|
119
|
+
if keyword in value:
|
120
|
+
yield AttributesHandler({key: value})
|
121
|
+
else:
|
122
|
+
if keyword == value:
|
123
|
+
yield AttributesHandler({key: value})
|
124
|
+
|
125
|
+
@property
|
126
|
+
def json_string(self):
|
127
|
+
"""Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
|
128
|
+
return dumps(dict(self._data))
|
129
|
+
|
130
|
+
def __getitem__(self, key):
|
131
|
+
return self._data[key]
|
132
|
+
|
133
|
+
def __iter__(self):
|
134
|
+
return iter(self._data)
|
135
|
+
|
136
|
+
def __len__(self):
|
137
|
+
return len(self._data)
|
138
|
+
|
139
|
+
def __repr__(self):
|
140
|
+
return f"{self.__class__.__name__}({self._data})"
|
141
|
+
|
142
|
+
def __str__(self):
|
143
|
+
return str(self._data)
|
144
|
+
|
145
|
+
def __contains__(self, key):
|
146
|
+
return key in self._data
|
scrapling/mixins.py
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
class SelectorsGeneration:
|
3
|
+
"""Selectors generation functions
|
4
|
+
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
5
|
+
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
6
|
+
|
7
|
+
def __general_selection(self, selection: str = 'css') -> str:
|
8
|
+
"""Generate a selector for the current element.
|
9
|
+
:return: A string of the generated selector.
|
10
|
+
"""
|
11
|
+
selectorPath = []
|
12
|
+
target = self
|
13
|
+
css = selection.lower() == 'css'
|
14
|
+
while target is not None:
|
15
|
+
if target.parent:
|
16
|
+
if target.attrib.get('id'):
|
17
|
+
# id is enough
|
18
|
+
part = (
|
19
|
+
f'#{target.attrib["id"]}' if css
|
20
|
+
else f"[@id='{target.attrib['id']}']"
|
21
|
+
)
|
22
|
+
selectorPath.append(part)
|
23
|
+
return (
|
24
|
+
" > ".join(reversed(selectorPath)) if css
|
25
|
+
else '//*' + "/".join(reversed(selectorPath))
|
26
|
+
)
|
27
|
+
else:
|
28
|
+
part = f'{target.tag}'
|
29
|
+
# We won't use classes anymore because I some websites share exact classes between elements
|
30
|
+
# classes = target.attrib.get('class', '').split()
|
31
|
+
# if classes and css:
|
32
|
+
# part += f".{'.'.join(classes)}"
|
33
|
+
# else:
|
34
|
+
counter = {}
|
35
|
+
for child in target.parent.children:
|
36
|
+
counter.setdefault(child.tag, 0)
|
37
|
+
counter[child.tag] += 1
|
38
|
+
if child._root == target._root:
|
39
|
+
break
|
40
|
+
|
41
|
+
if counter[target.tag] > 1:
|
42
|
+
part += (
|
43
|
+
f":nth-of-type({counter[target.tag]})" if css
|
44
|
+
else f"[{counter[target.tag]}]"
|
45
|
+
)
|
46
|
+
|
47
|
+
selectorPath.append(part)
|
48
|
+
target = target.parent
|
49
|
+
if target is None or target.tag == 'html':
|
50
|
+
return (
|
51
|
+
" > ".join(reversed(selectorPath)) if css
|
52
|
+
else '//' + "/".join(reversed(selectorPath))
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
break
|
56
|
+
|
57
|
+
return (
|
58
|
+
" > ".join(reversed(selectorPath)) if css
|
59
|
+
else '//' + "/".join(reversed(selectorPath))
|
60
|
+
)
|
61
|
+
|
62
|
+
@property
|
63
|
+
def css_selector(self) -> str:
|
64
|
+
"""Generate a CSS selector for the current element
|
65
|
+
:return: A string of the generated selector.
|
66
|
+
"""
|
67
|
+
return self.__general_selection()
|
68
|
+
|
69
|
+
@property
|
70
|
+
def xpath_selector(self) -> str:
|
71
|
+
"""Generate a XPath selector for the current element
|
72
|
+
:return: A string of the generated selector.
|
73
|
+
"""
|
74
|
+
return self.__general_selection('xpath')
|