scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,44 +1,49 @@
|
|
1
|
-
import sqlite3
|
2
|
-
import threading
|
3
|
-
from abc import ABC, abstractmethod
|
4
1
|
from hashlib import sha256
|
2
|
+
from threading import RLock
|
3
|
+
from functools import lru_cache
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from sqlite3 import connect as db_connect
|
5
6
|
|
6
|
-
import
|
7
|
-
from lxml import
|
7
|
+
from orjson import dumps, loads
|
8
|
+
from lxml.html import HtmlElement
|
8
9
|
from tldextract import extract as tld
|
9
10
|
|
10
|
-
from scrapling.core.
|
11
|
-
from scrapling.core.
|
11
|
+
from scrapling.core.utils import _StorageTools, log
|
12
|
+
from scrapling.core._types import Dict, Optional, Any
|
12
13
|
|
13
14
|
|
14
|
-
class StorageSystemMixin(ABC):
|
15
|
+
class StorageSystemMixin(ABC): # pragma: no cover
|
15
16
|
# If you want to make your own storage system, you have to inherit from this
|
16
|
-
def __init__(self, url:
|
17
|
+
def __init__(self, url: Optional[str] = None):
|
17
18
|
"""
|
18
19
|
:param url: URL of the website we are working on to separate it from other websites data
|
19
20
|
"""
|
20
21
|
self.url = url
|
21
22
|
|
22
23
|
@lru_cache(64, typed=True)
|
23
|
-
def _get_base_url(self, default_value: str =
|
24
|
-
if not self.url or
|
24
|
+
def _get_base_url(self, default_value: str = "default") -> str:
|
25
|
+
if not self.url or not isinstance(self.url, str):
|
25
26
|
return default_value
|
26
27
|
|
27
28
|
try:
|
28
29
|
extracted = tld(self.url)
|
29
|
-
return
|
30
|
+
return (
|
31
|
+
extracted.top_domain_under_public_suffix
|
32
|
+
or extracted.domain
|
33
|
+
or default_value
|
34
|
+
)
|
30
35
|
except AttributeError:
|
31
36
|
return default_value
|
32
37
|
|
33
38
|
@abstractmethod
|
34
|
-
def save(self, element:
|
39
|
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
35
40
|
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
36
41
|
|
37
|
-
:param element: The element itself
|
42
|
+
:param element: The element itself which we want to save to storage.
|
38
43
|
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
39
44
|
the docs for more info.
|
40
45
|
"""
|
41
|
-
raise NotImplementedError(
|
46
|
+
raise NotImplementedError("Storage system must implement `save` method")
|
42
47
|
|
43
48
|
@abstractmethod
|
44
49
|
def retrieve(self, identifier: str) -> Optional[Dict]:
|
@@ -48,7 +53,7 @@ class StorageSystemMixin(ABC):
|
|
48
53
|
the docs for more info.
|
49
54
|
:return: A dictionary of the unique properties
|
50
55
|
"""
|
51
|
-
raise NotImplementedError(
|
56
|
+
raise NotImplementedError("Storage system must implement `save` method")
|
52
57
|
|
53
58
|
@staticmethod
|
54
59
|
@lru_cache(128, typed=True)
|
@@ -57,7 +62,7 @@ class StorageSystemMixin(ABC):
|
|
57
62
|
identifier = identifier.lower().strip()
|
58
63
|
if isinstance(identifier, str):
|
59
64
|
# Hash functions have to take bytes
|
60
|
-
identifier = identifier.encode(
|
65
|
+
identifier = identifier.encode("utf-8")
|
61
66
|
|
62
67
|
hash_value = sha256(identifier).hexdigest()
|
63
68
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
@@ -66,21 +71,21 @@ class StorageSystemMixin(ABC):
|
|
66
71
|
@lru_cache(1, typed=True)
|
67
72
|
class SQLiteStorageSystem(StorageSystemMixin):
|
68
73
|
"""The recommended system to use, it's race condition safe and thread safe.
|
69
|
-
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
70
|
-
> It's optimized for threaded applications but running it without threads shouldn't make it slow."""
|
71
|
-
|
74
|
+
Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
|
75
|
+
> It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
|
76
|
+
|
77
|
+
def __init__(self, storage_file: str, url: Optional[str] = None):
|
72
78
|
"""
|
73
|
-
:param storage_file: File to be used to store elements
|
79
|
+
:param storage_file: File to be used to store elements' data.
|
74
80
|
:param url: URL of the website we are working on to separate it from other websites data
|
75
81
|
|
76
82
|
"""
|
77
83
|
super().__init__(url)
|
78
84
|
self.storage_file = storage_file
|
79
|
-
|
80
|
-
|
81
|
-
# >SQLite default mode in earlier version is 1 not 2 (1=thread-safe 2=serialized)
|
85
|
+
self.lock = RLock() # Better than Lock for reentrancy
|
86
|
+
# >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
|
82
87
|
# `check_same_thread=False` to allow it to be used across different threads.
|
83
|
-
self.connection =
|
88
|
+
self.connection = db_connect(self.storage_file, check_same_thread=False)
|
84
89
|
# WAL (Write-Ahead Logging) allows for better concurrency.
|
85
90
|
self.connection.execute("PRAGMA journal_mode=WAL")
|
86
91
|
self.cursor = self.connection.cursor()
|
@@ -101,24 +106,27 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
101
106
|
""")
|
102
107
|
self.connection.commit()
|
103
108
|
|
104
|
-
def save(self, element:
|
109
|
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
105
110
|
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
106
111
|
|
107
|
-
:param element: The element itself
|
112
|
+
:param element: The element itself which we want to save to storage.
|
108
113
|
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
109
114
|
the docs for more info.
|
110
115
|
"""
|
111
116
|
url = self._get_base_url()
|
112
117
|
element_data = _StorageTools.element_to_dict(element)
|
113
118
|
with self.lock:
|
114
|
-
self.cursor.execute(
|
119
|
+
self.cursor.execute(
|
120
|
+
"""
|
115
121
|
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
116
122
|
VALUES (?, ?, ?)
|
117
|
-
""",
|
123
|
+
""",
|
124
|
+
(url, identifier, dumps(element_data)),
|
125
|
+
)
|
118
126
|
self.cursor.fetchall()
|
119
127
|
self.connection.commit()
|
120
128
|
|
121
|
-
def retrieve(self, identifier: str) -> Optional[Dict]:
|
129
|
+
def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
|
122
130
|
"""Using the identifier, we search the storage and return the unique properties of the element
|
123
131
|
|
124
132
|
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
@@ -129,15 +137,15 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
129
137
|
with self.lock:
|
130
138
|
self.cursor.execute(
|
131
139
|
"SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
|
132
|
-
(url, identifier)
|
140
|
+
(url, identifier),
|
133
141
|
)
|
134
142
|
result = self.cursor.fetchone()
|
135
143
|
if result:
|
136
|
-
return
|
144
|
+
return loads(result[0])
|
137
145
|
return None
|
138
146
|
|
139
147
|
def close(self):
|
140
|
-
"""Close all connections
|
148
|
+
"""Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal"""
|
141
149
|
with self.lock:
|
142
150
|
self.connection.commit()
|
143
151
|
self.cursor.close()
|
scrapling/core/translator.py
CHANGED
@@ -1,30 +1,24 @@
|
|
1
1
|
"""
|
2
|
-
Most of this file is adapted version of the
|
2
|
+
Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
|
3
3
|
|
4
|
-
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
|
4
|
+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
|
5
5
|
|
6
6
|
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
|
7
7
|
|
8
|
-
|
8
|
+
If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
|
9
9
|
"""
|
10
10
|
|
11
|
-
import
|
11
|
+
from functools import lru_cache
|
12
12
|
|
13
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
14
14
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
15
|
from cssselect.xpath import ExpressionError
|
16
16
|
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
17
|
-
from w3lib.html import HTML5_WHITESPACE
|
18
17
|
|
19
18
|
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
-
from scrapling.core.utils import lru_cache
|
21
|
-
|
22
|
-
regex = f"[{HTML5_WHITESPACE}]+"
|
23
|
-
replace_html5_whitespaces = re.compile(regex).sub
|
24
19
|
|
25
20
|
|
26
21
|
class XPathExpr(OriginalXPathExpr):
|
27
|
-
|
28
22
|
textnode: bool = False
|
29
23
|
attribute: Optional[str] = None
|
30
24
|
|
@@ -34,7 +28,7 @@ class XPathExpr(OriginalXPathExpr):
|
|
34
28
|
xpath: OriginalXPathExpr,
|
35
29
|
textnode: bool = False,
|
36
30
|
attribute: Optional[str] = None,
|
37
|
-
) ->
|
31
|
+
) -> Self:
|
38
32
|
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
39
33
|
x.textnode = textnode
|
40
34
|
x.attribute = attribute
|
@@ -43,29 +37,29 @@ class XPathExpr(OriginalXPathExpr):
|
|
43
37
|
def __str__(self) -> str:
|
44
38
|
path = super().__str__()
|
45
39
|
if self.textnode:
|
46
|
-
if path == "*":
|
40
|
+
if path == "*": # pragma: no cover
|
47
41
|
path = "text()"
|
48
|
-
elif path.endswith("::*/*"):
|
42
|
+
elif path.endswith("::*/*"): # pragma: no cover
|
49
43
|
path = path[:-3] + "text()"
|
50
44
|
else:
|
51
45
|
path += "/text()"
|
52
46
|
|
53
47
|
if self.attribute is not None:
|
54
|
-
if path.endswith("::*/*"):
|
48
|
+
if path.endswith("::*/*"): # pragma: no cover
|
55
49
|
path = path[:-2]
|
56
50
|
path += f"/@{self.attribute}"
|
57
51
|
|
58
52
|
return path
|
59
53
|
|
60
54
|
def join(
|
61
|
-
self:
|
55
|
+
self: Self,
|
62
56
|
combiner: str,
|
63
57
|
other: OriginalXPathExpr,
|
64
58
|
*args: Any,
|
65
59
|
**kwargs: Any,
|
66
|
-
) ->
|
60
|
+
) -> Self:
|
67
61
|
if not isinstance(other, XPathExpr):
|
68
|
-
raise ValueError(
|
62
|
+
raise ValueError( # pragma: no cover
|
69
63
|
f"Expressions of type {__name__}.XPathExpr can ony join expressions"
|
70
64
|
f" of the same type (or its descendants), got {type(other)}"
|
71
65
|
)
|
@@ -77,10 +71,10 @@ class XPathExpr(OriginalXPathExpr):
|
|
77
71
|
|
78
72
|
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
79
73
|
class TranslatorProtocol(Protocol):
|
80
|
-
def xpath_element(self, selector: Element) -> OriginalXPathExpr:
|
74
|
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
|
81
75
|
pass
|
82
76
|
|
83
|
-
def css_to_xpath(self, css: str, prefix: str = ...) -> str:
|
77
|
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
|
84
78
|
pass
|
85
79
|
|
86
80
|
|
@@ -91,7 +85,7 @@ class TranslatorMixin:
|
|
91
85
|
"""
|
92
86
|
|
93
87
|
def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
|
94
|
-
# https://github.com/python/mypy/issues/
|
88
|
+
# https://github.com/python/mypy/issues/14757
|
95
89
|
xpath = super().xpath_element(selector) # type: ignore[safe-super]
|
96
90
|
return XPathExpr.from_xpath(xpath)
|
97
91
|
|
@@ -99,12 +93,12 @@ class TranslatorMixin:
|
|
99
93
|
self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
|
100
94
|
) -> OriginalXPathExpr:
|
101
95
|
"""
|
102
|
-
Dispatch method that transforms XPath to support pseudo-
|
96
|
+
Dispatch method that transforms XPath to support the pseudo-element.
|
103
97
|
"""
|
104
98
|
if isinstance(pseudo_element, FunctionalPseudoElement):
|
105
99
|
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
|
106
100
|
method = getattr(self, method_name, None)
|
107
|
-
if not method:
|
101
|
+
if not method: # pragma: no cover
|
108
102
|
raise ExpressionError(
|
109
103
|
f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
|
110
104
|
)
|
@@ -114,7 +108,7 @@ class TranslatorMixin:
|
|
114
108
|
f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
115
109
|
)
|
116
110
|
method = getattr(self, method_name, None)
|
117
|
-
if not method:
|
111
|
+
if not method: # pragma: no cover
|
118
112
|
raise ExpressionError(
|
119
113
|
f"The pseudo-element ::{pseudo_element} is unknown"
|
120
114
|
)
|
@@ -123,10 +117,10 @@ class TranslatorMixin:
|
|
123
117
|
|
124
118
|
@staticmethod
|
125
119
|
def xpath_attr_functional_pseudo_element(
|
126
|
-
|
120
|
+
xpath: OriginalXPathExpr, function: FunctionalPseudoElement
|
127
121
|
) -> XPathExpr:
|
128
122
|
"""Support selecting attribute values using ::attr() pseudo-element"""
|
129
|
-
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
123
|
+
if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
|
130
124
|
raise ExpressionError(
|
131
125
|
f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
|
132
126
|
)
|
@@ -144,4 +138,4 @@ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
|
144
138
|
return super().css_to_xpath(css, prefix)
|
145
139
|
|
146
140
|
|
147
|
-
|
141
|
+
translator = HTMLTranslator()
|
scrapling/core/utils.py
CHANGED
@@ -1,17 +1,18 @@
|
|
1
1
|
import logging
|
2
|
-
import re
|
3
2
|
from itertools import chain
|
3
|
+
from re import compile as re_compile
|
4
4
|
|
5
|
-
import orjson
|
6
5
|
from lxml import html
|
7
6
|
|
8
|
-
from scrapling.core._types import Any, Dict, Iterable,
|
7
|
+
from scrapling.core._types import Any, Dict, Iterable, List
|
9
8
|
|
10
|
-
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
11
|
-
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
9
|
+
# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
|
12
10
|
from functools import lru_cache # isort:skip
|
13
11
|
|
14
|
-
html_forbidden =
|
12
|
+
html_forbidden = (html.HtmlComment,)
|
13
|
+
|
14
|
+
__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
|
15
|
+
__CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")
|
15
16
|
|
16
17
|
|
17
18
|
@lru_cache(1, typed=True)
|
@@ -20,12 +21,11 @@ def setup_logger():
|
|
20
21
|
|
21
22
|
:returns: logging.Logger: Configured logger instance
|
22
23
|
"""
|
23
|
-
logger = logging.getLogger(
|
24
|
+
logger = logging.getLogger("scrapling")
|
24
25
|
logger.setLevel(logging.INFO)
|
25
26
|
|
26
27
|
formatter = logging.Formatter(
|
27
|
-
fmt="[%(asctime)s] %(levelname)s: %(message)s",
|
28
|
-
datefmt="%Y-%m-%d %H:%M:%S"
|
28
|
+
fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
29
29
|
)
|
30
30
|
|
31
31
|
console_handler = logging.StreamHandler()
|
@@ -41,24 +41,19 @@ def setup_logger():
|
|
41
41
|
log = setup_logger()
|
42
42
|
|
43
43
|
|
44
|
-
def
|
45
|
-
if type(content) is bytes:
|
46
|
-
content = content.decode()
|
47
|
-
|
48
|
-
try:
|
49
|
-
_ = orjson.loads(content)
|
50
|
-
return True
|
51
|
-
except orjson.JSONDecodeError:
|
52
|
-
return False
|
53
|
-
|
54
|
-
|
55
|
-
def flatten(lst: Iterable):
|
44
|
+
def flatten(lst: Iterable[Any]) -> List[Any]:
|
56
45
|
return list(chain.from_iterable(lst))
|
57
46
|
|
58
47
|
|
59
|
-
def _is_iterable(
|
48
|
+
def _is_iterable(obj: Any) -> bool:
|
60
49
|
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
61
|
-
return isinstance(
|
50
|
+
return isinstance(
|
51
|
+
obj,
|
52
|
+
(
|
53
|
+
list,
|
54
|
+
tuple,
|
55
|
+
),
|
56
|
+
)
|
62
57
|
|
63
58
|
|
64
59
|
class _StorageTools:
|
@@ -66,31 +61,43 @@ class _StorageTools:
|
|
66
61
|
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
67
62
|
if not element.attrib:
|
68
63
|
return {}
|
69
|
-
return {
|
64
|
+
return {
|
65
|
+
k: v.strip()
|
66
|
+
for k, v in element.attrib.items()
|
67
|
+
if v and v.strip() and k not in forbidden
|
68
|
+
}
|
70
69
|
|
71
70
|
@classmethod
|
72
71
|
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
73
72
|
parent = element.getparent()
|
74
73
|
result = {
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
74
|
+
"tag": str(element.tag),
|
75
|
+
"attributes": cls.__clean_attributes(element),
|
76
|
+
"text": element.text.strip() if element.text else None,
|
77
|
+
"path": cls._get_element_path(element),
|
79
78
|
}
|
80
79
|
if parent is not None:
|
81
|
-
result.update(
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
80
|
+
result.update(
|
81
|
+
{
|
82
|
+
"parent_name": parent.tag,
|
83
|
+
"parent_attribs": dict(parent.attrib),
|
84
|
+
"parent_text": parent.text.strip() if parent.text else None,
|
85
|
+
}
|
86
|
+
)
|
86
87
|
|
87
|
-
siblings = [
|
88
|
+
siblings = [
|
89
|
+
child.tag for child in parent.iterchildren() if child != element
|
90
|
+
]
|
88
91
|
if siblings:
|
89
|
-
result.update({
|
92
|
+
result.update({"siblings": tuple(siblings)})
|
90
93
|
|
91
|
-
children = [
|
94
|
+
children = [
|
95
|
+
child.tag
|
96
|
+
for child in element.iterchildren()
|
97
|
+
if not isinstance(child, html_forbidden)
|
98
|
+
]
|
92
99
|
if children:
|
93
|
-
result.update({
|
100
|
+
result.update({"children": tuple(children)})
|
94
101
|
|
95
102
|
return result
|
96
103
|
|
@@ -98,25 +105,13 @@ class _StorageTools:
|
|
98
105
|
def _get_element_path(cls, element: html.HtmlElement):
|
99
106
|
parent = element.getparent()
|
100
107
|
return tuple(
|
101
|
-
(element.tag,)
|
102
|
-
|
103
|
-
)
|
108
|
+
(element.tag,)
|
109
|
+
if parent is None
|
110
|
+
else (cls._get_element_path(parent) + (element.tag,))
|
104
111
|
)
|
105
112
|
|
106
113
|
|
107
|
-
# def _root_type_verifier(method):
|
108
|
-
# # Just to make sure we are safe
|
109
|
-
# @wraps(method)
|
110
|
-
# def _impl(self, *args, **kw):
|
111
|
-
# # All html types inherits from HtmlMixin so this to check for all at once
|
112
|
-
# if not issubclass(type(self._root), html.HtmlMixin):
|
113
|
-
# raise ValueError(f"Cannot use function on a Node of type {type(self._root)!r}")
|
114
|
-
# return method(self, *args, **kw)
|
115
|
-
# return _impl
|
116
|
-
|
117
|
-
|
118
114
|
@lru_cache(128, typed=True)
|
119
115
|
def clean_spaces(string):
|
120
|
-
string = string.
|
121
|
-
|
122
|
-
return re.sub(' +', ' ', string)
|
116
|
+
string = string.translate(__CLEANING_TABLE__)
|
117
|
+
return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)
|
scrapling/engines/__init__.py
CHANGED
@@ -1,7 +1,16 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
3
|
-
from .
|
4
|
-
|
5
|
-
|
1
|
+
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
|
2
|
+
from .static import FetcherSession, FetcherClient, AsyncFetcherClient
|
3
|
+
from ._browsers import (
|
4
|
+
DynamicSession,
|
5
|
+
AsyncDynamicSession,
|
6
|
+
StealthySession,
|
7
|
+
AsyncStealthySession,
|
8
|
+
)
|
6
9
|
|
7
|
-
__all__ = [
|
10
|
+
__all__ = [
|
11
|
+
"FetcherSession",
|
12
|
+
"DynamicSession",
|
13
|
+
"AsyncDynamicSession",
|
14
|
+
"StealthySession",
|
15
|
+
"AsyncStealthySession",
|
16
|
+
]
|