scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +227 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +209 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
scrapling/fetchers.py
CHANGED
@@ -6,16 +6,18 @@ from scrapling.core._types import (
|
|
6
6
|
SelectorWaitStates,
|
7
7
|
Iterable,
|
8
8
|
)
|
9
|
-
from scrapling.engines import (
|
9
|
+
from scrapling.engines.static import (
|
10
10
|
FetcherSession,
|
11
|
-
StealthySession,
|
12
|
-
AsyncStealthySession,
|
13
|
-
DynamicSession,
|
14
|
-
AsyncDynamicSession,
|
15
11
|
FetcherClient as _FetcherClient,
|
16
12
|
AsyncFetcherClient as _AsyncFetcherClient,
|
17
13
|
)
|
18
|
-
from scrapling.engines.
|
14
|
+
from scrapling.engines._browsers import (
|
15
|
+
DynamicSession,
|
16
|
+
StealthySession,
|
17
|
+
AsyncDynamicSession,
|
18
|
+
AsyncStealthySession,
|
19
|
+
)
|
20
|
+
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
19
21
|
|
20
22
|
__FetcherClientInstance__ = _FetcherClient()
|
21
23
|
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
|
@@ -56,6 +58,7 @@ class StealthyFetcher(BaseFetcher):
|
|
56
58
|
block_webrtc: bool = False,
|
57
59
|
allow_webgl: bool = True,
|
58
60
|
network_idle: bool = False,
|
61
|
+
load_dom: bool = True,
|
59
62
|
humanize: bool | float = True,
|
60
63
|
solve_cloudflare: bool = False,
|
61
64
|
wait: int | float = 0,
|
@@ -92,11 +95,12 @@ class StealthyFetcher(BaseFetcher):
|
|
92
95
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
93
96
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
94
97
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
98
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
95
99
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
96
100
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
97
101
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
98
102
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
99
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
103
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
100
104
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
101
105
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
102
106
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
@@ -112,13 +116,10 @@ class StealthyFetcher(BaseFetcher):
|
|
112
116
|
if not custom_config:
|
113
117
|
custom_config = {}
|
114
118
|
elif not isinstance(custom_config, dict):
|
115
|
-
ValueError(
|
116
|
-
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
117
|
-
)
|
119
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
118
120
|
|
119
121
|
with StealthySession(
|
120
122
|
wait=wait,
|
121
|
-
max_pages=1,
|
122
123
|
proxy=proxy,
|
123
124
|
geoip=geoip,
|
124
125
|
addons=addons,
|
@@ -126,6 +127,7 @@ class StealthyFetcher(BaseFetcher):
|
|
126
127
|
cookies=cookies,
|
127
128
|
headless=headless,
|
128
129
|
humanize=humanize,
|
130
|
+
load_dom=load_dom,
|
129
131
|
disable_ads=disable_ads,
|
130
132
|
allow_webgl=allow_webgl,
|
131
133
|
page_action=page_action,
|
@@ -155,6 +157,7 @@ class StealthyFetcher(BaseFetcher):
|
|
155
157
|
block_webrtc: bool = False,
|
156
158
|
allow_webgl: bool = True,
|
157
159
|
network_idle: bool = False,
|
160
|
+
load_dom: bool = True,
|
158
161
|
humanize: bool | float = True,
|
159
162
|
solve_cloudflare: bool = False,
|
160
163
|
wait: int | float = 0,
|
@@ -191,11 +194,12 @@ class StealthyFetcher(BaseFetcher):
|
|
191
194
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
192
195
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
193
196
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
197
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
194
198
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
195
199
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
196
200
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
197
201
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
198
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
202
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
199
203
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
200
204
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
201
205
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
@@ -211,9 +215,7 @@ class StealthyFetcher(BaseFetcher):
|
|
211
215
|
if not custom_config:
|
212
216
|
custom_config = {}
|
213
217
|
elif not isinstance(custom_config, dict):
|
214
|
-
ValueError(
|
215
|
-
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
216
|
-
)
|
218
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
217
219
|
|
218
220
|
async with AsyncStealthySession(
|
219
221
|
wait=wait,
|
@@ -225,6 +227,7 @@ class StealthyFetcher(BaseFetcher):
|
|
225
227
|
cookies=cookies,
|
226
228
|
headless=headless,
|
227
229
|
humanize=humanize,
|
230
|
+
load_dom=load_dom,
|
228
231
|
disable_ads=disable_ads,
|
229
232
|
allow_webgl=allow_webgl,
|
230
233
|
page_action=page_action,
|
@@ -285,6 +288,7 @@ class DynamicFetcher(BaseFetcher):
|
|
285
288
|
init_script: Optional[str] = None,
|
286
289
|
cookies: Optional[Iterable[Dict]] = None,
|
287
290
|
network_idle: bool = False,
|
291
|
+
load_dom: bool = True,
|
288
292
|
wait_selector_state: SelectorWaitStates = "attached",
|
289
293
|
custom_config: Optional[Dict] = None,
|
290
294
|
) -> Response:
|
@@ -298,9 +302,10 @@ class DynamicFetcher(BaseFetcher):
|
|
298
302
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
299
303
|
:param cookies: Set cookies for the next request.
|
300
304
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
305
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
301
306
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
302
307
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
303
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
308
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
304
309
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
305
310
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
306
311
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -319,9 +324,7 @@ class DynamicFetcher(BaseFetcher):
|
|
319
324
|
if not custom_config:
|
320
325
|
custom_config = {}
|
321
326
|
elif not isinstance(custom_config, dict):
|
322
|
-
raise ValueError(
|
323
|
-
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
324
|
-
)
|
327
|
+
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
325
328
|
|
326
329
|
with DynamicSession(
|
327
330
|
wait=wait,
|
@@ -332,6 +335,7 @@ class DynamicFetcher(BaseFetcher):
|
|
332
335
|
cdp_url=cdp_url,
|
333
336
|
cookies=cookies,
|
334
337
|
headless=headless,
|
338
|
+
load_dom=load_dom,
|
335
339
|
useragent=useragent,
|
336
340
|
real_chrome=real_chrome,
|
337
341
|
page_action=page_action,
|
@@ -371,6 +375,7 @@ class DynamicFetcher(BaseFetcher):
|
|
371
375
|
init_script: Optional[str] = None,
|
372
376
|
cookies: Optional[Iterable[Dict]] = None,
|
373
377
|
network_idle: bool = False,
|
378
|
+
load_dom: bool = True,
|
374
379
|
wait_selector_state: SelectorWaitStates = "attached",
|
375
380
|
custom_config: Optional[Dict] = None,
|
376
381
|
) -> Response:
|
@@ -384,9 +389,10 @@ class DynamicFetcher(BaseFetcher):
|
|
384
389
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
385
390
|
:param cookies: Set cookies for the next request.
|
386
391
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
392
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
387
393
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
388
394
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
389
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
395
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
390
396
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
391
397
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
392
398
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -405,12 +411,11 @@ class DynamicFetcher(BaseFetcher):
|
|
405
411
|
if not custom_config:
|
406
412
|
custom_config = {}
|
407
413
|
elif not isinstance(custom_config, dict):
|
408
|
-
raise ValueError(
|
409
|
-
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
410
|
-
)
|
414
|
+
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
411
415
|
|
412
416
|
async with AsyncDynamicSession(
|
413
417
|
wait=wait,
|
418
|
+
max_pages=1,
|
414
419
|
proxy=proxy,
|
415
420
|
locale=locale,
|
416
421
|
timeout=timeout,
|
@@ -418,8 +423,8 @@ class DynamicFetcher(BaseFetcher):
|
|
418
423
|
cdp_url=cdp_url,
|
419
424
|
cookies=cookies,
|
420
425
|
headless=headless,
|
426
|
+
load_dom=load_dom,
|
421
427
|
useragent=useragent,
|
422
|
-
max_pages=1,
|
423
428
|
real_chrome=real_chrome,
|
424
429
|
page_action=page_action,
|
425
430
|
hide_canvas=hide_canvas,
|
scrapling/parser.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
-
from pathlib import Path
|
2
1
|
import re
|
2
|
+
from pathlib import Path
|
3
3
|
from inspect import signature
|
4
|
-
from difflib import SequenceMatcher
|
5
4
|
from urllib.parse import urljoin
|
5
|
+
from difflib import SequenceMatcher
|
6
6
|
|
7
|
-
from cssselect import SelectorError, SelectorSyntaxError
|
8
|
-
from cssselect import parse as split_selectors
|
9
7
|
from lxml.html import HtmlElement, HtmlMixin, HTMLParser
|
8
|
+
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
10
9
|
from lxml.etree import (
|
11
10
|
XPath,
|
12
11
|
tostring,
|
@@ -75,7 +74,7 @@ class Selector(SelectorsGeneration):
|
|
75
74
|
self,
|
76
75
|
content: Optional[str | bytes] = None,
|
77
76
|
url: Optional[str] = None,
|
78
|
-
encoding: str = "
|
77
|
+
encoding: str = "utf-8",
|
79
78
|
huge_tree: bool = True,
|
80
79
|
root: Optional[HtmlElement] = None,
|
81
80
|
keep_comments: Optional[bool] = False,
|
@@ -110,22 +109,16 @@ class Selector(SelectorsGeneration):
|
|
110
109
|
If empty, default values will be used.
|
111
110
|
"""
|
112
111
|
if root is None and content is None:
|
113
|
-
raise ValueError(
|
114
|
-
"Selector class needs HTML content, or root arguments to work"
|
115
|
-
)
|
112
|
+
raise ValueError("Selector class needs HTML content, or root arguments to work")
|
116
113
|
|
117
114
|
self.__text = None
|
118
115
|
if root is None:
|
119
116
|
if isinstance(content, str):
|
120
|
-
body = (
|
121
|
-
content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
122
|
-
)
|
117
|
+
body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
123
118
|
elif isinstance(content, bytes):
|
124
|
-
body = content.replace(b"\x00", b"")
|
119
|
+
body = content.replace(b"\x00", b"")
|
125
120
|
else:
|
126
|
-
raise TypeError(
|
127
|
-
f"content argument must be str or bytes, got {type(content)}"
|
128
|
-
)
|
121
|
+
raise TypeError(f"content argument must be str or bytes, got {type(content)}")
|
129
122
|
|
130
123
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
131
124
|
parser = HTMLParser(
|
@@ -139,8 +132,7 @@ class Selector(SelectorsGeneration):
|
|
139
132
|
strip_cdata=(not keep_cdata),
|
140
133
|
)
|
141
134
|
self._root = fromstring(body, parser=parser, base_url=url)
|
142
|
-
|
143
|
-
self._raw_body = body.decode()
|
135
|
+
self._raw_body = content
|
144
136
|
|
145
137
|
else:
|
146
138
|
# All HTML types inherit from HtmlMixin so this to check for all at once
|
@@ -165,16 +157,10 @@ class Selector(SelectorsGeneration):
|
|
165
157
|
}
|
166
158
|
|
167
159
|
if not hasattr(storage, "__wrapped__"):
|
168
|
-
raise ValueError(
|
169
|
-
"Storage class must be wrapped with lru_cache decorator, see docs for info"
|
170
|
-
)
|
160
|
+
raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
|
171
161
|
|
172
|
-
if not issubclass(
|
173
|
-
|
174
|
-
): # pragma: no cover
|
175
|
-
raise ValueError(
|
176
|
-
"Storage system must be inherited from class `StorageSystemMixin`"
|
177
|
-
)
|
162
|
+
if not issubclass(storage.__wrapped__, StorageSystemMixin): # pragma: no cover
|
163
|
+
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
178
164
|
|
179
165
|
self._storage = storage(**storage_args)
|
180
166
|
|
@@ -239,9 +225,7 @@ class Selector(SelectorsGeneration):
|
|
239
225
|
|
240
226
|
def __element_convertor(self, element: HtmlElement) -> "Selector":
|
241
227
|
"""Used internally to convert a single HtmlElement to Selector directly without checks"""
|
242
|
-
db_instance = (
|
243
|
-
self._storage if (hasattr(self, "_storage") and self._storage) else None
|
244
|
-
)
|
228
|
+
db_instance = self._storage if (hasattr(self, "_storage") and self._storage) else None
|
245
229
|
return Selector(
|
246
230
|
root=element,
|
247
231
|
url=self.url,
|
@@ -355,18 +339,19 @@ class Selector(SelectorsGeneration):
|
|
355
339
|
@property
|
356
340
|
def html_content(self) -> TextHandler:
|
357
341
|
"""Return the inner HTML code of the element"""
|
358
|
-
return TextHandler(
|
359
|
-
tostring(self._root, encoding="unicode", method="html", with_tail=False)
|
360
|
-
)
|
342
|
+
return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
|
361
343
|
|
362
|
-
|
344
|
+
@property
|
345
|
+
def body(self):
|
346
|
+
"""Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
|
347
|
+
return self._raw_body
|
363
348
|
|
364
349
|
def prettify(self) -> TextHandler:
|
365
350
|
"""Return a prettified version of the element's inner html-code"""
|
366
351
|
return TextHandler(
|
367
352
|
tostring(
|
368
353
|
self._root,
|
369
|
-
encoding=
|
354
|
+
encoding=self.encoding,
|
370
355
|
pretty_print=True,
|
371
356
|
method="html",
|
372
357
|
with_tail=False,
|
@@ -404,9 +389,7 @@ class Selector(SelectorsGeneration):
|
|
404
389
|
def siblings(self) -> "Selectors":
|
405
390
|
"""Return other children of the current element's parent or empty list otherwise"""
|
406
391
|
if self.parent:
|
407
|
-
return Selectors(
|
408
|
-
child for child in self.parent.children if child._root != self._root
|
409
|
-
)
|
392
|
+
return Selectors(child for child in self.parent.children if child._root != self._root)
|
410
393
|
return Selectors()
|
411
394
|
|
412
395
|
def iterancestors(self) -> Generator["Selector", None, None]:
|
@@ -519,9 +502,7 @@ class Selector(SelectorsGeneration):
|
|
519
502
|
log.debug(f"Highest probability was {highest_probability}%")
|
520
503
|
log.debug("Top 5 best matching elements are: ")
|
521
504
|
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
522
|
-
log.debug(
|
523
|
-
f"{percent} -> {self.__handle_elements(score_table[percent])}"
|
524
|
-
)
|
505
|
+
log.debug(f"{percent} -> {self.__handle_elements(score_table[percent])}")
|
525
506
|
|
526
507
|
if not selector_type:
|
527
508
|
return score_table[highest_probability]
|
@@ -658,9 +639,7 @@ class Selector(SelectorsGeneration):
|
|
658
639
|
SelectorError,
|
659
640
|
SelectorSyntaxError,
|
660
641
|
) as e:
|
661
|
-
raise SelectorSyntaxError(
|
662
|
-
f"Invalid CSS selector '{selector}': {str(e)}"
|
663
|
-
) from e
|
642
|
+
raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e
|
664
643
|
|
665
644
|
def xpath(
|
666
645
|
self,
|
@@ -702,9 +681,7 @@ class Selector(SelectorsGeneration):
|
|
702
681
|
elif self.__adaptive_enabled and auto_save:
|
703
682
|
self.save(elements[0], identifier or selector)
|
704
683
|
|
705
|
-
return self.__handle_elements(
|
706
|
-
elements[0:1] if (_first_match and elements) else elements
|
707
|
-
)
|
684
|
+
return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
|
708
685
|
elif self.__adaptive_enabled:
|
709
686
|
if adaptive:
|
710
687
|
element_data = self.retrieve(identifier or selector)
|
@@ -713,9 +690,7 @@ class Selector(SelectorsGeneration):
|
|
713
690
|
if elements is not None and auto_save:
|
714
691
|
self.save(elements[0], identifier or selector)
|
715
692
|
|
716
|
-
return self.__handle_elements(
|
717
|
-
elements[0:1] if (_first_match and elements) else elements
|
718
|
-
)
|
693
|
+
return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
|
719
694
|
else:
|
720
695
|
if adaptive:
|
721
696
|
log.warning(
|
@@ -726,9 +701,7 @@ class Selector(SelectorsGeneration):
|
|
726
701
|
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
727
702
|
)
|
728
703
|
|
729
|
-
return self.__handle_elements(
|
730
|
-
elements[0:1] if (_first_match and elements) else elements
|
731
|
-
)
|
704
|
+
return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
|
732
705
|
|
733
706
|
except (
|
734
707
|
SelectorError,
|
@@ -751,9 +724,7 @@ class Selector(SelectorsGeneration):
|
|
751
724
|
"""
|
752
725
|
|
753
726
|
if not args and not kwargs:
|
754
|
-
raise TypeError(
|
755
|
-
"You have to pass something to search with, like tag name(s), tag attributes, or both."
|
756
|
-
)
|
727
|
+
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
757
728
|
|
758
729
|
attributes = dict()
|
759
730
|
tags, patterns = set(), set()
|
@@ -766,18 +737,11 @@ class Selector(SelectorsGeneration):
|
|
766
737
|
|
767
738
|
elif type(arg) in (list, tuple, set):
|
768
739
|
if not all(map(lambda x: isinstance(x, str), arg)):
|
769
|
-
raise TypeError(
|
770
|
-
"Nested Iterables are not accepted, only iterables of tag names are accepted"
|
771
|
-
)
|
740
|
+
raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
|
772
741
|
tags.update(set(arg))
|
773
742
|
|
774
743
|
elif isinstance(arg, dict):
|
775
|
-
if not all(
|
776
|
-
[
|
777
|
-
(isinstance(k, str) and isinstance(v, str))
|
778
|
-
for k, v in arg.items()
|
779
|
-
]
|
780
|
-
):
|
744
|
+
if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
|
781
745
|
raise TypeError(
|
782
746
|
"Nested dictionaries are not accepted, only string keys and string values are accepted"
|
783
747
|
)
|
@@ -795,13 +759,9 @@ class Selector(SelectorsGeneration):
|
|
795
759
|
)
|
796
760
|
|
797
761
|
else:
|
798
|
-
raise TypeError(
|
799
|
-
f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
|
800
|
-
)
|
762
|
+
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
801
763
|
|
802
|
-
if not all(
|
803
|
-
[(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]
|
804
|
-
):
|
764
|
+
if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
|
805
765
|
raise TypeError("Only string values are accepted for arguments")
|
806
766
|
|
807
767
|
for attribute_name, value in kwargs.items():
|
@@ -825,9 +785,7 @@ class Selector(SelectorsGeneration):
|
|
825
785
|
if results:
|
826
786
|
# From the results, get the ones that fulfill passed regex patterns
|
827
787
|
for pattern in patterns:
|
828
|
-
results = results.filter(
|
829
|
-
lambda e: e.text.re(pattern, check_match=True)
|
830
|
-
)
|
788
|
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
831
789
|
|
832
790
|
# From the results, get the ones that fulfill passed functions
|
833
791
|
for function in functions:
|
@@ -858,9 +816,7 @@ class Selector(SelectorsGeneration):
|
|
858
816
|
return element
|
859
817
|
return None
|
860
818
|
|
861
|
-
def __calculate_similarity_score(
|
862
|
-
self, original: Dict, candidate: HtmlElement
|
863
|
-
) -> float:
|
819
|
+
def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
|
864
820
|
"""Used internally to calculate a score that shows how a candidate element similar to the original one
|
865
821
|
|
866
822
|
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
@@ -877,15 +833,11 @@ class Selector(SelectorsGeneration):
|
|
877
833
|
checks += 1
|
878
834
|
|
879
835
|
if original["text"]:
|
880
|
-
score += SequenceMatcher(
|
881
|
-
None, original["text"], candidate.get("text") or ""
|
882
|
-
).ratio() # * 0.3 # 30%
|
836
|
+
score += SequenceMatcher(None, original["text"], candidate.get("text") or "").ratio() # * 0.3 # 30%
|
883
837
|
checks += 1
|
884
838
|
|
885
839
|
# if both don't have attributes, it still counts for something!
|
886
|
-
score += self.__calculate_dict_diff(
|
887
|
-
original["attributes"], candidate["attributes"]
|
888
|
-
) # * 0.3 # 30%
|
840
|
+
score += self.__calculate_dict_diff(original["attributes"], candidate["attributes"]) # * 0.3 # 30%
|
889
841
|
checks += 1
|
890
842
|
|
891
843
|
# Separate similarity test for class, id, href,... this will help in full structural changes
|
@@ -903,9 +855,7 @@ class Selector(SelectorsGeneration):
|
|
903
855
|
).ratio() # * 0.3 # 30%
|
904
856
|
checks += 1
|
905
857
|
|
906
|
-
score += SequenceMatcher(
|
907
|
-
None, original["path"], candidate["path"]
|
908
|
-
).ratio() # * 0.1 # 10%
|
858
|
+
score += SequenceMatcher(None, original["path"], candidate["path"]).ratio() # * 0.1 # 10%
|
909
859
|
checks += 1
|
910
860
|
|
911
861
|
if original.get("parent_name"):
|
@@ -944,14 +894,8 @@ class Selector(SelectorsGeneration):
|
|
944
894
|
@staticmethod
|
945
895
|
def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
|
946
896
|
"""Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
947
|
-
score = (
|
948
|
-
|
949
|
-
* 0.5
|
950
|
-
)
|
951
|
-
score += (
|
952
|
-
SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
|
953
|
-
* 0.5
|
954
|
-
)
|
897
|
+
score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
|
898
|
+
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
955
899
|
return score
|
956
900
|
|
957
901
|
def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
|
@@ -992,7 +936,7 @@ class Selector(SelectorsGeneration):
|
|
992
936
|
# Operations on text functions
|
993
937
|
def json(self) -> Dict:
|
994
938
|
"""Return JSON response if the response is jsonable otherwise throws error"""
|
995
|
-
if self._raw_body:
|
939
|
+
if self._raw_body and isinstance(self._raw_body, str):
|
996
940
|
return TextHandler(self._raw_body).json()
|
997
941
|
elif self.text:
|
998
942
|
return self.text.json()
|
@@ -1031,9 +975,7 @@ class Selector(SelectorsGeneration):
|
|
1031
975
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1032
976
|
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
1033
977
|
"""
|
1034
|
-
return self.text.re_first(
|
1035
|
-
regex, default, replace_entities, clean_match, case_sensitive
|
1036
|
-
)
|
978
|
+
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
1037
979
|
|
1038
980
|
@staticmethod
|
1039
981
|
def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
|
@@ -1052,9 +994,7 @@ class Selector(SelectorsGeneration):
|
|
1052
994
|
"""Calculate a score of how much these elements are alike and return True
|
1053
995
|
if the score is higher or equals the threshold"""
|
1054
996
|
candidate_attributes = (
|
1055
|
-
self.__get_attributes(candidate, ignore_attributes)
|
1056
|
-
if ignore_attributes
|
1057
|
-
else candidate.attrib
|
997
|
+
self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
|
1058
998
|
)
|
1059
999
|
score, checks = 0, 0
|
1060
1000
|
|
@@ -1116,11 +1056,7 @@ class Selector(SelectorsGeneration):
|
|
1116
1056
|
similar_elements = list()
|
1117
1057
|
|
1118
1058
|
current_depth = len(list(root.iterancestors()))
|
1119
|
-
target_attrs = (
|
1120
|
-
self.__get_attributes(root, ignore_attributes)
|
1121
|
-
if ignore_attributes
|
1122
|
-
else root.attrib
|
1123
|
-
)
|
1059
|
+
target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib
|
1124
1060
|
|
1125
1061
|
path_parts = [self.tag]
|
1126
1062
|
if (parent := root.getparent()) is not None:
|
@@ -1129,9 +1065,7 @@ class Selector(SelectorsGeneration):
|
|
1129
1065
|
path_parts.insert(0, grandparent.tag)
|
1130
1066
|
|
1131
1067
|
xpath_path = "//{}".format("/".join(path_parts))
|
1132
|
-
potential_matches = root.xpath(
|
1133
|
-
f"{xpath_path}[count(ancestor::*) = {current_depth}]"
|
1134
|
-
)
|
1068
|
+
potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")
|
1135
1069
|
|
1136
1070
|
for potential_match in potential_matches:
|
1137
1071
|
if potential_match != root and self.__are_alike(
|
@@ -1275,12 +1209,7 @@ class Selectors(List[Selector]):
|
|
1275
1209
|
|
1276
1210
|
:return: `Selectors` class.
|
1277
1211
|
"""
|
1278
|
-
results = [
|
1279
|
-
n.xpath(
|
1280
|
-
selector, identifier or selector, False, auto_save, percentage, **kwargs
|
1281
|
-
)
|
1282
|
-
for n in self
|
1283
|
-
]
|
1212
|
+
results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
|
1284
1213
|
return self.__class__(flatten(results))
|
1285
1214
|
|
1286
1215
|
def css(
|
@@ -1308,10 +1237,7 @@ class Selectors(List[Selector]):
|
|
1308
1237
|
|
1309
1238
|
:return: `Selectors` class.
|
1310
1239
|
"""
|
1311
|
-
results = [
|
1312
|
-
n.css(selector, identifier or selector, False, auto_save, percentage)
|
1313
|
-
for n in self
|
1314
|
-
]
|
1240
|
+
results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
|
1315
1241
|
return self.__class__(flatten(results))
|
1316
1242
|
|
1317
1243
|
def re(
|
@@ -1329,10 +1255,7 @@ class Selectors(List[Selector]):
|
|
1329
1255
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1330
1256
|
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
1331
1257
|
"""
|
1332
|
-
results = [
|
1333
|
-
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
1334
|
-
for n in self
|
1335
|
-
]
|
1258
|
+
results = [n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
|
1336
1259
|
return TextHandlers(flatten(results))
|
1337
1260
|
|
1338
1261
|
def re_first(
|