scrapling 0.1.2__py3-none-any.whl → 0.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +2 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +121 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +232 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +18 -0
- scrapling/engines/toolbelt/custom.py +168 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +74 -0
- scrapling/fetchers.py +190 -0
- scrapling/parser.py +216 -51
- scrapling-0.2.dist-info/METADATA +807 -0
- scrapling-0.2.dist-info/RECORD +32 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/LICENSE +0 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
"""
|
2
|
+
Functions related to files and URLs
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import logging
|
7
|
+
from urllib.parse import urlparse, urlencode
|
8
|
+
|
9
|
+
from scrapling.core.utils import cache
|
10
|
+
from scrapling.core._types import Union, Dict, Optional
|
11
|
+
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
12
|
+
|
13
|
+
from playwright.sync_api import Route
|
14
|
+
|
15
|
+
|
16
|
+
def intercept_route(route: Route) -> Union[Route, None]:
|
17
|
+
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
18
|
+
|
19
|
+
:param route: PlayWright `Route` object of the current page
|
20
|
+
:return: PlayWright `Route` object
|
21
|
+
"""
|
22
|
+
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
23
|
+
logging.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
24
|
+
return route.abort()
|
25
|
+
return route.continue_()
|
26
|
+
|
27
|
+
|
28
|
+
def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
29
|
+
"""Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
|
30
|
+
|
31
|
+
:param cdp_url: The target URL.
|
32
|
+
:param query_params: A dictionary of the parameters to add.
|
33
|
+
:return: The new CDP URL.
|
34
|
+
"""
|
35
|
+
try:
|
36
|
+
# Validate the base URL structure
|
37
|
+
parsed = urlparse(cdp_url)
|
38
|
+
|
39
|
+
# Check scheme
|
40
|
+
if parsed.scheme not in ('ws', 'wss'):
|
41
|
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
42
|
+
|
43
|
+
# Validate hostname and port
|
44
|
+
if not parsed.netloc:
|
45
|
+
raise ValueError("Invalid hostname for the CDP URL")
|
46
|
+
|
47
|
+
# Ensure path starts with /
|
48
|
+
path = parsed.path
|
49
|
+
if not path.startswith('/'):
|
50
|
+
path = '/' + path
|
51
|
+
|
52
|
+
# Reconstruct the base URL with validated parts
|
53
|
+
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
54
|
+
|
55
|
+
# Add query parameters
|
56
|
+
if query_params:
|
57
|
+
query_string = urlencode(query_params)
|
58
|
+
return f"{validated_base}?{query_string}"
|
59
|
+
|
60
|
+
return validated_base
|
61
|
+
|
62
|
+
except Exception as e:
|
63
|
+
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
64
|
+
|
65
|
+
|
66
|
+
@cache(None, typed=True)
|
67
|
+
def js_bypass_path(filename: str) -> str:
|
68
|
+
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
69
|
+
|
70
|
+
:param filename: The base filename of the JS file.
|
71
|
+
:return: The full path of the JS file.
|
72
|
+
"""
|
73
|
+
current_directory = os.path.dirname(__file__)
|
74
|
+
return os.path.join(current_directory, 'bypasses', filename)
|
scrapling/fetchers.py
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal
|
2
|
+
|
3
|
+
from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
|
4
|
+
from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
|
5
|
+
|
6
|
+
|
7
|
+
class Fetcher(BaseFetcher):
|
8
|
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on httpx.
|
9
|
+
|
10
|
+
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
|
+
"""
|
12
|
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
13
|
+
"""Make basic HTTP GET request for you but with some added flavors.
|
14
|
+
:param url: Target url.
|
15
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
17
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
18
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
19
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
20
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
21
|
+
"""
|
22
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
23
|
+
return response_object
|
24
|
+
|
25
|
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
26
|
+
"""Make basic HTTP POST request for you but with some added flavors.
|
27
|
+
:param url: Target url.
|
28
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
29
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
30
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
31
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
32
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
33
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
34
|
+
"""
|
35
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
36
|
+
return response_object
|
37
|
+
|
38
|
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
39
|
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
40
|
+
:param url: Target url
|
41
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
42
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
43
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
44
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
45
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
46
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
47
|
+
"""
|
48
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
49
|
+
return response_object
|
50
|
+
|
51
|
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
52
|
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
53
|
+
:param url: Target url
|
54
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
55
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
56
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
57
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
58
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
59
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
60
|
+
"""
|
61
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
62
|
+
return response_object
|
63
|
+
|
64
|
+
|
65
|
+
class StealthyFetcher(BaseFetcher):
|
66
|
+
"""A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
|
67
|
+
|
68
|
+
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
69
|
+
Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
70
|
+
"""
|
71
|
+
def fetch(
|
72
|
+
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
73
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
74
|
+
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
75
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None
|
76
|
+
) -> Response:
|
77
|
+
"""
|
78
|
+
Opens up a browser and do your request based on your chosen options below.
|
79
|
+
:param url: Target url.
|
80
|
+
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
81
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
82
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
83
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
84
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
85
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
86
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
87
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
88
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
89
|
+
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
90
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
91
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
92
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
93
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
94
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
95
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
96
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
97
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
98
|
+
"""
|
99
|
+
engine = CamoufoxEngine(
|
100
|
+
timeout=timeout,
|
101
|
+
headless=headless,
|
102
|
+
page_action=page_action,
|
103
|
+
block_images=block_images,
|
104
|
+
block_webrtc=block_webrtc,
|
105
|
+
addons=addons,
|
106
|
+
humanize=humanize,
|
107
|
+
allow_webgl=allow_webgl,
|
108
|
+
disable_resources=disable_resources,
|
109
|
+
network_idle=network_idle,
|
110
|
+
wait_selector=wait_selector,
|
111
|
+
wait_selector_state=wait_selector_state,
|
112
|
+
google_search=google_search,
|
113
|
+
extra_headers=extra_headers,
|
114
|
+
adaptor_arguments=self.adaptor_arguments,
|
115
|
+
)
|
116
|
+
return engine.fetch(url)
|
117
|
+
|
118
|
+
|
119
|
+
class PlayWrightFetcher(BaseFetcher):
|
120
|
+
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
121
|
+
|
122
|
+
Using this Fetcher class, you can do requests with:
|
123
|
+
- Vanilla Playwright without any modifications other than the ones you chose.
|
124
|
+
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
125
|
+
Some of the things stealth mode does include:
|
126
|
+
1) Patches the CDP runtime fingerprint.
|
127
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
128
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
129
|
+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
130
|
+
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
131
|
+
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
132
|
+
> Note that these are the main options with PlayWright but it can be mixed together.
|
133
|
+
"""
|
134
|
+
def fetch(
|
135
|
+
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
136
|
+
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
137
|
+
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
138
|
+
hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
139
|
+
stealth: bool = False,
|
140
|
+
cdp_url: Optional[str] = None,
|
141
|
+
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
142
|
+
) -> Response:
|
143
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
144
|
+
:param url: Target url.
|
145
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
146
|
+
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
147
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
148
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
149
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
150
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
151
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
152
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
153
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
154
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
155
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
156
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
157
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
158
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
159
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
160
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
161
|
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
162
|
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
163
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
164
|
+
"""
|
165
|
+
engine = PlaywrightEngine(
|
166
|
+
timeout=timeout,
|
167
|
+
stealth=stealth,
|
168
|
+
cdp_url=cdp_url,
|
169
|
+
headless=headless,
|
170
|
+
useragent=useragent,
|
171
|
+
page_action=page_action,
|
172
|
+
hide_canvas=hide_canvas,
|
173
|
+
network_idle=network_idle,
|
174
|
+
google_search=google_search,
|
175
|
+
extra_headers=extra_headers,
|
176
|
+
wait_selector=wait_selector,
|
177
|
+
disable_webgl=disable_webgl,
|
178
|
+
nstbrowser_mode=nstbrowser_mode,
|
179
|
+
nstbrowser_config=nstbrowser_config,
|
180
|
+
disable_resources=disable_resources,
|
181
|
+
wait_selector_state=wait_selector_state,
|
182
|
+
adaptor_arguments=self.adaptor_arguments,
|
183
|
+
)
|
184
|
+
return engine.fetch(url)
|
185
|
+
|
186
|
+
|
187
|
+
class CustomFetcher(BaseFetcher):
|
188
|
+
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|
189
|
+
engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
|
190
|
+
return engine.fetch(url)
|
scrapling/parser.py
CHANGED
@@ -1,18 +1,14 @@
|
|
1
1
|
import os
|
2
|
+
import re
|
3
|
+
import inspect
|
2
4
|
from difflib import SequenceMatcher
|
3
|
-
from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
|
4
|
-
try:
|
5
|
-
from typing import SupportsIndex
|
6
|
-
except ImportError:
|
7
|
-
# 'SupportsIndex' got added in Python 3.8
|
8
|
-
SupportsIndex = None
|
9
|
-
|
10
|
-
from scrapling.translator import HTMLTranslator
|
11
|
-
from scrapling.mixins import SelectorsGeneration
|
12
|
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
13
|
-
from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
14
|
-
from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
15
5
|
|
6
|
+
from scrapling.core.translator import HTMLTranslator
|
7
|
+
from scrapling.core.mixins import SelectorsGeneration
|
8
|
+
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
9
|
+
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
10
|
+
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
11
|
+
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
16
12
|
from lxml import etree, html
|
17
13
|
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
18
14
|
|
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
|
|
32
28
|
huge_tree: bool = True,
|
33
29
|
root: Optional[html.HtmlElement] = None,
|
34
30
|
keep_comments: Optional[bool] = False,
|
35
|
-
auto_match: Optional[bool] =
|
31
|
+
auto_match: Optional[bool] = True,
|
36
32
|
storage: Any = SQLiteStorageSystem,
|
37
33
|
storage_args: Optional[Dict] = None,
|
38
34
|
debug: Optional[bool] = True,
|
@@ -125,7 +121,7 @@ class Adaptor(SelectorsGeneration):
|
|
125
121
|
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
126
122
|
"""Return True if given element is a result of a string expression
|
127
123
|
Examples:
|
128
|
-
|
124
|
+
XPath -> '/text()', '/@attribute' etc...
|
129
125
|
CSS3 -> '::text', '::attr(attrib)'...
|
130
126
|
"""
|
131
127
|
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
@@ -163,6 +159,8 @@ class Adaptor(SelectorsGeneration):
|
|
163
159
|
results = [self.__get_correct_result(n) for n in result]
|
164
160
|
if all(isinstance(res, self.__class__) for res in results):
|
165
161
|
return Adaptors(results)
|
162
|
+
elif all(isinstance(res, TextHandler) for res in results):
|
163
|
+
return TextHandlers(results)
|
166
164
|
return results
|
167
165
|
|
168
166
|
return self.__get_correct_result(result)
|
@@ -399,6 +397,56 @@ class Adaptor(SelectorsGeneration):
|
|
399
397
|
return self.__convert_results(score_table[highest_probability])
|
400
398
|
return []
|
401
399
|
|
400
|
+
def css_first(self, selector: str, identifier: str = '',
|
401
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
402
|
+
) -> Union['Adaptor', 'TextHandler', None]:
|
403
|
+
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
404
|
+
|
405
|
+
**Important:
|
406
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
407
|
+
and want to relocate the same element(s)**
|
408
|
+
|
409
|
+
:param selector: The CSS3 selector to be used.
|
410
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
411
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
412
|
+
otherwise the selector will be used.
|
413
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
414
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
415
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
416
|
+
number unless you must know what you are doing!
|
417
|
+
|
418
|
+
:return: List as :class:`Adaptors`
|
419
|
+
"""
|
420
|
+
for element in self.css(selector, identifier, auto_match, auto_save, percentage):
|
421
|
+
return element
|
422
|
+
return None
|
423
|
+
|
424
|
+
def xpath_first(self, selector: str, identifier: str = '',
|
425
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
426
|
+
) -> Union['Adaptor', 'TextHandler', None]:
|
427
|
+
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
428
|
+
|
429
|
+
**Important:
|
430
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
431
|
+
and want to relocate the same element(s)**
|
432
|
+
|
433
|
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
434
|
+
|
435
|
+
:param selector: The XPath selector to be used.
|
436
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
437
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
438
|
+
otherwise the selector will be used.
|
439
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
440
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
441
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
442
|
+
number unless you must know what you are doing!
|
443
|
+
|
444
|
+
:return: List as :class:`Adaptors`
|
445
|
+
"""
|
446
|
+
for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
|
447
|
+
return element
|
448
|
+
return None
|
449
|
+
|
402
450
|
def css(self, selector: str, identifier: str = '',
|
403
451
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
404
452
|
) -> Union['Adaptors[Adaptor]', List]:
|
@@ -495,6 +543,113 @@ class Adaptor(SelectorsGeneration):
|
|
495
543
|
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
496
544
|
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
497
545
|
|
546
|
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
|
547
|
+
"""Find elements by filters of your creations for ease..
|
548
|
+
|
549
|
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
550
|
+
:param kwargs: The attributes you want to filter elements based on it.
|
551
|
+
:return: The `Adaptors` object of the elements or empty list
|
552
|
+
"""
|
553
|
+
# Attributes that are Python reserved words and can't be used directly
|
554
|
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
555
|
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
556
|
+
whitelisted = {
|
557
|
+
'class_': 'class',
|
558
|
+
'for_': 'for',
|
559
|
+
}
|
560
|
+
|
561
|
+
if not args and not kwargs:
|
562
|
+
raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
|
563
|
+
|
564
|
+
attributes = dict()
|
565
|
+
tags, patterns = set(), set()
|
566
|
+
results, functions, selectors = [], [], []
|
567
|
+
|
568
|
+
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
569
|
+
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
570
|
+
if filter_function(element):
|
571
|
+
results.append(element)
|
572
|
+
|
573
|
+
for branch in element.children:
|
574
|
+
_search_tree(branch, filter_function)
|
575
|
+
|
576
|
+
# Brace yourself for a wonderful journey!
|
577
|
+
for arg in args:
|
578
|
+
if type(arg) is str:
|
579
|
+
tags.add(arg)
|
580
|
+
|
581
|
+
elif type(arg) in [list, tuple, set]:
|
582
|
+
if not all(map(lambda x: type(x) is str, arg)):
|
583
|
+
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
584
|
+
tags.update(set(arg))
|
585
|
+
|
586
|
+
elif type(arg) is dict:
|
587
|
+
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
588
|
+
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
589
|
+
attributes.update(arg)
|
590
|
+
|
591
|
+
elif type(arg) is re.Pattern:
|
592
|
+
patterns.add(arg)
|
593
|
+
|
594
|
+
elif callable(arg):
|
595
|
+
if len(inspect.signature(arg).parameters) > 0:
|
596
|
+
functions.append(arg)
|
597
|
+
else:
|
598
|
+
raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
|
599
|
+
|
600
|
+
else:
|
601
|
+
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
602
|
+
|
603
|
+
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
604
|
+
raise TypeError('Only string values are accepted for arguments')
|
605
|
+
|
606
|
+
for attribute_name, value in kwargs.items():
|
607
|
+
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
608
|
+
attribute_name = whitelisted.get(attribute_name, attribute_name)
|
609
|
+
attributes[attribute_name] = value
|
610
|
+
|
611
|
+
# It's easier and faster to build a selector than traversing the tree
|
612
|
+
tags = tags or ['']
|
613
|
+
for tag in tags:
|
614
|
+
selector = tag
|
615
|
+
for key, value in attributes.items():
|
616
|
+
value = value.replace('"', r'\"') # Escape double quotes in user input
|
617
|
+
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
618
|
+
selector += '[{}="{}"]'.format(key, value)
|
619
|
+
if selector:
|
620
|
+
selectors.append(selector)
|
621
|
+
|
622
|
+
if selectors:
|
623
|
+
results = self.css(', '.join(selectors))
|
624
|
+
if results:
|
625
|
+
# From the results, get the ones that fulfill passed regex patterns
|
626
|
+
for pattern in patterns:
|
627
|
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
628
|
+
|
629
|
+
# From the results, get the ones that fulfill passed functions
|
630
|
+
for function in functions:
|
631
|
+
results = results.filter(function)
|
632
|
+
else:
|
633
|
+
for pattern in patterns:
|
634
|
+
results.extend(self.find_by_regex(pattern, first_match=False))
|
635
|
+
|
636
|
+
for result in (results or [self]):
|
637
|
+
for function in functions:
|
638
|
+
_search_tree(result, function)
|
639
|
+
|
640
|
+
return self.__convert_results(results)
|
641
|
+
|
642
|
+
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
643
|
+
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
644
|
+
|
645
|
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
646
|
+
:param kwargs: The attributes you want to filter elements based on it.
|
647
|
+
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
648
|
+
"""
|
649
|
+
for element in self.find_all(*args, **kwargs):
|
650
|
+
return element
|
651
|
+
return None
|
652
|
+
|
498
653
|
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
499
654
|
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
500
655
|
|
@@ -606,25 +761,33 @@ class Adaptor(SelectorsGeneration):
|
|
606
761
|
# Operations on text functions
|
607
762
|
def json(self) -> Dict:
|
608
763
|
"""Return json response if the response is jsonable otherwise throws error"""
|
609
|
-
|
764
|
+
if self.text:
|
765
|
+
return self.text.json()
|
766
|
+
else:
|
767
|
+
return self.get_all_text(strip=True).json()
|
610
768
|
|
611
|
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
769
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
770
|
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
612
771
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
613
772
|
|
614
773
|
:param regex: Can be either a compiled regular expression or a string.
|
615
774
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
775
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
776
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
616
777
|
"""
|
617
|
-
return self.text.re(regex, replace_entities)
|
778
|
+
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
618
779
|
|
619
|
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
780
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
781
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
620
782
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
621
783
|
|
622
784
|
:param regex: Can be either a compiled regular expression or a string.
|
623
785
|
:param default: The default value to be returned if there is no match
|
624
786
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
625
|
-
|
787
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
788
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
626
789
|
"""
|
627
|
-
return self.text.re_first(regex, default, replace_entities)
|
790
|
+
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
628
791
|
|
629
792
|
def find_similar(
|
630
793
|
self,
|
@@ -757,10 +920,10 @@ class Adaptor(SelectorsGeneration):
|
|
757
920
|
return self.__convert_results(results)
|
758
921
|
|
759
922
|
def find_by_regex(
|
760
|
-
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
923
|
+
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
761
924
|
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
762
925
|
"""Find elements that its text content matches the input regex pattern.
|
763
|
-
:param query: Regex query to match
|
926
|
+
:param query: Regex query/pattern to match
|
764
927
|
:param first_match: Return first element that matches conditions, enabled by default
|
765
928
|
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
766
929
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
@@ -855,54 +1018,56 @@ class Adaptors(List[Adaptor]):
|
|
855
1018
|
]
|
856
1019
|
return self.__class__(flatten(results))
|
857
1020
|
|
858
|
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
1021
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1022
|
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
859
1023
|
"""Call the ``.re()`` method for each element in this list and return
|
860
1024
|
their results flattened as List of TextHandler.
|
861
1025
|
|
862
1026
|
:param regex: Can be either a compiled regular expression or a string.
|
863
1027
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1028
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1029
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
864
1030
|
"""
|
865
1031
|
results = [
|
866
|
-
n.text.re(regex, replace_entities) for n in self
|
1032
|
+
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
867
1033
|
]
|
868
1034
|
return flatten(results)
|
869
1035
|
|
870
|
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
1036
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1037
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
871
1038
|
"""Call the ``.re_first()`` method for each element in this list and return
|
872
|
-
|
1039
|
+
the first result or the default value otherwise.
|
873
1040
|
|
874
1041
|
:param regex: Can be either a compiled regular expression or a string.
|
875
1042
|
:param default: The default value to be returned if there is no match
|
876
1043
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1044
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1045
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
1046
|
+
"""
|
1047
|
+
for n in self:
|
1048
|
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
1049
|
+
return result
|
1050
|
+
return default
|
1051
|
+
|
1052
|
+
def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
1053
|
+
"""Loop over all current elements and return the first element that matches the passed function
|
1054
|
+
:param func: A function that takes each element as an argument and returns True/False
|
1055
|
+
:return: The first element that match the function or ``None`` otherwise.
|
1056
|
+
"""
|
1057
|
+
for element in self:
|
1058
|
+
if func(element):
|
1059
|
+
return element
|
1060
|
+
return None
|
877
1061
|
|
1062
|
+
def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
|
1063
|
+
"""Filter current elements based on the passed function
|
1064
|
+
:param func: A function that takes each element as an argument and returns True/False
|
1065
|
+
:return: The new `Adaptors` object or empty list otherwise.
|
878
1066
|
"""
|
879
1067
|
results = [
|
880
|
-
|
1068
|
+
element for element in self if func(element)
|
881
1069
|
]
|
882
|
-
return
|
883
|
-
|
884
|
-
# def __getattr__(self, name):
|
885
|
-
# if name in dir(self.__class__):
|
886
|
-
# return super().__getattribute__(name)
|
887
|
-
#
|
888
|
-
# # Execute the method itself on each Adaptor
|
889
|
-
# results = []
|
890
|
-
# for item in self:
|
891
|
-
# results.append(getattr(item, name))
|
892
|
-
#
|
893
|
-
# if all(callable(r) for r in results):
|
894
|
-
# def call_all(*args, **kwargs):
|
895
|
-
# final_results = [r(*args, **kwargs) for r in results]
|
896
|
-
# if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
|
897
|
-
# return self.__class__(final_results)
|
898
|
-
# return final_results
|
899
|
-
#
|
900
|
-
# return call_all
|
901
|
-
# else:
|
902
|
-
# # Flatten the result if it's a single-item list containing a list
|
903
|
-
# if len(self) == 1 and isinstance(results[0], list):
|
904
|
-
# return self.__class__(results[0])
|
905
|
-
# return self.__class__(results)
|
1070
|
+
return self.__class__(results) if results else results
|
906
1071
|
|
907
1072
|
def get(self, default=None):
|
908
1073
|
"""Returns the first item of the current list
|