scrapling 0.1.2__py3-none-any.whl → 0.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +2 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +121 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +232 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +18 -0
- scrapling/engines/toolbelt/custom.py +168 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +74 -0
- scrapling/fetchers.py +190 -0
- scrapling/parser.py +216 -51
- scrapling-0.2.dist-info/METADATA +807 -0
- scrapling-0.2.dist-info/RECORD +32 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/LICENSE +0 -0
scrapling/engines/pw.py
ADDED
@@ -0,0 +1,232 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from scrapling.core._types import Union, Callable, Optional, List, Dict
|
4
|
+
|
5
|
+
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
|
6
|
+
from scrapling.engines.toolbelt import (
|
7
|
+
Response,
|
8
|
+
do_nothing,
|
9
|
+
js_bypass_path,
|
10
|
+
intercept_route,
|
11
|
+
generate_headers,
|
12
|
+
check_type_validity,
|
13
|
+
construct_cdp_url,
|
14
|
+
generate_convincing_referer,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
class PlaywrightEngine:
|
19
|
+
def __init__(
|
20
|
+
self, headless: Union[bool, str] = True,
|
21
|
+
disable_resources: bool = False,
|
22
|
+
useragent: Optional[str] = None,
|
23
|
+
network_idle: Optional[bool] = False,
|
24
|
+
timeout: Optional[float] = 30000,
|
25
|
+
page_action: Callable = do_nothing,
|
26
|
+
wait_selector: Optional[str] = None,
|
27
|
+
wait_selector_state: Optional[str] = 'attached',
|
28
|
+
stealth: bool = False,
|
29
|
+
hide_canvas: bool = True,
|
30
|
+
disable_webgl: bool = False,
|
31
|
+
cdp_url: Optional[str] = None,
|
32
|
+
nstbrowser_mode: bool = False,
|
33
|
+
nstbrowser_config: Optional[Dict] = None,
|
34
|
+
google_search: Optional[bool] = True,
|
35
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
36
|
+
adaptor_arguments: Dict = None
|
37
|
+
):
|
38
|
+
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
|
39
|
+
|
40
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
41
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
42
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
43
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
44
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
45
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
46
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
47
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
48
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
49
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
50
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
51
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
52
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
53
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
54
|
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
55
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
56
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
57
|
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
58
|
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
59
|
+
"""
|
60
|
+
self.headless = headless
|
61
|
+
self.disable_resources = disable_resources
|
62
|
+
self.network_idle = bool(network_idle)
|
63
|
+
self.stealth = bool(stealth)
|
64
|
+
self.hide_canvas = bool(hide_canvas)
|
65
|
+
self.disable_webgl = bool(disable_webgl)
|
66
|
+
self.google_search = bool(google_search)
|
67
|
+
self.extra_headers = extra_headers or {}
|
68
|
+
self.cdp_url = cdp_url
|
69
|
+
self.useragent = useragent
|
70
|
+
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
71
|
+
if callable(page_action):
|
72
|
+
self.page_action = page_action
|
73
|
+
else:
|
74
|
+
self.page_action = do_nothing
|
75
|
+
logging.error('[Ignored] Argument "page_action" must be callable')
|
76
|
+
|
77
|
+
self.wait_selector = wait_selector
|
78
|
+
self.wait_selector_state = wait_selector_state
|
79
|
+
self.nstbrowser_mode = bool(nstbrowser_mode)
|
80
|
+
self.nstbrowser_config = nstbrowser_config
|
81
|
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
82
|
+
|
83
|
+
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
84
|
+
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
85
|
+
|
86
|
+
:param flags: Chrome flags to be added to NSTBrowser query
|
87
|
+
:return: CDP URL
|
88
|
+
"""
|
89
|
+
cdp_url = self.cdp_url
|
90
|
+
if self.nstbrowser_mode:
|
91
|
+
if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
|
92
|
+
config = self.nstbrowser_config
|
93
|
+
else:
|
94
|
+
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
95
|
+
if flags:
|
96
|
+
query.update({
|
97
|
+
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
98
|
+
})
|
99
|
+
|
100
|
+
config = {
|
101
|
+
'config': json.dumps(query),
|
102
|
+
# 'token': ''
|
103
|
+
}
|
104
|
+
cdp_url = construct_cdp_url(cdp_url, config)
|
105
|
+
else:
|
106
|
+
# To validate it
|
107
|
+
cdp_url = construct_cdp_url(cdp_url)
|
108
|
+
|
109
|
+
return cdp_url
|
110
|
+
|
111
|
+
def fetch(self, url: str) -> Response:
|
112
|
+
"""Opens up the browser and do your request based on your chosen options.
|
113
|
+
|
114
|
+
:param url: Target url.
|
115
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
116
|
+
"""
|
117
|
+
if not self.stealth:
|
118
|
+
from playwright.sync_api import sync_playwright
|
119
|
+
else:
|
120
|
+
from rebrowser_playwright.sync_api import sync_playwright
|
121
|
+
|
122
|
+
with sync_playwright() as p:
|
123
|
+
# Handle the UserAgent early
|
124
|
+
if self.useragent:
|
125
|
+
extra_headers = {}
|
126
|
+
useragent = self.useragent
|
127
|
+
else:
|
128
|
+
extra_headers = generate_headers(browser_mode=True)
|
129
|
+
useragent = extra_headers.get('User-Agent')
|
130
|
+
|
131
|
+
# Prepare the flags before diving
|
132
|
+
flags = DEFAULT_STEALTH_FLAGS
|
133
|
+
if self.hide_canvas:
|
134
|
+
flags += ['--fingerprinting-canvas-image-data-noise']
|
135
|
+
if self.disable_webgl:
|
136
|
+
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
137
|
+
|
138
|
+
# Creating the browser
|
139
|
+
if self.cdp_url:
|
140
|
+
cdp_url = self._cdp_url_logic(flags if self.stealth else None)
|
141
|
+
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
142
|
+
else:
|
143
|
+
if self.stealth:
|
144
|
+
browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
|
145
|
+
else:
|
146
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
|
147
|
+
|
148
|
+
# Creating the context
|
149
|
+
if self.stealth:
|
150
|
+
context = browser.new_context(
|
151
|
+
locale='en-US',
|
152
|
+
is_mobile=False,
|
153
|
+
has_touch=False,
|
154
|
+
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
155
|
+
user_agent=useragent,
|
156
|
+
device_scale_factor=2,
|
157
|
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
158
|
+
service_workers="allow",
|
159
|
+
ignore_https_errors=True,
|
160
|
+
extra_http_headers=extra_headers,
|
161
|
+
screen={"width": 1920, "height": 1080},
|
162
|
+
viewport={"width": 1920, "height": 1080},
|
163
|
+
permissions=["geolocation", 'notifications'],
|
164
|
+
)
|
165
|
+
else:
|
166
|
+
context = browser.new_context(
|
167
|
+
color_scheme='dark',
|
168
|
+
user_agent=useragent,
|
169
|
+
device_scale_factor=2,
|
170
|
+
extra_http_headers=extra_headers
|
171
|
+
)
|
172
|
+
|
173
|
+
# Finally we are in business
|
174
|
+
page = context.new_page()
|
175
|
+
page.set_default_navigation_timeout(self.timeout)
|
176
|
+
page.set_default_timeout(self.timeout)
|
177
|
+
|
178
|
+
if self.extra_headers:
|
179
|
+
page.set_extra_http_headers(self.extra_headers)
|
180
|
+
|
181
|
+
if self.disable_resources:
|
182
|
+
page.route("**/*", intercept_route)
|
183
|
+
|
184
|
+
if self.stealth:
|
185
|
+
# Basic bypasses nothing fancy as I'm still working on it
|
186
|
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
187
|
+
# https://bot.sannysoft.com/
|
188
|
+
# https://kaliiiiiiiiii.github.io/brotector/
|
189
|
+
# https://pixelscan.net/
|
190
|
+
# https://iphey.com/
|
191
|
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
192
|
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
193
|
+
# https://prescience-data.github.io/execution-monitor.html
|
194
|
+
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
195
|
+
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
196
|
+
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
197
|
+
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
198
|
+
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
199
|
+
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
200
|
+
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
201
|
+
|
202
|
+
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
203
|
+
page.wait_for_load_state(state="domcontentloaded")
|
204
|
+
if self.network_idle:
|
205
|
+
page.wait_for_load_state('networkidle')
|
206
|
+
|
207
|
+
page = self.page_action(page)
|
208
|
+
|
209
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
210
|
+
waiter = page.locator(self.wait_selector)
|
211
|
+
waiter.wait_for(state=self.wait_selector_state)
|
212
|
+
|
213
|
+
content_type = res.headers.get('content-type', '')
|
214
|
+
# Parse charset from content-type
|
215
|
+
encoding = 'utf-8' # default encoding
|
216
|
+
if 'charset=' in content_type.lower():
|
217
|
+
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
218
|
+
|
219
|
+
response = Response(
|
220
|
+
url=res.url,
|
221
|
+
text=page.content(),
|
222
|
+
content=res.body(),
|
223
|
+
status=res.status,
|
224
|
+
reason=res.status_text,
|
225
|
+
encoding=encoding,
|
226
|
+
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
227
|
+
headers=res.all_headers(),
|
228
|
+
request_headers=res.request.all_headers(),
|
229
|
+
adaptor_arguments=self.adaptor_arguments
|
230
|
+
)
|
231
|
+
page.close()
|
232
|
+
return response
|
@@ -0,0 +1,112 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from scrapling.core._types import Union, Optional, Dict
|
4
|
+
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
5
|
+
|
6
|
+
import httpx
|
7
|
+
from httpx._models import Response as httpxResponse
|
8
|
+
|
9
|
+
|
10
|
+
class StaticEngine:
|
11
|
+
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
|
12
|
+
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
13
|
+
|
14
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
15
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
16
|
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
17
|
+
"""
|
18
|
+
self.timeout = timeout
|
19
|
+
self.follow_redirects = bool(follow_redirects)
|
20
|
+
self._extra_headers = generate_headers(browser_mode=False)
|
21
|
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
22
|
+
|
23
|
+
@staticmethod
|
24
|
+
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
25
|
+
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
26
|
+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
27
|
+
|
28
|
+
:param headers: Current headers in the request if the user passed any
|
29
|
+
:param url: The Target URL.
|
30
|
+
:param stealth: Whether stealth mode is enabled or not.
|
31
|
+
:return: A dictionary of the new headers.
|
32
|
+
"""
|
33
|
+
headers = headers or {}
|
34
|
+
|
35
|
+
# Validate headers
|
36
|
+
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
37
|
+
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
38
|
+
logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
39
|
+
|
40
|
+
if stealth:
|
41
|
+
extra_headers = generate_headers(browser_mode=False)
|
42
|
+
headers.update(extra_headers)
|
43
|
+
headers.update({'referer': generate_convincing_referer(url)})
|
44
|
+
|
45
|
+
return headers
|
46
|
+
|
47
|
+
def _prepare_response(self, response: httpxResponse) -> Response:
|
48
|
+
"""Takes httpx response and generates `Response` object from it.
|
49
|
+
|
50
|
+
:param response: httpx response object
|
51
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
52
|
+
"""
|
53
|
+
return Response(
|
54
|
+
url=str(response.url),
|
55
|
+
text=response.text,
|
56
|
+
content=response.content,
|
57
|
+
status=response.status_code,
|
58
|
+
reason=response.reason_phrase,
|
59
|
+
encoding=response.encoding or 'utf-8',
|
60
|
+
cookies=dict(response.cookies),
|
61
|
+
headers=dict(response.headers),
|
62
|
+
request_headers=dict(response.request.headers),
|
63
|
+
adaptor_arguments=self.adaptor_arguments
|
64
|
+
)
|
65
|
+
|
66
|
+
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
|
+
"""Make basic HTTP GET request for you but with some added flavors.
|
68
|
+
:param url: Target url.
|
69
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
70
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
71
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
72
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
73
|
+
"""
|
74
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
75
|
+
request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
76
|
+
return self._prepare_response(request)
|
77
|
+
|
78
|
+
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
79
|
+
"""Make basic HTTP POST request for you but with some added flavors.
|
80
|
+
:param url: Target url.
|
81
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
82
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
83
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
84
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
85
|
+
"""
|
86
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
87
|
+
request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
88
|
+
return self._prepare_response(request)
|
89
|
+
|
90
|
+
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
91
|
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
92
|
+
:param url: Target url.
|
93
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
94
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
95
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
96
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
97
|
+
"""
|
98
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
99
|
+
request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
100
|
+
return self._prepare_response(request)
|
101
|
+
|
102
|
+
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
103
|
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
104
|
+
:param url: Target url.
|
105
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
106
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
107
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
108
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
109
|
+
"""
|
110
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
111
|
+
request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
112
|
+
return self._prepare_response(request)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from .fingerprints import (
|
2
|
+
get_os_name,
|
3
|
+
generate_headers,
|
4
|
+
generate_convincing_referer,
|
5
|
+
)
|
6
|
+
from .custom import (
|
7
|
+
Response,
|
8
|
+
do_nothing,
|
9
|
+
BaseFetcher,
|
10
|
+
get_variable_name,
|
11
|
+
check_type_validity,
|
12
|
+
check_if_engine_usable,
|
13
|
+
)
|
14
|
+
from .navigation import (
|
15
|
+
js_bypass_path,
|
16
|
+
intercept_route,
|
17
|
+
construct_cdp_url,
|
18
|
+
)
|
@@ -0,0 +1,168 @@
|
|
1
|
+
"""
|
2
|
+
Functions related to custom types or type checking
|
3
|
+
"""
|
4
|
+
import inspect
|
5
|
+
import logging
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
|
8
|
+
from scrapling.core.utils import setup_basic_logging
|
9
|
+
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
10
|
+
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass(frozen=True)
|
14
|
+
class Response:
|
15
|
+
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
16
|
+
url: str
|
17
|
+
text: str
|
18
|
+
content: bytes
|
19
|
+
status: int
|
20
|
+
reason: str
|
21
|
+
encoding: str = 'utf-8' # default encoding
|
22
|
+
cookies: Dict = field(default_factory=dict)
|
23
|
+
headers: Dict = field(default_factory=dict)
|
24
|
+
request_headers: Dict = field(default_factory=dict)
|
25
|
+
adaptor_arguments: Dict = field(default_factory=dict)
|
26
|
+
|
27
|
+
@property
|
28
|
+
def adaptor(self) -> Union[Adaptor, None]:
|
29
|
+
"""Generate Adaptor instance from this response if possible, otherwise return None"""
|
30
|
+
automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
|
31
|
+
if self.text:
|
32
|
+
# For playwright that will be the response after all JS executed
|
33
|
+
return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
34
|
+
elif self.content:
|
35
|
+
# For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
|
36
|
+
# To get response Bytes after the load states
|
37
|
+
# Reference: https://playwright.dev/python/docs/api/class-page
|
38
|
+
return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
39
|
+
return None
|
40
|
+
|
41
|
+
def __repr__(self):
|
42
|
+
return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
43
|
+
|
44
|
+
|
45
|
+
class BaseFetcher:
|
46
|
+
def __init__(
|
47
|
+
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
48
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
|
49
|
+
automatch_domain: Optional[str] = None,
|
50
|
+
):
|
51
|
+
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
52
|
+
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
53
|
+
|
54
|
+
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
55
|
+
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
56
|
+
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
57
|
+
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
58
|
+
priority over all auto-match related arguments/functions in the class.
|
59
|
+
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
60
|
+
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
61
|
+
If empty, default values will be used.
|
62
|
+
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
63
|
+
Otherwise, the domain of the request is used by default.
|
64
|
+
:param debug: Enable debug mode
|
65
|
+
"""
|
66
|
+
# Adaptor class parameters
|
67
|
+
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
68
|
+
self.adaptor_arguments = dict(
|
69
|
+
huge_tree=huge_tree,
|
70
|
+
keep_comments=keep_comments,
|
71
|
+
auto_match=auto_match,
|
72
|
+
storage=storage,
|
73
|
+
storage_args=storage_args,
|
74
|
+
debug=debug,
|
75
|
+
)
|
76
|
+
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
77
|
+
setup_basic_logging(level='debug' if debug else 'info')
|
78
|
+
if automatch_domain:
|
79
|
+
if type(automatch_domain) is not str:
|
80
|
+
logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
81
|
+
else:
|
82
|
+
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
83
|
+
|
84
|
+
|
85
|
+
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
86
|
+
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
87
|
+
|
88
|
+
:param engine: The engine class itself
|
89
|
+
:return: The engine class again if all checks out, otherwise raises error
|
90
|
+
:raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
|
91
|
+
"""
|
92
|
+
# if isinstance(engine, type):
|
93
|
+
# raise TypeError("Expected an engine instance, not a class definition of the engine")
|
94
|
+
|
95
|
+
if hasattr(engine, 'fetch'):
|
96
|
+
fetch_function = getattr(engine, "fetch")
|
97
|
+
if callable(fetch_function):
|
98
|
+
if len(inspect.signature(fetch_function).parameters) > 0:
|
99
|
+
return engine
|
100
|
+
else:
|
101
|
+
# raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
|
102
|
+
raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
|
103
|
+
else:
|
104
|
+
# raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
|
105
|
+
raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
|
106
|
+
else:
|
107
|
+
# raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
|
108
|
+
raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
|
109
|
+
|
110
|
+
|
111
|
+
def get_variable_name(var: Any) -> Optional[str]:
|
112
|
+
"""Get the name of a variable using global and local scopes.
|
113
|
+
:param var: The variable to find the name for
|
114
|
+
:return: The name of the variable if found, None otherwise
|
115
|
+
"""
|
116
|
+
for scope in [globals(), locals()]:
|
117
|
+
for name, value in scope.items():
|
118
|
+
if value is var:
|
119
|
+
return name
|
120
|
+
return None
|
121
|
+
|
122
|
+
|
123
|
+
def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
|
124
|
+
"""Check if a variable matches the specified type constraints.
|
125
|
+
:param variable: The variable to check
|
126
|
+
:param valid_types: List of valid types for the variable
|
127
|
+
:param default_value: Value to return if type check fails
|
128
|
+
:param critical: If True, raises TypeError instead of logging error
|
129
|
+
:param param_name: Optional parameter name for error messages
|
130
|
+
:return: The original variable if valid, default_value if invalid
|
131
|
+
:raise TypeError: If critical=True and type check fails
|
132
|
+
"""
|
133
|
+
# Use provided param_name or try to get it automatically
|
134
|
+
var_name = param_name or get_variable_name(variable) or "Unknown"
|
135
|
+
|
136
|
+
# Convert valid_types to a list if None
|
137
|
+
valid_types = valid_types or []
|
138
|
+
|
139
|
+
# Handle None value
|
140
|
+
if variable is None:
|
141
|
+
if type(None) in valid_types:
|
142
|
+
return variable
|
143
|
+
error_msg = f'Argument "{var_name}" cannot be None'
|
144
|
+
if critical:
|
145
|
+
raise TypeError(error_msg)
|
146
|
+
logging.error(f'[Ignored] {error_msg}')
|
147
|
+
return default_value
|
148
|
+
|
149
|
+
# If no valid_types specified and variable has a value, return it
|
150
|
+
if not valid_types:
|
151
|
+
return variable
|
152
|
+
|
153
|
+
# Check if variable type matches any of the valid types
|
154
|
+
if not any(isinstance(variable, t) for t in valid_types):
|
155
|
+
type_names = [t.__name__ for t in valid_types]
|
156
|
+
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
157
|
+
if critical:
|
158
|
+
raise TypeError(error_msg)
|
159
|
+
logging.error(f'[Ignored] {error_msg}')
|
160
|
+
return default_value
|
161
|
+
|
162
|
+
return variable
|
163
|
+
|
164
|
+
|
165
|
+
# Pew Pew
|
166
|
+
def do_nothing(page):
|
167
|
+
# Just works as a filler for `page_action` argument in browser engines
|
168
|
+
return page
|
@@ -0,0 +1,81 @@
|
|
1
|
+
"""
|
2
|
+
Functions related to generating headers and fingerprints generally
|
3
|
+
"""
|
4
|
+
|
5
|
+
import platform
|
6
|
+
|
7
|
+
from scrapling.core.utils import cache
|
8
|
+
from scrapling.core._types import Union, Dict
|
9
|
+
|
10
|
+
from tldextract import extract
|
11
|
+
from browserforge.headers import HeaderGenerator, Browser
|
12
|
+
from browserforge.fingerprints import FingerprintGenerator, Fingerprint
|
13
|
+
|
14
|
+
|
15
|
+
@cache(None, typed=True)
|
16
|
+
def generate_convincing_referer(url: str) -> str:
|
17
|
+
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
18
|
+
|
19
|
+
>>> generate_convincing_referer('https://www.somewebsite.com/blah')
|
20
|
+
'https://www.google.com/search?q=somewebsite'
|
21
|
+
|
22
|
+
:param url: The URL you are about to fetch.
|
23
|
+
:return: Google's search URL of the domain name
|
24
|
+
"""
|
25
|
+
website_name = extract(url).domain
|
26
|
+
return f'https://www.google.com/search?q={website_name}'
|
27
|
+
|
28
|
+
|
29
|
+
@cache(None, typed=True)
|
30
|
+
def get_os_name() -> Union[str, None]:
|
31
|
+
"""Get the current OS name in the same format needed for browserforge
|
32
|
+
|
33
|
+
:return: Current OS name or `None` otherwise
|
34
|
+
"""
|
35
|
+
#
|
36
|
+
os_name = platform.system()
|
37
|
+
return {
|
38
|
+
'Linux': 'linux',
|
39
|
+
'Darwin': 'macos',
|
40
|
+
'Windows': 'windows',
|
41
|
+
# For the future? because why not
|
42
|
+
'iOS': 'ios',
|
43
|
+
}.get(os_name)
|
44
|
+
|
45
|
+
|
46
|
+
def generate_suitable_fingerprint() -> Fingerprint:
|
47
|
+
"""Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
|
48
|
+
|
49
|
+
This function was originally created to test Browserforge's injector.
|
50
|
+
:return: `Fingerprint` object
|
51
|
+
"""
|
52
|
+
return FingerprintGenerator(
|
53
|
+
browser=[Browser(name='chrome', min_version=128)],
|
54
|
+
os=get_os_name(), # None is ignored
|
55
|
+
device='desktop'
|
56
|
+
).generate()
|
57
|
+
|
58
|
+
|
59
|
+
def generate_headers(browser_mode: bool = False) -> Dict:
|
60
|
+
"""Generate real browser-like headers using browserforge's generator
|
61
|
+
|
62
|
+
:param browser_mode: If enabled, the headers created are used for playwright so it have to match everything
|
63
|
+
:return: A dictionary of the generated headers
|
64
|
+
"""
|
65
|
+
if browser_mode:
|
66
|
+
# In this mode we don't care about anything other than matching the OS and the browser type with the browser we are using
|
67
|
+
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
68
|
+
os_name = get_os_name()
|
69
|
+
return HeaderGenerator(
|
70
|
+
browser=[Browser(name='chrome', min_version=128)],
|
71
|
+
os=os_name, # None is ignored
|
72
|
+
device='desktop'
|
73
|
+
).generate()
|
74
|
+
else:
|
75
|
+
# Here it's used for normal requests that aren't done through browsers so we can take it lightly
|
76
|
+
browsers = [
|
77
|
+
Browser(name='chrome', min_version=120),
|
78
|
+
Browser(name='firefox', min_version=120),
|
79
|
+
Browser(name='edge', min_version=120),
|
80
|
+
]
|
81
|
+
return HeaderGenerator(browser=browsers, device='desktop').generate()
|