scrapling 0.1.2__py3-none-any.whl → 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +2 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +121 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +232 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +18 -0
- scrapling/engines/toolbelt/custom.py +168 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +74 -0
- scrapling/fetchers.py +190 -0
- scrapling/parser.py +216 -51
- scrapling-0.2.dist-info/METADATA +807 -0
- scrapling-0.2.dist-info/RECORD +32 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/LICENSE +0 -0
scrapling/engines/pw.py
ADDED
@@ -0,0 +1,232 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from scrapling.core._types import Union, Callable, Optional, List, Dict
|
4
|
+
|
5
|
+
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
|
6
|
+
from scrapling.engines.toolbelt import (
|
7
|
+
Response,
|
8
|
+
do_nothing,
|
9
|
+
js_bypass_path,
|
10
|
+
intercept_route,
|
11
|
+
generate_headers,
|
12
|
+
check_type_validity,
|
13
|
+
construct_cdp_url,
|
14
|
+
generate_convincing_referer,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
class PlaywrightEngine:
|
19
|
+
def __init__(
|
20
|
+
self, headless: Union[bool, str] = True,
|
21
|
+
disable_resources: bool = False,
|
22
|
+
useragent: Optional[str] = None,
|
23
|
+
network_idle: Optional[bool] = False,
|
24
|
+
timeout: Optional[float] = 30000,
|
25
|
+
page_action: Callable = do_nothing,
|
26
|
+
wait_selector: Optional[str] = None,
|
27
|
+
wait_selector_state: Optional[str] = 'attached',
|
28
|
+
stealth: bool = False,
|
29
|
+
hide_canvas: bool = True,
|
30
|
+
disable_webgl: bool = False,
|
31
|
+
cdp_url: Optional[str] = None,
|
32
|
+
nstbrowser_mode: bool = False,
|
33
|
+
nstbrowser_config: Optional[Dict] = None,
|
34
|
+
google_search: Optional[bool] = True,
|
35
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
36
|
+
adaptor_arguments: Dict = None
|
37
|
+
):
|
38
|
+
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
|
39
|
+
|
40
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
41
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
42
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
43
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
44
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
45
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
46
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
47
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
48
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
49
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
50
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
51
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
52
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
53
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
54
|
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
55
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
56
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
57
|
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
58
|
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
59
|
+
"""
|
60
|
+
self.headless = headless
|
61
|
+
self.disable_resources = disable_resources
|
62
|
+
self.network_idle = bool(network_idle)
|
63
|
+
self.stealth = bool(stealth)
|
64
|
+
self.hide_canvas = bool(hide_canvas)
|
65
|
+
self.disable_webgl = bool(disable_webgl)
|
66
|
+
self.google_search = bool(google_search)
|
67
|
+
self.extra_headers = extra_headers or {}
|
68
|
+
self.cdp_url = cdp_url
|
69
|
+
self.useragent = useragent
|
70
|
+
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
71
|
+
if callable(page_action):
|
72
|
+
self.page_action = page_action
|
73
|
+
else:
|
74
|
+
self.page_action = do_nothing
|
75
|
+
logging.error('[Ignored] Argument "page_action" must be callable')
|
76
|
+
|
77
|
+
self.wait_selector = wait_selector
|
78
|
+
self.wait_selector_state = wait_selector_state
|
79
|
+
self.nstbrowser_mode = bool(nstbrowser_mode)
|
80
|
+
self.nstbrowser_config = nstbrowser_config
|
81
|
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
82
|
+
|
83
|
+
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
84
|
+
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
85
|
+
|
86
|
+
:param flags: Chrome flags to be added to NSTBrowser query
|
87
|
+
:return: CDP URL
|
88
|
+
"""
|
89
|
+
cdp_url = self.cdp_url
|
90
|
+
if self.nstbrowser_mode:
|
91
|
+
if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
|
92
|
+
config = self.nstbrowser_config
|
93
|
+
else:
|
94
|
+
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
95
|
+
if flags:
|
96
|
+
query.update({
|
97
|
+
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
98
|
+
})
|
99
|
+
|
100
|
+
config = {
|
101
|
+
'config': json.dumps(query),
|
102
|
+
# 'token': ''
|
103
|
+
}
|
104
|
+
cdp_url = construct_cdp_url(cdp_url, config)
|
105
|
+
else:
|
106
|
+
# To validate it
|
107
|
+
cdp_url = construct_cdp_url(cdp_url)
|
108
|
+
|
109
|
+
return cdp_url
|
110
|
+
|
111
|
+
def fetch(self, url: str) -> Response:
|
112
|
+
"""Opens up the browser and do your request based on your chosen options.
|
113
|
+
|
114
|
+
:param url: Target url.
|
115
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
116
|
+
"""
|
117
|
+
if not self.stealth:
|
118
|
+
from playwright.sync_api import sync_playwright
|
119
|
+
else:
|
120
|
+
from rebrowser_playwright.sync_api import sync_playwright
|
121
|
+
|
122
|
+
with sync_playwright() as p:
|
123
|
+
# Handle the UserAgent early
|
124
|
+
if self.useragent:
|
125
|
+
extra_headers = {}
|
126
|
+
useragent = self.useragent
|
127
|
+
else:
|
128
|
+
extra_headers = generate_headers(browser_mode=True)
|
129
|
+
useragent = extra_headers.get('User-Agent')
|
130
|
+
|
131
|
+
# Prepare the flags before diving
|
132
|
+
flags = DEFAULT_STEALTH_FLAGS
|
133
|
+
if self.hide_canvas:
|
134
|
+
flags += ['--fingerprinting-canvas-image-data-noise']
|
135
|
+
if self.disable_webgl:
|
136
|
+
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
137
|
+
|
138
|
+
# Creating the browser
|
139
|
+
if self.cdp_url:
|
140
|
+
cdp_url = self._cdp_url_logic(flags if self.stealth else None)
|
141
|
+
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
142
|
+
else:
|
143
|
+
if self.stealth:
|
144
|
+
browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
|
145
|
+
else:
|
146
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
|
147
|
+
|
148
|
+
# Creating the context
|
149
|
+
if self.stealth:
|
150
|
+
context = browser.new_context(
|
151
|
+
locale='en-US',
|
152
|
+
is_mobile=False,
|
153
|
+
has_touch=False,
|
154
|
+
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
155
|
+
user_agent=useragent,
|
156
|
+
device_scale_factor=2,
|
157
|
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
158
|
+
service_workers="allow",
|
159
|
+
ignore_https_errors=True,
|
160
|
+
extra_http_headers=extra_headers,
|
161
|
+
screen={"width": 1920, "height": 1080},
|
162
|
+
viewport={"width": 1920, "height": 1080},
|
163
|
+
permissions=["geolocation", 'notifications'],
|
164
|
+
)
|
165
|
+
else:
|
166
|
+
context = browser.new_context(
|
167
|
+
color_scheme='dark',
|
168
|
+
user_agent=useragent,
|
169
|
+
device_scale_factor=2,
|
170
|
+
extra_http_headers=extra_headers
|
171
|
+
)
|
172
|
+
|
173
|
+
# Finally we are in business
|
174
|
+
page = context.new_page()
|
175
|
+
page.set_default_navigation_timeout(self.timeout)
|
176
|
+
page.set_default_timeout(self.timeout)
|
177
|
+
|
178
|
+
if self.extra_headers:
|
179
|
+
page.set_extra_http_headers(self.extra_headers)
|
180
|
+
|
181
|
+
if self.disable_resources:
|
182
|
+
page.route("**/*", intercept_route)
|
183
|
+
|
184
|
+
if self.stealth:
|
185
|
+
# Basic bypasses nothing fancy as I'm still working on it
|
186
|
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
187
|
+
# https://bot.sannysoft.com/
|
188
|
+
# https://kaliiiiiiiiii.github.io/brotector/
|
189
|
+
# https://pixelscan.net/
|
190
|
+
# https://iphey.com/
|
191
|
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
192
|
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
193
|
+
# https://prescience-data.github.io/execution-monitor.html
|
194
|
+
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
195
|
+
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
196
|
+
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
197
|
+
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
198
|
+
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
199
|
+
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
200
|
+
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
201
|
+
|
202
|
+
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
203
|
+
page.wait_for_load_state(state="domcontentloaded")
|
204
|
+
if self.network_idle:
|
205
|
+
page.wait_for_load_state('networkidle')
|
206
|
+
|
207
|
+
page = self.page_action(page)
|
208
|
+
|
209
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
210
|
+
waiter = page.locator(self.wait_selector)
|
211
|
+
waiter.wait_for(state=self.wait_selector_state)
|
212
|
+
|
213
|
+
content_type = res.headers.get('content-type', '')
|
214
|
+
# Parse charset from content-type
|
215
|
+
encoding = 'utf-8' # default encoding
|
216
|
+
if 'charset=' in content_type.lower():
|
217
|
+
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
218
|
+
|
219
|
+
response = Response(
|
220
|
+
url=res.url,
|
221
|
+
text=page.content(),
|
222
|
+
content=res.body(),
|
223
|
+
status=res.status,
|
224
|
+
reason=res.status_text,
|
225
|
+
encoding=encoding,
|
226
|
+
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
227
|
+
headers=res.all_headers(),
|
228
|
+
request_headers=res.request.all_headers(),
|
229
|
+
adaptor_arguments=self.adaptor_arguments
|
230
|
+
)
|
231
|
+
page.close()
|
232
|
+
return response
|
@@ -0,0 +1,112 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from scrapling.core._types import Union, Optional, Dict
|
4
|
+
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
5
|
+
|
6
|
+
import httpx
|
7
|
+
from httpx._models import Response as httpxResponse
|
8
|
+
|
9
|
+
|
10
|
+
class StaticEngine:
|
11
|
+
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
|
12
|
+
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
13
|
+
|
14
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
15
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
16
|
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
17
|
+
"""
|
18
|
+
self.timeout = timeout
|
19
|
+
self.follow_redirects = bool(follow_redirects)
|
20
|
+
self._extra_headers = generate_headers(browser_mode=False)
|
21
|
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
22
|
+
|
23
|
+
@staticmethod
|
24
|
+
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
25
|
+
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
26
|
+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
27
|
+
|
28
|
+
:param headers: Current headers in the request if the user passed any
|
29
|
+
:param url: The Target URL.
|
30
|
+
:param stealth: Whether stealth mode is enabled or not.
|
31
|
+
:return: A dictionary of the new headers.
|
32
|
+
"""
|
33
|
+
headers = headers or {}
|
34
|
+
|
35
|
+
# Validate headers
|
36
|
+
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
37
|
+
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
38
|
+
logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
39
|
+
|
40
|
+
if stealth:
|
41
|
+
extra_headers = generate_headers(browser_mode=False)
|
42
|
+
headers.update(extra_headers)
|
43
|
+
headers.update({'referer': generate_convincing_referer(url)})
|
44
|
+
|
45
|
+
return headers
|
46
|
+
|
47
|
+
def _prepare_response(self, response: httpxResponse) -> Response:
|
48
|
+
"""Takes httpx response and generates `Response` object from it.
|
49
|
+
|
50
|
+
:param response: httpx response object
|
51
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
52
|
+
"""
|
53
|
+
return Response(
|
54
|
+
url=str(response.url),
|
55
|
+
text=response.text,
|
56
|
+
content=response.content,
|
57
|
+
status=response.status_code,
|
58
|
+
reason=response.reason_phrase,
|
59
|
+
encoding=response.encoding or 'utf-8',
|
60
|
+
cookies=dict(response.cookies),
|
61
|
+
headers=dict(response.headers),
|
62
|
+
request_headers=dict(response.request.headers),
|
63
|
+
adaptor_arguments=self.adaptor_arguments
|
64
|
+
)
|
65
|
+
|
66
|
+
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
|
+
"""Make basic HTTP GET request for you but with some added flavors.
|
68
|
+
:param url: Target url.
|
69
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
70
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
71
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
72
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
73
|
+
"""
|
74
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
75
|
+
request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
76
|
+
return self._prepare_response(request)
|
77
|
+
|
78
|
+
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
79
|
+
"""Make basic HTTP POST request for you but with some added flavors.
|
80
|
+
:param url: Target url.
|
81
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
82
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
83
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
84
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
85
|
+
"""
|
86
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
87
|
+
request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
88
|
+
return self._prepare_response(request)
|
89
|
+
|
90
|
+
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
91
|
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
92
|
+
:param url: Target url.
|
93
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
94
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
95
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
96
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
97
|
+
"""
|
98
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
99
|
+
request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
100
|
+
return self._prepare_response(request)
|
101
|
+
|
102
|
+
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
103
|
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
104
|
+
:param url: Target url.
|
105
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
106
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
107
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
108
|
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
109
|
+
"""
|
110
|
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
111
|
+
request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
112
|
+
return self._prepare_response(request)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from .fingerprints import (
|
2
|
+
get_os_name,
|
3
|
+
generate_headers,
|
4
|
+
generate_convincing_referer,
|
5
|
+
)
|
6
|
+
from .custom import (
|
7
|
+
Response,
|
8
|
+
do_nothing,
|
9
|
+
BaseFetcher,
|
10
|
+
get_variable_name,
|
11
|
+
check_type_validity,
|
12
|
+
check_if_engine_usable,
|
13
|
+
)
|
14
|
+
from .navigation import (
|
15
|
+
js_bypass_path,
|
16
|
+
intercept_route,
|
17
|
+
construct_cdp_url,
|
18
|
+
)
|
@@ -0,0 +1,168 @@
|
|
1
|
+
"""
|
2
|
+
Functions related to custom types or type checking
|
3
|
+
"""
|
4
|
+
import inspect
|
5
|
+
import logging
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
|
8
|
+
from scrapling.core.utils import setup_basic_logging
|
9
|
+
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
10
|
+
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass(frozen=True)
|
14
|
+
class Response:
|
15
|
+
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
16
|
+
url: str
|
17
|
+
text: str
|
18
|
+
content: bytes
|
19
|
+
status: int
|
20
|
+
reason: str
|
21
|
+
encoding: str = 'utf-8' # default encoding
|
22
|
+
cookies: Dict = field(default_factory=dict)
|
23
|
+
headers: Dict = field(default_factory=dict)
|
24
|
+
request_headers: Dict = field(default_factory=dict)
|
25
|
+
adaptor_arguments: Dict = field(default_factory=dict)
|
26
|
+
|
27
|
+
@property
|
28
|
+
def adaptor(self) -> Union[Adaptor, None]:
|
29
|
+
"""Generate Adaptor instance from this response if possible, otherwise return None"""
|
30
|
+
automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
|
31
|
+
if self.text:
|
32
|
+
# For playwright that will be the response after all JS executed
|
33
|
+
return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
34
|
+
elif self.content:
|
35
|
+
# For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
|
36
|
+
# To get response Bytes after the load states
|
37
|
+
# Reference: https://playwright.dev/python/docs/api/class-page
|
38
|
+
return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
39
|
+
return None
|
40
|
+
|
41
|
+
def __repr__(self):
|
42
|
+
return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
43
|
+
|
44
|
+
|
45
|
+
class BaseFetcher:
|
46
|
+
def __init__(
|
47
|
+
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
48
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
|
49
|
+
automatch_domain: Optional[str] = None,
|
50
|
+
):
|
51
|
+
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
52
|
+
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
53
|
+
|
54
|
+
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
55
|
+
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
56
|
+
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
57
|
+
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
58
|
+
priority over all auto-match related arguments/functions in the class.
|
59
|
+
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
60
|
+
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
61
|
+
If empty, default values will be used.
|
62
|
+
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
63
|
+
Otherwise, the domain of the request is used by default.
|
64
|
+
:param debug: Enable debug mode
|
65
|
+
"""
|
66
|
+
# Adaptor class parameters
|
67
|
+
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
68
|
+
self.adaptor_arguments = dict(
|
69
|
+
huge_tree=huge_tree,
|
70
|
+
keep_comments=keep_comments,
|
71
|
+
auto_match=auto_match,
|
72
|
+
storage=storage,
|
73
|
+
storage_args=storage_args,
|
74
|
+
debug=debug,
|
75
|
+
)
|
76
|
+
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
77
|
+
setup_basic_logging(level='debug' if debug else 'info')
|
78
|
+
if automatch_domain:
|
79
|
+
if type(automatch_domain) is not str:
|
80
|
+
logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
81
|
+
else:
|
82
|
+
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
83
|
+
|
84
|
+
|
85
|
+
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
86
|
+
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
87
|
+
|
88
|
+
:param engine: The engine class itself
|
89
|
+
:return: The engine class again if all checks out, otherwise raises error
|
90
|
+
:raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
|
91
|
+
"""
|
92
|
+
# if isinstance(engine, type):
|
93
|
+
# raise TypeError("Expected an engine instance, not a class definition of the engine")
|
94
|
+
|
95
|
+
if hasattr(engine, 'fetch'):
|
96
|
+
fetch_function = getattr(engine, "fetch")
|
97
|
+
if callable(fetch_function):
|
98
|
+
if len(inspect.signature(fetch_function).parameters) > 0:
|
99
|
+
return engine
|
100
|
+
else:
|
101
|
+
# raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
|
102
|
+
raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
|
103
|
+
else:
|
104
|
+
# raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
|
105
|
+
raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
|
106
|
+
else:
|
107
|
+
# raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
|
108
|
+
raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
|
109
|
+
|
110
|
+
|
111
|
+
def get_variable_name(var: Any) -> Optional[str]:
|
112
|
+
"""Get the name of a variable using global and local scopes.
|
113
|
+
:param var: The variable to find the name for
|
114
|
+
:return: The name of the variable if found, None otherwise
|
115
|
+
"""
|
116
|
+
for scope in [globals(), locals()]:
|
117
|
+
for name, value in scope.items():
|
118
|
+
if value is var:
|
119
|
+
return name
|
120
|
+
return None
|
121
|
+
|
122
|
+
|
123
|
+
def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
|
124
|
+
"""Check if a variable matches the specified type constraints.
|
125
|
+
:param variable: The variable to check
|
126
|
+
:param valid_types: List of valid types for the variable
|
127
|
+
:param default_value: Value to return if type check fails
|
128
|
+
:param critical: If True, raises TypeError instead of logging error
|
129
|
+
:param param_name: Optional parameter name for error messages
|
130
|
+
:return: The original variable if valid, default_value if invalid
|
131
|
+
:raise TypeError: If critical=True and type check fails
|
132
|
+
"""
|
133
|
+
# Use provided param_name or try to get it automatically
|
134
|
+
var_name = param_name or get_variable_name(variable) or "Unknown"
|
135
|
+
|
136
|
+
# Convert valid_types to a list if None
|
137
|
+
valid_types = valid_types or []
|
138
|
+
|
139
|
+
# Handle None value
|
140
|
+
if variable is None:
|
141
|
+
if type(None) in valid_types:
|
142
|
+
return variable
|
143
|
+
error_msg = f'Argument "{var_name}" cannot be None'
|
144
|
+
if critical:
|
145
|
+
raise TypeError(error_msg)
|
146
|
+
logging.error(f'[Ignored] {error_msg}')
|
147
|
+
return default_value
|
148
|
+
|
149
|
+
# If no valid_types specified and variable has a value, return it
|
150
|
+
if not valid_types:
|
151
|
+
return variable
|
152
|
+
|
153
|
+
# Check if variable type matches any of the valid types
|
154
|
+
if not any(isinstance(variable, t) for t in valid_types):
|
155
|
+
type_names = [t.__name__ for t in valid_types]
|
156
|
+
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
157
|
+
if critical:
|
158
|
+
raise TypeError(error_msg)
|
159
|
+
logging.error(f'[Ignored] {error_msg}')
|
160
|
+
return default_value
|
161
|
+
|
162
|
+
return variable
|
163
|
+
|
164
|
+
|
165
|
+
# Pew Pew
|
166
|
+
def do_nothing(page):
|
167
|
+
# Just works as a filler for `page_action` argument in browser engines
|
168
|
+
return page
|
@@ -0,0 +1,81 @@
|
|
1
|
+
"""
|
2
|
+
Functions related to generating headers and fingerprints generally
|
3
|
+
"""
|
4
|
+
|
5
|
+
import platform
|
6
|
+
|
7
|
+
from scrapling.core.utils import cache
|
8
|
+
from scrapling.core._types import Union, Dict
|
9
|
+
|
10
|
+
from tldextract import extract
|
11
|
+
from browserforge.headers import HeaderGenerator, Browser
|
12
|
+
from browserforge.fingerprints import FingerprintGenerator, Fingerprint
|
13
|
+
|
14
|
+
|
15
|
+
@cache(None, typed=True)
|
16
|
+
def generate_convincing_referer(url: str) -> str:
|
17
|
+
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
18
|
+
|
19
|
+
>>> generate_convincing_referer('https://www.somewebsite.com/blah')
|
20
|
+
'https://www.google.com/search?q=somewebsite'
|
21
|
+
|
22
|
+
:param url: The URL you are about to fetch.
|
23
|
+
:return: Google's search URL of the domain name
|
24
|
+
"""
|
25
|
+
website_name = extract(url).domain
|
26
|
+
return f'https://www.google.com/search?q={website_name}'
|
27
|
+
|
28
|
+
|
29
|
+
@cache(None, typed=True)
|
30
|
+
def get_os_name() -> Union[str, None]:
|
31
|
+
"""Get the current OS name in the same format needed for browserforge
|
32
|
+
|
33
|
+
:return: Current OS name or `None` otherwise
|
34
|
+
"""
|
35
|
+
#
|
36
|
+
os_name = platform.system()
|
37
|
+
return {
|
38
|
+
'Linux': 'linux',
|
39
|
+
'Darwin': 'macos',
|
40
|
+
'Windows': 'windows',
|
41
|
+
# For the future? because why not
|
42
|
+
'iOS': 'ios',
|
43
|
+
}.get(os_name)
|
44
|
+
|
45
|
+
|
46
|
+
def generate_suitable_fingerprint() -> Fingerprint:
|
47
|
+
"""Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
|
48
|
+
|
49
|
+
This function was originally created to test Browserforge's injector.
|
50
|
+
:return: `Fingerprint` object
|
51
|
+
"""
|
52
|
+
return FingerprintGenerator(
|
53
|
+
browser=[Browser(name='chrome', min_version=128)],
|
54
|
+
os=get_os_name(), # None is ignored
|
55
|
+
device='desktop'
|
56
|
+
).generate()
|
57
|
+
|
58
|
+
|
59
|
+
def generate_headers(browser_mode: bool = False) -> Dict:
|
60
|
+
"""Generate real browser-like headers using browserforge's generator
|
61
|
+
|
62
|
+
:param browser_mode: If enabled, the headers created are used for playwright so it have to match everything
|
63
|
+
:return: A dictionary of the generated headers
|
64
|
+
"""
|
65
|
+
if browser_mode:
|
66
|
+
# In this mode we don't care about anything other than matching the OS and the browser type with the browser we are using
|
67
|
+
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
68
|
+
os_name = get_os_name()
|
69
|
+
return HeaderGenerator(
|
70
|
+
browser=[Browser(name='chrome', min_version=128)],
|
71
|
+
os=os_name, # None is ignored
|
72
|
+
device='desktop'
|
73
|
+
).generate()
|
74
|
+
else:
|
75
|
+
# Here it's used for normal requests that aren't done through browsers so we can take it lightly
|
76
|
+
browsers = [
|
77
|
+
Browser(name='chrome', min_version=120),
|
78
|
+
Browser(name='firefox', min_version=120),
|
79
|
+
Browser(name='edge', min_version=120),
|
80
|
+
]
|
81
|
+
return HeaderGenerator(browser=browsers, device='desktop').generate()
|