scrapling 0.2.8__py3-none-any.whl → 0.2.91__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -4
- scrapling/core/_types.py +2 -0
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +124 -24
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +195 -91
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +16 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +231 -16
- scrapling/parser.py +50 -22
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/METADATA +33 -18
- scrapling-0.2.91.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
import json
|
2
|
-
import logging
|
3
2
|
|
4
|
-
from scrapling.core._types import Callable, Dict,
|
3
|
+
from scrapling.core._types import (Callable, Dict, Optional,
|
4
|
+
SelectorWaitStates, Union)
|
5
|
+
from scrapling.core.utils import log, lru_cache
|
5
6
|
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
7
|
NSTBROWSER_DEFAULT_QUERY)
|
7
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
async_intercept_route,
|
8
10
|
check_type_validity, construct_cdp_url,
|
9
|
-
construct_proxy_dict,
|
11
|
+
construct_proxy_dict,
|
10
12
|
generate_convincing_referer,
|
11
13
|
generate_headers, intercept_route,
|
12
14
|
js_bypass_path)
|
@@ -19,10 +21,10 @@ class PlaywrightEngine:
|
|
19
21
|
useragent: Optional[str] = None,
|
20
22
|
network_idle: Optional[bool] = False,
|
21
23
|
timeout: Optional[float] = 30000,
|
22
|
-
page_action: Callable =
|
24
|
+
page_action: Callable = None,
|
23
25
|
wait_selector: Optional[str] = None,
|
24
26
|
locale: Optional[str] = 'en-US',
|
25
|
-
wait_selector_state:
|
27
|
+
wait_selector_state: SelectorWaitStates = 'attached',
|
26
28
|
stealth: Optional[bool] = False,
|
27
29
|
real_chrome: Optional[bool] = False,
|
28
30
|
hide_canvas: Optional[bool] = False,
|
@@ -74,11 +76,14 @@ class PlaywrightEngine:
|
|
74
76
|
self.cdp_url = cdp_url
|
75
77
|
self.useragent = useragent
|
76
78
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
77
|
-
if
|
78
|
-
|
79
|
+
if page_action is not None:
|
80
|
+
if callable(page_action):
|
81
|
+
self.page_action = page_action
|
82
|
+
else:
|
83
|
+
self.page_action = None
|
84
|
+
log.error('[Ignored] Argument "page_action" must be callable')
|
79
85
|
else:
|
80
|
-
self.page_action =
|
81
|
-
logging.error('[Ignored] Argument "page_action" must be callable')
|
86
|
+
self.page_action = None
|
82
87
|
|
83
88
|
self.wait_selector = wait_selector
|
84
89
|
self.wait_selector_state = wait_selector_state
|
@@ -94,10 +99,8 @@ class PlaywrightEngine:
|
|
94
99
|
# '--disable-extensions',
|
95
100
|
]
|
96
101
|
|
97
|
-
def _cdp_url_logic(self
|
102
|
+
def _cdp_url_logic(self) -> str:
|
98
103
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
99
|
-
|
100
|
-
:param flags: Chrome flags to be added to NSTBrowser query
|
101
104
|
:return: CDP URL
|
102
105
|
"""
|
103
106
|
cdp_url = self.cdp_url
|
@@ -106,7 +109,8 @@ class PlaywrightEngine:
|
|
106
109
|
config = self.nstbrowser_config
|
107
110
|
else:
|
108
111
|
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
109
|
-
if
|
112
|
+
if self.stealth:
|
113
|
+
flags = self.__set_flags()
|
110
114
|
query.update({
|
111
115
|
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
112
116
|
})
|
@@ -122,78 +126,104 @@ class PlaywrightEngine:
|
|
122
126
|
|
123
127
|
return cdp_url
|
124
128
|
|
129
|
+
@lru_cache(typed=True)
|
130
|
+
def __set_flags(self):
|
131
|
+
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
132
|
+
flags = DEFAULT_STEALTH_FLAGS
|
133
|
+
if self.hide_canvas:
|
134
|
+
flags += ('--fingerprinting-canvas-image-data-noise',)
|
135
|
+
if self.disable_webgl:
|
136
|
+
flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
|
137
|
+
|
138
|
+
return flags
|
139
|
+
|
140
|
+
def __launch_kwargs(self):
|
141
|
+
"""Creates the arguments we will use while launching playwright's browser"""
|
142
|
+
launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
|
143
|
+
if self.stealth:
|
144
|
+
launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
|
145
|
+
|
146
|
+
return launch_kwargs
|
147
|
+
|
148
|
+
def __context_kwargs(self):
|
149
|
+
"""Creates the arguments for the browser context"""
|
150
|
+
context_kwargs = {
|
151
|
+
"proxy": self.proxy,
|
152
|
+
"locale": self.locale,
|
153
|
+
"color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
|
154
|
+
"device_scale_factor": 2,
|
155
|
+
"extra_http_headers": self.extra_headers if self.extra_headers else {},
|
156
|
+
"user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
|
157
|
+
}
|
158
|
+
if self.stealth:
|
159
|
+
context_kwargs.update({
|
160
|
+
'is_mobile': False,
|
161
|
+
'has_touch': False,
|
162
|
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
163
|
+
'service_workers': 'allow',
|
164
|
+
'ignore_https_errors': True,
|
165
|
+
'screen': {'width': 1920, 'height': 1080},
|
166
|
+
'viewport': {'width': 1920, 'height': 1080},
|
167
|
+
'permissions': ['geolocation', 'notifications']
|
168
|
+
})
|
169
|
+
|
170
|
+
return context_kwargs
|
171
|
+
|
172
|
+
@lru_cache()
|
173
|
+
def __stealth_scripts(self):
|
174
|
+
# Basic bypasses nothing fancy as I'm still working on it
|
175
|
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
176
|
+
# https://bot.sannysoft.com/
|
177
|
+
# https://kaliiiiiiiiii.github.io/brotector/
|
178
|
+
# https://pixelscan.net/
|
179
|
+
# https://iphey.com/
|
180
|
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
181
|
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
182
|
+
# https://prescience-data.github.io/execution-monitor.html
|
183
|
+
return tuple(
|
184
|
+
js_bypass_path(script) for script in (
|
185
|
+
# Order is important
|
186
|
+
'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
|
187
|
+
'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
|
188
|
+
)
|
189
|
+
)
|
190
|
+
|
125
191
|
def fetch(self, url: str) -> Response:
|
126
192
|
"""Opens up the browser and do your request based on your chosen options.
|
127
193
|
|
128
194
|
:param url: Target url.
|
129
195
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
130
196
|
"""
|
197
|
+
from playwright.sync_api import Response as PlaywrightResponse
|
131
198
|
if not self.stealth or self.real_chrome:
|
132
199
|
# Because rebrowser_playwright doesn't play well with real browsers
|
133
200
|
from playwright.sync_api import sync_playwright
|
134
201
|
else:
|
135
202
|
from rebrowser_playwright.sync_api import sync_playwright
|
136
203
|
|
137
|
-
|
138
|
-
|
139
|
-
if self.useragent:
|
140
|
-
extra_headers = {}
|
141
|
-
useragent = self.useragent
|
142
|
-
else:
|
143
|
-
extra_headers = {}
|
144
|
-
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
204
|
+
# Store the final response
|
205
|
+
final_response = None
|
145
206
|
|
146
|
-
|
147
|
-
|
148
|
-
if
|
149
|
-
|
150
|
-
if self.disable_webgl:
|
151
|
-
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
207
|
+
def handle_response(finished_response: PlaywrightResponse):
|
208
|
+
nonlocal final_response
|
209
|
+
if finished_response.request.resource_type == "document":
|
210
|
+
final_response = finished_response
|
152
211
|
|
212
|
+
with sync_playwright() as p:
|
153
213
|
# Creating the browser
|
154
214
|
if self.cdp_url:
|
155
|
-
cdp_url = self._cdp_url_logic(
|
215
|
+
cdp_url = self._cdp_url_logic()
|
156
216
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
157
217
|
else:
|
158
|
-
|
159
|
-
browser = p.chromium.launch(
|
160
|
-
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
161
|
-
)
|
162
|
-
else:
|
163
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
164
|
-
|
165
|
-
# Creating the context
|
166
|
-
if self.stealth:
|
167
|
-
context = browser.new_context(
|
168
|
-
locale=self.locale,
|
169
|
-
is_mobile=False,
|
170
|
-
has_touch=False,
|
171
|
-
proxy=self.proxy,
|
172
|
-
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
173
|
-
user_agent=useragent,
|
174
|
-
device_scale_factor=2,
|
175
|
-
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
176
|
-
service_workers="allow",
|
177
|
-
ignore_https_errors=True,
|
178
|
-
extra_http_headers=extra_headers,
|
179
|
-
screen={"width": 1920, "height": 1080},
|
180
|
-
viewport={"width": 1920, "height": 1080},
|
181
|
-
permissions=["geolocation", 'notifications'],
|
182
|
-
)
|
183
|
-
else:
|
184
|
-
context = browser.new_context(
|
185
|
-
locale=self.locale,
|
186
|
-
proxy=self.proxy,
|
187
|
-
color_scheme='dark',
|
188
|
-
user_agent=useragent,
|
189
|
-
device_scale_factor=2,
|
190
|
-
extra_http_headers=extra_headers
|
191
|
-
)
|
218
|
+
browser = p.chromium.launch(**self.__launch_kwargs())
|
192
219
|
|
220
|
+
context = browser.new_context(**self.__context_kwargs())
|
193
221
|
# Finally we are in business
|
194
222
|
page = context.new_page()
|
195
223
|
page.set_default_navigation_timeout(self.timeout)
|
196
224
|
page.set_default_timeout(self.timeout)
|
225
|
+
# Listen for all responses
|
226
|
+
page.on("response", handle_response)
|
197
227
|
|
198
228
|
if self.extra_headers:
|
199
229
|
page.set_extra_http_headers(self.extra_headers)
|
@@ -202,29 +232,16 @@ class PlaywrightEngine:
|
|
202
232
|
page.route("**/*", intercept_route)
|
203
233
|
|
204
234
|
if self.stealth:
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
# https://pixelscan.net/
|
210
|
-
# https://iphey.com/
|
211
|
-
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
212
|
-
# https://arh.antoinevastel.com/bots/areyouheadless/
|
213
|
-
# https://prescience-data.github.io/execution-monitor.html
|
214
|
-
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
215
|
-
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
216
|
-
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
217
|
-
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
218
|
-
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
219
|
-
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
220
|
-
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
221
|
-
|
222
|
-
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
235
|
+
for script in self.__stealth_scripts():
|
236
|
+
page.add_init_script(path=script)
|
237
|
+
|
238
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
223
239
|
page.wait_for_load_state(state="domcontentloaded")
|
224
240
|
if self.network_idle:
|
225
241
|
page.wait_for_load_state('networkidle')
|
226
242
|
|
227
|
-
|
243
|
+
if self.page_action is not None:
|
244
|
+
page = self.page_action(page)
|
228
245
|
|
229
246
|
if self.wait_selector and type(self.wait_selector) is str:
|
230
247
|
waiter = page.locator(self.wait_selector)
|
@@ -235,25 +252,112 @@ class PlaywrightEngine:
|
|
235
252
|
if self.network_idle:
|
236
253
|
page.wait_for_load_state('networkidle')
|
237
254
|
|
255
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
|
+
# In case we didn't catch a document type somehow
|
257
|
+
final_response = final_response if final_response else first_response
|
238
258
|
# This will be parsed inside `Response`
|
239
|
-
encoding =
|
240
|
-
|
241
|
-
status_text = res.status_text
|
259
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
242
260
|
# PlayWright API sometimes give empty status text for some reason!
|
243
|
-
|
244
|
-
status_text = StatusText.get(res.status)
|
261
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
245
262
|
|
246
263
|
response = Response(
|
247
|
-
url=
|
264
|
+
url=final_response.url,
|
248
265
|
text=page.content(),
|
249
|
-
body=
|
250
|
-
status=
|
266
|
+
body=response_bytes,
|
267
|
+
status=final_response.status,
|
251
268
|
reason=status_text,
|
252
269
|
encoding=encoding,
|
253
270
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
254
|
-
headers=
|
255
|
-
request_headers=
|
271
|
+
headers=final_response.all_headers(),
|
272
|
+
request_headers=final_response.request.all_headers(),
|
256
273
|
**self.adaptor_arguments
|
257
274
|
)
|
258
275
|
page.close()
|
259
276
|
return response
|
277
|
+
|
278
|
+
async def async_fetch(self, url: str) -> Response:
|
279
|
+
"""Async version of `fetch`
|
280
|
+
|
281
|
+
:param url: Target url.
|
282
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
283
|
+
"""
|
284
|
+
from playwright.async_api import Response as PlaywrightResponse
|
285
|
+
if not self.stealth or self.real_chrome:
|
286
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
287
|
+
from playwright.async_api import async_playwright
|
288
|
+
else:
|
289
|
+
from rebrowser_playwright.async_api import async_playwright
|
290
|
+
|
291
|
+
# Store the final response
|
292
|
+
final_response = None
|
293
|
+
|
294
|
+
async def handle_response(finished_response: PlaywrightResponse):
|
295
|
+
nonlocal final_response
|
296
|
+
if finished_response.request.resource_type == "document":
|
297
|
+
final_response = finished_response
|
298
|
+
|
299
|
+
async with async_playwright() as p:
|
300
|
+
# Creating the browser
|
301
|
+
if self.cdp_url:
|
302
|
+
cdp_url = self._cdp_url_logic()
|
303
|
+
browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
304
|
+
else:
|
305
|
+
browser = await p.chromium.launch(**self.__launch_kwargs())
|
306
|
+
|
307
|
+
context = await browser.new_context(**self.__context_kwargs())
|
308
|
+
# Finally we are in business
|
309
|
+
page = await context.new_page()
|
310
|
+
page.set_default_navigation_timeout(self.timeout)
|
311
|
+
page.set_default_timeout(self.timeout)
|
312
|
+
# Listen for all responses
|
313
|
+
page.on("response", handle_response)
|
314
|
+
|
315
|
+
if self.extra_headers:
|
316
|
+
await page.set_extra_http_headers(self.extra_headers)
|
317
|
+
|
318
|
+
if self.disable_resources:
|
319
|
+
await page.route("**/*", async_intercept_route)
|
320
|
+
|
321
|
+
if self.stealth:
|
322
|
+
for script in self.__stealth_scripts():
|
323
|
+
await page.add_init_script(path=script)
|
324
|
+
|
325
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
326
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
327
|
+
if self.network_idle:
|
328
|
+
await page.wait_for_load_state('networkidle')
|
329
|
+
|
330
|
+
if self.page_action is not None:
|
331
|
+
page = await self.page_action(page)
|
332
|
+
|
333
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
334
|
+
waiter = page.locator(self.wait_selector)
|
335
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
336
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
337
|
+
await page.wait_for_load_state(state="load")
|
338
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
339
|
+
if self.network_idle:
|
340
|
+
await page.wait_for_load_state('networkidle')
|
341
|
+
|
342
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
|
+
# In case we didn't catch a document type somehow
|
344
|
+
final_response = final_response if final_response else first_response
|
345
|
+
# This will be parsed inside `Response`
|
346
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
347
|
+
# PlayWright API sometimes give empty status text for some reason!
|
348
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
349
|
+
|
350
|
+
response = Response(
|
351
|
+
url=final_response.url,
|
352
|
+
text=await page.content(),
|
353
|
+
body=response_bytes,
|
354
|
+
status=final_response.status,
|
355
|
+
reason=status_text,
|
356
|
+
encoding=encoding,
|
357
|
+
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
358
|
+
headers=await final_response.all_headers(),
|
359
|
+
request_headers=await final_response.request.all_headers(),
|
360
|
+
**self.adaptor_arguments
|
361
|
+
)
|
362
|
+
await page.close()
|
363
|
+
return response
|
scrapling/engines/static.py
CHANGED
@@ -1,34 +1,44 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
1
|
import httpx
|
4
2
|
from httpx._models import Response as httpxResponse
|
5
3
|
|
6
|
-
from scrapling.core._types import Dict, Optional, Union
|
4
|
+
from scrapling.core._types import Dict, Optional, Tuple, Union
|
5
|
+
from scrapling.core.utils import log, lru_cache
|
7
6
|
|
8
7
|
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
9
8
|
|
10
9
|
|
10
|
+
@lru_cache(typed=True)
|
11
11
|
class StaticEngine:
|
12
|
-
def __init__(
|
12
|
+
def __init__(
|
13
|
+
self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
|
14
|
+
timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
|
15
|
+
):
|
13
16
|
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
14
17
|
|
18
|
+
:param url: Target url.
|
19
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
20
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
21
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
15
22
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
23
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
17
24
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
18
25
|
"""
|
26
|
+
self.url = url
|
27
|
+
self.proxy = proxy
|
28
|
+
self.stealth = stealthy_headers
|
19
29
|
self.timeout = timeout
|
20
30
|
self.follow_redirects = bool(follow_redirects)
|
31
|
+
self.retries = retries
|
21
32
|
self._extra_headers = generate_headers(browser_mode=False)
|
22
|
-
|
33
|
+
# Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
|
34
|
+
# So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
|
35
|
+
self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
|
23
36
|
|
24
|
-
|
25
|
-
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
37
|
+
def _headers_job(self, headers: Optional[Dict]) -> Dict:
|
26
38
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
27
39
|
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
28
40
|
|
29
41
|
:param headers: Current headers in the request if the user passed any
|
30
|
-
:param url: The Target URL.
|
31
|
-
:param stealth: Whether stealth mode is enabled or not.
|
32
42
|
:return: A dictionary of the new headers.
|
33
43
|
"""
|
34
44
|
headers = headers or {}
|
@@ -36,12 +46,12 @@ class StaticEngine:
|
|
36
46
|
# Validate headers
|
37
47
|
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
38
48
|
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
39
|
-
|
49
|
+
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
40
50
|
|
41
|
-
if stealth:
|
51
|
+
if self.stealth:
|
42
52
|
extra_headers = generate_headers(browser_mode=False)
|
43
53
|
headers.update(extra_headers)
|
44
|
-
headers.update({'referer': generate_convincing_referer(url)})
|
54
|
+
headers.update({'referer': generate_convincing_referer(self.url)})
|
45
55
|
|
46
56
|
return headers
|
47
57
|
|
@@ -61,69 +71,102 @@ class StaticEngine:
|
|
61
71
|
cookies=dict(response.cookies),
|
62
72
|
headers=dict(response.headers),
|
63
73
|
request_headers=dict(response.request.headers),
|
74
|
+
method=response.request.method,
|
64
75
|
**self.adaptor_arguments
|
65
76
|
)
|
66
77
|
|
67
|
-
def get(self,
|
78
|
+
def get(self, **kwargs: Dict) -> Response:
|
68
79
|
"""Make basic HTTP GET request for you but with some added flavors.
|
69
80
|
|
70
|
-
:param
|
71
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
72
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
73
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
74
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
81
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
75
82
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
76
83
|
"""
|
77
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
78
|
-
with httpx.Client(proxy=proxy) as client:
|
79
|
-
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
84
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
85
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
86
|
+
request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
80
87
|
|
81
88
|
return self._prepare_response(request)
|
82
89
|
|
83
|
-
def
|
90
|
+
async def async_get(self, **kwargs: Dict) -> Response:
|
91
|
+
"""Make basic async HTTP GET request for you but with some added flavors.
|
92
|
+
|
93
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
94
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
95
|
+
"""
|
96
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
97
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
98
|
+
request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
99
|
+
|
100
|
+
return self._prepare_response(request)
|
101
|
+
|
102
|
+
def post(self, **kwargs: Dict) -> Response:
|
84
103
|
"""Make basic HTTP POST request for you but with some added flavors.
|
85
104
|
|
86
|
-
:param
|
87
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
88
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
89
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
90
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
105
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
91
106
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
92
107
|
"""
|
93
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
94
|
-
with httpx.Client(proxy=proxy) as client:
|
95
|
-
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
108
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
109
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
110
|
+
request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
96
111
|
|
97
112
|
return self._prepare_response(request)
|
98
113
|
|
99
|
-
def
|
114
|
+
async def async_post(self, **kwargs: Dict) -> Response:
|
115
|
+
"""Make basic async HTTP POST request for you but with some added flavors.
|
116
|
+
|
117
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
118
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
119
|
+
"""
|
120
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
121
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
122
|
+
request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
123
|
+
|
124
|
+
return self._prepare_response(request)
|
125
|
+
|
126
|
+
def delete(self, **kwargs: Dict) -> Response:
|
100
127
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
101
128
|
|
102
|
-
:param
|
103
|
-
:
|
104
|
-
|
105
|
-
|
106
|
-
|
129
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
130
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
131
|
+
"""
|
132
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
133
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
134
|
+
request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
135
|
+
|
136
|
+
return self._prepare_response(request)
|
137
|
+
|
138
|
+
async def async_delete(self, **kwargs: Dict) -> Response:
|
139
|
+
"""Make basic async HTTP DELETE request for you but with some added flavors.
|
140
|
+
|
141
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
107
142
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
108
143
|
"""
|
109
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
110
|
-
with httpx.
|
111
|
-
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
144
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
145
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
146
|
+
request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
112
147
|
|
113
148
|
return self._prepare_response(request)
|
114
149
|
|
115
|
-
def put(self,
|
150
|
+
def put(self, **kwargs: Dict) -> Response:
|
116
151
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
117
152
|
|
118
|
-
:param
|
119
|
-
:
|
120
|
-
|
121
|
-
|
122
|
-
|
153
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
154
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
155
|
+
"""
|
156
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
157
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
158
|
+
request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
159
|
+
|
160
|
+
return self._prepare_response(request)
|
161
|
+
|
162
|
+
async def async_put(self, **kwargs: Dict) -> Response:
|
163
|
+
"""Make basic async HTTP PUT request for you but with some added flavors.
|
164
|
+
|
165
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
123
166
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
124
167
|
"""
|
125
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
126
|
-
with httpx.
|
127
|
-
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
168
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
169
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
170
|
+
request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
128
171
|
|
129
172
|
return self._prepare_response(request)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
2
|
-
check_type_validity,
|
2
|
+
check_type_validity, get_variable_name)
|
3
3
|
from .fingerprints import (generate_convincing_referer, generate_headers,
|
4
4
|
get_os_name)
|
5
|
-
from .navigation import (
|
6
|
-
intercept_route, js_bypass_path)
|
5
|
+
from .navigation import (async_intercept_route, construct_cdp_url,
|
6
|
+
construct_proxy_dict, intercept_route, js_bypass_path)
|