scrapling 0.2.8__py3-none-any.whl → 0.2.91__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -4
- scrapling/core/_types.py +2 -0
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +124 -24
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +195 -91
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +16 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +231 -16
- scrapling/parser.py +50 -22
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/METADATA +33 -18
- scrapling-0.2.91.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
import json
|
2
|
-
import logging
|
3
2
|
|
4
|
-
from scrapling.core._types import Callable, Dict,
|
3
|
+
from scrapling.core._types import (Callable, Dict, Optional,
|
4
|
+
SelectorWaitStates, Union)
|
5
|
+
from scrapling.core.utils import log, lru_cache
|
5
6
|
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
7
|
NSTBROWSER_DEFAULT_QUERY)
|
7
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
async_intercept_route,
|
8
10
|
check_type_validity, construct_cdp_url,
|
9
|
-
construct_proxy_dict,
|
11
|
+
construct_proxy_dict,
|
10
12
|
generate_convincing_referer,
|
11
13
|
generate_headers, intercept_route,
|
12
14
|
js_bypass_path)
|
@@ -19,10 +21,10 @@ class PlaywrightEngine:
|
|
19
21
|
useragent: Optional[str] = None,
|
20
22
|
network_idle: Optional[bool] = False,
|
21
23
|
timeout: Optional[float] = 30000,
|
22
|
-
page_action: Callable =
|
24
|
+
page_action: Callable = None,
|
23
25
|
wait_selector: Optional[str] = None,
|
24
26
|
locale: Optional[str] = 'en-US',
|
25
|
-
wait_selector_state:
|
27
|
+
wait_selector_state: SelectorWaitStates = 'attached',
|
26
28
|
stealth: Optional[bool] = False,
|
27
29
|
real_chrome: Optional[bool] = False,
|
28
30
|
hide_canvas: Optional[bool] = False,
|
@@ -74,11 +76,14 @@ class PlaywrightEngine:
|
|
74
76
|
self.cdp_url = cdp_url
|
75
77
|
self.useragent = useragent
|
76
78
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
77
|
-
if
|
78
|
-
|
79
|
+
if page_action is not None:
|
80
|
+
if callable(page_action):
|
81
|
+
self.page_action = page_action
|
82
|
+
else:
|
83
|
+
self.page_action = None
|
84
|
+
log.error('[Ignored] Argument "page_action" must be callable')
|
79
85
|
else:
|
80
|
-
self.page_action =
|
81
|
-
logging.error('[Ignored] Argument "page_action" must be callable')
|
86
|
+
self.page_action = None
|
82
87
|
|
83
88
|
self.wait_selector = wait_selector
|
84
89
|
self.wait_selector_state = wait_selector_state
|
@@ -94,10 +99,8 @@ class PlaywrightEngine:
|
|
94
99
|
# '--disable-extensions',
|
95
100
|
]
|
96
101
|
|
97
|
-
def _cdp_url_logic(self
|
102
|
+
def _cdp_url_logic(self) -> str:
|
98
103
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
99
|
-
|
100
|
-
:param flags: Chrome flags to be added to NSTBrowser query
|
101
104
|
:return: CDP URL
|
102
105
|
"""
|
103
106
|
cdp_url = self.cdp_url
|
@@ -106,7 +109,8 @@ class PlaywrightEngine:
|
|
106
109
|
config = self.nstbrowser_config
|
107
110
|
else:
|
108
111
|
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
109
|
-
if
|
112
|
+
if self.stealth:
|
113
|
+
flags = self.__set_flags()
|
110
114
|
query.update({
|
111
115
|
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
112
116
|
})
|
@@ -122,78 +126,104 @@ class PlaywrightEngine:
|
|
122
126
|
|
123
127
|
return cdp_url
|
124
128
|
|
129
|
+
@lru_cache(typed=True)
|
130
|
+
def __set_flags(self):
|
131
|
+
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
132
|
+
flags = DEFAULT_STEALTH_FLAGS
|
133
|
+
if self.hide_canvas:
|
134
|
+
flags += ('--fingerprinting-canvas-image-data-noise',)
|
135
|
+
if self.disable_webgl:
|
136
|
+
flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
|
137
|
+
|
138
|
+
return flags
|
139
|
+
|
140
|
+
def __launch_kwargs(self):
|
141
|
+
"""Creates the arguments we will use while launching playwright's browser"""
|
142
|
+
launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
|
143
|
+
if self.stealth:
|
144
|
+
launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
|
145
|
+
|
146
|
+
return launch_kwargs
|
147
|
+
|
148
|
+
def __context_kwargs(self):
|
149
|
+
"""Creates the arguments for the browser context"""
|
150
|
+
context_kwargs = {
|
151
|
+
"proxy": self.proxy,
|
152
|
+
"locale": self.locale,
|
153
|
+
"color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
|
154
|
+
"device_scale_factor": 2,
|
155
|
+
"extra_http_headers": self.extra_headers if self.extra_headers else {},
|
156
|
+
"user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
|
157
|
+
}
|
158
|
+
if self.stealth:
|
159
|
+
context_kwargs.update({
|
160
|
+
'is_mobile': False,
|
161
|
+
'has_touch': False,
|
162
|
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
163
|
+
'service_workers': 'allow',
|
164
|
+
'ignore_https_errors': True,
|
165
|
+
'screen': {'width': 1920, 'height': 1080},
|
166
|
+
'viewport': {'width': 1920, 'height': 1080},
|
167
|
+
'permissions': ['geolocation', 'notifications']
|
168
|
+
})
|
169
|
+
|
170
|
+
return context_kwargs
|
171
|
+
|
172
|
+
@lru_cache()
|
173
|
+
def __stealth_scripts(self):
|
174
|
+
# Basic bypasses nothing fancy as I'm still working on it
|
175
|
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
176
|
+
# https://bot.sannysoft.com/
|
177
|
+
# https://kaliiiiiiiiii.github.io/brotector/
|
178
|
+
# https://pixelscan.net/
|
179
|
+
# https://iphey.com/
|
180
|
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
181
|
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
182
|
+
# https://prescience-data.github.io/execution-monitor.html
|
183
|
+
return tuple(
|
184
|
+
js_bypass_path(script) for script in (
|
185
|
+
# Order is important
|
186
|
+
'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
|
187
|
+
'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
|
188
|
+
)
|
189
|
+
)
|
190
|
+
|
125
191
|
def fetch(self, url: str) -> Response:
|
126
192
|
"""Opens up the browser and do your request based on your chosen options.
|
127
193
|
|
128
194
|
:param url: Target url.
|
129
195
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
130
196
|
"""
|
197
|
+
from playwright.sync_api import Response as PlaywrightResponse
|
131
198
|
if not self.stealth or self.real_chrome:
|
132
199
|
# Because rebrowser_playwright doesn't play well with real browsers
|
133
200
|
from playwright.sync_api import sync_playwright
|
134
201
|
else:
|
135
202
|
from rebrowser_playwright.sync_api import sync_playwright
|
136
203
|
|
137
|
-
|
138
|
-
|
139
|
-
if self.useragent:
|
140
|
-
extra_headers = {}
|
141
|
-
useragent = self.useragent
|
142
|
-
else:
|
143
|
-
extra_headers = {}
|
144
|
-
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
204
|
+
# Store the final response
|
205
|
+
final_response = None
|
145
206
|
|
146
|
-
|
147
|
-
|
148
|
-
if
|
149
|
-
|
150
|
-
if self.disable_webgl:
|
151
|
-
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
207
|
+
def handle_response(finished_response: PlaywrightResponse):
|
208
|
+
nonlocal final_response
|
209
|
+
if finished_response.request.resource_type == "document":
|
210
|
+
final_response = finished_response
|
152
211
|
|
212
|
+
with sync_playwright() as p:
|
153
213
|
# Creating the browser
|
154
214
|
if self.cdp_url:
|
155
|
-
cdp_url = self._cdp_url_logic(
|
215
|
+
cdp_url = self._cdp_url_logic()
|
156
216
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
157
217
|
else:
|
158
|
-
|
159
|
-
browser = p.chromium.launch(
|
160
|
-
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
161
|
-
)
|
162
|
-
else:
|
163
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
164
|
-
|
165
|
-
# Creating the context
|
166
|
-
if self.stealth:
|
167
|
-
context = browser.new_context(
|
168
|
-
locale=self.locale,
|
169
|
-
is_mobile=False,
|
170
|
-
has_touch=False,
|
171
|
-
proxy=self.proxy,
|
172
|
-
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
173
|
-
user_agent=useragent,
|
174
|
-
device_scale_factor=2,
|
175
|
-
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
176
|
-
service_workers="allow",
|
177
|
-
ignore_https_errors=True,
|
178
|
-
extra_http_headers=extra_headers,
|
179
|
-
screen={"width": 1920, "height": 1080},
|
180
|
-
viewport={"width": 1920, "height": 1080},
|
181
|
-
permissions=["geolocation", 'notifications'],
|
182
|
-
)
|
183
|
-
else:
|
184
|
-
context = browser.new_context(
|
185
|
-
locale=self.locale,
|
186
|
-
proxy=self.proxy,
|
187
|
-
color_scheme='dark',
|
188
|
-
user_agent=useragent,
|
189
|
-
device_scale_factor=2,
|
190
|
-
extra_http_headers=extra_headers
|
191
|
-
)
|
218
|
+
browser = p.chromium.launch(**self.__launch_kwargs())
|
192
219
|
|
220
|
+
context = browser.new_context(**self.__context_kwargs())
|
193
221
|
# Finally we are in business
|
194
222
|
page = context.new_page()
|
195
223
|
page.set_default_navigation_timeout(self.timeout)
|
196
224
|
page.set_default_timeout(self.timeout)
|
225
|
+
# Listen for all responses
|
226
|
+
page.on("response", handle_response)
|
197
227
|
|
198
228
|
if self.extra_headers:
|
199
229
|
page.set_extra_http_headers(self.extra_headers)
|
@@ -202,29 +232,16 @@ class PlaywrightEngine:
|
|
202
232
|
page.route("**/*", intercept_route)
|
203
233
|
|
204
234
|
if self.stealth:
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
# https://pixelscan.net/
|
210
|
-
# https://iphey.com/
|
211
|
-
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
212
|
-
# https://arh.antoinevastel.com/bots/areyouheadless/
|
213
|
-
# https://prescience-data.github.io/execution-monitor.html
|
214
|
-
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
215
|
-
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
216
|
-
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
217
|
-
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
218
|
-
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
219
|
-
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
220
|
-
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
221
|
-
|
222
|
-
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
235
|
+
for script in self.__stealth_scripts():
|
236
|
+
page.add_init_script(path=script)
|
237
|
+
|
238
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
223
239
|
page.wait_for_load_state(state="domcontentloaded")
|
224
240
|
if self.network_idle:
|
225
241
|
page.wait_for_load_state('networkidle')
|
226
242
|
|
227
|
-
|
243
|
+
if self.page_action is not None:
|
244
|
+
page = self.page_action(page)
|
228
245
|
|
229
246
|
if self.wait_selector and type(self.wait_selector) is str:
|
230
247
|
waiter = page.locator(self.wait_selector)
|
@@ -235,25 +252,112 @@ class PlaywrightEngine:
|
|
235
252
|
if self.network_idle:
|
236
253
|
page.wait_for_load_state('networkidle')
|
237
254
|
|
255
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
|
+
# In case we didn't catch a document type somehow
|
257
|
+
final_response = final_response if final_response else first_response
|
238
258
|
# This will be parsed inside `Response`
|
239
|
-
encoding =
|
240
|
-
|
241
|
-
status_text = res.status_text
|
259
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
242
260
|
# PlayWright API sometimes give empty status text for some reason!
|
243
|
-
|
244
|
-
status_text = StatusText.get(res.status)
|
261
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
245
262
|
|
246
263
|
response = Response(
|
247
|
-
url=
|
264
|
+
url=final_response.url,
|
248
265
|
text=page.content(),
|
249
|
-
body=
|
250
|
-
status=
|
266
|
+
body=response_bytes,
|
267
|
+
status=final_response.status,
|
251
268
|
reason=status_text,
|
252
269
|
encoding=encoding,
|
253
270
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
254
|
-
headers=
|
255
|
-
request_headers=
|
271
|
+
headers=final_response.all_headers(),
|
272
|
+
request_headers=final_response.request.all_headers(),
|
256
273
|
**self.adaptor_arguments
|
257
274
|
)
|
258
275
|
page.close()
|
259
276
|
return response
|
277
|
+
|
278
|
+
async def async_fetch(self, url: str) -> Response:
|
279
|
+
"""Async version of `fetch`
|
280
|
+
|
281
|
+
:param url: Target url.
|
282
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
283
|
+
"""
|
284
|
+
from playwright.async_api import Response as PlaywrightResponse
|
285
|
+
if not self.stealth or self.real_chrome:
|
286
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
287
|
+
from playwright.async_api import async_playwright
|
288
|
+
else:
|
289
|
+
from rebrowser_playwright.async_api import async_playwright
|
290
|
+
|
291
|
+
# Store the final response
|
292
|
+
final_response = None
|
293
|
+
|
294
|
+
async def handle_response(finished_response: PlaywrightResponse):
|
295
|
+
nonlocal final_response
|
296
|
+
if finished_response.request.resource_type == "document":
|
297
|
+
final_response = finished_response
|
298
|
+
|
299
|
+
async with async_playwright() as p:
|
300
|
+
# Creating the browser
|
301
|
+
if self.cdp_url:
|
302
|
+
cdp_url = self._cdp_url_logic()
|
303
|
+
browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
304
|
+
else:
|
305
|
+
browser = await p.chromium.launch(**self.__launch_kwargs())
|
306
|
+
|
307
|
+
context = await browser.new_context(**self.__context_kwargs())
|
308
|
+
# Finally we are in business
|
309
|
+
page = await context.new_page()
|
310
|
+
page.set_default_navigation_timeout(self.timeout)
|
311
|
+
page.set_default_timeout(self.timeout)
|
312
|
+
# Listen for all responses
|
313
|
+
page.on("response", handle_response)
|
314
|
+
|
315
|
+
if self.extra_headers:
|
316
|
+
await page.set_extra_http_headers(self.extra_headers)
|
317
|
+
|
318
|
+
if self.disable_resources:
|
319
|
+
await page.route("**/*", async_intercept_route)
|
320
|
+
|
321
|
+
if self.stealth:
|
322
|
+
for script in self.__stealth_scripts():
|
323
|
+
await page.add_init_script(path=script)
|
324
|
+
|
325
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
326
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
327
|
+
if self.network_idle:
|
328
|
+
await page.wait_for_load_state('networkidle')
|
329
|
+
|
330
|
+
if self.page_action is not None:
|
331
|
+
page = await self.page_action(page)
|
332
|
+
|
333
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
334
|
+
waiter = page.locator(self.wait_selector)
|
335
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
336
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
337
|
+
await page.wait_for_load_state(state="load")
|
338
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
339
|
+
if self.network_idle:
|
340
|
+
await page.wait_for_load_state('networkidle')
|
341
|
+
|
342
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
|
+
# In case we didn't catch a document type somehow
|
344
|
+
final_response = final_response if final_response else first_response
|
345
|
+
# This will be parsed inside `Response`
|
346
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
347
|
+
# PlayWright API sometimes give empty status text for some reason!
|
348
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
349
|
+
|
350
|
+
response = Response(
|
351
|
+
url=final_response.url,
|
352
|
+
text=await page.content(),
|
353
|
+
body=response_bytes,
|
354
|
+
status=final_response.status,
|
355
|
+
reason=status_text,
|
356
|
+
encoding=encoding,
|
357
|
+
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
358
|
+
headers=await final_response.all_headers(),
|
359
|
+
request_headers=await final_response.request.all_headers(),
|
360
|
+
**self.adaptor_arguments
|
361
|
+
)
|
362
|
+
await page.close()
|
363
|
+
return response
|
scrapling/engines/static.py
CHANGED
@@ -1,34 +1,44 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
1
|
import httpx
|
4
2
|
from httpx._models import Response as httpxResponse
|
5
3
|
|
6
|
-
from scrapling.core._types import Dict, Optional, Union
|
4
|
+
from scrapling.core._types import Dict, Optional, Tuple, Union
|
5
|
+
from scrapling.core.utils import log, lru_cache
|
7
6
|
|
8
7
|
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
9
8
|
|
10
9
|
|
10
|
+
@lru_cache(typed=True)
|
11
11
|
class StaticEngine:
|
12
|
-
def __init__(
|
12
|
+
def __init__(
|
13
|
+
self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
|
14
|
+
timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
|
15
|
+
):
|
13
16
|
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
14
17
|
|
18
|
+
:param url: Target url.
|
19
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
20
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
21
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
15
22
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
23
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
17
24
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
18
25
|
"""
|
26
|
+
self.url = url
|
27
|
+
self.proxy = proxy
|
28
|
+
self.stealth = stealthy_headers
|
19
29
|
self.timeout = timeout
|
20
30
|
self.follow_redirects = bool(follow_redirects)
|
31
|
+
self.retries = retries
|
21
32
|
self._extra_headers = generate_headers(browser_mode=False)
|
22
|
-
|
33
|
+
# Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
|
34
|
+
# So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
|
35
|
+
self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
|
23
36
|
|
24
|
-
|
25
|
-
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
37
|
+
def _headers_job(self, headers: Optional[Dict]) -> Dict:
|
26
38
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
27
39
|
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
28
40
|
|
29
41
|
:param headers: Current headers in the request if the user passed any
|
30
|
-
:param url: The Target URL.
|
31
|
-
:param stealth: Whether stealth mode is enabled or not.
|
32
42
|
:return: A dictionary of the new headers.
|
33
43
|
"""
|
34
44
|
headers = headers or {}
|
@@ -36,12 +46,12 @@ class StaticEngine:
|
|
36
46
|
# Validate headers
|
37
47
|
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
38
48
|
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
39
|
-
|
49
|
+
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
40
50
|
|
41
|
-
if stealth:
|
51
|
+
if self.stealth:
|
42
52
|
extra_headers = generate_headers(browser_mode=False)
|
43
53
|
headers.update(extra_headers)
|
44
|
-
headers.update({'referer': generate_convincing_referer(url)})
|
54
|
+
headers.update({'referer': generate_convincing_referer(self.url)})
|
45
55
|
|
46
56
|
return headers
|
47
57
|
|
@@ -61,69 +71,102 @@ class StaticEngine:
|
|
61
71
|
cookies=dict(response.cookies),
|
62
72
|
headers=dict(response.headers),
|
63
73
|
request_headers=dict(response.request.headers),
|
74
|
+
method=response.request.method,
|
64
75
|
**self.adaptor_arguments
|
65
76
|
)
|
66
77
|
|
67
|
-
def get(self,
|
78
|
+
def get(self, **kwargs: Dict) -> Response:
|
68
79
|
"""Make basic HTTP GET request for you but with some added flavors.
|
69
80
|
|
70
|
-
:param
|
71
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
72
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
73
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
74
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
81
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
75
82
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
76
83
|
"""
|
77
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
78
|
-
with httpx.Client(proxy=proxy) as client:
|
79
|
-
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
84
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
85
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
86
|
+
request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
80
87
|
|
81
88
|
return self._prepare_response(request)
|
82
89
|
|
83
|
-
def
|
90
|
+
async def async_get(self, **kwargs: Dict) -> Response:
|
91
|
+
"""Make basic async HTTP GET request for you but with some added flavors.
|
92
|
+
|
93
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
94
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
95
|
+
"""
|
96
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
97
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
98
|
+
request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
99
|
+
|
100
|
+
return self._prepare_response(request)
|
101
|
+
|
102
|
+
def post(self, **kwargs: Dict) -> Response:
|
84
103
|
"""Make basic HTTP POST request for you but with some added flavors.
|
85
104
|
|
86
|
-
:param
|
87
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
88
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
89
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
90
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
105
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
91
106
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
92
107
|
"""
|
93
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
94
|
-
with httpx.Client(proxy=proxy) as client:
|
95
|
-
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
108
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
109
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
110
|
+
request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
96
111
|
|
97
112
|
return self._prepare_response(request)
|
98
113
|
|
99
|
-
def
|
114
|
+
async def async_post(self, **kwargs: Dict) -> Response:
|
115
|
+
"""Make basic async HTTP POST request for you but with some added flavors.
|
116
|
+
|
117
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
118
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
119
|
+
"""
|
120
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
121
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
122
|
+
request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
123
|
+
|
124
|
+
return self._prepare_response(request)
|
125
|
+
|
126
|
+
def delete(self, **kwargs: Dict) -> Response:
|
100
127
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
101
128
|
|
102
|
-
:param
|
103
|
-
:
|
104
|
-
|
105
|
-
|
106
|
-
|
129
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
130
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
131
|
+
"""
|
132
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
133
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
134
|
+
request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
135
|
+
|
136
|
+
return self._prepare_response(request)
|
137
|
+
|
138
|
+
async def async_delete(self, **kwargs: Dict) -> Response:
|
139
|
+
"""Make basic async HTTP DELETE request for you but with some added flavors.
|
140
|
+
|
141
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
107
142
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
108
143
|
"""
|
109
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
110
|
-
with httpx.
|
111
|
-
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
144
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
145
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
146
|
+
request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
112
147
|
|
113
148
|
return self._prepare_response(request)
|
114
149
|
|
115
|
-
def put(self,
|
150
|
+
def put(self, **kwargs: Dict) -> Response:
|
116
151
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
117
152
|
|
118
|
-
:param
|
119
|
-
:
|
120
|
-
|
121
|
-
|
122
|
-
|
153
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
154
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
155
|
+
"""
|
156
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
157
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
158
|
+
request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
159
|
+
|
160
|
+
return self._prepare_response(request)
|
161
|
+
|
162
|
+
async def async_put(self, **kwargs: Dict) -> Response:
|
163
|
+
"""Make basic async HTTP PUT request for you but with some added flavors.
|
164
|
+
|
165
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
123
166
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
124
167
|
"""
|
125
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
126
|
-
with httpx.
|
127
|
-
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
168
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
169
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
170
|
+
request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
128
171
|
|
129
172
|
return self._prepare_response(request)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
2
|
-
check_type_validity,
|
2
|
+
check_type_validity, get_variable_name)
|
3
3
|
from .fingerprints import (generate_convincing_referer, generate_headers,
|
4
4
|
get_os_name)
|
5
|
-
from .navigation import (
|
6
|
-
intercept_route, js_bypass_path)
|
5
|
+
from .navigation import (async_intercept_route, construct_cdp_url,
|
6
|
+
construct_proxy_dict, intercept_route, js_bypass_path)
|