scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +5 -4
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +93 -11
- scrapling/core/storage_adaptors.py +9 -10
- scrapling/core/translator.py +6 -7
- scrapling/core/utils.py +35 -30
- scrapling/defaults.py +2 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +96 -26
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +166 -96
- scrapling/engines/static.py +94 -50
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +22 -23
- scrapling/engines/toolbelt/fingerprints.py +7 -7
- scrapling/engines/toolbelt/navigation.py +25 -12
- scrapling/fetchers.py +233 -17
- scrapling/parser.py +63 -28
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +263 -219
- scrapling-0.2.7.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -64
- tests/fetchers/test_httpx.py +0 -67
- tests/fetchers/test_playwright.py +0 -76
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py
CHANGED
@@ -1,20 +1,16 @@
|
|
1
1
|
import json
|
2
|
-
|
3
|
-
from scrapling.core._types import
|
4
|
-
|
5
|
-
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS,
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
check_type_validity,
|
15
|
-
construct_proxy_dict,
|
16
|
-
generate_convincing_referer,
|
17
|
-
)
|
2
|
+
|
3
|
+
from scrapling.core._types import Callable, Dict, Optional, Union
|
4
|
+
from scrapling.core.utils import log, lru_cache
|
5
|
+
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
|
+
NSTBROWSER_DEFAULT_QUERY)
|
7
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
8
|
+
async_intercept_route,
|
9
|
+
check_type_validity, construct_cdp_url,
|
10
|
+
construct_proxy_dict,
|
11
|
+
generate_convincing_referer,
|
12
|
+
generate_headers, intercept_route,
|
13
|
+
js_bypass_path)
|
18
14
|
|
19
15
|
|
20
16
|
class PlaywrightEngine:
|
@@ -24,7 +20,7 @@ class PlaywrightEngine:
|
|
24
20
|
useragent: Optional[str] = None,
|
25
21
|
network_idle: Optional[bool] = False,
|
26
22
|
timeout: Optional[float] = 30000,
|
27
|
-
page_action: Callable =
|
23
|
+
page_action: Callable = None,
|
28
24
|
wait_selector: Optional[str] = None,
|
29
25
|
locale: Optional[str] = 'en-US',
|
30
26
|
wait_selector_state: Optional[str] = 'attached',
|
@@ -79,11 +75,14 @@ class PlaywrightEngine:
|
|
79
75
|
self.cdp_url = cdp_url
|
80
76
|
self.useragent = useragent
|
81
77
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
82
|
-
if
|
83
|
-
|
78
|
+
if page_action is not None:
|
79
|
+
if callable(page_action):
|
80
|
+
self.page_action = page_action
|
81
|
+
else:
|
82
|
+
self.page_action = None
|
83
|
+
log.error('[Ignored] Argument "page_action" must be callable')
|
84
84
|
else:
|
85
|
-
self.page_action =
|
86
|
-
logging.error('[Ignored] Argument "page_action" must be callable')
|
85
|
+
self.page_action = None
|
87
86
|
|
88
87
|
self.wait_selector = wait_selector
|
89
88
|
self.wait_selector_state = wait_selector_state
|
@@ -99,10 +98,8 @@ class PlaywrightEngine:
|
|
99
98
|
# '--disable-extensions',
|
100
99
|
]
|
101
100
|
|
102
|
-
def _cdp_url_logic(self
|
101
|
+
def _cdp_url_logic(self) -> str:
|
103
102
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
104
|
-
|
105
|
-
:param flags: Chrome flags to be added to NSTBrowser query
|
106
103
|
:return: CDP URL
|
107
104
|
"""
|
108
105
|
cdp_url = self.cdp_url
|
@@ -111,7 +108,8 @@ class PlaywrightEngine:
|
|
111
108
|
config = self.nstbrowser_config
|
112
109
|
else:
|
113
110
|
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
114
|
-
if
|
111
|
+
if self.stealth:
|
112
|
+
flags = self.__set_flags()
|
115
113
|
query.update({
|
116
114
|
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
117
115
|
})
|
@@ -127,6 +125,68 @@ class PlaywrightEngine:
|
|
127
125
|
|
128
126
|
return cdp_url
|
129
127
|
|
128
|
+
@lru_cache(typed=True)
|
129
|
+
def __set_flags(self):
|
130
|
+
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
131
|
+
flags = DEFAULT_STEALTH_FLAGS
|
132
|
+
if self.hide_canvas:
|
133
|
+
flags += ('--fingerprinting-canvas-image-data-noise',)
|
134
|
+
if self.disable_webgl:
|
135
|
+
flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
|
136
|
+
|
137
|
+
return flags
|
138
|
+
|
139
|
+
def __launch_kwargs(self):
|
140
|
+
"""Creates the arguments we will use while launching playwright's browser"""
|
141
|
+
launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
|
142
|
+
if self.stealth:
|
143
|
+
launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
|
144
|
+
|
145
|
+
return launch_kwargs
|
146
|
+
|
147
|
+
def __context_kwargs(self):
|
148
|
+
"""Creates the arguments for the browser context"""
|
149
|
+
context_kwargs = {
|
150
|
+
"proxy": self.proxy,
|
151
|
+
"locale": self.locale,
|
152
|
+
"color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
|
153
|
+
"device_scale_factor": 2,
|
154
|
+
"extra_http_headers": self.extra_headers if self.extra_headers else {},
|
155
|
+
"user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
|
156
|
+
}
|
157
|
+
if self.stealth:
|
158
|
+
context_kwargs.update({
|
159
|
+
'is_mobile': False,
|
160
|
+
'has_touch': False,
|
161
|
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
162
|
+
'service_workers': 'allow',
|
163
|
+
'ignore_https_errors': True,
|
164
|
+
'screen': {'width': 1920, 'height': 1080},
|
165
|
+
'viewport': {'width': 1920, 'height': 1080},
|
166
|
+
'permissions': ['geolocation', 'notifications']
|
167
|
+
})
|
168
|
+
|
169
|
+
return context_kwargs
|
170
|
+
|
171
|
+
@lru_cache()
|
172
|
+
def __stealth_scripts(self):
|
173
|
+
# Basic bypasses nothing fancy as I'm still working on it
|
174
|
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
175
|
+
# https://bot.sannysoft.com/
|
176
|
+
# https://kaliiiiiiiiii.github.io/brotector/
|
177
|
+
# https://pixelscan.net/
|
178
|
+
# https://iphey.com/
|
179
|
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
180
|
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
181
|
+
# https://prescience-data.github.io/execution-monitor.html
|
182
|
+
return tuple(
|
183
|
+
js_bypass_path(script) for script in (
|
184
|
+
# Order is important
|
185
|
+
'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
|
186
|
+
'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
|
187
|
+
)
|
188
|
+
)
|
189
|
+
|
130
190
|
def fetch(self, url: str) -> Response:
|
131
191
|
"""Opens up the browser and do your request based on your chosen options.
|
132
192
|
|
@@ -140,61 +200,14 @@ class PlaywrightEngine:
|
|
140
200
|
from rebrowser_playwright.sync_api import sync_playwright
|
141
201
|
|
142
202
|
with sync_playwright() as p:
|
143
|
-
# Handle the UserAgent early
|
144
|
-
if self.useragent:
|
145
|
-
extra_headers = {}
|
146
|
-
useragent = self.useragent
|
147
|
-
else:
|
148
|
-
extra_headers = {}
|
149
|
-
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
150
|
-
|
151
|
-
# Prepare the flags before diving
|
152
|
-
flags = DEFAULT_STEALTH_FLAGS
|
153
|
-
if self.hide_canvas:
|
154
|
-
flags += ['--fingerprinting-canvas-image-data-noise']
|
155
|
-
if self.disable_webgl:
|
156
|
-
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
157
|
-
|
158
203
|
# Creating the browser
|
159
204
|
if self.cdp_url:
|
160
|
-
cdp_url = self._cdp_url_logic(
|
205
|
+
cdp_url = self._cdp_url_logic()
|
161
206
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
162
207
|
else:
|
163
|
-
|
164
|
-
browser = p.chromium.launch(
|
165
|
-
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
166
|
-
)
|
167
|
-
else:
|
168
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
169
|
-
|
170
|
-
# Creating the context
|
171
|
-
if self.stealth:
|
172
|
-
context = browser.new_context(
|
173
|
-
locale=self.locale,
|
174
|
-
is_mobile=False,
|
175
|
-
has_touch=False,
|
176
|
-
proxy=self.proxy,
|
177
|
-
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
178
|
-
user_agent=useragent,
|
179
|
-
device_scale_factor=2,
|
180
|
-
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
181
|
-
service_workers="allow",
|
182
|
-
ignore_https_errors=True,
|
183
|
-
extra_http_headers=extra_headers,
|
184
|
-
screen={"width": 1920, "height": 1080},
|
185
|
-
viewport={"width": 1920, "height": 1080},
|
186
|
-
permissions=["geolocation", 'notifications'],
|
187
|
-
)
|
188
|
-
else:
|
189
|
-
context = browser.new_context(
|
190
|
-
locale=self.locale,
|
191
|
-
proxy=self.proxy,
|
192
|
-
color_scheme='dark',
|
193
|
-
user_agent=useragent,
|
194
|
-
device_scale_factor=2,
|
195
|
-
extra_http_headers=extra_headers
|
196
|
-
)
|
208
|
+
browser = p.chromium.launch(**self.__launch_kwargs())
|
197
209
|
|
210
|
+
context = browser.new_context(**self.__context_kwargs())
|
198
211
|
# Finally we are in business
|
199
212
|
page = context.new_page()
|
200
213
|
page.set_default_navigation_timeout(self.timeout)
|
@@ -207,29 +220,16 @@ class PlaywrightEngine:
|
|
207
220
|
page.route("**/*", intercept_route)
|
208
221
|
|
209
222
|
if self.stealth:
|
210
|
-
|
211
|
-
|
212
|
-
# https://bot.sannysoft.com/
|
213
|
-
# https://kaliiiiiiiiii.github.io/brotector/
|
214
|
-
# https://pixelscan.net/
|
215
|
-
# https://iphey.com/
|
216
|
-
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
217
|
-
# https://arh.antoinevastel.com/bots/areyouheadless/
|
218
|
-
# https://prescience-data.github.io/execution-monitor.html
|
219
|
-
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
220
|
-
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
221
|
-
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
222
|
-
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
223
|
-
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
224
|
-
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
225
|
-
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
223
|
+
for script in self.__stealth_scripts():
|
224
|
+
page.add_init_script(path=script)
|
226
225
|
|
227
226
|
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
228
227
|
page.wait_for_load_state(state="domcontentloaded")
|
229
228
|
if self.network_idle:
|
230
229
|
page.wait_for_load_state('networkidle')
|
231
230
|
|
232
|
-
|
231
|
+
if self.page_action is not None:
|
232
|
+
page = self.page_action(page)
|
233
233
|
|
234
234
|
if self.wait_selector and type(self.wait_selector) is str:
|
235
235
|
waiter = page.locator(self.wait_selector)
|
@@ -242,11 +242,8 @@ class PlaywrightEngine:
|
|
242
242
|
|
243
243
|
# This will be parsed inside `Response`
|
244
244
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
245
|
-
|
246
|
-
status_text = res.status_text
|
247
245
|
# PlayWright API sometimes give empty status text for some reason!
|
248
|
-
|
249
|
-
status_text = StatusText.get(res.status)
|
246
|
+
status_text = res.status_text or StatusText.get(res.status)
|
250
247
|
|
251
248
|
response = Response(
|
252
249
|
url=res.url,
|
@@ -262,3 +259,76 @@ class PlaywrightEngine:
|
|
262
259
|
)
|
263
260
|
page.close()
|
264
261
|
return response
|
262
|
+
|
263
|
+
async def async_fetch(self, url: str) -> Response:
|
264
|
+
"""Async version of `fetch`
|
265
|
+
|
266
|
+
:param url: Target url.
|
267
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
268
|
+
"""
|
269
|
+
if not self.stealth or self.real_chrome:
|
270
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
271
|
+
from playwright.async_api import async_playwright
|
272
|
+
else:
|
273
|
+
from rebrowser_playwright.async_api import async_playwright
|
274
|
+
|
275
|
+
async with async_playwright() as p:
|
276
|
+
# Creating the browser
|
277
|
+
if self.cdp_url:
|
278
|
+
cdp_url = self._cdp_url_logic()
|
279
|
+
browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
280
|
+
else:
|
281
|
+
browser = await p.chromium.launch(**self.__launch_kwargs())
|
282
|
+
|
283
|
+
context = await browser.new_context(**self.__context_kwargs())
|
284
|
+
# Finally we are in business
|
285
|
+
page = await context.new_page()
|
286
|
+
page.set_default_navigation_timeout(self.timeout)
|
287
|
+
page.set_default_timeout(self.timeout)
|
288
|
+
|
289
|
+
if self.extra_headers:
|
290
|
+
await page.set_extra_http_headers(self.extra_headers)
|
291
|
+
|
292
|
+
if self.disable_resources:
|
293
|
+
await page.route("**/*", async_intercept_route)
|
294
|
+
|
295
|
+
if self.stealth:
|
296
|
+
for script in self.__stealth_scripts():
|
297
|
+
await page.add_init_script(path=script)
|
298
|
+
|
299
|
+
res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
300
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
301
|
+
if self.network_idle:
|
302
|
+
await page.wait_for_load_state('networkidle')
|
303
|
+
|
304
|
+
if self.page_action is not None:
|
305
|
+
page = await self.page_action(page)
|
306
|
+
|
307
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
308
|
+
waiter = page.locator(self.wait_selector)
|
309
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
310
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
311
|
+
await page.wait_for_load_state(state="load")
|
312
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
313
|
+
if self.network_idle:
|
314
|
+
await page.wait_for_load_state('networkidle')
|
315
|
+
|
316
|
+
# This will be parsed inside `Response`
|
317
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
318
|
+
# PlayWright API sometimes give empty status text for some reason!
|
319
|
+
status_text = res.status_text or StatusText.get(res.status)
|
320
|
+
|
321
|
+
response = Response(
|
322
|
+
url=res.url,
|
323
|
+
text=await page.content(),
|
324
|
+
body=(await page.content()).encode('utf-8'),
|
325
|
+
status=res.status,
|
326
|
+
reason=status_text,
|
327
|
+
encoding=encoding,
|
328
|
+
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
329
|
+
headers=await res.all_headers(),
|
330
|
+
request_headers=await res.request.all_headers(),
|
331
|
+
**self.adaptor_arguments
|
332
|
+
)
|
333
|
+
await page.close()
|
334
|
+
return response
|
scrapling/engines/static.py
CHANGED
@@ -1,33 +1,44 @@
|
|
1
|
-
import logging
|
2
|
-
|
3
|
-
from scrapling.core._types import Union, Optional, Dict
|
4
|
-
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
5
|
-
|
6
1
|
import httpx
|
7
2
|
from httpx._models import Response as httpxResponse
|
8
3
|
|
4
|
+
from scrapling.core._types import Dict, Optional, Tuple, Union
|
5
|
+
from scrapling.core.utils import log, lru_cache
|
9
6
|
|
7
|
+
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
8
|
+
|
9
|
+
|
10
|
+
@lru_cache(typed=True)
|
10
11
|
class StaticEngine:
|
11
|
-
def __init__(
|
12
|
+
def __init__(
|
13
|
+
self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
|
14
|
+
timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
|
15
|
+
):
|
12
16
|
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
13
17
|
|
18
|
+
:param url: Target url.
|
19
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
20
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
21
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
14
22
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
15
23
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
16
24
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
17
25
|
"""
|
26
|
+
self.url = url
|
27
|
+
self.proxy = proxy
|
28
|
+
self.stealth = stealthy_headers
|
18
29
|
self.timeout = timeout
|
19
30
|
self.follow_redirects = bool(follow_redirects)
|
31
|
+
self.retries = retries
|
20
32
|
self._extra_headers = generate_headers(browser_mode=False)
|
21
|
-
|
33
|
+
# Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
|
34
|
+
# So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
|
35
|
+
self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
|
22
36
|
|
23
|
-
|
24
|
-
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
37
|
+
def _headers_job(self, headers: Optional[Dict]) -> Dict:
|
25
38
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
26
39
|
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
27
40
|
|
28
41
|
:param headers: Current headers in the request if the user passed any
|
29
|
-
:param url: The Target URL.
|
30
|
-
:param stealth: Whether stealth mode is enabled or not.
|
31
42
|
:return: A dictionary of the new headers.
|
32
43
|
"""
|
33
44
|
headers = headers or {}
|
@@ -35,12 +46,12 @@ class StaticEngine:
|
|
35
46
|
# Validate headers
|
36
47
|
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
37
48
|
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
38
|
-
|
49
|
+
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
39
50
|
|
40
|
-
if stealth:
|
51
|
+
if self.stealth:
|
41
52
|
extra_headers = generate_headers(browser_mode=False)
|
42
53
|
headers.update(extra_headers)
|
43
|
-
headers.update({'referer': generate_convincing_referer(url)})
|
54
|
+
headers.update({'referer': generate_convincing_referer(self.url)})
|
44
55
|
|
45
56
|
return headers
|
46
57
|
|
@@ -60,69 +71,102 @@ class StaticEngine:
|
|
60
71
|
cookies=dict(response.cookies),
|
61
72
|
headers=dict(response.headers),
|
62
73
|
request_headers=dict(response.request.headers),
|
74
|
+
method=response.request.method,
|
63
75
|
**self.adaptor_arguments
|
64
76
|
)
|
65
77
|
|
66
|
-
def get(self,
|
78
|
+
def get(self, **kwargs: Dict) -> Response:
|
67
79
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
80
|
|
69
|
-
:param
|
70
|
-
:
|
71
|
-
|
72
|
-
|
73
|
-
|
81
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
82
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
83
|
+
"""
|
84
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
85
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
86
|
+
request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
87
|
+
|
88
|
+
return self._prepare_response(request)
|
89
|
+
|
90
|
+
async def async_get(self, **kwargs: Dict) -> Response:
|
91
|
+
"""Make basic async HTTP GET request for you but with some added flavors.
|
92
|
+
|
93
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
74
94
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
75
95
|
"""
|
76
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
77
|
-
with httpx.
|
78
|
-
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
96
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
97
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
98
|
+
request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
79
99
|
|
80
100
|
return self._prepare_response(request)
|
81
101
|
|
82
|
-
def post(self,
|
102
|
+
def post(self, **kwargs: Dict) -> Response:
|
83
103
|
"""Make basic HTTP POST request for you but with some added flavors.
|
84
104
|
|
85
|
-
:param
|
86
|
-
:
|
87
|
-
|
88
|
-
|
89
|
-
|
105
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
106
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
107
|
+
"""
|
108
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
109
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
110
|
+
request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
111
|
+
|
112
|
+
return self._prepare_response(request)
|
113
|
+
|
114
|
+
async def async_post(self, **kwargs: Dict) -> Response:
|
115
|
+
"""Make basic async HTTP POST request for you but with some added flavors.
|
116
|
+
|
117
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
90
118
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
91
119
|
"""
|
92
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
93
|
-
with httpx.
|
94
|
-
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
120
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
121
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
122
|
+
request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
95
123
|
|
96
124
|
return self._prepare_response(request)
|
97
125
|
|
98
|
-
def delete(self,
|
126
|
+
def delete(self, **kwargs: Dict) -> Response:
|
99
127
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
100
128
|
|
101
|
-
:param
|
102
|
-
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
103
|
-
create a referer header as if this request had came from Google's search of this URL's domain.
|
104
|
-
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
105
|
-
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
129
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
106
130
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
107
131
|
"""
|
108
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
109
|
-
with httpx.Client(proxy=proxy) as client:
|
110
|
-
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
132
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
133
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
134
|
+
request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
111
135
|
|
112
136
|
return self._prepare_response(request)
|
113
137
|
|
114
|
-
def
|
138
|
+
async def async_delete(self, **kwargs: Dict) -> Response:
|
139
|
+
"""Make basic async HTTP DELETE request for you but with some added flavors.
|
140
|
+
|
141
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
142
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
143
|
+
"""
|
144
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
145
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
146
|
+
request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
147
|
+
|
148
|
+
return self._prepare_response(request)
|
149
|
+
|
150
|
+
def put(self, **kwargs: Dict) -> Response:
|
115
151
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
116
152
|
|
117
|
-
:param
|
118
|
-
:
|
119
|
-
|
120
|
-
|
121
|
-
|
153
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
154
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
155
|
+
"""
|
156
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
157
|
+
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
158
|
+
request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
159
|
+
|
160
|
+
return self._prepare_response(request)
|
161
|
+
|
162
|
+
async def async_put(self, **kwargs: Dict) -> Response:
|
163
|
+
"""Make basic async HTTP PUT request for you but with some added flavors.
|
164
|
+
|
165
|
+
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
122
166
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
123
167
|
"""
|
124
|
-
headers = self._headers_job(kwargs.pop('headers', {})
|
125
|
-
with httpx.
|
126
|
-
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
168
|
+
headers = self._headers_job(kwargs.pop('headers', {}))
|
169
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
170
|
+
request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
127
171
|
|
128
172
|
return self._prepare_response(request)
|
@@ -1,20 +1,6 @@
|
|
1
|
-
from .
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
Response,
|
8
|
-
do_nothing,
|
9
|
-
StatusText,
|
10
|
-
BaseFetcher,
|
11
|
-
get_variable_name,
|
12
|
-
check_type_validity,
|
13
|
-
check_if_engine_usable,
|
14
|
-
)
|
15
|
-
from .navigation import (
|
16
|
-
js_bypass_path,
|
17
|
-
intercept_route,
|
18
|
-
construct_cdp_url,
|
19
|
-
construct_proxy_dict,
|
20
|
-
)
|
1
|
+
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
2
|
+
check_type_validity, get_variable_name)
|
3
|
+
from .fingerprints import (generate_convincing_referer, generate_headers,
|
4
|
+
get_os_name)
|
5
|
+
from .navigation import (async_intercept_route, construct_cdp_url,
|
6
|
+
construct_proxy_dict, intercept_route, js_bypass_path)
|