scrapling 0.2.8__py3-none-any.whl → 0.2.91__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. scrapling/__init__.py +4 -4
  2. scrapling/core/_types.py +2 -0
  3. scrapling/core/custom_types.py +88 -6
  4. scrapling/core/storage_adaptors.py +5 -6
  5. scrapling/core/translator.py +2 -2
  6. scrapling/core/utils.py +29 -27
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/camo.py +124 -24
  9. scrapling/engines/constants.py +4 -4
  10. scrapling/engines/pw.py +195 -91
  11. scrapling/engines/static.py +91 -48
  12. scrapling/engines/toolbelt/__init__.py +3 -3
  13. scrapling/engines/toolbelt/custom.py +16 -22
  14. scrapling/engines/toolbelt/fingerprints.py +3 -3
  15. scrapling/engines/toolbelt/navigation.py +21 -8
  16. scrapling/fetchers.py +231 -16
  17. scrapling/parser.py +50 -22
  18. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/METADATA +33 -18
  19. scrapling-0.2.91.dist-info/RECORD +47 -0
  20. tests/fetchers/async/__init__.py +0 -0
  21. tests/fetchers/async/test_camoufox.py +95 -0
  22. tests/fetchers/async/test_httpx.py +83 -0
  23. tests/fetchers/async/test_playwright.py +99 -0
  24. tests/fetchers/sync/__init__.py +0 -0
  25. tests/fetchers/sync/test_camoufox.py +68 -0
  26. tests/fetchers/sync/test_httpx.py +82 -0
  27. tests/fetchers/sync/test_playwright.py +87 -0
  28. tests/fetchers/test_utils.py +90 -122
  29. tests/parser/test_automatch.py +64 -9
  30. tests/parser/test_general.py +260 -218
  31. scrapling-0.2.8.dist-info/RECORD +0 -42
  32. tests/fetchers/test_camoufox.py +0 -65
  33. tests/fetchers/test_httpx.py +0 -68
  34. tests/fetchers/test_playwright.py +0 -77
  35. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
  36. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
  37. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py CHANGED
@@ -1,12 +1,14 @@
1
1
  import json
2
- import logging
3
2
 
4
- from scrapling.core._types import Callable, Dict, List, Optional, Union
3
+ from scrapling.core._types import (Callable, Dict, Optional,
4
+ SelectorWaitStates, Union)
5
+ from scrapling.core.utils import log, lru_cache
5
6
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
7
  NSTBROWSER_DEFAULT_QUERY)
7
8
  from scrapling.engines.toolbelt import (Response, StatusText,
9
+ async_intercept_route,
8
10
  check_type_validity, construct_cdp_url,
9
- construct_proxy_dict, do_nothing,
11
+ construct_proxy_dict,
10
12
  generate_convincing_referer,
11
13
  generate_headers, intercept_route,
12
14
  js_bypass_path)
@@ -19,10 +21,10 @@ class PlaywrightEngine:
19
21
  useragent: Optional[str] = None,
20
22
  network_idle: Optional[bool] = False,
21
23
  timeout: Optional[float] = 30000,
22
- page_action: Callable = do_nothing,
24
+ page_action: Callable = None,
23
25
  wait_selector: Optional[str] = None,
24
26
  locale: Optional[str] = 'en-US',
25
- wait_selector_state: Optional[str] = 'attached',
27
+ wait_selector_state: SelectorWaitStates = 'attached',
26
28
  stealth: Optional[bool] = False,
27
29
  real_chrome: Optional[bool] = False,
28
30
  hide_canvas: Optional[bool] = False,
@@ -74,11 +76,14 @@ class PlaywrightEngine:
74
76
  self.cdp_url = cdp_url
75
77
  self.useragent = useragent
76
78
  self.timeout = check_type_validity(timeout, [int, float], 30000)
77
- if callable(page_action):
78
- self.page_action = page_action
79
+ if page_action is not None:
80
+ if callable(page_action):
81
+ self.page_action = page_action
82
+ else:
83
+ self.page_action = None
84
+ log.error('[Ignored] Argument "page_action" must be callable')
79
85
  else:
80
- self.page_action = do_nothing
81
- logging.error('[Ignored] Argument "page_action" must be callable')
86
+ self.page_action = None
82
87
 
83
88
  self.wait_selector = wait_selector
84
89
  self.wait_selector_state = wait_selector_state
@@ -94,10 +99,8 @@ class PlaywrightEngine:
94
99
  # '--disable-extensions',
95
100
  ]
96
101
 
97
- def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
102
+ def _cdp_url_logic(self) -> str:
98
103
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
99
-
100
- :param flags: Chrome flags to be added to NSTBrowser query
101
104
  :return: CDP URL
102
105
  """
103
106
  cdp_url = self.cdp_url
@@ -106,7 +109,8 @@ class PlaywrightEngine:
106
109
  config = self.nstbrowser_config
107
110
  else:
108
111
  query = NSTBROWSER_DEFAULT_QUERY.copy()
109
- if flags:
112
+ if self.stealth:
113
+ flags = self.__set_flags()
110
114
  query.update({
111
115
  "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
112
116
  })
@@ -122,78 +126,104 @@ class PlaywrightEngine:
122
126
 
123
127
  return cdp_url
124
128
 
129
+ @lru_cache(typed=True)
130
+ def __set_flags(self):
131
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
132
+ flags = DEFAULT_STEALTH_FLAGS
133
+ if self.hide_canvas:
134
+ flags += ('--fingerprinting-canvas-image-data-noise',)
135
+ if self.disable_webgl:
136
+ flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
137
+
138
+ return flags
139
+
140
+ def __launch_kwargs(self):
141
+ """Creates the arguments we will use while launching playwright's browser"""
142
+ launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
143
+ if self.stealth:
144
+ launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
145
+
146
+ return launch_kwargs
147
+
148
+ def __context_kwargs(self):
149
+ """Creates the arguments for the browser context"""
150
+ context_kwargs = {
151
+ "proxy": self.proxy,
152
+ "locale": self.locale,
153
+ "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
154
+ "device_scale_factor": 2,
155
+ "extra_http_headers": self.extra_headers if self.extra_headers else {},
156
+ "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
157
+ }
158
+ if self.stealth:
159
+ context_kwargs.update({
160
+ 'is_mobile': False,
161
+ 'has_touch': False,
162
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
163
+ 'service_workers': 'allow',
164
+ 'ignore_https_errors': True,
165
+ 'screen': {'width': 1920, 'height': 1080},
166
+ 'viewport': {'width': 1920, 'height': 1080},
167
+ 'permissions': ['geolocation', 'notifications']
168
+ })
169
+
170
+ return context_kwargs
171
+
172
+ @lru_cache()
173
+ def __stealth_scripts(self):
174
+ # Basic bypasses nothing fancy as I'm still working on it
175
+ # But with adding these bypasses to the above config, it bypasses many online tests like
176
+ # https://bot.sannysoft.com/
177
+ # https://kaliiiiiiiiii.github.io/brotector/
178
+ # https://pixelscan.net/
179
+ # https://iphey.com/
180
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
181
+ # https://arh.antoinevastel.com/bots/areyouheadless/
182
+ # https://prescience-data.github.io/execution-monitor.html
183
+ return tuple(
184
+ js_bypass_path(script) for script in (
185
+ # Order is important
186
+ 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
187
+ 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
188
+ )
189
+ )
190
+
125
191
  def fetch(self, url: str) -> Response:
126
192
  """Opens up the browser and do your request based on your chosen options.
127
193
 
128
194
  :param url: Target url.
129
195
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
130
196
  """
197
+ from playwright.sync_api import Response as PlaywrightResponse
131
198
  if not self.stealth or self.real_chrome:
132
199
  # Because rebrowser_playwright doesn't play well with real browsers
133
200
  from playwright.sync_api import sync_playwright
134
201
  else:
135
202
  from rebrowser_playwright.sync_api import sync_playwright
136
203
 
137
- with sync_playwright() as p:
138
- # Handle the UserAgent early
139
- if self.useragent:
140
- extra_headers = {}
141
- useragent = self.useragent
142
- else:
143
- extra_headers = {}
144
- useragent = generate_headers(browser_mode=True).get('User-Agent')
204
+ # Store the final response
205
+ final_response = None
145
206
 
146
- # Prepare the flags before diving
147
- flags = DEFAULT_STEALTH_FLAGS
148
- if self.hide_canvas:
149
- flags += ['--fingerprinting-canvas-image-data-noise']
150
- if self.disable_webgl:
151
- flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
207
+ def handle_response(finished_response: PlaywrightResponse):
208
+ nonlocal final_response
209
+ if finished_response.request.resource_type == "document":
210
+ final_response = finished_response
152
211
 
212
+ with sync_playwright() as p:
153
213
  # Creating the browser
154
214
  if self.cdp_url:
155
- cdp_url = self._cdp_url_logic(flags if self.stealth else None)
215
+ cdp_url = self._cdp_url_logic()
156
216
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
157
217
  else:
158
- if self.stealth:
159
- browser = p.chromium.launch(
160
- headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
161
- )
162
- else:
163
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
164
-
165
- # Creating the context
166
- if self.stealth:
167
- context = browser.new_context(
168
- locale=self.locale,
169
- is_mobile=False,
170
- has_touch=False,
171
- proxy=self.proxy,
172
- color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
173
- user_agent=useragent,
174
- device_scale_factor=2,
175
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
176
- service_workers="allow",
177
- ignore_https_errors=True,
178
- extra_http_headers=extra_headers,
179
- screen={"width": 1920, "height": 1080},
180
- viewport={"width": 1920, "height": 1080},
181
- permissions=["geolocation", 'notifications'],
182
- )
183
- else:
184
- context = browser.new_context(
185
- locale=self.locale,
186
- proxy=self.proxy,
187
- color_scheme='dark',
188
- user_agent=useragent,
189
- device_scale_factor=2,
190
- extra_http_headers=extra_headers
191
- )
218
+ browser = p.chromium.launch(**self.__launch_kwargs())
192
219
 
220
+ context = browser.new_context(**self.__context_kwargs())
193
221
  # Finally we are in business
194
222
  page = context.new_page()
195
223
  page.set_default_navigation_timeout(self.timeout)
196
224
  page.set_default_timeout(self.timeout)
225
+ # Listen for all responses
226
+ page.on("response", handle_response)
197
227
 
198
228
  if self.extra_headers:
199
229
  page.set_extra_http_headers(self.extra_headers)
@@ -202,29 +232,16 @@ class PlaywrightEngine:
202
232
  page.route("**/*", intercept_route)
203
233
 
204
234
  if self.stealth:
205
- # Basic bypasses nothing fancy as I'm still working on it
206
- # But with adding these bypasses to the above config, it bypasses many online tests like
207
- # https://bot.sannysoft.com/
208
- # https://kaliiiiiiiiii.github.io/brotector/
209
- # https://pixelscan.net/
210
- # https://iphey.com/
211
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
212
- # https://arh.antoinevastel.com/bots/areyouheadless/
213
- # https://prescience-data.github.io/execution-monitor.html
214
- page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
215
- page.add_init_script(path=js_bypass_path('window_chrome.js'))
216
- page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
217
- page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
218
- page.add_init_script(path=js_bypass_path('notification_permission.js'))
219
- page.add_init_script(path=js_bypass_path('screen_props.js'))
220
- page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
221
-
222
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
235
+ for script in self.__stealth_scripts():
236
+ page.add_init_script(path=script)
237
+
238
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
223
239
  page.wait_for_load_state(state="domcontentloaded")
224
240
  if self.network_idle:
225
241
  page.wait_for_load_state('networkidle')
226
242
 
227
- page = self.page_action(page)
243
+ if self.page_action is not None:
244
+ page = self.page_action(page)
228
245
 
229
246
  if self.wait_selector and type(self.wait_selector) is str:
230
247
  waiter = page.locator(self.wait_selector)
@@ -235,25 +252,112 @@ class PlaywrightEngine:
235
252
  if self.network_idle:
236
253
  page.wait_for_load_state('networkidle')
237
254
 
255
+ response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
256
+ # In case we didn't catch a document type somehow
257
+ final_response = final_response if final_response else first_response
238
258
  # This will be parsed inside `Response`
239
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
240
-
241
- status_text = res.status_text
259
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
242
260
  # PlayWright API sometimes give empty status text for some reason!
243
- if not status_text:
244
- status_text = StatusText.get(res.status)
261
+ status_text = final_response.status_text or StatusText.get(final_response.status)
245
262
 
246
263
  response = Response(
247
- url=res.url,
264
+ url=final_response.url,
248
265
  text=page.content(),
249
- body=page.content().encode('utf-8'),
250
- status=res.status,
266
+ body=response_bytes,
267
+ status=final_response.status,
251
268
  reason=status_text,
252
269
  encoding=encoding,
253
270
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
254
- headers=res.all_headers(),
255
- request_headers=res.request.all_headers(),
271
+ headers=final_response.all_headers(),
272
+ request_headers=final_response.request.all_headers(),
256
273
  **self.adaptor_arguments
257
274
  )
258
275
  page.close()
259
276
  return response
277
+
278
+ async def async_fetch(self, url: str) -> Response:
279
+ """Async version of `fetch`
280
+
281
+ :param url: Target url.
282
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
283
+ """
284
+ from playwright.async_api import Response as PlaywrightResponse
285
+ if not self.stealth or self.real_chrome:
286
+ # Because rebrowser_playwright doesn't play well with real browsers
287
+ from playwright.async_api import async_playwright
288
+ else:
289
+ from rebrowser_playwright.async_api import async_playwright
290
+
291
+ # Store the final response
292
+ final_response = None
293
+
294
+ async def handle_response(finished_response: PlaywrightResponse):
295
+ nonlocal final_response
296
+ if finished_response.request.resource_type == "document":
297
+ final_response = finished_response
298
+
299
+ async with async_playwright() as p:
300
+ # Creating the browser
301
+ if self.cdp_url:
302
+ cdp_url = self._cdp_url_logic()
303
+ browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
304
+ else:
305
+ browser = await p.chromium.launch(**self.__launch_kwargs())
306
+
307
+ context = await browser.new_context(**self.__context_kwargs())
308
+ # Finally we are in business
309
+ page = await context.new_page()
310
+ page.set_default_navigation_timeout(self.timeout)
311
+ page.set_default_timeout(self.timeout)
312
+ # Listen for all responses
313
+ page.on("response", handle_response)
314
+
315
+ if self.extra_headers:
316
+ await page.set_extra_http_headers(self.extra_headers)
317
+
318
+ if self.disable_resources:
319
+ await page.route("**/*", async_intercept_route)
320
+
321
+ if self.stealth:
322
+ for script in self.__stealth_scripts():
323
+ await page.add_init_script(path=script)
324
+
325
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
326
+ await page.wait_for_load_state(state="domcontentloaded")
327
+ if self.network_idle:
328
+ await page.wait_for_load_state('networkidle')
329
+
330
+ if self.page_action is not None:
331
+ page = await self.page_action(page)
332
+
333
+ if self.wait_selector and type(self.wait_selector) is str:
334
+ waiter = page.locator(self.wait_selector)
335
+ await waiter.first.wait_for(state=self.wait_selector_state)
336
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
337
+ await page.wait_for_load_state(state="load")
338
+ await page.wait_for_load_state(state="domcontentloaded")
339
+ if self.network_idle:
340
+ await page.wait_for_load_state('networkidle')
341
+
342
+ response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
343
+ # In case we didn't catch a document type somehow
344
+ final_response = final_response if final_response else first_response
345
+ # This will be parsed inside `Response`
346
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
347
+ # PlayWright API sometimes give empty status text for some reason!
348
+ status_text = final_response.status_text or StatusText.get(final_response.status)
349
+
350
+ response = Response(
351
+ url=final_response.url,
352
+ text=await page.content(),
353
+ body=response_bytes,
354
+ status=final_response.status,
355
+ reason=status_text,
356
+ encoding=encoding,
357
+ cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
358
+ headers=await final_response.all_headers(),
359
+ request_headers=await final_response.request.all_headers(),
360
+ **self.adaptor_arguments
361
+ )
362
+ await page.close()
363
+ return response
@@ -1,34 +1,44 @@
1
- import logging
2
-
3
1
  import httpx
4
2
  from httpx._models import Response as httpxResponse
5
3
 
6
- from scrapling.core._types import Dict, Optional, Union
4
+ from scrapling.core._types import Dict, Optional, Tuple, Union
5
+ from scrapling.core.utils import log, lru_cache
7
6
 
8
7
  from .toolbelt import Response, generate_convincing_referer, generate_headers
9
8
 
10
9
 
10
+ @lru_cache(typed=True)
11
11
  class StaticEngine:
12
- def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ def __init__(
13
+ self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
14
+ timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
15
+ ):
13
16
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
14
17
 
18
+ :param url: Target url.
19
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
20
+ create a referer header as if this request had came from Google's search of this URL's domain.
21
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
15
22
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
16
23
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
17
24
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
18
25
  """
26
+ self.url = url
27
+ self.proxy = proxy
28
+ self.stealth = stealthy_headers
19
29
  self.timeout = timeout
20
30
  self.follow_redirects = bool(follow_redirects)
31
+ self.retries = retries
21
32
  self._extra_headers = generate_headers(browser_mode=False)
22
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
33
+ # Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
34
+ # So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
35
+ self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
23
36
 
24
- @staticmethod
25
- def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
37
+ def _headers_job(self, headers: Optional[Dict]) -> Dict:
26
38
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
27
39
  finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
28
40
 
29
41
  :param headers: Current headers in the request if the user passed any
30
- :param url: The Target URL.
31
- :param stealth: Whether stealth mode is enabled or not.
32
42
  :return: A dictionary of the new headers.
33
43
  """
34
44
  headers = headers or {}
@@ -36,12 +46,12 @@ class StaticEngine:
36
46
  # Validate headers
37
47
  if not headers.get('user-agent') and not headers.get('User-Agent'):
38
48
  headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
39
- logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
49
+ log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
40
50
 
41
- if stealth:
51
+ if self.stealth:
42
52
  extra_headers = generate_headers(browser_mode=False)
43
53
  headers.update(extra_headers)
44
- headers.update({'referer': generate_convincing_referer(url)})
54
+ headers.update({'referer': generate_convincing_referer(self.url)})
45
55
 
46
56
  return headers
47
57
 
@@ -61,69 +71,102 @@ class StaticEngine:
61
71
  cookies=dict(response.cookies),
62
72
  headers=dict(response.headers),
63
73
  request_headers=dict(response.request.headers),
74
+ method=response.request.method,
64
75
  **self.adaptor_arguments
65
76
  )
66
77
 
67
- def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
78
+ def get(self, **kwargs: Dict) -> Response:
68
79
  """Make basic HTTP GET request for you but with some added flavors.
69
80
 
70
- :param url: Target url.
71
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
72
- create a referer header as if this request had came from Google's search of this URL's domain.
73
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
74
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
81
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
75
82
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
76
83
  """
77
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
78
- with httpx.Client(proxy=proxy) as client:
79
- request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
84
+ headers = self._headers_job(kwargs.pop('headers', {}))
85
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
86
+ request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
80
87
 
81
88
  return self._prepare_response(request)
82
89
 
83
- def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
90
+ async def async_get(self, **kwargs: Dict) -> Response:
91
+ """Make basic async HTTP GET request for you but with some added flavors.
92
+
93
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
94
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
95
+ """
96
+ headers = self._headers_job(kwargs.pop('headers', {}))
97
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
98
+ request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
99
+
100
+ return self._prepare_response(request)
101
+
102
+ def post(self, **kwargs: Dict) -> Response:
84
103
  """Make basic HTTP POST request for you but with some added flavors.
85
104
 
86
- :param url: Target url.
87
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
88
- create a referer header as if this request had came from Google's search of this URL's domain.
89
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
90
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
105
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
91
106
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
92
107
  """
93
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
94
- with httpx.Client(proxy=proxy) as client:
95
- request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
108
+ headers = self._headers_job(kwargs.pop('headers', {}))
109
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
110
+ request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
96
111
 
97
112
  return self._prepare_response(request)
98
113
 
99
- def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
114
+ async def async_post(self, **kwargs: Dict) -> Response:
115
+ """Make basic async HTTP POST request for you but with some added flavors.
116
+
117
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
118
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
119
+ """
120
+ headers = self._headers_job(kwargs.pop('headers', {}))
121
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
122
+ request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
123
+
124
+ return self._prepare_response(request)
125
+
126
+ def delete(self, **kwargs: Dict) -> Response:
100
127
  """Make basic HTTP DELETE request for you but with some added flavors.
101
128
 
102
- :param url: Target url.
103
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
104
- create a referer header as if this request had came from Google's search of this URL's domain.
105
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
106
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
129
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
130
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
131
+ """
132
+ headers = self._headers_job(kwargs.pop('headers', {}))
133
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
134
+ request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
135
+
136
+ return self._prepare_response(request)
137
+
138
+ async def async_delete(self, **kwargs: Dict) -> Response:
139
+ """Make basic async HTTP DELETE request for you but with some added flavors.
140
+
141
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
107
142
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
108
143
  """
109
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
110
- with httpx.Client(proxy=proxy) as client:
111
- request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
144
+ headers = self._headers_job(kwargs.pop('headers', {}))
145
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
146
+ request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
147
 
113
148
  return self._prepare_response(request)
114
149
 
115
- def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
150
+ def put(self, **kwargs: Dict) -> Response:
116
151
  """Make basic HTTP PUT request for you but with some added flavors.
117
152
 
118
- :param url: Target url.
119
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
120
- create a referer header as if this request had came from Google's search of this URL's domain.
121
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
122
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
153
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
154
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
155
+ """
156
+ headers = self._headers_job(kwargs.pop('headers', {}))
157
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
158
+ request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
159
+
160
+ return self._prepare_response(request)
161
+
162
+ async def async_put(self, **kwargs: Dict) -> Response:
163
+ """Make basic async HTTP PUT request for you but with some added flavors.
164
+
165
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
123
166
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
124
167
  """
125
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
126
- with httpx.Client(proxy=proxy) as client:
127
- request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
168
+ headers = self._headers_job(kwargs.pop('headers', {}))
169
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
170
+ request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
128
171
 
129
172
  return self._prepare_response(request)
@@ -1,6 +1,6 @@
1
1
  from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
- check_type_validity, do_nothing, get_variable_name)
2
+ check_type_validity, get_variable_name)
3
3
  from .fingerprints import (generate_convincing_referer, generate_headers,
4
4
  get_os_name)
5
- from .navigation import (construct_cdp_url, construct_proxy_dict,
6
- intercept_route, js_bypass_path)
5
+ from .navigation import (async_intercept_route, construct_cdp_url,
6
+ construct_proxy_dict, intercept_route, js_bypass_path)