scrapling 0.2.8__py3-none-any.whl → 0.2.91__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. scrapling/__init__.py +4 -4
  2. scrapling/core/_types.py +2 -0
  3. scrapling/core/custom_types.py +88 -6
  4. scrapling/core/storage_adaptors.py +5 -6
  5. scrapling/core/translator.py +2 -2
  6. scrapling/core/utils.py +29 -27
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/camo.py +124 -24
  9. scrapling/engines/constants.py +4 -4
  10. scrapling/engines/pw.py +195 -91
  11. scrapling/engines/static.py +91 -48
  12. scrapling/engines/toolbelt/__init__.py +3 -3
  13. scrapling/engines/toolbelt/custom.py +16 -22
  14. scrapling/engines/toolbelt/fingerprints.py +3 -3
  15. scrapling/engines/toolbelt/navigation.py +21 -8
  16. scrapling/fetchers.py +231 -16
  17. scrapling/parser.py +50 -22
  18. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/METADATA +33 -18
  19. scrapling-0.2.91.dist-info/RECORD +47 -0
  20. tests/fetchers/async/__init__.py +0 -0
  21. tests/fetchers/async/test_camoufox.py +95 -0
  22. tests/fetchers/async/test_httpx.py +83 -0
  23. tests/fetchers/async/test_playwright.py +99 -0
  24. tests/fetchers/sync/__init__.py +0 -0
  25. tests/fetchers/sync/test_camoufox.py +68 -0
  26. tests/fetchers/sync/test_httpx.py +82 -0
  27. tests/fetchers/sync/test_playwright.py +87 -0
  28. tests/fetchers/test_utils.py +90 -122
  29. tests/parser/test_automatch.py +64 -9
  30. tests/parser/test_general.py +260 -218
  31. scrapling-0.2.8.dist-info/RECORD +0 -42
  32. tests/fetchers/test_camoufox.py +0 -65
  33. tests/fetchers/test_httpx.py +0 -68
  34. tests/fetchers/test_playwright.py +0 -77
  35. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
  36. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
  37. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py CHANGED
@@ -1,12 +1,14 @@
1
1
  import json
2
- import logging
3
2
 
4
- from scrapling.core._types import Callable, Dict, List, Optional, Union
3
+ from scrapling.core._types import (Callable, Dict, Optional,
4
+ SelectorWaitStates, Union)
5
+ from scrapling.core.utils import log, lru_cache
5
6
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
7
  NSTBROWSER_DEFAULT_QUERY)
7
8
  from scrapling.engines.toolbelt import (Response, StatusText,
9
+ async_intercept_route,
8
10
  check_type_validity, construct_cdp_url,
9
- construct_proxy_dict, do_nothing,
11
+ construct_proxy_dict,
10
12
  generate_convincing_referer,
11
13
  generate_headers, intercept_route,
12
14
  js_bypass_path)
@@ -19,10 +21,10 @@ class PlaywrightEngine:
19
21
  useragent: Optional[str] = None,
20
22
  network_idle: Optional[bool] = False,
21
23
  timeout: Optional[float] = 30000,
22
- page_action: Callable = do_nothing,
24
+ page_action: Callable = None,
23
25
  wait_selector: Optional[str] = None,
24
26
  locale: Optional[str] = 'en-US',
25
- wait_selector_state: Optional[str] = 'attached',
27
+ wait_selector_state: SelectorWaitStates = 'attached',
26
28
  stealth: Optional[bool] = False,
27
29
  real_chrome: Optional[bool] = False,
28
30
  hide_canvas: Optional[bool] = False,
@@ -74,11 +76,14 @@ class PlaywrightEngine:
74
76
  self.cdp_url = cdp_url
75
77
  self.useragent = useragent
76
78
  self.timeout = check_type_validity(timeout, [int, float], 30000)
77
- if callable(page_action):
78
- self.page_action = page_action
79
+ if page_action is not None:
80
+ if callable(page_action):
81
+ self.page_action = page_action
82
+ else:
83
+ self.page_action = None
84
+ log.error('[Ignored] Argument "page_action" must be callable')
79
85
  else:
80
- self.page_action = do_nothing
81
- logging.error('[Ignored] Argument "page_action" must be callable')
86
+ self.page_action = None
82
87
 
83
88
  self.wait_selector = wait_selector
84
89
  self.wait_selector_state = wait_selector_state
@@ -94,10 +99,8 @@ class PlaywrightEngine:
94
99
  # '--disable-extensions',
95
100
  ]
96
101
 
97
- def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
102
+ def _cdp_url_logic(self) -> str:
98
103
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
99
-
100
- :param flags: Chrome flags to be added to NSTBrowser query
101
104
  :return: CDP URL
102
105
  """
103
106
  cdp_url = self.cdp_url
@@ -106,7 +109,8 @@ class PlaywrightEngine:
106
109
  config = self.nstbrowser_config
107
110
  else:
108
111
  query = NSTBROWSER_DEFAULT_QUERY.copy()
109
- if flags:
112
+ if self.stealth:
113
+ flags = self.__set_flags()
110
114
  query.update({
111
115
  "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
112
116
  })
@@ -122,78 +126,104 @@ class PlaywrightEngine:
122
126
 
123
127
  return cdp_url
124
128
 
129
+ @lru_cache(typed=True)
130
+ def __set_flags(self):
131
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
132
+ flags = DEFAULT_STEALTH_FLAGS
133
+ if self.hide_canvas:
134
+ flags += ('--fingerprinting-canvas-image-data-noise',)
135
+ if self.disable_webgl:
136
+ flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
137
+
138
+ return flags
139
+
140
+ def __launch_kwargs(self):
141
+ """Creates the arguments we will use while launching playwright's browser"""
142
+ launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
143
+ if self.stealth:
144
+ launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
145
+
146
+ return launch_kwargs
147
+
148
+ def __context_kwargs(self):
149
+ """Creates the arguments for the browser context"""
150
+ context_kwargs = {
151
+ "proxy": self.proxy,
152
+ "locale": self.locale,
153
+ "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
154
+ "device_scale_factor": 2,
155
+ "extra_http_headers": self.extra_headers if self.extra_headers else {},
156
+ "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
157
+ }
158
+ if self.stealth:
159
+ context_kwargs.update({
160
+ 'is_mobile': False,
161
+ 'has_touch': False,
162
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
163
+ 'service_workers': 'allow',
164
+ 'ignore_https_errors': True,
165
+ 'screen': {'width': 1920, 'height': 1080},
166
+ 'viewport': {'width': 1920, 'height': 1080},
167
+ 'permissions': ['geolocation', 'notifications']
168
+ })
169
+
170
+ return context_kwargs
171
+
172
+ @lru_cache()
173
+ def __stealth_scripts(self):
174
+ # Basic bypasses nothing fancy as I'm still working on it
175
+ # But with adding these bypasses to the above config, it bypasses many online tests like
176
+ # https://bot.sannysoft.com/
177
+ # https://kaliiiiiiiiii.github.io/brotector/
178
+ # https://pixelscan.net/
179
+ # https://iphey.com/
180
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
181
+ # https://arh.antoinevastel.com/bots/areyouheadless/
182
+ # https://prescience-data.github.io/execution-monitor.html
183
+ return tuple(
184
+ js_bypass_path(script) for script in (
185
+ # Order is important
186
+ 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
187
+ 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
188
+ )
189
+ )
190
+
125
191
  def fetch(self, url: str) -> Response:
126
192
  """Opens up the browser and do your request based on your chosen options.
127
193
 
128
194
  :param url: Target url.
129
195
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
130
196
  """
197
+ from playwright.sync_api import Response as PlaywrightResponse
131
198
  if not self.stealth or self.real_chrome:
132
199
  # Because rebrowser_playwright doesn't play well with real browsers
133
200
  from playwright.sync_api import sync_playwright
134
201
  else:
135
202
  from rebrowser_playwright.sync_api import sync_playwright
136
203
 
137
- with sync_playwright() as p:
138
- # Handle the UserAgent early
139
- if self.useragent:
140
- extra_headers = {}
141
- useragent = self.useragent
142
- else:
143
- extra_headers = {}
144
- useragent = generate_headers(browser_mode=True).get('User-Agent')
204
+ # Store the final response
205
+ final_response = None
145
206
 
146
- # Prepare the flags before diving
147
- flags = DEFAULT_STEALTH_FLAGS
148
- if self.hide_canvas:
149
- flags += ['--fingerprinting-canvas-image-data-noise']
150
- if self.disable_webgl:
151
- flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
207
+ def handle_response(finished_response: PlaywrightResponse):
208
+ nonlocal final_response
209
+ if finished_response.request.resource_type == "document":
210
+ final_response = finished_response
152
211
 
212
+ with sync_playwright() as p:
153
213
  # Creating the browser
154
214
  if self.cdp_url:
155
- cdp_url = self._cdp_url_logic(flags if self.stealth else None)
215
+ cdp_url = self._cdp_url_logic()
156
216
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
157
217
  else:
158
- if self.stealth:
159
- browser = p.chromium.launch(
160
- headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
161
- )
162
- else:
163
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
164
-
165
- # Creating the context
166
- if self.stealth:
167
- context = browser.new_context(
168
- locale=self.locale,
169
- is_mobile=False,
170
- has_touch=False,
171
- proxy=self.proxy,
172
- color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
173
- user_agent=useragent,
174
- device_scale_factor=2,
175
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
176
- service_workers="allow",
177
- ignore_https_errors=True,
178
- extra_http_headers=extra_headers,
179
- screen={"width": 1920, "height": 1080},
180
- viewport={"width": 1920, "height": 1080},
181
- permissions=["geolocation", 'notifications'],
182
- )
183
- else:
184
- context = browser.new_context(
185
- locale=self.locale,
186
- proxy=self.proxy,
187
- color_scheme='dark',
188
- user_agent=useragent,
189
- device_scale_factor=2,
190
- extra_http_headers=extra_headers
191
- )
218
+ browser = p.chromium.launch(**self.__launch_kwargs())
192
219
 
220
+ context = browser.new_context(**self.__context_kwargs())
193
221
  # Finally we are in business
194
222
  page = context.new_page()
195
223
  page.set_default_navigation_timeout(self.timeout)
196
224
  page.set_default_timeout(self.timeout)
225
+ # Listen for all responses
226
+ page.on("response", handle_response)
197
227
 
198
228
  if self.extra_headers:
199
229
  page.set_extra_http_headers(self.extra_headers)
@@ -202,29 +232,16 @@ class PlaywrightEngine:
202
232
  page.route("**/*", intercept_route)
203
233
 
204
234
  if self.stealth:
205
- # Basic bypasses nothing fancy as I'm still working on it
206
- # But with adding these bypasses to the above config, it bypasses many online tests like
207
- # https://bot.sannysoft.com/
208
- # https://kaliiiiiiiiii.github.io/brotector/
209
- # https://pixelscan.net/
210
- # https://iphey.com/
211
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
212
- # https://arh.antoinevastel.com/bots/areyouheadless/
213
- # https://prescience-data.github.io/execution-monitor.html
214
- page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
215
- page.add_init_script(path=js_bypass_path('window_chrome.js'))
216
- page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
217
- page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
218
- page.add_init_script(path=js_bypass_path('notification_permission.js'))
219
- page.add_init_script(path=js_bypass_path('screen_props.js'))
220
- page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
221
-
222
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
235
+ for script in self.__stealth_scripts():
236
+ page.add_init_script(path=script)
237
+
238
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
223
239
  page.wait_for_load_state(state="domcontentloaded")
224
240
  if self.network_idle:
225
241
  page.wait_for_load_state('networkidle')
226
242
 
227
- page = self.page_action(page)
243
+ if self.page_action is not None:
244
+ page = self.page_action(page)
228
245
 
229
246
  if self.wait_selector and type(self.wait_selector) is str:
230
247
  waiter = page.locator(self.wait_selector)
@@ -235,25 +252,112 @@ class PlaywrightEngine:
235
252
  if self.network_idle:
236
253
  page.wait_for_load_state('networkidle')
237
254
 
255
+ response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
256
+ # In case we didn't catch a document type somehow
257
+ final_response = final_response if final_response else first_response
238
258
  # This will be parsed inside `Response`
239
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
240
-
241
- status_text = res.status_text
259
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
242
260
  # PlayWright API sometimes give empty status text for some reason!
243
- if not status_text:
244
- status_text = StatusText.get(res.status)
261
+ status_text = final_response.status_text or StatusText.get(final_response.status)
245
262
 
246
263
  response = Response(
247
- url=res.url,
264
+ url=final_response.url,
248
265
  text=page.content(),
249
- body=page.content().encode('utf-8'),
250
- status=res.status,
266
+ body=response_bytes,
267
+ status=final_response.status,
251
268
  reason=status_text,
252
269
  encoding=encoding,
253
270
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
254
- headers=res.all_headers(),
255
- request_headers=res.request.all_headers(),
271
+ headers=final_response.all_headers(),
272
+ request_headers=final_response.request.all_headers(),
256
273
  **self.adaptor_arguments
257
274
  )
258
275
  page.close()
259
276
  return response
277
+
278
+ async def async_fetch(self, url: str) -> Response:
279
+ """Async version of `fetch`
280
+
281
+ :param url: Target url.
282
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
283
+ """
284
+ from playwright.async_api import Response as PlaywrightResponse
285
+ if not self.stealth or self.real_chrome:
286
+ # Because rebrowser_playwright doesn't play well with real browsers
287
+ from playwright.async_api import async_playwright
288
+ else:
289
+ from rebrowser_playwright.async_api import async_playwright
290
+
291
+ # Store the final response
292
+ final_response = None
293
+
294
+ async def handle_response(finished_response: PlaywrightResponse):
295
+ nonlocal final_response
296
+ if finished_response.request.resource_type == "document":
297
+ final_response = finished_response
298
+
299
+ async with async_playwright() as p:
300
+ # Creating the browser
301
+ if self.cdp_url:
302
+ cdp_url = self._cdp_url_logic()
303
+ browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
304
+ else:
305
+ browser = await p.chromium.launch(**self.__launch_kwargs())
306
+
307
+ context = await browser.new_context(**self.__context_kwargs())
308
+ # Finally we are in business
309
+ page = await context.new_page()
310
+ page.set_default_navigation_timeout(self.timeout)
311
+ page.set_default_timeout(self.timeout)
312
+ # Listen for all responses
313
+ page.on("response", handle_response)
314
+
315
+ if self.extra_headers:
316
+ await page.set_extra_http_headers(self.extra_headers)
317
+
318
+ if self.disable_resources:
319
+ await page.route("**/*", async_intercept_route)
320
+
321
+ if self.stealth:
322
+ for script in self.__stealth_scripts():
323
+ await page.add_init_script(path=script)
324
+
325
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
326
+ await page.wait_for_load_state(state="domcontentloaded")
327
+ if self.network_idle:
328
+ await page.wait_for_load_state('networkidle')
329
+
330
+ if self.page_action is not None:
331
+ page = await self.page_action(page)
332
+
333
+ if self.wait_selector and type(self.wait_selector) is str:
334
+ waiter = page.locator(self.wait_selector)
335
+ await waiter.first.wait_for(state=self.wait_selector_state)
336
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
337
+ await page.wait_for_load_state(state="load")
338
+ await page.wait_for_load_state(state="domcontentloaded")
339
+ if self.network_idle:
340
+ await page.wait_for_load_state('networkidle')
341
+
342
+ response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
343
+ # In case we didn't catch a document type somehow
344
+ final_response = final_response if final_response else first_response
345
+ # This will be parsed inside `Response`
346
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
347
+ # PlayWright API sometimes give empty status text for some reason!
348
+ status_text = final_response.status_text or StatusText.get(final_response.status)
349
+
350
+ response = Response(
351
+ url=final_response.url,
352
+ text=await page.content(),
353
+ body=response_bytes,
354
+ status=final_response.status,
355
+ reason=status_text,
356
+ encoding=encoding,
357
+ cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
358
+ headers=await final_response.all_headers(),
359
+ request_headers=await final_response.request.all_headers(),
360
+ **self.adaptor_arguments
361
+ )
362
+ await page.close()
363
+ return response
@@ -1,34 +1,44 @@
1
- import logging
2
-
3
1
  import httpx
4
2
  from httpx._models import Response as httpxResponse
5
3
 
6
- from scrapling.core._types import Dict, Optional, Union
4
+ from scrapling.core._types import Dict, Optional, Tuple, Union
5
+ from scrapling.core.utils import log, lru_cache
7
6
 
8
7
  from .toolbelt import Response, generate_convincing_referer, generate_headers
9
8
 
10
9
 
10
+ @lru_cache(typed=True)
11
11
  class StaticEngine:
12
- def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ def __init__(
13
+ self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
14
+ timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
15
+ ):
13
16
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
14
17
 
18
+ :param url: Target url.
19
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
20
+ create a referer header as if this request had came from Google's search of this URL's domain.
21
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
15
22
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
16
23
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
17
24
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
18
25
  """
26
+ self.url = url
27
+ self.proxy = proxy
28
+ self.stealth = stealthy_headers
19
29
  self.timeout = timeout
20
30
  self.follow_redirects = bool(follow_redirects)
31
+ self.retries = retries
21
32
  self._extra_headers = generate_headers(browser_mode=False)
22
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
33
+ # Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
34
+ # So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
35
+ self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
23
36
 
24
- @staticmethod
25
- def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
37
+ def _headers_job(self, headers: Optional[Dict]) -> Dict:
26
38
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
27
39
  finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
28
40
 
29
41
  :param headers: Current headers in the request if the user passed any
30
- :param url: The Target URL.
31
- :param stealth: Whether stealth mode is enabled or not.
32
42
  :return: A dictionary of the new headers.
33
43
  """
34
44
  headers = headers or {}
@@ -36,12 +46,12 @@ class StaticEngine:
36
46
  # Validate headers
37
47
  if not headers.get('user-agent') and not headers.get('User-Agent'):
38
48
  headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
39
- logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
49
+ log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
40
50
 
41
- if stealth:
51
+ if self.stealth:
42
52
  extra_headers = generate_headers(browser_mode=False)
43
53
  headers.update(extra_headers)
44
- headers.update({'referer': generate_convincing_referer(url)})
54
+ headers.update({'referer': generate_convincing_referer(self.url)})
45
55
 
46
56
  return headers
47
57
 
@@ -61,69 +71,102 @@ class StaticEngine:
61
71
  cookies=dict(response.cookies),
62
72
  headers=dict(response.headers),
63
73
  request_headers=dict(response.request.headers),
74
+ method=response.request.method,
64
75
  **self.adaptor_arguments
65
76
  )
66
77
 
67
- def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
78
+ def get(self, **kwargs: Dict) -> Response:
68
79
  """Make basic HTTP GET request for you but with some added flavors.
69
80
 
70
- :param url: Target url.
71
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
72
- create a referer header as if this request had came from Google's search of this URL's domain.
73
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
74
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
81
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
75
82
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
76
83
  """
77
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
78
- with httpx.Client(proxy=proxy) as client:
79
- request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
84
+ headers = self._headers_job(kwargs.pop('headers', {}))
85
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
86
+ request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
80
87
 
81
88
  return self._prepare_response(request)
82
89
 
83
- def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
90
+ async def async_get(self, **kwargs: Dict) -> Response:
91
+ """Make basic async HTTP GET request for you but with some added flavors.
92
+
93
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
94
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
95
+ """
96
+ headers = self._headers_job(kwargs.pop('headers', {}))
97
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
98
+ request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
99
+
100
+ return self._prepare_response(request)
101
+
102
+ def post(self, **kwargs: Dict) -> Response:
84
103
  """Make basic HTTP POST request for you but with some added flavors.
85
104
 
86
- :param url: Target url.
87
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
88
- create a referer header as if this request had came from Google's search of this URL's domain.
89
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
90
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
105
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
91
106
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
92
107
  """
93
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
94
- with httpx.Client(proxy=proxy) as client:
95
- request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
108
+ headers = self._headers_job(kwargs.pop('headers', {}))
109
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
110
+ request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
96
111
 
97
112
  return self._prepare_response(request)
98
113
 
99
- def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
114
+ async def async_post(self, **kwargs: Dict) -> Response:
115
+ """Make basic async HTTP POST request for you but with some added flavors.
116
+
117
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
118
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
119
+ """
120
+ headers = self._headers_job(kwargs.pop('headers', {}))
121
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
122
+ request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
123
+
124
+ return self._prepare_response(request)
125
+
126
+ def delete(self, **kwargs: Dict) -> Response:
100
127
  """Make basic HTTP DELETE request for you but with some added flavors.
101
128
 
102
- :param url: Target url.
103
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
104
- create a referer header as if this request had came from Google's search of this URL's domain.
105
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
106
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
129
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
130
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
131
+ """
132
+ headers = self._headers_job(kwargs.pop('headers', {}))
133
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
134
+ request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
135
+
136
+ return self._prepare_response(request)
137
+
138
+ async def async_delete(self, **kwargs: Dict) -> Response:
139
+ """Make basic async HTTP DELETE request for you but with some added flavors.
140
+
141
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
107
142
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
108
143
  """
109
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
110
- with httpx.Client(proxy=proxy) as client:
111
- request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
144
+ headers = self._headers_job(kwargs.pop('headers', {}))
145
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
146
+ request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
147
 
113
148
  return self._prepare_response(request)
114
149
 
115
- def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
150
+ def put(self, **kwargs: Dict) -> Response:
116
151
  """Make basic HTTP PUT request for you but with some added flavors.
117
152
 
118
- :param url: Target url.
119
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
120
- create a referer header as if this request had came from Google's search of this URL's domain.
121
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
122
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
153
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
154
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
155
+ """
156
+ headers = self._headers_job(kwargs.pop('headers', {}))
157
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
158
+ request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
159
+
160
+ return self._prepare_response(request)
161
+
162
+ async def async_put(self, **kwargs: Dict) -> Response:
163
+ """Make basic async HTTP PUT request for you but with some added flavors.
164
+
165
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
123
166
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
124
167
  """
125
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
126
- with httpx.Client(proxy=proxy) as client:
127
- request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
168
+ headers = self._headers_job(kwargs.pop('headers', {}))
169
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
170
+ request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
128
171
 
129
172
  return self._prepare_response(request)
@@ -1,6 +1,6 @@
1
1
  from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
- check_type_validity, do_nothing, get_variable_name)
2
+ check_type_validity, get_variable_name)
3
3
  from .fingerprints import (generate_convincing_referer, generate_headers,
4
4
  get_os_name)
5
- from .navigation import (construct_cdp_url, construct_proxy_dict,
6
- intercept_route, js_bypass_path)
5
+ from .navigation import (async_intercept_route, construct_cdp_url,
6
+ construct_proxy_dict, intercept_route, js_bypass_path)