scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. scrapling/__init__.py +5 -4
  2. scrapling/core/_types.py +2 -3
  3. scrapling/core/custom_types.py +93 -11
  4. scrapling/core/storage_adaptors.py +9 -10
  5. scrapling/core/translator.py +6 -7
  6. scrapling/core/utils.py +35 -30
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/__init__.py +2 -2
  9. scrapling/engines/camo.py +96 -26
  10. scrapling/engines/constants.py +4 -4
  11. scrapling/engines/pw.py +166 -96
  12. scrapling/engines/static.py +94 -50
  13. scrapling/engines/toolbelt/__init__.py +6 -20
  14. scrapling/engines/toolbelt/custom.py +22 -23
  15. scrapling/engines/toolbelt/fingerprints.py +7 -7
  16. scrapling/engines/toolbelt/navigation.py +25 -12
  17. scrapling/fetchers.py +233 -17
  18. scrapling/parser.py +63 -28
  19. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
  20. scrapling-0.2.9.dist-info/RECORD +47 -0
  21. tests/fetchers/async/__init__.py +0 -0
  22. tests/fetchers/async/test_camoufox.py +95 -0
  23. tests/fetchers/async/test_httpx.py +83 -0
  24. tests/fetchers/async/test_playwright.py +99 -0
  25. tests/fetchers/sync/__init__.py +0 -0
  26. tests/fetchers/sync/test_camoufox.py +68 -0
  27. tests/fetchers/sync/test_httpx.py +82 -0
  28. tests/fetchers/sync/test_playwright.py +87 -0
  29. tests/fetchers/test_utils.py +90 -122
  30. tests/parser/test_automatch.py +64 -9
  31. tests/parser/test_general.py +263 -219
  32. scrapling-0.2.7.dist-info/RECORD +0 -42
  33. tests/fetchers/test_camoufox.py +0 -64
  34. tests/fetchers/test_httpx.py +0 -67
  35. tests/fetchers/test_playwright.py +0 -76
  36. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
  37. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
  38. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py CHANGED
@@ -1,20 +1,16 @@
1
1
  import json
2
- import logging
3
- from scrapling.core._types import Union, Callable, Optional, List, Dict
4
-
5
- from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
- from scrapling.engines.toolbelt import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- js_bypass_path,
11
- intercept_route,
12
- generate_headers,
13
- construct_cdp_url,
14
- check_type_validity,
15
- construct_proxy_dict,
16
- generate_convincing_referer,
17
- )
2
+
3
+ from scrapling.core._types import Callable, Dict, Optional, Union
4
+ from scrapling.core.utils import log, lru_cache
5
+ from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
+ NSTBROWSER_DEFAULT_QUERY)
7
+ from scrapling.engines.toolbelt import (Response, StatusText,
8
+ async_intercept_route,
9
+ check_type_validity, construct_cdp_url,
10
+ construct_proxy_dict,
11
+ generate_convincing_referer,
12
+ generate_headers, intercept_route,
13
+ js_bypass_path)
18
14
 
19
15
 
20
16
  class PlaywrightEngine:
@@ -24,7 +20,7 @@ class PlaywrightEngine:
24
20
  useragent: Optional[str] = None,
25
21
  network_idle: Optional[bool] = False,
26
22
  timeout: Optional[float] = 30000,
27
- page_action: Callable = do_nothing,
23
+ page_action: Callable = None,
28
24
  wait_selector: Optional[str] = None,
29
25
  locale: Optional[str] = 'en-US',
30
26
  wait_selector_state: Optional[str] = 'attached',
@@ -79,11 +75,14 @@ class PlaywrightEngine:
79
75
  self.cdp_url = cdp_url
80
76
  self.useragent = useragent
81
77
  self.timeout = check_type_validity(timeout, [int, float], 30000)
82
- if callable(page_action):
83
- self.page_action = page_action
78
+ if page_action is not None:
79
+ if callable(page_action):
80
+ self.page_action = page_action
81
+ else:
82
+ self.page_action = None
83
+ log.error('[Ignored] Argument "page_action" must be callable')
84
84
  else:
85
- self.page_action = do_nothing
86
- logging.error('[Ignored] Argument "page_action" must be callable')
85
+ self.page_action = None
87
86
 
88
87
  self.wait_selector = wait_selector
89
88
  self.wait_selector_state = wait_selector_state
@@ -99,10 +98,8 @@ class PlaywrightEngine:
99
98
  # '--disable-extensions',
100
99
  ]
101
100
 
102
- def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
101
+ def _cdp_url_logic(self) -> str:
103
102
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
104
-
105
- :param flags: Chrome flags to be added to NSTBrowser query
106
103
  :return: CDP URL
107
104
  """
108
105
  cdp_url = self.cdp_url
@@ -111,7 +108,8 @@ class PlaywrightEngine:
111
108
  config = self.nstbrowser_config
112
109
  else:
113
110
  query = NSTBROWSER_DEFAULT_QUERY.copy()
114
- if flags:
111
+ if self.stealth:
112
+ flags = self.__set_flags()
115
113
  query.update({
116
114
  "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
117
115
  })
@@ -127,6 +125,68 @@ class PlaywrightEngine:
127
125
 
128
126
  return cdp_url
129
127
 
128
+ @lru_cache(typed=True)
129
+ def __set_flags(self):
130
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
131
+ flags = DEFAULT_STEALTH_FLAGS
132
+ if self.hide_canvas:
133
+ flags += ('--fingerprinting-canvas-image-data-noise',)
134
+ if self.disable_webgl:
135
+ flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
136
+
137
+ return flags
138
+
139
+ def __launch_kwargs(self):
140
+ """Creates the arguments we will use while launching playwright's browser"""
141
+ launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
142
+ if self.stealth:
143
+ launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
144
+
145
+ return launch_kwargs
146
+
147
+ def __context_kwargs(self):
148
+ """Creates the arguments for the browser context"""
149
+ context_kwargs = {
150
+ "proxy": self.proxy,
151
+ "locale": self.locale,
152
+ "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
153
+ "device_scale_factor": 2,
154
+ "extra_http_headers": self.extra_headers if self.extra_headers else {},
155
+ "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
156
+ }
157
+ if self.stealth:
158
+ context_kwargs.update({
159
+ 'is_mobile': False,
160
+ 'has_touch': False,
161
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
162
+ 'service_workers': 'allow',
163
+ 'ignore_https_errors': True,
164
+ 'screen': {'width': 1920, 'height': 1080},
165
+ 'viewport': {'width': 1920, 'height': 1080},
166
+ 'permissions': ['geolocation', 'notifications']
167
+ })
168
+
169
+ return context_kwargs
170
+
171
+ @lru_cache()
172
+ def __stealth_scripts(self):
173
+ # Basic bypasses nothing fancy as I'm still working on it
174
+ # But with adding these bypasses to the above config, it bypasses many online tests like
175
+ # https://bot.sannysoft.com/
176
+ # https://kaliiiiiiiiii.github.io/brotector/
177
+ # https://pixelscan.net/
178
+ # https://iphey.com/
179
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
180
+ # https://arh.antoinevastel.com/bots/areyouheadless/
181
+ # https://prescience-data.github.io/execution-monitor.html
182
+ return tuple(
183
+ js_bypass_path(script) for script in (
184
+ # Order is important
185
+ 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
186
+ 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
187
+ )
188
+ )
189
+
130
190
  def fetch(self, url: str) -> Response:
131
191
  """Opens up the browser and do your request based on your chosen options.
132
192
 
@@ -140,61 +200,14 @@ class PlaywrightEngine:
140
200
  from rebrowser_playwright.sync_api import sync_playwright
141
201
 
142
202
  with sync_playwright() as p:
143
- # Handle the UserAgent early
144
- if self.useragent:
145
- extra_headers = {}
146
- useragent = self.useragent
147
- else:
148
- extra_headers = {}
149
- useragent = generate_headers(browser_mode=True).get('User-Agent')
150
-
151
- # Prepare the flags before diving
152
- flags = DEFAULT_STEALTH_FLAGS
153
- if self.hide_canvas:
154
- flags += ['--fingerprinting-canvas-image-data-noise']
155
- if self.disable_webgl:
156
- flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
157
-
158
203
  # Creating the browser
159
204
  if self.cdp_url:
160
- cdp_url = self._cdp_url_logic(flags if self.stealth else None)
205
+ cdp_url = self._cdp_url_logic()
161
206
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
162
207
  else:
163
- if self.stealth:
164
- browser = p.chromium.launch(
165
- headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
166
- )
167
- else:
168
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
169
-
170
- # Creating the context
171
- if self.stealth:
172
- context = browser.new_context(
173
- locale=self.locale,
174
- is_mobile=False,
175
- has_touch=False,
176
- proxy=self.proxy,
177
- color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
178
- user_agent=useragent,
179
- device_scale_factor=2,
180
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
181
- service_workers="allow",
182
- ignore_https_errors=True,
183
- extra_http_headers=extra_headers,
184
- screen={"width": 1920, "height": 1080},
185
- viewport={"width": 1920, "height": 1080},
186
- permissions=["geolocation", 'notifications'],
187
- )
188
- else:
189
- context = browser.new_context(
190
- locale=self.locale,
191
- proxy=self.proxy,
192
- color_scheme='dark',
193
- user_agent=useragent,
194
- device_scale_factor=2,
195
- extra_http_headers=extra_headers
196
- )
208
+ browser = p.chromium.launch(**self.__launch_kwargs())
197
209
 
210
+ context = browser.new_context(**self.__context_kwargs())
198
211
  # Finally we are in business
199
212
  page = context.new_page()
200
213
  page.set_default_navigation_timeout(self.timeout)
@@ -207,29 +220,16 @@ class PlaywrightEngine:
207
220
  page.route("**/*", intercept_route)
208
221
 
209
222
  if self.stealth:
210
- # Basic bypasses nothing fancy as I'm still working on it
211
- # But with adding these bypasses to the above config, it bypasses many online tests like
212
- # https://bot.sannysoft.com/
213
- # https://kaliiiiiiiiii.github.io/brotector/
214
- # https://pixelscan.net/
215
- # https://iphey.com/
216
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
217
- # https://arh.antoinevastel.com/bots/areyouheadless/
218
- # https://prescience-data.github.io/execution-monitor.html
219
- page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
220
- page.add_init_script(path=js_bypass_path('window_chrome.js'))
221
- page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
222
- page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
223
- page.add_init_script(path=js_bypass_path('notification_permission.js'))
224
- page.add_init_script(path=js_bypass_path('screen_props.js'))
225
- page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
223
+ for script in self.__stealth_scripts():
224
+ page.add_init_script(path=script)
226
225
 
227
226
  res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
228
227
  page.wait_for_load_state(state="domcontentloaded")
229
228
  if self.network_idle:
230
229
  page.wait_for_load_state('networkidle')
231
230
 
232
- page = self.page_action(page)
231
+ if self.page_action is not None:
232
+ page = self.page_action(page)
233
233
 
234
234
  if self.wait_selector and type(self.wait_selector) is str:
235
235
  waiter = page.locator(self.wait_selector)
@@ -242,11 +242,8 @@ class PlaywrightEngine:
242
242
 
243
243
  # This will be parsed inside `Response`
244
244
  encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
245
-
246
- status_text = res.status_text
247
245
  # PlayWright API sometimes give empty status text for some reason!
248
- if not status_text:
249
- status_text = StatusText.get(res.status)
246
+ status_text = res.status_text or StatusText.get(res.status)
250
247
 
251
248
  response = Response(
252
249
  url=res.url,
@@ -262,3 +259,76 @@ class PlaywrightEngine:
262
259
  )
263
260
  page.close()
264
261
  return response
262
+
263
+ async def async_fetch(self, url: str) -> Response:
264
+ """Async version of `fetch`
265
+
266
+ :param url: Target url.
267
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268
+ """
269
+ if not self.stealth or self.real_chrome:
270
+ # Because rebrowser_playwright doesn't play well with real browsers
271
+ from playwright.async_api import async_playwright
272
+ else:
273
+ from rebrowser_playwright.async_api import async_playwright
274
+
275
+ async with async_playwright() as p:
276
+ # Creating the browser
277
+ if self.cdp_url:
278
+ cdp_url = self._cdp_url_logic()
279
+ browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
280
+ else:
281
+ browser = await p.chromium.launch(**self.__launch_kwargs())
282
+
283
+ context = await browser.new_context(**self.__context_kwargs())
284
+ # Finally we are in business
285
+ page = await context.new_page()
286
+ page.set_default_navigation_timeout(self.timeout)
287
+ page.set_default_timeout(self.timeout)
288
+
289
+ if self.extra_headers:
290
+ await page.set_extra_http_headers(self.extra_headers)
291
+
292
+ if self.disable_resources:
293
+ await page.route("**/*", async_intercept_route)
294
+
295
+ if self.stealth:
296
+ for script in self.__stealth_scripts():
297
+ await page.add_init_script(path=script)
298
+
299
+ res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
300
+ await page.wait_for_load_state(state="domcontentloaded")
301
+ if self.network_idle:
302
+ await page.wait_for_load_state('networkidle')
303
+
304
+ if self.page_action is not None:
305
+ page = await self.page_action(page)
306
+
307
+ if self.wait_selector and type(self.wait_selector) is str:
308
+ waiter = page.locator(self.wait_selector)
309
+ await waiter.first.wait_for(state=self.wait_selector_state)
310
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
311
+ await page.wait_for_load_state(state="load")
312
+ await page.wait_for_load_state(state="domcontentloaded")
313
+ if self.network_idle:
314
+ await page.wait_for_load_state('networkidle')
315
+
316
+ # This will be parsed inside `Response`
317
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
318
+ # PlayWright API sometimes give empty status text for some reason!
319
+ status_text = res.status_text or StatusText.get(res.status)
320
+
321
+ response = Response(
322
+ url=res.url,
323
+ text=await page.content(),
324
+ body=(await page.content()).encode('utf-8'),
325
+ status=res.status,
326
+ reason=status_text,
327
+ encoding=encoding,
328
+ cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
329
+ headers=await res.all_headers(),
330
+ request_headers=await res.request.all_headers(),
331
+ **self.adaptor_arguments
332
+ )
333
+ await page.close()
334
+ return response
@@ -1,33 +1,44 @@
1
- import logging
2
-
3
- from scrapling.core._types import Union, Optional, Dict
4
- from .toolbelt import Response, generate_convincing_referer, generate_headers
5
-
6
1
  import httpx
7
2
  from httpx._models import Response as httpxResponse
8
3
 
4
+ from scrapling.core._types import Dict, Optional, Tuple, Union
5
+ from scrapling.core.utils import log, lru_cache
9
6
 
7
+ from .toolbelt import Response, generate_convincing_referer, generate_headers
8
+
9
+
10
+ @lru_cache(typed=True)
10
11
  class StaticEngine:
11
- def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ def __init__(
13
+ self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
14
+ timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
15
+ ):
12
16
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
13
17
 
18
+ :param url: Target url.
19
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
20
+ create a referer header as if this request had came from Google's search of this URL's domain.
21
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
14
22
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
15
23
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
16
24
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
17
25
  """
26
+ self.url = url
27
+ self.proxy = proxy
28
+ self.stealth = stealthy_headers
18
29
  self.timeout = timeout
19
30
  self.follow_redirects = bool(follow_redirects)
31
+ self.retries = retries
20
32
  self._extra_headers = generate_headers(browser_mode=False)
21
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
33
+ # Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
34
+ # So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
35
+ self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
22
36
 
23
- @staticmethod
24
- def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
37
+ def _headers_job(self, headers: Optional[Dict]) -> Dict:
25
38
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
39
  finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
40
 
28
41
  :param headers: Current headers in the request if the user passed any
29
- :param url: The Target URL.
30
- :param stealth: Whether stealth mode is enabled or not.
31
42
  :return: A dictionary of the new headers.
32
43
  """
33
44
  headers = headers or {}
@@ -35,12 +46,12 @@ class StaticEngine:
35
46
  # Validate headers
36
47
  if not headers.get('user-agent') and not headers.get('User-Agent'):
37
48
  headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
38
- logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
49
+ log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
39
50
 
40
- if stealth:
51
+ if self.stealth:
41
52
  extra_headers = generate_headers(browser_mode=False)
42
53
  headers.update(extra_headers)
43
- headers.update({'referer': generate_convincing_referer(url)})
54
+ headers.update({'referer': generate_convincing_referer(self.url)})
44
55
 
45
56
  return headers
46
57
 
@@ -60,69 +71,102 @@ class StaticEngine:
60
71
  cookies=dict(response.cookies),
61
72
  headers=dict(response.headers),
62
73
  request_headers=dict(response.request.headers),
74
+ method=response.request.method,
63
75
  **self.adaptor_arguments
64
76
  )
65
77
 
66
- def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
78
+ def get(self, **kwargs: Dict) -> Response:
67
79
  """Make basic HTTP GET request for you but with some added flavors.
68
80
 
69
- :param url: Target url.
70
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
71
- create a referer header as if this request had came from Google's search of this URL's domain.
72
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
73
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
81
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
82
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
83
+ """
84
+ headers = self._headers_job(kwargs.pop('headers', {}))
85
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
86
+ request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
87
+
88
+ return self._prepare_response(request)
89
+
90
+ async def async_get(self, **kwargs: Dict) -> Response:
91
+ """Make basic async HTTP GET request for you but with some added flavors.
92
+
93
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
74
94
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
75
95
  """
76
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
77
- with httpx.Client(proxy=proxy) as client:
78
- request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
96
+ headers = self._headers_job(kwargs.pop('headers', {}))
97
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
98
+ request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
79
99
 
80
100
  return self._prepare_response(request)
81
101
 
82
- def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
102
+ def post(self, **kwargs: Dict) -> Response:
83
103
  """Make basic HTTP POST request for you but with some added flavors.
84
104
 
85
- :param url: Target url.
86
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
87
- create a referer header as if this request had came from Google's search of this URL's domain.
88
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
89
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
105
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
106
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
107
+ """
108
+ headers = self._headers_job(kwargs.pop('headers', {}))
109
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
110
+ request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
111
+
112
+ return self._prepare_response(request)
113
+
114
+ async def async_post(self, **kwargs: Dict) -> Response:
115
+ """Make basic async HTTP POST request for you but with some added flavors.
116
+
117
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
90
118
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
91
119
  """
92
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
93
- with httpx.Client(proxy=proxy) as client:
94
- request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
120
+ headers = self._headers_job(kwargs.pop('headers', {}))
121
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
122
+ request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
95
123
 
96
124
  return self._prepare_response(request)
97
125
 
98
- def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
126
+ def delete(self, **kwargs: Dict) -> Response:
99
127
  """Make basic HTTP DELETE request for you but with some added flavors.
100
128
 
101
- :param url: Target url.
102
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
103
- create a referer header as if this request had came from Google's search of this URL's domain.
104
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
105
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
129
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
106
130
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
107
131
  """
108
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
109
- with httpx.Client(proxy=proxy) as client:
110
- request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
132
+ headers = self._headers_job(kwargs.pop('headers', {}))
133
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
134
+ request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
111
135
 
112
136
  return self._prepare_response(request)
113
137
 
114
- def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
138
+ async def async_delete(self, **kwargs: Dict) -> Response:
139
+ """Make basic async HTTP DELETE request for you but with some added flavors.
140
+
141
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
142
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
143
+ """
144
+ headers = self._headers_job(kwargs.pop('headers', {}))
145
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
146
+ request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
147
+
148
+ return self._prepare_response(request)
149
+
150
+ def put(self, **kwargs: Dict) -> Response:
115
151
  """Make basic HTTP PUT request for you but with some added flavors.
116
152
 
117
- :param url: Target url.
118
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
119
- create a referer header as if this request had came from Google's search of this URL's domain.
120
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
121
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
153
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
154
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
155
+ """
156
+ headers = self._headers_job(kwargs.pop('headers', {}))
157
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
158
+ request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
159
+
160
+ return self._prepare_response(request)
161
+
162
+ async def async_put(self, **kwargs: Dict) -> Response:
163
+ """Make basic async HTTP PUT request for you but with some added flavors.
164
+
165
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
122
166
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
123
167
  """
124
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
125
- with httpx.Client(proxy=proxy) as client:
126
- request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
168
+ headers = self._headers_job(kwargs.pop('headers', {}))
169
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
170
+ request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
127
171
 
128
172
  return self._prepare_response(request)
@@ -1,20 +1,6 @@
1
- from .fingerprints import (
2
- get_os_name,
3
- generate_headers,
4
- generate_convincing_referer,
5
- )
6
- from .custom import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- BaseFetcher,
11
- get_variable_name,
12
- check_type_validity,
13
- check_if_engine_usable,
14
- )
15
- from .navigation import (
16
- js_bypass_path,
17
- intercept_route,
18
- construct_cdp_url,
19
- construct_proxy_dict,
20
- )
1
+ from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
+ check_type_validity, get_variable_name)
3
+ from .fingerprints import (generate_convincing_referer, generate_headers,
4
+ get_os_name)
5
+ from .navigation import (async_intercept_route, construct_cdp_url,
6
+ construct_proxy_dict, intercept_route, js_bypass_path)