scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. scrapling/__init__.py +5 -4
  2. scrapling/core/_types.py +2 -3
  3. scrapling/core/custom_types.py +93 -11
  4. scrapling/core/storage_adaptors.py +9 -10
  5. scrapling/core/translator.py +6 -7
  6. scrapling/core/utils.py +35 -30
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/__init__.py +2 -2
  9. scrapling/engines/camo.py +96 -26
  10. scrapling/engines/constants.py +4 -4
  11. scrapling/engines/pw.py +166 -96
  12. scrapling/engines/static.py +94 -50
  13. scrapling/engines/toolbelt/__init__.py +6 -20
  14. scrapling/engines/toolbelt/custom.py +22 -23
  15. scrapling/engines/toolbelt/fingerprints.py +7 -7
  16. scrapling/engines/toolbelt/navigation.py +25 -12
  17. scrapling/fetchers.py +233 -17
  18. scrapling/parser.py +63 -28
  19. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
  20. scrapling-0.2.9.dist-info/RECORD +47 -0
  21. tests/fetchers/async/__init__.py +0 -0
  22. tests/fetchers/async/test_camoufox.py +95 -0
  23. tests/fetchers/async/test_httpx.py +83 -0
  24. tests/fetchers/async/test_playwright.py +99 -0
  25. tests/fetchers/sync/__init__.py +0 -0
  26. tests/fetchers/sync/test_camoufox.py +68 -0
  27. tests/fetchers/sync/test_httpx.py +82 -0
  28. tests/fetchers/sync/test_playwright.py +87 -0
  29. tests/fetchers/test_utils.py +90 -122
  30. tests/parser/test_automatch.py +64 -9
  31. tests/parser/test_general.py +263 -219
  32. scrapling-0.2.7.dist-info/RECORD +0 -42
  33. tests/fetchers/test_camoufox.py +0 -64
  34. tests/fetchers/test_httpx.py +0 -67
  35. tests/fetchers/test_playwright.py +0 -76
  36. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
  37. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
  38. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/engines/pw.py CHANGED
@@ -1,20 +1,16 @@
1
1
  import json
2
- import logging
3
- from scrapling.core._types import Union, Callable, Optional, List, Dict
4
-
5
- from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
- from scrapling.engines.toolbelt import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- js_bypass_path,
11
- intercept_route,
12
- generate_headers,
13
- construct_cdp_url,
14
- check_type_validity,
15
- construct_proxy_dict,
16
- generate_convincing_referer,
17
- )
2
+
3
+ from scrapling.core._types import Callable, Dict, Optional, Union
4
+ from scrapling.core.utils import log, lru_cache
5
+ from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
+ NSTBROWSER_DEFAULT_QUERY)
7
+ from scrapling.engines.toolbelt import (Response, StatusText,
8
+ async_intercept_route,
9
+ check_type_validity, construct_cdp_url,
10
+ construct_proxy_dict,
11
+ generate_convincing_referer,
12
+ generate_headers, intercept_route,
13
+ js_bypass_path)
18
14
 
19
15
 
20
16
  class PlaywrightEngine:
@@ -24,7 +20,7 @@ class PlaywrightEngine:
24
20
  useragent: Optional[str] = None,
25
21
  network_idle: Optional[bool] = False,
26
22
  timeout: Optional[float] = 30000,
27
- page_action: Callable = do_nothing,
23
+ page_action: Callable = None,
28
24
  wait_selector: Optional[str] = None,
29
25
  locale: Optional[str] = 'en-US',
30
26
  wait_selector_state: Optional[str] = 'attached',
@@ -79,11 +75,14 @@ class PlaywrightEngine:
79
75
  self.cdp_url = cdp_url
80
76
  self.useragent = useragent
81
77
  self.timeout = check_type_validity(timeout, [int, float], 30000)
82
- if callable(page_action):
83
- self.page_action = page_action
78
+ if page_action is not None:
79
+ if callable(page_action):
80
+ self.page_action = page_action
81
+ else:
82
+ self.page_action = None
83
+ log.error('[Ignored] Argument "page_action" must be callable')
84
84
  else:
85
- self.page_action = do_nothing
86
- logging.error('[Ignored] Argument "page_action" must be callable')
85
+ self.page_action = None
87
86
 
88
87
  self.wait_selector = wait_selector
89
88
  self.wait_selector_state = wait_selector_state
@@ -99,10 +98,8 @@ class PlaywrightEngine:
99
98
  # '--disable-extensions',
100
99
  ]
101
100
 
102
- def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
101
+ def _cdp_url_logic(self) -> str:
103
102
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
104
-
105
- :param flags: Chrome flags to be added to NSTBrowser query
106
103
  :return: CDP URL
107
104
  """
108
105
  cdp_url = self.cdp_url
@@ -111,7 +108,8 @@ class PlaywrightEngine:
111
108
  config = self.nstbrowser_config
112
109
  else:
113
110
  query = NSTBROWSER_DEFAULT_QUERY.copy()
114
- if flags:
111
+ if self.stealth:
112
+ flags = self.__set_flags()
115
113
  query.update({
116
114
  "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
117
115
  })
@@ -127,6 +125,68 @@ class PlaywrightEngine:
127
125
 
128
126
  return cdp_url
129
127
 
128
+ @lru_cache(typed=True)
129
+ def __set_flags(self):
130
+ """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
131
+ flags = DEFAULT_STEALTH_FLAGS
132
+ if self.hide_canvas:
133
+ flags += ('--fingerprinting-canvas-image-data-noise',)
134
+ if self.disable_webgl:
135
+ flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
136
+
137
+ return flags
138
+
139
+ def __launch_kwargs(self):
140
+ """Creates the arguments we will use while launching playwright's browser"""
141
+ launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
142
+ if self.stealth:
143
+ launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
144
+
145
+ return launch_kwargs
146
+
147
+ def __context_kwargs(self):
148
+ """Creates the arguments for the browser context"""
149
+ context_kwargs = {
150
+ "proxy": self.proxy,
151
+ "locale": self.locale,
152
+ "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
153
+ "device_scale_factor": 2,
154
+ "extra_http_headers": self.extra_headers if self.extra_headers else {},
155
+ "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
156
+ }
157
+ if self.stealth:
158
+ context_kwargs.update({
159
+ 'is_mobile': False,
160
+ 'has_touch': False,
161
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
162
+ 'service_workers': 'allow',
163
+ 'ignore_https_errors': True,
164
+ 'screen': {'width': 1920, 'height': 1080},
165
+ 'viewport': {'width': 1920, 'height': 1080},
166
+ 'permissions': ['geolocation', 'notifications']
167
+ })
168
+
169
+ return context_kwargs
170
+
171
+ @lru_cache()
172
+ def __stealth_scripts(self):
173
+ # Basic bypasses nothing fancy as I'm still working on it
174
+ # But with adding these bypasses to the above config, it bypasses many online tests like
175
+ # https://bot.sannysoft.com/
176
+ # https://kaliiiiiiiiii.github.io/brotector/
177
+ # https://pixelscan.net/
178
+ # https://iphey.com/
179
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
180
+ # https://arh.antoinevastel.com/bots/areyouheadless/
181
+ # https://prescience-data.github.io/execution-monitor.html
182
+ return tuple(
183
+ js_bypass_path(script) for script in (
184
+ # Order is important
185
+ 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
186
+ 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
187
+ )
188
+ )
189
+
130
190
  def fetch(self, url: str) -> Response:
131
191
  """Opens up the browser and do your request based on your chosen options.
132
192
 
@@ -140,61 +200,14 @@ class PlaywrightEngine:
140
200
  from rebrowser_playwright.sync_api import sync_playwright
141
201
 
142
202
  with sync_playwright() as p:
143
- # Handle the UserAgent early
144
- if self.useragent:
145
- extra_headers = {}
146
- useragent = self.useragent
147
- else:
148
- extra_headers = {}
149
- useragent = generate_headers(browser_mode=True).get('User-Agent')
150
-
151
- # Prepare the flags before diving
152
- flags = DEFAULT_STEALTH_FLAGS
153
- if self.hide_canvas:
154
- flags += ['--fingerprinting-canvas-image-data-noise']
155
- if self.disable_webgl:
156
- flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
157
-
158
203
  # Creating the browser
159
204
  if self.cdp_url:
160
- cdp_url = self._cdp_url_logic(flags if self.stealth else None)
205
+ cdp_url = self._cdp_url_logic()
161
206
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
162
207
  else:
163
- if self.stealth:
164
- browser = p.chromium.launch(
165
- headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
166
- )
167
- else:
168
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
169
-
170
- # Creating the context
171
- if self.stealth:
172
- context = browser.new_context(
173
- locale=self.locale,
174
- is_mobile=False,
175
- has_touch=False,
176
- proxy=self.proxy,
177
- color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
178
- user_agent=useragent,
179
- device_scale_factor=2,
180
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
181
- service_workers="allow",
182
- ignore_https_errors=True,
183
- extra_http_headers=extra_headers,
184
- screen={"width": 1920, "height": 1080},
185
- viewport={"width": 1920, "height": 1080},
186
- permissions=["geolocation", 'notifications'],
187
- )
188
- else:
189
- context = browser.new_context(
190
- locale=self.locale,
191
- proxy=self.proxy,
192
- color_scheme='dark',
193
- user_agent=useragent,
194
- device_scale_factor=2,
195
- extra_http_headers=extra_headers
196
- )
208
+ browser = p.chromium.launch(**self.__launch_kwargs())
197
209
 
210
+ context = browser.new_context(**self.__context_kwargs())
198
211
  # Finally we are in business
199
212
  page = context.new_page()
200
213
  page.set_default_navigation_timeout(self.timeout)
@@ -207,29 +220,16 @@ class PlaywrightEngine:
207
220
  page.route("**/*", intercept_route)
208
221
 
209
222
  if self.stealth:
210
- # Basic bypasses nothing fancy as I'm still working on it
211
- # But with adding these bypasses to the above config, it bypasses many online tests like
212
- # https://bot.sannysoft.com/
213
- # https://kaliiiiiiiiii.github.io/brotector/
214
- # https://pixelscan.net/
215
- # https://iphey.com/
216
- # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
217
- # https://arh.antoinevastel.com/bots/areyouheadless/
218
- # https://prescience-data.github.io/execution-monitor.html
219
- page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
220
- page.add_init_script(path=js_bypass_path('window_chrome.js'))
221
- page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
222
- page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
223
- page.add_init_script(path=js_bypass_path('notification_permission.js'))
224
- page.add_init_script(path=js_bypass_path('screen_props.js'))
225
- page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
223
+ for script in self.__stealth_scripts():
224
+ page.add_init_script(path=script)
226
225
 
227
226
  res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
228
227
  page.wait_for_load_state(state="domcontentloaded")
229
228
  if self.network_idle:
230
229
  page.wait_for_load_state('networkidle')
231
230
 
232
- page = self.page_action(page)
231
+ if self.page_action is not None:
232
+ page = self.page_action(page)
233
233
 
234
234
  if self.wait_selector and type(self.wait_selector) is str:
235
235
  waiter = page.locator(self.wait_selector)
@@ -242,11 +242,8 @@ class PlaywrightEngine:
242
242
 
243
243
  # This will be parsed inside `Response`
244
244
  encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
245
-
246
- status_text = res.status_text
247
245
  # PlayWright API sometimes give empty status text for some reason!
248
- if not status_text:
249
- status_text = StatusText.get(res.status)
246
+ status_text = res.status_text or StatusText.get(res.status)
250
247
 
251
248
  response = Response(
252
249
  url=res.url,
@@ -262,3 +259,76 @@ class PlaywrightEngine:
262
259
  )
263
260
  page.close()
264
261
  return response
262
+
263
+ async def async_fetch(self, url: str) -> Response:
264
+ """Async version of `fetch`
265
+
266
+ :param url: Target url.
267
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268
+ """
269
+ if not self.stealth or self.real_chrome:
270
+ # Because rebrowser_playwright doesn't play well with real browsers
271
+ from playwright.async_api import async_playwright
272
+ else:
273
+ from rebrowser_playwright.async_api import async_playwright
274
+
275
+ async with async_playwright() as p:
276
+ # Creating the browser
277
+ if self.cdp_url:
278
+ cdp_url = self._cdp_url_logic()
279
+ browser = await p.chromium.connect_over_cdp(endpoint_url=cdp_url)
280
+ else:
281
+ browser = await p.chromium.launch(**self.__launch_kwargs())
282
+
283
+ context = await browser.new_context(**self.__context_kwargs())
284
+ # Finally we are in business
285
+ page = await context.new_page()
286
+ page.set_default_navigation_timeout(self.timeout)
287
+ page.set_default_timeout(self.timeout)
288
+
289
+ if self.extra_headers:
290
+ await page.set_extra_http_headers(self.extra_headers)
291
+
292
+ if self.disable_resources:
293
+ await page.route("**/*", async_intercept_route)
294
+
295
+ if self.stealth:
296
+ for script in self.__stealth_scripts():
297
+ await page.add_init_script(path=script)
298
+
299
+ res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
300
+ await page.wait_for_load_state(state="domcontentloaded")
301
+ if self.network_idle:
302
+ await page.wait_for_load_state('networkidle')
303
+
304
+ if self.page_action is not None:
305
+ page = await self.page_action(page)
306
+
307
+ if self.wait_selector and type(self.wait_selector) is str:
308
+ waiter = page.locator(self.wait_selector)
309
+ await waiter.first.wait_for(state=self.wait_selector_state)
310
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
311
+ await page.wait_for_load_state(state="load")
312
+ await page.wait_for_load_state(state="domcontentloaded")
313
+ if self.network_idle:
314
+ await page.wait_for_load_state('networkidle')
315
+
316
+ # This will be parsed inside `Response`
317
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
318
+ # PlayWright API sometimes give empty status text for some reason!
319
+ status_text = res.status_text or StatusText.get(res.status)
320
+
321
+ response = Response(
322
+ url=res.url,
323
+ text=await page.content(),
324
+ body=(await page.content()).encode('utf-8'),
325
+ status=res.status,
326
+ reason=status_text,
327
+ encoding=encoding,
328
+ cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
329
+ headers=await res.all_headers(),
330
+ request_headers=await res.request.all_headers(),
331
+ **self.adaptor_arguments
332
+ )
333
+ await page.close()
334
+ return response
@@ -1,33 +1,44 @@
1
- import logging
2
-
3
- from scrapling.core._types import Union, Optional, Dict
4
- from .toolbelt import Response, generate_convincing_referer, generate_headers
5
-
6
1
  import httpx
7
2
  from httpx._models import Response as httpxResponse
8
3
 
4
+ from scrapling.core._types import Dict, Optional, Tuple, Union
5
+ from scrapling.core.utils import log, lru_cache
9
6
 
7
+ from .toolbelt import Response, generate_convincing_referer, generate_headers
8
+
9
+
10
+ @lru_cache(typed=True)
10
11
  class StaticEngine:
11
- def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ def __init__(
13
+ self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
14
+ timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
15
+ ):
12
16
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
13
17
 
18
+ :param url: Target url.
19
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
20
+ create a referer header as if this request had came from Google's search of this URL's domain.
21
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
14
22
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
15
23
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
16
24
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
17
25
  """
26
+ self.url = url
27
+ self.proxy = proxy
28
+ self.stealth = stealthy_headers
18
29
  self.timeout = timeout
19
30
  self.follow_redirects = bool(follow_redirects)
31
+ self.retries = retries
20
32
  self._extra_headers = generate_headers(browser_mode=False)
21
- self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
33
+ # Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
34
+ # So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
35
+ self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
22
36
 
23
- @staticmethod
24
- def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
37
+ def _headers_job(self, headers: Optional[Dict]) -> Dict:
25
38
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
39
  finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
40
 
28
41
  :param headers: Current headers in the request if the user passed any
29
- :param url: The Target URL.
30
- :param stealth: Whether stealth mode is enabled or not.
31
42
  :return: A dictionary of the new headers.
32
43
  """
33
44
  headers = headers or {}
@@ -35,12 +46,12 @@ class StaticEngine:
35
46
  # Validate headers
36
47
  if not headers.get('user-agent') and not headers.get('User-Agent'):
37
48
  headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
38
- logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
49
+ log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
39
50
 
40
- if stealth:
51
+ if self.stealth:
41
52
  extra_headers = generate_headers(browser_mode=False)
42
53
  headers.update(extra_headers)
43
- headers.update({'referer': generate_convincing_referer(url)})
54
+ headers.update({'referer': generate_convincing_referer(self.url)})
44
55
 
45
56
  return headers
46
57
 
@@ -60,69 +71,102 @@ class StaticEngine:
60
71
  cookies=dict(response.cookies),
61
72
  headers=dict(response.headers),
62
73
  request_headers=dict(response.request.headers),
74
+ method=response.request.method,
63
75
  **self.adaptor_arguments
64
76
  )
65
77
 
66
- def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
78
+ def get(self, **kwargs: Dict) -> Response:
67
79
  """Make basic HTTP GET request for you but with some added flavors.
68
80
 
69
- :param url: Target url.
70
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
71
- create a referer header as if this request had came from Google's search of this URL's domain.
72
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
73
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
81
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
82
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
83
+ """
84
+ headers = self._headers_job(kwargs.pop('headers', {}))
85
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
86
+ request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
87
+
88
+ return self._prepare_response(request)
89
+
90
+ async def async_get(self, **kwargs: Dict) -> Response:
91
+ """Make basic async HTTP GET request for you but with some added flavors.
92
+
93
+ :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
74
94
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
75
95
  """
76
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
77
- with httpx.Client(proxy=proxy) as client:
78
- request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
96
+ headers = self._headers_job(kwargs.pop('headers', {}))
97
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
98
+ request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
79
99
 
80
100
  return self._prepare_response(request)
81
101
 
82
- def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
102
+ def post(self, **kwargs: Dict) -> Response:
83
103
  """Make basic HTTP POST request for you but with some added flavors.
84
104
 
85
- :param url: Target url.
86
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
87
- create a referer header as if this request had came from Google's search of this URL's domain.
88
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
89
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
105
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
106
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
107
+ """
108
+ headers = self._headers_job(kwargs.pop('headers', {}))
109
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
110
+ request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
111
+
112
+ return self._prepare_response(request)
113
+
114
+ async def async_post(self, **kwargs: Dict) -> Response:
115
+ """Make basic async HTTP POST request for you but with some added flavors.
116
+
117
+ :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
90
118
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
91
119
  """
92
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
93
- with httpx.Client(proxy=proxy) as client:
94
- request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
120
+ headers = self._headers_job(kwargs.pop('headers', {}))
121
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
122
+ request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
95
123
 
96
124
  return self._prepare_response(request)
97
125
 
98
- def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
126
+ def delete(self, **kwargs: Dict) -> Response:
99
127
  """Make basic HTTP DELETE request for you but with some added flavors.
100
128
 
101
- :param url: Target url.
102
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
103
- create a referer header as if this request had came from Google's search of this URL's domain.
104
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
105
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
129
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
106
130
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
107
131
  """
108
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
109
- with httpx.Client(proxy=proxy) as client:
110
- request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
132
+ headers = self._headers_job(kwargs.pop('headers', {}))
133
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
134
+ request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
111
135
 
112
136
  return self._prepare_response(request)
113
137
 
114
- def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
138
+ async def async_delete(self, **kwargs: Dict) -> Response:
139
+ """Make basic async HTTP DELETE request for you but with some added flavors.
140
+
141
+ :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
142
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
143
+ """
144
+ headers = self._headers_job(kwargs.pop('headers', {}))
145
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
146
+ request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
147
+
148
+ return self._prepare_response(request)
149
+
150
+ def put(self, **kwargs: Dict) -> Response:
115
151
  """Make basic HTTP PUT request for you but with some added flavors.
116
152
 
117
- :param url: Target url.
118
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
119
- create a referer header as if this request had came from Google's search of this URL's domain.
120
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
121
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
153
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
154
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
155
+ """
156
+ headers = self._headers_job(kwargs.pop('headers', {}))
157
+ with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
158
+ request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
159
+
160
+ return self._prepare_response(request)
161
+
162
+ async def async_put(self, **kwargs: Dict) -> Response:
163
+ """Make basic async HTTP PUT request for you but with some added flavors.
164
+
165
+ :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
122
166
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
123
167
  """
124
- headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
125
- with httpx.Client(proxy=proxy) as client:
126
- request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
168
+ headers = self._headers_job(kwargs.pop('headers', {}))
169
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
170
+ request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
127
171
 
128
172
  return self._prepare_response(request)
@@ -1,20 +1,6 @@
1
- from .fingerprints import (
2
- get_os_name,
3
- generate_headers,
4
- generate_convincing_referer,
5
- )
6
- from .custom import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- BaseFetcher,
11
- get_variable_name,
12
- check_type_validity,
13
- check_if_engine_usable,
14
- )
15
- from .navigation import (
16
- js_bypass_path,
17
- intercept_route,
18
- construct_cdp_url,
19
- construct_proxy_dict,
20
- )
1
+ from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
+ check_type_validity, get_variable_name)
3
+ from .fingerprints import (generate_convincing_referer, generate_headers,
4
+ get_os_name)
5
+ from .navigation import (async_intercept_route, construct_cdp_url,
6
+ construct_proxy_dict, intercept_route, js_bypass_path)