scrapling 0.2.94__tar.gz → 0.2.95__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {scrapling-0.2.94/scrapling.egg-info → scrapling-0.2.95}/PKG-INFO +4 -2
  2. {scrapling-0.2.94 → scrapling-0.2.95}/README.md +2 -0
  3. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/__init__.py +1 -1
  4. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/static.py +9 -6
  5. {scrapling-0.2.94 → scrapling-0.2.95/scrapling.egg-info}/PKG-INFO +4 -2
  6. {scrapling-0.2.94 → scrapling-0.2.95}/setup.cfg +2 -2
  7. {scrapling-0.2.94 → scrapling-0.2.95}/setup.py +3 -4
  8. {scrapling-0.2.94 → scrapling-0.2.95}/LICENSE +0 -0
  9. {scrapling-0.2.94 → scrapling-0.2.95}/MANIFEST.in +0 -0
  10. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/cli.py +0 -0
  11. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/__init__.py +0 -0
  12. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/_types.py +0 -0
  13. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/custom_types.py +0 -0
  14. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/mixins.py +0 -0
  15. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/storage_adaptors.py +0 -0
  16. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/translator.py +0 -0
  17. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/utils.py +0 -0
  18. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/defaults.py +0 -0
  19. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/__init__.py +0 -0
  20. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/camo.py +0 -0
  21. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/constants.py +0 -0
  22. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/pw.py +0 -0
  23. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/__init__.py +0 -0
  24. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  25. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  26. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  27. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  28. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  29. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  30. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  31. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/custom.py +0 -0
  32. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  33. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/navigation.py +0 -0
  34. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/fetchers.py +0 -0
  35. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/parser.py +0 -0
  36. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/py.typed +0 -0
  37. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/SOURCES.txt +0 -0
  38. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/dependency_links.txt +0 -0
  39. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/entry_points.txt +0 -0
  40. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/not-zip-safe +0 -0
  41. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/requires.txt +0 -0
  42. {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/top_level.txt +0 -0
  43. {scrapling-0.2.94 → scrapling-0.2.95}/tests/__init__.py +0 -0
  44. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/__init__.py +0 -0
  45. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/__init__.py +0 -0
  46. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/test_camoufox.py +0 -0
  47. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/test_httpx.py +0 -0
  48. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/test_playwright.py +0 -0
  49. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/__init__.py +0 -0
  50. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/test_camoufox.py +0 -0
  51. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/test_httpx.py +0 -0
  52. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/test_playwright.py +0 -0
  53. {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/test_utils.py +0 -0
  54. {scrapling-0.2.94 → scrapling-0.2.95}/tests/parser/__init__.py +0 -0
  55. {scrapling-0.2.94 → scrapling-0.2.95}/tests/parser/test_automatch.py +0 -0
  56. {scrapling-0.2.94 → scrapling-0.2.95}/tests/parser/test_general.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scrapling
3
- Version: 0.2.94
4
- Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
3
+ Version: 0.2.95
4
+ Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
@@ -275,6 +275,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
275
275
 
276
276
  For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
277
277
 
278
+ > Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
279
+
278
280
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
279
281
  ```python
280
282
  >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
@@ -220,6 +220,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
220
220
 
221
221
  For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
222
222
 
223
+ > Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
224
+
223
225
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
224
226
  ```python
225
227
  >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.94"
8
+ __version__ = "0.2.95"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
@@ -42,16 +42,19 @@ class StaticEngine:
42
42
  :return: A dictionary of the new headers.
43
43
  """
44
44
  headers = headers or {}
45
-
46
- # Validate headers
47
- if not headers.get('user-agent') and not headers.get('User-Agent'):
48
- headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
49
- log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
45
+ headers_keys = set(map(str.lower, headers.keys()))
50
46
 
51
47
  if self.stealth:
52
48
  extra_headers = generate_headers(browser_mode=False)
49
+ # Don't overwrite user supplied headers
50
+ extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
53
51
  headers.update(extra_headers)
54
- headers.update({'referer': generate_convincing_referer(self.url)})
52
+ if 'referer' not in headers_keys:
53
+ headers.update({'referer': generate_convincing_referer(self.url)})
54
+
55
+ elif 'user-agent' not in headers_keys:
56
+ headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
57
+ log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
55
58
 
56
59
  return headers
57
60
 
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scrapling
3
- Version: 0.2.94
4
- Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
3
+ Version: 0.2.95
4
+ Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
@@ -275,6 +275,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
275
275
 
276
276
  For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
277
277
 
278
+ > Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
279
+
278
280
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
279
281
  ```python
280
282
  >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
@@ -1,9 +1,9 @@
1
1
  [metadata]
2
2
  name = scrapling
3
- version = 0.2.94
3
+ version = 0.2.95
4
4
  author = Karim Shoair
5
5
  author_email = karim.shoair@pm.me
6
- description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
6
+ description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again!
7
7
  license = BSD
8
8
  home_page = https://github.com/D4Vinci/Scrapling
9
9
 
@@ -6,10 +6,9 @@ with open("README.md", "r", encoding="utf-8") as fh:
6
6
 
7
7
  setup(
8
8
  name="scrapling",
9
- version="0.2.94",
10
- description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
- simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
- impressive speed improvements over many popular scraping tools.""",
9
+ version="0.2.95",
10
+ description="""Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
11
+ it simplifies web scraping, even when websites' design changes, while providing impressive speed that surpasses almost all alternatives.""",
13
12
  long_description=long_description,
14
13
  long_description_content_type="text/markdown",
15
14
  author="Karim Shoair",
File without changes
File without changes
File without changes
File without changes