scrapling 0.2.94__tar.gz → 0.2.95__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.2.94/scrapling.egg-info → scrapling-0.2.95}/PKG-INFO +4 -2
- {scrapling-0.2.94 → scrapling-0.2.95}/README.md +2 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/__init__.py +1 -1
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/static.py +9 -6
- {scrapling-0.2.94 → scrapling-0.2.95/scrapling.egg-info}/PKG-INFO +4 -2
- {scrapling-0.2.94 → scrapling-0.2.95}/setup.cfg +2 -2
- {scrapling-0.2.94 → scrapling-0.2.95}/setup.py +3 -4
- {scrapling-0.2.94 → scrapling-0.2.95}/LICENSE +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/MANIFEST.in +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/cli.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/translator.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/defaults.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/camo.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/pw.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/fetchers.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/parser.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling/py.typed +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/async/test_playwright.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/sync/test_playwright.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.94 → scrapling-0.2.95}/tests/parser/test_general.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is
|
3
|
+
Version: 0.2.95
|
4
|
+
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -275,6 +275,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
|
|
275
275
|
|
276
276
|
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
277
277
|
|
278
|
+
> Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
|
279
|
+
|
278
280
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
279
281
|
```python
|
280
282
|
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
@@ -220,6 +220,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
|
|
220
220
|
|
221
221
|
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
222
222
|
|
223
|
+
> Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
|
224
|
+
|
223
225
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
224
226
|
```python
|
225
227
|
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.95"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
@@ -42,16 +42,19 @@ class StaticEngine:
|
|
42
42
|
:return: A dictionary of the new headers.
|
43
43
|
"""
|
44
44
|
headers = headers or {}
|
45
|
-
|
46
|
-
# Validate headers
|
47
|
-
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
48
|
-
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
49
|
-
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
45
|
+
headers_keys = set(map(str.lower, headers.keys()))
|
50
46
|
|
51
47
|
if self.stealth:
|
52
48
|
extra_headers = generate_headers(browser_mode=False)
|
49
|
+
# Don't overwrite user supplied headers
|
50
|
+
extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
|
53
51
|
headers.update(extra_headers)
|
54
|
-
|
52
|
+
if 'referer' not in headers_keys:
|
53
|
+
headers.update({'referer': generate_convincing_referer(self.url)})
|
54
|
+
|
55
|
+
elif 'user-agent' not in headers_keys:
|
56
|
+
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
57
|
+
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
55
58
|
|
56
59
|
return headers
|
57
60
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is
|
3
|
+
Version: 0.2.95
|
4
|
+
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -275,6 +275,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
|
|
275
275
|
|
276
276
|
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
277
277
|
|
278
|
+
> Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
|
279
|
+
|
278
280
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
279
281
|
```python
|
280
282
|
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
@@ -1,9 +1,9 @@
|
|
1
1
|
[metadata]
|
2
2
|
name = scrapling
|
3
|
-
version = 0.2.
|
3
|
+
version = 0.2.95
|
4
4
|
author = Karim Shoair
|
5
5
|
author_email = karim.shoair@pm.me
|
6
|
-
description = Scrapling is an undetectable, powerful, flexible,
|
6
|
+
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again!
|
7
7
|
license = BSD
|
8
8
|
home_page = https://github.com/D4Vinci/Scrapling
|
9
9
|
|
@@ -6,10 +6,9 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
10
|
-
description="""Scrapling is
|
11
|
-
|
12
|
-
impressive speed improvements over many popular scraping tools.""",
|
9
|
+
version="0.2.95",
|
10
|
+
description="""Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
11
|
+
it simplifies web scraping, even when websites' design changes, while providing impressive speed that surpasses almost all alternatives.""",
|
13
12
|
long_description=long_description,
|
14
13
|
long_description_content_type="text/markdown",
|
15
14
|
author="Karim Shoair",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.94 → scrapling-0.2.95}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|