scrapling 0.2.4__tar.gz → 0.2.5__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.4/scrapling.egg-info → scrapling-0.2.5}/PKG-INFO +1 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/__init__.py +1 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/custom_types.py +2 -3
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/translator.py +6 -4
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/camo.py +3 -6
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/pw.py +3 -6
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/static.py +5 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/custom.py +69 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/fetchers.py +15 -7
- {scrapling-0.2.4 → scrapling-0.2.5/scrapling.egg-info}/PKG-INFO +1 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling.egg-info/SOURCES.txt +1 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/setup.cfg +1 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/setup.py +1 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/fetchers/test_camoufox.py +2 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/fetchers/test_playwright.py +2 -0
- scrapling-0.2.5/tests/fetchers/test_utils.py +129 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/parser/test_general.py +1 -1
- {scrapling-0.2.4 → scrapling-0.2.5}/LICENSE +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/MANIFEST.in +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/README.md +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/defaults.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/parser.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling/py.typed +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/__init__.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/fetchers/test_httpx.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.4 → scrapling-0.2.5}/tests/parser/test_automatch.py +0 -0
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.5"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
@@ -129,9 +129,8 @@ class TextHandlers(List[TextHandler]):
|
|
129
129
|
|
130
130
|
|
131
131
|
class AttributesHandler(Mapping):
|
132
|
-
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
133
|
-
|
134
|
-
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
132
|
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
133
|
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
135
134
|
"""
|
136
135
|
__slots__ = ('_data',)
|
137
136
|
|
@@ -1,9 +1,11 @@
|
|
1
1
|
"""
|
2
2
|
Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
|
3
|
-
|
4
|
-
which will be important in future releases but most importantly...
|
5
|
-
|
6
|
-
|
3
|
+
|
4
|
+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
|
5
|
+
|
6
|
+
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
|
7
|
+
|
8
|
+
if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
|
7
9
|
"""
|
8
10
|
|
9
11
|
import re
|
@@ -104,13 +104,10 @@ class CamoufoxEngine:
|
|
104
104
|
|
105
105
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
106
|
waiter = page.locator(self.wait_selector)
|
107
|
-
waiter.wait_for(state=self.wait_selector_state)
|
107
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
encoding = 'utf-8' # default encoding
|
112
|
-
if 'charset=' in content_type.lower():
|
113
|
-
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
109
|
+
# This will be parsed inside `Response`
|
110
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
114
111
|
|
115
112
|
status_text = res.status_text
|
116
113
|
# PlayWright API sometimes give empty status text for some reason!
|
@@ -214,13 +214,10 @@ class PlaywrightEngine:
|
|
214
214
|
|
215
215
|
if self.wait_selector and type(self.wait_selector) is str:
|
216
216
|
waiter = page.locator(self.wait_selector)
|
217
|
-
waiter.wait_for(state=self.wait_selector_state)
|
217
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
218
218
|
|
219
|
-
|
220
|
-
|
221
|
-
encoding = 'utf-8' # default encoding
|
222
|
-
if 'charset=' in content_type.lower():
|
223
|
-
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
219
|
+
# This will be parsed inside `Response`
|
220
|
+
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
224
221
|
|
225
222
|
status_text = res.status_text
|
226
223
|
# PlayWright API sometimes give empty status text for some reason!
|
@@ -23,7 +23,7 @@ class StaticEngine:
|
|
23
23
|
@staticmethod
|
24
24
|
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
25
25
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
26
|
-
|
26
|
+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
27
27
|
|
28
28
|
:param headers: Current headers in the request if the user passed any
|
29
29
|
:param url: The Target URL.
|
@@ -65,6 +65,7 @@ class StaticEngine:
|
|
65
65
|
|
66
66
|
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
67
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
|
+
|
68
69
|
:param url: Target url.
|
69
70
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
70
71
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -77,6 +78,7 @@ class StaticEngine:
|
|
77
78
|
|
78
79
|
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
79
80
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
|
+
|
80
82
|
:param url: Target url.
|
81
83
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
82
84
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -89,6 +91,7 @@ class StaticEngine:
|
|
89
91
|
|
90
92
|
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
91
93
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
|
+
|
92
95
|
:param url: Target url.
|
93
96
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
94
97
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -101,6 +104,7 @@ class StaticEngine:
|
|
101
104
|
|
102
105
|
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
103
106
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
|
+
|
104
108
|
:param url: Target url.
|
105
109
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
106
110
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
@@ -3,11 +3,78 @@ Functions related to custom types or type checking
|
|
3
3
|
"""
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
|
+
from email.message import Message
|
6
7
|
|
7
8
|
from scrapling.core.custom_types import MappingProxyType
|
8
9
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
10
|
from scrapling.core.utils import setup_basic_logging, cache
|
10
|
-
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
11
|
+
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
|
12
|
+
|
13
|
+
|
14
|
+
class ResponseEncoding:
|
15
|
+
__DEFAULT_ENCODING = "utf-8"
|
16
|
+
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
@cache(maxsize=None)
|
20
|
+
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
21
|
+
"""Parse content type and parameters from a content-type header value.
|
22
|
+
|
23
|
+
Uses `email.message.Message` for robust header parsing according to RFC 2045.
|
24
|
+
|
25
|
+
:param header_value: Raw content-type header string
|
26
|
+
:return: Tuple of (content_type, parameters_dict)
|
27
|
+
"""
|
28
|
+
# Create a Message object and set the Content-Type header then get the content type and parameters
|
29
|
+
msg = Message()
|
30
|
+
msg['content-type'] = header_value
|
31
|
+
|
32
|
+
content_type = msg.get_content_type()
|
33
|
+
params = dict(msg.get_params(failobj=[]))
|
34
|
+
|
35
|
+
# Remove the content-type from params if present somehow
|
36
|
+
params.pop('content-type', None)
|
37
|
+
|
38
|
+
return content_type, params
|
39
|
+
|
40
|
+
@classmethod
|
41
|
+
@cache(maxsize=None)
|
42
|
+
def get_value(cls, content_type: Optional[str]) -> str:
|
43
|
+
"""Determine the appropriate character encoding from a content-type header.
|
44
|
+
|
45
|
+
The encoding is determined by these rules in order:
|
46
|
+
1. If no content-type is provided, use UTF-8
|
47
|
+
2. If charset parameter is present, use that encoding
|
48
|
+
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
|
49
|
+
4. If content-type is application/json, use UTF-8 per RFC 4627
|
50
|
+
5. Default to UTF-8 if nothing else matches
|
51
|
+
|
52
|
+
:param content_type: Content-Type header value or None
|
53
|
+
:return: String naming the character encoding
|
54
|
+
"""
|
55
|
+
if not content_type:
|
56
|
+
return cls.__DEFAULT_ENCODING
|
57
|
+
|
58
|
+
try:
|
59
|
+
content_type, params = cls.__parse_content_type(content_type)
|
60
|
+
|
61
|
+
# First check for explicit charset parameter
|
62
|
+
if "charset" in params:
|
63
|
+
encoding = params["charset"].strip("'\"")
|
64
|
+
"test".encode(encoding) # Validate encoding
|
65
|
+
return encoding
|
66
|
+
|
67
|
+
# Apply content-type specific rules
|
68
|
+
if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
69
|
+
return "ISO-8859-1"
|
70
|
+
|
71
|
+
if content_type == "application/json":
|
72
|
+
return cls.__DEFAULT_ENCODING
|
73
|
+
|
74
|
+
return cls.__DEFAULT_ENCODING
|
75
|
+
|
76
|
+
except (ValueError, LookupError, UnicodeEncodeError):
|
77
|
+
return cls.__DEFAULT_ENCODING
|
11
78
|
|
12
79
|
|
13
80
|
class Response(Adaptor):
|
@@ -20,6 +87,7 @@ class Response(Adaptor):
|
|
20
87
|
self.cookies = cookies
|
21
88
|
self.headers = headers
|
22
89
|
self.request_headers = request_headers
|
90
|
+
encoding = ResponseEncoding.get_value(encoding)
|
23
91
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
24
92
|
# For back-ward compatibility
|
25
93
|
self.adaptor = self
|
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
|
|
11
11
|
"""
|
12
12
|
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
13
13
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
|
+
|
14
15
|
:param url: Target url.
|
15
16
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
16
17
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -24,6 +25,7 @@ class Fetcher(BaseFetcher):
|
|
24
25
|
|
25
26
|
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
26
27
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
|
+
|
27
29
|
:param url: Target url.
|
28
30
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
29
31
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -37,12 +39,14 @@ class Fetcher(BaseFetcher):
|
|
37
39
|
|
38
40
|
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
39
41
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
|
+
|
40
43
|
:param url: Target url
|
41
44
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
42
45
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
43
46
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
44
|
-
|
47
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
45
48
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
|
+
|
46
50
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
47
51
|
"""
|
48
52
|
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
@@ -50,6 +54,7 @@ class Fetcher(BaseFetcher):
|
|
50
54
|
|
51
55
|
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
52
56
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
|
+
|
53
58
|
:param url: Target url
|
54
59
|
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
55
60
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
@@ -77,6 +82,7 @@ class StealthyFetcher(BaseFetcher):
|
|
77
82
|
) -> Response:
|
78
83
|
"""
|
79
84
|
Opens up a browser and do your request based on your chosen options below.
|
85
|
+
|
80
86
|
:param url: Target url.
|
81
87
|
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
82
88
|
:param block_images: Prevent the loading of images through Firefox preferences.
|
@@ -127,14 +133,15 @@ class PlayWrightFetcher(BaseFetcher):
|
|
127
133
|
Using this Fetcher class, you can do requests with:
|
128
134
|
- Vanilla Playwright without any modifications other than the ones you chose.
|
129
135
|
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
136
|
+
Some of the things stealth mode does include:
|
137
|
+
1) Patches the CDP runtime fingerprint.
|
138
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
139
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
140
|
+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
135
141
|
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
136
142
|
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
137
|
-
|
143
|
+
|
144
|
+
> Note that these are the main options with PlayWright but it can be mixed together.
|
138
145
|
"""
|
139
146
|
def fetch(
|
140
147
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
@@ -147,6 +154,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
147
154
|
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
148
155
|
) -> Response:
|
149
156
|
"""Opens up a browser and do your request based on your chosen options below.
|
157
|
+
|
150
158
|
:param url: Target url.
|
151
159
|
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
152
160
|
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
9
|
+
version="0.2.5",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
@@ -36,6 +36,7 @@ class TestStealthyFetcher(unittest.TestCase):
|
|
36
36
|
def test_waiting_selector(self):
|
37
37
|
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
38
|
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
39
40
|
|
40
41
|
def test_cookies_loading(self):
|
41
42
|
"""Test if cookies are set after the request"""
|
@@ -56,6 +57,7 @@ class TestStealthyFetcher(unittest.TestCase):
|
|
56
57
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
57
58
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
58
59
|
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
60
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
|
59
61
|
|
60
62
|
def test_infinite_timeout(self):
|
61
63
|
"""Test if infinite timeout breaks the code or not"""
|
@@ -35,6 +35,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
|
|
35
35
|
def test_waiting_selector(self):
|
36
36
|
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
37
|
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
38
39
|
|
39
40
|
def test_cookies_loading(self):
|
40
41
|
"""Test if cookies are set after the request"""
|
@@ -56,6 +57,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
|
|
56
57
|
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
57
58
|
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
58
59
|
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
60
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
|
59
61
|
|
60
62
|
def test_cdp_url(self):
|
61
63
|
"""Test if it's going to try to connect to cdp url or not"""
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
|
4
|
+
|
5
|
+
|
6
|
+
class TestPlayWrightFetcher(unittest.TestCase):
|
7
|
+
def setUp(self):
|
8
|
+
self.content_type_map = {
|
9
|
+
# A map generated by ChatGPT for most possible `content_type` values and the expected outcome
|
10
|
+
'text/html; charset=UTF-8': 'UTF-8',
|
11
|
+
'text/html; charset=ISO-8859-1': 'ISO-8859-1',
|
12
|
+
'text/html': 'ISO-8859-1',
|
13
|
+
'application/json; charset=UTF-8': 'UTF-8',
|
14
|
+
'application/json': 'utf-8',
|
15
|
+
'text/json': 'utf-8',
|
16
|
+
'application/javascript; charset=UTF-8': 'UTF-8',
|
17
|
+
'application/javascript': 'utf-8',
|
18
|
+
'text/plain; charset=UTF-8': 'UTF-8',
|
19
|
+
'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
|
20
|
+
'text/plain': 'ISO-8859-1',
|
21
|
+
'application/xhtml+xml; charset=UTF-8': 'UTF-8',
|
22
|
+
'application/xhtml+xml': 'utf-8',
|
23
|
+
'text/html; charset=windows-1252': 'windows-1252',
|
24
|
+
'application/json; charset=windows-1252': 'windows-1252',
|
25
|
+
'text/plain; charset=windows-1252': 'windows-1252',
|
26
|
+
'text/html; charset="UTF-8"': 'UTF-8',
|
27
|
+
'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
|
28
|
+
'text/html; charset="windows-1252"': 'windows-1252',
|
29
|
+
'application/json; charset="UTF-8"': 'UTF-8',
|
30
|
+
'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
|
31
|
+
'application/json; charset="windows-1252"': 'windows-1252',
|
32
|
+
'text/json; charset="UTF-8"': 'UTF-8',
|
33
|
+
'application/javascript; charset="UTF-8"': 'UTF-8',
|
34
|
+
'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
|
35
|
+
'text/plain; charset="UTF-8"': 'UTF-8',
|
36
|
+
'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
|
37
|
+
'text/plain; charset="windows-1252"': 'windows-1252',
|
38
|
+
'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
|
39
|
+
'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
40
|
+
'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
|
41
|
+
'text/html; charset="US-ASCII"': 'US-ASCII',
|
42
|
+
'application/json; charset="US-ASCII"': 'US-ASCII',
|
43
|
+
'text/plain; charset="US-ASCII"': 'US-ASCII',
|
44
|
+
'text/html; charset="Shift_JIS"': 'Shift_JIS',
|
45
|
+
'application/json; charset="Shift_JIS"': 'Shift_JIS',
|
46
|
+
'text/plain; charset="Shift_JIS"': 'Shift_JIS',
|
47
|
+
'application/xml; charset="UTF-8"': 'UTF-8',
|
48
|
+
'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
49
|
+
'application/xml': 'utf-8',
|
50
|
+
'text/xml; charset="UTF-8"': 'UTF-8',
|
51
|
+
'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
52
|
+
'text/xml': 'utf-8'
|
53
|
+
}
|
54
|
+
self.status_map = {
|
55
|
+
100: "Continue",
|
56
|
+
101: "Switching Protocols",
|
57
|
+
102: "Processing",
|
58
|
+
103: "Early Hints",
|
59
|
+
200: "OK",
|
60
|
+
201: "Created",
|
61
|
+
202: "Accepted",
|
62
|
+
203: "Non-Authoritative Information",
|
63
|
+
204: "No Content",
|
64
|
+
205: "Reset Content",
|
65
|
+
206: "Partial Content",
|
66
|
+
207: "Multi-Status",
|
67
|
+
208: "Already Reported",
|
68
|
+
226: "IM Used",
|
69
|
+
300: "Multiple Choices",
|
70
|
+
301: "Moved Permanently",
|
71
|
+
302: "Found",
|
72
|
+
303: "See Other",
|
73
|
+
304: "Not Modified",
|
74
|
+
305: "Use Proxy",
|
75
|
+
307: "Temporary Redirect",
|
76
|
+
308: "Permanent Redirect",
|
77
|
+
400: "Bad Request",
|
78
|
+
401: "Unauthorized",
|
79
|
+
402: "Payment Required",
|
80
|
+
403: "Forbidden",
|
81
|
+
404: "Not Found",
|
82
|
+
405: "Method Not Allowed",
|
83
|
+
406: "Not Acceptable",
|
84
|
+
407: "Proxy Authentication Required",
|
85
|
+
408: "Request Timeout",
|
86
|
+
409: "Conflict",
|
87
|
+
410: "Gone",
|
88
|
+
411: "Length Required",
|
89
|
+
412: "Precondition Failed",
|
90
|
+
413: "Payload Too Large",
|
91
|
+
414: "URI Too Long",
|
92
|
+
415: "Unsupported Media Type",
|
93
|
+
416: "Range Not Satisfiable",
|
94
|
+
417: "Expectation Failed",
|
95
|
+
418: "I'm a teapot",
|
96
|
+
421: "Misdirected Request",
|
97
|
+
422: "Unprocessable Entity",
|
98
|
+
423: "Locked",
|
99
|
+
424: "Failed Dependency",
|
100
|
+
425: "Too Early",
|
101
|
+
426: "Upgrade Required",
|
102
|
+
428: "Precondition Required",
|
103
|
+
429: "Too Many Requests",
|
104
|
+
431: "Request Header Fields Too Large",
|
105
|
+
451: "Unavailable For Legal Reasons",
|
106
|
+
500: "Internal Server Error",
|
107
|
+
501: "Not Implemented",
|
108
|
+
502: "Bad Gateway",
|
109
|
+
503: "Service Unavailable",
|
110
|
+
504: "Gateway Timeout",
|
111
|
+
505: "HTTP Version Not Supported",
|
112
|
+
506: "Variant Also Negotiates",
|
113
|
+
507: "Insufficient Storage",
|
114
|
+
508: "Loop Detected",
|
115
|
+
510: "Not Extended",
|
116
|
+
511: "Network Authentication Required"
|
117
|
+
}
|
118
|
+
|
119
|
+
def test_parsing_content_type(self):
|
120
|
+
"""Test if parsing different types of content-type returns the expected result"""
|
121
|
+
for header_value, expected_encoding in self.content_type_map.items():
|
122
|
+
self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
|
123
|
+
|
124
|
+
def test_parsing_response_status(self):
|
125
|
+
"""Test if using different http responses' status codes returns the expected result"""
|
126
|
+
for status_code, expected_status_text in self.status_map.items():
|
127
|
+
self.assertEqual(StatusText.get(status_code), expected_status_text)
|
128
|
+
|
129
|
+
self.assertEqual(StatusText.get(1000), "Unknown Status Code")
|
@@ -278,7 +278,7 @@ class TestParser(unittest.TestCase):
|
|
278
278
|
self.assertEqual(len(elements), 5000)
|
279
279
|
# Converting 5000 elements to a class and doing operations on them will take time
|
280
280
|
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
281
|
-
self.assertLess(end_time - start_time, 0.1
|
281
|
+
self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
|
282
282
|
|
283
283
|
|
284
284
|
# Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.4 → scrapling-0.2.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|