scrapling 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
4
4
  from scrapling.core.custom_types import TextHandler, AttributesHandler
5
5
 
6
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2.3"
7
+ __version__ = "0.2.5"
8
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
9
 
10
10
 
@@ -129,9 +129,8 @@ class TextHandlers(List[TextHandler]):
129
129
 
130
130
 
131
131
  class AttributesHandler(Mapping):
132
- """A read-only mapping to use instead of the standard dictionary for the speed boost but
133
- at the same time I use it to add more functionalities.
134
- If standard dictionary is needed, just convert this class to dictionary with `dict` function
132
+ """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
133
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
135
134
  """
136
135
  __slots__ = ('_data',)
137
136
 
@@ -1,9 +1,11 @@
1
1
  """
2
2
  Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
3
- To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
4
- which will be important in future releases but most importantly...
5
- so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
6
- > if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
3
+
4
+ To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
5
+
6
+ So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
7
+
8
+ if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
7
9
  """
8
10
 
9
11
  import re
scrapling/engines/camo.py CHANGED
@@ -4,6 +4,7 @@ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
4
4
  from scrapling.engines.toolbelt import (
5
5
  Response,
6
6
  do_nothing,
7
+ StatusText,
7
8
  get_os_name,
8
9
  intercept_route,
9
10
  check_type_validity,
@@ -103,20 +104,22 @@ class CamoufoxEngine:
103
104
 
104
105
  if self.wait_selector and type(self.wait_selector) is str:
105
106
  waiter = page.locator(self.wait_selector)
106
- waiter.wait_for(state=self.wait_selector_state)
107
+ waiter.first.wait_for(state=self.wait_selector_state)
107
108
 
108
- content_type = res.headers.get('content-type', '')
109
- # Parse charset from content-type
110
- encoding = 'utf-8' # default encoding
111
- if 'charset=' in content_type.lower():
112
- encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
109
+ # This will be parsed inside `Response`
110
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
111
+
112
+ status_text = res.status_text
113
+ # PlayWright API sometimes give empty status text for some reason!
114
+ if not status_text:
115
+ status_text = StatusText.get(res.status)
113
116
 
114
117
  response = Response(
115
118
  url=res.url,
116
119
  text=page.content(),
117
- body=res.body(),
120
+ body=page.content().encode('utf-8'),
118
121
  status=res.status,
119
- reason=res.status_text,
122
+ reason=status_text,
120
123
  encoding=encoding,
121
124
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
122
125
  headers=res.all_headers(),
scrapling/engines/pw.py CHANGED
@@ -6,6 +6,7 @@ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAUL
6
6
  from scrapling.engines.toolbelt import (
7
7
  Response,
8
8
  do_nothing,
9
+ StatusText,
9
10
  js_bypass_path,
10
11
  intercept_route,
11
12
  generate_headers,
@@ -213,20 +214,22 @@ class PlaywrightEngine:
213
214
 
214
215
  if self.wait_selector and type(self.wait_selector) is str:
215
216
  waiter = page.locator(self.wait_selector)
216
- waiter.wait_for(state=self.wait_selector_state)
217
+ waiter.first.wait_for(state=self.wait_selector_state)
217
218
 
218
- content_type = res.headers.get('content-type', '')
219
- # Parse charset from content-type
220
- encoding = 'utf-8' # default encoding
221
- if 'charset=' in content_type.lower():
222
- encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
219
+ # This will be parsed inside `Response`
220
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
221
+
222
+ status_text = res.status_text
223
+ # PlayWright API sometimes give empty status text for some reason!
224
+ if not status_text:
225
+ status_text = StatusText.get(res.status)
223
226
 
224
227
  response = Response(
225
228
  url=res.url,
226
229
  text=page.content(),
227
- body=res.body(),
230
+ body=page.content().encode('utf-8'),
228
231
  status=res.status,
229
- reason=res.status_text,
232
+ reason=status_text,
230
233
  encoding=encoding,
231
234
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
232
235
  headers=res.all_headers(),
@@ -23,7 +23,7 @@ class StaticEngine:
23
23
  @staticmethod
24
24
  def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
25
25
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
- finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
26
+ finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
27
 
28
28
  :param headers: Current headers in the request if the user passed any
29
29
  :param url: The Target URL.
@@ -65,6 +65,7 @@ class StaticEngine:
65
65
 
66
66
  def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
67
  """Make basic HTTP GET request for you but with some added flavors.
68
+
68
69
  :param url: Target url.
69
70
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
71
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -77,6 +78,7 @@ class StaticEngine:
77
78
 
78
79
  def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
79
80
  """Make basic HTTP POST request for you but with some added flavors.
81
+
80
82
  :param url: Target url.
81
83
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
84
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -89,6 +91,7 @@ class StaticEngine:
89
91
 
90
92
  def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
91
93
  """Make basic HTTP DELETE request for you but with some added flavors.
94
+
92
95
  :param url: Target url.
93
96
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
97
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -101,6 +104,7 @@ class StaticEngine:
101
104
 
102
105
  def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103
106
  """Make basic HTTP PUT request for you but with some added flavors.
107
+
104
108
  :param url: Target url.
105
109
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
110
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -6,6 +6,7 @@ from .fingerprints import (
6
6
  from .custom import (
7
7
  Response,
8
8
  do_nothing,
9
+ StatusText,
9
10
  BaseFetcher,
10
11
  get_variable_name,
11
12
  check_type_validity,
@@ -3,10 +3,78 @@ Functions related to custom types or type checking
3
3
  """
4
4
  import inspect
5
5
  import logging
6
+ from email.message import Message
6
7
 
7
- from scrapling.core.utils import setup_basic_logging
8
+ from scrapling.core.custom_types import MappingProxyType
8
9
  from scrapling.parser import Adaptor, SQLiteStorageSystem
9
- from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
10
+ from scrapling.core.utils import setup_basic_logging, cache
11
+ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
12
+
13
+
14
+ class ResponseEncoding:
15
+ __DEFAULT_ENCODING = "utf-8"
16
+ __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
17
+
18
+ @classmethod
19
+ @cache(maxsize=None)
20
+ def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
21
+ """Parse content type and parameters from a content-type header value.
22
+
23
+ Uses `email.message.Message` for robust header parsing according to RFC 2045.
24
+
25
+ :param header_value: Raw content-type header string
26
+ :return: Tuple of (content_type, parameters_dict)
27
+ """
28
+ # Create a Message object and set the Content-Type header then get the content type and parameters
29
+ msg = Message()
30
+ msg['content-type'] = header_value
31
+
32
+ content_type = msg.get_content_type()
33
+ params = dict(msg.get_params(failobj=[]))
34
+
35
+ # Remove the content-type from params if present somehow
36
+ params.pop('content-type', None)
37
+
38
+ return content_type, params
39
+
40
+ @classmethod
41
+ @cache(maxsize=None)
42
+ def get_value(cls, content_type: Optional[str]) -> str:
43
+ """Determine the appropriate character encoding from a content-type header.
44
+
45
+ The encoding is determined by these rules in order:
46
+ 1. If no content-type is provided, use UTF-8
47
+ 2. If charset parameter is present, use that encoding
48
+ 3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
49
+ 4. If content-type is application/json, use UTF-8 per RFC 4627
50
+ 5. Default to UTF-8 if nothing else matches
51
+
52
+ :param content_type: Content-Type header value or None
53
+ :return: String naming the character encoding
54
+ """
55
+ if not content_type:
56
+ return cls.__DEFAULT_ENCODING
57
+
58
+ try:
59
+ content_type, params = cls.__parse_content_type(content_type)
60
+
61
+ # First check for explicit charset parameter
62
+ if "charset" in params:
63
+ encoding = params["charset"].strip("'\"")
64
+ "test".encode(encoding) # Validate encoding
65
+ return encoding
66
+
67
+ # Apply content-type specific rules
68
+ if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
69
+ return "ISO-8859-1"
70
+
71
+ if content_type == "application/json":
72
+ return cls.__DEFAULT_ENCODING
73
+
74
+ return cls.__DEFAULT_ENCODING
75
+
76
+ except (ValueError, LookupError, UnicodeEncodeError):
77
+ return cls.__DEFAULT_ENCODING
10
78
 
11
79
 
12
80
  class Response(Adaptor):
@@ -19,6 +87,7 @@ class Response(Adaptor):
19
87
  self.cookies = cookies
20
88
  self.headers = headers
21
89
  self.request_headers = request_headers
90
+ encoding = ResponseEncoding.get_value(encoding)
22
91
  super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
23
92
  # For back-ward compatibility
24
93
  self.adaptor = self
@@ -67,6 +136,83 @@ class BaseFetcher:
67
136
  self.adaptor_arguments.update({'automatch_domain': automatch_domain})
68
137
 
69
138
 
139
+ class StatusText:
140
+ """A class that gets the status text of response status code.
141
+
142
+ Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
143
+ """
144
+ _phrases = MappingProxyType({
145
+ 100: "Continue",
146
+ 101: "Switching Protocols",
147
+ 102: "Processing",
148
+ 103: "Early Hints",
149
+ 200: "OK",
150
+ 201: "Created",
151
+ 202: "Accepted",
152
+ 203: "Non-Authoritative Information",
153
+ 204: "No Content",
154
+ 205: "Reset Content",
155
+ 206: "Partial Content",
156
+ 207: "Multi-Status",
157
+ 208: "Already Reported",
158
+ 226: "IM Used",
159
+ 300: "Multiple Choices",
160
+ 301: "Moved Permanently",
161
+ 302: "Found",
162
+ 303: "See Other",
163
+ 304: "Not Modified",
164
+ 305: "Use Proxy",
165
+ 307: "Temporary Redirect",
166
+ 308: "Permanent Redirect",
167
+ 400: "Bad Request",
168
+ 401: "Unauthorized",
169
+ 402: "Payment Required",
170
+ 403: "Forbidden",
171
+ 404: "Not Found",
172
+ 405: "Method Not Allowed",
173
+ 406: "Not Acceptable",
174
+ 407: "Proxy Authentication Required",
175
+ 408: "Request Timeout",
176
+ 409: "Conflict",
177
+ 410: "Gone",
178
+ 411: "Length Required",
179
+ 412: "Precondition Failed",
180
+ 413: "Payload Too Large",
181
+ 414: "URI Too Long",
182
+ 415: "Unsupported Media Type",
183
+ 416: "Range Not Satisfiable",
184
+ 417: "Expectation Failed",
185
+ 418: "I'm a teapot",
186
+ 421: "Misdirected Request",
187
+ 422: "Unprocessable Entity",
188
+ 423: "Locked",
189
+ 424: "Failed Dependency",
190
+ 425: "Too Early",
191
+ 426: "Upgrade Required",
192
+ 428: "Precondition Required",
193
+ 429: "Too Many Requests",
194
+ 431: "Request Header Fields Too Large",
195
+ 451: "Unavailable For Legal Reasons",
196
+ 500: "Internal Server Error",
197
+ 501: "Not Implemented",
198
+ 502: "Bad Gateway",
199
+ 503: "Service Unavailable",
200
+ 504: "Gateway Timeout",
201
+ 505: "HTTP Version Not Supported",
202
+ 506: "Variant Also Negotiates",
203
+ 507: "Insufficient Storage",
204
+ 508: "Loop Detected",
205
+ 510: "Not Extended",
206
+ 511: "Network Authentication Required"
207
+ })
208
+
209
+ @classmethod
210
+ @cache(maxsize=128)
211
+ def get(cls, status_code: int) -> str:
212
+ """Get the phrase for a given HTTP status code."""
213
+ return cls._phrases.get(status_code, "Unknown Status Code")
214
+
215
+
70
216
  def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
71
217
  """This function check if the passed engine can be used by a Fetcher-type class or not.
72
218
 
scrapling/fetchers.py CHANGED
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
11
11
  """
12
12
  def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
13
13
  """Make basic HTTP GET request for you but with some added flavors.
14
+
14
15
  :param url: Target url.
15
16
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
16
17
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -24,6 +25,7 @@ class Fetcher(BaseFetcher):
24
25
 
25
26
  def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
26
27
  """Make basic HTTP POST request for you but with some added flavors.
28
+
27
29
  :param url: Target url.
28
30
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
29
31
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -37,12 +39,14 @@ class Fetcher(BaseFetcher):
37
39
 
38
40
  def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
39
41
  """Make basic HTTP PUT request for you but with some added flavors.
42
+
40
43
  :param url: Target url
41
44
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
42
45
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
43
46
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
44
- create a referer header as if this request came from Google's search of this URL's domain.
47
+ create a referer header as if this request came from Google's search of this URL's domain.
45
48
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
49
+
46
50
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
47
51
  """
48
52
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
@@ -50,6 +54,7 @@ class Fetcher(BaseFetcher):
50
54
 
51
55
  def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
52
56
  """Make basic HTTP DELETE request for you but with some added flavors.
57
+
53
58
  :param url: Target url
54
59
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
55
60
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -77,6 +82,7 @@ class StealthyFetcher(BaseFetcher):
77
82
  ) -> Response:
78
83
  """
79
84
  Opens up a browser and do your request based on your chosen options below.
85
+
80
86
  :param url: Target url.
81
87
  :param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
82
88
  :param block_images: Prevent the loading of images through Firefox preferences.
@@ -127,14 +133,15 @@ class PlayWrightFetcher(BaseFetcher):
127
133
  Using this Fetcher class, you can do requests with:
128
134
  - Vanilla Playwright without any modifications other than the ones you chose.
129
135
  - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
130
- Some of the things stealth mode does include:
131
- 1) Patches the CDP runtime fingerprint.
132
- 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
133
- 3) Using custom flags on launch to hide Playwright even more and make it faster.
134
- 4) Generates real browser's headers of the same type and same user OS then append it to the request.
136
+ Some of the things stealth mode does include:
137
+ 1) Patches the CDP runtime fingerprint.
138
+ 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
139
+ 3) Using custom flags on launch to hide Playwright even more and make it faster.
140
+ 4) Generates real browser's headers of the same type and same user OS then append it to the request.
135
141
  - Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
136
142
  - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
137
- > Note that these are the main options with PlayWright but it can be mixed together.
143
+
144
+ > Note that these are the main options with PlayWright but it can be mixed together.
138
145
  """
139
146
  def fetch(
140
147
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
@@ -147,6 +154,7 @@ class PlayWrightFetcher(BaseFetcher):
147
154
  nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
148
155
  ) -> Response:
149
156
  """Opens up a browser and do your request based on your chosen options below.
157
+
150
158
  :param url: Target url.
151
159
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
152
160
  :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -32,16 +32,16 @@ Classifier: Typing :: Typed
32
32
  Requires-Python: >=3.8
33
33
  Description-Content-Type: text/markdown
34
34
  License-File: LICENSE
35
- Requires-Dist: requests >=2.3
36
- Requires-Dist: lxml >=4.5
37
- Requires-Dist: cssselect >=1.2
35
+ Requires-Dist: requests>=2.3
36
+ Requires-Dist: lxml>=4.5
37
+ Requires-Dist: cssselect>=1.2
38
38
  Requires-Dist: w3lib
39
- Requires-Dist: orjson >=3
39
+ Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox >=0.3.10
44
+ Requires-Dist: camoufox>=0.3.10
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -1,22 +1,22 @@
1
- scrapling/__init__.py,sha256=tNB1LdlhamZYjlxqbEwo1BpsVE48Bt8acfjl8DNKpEE,435
1
+ scrapling/__init__.py,sha256=eSTwhUqPltd41XPPwSXElWmeRalaQEncnjBahW3KUcQ,435
2
2
  scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
3
- scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
3
+ scrapling/fetchers.py,sha256=T3aRXvvpXDbql_2BqJ90KLtgfF4HLUOImdoxWLCcY-A,16045
4
4
  scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
5
5
  scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
6
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
8
- scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
8
+ scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
9
9
  scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
10
  scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
11
- scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
11
+ scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
12
12
  scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
13
13
  scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
14
- scrapling/engines/camo.py,sha256=41vp2Nh51kKuOSZ1PijsIpROpQZgFfUPybVbEX8pEXk,7530
14
+ scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
15
15
  scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
16
- scrapling/engines/pw.py,sha256=l5MrSW_WNBKAxAlyxbt09ka_lEGo61XKuaOgWpYmvHk,12102
17
- scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
18
- scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
19
- scrapling/engines/toolbelt/custom.py,sha256=8lvGHWIZoOotSTF97KgPb3CbJquel2QFx8rP8Hf2sQ4,7469
16
+ scrapling/engines/pw.py,sha256=Llr2HjEr5kgyffEJH5N4XdWrSWvcil9WAwE1RJnOhFo,12190
17
+ scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
18
+ scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
19
+ scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
20
20
  scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
21
21
  scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
22
22
  scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
@@ -28,14 +28,15 @@ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gI
28
28
  scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
29
29
  tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
30
30
  tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
31
- tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
31
+ tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
32
32
  tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
33
- tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-TxUcs,3479
33
+ tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
34
+ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
34
35
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
36
  tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
36
- tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
37
- scrapling-0.2.3.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
38
- scrapling-0.2.3.dist-info/METADATA,sha256=YGUnQmOdwDVs7rHLUCG9hpLg567s_5I0Cu8TIKKDc_Y,64785
39
- scrapling-0.2.3.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
40
- scrapling-0.2.3.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
41
- scrapling-0.2.3.dist-info/RECORD,,
37
+ tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
38
+ scrapling-0.2.5.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
39
+ scrapling-0.2.5.dist-info/METADATA,sha256=oY8RtycwNJXbgA4ePsVkpj6SKP6Y6vMmOgF9L3lc0mg,64780
40
+ scrapling-0.2.5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
41
+ scrapling-0.2.5.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
42
+ scrapling-0.2.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.5.0)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -36,6 +36,7 @@ class TestStealthyFetcher(unittest.TestCase):
36
36
  def test_waiting_selector(self):
37
37
  """Test if waiting for a selector make page does not finish loading or not"""
38
38
  self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
39
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
39
40
 
40
41
  def test_cookies_loading(self):
41
42
  """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ class TestStealthyFetcher(unittest.TestCase):
56
57
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
57
58
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
58
59
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
60
+ self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
59
61
 
60
62
  def test_infinite_timeout(self):
61
63
  """Test if infinite timeout breaks the code or not"""
@@ -35,6 +35,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
35
35
  def test_waiting_selector(self):
36
36
  """Test if waiting for a selector make page does not finish loading or not"""
37
37
  self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
38
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
38
39
 
39
40
  def test_cookies_loading(self):
40
41
  """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
56
57
  self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
57
58
  self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
58
59
  self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
60
+ self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
59
61
 
60
62
  def test_cdp_url(self):
61
63
  """Test if it's going to try to connect to cdp url or not"""
@@ -0,0 +1,129 @@
1
+ import unittest
2
+
3
+ from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
4
+
5
+
6
+ class TestPlayWrightFetcher(unittest.TestCase):
7
+ def setUp(self):
8
+ self.content_type_map = {
9
+ # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
+ 'text/html; charset=UTF-8': 'UTF-8',
11
+ 'text/html; charset=ISO-8859-1': 'ISO-8859-1',
12
+ 'text/html': 'ISO-8859-1',
13
+ 'application/json; charset=UTF-8': 'UTF-8',
14
+ 'application/json': 'utf-8',
15
+ 'text/json': 'utf-8',
16
+ 'application/javascript; charset=UTF-8': 'UTF-8',
17
+ 'application/javascript': 'utf-8',
18
+ 'text/plain; charset=UTF-8': 'UTF-8',
19
+ 'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
20
+ 'text/plain': 'ISO-8859-1',
21
+ 'application/xhtml+xml; charset=UTF-8': 'UTF-8',
22
+ 'application/xhtml+xml': 'utf-8',
23
+ 'text/html; charset=windows-1252': 'windows-1252',
24
+ 'application/json; charset=windows-1252': 'windows-1252',
25
+ 'text/plain; charset=windows-1252': 'windows-1252',
26
+ 'text/html; charset="UTF-8"': 'UTF-8',
27
+ 'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
28
+ 'text/html; charset="windows-1252"': 'windows-1252',
29
+ 'application/json; charset="UTF-8"': 'UTF-8',
30
+ 'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
31
+ 'application/json; charset="windows-1252"': 'windows-1252',
32
+ 'text/json; charset="UTF-8"': 'UTF-8',
33
+ 'application/javascript; charset="UTF-8"': 'UTF-8',
34
+ 'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
35
+ 'text/plain; charset="UTF-8"': 'UTF-8',
36
+ 'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
37
+ 'text/plain; charset="windows-1252"': 'windows-1252',
38
+ 'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
39
+ 'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
40
+ 'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
41
+ 'text/html; charset="US-ASCII"': 'US-ASCII',
42
+ 'application/json; charset="US-ASCII"': 'US-ASCII',
43
+ 'text/plain; charset="US-ASCII"': 'US-ASCII',
44
+ 'text/html; charset="Shift_JIS"': 'Shift_JIS',
45
+ 'application/json; charset="Shift_JIS"': 'Shift_JIS',
46
+ 'text/plain; charset="Shift_JIS"': 'Shift_JIS',
47
+ 'application/xml; charset="UTF-8"': 'UTF-8',
48
+ 'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
49
+ 'application/xml': 'utf-8',
50
+ 'text/xml; charset="UTF-8"': 'UTF-8',
51
+ 'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
52
+ 'text/xml': 'utf-8'
53
+ }
54
+ self.status_map = {
55
+ 100: "Continue",
56
+ 101: "Switching Protocols",
57
+ 102: "Processing",
58
+ 103: "Early Hints",
59
+ 200: "OK",
60
+ 201: "Created",
61
+ 202: "Accepted",
62
+ 203: "Non-Authoritative Information",
63
+ 204: "No Content",
64
+ 205: "Reset Content",
65
+ 206: "Partial Content",
66
+ 207: "Multi-Status",
67
+ 208: "Already Reported",
68
+ 226: "IM Used",
69
+ 300: "Multiple Choices",
70
+ 301: "Moved Permanently",
71
+ 302: "Found",
72
+ 303: "See Other",
73
+ 304: "Not Modified",
74
+ 305: "Use Proxy",
75
+ 307: "Temporary Redirect",
76
+ 308: "Permanent Redirect",
77
+ 400: "Bad Request",
78
+ 401: "Unauthorized",
79
+ 402: "Payment Required",
80
+ 403: "Forbidden",
81
+ 404: "Not Found",
82
+ 405: "Method Not Allowed",
83
+ 406: "Not Acceptable",
84
+ 407: "Proxy Authentication Required",
85
+ 408: "Request Timeout",
86
+ 409: "Conflict",
87
+ 410: "Gone",
88
+ 411: "Length Required",
89
+ 412: "Precondition Failed",
90
+ 413: "Payload Too Large",
91
+ 414: "URI Too Long",
92
+ 415: "Unsupported Media Type",
93
+ 416: "Range Not Satisfiable",
94
+ 417: "Expectation Failed",
95
+ 418: "I'm a teapot",
96
+ 421: "Misdirected Request",
97
+ 422: "Unprocessable Entity",
98
+ 423: "Locked",
99
+ 424: "Failed Dependency",
100
+ 425: "Too Early",
101
+ 426: "Upgrade Required",
102
+ 428: "Precondition Required",
103
+ 429: "Too Many Requests",
104
+ 431: "Request Header Fields Too Large",
105
+ 451: "Unavailable For Legal Reasons",
106
+ 500: "Internal Server Error",
107
+ 501: "Not Implemented",
108
+ 502: "Bad Gateway",
109
+ 503: "Service Unavailable",
110
+ 504: "Gateway Timeout",
111
+ 505: "HTTP Version Not Supported",
112
+ 506: "Variant Also Negotiates",
113
+ 507: "Insufficient Storage",
114
+ 508: "Loop Detected",
115
+ 510: "Not Extended",
116
+ 511: "Network Authentication Required"
117
+ }
118
+
119
+ def test_parsing_content_type(self):
120
+ """Test if parsing different types of content-type returns the expected result"""
121
+ for header_value, expected_encoding in self.content_type_map.items():
122
+ self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
123
+
124
+ def test_parsing_response_status(self):
125
+ """Test if using different http responses' status codes returns the expected result"""
126
+ for status_code, expected_status_text in self.status_map.items():
127
+ self.assertEqual(StatusText.get(status_code), expected_status_text)
128
+
129
+ self.assertEqual(StatusText.get(1000), "Unknown Status Code")
@@ -278,7 +278,7 @@ class TestParser(unittest.TestCase):
278
278
  self.assertEqual(len(elements), 5000)
279
279
  # Converting 5000 elements to a class and doing operations on them will take time
280
280
  # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
281
- self.assertLess(end_time - start_time, 0.1)
281
+ self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
282
282
 
283
283
 
284
284
  # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report