scrapling 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
4
4
  from scrapling.core.custom_types import TextHandler, AttributesHandler
5
5
 
6
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2.3"
7
+ __version__ = "0.2.5"
8
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
9
 
10
10
 
@@ -129,9 +129,8 @@ class TextHandlers(List[TextHandler]):
129
129
 
130
130
 
131
131
  class AttributesHandler(Mapping):
132
- """A read-only mapping to use instead of the standard dictionary for the speed boost but
133
- at the same time I use it to add more functionalities.
134
- If standard dictionary is needed, just convert this class to dictionary with `dict` function
132
+ """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
133
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
135
134
  """
136
135
  __slots__ = ('_data',)
137
136
 
@@ -1,9 +1,11 @@
1
1
  """
2
2
  Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
3
- To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
4
- which will be important in future releases but most importantly...
5
- so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
6
- > if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
3
+
4
+ To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
5
+
6
+ So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
7
+
8
+ if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
7
9
  """
8
10
 
9
11
  import re
scrapling/engines/camo.py CHANGED
@@ -4,6 +4,7 @@ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
4
4
  from scrapling.engines.toolbelt import (
5
5
  Response,
6
6
  do_nothing,
7
+ StatusText,
7
8
  get_os_name,
8
9
  intercept_route,
9
10
  check_type_validity,
@@ -103,20 +104,22 @@ class CamoufoxEngine:
103
104
 
104
105
  if self.wait_selector and type(self.wait_selector) is str:
105
106
  waiter = page.locator(self.wait_selector)
106
- waiter.wait_for(state=self.wait_selector_state)
107
+ waiter.first.wait_for(state=self.wait_selector_state)
107
108
 
108
- content_type = res.headers.get('content-type', '')
109
- # Parse charset from content-type
110
- encoding = 'utf-8' # default encoding
111
- if 'charset=' in content_type.lower():
112
- encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
109
+ # This will be parsed inside `Response`
110
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
111
+
112
+ status_text = res.status_text
113
+ # PlayWright API sometimes give empty status text for some reason!
114
+ if not status_text:
115
+ status_text = StatusText.get(res.status)
113
116
 
114
117
  response = Response(
115
118
  url=res.url,
116
119
  text=page.content(),
117
- body=res.body(),
120
+ body=page.content().encode('utf-8'),
118
121
  status=res.status,
119
- reason=res.status_text,
122
+ reason=status_text,
120
123
  encoding=encoding,
121
124
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
122
125
  headers=res.all_headers(),
scrapling/engines/pw.py CHANGED
@@ -6,6 +6,7 @@ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAUL
6
6
  from scrapling.engines.toolbelt import (
7
7
  Response,
8
8
  do_nothing,
9
+ StatusText,
9
10
  js_bypass_path,
10
11
  intercept_route,
11
12
  generate_headers,
@@ -213,20 +214,22 @@ class PlaywrightEngine:
213
214
 
214
215
  if self.wait_selector and type(self.wait_selector) is str:
215
216
  waiter = page.locator(self.wait_selector)
216
- waiter.wait_for(state=self.wait_selector_state)
217
+ waiter.first.wait_for(state=self.wait_selector_state)
217
218
 
218
- content_type = res.headers.get('content-type', '')
219
- # Parse charset from content-type
220
- encoding = 'utf-8' # default encoding
221
- if 'charset=' in content_type.lower():
222
- encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
219
+ # This will be parsed inside `Response`
220
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
221
+
222
+ status_text = res.status_text
223
+ # PlayWright API sometimes give empty status text for some reason!
224
+ if not status_text:
225
+ status_text = StatusText.get(res.status)
223
226
 
224
227
  response = Response(
225
228
  url=res.url,
226
229
  text=page.content(),
227
- body=res.body(),
230
+ body=page.content().encode('utf-8'),
228
231
  status=res.status,
229
- reason=res.status_text,
232
+ reason=status_text,
230
233
  encoding=encoding,
231
234
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
232
235
  headers=res.all_headers(),
@@ -23,7 +23,7 @@ class StaticEngine:
23
23
  @staticmethod
24
24
  def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
25
25
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
- finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
26
+ finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
27
 
28
28
  :param headers: Current headers in the request if the user passed any
29
29
  :param url: The Target URL.
@@ -65,6 +65,7 @@ class StaticEngine:
65
65
 
66
66
  def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
67
  """Make basic HTTP GET request for you but with some added flavors.
68
+
68
69
  :param url: Target url.
69
70
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
71
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -77,6 +78,7 @@ class StaticEngine:
77
78
 
78
79
  def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
79
80
  """Make basic HTTP POST request for you but with some added flavors.
81
+
80
82
  :param url: Target url.
81
83
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
84
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -89,6 +91,7 @@ class StaticEngine:
89
91
 
90
92
  def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
91
93
  """Make basic HTTP DELETE request for you but with some added flavors.
94
+
92
95
  :param url: Target url.
93
96
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
97
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -101,6 +104,7 @@ class StaticEngine:
101
104
 
102
105
  def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103
106
  """Make basic HTTP PUT request for you but with some added flavors.
107
+
104
108
  :param url: Target url.
105
109
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
110
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -6,6 +6,7 @@ from .fingerprints import (
6
6
  from .custom import (
7
7
  Response,
8
8
  do_nothing,
9
+ StatusText,
9
10
  BaseFetcher,
10
11
  get_variable_name,
11
12
  check_type_validity,
@@ -3,10 +3,78 @@ Functions related to custom types or type checking
3
3
  """
4
4
  import inspect
5
5
  import logging
6
+ from email.message import Message
6
7
 
7
- from scrapling.core.utils import setup_basic_logging
8
+ from scrapling.core.custom_types import MappingProxyType
8
9
  from scrapling.parser import Adaptor, SQLiteStorageSystem
9
- from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
10
+ from scrapling.core.utils import setup_basic_logging, cache
11
+ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
12
+
13
+
14
+ class ResponseEncoding:
15
+ __DEFAULT_ENCODING = "utf-8"
16
+ __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
17
+
18
+ @classmethod
19
+ @cache(maxsize=None)
20
+ def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
21
+ """Parse content type and parameters from a content-type header value.
22
+
23
+ Uses `email.message.Message` for robust header parsing according to RFC 2045.
24
+
25
+ :param header_value: Raw content-type header string
26
+ :return: Tuple of (content_type, parameters_dict)
27
+ """
28
+ # Create a Message object and set the Content-Type header then get the content type and parameters
29
+ msg = Message()
30
+ msg['content-type'] = header_value
31
+
32
+ content_type = msg.get_content_type()
33
+ params = dict(msg.get_params(failobj=[]))
34
+
35
+ # Remove the content-type from params if present somehow
36
+ params.pop('content-type', None)
37
+
38
+ return content_type, params
39
+
40
+ @classmethod
41
+ @cache(maxsize=None)
42
+ def get_value(cls, content_type: Optional[str]) -> str:
43
+ """Determine the appropriate character encoding from a content-type header.
44
+
45
+ The encoding is determined by these rules in order:
46
+ 1. If no content-type is provided, use UTF-8
47
+ 2. If charset parameter is present, use that encoding
48
+ 3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
49
+ 4. If content-type is application/json, use UTF-8 per RFC 4627
50
+ 5. Default to UTF-8 if nothing else matches
51
+
52
+ :param content_type: Content-Type header value or None
53
+ :return: String naming the character encoding
54
+ """
55
+ if not content_type:
56
+ return cls.__DEFAULT_ENCODING
57
+
58
+ try:
59
+ content_type, params = cls.__parse_content_type(content_type)
60
+
61
+ # First check for explicit charset parameter
62
+ if "charset" in params:
63
+ encoding = params["charset"].strip("'\"")
64
+ "test".encode(encoding) # Validate encoding
65
+ return encoding
66
+
67
+ # Apply content-type specific rules
68
+ if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
69
+ return "ISO-8859-1"
70
+
71
+ if content_type == "application/json":
72
+ return cls.__DEFAULT_ENCODING
73
+
74
+ return cls.__DEFAULT_ENCODING
75
+
76
+ except (ValueError, LookupError, UnicodeEncodeError):
77
+ return cls.__DEFAULT_ENCODING
10
78
 
11
79
 
12
80
  class Response(Adaptor):
@@ -19,6 +87,7 @@ class Response(Adaptor):
19
87
  self.cookies = cookies
20
88
  self.headers = headers
21
89
  self.request_headers = request_headers
90
+ encoding = ResponseEncoding.get_value(encoding)
22
91
  super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
23
92
  # For back-ward compatibility
24
93
  self.adaptor = self
@@ -67,6 +136,83 @@ class BaseFetcher:
67
136
  self.adaptor_arguments.update({'automatch_domain': automatch_domain})
68
137
 
69
138
 
139
+ class StatusText:
140
+ """A class that gets the status text of response status code.
141
+
142
+ Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
143
+ """
144
+ _phrases = MappingProxyType({
145
+ 100: "Continue",
146
+ 101: "Switching Protocols",
147
+ 102: "Processing",
148
+ 103: "Early Hints",
149
+ 200: "OK",
150
+ 201: "Created",
151
+ 202: "Accepted",
152
+ 203: "Non-Authoritative Information",
153
+ 204: "No Content",
154
+ 205: "Reset Content",
155
+ 206: "Partial Content",
156
+ 207: "Multi-Status",
157
+ 208: "Already Reported",
158
+ 226: "IM Used",
159
+ 300: "Multiple Choices",
160
+ 301: "Moved Permanently",
161
+ 302: "Found",
162
+ 303: "See Other",
163
+ 304: "Not Modified",
164
+ 305: "Use Proxy",
165
+ 307: "Temporary Redirect",
166
+ 308: "Permanent Redirect",
167
+ 400: "Bad Request",
168
+ 401: "Unauthorized",
169
+ 402: "Payment Required",
170
+ 403: "Forbidden",
171
+ 404: "Not Found",
172
+ 405: "Method Not Allowed",
173
+ 406: "Not Acceptable",
174
+ 407: "Proxy Authentication Required",
175
+ 408: "Request Timeout",
176
+ 409: "Conflict",
177
+ 410: "Gone",
178
+ 411: "Length Required",
179
+ 412: "Precondition Failed",
180
+ 413: "Payload Too Large",
181
+ 414: "URI Too Long",
182
+ 415: "Unsupported Media Type",
183
+ 416: "Range Not Satisfiable",
184
+ 417: "Expectation Failed",
185
+ 418: "I'm a teapot",
186
+ 421: "Misdirected Request",
187
+ 422: "Unprocessable Entity",
188
+ 423: "Locked",
189
+ 424: "Failed Dependency",
190
+ 425: "Too Early",
191
+ 426: "Upgrade Required",
192
+ 428: "Precondition Required",
193
+ 429: "Too Many Requests",
194
+ 431: "Request Header Fields Too Large",
195
+ 451: "Unavailable For Legal Reasons",
196
+ 500: "Internal Server Error",
197
+ 501: "Not Implemented",
198
+ 502: "Bad Gateway",
199
+ 503: "Service Unavailable",
200
+ 504: "Gateway Timeout",
201
+ 505: "HTTP Version Not Supported",
202
+ 506: "Variant Also Negotiates",
203
+ 507: "Insufficient Storage",
204
+ 508: "Loop Detected",
205
+ 510: "Not Extended",
206
+ 511: "Network Authentication Required"
207
+ })
208
+
209
+ @classmethod
210
+ @cache(maxsize=128)
211
+ def get(cls, status_code: int) -> str:
212
+ """Get the phrase for a given HTTP status code."""
213
+ return cls._phrases.get(status_code, "Unknown Status Code")
214
+
215
+
70
216
  def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
71
217
  """This function check if the passed engine can be used by a Fetcher-type class or not.
72
218
 
scrapling/fetchers.py CHANGED
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
11
11
  """
12
12
  def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
13
13
  """Make basic HTTP GET request for you but with some added flavors.
14
+
14
15
  :param url: Target url.
15
16
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
16
17
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -24,6 +25,7 @@ class Fetcher(BaseFetcher):
24
25
 
25
26
  def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
26
27
  """Make basic HTTP POST request for you but with some added flavors.
28
+
27
29
  :param url: Target url.
28
30
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
29
31
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -37,12 +39,14 @@ class Fetcher(BaseFetcher):
37
39
 
38
40
  def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
39
41
  """Make basic HTTP PUT request for you but with some added flavors.
42
+
40
43
  :param url: Target url
41
44
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
42
45
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
43
46
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
44
- create a referer header as if this request came from Google's search of this URL's domain.
47
+ create a referer header as if this request came from Google's search of this URL's domain.
45
48
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
49
+
46
50
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
47
51
  """
48
52
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
@@ -50,6 +54,7 @@ class Fetcher(BaseFetcher):
50
54
 
51
55
  def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
52
56
  """Make basic HTTP DELETE request for you but with some added flavors.
57
+
53
58
  :param url: Target url
54
59
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
55
60
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -77,6 +82,7 @@ class StealthyFetcher(BaseFetcher):
77
82
  ) -> Response:
78
83
  """
79
84
  Opens up a browser and do your request based on your chosen options below.
85
+
80
86
  :param url: Target url.
81
87
  :param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
82
88
  :param block_images: Prevent the loading of images through Firefox preferences.
@@ -127,14 +133,15 @@ class PlayWrightFetcher(BaseFetcher):
127
133
  Using this Fetcher class, you can do requests with:
128
134
  - Vanilla Playwright without any modifications other than the ones you chose.
129
135
  - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
130
- Some of the things stealth mode does include:
131
- 1) Patches the CDP runtime fingerprint.
132
- 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
133
- 3) Using custom flags on launch to hide Playwright even more and make it faster.
134
- 4) Generates real browser's headers of the same type and same user OS then append it to the request.
136
+ Some of the things stealth mode does include:
137
+ 1) Patches the CDP runtime fingerprint.
138
+ 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
139
+ 3) Using custom flags on launch to hide Playwright even more and make it faster.
140
+ 4) Generates real browser's headers of the same type and same user OS then append it to the request.
135
141
  - Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
136
142
  - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
137
- > Note that these are the main options with PlayWright but it can be mixed together.
143
+
144
+ > Note that these are the main options with PlayWright but it can be mixed together.
138
145
  """
139
146
  def fetch(
140
147
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
@@ -147,6 +154,7 @@ class PlayWrightFetcher(BaseFetcher):
147
154
  nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
148
155
  ) -> Response:
149
156
  """Opens up a browser and do your request based on your chosen options below.
157
+
150
158
  :param url: Target url.
151
159
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
152
160
  :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -32,16 +32,16 @@ Classifier: Typing :: Typed
32
32
  Requires-Python: >=3.8
33
33
  Description-Content-Type: text/markdown
34
34
  License-File: LICENSE
35
- Requires-Dist: requests >=2.3
36
- Requires-Dist: lxml >=4.5
37
- Requires-Dist: cssselect >=1.2
35
+ Requires-Dist: requests>=2.3
36
+ Requires-Dist: lxml>=4.5
37
+ Requires-Dist: cssselect>=1.2
38
38
  Requires-Dist: w3lib
39
- Requires-Dist: orjson >=3
39
+ Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox >=0.3.10
44
+ Requires-Dist: camoufox>=0.3.10
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -1,22 +1,22 @@
1
- scrapling/__init__.py,sha256=tNB1LdlhamZYjlxqbEwo1BpsVE48Bt8acfjl8DNKpEE,435
1
+ scrapling/__init__.py,sha256=eSTwhUqPltd41XPPwSXElWmeRalaQEncnjBahW3KUcQ,435
2
2
  scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
3
- scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
3
+ scrapling/fetchers.py,sha256=T3aRXvvpXDbql_2BqJ90KLtgfF4HLUOImdoxWLCcY-A,16045
4
4
  scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
5
5
  scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
6
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
8
- scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
8
+ scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
9
9
  scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
10
  scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
11
- scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
11
+ scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
12
12
  scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
13
13
  scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
14
- scrapling/engines/camo.py,sha256=41vp2Nh51kKuOSZ1PijsIpROpQZgFfUPybVbEX8pEXk,7530
14
+ scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
15
15
  scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
16
- scrapling/engines/pw.py,sha256=l5MrSW_WNBKAxAlyxbt09ka_lEGo61XKuaOgWpYmvHk,12102
17
- scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
18
- scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
19
- scrapling/engines/toolbelt/custom.py,sha256=8lvGHWIZoOotSTF97KgPb3CbJquel2QFx8rP8Hf2sQ4,7469
16
+ scrapling/engines/pw.py,sha256=Llr2HjEr5kgyffEJH5N4XdWrSWvcil9WAwE1RJnOhFo,12190
17
+ scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
18
+ scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
19
+ scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
20
20
  scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
21
21
  scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
22
22
  scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
@@ -28,14 +28,15 @@ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gI
28
28
  scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
29
29
  tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
30
30
  tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
31
- tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
31
+ tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
32
32
  tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
33
- tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-TxUcs,3479
33
+ tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
34
+ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
34
35
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
36
  tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
36
- tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
37
- scrapling-0.2.3.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
38
- scrapling-0.2.3.dist-info/METADATA,sha256=YGUnQmOdwDVs7rHLUCG9hpLg567s_5I0Cu8TIKKDc_Y,64785
39
- scrapling-0.2.3.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
40
- scrapling-0.2.3.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
41
- scrapling-0.2.3.dist-info/RECORD,,
37
+ tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
38
+ scrapling-0.2.5.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
39
+ scrapling-0.2.5.dist-info/METADATA,sha256=oY8RtycwNJXbgA4ePsVkpj6SKP6Y6vMmOgF9L3lc0mg,64780
40
+ scrapling-0.2.5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
41
+ scrapling-0.2.5.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
42
+ scrapling-0.2.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.5.0)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -36,6 +36,7 @@ class TestStealthyFetcher(unittest.TestCase):
36
36
  def test_waiting_selector(self):
37
37
  """Test if waiting for a selector make page does not finish loading or not"""
38
38
  self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
39
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
39
40
 
40
41
  def test_cookies_loading(self):
41
42
  """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ class TestStealthyFetcher(unittest.TestCase):
56
57
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
57
58
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
58
59
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
60
+ self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
59
61
 
60
62
  def test_infinite_timeout(self):
61
63
  """Test if infinite timeout breaks the code or not"""
@@ -35,6 +35,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
35
35
  def test_waiting_selector(self):
36
36
  """Test if waiting for a selector make page does not finish loading or not"""
37
37
  self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
38
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
38
39
 
39
40
  def test_cookies_loading(self):
40
41
  """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
56
57
  self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
57
58
  self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
58
59
  self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
60
+ self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
59
61
 
60
62
  def test_cdp_url(self):
61
63
  """Test if it's going to try to connect to cdp url or not"""
@@ -0,0 +1,129 @@
1
+ import unittest
2
+
3
+ from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
4
+
5
+
6
+ class TestPlayWrightFetcher(unittest.TestCase):
7
+ def setUp(self):
8
+ self.content_type_map = {
9
+ # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
+ 'text/html; charset=UTF-8': 'UTF-8',
11
+ 'text/html; charset=ISO-8859-1': 'ISO-8859-1',
12
+ 'text/html': 'ISO-8859-1',
13
+ 'application/json; charset=UTF-8': 'UTF-8',
14
+ 'application/json': 'utf-8',
15
+ 'text/json': 'utf-8',
16
+ 'application/javascript; charset=UTF-8': 'UTF-8',
17
+ 'application/javascript': 'utf-8',
18
+ 'text/plain; charset=UTF-8': 'UTF-8',
19
+ 'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
20
+ 'text/plain': 'ISO-8859-1',
21
+ 'application/xhtml+xml; charset=UTF-8': 'UTF-8',
22
+ 'application/xhtml+xml': 'utf-8',
23
+ 'text/html; charset=windows-1252': 'windows-1252',
24
+ 'application/json; charset=windows-1252': 'windows-1252',
25
+ 'text/plain; charset=windows-1252': 'windows-1252',
26
+ 'text/html; charset="UTF-8"': 'UTF-8',
27
+ 'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
28
+ 'text/html; charset="windows-1252"': 'windows-1252',
29
+ 'application/json; charset="UTF-8"': 'UTF-8',
30
+ 'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
31
+ 'application/json; charset="windows-1252"': 'windows-1252',
32
+ 'text/json; charset="UTF-8"': 'UTF-8',
33
+ 'application/javascript; charset="UTF-8"': 'UTF-8',
34
+ 'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
35
+ 'text/plain; charset="UTF-8"': 'UTF-8',
36
+ 'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
37
+ 'text/plain; charset="windows-1252"': 'windows-1252',
38
+ 'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
39
+ 'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
40
+ 'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
41
+ 'text/html; charset="US-ASCII"': 'US-ASCII',
42
+ 'application/json; charset="US-ASCII"': 'US-ASCII',
43
+ 'text/plain; charset="US-ASCII"': 'US-ASCII',
44
+ 'text/html; charset="Shift_JIS"': 'Shift_JIS',
45
+ 'application/json; charset="Shift_JIS"': 'Shift_JIS',
46
+ 'text/plain; charset="Shift_JIS"': 'Shift_JIS',
47
+ 'application/xml; charset="UTF-8"': 'UTF-8',
48
+ 'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
49
+ 'application/xml': 'utf-8',
50
+ 'text/xml; charset="UTF-8"': 'UTF-8',
51
+ 'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
52
+ 'text/xml': 'utf-8'
53
+ }
54
+ self.status_map = {
55
+ 100: "Continue",
56
+ 101: "Switching Protocols",
57
+ 102: "Processing",
58
+ 103: "Early Hints",
59
+ 200: "OK",
60
+ 201: "Created",
61
+ 202: "Accepted",
62
+ 203: "Non-Authoritative Information",
63
+ 204: "No Content",
64
+ 205: "Reset Content",
65
+ 206: "Partial Content",
66
+ 207: "Multi-Status",
67
+ 208: "Already Reported",
68
+ 226: "IM Used",
69
+ 300: "Multiple Choices",
70
+ 301: "Moved Permanently",
71
+ 302: "Found",
72
+ 303: "See Other",
73
+ 304: "Not Modified",
74
+ 305: "Use Proxy",
75
+ 307: "Temporary Redirect",
76
+ 308: "Permanent Redirect",
77
+ 400: "Bad Request",
78
+ 401: "Unauthorized",
79
+ 402: "Payment Required",
80
+ 403: "Forbidden",
81
+ 404: "Not Found",
82
+ 405: "Method Not Allowed",
83
+ 406: "Not Acceptable",
84
+ 407: "Proxy Authentication Required",
85
+ 408: "Request Timeout",
86
+ 409: "Conflict",
87
+ 410: "Gone",
88
+ 411: "Length Required",
89
+ 412: "Precondition Failed",
90
+ 413: "Payload Too Large",
91
+ 414: "URI Too Long",
92
+ 415: "Unsupported Media Type",
93
+ 416: "Range Not Satisfiable",
94
+ 417: "Expectation Failed",
95
+ 418: "I'm a teapot",
96
+ 421: "Misdirected Request",
97
+ 422: "Unprocessable Entity",
98
+ 423: "Locked",
99
+ 424: "Failed Dependency",
100
+ 425: "Too Early",
101
+ 426: "Upgrade Required",
102
+ 428: "Precondition Required",
103
+ 429: "Too Many Requests",
104
+ 431: "Request Header Fields Too Large",
105
+ 451: "Unavailable For Legal Reasons",
106
+ 500: "Internal Server Error",
107
+ 501: "Not Implemented",
108
+ 502: "Bad Gateway",
109
+ 503: "Service Unavailable",
110
+ 504: "Gateway Timeout",
111
+ 505: "HTTP Version Not Supported",
112
+ 506: "Variant Also Negotiates",
113
+ 507: "Insufficient Storage",
114
+ 508: "Loop Detected",
115
+ 510: "Not Extended",
116
+ 511: "Network Authentication Required"
117
+ }
118
+
119
+ def test_parsing_content_type(self):
120
+ """Test if parsing different types of content-type returns the expected result"""
121
+ for header_value, expected_encoding in self.content_type_map.items():
122
+ self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
123
+
124
+ def test_parsing_response_status(self):
125
+ """Test if using different http responses' status codes returns the expected result"""
126
+ for status_code, expected_status_text in self.status_map.items():
127
+ self.assertEqual(StatusText.get(status_code), expected_status_text)
128
+
129
+ self.assertEqual(StatusText.get(1000), "Unknown Status Code")
@@ -278,7 +278,7 @@ class TestParser(unittest.TestCase):
278
278
  self.assertEqual(len(elements), 5000)
279
279
  # Converting 5000 elements to a class and doing operations on them will take time
280
280
  # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
281
- self.assertLess(end_time - start_time, 0.1)
281
+ self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
282
282
 
283
283
 
284
284
  # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report