scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +219 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +201 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
  30. scrapling-0.3.3.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,15 @@
1
+ from functools import lru_cache
2
+ from re import compile as re_compile
3
+
1
4
  from curl_cffi.requests import Response as CurlResponse
2
5
  from playwright.sync_api import Page as SyncPage, Response as SyncResponse
3
6
  from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
4
7
 
5
8
  from scrapling.core.utils import log
6
- from scrapling.core._types import Dict, Optional
7
9
  from .custom import Response, StatusText
10
+ from scrapling.core._types import Dict, Optional
11
+
12
+ __CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
8
13
 
9
14
 
10
15
  class ResponseFactory:
@@ -18,9 +23,19 @@ class ResponseFactory:
18
23
  """
19
24
 
20
25
  @classmethod
21
- def _process_response_history(
22
- cls, first_response: SyncResponse, parser_arguments: Dict
23
- ) -> list[Response]:
26
+ @lru_cache(maxsize=16)
27
+ def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
28
+ """Extract browser encoding from headers.
29
+ Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
30
+ """
31
+ if content_type:
32
+ # Because Playwright can't do that by themselves like all libraries for some reason :3
33
+ match = __CHARSET_RE__.search(content_type)
34
+ return match.group(1) if match else None
35
+ return None
36
+
37
+ @classmethod
38
+ def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
24
39
  """Process response history to build a list of `Response` objects"""
25
40
  history = []
26
41
  current_request = first_response.request.redirected_from
@@ -32,24 +47,23 @@ class ResponseFactory:
32
47
  history.insert(
33
48
  0,
34
49
  Response(
35
- url=current_request.url,
36
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
37
- content="",
38
- status=current_response.status if current_response else 301,
39
- reason=(
40
- current_response.status_text
41
- or StatusText.get(current_response.status)
42
- )
43
- if current_response
44
- else StatusText.get(301),
45
- encoding=current_response.headers.get("content-type", "")
46
- or "utf-8",
47
- cookies=tuple(),
48
- headers=current_response.all_headers()
49
- if current_response
50
- else {},
51
- request_headers=current_request.all_headers(),
52
- **parser_arguments,
50
+ **{
51
+ "url": current_request.url,
52
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
53
+ "content": "",
54
+ "status": current_response.status if current_response else 301,
55
+ "reason": (current_response.status_text or StatusText.get(current_response.status))
56
+ if current_response
57
+ else StatusText.get(301),
58
+ "encoding": cls.__extract_browser_encoding(
59
+ current_response.headers.get("content-type", "")
60
+ )
61
+ or "utf-8",
62
+ "cookies": tuple(),
63
+ "headers": current_response.all_headers() if current_response else {},
64
+ "request_headers": current_request.all_headers(),
65
+ **parser_arguments,
66
+ }
53
67
  ),
54
68
  )
55
69
  except Exception as e: # pragma: no cover
@@ -93,14 +107,11 @@ class ResponseFactory:
93
107
  if not final_response:
94
108
  raise ValueError("Failed to get a response from the page")
95
109
 
96
- # This will be parsed inside `Response`
97
110
  encoding = (
98
- final_response.headers.get("content-type", "") or "utf-8"
111
+ cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
99
112
  ) # default encoding
100
113
  # PlayWright API sometimes give empty status text for some reason!
101
- status_text = final_response.status_text or StatusText.get(
102
- final_response.status
103
- )
114
+ status_text = final_response.status_text or StatusText.get(final_response.status)
104
115
 
105
116
  history = cls._process_response_history(first_response, parser_arguments)
106
117
  try:
@@ -110,16 +121,18 @@ class ResponseFactory:
110
121
  page_content = ""
111
122
 
112
123
  return Response(
113
- url=page.url,
114
- content=page_content,
115
- status=final_response.status,
116
- reason=status_text,
117
- encoding=encoding,
118
- cookies=tuple(dict(cookie) for cookie in page.context.cookies()),
119
- headers=first_response.all_headers(),
120
- request_headers=first_response.request.all_headers(),
121
- history=history,
122
- **parser_arguments,
124
+ **{
125
+ "url": page.url,
126
+ "content": page_content,
127
+ "status": final_response.status,
128
+ "reason": status_text,
129
+ "encoding": encoding,
130
+ "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
131
+ "headers": first_response.all_headers(),
132
+ "request_headers": first_response.request.all_headers(),
133
+ "history": history,
134
+ **parser_arguments,
135
+ }
123
136
  )
124
137
 
125
138
  @classmethod
@@ -137,24 +150,23 @@ class ResponseFactory:
137
150
  history.insert(
138
151
  0,
139
152
  Response(
140
- url=current_request.url,
141
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
142
- content="",
143
- status=current_response.status if current_response else 301,
144
- reason=(
145
- current_response.status_text
146
- or StatusText.get(current_response.status)
147
- )
148
- if current_response
149
- else StatusText.get(301),
150
- encoding=current_response.headers.get("content-type", "")
151
- or "utf-8",
152
- cookies=tuple(),
153
- headers=await current_response.all_headers()
154
- if current_response
155
- else {},
156
- request_headers=await current_request.all_headers(),
157
- **parser_arguments,
153
+ **{
154
+ "url": current_request.url,
155
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
156
+ "content": "",
157
+ "status": current_response.status if current_response else 301,
158
+ "reason": (current_response.status_text or StatusText.get(current_response.status))
159
+ if current_response
160
+ else StatusText.get(301),
161
+ "encoding": cls.__extract_browser_encoding(
162
+ current_response.headers.get("content-type", "")
163
+ )
164
+ or "utf-8",
165
+ "cookies": tuple(),
166
+ "headers": await current_response.all_headers() if current_response else {},
167
+ "request_headers": await current_request.all_headers(),
168
+ **parser_arguments,
169
+ }
158
170
  ),
159
171
  )
160
172
  except Exception as e: # pragma: no cover
@@ -198,18 +210,13 @@ class ResponseFactory:
198
210
  if not final_response:
199
211
  raise ValueError("Failed to get a response from the page")
200
212
 
201
- # This will be parsed inside `Response`
202
213
  encoding = (
203
- final_response.headers.get("content-type", "") or "utf-8"
214
+ cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
204
215
  ) # default encoding
205
216
  # PlayWright API sometimes give empty status text for some reason!
206
- status_text = final_response.status_text or StatusText.get(
207
- final_response.status
208
- )
217
+ status_text = final_response.status_text or StatusText.get(final_response.status)
209
218
 
210
- history = await cls._async_process_response_history(
211
- first_response, parser_arguments
212
- )
219
+ history = await cls._async_process_response_history(first_response, parser_arguments)
213
220
  try:
214
221
  page_content = await page.content()
215
222
  except Exception as e: # pragma: no cover
@@ -217,16 +224,18 @@ class ResponseFactory:
217
224
  page_content = ""
218
225
 
219
226
  return Response(
220
- url=page.url,
221
- content=page_content,
222
- status=final_response.status,
223
- reason=status_text,
224
- encoding=encoding,
225
- cookies=tuple(dict(cookie) for cookie in await page.context.cookies()),
226
- headers=await first_response.all_headers(),
227
- request_headers=await first_response.request.all_headers(),
228
- history=history,
229
- **parser_arguments,
227
+ **{
228
+ "url": page.url,
229
+ "content": page_content,
230
+ "status": final_response.status,
231
+ "reason": status_text,
232
+ "encoding": encoding,
233
+ "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
234
+ "headers": await first_response.all_headers(),
235
+ "request_headers": await first_response.request.all_headers(),
236
+ "history": history,
237
+ **parser_arguments,
238
+ }
230
239
  )
231
240
 
232
241
  @staticmethod
@@ -238,17 +247,17 @@ class ResponseFactory:
238
247
  :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
239
248
  """
240
249
  return Response(
241
- url=response.url,
242
- content=response.content
243
- if isinstance(response.content, bytes)
244
- else response.content.encode(),
245
- status=response.status_code,
246
- reason=response.reason,
247
- encoding=response.encoding or "utf-8",
248
- cookies=dict(response.cookies),
249
- headers=dict(response.headers),
250
- request_headers=dict(response.request.headers),
251
- method=response.request.method,
252
- history=response.history, # https://github.com/lexiforest/curl_cffi/issues/82
253
- **parser_arguments,
250
+ **{
251
+ "url": response.url,
252
+ "content": response.content,
253
+ "status": response.status_code,
254
+ "reason": response.reason,
255
+ "encoding": response.encoding or "utf-8",
256
+ "cookies": dict(response.cookies),
257
+ "headers": dict(response.headers),
258
+ "request_headers": dict(response.request.headers),
259
+ "method": response.request.method,
260
+ "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
261
+ **parser_arguments,
262
+ }
254
263
  )
@@ -2,8 +2,9 @@
2
2
  Functions related to custom types or type checking
3
3
  """
4
4
 
5
- from email.message import Message
5
+ from functools import lru_cache
6
6
 
7
+ from scrapling.core.utils import log
7
8
  from scrapling.core._types import (
8
9
  Any,
9
10
  Dict,
@@ -12,89 +13,9 @@ from scrapling.core._types import (
12
13
  Tuple,
13
14
  )
14
15
  from scrapling.core.custom_types import MappingProxyType
15
- from scrapling.core.utils import log, lru_cache
16
16
  from scrapling.parser import Selector, SQLiteStorageSystem
17
17
 
18
18
 
19
- class ResponseEncoding:
20
- __DEFAULT_ENCODING = "utf-8"
21
- __ISO_8859_1_CONTENT_TYPES = {
22
- "text/plain",
23
- "text/html",
24
- "text/css",
25
- "text/javascript",
26
- }
27
-
28
- @classmethod
29
- @lru_cache(maxsize=128)
30
- def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
31
- """Parse content type and parameters from a content-type header value.
32
-
33
- Uses `email.message.Message` for robust header parsing according to RFC 2045.
34
-
35
- :param header_value: Raw content-type header string
36
- :return: Tuple of (content_type, parameters_dict)
37
- """
38
- # Create a Message object and set the Content-Type header then get the content type and parameters
39
- msg = Message()
40
- msg["content-type"] = header_value
41
-
42
- content_type = msg.get_content_type()
43
- params = dict(msg.get_params(failobj=[]))
44
-
45
- # Remove the content-type from params if present somehow
46
- params.pop("content-type", None)
47
-
48
- return content_type, params
49
-
50
- @classmethod
51
- @lru_cache(maxsize=128)
52
- def get_value(
53
- cls, content_type: Optional[str], text: Optional[str] = "test"
54
- ) -> str:
55
- """Determine the appropriate character encoding from a content-type header.
56
-
57
- The encoding is determined by these rules in order:
58
- 1. If no content-type is provided, use UTF-8
59
- 2. If charset parameter is present, use that encoding
60
- 3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
61
- 4. If content-type is application/json, use UTF-8 per RFC 4627
62
- 5. Default to UTF-8 if nothing else matches
63
-
64
- :param content_type: Content-Type header value or None
65
- :param text: A text to test the encoding on it
66
- :return: String naming the character encoding
67
- """
68
- if not content_type:
69
- return cls.__DEFAULT_ENCODING
70
-
71
- try:
72
- encoding = None
73
- content_type, params = cls.__parse_content_type(content_type)
74
-
75
- # First check for explicit charset parameter
76
- if "charset" in params:
77
- encoding = params["charset"].strip("'\"")
78
-
79
- # Apply content-type specific rules
80
- elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
81
- encoding = "ISO-8859-1"
82
-
83
- elif content_type == "application/json":
84
- encoding = cls.__DEFAULT_ENCODING
85
-
86
- if encoding:
87
- _ = text.encode(
88
- encoding
89
- ) # Validate encoding and validate it can encode the given text
90
- return encoding
91
-
92
- return cls.__DEFAULT_ENCODING
93
-
94
- except (ValueError, LookupError, UnicodeEncodeError):
95
- return cls.__DEFAULT_ENCODING
96
-
97
-
98
19
  class Response(Selector):
99
20
  """This class is returned by all engines as a way to unify response type between different libraries."""
100
21
 
@@ -119,9 +40,6 @@ class Response(Selector):
119
40
  self.headers = headers
120
41
  self.request_headers = request_headers
121
42
  self.history = history or []
122
- encoding = ResponseEncoding.get_value(
123
- encoding, content.decode("utf-8") if isinstance(content, bytes) else content
124
- )
125
43
  super().__init__(
126
44
  content=content,
127
45
  url=adaptive_domain or url,
@@ -129,9 +47,7 @@ class Response(Selector):
129
47
  **selector_config,
130
48
  )
131
49
  # For easier debugging while working from a Python shell
132
- log.info(
133
- f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
134
- )
50
+ log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
135
51
 
136
52
 
137
53
  class BaseFetcher:
@@ -190,18 +106,12 @@ class BaseFetcher:
190
106
  setattr(cls, key, value)
191
107
  else:
192
108
  # Yup, no fun allowed LOL
193
- raise AttributeError(
194
- f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
195
- )
109
+ raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
196
110
  else:
197
- raise ValueError(
198
- f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
199
- )
111
+ raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
200
112
 
201
113
  if not kwargs:
202
- raise AttributeError(
203
- f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
204
- )
114
+ raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")
205
115
 
206
116
  @classmethod
207
117
  def _generate_parser_arguments(cls) -> Dict:
@@ -217,9 +127,7 @@ class BaseFetcher:
217
127
  )
218
128
  if cls.adaptive_domain:
219
129
  if not isinstance(cls.adaptive_domain, str):
220
- log.warning(
221
- '[Ignored] The argument "adaptive_domain" must be of string type'
222
- )
130
+ log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
223
131
  else:
224
132
  parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
225
133
 
@@ -2,13 +2,13 @@
2
2
  Functions related to generating headers and fingerprints generally
3
3
  """
4
4
 
5
+ from functools import lru_cache
5
6
  from platform import system as platform_system
6
7
 
7
8
  from tldextract import extract
8
9
  from browserforge.headers import Browser, HeaderGenerator
9
10
 
10
11
  from scrapling.core._types import Dict, Optional
11
- from scrapling.core.utils import lru_cache
12
12
 
13
13
  __OS_NAME__ = platform_system()
14
14
 
@@ -37,8 +37,6 @@ def get_os_name() -> Optional[str]:
37
37
  "Linux": "linux",
38
38
  "Darwin": "macos",
39
39
  "Windows": "windows",
40
- # For the future? because why not?
41
- "iOS": "ios",
42
40
  }.get(__OS_NAME__)
43
41
 
44
42
 
@@ -30,9 +30,7 @@ def intercept_route(route: Route):
30
30
  :return: PlayWright `Route` object
31
31
  """
32
32
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
33
- log.debug(
34
- f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
35
- )
33
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
36
34
  route.abort()
37
35
  else:
38
36
  route.continue_()
@@ -45,17 +43,13 @@ async def async_intercept_route(route: async_Route):
45
43
  :return: PlayWright `Route` object
46
44
  """
47
45
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
48
- log.debug(
49
- f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
50
- )
46
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
51
47
  await route.abort()
52
48
  else:
53
49
  await route.continue_()
54
50
 
55
51
 
56
- def construct_proxy_dict(
57
- proxy_string: str | Dict[str, str], as_tuple=False
58
- ) -> Optional[Dict | Tuple]:
52
+ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
59
53
  """Validate a proxy and return it in the acceptable format for Playwright
60
54
  Reference: https://playwright.dev/python/docs/network#http-proxy
61
55
 
@@ -65,10 +59,7 @@ def construct_proxy_dict(
65
59
  """
66
60
  if isinstance(proxy_string, str):
67
61
  proxy = urlparse(proxy_string)
68
- if (
69
- proxy.scheme not in ("http", "https", "socks4", "socks5")
70
- or not proxy.hostname
71
- ):
62
+ if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
72
63
  raise ValueError("Invalid proxy string!")
73
64
 
74
65
  try:
@@ -95,51 +86,6 @@ def construct_proxy_dict(
95
86
  return None
96
87
 
97
88
 
98
- def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
99
- """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
100
-
101
- :param cdp_url: The target URL.
102
- :param query_params: A dictionary of the parameters to add.
103
- :return: The new CDP URL.
104
- """
105
- try:
106
- # Validate the base URL structure
107
- parsed = urlparse(cdp_url)
108
-
109
- # Check scheme
110
- if parsed.scheme not in ("ws", "wss"):
111
- raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
112
-
113
- # Validate hostname and port
114
- if not parsed.netloc:
115
- raise ValueError("Invalid hostname for the CDP URL")
116
-
117
- try:
118
- # Checking if the port is valid (if available)
119
- _ = parsed.port
120
- except ValueError:
121
- # urlparse will raise `ValueError` if the port can't be casted to integer
122
- raise ValueError("Invalid port for the CDP URL")
123
-
124
- # Ensure the path starts with /
125
- path = parsed.path
126
- if not path.startswith("/"):
127
- path = "/" + path
128
-
129
- # Reconstruct the base URL with validated parts
130
- validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
131
-
132
- # Add query parameters
133
- if query_params:
134
- query_string = urlencode(query_params)
135
- return f"{validated_base}?{query_string}"
136
-
137
- return validated_base
138
-
139
- except Exception as e:
140
- raise ValueError(f"Invalid CDP URL: {str(e)}")
141
-
142
-
143
89
  @lru_cache(10, typed=True)
144
90
  def js_bypass_path(filename: str) -> str:
145
91
  """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it