scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +227 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,20 @@
1
- from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
- check_type_validity, get_variable_name)
3
- from .fingerprints import (generate_convincing_referer, generate_headers,
4
- get_os_name)
5
- from .navigation import (async_intercept_route, construct_cdp_url,
6
- construct_proxy_dict, intercept_route, js_bypass_path)
1
+ from .custom import (
2
+ BaseFetcher,
3
+ Response,
4
+ StatusText,
5
+ get_variable_name,
6
+ )
7
+ from .fingerprints import (
8
+ generate_convincing_referer,
9
+ generate_headers,
10
+ get_os_name,
11
+ __default_useragent__,
12
+ )
13
+ from .navigation import (
14
+ async_intercept_route,
15
+ construct_cdp_url,
16
+ construct_proxy_dict,
17
+ intercept_route,
18
+ js_bypass_path,
19
+ )
20
+ from .convertor import ResponseFactory
@@ -1,2 +1,3 @@
1
1
  // Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
2
- delete __pwInitScripts;
2
+ delete window.__pwInitScripts;
3
+ delete window.__playwright__binding__;
@@ -0,0 +1,254 @@
1
+ from curl_cffi.requests import Response as CurlResponse
2
+ from playwright.sync_api import Page as SyncPage, Response as SyncResponse
3
+ from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
4
+
5
+ from scrapling.core.utils import log
6
+ from scrapling.core._types import Dict, Optional
7
+ from .custom import Response, StatusText
8
+
9
+
10
+ class ResponseFactory:
11
+ """
12
+ Factory class for creating `Response` objects from various sources.
13
+
14
+ This class provides multiple static and instance methods for building standardized `Response` objects
15
+ from diverse input sources such as Playwright responses, asynchronous Playwright responses,
16
+ and raw HTTP request responses. It supports handling response histories, constructing the proper
17
+ response objects, and managing encoding, headers, cookies, and other attributes.
18
+ """
19
+
20
+ @classmethod
21
+ def _process_response_history(
22
+ cls, first_response: SyncResponse, parser_arguments: Dict
23
+ ) -> list[Response]:
24
+ """Process response history to build a list of `Response` objects"""
25
+ history = []
26
+ current_request = first_response.request.redirected_from
27
+
28
+ try:
29
+ while current_request:
30
+ try:
31
+ current_response = current_request.response()
32
+ history.insert(
33
+ 0,
34
+ Response(
35
+ url=current_request.url,
36
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
37
+ content="",
38
+ status=current_response.status if current_response else 301,
39
+ reason=(
40
+ current_response.status_text
41
+ or StatusText.get(current_response.status)
42
+ )
43
+ if current_response
44
+ else StatusText.get(301),
45
+ encoding=current_response.headers.get("content-type", "")
46
+ or "utf-8",
47
+ cookies=tuple(),
48
+ headers=current_response.all_headers()
49
+ if current_response
50
+ else {},
51
+ request_headers=current_request.all_headers(),
52
+ **parser_arguments,
53
+ ),
54
+ )
55
+ except Exception as e: # pragma: no cover
56
+ log.error(f"Error processing redirect: {e}")
57
+ break
58
+
59
+ current_request = current_request.redirected_from
60
+ except Exception as e: # pragma: no cover
61
+ log.error(f"Error processing response history: {e}")
62
+
63
+ return history
64
+
65
+ @classmethod
66
+ def from_playwright_response(
67
+ cls,
68
+ page: SyncPage,
69
+ first_response: SyncResponse,
70
+ final_response: Optional[SyncResponse],
71
+ parser_arguments: Dict,
72
+ ) -> Response:
73
+ """
74
+ Transforms a Playwright response into an internal `Response` object, encapsulating
75
+ the page's content, response status, headers, and relevant metadata.
76
+
77
+ The function handles potential issues, such as empty or missing final responses,
78
+ by falling back to the first response if necessary. Encoding and status text
79
+ are also derived from the provided response headers or reasonable defaults.
80
+ Additionally, the page content and cookies are extracted for further use.
81
+
82
+ :param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
83
+ :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
84
+ :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
85
+ :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
86
+ the `Response` object.
87
+
88
+ :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
89
+ :rtype: Response
90
+ """
91
+ # In case we didn't catch a document type somehow
92
+ final_response = final_response if final_response else first_response
93
+ if not final_response:
94
+ raise ValueError("Failed to get a response from the page")
95
+
96
+ # This will be parsed inside `Response`
97
+ encoding = (
98
+ final_response.headers.get("content-type", "") or "utf-8"
99
+ ) # default encoding
100
+ # PlayWright API sometimes give empty status text for some reason!
101
+ status_text = final_response.status_text or StatusText.get(
102
+ final_response.status
103
+ )
104
+
105
+ history = cls._process_response_history(first_response, parser_arguments)
106
+ try:
107
+ page_content = page.content()
108
+ except Exception as e: # pragma: no cover
109
+ log.error(f"Error getting page content: {e}")
110
+ page_content = ""
111
+
112
+ return Response(
113
+ url=page.url,
114
+ content=page_content,
115
+ status=final_response.status,
116
+ reason=status_text,
117
+ encoding=encoding,
118
+ cookies=tuple(dict(cookie) for cookie in page.context.cookies()),
119
+ headers=first_response.all_headers(),
120
+ request_headers=first_response.request.all_headers(),
121
+ history=history,
122
+ **parser_arguments,
123
+ )
124
+
125
+ @classmethod
126
+ async def _async_process_response_history(
127
+ cls, first_response: AsyncResponse, parser_arguments: Dict
128
+ ) -> list[Response]:
129
+ """Process response history to build a list of `Response` objects"""
130
+ history = []
131
+ current_request = first_response.request.redirected_from
132
+
133
+ try:
134
+ while current_request:
135
+ try:
136
+ current_response = await current_request.response()
137
+ history.insert(
138
+ 0,
139
+ Response(
140
+ url=current_request.url,
141
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
142
+ content="",
143
+ status=current_response.status if current_response else 301,
144
+ reason=(
145
+ current_response.status_text
146
+ or StatusText.get(current_response.status)
147
+ )
148
+ if current_response
149
+ else StatusText.get(301),
150
+ encoding=current_response.headers.get("content-type", "")
151
+ or "utf-8",
152
+ cookies=tuple(),
153
+ headers=await current_response.all_headers()
154
+ if current_response
155
+ else {},
156
+ request_headers=await current_request.all_headers(),
157
+ **parser_arguments,
158
+ ),
159
+ )
160
+ except Exception as e: # pragma: no cover
161
+ log.error(f"Error processing redirect: {e}")
162
+ break
163
+
164
+ current_request = current_request.redirected_from
165
+ except Exception as e: # pragma: no cover
166
+ log.error(f"Error processing response history: {e}")
167
+
168
+ return history
169
+
170
+ @classmethod
171
+ async def from_async_playwright_response(
172
+ cls,
173
+ page: AsyncPage,
174
+ first_response: AsyncResponse,
175
+ final_response: Optional[AsyncResponse],
176
+ parser_arguments: Dict,
177
+ ) -> Response:
178
+ """
179
+ Transforms a Playwright response into an internal `Response` object, encapsulating
180
+ the page's content, response status, headers, and relevant metadata.
181
+
182
+ The function handles potential issues, such as empty or missing final responses,
183
+ by falling back to the first response if necessary. Encoding and status text
184
+ are also derived from the provided response headers or reasonable defaults.
185
+ Additionally, the page content and cookies are extracted for further use.
186
+
187
+ :param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
188
+ :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
189
+ :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
190
+ :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
191
+ the `Response` object.
192
+
193
+ :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
194
+ :rtype: Response
195
+ """
196
+ # In case we didn't catch a document type somehow
197
+ final_response = final_response if final_response else first_response
198
+ if not final_response:
199
+ raise ValueError("Failed to get a response from the page")
200
+
201
+ # This will be parsed inside `Response`
202
+ encoding = (
203
+ final_response.headers.get("content-type", "") or "utf-8"
204
+ ) # default encoding
205
+ # PlayWright API sometimes give empty status text for some reason!
206
+ status_text = final_response.status_text or StatusText.get(
207
+ final_response.status
208
+ )
209
+
210
+ history = await cls._async_process_response_history(
211
+ first_response, parser_arguments
212
+ )
213
+ try:
214
+ page_content = await page.content()
215
+ except Exception as e: # pragma: no cover
216
+ log.error(f"Error getting page content in async: {e}")
217
+ page_content = ""
218
+
219
+ return Response(
220
+ url=page.url,
221
+ content=page_content,
222
+ status=final_response.status,
223
+ reason=status_text,
224
+ encoding=encoding,
225
+ cookies=tuple(dict(cookie) for cookie in await page.context.cookies()),
226
+ headers=await first_response.all_headers(),
227
+ request_headers=await first_response.request.all_headers(),
228
+ history=history,
229
+ **parser_arguments,
230
+ )
231
+
232
+ @staticmethod
233
+ def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response:
234
+ """Takes `curl_cffi` response and generates `Response` object from it.
235
+
236
+ :param response: `curl_cffi` response object
237
+ :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
238
+ :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
239
+ """
240
+ return Response(
241
+ url=response.url,
242
+ content=response.content
243
+ if isinstance(response.content, bytes)
244
+ else response.content.encode(),
245
+ status=response.status_code,
246
+ reason=response.reason,
247
+ encoding=response.encoding or "utf-8",
248
+ cookies=dict(response.cookies),
249
+ headers=dict(response.headers),
250
+ request_headers=dict(response.request.headers),
251
+ method=response.request.method,
252
+ history=response.history, # https://github.com/lexiforest/curl_cffi/issues/82
253
+ **parser_arguments,
254
+ )