scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,6 +1,20 @@
|
|
1
|
-
from .custom import (
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
from .custom import (
|
2
|
+
BaseFetcher,
|
3
|
+
Response,
|
4
|
+
StatusText,
|
5
|
+
get_variable_name,
|
6
|
+
)
|
7
|
+
from .fingerprints import (
|
8
|
+
generate_convincing_referer,
|
9
|
+
generate_headers,
|
10
|
+
get_os_name,
|
11
|
+
__default_useragent__,
|
12
|
+
)
|
13
|
+
from .navigation import (
|
14
|
+
async_intercept_route,
|
15
|
+
construct_cdp_url,
|
16
|
+
construct_proxy_dict,
|
17
|
+
intercept_route,
|
18
|
+
js_bypass_path,
|
19
|
+
)
|
20
|
+
from .convertor import ResponseFactory
|
@@ -1,2 +1,3 @@
|
|
1
1
|
// Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
|
2
|
-
delete __pwInitScripts;
|
2
|
+
delete window.__pwInitScripts;
|
3
|
+
delete window.__playwright__binding__;
|
@@ -0,0 +1,254 @@
|
|
1
|
+
from curl_cffi.requests import Response as CurlResponse
|
2
|
+
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
3
|
+
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
4
|
+
|
5
|
+
from scrapling.core.utils import log
|
6
|
+
from scrapling.core._types import Dict, Optional
|
7
|
+
from .custom import Response, StatusText
|
8
|
+
|
9
|
+
|
10
|
+
class ResponseFactory:
|
11
|
+
"""
|
12
|
+
Factory class for creating `Response` objects from various sources.
|
13
|
+
|
14
|
+
This class provides multiple static and instance methods for building standardized `Response` objects
|
15
|
+
from diverse input sources such as Playwright responses, asynchronous Playwright responses,
|
16
|
+
and raw HTTP request responses. It supports handling response histories, constructing the proper
|
17
|
+
response objects, and managing encoding, headers, cookies, and other attributes.
|
18
|
+
"""
|
19
|
+
|
20
|
+
@classmethod
|
21
|
+
def _process_response_history(
|
22
|
+
cls, first_response: SyncResponse, parser_arguments: Dict
|
23
|
+
) -> list[Response]:
|
24
|
+
"""Process response history to build a list of `Response` objects"""
|
25
|
+
history = []
|
26
|
+
current_request = first_response.request.redirected_from
|
27
|
+
|
28
|
+
try:
|
29
|
+
while current_request:
|
30
|
+
try:
|
31
|
+
current_response = current_request.response()
|
32
|
+
history.insert(
|
33
|
+
0,
|
34
|
+
Response(
|
35
|
+
url=current_request.url,
|
36
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
37
|
+
content="",
|
38
|
+
status=current_response.status if current_response else 301,
|
39
|
+
reason=(
|
40
|
+
current_response.status_text
|
41
|
+
or StatusText.get(current_response.status)
|
42
|
+
)
|
43
|
+
if current_response
|
44
|
+
else StatusText.get(301),
|
45
|
+
encoding=current_response.headers.get("content-type", "")
|
46
|
+
or "utf-8",
|
47
|
+
cookies=tuple(),
|
48
|
+
headers=current_response.all_headers()
|
49
|
+
if current_response
|
50
|
+
else {},
|
51
|
+
request_headers=current_request.all_headers(),
|
52
|
+
**parser_arguments,
|
53
|
+
),
|
54
|
+
)
|
55
|
+
except Exception as e: # pragma: no cover
|
56
|
+
log.error(f"Error processing redirect: {e}")
|
57
|
+
break
|
58
|
+
|
59
|
+
current_request = current_request.redirected_from
|
60
|
+
except Exception as e: # pragma: no cover
|
61
|
+
log.error(f"Error processing response history: {e}")
|
62
|
+
|
63
|
+
return history
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def from_playwright_response(
|
67
|
+
cls,
|
68
|
+
page: SyncPage,
|
69
|
+
first_response: SyncResponse,
|
70
|
+
final_response: Optional[SyncResponse],
|
71
|
+
parser_arguments: Dict,
|
72
|
+
) -> Response:
|
73
|
+
"""
|
74
|
+
Transforms a Playwright response into an internal `Response` object, encapsulating
|
75
|
+
the page's content, response status, headers, and relevant metadata.
|
76
|
+
|
77
|
+
The function handles potential issues, such as empty or missing final responses,
|
78
|
+
by falling back to the first response if necessary. Encoding and status text
|
79
|
+
are also derived from the provided response headers or reasonable defaults.
|
80
|
+
Additionally, the page content and cookies are extracted for further use.
|
81
|
+
|
82
|
+
:param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
|
83
|
+
:param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
|
84
|
+
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
85
|
+
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
86
|
+
the `Response` object.
|
87
|
+
|
88
|
+
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
89
|
+
:rtype: Response
|
90
|
+
"""
|
91
|
+
# In case we didn't catch a document type somehow
|
92
|
+
final_response = final_response if final_response else first_response
|
93
|
+
if not final_response:
|
94
|
+
raise ValueError("Failed to get a response from the page")
|
95
|
+
|
96
|
+
# This will be parsed inside `Response`
|
97
|
+
encoding = (
|
98
|
+
final_response.headers.get("content-type", "") or "utf-8"
|
99
|
+
) # default encoding
|
100
|
+
# PlayWright API sometimes give empty status text for some reason!
|
101
|
+
status_text = final_response.status_text or StatusText.get(
|
102
|
+
final_response.status
|
103
|
+
)
|
104
|
+
|
105
|
+
history = cls._process_response_history(first_response, parser_arguments)
|
106
|
+
try:
|
107
|
+
page_content = page.content()
|
108
|
+
except Exception as e: # pragma: no cover
|
109
|
+
log.error(f"Error getting page content: {e}")
|
110
|
+
page_content = ""
|
111
|
+
|
112
|
+
return Response(
|
113
|
+
url=page.url,
|
114
|
+
content=page_content,
|
115
|
+
status=final_response.status,
|
116
|
+
reason=status_text,
|
117
|
+
encoding=encoding,
|
118
|
+
cookies=tuple(dict(cookie) for cookie in page.context.cookies()),
|
119
|
+
headers=first_response.all_headers(),
|
120
|
+
request_headers=first_response.request.all_headers(),
|
121
|
+
history=history,
|
122
|
+
**parser_arguments,
|
123
|
+
)
|
124
|
+
|
125
|
+
@classmethod
|
126
|
+
async def _async_process_response_history(
|
127
|
+
cls, first_response: AsyncResponse, parser_arguments: Dict
|
128
|
+
) -> list[Response]:
|
129
|
+
"""Process response history to build a list of `Response` objects"""
|
130
|
+
history = []
|
131
|
+
current_request = first_response.request.redirected_from
|
132
|
+
|
133
|
+
try:
|
134
|
+
while current_request:
|
135
|
+
try:
|
136
|
+
current_response = await current_request.response()
|
137
|
+
history.insert(
|
138
|
+
0,
|
139
|
+
Response(
|
140
|
+
url=current_request.url,
|
141
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
142
|
+
content="",
|
143
|
+
status=current_response.status if current_response else 301,
|
144
|
+
reason=(
|
145
|
+
current_response.status_text
|
146
|
+
or StatusText.get(current_response.status)
|
147
|
+
)
|
148
|
+
if current_response
|
149
|
+
else StatusText.get(301),
|
150
|
+
encoding=current_response.headers.get("content-type", "")
|
151
|
+
or "utf-8",
|
152
|
+
cookies=tuple(),
|
153
|
+
headers=await current_response.all_headers()
|
154
|
+
if current_response
|
155
|
+
else {},
|
156
|
+
request_headers=await current_request.all_headers(),
|
157
|
+
**parser_arguments,
|
158
|
+
),
|
159
|
+
)
|
160
|
+
except Exception as e: # pragma: no cover
|
161
|
+
log.error(f"Error processing redirect: {e}")
|
162
|
+
break
|
163
|
+
|
164
|
+
current_request = current_request.redirected_from
|
165
|
+
except Exception as e: # pragma: no cover
|
166
|
+
log.error(f"Error processing response history: {e}")
|
167
|
+
|
168
|
+
return history
|
169
|
+
|
170
|
+
@classmethod
|
171
|
+
async def from_async_playwright_response(
|
172
|
+
cls,
|
173
|
+
page: AsyncPage,
|
174
|
+
first_response: AsyncResponse,
|
175
|
+
final_response: Optional[AsyncResponse],
|
176
|
+
parser_arguments: Dict,
|
177
|
+
) -> Response:
|
178
|
+
"""
|
179
|
+
Transforms a Playwright response into an internal `Response` object, encapsulating
|
180
|
+
the page's content, response status, headers, and relevant metadata.
|
181
|
+
|
182
|
+
The function handles potential issues, such as empty or missing final responses,
|
183
|
+
by falling back to the first response if necessary. Encoding and status text
|
184
|
+
are also derived from the provided response headers or reasonable defaults.
|
185
|
+
Additionally, the page content and cookies are extracted for further use.
|
186
|
+
|
187
|
+
:param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
|
188
|
+
:param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
|
189
|
+
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
190
|
+
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
191
|
+
the `Response` object.
|
192
|
+
|
193
|
+
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
194
|
+
:rtype: Response
|
195
|
+
"""
|
196
|
+
# In case we didn't catch a document type somehow
|
197
|
+
final_response = final_response if final_response else first_response
|
198
|
+
if not final_response:
|
199
|
+
raise ValueError("Failed to get a response from the page")
|
200
|
+
|
201
|
+
# This will be parsed inside `Response`
|
202
|
+
encoding = (
|
203
|
+
final_response.headers.get("content-type", "") or "utf-8"
|
204
|
+
) # default encoding
|
205
|
+
# PlayWright API sometimes give empty status text for some reason!
|
206
|
+
status_text = final_response.status_text or StatusText.get(
|
207
|
+
final_response.status
|
208
|
+
)
|
209
|
+
|
210
|
+
history = await cls._async_process_response_history(
|
211
|
+
first_response, parser_arguments
|
212
|
+
)
|
213
|
+
try:
|
214
|
+
page_content = await page.content()
|
215
|
+
except Exception as e: # pragma: no cover
|
216
|
+
log.error(f"Error getting page content in async: {e}")
|
217
|
+
page_content = ""
|
218
|
+
|
219
|
+
return Response(
|
220
|
+
url=page.url,
|
221
|
+
content=page_content,
|
222
|
+
status=final_response.status,
|
223
|
+
reason=status_text,
|
224
|
+
encoding=encoding,
|
225
|
+
cookies=tuple(dict(cookie) for cookie in await page.context.cookies()),
|
226
|
+
headers=await first_response.all_headers(),
|
227
|
+
request_headers=await first_response.request.all_headers(),
|
228
|
+
history=history,
|
229
|
+
**parser_arguments,
|
230
|
+
)
|
231
|
+
|
232
|
+
@staticmethod
|
233
|
+
def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response:
|
234
|
+
"""Takes `curl_cffi` response and generates `Response` object from it.
|
235
|
+
|
236
|
+
:param response: `curl_cffi` response object
|
237
|
+
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
|
238
|
+
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
239
|
+
"""
|
240
|
+
return Response(
|
241
|
+
url=response.url,
|
242
|
+
content=response.content
|
243
|
+
if isinstance(response.content, bytes)
|
244
|
+
else response.content.encode(),
|
245
|
+
status=response.status_code,
|
246
|
+
reason=response.reason,
|
247
|
+
encoding=response.encoding or "utf-8",
|
248
|
+
cookies=dict(response.cookies),
|
249
|
+
headers=dict(response.headers),
|
250
|
+
request_headers=dict(response.request.headers),
|
251
|
+
method=response.request.method,
|
252
|
+
history=response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
253
|
+
**parser_arguments,
|
254
|
+
)
|