scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +759 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +644 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +170 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +239 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.1.dist-info/METADATA +411 -0
- scrapling-0.3.1.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
scrapling/core/shell.py
ADDED
@@ -0,0 +1,647 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
from re import sub as re_sub
|
3
|
+
from sys import stderr
|
4
|
+
from functools import wraps
|
5
|
+
from http import cookies as Cookie
|
6
|
+
from collections import namedtuple
|
7
|
+
from shlex import split as shlex_split
|
8
|
+
from tempfile import mkstemp as make_temp_file
|
9
|
+
from urllib.parse import urlparse, urlunparse, parse_qsl
|
10
|
+
from argparse import ArgumentParser, SUPPRESS
|
11
|
+
from webbrowser import open as open_in_browser
|
12
|
+
from logging import (
|
13
|
+
DEBUG,
|
14
|
+
INFO,
|
15
|
+
WARNING,
|
16
|
+
ERROR,
|
17
|
+
CRITICAL,
|
18
|
+
FATAL,
|
19
|
+
getLogger,
|
20
|
+
getLevelName,
|
21
|
+
)
|
22
|
+
|
23
|
+
from orjson import loads as json_loads, JSONDecodeError
|
24
|
+
|
25
|
+
from scrapling import __version__
|
26
|
+
from scrapling.core.custom_types import TextHandler
|
27
|
+
from scrapling.core.utils import log
|
28
|
+
from scrapling.parser import Selector, Selectors
|
29
|
+
from scrapling.core._types import (
|
30
|
+
List,
|
31
|
+
Optional,
|
32
|
+
Dict,
|
33
|
+
Tuple,
|
34
|
+
Any,
|
35
|
+
extraction_types,
|
36
|
+
Generator,
|
37
|
+
)
|
38
|
+
from scrapling.fetchers import (
|
39
|
+
Fetcher,
|
40
|
+
AsyncFetcher,
|
41
|
+
DynamicFetcher,
|
42
|
+
StealthyFetcher,
|
43
|
+
Response,
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
_known_logging_levels = {
|
48
|
+
"debug": DEBUG,
|
49
|
+
"info": INFO,
|
50
|
+
"warning": WARNING,
|
51
|
+
"error": ERROR,
|
52
|
+
"critical": CRITICAL,
|
53
|
+
"fatal": FATAL,
|
54
|
+
}
|
55
|
+
|
56
|
+
|
57
|
+
# Define the structure for parsed context - Simplified for Fetcher args
|
58
|
+
Request = namedtuple(
|
59
|
+
"Request",
|
60
|
+
[
|
61
|
+
"method",
|
62
|
+
"url",
|
63
|
+
"params",
|
64
|
+
"data", # Can be str, bytes, or dict (for urlencoded)
|
65
|
+
"json_data", # Python object (dict/list) for JSON payload
|
66
|
+
"headers",
|
67
|
+
"cookies",
|
68
|
+
"proxy",
|
69
|
+
"follow_redirects", # Added for -L flag
|
70
|
+
],
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
def _CookieParser(cookie_string):
|
75
|
+
# Errors will be handled on call so the log can be specified
|
76
|
+
cookie_parser = Cookie.SimpleCookie()
|
77
|
+
cookie_parser.load(cookie_string)
|
78
|
+
for key, morsel in cookie_parser.items():
|
79
|
+
yield key, morsel.value
|
80
|
+
|
81
|
+
|
82
|
+
def _ParseHeaders(
|
83
|
+
header_lines: List[str], parse_cookies: bool = True
|
84
|
+
) -> Tuple[Dict[str, str], Dict[str, str]]:
|
85
|
+
"""Parses headers into separate header and cookie dictionaries."""
|
86
|
+
header_dict = dict()
|
87
|
+
cookie_dict = dict()
|
88
|
+
|
89
|
+
for header_line in header_lines:
|
90
|
+
if ":" not in header_line:
|
91
|
+
if header_line.endswith(";"):
|
92
|
+
header_key = header_line[:-1].strip()
|
93
|
+
header_value = ""
|
94
|
+
header_dict[header_key] = header_value
|
95
|
+
else:
|
96
|
+
raise ValueError(
|
97
|
+
f"Could not parse header without colon: '{header_line}'."
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
header_key, header_value = header_line.split(":", 1)
|
101
|
+
header_key = header_key.strip()
|
102
|
+
header_value = header_value.strip()
|
103
|
+
|
104
|
+
if parse_cookies:
|
105
|
+
if header_key.lower() == "cookie":
|
106
|
+
try:
|
107
|
+
cookie_dict = {
|
108
|
+
key: value for key, value in _CookieParser(header_value)
|
109
|
+
}
|
110
|
+
except Exception as e: # pragma: no cover
|
111
|
+
raise ValueError(
|
112
|
+
f"Could not parse cookie string from header '{header_value}': {e}"
|
113
|
+
)
|
114
|
+
else:
|
115
|
+
header_dict[header_key] = header_value
|
116
|
+
else:
|
117
|
+
header_dict[header_key] = header_value
|
118
|
+
|
119
|
+
return header_dict, cookie_dict
|
120
|
+
|
121
|
+
|
122
|
+
# Suppress exit on error to handle parsing errors gracefully
|
123
|
+
class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
124
|
+
def error(self, message):
|
125
|
+
log.error(f"Curl arguments parsing error: {message}")
|
126
|
+
raise ValueError(f"Curl arguments parsing error: {message}")
|
127
|
+
|
128
|
+
def exit(self, status=0, message=None):
|
129
|
+
if message:
|
130
|
+
log.error(f"Scrapling shell exited with status {status}: {message}")
|
131
|
+
self._print_message(message, stderr)
|
132
|
+
raise ValueError(
|
133
|
+
f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
|
134
|
+
)
|
135
|
+
|
136
|
+
|
137
|
+
class CurlParser:
|
138
|
+
"""Builds the argument parser for relevant curl flags from DevTools."""
|
139
|
+
|
140
|
+
def __init__(self):
|
141
|
+
# We will use argparse parser to parse the curl command directly instead of regex
|
142
|
+
# We will focus more on flags that will show up on curl commands copied from DevTools's network tab
|
143
|
+
_parser = NoExitArgumentParser(add_help=False) # Disable default help
|
144
|
+
# Basic curl arguments
|
145
|
+
_parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
|
146
|
+
_parser.add_argument("url")
|
147
|
+
_parser.add_argument("-X", "--request", dest="method", default=None)
|
148
|
+
_parser.add_argument("-H", "--header", action="append", default=[])
|
149
|
+
_parser.add_argument(
|
150
|
+
"-A", "--user-agent", help="Will be parsed from -H if present"
|
151
|
+
) # Note: DevTools usually includes this in -H
|
152
|
+
|
153
|
+
# Data arguments (prioritizing types common from DevTools)
|
154
|
+
_parser.add_argument("-d", "--data", default=None)
|
155
|
+
_parser.add_argument(
|
156
|
+
"--data-raw", default=None
|
157
|
+
) # Often used by browsers for JSON body
|
158
|
+
_parser.add_argument("--data-binary", default=None)
|
159
|
+
# Keep urlencode for completeness, though less common from browser copy/paste
|
160
|
+
_parser.add_argument("--data-urlencode", action="append", default=[])
|
161
|
+
_parser.add_argument(
|
162
|
+
"-G", "--get", action="store_true"
|
163
|
+
) # Use GET and put data in URL
|
164
|
+
|
165
|
+
_parser.add_argument(
|
166
|
+
"-b",
|
167
|
+
"--cookie",
|
168
|
+
default=None,
|
169
|
+
help="Send cookies from string/file (string format used by DevTools)",
|
170
|
+
)
|
171
|
+
|
172
|
+
# Proxy
|
173
|
+
_parser.add_argument("-x", "--proxy", default=None)
|
174
|
+
_parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
|
175
|
+
|
176
|
+
# Connection/Security
|
177
|
+
_parser.add_argument("-k", "--insecure", action="store_true")
|
178
|
+
_parser.add_argument(
|
179
|
+
"--compressed", action="store_true"
|
180
|
+
) # Very common from browsers
|
181
|
+
|
182
|
+
# Other flags often included but may not map directly to request args
|
183
|
+
_parser.add_argument("-i", "--include", action="store_true")
|
184
|
+
_parser.add_argument("-s", "--silent", action="store_true")
|
185
|
+
_parser.add_argument("-v", "--verbose", action="store_true")
|
186
|
+
|
187
|
+
self.parser: NoExitArgumentParser = _parser
|
188
|
+
self._supported_methods = ("get", "post", "put", "delete")
|
189
|
+
|
190
|
+
# --- Main Parsing Logic ---
|
191
|
+
def parse(self, curl_command: str) -> Optional[Request]:
|
192
|
+
"""Parses the curl command string into a structured context for Fetcher."""
|
193
|
+
|
194
|
+
clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
|
195
|
+
|
196
|
+
try:
|
197
|
+
tokens = shlex_split(
|
198
|
+
clean_command
|
199
|
+
) # Split the string using shell-like syntax
|
200
|
+
except ValueError as e: # pragma: no cover
|
201
|
+
log.error(f"Could not split command line: {e}")
|
202
|
+
return None
|
203
|
+
|
204
|
+
try:
|
205
|
+
parsed_args, unknown = self.parser.parse_known_args(tokens)
|
206
|
+
if unknown:
|
207
|
+
raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")
|
208
|
+
|
209
|
+
except ValueError: # pragma: no cover
|
210
|
+
return None
|
211
|
+
|
212
|
+
except AttributeError:
|
213
|
+
raise
|
214
|
+
|
215
|
+
except Exception as e: # pragma: no cover
|
216
|
+
log.error(
|
217
|
+
f"An unexpected error occurred during curl arguments parsing: {e}"
|
218
|
+
)
|
219
|
+
return None
|
220
|
+
|
221
|
+
# --- Determine Method ---
|
222
|
+
method = "get" # Default
|
223
|
+
if parsed_args.get: # `-G` forces GET
|
224
|
+
method = "get"
|
225
|
+
|
226
|
+
elif parsed_args.method:
|
227
|
+
method = parsed_args.method.strip().lower()
|
228
|
+
|
229
|
+
# Infer POST if data is present (unless overridden by -X or -G)
|
230
|
+
elif any(
|
231
|
+
[
|
232
|
+
parsed_args.data,
|
233
|
+
parsed_args.data_raw,
|
234
|
+
parsed_args.data_binary,
|
235
|
+
parsed_args.data_urlencode,
|
236
|
+
]
|
237
|
+
):
|
238
|
+
method = "post"
|
239
|
+
|
240
|
+
headers, cookies = _ParseHeaders(parsed_args.header)
|
241
|
+
|
242
|
+
if parsed_args.cookie:
|
243
|
+
# We are focusing on the string format from DevTools.
|
244
|
+
try:
|
245
|
+
for key, value in _CookieParser(parsed_args.cookie):
|
246
|
+
# Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
|
247
|
+
cookies[key] = value
|
248
|
+
log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
|
249
|
+
except Exception as e: # pragma: no cover
|
250
|
+
log.error(
|
251
|
+
f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
|
252
|
+
)
|
253
|
+
|
254
|
+
# --- Process Data Payload ---
|
255
|
+
params = dict()
|
256
|
+
data_payload: Optional[str | bytes | Dict] = None
|
257
|
+
json_payload: Optional[Any] = None
|
258
|
+
|
259
|
+
# DevTools often uses --data-raw for JSON bodies
|
260
|
+
# Precedence: --data-binary > --data-raw / -d > --data-urlencode
|
261
|
+
if parsed_args.data_binary is not None: # pragma: no cover
|
262
|
+
try:
|
263
|
+
data_payload = parsed_args.data_binary.encode("utf-8")
|
264
|
+
log.debug("Using data from --data-binary as bytes.")
|
265
|
+
except Exception as e:
|
266
|
+
log.warning(
|
267
|
+
f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
|
268
|
+
)
|
269
|
+
data_payload = parsed_args.data_binary # Fallback to string
|
270
|
+
|
271
|
+
elif parsed_args.data_raw is not None:
|
272
|
+
data_payload = parsed_args.data_raw
|
273
|
+
|
274
|
+
elif parsed_args.data is not None:
|
275
|
+
data_payload = parsed_args.data
|
276
|
+
|
277
|
+
elif parsed_args.data_urlencode: # pragma: no cover
|
278
|
+
# Combine and parse urlencoded data
|
279
|
+
combined_data = "&".join(parsed_args.data_urlencode)
|
280
|
+
try:
|
281
|
+
data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
|
282
|
+
except Exception as e:
|
283
|
+
log.warning(
|
284
|
+
f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
|
285
|
+
)
|
286
|
+
data_payload = combined_data
|
287
|
+
|
288
|
+
# Check if raw data looks like JSON, prefer 'json' param if so
|
289
|
+
if isinstance(data_payload, str):
|
290
|
+
try:
|
291
|
+
maybe_json = json_loads(data_payload)
|
292
|
+
if isinstance(maybe_json, (dict, list)):
|
293
|
+
json_payload = maybe_json
|
294
|
+
data_payload = None
|
295
|
+
except JSONDecodeError:
|
296
|
+
pass # Not JSON, keep it in data_payload
|
297
|
+
|
298
|
+
# Handle `-G`: Move data to params if the method is GET
|
299
|
+
if method == "get" and data_payload: # pragma: no cover
|
300
|
+
if isinstance(data_payload, dict): # From --data-urlencode likely
|
301
|
+
params.update(data_payload)
|
302
|
+
elif isinstance(data_payload, str):
|
303
|
+
try:
|
304
|
+
params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
|
305
|
+
except ValueError:
|
306
|
+
log.warning(
|
307
|
+
f"Could not parse data '{data_payload}' into GET parameters for -G."
|
308
|
+
)
|
309
|
+
|
310
|
+
if params:
|
311
|
+
data_payload = None # Clear data as it's moved to params
|
312
|
+
json_payload = None # Should not have JSON body with -G
|
313
|
+
|
314
|
+
# --- Process Proxy ---
|
315
|
+
proxies: Optional[Dict[str, str]] = None
|
316
|
+
if parsed_args.proxy:
|
317
|
+
proxy_url = (
|
318
|
+
f"http://{parsed_args.proxy}"
|
319
|
+
if "://" not in parsed_args.proxy
|
320
|
+
else parsed_args.proxy
|
321
|
+
)
|
322
|
+
|
323
|
+
if parsed_args.proxy_user:
|
324
|
+
user_pass = parsed_args.proxy_user
|
325
|
+
parts = urlparse(proxy_url)
|
326
|
+
netloc_parts = parts.netloc.split("@")
|
327
|
+
netloc = (
|
328
|
+
f"{user_pass}@{netloc_parts[-1]}"
|
329
|
+
if len(netloc_parts) > 1
|
330
|
+
else f"{user_pass}@{parts.netloc}"
|
331
|
+
)
|
332
|
+
proxy_url = urlunparse(
|
333
|
+
(
|
334
|
+
parts.scheme,
|
335
|
+
netloc,
|
336
|
+
parts.path,
|
337
|
+
parts.params,
|
338
|
+
parts.query,
|
339
|
+
parts.fragment,
|
340
|
+
)
|
341
|
+
)
|
342
|
+
|
343
|
+
# Standard proxy dict format
|
344
|
+
proxies = {"http": proxy_url, "https": proxy_url}
|
345
|
+
log.debug(f"Using proxy configuration: {proxies}")
|
346
|
+
|
347
|
+
# --- Final Context ---
|
348
|
+
return Request(
|
349
|
+
method=method,
|
350
|
+
url=parsed_args.url,
|
351
|
+
params=params,
|
352
|
+
data=data_payload,
|
353
|
+
json_data=json_payload,
|
354
|
+
headers=headers,
|
355
|
+
cookies=cookies,
|
356
|
+
proxy=proxies,
|
357
|
+
follow_redirects=True, # Scrapling default is True
|
358
|
+
)
|
359
|
+
|
360
|
+
def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
|
361
|
+
if isinstance(curl_command, (Request, str)):
|
362
|
+
request = (
|
363
|
+
self.parse(curl_command)
|
364
|
+
if isinstance(curl_command, str)
|
365
|
+
else curl_command
|
366
|
+
)
|
367
|
+
|
368
|
+
# Ensure request parsing was successful before proceeding
|
369
|
+
if request is None: # pragma: no cover
|
370
|
+
log.error("Failed to parse curl command, cannot convert to fetcher.")
|
371
|
+
return None
|
372
|
+
|
373
|
+
request_args = request._asdict()
|
374
|
+
method = request_args.pop("method").strip().lower()
|
375
|
+
if method in self._supported_methods:
|
376
|
+
request_args["json"] = request_args.pop("json_data")
|
377
|
+
|
378
|
+
# Ensure data/json are removed for non-POST/PUT methods
|
379
|
+
if method not in ("post", "put"):
|
380
|
+
_ = request_args.pop("data", None)
|
381
|
+
_ = request_args.pop("json", None)
|
382
|
+
|
383
|
+
try:
|
384
|
+
return getattr(Fetcher, method)(**request_args)
|
385
|
+
except Exception as e: # pragma: no cover
|
386
|
+
log.error(f"Error calling Fetcher.{method}: {e}")
|
387
|
+
return None
|
388
|
+
else: # pragma: no cover
|
389
|
+
log.error(
|
390
|
+
f'Request method "{method}" isn\'t supported by Scrapling yet'
|
391
|
+
)
|
392
|
+
return None
|
393
|
+
|
394
|
+
else: # pragma: no cover
|
395
|
+
log.error("Input must be a valid curl command string or a Request object.")
|
396
|
+
return None
|
397
|
+
|
398
|
+
|
399
|
+
def show_page_in_browser(page: Selector): # pragma: no cover
|
400
|
+
if not page or not isinstance(page, Selector):
|
401
|
+
log.error("Input must be of type `Selector`")
|
402
|
+
return
|
403
|
+
|
404
|
+
try:
|
405
|
+
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
406
|
+
with open(fd, "w", encoding="utf-8") as f:
|
407
|
+
f.write(page.body)
|
408
|
+
|
409
|
+
open_in_browser(f"file://{fname}")
|
410
|
+
except IOError as e:
|
411
|
+
log.error(f"Failed to write temporary file for viewing: {e}")
|
412
|
+
except Exception as e:
|
413
|
+
log.error(f"An unexpected error occurred while viewing the page: {e}")
|
414
|
+
|
415
|
+
|
416
|
+
class CustomShell:
|
417
|
+
"""A custom IPython shell with minimal dependencies"""
|
418
|
+
|
419
|
+
def __init__(self, code, log_level="debug"):
|
420
|
+
self.code = code
|
421
|
+
self.page = None
|
422
|
+
self.pages = Selectors([])
|
423
|
+
self._curl_parser = CurlParser()
|
424
|
+
log_level = log_level.strip().lower()
|
425
|
+
|
426
|
+
if _known_logging_levels.get(log_level):
|
427
|
+
self.log_level = _known_logging_levels[log_level]
|
428
|
+
else: # pragma: no cover
|
429
|
+
log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
|
430
|
+
self.log_level = DEBUG
|
431
|
+
|
432
|
+
self.shell = None
|
433
|
+
|
434
|
+
# Initialize your application components
|
435
|
+
self.init_components()
|
436
|
+
|
437
|
+
def init_components(self):
|
438
|
+
"""Initialize application components"""
|
439
|
+
# This is where you'd set up your application-specific objects
|
440
|
+
if self.log_level:
|
441
|
+
getLogger("scrapling").setLevel(self.log_level)
|
442
|
+
|
443
|
+
settings = Fetcher.display_config()
|
444
|
+
settings.pop("storage", None)
|
445
|
+
settings.pop("storage_args", None)
|
446
|
+
log.info(f"Scrapling {__version__} shell started")
|
447
|
+
log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
|
448
|
+
log.info(f"Fetchers' parsing settings: {settings}")
|
449
|
+
|
450
|
+
@staticmethod
|
451
|
+
def banner():
|
452
|
+
"""Create a custom banner for the shell"""
|
453
|
+
return f"""
|
454
|
+
-> Available Scrapling objects:
|
455
|
+
- Fetcher/AsyncFetcher
|
456
|
+
- DynamicFetcher
|
457
|
+
- StealthyFetcher
|
458
|
+
- Selector
|
459
|
+
|
460
|
+
-> Useful shortcuts:
|
461
|
+
- {"get":<30} Shortcut for `Fetcher.get`
|
462
|
+
- {"post":<30} Shortcut for `Fetcher.post`
|
463
|
+
- {"put":<30} Shortcut for `Fetcher.put`
|
464
|
+
- {"delete":<30} Shortcut for `Fetcher.delete`
|
465
|
+
- {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
|
466
|
+
- {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
|
467
|
+
|
468
|
+
-> Useful commands
|
469
|
+
- {"page / response":<30} The response object of the last page you fetched
|
470
|
+
- {"pages":<30} Selectors object of the last 5 response objects you fetched
|
471
|
+
- {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
|
472
|
+
- {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
|
473
|
+
- {"view(page)":<30} View page in a browser
|
474
|
+
- {"help()":<30} Show this help message (Shell help)
|
475
|
+
|
476
|
+
Type 'exit' or press Ctrl+D to exit.
|
477
|
+
"""
|
478
|
+
|
479
|
+
def update_page(self, result): # pragma: no cover
|
480
|
+
"""Update the current page and add to pages history"""
|
481
|
+
self.page = result
|
482
|
+
if isinstance(result, (Response, Selector)):
|
483
|
+
self.pages.append(result)
|
484
|
+
if len(self.pages) > 5:
|
485
|
+
self.pages.pop(0) # Remove oldest item
|
486
|
+
|
487
|
+
# Update in IPython namespace too
|
488
|
+
if self.shell:
|
489
|
+
self.shell.user_ns["page"] = self.page
|
490
|
+
self.shell.user_ns["response"] = self.page
|
491
|
+
self.shell.user_ns["pages"] = self.pages
|
492
|
+
|
493
|
+
return result
|
494
|
+
|
495
|
+
def create_wrapper(self, func):
|
496
|
+
"""Create a wrapper that preserves function signature but updates page"""
|
497
|
+
|
498
|
+
@wraps(func)
|
499
|
+
def wrapper(*args, **kwargs):
|
500
|
+
result = func(*args, **kwargs)
|
501
|
+
return self.update_page(result)
|
502
|
+
|
503
|
+
return wrapper
|
504
|
+
|
505
|
+
def get_namespace(self):
|
506
|
+
"""Create a namespace with application-specific objects"""
|
507
|
+
|
508
|
+
# Create wrapped versions of fetch functions
|
509
|
+
get = self.create_wrapper(Fetcher.get)
|
510
|
+
post = self.create_wrapper(Fetcher.post)
|
511
|
+
put = self.create_wrapper(Fetcher.put)
|
512
|
+
delete = self.create_wrapper(Fetcher.delete)
|
513
|
+
dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
|
514
|
+
stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
|
515
|
+
curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
|
516
|
+
|
517
|
+
# Create the namespace dictionary
|
518
|
+
return {
|
519
|
+
"get": get,
|
520
|
+
"post": post,
|
521
|
+
"put": put,
|
522
|
+
"delete": delete,
|
523
|
+
"Fetcher": Fetcher,
|
524
|
+
"AsyncFetcher": AsyncFetcher,
|
525
|
+
"fetch": dynamic_fetch,
|
526
|
+
"DynamicFetcher": DynamicFetcher,
|
527
|
+
"stealthy_fetch": stealthy_fetch,
|
528
|
+
"StealthyFetcher": StealthyFetcher,
|
529
|
+
"Selector": Selector,
|
530
|
+
"page": self.page,
|
531
|
+
"response": self.page,
|
532
|
+
"pages": self.pages,
|
533
|
+
"view": show_page_in_browser,
|
534
|
+
"uncurl": self._curl_parser.parse,
|
535
|
+
"curl2fetcher": curl2fetcher,
|
536
|
+
"help": self.show_help,
|
537
|
+
}
|
538
|
+
|
539
|
+
def show_help(self): # pragma: no cover
|
540
|
+
"""Show help information"""
|
541
|
+
print(self.banner())
|
542
|
+
|
543
|
+
def start(self): # pragma: no cover
|
544
|
+
"""Start the interactive shell"""
|
545
|
+
from IPython.terminal.embed import InteractiveShellEmbed
|
546
|
+
|
547
|
+
# Get our namespace with application objects
|
548
|
+
namespace = self.get_namespace()
|
549
|
+
ipython_shell = InteractiveShellEmbed(
|
550
|
+
banner1=self.banner(),
|
551
|
+
banner2="",
|
552
|
+
enable_tip=False,
|
553
|
+
exit_msg="Bye Bye",
|
554
|
+
user_ns=namespace,
|
555
|
+
)
|
556
|
+
self.shell = ipython_shell
|
557
|
+
|
558
|
+
# If a command was provided, execute it and exit
|
559
|
+
if self.code:
|
560
|
+
log.info(f"Executing provided code: {self.code}")
|
561
|
+
try:
|
562
|
+
ipython_shell.run_cell(self.code, store_history=False)
|
563
|
+
except Exception as e:
|
564
|
+
log.error(f"Error executing initial code: {e}")
|
565
|
+
return
|
566
|
+
|
567
|
+
ipython_shell()
|
568
|
+
|
569
|
+
|
570
|
+
class Convertor:
|
571
|
+
"""Utils for the extract shell command"""
|
572
|
+
|
573
|
+
_extension_map: Dict[str, extraction_types] = {
|
574
|
+
"md": "markdown",
|
575
|
+
"html": "html",
|
576
|
+
"txt": "text",
|
577
|
+
}
|
578
|
+
|
579
|
+
@classmethod
|
580
|
+
def _convert_to_markdown(cls, body: TextHandler) -> str:
|
581
|
+
"""Convert HTML content to Markdown"""
|
582
|
+
from markdownify import markdownify
|
583
|
+
|
584
|
+
return markdownify(body)
|
585
|
+
|
586
|
+
@classmethod
|
587
|
+
def _extract_content(
|
588
|
+
cls,
|
589
|
+
page: Selector,
|
590
|
+
extraction_type: extraction_types = "markdown",
|
591
|
+
css_selector: Optional[str] = None,
|
592
|
+
main_content_only: bool = False,
|
593
|
+
) -> Generator[str, None, None]:
|
594
|
+
"""Extract the content of a Selector"""
|
595
|
+
if not page or not isinstance(page, Selector): # pragma: no cover
|
596
|
+
raise TypeError("Input must be of type `Selector`")
|
597
|
+
elif not extraction_type or extraction_type not in cls._extension_map.values():
|
598
|
+
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
599
|
+
else:
|
600
|
+
if main_content_only:
|
601
|
+
page = page.css_first("body") or page
|
602
|
+
|
603
|
+
pages = [page] if not css_selector else page.css(css_selector)
|
604
|
+
for page in pages:
|
605
|
+
match extraction_type:
|
606
|
+
case "markdown":
|
607
|
+
yield cls._convert_to_markdown(page.body)
|
608
|
+
case "html":
|
609
|
+
yield page.body
|
610
|
+
case "text":
|
611
|
+
txt_content = page.get_all_text(strip=True)
|
612
|
+
for s in (
|
613
|
+
"\n",
|
614
|
+
"\r",
|
615
|
+
"\t",
|
616
|
+
" ",
|
617
|
+
):
|
618
|
+
# Remove consecutive white-spaces
|
619
|
+
txt_content = re_sub(f"[{s}]+", s, txt_content)
|
620
|
+
yield txt_content
|
621
|
+
yield ""
|
622
|
+
|
623
|
+
@classmethod
|
624
|
+
def write_content_to_file(
|
625
|
+
cls, page: Selector, filename: str, css_selector: Optional[str] = None
|
626
|
+
) -> None:
|
627
|
+
"""Write a Selector's content to a file"""
|
628
|
+
if not page or not isinstance(page, Selector): # pragma: no cover
|
629
|
+
raise TypeError("Input must be of type `Selector`")
|
630
|
+
elif not filename or not isinstance(filename, str) or not filename.strip():
|
631
|
+
raise ValueError("Filename must be provided")
|
632
|
+
elif not filename.endswith((".md", ".html", ".txt")):
|
633
|
+
raise ValueError(
|
634
|
+
"Unknown file type: filename must end with '.md', '.html', or '.txt'"
|
635
|
+
)
|
636
|
+
else:
|
637
|
+
with open(filename, "w", encoding="utf-8") as f:
|
638
|
+
extension = filename.split(".")[-1]
|
639
|
+
f.write(
|
640
|
+
"".join(
|
641
|
+
cls._extract_content(
|
642
|
+
page,
|
643
|
+
cls._extension_map[extension],
|
644
|
+
css_selector=css_selector,
|
645
|
+
)
|
646
|
+
)
|
647
|
+
)
|