scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +219 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +201 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
- scrapling-0.3.3.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
scrapling/core/shell.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
from re import sub as re_sub
|
3
3
|
from sys import stderr
|
4
4
|
from functools import wraps
|
5
|
-
from http import cookies as Cookie
|
6
5
|
from collections import namedtuple
|
7
6
|
from shlex import split as shlex_split
|
8
7
|
from tempfile import mkstemp as make_temp_file
|
@@ -23,25 +22,17 @@ from logging import (
|
|
23
22
|
from orjson import loads as json_loads, JSONDecodeError
|
24
23
|
|
25
24
|
from scrapling import __version__
|
26
|
-
from scrapling.core.custom_types import TextHandler
|
27
|
-
from scrapling.core.utils import log
|
28
25
|
from scrapling.parser import Selector, Selectors
|
26
|
+
from scrapling.core.custom_types import TextHandler
|
27
|
+
from scrapling.engines.toolbelt.custom import Response
|
28
|
+
from scrapling.core.utils import log, _ParseHeaders, _CookieParser
|
29
29
|
from scrapling.core._types import (
|
30
|
-
List,
|
31
30
|
Optional,
|
32
31
|
Dict,
|
33
|
-
Tuple,
|
34
32
|
Any,
|
35
33
|
extraction_types,
|
36
34
|
Generator,
|
37
35
|
)
|
38
|
-
from scrapling.fetchers import (
|
39
|
-
Fetcher,
|
40
|
-
AsyncFetcher,
|
41
|
-
DynamicFetcher,
|
42
|
-
StealthyFetcher,
|
43
|
-
Response,
|
44
|
-
)
|
45
36
|
|
46
37
|
|
47
38
|
_known_logging_levels = {
|
@@ -71,54 +62,6 @@ Request = namedtuple(
|
|
71
62
|
)
|
72
63
|
|
73
64
|
|
74
|
-
def _CookieParser(cookie_string):
|
75
|
-
# Errors will be handled on call so the log can be specified
|
76
|
-
cookie_parser = Cookie.SimpleCookie()
|
77
|
-
cookie_parser.load(cookie_string)
|
78
|
-
for key, morsel in cookie_parser.items():
|
79
|
-
yield key, morsel.value
|
80
|
-
|
81
|
-
|
82
|
-
def _ParseHeaders(
|
83
|
-
header_lines: List[str], parse_cookies: bool = True
|
84
|
-
) -> Tuple[Dict[str, str], Dict[str, str]]:
|
85
|
-
"""Parses headers into separate header and cookie dictionaries."""
|
86
|
-
header_dict = dict()
|
87
|
-
cookie_dict = dict()
|
88
|
-
|
89
|
-
for header_line in header_lines:
|
90
|
-
if ":" not in header_line:
|
91
|
-
if header_line.endswith(";"):
|
92
|
-
header_key = header_line[:-1].strip()
|
93
|
-
header_value = ""
|
94
|
-
header_dict[header_key] = header_value
|
95
|
-
else:
|
96
|
-
raise ValueError(
|
97
|
-
f"Could not parse header without colon: '{header_line}'."
|
98
|
-
)
|
99
|
-
else:
|
100
|
-
header_key, header_value = header_line.split(":", 1)
|
101
|
-
header_key = header_key.strip()
|
102
|
-
header_value = header_value.strip()
|
103
|
-
|
104
|
-
if parse_cookies:
|
105
|
-
if header_key.lower() == "cookie":
|
106
|
-
try:
|
107
|
-
cookie_dict = {
|
108
|
-
key: value for key, value in _CookieParser(header_value)
|
109
|
-
}
|
110
|
-
except Exception as e: # pragma: no cover
|
111
|
-
raise ValueError(
|
112
|
-
f"Could not parse cookie string from header '{header_value}': {e}"
|
113
|
-
)
|
114
|
-
else:
|
115
|
-
header_dict[header_key] = header_value
|
116
|
-
else:
|
117
|
-
header_dict[header_key] = header_value
|
118
|
-
|
119
|
-
return header_dict, cookie_dict
|
120
|
-
|
121
|
-
|
122
65
|
# Suppress exit on error to handle parsing errors gracefully
|
123
66
|
class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
124
67
|
def error(self, message):
|
@@ -129,15 +72,16 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
|
129
72
|
if message:
|
130
73
|
log.error(f"Scrapling shell exited with status {status}: {message}")
|
131
74
|
self._print_message(message, stderr)
|
132
|
-
raise ValueError(
|
133
|
-
f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
|
134
|
-
)
|
75
|
+
raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
|
135
76
|
|
136
77
|
|
137
78
|
class CurlParser:
|
138
79
|
"""Builds the argument parser for relevant curl flags from DevTools."""
|
139
80
|
|
140
81
|
def __init__(self):
|
82
|
+
from scrapling.fetchers import Fetcher as __Fetcher
|
83
|
+
|
84
|
+
self.__fetcher = __Fetcher
|
141
85
|
# We will use argparse parser to parse the curl command directly instead of regex
|
142
86
|
# We will focus more on flags that will show up on curl commands copied from DevTools's network tab
|
143
87
|
_parser = NoExitArgumentParser(add_help=False) # Disable default help
|
@@ -152,15 +96,11 @@ class CurlParser:
|
|
152
96
|
|
153
97
|
# Data arguments (prioritizing types common from DevTools)
|
154
98
|
_parser.add_argument("-d", "--data", default=None)
|
155
|
-
_parser.add_argument(
|
156
|
-
"--data-raw", default=None
|
157
|
-
) # Often used by browsers for JSON body
|
99
|
+
_parser.add_argument("--data-raw", default=None) # Often used by browsers for JSON body
|
158
100
|
_parser.add_argument("--data-binary", default=None)
|
159
101
|
# Keep urlencode for completeness, though less common from browser copy/paste
|
160
102
|
_parser.add_argument("--data-urlencode", action="append", default=[])
|
161
|
-
_parser.add_argument(
|
162
|
-
"-G", "--get", action="store_true"
|
163
|
-
) # Use GET and put data in URL
|
103
|
+
_parser.add_argument("-G", "--get", action="store_true") # Use GET and put data in URL
|
164
104
|
|
165
105
|
_parser.add_argument(
|
166
106
|
"-b",
|
@@ -175,9 +115,7 @@ class CurlParser:
|
|
175
115
|
|
176
116
|
# Connection/Security
|
177
117
|
_parser.add_argument("-k", "--insecure", action="store_true")
|
178
|
-
_parser.add_argument(
|
179
|
-
"--compressed", action="store_true"
|
180
|
-
) # Very common from browsers
|
118
|
+
_parser.add_argument("--compressed", action="store_true") # Very common from browsers
|
181
119
|
|
182
120
|
# Other flags often included but may not map directly to request args
|
183
121
|
_parser.add_argument("-i", "--include", action="store_true")
|
@@ -194,9 +132,7 @@ class CurlParser:
|
|
194
132
|
clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
|
195
133
|
|
196
134
|
try:
|
197
|
-
tokens = shlex_split(
|
198
|
-
clean_command
|
199
|
-
) # Split the string using shell-like syntax
|
135
|
+
tokens = shlex_split(clean_command) # Split the string using shell-like syntax
|
200
136
|
except ValueError as e: # pragma: no cover
|
201
137
|
log.error(f"Could not split command line: {e}")
|
202
138
|
return None
|
@@ -213,9 +149,7 @@ class CurlParser:
|
|
213
149
|
raise
|
214
150
|
|
215
151
|
except Exception as e: # pragma: no cover
|
216
|
-
log.error(
|
217
|
-
f"An unexpected error occurred during curl arguments parsing: {e}"
|
218
|
-
)
|
152
|
+
log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
|
219
153
|
return None
|
220
154
|
|
221
155
|
# --- Determine Method ---
|
@@ -247,9 +181,7 @@ class CurlParser:
|
|
247
181
|
cookies[key] = value
|
248
182
|
log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
|
249
183
|
except Exception as e: # pragma: no cover
|
250
|
-
log.error(
|
251
|
-
f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
|
252
|
-
)
|
184
|
+
log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
|
253
185
|
|
254
186
|
# --- Process Data Payload ---
|
255
187
|
params = dict()
|
@@ -280,9 +212,7 @@ class CurlParser:
|
|
280
212
|
try:
|
281
213
|
data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
|
282
214
|
except Exception as e:
|
283
|
-
log.warning(
|
284
|
-
f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
|
285
|
-
)
|
215
|
+
log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
|
286
216
|
data_payload = combined_data
|
287
217
|
|
288
218
|
# Check if raw data looks like JSON, prefer 'json' param if so
|
@@ -303,9 +233,7 @@ class CurlParser:
|
|
303
233
|
try:
|
304
234
|
params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
|
305
235
|
except ValueError:
|
306
|
-
log.warning(
|
307
|
-
f"Could not parse data '{data_payload}' into GET parameters for -G."
|
308
|
-
)
|
236
|
+
log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
|
309
237
|
|
310
238
|
if params:
|
311
239
|
data_payload = None # Clear data as it's moved to params
|
@@ -314,21 +242,13 @@ class CurlParser:
|
|
314
242
|
# --- Process Proxy ---
|
315
243
|
proxies: Optional[Dict[str, str]] = None
|
316
244
|
if parsed_args.proxy:
|
317
|
-
proxy_url =
|
318
|
-
f"http://{parsed_args.proxy}"
|
319
|
-
if "://" not in parsed_args.proxy
|
320
|
-
else parsed_args.proxy
|
321
|
-
)
|
245
|
+
proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
|
322
246
|
|
323
247
|
if parsed_args.proxy_user:
|
324
248
|
user_pass = parsed_args.proxy_user
|
325
249
|
parts = urlparse(proxy_url)
|
326
250
|
netloc_parts = parts.netloc.split("@")
|
327
|
-
netloc = (
|
328
|
-
f"{user_pass}@{netloc_parts[-1]}"
|
329
|
-
if len(netloc_parts) > 1
|
330
|
-
else f"{user_pass}@{parts.netloc}"
|
331
|
-
)
|
251
|
+
netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
|
332
252
|
proxy_url = urlunparse(
|
333
253
|
(
|
334
254
|
parts.scheme,
|
@@ -359,11 +279,7 @@ class CurlParser:
|
|
359
279
|
|
360
280
|
def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
|
361
281
|
if isinstance(curl_command, (Request, str)):
|
362
|
-
request = (
|
363
|
-
self.parse(curl_command)
|
364
|
-
if isinstance(curl_command, str)
|
365
|
-
else curl_command
|
366
|
-
)
|
282
|
+
request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
|
367
283
|
|
368
284
|
# Ensure request parsing was successful before proceeding
|
369
285
|
if request is None: # pragma: no cover
|
@@ -381,14 +297,12 @@ class CurlParser:
|
|
381
297
|
_ = request_args.pop("json", None)
|
382
298
|
|
383
299
|
try:
|
384
|
-
return getattr(
|
300
|
+
return getattr(self.__fetcher, method)(**request_args)
|
385
301
|
except Exception as e: # pragma: no cover
|
386
302
|
log.error(f"Error calling Fetcher.{method}: {e}")
|
387
303
|
return None
|
388
304
|
else: # pragma: no cover
|
389
|
-
log.error(
|
390
|
-
f'Request method "{method}" isn\'t supported by Scrapling yet'
|
391
|
-
)
|
305
|
+
log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
|
392
306
|
return None
|
393
307
|
|
394
308
|
else: # pragma: no cover
|
@@ -403,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
403
317
|
|
404
318
|
try:
|
405
319
|
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
406
|
-
with open(fd, "w", encoding=
|
320
|
+
with open(fd, "w", encoding=page.encoding) as f:
|
407
321
|
f.write(page.body)
|
408
322
|
|
409
323
|
open_in_browser(f"file://{fname}")
|
@@ -417,6 +331,19 @@ class CustomShell:
|
|
417
331
|
"""A custom IPython shell with minimal dependencies"""
|
418
332
|
|
419
333
|
def __init__(self, code, log_level="debug"):
|
334
|
+
from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
|
335
|
+
from scrapling.fetchers import (
|
336
|
+
Fetcher as __Fetcher,
|
337
|
+
AsyncFetcher as __AsyncFetcher,
|
338
|
+
DynamicFetcher as __DynamicFetcher,
|
339
|
+
StealthyFetcher as __StealthyFetcher,
|
340
|
+
)
|
341
|
+
|
342
|
+
self.__InteractiveShellEmbed = __InteractiveShellEmbed
|
343
|
+
self.__Fetcher = __Fetcher
|
344
|
+
self.__AsyncFetcher = __AsyncFetcher
|
345
|
+
self.__DynamicFetcher = __DynamicFetcher
|
346
|
+
self.__StealthyFetcher = __StealthyFetcher
|
420
347
|
self.code = code
|
421
348
|
self.page = None
|
422
349
|
self.pages = Selectors([])
|
@@ -440,7 +367,7 @@ class CustomShell:
|
|
440
367
|
if self.log_level:
|
441
368
|
getLogger("scrapling").setLevel(self.log_level)
|
442
369
|
|
443
|
-
settings =
|
370
|
+
settings = self.__Fetcher.display_config()
|
444
371
|
settings.pop("storage", None)
|
445
372
|
settings.pop("storage_args", None)
|
446
373
|
log.info(f"Scrapling {__version__} shell started")
|
@@ -506,12 +433,12 @@ Type 'exit' or press Ctrl+D to exit.
|
|
506
433
|
"""Create a namespace with application-specific objects"""
|
507
434
|
|
508
435
|
# Create wrapped versions of fetch functions
|
509
|
-
get = self.create_wrapper(
|
510
|
-
post = self.create_wrapper(
|
511
|
-
put = self.create_wrapper(
|
512
|
-
delete = self.create_wrapper(
|
513
|
-
dynamic_fetch = self.create_wrapper(
|
514
|
-
stealthy_fetch = self.create_wrapper(
|
436
|
+
get = self.create_wrapper(self.__Fetcher.get)
|
437
|
+
post = self.create_wrapper(self.__Fetcher.post)
|
438
|
+
put = self.create_wrapper(self.__Fetcher.put)
|
439
|
+
delete = self.create_wrapper(self.__Fetcher.delete)
|
440
|
+
dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
|
441
|
+
stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
|
515
442
|
curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
|
516
443
|
|
517
444
|
# Create the namespace dictionary
|
@@ -520,12 +447,12 @@ Type 'exit' or press Ctrl+D to exit.
|
|
520
447
|
"post": post,
|
521
448
|
"put": put,
|
522
449
|
"delete": delete,
|
523
|
-
"Fetcher":
|
524
|
-
"AsyncFetcher":
|
450
|
+
"Fetcher": self.__Fetcher,
|
451
|
+
"AsyncFetcher": self.__AsyncFetcher,
|
525
452
|
"fetch": dynamic_fetch,
|
526
|
-
"DynamicFetcher":
|
453
|
+
"DynamicFetcher": self.__DynamicFetcher,
|
527
454
|
"stealthy_fetch": stealthy_fetch,
|
528
|
-
"StealthyFetcher":
|
455
|
+
"StealthyFetcher": self.__StealthyFetcher,
|
529
456
|
"Selector": Selector,
|
530
457
|
"page": self.page,
|
531
458
|
"response": self.page,
|
@@ -542,11 +469,10 @@ Type 'exit' or press Ctrl+D to exit.
|
|
542
469
|
|
543
470
|
def start(self): # pragma: no cover
|
544
471
|
"""Start the interactive shell"""
|
545
|
-
from IPython.terminal.embed import InteractiveShellEmbed
|
546
472
|
|
547
473
|
# Get our namespace with application objects
|
548
474
|
namespace = self.get_namespace()
|
549
|
-
ipython_shell =
|
475
|
+
ipython_shell = self.__InteractiveShellEmbed(
|
550
476
|
banner1=self.banner(),
|
551
477
|
banner2="",
|
552
478
|
enable_tip=False,
|
@@ -621,20 +547,16 @@ class Convertor:
|
|
621
547
|
yield ""
|
622
548
|
|
623
549
|
@classmethod
|
624
|
-
def write_content_to_file(
|
625
|
-
cls, page: Selector, filename: str, css_selector: Optional[str] = None
|
626
|
-
) -> None:
|
550
|
+
def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
|
627
551
|
"""Write a Selector's content to a file"""
|
628
552
|
if not page or not isinstance(page, Selector): # pragma: no cover
|
629
553
|
raise TypeError("Input must be of type `Selector`")
|
630
554
|
elif not filename or not isinstance(filename, str) or not filename.strip():
|
631
555
|
raise ValueError("Filename must be provided")
|
632
556
|
elif not filename.endswith((".md", ".html", ".txt")):
|
633
|
-
raise ValueError(
|
634
|
-
"Unknown file type: filename must end with '.md', '.html', or '.txt'"
|
635
|
-
)
|
557
|
+
raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
|
636
558
|
else:
|
637
|
-
with open(filename, "w", encoding=
|
559
|
+
with open(filename, "w", encoding=page.encoding) as f:
|
638
560
|
extension = filename.split(".")[-1]
|
639
561
|
f.write(
|
640
562
|
"".join(
|
scrapling/core/storage.py
CHANGED
@@ -27,11 +27,7 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
27
27
|
|
28
28
|
try:
|
29
29
|
extracted = tld(self.url)
|
30
|
-
return
|
31
|
-
extracted.top_domain_under_public_suffix
|
32
|
-
or extracted.domain
|
33
|
-
or default_value
|
34
|
-
)
|
30
|
+
return extracted.top_domain_under_public_suffix or extracted.domain or default_value
|
35
31
|
except AttributeError:
|
36
32
|
return default_value
|
37
33
|
|
@@ -90,9 +86,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
90
86
|
self.connection.execute("PRAGMA journal_mode=WAL")
|
91
87
|
self.cursor = self.connection.cursor()
|
92
88
|
self._setup_database()
|
93
|
-
log.debug(
|
94
|
-
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
95
|
-
)
|
89
|
+
log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
|
96
90
|
|
97
91
|
def _setup_database(self) -> None:
|
98
92
|
self.cursor.execute("""
|
scrapling/core/translator.py
CHANGED
@@ -10,10 +10,10 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
from functools import lru_cache
|
12
12
|
|
13
|
-
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
14
|
-
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
13
|
from cssselect.xpath import ExpressionError
|
16
14
|
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
15
|
+
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
16
|
+
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
17
17
|
|
18
18
|
from scrapling.core._types import Any, Optional, Protocol, Self
|
19
19
|
|
@@ -89,9 +89,7 @@ class TranslatorMixin:
|
|
89
89
|
xpath = super().xpath_element(selector) # type: ignore[safe-super]
|
90
90
|
return XPathExpr.from_xpath(xpath)
|
91
91
|
|
92
|
-
def xpath_pseudo_element(
|
93
|
-
self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
|
94
|
-
) -> OriginalXPathExpr:
|
92
|
+
def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
|
95
93
|
"""
|
96
94
|
Dispatch method that transforms XPath to support the pseudo-element.
|
97
95
|
"""
|
@@ -99,31 +97,21 @@ class TranslatorMixin:
|
|
99
97
|
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
|
100
98
|
method = getattr(self, method_name, None)
|
101
99
|
if not method: # pragma: no cover
|
102
|
-
raise ExpressionError(
|
103
|
-
f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
|
104
|
-
)
|
100
|
+
raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
|
105
101
|
xpath = method(xpath, pseudo_element)
|
106
102
|
else:
|
107
|
-
method_name = (
|
108
|
-
f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
109
|
-
)
|
103
|
+
method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
110
104
|
method = getattr(self, method_name, None)
|
111
105
|
if not method: # pragma: no cover
|
112
|
-
raise ExpressionError(
|
113
|
-
f"The pseudo-element ::{pseudo_element} is unknown"
|
114
|
-
)
|
106
|
+
raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
|
115
107
|
xpath = method(xpath)
|
116
108
|
return xpath
|
117
109
|
|
118
110
|
@staticmethod
|
119
|
-
def xpath_attr_functional_pseudo_element(
|
120
|
-
xpath: OriginalXPathExpr, function: FunctionalPseudoElement
|
121
|
-
) -> XPathExpr:
|
111
|
+
def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
|
122
112
|
"""Support selecting attribute values using ::attr() pseudo-element"""
|
123
113
|
if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
|
124
|
-
raise ExpressionError(
|
125
|
-
f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
|
126
|
-
)
|
114
|
+
raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
|
127
115
|
return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
|
128
116
|
|
129
117
|
@staticmethod
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from http import cookies as Cookie
|
2
|
+
|
3
|
+
|
4
|
+
from scrapling.core._types import (
|
5
|
+
List,
|
6
|
+
Dict,
|
7
|
+
Tuple,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
def _CookieParser(cookie_string):
|
12
|
+
# Errors will be handled on call so the log can be specified
|
13
|
+
cookie_parser = Cookie.SimpleCookie()
|
14
|
+
cookie_parser.load(cookie_string)
|
15
|
+
for key, morsel in cookie_parser.items():
|
16
|
+
yield key, morsel.value
|
17
|
+
|
18
|
+
|
19
|
+
def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
|
20
|
+
"""Parses headers into separate header and cookie dictionaries."""
|
21
|
+
header_dict = dict()
|
22
|
+
cookie_dict = dict()
|
23
|
+
|
24
|
+
for header_line in header_lines:
|
25
|
+
if ":" not in header_line:
|
26
|
+
if header_line.endswith(";"):
|
27
|
+
header_key = header_line[:-1].strip()
|
28
|
+
header_value = ""
|
29
|
+
header_dict[header_key] = header_value
|
30
|
+
else:
|
31
|
+
raise ValueError(f"Could not parse header without colon: '{header_line}'.")
|
32
|
+
else:
|
33
|
+
header_key, header_value = header_line.split(":", 1)
|
34
|
+
header_key = header_key.strip()
|
35
|
+
header_value = header_value.strip()
|
36
|
+
|
37
|
+
if parse_cookies:
|
38
|
+
if header_key.lower() == "cookie":
|
39
|
+
try:
|
40
|
+
cookie_dict = {key: value for key, value in _CookieParser(header_value)}
|
41
|
+
except Exception as e: # pragma: no cover
|
42
|
+
raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
|
43
|
+
else:
|
44
|
+
header_dict[header_key] = header_value
|
45
|
+
else:
|
46
|
+
header_dict[header_key] = header_value
|
47
|
+
|
48
|
+
return header_dict, cookie_dict
|
@@ -24,9 +24,7 @@ def setup_logger():
|
|
24
24
|
logger = logging.getLogger("scrapling")
|
25
25
|
logger.setLevel(logging.INFO)
|
26
26
|
|
27
|
-
formatter = logging.Formatter(
|
28
|
-
fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
29
|
-
)
|
27
|
+
formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
30
28
|
|
31
29
|
console_handler = logging.StreamHandler()
|
32
30
|
console_handler.setFormatter(formatter)
|
@@ -61,11 +59,7 @@ class _StorageTools:
|
|
61
59
|
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
62
60
|
if not element.attrib:
|
63
61
|
return {}
|
64
|
-
return {
|
65
|
-
k: v.strip()
|
66
|
-
for k, v in element.attrib.items()
|
67
|
-
if v and v.strip() and k not in forbidden
|
68
|
-
}
|
62
|
+
return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
|
69
63
|
|
70
64
|
@classmethod
|
71
65
|
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
@@ -85,17 +79,11 @@ class _StorageTools:
|
|
85
79
|
}
|
86
80
|
)
|
87
81
|
|
88
|
-
siblings = [
|
89
|
-
child.tag for child in parent.iterchildren() if child != element
|
90
|
-
]
|
82
|
+
siblings = [child.tag for child in parent.iterchildren() if child != element]
|
91
83
|
if siblings:
|
92
84
|
result.update({"siblings": tuple(siblings)})
|
93
85
|
|
94
|
-
children = [
|
95
|
-
child.tag
|
96
|
-
for child in element.iterchildren()
|
97
|
-
if not isinstance(child, html_forbidden)
|
98
|
-
]
|
86
|
+
children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
|
99
87
|
if children:
|
100
88
|
result.update({"children": tuple(children)})
|
101
89
|
|
@@ -104,11 +92,7 @@ class _StorageTools:
|
|
104
92
|
@classmethod
|
105
93
|
def _get_element_path(cls, element: html.HtmlElement):
|
106
94
|
parent = element.getparent()
|
107
|
-
return tuple(
|
108
|
-
(element.tag,)
|
109
|
-
if parent is None
|
110
|
-
else (cls._get_element_path(parent) + (element.tag,))
|
111
|
-
)
|
95
|
+
return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
|
112
96
|
|
113
97
|
|
114
98
|
@lru_cache(128, typed=True)
|
scrapling/engines/__init__.py
CHANGED
@@ -1,16 +0,0 @@
|
|
1
|
-
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
|
2
|
-
from .static import FetcherSession, FetcherClient, AsyncFetcherClient
|
3
|
-
from ._browsers import (
|
4
|
-
DynamicSession,
|
5
|
-
AsyncDynamicSession,
|
6
|
-
StealthySession,
|
7
|
-
AsyncStealthySession,
|
8
|
-
)
|
9
|
-
|
10
|
-
__all__ = [
|
11
|
-
"FetcherSession",
|
12
|
-
"DynamicSession",
|
13
|
-
"AsyncDynamicSession",
|
14
|
-
"StealthySession",
|
15
|
-
"AsyncStealthySession",
|
16
|
-
]
|