scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +51 -129
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +238 -293
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +220 -278
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +29 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +41 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.dist-info/RECORD +0 -41
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
scrapling/core/shell.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
from re import sub as re_sub
|
3
3
|
from sys import stderr
|
4
4
|
from functools import wraps
|
5
|
-
from http import cookies as Cookie
|
6
5
|
from collections import namedtuple
|
7
6
|
from shlex import split as shlex_split
|
8
7
|
from tempfile import mkstemp as make_temp_file
|
@@ -20,29 +19,20 @@ from logging import (
|
|
20
19
|
getLevelName,
|
21
20
|
)
|
22
21
|
|
23
|
-
from IPython.terminal.embed import InteractiveShellEmbed
|
24
22
|
from orjson import loads as json_loads, JSONDecodeError
|
25
23
|
|
26
24
|
from scrapling import __version__
|
27
|
-
from scrapling.core.custom_types import TextHandler
|
28
|
-
from scrapling.core.utils import log
|
29
25
|
from scrapling.parser import Selector, Selectors
|
26
|
+
from scrapling.core.custom_types import TextHandler
|
27
|
+
from scrapling.engines.toolbelt.custom import Response
|
28
|
+
from scrapling.core.utils import log, _ParseHeaders, _CookieParser
|
30
29
|
from scrapling.core._types import (
|
31
|
-
List,
|
32
30
|
Optional,
|
33
31
|
Dict,
|
34
|
-
Tuple,
|
35
32
|
Any,
|
36
33
|
extraction_types,
|
37
34
|
Generator,
|
38
35
|
)
|
39
|
-
from scrapling.fetchers import (
|
40
|
-
Fetcher,
|
41
|
-
AsyncFetcher,
|
42
|
-
DynamicFetcher,
|
43
|
-
StealthyFetcher,
|
44
|
-
Response,
|
45
|
-
)
|
46
36
|
|
47
37
|
|
48
38
|
_known_logging_levels = {
|
@@ -72,54 +62,6 @@ Request = namedtuple(
|
|
72
62
|
)
|
73
63
|
|
74
64
|
|
75
|
-
def _CookieParser(cookie_string):
|
76
|
-
# Errors will be handled on call so the log can be specified
|
77
|
-
cookie_parser = Cookie.SimpleCookie()
|
78
|
-
cookie_parser.load(cookie_string)
|
79
|
-
for key, morsel in cookie_parser.items():
|
80
|
-
yield key, morsel.value
|
81
|
-
|
82
|
-
|
83
|
-
def _ParseHeaders(
|
84
|
-
header_lines: List[str], parse_cookies: bool = True
|
85
|
-
) -> Tuple[Dict[str, str], Dict[str, str]]:
|
86
|
-
"""Parses headers into separate header and cookie dictionaries."""
|
87
|
-
header_dict = dict()
|
88
|
-
cookie_dict = dict()
|
89
|
-
|
90
|
-
for header_line in header_lines:
|
91
|
-
if ":" not in header_line:
|
92
|
-
if header_line.endswith(";"):
|
93
|
-
header_key = header_line[:-1].strip()
|
94
|
-
header_value = ""
|
95
|
-
header_dict[header_key] = header_value
|
96
|
-
else:
|
97
|
-
raise ValueError(
|
98
|
-
f"Could not parse header without colon: '{header_line}'."
|
99
|
-
)
|
100
|
-
else:
|
101
|
-
header_key, header_value = header_line.split(":", 1)
|
102
|
-
header_key = header_key.strip()
|
103
|
-
header_value = header_value.strip()
|
104
|
-
|
105
|
-
if parse_cookies:
|
106
|
-
if header_key.lower() == "cookie":
|
107
|
-
try:
|
108
|
-
cookie_dict = {
|
109
|
-
key: value for key, value in _CookieParser(header_value)
|
110
|
-
}
|
111
|
-
except Exception as e: # pragma: no cover
|
112
|
-
raise ValueError(
|
113
|
-
f"Could not parse cookie string from header '{header_value}': {e}"
|
114
|
-
)
|
115
|
-
else:
|
116
|
-
header_dict[header_key] = header_value
|
117
|
-
else:
|
118
|
-
header_dict[header_key] = header_value
|
119
|
-
|
120
|
-
return header_dict, cookie_dict
|
121
|
-
|
122
|
-
|
123
65
|
# Suppress exit on error to handle parsing errors gracefully
|
124
66
|
class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
125
67
|
def error(self, message):
|
@@ -130,15 +72,16 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
|
130
72
|
if message:
|
131
73
|
log.error(f"Scrapling shell exited with status {status}: {message}")
|
132
74
|
self._print_message(message, stderr)
|
133
|
-
raise ValueError(
|
134
|
-
f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
|
135
|
-
)
|
75
|
+
raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
|
136
76
|
|
137
77
|
|
138
78
|
class CurlParser:
|
139
79
|
"""Builds the argument parser for relevant curl flags from DevTools."""
|
140
80
|
|
141
81
|
def __init__(self):
|
82
|
+
from scrapling.fetchers import Fetcher as __Fetcher
|
83
|
+
|
84
|
+
self.__fetcher = __Fetcher
|
142
85
|
# We will use argparse parser to parse the curl command directly instead of regex
|
143
86
|
# We will focus more on flags that will show up on curl commands copied from DevTools's network tab
|
144
87
|
_parser = NoExitArgumentParser(add_help=False) # Disable default help
|
@@ -153,15 +96,11 @@ class CurlParser:
|
|
153
96
|
|
154
97
|
# Data arguments (prioritizing types common from DevTools)
|
155
98
|
_parser.add_argument("-d", "--data", default=None)
|
156
|
-
_parser.add_argument(
|
157
|
-
"--data-raw", default=None
|
158
|
-
) # Often used by browsers for JSON body
|
99
|
+
_parser.add_argument("--data-raw", default=None) # Often used by browsers for JSON body
|
159
100
|
_parser.add_argument("--data-binary", default=None)
|
160
101
|
# Keep urlencode for completeness, though less common from browser copy/paste
|
161
102
|
_parser.add_argument("--data-urlencode", action="append", default=[])
|
162
|
-
_parser.add_argument(
|
163
|
-
"-G", "--get", action="store_true"
|
164
|
-
) # Use GET and put data in URL
|
103
|
+
_parser.add_argument("-G", "--get", action="store_true") # Use GET and put data in URL
|
165
104
|
|
166
105
|
_parser.add_argument(
|
167
106
|
"-b",
|
@@ -176,9 +115,7 @@ class CurlParser:
|
|
176
115
|
|
177
116
|
# Connection/Security
|
178
117
|
_parser.add_argument("-k", "--insecure", action="store_true")
|
179
|
-
_parser.add_argument(
|
180
|
-
"--compressed", action="store_true"
|
181
|
-
) # Very common from browsers
|
118
|
+
_parser.add_argument("--compressed", action="store_true") # Very common from browsers
|
182
119
|
|
183
120
|
# Other flags often included but may not map directly to request args
|
184
121
|
_parser.add_argument("-i", "--include", action="store_true")
|
@@ -195,9 +132,7 @@ class CurlParser:
|
|
195
132
|
clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
|
196
133
|
|
197
134
|
try:
|
198
|
-
tokens = shlex_split(
|
199
|
-
clean_command
|
200
|
-
) # Split the string using shell-like syntax
|
135
|
+
tokens = shlex_split(clean_command) # Split the string using shell-like syntax
|
201
136
|
except ValueError as e: # pragma: no cover
|
202
137
|
log.error(f"Could not split command line: {e}")
|
203
138
|
return None
|
@@ -214,9 +149,7 @@ class CurlParser:
|
|
214
149
|
raise
|
215
150
|
|
216
151
|
except Exception as e: # pragma: no cover
|
217
|
-
log.error(
|
218
|
-
f"An unexpected error occurred during curl arguments parsing: {e}"
|
219
|
-
)
|
152
|
+
log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
|
220
153
|
return None
|
221
154
|
|
222
155
|
# --- Determine Method ---
|
@@ -248,9 +181,7 @@ class CurlParser:
|
|
248
181
|
cookies[key] = value
|
249
182
|
log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
|
250
183
|
except Exception as e: # pragma: no cover
|
251
|
-
log.error(
|
252
|
-
f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
|
253
|
-
)
|
184
|
+
log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
|
254
185
|
|
255
186
|
# --- Process Data Payload ---
|
256
187
|
params = dict()
|
@@ -281,9 +212,7 @@ class CurlParser:
|
|
281
212
|
try:
|
282
213
|
data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
|
283
214
|
except Exception as e:
|
284
|
-
log.warning(
|
285
|
-
f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
|
286
|
-
)
|
215
|
+
log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
|
287
216
|
data_payload = combined_data
|
288
217
|
|
289
218
|
# Check if raw data looks like JSON, prefer 'json' param if so
|
@@ -304,9 +233,7 @@ class CurlParser:
|
|
304
233
|
try:
|
305
234
|
params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
|
306
235
|
except ValueError:
|
307
|
-
log.warning(
|
308
|
-
f"Could not parse data '{data_payload}' into GET parameters for -G."
|
309
|
-
)
|
236
|
+
log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
|
310
237
|
|
311
238
|
if params:
|
312
239
|
data_payload = None # Clear data as it's moved to params
|
@@ -315,21 +242,13 @@ class CurlParser:
|
|
315
242
|
# --- Process Proxy ---
|
316
243
|
proxies: Optional[Dict[str, str]] = None
|
317
244
|
if parsed_args.proxy:
|
318
|
-
proxy_url =
|
319
|
-
f"http://{parsed_args.proxy}"
|
320
|
-
if "://" not in parsed_args.proxy
|
321
|
-
else parsed_args.proxy
|
322
|
-
)
|
245
|
+
proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
|
323
246
|
|
324
247
|
if parsed_args.proxy_user:
|
325
248
|
user_pass = parsed_args.proxy_user
|
326
249
|
parts = urlparse(proxy_url)
|
327
250
|
netloc_parts = parts.netloc.split("@")
|
328
|
-
netloc = (
|
329
|
-
f"{user_pass}@{netloc_parts[-1]}"
|
330
|
-
if len(netloc_parts) > 1
|
331
|
-
else f"{user_pass}@{parts.netloc}"
|
332
|
-
)
|
251
|
+
netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
|
333
252
|
proxy_url = urlunparse(
|
334
253
|
(
|
335
254
|
parts.scheme,
|
@@ -360,11 +279,7 @@ class CurlParser:
|
|
360
279
|
|
361
280
|
def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
|
362
281
|
if isinstance(curl_command, (Request, str)):
|
363
|
-
request = (
|
364
|
-
self.parse(curl_command)
|
365
|
-
if isinstance(curl_command, str)
|
366
|
-
else curl_command
|
367
|
-
)
|
282
|
+
request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
|
368
283
|
|
369
284
|
# Ensure request parsing was successful before proceeding
|
370
285
|
if request is None: # pragma: no cover
|
@@ -382,20 +297,17 @@ class CurlParser:
|
|
382
297
|
_ = request_args.pop("json", None)
|
383
298
|
|
384
299
|
try:
|
385
|
-
return getattr(
|
300
|
+
return getattr(self.__fetcher, method)(**request_args)
|
386
301
|
except Exception as e: # pragma: no cover
|
387
302
|
log.error(f"Error calling Fetcher.{method}: {e}")
|
388
303
|
return None
|
389
304
|
else: # pragma: no cover
|
390
|
-
log.error(
|
391
|
-
f'Request method "{method}" isn\'t supported by Scrapling yet'
|
392
|
-
)
|
305
|
+
log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
|
393
306
|
return None
|
394
307
|
|
395
308
|
else: # pragma: no cover
|
396
309
|
log.error("Input must be a valid curl command string or a Request object.")
|
397
|
-
|
398
|
-
return None
|
310
|
+
return None
|
399
311
|
|
400
312
|
|
401
313
|
def show_page_in_browser(page: Selector): # pragma: no cover
|
@@ -405,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
405
317
|
|
406
318
|
try:
|
407
319
|
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
408
|
-
with open(fd, "w", encoding=
|
320
|
+
with open(fd, "w", encoding=page.encoding) as f:
|
409
321
|
f.write(page.body)
|
410
322
|
|
411
323
|
open_in_browser(f"file://{fname}")
|
@@ -419,6 +331,19 @@ class CustomShell:
|
|
419
331
|
"""A custom IPython shell with minimal dependencies"""
|
420
332
|
|
421
333
|
def __init__(self, code, log_level="debug"):
|
334
|
+
from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
|
335
|
+
from scrapling.fetchers import (
|
336
|
+
Fetcher as __Fetcher,
|
337
|
+
AsyncFetcher as __AsyncFetcher,
|
338
|
+
DynamicFetcher as __DynamicFetcher,
|
339
|
+
StealthyFetcher as __StealthyFetcher,
|
340
|
+
)
|
341
|
+
|
342
|
+
self.__InteractiveShellEmbed = __InteractiveShellEmbed
|
343
|
+
self.__Fetcher = __Fetcher
|
344
|
+
self.__AsyncFetcher = __AsyncFetcher
|
345
|
+
self.__DynamicFetcher = __DynamicFetcher
|
346
|
+
self.__StealthyFetcher = __StealthyFetcher
|
422
347
|
self.code = code
|
423
348
|
self.page = None
|
424
349
|
self.pages = Selectors([])
|
@@ -442,7 +367,7 @@ class CustomShell:
|
|
442
367
|
if self.log_level:
|
443
368
|
getLogger("scrapling").setLevel(self.log_level)
|
444
369
|
|
445
|
-
settings =
|
370
|
+
settings = self.__Fetcher.display_config()
|
446
371
|
settings.pop("storage", None)
|
447
372
|
settings.pop("storage_args", None)
|
448
373
|
log.info(f"Scrapling {__version__} shell started")
|
@@ -508,12 +433,12 @@ Type 'exit' or press Ctrl+D to exit.
|
|
508
433
|
"""Create a namespace with application-specific objects"""
|
509
434
|
|
510
435
|
# Create wrapped versions of fetch functions
|
511
|
-
get = self.create_wrapper(
|
512
|
-
post = self.create_wrapper(
|
513
|
-
put = self.create_wrapper(
|
514
|
-
delete = self.create_wrapper(
|
515
|
-
dynamic_fetch = self.create_wrapper(
|
516
|
-
stealthy_fetch = self.create_wrapper(
|
436
|
+
get = self.create_wrapper(self.__Fetcher.get)
|
437
|
+
post = self.create_wrapper(self.__Fetcher.post)
|
438
|
+
put = self.create_wrapper(self.__Fetcher.put)
|
439
|
+
delete = self.create_wrapper(self.__Fetcher.delete)
|
440
|
+
dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
|
441
|
+
stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
|
517
442
|
curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
|
518
443
|
|
519
444
|
# Create the namespace dictionary
|
@@ -522,12 +447,12 @@ Type 'exit' or press Ctrl+D to exit.
|
|
522
447
|
"post": post,
|
523
448
|
"put": put,
|
524
449
|
"delete": delete,
|
525
|
-
"Fetcher":
|
526
|
-
"AsyncFetcher":
|
450
|
+
"Fetcher": self.__Fetcher,
|
451
|
+
"AsyncFetcher": self.__AsyncFetcher,
|
527
452
|
"fetch": dynamic_fetch,
|
528
|
-
"DynamicFetcher":
|
453
|
+
"DynamicFetcher": self.__DynamicFetcher,
|
529
454
|
"stealthy_fetch": stealthy_fetch,
|
530
|
-
"StealthyFetcher":
|
455
|
+
"StealthyFetcher": self.__StealthyFetcher,
|
531
456
|
"Selector": Selector,
|
532
457
|
"page": self.page,
|
533
458
|
"response": self.page,
|
@@ -544,9 +469,10 @@ Type 'exit' or press Ctrl+D to exit.
|
|
544
469
|
|
545
470
|
def start(self): # pragma: no cover
|
546
471
|
"""Start the interactive shell"""
|
472
|
+
|
547
473
|
# Get our namespace with application objects
|
548
474
|
namespace = self.get_namespace()
|
549
|
-
ipython_shell =
|
475
|
+
ipython_shell = self.__InteractiveShellEmbed(
|
550
476
|
banner1=self.banner(),
|
551
477
|
banner2="",
|
552
478
|
enable_tip=False,
|
@@ -621,20 +547,16 @@ class Convertor:
|
|
621
547
|
yield ""
|
622
548
|
|
623
549
|
@classmethod
|
624
|
-
def write_content_to_file(
|
625
|
-
cls, page: Selector, filename: str, css_selector: Optional[str] = None
|
626
|
-
) -> None:
|
550
|
+
def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
|
627
551
|
"""Write a Selector's content to a file"""
|
628
552
|
if not page or not isinstance(page, Selector): # pragma: no cover
|
629
553
|
raise TypeError("Input must be of type `Selector`")
|
630
554
|
elif not filename or not isinstance(filename, str) or not filename.strip():
|
631
555
|
raise ValueError("Filename must be provided")
|
632
556
|
elif not filename.endswith((".md", ".html", ".txt")):
|
633
|
-
raise ValueError(
|
634
|
-
"Unknown file type: filename must end with '.md', '.html', or '.txt'"
|
635
|
-
)
|
557
|
+
raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
|
636
558
|
else:
|
637
|
-
with open(filename, "w", encoding=
|
559
|
+
with open(filename, "w", encoding=page.encoding) as f:
|
638
560
|
extension = filename.split(".")[-1]
|
639
561
|
f.write(
|
640
562
|
"".join(
|
scrapling/core/storage.py
CHANGED
@@ -27,11 +27,7 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
27
27
|
|
28
28
|
try:
|
29
29
|
extracted = tld(self.url)
|
30
|
-
return
|
31
|
-
extracted.top_domain_under_public_suffix
|
32
|
-
or extracted.domain
|
33
|
-
or default_value
|
34
|
-
)
|
30
|
+
return extracted.top_domain_under_public_suffix or extracted.domain or default_value
|
35
31
|
except AttributeError:
|
36
32
|
return default_value
|
37
33
|
|
@@ -90,9 +86,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
90
86
|
self.connection.execute("PRAGMA journal_mode=WAL")
|
91
87
|
self.cursor = self.connection.cursor()
|
92
88
|
self._setup_database()
|
93
|
-
log.debug(
|
94
|
-
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
95
|
-
)
|
89
|
+
log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
|
96
90
|
|
97
91
|
def _setup_database(self) -> None:
|
98
92
|
self.cursor.execute("""
|
scrapling/core/translator.py
CHANGED
@@ -10,10 +10,10 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
from functools import lru_cache
|
12
12
|
|
13
|
-
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
14
|
-
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
13
|
from cssselect.xpath import ExpressionError
|
16
14
|
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
15
|
+
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
16
|
+
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
17
17
|
|
18
18
|
from scrapling.core._types import Any, Optional, Protocol, Self
|
19
19
|
|
@@ -89,9 +89,7 @@ class TranslatorMixin:
|
|
89
89
|
xpath = super().xpath_element(selector) # type: ignore[safe-super]
|
90
90
|
return XPathExpr.from_xpath(xpath)
|
91
91
|
|
92
|
-
def xpath_pseudo_element(
|
93
|
-
self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
|
94
|
-
) -> OriginalXPathExpr:
|
92
|
+
def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
|
95
93
|
"""
|
96
94
|
Dispatch method that transforms XPath to support the pseudo-element.
|
97
95
|
"""
|
@@ -99,31 +97,21 @@ class TranslatorMixin:
|
|
99
97
|
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
|
100
98
|
method = getattr(self, method_name, None)
|
101
99
|
if not method: # pragma: no cover
|
102
|
-
raise ExpressionError(
|
103
|
-
f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
|
104
|
-
)
|
100
|
+
raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
|
105
101
|
xpath = method(xpath, pseudo_element)
|
106
102
|
else:
|
107
|
-
method_name = (
|
108
|
-
f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
109
|
-
)
|
103
|
+
method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
110
104
|
method = getattr(self, method_name, None)
|
111
105
|
if not method: # pragma: no cover
|
112
|
-
raise ExpressionError(
|
113
|
-
f"The pseudo-element ::{pseudo_element} is unknown"
|
114
|
-
)
|
106
|
+
raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
|
115
107
|
xpath = method(xpath)
|
116
108
|
return xpath
|
117
109
|
|
118
110
|
@staticmethod
|
119
|
-
def xpath_attr_functional_pseudo_element(
|
120
|
-
xpath: OriginalXPathExpr, function: FunctionalPseudoElement
|
121
|
-
) -> XPathExpr:
|
111
|
+
def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
|
122
112
|
"""Support selecting attribute values using ::attr() pseudo-element"""
|
123
113
|
if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
|
124
|
-
raise ExpressionError(
|
125
|
-
f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
|
126
|
-
)
|
114
|
+
raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
|
127
115
|
return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
|
128
116
|
|
129
117
|
@staticmethod
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from http import cookies as Cookie
|
2
|
+
|
3
|
+
|
4
|
+
from scrapling.core._types import (
|
5
|
+
List,
|
6
|
+
Dict,
|
7
|
+
Tuple,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
def _CookieParser(cookie_string):
|
12
|
+
# Errors will be handled on call so the log can be specified
|
13
|
+
cookie_parser = Cookie.SimpleCookie()
|
14
|
+
cookie_parser.load(cookie_string)
|
15
|
+
for key, morsel in cookie_parser.items():
|
16
|
+
yield key, morsel.value
|
17
|
+
|
18
|
+
|
19
|
+
def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
|
20
|
+
"""Parses headers into separate header and cookie dictionaries."""
|
21
|
+
header_dict = dict()
|
22
|
+
cookie_dict = dict()
|
23
|
+
|
24
|
+
for header_line in header_lines:
|
25
|
+
if ":" not in header_line:
|
26
|
+
if header_line.endswith(";"):
|
27
|
+
header_key = header_line[:-1].strip()
|
28
|
+
header_value = ""
|
29
|
+
header_dict[header_key] = header_value
|
30
|
+
else:
|
31
|
+
raise ValueError(f"Could not parse header without colon: '{header_line}'.")
|
32
|
+
else:
|
33
|
+
header_key, header_value = header_line.split(":", 1)
|
34
|
+
header_key = header_key.strip()
|
35
|
+
header_value = header_value.strip()
|
36
|
+
|
37
|
+
if parse_cookies:
|
38
|
+
if header_key.lower() == "cookie":
|
39
|
+
try:
|
40
|
+
cookie_dict = {key: value for key, value in _CookieParser(header_value)}
|
41
|
+
except Exception as e: # pragma: no cover
|
42
|
+
raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
|
43
|
+
else:
|
44
|
+
header_dict[header_key] = header_value
|
45
|
+
else:
|
46
|
+
header_dict[header_key] = header_value
|
47
|
+
|
48
|
+
return header_dict, cookie_dict
|
@@ -24,9 +24,7 @@ def setup_logger():
|
|
24
24
|
logger = logging.getLogger("scrapling")
|
25
25
|
logger.setLevel(logging.INFO)
|
26
26
|
|
27
|
-
formatter = logging.Formatter(
|
28
|
-
fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
29
|
-
)
|
27
|
+
formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
30
28
|
|
31
29
|
console_handler = logging.StreamHandler()
|
32
30
|
console_handler.setFormatter(formatter)
|
@@ -61,11 +59,7 @@ class _StorageTools:
|
|
61
59
|
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
62
60
|
if not element.attrib:
|
63
61
|
return {}
|
64
|
-
return {
|
65
|
-
k: v.strip()
|
66
|
-
for k, v in element.attrib.items()
|
67
|
-
if v and v.strip() and k not in forbidden
|
68
|
-
}
|
62
|
+
return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
|
69
63
|
|
70
64
|
@classmethod
|
71
65
|
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
@@ -85,17 +79,11 @@ class _StorageTools:
|
|
85
79
|
}
|
86
80
|
)
|
87
81
|
|
88
|
-
siblings = [
|
89
|
-
child.tag for child in parent.iterchildren() if child != element
|
90
|
-
]
|
82
|
+
siblings = [child.tag for child in parent.iterchildren() if child != element]
|
91
83
|
if siblings:
|
92
84
|
result.update({"siblings": tuple(siblings)})
|
93
85
|
|
94
|
-
children = [
|
95
|
-
child.tag
|
96
|
-
for child in element.iterchildren()
|
97
|
-
if not isinstance(child, html_forbidden)
|
98
|
-
]
|
86
|
+
children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
|
99
87
|
if children:
|
100
88
|
result.update({"children": tuple(children)})
|
101
89
|
|
@@ -104,11 +92,7 @@ class _StorageTools:
|
|
104
92
|
@classmethod
|
105
93
|
def _get_element_path(cls, element: html.HtmlElement):
|
106
94
|
parent = element.getparent()
|
107
|
-
return tuple(
|
108
|
-
(element.tag,)
|
109
|
-
if parent is None
|
110
|
-
else (cls._get_element_path(parent) + (element.tag,))
|
111
|
-
)
|
95
|
+
return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
|
112
96
|
|
113
97
|
|
114
98
|
@lru_cache(128, typed=True)
|
scrapling/engines/__init__.py
CHANGED
@@ -1,16 +0,0 @@
|
|
1
|
-
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
|
2
|
-
from .static import FetcherSession, FetcherClient, AsyncFetcherClient
|
3
|
-
from ._browsers import (
|
4
|
-
DynamicSession,
|
5
|
-
AsyncDynamicSession,
|
6
|
-
StealthySession,
|
7
|
-
AsyncStealthySession,
|
8
|
-
)
|
9
|
-
|
10
|
-
__all__ = [
|
11
|
-
"FetcherSession",
|
12
|
-
"DynamicSession",
|
13
|
-
"AsyncDynamicSession",
|
14
|
-
"StealthySession",
|
15
|
-
"AsyncStealthySession",
|
16
|
-
]
|