scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +227 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
from threading import RLock
|
2
|
+
from dataclasses import dataclass
|
3
|
+
|
4
|
+
from playwright.sync_api import Page as SyncPage
|
5
|
+
from playwright.async_api import Page as AsyncPage
|
6
|
+
|
7
|
+
from scrapling.core._types import Optional, List, Literal
|
8
|
+
|
9
|
+
PageState = Literal["ready", "busy", "error"] # States that a page can be in
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class PageInfo:
|
14
|
+
"""Information about the page and its current state"""
|
15
|
+
|
16
|
+
__slots__ = ("page", "state", "url")
|
17
|
+
page: SyncPage | AsyncPage
|
18
|
+
state: PageState
|
19
|
+
url: Optional[str]
|
20
|
+
|
21
|
+
def mark_busy(self, url: str = ""):
|
22
|
+
"""Mark the page as busy"""
|
23
|
+
self.state = "busy"
|
24
|
+
self.url = url
|
25
|
+
|
26
|
+
def mark_ready(self):
|
27
|
+
"""Mark the page as ready for new requests"""
|
28
|
+
self.state = "ready"
|
29
|
+
self.url = ""
|
30
|
+
|
31
|
+
def mark_error(self):
|
32
|
+
"""Mark the page as having an error"""
|
33
|
+
self.state = "error"
|
34
|
+
|
35
|
+
def __repr__(self):
|
36
|
+
return f'Page(URL="{self.url!r}", state={self.state!r})'
|
37
|
+
|
38
|
+
def __eq__(self, other_page):
|
39
|
+
"""Comparing this page to another page object."""
|
40
|
+
if other_page.__class__ is not self.__class__:
|
41
|
+
return NotImplemented
|
42
|
+
return self.page == other_page.page
|
43
|
+
|
44
|
+
|
45
|
+
class PagePool:
|
46
|
+
"""Manages a pool of browser pages/tabs with state tracking"""
|
47
|
+
|
48
|
+
__slots__ = ("max_pages", "pages", "_lock")
|
49
|
+
|
50
|
+
def __init__(self, max_pages: int = 5):
|
51
|
+
self.max_pages = max_pages
|
52
|
+
self.pages: List[PageInfo] = []
|
53
|
+
self._lock = RLock()
|
54
|
+
|
55
|
+
def add_page(self, page: SyncPage | AsyncPage) -> PageInfo:
|
56
|
+
"""Add a new page to the pool"""
|
57
|
+
with self._lock:
|
58
|
+
if len(self.pages) >= self.max_pages:
|
59
|
+
raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
|
60
|
+
|
61
|
+
page_info = PageInfo(page, "ready", "")
|
62
|
+
self.pages.append(page_info)
|
63
|
+
return page_info
|
64
|
+
|
65
|
+
def get_ready_page(self) -> Optional[PageInfo]:
|
66
|
+
"""Get a page that's ready for use"""
|
67
|
+
with self._lock:
|
68
|
+
for page_info in self.pages:
|
69
|
+
if page_info.state == "ready":
|
70
|
+
return page_info
|
71
|
+
return None
|
72
|
+
|
73
|
+
@property
|
74
|
+
def pages_count(self) -> int:
|
75
|
+
"""Get the total number of pages"""
|
76
|
+
return len(self.pages)
|
77
|
+
|
78
|
+
@property
|
79
|
+
def ready_count(self) -> int:
|
80
|
+
"""Get the number of ready pages"""
|
81
|
+
with self._lock:
|
82
|
+
return sum(1 for p in self.pages if p.state == "ready")
|
83
|
+
|
84
|
+
@property
|
85
|
+
def busy_count(self) -> int:
|
86
|
+
"""Get the number of busy pages"""
|
87
|
+
with self._lock:
|
88
|
+
return sum(1 for p in self.pages if p.state == "busy")
|
89
|
+
|
90
|
+
def cleanup_error_pages(self):
|
91
|
+
"""Remove pages in error state"""
|
92
|
+
with self._lock:
|
93
|
+
self.pages = [p for p in self.pages if p.state != "error"]
|
@@ -0,0 +1,150 @@
|
|
1
|
+
from msgspec import Struct, convert, ValidationError
|
2
|
+
from urllib.parse import urlparse
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from scrapling.core._types import (
|
6
|
+
Optional,
|
7
|
+
Dict,
|
8
|
+
Callable,
|
9
|
+
List,
|
10
|
+
SelectorWaitStates,
|
11
|
+
)
|
12
|
+
from scrapling.engines.toolbelt import construct_proxy_dict
|
13
|
+
|
14
|
+
|
15
|
+
class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
16
|
+
"""Configuration struct for validation"""
|
17
|
+
|
18
|
+
max_pages: int = 1
|
19
|
+
cdp_url: Optional[str] = None
|
20
|
+
headless: bool = True
|
21
|
+
google_search: bool = True
|
22
|
+
hide_canvas: bool = False
|
23
|
+
disable_webgl: bool = False
|
24
|
+
real_chrome: bool = False
|
25
|
+
stealth: bool = False
|
26
|
+
wait: int | float = 0
|
27
|
+
page_action: Optional[Callable] = None
|
28
|
+
proxy: Optional[str | Dict[str, str]] = (
|
29
|
+
None # The default value for proxy in Playwright's source is `None`
|
30
|
+
)
|
31
|
+
locale: str = "en-US"
|
32
|
+
extra_headers: Optional[Dict[str, str]] = None
|
33
|
+
useragent: Optional[str] = None
|
34
|
+
timeout: int | float = 30000
|
35
|
+
disable_resources: bool = False
|
36
|
+
wait_selector: Optional[str] = None
|
37
|
+
cookies: Optional[List[Dict]] = None
|
38
|
+
network_idle: bool = False
|
39
|
+
wait_selector_state: SelectorWaitStates = "attached"
|
40
|
+
selector_config: Optional[Dict] = None
|
41
|
+
|
42
|
+
def __post_init__(self):
|
43
|
+
"""Custom validation after msgspec validation"""
|
44
|
+
if self.max_pages < 1 or self.max_pages > 50:
|
45
|
+
raise ValueError("max_pages must be between 1 and 50")
|
46
|
+
if self.timeout < 0:
|
47
|
+
raise ValueError("timeout must be >= 0")
|
48
|
+
if self.page_action is not None and not callable(self.page_action):
|
49
|
+
raise TypeError(
|
50
|
+
f"page_action must be callable, got {type(self.page_action).__name__}"
|
51
|
+
)
|
52
|
+
if self.proxy:
|
53
|
+
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
54
|
+
if self.cdp_url:
|
55
|
+
self.__validate_cdp(self.cdp_url)
|
56
|
+
if not self.cookies:
|
57
|
+
self.cookies = []
|
58
|
+
if not self.selector_config:
|
59
|
+
self.selector_config = {}
|
60
|
+
|
61
|
+
@staticmethod
|
62
|
+
def __validate_cdp(cdp_url):
|
63
|
+
try:
|
64
|
+
# Check the scheme
|
65
|
+
if not cdp_url.startswith(("ws://", "wss://")):
|
66
|
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
67
|
+
|
68
|
+
# Validate hostname and port
|
69
|
+
if not urlparse(cdp_url).netloc:
|
70
|
+
raise ValueError("Invalid hostname for the CDP URL")
|
71
|
+
|
72
|
+
except AttributeError as e:
|
73
|
+
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
74
|
+
|
75
|
+
except Exception as e:
|
76
|
+
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
77
|
+
|
78
|
+
|
79
|
+
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
80
|
+
"""Configuration struct for validation"""
|
81
|
+
|
82
|
+
max_pages: int = 1
|
83
|
+
headless: bool = True # noqa: F821
|
84
|
+
block_images: bool = False
|
85
|
+
disable_resources: bool = False
|
86
|
+
block_webrtc: bool = False
|
87
|
+
allow_webgl: bool = True
|
88
|
+
network_idle: bool = False
|
89
|
+
humanize: bool | float = True
|
90
|
+
solve_cloudflare: bool = False
|
91
|
+
wait: int | float = 0
|
92
|
+
timeout: int | float = 30000
|
93
|
+
page_action: Optional[Callable] = None
|
94
|
+
wait_selector: Optional[str] = None
|
95
|
+
addons: Optional[List[str]] = None
|
96
|
+
wait_selector_state: SelectorWaitStates = "attached"
|
97
|
+
cookies: Optional[List[Dict]] = None
|
98
|
+
google_search: bool = True
|
99
|
+
extra_headers: Optional[Dict[str, str]] = None
|
100
|
+
proxy: Optional[str | Dict[str, str]] = (
|
101
|
+
None # The default value for proxy in Playwright's source is `None`
|
102
|
+
)
|
103
|
+
os_randomize: bool = False
|
104
|
+
disable_ads: bool = False
|
105
|
+
geoip: bool = False
|
106
|
+
selector_config: Optional[Dict] = None
|
107
|
+
additional_args: Optional[Dict] = None
|
108
|
+
|
109
|
+
def __post_init__(self):
|
110
|
+
"""Custom validation after msgspec validation"""
|
111
|
+
if self.max_pages < 1 or self.max_pages > 50:
|
112
|
+
raise ValueError("max_pages must be between 1 and 50")
|
113
|
+
if self.timeout < 0:
|
114
|
+
raise ValueError("timeout must be >= 0")
|
115
|
+
if self.page_action is not None and not callable(self.page_action):
|
116
|
+
raise TypeError(
|
117
|
+
f"page_action must be callable, got {type(self.page_action).__name__}"
|
118
|
+
)
|
119
|
+
if self.proxy:
|
120
|
+
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
121
|
+
|
122
|
+
if not self.addons:
|
123
|
+
self.addons = []
|
124
|
+
else:
|
125
|
+
for addon in self.addons:
|
126
|
+
addon_path = Path(addon)
|
127
|
+
if not addon_path.exists():
|
128
|
+
raise FileNotFoundError(f"Addon's path not found: {addon}")
|
129
|
+
elif not addon_path.is_dir():
|
130
|
+
raise ValueError(
|
131
|
+
f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
|
132
|
+
)
|
133
|
+
|
134
|
+
if not self.cookies:
|
135
|
+
self.cookies = []
|
136
|
+
if self.solve_cloudflare and self.timeout < 60_000:
|
137
|
+
self.timeout = 60_000
|
138
|
+
if not self.selector_config:
|
139
|
+
self.selector_config = {}
|
140
|
+
if not self.additional_args:
|
141
|
+
self.additional_args = {}
|
142
|
+
|
143
|
+
|
144
|
+
def validate(params, model):
|
145
|
+
try:
|
146
|
+
config = convert(params, model)
|
147
|
+
except ValidationError as e:
|
148
|
+
raise TypeError(f"Invalid argument type: {e}")
|
149
|
+
|
150
|
+
return config
|
scrapling/engines/constants.py
CHANGED
@@ -1,92 +1,108 @@
|
|
1
1
|
# Disable loading these resources for speed
|
2
2
|
DEFAULT_DISABLED_RESOURCES = {
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
3
|
+
"font",
|
4
|
+
"image",
|
5
|
+
"media",
|
6
|
+
"beacon",
|
7
|
+
"object",
|
8
|
+
"imageset",
|
9
|
+
"texttrack",
|
10
|
+
"websocket",
|
11
|
+
"csp_report",
|
12
|
+
"stylesheet",
|
13
13
|
}
|
14
14
|
|
15
|
+
HARMFUL_DEFAULT_ARGS = (
|
16
|
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
17
|
+
"--enable-automation",
|
18
|
+
"--disable-popup-blocking",
|
19
|
+
# '--disable-component-update',
|
20
|
+
# '--disable-default-apps',
|
21
|
+
# '--disable-extensions',
|
22
|
+
)
|
23
|
+
|
24
|
+
DEFAULT_FLAGS = (
|
25
|
+
# Speed up chromium browsers by default
|
26
|
+
"--no-pings",
|
27
|
+
"--no-first-run",
|
28
|
+
"--disable-infobars",
|
29
|
+
"--disable-breakpad",
|
30
|
+
"--no-service-autorun",
|
31
|
+
"--homepage=about:blank",
|
32
|
+
"--password-store=basic",
|
33
|
+
"--no-default-browser-check",
|
34
|
+
"--disable-session-crashed-bubble",
|
35
|
+
"--disable-search-engine-choice-screen",
|
36
|
+
)
|
37
|
+
|
15
38
|
DEFAULT_STEALTH_FLAGS = (
|
16
39
|
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
17
40
|
# Generally this will make the browser faster and less detectable
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
'--password-store=basic',
|
43
|
-
'--disable-cloud-import',
|
44
|
-
'--disable-default-apps',
|
45
|
-
'--disable-print-preview',
|
46
|
-
'--disable-dev-shm-usage',
|
41
|
+
"--incognito",
|
42
|
+
"--test-type",
|
43
|
+
"--lang=en-US",
|
44
|
+
"--mute-audio",
|
45
|
+
"--disable-sync",
|
46
|
+
"--hide-scrollbars",
|
47
|
+
"--disable-logging",
|
48
|
+
"--start-maximized", # For headless check bypass
|
49
|
+
"--enable-async-dns",
|
50
|
+
"--accept-lang=en-US",
|
51
|
+
"--use-mock-keychain",
|
52
|
+
"--disable-translate",
|
53
|
+
"--disable-extensions",
|
54
|
+
"--disable-voice-input",
|
55
|
+
"--window-position=0,0",
|
56
|
+
"--disable-wake-on-wifi",
|
57
|
+
"--ignore-gpu-blocklist",
|
58
|
+
"--enable-tcp-fast-open",
|
59
|
+
"--enable-web-bluetooth",
|
60
|
+
"--disable-hang-monitor",
|
61
|
+
"--disable-cloud-import",
|
62
|
+
"--disable-default-apps",
|
63
|
+
"--disable-print-preview",
|
64
|
+
"--disable-dev-shm-usage",
|
47
65
|
# '--disable-popup-blocking',
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
'--disable-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
'--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
|
89
|
-
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
|
66
|
+
"--metrics-recording-only",
|
67
|
+
"--disable-crash-reporter",
|
68
|
+
"--disable-partial-raster",
|
69
|
+
"--disable-gesture-typing",
|
70
|
+
"--disable-checker-imaging",
|
71
|
+
"--disable-prompt-on-repost",
|
72
|
+
"--force-color-profile=srgb",
|
73
|
+
"--font-render-hinting=none",
|
74
|
+
"--aggressive-cache-discard",
|
75
|
+
"--disable-component-update",
|
76
|
+
"--disable-cookie-encryption",
|
77
|
+
"--disable-domain-reliability",
|
78
|
+
"--disable-threaded-animation",
|
79
|
+
"--disable-threaded-scrolling",
|
80
|
+
# '--disable-reading-from-canvas', # For Firefox
|
81
|
+
"--enable-simple-cache-backend",
|
82
|
+
"--disable-background-networking",
|
83
|
+
"--enable-surface-synchronization",
|
84
|
+
"--disable-image-animation-resync",
|
85
|
+
"--disable-renderer-backgrounding",
|
86
|
+
"--disable-ipc-flooding-protection",
|
87
|
+
"--prerender-from-omnibox=disabled",
|
88
|
+
"--safebrowsing-disable-auto-update",
|
89
|
+
"--disable-offer-upload-credit-cards",
|
90
|
+
"--disable-features=site-per-process",
|
91
|
+
"--disable-background-timer-throttling",
|
92
|
+
"--disable-new-content-rendering-timeout",
|
93
|
+
"--run-all-compositor-stages-before-draw",
|
94
|
+
"--disable-client-side-phishing-detection",
|
95
|
+
"--disable-backgrounding-occluded-windows",
|
96
|
+
"--disable-layer-tree-host-memory-pressure",
|
97
|
+
"--autoplay-policy=no-user-gesture-required",
|
98
|
+
"--disable-offer-store-unmasked-wallet-cards",
|
99
|
+
"--disable-blink-features=AutomationControlled",
|
100
|
+
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
101
|
+
"--disable-component-extensions-with-background-pages",
|
102
|
+
"--force-webrtc-ip-handling-policy=disable_non_proxied_udp",
|
103
|
+
"--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
|
104
|
+
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
105
|
+
"--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
|
90
106
|
)
|
91
107
|
|
92
108
|
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
@@ -95,13 +111,10 @@ NSTBROWSER_DEFAULT_QUERY = {
|
|
95
111
|
"headless": True,
|
96
112
|
"autoClose": True,
|
97
113
|
"fingerprint": {
|
98
|
-
"flags": {
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
"platform": 'linux', # support: windows, mac, linux
|
103
|
-
"kernel": 'chromium', # only support: chromium
|
104
|
-
"kernelMilestone": '128',
|
114
|
+
"flags": {"timezone": "BasedOnIp", "screen": "Custom"},
|
115
|
+
"platform": "linux", # support: windows, mac, linux
|
116
|
+
"kernel": "chromium", # only support: chromium
|
117
|
+
"kernelMilestone": "128",
|
105
118
|
"hardwareConcurrency": 8,
|
106
119
|
"deviceMemory": 8,
|
107
120
|
},
|