scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +227 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,93 @@
1
+ from threading import RLock
2
+ from dataclasses import dataclass
3
+
4
+ from playwright.sync_api import Page as SyncPage
5
+ from playwright.async_api import Page as AsyncPage
6
+
7
+ from scrapling.core._types import Optional, List, Literal
8
+
9
+ PageState = Literal["ready", "busy", "error"] # States that a page can be in
10
+
11
+
12
+ @dataclass
13
+ class PageInfo:
14
+ """Information about the page and its current state"""
15
+
16
+ __slots__ = ("page", "state", "url")
17
+ page: SyncPage | AsyncPage
18
+ state: PageState
19
+ url: Optional[str]
20
+
21
+ def mark_busy(self, url: str = ""):
22
+ """Mark the page as busy"""
23
+ self.state = "busy"
24
+ self.url = url
25
+
26
+ def mark_ready(self):
27
+ """Mark the page as ready for new requests"""
28
+ self.state = "ready"
29
+ self.url = ""
30
+
31
+ def mark_error(self):
32
+ """Mark the page as having an error"""
33
+ self.state = "error"
34
+
35
+ def __repr__(self):
36
+ return f'Page(URL="{self.url!r}", state={self.state!r})'
37
+
38
+ def __eq__(self, other_page):
39
+ """Comparing this page to another page object."""
40
+ if other_page.__class__ is not self.__class__:
41
+ return NotImplemented
42
+ return self.page == other_page.page
43
+
44
+
45
+ class PagePool:
46
+ """Manages a pool of browser pages/tabs with state tracking"""
47
+
48
+ __slots__ = ("max_pages", "pages", "_lock")
49
+
50
+ def __init__(self, max_pages: int = 5):
51
+ self.max_pages = max_pages
52
+ self.pages: List[PageInfo] = []
53
+ self._lock = RLock()
54
+
55
+ def add_page(self, page: SyncPage | AsyncPage) -> PageInfo:
56
+ """Add a new page to the pool"""
57
+ with self._lock:
58
+ if len(self.pages) >= self.max_pages:
59
+ raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
60
+
61
+ page_info = PageInfo(page, "ready", "")
62
+ self.pages.append(page_info)
63
+ return page_info
64
+
65
+ def get_ready_page(self) -> Optional[PageInfo]:
66
+ """Get a page that's ready for use"""
67
+ with self._lock:
68
+ for page_info in self.pages:
69
+ if page_info.state == "ready":
70
+ return page_info
71
+ return None
72
+
73
+ @property
74
+ def pages_count(self) -> int:
75
+ """Get the total number of pages"""
76
+ return len(self.pages)
77
+
78
+ @property
79
+ def ready_count(self) -> int:
80
+ """Get the number of ready pages"""
81
+ with self._lock:
82
+ return sum(1 for p in self.pages if p.state == "ready")
83
+
84
+ @property
85
+ def busy_count(self) -> int:
86
+ """Get the number of busy pages"""
87
+ with self._lock:
88
+ return sum(1 for p in self.pages if p.state == "busy")
89
+
90
+ def cleanup_error_pages(self):
91
+ """Remove pages in error state"""
92
+ with self._lock:
93
+ self.pages = [p for p in self.pages if p.state != "error"]
@@ -0,0 +1,150 @@
1
+ from msgspec import Struct, convert, ValidationError
2
+ from urllib.parse import urlparse
3
+ from pathlib import Path
4
+
5
+ from scrapling.core._types import (
6
+ Optional,
7
+ Dict,
8
+ Callable,
9
+ List,
10
+ SelectorWaitStates,
11
+ )
12
+ from scrapling.engines.toolbelt import construct_proxy_dict
13
+
14
+
15
+ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
16
+ """Configuration struct for validation"""
17
+
18
+ max_pages: int = 1
19
+ cdp_url: Optional[str] = None
20
+ headless: bool = True
21
+ google_search: bool = True
22
+ hide_canvas: bool = False
23
+ disable_webgl: bool = False
24
+ real_chrome: bool = False
25
+ stealth: bool = False
26
+ wait: int | float = 0
27
+ page_action: Optional[Callable] = None
28
+ proxy: Optional[str | Dict[str, str]] = (
29
+ None # The default value for proxy in Playwright's source is `None`
30
+ )
31
+ locale: str = "en-US"
32
+ extra_headers: Optional[Dict[str, str]] = None
33
+ useragent: Optional[str] = None
34
+ timeout: int | float = 30000
35
+ disable_resources: bool = False
36
+ wait_selector: Optional[str] = None
37
+ cookies: Optional[List[Dict]] = None
38
+ network_idle: bool = False
39
+ wait_selector_state: SelectorWaitStates = "attached"
40
+ selector_config: Optional[Dict] = None
41
+
42
+ def __post_init__(self):
43
+ """Custom validation after msgspec validation"""
44
+ if self.max_pages < 1 or self.max_pages > 50:
45
+ raise ValueError("max_pages must be between 1 and 50")
46
+ if self.timeout < 0:
47
+ raise ValueError("timeout must be >= 0")
48
+ if self.page_action is not None and not callable(self.page_action):
49
+ raise TypeError(
50
+ f"page_action must be callable, got {type(self.page_action).__name__}"
51
+ )
52
+ if self.proxy:
53
+ self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
54
+ if self.cdp_url:
55
+ self.__validate_cdp(self.cdp_url)
56
+ if not self.cookies:
57
+ self.cookies = []
58
+ if not self.selector_config:
59
+ self.selector_config = {}
60
+
61
+ @staticmethod
62
+ def __validate_cdp(cdp_url):
63
+ try:
64
+ # Check the scheme
65
+ if not cdp_url.startswith(("ws://", "wss://")):
66
+ raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
67
+
68
+ # Validate hostname and port
69
+ if not urlparse(cdp_url).netloc:
70
+ raise ValueError("Invalid hostname for the CDP URL")
71
+
72
+ except AttributeError as e:
73
+ raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
74
+
75
+ except Exception as e:
76
+ raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
77
+
78
+
79
+ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
80
+ """Configuration struct for validation"""
81
+
82
+ max_pages: int = 1
83
+ headless: bool = True # noqa: F821
84
+ block_images: bool = False
85
+ disable_resources: bool = False
86
+ block_webrtc: bool = False
87
+ allow_webgl: bool = True
88
+ network_idle: bool = False
89
+ humanize: bool | float = True
90
+ solve_cloudflare: bool = False
91
+ wait: int | float = 0
92
+ timeout: int | float = 30000
93
+ page_action: Optional[Callable] = None
94
+ wait_selector: Optional[str] = None
95
+ addons: Optional[List[str]] = None
96
+ wait_selector_state: SelectorWaitStates = "attached"
97
+ cookies: Optional[List[Dict]] = None
98
+ google_search: bool = True
99
+ extra_headers: Optional[Dict[str, str]] = None
100
+ proxy: Optional[str | Dict[str, str]] = (
101
+ None # The default value for proxy in Playwright's source is `None`
102
+ )
103
+ os_randomize: bool = False
104
+ disable_ads: bool = False
105
+ geoip: bool = False
106
+ selector_config: Optional[Dict] = None
107
+ additional_args: Optional[Dict] = None
108
+
109
+ def __post_init__(self):
110
+ """Custom validation after msgspec validation"""
111
+ if self.max_pages < 1 or self.max_pages > 50:
112
+ raise ValueError("max_pages must be between 1 and 50")
113
+ if self.timeout < 0:
114
+ raise ValueError("timeout must be >= 0")
115
+ if self.page_action is not None and not callable(self.page_action):
116
+ raise TypeError(
117
+ f"page_action must be callable, got {type(self.page_action).__name__}"
118
+ )
119
+ if self.proxy:
120
+ self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
121
+
122
+ if not self.addons:
123
+ self.addons = []
124
+ else:
125
+ for addon in self.addons:
126
+ addon_path = Path(addon)
127
+ if not addon_path.exists():
128
+ raise FileNotFoundError(f"Addon's path not found: {addon}")
129
+ elif not addon_path.is_dir():
130
+ raise ValueError(
131
+ f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
132
+ )
133
+
134
+ if not self.cookies:
135
+ self.cookies = []
136
+ if self.solve_cloudflare and self.timeout < 60_000:
137
+ self.timeout = 60_000
138
+ if not self.selector_config:
139
+ self.selector_config = {}
140
+ if not self.additional_args:
141
+ self.additional_args = {}
142
+
143
+
144
+ def validate(params, model):
145
+ try:
146
+ config = convert(params, model)
147
+ except ValidationError as e:
148
+ raise TypeError(f"Invalid argument type: {e}")
149
+
150
+ return config
@@ -1,92 +1,108 @@
1
1
  # Disable loading these resources for speed
2
2
  DEFAULT_DISABLED_RESOURCES = {
3
- 'font',
4
- 'image',
5
- 'media',
6
- 'beacon',
7
- 'object',
8
- 'imageset',
9
- 'texttrack',
10
- 'websocket',
11
- 'csp_report',
12
- 'stylesheet',
3
+ "font",
4
+ "image",
5
+ "media",
6
+ "beacon",
7
+ "object",
8
+ "imageset",
9
+ "texttrack",
10
+ "websocket",
11
+ "csp_report",
12
+ "stylesheet",
13
13
  }
14
14
 
15
+ HARMFUL_DEFAULT_ARGS = (
16
+ # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
17
+ "--enable-automation",
18
+ "--disable-popup-blocking",
19
+ # '--disable-component-update',
20
+ # '--disable-default-apps',
21
+ # '--disable-extensions',
22
+ )
23
+
24
+ DEFAULT_FLAGS = (
25
+ # Speed up chromium browsers by default
26
+ "--no-pings",
27
+ "--no-first-run",
28
+ "--disable-infobars",
29
+ "--disable-breakpad",
30
+ "--no-service-autorun",
31
+ "--homepage=about:blank",
32
+ "--password-store=basic",
33
+ "--no-default-browser-check",
34
+ "--disable-session-crashed-bubble",
35
+ "--disable-search-engine-choice-screen",
36
+ )
37
+
15
38
  DEFAULT_STEALTH_FLAGS = (
16
39
  # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
17
40
  # Generally this will make the browser faster and less detectable
18
- '--no-pings',
19
- '--incognito',
20
- '--test-type',
21
- '--lang=en-US',
22
- '--mute-audio',
23
- '--no-first-run',
24
- '--disable-sync',
25
- '--hide-scrollbars',
26
- '--disable-logging',
27
- '--start-maximized', # For headless check bypass
28
- '--enable-async-dns',
29
- '--disable-breakpad',
30
- '--disable-infobars',
31
- '--accept-lang=en-US',
32
- '--use-mock-keychain',
33
- '--disable-translate',
34
- '--disable-extensions',
35
- '--disable-voice-input',
36
- '--window-position=0,0',
37
- '--disable-wake-on-wifi',
38
- '--ignore-gpu-blocklist',
39
- '--enable-tcp-fast-open',
40
- '--enable-web-bluetooth',
41
- '--disable-hang-monitor',
42
- '--password-store=basic',
43
- '--disable-cloud-import',
44
- '--disable-default-apps',
45
- '--disable-print-preview',
46
- '--disable-dev-shm-usage',
41
+ "--incognito",
42
+ "--test-type",
43
+ "--lang=en-US",
44
+ "--mute-audio",
45
+ "--disable-sync",
46
+ "--hide-scrollbars",
47
+ "--disable-logging",
48
+ "--start-maximized", # For headless check bypass
49
+ "--enable-async-dns",
50
+ "--accept-lang=en-US",
51
+ "--use-mock-keychain",
52
+ "--disable-translate",
53
+ "--disable-extensions",
54
+ "--disable-voice-input",
55
+ "--window-position=0,0",
56
+ "--disable-wake-on-wifi",
57
+ "--ignore-gpu-blocklist",
58
+ "--enable-tcp-fast-open",
59
+ "--enable-web-bluetooth",
60
+ "--disable-hang-monitor",
61
+ "--disable-cloud-import",
62
+ "--disable-default-apps",
63
+ "--disable-print-preview",
64
+ "--disable-dev-shm-usage",
47
65
  # '--disable-popup-blocking',
48
- '--metrics-recording-only',
49
- '--disable-crash-reporter',
50
- '--disable-partial-raster',
51
- '--disable-gesture-typing',
52
- '--disable-checker-imaging',
53
- '--disable-prompt-on-repost',
54
- '--force-color-profile=srgb',
55
- '--font-render-hinting=none',
56
- '--no-default-browser-check',
57
- '--aggressive-cache-discard',
58
- '--disable-component-update',
59
- '--disable-cookie-encryption',
60
- '--disable-domain-reliability',
61
- '--disable-threaded-animation',
62
- '--disable-threaded-scrolling',
63
- # '--disable-reading-from-canvas', # For Firefox
64
- '--enable-simple-cache-backend',
65
- '--disable-background-networking',
66
- '--disable-session-crashed-bubble',
67
- '--enable-surface-synchronization',
68
- '--disable-image-animation-resync',
69
- '--disable-renderer-backgrounding',
70
- '--disable-ipc-flooding-protection',
71
- '--prerender-from-omnibox=disabled',
72
- '--safebrowsing-disable-auto-update',
73
- '--disable-offer-upload-credit-cards',
74
- '--disable-features=site-per-process',
75
- '--disable-background-timer-throttling',
76
- '--disable-new-content-rendering-timeout',
77
- '--run-all-compositor-stages-before-draw',
78
- '--disable-client-side-phishing-detection',
79
- '--disable-backgrounding-occluded-windows',
80
- '--disable-layer-tree-host-memory-pressure',
81
- '--autoplay-policy=no-user-gesture-required',
82
- '--disable-offer-store-unmasked-wallet-cards',
83
- '--disable-blink-features=AutomationControlled',
84
- '--webrtc-ip-handling-policy=disable_non_proxied_udp',
85
- '--disable-component-extensions-with-background-pages',
86
- '--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
87
- '--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
88
- '--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
89
- '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
66
+ "--metrics-recording-only",
67
+ "--disable-crash-reporter",
68
+ "--disable-partial-raster",
69
+ "--disable-gesture-typing",
70
+ "--disable-checker-imaging",
71
+ "--disable-prompt-on-repost",
72
+ "--force-color-profile=srgb",
73
+ "--font-render-hinting=none",
74
+ "--aggressive-cache-discard",
75
+ "--disable-component-update",
76
+ "--disable-cookie-encryption",
77
+ "--disable-domain-reliability",
78
+ "--disable-threaded-animation",
79
+ "--disable-threaded-scrolling",
80
+ # '--disable-reading-from-canvas', # For Firefox
81
+ "--enable-simple-cache-backend",
82
+ "--disable-background-networking",
83
+ "--enable-surface-synchronization",
84
+ "--disable-image-animation-resync",
85
+ "--disable-renderer-backgrounding",
86
+ "--disable-ipc-flooding-protection",
87
+ "--prerender-from-omnibox=disabled",
88
+ "--safebrowsing-disable-auto-update",
89
+ "--disable-offer-upload-credit-cards",
90
+ "--disable-features=site-per-process",
91
+ "--disable-background-timer-throttling",
92
+ "--disable-new-content-rendering-timeout",
93
+ "--run-all-compositor-stages-before-draw",
94
+ "--disable-client-side-phishing-detection",
95
+ "--disable-backgrounding-occluded-windows",
96
+ "--disable-layer-tree-host-memory-pressure",
97
+ "--autoplay-policy=no-user-gesture-required",
98
+ "--disable-offer-store-unmasked-wallet-cards",
99
+ "--disable-blink-features=AutomationControlled",
100
+ "--webrtc-ip-handling-policy=disable_non_proxied_udp",
101
+ "--disable-component-extensions-with-background-pages",
102
+ "--force-webrtc-ip-handling-policy=disable_non_proxied_udp",
103
+ "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
104
+ "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
105
+ "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
90
106
  )
91
107
 
92
108
  # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
@@ -95,13 +111,10 @@ NSTBROWSER_DEFAULT_QUERY = {
95
111
  "headless": True,
96
112
  "autoClose": True,
97
113
  "fingerprint": {
98
- "flags": {
99
- "timezone": "BasedOnIp",
100
- "screen": "Custom"
101
- },
102
- "platform": 'linux', # support: windows, mac, linux
103
- "kernel": 'chromium', # only support: chromium
104
- "kernelMilestone": '128',
114
+ "flags": {"timezone": "BasedOnIp", "screen": "Custom"},
115
+ "platform": "linux", # support: windows, mac, linux
116
+ "kernel": "chromium", # only support: chromium
117
+ "kernelMilestone": "128",
105
118
  "hardwareConcurrency": 8,
106
119
  "deviceMemory": 8,
107
120
  },