scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +175 -21
- scrapling/engines/_browsers/_camoufox.py +95 -171
- scrapling/engines/_browsers/_config_tools.py +9 -3
- scrapling/engines/_browsers/_controllers.py +51 -101
- scrapling/engines/_browsers/_validators.py +95 -63
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +48 -15
- scrapling/engines/toolbelt/custom.py +6 -21
- scrapling/engines/toolbelt/fingerprints.py +14 -9
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +15 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/METADATA +7 -6
- scrapling-0.3.8.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
scrapling/core/_types.py
CHANGED
|
@@ -12,9 +12,11 @@ from typing import (
|
|
|
12
12
|
Generator,
|
|
13
13
|
Iterable,
|
|
14
14
|
List,
|
|
15
|
+
Set,
|
|
15
16
|
Literal,
|
|
16
17
|
Optional,
|
|
17
18
|
Pattern,
|
|
19
|
+
Sequence,
|
|
18
20
|
Tuple,
|
|
19
21
|
TypeVar,
|
|
20
22
|
Union,
|
|
@@ -22,6 +24,7 @@ from typing import (
|
|
|
22
24
|
Mapping,
|
|
23
25
|
Awaitable,
|
|
24
26
|
Protocol,
|
|
27
|
+
Coroutine,
|
|
25
28
|
SupportsIndex,
|
|
26
29
|
)
|
|
27
30
|
|
scrapling/core/ai.py
CHANGED
|
@@ -20,6 +20,7 @@ from scrapling.core._types import (
|
|
|
20
20
|
Mapping,
|
|
21
21
|
Dict,
|
|
22
22
|
List,
|
|
23
|
+
Any,
|
|
23
24
|
SelectorWaitStates,
|
|
24
25
|
Generator,
|
|
25
26
|
)
|
|
@@ -171,7 +172,7 @@ class ScraplingMCPServer:
|
|
|
171
172
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
|
172
173
|
"""
|
|
173
174
|
async with FetcherSession() as session:
|
|
174
|
-
tasks = [
|
|
175
|
+
tasks: List[Any] = [
|
|
175
176
|
session.get(
|
|
176
177
|
url,
|
|
177
178
|
auth=auth,
|
scrapling/core/custom_types.py
CHANGED
|
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
|
|
|
5
5
|
from orjson import dumps, loads
|
|
6
6
|
|
|
7
7
|
from scrapling.core._types import (
|
|
8
|
+
Any,
|
|
8
9
|
cast,
|
|
9
10
|
Dict,
|
|
10
11
|
List,
|
|
@@ -14,7 +15,6 @@ from scrapling.core._types import (
|
|
|
14
15
|
Literal,
|
|
15
16
|
Pattern,
|
|
16
17
|
Iterable,
|
|
17
|
-
Optional,
|
|
18
18
|
Generator,
|
|
19
19
|
SupportsIndex,
|
|
20
20
|
)
|
|
@@ -33,23 +33,20 @@ class TextHandler(str):
|
|
|
33
33
|
|
|
34
34
|
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
|
35
35
|
lst = super().__getitem__(key)
|
|
36
|
-
return
|
|
36
|
+
return TextHandler(lst)
|
|
37
37
|
|
|
38
|
-
def split(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
[TextHandler(s) for s in super().split(sep, maxsplit)],
|
|
43
|
-
)
|
|
44
|
-
)
|
|
38
|
+
def split(
|
|
39
|
+
self, sep: str | None = None, maxsplit: SupportsIndex = -1
|
|
40
|
+
) -> Union[List, "TextHandlers"]: # pragma: no cover
|
|
41
|
+
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
|
45
42
|
|
|
46
|
-
def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
43
|
+
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
47
44
|
return TextHandler(super().strip(chars))
|
|
48
45
|
|
|
49
|
-
def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
46
|
+
def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
50
47
|
return TextHandler(super().lstrip(chars))
|
|
51
48
|
|
|
52
|
-
def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
49
|
+
def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
53
50
|
return TextHandler(super().rstrip(chars))
|
|
54
51
|
|
|
55
52
|
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -64,7 +61,7 @@ class TextHandler(str):
|
|
|
64
61
|
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
65
62
|
return TextHandler(super().expandtabs(tabsize))
|
|
66
63
|
|
|
67
|
-
def format(self, *args:
|
|
64
|
+
def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
68
65
|
return TextHandler(super().format(*args, **kwargs))
|
|
69
66
|
|
|
70
67
|
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -131,10 +128,11 @@ class TextHandler(str):
|
|
|
131
128
|
def re(
|
|
132
129
|
self,
|
|
133
130
|
regex: str | Pattern,
|
|
134
|
-
check_match: Literal[True],
|
|
135
131
|
replace_entities: bool = True,
|
|
136
132
|
clean_match: bool = False,
|
|
137
133
|
case_sensitive: bool = True,
|
|
134
|
+
*,
|
|
135
|
+
check_match: Literal[True],
|
|
138
136
|
) -> bool: ...
|
|
139
137
|
|
|
140
138
|
@overload
|
|
@@ -179,19 +177,14 @@ class TextHandler(str):
|
|
|
179
177
|
results = flatten(results)
|
|
180
178
|
|
|
181
179
|
if not replace_entities:
|
|
182
|
-
return TextHandlers(
|
|
180
|
+
return TextHandlers([TextHandler(string) for string in results])
|
|
183
181
|
|
|
184
|
-
return TextHandlers(
|
|
185
|
-
cast(
|
|
186
|
-
List[_TextHandlerType],
|
|
187
|
-
[TextHandler(_replace_entities(s)) for s in results],
|
|
188
|
-
)
|
|
189
|
-
)
|
|
182
|
+
return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
|
|
190
183
|
|
|
191
184
|
def re_first(
|
|
192
185
|
self,
|
|
193
186
|
regex: str | Pattern,
|
|
194
|
-
default=None,
|
|
187
|
+
default: Any = None,
|
|
195
188
|
replace_entities: bool = True,
|
|
196
189
|
clean_match: bool = False,
|
|
197
190
|
case_sensitive: bool = True,
|
|
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
|
|
|
232
225
|
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
|
233
226
|
lst = super().__getitem__(pos)
|
|
234
227
|
if isinstance(pos, slice):
|
|
235
|
-
return TextHandlers(cast(List[
|
|
236
|
-
return cast(
|
|
228
|
+
return TextHandlers(cast(List[TextHandler], lst))
|
|
229
|
+
return TextHandler(cast(TextHandler, lst))
|
|
237
230
|
|
|
238
231
|
def re(
|
|
239
232
|
self,
|
|
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
|
|
|
256
249
|
def re_first(
|
|
257
250
|
self,
|
|
258
251
|
regex: str | Pattern,
|
|
259
|
-
default=None,
|
|
252
|
+
default: Any = None,
|
|
260
253
|
replace_entities: bool = True,
|
|
261
254
|
clean_match: bool = False,
|
|
262
255
|
case_sensitive: bool = True,
|
|
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
|
309
302
|
)
|
|
310
303
|
|
|
311
304
|
# Fastest read-only mapping type
|
|
312
|
-
self._data = MappingProxyType(mapping)
|
|
305
|
+
self._data: Mapping[str, Any] = MappingProxyType(mapping)
|
|
313
306
|
|
|
314
|
-
def get(self, key: str, default:
|
|
307
|
+
def get(self, key: str, default: Any = None) -> _TextHandlerType:
|
|
315
308
|
"""Acts like the standard dictionary `.get()` method"""
|
|
316
309
|
return self._data.get(key, default)
|
|
317
310
|
|
scrapling/core/mixins.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
from scrapling.core._types import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from scrapling.parser import Selector
|
|
5
|
+
|
|
6
|
+
|
|
1
7
|
class SelectorsGeneration:
|
|
2
8
|
"""
|
|
3
9
|
Functions for generating selectors
|
|
@@ -5,7 +11,7 @@ class SelectorsGeneration:
|
|
|
5
11
|
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
|
6
12
|
"""
|
|
7
13
|
|
|
8
|
-
def
|
|
14
|
+
def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
|
|
9
15
|
"""Generate a selector for the current element.
|
|
10
16
|
:return: A string of the generated selector.
|
|
11
17
|
"""
|
|
@@ -47,29 +53,29 @@ class SelectorsGeneration:
|
|
|
47
53
|
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
|
48
54
|
|
|
49
55
|
@property
|
|
50
|
-
def generate_css_selector(self) -> str:
|
|
56
|
+
def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
51
57
|
"""Generate a CSS selector for the current element
|
|
52
58
|
:return: A string of the generated selector.
|
|
53
59
|
"""
|
|
54
|
-
return self.
|
|
60
|
+
return self._general_selection()
|
|
55
61
|
|
|
56
62
|
@property
|
|
57
|
-
def generate_full_css_selector(self) -> str:
|
|
63
|
+
def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
58
64
|
"""Generate a complete CSS selector for the current element
|
|
59
65
|
:return: A string of the generated selector.
|
|
60
66
|
"""
|
|
61
|
-
return self.
|
|
67
|
+
return self._general_selection(full_path=True)
|
|
62
68
|
|
|
63
69
|
@property
|
|
64
|
-
def generate_xpath_selector(self) -> str:
|
|
70
|
+
def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
65
71
|
"""Generate an XPath selector for the current element
|
|
66
72
|
:return: A string of the generated selector.
|
|
67
73
|
"""
|
|
68
|
-
return self.
|
|
74
|
+
return self._general_selection("xpath")
|
|
69
75
|
|
|
70
76
|
@property
|
|
71
|
-
def generate_full_xpath_selector(self) -> str:
|
|
77
|
+
def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
72
78
|
"""Generate a complete XPath selector for the current element
|
|
73
79
|
:return: A string of the generated selector.
|
|
74
80
|
"""
|
|
75
|
-
return self.
|
|
81
|
+
return self._general_selection("xpath", full_path=True)
|
scrapling/core/shell.py
CHANGED
|
@@ -31,6 +31,7 @@ from scrapling.core._types import (
|
|
|
31
31
|
Optional,
|
|
32
32
|
Dict,
|
|
33
33
|
Any,
|
|
34
|
+
cast,
|
|
34
35
|
extraction_types,
|
|
35
36
|
Generator,
|
|
36
37
|
)
|
|
@@ -540,15 +541,15 @@ class Convertor:
|
|
|
540
541
|
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
|
541
542
|
else:
|
|
542
543
|
if main_content_only:
|
|
543
|
-
page = page.css_first("body") or page
|
|
544
|
+
page = cast(Selector, page.css_first("body")) or page
|
|
544
545
|
|
|
545
|
-
pages = [page] if not css_selector else page.css(css_selector)
|
|
546
|
+
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
|
546
547
|
for page in pages:
|
|
547
548
|
match extraction_type:
|
|
548
549
|
case "markdown":
|
|
549
550
|
yield cls._convert_to_markdown(page.html_content)
|
|
550
551
|
case "html":
|
|
551
|
-
yield page.
|
|
552
|
+
yield page.html_content
|
|
552
553
|
case "text":
|
|
553
554
|
txt_content = page.get_all_text(strip=True)
|
|
554
555
|
for s in (
|
scrapling/core/storage.py
CHANGED
|
@@ -56,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
|
56
56
|
@lru_cache(128, typed=True)
|
|
57
57
|
def _get_hash(identifier: str) -> str:
|
|
58
58
|
"""If you want to hash identifier in your storage system, use this safer"""
|
|
59
|
-
|
|
60
|
-
if isinstance(
|
|
59
|
+
_identifier = identifier.lower().strip()
|
|
60
|
+
if isinstance(_identifier, str):
|
|
61
61
|
# Hash functions have to take bytes
|
|
62
|
-
|
|
62
|
+
_identifier = _identifier.encode("utf-8")
|
|
63
63
|
|
|
64
|
-
hash_value = sha256(
|
|
65
|
-
return f"{hash_value}_{len(
|
|
64
|
+
hash_value = sha256(_identifier).hexdigest()
|
|
65
|
+
return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
@lru_cache(1, typed=True)
|
scrapling/core/translator.py
CHANGED
|
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
|
10
10
|
|
|
11
11
|
from functools import lru_cache
|
|
12
12
|
|
|
13
|
-
from cssselect.xpath import ExpressionError
|
|
14
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
|
15
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
|
14
|
+
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
|
|
16
15
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
|
17
16
|
|
|
18
|
-
from scrapling.core._types import Any,
|
|
17
|
+
from scrapling.core._types import Any, Protocol, Self
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class XPathExpr(OriginalXPathExpr):
|
|
22
21
|
textnode: bool = False
|
|
23
|
-
attribute:
|
|
22
|
+
attribute: str | None = None
|
|
24
23
|
|
|
25
24
|
@classmethod
|
|
26
25
|
def from_xpath(
|
|
27
26
|
cls,
|
|
28
27
|
xpath: OriginalXPathExpr,
|
|
29
28
|
textnode: bool = False,
|
|
30
|
-
attribute:
|
|
29
|
+
attribute: str | None = None,
|
|
31
30
|
) -> Self:
|
|
32
31
|
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
|
33
32
|
x.textnode = textnode
|
|
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
|
|
|
71
70
|
|
|
72
71
|
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
|
73
72
|
class TranslatorProtocol(Protocol):
|
|
74
|
-
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
|
|
73
|
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
|
|
75
74
|
pass
|
|
76
75
|
|
|
77
|
-
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
|
|
76
|
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
|
|
78
77
|
pass
|
|
79
78
|
|
|
80
79
|
|
|
@@ -121,9 +120,15 @@ class TranslatorMixin:
|
|
|
121
120
|
|
|
122
121
|
|
|
123
122
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
|
124
|
-
@lru_cache(maxsize=256)
|
|
125
123
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
|
126
124
|
return super().css_to_xpath(css, prefix)
|
|
127
125
|
|
|
128
126
|
|
|
129
127
|
translator = HTMLTranslator()
|
|
128
|
+
# Using a function instead of the translator directly to avoid Pyright override error
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@lru_cache(maxsize=256)
|
|
132
|
+
def css_to_xpath(query: str) -> str:
|
|
133
|
+
"""Return translated XPath version of a given CSS query"""
|
|
134
|
+
return translator.css_to_xpath(query)
|
|
@@ -2,19 +2,27 @@ from time import time
|
|
|
2
2
|
from asyncio import sleep as asyncio_sleep, Lock
|
|
3
3
|
|
|
4
4
|
from camoufox import DefaultAddons
|
|
5
|
-
from playwright.sync_api import
|
|
5
|
+
from playwright.sync_api import (
|
|
6
|
+
Page,
|
|
7
|
+
Frame,
|
|
8
|
+
BrowserContext,
|
|
9
|
+
Playwright,
|
|
10
|
+
Response as SyncPlaywrightResponse,
|
|
11
|
+
)
|
|
6
12
|
from playwright.async_api import (
|
|
7
|
-
|
|
13
|
+
Page as AsyncPage,
|
|
14
|
+
Frame as AsyncFrame,
|
|
8
15
|
Playwright as AsyncPlaywright,
|
|
16
|
+
Response as AsyncPlaywrightResponse,
|
|
17
|
+
BrowserContext as AsyncBrowserContext,
|
|
9
18
|
)
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
)
|
|
19
|
+
from playwright._impl._errors import Error as PlaywrightError
|
|
20
|
+
from camoufox.pkgman import installed_verstr as camoufox_version
|
|
21
|
+
from camoufox.utils import launch_options as generate_launch_options
|
|
14
22
|
|
|
15
23
|
from ._page import PageInfo, PagePool
|
|
16
24
|
from scrapling.parser import Selector
|
|
17
|
-
from scrapling.core._types import Dict, Optional
|
|
25
|
+
from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING
|
|
18
26
|
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
|
19
27
|
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
|
20
28
|
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
|
@@ -28,10 +36,35 @@ class SyncSession:
|
|
|
28
36
|
self.max_pages = max_pages
|
|
29
37
|
self.page_pool = PagePool(max_pages)
|
|
30
38
|
self._max_wait_for_page = 60
|
|
31
|
-
self.playwright:
|
|
32
|
-
self.context:
|
|
39
|
+
self.playwright: Playwright | Any = None
|
|
40
|
+
self.context: BrowserContext | Any = None
|
|
33
41
|
self._closed = False
|
|
34
42
|
|
|
43
|
+
def __create__(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def close(self): # pragma: no cover
|
|
47
|
+
"""Close all resources"""
|
|
48
|
+
if self._closed:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
if self.context:
|
|
52
|
+
self.context.close()
|
|
53
|
+
self.context = None
|
|
54
|
+
|
|
55
|
+
if self.playwright:
|
|
56
|
+
self.playwright.stop()
|
|
57
|
+
self.playwright = None # pyright: ignore
|
|
58
|
+
|
|
59
|
+
self._closed = True
|
|
60
|
+
|
|
61
|
+
def __enter__(self):
|
|
62
|
+
self.__create__()
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
66
|
+
self.close()
|
|
67
|
+
|
|
35
68
|
def _get_page(
|
|
36
69
|
self,
|
|
37
70
|
timeout: int | float,
|
|
@@ -41,6 +74,7 @@ class SyncSession:
|
|
|
41
74
|
"""Get a new page to use"""
|
|
42
75
|
|
|
43
76
|
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
|
77
|
+
assert self.context is not None, "Browser context not initialized"
|
|
44
78
|
page = self.context.new_page()
|
|
45
79
|
page.set_default_navigation_timeout(timeout)
|
|
46
80
|
page.set_default_timeout(timeout)
|
|
@@ -54,7 +88,9 @@ class SyncSession:
|
|
|
54
88
|
for script in _compiled_stealth_scripts():
|
|
55
89
|
page.add_init_script(script=script)
|
|
56
90
|
|
|
57
|
-
|
|
91
|
+
page_info = self.page_pool.add_page(page)
|
|
92
|
+
page_info.mark_busy()
|
|
93
|
+
return page_info
|
|
58
94
|
|
|
59
95
|
def get_pool_stats(self) -> Dict[str, int]:
|
|
60
96
|
"""Get statistics about the current page pool"""
|
|
@@ -64,14 +100,76 @@ class SyncSession:
|
|
|
64
100
|
"max_pages": self.max_pages,
|
|
65
101
|
}
|
|
66
102
|
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
|
|
105
|
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
|
106
|
+
try:
|
|
107
|
+
page.wait_for_load_state("networkidle", timeout=timeout)
|
|
108
|
+
except PlaywrightError:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
|
|
112
|
+
page.wait_for_load_state(state="load")
|
|
113
|
+
if load_dom:
|
|
114
|
+
page.wait_for_load_state(state="domcontentloaded")
|
|
115
|
+
if network_idle:
|
|
116
|
+
self._wait_for_networkidle(page)
|
|
67
117
|
|
|
68
|
-
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
|
|
120
|
+
"""Create a response handler that captures the final navigation response.
|
|
121
|
+
|
|
122
|
+
:param page_info: The PageInfo object containing the page
|
|
123
|
+
:param response_container: A list to store the final response (mutable container)
|
|
124
|
+
:return: A callback function for page.on("response", ...)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
128
|
+
if (
|
|
129
|
+
finished_response.request.resource_type == "document"
|
|
130
|
+
and finished_response.request.is_navigation_request()
|
|
131
|
+
and finished_response.request.frame == page_info.page.main_frame
|
|
132
|
+
):
|
|
133
|
+
response_container[0] = finished_response
|
|
134
|
+
|
|
135
|
+
return handle_response
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class AsyncSession:
|
|
69
139
|
def __init__(self, max_pages: int = 1):
|
|
70
|
-
|
|
71
|
-
self.
|
|
72
|
-
self.
|
|
140
|
+
self.max_pages = max_pages
|
|
141
|
+
self.page_pool = PagePool(max_pages)
|
|
142
|
+
self._max_wait_for_page = 60
|
|
143
|
+
self.playwright: AsyncPlaywright | Any = None
|
|
144
|
+
self.context: AsyncBrowserContext | Any = None
|
|
145
|
+
self._closed = False
|
|
73
146
|
self._lock = Lock()
|
|
74
147
|
|
|
148
|
+
async def __create__(self):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
async def close(self):
|
|
152
|
+
"""Close all resources"""
|
|
153
|
+
if self._closed: # pragma: no cover
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
if self.context:
|
|
157
|
+
await self.context.close()
|
|
158
|
+
self.context = None # pyright: ignore
|
|
159
|
+
|
|
160
|
+
if self.playwright:
|
|
161
|
+
await self.playwright.stop()
|
|
162
|
+
self.playwright = None # pyright: ignore
|
|
163
|
+
|
|
164
|
+
self._closed = True
|
|
165
|
+
|
|
166
|
+
async def __aenter__(self):
|
|
167
|
+
await self.__create__()
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
171
|
+
await self.close()
|
|
172
|
+
|
|
75
173
|
async def _get_page(
|
|
76
174
|
self,
|
|
77
175
|
timeout: int | float,
|
|
@@ -79,6 +177,9 @@ class AsyncSession(SyncSession):
|
|
|
79
177
|
disable_resources: bool,
|
|
80
178
|
) -> PageInfo: # pragma: no cover
|
|
81
179
|
"""Get a new page to use"""
|
|
180
|
+
if TYPE_CHECKING:
|
|
181
|
+
assert self.context is not None, "Browser context not initialized"
|
|
182
|
+
|
|
82
183
|
async with self._lock:
|
|
83
184
|
# If we're at max capacity after cleanup, wait for busy pages to finish
|
|
84
185
|
if self.page_pool.pages_count >= self.max_pages:
|
|
@@ -107,6 +208,48 @@ class AsyncSession(SyncSession):
|
|
|
107
208
|
|
|
108
209
|
return self.page_pool.add_page(page)
|
|
109
210
|
|
|
211
|
+
def get_pool_stats(self) -> Dict[str, int]:
|
|
212
|
+
"""Get statistics about the current page pool"""
|
|
213
|
+
return {
|
|
214
|
+
"total_pages": self.page_pool.pages_count,
|
|
215
|
+
"busy_pages": self.page_pool.busy_count,
|
|
216
|
+
"max_pages": self.max_pages,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
@staticmethod
|
|
220
|
+
async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
|
|
221
|
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
|
222
|
+
try:
|
|
223
|
+
await page.wait_for_load_state("networkidle", timeout=timeout)
|
|
224
|
+
except PlaywrightError:
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
|
|
228
|
+
await page.wait_for_load_state(state="load")
|
|
229
|
+
if load_dom:
|
|
230
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
|
231
|
+
if network_idle:
|
|
232
|
+
await self._wait_for_networkidle(page)
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
|
|
236
|
+
"""Create an async response handler that captures the final navigation response.
|
|
237
|
+
|
|
238
|
+
:param page_info: The PageInfo object containing the page
|
|
239
|
+
:param response_container: A list to store the final response (mutable container)
|
|
240
|
+
:return: A callback function for page.on("response", ...)
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
244
|
+
if (
|
|
245
|
+
finished_response.request.resource_type == "document"
|
|
246
|
+
and finished_response.request.is_navigation_request()
|
|
247
|
+
and finished_response.request.frame == page_info.page.main_frame
|
|
248
|
+
):
|
|
249
|
+
response_container[0] = finished_response
|
|
250
|
+
|
|
251
|
+
return handle_response
|
|
252
|
+
|
|
110
253
|
|
|
111
254
|
class DynamicSessionMixin:
|
|
112
255
|
def __validate__(self, **params):
|
|
@@ -133,12 +276,18 @@ class DynamicSessionMixin:
|
|
|
133
276
|
self.wait_selector = config.wait_selector
|
|
134
277
|
self.init_script = config.init_script
|
|
135
278
|
self.wait_selector_state = config.wait_selector_state
|
|
279
|
+
self.extra_flags = config.extra_flags
|
|
136
280
|
self.selector_config = config.selector_config
|
|
281
|
+
self.additional_args = config.additional_args
|
|
137
282
|
self.page_action = config.page_action
|
|
138
|
-
self.
|
|
283
|
+
self.user_data_dir = config.user_data_dir
|
|
284
|
+
self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
|
|
139
285
|
self.__initiate_browser_options__()
|
|
140
286
|
|
|
141
287
|
def __initiate_browser_options__(self):
|
|
288
|
+
if TYPE_CHECKING:
|
|
289
|
+
assert isinstance(self.proxy, tuple)
|
|
290
|
+
|
|
142
291
|
if not self.cdp_url:
|
|
143
292
|
# `launch_options` is used with persistent context
|
|
144
293
|
self.launch_options = dict(
|
|
@@ -152,10 +301,13 @@ class DynamicSessionMixin:
|
|
|
152
301
|
self.stealth,
|
|
153
302
|
self.hide_canvas,
|
|
154
303
|
self.disable_webgl,
|
|
304
|
+
tuple(self.extra_flags) if self.extra_flags else tuple(),
|
|
155
305
|
)
|
|
156
306
|
)
|
|
157
307
|
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
|
158
308
|
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
|
309
|
+
self.launch_options["user_data_dir"] = self.user_data_dir
|
|
310
|
+
self.launch_options.update(cast(Dict, self.additional_args))
|
|
159
311
|
self.context_options = dict()
|
|
160
312
|
else:
|
|
161
313
|
# while `context_options` is left to be used when cdp mode is enabled
|
|
@@ -171,11 +323,12 @@ class DynamicSessionMixin:
|
|
|
171
323
|
)
|
|
172
324
|
self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
|
|
173
325
|
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
|
326
|
+
self.context_options.update(cast(Dict, self.additional_args))
|
|
174
327
|
|
|
175
328
|
|
|
176
329
|
class StealthySessionMixin:
|
|
177
330
|
def __validate__(self, **params):
|
|
178
|
-
config = validate(params, model=CamoufoxConfig)
|
|
331
|
+
config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
|
|
179
332
|
|
|
180
333
|
self.max_pages = config.max_pages
|
|
181
334
|
self.headless = config.headless
|
|
@@ -204,15 +357,16 @@ class StealthySessionMixin:
|
|
|
204
357
|
self.selector_config = config.selector_config
|
|
205
358
|
self.additional_args = config.additional_args
|
|
206
359
|
self.page_action = config.page_action
|
|
207
|
-
self.
|
|
360
|
+
self.user_data_dir = config.user_data_dir
|
|
361
|
+
self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
|
|
208
362
|
self.__initiate_browser_options__()
|
|
209
363
|
|
|
210
364
|
def __initiate_browser_options__(self):
|
|
211
365
|
"""Initiate browser options."""
|
|
212
|
-
self.launch_options = generate_launch_options(
|
|
366
|
+
self.launch_options: Dict[str, Any] = generate_launch_options(
|
|
213
367
|
**{
|
|
214
368
|
"geoip": self.geoip,
|
|
215
|
-
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
|
369
|
+
"proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
|
|
216
370
|
"addons": self.addons,
|
|
217
371
|
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
|
218
372
|
"headless": self.headless,
|
|
@@ -222,7 +376,7 @@ class StealthySessionMixin:
|
|
|
222
376
|
"block_webrtc": self.block_webrtc,
|
|
223
377
|
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
|
224
378
|
"os": None if self.os_randomize else get_os_name(),
|
|
225
|
-
"user_data_dir":
|
|
379
|
+
"user_data_dir": self.user_data_dir,
|
|
226
380
|
"ff_version": __ff_version_str__,
|
|
227
381
|
"firefox_user_prefs": {
|
|
228
382
|
# This is what enabling `enable_cache` does internally, so we do it from here instead
|
|
@@ -232,7 +386,7 @@ class StealthySessionMixin:
|
|
|
232
386
|
"browser.cache.disk_cache_ssl": True,
|
|
233
387
|
"browser.cache.disk.smart_size.enabled": True,
|
|
234
388
|
},
|
|
235
|
-
**self.additional_args,
|
|
389
|
+
**cast(Dict, self.additional_args),
|
|
236
390
|
}
|
|
237
391
|
)
|
|
238
392
|
|