scrapling 0.3.7__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3.7/scrapling.egg-info → scrapling-0.3.8}/PKG-INFO +6 -4
- {scrapling-0.3.7 → scrapling-0.3.8}/README.md +2 -1
- {scrapling-0.3.7 → scrapling-0.3.8}/pyproject.toml +5 -4
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/__init__.py +1 -1
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/_base.py +140 -9
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/_camoufox.py +47 -164
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/_config_tools.py +8 -2
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/_controllers.py +25 -96
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/_validators.py +72 -61
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/convertor.py +37 -2
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/custom.py +0 -12
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/fingerprints.py +6 -8
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/fetchers/chrome.py +6 -0
- {scrapling-0.3.7 → scrapling-0.3.8/scrapling.egg-info}/PKG-INFO +6 -4
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling.egg-info/requires.txt +2 -2
- {scrapling-0.3.7 → scrapling-0.3.8}/setup.cfg +1 -1
- {scrapling-0.3.7 → scrapling-0.3.8}/LICENSE +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/MANIFEST.in +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/cli.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/_types.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/ai.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/mixins.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/shell.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/storage.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/translator.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/utils/__init__.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/_browsers/_page.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/constants.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/static.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/fetchers/__init__.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/fetchers/firefox.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/fetchers/requests.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/parser.py +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling/py.typed +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3.7 → scrapling-0.3.8}/scrapling.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapling
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
|
6
6
|
Author: Karim Shoair
|
|
@@ -36,6 +36,7 @@ License: BSD 3-Clause License
|
|
|
36
36
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
37
37
|
|
|
38
38
|
Project-URL: Homepage, https://github.com/D4Vinci/Scrapling
|
|
39
|
+
Project-URL: Changelog, https://github.com/D4Vinci/Scrapling/releases
|
|
39
40
|
Project-URL: Documentation, https://scrapling.readthedocs.io/en/latest/
|
|
40
41
|
Project-URL: Repository, https://github.com/D4Vinci/Scrapling
|
|
41
42
|
Project-URL: Bug Tracker, https://github.com/D4Vinci/Scrapling/issues
|
|
@@ -66,7 +67,7 @@ Description-Content-Type: text/markdown
|
|
|
66
67
|
License-File: LICENSE
|
|
67
68
|
Requires-Dist: lxml>=6.0.2
|
|
68
69
|
Requires-Dist: cssselect>=1.3.0
|
|
69
|
-
Requires-Dist: orjson>=3.11.
|
|
70
|
+
Requires-Dist: orjson>=3.11.4
|
|
70
71
|
Requires-Dist: tldextract>=5.3.0
|
|
71
72
|
Provides-Extra: fetchers
|
|
72
73
|
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
|
@@ -77,7 +78,7 @@ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
|
|
77
78
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
|
78
79
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
|
79
80
|
Provides-Extra: ai
|
|
80
|
-
Requires-Dist: mcp>=1.
|
|
81
|
+
Requires-Dist: mcp>=1.19.0; extra == "ai"
|
|
81
82
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
|
82
83
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
|
83
84
|
Provides-Extra: shell
|
|
@@ -157,10 +158,11 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
|
|
|
157
158
|
|
|
158
159
|
<!-- sponsors -->
|
|
159
160
|
|
|
160
|
-
<a href="https://www.
|
|
161
|
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
|
161
162
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
|
162
163
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
|
163
164
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
165
|
+
<a href="https://app.cyberyozh.com/?utm_source=github&utm_medium=scrapling" target="_blank" title="We have gathered the best solutions for multi‑accounting and automation in one place."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/cyberyozh.png"></a>
|
|
164
166
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
|
165
167
|
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
|
166
168
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
|
@@ -67,10 +67,11 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
|
|
|
67
67
|
|
|
68
68
|
<!-- sponsors -->
|
|
69
69
|
|
|
70
|
-
<a href="https://www.
|
|
70
|
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
|
71
71
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
|
72
72
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
|
73
73
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
74
|
+
<a href="https://app.cyberyozh.com/?utm_source=github&utm_medium=scrapling" target="_blank" title="We have gathered the best solutions for multi‑accounting and automation in one place."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/cyberyozh.png"></a>
|
|
74
75
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
|
75
76
|
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
|
76
77
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapling"
|
|
7
|
-
# Static version instead of dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
|
8
|
-
version = "0.3.
|
|
7
|
+
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
|
8
|
+
version = "0.3.8"
|
|
9
9
|
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
11
11
|
license = {file = "LICENSE"}
|
|
@@ -59,7 +59,7 @@ classifiers = [
|
|
|
59
59
|
dependencies = [
|
|
60
60
|
"lxml>=6.0.2",
|
|
61
61
|
"cssselect>=1.3.0",
|
|
62
|
-
"orjson>=3.11.
|
|
62
|
+
"orjson>=3.11.4",
|
|
63
63
|
"tldextract>=5.3.0",
|
|
64
64
|
]
|
|
65
65
|
|
|
@@ -74,7 +74,7 @@ fetchers = [
|
|
|
74
74
|
"msgspec>=0.19.0",
|
|
75
75
|
]
|
|
76
76
|
ai = [
|
|
77
|
-
"mcp>=1.
|
|
77
|
+
"mcp>=1.19.0",
|
|
78
78
|
"markdownify>=1.2.0",
|
|
79
79
|
"scrapling[fetchers]",
|
|
80
80
|
]
|
|
@@ -89,6 +89,7 @@ all = [
|
|
|
89
89
|
|
|
90
90
|
[project.urls]
|
|
91
91
|
Homepage = "https://github.com/D4Vinci/Scrapling"
|
|
92
|
+
Changelog = "https://github.com/D4Vinci/Scrapling/releases"
|
|
92
93
|
Documentation = "https://scrapling.readthedocs.io/en/latest/"
|
|
93
94
|
Repository = "https://github.com/D4Vinci/Scrapling"
|
|
94
95
|
"Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
|
|
@@ -2,17 +2,27 @@ from time import time
|
|
|
2
2
|
from asyncio import sleep as asyncio_sleep, Lock
|
|
3
3
|
|
|
4
4
|
from camoufox import DefaultAddons
|
|
5
|
-
from playwright.sync_api import
|
|
5
|
+
from playwright.sync_api import (
|
|
6
|
+
Page,
|
|
7
|
+
Frame,
|
|
8
|
+
BrowserContext,
|
|
9
|
+
Playwright,
|
|
10
|
+
Response as SyncPlaywrightResponse,
|
|
11
|
+
)
|
|
6
12
|
from playwright.async_api import (
|
|
7
|
-
|
|
13
|
+
Page as AsyncPage,
|
|
14
|
+
Frame as AsyncFrame,
|
|
8
15
|
Playwright as AsyncPlaywright,
|
|
16
|
+
Response as AsyncPlaywrightResponse,
|
|
17
|
+
BrowserContext as AsyncBrowserContext,
|
|
9
18
|
)
|
|
19
|
+
from playwright._impl._errors import Error as PlaywrightError
|
|
10
20
|
from camoufox.pkgman import installed_verstr as camoufox_version
|
|
11
21
|
from camoufox.utils import launch_options as generate_launch_options
|
|
12
22
|
|
|
13
23
|
from ._page import PageInfo, PagePool
|
|
14
24
|
from scrapling.parser import Selector
|
|
15
|
-
from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
|
|
25
|
+
from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING
|
|
16
26
|
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
|
17
27
|
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
|
18
28
|
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
|
@@ -26,10 +36,35 @@ class SyncSession:
|
|
|
26
36
|
self.max_pages = max_pages
|
|
27
37
|
self.page_pool = PagePool(max_pages)
|
|
28
38
|
self._max_wait_for_page = 60
|
|
29
|
-
self.playwright:
|
|
30
|
-
self.context:
|
|
39
|
+
self.playwright: Playwright | Any = None
|
|
40
|
+
self.context: BrowserContext | Any = None
|
|
31
41
|
self._closed = False
|
|
32
42
|
|
|
43
|
+
def __create__(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def close(self): # pragma: no cover
|
|
47
|
+
"""Close all resources"""
|
|
48
|
+
if self._closed:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
if self.context:
|
|
52
|
+
self.context.close()
|
|
53
|
+
self.context = None
|
|
54
|
+
|
|
55
|
+
if self.playwright:
|
|
56
|
+
self.playwright.stop()
|
|
57
|
+
self.playwright = None # pyright: ignore
|
|
58
|
+
|
|
59
|
+
self._closed = True
|
|
60
|
+
|
|
61
|
+
def __enter__(self):
|
|
62
|
+
self.__create__()
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
66
|
+
self.close()
|
|
67
|
+
|
|
33
68
|
def _get_page(
|
|
34
69
|
self,
|
|
35
70
|
timeout: int | float,
|
|
@@ -53,7 +88,9 @@ class SyncSession:
|
|
|
53
88
|
for script in _compiled_stealth_scripts():
|
|
54
89
|
page.add_init_script(script=script)
|
|
55
90
|
|
|
56
|
-
|
|
91
|
+
page_info = self.page_pool.add_page(page)
|
|
92
|
+
page_info.mark_busy()
|
|
93
|
+
return page_info
|
|
57
94
|
|
|
58
95
|
def get_pool_stats(self) -> Dict[str, int]:
|
|
59
96
|
"""Get statistics about the current page pool"""
|
|
@@ -63,17 +100,76 @@ class SyncSession:
|
|
|
63
100
|
"max_pages": self.max_pages,
|
|
64
101
|
}
|
|
65
102
|
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
|
|
105
|
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
|
106
|
+
try:
|
|
107
|
+
page.wait_for_load_state("networkidle", timeout=timeout)
|
|
108
|
+
except PlaywrightError:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
|
|
112
|
+
page.wait_for_load_state(state="load")
|
|
113
|
+
if load_dom:
|
|
114
|
+
page.wait_for_load_state(state="domcontentloaded")
|
|
115
|
+
if network_idle:
|
|
116
|
+
self._wait_for_networkidle(page)
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
|
|
120
|
+
"""Create a response handler that captures the final navigation response.
|
|
121
|
+
|
|
122
|
+
:param page_info: The PageInfo object containing the page
|
|
123
|
+
:param response_container: A list to store the final response (mutable container)
|
|
124
|
+
:return: A callback function for page.on("response", ...)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
128
|
+
if (
|
|
129
|
+
finished_response.request.resource_type == "document"
|
|
130
|
+
and finished_response.request.is_navigation_request()
|
|
131
|
+
and finished_response.request.frame == page_info.page.main_frame
|
|
132
|
+
):
|
|
133
|
+
response_container[0] = finished_response
|
|
134
|
+
|
|
135
|
+
return handle_response
|
|
136
|
+
|
|
66
137
|
|
|
67
138
|
class AsyncSession:
|
|
68
139
|
def __init__(self, max_pages: int = 1):
|
|
69
140
|
self.max_pages = max_pages
|
|
70
141
|
self.page_pool = PagePool(max_pages)
|
|
71
142
|
self._max_wait_for_page = 60
|
|
72
|
-
self.playwright:
|
|
73
|
-
self.context:
|
|
143
|
+
self.playwright: AsyncPlaywright | Any = None
|
|
144
|
+
self.context: AsyncBrowserContext | Any = None
|
|
74
145
|
self._closed = False
|
|
75
146
|
self._lock = Lock()
|
|
76
147
|
|
|
148
|
+
async def __create__(self):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
async def close(self):
|
|
152
|
+
"""Close all resources"""
|
|
153
|
+
if self._closed: # pragma: no cover
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
if self.context:
|
|
157
|
+
await self.context.close()
|
|
158
|
+
self.context = None # pyright: ignore
|
|
159
|
+
|
|
160
|
+
if self.playwright:
|
|
161
|
+
await self.playwright.stop()
|
|
162
|
+
self.playwright = None # pyright: ignore
|
|
163
|
+
|
|
164
|
+
self._closed = True
|
|
165
|
+
|
|
166
|
+
async def __aenter__(self):
|
|
167
|
+
await self.__create__()
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
171
|
+
await self.close()
|
|
172
|
+
|
|
77
173
|
async def _get_page(
|
|
78
174
|
self,
|
|
79
175
|
timeout: int | float,
|
|
@@ -97,7 +193,6 @@ class AsyncSession:
|
|
|
97
193
|
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
|
98
194
|
)
|
|
99
195
|
|
|
100
|
-
assert self.context is not None, "Browser context not initialized"
|
|
101
196
|
page = await self.context.new_page()
|
|
102
197
|
page.set_default_navigation_timeout(timeout)
|
|
103
198
|
page.set_default_timeout(timeout)
|
|
@@ -121,6 +216,40 @@ class AsyncSession:
|
|
|
121
216
|
"max_pages": self.max_pages,
|
|
122
217
|
}
|
|
123
218
|
|
|
219
|
+
@staticmethod
|
|
220
|
+
async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
|
|
221
|
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
|
222
|
+
try:
|
|
223
|
+
await page.wait_for_load_state("networkidle", timeout=timeout)
|
|
224
|
+
except PlaywrightError:
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
|
|
228
|
+
await page.wait_for_load_state(state="load")
|
|
229
|
+
if load_dom:
|
|
230
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
|
231
|
+
if network_idle:
|
|
232
|
+
await self._wait_for_networkidle(page)
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
|
|
236
|
+
"""Create an async response handler that captures the final navigation response.
|
|
237
|
+
|
|
238
|
+
:param page_info: The PageInfo object containing the page
|
|
239
|
+
:param response_container: A list to store the final response (mutable container)
|
|
240
|
+
:return: A callback function for page.on("response", ...)
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
244
|
+
if (
|
|
245
|
+
finished_response.request.resource_type == "document"
|
|
246
|
+
and finished_response.request.is_navigation_request()
|
|
247
|
+
and finished_response.request.frame == page_info.page.main_frame
|
|
248
|
+
):
|
|
249
|
+
response_container[0] = finished_response
|
|
250
|
+
|
|
251
|
+
return handle_response
|
|
252
|
+
|
|
124
253
|
|
|
125
254
|
class DynamicSessionMixin:
|
|
126
255
|
def __validate__(self, **params):
|
|
@@ -147,6 +276,7 @@ class DynamicSessionMixin:
|
|
|
147
276
|
self.wait_selector = config.wait_selector
|
|
148
277
|
self.init_script = config.init_script
|
|
149
278
|
self.wait_selector_state = config.wait_selector_state
|
|
279
|
+
self.extra_flags = config.extra_flags
|
|
150
280
|
self.selector_config = config.selector_config
|
|
151
281
|
self.additional_args = config.additional_args
|
|
152
282
|
self.page_action = config.page_action
|
|
@@ -171,6 +301,7 @@ class DynamicSessionMixin:
|
|
|
171
301
|
self.stealth,
|
|
172
302
|
self.hide_canvas,
|
|
173
303
|
self.disable_webgl,
|
|
304
|
+
tuple(self.extra_flags) if self.extra_flags else tuple(),
|
|
174
305
|
)
|
|
175
306
|
)
|
|
176
307
|
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|