scrapling 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +37 -14
- scrapling/engines/_browsers/_camoufox.py +76 -35
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +32 -11
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +9 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/METADATA +3 -4
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
from random import randint
|
1
2
|
from re import compile as re_compile
|
2
3
|
|
3
4
|
from playwright.sync_api import (
|
@@ -20,10 +21,12 @@ from ._validators import validate_fetch as _validate
|
|
20
21
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
21
22
|
from scrapling.core.utils import log
|
22
23
|
from scrapling.core._types import (
|
24
|
+
Any,
|
23
25
|
Dict,
|
24
26
|
List,
|
25
27
|
Optional,
|
26
28
|
Callable,
|
29
|
+
TYPE_CHECKING,
|
27
30
|
SelectorWaitStates,
|
28
31
|
)
|
29
32
|
from scrapling.engines.toolbelt.convertor import (
|
@@ -33,7 +36,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
33
36
|
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
34
37
|
|
35
38
|
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
36
|
-
_UNSET = object()
|
39
|
+
_UNSET: Any = object()
|
37
40
|
|
38
41
|
|
39
42
|
class StealthySession(StealthySessionMixin, SyncSession):
|
@@ -101,6 +104,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
101
104
|
os_randomize: bool = False,
|
102
105
|
disable_ads: bool = False,
|
103
106
|
geoip: bool = False,
|
107
|
+
user_data_dir: str = "",
|
104
108
|
selector_config: Optional[Dict] = None,
|
105
109
|
additional_args: Optional[Dict] = None,
|
106
110
|
):
|
@@ -133,6 +137,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
133
137
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
134
138
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
135
139
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
140
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
136
141
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
137
142
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
138
143
|
"""
|
@@ -156,6 +161,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
156
161
|
block_images=block_images,
|
157
162
|
block_webrtc=block_webrtc,
|
158
163
|
os_randomize=os_randomize,
|
164
|
+
user_data_dir=user_data_dir,
|
159
165
|
wait_selector=wait_selector,
|
160
166
|
google_search=google_search,
|
161
167
|
extra_headers=extra_headers,
|
@@ -170,9 +176,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
170
176
|
def __create__(self):
|
171
177
|
"""Create a browser for this instance and context."""
|
172
178
|
self.playwright = sync_playwright().start()
|
173
|
-
self.context = self.playwright.firefox.launch_persistent_context(
|
174
|
-
**self.launch_options
|
175
|
-
)
|
179
|
+
self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
|
176
180
|
|
177
181
|
if self.init_script: # pragma: no cover
|
178
182
|
self.context.add_init_script(path=self.init_script)
|
@@ -203,9 +207,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
203
207
|
self._closed = True
|
204
208
|
|
205
209
|
@staticmethod
|
206
|
-
def _get_page_content(page: Page) -> str
|
210
|
+
def _get_page_content(page: Page) -> str:
|
207
211
|
"""
|
208
|
-
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
212
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
209
213
|
:param page: The page to extract content from.
|
210
214
|
:return:
|
211
215
|
"""
|
@@ -215,6 +219,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
215
219
|
except PlaywrightError:
|
216
220
|
page.wait_for_timeout(1000)
|
217
221
|
continue
|
222
|
+
return "" # pyright: ignore
|
218
223
|
|
219
224
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
220
225
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
@@ -222,6 +227,10 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
222
227
|
:param page: The targeted page
|
223
228
|
:return:
|
224
229
|
"""
|
230
|
+
try:
|
231
|
+
page.wait_for_load_state("networkidle", timeout=5000)
|
232
|
+
except PlaywrightError:
|
233
|
+
pass
|
225
234
|
challenge_type = self._detect_cloudflare(self._get_page_content(page))
|
226
235
|
if not challenge_type:
|
227
236
|
log.error("No Cloudflare challenge found.")
|
@@ -244,26 +253,35 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
244
253
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
245
254
|
page.wait_for_timeout(500)
|
246
255
|
|
256
|
+
outer_box = {}
|
247
257
|
iframe = page.frame(url=__CF_PATTERN__)
|
248
|
-
if iframe is None:
|
249
|
-
|
250
|
-
|
258
|
+
if iframe is not None:
|
259
|
+
iframe.wait_for_load_state(state="domcontentloaded")
|
260
|
+
iframe.wait_for_load_state("networkidle")
|
251
261
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
262
|
+
if challenge_type != "embedded":
|
263
|
+
while not iframe.frame_element().is_visible():
|
264
|
+
# Double-checking that the iframe is loaded
|
265
|
+
page.wait_for_timeout(500)
|
266
|
+
outer_box: Any = iframe.frame_element().bounding_box()
|
267
|
+
|
268
|
+
if not iframe or not outer_box:
|
269
|
+
outer_box: Any = page.locator(box_selector).last.bounding_box()
|
256
270
|
|
257
|
-
iframe.wait_for_load_state(state="domcontentloaded")
|
258
|
-
iframe.wait_for_load_state("networkidle")
|
259
271
|
# Calculate the Captcha coordinates for any viewport
|
260
|
-
|
261
|
-
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
272
|
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
262
273
|
|
263
274
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
264
275
|
page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
276
|
+
page.wait_for_load_state("networkidle")
|
277
|
+
if iframe is not None:
|
278
|
+
# Wait for the frame to be removed from the page
|
279
|
+
while iframe in page.frames:
|
280
|
+
page.wait_for_timeout(100)
|
265
281
|
if challenge_type != "embedded":
|
282
|
+
page.locator(box_selector).last.wait_for(state="detached")
|
266
283
|
page.locator(".zone-name-title").wait_for(state="hidden")
|
284
|
+
page.wait_for_load_state(state="load")
|
267
285
|
page.wait_for_load_state(state="domcontentloaded")
|
268
286
|
|
269
287
|
log.info("Cloudflare captcha is solved")
|
@@ -335,6 +353,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
335
353
|
if (
|
336
354
|
finished_response.request.resource_type == "document"
|
337
355
|
and finished_response.request.is_navigation_request()
|
356
|
+
and finished_response.request.frame == page_info.page.main_frame
|
338
357
|
):
|
339
358
|
final_response = finished_response
|
340
359
|
|
@@ -387,7 +406,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
387
406
|
page_info.page, first_response, final_response, params.selector_config
|
388
407
|
)
|
389
408
|
|
390
|
-
# Close the page
|
409
|
+
# Close the page to free up resources
|
391
410
|
page_info.page.close()
|
392
411
|
self.page_pool.pages.remove(page_info)
|
393
412
|
|
@@ -427,6 +446,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
427
446
|
os_randomize: bool = False,
|
428
447
|
disable_ads: bool = False,
|
429
448
|
geoip: bool = False,
|
449
|
+
user_data_dir: str = "",
|
430
450
|
selector_config: Optional[Dict] = None,
|
431
451
|
additional_args: Optional[Dict] = None,
|
432
452
|
):
|
@@ -460,6 +480,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
460
480
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
461
481
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
462
482
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
483
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
463
484
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
464
485
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
465
486
|
"""
|
@@ -485,6 +506,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
485
506
|
wait_selector=wait_selector,
|
486
507
|
google_search=google_search,
|
487
508
|
extra_headers=extra_headers,
|
509
|
+
user_data_dir=user_data_dir,
|
488
510
|
additional_args=additional_args,
|
489
511
|
selector_config=selector_config,
|
490
512
|
solve_cloudflare=solve_cloudflare,
|
@@ -504,7 +526,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
504
526
|
await self.context.add_init_script(path=self.init_script)
|
505
527
|
|
506
528
|
if self.cookies:
|
507
|
-
await self.context.add_cookies(self.cookies)
|
529
|
+
await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
|
508
530
|
|
509
531
|
async def __aenter__(self):
|
510
532
|
await self.__create__()
|
@@ -520,18 +542,18 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
520
542
|
|
521
543
|
if self.context:
|
522
544
|
await self.context.close()
|
523
|
-
self.context = None
|
545
|
+
self.context = None # pyright: ignore
|
524
546
|
|
525
547
|
if self.playwright:
|
526
548
|
await self.playwright.stop()
|
527
|
-
self.playwright = None
|
549
|
+
self.playwright = None # pyright: ignore
|
528
550
|
|
529
551
|
self._closed = True
|
530
552
|
|
531
553
|
@staticmethod
|
532
|
-
async def _get_page_content(page: async_Page) -> str
|
554
|
+
async def _get_page_content(page: async_Page) -> str:
|
533
555
|
"""
|
534
|
-
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
556
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
535
557
|
:param page: The page to extract content from.
|
536
558
|
:return:
|
537
559
|
"""
|
@@ -541,6 +563,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
541
563
|
except PlaywrightError:
|
542
564
|
await page.wait_for_timeout(1000)
|
543
565
|
continue
|
566
|
+
return "" # pyright: ignore
|
544
567
|
|
545
568
|
async def _solve_cloudflare(self, page: async_Page):
|
546
569
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
@@ -548,6 +571,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
548
571
|
:param page: The async targeted page
|
549
572
|
:return:
|
550
573
|
"""
|
574
|
+
try:
|
575
|
+
await page.wait_for_load_state("networkidle", timeout=5000)
|
576
|
+
except PlaywrightError:
|
577
|
+
pass
|
551
578
|
challenge_type = self._detect_cloudflare(await self._get_page_content(page))
|
552
579
|
if not challenge_type:
|
553
580
|
log.error("No Cloudflare challenge found.")
|
@@ -570,26 +597,35 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
570
597
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
571
598
|
await page.wait_for_timeout(500)
|
572
599
|
|
600
|
+
outer_box = {}
|
573
601
|
iframe = page.frame(url=__CF_PATTERN__)
|
574
|
-
if iframe is None:
|
575
|
-
|
576
|
-
|
602
|
+
if iframe is not None:
|
603
|
+
await iframe.wait_for_load_state(state="domcontentloaded")
|
604
|
+
await iframe.wait_for_load_state("networkidle")
|
577
605
|
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
606
|
+
if challenge_type != "embedded":
|
607
|
+
while not await (await iframe.frame_element()).is_visible():
|
608
|
+
# Double-checking that the iframe is loaded
|
609
|
+
await page.wait_for_timeout(500)
|
610
|
+
outer_box: Any = await (await iframe.frame_element()).bounding_box()
|
611
|
+
|
612
|
+
if not iframe or not outer_box:
|
613
|
+
outer_box: Any = await page.locator(box_selector).last.bounding_box()
|
582
614
|
|
583
|
-
await iframe.wait_for_load_state(state="domcontentloaded")
|
584
|
-
await iframe.wait_for_load_state("networkidle")
|
585
615
|
# Calculate the Captcha coordinates for any viewport
|
586
|
-
|
587
|
-
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
616
|
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
588
617
|
|
589
618
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
590
619
|
await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
620
|
+
await page.wait_for_load_state("networkidle")
|
621
|
+
if iframe is not None:
|
622
|
+
# Wait for the frame to be removed from the page
|
623
|
+
while iframe in page.frames:
|
624
|
+
await page.wait_for_timeout(100)
|
591
625
|
if challenge_type != "embedded":
|
626
|
+
await page.locator(box_selector).wait_for(state="detached")
|
592
627
|
await page.locator(".zone-name-title").wait_for(state="hidden")
|
628
|
+
await page.wait_for_load_state(state="load")
|
593
629
|
await page.wait_for_load_state(state="domcontentloaded")
|
594
630
|
|
595
631
|
log.info("Cloudflare captcha is solved")
|
@@ -661,12 +697,17 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
661
697
|
if (
|
662
698
|
finished_response.request.resource_type == "document"
|
663
699
|
and finished_response.request.is_navigation_request()
|
700
|
+
and finished_response.request.frame == page_info.page.main_frame
|
664
701
|
):
|
665
702
|
final_response = finished_response
|
666
703
|
|
667
704
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
668
705
|
page_info.mark_busy(url=url)
|
669
706
|
|
707
|
+
if TYPE_CHECKING:
|
708
|
+
if not isinstance(page_info.page, async_Page):
|
709
|
+
raise TypeError
|
710
|
+
|
670
711
|
try:
|
671
712
|
# Navigate to URL and wait for a specified state
|
672
713
|
page_info.page.on("response", handle_response)
|
@@ -715,7 +756,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
715
756
|
page_info.page, first_response, final_response, params.selector_config
|
716
757
|
)
|
717
758
|
|
718
|
-
# Close the page
|
759
|
+
# Close the page to free up resources
|
719
760
|
await page_info.page.close()
|
720
761
|
self.page_pool.pages.remove(page_info)
|
721
762
|
|
@@ -10,6 +10,7 @@ from playwright.async_api import (
|
|
10
10
|
BrowserContext as AsyncBrowserContext,
|
11
11
|
Playwright as AsyncPlaywright,
|
12
12
|
Locator as AsyncLocator,
|
13
|
+
Page as async_Page,
|
13
14
|
)
|
14
15
|
from patchright.sync_api import sync_playwright as sync_patchright
|
15
16
|
from patchright.async_api import async_playwright as async_patchright
|
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
|
|
18
19
|
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
19
20
|
from ._validators import validate_fetch as _validate
|
20
21
|
from scrapling.core._types import (
|
22
|
+
Any,
|
21
23
|
Dict,
|
22
24
|
List,
|
23
25
|
Optional,
|
24
26
|
Callable,
|
27
|
+
TYPE_CHECKING,
|
25
28
|
SelectorWaitStates,
|
26
29
|
)
|
27
30
|
from scrapling.engines.toolbelt.convertor import (
|
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
30
33
|
)
|
31
34
|
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
32
35
|
|
33
|
-
_UNSET = object()
|
36
|
+
_UNSET: Any = object()
|
34
37
|
|
35
38
|
|
36
39
|
class DynamicSession(DynamicSessionMixin, SyncSession):
|
@@ -94,7 +97,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
94
97
|
network_idle: bool = False,
|
95
98
|
load_dom: bool = True,
|
96
99
|
wait_selector_state: SelectorWaitStates = "attached",
|
100
|
+
user_data_dir: str = "",
|
97
101
|
selector_config: Optional[Dict] = None,
|
102
|
+
additional_args: Optional[Dict] = None,
|
98
103
|
):
|
99
104
|
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
100
105
|
|
@@ -121,7 +126,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
121
126
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
122
127
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
123
128
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
129
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
124
130
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
131
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
125
132
|
"""
|
126
133
|
self.__validate__(
|
127
134
|
wait=wait,
|
@@ -140,11 +147,13 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
140
147
|
hide_canvas=hide_canvas,
|
141
148
|
init_script=init_script,
|
142
149
|
network_idle=network_idle,
|
150
|
+
user_data_dir=user_data_dir,
|
143
151
|
google_search=google_search,
|
144
152
|
extra_headers=extra_headers,
|
145
153
|
wait_selector=wait_selector,
|
146
154
|
disable_webgl=disable_webgl,
|
147
155
|
selector_config=selector_config,
|
156
|
+
additional_args=additional_args,
|
148
157
|
disable_resources=disable_resources,
|
149
158
|
wait_selector_state=wait_selector_state,
|
150
159
|
)
|
@@ -154,14 +163,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
154
163
|
"""Create a browser for this instance and context."""
|
155
164
|
sync_context = sync_patchright if self.stealth else sync_playwright
|
156
165
|
|
157
|
-
self.playwright: Playwright = sync_context().start()
|
166
|
+
self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
158
167
|
|
159
168
|
if self.cdp_url: # pragma: no cover
|
160
169
|
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
161
170
|
**self.context_options
|
162
171
|
)
|
163
172
|
else:
|
164
|
-
self.context = self.playwright.chromium.launch_persistent_context(
|
173
|
+
self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
|
165
174
|
|
166
175
|
if self.init_script: # pragma: no cover
|
167
176
|
self.context.add_init_script(path=self.init_script)
|
@@ -187,7 +196,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
187
196
|
|
188
197
|
if self.playwright:
|
189
198
|
self.playwright.stop()
|
190
|
-
self.playwright = None
|
199
|
+
self.playwright = None # pyright: ignore
|
191
200
|
|
192
201
|
self._closed = True
|
193
202
|
|
@@ -254,6 +263,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
254
263
|
if (
|
255
264
|
finished_response.request.resource_type == "document"
|
256
265
|
and finished_response.request.is_navigation_request()
|
266
|
+
and finished_response.request.frame == page_info.page.main_frame
|
257
267
|
):
|
258
268
|
final_response = finished_response
|
259
269
|
|
@@ -299,7 +309,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
299
309
|
page_info.page, first_response, final_response, params.selector_config
|
300
310
|
)
|
301
311
|
|
302
|
-
# Close the page
|
312
|
+
# Close the page to free up resources
|
303
313
|
page_info.page.close()
|
304
314
|
self.page_pool.pages.remove(page_info)
|
305
315
|
|
@@ -337,7 +347,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
337
347
|
network_idle: bool = False,
|
338
348
|
load_dom: bool = True,
|
339
349
|
wait_selector_state: SelectorWaitStates = "attached",
|
350
|
+
user_data_dir: str = "",
|
340
351
|
selector_config: Optional[Dict] = None,
|
352
|
+
additional_args: Optional[Dict] = None,
|
341
353
|
):
|
342
354
|
"""A Browser session manager with page pooling
|
343
355
|
|
@@ -365,7 +377,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
365
377
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
366
378
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
367
379
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
380
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
368
381
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
382
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
369
383
|
"""
|
370
384
|
|
371
385
|
self.__validate__(
|
@@ -385,11 +399,13 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
385
399
|
hide_canvas=hide_canvas,
|
386
400
|
init_script=init_script,
|
387
401
|
network_idle=network_idle,
|
402
|
+
user_data_dir=user_data_dir,
|
388
403
|
google_search=google_search,
|
389
404
|
extra_headers=extra_headers,
|
390
405
|
wait_selector=wait_selector,
|
391
406
|
disable_webgl=disable_webgl,
|
392
407
|
selector_config=selector_config,
|
408
|
+
additional_args=additional_args,
|
393
409
|
disable_resources=disable_resources,
|
394
410
|
wait_selector_state=wait_selector_state,
|
395
411
|
)
|
@@ -399,21 +415,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
399
415
|
"""Create a browser for this instance and context."""
|
400
416
|
async_context = async_patchright if self.stealth else async_playwright
|
401
417
|
|
402
|
-
self.playwright: AsyncPlaywright = await async_context().start()
|
418
|
+
self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
403
419
|
|
404
420
|
if self.cdp_url:
|
405
421
|
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
406
422
|
self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
|
407
423
|
else:
|
408
424
|
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
409
|
-
|
425
|
+
**self.launch_options
|
410
426
|
)
|
411
427
|
|
412
428
|
if self.init_script: # pragma: no cover
|
413
429
|
await self.context.add_init_script(path=self.init_script)
|
414
430
|
|
415
431
|
if self.cookies:
|
416
|
-
await self.context.add_cookies(self.cookies)
|
432
|
+
await self.context.add_cookies(self.cookies) # pyright: ignore
|
417
433
|
|
418
434
|
async def __aenter__(self):
|
419
435
|
await self.__create__()
|
@@ -429,11 +445,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
429
445
|
|
430
446
|
if self.context:
|
431
447
|
await self.context.close()
|
432
|
-
self.context = None
|
448
|
+
self.context = None # pyright: ignore
|
433
449
|
|
434
450
|
if self.playwright:
|
435
451
|
await self.playwright.stop()
|
436
|
-
self.playwright = None
|
452
|
+
self.playwright = None # pyright: ignore
|
437
453
|
|
438
454
|
self._closed = True
|
439
455
|
|
@@ -500,12 +516,17 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
500
516
|
if (
|
501
517
|
finished_response.request.resource_type == "document"
|
502
518
|
and finished_response.request.is_navigation_request()
|
519
|
+
and finished_response.request.frame == page_info.page.main_frame
|
503
520
|
):
|
504
521
|
final_response = finished_response
|
505
522
|
|
506
523
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
507
524
|
page_info.mark_busy(url=url)
|
508
525
|
|
526
|
+
if TYPE_CHECKING:
|
527
|
+
if not isinstance(page_info.page, async_Page):
|
528
|
+
raise TypeError
|
529
|
+
|
509
530
|
try:
|
510
531
|
# Navigate to URL and wait for a specified state
|
511
532
|
page_info.page.on("response", handle_response)
|
@@ -545,7 +566,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
545
566
|
page_info.page, first_response, final_response, params.selector_config
|
546
567
|
)
|
547
568
|
|
548
|
-
# Close the page
|
569
|
+
# Close the page to free up resources
|
549
570
|
await page_info.page.close()
|
550
571
|
self.page_pool.pages.remove(page_info)
|
551
572
|
return response
|
@@ -11,7 +11,9 @@ from scrapling.core._types import (
|
|
11
11
|
Tuple,
|
12
12
|
Optional,
|
13
13
|
Callable,
|
14
|
+
Iterable,
|
14
15
|
SelectorWaitStates,
|
16
|
+
overload,
|
15
17
|
)
|
16
18
|
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
17
19
|
|
@@ -73,7 +75,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
73
75
|
stealth: bool = False
|
74
76
|
wait: Seconds = 0
|
75
77
|
page_action: Optional[Callable] = None
|
76
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
78
|
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
77
79
|
locale: str = "en-US"
|
78
80
|
extra_headers: Optional[Dict[str, str]] = None
|
79
81
|
useragent: Optional[str] = None
|
@@ -81,11 +83,13 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
81
83
|
init_script: Optional[str] = None
|
82
84
|
disable_resources: bool = False
|
83
85
|
wait_selector: Optional[str] = None
|
84
|
-
cookies: Optional[
|
86
|
+
cookies: Optional[Iterable[Dict]] = None
|
85
87
|
network_idle: bool = False
|
86
88
|
load_dom: bool = True
|
87
89
|
wait_selector_state: SelectorWaitStates = "attached"
|
88
|
-
|
90
|
+
user_data_dir: str = ""
|
91
|
+
selector_config: Optional[Dict] = {}
|
92
|
+
additional_args: Optional[Dict] = {}
|
89
93
|
|
90
94
|
def __post_init__(self):
|
91
95
|
"""Custom validation after msgspec validation"""
|
@@ -100,6 +104,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
100
104
|
self.cookies = []
|
101
105
|
if not self.selector_config:
|
102
106
|
self.selector_config = {}
|
107
|
+
if not self.additional_args:
|
108
|
+
self.additional_args = {}
|
103
109
|
|
104
110
|
if self.init_script is not None:
|
105
111
|
_validate_file_path(self.init_script)
|
@@ -125,15 +131,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
125
131
|
wait_selector: Optional[str] = None
|
126
132
|
addons: Optional[List[str]] = None
|
127
133
|
wait_selector_state: SelectorWaitStates = "attached"
|
128
|
-
cookies: Optional[
|
134
|
+
cookies: Optional[Iterable[Dict]] = None
|
129
135
|
google_search: bool = True
|
130
136
|
extra_headers: Optional[Dict[str, str]] = None
|
131
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
137
|
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
132
138
|
os_randomize: bool = False
|
133
139
|
disable_ads: bool = False
|
134
140
|
geoip: bool = False
|
135
|
-
|
136
|
-
|
141
|
+
user_data_dir: str = ""
|
142
|
+
selector_config: Optional[Dict] = {}
|
143
|
+
additional_args: Optional[Dict] = {}
|
137
144
|
|
138
145
|
def __post_init__(self):
|
139
146
|
"""Custom validation after msgspec validation"""
|
@@ -177,7 +184,7 @@ class FetchConfig(Struct, kw_only=True):
|
|
177
184
|
network_idle: bool = False
|
178
185
|
load_dom: bool = True
|
179
186
|
solve_cloudflare: bool = False
|
180
|
-
selector_config:
|
187
|
+
selector_config: Dict = {}
|
181
188
|
|
182
189
|
def to_dict(self):
|
183
190
|
return {f: getattr(self, f) for f in self.__struct_fields__}
|
@@ -198,7 +205,7 @@ class _fetch_params:
|
|
198
205
|
network_idle: bool
|
199
206
|
load_dom: bool
|
200
207
|
solve_cloudflare: bool
|
201
|
-
selector_config:
|
208
|
+
selector_config: Dict
|
202
209
|
|
203
210
|
|
204
211
|
def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
@@ -222,7 +229,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
222
229
|
return _fetch_params(**result)
|
223
230
|
|
224
231
|
|
225
|
-
|
232
|
+
@overload
|
233
|
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
234
|
+
|
235
|
+
|
236
|
+
@overload
|
237
|
+
def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
|
238
|
+
|
239
|
+
|
240
|
+
@overload
|
241
|
+
def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
|
242
|
+
|
243
|
+
|
244
|
+
def validate(
|
245
|
+
params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
|
246
|
+
) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
|
226
247
|
try:
|
227
248
|
return convert(params, model)
|
228
249
|
except ValidationError as e:
|