scrapling 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3.2/scrapling.egg-info → scrapling-0.3.4}/PKG-INFO +10 -9
- {scrapling-0.3.2 → scrapling-0.3.4}/README.md +9 -8
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/__init__.py +1 -1
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/shell.py +19 -4
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/_base.py +5 -5
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/_camoufox.py +35 -14
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/_controllers.py +0 -8
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/parser.py +13 -9
- {scrapling-0.3.2 → scrapling-0.3.4/scrapling.egg-info}/PKG-INFO +10 -9
- {scrapling-0.3.2 → scrapling-0.3.4}/setup.cfg +1 -1
- {scrapling-0.3.2 → scrapling-0.3.4}/LICENSE +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/MANIFEST.in +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/pyproject.toml +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/cli.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/_types.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/ai.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/mixins.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/storage.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/translator.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/utils/__init__.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/_config_tools.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/_page.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/_browsers/_validators.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/constants.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/static.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/convertor.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/fetchers.py +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling/py.typed +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.3.2 → scrapling-0.3.4}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.4
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -114,14 +114,6 @@ Dynamic: license-file
|
|
114
114
|
</p>
|
115
115
|
|
116
116
|
<p align="center">
|
117
|
-
<a href="https://scrapling.readthedocs.io/en/latest/#installation">
|
118
|
-
Installation
|
119
|
-
</a>
|
120
|
-
·
|
121
|
-
<a href="https://scrapling.readthedocs.io/en/latest/overview/">
|
122
|
-
Overview
|
123
|
-
</a>
|
124
|
-
·
|
125
117
|
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
126
118
|
Selection methods
|
127
119
|
</a>
|
@@ -130,6 +122,14 @@ Dynamic: license-file
|
|
130
122
|
Choosing a fetcher
|
131
123
|
</a>
|
132
124
|
·
|
125
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
|
126
|
+
CLI
|
127
|
+
</a>
|
128
|
+
·
|
129
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
130
|
+
MCP mode
|
131
|
+
</a>
|
132
|
+
·
|
133
133
|
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
134
134
|
Migrating from Beautifulsoup
|
135
135
|
</a>
|
@@ -159,6 +159,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
159
159
|
|
160
160
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
161
161
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
|
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
162
163
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
163
164
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
164
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
@@ -24,14 +24,6 @@
|
|
24
24
|
</p>
|
25
25
|
|
26
26
|
<p align="center">
|
27
|
-
<a href="https://scrapling.readthedocs.io/en/latest/#installation">
|
28
|
-
Installation
|
29
|
-
</a>
|
30
|
-
·
|
31
|
-
<a href="https://scrapling.readthedocs.io/en/latest/overview/">
|
32
|
-
Overview
|
33
|
-
</a>
|
34
|
-
·
|
35
27
|
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
36
28
|
Selection methods
|
37
29
|
</a>
|
@@ -40,6 +32,14 @@
|
|
40
32
|
Choosing a fetcher
|
41
33
|
</a>
|
42
34
|
·
|
35
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
|
36
|
+
CLI
|
37
|
+
</a>
|
38
|
+
·
|
39
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
40
|
+
MCP mode
|
41
|
+
</a>
|
42
|
+
·
|
43
43
|
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
44
44
|
Migrating from Beautifulsoup
|
45
45
|
</a>
|
@@ -69,6 +69,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
69
69
|
|
70
70
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
71
71
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
72
|
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
72
73
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
73
74
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
74
75
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
@@ -317,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
317
317
|
|
318
318
|
try:
|
319
319
|
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
320
|
-
with open(fd, "
|
320
|
+
with open(fd, "wb") as f:
|
321
321
|
f.write(page.body)
|
322
322
|
|
323
323
|
open_in_browser(f"file://{fname}")
|
@@ -335,15 +335,25 @@ class CustomShell:
|
|
335
335
|
from scrapling.fetchers import (
|
336
336
|
Fetcher as __Fetcher,
|
337
337
|
AsyncFetcher as __AsyncFetcher,
|
338
|
+
FetcherSession as __FetcherSession,
|
338
339
|
DynamicFetcher as __DynamicFetcher,
|
340
|
+
DynamicSession as __DynamicSession,
|
341
|
+
AsyncDynamicSession as __AsyncDynamicSession,
|
339
342
|
StealthyFetcher as __StealthyFetcher,
|
343
|
+
StealthySession as __StealthySession,
|
344
|
+
AsyncStealthySession as __AsyncStealthySession,
|
340
345
|
)
|
341
346
|
|
342
347
|
self.__InteractiveShellEmbed = __InteractiveShellEmbed
|
343
348
|
self.__Fetcher = __Fetcher
|
344
349
|
self.__AsyncFetcher = __AsyncFetcher
|
350
|
+
self.__FetcherSession = __FetcherSession
|
345
351
|
self.__DynamicFetcher = __DynamicFetcher
|
352
|
+
self.__DynamicSession = __DynamicSession
|
353
|
+
self.__AsyncDynamicSession = __AsyncDynamicSession
|
346
354
|
self.__StealthyFetcher = __StealthyFetcher
|
355
|
+
self.__StealthySession = __StealthySession
|
356
|
+
self.__AsyncStealthySession = __AsyncStealthySession
|
347
357
|
self.code = code
|
348
358
|
self.page = None
|
349
359
|
self.pages = Selectors([])
|
@@ -379,9 +389,9 @@ class CustomShell:
|
|
379
389
|
"""Create a custom banner for the shell"""
|
380
390
|
return f"""
|
381
391
|
-> Available Scrapling objects:
|
382
|
-
- Fetcher/AsyncFetcher
|
383
|
-
- DynamicFetcher
|
384
|
-
- StealthyFetcher
|
392
|
+
- Fetcher/AsyncFetcher/FetcherSession
|
393
|
+
- DynamicFetcher/DynamicSession/AsyncDynamicSession
|
394
|
+
- StealthyFetcher/StealthySession/AsyncStealthySession
|
385
395
|
- Selector
|
386
396
|
|
387
397
|
-> Useful shortcuts:
|
@@ -449,6 +459,11 @@ Type 'exit' or press Ctrl+D to exit.
|
|
449
459
|
"delete": delete,
|
450
460
|
"Fetcher": self.__Fetcher,
|
451
461
|
"AsyncFetcher": self.__AsyncFetcher,
|
462
|
+
"FetcherSession": self.__FetcherSession,
|
463
|
+
"DynamicSession": self.__DynamicSession,
|
464
|
+
"AsyncDynamicSession": self.__AsyncDynamicSession,
|
465
|
+
"StealthySession": self.__StealthySession,
|
466
|
+
"AsyncStealthySession": self.__AsyncStealthySession,
|
452
467
|
"fetch": dynamic_fetch,
|
453
468
|
"DynamicFetcher": self.__DynamicFetcher,
|
454
469
|
"stealthy_fetch": stealthy_fetch,
|
@@ -31,7 +31,7 @@ class SyncSession:
|
|
31
31
|
def __init__(self, max_pages: int = 1):
|
32
32
|
self.max_pages = max_pages
|
33
33
|
self.page_pool = PagePool(max_pages)
|
34
|
-
self.
|
34
|
+
self._max_wait_for_page = 60
|
35
35
|
self.playwright: Optional[Playwright] = None
|
36
36
|
self.context: Optional[BrowserContext] = None
|
37
37
|
self._closed = False
|
@@ -50,7 +50,7 @@ class SyncSession:
|
|
50
50
|
# If we're at max capacity after cleanup, wait for busy pages to finish
|
51
51
|
if self.page_pool.pages_count >= self.max_pages:
|
52
52
|
start_time = time()
|
53
|
-
while time() - start_time < self.
|
53
|
+
while time() - start_time < self._max_wait_for_page:
|
54
54
|
# Wait for any pages to finish, then clean them up
|
55
55
|
sleep(0.05)
|
56
56
|
self.page_pool.close_all_finished_pages()
|
@@ -58,7 +58,7 @@ class SyncSession:
|
|
58
58
|
break
|
59
59
|
else:
|
60
60
|
raise TimeoutError(
|
61
|
-
f"No pages finished to clear place in the pool within the {self.
|
61
|
+
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
62
62
|
)
|
63
63
|
|
64
64
|
page = self.context.new_page()
|
@@ -111,7 +111,7 @@ class AsyncSession(SyncSession):
|
|
111
111
|
# If we're at max capacity after cleanup, wait for busy pages to finish
|
112
112
|
if self.page_pool.pages_count >= self.max_pages:
|
113
113
|
start_time = time()
|
114
|
-
while time() - start_time < self.
|
114
|
+
while time() - start_time < self._max_wait_for_page:
|
115
115
|
# Wait for any pages to finish, then clean them up
|
116
116
|
await asyncio_sleep(0.05)
|
117
117
|
await self.page_pool.aclose_all_finished_pages()
|
@@ -119,7 +119,7 @@ class AsyncSession(SyncSession):
|
|
119
119
|
break
|
120
120
|
else:
|
121
121
|
raise TimeoutError(
|
122
|
-
f"No pages finished to clear place in the pool within the {self.
|
122
|
+
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
123
123
|
)
|
124
124
|
|
125
125
|
page = await self.context.new_page()
|
@@ -14,6 +14,7 @@ from playwright.async_api import (
|
|
14
14
|
Locator as AsyncLocator,
|
15
15
|
Page as async_Page,
|
16
16
|
)
|
17
|
+
from playwright._impl._errors import Error as PlaywrightError
|
17
18
|
|
18
19
|
from ._validators import validate, CamoufoxConfig
|
19
20
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
@@ -173,10 +174,6 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
173
174
|
**self.launch_options
|
174
175
|
)
|
175
176
|
|
176
|
-
# Get the default page and close it
|
177
|
-
default_page = self.context.pages[0]
|
178
|
-
default_page.close()
|
179
|
-
|
180
177
|
if self.init_script: # pragma: no cover
|
181
178
|
self.context.add_init_script(path=self.init_script)
|
182
179
|
|
@@ -205,20 +202,34 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
205
202
|
|
206
203
|
self._closed = True
|
207
204
|
|
205
|
+
@staticmethod
|
206
|
+
def _get_page_content(page: Page) -> str | None:
|
207
|
+
"""
|
208
|
+
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
209
|
+
:param page: The page to extract content from.
|
210
|
+
:return:
|
211
|
+
"""
|
212
|
+
while True:
|
213
|
+
try:
|
214
|
+
return page.content() or ""
|
215
|
+
except PlaywrightError:
|
216
|
+
page.wait_for_timeout(1000)
|
217
|
+
continue
|
218
|
+
|
208
219
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
209
220
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
210
221
|
|
211
222
|
:param page: The targeted page
|
212
223
|
:return:
|
213
224
|
"""
|
214
|
-
challenge_type = self._detect_cloudflare(
|
225
|
+
challenge_type = self._detect_cloudflare(self._get_page_content(page))
|
215
226
|
if not challenge_type:
|
216
227
|
log.error("No Cloudflare challenge found.")
|
217
228
|
return
|
218
229
|
else:
|
219
230
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
220
231
|
if challenge_type == "non-interactive":
|
221
|
-
while "<title>Just a moment...</title>" in (
|
232
|
+
while "<title>Just a moment...</title>" in (self._get_page_content(page)):
|
222
233
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
223
234
|
page.wait_for_timeout(1000)
|
224
235
|
page.wait_for_load_state()
|
@@ -226,7 +237,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
226
237
|
return
|
227
238
|
|
228
239
|
else:
|
229
|
-
while "Verifying you are human." in
|
240
|
+
while "Verifying you are human." in self._get_page_content(page):
|
230
241
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
231
242
|
page.wait_for_timeout(500)
|
232
243
|
|
@@ -482,10 +493,6 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
482
493
|
**self.launch_options
|
483
494
|
)
|
484
495
|
|
485
|
-
# Get the default page and close it
|
486
|
-
default_page = self.context.pages[0]
|
487
|
-
await default_page.close()
|
488
|
-
|
489
496
|
if self.init_script: # pragma: no cover
|
490
497
|
await self.context.add_init_script(path=self.init_script)
|
491
498
|
|
@@ -514,20 +521,34 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
514
521
|
|
515
522
|
self._closed = True
|
516
523
|
|
524
|
+
@staticmethod
|
525
|
+
async def _get_page_content(page: async_Page) -> str | None:
|
526
|
+
"""
|
527
|
+
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
528
|
+
:param page: The page to extract content from.
|
529
|
+
:return:
|
530
|
+
"""
|
531
|
+
while True:
|
532
|
+
try:
|
533
|
+
return (await page.content()) or ""
|
534
|
+
except PlaywrightError:
|
535
|
+
await page.wait_for_timeout(1000)
|
536
|
+
continue
|
537
|
+
|
517
538
|
async def _solve_cloudflare(self, page: async_Page):
|
518
539
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
519
540
|
|
520
541
|
:param page: The async targeted page
|
521
542
|
:return:
|
522
543
|
"""
|
523
|
-
challenge_type = self._detect_cloudflare(await
|
544
|
+
challenge_type = self._detect_cloudflare(await self._get_page_content(page))
|
524
545
|
if not challenge_type:
|
525
546
|
log.error("No Cloudflare challenge found.")
|
526
547
|
return
|
527
548
|
else:
|
528
549
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
529
550
|
if challenge_type == "non-interactive": # pragma: no cover
|
530
|
-
while "<title>Just a moment...</title>" in (await
|
551
|
+
while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
|
531
552
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
532
553
|
await page.wait_for_timeout(1000)
|
533
554
|
await page.wait_for_load_state()
|
@@ -535,7 +556,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
535
556
|
return
|
536
557
|
|
537
558
|
else:
|
538
|
-
while "Verifying you are human." in (await
|
559
|
+
while "Verifying you are human." in (await self._get_page_content(page)):
|
539
560
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
540
561
|
await page.wait_for_timeout(500)
|
541
562
|
|
@@ -168,10 +168,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
168
168
|
else:
|
169
169
|
self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
|
170
170
|
|
171
|
-
# Get the default page and close it
|
172
|
-
default_page = self.context.pages[0]
|
173
|
-
default_page.close()
|
174
|
-
|
175
171
|
if self.init_script: # pragma: no cover
|
176
172
|
self.context.add_init_script(path=self.init_script)
|
177
173
|
|
@@ -421,10 +417,6 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
421
417
|
user_data_dir="", **self.launch_options
|
422
418
|
)
|
423
419
|
|
424
|
-
# Get the default page and close it
|
425
|
-
default_page = self.context.pages[0]
|
426
|
-
await default_page.close()
|
427
|
-
|
428
420
|
if self.init_script: # pragma: no cover
|
429
421
|
await self.context.add_init_script(path=self.init_script)
|
430
422
|
|
@@ -339,7 +339,10 @@ class Selector(SelectorsGeneration):
|
|
339
339
|
@property
|
340
340
|
def html_content(self) -> TextHandler:
|
341
341
|
"""Return the inner HTML code of the element"""
|
342
|
-
|
342
|
+
content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
|
343
|
+
if isinstance(content, bytes):
|
344
|
+
content = content.decode("utf-8")
|
345
|
+
return TextHandler(content)
|
343
346
|
|
344
347
|
@property
|
345
348
|
def body(self):
|
@@ -348,15 +351,16 @@ class Selector(SelectorsGeneration):
|
|
348
351
|
|
349
352
|
def prettify(self) -> TextHandler:
|
350
353
|
"""Return a prettified version of the element's inner html-code"""
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
with_tail=False,
|
358
|
-
)
|
354
|
+
content = tostring(
|
355
|
+
self._root,
|
356
|
+
encoding=self.encoding,
|
357
|
+
pretty_print=True,
|
358
|
+
method="html",
|
359
|
+
with_tail=False,
|
359
360
|
)
|
361
|
+
if isinstance(content, bytes):
|
362
|
+
content = content.decode("utf-8")
|
363
|
+
return TextHandler(content)
|
360
364
|
|
361
365
|
def has_class(self, class_name: str) -> bool:
|
362
366
|
"""Check if the element has a specific class
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.4
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -114,14 +114,6 @@ Dynamic: license-file
|
|
114
114
|
</p>
|
115
115
|
|
116
116
|
<p align="center">
|
117
|
-
<a href="https://scrapling.readthedocs.io/en/latest/#installation">
|
118
|
-
Installation
|
119
|
-
</a>
|
120
|
-
·
|
121
|
-
<a href="https://scrapling.readthedocs.io/en/latest/overview/">
|
122
|
-
Overview
|
123
|
-
</a>
|
124
|
-
·
|
125
117
|
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
126
118
|
Selection methods
|
127
119
|
</a>
|
@@ -130,6 +122,14 @@ Dynamic: license-file
|
|
130
122
|
Choosing a fetcher
|
131
123
|
</a>
|
132
124
|
·
|
125
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
|
126
|
+
CLI
|
127
|
+
</a>
|
128
|
+
·
|
129
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
130
|
+
MCP mode
|
131
|
+
</a>
|
132
|
+
·
|
133
133
|
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
134
134
|
Migrating from Beautifulsoup
|
135
135
|
</a>
|
@@ -159,6 +159,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
159
159
|
|
160
160
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
161
161
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
|
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
162
163
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
163
164
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
164
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[metadata]
|
2
2
|
name = scrapling
|
3
|
-
version = 0.3.
|
3
|
+
version = 0.3.4
|
4
4
|
author = Karim Shoair
|
5
5
|
author_email = karim.shoair@pm.me
|
6
6
|
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
{scrapling-0.3.2 → scrapling-0.3.4}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|