scrapling 0.3__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3/scrapling.egg-info → scrapling-0.3.1}/PKG-INFO +5 -3
- {scrapling-0.3 → scrapling-0.3.1}/README.md +4 -2
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/__init__.py +1 -1
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/shell.py +3 -3
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/_browsers/_camoufox.py +14 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/_browsers/_controllers.py +14 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/_browsers/_validators.py +20 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/fetchers.py +12 -0
- {scrapling-0.3 → scrapling-0.3.1/scrapling.egg-info}/PKG-INFO +5 -3
- {scrapling-0.3 → scrapling-0.3.1}/setup.cfg +1 -1
- {scrapling-0.3 → scrapling-0.3.1}/LICENSE +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/MANIFEST.in +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/pyproject.toml +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/cli.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/_types.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/ai.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/mixins.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/storage.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/translator.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/core/utils.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/_browsers/_config_tools.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/_browsers/_page.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/constants.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/static.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/convertor.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/parser.py +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling/py.typed +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.3 → scrapling-0.3.1}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -155,8 +155,8 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
155
155
|
<!-- sponsors -->
|
156
156
|
|
157
157
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
158
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
159
158
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
159
|
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
160
160
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
161
161
|
|
162
162
|
<!-- /sponsors -->
|
@@ -273,7 +273,7 @@ from scrapling.parser import Selector
|
|
273
273
|
|
274
274
|
page = Selector("<html>...</html>")
|
275
275
|
```
|
276
|
-
And it works exactly the same!
|
276
|
+
And it works exactly the same way!
|
277
277
|
|
278
278
|
### Async Session Management Examples
|
279
279
|
```python
|
@@ -302,6 +302,8 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
302
302
|
|
303
303
|
Scrapling v0.3 includes a powerful command-line interface:
|
304
304
|
|
305
|
+
[](https://asciinema.org/a/736339)
|
306
|
+
|
305
307
|
```bash
|
306
308
|
# Launch interactive Web Scraping shell
|
307
309
|
scrapling shell
|
@@ -68,8 +68,8 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
68
68
|
<!-- sponsors -->
|
69
69
|
|
70
70
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
71
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
72
71
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
72
|
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
73
73
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
74
74
|
|
75
75
|
<!-- /sponsors -->
|
@@ -186,7 +186,7 @@ from scrapling.parser import Selector
|
|
186
186
|
|
187
187
|
page = Selector("<html>...</html>")
|
188
188
|
```
|
189
|
-
And it works exactly the same!
|
189
|
+
And it works exactly the same way!
|
190
190
|
|
191
191
|
### Async Session Management Examples
|
192
192
|
```python
|
@@ -215,6 +215,8 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
215
215
|
|
216
216
|
Scrapling v0.3 includes a powerful command-line interface:
|
217
217
|
|
218
|
+
[](https://asciinema.org/a/736339)
|
219
|
+
|
218
220
|
```bash
|
219
221
|
# Launch interactive Web Scraping shell
|
220
222
|
scrapling shell
|
@@ -20,7 +20,6 @@ from logging import (
|
|
20
20
|
getLevelName,
|
21
21
|
)
|
22
22
|
|
23
|
-
from IPython.terminal.embed import InteractiveShellEmbed
|
24
23
|
from orjson import loads as json_loads, JSONDecodeError
|
25
24
|
|
26
25
|
from scrapling import __version__
|
@@ -394,8 +393,7 @@ class CurlParser:
|
|
394
393
|
|
395
394
|
else: # pragma: no cover
|
396
395
|
log.error("Input must be a valid curl command string or a Request object.")
|
397
|
-
|
398
|
-
return None
|
396
|
+
return None
|
399
397
|
|
400
398
|
|
401
399
|
def show_page_in_browser(page: Selector): # pragma: no cover
|
@@ -544,6 +542,8 @@ Type 'exit' or press Ctrl+D to exit.
|
|
544
542
|
|
545
543
|
def start(self): # pragma: no cover
|
546
544
|
"""Start the interactive shell"""
|
545
|
+
from IPython.terminal.embed import InteractiveShellEmbed
|
546
|
+
|
547
547
|
# Get our namespace with application objects
|
548
548
|
namespace = self.get_namespace()
|
549
549
|
ipython_shell = InteractiveShellEmbed(
|
@@ -60,6 +60,7 @@ class StealthySession:
|
|
60
60
|
"timeout",
|
61
61
|
"page_action",
|
62
62
|
"wait_selector",
|
63
|
+
"init_script",
|
63
64
|
"addons",
|
64
65
|
"wait_selector_state",
|
65
66
|
"cookies",
|
@@ -95,6 +96,7 @@ class StealthySession:
|
|
95
96
|
timeout: int | float = 30000,
|
96
97
|
page_action: Optional[Callable] = None,
|
97
98
|
wait_selector: Optional[str] = None,
|
99
|
+
init_script: Optional[str] = None,
|
98
100
|
addons: Optional[List[str]] = None,
|
99
101
|
wait_selector_state: SelectorWaitStates = "attached",
|
100
102
|
cookies: Optional[List[Dict]] = None,
|
@@ -128,6 +130,7 @@ class StealthySession:
|
|
128
130
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
129
131
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
130
132
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
133
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
131
134
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
132
135
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
133
136
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
@@ -153,6 +156,7 @@ class StealthySession:
|
|
153
156
|
"timeout": timeout,
|
154
157
|
"page_action": page_action,
|
155
158
|
"wait_selector": wait_selector,
|
159
|
+
"init_script": init_script,
|
156
160
|
"addons": addons,
|
157
161
|
"wait_selector_state": wait_selector_state,
|
158
162
|
"cookies": cookies,
|
@@ -180,6 +184,7 @@ class StealthySession:
|
|
180
184
|
self.timeout = config.timeout
|
181
185
|
self.page_action = config.page_action
|
182
186
|
self.wait_selector = config.wait_selector
|
187
|
+
self.init_script = config.init_script
|
183
188
|
self.addons = config.addons
|
184
189
|
self.wait_selector_state = config.wait_selector_state
|
185
190
|
self.cookies = config.cookies
|
@@ -234,6 +239,9 @@ class StealthySession:
|
|
234
239
|
**self.launch_options
|
235
240
|
)
|
236
241
|
)
|
242
|
+
if self.init_script: # pragma: no cover
|
243
|
+
self.context.add_init_script(path=self.init_script)
|
244
|
+
|
237
245
|
if self.cookies: # pragma: no cover
|
238
246
|
self.context.add_cookies(self.cookies)
|
239
247
|
|
@@ -474,6 +482,7 @@ class AsyncStealthySession(StealthySession):
|
|
474
482
|
timeout: int | float = 30000,
|
475
483
|
page_action: Optional[Callable] = None,
|
476
484
|
wait_selector: Optional[str] = None,
|
485
|
+
init_script: Optional[str] = None,
|
477
486
|
addons: Optional[List[str]] = None,
|
478
487
|
wait_selector_state: SelectorWaitStates = "attached",
|
479
488
|
cookies: Optional[List[Dict]] = None,
|
@@ -507,6 +516,7 @@ class AsyncStealthySession(StealthySession):
|
|
507
516
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
508
517
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
509
518
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
519
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
510
520
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
511
521
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
512
522
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
@@ -531,6 +541,7 @@ class AsyncStealthySession(StealthySession):
|
|
531
541
|
timeout,
|
532
542
|
page_action,
|
533
543
|
wait_selector,
|
544
|
+
init_script,
|
534
545
|
addons,
|
535
546
|
wait_selector_state,
|
536
547
|
cookies,
|
@@ -557,6 +568,9 @@ class AsyncStealthySession(StealthySession):
|
|
557
568
|
**self.launch_options
|
558
569
|
)
|
559
570
|
)
|
571
|
+
if self.init_script: # pragma: no cover
|
572
|
+
await self.context.add_init_script(path=self.init_script)
|
573
|
+
|
560
574
|
if self.cookies:
|
561
575
|
await self.context.add_cookies(self.cookies)
|
562
576
|
|
@@ -60,6 +60,7 @@ class DynamicSession:
|
|
60
60
|
"disable_resources",
|
61
61
|
"network_idle",
|
62
62
|
"wait_selector",
|
63
|
+
"init_script",
|
63
64
|
"wait_selector_state",
|
64
65
|
"wait",
|
65
66
|
"playwright",
|
@@ -94,6 +95,7 @@ class DynamicSession:
|
|
94
95
|
timeout: int | float = 30000,
|
95
96
|
disable_resources: bool = False,
|
96
97
|
wait_selector: Optional[str] = None,
|
98
|
+
init_script: Optional[str] = None,
|
97
99
|
cookies: Optional[List[Dict]] = None,
|
98
100
|
network_idle: bool = False,
|
99
101
|
wait_selector_state: SelectorWaitStates = "attached",
|
@@ -112,6 +114,7 @@ class DynamicSession:
|
|
112
114
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
113
115
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
114
116
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
117
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
115
118
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
116
119
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
117
120
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
@@ -143,6 +146,7 @@ class DynamicSession:
|
|
143
146
|
"selector_config": selector_config,
|
144
147
|
"disable_resources": disable_resources,
|
145
148
|
"wait_selector": wait_selector,
|
149
|
+
"init_script": init_script,
|
146
150
|
"cookies": cookies,
|
147
151
|
"network_idle": network_idle,
|
148
152
|
"wait_selector_state": wait_selector_state,
|
@@ -168,6 +172,7 @@ class DynamicSession:
|
|
168
172
|
self.cdp_url = config.cdp_url
|
169
173
|
self.network_idle = config.network_idle
|
170
174
|
self.wait_selector = config.wait_selector
|
175
|
+
self.init_script = config.init_script
|
171
176
|
self.wait_selector_state = config.wait_selector_state
|
172
177
|
|
173
178
|
self.playwright: Optional[Playwright] = None
|
@@ -243,6 +248,9 @@ class DynamicSession:
|
|
243
248
|
user_data_dir="", **self.launch_options
|
244
249
|
)
|
245
250
|
|
251
|
+
if self.init_script: # pragma: no cover
|
252
|
+
self.context.add_init_script(path=self.init_script)
|
253
|
+
|
246
254
|
if self.cookies: # pragma: no cover
|
247
255
|
self.context.add_cookies(self.cookies)
|
248
256
|
|
@@ -409,6 +417,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
409
417
|
timeout: int | float = 30000,
|
410
418
|
disable_resources: bool = False,
|
411
419
|
wait_selector: Optional[str] = None,
|
420
|
+
init_script: Optional[str] = None,
|
412
421
|
cookies: Optional[List[Dict]] = None,
|
413
422
|
network_idle: bool = False,
|
414
423
|
wait_selector_state: SelectorWaitStates = "attached",
|
@@ -427,6 +436,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
427
436
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
428
437
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
429
438
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
439
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
430
440
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
431
441
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
432
442
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
@@ -459,6 +469,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
459
469
|
timeout,
|
460
470
|
disable_resources,
|
461
471
|
wait_selector,
|
472
|
+
init_script,
|
462
473
|
cookies,
|
463
474
|
network_idle,
|
464
475
|
wait_selector_state,
|
@@ -494,6 +505,9 @@ class AsyncDynamicSession(DynamicSession):
|
|
494
505
|
)
|
495
506
|
)
|
496
507
|
|
508
|
+
if self.init_script: # pragma: no cover
|
509
|
+
await self.context.add_init_script(path=self.init_script)
|
510
|
+
|
497
511
|
if self.cookies:
|
498
512
|
await self.context.add_cookies(self.cookies)
|
499
513
|
|
@@ -32,6 +32,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
32
32
|
extra_headers: Optional[Dict[str, str]] = None
|
33
33
|
useragent: Optional[str] = None
|
34
34
|
timeout: int | float = 30000
|
35
|
+
init_script: Optional[str] = None
|
35
36
|
disable_resources: bool = False
|
36
37
|
wait_selector: Optional[str] = None
|
37
38
|
cookies: Optional[List[Dict]] = None
|
@@ -58,6 +59,15 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
58
59
|
if not self.selector_config:
|
59
60
|
self.selector_config = {}
|
60
61
|
|
62
|
+
if self.init_script is not None:
|
63
|
+
script_path = Path(self.init_script)
|
64
|
+
if not script_path.exists():
|
65
|
+
raise ValueError("Init script path not found")
|
66
|
+
elif not script_path.is_file():
|
67
|
+
raise ValueError("Init script is not a file")
|
68
|
+
elif not script_path.is_absolute():
|
69
|
+
raise ValueError("Init script is not a absolute path")
|
70
|
+
|
61
71
|
@staticmethod
|
62
72
|
def __validate_cdp(cdp_url):
|
63
73
|
try:
|
@@ -90,6 +100,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
90
100
|
solve_cloudflare: bool = False
|
91
101
|
wait: int | float = 0
|
92
102
|
timeout: int | float = 30000
|
103
|
+
init_script: Optional[str] = None
|
93
104
|
page_action: Optional[Callable] = None
|
94
105
|
wait_selector: Optional[str] = None
|
95
106
|
addons: Optional[List[str]] = None
|
@@ -131,6 +142,15 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
131
142
|
f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
|
132
143
|
)
|
133
144
|
|
145
|
+
if self.init_script is not None:
|
146
|
+
script_path = Path(self.init_script)
|
147
|
+
if not script_path.exists():
|
148
|
+
raise ValueError("Init script path not found")
|
149
|
+
elif not script_path.is_file():
|
150
|
+
raise ValueError("Init script is not a file")
|
151
|
+
elif not script_path.is_absolute():
|
152
|
+
raise ValueError("Init script is not a absolute path")
|
153
|
+
|
134
154
|
if not self.cookies:
|
135
155
|
self.cookies = []
|
136
156
|
if self.solve_cloudflare and self.timeout < 60_000:
|
@@ -62,6 +62,7 @@ class StealthyFetcher(BaseFetcher):
|
|
62
62
|
timeout: int | float = 30000,
|
63
63
|
page_action: Optional[Callable] = None,
|
64
64
|
wait_selector: Optional[str] = None,
|
65
|
+
init_script: Optional[str] = None,
|
65
66
|
addons: Optional[List[str]] = None,
|
66
67
|
wait_selector_state: SelectorWaitStates = "attached",
|
67
68
|
cookies: Optional[List[Dict]] = None,
|
@@ -97,6 +98,7 @@ class StealthyFetcher(BaseFetcher):
|
|
97
98
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
98
99
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
99
100
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
101
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
100
102
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
101
103
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
102
104
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
@@ -127,6 +129,7 @@ class StealthyFetcher(BaseFetcher):
|
|
127
129
|
disable_ads=disable_ads,
|
128
130
|
allow_webgl=allow_webgl,
|
129
131
|
page_action=page_action,
|
132
|
+
init_script=init_script,
|
130
133
|
network_idle=network_idle,
|
131
134
|
block_images=block_images,
|
132
135
|
block_webrtc=block_webrtc,
|
@@ -158,6 +161,7 @@ class StealthyFetcher(BaseFetcher):
|
|
158
161
|
timeout: int | float = 30000,
|
159
162
|
page_action: Optional[Callable] = None,
|
160
163
|
wait_selector: Optional[str] = None,
|
164
|
+
init_script: Optional[str] = None,
|
161
165
|
addons: Optional[List[str]] = None,
|
162
166
|
wait_selector_state: SelectorWaitStates = "attached",
|
163
167
|
cookies: Optional[List[Dict]] = None,
|
@@ -193,6 +197,7 @@ class StealthyFetcher(BaseFetcher):
|
|
193
197
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
194
198
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
195
199
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
200
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
196
201
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
197
202
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
198
203
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
@@ -223,6 +228,7 @@ class StealthyFetcher(BaseFetcher):
|
|
223
228
|
disable_ads=disable_ads,
|
224
229
|
allow_webgl=allow_webgl,
|
225
230
|
page_action=page_action,
|
231
|
+
init_script=init_script,
|
226
232
|
network_idle=network_idle,
|
227
233
|
block_images=block_images,
|
228
234
|
block_webrtc=block_webrtc,
|
@@ -276,6 +282,7 @@ class DynamicFetcher(BaseFetcher):
|
|
276
282
|
timeout: int | float = 30000,
|
277
283
|
disable_resources: bool = False,
|
278
284
|
wait_selector: Optional[str] = None,
|
285
|
+
init_script: Optional[str] = None,
|
279
286
|
cookies: Optional[Iterable[Dict]] = None,
|
280
287
|
network_idle: bool = False,
|
281
288
|
wait_selector_state: SelectorWaitStates = "attached",
|
@@ -295,6 +302,7 @@ class DynamicFetcher(BaseFetcher):
|
|
295
302
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
296
303
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
297
304
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
305
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
298
306
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
299
307
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
300
308
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
@@ -328,6 +336,7 @@ class DynamicFetcher(BaseFetcher):
|
|
328
336
|
real_chrome=real_chrome,
|
329
337
|
page_action=page_action,
|
330
338
|
hide_canvas=hide_canvas,
|
339
|
+
init_script=init_script,
|
331
340
|
network_idle=network_idle,
|
332
341
|
google_search=google_search,
|
333
342
|
extra_headers=extra_headers,
|
@@ -359,6 +368,7 @@ class DynamicFetcher(BaseFetcher):
|
|
359
368
|
timeout: int | float = 30000,
|
360
369
|
disable_resources: bool = False,
|
361
370
|
wait_selector: Optional[str] = None,
|
371
|
+
init_script: Optional[str] = None,
|
362
372
|
cookies: Optional[Iterable[Dict]] = None,
|
363
373
|
network_idle: bool = False,
|
364
374
|
wait_selector_state: SelectorWaitStates = "attached",
|
@@ -378,6 +388,7 @@ class DynamicFetcher(BaseFetcher):
|
|
378
388
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
379
389
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
380
390
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
391
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
381
392
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
382
393
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
383
394
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
@@ -412,6 +423,7 @@ class DynamicFetcher(BaseFetcher):
|
|
412
423
|
real_chrome=real_chrome,
|
413
424
|
page_action=page_action,
|
414
425
|
hide_canvas=hide_canvas,
|
426
|
+
init_script=init_script,
|
415
427
|
network_idle=network_idle,
|
416
428
|
google_search=google_search,
|
417
429
|
extra_headers=extra_headers,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -155,8 +155,8 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
155
155
|
<!-- sponsors -->
|
156
156
|
|
157
157
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
158
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
159
158
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
159
|
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
160
160
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
161
161
|
|
162
162
|
<!-- /sponsors -->
|
@@ -273,7 +273,7 @@ from scrapling.parser import Selector
|
|
273
273
|
|
274
274
|
page = Selector("<html>...</html>")
|
275
275
|
```
|
276
|
-
And it works exactly the same!
|
276
|
+
And it works exactly the same way!
|
277
277
|
|
278
278
|
### Async Session Management Examples
|
279
279
|
```python
|
@@ -302,6 +302,8 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
302
302
|
|
303
303
|
Scrapling v0.3 includes a powerful command-line interface:
|
304
304
|
|
305
|
+
[](https://asciinema.org/a/736339)
|
306
|
+
|
305
307
|
```bash
|
306
308
|
# Launch interactive Web Scraping shell
|
307
309
|
scrapling shell
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[metadata]
|
2
2
|
name = scrapling
|
3
|
-
version = 0.3
|
3
|
+
version = 0.3.1
|
4
4
|
author = Karim Shoair
|
5
5
|
author_email = karim.shoair@pm.me
|
6
6
|
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
{scrapling-0.3 → scrapling-0.3.1}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|