scrapling 0.2.5__tar.gz → 0.2.7__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.5/scrapling.egg-info → scrapling-0.2.7}/PKG-INFO +21 -11
- {scrapling-0.2.5 → scrapling-0.2.7}/README.md +19 -9
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/__init__.py +1 -1
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/camo.py +12 -1
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/constants.py +1 -1
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/pw.py +34 -10
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/static.py +20 -8
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/custom.py +12 -8
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/fingerprints.py +1 -1
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/fetchers.py +25 -15
- {scrapling-0.2.5 → scrapling-0.2.7/scrapling.egg-info}/PKG-INFO +21 -11
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling.egg-info/requires.txt +1 -1
- {scrapling-0.2.5 → scrapling-0.2.7}/setup.cfg +1 -1
- {scrapling-0.2.5 → scrapling-0.2.7}/setup.py +2 -2
- {scrapling-0.2.5 → scrapling-0.2.7}/LICENSE +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/MANIFEST.in +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/translator.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/defaults.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/parser.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling/py.typed +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/__init__.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/fetchers/test_camoufox.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/fetchers/test_httpx.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/fetchers/test_playwright.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.5 → scrapling-0.2.7}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -39,7 +39,7 @@ Requires-Dist: w3lib
|
|
39
39
|
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
42
|
+
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
44
|
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
90
90
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
91
91
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
92
92
|
* [Installation](#installation)
|
93
|
-
* [Fetching Websites
|
94
|
-
* [
|
95
|
-
* [
|
96
|
-
* [
|
93
|
+
* [Fetching Websites](#fetching-websites)
|
94
|
+
* [Features](#features)
|
95
|
+
* [Fetcher class](#fetcher)
|
96
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
97
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
97
98
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
98
99
|
* [Smart Navigation](#smart-navigation)
|
99
100
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -256,7 +257,10 @@ playwright install chromium
|
|
256
257
|
python -m browserforge update
|
257
258
|
```
|
258
259
|
|
259
|
-
## Fetching Websites
|
260
|
+
## Fetching Websites
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
|
+
|
263
|
+
### Features
|
260
264
|
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
265
|
```python
|
262
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
@@ -279,9 +283,11 @@ Also, the `Response` object returned from all fetchers is the same as `Adaptor`
|
|
279
283
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
280
284
|
|
281
285
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
286
|
+
|
287
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
282
288
|
```python
|
283
289
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
284
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
290
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
285
291
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
286
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
287
293
|
```
|
@@ -309,6 +315,7 @@ True
|
|
309
315
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
310
316
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
311
317
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
318
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
312
319
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
313
320
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
314
321
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -336,9 +343,11 @@ Using this Fetcher class, you can make requests with:
|
|
336
343
|
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
337
344
|
* Using custom flags on launch to hide Playwright even more and make it faster.
|
338
345
|
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
339
|
-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
346
|
+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
347
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
348
|
|
349
|
+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
|
350
|
+
|
342
351
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
343
352
|
|
344
353
|
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
@@ -360,6 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
360
369
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
361
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
362
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
372
|
+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
363
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
364
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
365
376
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -811,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
811
822
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
812
823
|
|
813
824
|
## More Sponsors!
|
814
|
-
|
815
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
825
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
816
826
|
|
817
827
|
|
818
828
|
## Contributing
|
@@ -44,10 +44,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
44
44
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
45
45
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
46
46
|
* [Installation](#installation)
|
47
|
-
* [Fetching Websites
|
48
|
-
* [
|
49
|
-
* [
|
50
|
-
* [
|
47
|
+
* [Fetching Websites](#fetching-websites)
|
48
|
+
* [Features](#features)
|
49
|
+
* [Fetcher class](#fetcher)
|
50
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
51
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
51
52
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
52
53
|
* [Smart Navigation](#smart-navigation)
|
53
54
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -210,7 +211,10 @@ playwright install chromium
|
|
210
211
|
python -m browserforge update
|
211
212
|
```
|
212
213
|
|
213
|
-
## Fetching Websites
|
214
|
+
## Fetching Websites
|
215
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
216
|
+
|
217
|
+
### Features
|
214
218
|
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
215
219
|
```python
|
216
220
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
@@ -233,9 +237,11 @@ Also, the `Response` object returned from all fetchers is the same as `Adaptor`
|
|
233
237
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
234
238
|
|
235
239
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
240
|
+
|
241
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
236
242
|
```python
|
237
243
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
238
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
244
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
239
245
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
240
246
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
241
247
|
```
|
@@ -263,6 +269,7 @@ True
|
|
263
269
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
264
270
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
265
271
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
272
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
266
273
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
267
274
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
268
275
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -290,9 +297,11 @@ Using this Fetcher class, you can make requests with:
|
|
290
297
|
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
291
298
|
* Using custom flags on launch to hide Playwright even more and make it faster.
|
292
299
|
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
293
|
-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
300
|
+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
294
301
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
295
302
|
|
303
|
+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
|
304
|
+
|
296
305
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
297
306
|
|
298
307
|
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
@@ -314,6 +323,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
314
323
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
315
324
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
316
325
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
326
|
+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
327
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
317
328
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
318
329
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
319
330
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -765,8 +776,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
765
776
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
766
777
|
|
767
778
|
## More Sponsors!
|
768
|
-
|
769
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
779
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
770
780
|
|
771
781
|
|
772
782
|
## Contributing
|
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.7"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
@@ -12,6 +12,7 @@ from scrapling.engines.toolbelt import (
|
|
12
12
|
generate_convincing_referer,
|
13
13
|
)
|
14
14
|
|
15
|
+
from camoufox import DefaultAddons
|
15
16
|
from camoufox.sync_api import Camoufox
|
16
17
|
|
17
18
|
|
@@ -21,7 +22,8 @@ class CamoufoxEngine:
|
|
21
22
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
22
23
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
23
24
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
24
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None,
|
25
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
26
|
+
adaptor_arguments: Dict = None,
|
25
27
|
):
|
26
28
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
27
29
|
|
@@ -36,6 +38,7 @@ class CamoufoxEngine:
|
|
36
38
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
39
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
38
40
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
41
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
39
42
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
40
43
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
41
44
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -54,6 +57,7 @@ class CamoufoxEngine:
|
|
54
57
|
self.network_idle = bool(network_idle)
|
55
58
|
self.google_search = bool(google_search)
|
56
59
|
self.os_randomize = bool(os_randomize)
|
60
|
+
self.disable_ads = bool(disable_ads)
|
57
61
|
self.extra_headers = extra_headers or {}
|
58
62
|
self.proxy = construct_proxy_dict(proxy)
|
59
63
|
self.addons = addons or []
|
@@ -75,9 +79,11 @@ class CamoufoxEngine:
|
|
75
79
|
:param url: Target url.
|
76
80
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
77
81
|
"""
|
82
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
78
83
|
with Camoufox(
|
79
84
|
proxy=self.proxy,
|
80
85
|
addons=self.addons,
|
86
|
+
exclude_addons=addons,
|
81
87
|
headless=self.headless,
|
82
88
|
humanize=self.humanize,
|
83
89
|
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
@@ -105,6 +111,11 @@ class CamoufoxEngine:
|
|
105
111
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
112
|
waiter = page.locator(self.wait_selector)
|
107
113
|
waiter.first.wait_for(state=self.wait_selector_state)
|
114
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
115
|
+
page.wait_for_load_state(state="load")
|
116
|
+
page.wait_for_load_state(state="domcontentloaded")
|
117
|
+
if self.network_idle:
|
118
|
+
page.wait_for_load_state('networkidle')
|
108
119
|
|
109
120
|
# This will be parsed inside `Response`
|
110
121
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
44
44
|
'--disable-default-apps',
|
45
45
|
'--disable-print-preview',
|
46
46
|
'--disable-dev-shm-usage',
|
47
|
-
'--disable-popup-blocking',
|
47
|
+
# '--disable-popup-blocking',
|
48
48
|
'--metrics-recording-only',
|
49
49
|
'--disable-crash-reporter',
|
50
50
|
'--disable-partial-raster',
|
@@ -26,12 +26,14 @@ class PlaywrightEngine:
|
|
26
26
|
timeout: Optional[float] = 30000,
|
27
27
|
page_action: Callable = do_nothing,
|
28
28
|
wait_selector: Optional[str] = None,
|
29
|
+
locale: Optional[str] = 'en-US',
|
29
30
|
wait_selector_state: Optional[str] = 'attached',
|
30
|
-
stealth: bool = False,
|
31
|
-
|
32
|
-
|
31
|
+
stealth: Optional[bool] = False,
|
32
|
+
real_chrome: Optional[bool] = False,
|
33
|
+
hide_canvas: Optional[bool] = False,
|
34
|
+
disable_webgl: Optional[bool] = False,
|
33
35
|
cdp_url: Optional[str] = None,
|
34
|
-
nstbrowser_mode: bool = False,
|
36
|
+
nstbrowser_mode: Optional[bool] = False,
|
35
37
|
nstbrowser_config: Optional[Dict] = None,
|
36
38
|
google_search: Optional[bool] = True,
|
37
39
|
extra_headers: Optional[Dict[str, str]] = None,
|
@@ -49,8 +51,10 @@ class PlaywrightEngine:
|
|
49
51
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
50
52
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
51
53
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
54
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
52
55
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
53
56
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
57
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
54
58
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
55
59
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
56
60
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
@@ -62,11 +66,13 @@ class PlaywrightEngine:
|
|
62
66
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
63
67
|
"""
|
64
68
|
self.headless = headless
|
69
|
+
self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
|
65
70
|
self.disable_resources = disable_resources
|
66
71
|
self.network_idle = bool(network_idle)
|
67
72
|
self.stealth = bool(stealth)
|
68
73
|
self.hide_canvas = bool(hide_canvas)
|
69
74
|
self.disable_webgl = bool(disable_webgl)
|
75
|
+
self.real_chrome = bool(real_chrome)
|
70
76
|
self.google_search = bool(google_search)
|
71
77
|
self.extra_headers = extra_headers or {}
|
72
78
|
self.proxy = construct_proxy_dict(proxy)
|
@@ -84,6 +90,14 @@ class PlaywrightEngine:
|
|
84
90
|
self.nstbrowser_mode = bool(nstbrowser_mode)
|
85
91
|
self.nstbrowser_config = nstbrowser_config
|
86
92
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
93
|
+
self.harmful_default_args = [
|
94
|
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
95
|
+
'--enable-automation',
|
96
|
+
'--disable-popup-blocking',
|
97
|
+
# '--disable-component-update',
|
98
|
+
# '--disable-default-apps',
|
99
|
+
# '--disable-extensions',
|
100
|
+
]
|
87
101
|
|
88
102
|
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
89
103
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
@@ -119,7 +133,8 @@ class PlaywrightEngine:
|
|
119
133
|
:param url: Target url.
|
120
134
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
121
135
|
"""
|
122
|
-
if not self.stealth:
|
136
|
+
if not self.stealth or self.real_chrome:
|
137
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
123
138
|
from playwright.sync_api import sync_playwright
|
124
139
|
else:
|
125
140
|
from rebrowser_playwright.sync_api import sync_playwright
|
@@ -130,8 +145,8 @@ class PlaywrightEngine:
|
|
130
145
|
extra_headers = {}
|
131
146
|
useragent = self.useragent
|
132
147
|
else:
|
133
|
-
extra_headers =
|
134
|
-
useragent =
|
148
|
+
extra_headers = {}
|
149
|
+
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
135
150
|
|
136
151
|
# Prepare the flags before diving
|
137
152
|
flags = DEFAULT_STEALTH_FLAGS
|
@@ -146,14 +161,16 @@ class PlaywrightEngine:
|
|
146
161
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
147
162
|
else:
|
148
163
|
if self.stealth:
|
149
|
-
browser = p.chromium.launch(
|
164
|
+
browser = p.chromium.launch(
|
165
|
+
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
166
|
+
)
|
150
167
|
else:
|
151
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=
|
168
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
152
169
|
|
153
170
|
# Creating the context
|
154
171
|
if self.stealth:
|
155
172
|
context = browser.new_context(
|
156
|
-
locale=
|
173
|
+
locale=self.locale,
|
157
174
|
is_mobile=False,
|
158
175
|
has_touch=False,
|
159
176
|
proxy=self.proxy,
|
@@ -170,6 +187,8 @@ class PlaywrightEngine:
|
|
170
187
|
)
|
171
188
|
else:
|
172
189
|
context = browser.new_context(
|
190
|
+
locale=self.locale,
|
191
|
+
proxy=self.proxy,
|
173
192
|
color_scheme='dark',
|
174
193
|
user_agent=useragent,
|
175
194
|
device_scale_factor=2,
|
@@ -215,6 +234,11 @@ class PlaywrightEngine:
|
|
215
234
|
if self.wait_selector and type(self.wait_selector) is str:
|
216
235
|
waiter = page.locator(self.wait_selector)
|
217
236
|
waiter.first.wait_for(state=self.wait_selector_state)
|
237
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
238
|
+
page.wait_for_load_state(state="load")
|
239
|
+
page.wait_for_load_state(state="domcontentloaded")
|
240
|
+
if self.network_idle:
|
241
|
+
page.wait_for_load_state('networkidle')
|
218
242
|
|
219
243
|
# This will be parsed inside `Response`
|
220
244
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
@@ -63,54 +63,66 @@ class StaticEngine:
|
|
63
63
|
**self.adaptor_arguments
|
64
64
|
)
|
65
65
|
|
66
|
-
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
66
|
+
def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
67
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
68
|
|
69
69
|
:param url: Target url.
|
70
70
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
71
71
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
72
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
72
73
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
73
74
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
74
75
|
"""
|
75
76
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
76
|
-
|
77
|
+
with httpx.Client(proxy=proxy) as client:
|
78
|
+
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
79
|
+
|
77
80
|
return self._prepare_response(request)
|
78
81
|
|
79
|
-
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
82
|
+
def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
80
83
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
84
|
|
82
85
|
:param url: Target url.
|
83
86
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
84
87
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
88
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
85
89
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
86
90
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
87
91
|
"""
|
88
92
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
89
|
-
|
93
|
+
with httpx.Client(proxy=proxy) as client:
|
94
|
+
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
95
|
+
|
90
96
|
return self._prepare_response(request)
|
91
97
|
|
92
|
-
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
98
|
+
def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
93
99
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
100
|
|
95
101
|
:param url: Target url.
|
96
102
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
97
103
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
104
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
98
105
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
99
106
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
100
107
|
"""
|
101
108
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
102
|
-
|
109
|
+
with httpx.Client(proxy=proxy) as client:
|
110
|
+
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
111
|
+
|
103
112
|
return self._prepare_response(request)
|
104
113
|
|
105
|
-
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
114
|
+
def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
106
115
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
116
|
|
108
117
|
:param url: Target url.
|
109
118
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
110
119
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
120
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
111
121
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
112
122
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
113
123
|
"""
|
114
124
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
115
|
-
|
125
|
+
with httpx.Client(proxy=proxy) as client:
|
126
|
+
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
127
|
+
|
116
128
|
return self._prepare_response(request)
|
@@ -39,7 +39,7 @@ class ResponseEncoding:
|
|
39
39
|
|
40
40
|
@classmethod
|
41
41
|
@cache(maxsize=None)
|
42
|
-
def get_value(cls, content_type: Optional[str]) -> str:
|
42
|
+
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
43
|
"""Determine the appropriate character encoding from a content-type header.
|
44
44
|
|
45
45
|
The encoding is determined by these rules in order:
|
@@ -50,26 +50,30 @@ class ResponseEncoding:
|
|
50
50
|
5. Default to UTF-8 if nothing else matches
|
51
51
|
|
52
52
|
:param content_type: Content-Type header value or None
|
53
|
+
:param text: A text to test the encoding on it
|
53
54
|
:return: String naming the character encoding
|
54
55
|
"""
|
55
56
|
if not content_type:
|
56
57
|
return cls.__DEFAULT_ENCODING
|
57
58
|
|
58
59
|
try:
|
60
|
+
encoding = None
|
59
61
|
content_type, params = cls.__parse_content_type(content_type)
|
60
62
|
|
61
63
|
# First check for explicit charset parameter
|
62
64
|
if "charset" in params:
|
63
65
|
encoding = params["charset"].strip("'\"")
|
64
|
-
"test".encode(encoding) # Validate encoding
|
65
|
-
return encoding
|
66
66
|
|
67
67
|
# Apply content-type specific rules
|
68
|
-
|
69
|
-
|
68
|
+
elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
69
|
+
encoding = "ISO-8859-1"
|
70
|
+
|
71
|
+
elif content_type == "application/json":
|
72
|
+
encoding = cls.__DEFAULT_ENCODING
|
70
73
|
|
71
|
-
if
|
72
|
-
|
74
|
+
if encoding:
|
75
|
+
_ = text.encode(encoding) # Validate encoding and validate it can encode the given text
|
76
|
+
return encoding
|
73
77
|
|
74
78
|
return cls.__DEFAULT_ENCODING
|
75
79
|
|
@@ -87,7 +91,7 @@ class Response(Adaptor):
|
|
87
91
|
self.cookies = cookies
|
88
92
|
self.headers = headers
|
89
93
|
self.request_headers = request_headers
|
90
|
-
encoding = ResponseEncoding.get_value(encoding)
|
94
|
+
encoding = ResponseEncoding.get_value(encoding, text)
|
91
95
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
92
96
|
# For back-ward compatibility
|
93
97
|
self.adaptor = self
|
@@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
67
67
|
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
68
68
|
os_name = get_os_name()
|
69
69
|
return HeaderGenerator(
|
70
|
-
browser=[Browser(name='chrome', min_version=
|
70
|
+
browser=[Browser(name='chrome', min_version=130)],
|
71
71
|
os=os_name, # None is ignored
|
72
72
|
device='desktop'
|
73
73
|
).generate()
|
@@ -9,7 +9,7 @@ class Fetcher(BaseFetcher):
|
|
9
9
|
|
10
10
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
11
|
"""
|
12
|
-
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
12
|
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
13
13
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
14
|
|
15
15
|
:param url: Target url.
|
@@ -17,13 +17,14 @@ class Fetcher(BaseFetcher):
|
|
17
17
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
18
18
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
19
19
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
20
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
20
21
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
21
22
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
22
23
|
"""
|
23
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
24
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs)
|
24
25
|
return response_object
|
25
26
|
|
26
|
-
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
27
|
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
27
28
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
29
|
|
29
30
|
:param url: Target url.
|
@@ -31,13 +32,14 @@ class Fetcher(BaseFetcher):
|
|
31
32
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
32
33
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
33
34
|
create a referer header as if this request came from Google's search of this URL's domain.
|
35
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
34
36
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
35
37
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
36
38
|
"""
|
37
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
39
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs)
|
38
40
|
return response_object
|
39
41
|
|
40
|
-
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
42
|
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
41
43
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
44
|
|
43
45
|
:param url: Target url
|
@@ -45,14 +47,15 @@ class Fetcher(BaseFetcher):
|
|
45
47
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
46
48
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
47
49
|
create a referer header as if this request came from Google's search of this URL's domain.
|
50
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
48
51
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
52
|
|
50
53
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
51
54
|
"""
|
52
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
55
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs)
|
53
56
|
return response_object
|
54
57
|
|
55
|
-
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
58
|
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
56
59
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
60
|
|
58
61
|
:param url: Target url
|
@@ -60,10 +63,11 @@ class Fetcher(BaseFetcher):
|
|
60
63
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
61
64
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
62
65
|
create a referer header as if this request came from Google's search of this URL's domain.
|
66
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
63
67
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
64
68
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
65
69
|
"""
|
66
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
70
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs)
|
67
71
|
return response_object
|
68
72
|
|
69
73
|
|
@@ -78,7 +82,7 @@ class StealthyFetcher(BaseFetcher):
|
|
78
82
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
79
83
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
80
84
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
81
|
-
os_randomize: Optional[bool] = None
|
85
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
82
86
|
) -> Response:
|
83
87
|
"""
|
84
88
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -92,6 +96,7 @@ class StealthyFetcher(BaseFetcher):
|
|
92
96
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
93
97
|
:param block_webrtc: Blocks WebRTC entirely.
|
94
98
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
99
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
95
100
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
96
101
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
97
102
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
@@ -111,6 +116,7 @@ class StealthyFetcher(BaseFetcher):
|
|
111
116
|
timeout=timeout,
|
112
117
|
headless=headless,
|
113
118
|
humanize=humanize,
|
119
|
+
disable_ads=disable_ads,
|
114
120
|
allow_webgl=allow_webgl,
|
115
121
|
page_action=page_action,
|
116
122
|
network_idle=network_idle,
|
@@ -138,7 +144,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
138
144
|
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
139
145
|
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
140
146
|
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
141
|
-
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
147
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
142
148
|
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
143
149
|
|
144
150
|
> Note that these are the main options with PlayWright but it can be mixed together.
|
@@ -146,12 +152,12 @@ class PlayWrightFetcher(BaseFetcher):
|
|
146
152
|
def fetch(
|
147
153
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
148
154
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
149
|
-
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
|
-
hide_canvas: bool =
|
151
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
152
|
-
stealth: bool = False,
|
155
|
+
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
156
|
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
157
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
158
|
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
153
159
|
cdp_url: Optional[str] = None,
|
154
|
-
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
160
|
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
155
161
|
) -> Response:
|
156
162
|
"""Opens up a browser and do your request based on your chosen options below.
|
157
163
|
|
@@ -163,10 +169,12 @@ class PlayWrightFetcher(BaseFetcher):
|
|
163
169
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
164
170
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
165
171
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
172
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
166
173
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
167
174
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
168
175
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
169
176
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
177
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
170
178
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
171
179
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
172
180
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
@@ -179,11 +187,13 @@ class PlayWrightFetcher(BaseFetcher):
|
|
179
187
|
"""
|
180
188
|
engine = PlaywrightEngine(
|
181
189
|
proxy=proxy,
|
190
|
+
locale=locale,
|
182
191
|
timeout=timeout,
|
183
192
|
stealth=stealth,
|
184
193
|
cdp_url=cdp_url,
|
185
194
|
headless=headless,
|
186
195
|
useragent=useragent,
|
196
|
+
real_chrome=real_chrome,
|
187
197
|
page_action=page_action,
|
188
198
|
hide_canvas=hide_canvas,
|
189
199
|
network_idle=network_idle,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -39,7 +39,7 @@ Requires-Dist: w3lib
|
|
39
39
|
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
42
|
+
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
44
|
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
90
90
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
91
91
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
92
92
|
* [Installation](#installation)
|
93
|
-
* [Fetching Websites
|
94
|
-
* [
|
95
|
-
* [
|
96
|
-
* [
|
93
|
+
* [Fetching Websites](#fetching-websites)
|
94
|
+
* [Features](#features)
|
95
|
+
* [Fetcher class](#fetcher)
|
96
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
97
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
97
98
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
98
99
|
* [Smart Navigation](#smart-navigation)
|
99
100
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -256,7 +257,10 @@ playwright install chromium
|
|
256
257
|
python -m browserforge update
|
257
258
|
```
|
258
259
|
|
259
|
-
## Fetching Websites
|
260
|
+
## Fetching Websites
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
|
+
|
263
|
+
### Features
|
260
264
|
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
265
|
```python
|
262
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
@@ -279,9 +283,11 @@ Also, the `Response` object returned from all fetchers is the same as `Adaptor`
|
|
279
283
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
280
284
|
|
281
285
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
286
|
+
|
287
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
282
288
|
```python
|
283
289
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
284
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
290
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
285
291
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
286
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
287
293
|
```
|
@@ -309,6 +315,7 @@ True
|
|
309
315
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
310
316
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
311
317
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
318
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
312
319
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
313
320
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
314
321
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -336,9 +343,11 @@ Using this Fetcher class, you can make requests with:
|
|
336
343
|
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
337
344
|
* Using custom flags on launch to hide Playwright even more and make it faster.
|
338
345
|
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
339
|
-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
346
|
+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
347
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
348
|
|
349
|
+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
|
350
|
+
|
342
351
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
343
352
|
|
344
353
|
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
@@ -360,6 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
360
369
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
361
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
362
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
372
|
+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
363
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
364
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
365
376
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -811,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
811
822
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
812
823
|
|
813
824
|
## More Sponsors!
|
814
|
-
|
815
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
825
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
816
826
|
|
817
827
|
|
818
828
|
## Contributing
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
9
|
+
version="0.2.7",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
@@ -55,7 +55,7 @@ setup(
|
|
55
55
|
"orjson>=3",
|
56
56
|
"tldextract",
|
57
57
|
'httpx[brotli,zstd]',
|
58
|
-
'playwright',
|
58
|
+
'playwright==1.48', # Temporary because currently All libraries that provide CDP patches doesn't support playwright 1.49 yet
|
59
59
|
'rebrowser-playwright',
|
60
60
|
'camoufox>=0.3.10',
|
61
61
|
'browserforge',
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.5 → scrapling-0.2.7}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|