scrapling 0.2.6__tar.gz → 0.2.8__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.6/scrapling.egg-info → scrapling-0.2.8}/PKG-INFO +28 -21
- {scrapling-0.2.6 → scrapling-0.2.8}/README.md +25 -18
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/__init__.py +4 -3
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/_types.py +2 -3
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/custom_types.py +5 -5
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/translator.py +5 -6
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/utils.py +15 -12
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/defaults.py +1 -1
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/camo.py +20 -13
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/constants.py +1 -1
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/pw.py +31 -18
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/static.py +24 -11
- scrapling-0.2.8/scrapling/engines/toolbelt/__init__.py +6 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/custom.py +15 -10
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/fingerprints.py +5 -5
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/navigation.py +6 -6
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/fetchers.py +23 -14
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/parser.py +15 -8
- {scrapling-0.2.6 → scrapling-0.2.8/scrapling.egg-info}/PKG-INFO +28 -21
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/requires.txt +1 -1
- {scrapling-0.2.6 → scrapling-0.2.8}/setup.cfg +1 -1
- {scrapling-0.2.6 → scrapling-0.2.8}/setup.py +6 -6
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_camoufox.py +1 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_httpx.py +1 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_playwright.py +1 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/parser/test_general.py +3 -1
- scrapling-0.2.6/scrapling/engines/toolbelt/__init__.py +0 -20
- {scrapling-0.2.6 → scrapling-0.2.8}/LICENSE +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/MANIFEST.in +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/storage_adaptors.py +6 -6
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/__init__.py +2 -2
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/py.typed +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/__init__.py +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.6 → scrapling-0.2.8}/tests/parser/test_automatch.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
3
|
+
Version: 0.2.8
|
4
|
+
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.
|
44
|
+
Requires-Dist: camoufox>=0.4.4
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -52,7 +52,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling.
|
55
|
+
>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
57
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
90
90
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
91
91
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
92
92
|
* [Installation](#installation)
|
93
|
-
* [Fetching Websites
|
94
|
-
* [
|
95
|
-
* [
|
96
|
-
* [
|
93
|
+
* [Fetching Websites](#fetching-websites)
|
94
|
+
* [Features](#features)
|
95
|
+
* [Fetcher class](#fetcher)
|
96
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
97
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
97
98
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
98
99
|
* [Smart Navigation](#smart-navigation)
|
99
100
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -256,43 +257,48 @@ playwright install chromium
|
|
256
257
|
python -m browserforge update
|
257
258
|
```
|
258
259
|
|
259
|
-
## Fetching Websites
|
260
|
-
|
260
|
+
## Fetching Websites
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
|
+
|
263
|
+
### Features
|
264
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
265
|
```python
|
262
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
263
267
|
```
|
264
|
-
|
268
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
|
265
269
|
|
266
270
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
267
271
|
```python
|
268
|
-
from scrapling.
|
272
|
+
from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
269
273
|
```
|
270
274
|
then use it right away without initializing like:
|
271
275
|
```python
|
272
276
|
page = StealthyFetcher.fetch('https://example.com')
|
273
277
|
```
|
274
278
|
|
275
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
279
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
276
280
|
> [!NOTE]
|
277
281
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
278
282
|
### Fetcher
|
279
283
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
280
284
|
|
281
285
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
286
|
+
|
287
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
282
288
|
```python
|
283
289
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
284
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
290
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
285
291
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
286
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
287
293
|
```
|
288
294
|
### StealthyFetcher
|
289
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
295
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
290
296
|
```python
|
291
297
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
292
298
|
>> page.status == 200
|
293
299
|
True
|
294
300
|
```
|
295
|
-
> Note: all requests done by this fetcher
|
301
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
296
302
|
|
297
303
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
298
304
|
|
@@ -309,6 +315,7 @@ True
|
|
309
315
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
310
316
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
311
317
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
318
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
312
319
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
313
320
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
314
321
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -327,7 +334,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
327
334
|
>> page.css_first("#search a::attr(href)")
|
328
335
|
'https://github.com/D4Vinci/Scrapling'
|
329
336
|
```
|
330
|
-
> Note: all requests done by this fetcher
|
337
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
331
338
|
|
332
339
|
Using this Fetcher class, you can make requests with:
|
333
340
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -339,7 +346,7 @@ Using this Fetcher class, you can make requests with:
|
|
339
346
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
347
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
348
|
|
342
|
-
> Hence using the `real_chrome` argument requires that you have
|
349
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
343
350
|
|
344
351
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
345
352
|
|
@@ -362,7 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
362
369
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
363
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
364
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
|
-
| real_chrome | If you have
|
372
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
366
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
367
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
368
376
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
814
822
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
815
823
|
|
816
824
|
## More Sponsors!
|
817
|
-
|
818
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
825
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
819
826
|
|
820
827
|
|
821
828
|
## Contributing
|
@@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
6
6
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
7
7
|
|
8
8
|
```python
|
9
|
-
>> from scrapling.
|
9
|
+
>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
10
10
|
# Fetch websites' source under the radar!
|
11
11
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
12
12
|
>> print(page.status)
|
@@ -44,10 +44,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
44
44
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
45
45
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
46
46
|
* [Installation](#installation)
|
47
|
-
* [Fetching Websites
|
48
|
-
* [
|
49
|
-
* [
|
50
|
-
* [
|
47
|
+
* [Fetching Websites](#fetching-websites)
|
48
|
+
* [Features](#features)
|
49
|
+
* [Fetcher class](#fetcher)
|
50
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
51
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
51
52
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
52
53
|
* [Smart Navigation](#smart-navigation)
|
53
54
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -210,43 +211,48 @@ playwright install chromium
|
|
210
211
|
python -m browserforge update
|
211
212
|
```
|
212
213
|
|
213
|
-
## Fetching Websites
|
214
|
-
|
214
|
+
## Fetching Websites
|
215
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
216
|
+
|
217
|
+
### Features
|
218
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
215
219
|
```python
|
216
220
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
217
221
|
```
|
218
|
-
|
222
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
|
219
223
|
|
220
224
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
221
225
|
```python
|
222
|
-
from scrapling.
|
226
|
+
from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
223
227
|
```
|
224
228
|
then use it right away without initializing like:
|
225
229
|
```python
|
226
230
|
page = StealthyFetcher.fetch('https://example.com')
|
227
231
|
```
|
228
232
|
|
229
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
233
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
230
234
|
> [!NOTE]
|
231
235
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
232
236
|
### Fetcher
|
233
237
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
234
238
|
|
235
239
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
240
|
+
|
241
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
236
242
|
```python
|
237
243
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
238
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
244
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
239
245
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
240
246
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
241
247
|
```
|
242
248
|
### StealthyFetcher
|
243
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
249
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
244
250
|
```python
|
245
251
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
246
252
|
>> page.status == 200
|
247
253
|
True
|
248
254
|
```
|
249
|
-
> Note: all requests done by this fetcher
|
255
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
250
256
|
|
251
257
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
252
258
|
|
@@ -263,6 +269,7 @@ True
|
|
263
269
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
264
270
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
265
271
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
272
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
266
273
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
267
274
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
268
275
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -281,7 +288,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
281
288
|
>> page.css_first("#search a::attr(href)")
|
282
289
|
'https://github.com/D4Vinci/Scrapling'
|
283
290
|
```
|
284
|
-
> Note: all requests done by this fetcher
|
291
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
285
292
|
|
286
293
|
Using this Fetcher class, you can make requests with:
|
287
294
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -293,7 +300,7 @@ Using this Fetcher class, you can make requests with:
|
|
293
300
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
294
301
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
295
302
|
|
296
|
-
> Hence using the `real_chrome` argument requires that you have
|
303
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
297
304
|
|
298
305
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
299
306
|
|
@@ -316,7 +323,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
316
323
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
317
324
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
318
325
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
319
|
-
| real_chrome | If you have
|
326
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
327
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
320
328
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
321
329
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
322
330
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -768,8 +776,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
768
776
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
769
777
|
|
770
778
|
## More Sponsors!
|
771
|
-
|
772
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
779
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
773
780
|
|
774
781
|
|
775
782
|
## Contributing
|
@@ -1,10 +1,11 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
|
-
from scrapling.
|
2
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
+
from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
|
4
|
+
StealthyFetcher)
|
3
5
|
from scrapling.parser import Adaptor, Adaptors
|
4
|
-
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
6
|
|
6
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.8"
|
8
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
10
|
|
10
11
|
|
@@ -2,9 +2,8 @@
|
|
2
2
|
Type definitions for type checking purposes.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import (
|
6
|
-
|
7
|
-
)
|
5
|
+
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
8
7
|
|
9
8
|
try:
|
10
9
|
from typing import Protocol
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import re
|
2
|
-
from types import MappingProxyType
|
3
2
|
from collections.abc import Mapping
|
3
|
+
from types import MappingProxyType
|
4
4
|
|
5
|
-
from
|
6
|
-
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
7
|
-
|
8
|
-
from orjson import loads, dumps
|
5
|
+
from orjson import dumps, loads
|
9
6
|
from w3lib.html import replace_entities as _replace_entities
|
10
7
|
|
8
|
+
from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
|
9
|
+
from scrapling.core.utils import _is_iterable, flatten
|
10
|
+
|
11
11
|
|
12
12
|
class TextHandler(str):
|
13
13
|
"""Extends standard Python string by adding more functionality"""
|
@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
import re
|
12
12
|
|
13
|
-
from w3lib.html import HTML5_WHITESPACE
|
14
|
-
from scrapling.core.utils import cache
|
15
|
-
from scrapling.core._types import Any, Optional, Protocol, Self
|
16
|
-
|
17
|
-
from cssselect.xpath import ExpressionError
|
18
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
19
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
20
14
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
|
+
from cssselect.xpath import ExpressionError
|
16
|
+
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
17
|
+
from w3lib.html import HTML5_WHITESPACE
|
21
18
|
|
19
|
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
+
from scrapling.core.utils import cache
|
22
21
|
|
23
22
|
regex = f"[{HTML5_WHITESPACE}]+"
|
24
23
|
replace_html5_whitespaces = re.compile(regex).sub
|
@@ -1,22 +1,25 @@
|
|
1
|
-
import re
|
2
1
|
import logging
|
2
|
+
import re
|
3
3
|
from itertools import chain
|
4
|
-
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
5
|
-
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
6
|
-
|
7
|
-
from scrapling.core._types import Dict, Iterable, Any, Union
|
8
4
|
|
9
5
|
import orjson
|
10
6
|
from lxml import html
|
11
7
|
|
8
|
+
from scrapling.core._types import Any, Dict, Iterable, Union
|
9
|
+
|
10
|
+
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
11
|
+
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
12
|
+
from functools import lru_cache as cache # isort:skip
|
13
|
+
|
14
|
+
|
12
15
|
html_forbidden = {html.HtmlComment, }
|
13
16
|
logging.basicConfig(
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
level=logging.ERROR,
|
18
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
19
|
+
handlers=[
|
20
|
+
logging.StreamHandler()
|
21
|
+
]
|
22
|
+
)
|
20
23
|
|
21
24
|
|
22
25
|
def is_jsonable(content: Union[bytes, str]) -> bool:
|
@@ -94,7 +97,7 @@ class _StorageTools:
|
|
94
97
|
parent = element.getparent()
|
95
98
|
return tuple(
|
96
99
|
(element.tag,) if parent is None else (
|
97
|
-
|
100
|
+
cls._get_element_path(parent) + (element.tag,)
|
98
101
|
)
|
99
102
|
)
|
100
103
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from .fetchers import Fetcher,
|
1
|
+
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
|
2
2
|
|
3
3
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
4
|
Fetcher = Fetcher()
|
@@ -1,19 +1,16 @@
|
|
1
1
|
import logging
|
2
|
-
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
3
|
-
|
4
|
-
from scrapling.engines.toolbelt import (
|
5
|
-
Response,
|
6
|
-
do_nothing,
|
7
|
-
StatusText,
|
8
|
-
get_os_name,
|
9
|
-
intercept_route,
|
10
|
-
check_type_validity,
|
11
|
-
construct_proxy_dict,
|
12
|
-
generate_convincing_referer,
|
13
|
-
)
|
14
2
|
|
3
|
+
from camoufox import DefaultAddons
|
15
4
|
from camoufox.sync_api import Camoufox
|
16
5
|
|
6
|
+
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
7
|
+
Union)
|
8
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
check_type_validity,
|
10
|
+
construct_proxy_dict, do_nothing,
|
11
|
+
generate_convincing_referer,
|
12
|
+
get_os_name, intercept_route)
|
13
|
+
|
17
14
|
|
18
15
|
class CamoufoxEngine:
|
19
16
|
def __init__(
|
@@ -21,7 +18,8 @@ class CamoufoxEngine:
|
|
21
18
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
22
19
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
23
20
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
24
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None,
|
21
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
22
|
+
adaptor_arguments: Dict = None,
|
25
23
|
):
|
26
24
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
27
25
|
|
@@ -36,6 +34,7 @@ class CamoufoxEngine:
|
|
36
34
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
35
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
38
36
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
37
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
39
38
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
40
39
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
41
40
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -54,6 +53,7 @@ class CamoufoxEngine:
|
|
54
53
|
self.network_idle = bool(network_idle)
|
55
54
|
self.google_search = bool(google_search)
|
56
55
|
self.os_randomize = bool(os_randomize)
|
56
|
+
self.disable_ads = bool(disable_ads)
|
57
57
|
self.extra_headers = extra_headers or {}
|
58
58
|
self.proxy = construct_proxy_dict(proxy)
|
59
59
|
self.addons = addons or []
|
@@ -75,9 +75,11 @@ class CamoufoxEngine:
|
|
75
75
|
:param url: Target url.
|
76
76
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
77
77
|
"""
|
78
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
78
79
|
with Camoufox(
|
79
80
|
proxy=self.proxy,
|
80
81
|
addons=self.addons,
|
82
|
+
exclude_addons=addons,
|
81
83
|
headless=self.headless,
|
82
84
|
humanize=self.humanize,
|
83
85
|
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
@@ -105,6 +107,11 @@ class CamoufoxEngine:
|
|
105
107
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
108
|
waiter = page.locator(self.wait_selector)
|
107
109
|
waiter.first.wait_for(state=self.wait_selector_state)
|
110
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
111
|
+
page.wait_for_load_state(state="load")
|
112
|
+
page.wait_for_load_state(state="domcontentloaded")
|
113
|
+
if self.network_idle:
|
114
|
+
page.wait_for_load_state('networkidle')
|
108
115
|
|
109
116
|
# This will be parsed inside `Response`
|
110
117
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
44
44
|
'--disable-default-apps',
|
45
45
|
'--disable-print-preview',
|
46
46
|
'--disable-dev-shm-usage',
|
47
|
-
'--disable-popup-blocking',
|
47
|
+
# '--disable-popup-blocking',
|
48
48
|
'--metrics-recording-only',
|
49
49
|
'--disable-crash-reporter',
|
50
50
|
'--disable-partial-raster',
|
@@ -1,20 +1,15 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
|
4
|
-
|
5
|
-
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS,
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
construct_cdp_url,
|
14
|
-
check_type_validity,
|
15
|
-
construct_proxy_dict,
|
16
|
-
generate_convincing_referer,
|
17
|
-
)
|
3
|
+
|
4
|
+
from scrapling.core._types import Callable, Dict, List, Optional, Union
|
5
|
+
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
|
+
NSTBROWSER_DEFAULT_QUERY)
|
7
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
8
|
+
check_type_validity, construct_cdp_url,
|
9
|
+
construct_proxy_dict, do_nothing,
|
10
|
+
generate_convincing_referer,
|
11
|
+
generate_headers, intercept_route,
|
12
|
+
js_bypass_path)
|
18
13
|
|
19
14
|
|
20
15
|
class PlaywrightEngine:
|
@@ -26,6 +21,7 @@ class PlaywrightEngine:
|
|
26
21
|
timeout: Optional[float] = 30000,
|
27
22
|
page_action: Callable = do_nothing,
|
28
23
|
wait_selector: Optional[str] = None,
|
24
|
+
locale: Optional[str] = 'en-US',
|
29
25
|
wait_selector_state: Optional[str] = 'attached',
|
30
26
|
stealth: Optional[bool] = False,
|
31
27
|
real_chrome: Optional[bool] = False,
|
@@ -50,6 +46,7 @@ class PlaywrightEngine:
|
|
50
46
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
51
47
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
52
48
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
49
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
53
50
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
54
51
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
52
|
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
@@ -64,6 +61,7 @@ class PlaywrightEngine:
|
|
64
61
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
65
62
|
"""
|
66
63
|
self.headless = headless
|
64
|
+
self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
|
67
65
|
self.disable_resources = disable_resources
|
68
66
|
self.network_idle = bool(network_idle)
|
69
67
|
self.stealth = bool(stealth)
|
@@ -87,6 +85,14 @@ class PlaywrightEngine:
|
|
87
85
|
self.nstbrowser_mode = bool(nstbrowser_mode)
|
88
86
|
self.nstbrowser_config = nstbrowser_config
|
89
87
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
88
|
+
self.harmful_default_args = [
|
89
|
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
90
|
+
'--enable-automation',
|
91
|
+
'--disable-popup-blocking',
|
92
|
+
# '--disable-component-update',
|
93
|
+
# '--disable-default-apps',
|
94
|
+
# '--disable-extensions',
|
95
|
+
]
|
90
96
|
|
91
97
|
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
92
98
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
@@ -151,15 +157,15 @@ class PlaywrightEngine:
|
|
151
157
|
else:
|
152
158
|
if self.stealth:
|
153
159
|
browser = p.chromium.launch(
|
154
|
-
headless=self.headless, args=flags, ignore_default_args=
|
160
|
+
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
161
|
)
|
156
162
|
else:
|
157
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=
|
163
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
158
164
|
|
159
165
|
# Creating the context
|
160
166
|
if self.stealth:
|
161
167
|
context = browser.new_context(
|
162
|
-
locale=
|
168
|
+
locale=self.locale,
|
163
169
|
is_mobile=False,
|
164
170
|
has_touch=False,
|
165
171
|
proxy=self.proxy,
|
@@ -176,6 +182,8 @@ class PlaywrightEngine:
|
|
176
182
|
)
|
177
183
|
else:
|
178
184
|
context = browser.new_context(
|
185
|
+
locale=self.locale,
|
186
|
+
proxy=self.proxy,
|
179
187
|
color_scheme='dark',
|
180
188
|
user_agent=useragent,
|
181
189
|
device_scale_factor=2,
|
@@ -221,6 +229,11 @@ class PlaywrightEngine:
|
|
221
229
|
if self.wait_selector and type(self.wait_selector) is str:
|
222
230
|
waiter = page.locator(self.wait_selector)
|
223
231
|
waiter.first.wait_for(state=self.wait_selector_state)
|
232
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
233
|
+
page.wait_for_load_state(state="load")
|
234
|
+
page.wait_for_load_state(state="domcontentloaded")
|
235
|
+
if self.network_idle:
|
236
|
+
page.wait_for_load_state('networkidle')
|
224
237
|
|
225
238
|
# This will be parsed inside `Response`
|
226
239
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|