scrapling 0.2.7__tar.gz → 0.2.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.7/scrapling.egg-info → scrapling-0.2.9}/PKG-INFO +41 -25
- {scrapling-0.2.7 → scrapling-0.2.9}/README.md +35 -18
- scrapling-0.2.9/scrapling/__init__.py +12 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/_types.py +2 -3
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/custom_types.py +93 -11
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/storage_adaptors.py +9 -10
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/translator.py +6 -7
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/utils.py +35 -30
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/defaults.py +2 -1
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/camo.py +96 -26
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/constants.py +4 -4
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/pw.py +166 -96
- scrapling-0.2.9/scrapling/engines/static.py +172 -0
- scrapling-0.2.9/scrapling/engines/toolbelt/__init__.py +6 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/custom.py +22 -23
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/fingerprints.py +7 -7
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/navigation.py +25 -12
- scrapling-0.2.9/scrapling/fetchers.py +432 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/parser.py +63 -28
- {scrapling-0.2.7 → scrapling-0.2.9/scrapling.egg-info}/PKG-INFO +41 -25
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling.egg-info/SOURCES.txt +8 -3
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling.egg-info/requires.txt +3 -4
- {scrapling-0.2.7 → scrapling-0.2.9}/setup.cfg +1 -1
- {scrapling-0.2.7 → scrapling-0.2.9}/setup.py +9 -10
- scrapling-0.2.9/tests/fetchers/async/test_camoufox.py +95 -0
- scrapling-0.2.9/tests/fetchers/async/test_httpx.py +83 -0
- scrapling-0.2.9/tests/fetchers/async/test_playwright.py +99 -0
- scrapling-0.2.9/tests/fetchers/sync/__init__.py +0 -0
- scrapling-0.2.9/tests/fetchers/sync/test_camoufox.py +68 -0
- scrapling-0.2.9/tests/fetchers/sync/test_httpx.py +82 -0
- scrapling-0.2.9/tests/fetchers/sync/test_playwright.py +87 -0
- scrapling-0.2.9/tests/fetchers/test_utils.py +97 -0
- scrapling-0.2.9/tests/parser/__init__.py +0 -0
- scrapling-0.2.9/tests/parser/test_automatch.py +111 -0
- scrapling-0.2.9/tests/parser/test_general.py +330 -0
- scrapling-0.2.7/scrapling/__init__.py +0 -11
- scrapling-0.2.7/scrapling/engines/static.py +0 -128
- scrapling-0.2.7/scrapling/engines/toolbelt/__init__.py +0 -20
- scrapling-0.2.7/scrapling/fetchers.py +0 -216
- scrapling-0.2.7/tests/fetchers/test_camoufox.py +0 -64
- scrapling-0.2.7/tests/fetchers/test_httpx.py +0 -67
- scrapling-0.2.7/tests/fetchers/test_playwright.py +0 -76
- scrapling-0.2.7/tests/fetchers/test_utils.py +0 -129
- scrapling-0.2.7/tests/parser/test_automatch.py +0 -56
- scrapling-0.2.7/tests/parser/test_general.py +0 -286
- {scrapling-0.2.7 → scrapling-0.2.9}/LICENSE +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/MANIFEST.in +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/__init__.py +2 -2
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling/py.typed +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/tests/__init__.py +0 -0
- {scrapling-0.2.7 → scrapling-0.2.9}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.7/tests/parser → scrapling-0.2.9/tests/fetchers/async}/__init__.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
3
|
+
Version: 0.2.9
|
4
|
+
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -29,7 +29,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
29
29
|
Classifier: Programming Language :: Python :: 3.13
|
30
30
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
31
31
|
Classifier: Typing :: Typed
|
32
|
-
Requires-Python: >=3.
|
32
|
+
Requires-Python: >=3.9
|
33
33
|
Description-Content-Type: text/markdown
|
34
34
|
License-File: LICENSE
|
35
35
|
Requires-Dist: requests>=2.3
|
@@ -39,10 +39,9 @@ Requires-Dist: w3lib
|
|
39
39
|
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
43
|
-
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.
|
45
|
-
Requires-Dist: browserforge
|
42
|
+
Requires-Dist: playwright>=1.49.1
|
43
|
+
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
|
+
Requires-Dist: camoufox[geoip]>=0.4.9
|
46
45
|
|
47
46
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
48
47
|
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
@@ -52,7 +51,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
51
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
52
|
|
54
53
|
```python
|
55
|
-
>> from scrapling.
|
54
|
+
>> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
56
55
|
# Fetch websites' source under the radar!
|
57
56
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
57
|
>> print(page.status)
|
@@ -81,7 +80,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
81
80
|
|
82
81
|
## Table of content
|
83
82
|
* [Key Features](#key-features)
|
84
|
-
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
83
|
+
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
85
84
|
* [Adaptive Scraping](#adaptive-scraping)
|
86
85
|
* [Performance](#performance)
|
87
86
|
* [Developing Experience](#developing-experience)
|
@@ -122,7 +121,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
122
121
|
|
123
122
|
## Key Features
|
124
123
|
|
125
|
-
### Fetch websites as you prefer
|
124
|
+
### Fetch websites as you prefer with async support
|
126
125
|
- **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
|
127
126
|
- **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
|
128
127
|
- **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
|
@@ -213,7 +212,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
213
212
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
214
213
|
|
215
214
|
## Installation
|
216
|
-
Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.
|
215
|
+
Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
|
217
216
|
```bash
|
218
217
|
pip3 install scrapling
|
219
218
|
```
|
@@ -258,47 +257,58 @@ python -m browserforge update
|
|
258
257
|
```
|
259
258
|
|
260
259
|
## Fetching Websites
|
261
|
-
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you
|
260
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
261
|
|
263
262
|
### Features
|
264
|
-
You might be
|
263
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
265
264
|
```python
|
266
265
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
267
266
|
```
|
268
|
-
|
267
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
269
268
|
|
270
269
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
271
270
|
```python
|
272
|
-
from scrapling.
|
271
|
+
from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
273
272
|
```
|
274
273
|
then use it right away without initializing like:
|
275
274
|
```python
|
276
275
|
page = StealthyFetcher.fetch('https://example.com')
|
277
276
|
```
|
278
277
|
|
279
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
278
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
280
279
|
> [!NOTE]
|
281
280
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
282
281
|
### Fetcher
|
283
282
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
284
283
|
|
285
|
-
For all methods, you have `
|
284
|
+
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
286
285
|
|
287
286
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
288
287
|
```python
|
289
|
-
>> page = Fetcher().get('https://httpbin.org/get',
|
288
|
+
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
290
289
|
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
291
290
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
292
291
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
293
292
|
```
|
293
|
+
For Async requests, you will just replace the import like below:
|
294
|
+
```python
|
295
|
+
>> from scrapling import AsyncFetcher
|
296
|
+
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
297
|
+
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
298
|
+
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
299
|
+
>> page = await AsyncFetcher().delete('https://httpbin.org/delete')
|
300
|
+
```
|
294
301
|
### StealthyFetcher
|
295
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
302
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
296
303
|
```python
|
297
304
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
298
305
|
>> page.status == 200
|
299
306
|
True
|
307
|
+
>> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
|
308
|
+
>> page.status == 200
|
309
|
+
True
|
300
310
|
```
|
301
|
-
> Note: all requests done by this fetcher
|
311
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
302
312
|
|
303
313
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
304
314
|
|
@@ -314,7 +324,8 @@ True
|
|
314
324
|
| page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
|
315
325
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
316
326
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
317
|
-
| allow_webgl |
|
327
|
+
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
328
|
+
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
318
329
|
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
319
330
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
320
331
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
@@ -333,8 +344,11 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
333
344
|
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
334
345
|
>> page.css_first("#search a::attr(href)")
|
335
346
|
'https://github.com/D4Vinci/Scrapling'
|
347
|
+
>> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
|
348
|
+
>> page.css_first("#search a::attr(href)")
|
349
|
+
'https://github.com/D4Vinci/Scrapling'
|
336
350
|
```
|
337
|
-
> Note: all requests done by this fetcher
|
351
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
338
352
|
|
339
353
|
Using this Fetcher class, you can make requests with:
|
340
354
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -346,7 +360,7 @@ Using this Fetcher class, you can make requests with:
|
|
346
360
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
347
361
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
348
362
|
|
349
|
-
> Hence using the `real_chrome` argument requires that you have
|
363
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
350
364
|
|
351
365
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
352
366
|
|
@@ -369,7 +383,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
369
383
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
370
384
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
371
385
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
372
|
-
| real_chrome | If you have
|
386
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
387
|
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
374
388
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
375
389
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
@@ -437,6 +451,9 @@ You can select elements by their text content in multiple ways, here's a full ex
|
|
437
451
|
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
438
452
|
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
439
453
|
|
454
|
+
>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
|
455
|
+
'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
|
456
|
+
|
440
457
|
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
441
458
|
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
442
459
|
|
@@ -850,7 +867,6 @@ This project includes code adapted from:
|
|
850
867
|
|
851
868
|
## Known Issues
|
852
869
|
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
853
|
-
- Currently, Scrapling is not compatible with async/await.
|
854
870
|
|
855
871
|
---
|
856
872
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
6
6
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
7
7
|
|
8
8
|
```python
|
9
|
-
>> from scrapling.
|
9
|
+
>> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
10
10
|
# Fetch websites' source under the radar!
|
11
11
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
12
12
|
>> print(page.status)
|
@@ -35,7 +35,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
35
35
|
|
36
36
|
## Table of content
|
37
37
|
* [Key Features](#key-features)
|
38
|
-
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
38
|
+
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
39
39
|
* [Adaptive Scraping](#adaptive-scraping)
|
40
40
|
* [Performance](#performance)
|
41
41
|
* [Developing Experience](#developing-experience)
|
@@ -76,7 +76,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
76
76
|
|
77
77
|
## Key Features
|
78
78
|
|
79
|
-
### Fetch websites as you prefer
|
79
|
+
### Fetch websites as you prefer with async support
|
80
80
|
- **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
|
81
81
|
- **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
|
82
82
|
- **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
|
@@ -167,7 +167,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
167
167
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
168
168
|
|
169
169
|
## Installation
|
170
|
-
Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.
|
170
|
+
Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
|
171
171
|
```bash
|
172
172
|
pip3 install scrapling
|
173
173
|
```
|
@@ -212,47 +212,58 @@ python -m browserforge update
|
|
212
212
|
```
|
213
213
|
|
214
214
|
## Fetching Websites
|
215
|
-
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you
|
215
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
216
216
|
|
217
217
|
### Features
|
218
|
-
You might be
|
218
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
219
219
|
```python
|
220
220
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
221
221
|
```
|
222
|
-
|
222
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
223
223
|
|
224
224
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
225
225
|
```python
|
226
|
-
from scrapling.
|
226
|
+
from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
227
227
|
```
|
228
228
|
then use it right away without initializing like:
|
229
229
|
```python
|
230
230
|
page = StealthyFetcher.fetch('https://example.com')
|
231
231
|
```
|
232
232
|
|
233
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
233
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
234
234
|
> [!NOTE]
|
235
235
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
236
236
|
### Fetcher
|
237
237
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
238
238
|
|
239
|
-
For all methods, you have `
|
239
|
+
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
240
240
|
|
241
241
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
242
242
|
```python
|
243
|
-
>> page = Fetcher().get('https://httpbin.org/get',
|
243
|
+
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
244
244
|
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
245
245
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
246
246
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
247
247
|
```
|
248
|
+
For Async requests, you will just replace the import like below:
|
249
|
+
```python
|
250
|
+
>> from scrapling import AsyncFetcher
|
251
|
+
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
252
|
+
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
253
|
+
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
254
|
+
>> page = await AsyncFetcher().delete('https://httpbin.org/delete')
|
255
|
+
```
|
248
256
|
### StealthyFetcher
|
249
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
257
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
250
258
|
```python
|
251
259
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
252
260
|
>> page.status == 200
|
253
261
|
True
|
262
|
+
>> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
|
263
|
+
>> page.status == 200
|
264
|
+
True
|
254
265
|
```
|
255
|
-
> Note: all requests done by this fetcher
|
266
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
256
267
|
|
257
268
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
258
269
|
|
@@ -268,7 +279,8 @@ True
|
|
268
279
|
| page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
|
269
280
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
270
281
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
271
|
-
| allow_webgl |
|
282
|
+
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
283
|
+
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
272
284
|
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
273
285
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
274
286
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
@@ -287,8 +299,11 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
287
299
|
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
288
300
|
>> page.css_first("#search a::attr(href)")
|
289
301
|
'https://github.com/D4Vinci/Scrapling'
|
302
|
+
>> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
|
303
|
+
>> page.css_first("#search a::attr(href)")
|
304
|
+
'https://github.com/D4Vinci/Scrapling'
|
290
305
|
```
|
291
|
-
> Note: all requests done by this fetcher
|
306
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
292
307
|
|
293
308
|
Using this Fetcher class, you can make requests with:
|
294
309
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -300,7 +315,7 @@ Using this Fetcher class, you can make requests with:
|
|
300
315
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
301
316
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
302
317
|
|
303
|
-
> Hence using the `real_chrome` argument requires that you have
|
318
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
304
319
|
|
305
320
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
306
321
|
|
@@ -323,7 +338,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
323
338
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
324
339
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
325
340
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
326
|
-
| real_chrome | If you have
|
341
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
327
342
|
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
328
343
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
329
344
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
@@ -391,6 +406,9 @@ You can select elements by their text content in multiple ways, here's a full ex
|
|
391
406
|
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
392
407
|
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
393
408
|
|
409
|
+
>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
|
410
|
+
'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
|
411
|
+
|
394
412
|
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
395
413
|
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
396
414
|
|
@@ -804,7 +822,6 @@ This project includes code adapted from:
|
|
804
822
|
|
805
823
|
## Known Issues
|
806
824
|
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
807
|
-
- Currently, Scrapling is not compatible with async/await.
|
808
825
|
|
809
826
|
---
|
810
827
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# Declare top-level shortcuts
|
2
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
+
from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
4
|
+
PlayWrightFetcher, StealthyFetcher)
|
5
|
+
from scrapling.parser import Adaptor, Adaptors
|
6
|
+
|
7
|
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
+
__version__ = "0.2.9"
|
9
|
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
|
+
|
11
|
+
|
12
|
+
__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
@@ -2,9 +2,8 @@
|
|
2
2
|
Type definitions for type checking purposes.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import (
|
6
|
-
|
7
|
-
)
|
5
|
+
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
8
7
|
|
9
8
|
try:
|
10
9
|
from typing import Protocol
|
@@ -1,24 +1,83 @@
|
|
1
1
|
import re
|
2
|
-
from types import MappingProxyType
|
3
2
|
from collections.abc import Mapping
|
3
|
+
from types import MappingProxyType
|
4
4
|
|
5
|
-
from
|
6
|
-
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
7
|
-
|
8
|
-
from orjson import loads, dumps
|
5
|
+
from orjson import dumps, loads
|
9
6
|
from w3lib.html import replace_entities as _replace_entities
|
10
7
|
|
8
|
+
from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
|
9
|
+
from scrapling.core.utils import _is_iterable, flatten
|
10
|
+
|
11
11
|
|
12
12
|
class TextHandler(str):
|
13
13
|
"""Extends standard Python string by adding more functionality"""
|
14
14
|
__slots__ = ()
|
15
15
|
|
16
16
|
def __new__(cls, string):
|
17
|
-
|
18
|
-
if type(string) is str:
|
17
|
+
if isinstance(string, str):
|
19
18
|
return super().__new__(cls, string)
|
20
|
-
|
21
|
-
|
19
|
+
return super().__new__(cls, '')
|
20
|
+
|
21
|
+
# Make methods from original `str` class return `TextHandler` instead of returning `str` again
|
22
|
+
# Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
|
23
|
+
# and I made sonnet write it for me :)
|
24
|
+
def strip(self, chars=None):
|
25
|
+
return TextHandler(super().strip(chars))
|
26
|
+
|
27
|
+
def lstrip(self, chars=None):
|
28
|
+
return TextHandler(super().lstrip(chars))
|
29
|
+
|
30
|
+
def rstrip(self, chars=None):
|
31
|
+
return TextHandler(super().rstrip(chars))
|
32
|
+
|
33
|
+
def capitalize(self):
|
34
|
+
return TextHandler(super().capitalize())
|
35
|
+
|
36
|
+
def casefold(self):
|
37
|
+
return TextHandler(super().casefold())
|
38
|
+
|
39
|
+
def center(self, width, fillchar=' '):
|
40
|
+
return TextHandler(super().center(width, fillchar))
|
41
|
+
|
42
|
+
def expandtabs(self, tabsize=8):
|
43
|
+
return TextHandler(super().expandtabs(tabsize))
|
44
|
+
|
45
|
+
def format(self, *args, **kwargs):
|
46
|
+
return TextHandler(super().format(*args, **kwargs))
|
47
|
+
|
48
|
+
def format_map(self, mapping):
|
49
|
+
return TextHandler(super().format_map(mapping))
|
50
|
+
|
51
|
+
def join(self, iterable):
|
52
|
+
return TextHandler(super().join(iterable))
|
53
|
+
|
54
|
+
def ljust(self, width, fillchar=' '):
|
55
|
+
return TextHandler(super().ljust(width, fillchar))
|
56
|
+
|
57
|
+
def rjust(self, width, fillchar=' '):
|
58
|
+
return TextHandler(super().rjust(width, fillchar))
|
59
|
+
|
60
|
+
def swapcase(self):
|
61
|
+
return TextHandler(super().swapcase())
|
62
|
+
|
63
|
+
def title(self):
|
64
|
+
return TextHandler(super().title())
|
65
|
+
|
66
|
+
def translate(self, table):
|
67
|
+
return TextHandler(super().translate(table))
|
68
|
+
|
69
|
+
def zfill(self, width):
|
70
|
+
return TextHandler(super().zfill(width))
|
71
|
+
|
72
|
+
def replace(self, old, new, count=-1):
|
73
|
+
return TextHandler(super().replace(old, new, count))
|
74
|
+
|
75
|
+
def upper(self):
|
76
|
+
return TextHandler(super().upper())
|
77
|
+
|
78
|
+
def lower(self):
|
79
|
+
return TextHandler(super().lower())
|
80
|
+
##############
|
22
81
|
|
23
82
|
def sort(self, reverse: bool = False) -> str:
|
24
83
|
"""Return a sorted version of the string"""
|
@@ -30,11 +89,21 @@ class TextHandler(str):
|
|
30
89
|
data = re.sub(' +', ' ', data)
|
31
90
|
return self.__class__(data.strip())
|
32
91
|
|
92
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
93
|
+
def get(self, default=None):
|
94
|
+
return self
|
95
|
+
|
96
|
+
def get_all(self):
|
97
|
+
return self
|
98
|
+
|
99
|
+
extract = get_all
|
100
|
+
extract_first = get
|
101
|
+
|
33
102
|
def json(self) -> Dict:
|
34
103
|
"""Return json response if the response is jsonable otherwise throw error"""
|
35
|
-
# Using
|
104
|
+
# Using str function as a workaround for orjson issue with subclasses of str
|
36
105
|
# Check this out: https://github.com/ijl/orjson/issues/445
|
37
|
-
return loads(self
|
106
|
+
return loads(str(self))
|
38
107
|
|
39
108
|
def re(
|
40
109
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
@@ -127,6 +196,19 @@ class TextHandlers(List[TextHandler]):
|
|
127
196
|
return result
|
128
197
|
return default
|
129
198
|
|
199
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
200
|
+
def get(self, default=None):
|
201
|
+
"""Returns the first item of the current list
|
202
|
+
:param default: the default value to return if the current list is empty
|
203
|
+
"""
|
204
|
+
return self[0] if len(self) > 0 else default
|
205
|
+
|
206
|
+
def extract(self):
|
207
|
+
return self
|
208
|
+
|
209
|
+
extract_first = get
|
210
|
+
get_all = extract
|
211
|
+
|
130
212
|
|
131
213
|
class AttributesHandler(Mapping):
|
132
214
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
@@ -1,16 +1,15 @@
|
|
1
|
-
import orjson
|
2
1
|
import sqlite3
|
3
|
-
import logging
|
4
2
|
import threading
|
5
|
-
from hashlib import sha256
|
6
3
|
from abc import ABC, abstractmethod
|
4
|
+
from hashlib import sha256
|
7
5
|
|
8
|
-
|
9
|
-
from scrapling.core.utils import _StorageTools, cache
|
10
|
-
|
6
|
+
import orjson
|
11
7
|
from lxml import html
|
12
8
|
from tldextract import extract as tld
|
13
9
|
|
10
|
+
from scrapling.core._types import Dict, Optional, Union
|
11
|
+
from scrapling.core.utils import _StorageTools, log, lru_cache
|
12
|
+
|
14
13
|
|
15
14
|
class StorageSystemMixin(ABC):
|
16
15
|
# If you want to make your own storage system, you have to inherit from this
|
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
20
19
|
"""
|
21
20
|
self.url = url
|
22
21
|
|
23
|
-
@
|
22
|
+
@lru_cache(None, typed=True)
|
24
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
25
24
|
if not self.url or type(self.url) is not str:
|
26
25
|
return default_value
|
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
52
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
53
52
|
|
54
53
|
@staticmethod
|
55
|
-
@
|
54
|
+
@lru_cache(None, typed=True)
|
56
55
|
def _get_hash(identifier: str) -> str:
|
57
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
58
57
|
identifier = identifier.lower().strip()
|
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
64
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
65
64
|
|
66
65
|
|
67
|
-
@
|
66
|
+
@lru_cache(None, typed=True)
|
68
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
69
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
70
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
86
85
|
self.connection.execute("PRAGMA journal_mode=WAL")
|
87
86
|
self.cursor = self.connection.cursor()
|
88
87
|
self._setup_database()
|
89
|
-
|
88
|
+
log.debug(
|
90
89
|
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
91
90
|
)
|
92
91
|
|
@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
import re
|
12
12
|
|
13
|
-
from w3lib.html import HTML5_WHITESPACE
|
14
|
-
from scrapling.core.utils import cache
|
15
|
-
from scrapling.core._types import Any, Optional, Protocol, Self
|
16
|
-
|
17
|
-
from cssselect.xpath import ExpressionError
|
18
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
19
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
20
14
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
|
+
from cssselect.xpath import ExpressionError
|
16
|
+
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
17
|
+
from w3lib.html import HTML5_WHITESPACE
|
21
18
|
|
19
|
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
+
from scrapling.core.utils import lru_cache
|
22
21
|
|
23
22
|
regex = f"[{HTML5_WHITESPACE}]+"
|
24
23
|
replace_html5_whitespaces = re.compile(regex).sub
|
@@ -140,6 +139,6 @@ class TranslatorMixin:
|
|
140
139
|
|
141
140
|
|
142
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
143
|
-
@
|
142
|
+
@lru_cache(maxsize=256)
|
144
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
145
144
|
return super().css_to_xpath(css, prefix)
|