scrapling 0.2__tar.gz → 0.2.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2/scrapling.egg-info → scrapling-0.2.1}/PKG-INFO +45 -17
- {scrapling-0.2 → scrapling-0.2.1}/README.md +43 -15
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/__init__.py +1 -1
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/utils.py +13 -1
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/camo.py +15 -8
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/pw.py +7 -2
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/static.py +9 -9
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/toolbelt/__init__.py +1 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/toolbelt/custom.py +16 -30
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/toolbelt/navigation.py +34 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/fetchers.py +21 -13
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/parser.py +8 -20
- scrapling-0.2.1/scrapling/py.typed +1 -0
- {scrapling-0.2 → scrapling-0.2.1/scrapling.egg-info}/PKG-INFO +45 -17
- {scrapling-0.2 → scrapling-0.2.1}/scrapling.egg-info/SOURCES.txt +1 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling.egg-info/requires.txt +1 -1
- {scrapling-0.2 → scrapling-0.2.1}/setup.cfg +2 -2
- {scrapling-0.2 → scrapling-0.2.1}/setup.py +2 -2
- {scrapling-0.2 → scrapling-0.2.1}/LICENSE +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/MANIFEST.in +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/_types.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/core/translator.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/__init__.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/fetchers/test_camoufox.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/fetchers/test_httpx.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/fetchers/test_playwright.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/parser/__init__.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2 → scrapling-0.2.1}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2
|
3
|
+
Version: 0.2.1
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.3.
|
44
|
+
Requires-Dist: camoufox>=0.3.9
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -54,15 +54,31 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
54
54
|
```python
|
55
55
|
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
|
-
>>
|
58
|
-
>> print(
|
57
|
+
>> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
|
58
|
+
>> print(page.status)
|
59
59
|
200
|
60
|
-
>> page = fetcher.adaptor
|
61
60
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
62
61
|
>> # Later, if the website structure changes, pass `auto_match=True`
|
63
62
|
>> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
|
64
63
|
```
|
65
64
|
|
65
|
+
# Sponsors
|
66
|
+
|
67
|
+
[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
|
68
|
+
|
69
|
+
- 👩💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
|
70
|
+
- 👩💻 **24/7 Expert Support**: We will join your Slack Channel
|
71
|
+
- 🌍 **Global Presence**: Available in 150+ Countries
|
72
|
+
- ⚡ **Low Latency**
|
73
|
+
- 🔒 **Swiss Quality and Privacy**
|
74
|
+
- 🎁 **Free Trial**
|
75
|
+
- 🛡️ **99.9% Uptime**
|
76
|
+
- 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
|
77
|
+
- 🔧 **Easy Integration**: Compatible with most software and programming languages
|
78
|
+
|
79
|
+
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
80
|
+
---
|
81
|
+
|
66
82
|
## Table of content
|
67
83
|
* [Key Features](#key-features)
|
68
84
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
@@ -95,7 +111,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
95
111
|
* [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
|
96
112
|
* [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
|
97
113
|
* [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
|
98
|
-
* [Sponsors](#sponsors)
|
114
|
+
* [More Sponsors!](#more-sponsors)
|
99
115
|
* [Contributing](#contributing)
|
100
116
|
* [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
|
101
117
|
* [License](#license)
|
@@ -136,7 +152,7 @@ from scrapling import Fetcher
|
|
136
152
|
fetcher = Fetcher(auto_match=False)
|
137
153
|
|
138
154
|
# Fetch a web page and create an Adaptor instance
|
139
|
-
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
155
|
+
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
140
156
|
# Get all strings in the full page
|
141
157
|
page.get_all_text(ignore_tags=('script', 'style'))
|
142
158
|
|
@@ -246,6 +262,8 @@ All fetcher-type classes are imported in the same way
|
|
246
262
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
247
263
|
```
|
248
264
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
265
|
+
|
266
|
+
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
249
267
|
> [!NOTE]
|
250
268
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
251
269
|
### Fetcher
|
@@ -265,6 +283,8 @@ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) whi
|
|
265
283
|
>> page.status == 200
|
266
284
|
True
|
267
285
|
```
|
286
|
+
> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
287
|
+
|
268
288
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
269
289
|
|
270
290
|
| Argument | Description | Optional |
|
@@ -283,6 +303,8 @@ True
|
|
283
303
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
284
304
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
285
305
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
306
|
+
| proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
|
307
|
+
| os_randomize | If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS. | ✔️ |
|
286
308
|
| wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
287
309
|
|
288
310
|
</details>
|
@@ -293,9 +315,11 @@ This list isn't final so expect a lot more additions and flexibility to be added
|
|
293
315
|
This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
|
294
316
|
```python
|
295
317
|
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
296
|
-
>> page.
|
318
|
+
>> page.css_first("#search a::attr(href)")
|
297
319
|
'https://github.com/D4Vinci/Scrapling'
|
298
320
|
```
|
321
|
+
> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
322
|
+
|
299
323
|
Using this Fetcher class, you can make requests with:
|
300
324
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
301
325
|
2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
|
@@ -323,6 +347,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
323
347
|
| wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
324
348
|
| google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
|
325
349
|
| extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
|
350
|
+
| proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
|
326
351
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
327
352
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
328
353
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
@@ -387,7 +412,7 @@ You can search for a specific ancestor of an element that satisfies a function,
|
|
387
412
|
### Content-based Selection & Finding Similar Elements
|
388
413
|
You can select elements by their text content in multiple ways, here's a full example on another website:
|
389
414
|
```python
|
390
|
-
>>> page = Fetcher().get('https://books.toscrape.com/index.html')
|
415
|
+
>>> page = Fetcher().get('https://books.toscrape.com/index.html')
|
391
416
|
|
392
417
|
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
393
418
|
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
@@ -507,11 +532,11 @@ Now let's test the same selector in both versions
|
|
507
532
|
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
|
508
533
|
>> new_url = "https://stackoverflow.com/"
|
509
534
|
>>
|
510
|
-
>> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
|
535
|
+
>> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
|
511
536
|
>> element1 = page.css_first(selector, auto_save=True)
|
512
537
|
>>
|
513
538
|
>> # Same selector but used in the updated website
|
514
|
-
>> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
|
539
|
+
>> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
|
515
540
|
>> element2 = page.css_first(selector, auto_match=True)
|
516
541
|
>>
|
517
542
|
>> if element1.text == element2.text:
|
@@ -523,7 +548,7 @@ Note that I used a new argument called `automatch_domain`, this is because for S
|
|
523
548
|
In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
|
524
549
|
|
525
550
|
**Notes:**
|
526
|
-
1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you
|
551
|
+
1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you.
|
527
552
|
2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
|
528
553
|
```text
|
529
554
|
Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
|
@@ -564,7 +589,7 @@ Examples to clear any confusion :)
|
|
564
589
|
|
565
590
|
```python
|
566
591
|
>> from scrapling import Fetcher
|
567
|
-
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
592
|
+
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
568
593
|
# Find all elements with tag name `div`.
|
569
594
|
>> page.find_all('div')
|
570
595
|
[<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
|
@@ -727,7 +752,10 @@ There are a lot of deep details skipped here to make this as short as possible s
|
|
727
752
|
|
728
753
|
Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
|
729
754
|
|
730
|
-
|
755
|
+
> [!IMPORTANT]
|
756
|
+
> A website is needed to provide detailed library documentation.<br/>
|
757
|
+
> I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. I have been working on Scrapling for months for free after all.<br/><br/>
|
758
|
+
> If you like `Scrapling` and want it to keep improving then this is a friendly reminder that you can help by supporting me through the [sponsor button](https://github.com/sponsors/D4Vinci).
|
731
759
|
|
732
760
|
## ⚡ Enlightening Questions and FAQs
|
733
761
|
This section addresses common questions about Scrapling, please read this section before opening an issue.
|
@@ -741,8 +769,8 @@ This section addresses common questions about Scrapling, please read this sectio
|
|
741
769
|
|
742
770
|
Together both are used to retrieve the element's unique properties from the database later.
|
743
771
|
4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
|
744
|
-
5.
|
745
|
-
6. The score for each element is stored in the table, and
|
772
|
+
5. Comparing elements is not exact but more about finding how similar these values are, so everything is taken into consideration, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.
|
773
|
+
6. The score for each element is stored in the table, and the element(s) with the highest combined similarity scores are returned.
|
746
774
|
|
747
775
|
### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
|
748
776
|
Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
|
@@ -773,7 +801,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
773
801
|
### Is Scrapling thread-safe?
|
774
802
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
775
803
|
|
776
|
-
## Sponsors
|
804
|
+
## More Sponsors!
|
777
805
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
778
806
|
|
779
807
|
## Contributing
|
@@ -8,15 +8,31 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
8
8
|
```python
|
9
9
|
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
10
10
|
# Fetch websites' source under the radar!
|
11
|
-
>>
|
12
|
-
>> print(
|
11
|
+
>> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
|
12
|
+
>> print(page.status)
|
13
13
|
200
|
14
|
-
>> page = fetcher.adaptor
|
15
14
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
16
15
|
>> # Later, if the website structure changes, pass `auto_match=True`
|
17
16
|
>> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
|
18
17
|
```
|
19
18
|
|
19
|
+
# Sponsors
|
20
|
+
|
21
|
+
[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
|
22
|
+
|
23
|
+
- 👩💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
|
24
|
+
- 👩💻 **24/7 Expert Support**: We will join your Slack Channel
|
25
|
+
- 🌍 **Global Presence**: Available in 150+ Countries
|
26
|
+
- ⚡ **Low Latency**
|
27
|
+
- 🔒 **Swiss Quality and Privacy**
|
28
|
+
- 🎁 **Free Trial**
|
29
|
+
- 🛡️ **99.9% Uptime**
|
30
|
+
- 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
|
31
|
+
- 🔧 **Easy Integration**: Compatible with most software and programming languages
|
32
|
+
|
33
|
+
[](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
|
34
|
+
---
|
35
|
+
|
20
36
|
## Table of content
|
21
37
|
* [Key Features](#key-features)
|
22
38
|
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
@@ -49,7 +65,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
49
65
|
* [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
|
50
66
|
* [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
|
51
67
|
* [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
|
52
|
-
* [Sponsors](#sponsors)
|
68
|
+
* [More Sponsors!](#more-sponsors)
|
53
69
|
* [Contributing](#contributing)
|
54
70
|
* [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
|
55
71
|
* [License](#license)
|
@@ -90,7 +106,7 @@ from scrapling import Fetcher
|
|
90
106
|
fetcher = Fetcher(auto_match=False)
|
91
107
|
|
92
108
|
# Fetch a web page and create an Adaptor instance
|
93
|
-
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
109
|
+
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
94
110
|
# Get all strings in the full page
|
95
111
|
page.get_all_text(ignore_tags=('script', 'style'))
|
96
112
|
|
@@ -200,6 +216,8 @@ All fetcher-type classes are imported in the same way
|
|
200
216
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
201
217
|
```
|
202
218
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
219
|
+
|
220
|
+
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
203
221
|
> [!NOTE]
|
204
222
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
205
223
|
### Fetcher
|
@@ -219,6 +237,8 @@ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) whi
|
|
219
237
|
>> page.status == 200
|
220
238
|
True
|
221
239
|
```
|
240
|
+
> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
241
|
+
|
222
242
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
223
243
|
|
224
244
|
| Argument | Description | Optional |
|
@@ -237,6 +257,8 @@ True
|
|
237
257
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
238
258
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
239
259
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
260
|
+
| proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
|
261
|
+
| os_randomize | If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS. | ✔️ |
|
240
262
|
| wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
241
263
|
|
242
264
|
</details>
|
@@ -247,9 +269,11 @@ This list isn't final so expect a lot more additions and flexibility to be added
|
|
247
269
|
This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
|
248
270
|
```python
|
249
271
|
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
250
|
-
>> page.
|
272
|
+
>> page.css_first("#search a::attr(href)")
|
251
273
|
'https://github.com/D4Vinci/Scrapling'
|
252
274
|
```
|
275
|
+
> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
276
|
+
|
253
277
|
Using this Fetcher class, you can make requests with:
|
254
278
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
255
279
|
2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
|
@@ -277,6 +301,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
277
301
|
| wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
278
302
|
| google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
|
279
303
|
| extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
|
304
|
+
| proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
|
280
305
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
281
306
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
282
307
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
@@ -341,7 +366,7 @@ You can search for a specific ancestor of an element that satisfies a function,
|
|
341
366
|
### Content-based Selection & Finding Similar Elements
|
342
367
|
You can select elements by their text content in multiple ways, here's a full example on another website:
|
343
368
|
```python
|
344
|
-
>>> page = Fetcher().get('https://books.toscrape.com/index.html')
|
369
|
+
>>> page = Fetcher().get('https://books.toscrape.com/index.html')
|
345
370
|
|
346
371
|
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
347
372
|
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
@@ -461,11 +486,11 @@ Now let's test the same selector in both versions
|
|
461
486
|
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
|
462
487
|
>> new_url = "https://stackoverflow.com/"
|
463
488
|
>>
|
464
|
-
>> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
|
489
|
+
>> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
|
465
490
|
>> element1 = page.css_first(selector, auto_save=True)
|
466
491
|
>>
|
467
492
|
>> # Same selector but used in the updated website
|
468
|
-
>> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
|
493
|
+
>> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
|
469
494
|
>> element2 = page.css_first(selector, auto_match=True)
|
470
495
|
>>
|
471
496
|
>> if element1.text == element2.text:
|
@@ -477,7 +502,7 @@ Note that I used a new argument called `automatch_domain`, this is because for S
|
|
477
502
|
In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
|
478
503
|
|
479
504
|
**Notes:**
|
480
|
-
1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you
|
505
|
+
1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you.
|
481
506
|
2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
|
482
507
|
```text
|
483
508
|
Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
|
@@ -518,7 +543,7 @@ Examples to clear any confusion :)
|
|
518
543
|
|
519
544
|
```python
|
520
545
|
>> from scrapling import Fetcher
|
521
|
-
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
546
|
+
>> page = Fetcher().get('https://quotes.toscrape.com/')
|
522
547
|
# Find all elements with tag name `div`.
|
523
548
|
>> page.find_all('div')
|
524
549
|
[<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
|
@@ -681,7 +706,10 @@ There are a lot of deep details skipped here to make this as short as possible s
|
|
681
706
|
|
682
707
|
Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
|
683
708
|
|
684
|
-
|
709
|
+
> [!IMPORTANT]
|
710
|
+
> A website is needed to provide detailed library documentation.<br/>
|
711
|
+
> I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. I have been working on Scrapling for months for free after all.<br/><br/>
|
712
|
+
> If you like `Scrapling` and want it to keep improving then this is a friendly reminder that you can help by supporting me through the [sponsor button](https://github.com/sponsors/D4Vinci).
|
685
713
|
|
686
714
|
## ⚡ Enlightening Questions and FAQs
|
687
715
|
This section addresses common questions about Scrapling, please read this section before opening an issue.
|
@@ -695,8 +723,8 @@ This section addresses common questions about Scrapling, please read this sectio
|
|
695
723
|
|
696
724
|
Together both are used to retrieve the element's unique properties from the database later.
|
697
725
|
4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
|
698
|
-
5.
|
699
|
-
6. The score for each element is stored in the table, and
|
726
|
+
5. Comparing elements is not exact but more about finding how similar these values are, so everything is taken into consideration, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.
|
727
|
+
6. The score for each element is stored in the table, and the element(s) with the highest combined similarity scores are returned.
|
700
728
|
|
701
729
|
### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
|
702
730
|
Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
|
@@ -727,7 +755,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
727
755
|
### Is Scrapling thread-safe?
|
728
756
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
729
757
|
|
730
|
-
## Sponsors
|
758
|
+
## More Sponsors!
|
731
759
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
732
760
|
|
733
761
|
## Contributing
|
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2"
|
7
|
+
__version__ = "0.2.1"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
@@ -4,8 +4,9 @@ from itertools import chain
|
|
4
4
|
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
5
5
|
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
6
6
|
|
7
|
-
from scrapling.core._types import Dict, Iterable, Any
|
7
|
+
from scrapling.core._types import Dict, Iterable, Any, Union
|
8
8
|
|
9
|
+
import orjson
|
9
10
|
from lxml import html
|
10
11
|
|
11
12
|
html_forbidden = {html.HtmlComment, }
|
@@ -18,6 +19,17 @@ logging.basicConfig(
|
|
18
19
|
)
|
19
20
|
|
20
21
|
|
22
|
+
def is_jsonable(content: Union[bytes, str]) -> bool:
|
23
|
+
if type(content) is bytes:
|
24
|
+
content = content.decode()
|
25
|
+
|
26
|
+
try:
|
27
|
+
_ = orjson.loads(content)
|
28
|
+
return True
|
29
|
+
except orjson.JSONDecodeError:
|
30
|
+
return False
|
31
|
+
|
32
|
+
|
21
33
|
@cache(None, typed=True)
|
22
34
|
def setup_basic_logging(level: str = 'debug'):
|
23
35
|
levels = {
|
@@ -7,6 +7,7 @@ from scrapling.engines.toolbelt import (
|
|
7
7
|
get_os_name,
|
8
8
|
intercept_route,
|
9
9
|
check_type_validity,
|
10
|
+
construct_proxy_dict,
|
10
11
|
generate_convincing_referer,
|
11
12
|
)
|
12
13
|
|
@@ -18,7 +19,8 @@ class CamoufoxEngine:
|
|
18
19
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
19
20
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
21
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
23
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
|
22
24
|
):
|
23
25
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
24
26
|
|
@@ -33,12 +35,14 @@ class CamoufoxEngine:
|
|
33
35
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
34
36
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
35
37
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
38
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
36
39
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
37
40
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
38
41
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
39
42
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
40
43
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
41
44
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
45
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
42
46
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
43
47
|
"""
|
44
48
|
self.headless = headless
|
@@ -48,7 +52,9 @@ class CamoufoxEngine:
|
|
48
52
|
self.allow_webgl = bool(allow_webgl)
|
49
53
|
self.network_idle = bool(network_idle)
|
50
54
|
self.google_search = bool(google_search)
|
55
|
+
self.os_randomize = bool(os_randomize)
|
51
56
|
self.extra_headers = extra_headers or {}
|
57
|
+
self.proxy = construct_proxy_dict(proxy)
|
52
58
|
self.addons = addons or []
|
53
59
|
self.humanize = humanize
|
54
60
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
@@ -66,17 +72,18 @@ class CamoufoxEngine:
|
|
66
72
|
"""Opens up the browser and do your request based on your chosen options.
|
67
73
|
|
68
74
|
:param url: Target url.
|
69
|
-
:return: A Response object
|
75
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
70
76
|
"""
|
71
77
|
with Camoufox(
|
72
|
-
|
73
|
-
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
74
|
-
os=get_os_name(),
|
75
|
-
block_webrtc=self.block_webrtc,
|
76
|
-
allow_webgl=self.allow_webgl,
|
78
|
+
proxy=self.proxy,
|
77
79
|
addons=self.addons,
|
80
|
+
headless=self.headless,
|
78
81
|
humanize=self.humanize,
|
79
|
-
i_know_what_im_doing=True, # To turn warnings off with user configurations
|
82
|
+
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
83
|
+
allow_webgl=self.allow_webgl,
|
84
|
+
block_webrtc=self.block_webrtc,
|
85
|
+
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
86
|
+
os=None if self.os_randomize else get_os_name(),
|
80
87
|
) as browser:
|
81
88
|
page = browser.new_page()
|
82
89
|
page.set_default_navigation_timeout(self.timeout)
|
@@ -9,8 +9,9 @@ from scrapling.engines.toolbelt import (
|
|
9
9
|
js_bypass_path,
|
10
10
|
intercept_route,
|
11
11
|
generate_headers,
|
12
|
-
check_type_validity,
|
13
12
|
construct_cdp_url,
|
13
|
+
check_type_validity,
|
14
|
+
construct_proxy_dict,
|
14
15
|
generate_convincing_referer,
|
15
16
|
)
|
16
17
|
|
@@ -33,6 +34,7 @@ class PlaywrightEngine:
|
|
33
34
|
nstbrowser_config: Optional[Dict] = None,
|
34
35
|
google_search: Optional[bool] = True,
|
35
36
|
extra_headers: Optional[Dict[str, str]] = None,
|
37
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
36
38
|
adaptor_arguments: Dict = None
|
37
39
|
):
|
38
40
|
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
|
@@ -54,6 +56,7 @@ class PlaywrightEngine:
|
|
54
56
|
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
55
57
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
56
58
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
59
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
57
60
|
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
58
61
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
59
62
|
"""
|
@@ -65,6 +68,7 @@ class PlaywrightEngine:
|
|
65
68
|
self.disable_webgl = bool(disable_webgl)
|
66
69
|
self.google_search = bool(google_search)
|
67
70
|
self.extra_headers = extra_headers or {}
|
71
|
+
self.proxy = construct_proxy_dict(proxy)
|
68
72
|
self.cdp_url = cdp_url
|
69
73
|
self.useragent = useragent
|
70
74
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
@@ -112,7 +116,7 @@ class PlaywrightEngine:
|
|
112
116
|
"""Opens up the browser and do your request based on your chosen options.
|
113
117
|
|
114
118
|
:param url: Target url.
|
115
|
-
:return: A Response object
|
119
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
116
120
|
"""
|
117
121
|
if not self.stealth:
|
118
122
|
from playwright.sync_api import sync_playwright
|
@@ -151,6 +155,7 @@ class PlaywrightEngine:
|
|
151
155
|
locale='en-US',
|
152
156
|
is_mobile=False,
|
153
157
|
has_touch=False,
|
158
|
+
proxy=self.proxy,
|
154
159
|
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
155
160
|
user_agent=useragent,
|
156
161
|
device_scale_factor=2,
|
@@ -48,7 +48,7 @@ class StaticEngine:
|
|
48
48
|
"""Takes httpx response and generates `Response` object from it.
|
49
49
|
|
50
50
|
:param response: httpx response object
|
51
|
-
:return: A Response object
|
51
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
52
52
|
"""
|
53
53
|
return Response(
|
54
54
|
url=str(response.url),
|
@@ -69,9 +69,9 @@ class StaticEngine:
|
|
69
69
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
70
70
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
71
71
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
72
|
-
:return: A Response object
|
72
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
73
73
|
"""
|
74
|
-
headers = self._headers_job(kwargs.
|
74
|
+
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
75
75
|
request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
76
76
|
return self._prepare_response(request)
|
77
77
|
|
@@ -81,9 +81,9 @@ class StaticEngine:
|
|
81
81
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
82
82
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
83
83
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
84
|
-
:return: A Response object
|
84
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
85
|
"""
|
86
|
-
headers = self._headers_job(kwargs.
|
86
|
+
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
87
87
|
request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
88
88
|
return self._prepare_response(request)
|
89
89
|
|
@@ -93,9 +93,9 @@ class StaticEngine:
|
|
93
93
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
94
94
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
95
95
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
96
|
-
:return: A Response object
|
96
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
97
97
|
"""
|
98
|
-
headers = self._headers_job(kwargs.
|
98
|
+
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
99
99
|
request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
100
100
|
return self._prepare_response(request)
|
101
101
|
|
@@ -105,8 +105,8 @@ class StaticEngine:
|
|
105
105
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
106
106
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
107
107
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
108
|
-
:return: A Response object
|
108
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
109
109
|
"""
|
110
|
-
headers = self._headers_job(kwargs.
|
110
|
+
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
111
111
|
request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
112
112
|
return self._prepare_response(request)
|