scrapling 0.2__tar.gz → 0.2.2__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. {scrapling-0.2/scrapling.egg-info → scrapling-0.2.2}/PKG-INFO +58 -19
  2. {scrapling-0.2 → scrapling-0.2.2}/README.md +56 -17
  3. {scrapling-0.2 → scrapling-0.2.2}/scrapling/__init__.py +1 -1
  4. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/utils.py +13 -1
  5. scrapling-0.2.2/scrapling/defaults.py +6 -0
  6. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/camo.py +17 -10
  7. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/pw.py +9 -4
  8. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/static.py +11 -11
  9. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/toolbelt/__init__.py +1 -0
  10. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/toolbelt/custom.py +16 -31
  11. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/toolbelt/navigation.py +34 -0
  12. {scrapling-0.2 → scrapling-0.2.2}/scrapling/fetchers.py +21 -13
  13. {scrapling-0.2 → scrapling-0.2.2}/scrapling/parser.py +19 -22
  14. scrapling-0.2.2/scrapling/py.typed +1 -0
  15. {scrapling-0.2 → scrapling-0.2.2/scrapling.egg-info}/PKG-INFO +58 -19
  16. {scrapling-0.2 → scrapling-0.2.2}/scrapling.egg-info/SOURCES.txt +2 -0
  17. {scrapling-0.2 → scrapling-0.2.2}/scrapling.egg-info/requires.txt +1 -1
  18. {scrapling-0.2 → scrapling-0.2.2}/setup.cfg +2 -2
  19. {scrapling-0.2 → scrapling-0.2.2}/setup.py +2 -2
  20. {scrapling-0.2 → scrapling-0.2.2}/LICENSE +0 -0
  21. {scrapling-0.2 → scrapling-0.2.2}/MANIFEST.in +0 -0
  22. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/__init__.py +0 -0
  23. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/_types.py +0 -0
  24. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/custom_types.py +0 -0
  25. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/mixins.py +0 -0
  26. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/storage_adaptors.py +0 -0
  27. {scrapling-0.2 → scrapling-0.2.2}/scrapling/core/translator.py +0 -0
  28. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/__init__.py +0 -0
  29. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/constants.py +0 -0
  30. {scrapling-0.2 → scrapling-0.2.2}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  31. {scrapling-0.2 → scrapling-0.2.2}/scrapling.egg-info/dependency_links.txt +0 -0
  32. {scrapling-0.2 → scrapling-0.2.2}/scrapling.egg-info/not-zip-safe +0 -0
  33. {scrapling-0.2 → scrapling-0.2.2}/scrapling.egg-info/top_level.txt +0 -0
  34. {scrapling-0.2 → scrapling-0.2.2}/tests/__init__.py +0 -0
  35. {scrapling-0.2 → scrapling-0.2.2}/tests/fetchers/__init__.py +0 -0
  36. {scrapling-0.2 → scrapling-0.2.2}/tests/fetchers/test_camoufox.py +0 -0
  37. {scrapling-0.2 → scrapling-0.2.2}/tests/fetchers/test_httpx.py +0 -0
  38. {scrapling-0.2 → scrapling-0.2.2}/tests/fetchers/test_playwright.py +0 -0
  39. {scrapling-0.2 → scrapling-0.2.2}/tests/parser/__init__.py +0 -0
  40. {scrapling-0.2 → scrapling-0.2.2}/tests/parser/test_automatch.py +0 -0
  41. {scrapling-0.2 → scrapling-0.2.2}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2
3
+ Version: 0.2.2
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.3.7
44
+ Requires-Dist: camoufox>=0.3.9
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,17 +52,33 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
52
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
53
 
54
54
  ```python
55
- >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
55
+ >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
56
56
  # Fetch websites' source under the radar!
57
- >> fetcher = StealthyFetcher().fetch('https://example.com', headless=True, disable_resources=True)
58
- >> print(fetcher.status)
57
+ >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
+ >> print(page.status)
59
59
  200
60
- >> page = fetcher.adaptor
61
60
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
62
61
  >> # Later, if the website structure changes, pass `auto_match=True`
63
62
  >> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
64
63
  ```
65
64
 
65
+ # Sponsors
66
+
67
+ [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
68
+
69
+ - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
70
+ - 👩‍💻 **24/7 Expert Support**: We will join your Slack Channel
71
+ - 🌍 **Global Presence**: Available in 150+ Countries
72
+ - ⚡ **Low Latency**
73
+ - 🔒 **Swiss Quality and Privacy**
74
+ - 🎁 **Free Trial**
75
+ - 🛡️ **99.9% Uptime**
76
+ - 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
77
+ - 🔧 **Easy Integration**: Compatible with most software and programming languages
78
+
79
+ [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
80
+ ---
81
+
66
82
  ## Table of content
67
83
  * [Key Features](#key-features)
68
84
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
@@ -95,7 +111,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
95
111
  * [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
96
112
  * [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
97
113
  * [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
98
- * [Sponsors](#sponsors)
114
+ * [More Sponsors!](#more-sponsors)
99
115
  * [Contributing](#contributing)
100
116
  * [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
101
117
  * [License](#license)
@@ -136,7 +152,7 @@ from scrapling import Fetcher
136
152
  fetcher = Fetcher(auto_match=False)
137
153
 
138
154
  # Fetch a web page and create an Adaptor instance
139
- page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True).adaptor
155
+ page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
140
156
  # Get all strings in the full page
141
157
  page.get_all_text(ignore_tags=('script', 'style'))
142
158
 
@@ -241,11 +257,22 @@ python -m browserforge update
241
257
  ```
242
258
 
243
259
  ## Fetching Websites Features
244
- All fetcher-type classes are imported in the same way
260
+ You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
245
261
  ```python
246
262
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
247
263
  ```
248
264
  And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
265
+
266
+ If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
267
+ ```python
268
+ from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
269
+ ```
270
+ then use it right away without initializing like:
271
+ ```python
272
+ page = StealthyFetcher.fetch('https://example.com')
273
+ ```
274
+
275
+ Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
249
276
  > [!NOTE]
250
277
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
251
278
  ### Fetcher
@@ -265,6 +292,8 @@ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) whi
265
292
  >> page.status == 200
266
293
  True
267
294
  ```
295
+ > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
296
+
268
297
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
269
298
 
270
299
  | Argument | Description | Optional |
@@ -283,6 +312,8 @@ True
283
312
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
284
313
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
285
314
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
315
+ | proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
316
+ | os_randomize | If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS. | ✔️ |
286
317
  | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
287
318
 
288
319
  </details>
@@ -293,9 +324,11 @@ This list isn't final so expect a lot more additions and flexibility to be added
293
324
  This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
294
325
  ```python
295
326
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
296
- >> page.adaptor.css_first("#search a::attr(href)")
327
+ >> page.css_first("#search a::attr(href)")
297
328
  'https://github.com/D4Vinci/Scrapling'
298
329
  ```
330
+ > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
331
+
299
332
  Using this Fetcher class, you can make requests with:
300
333
  1) Vanilla Playwright without any modifications other than the ones you chose.
301
334
  2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
@@ -323,6 +356,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
323
356
  | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
324
357
  | google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
325
358
  | extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
359
+ | proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
326
360
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
327
361
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
328
362
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
@@ -387,7 +421,7 @@ You can search for a specific ancestor of an element that satisfies a function,
387
421
  ### Content-based Selection & Finding Similar Elements
388
422
  You can select elements by their text content in multiple ways, here's a full example on another website:
389
423
  ```python
390
- >>> page = Fetcher().get('https://books.toscrape.com/index.html').adaptor
424
+ >>> page = Fetcher().get('https://books.toscrape.com/index.html')
391
425
 
392
426
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
393
427
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
@@ -507,11 +541,11 @@ Now let's test the same selector in both versions
507
541
  >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
508
542
  >> new_url = "https://stackoverflow.com/"
509
543
  >>
510
- >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30).adaptor
544
+ >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
511
545
  >> element1 = page.css_first(selector, auto_save=True)
512
546
  >>
513
547
  >> # Same selector but used in the updated website
514
- >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url).adaptor
548
+ >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
515
549
  >> element2 = page.css_first(selector, auto_match=True)
516
550
  >>
517
551
  >> if element1.text == element2.text:
@@ -523,7 +557,7 @@ Note that I used a new argument called `automatch_domain`, this is because for S
523
557
  In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
524
558
 
525
559
  **Notes:**
526
- 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you on the `.adaptor` property.
560
+ 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you.
527
561
  2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
528
562
  ```text
529
563
  Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
@@ -564,7 +598,7 @@ Examples to clear any confusion :)
564
598
 
565
599
  ```python
566
600
  >> from scrapling import Fetcher
567
- >> page = Fetcher().get('https://quotes.toscrape.com/').adaptor
601
+ >> page = Fetcher().get('https://quotes.toscrape.com/')
568
602
  # Find all elements with tag name `div`.
569
603
  >> page.find_all('div')
570
604
  [<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
@@ -727,7 +761,10 @@ There are a lot of deep details skipped here to make this as short as possible s
727
761
 
728
762
  Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
729
763
 
730
- To give detailed documentation of the library, it will need a website. I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. But you can help by using the [sponsor button](https://github.com/sponsors/D4Vinci) above :)
764
+ > [!IMPORTANT]
765
+ > A website is needed to provide detailed library documentation.<br/>
766
+ > I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. I have been working on Scrapling for months for free after all.<br/><br/>
767
+ > If you like `Scrapling` and want it to keep improving then this is a friendly reminder that you can help by supporting me through the [sponsor button](https://github.com/sponsors/D4Vinci).
731
768
 
732
769
  ## ⚡ Enlightening Questions and FAQs
733
770
  This section addresses common questions about Scrapling, please read this section before opening an issue.
@@ -741,8 +778,8 @@ This section addresses common questions about Scrapling, please read this sectio
741
778
 
742
779
  Together both are used to retrieve the element's unique properties from the database later.
743
780
  4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
744
- 5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
745
- 6. The score for each element is stored in the table, and in the end, the element(s) with the highest combined similarity scores are returned.
781
+ 5. Comparing elements is not exact but more about finding how similar these values are, so everything is taken into consideration, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.
782
+ 6. The score for each element is stored in the table, and the element(s) with the highest combined similarity scores are returned.
746
783
 
747
784
  ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
748
785
  Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
@@ -773,8 +810,10 @@ Of course, you can find elements by text/regex, find similar elements in a more
773
810
  ### Is Scrapling thread-safe?
774
811
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
775
812
 
776
- ## Sponsors
813
+ ## More Sponsors!
777
814
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
815
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
816
+
778
817
 
779
818
  ## Contributing
780
819
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
@@ -6,17 +6,33 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
6
6
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
7
7
 
8
8
  ```python
9
- >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
9
+ >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
10
10
  # Fetch websites' source under the radar!
11
- >> fetcher = StealthyFetcher().fetch('https://example.com', headless=True, disable_resources=True)
12
- >> print(fetcher.status)
11
+ >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
12
+ >> print(page.status)
13
13
  200
14
- >> page = fetcher.adaptor
15
14
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
16
15
  >> # Later, if the website structure changes, pass `auto_match=True`
17
16
  >> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
18
17
  ```
19
18
 
19
+ # Sponsors
20
+
21
+ [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
22
+
23
+ - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
24
+ - 👩‍💻 **24/7 Expert Support**: We will join your Slack Channel
25
+ - 🌍 **Global Presence**: Available in 150+ Countries
26
+ - ⚡ **Low Latency**
27
+ - 🔒 **Swiss Quality and Privacy**
28
+ - 🎁 **Free Trial**
29
+ - 🛡️ **99.9% Uptime**
30
+ - 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
31
+ - 🔧 **Easy Integration**: Compatible with most software and programming languages
32
+
33
+ [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
34
+ ---
35
+
20
36
  ## Table of content
21
37
  * [Key Features](#key-features)
22
38
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
@@ -49,7 +65,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
49
65
  * [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
50
66
  * [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
51
67
  * [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
52
- * [Sponsors](#sponsors)
68
+ * [More Sponsors!](#more-sponsors)
53
69
  * [Contributing](#contributing)
54
70
  * [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
55
71
  * [License](#license)
@@ -90,7 +106,7 @@ from scrapling import Fetcher
90
106
  fetcher = Fetcher(auto_match=False)
91
107
 
92
108
  # Fetch a web page and create an Adaptor instance
93
- page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True).adaptor
109
+ page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
94
110
  # Get all strings in the full page
95
111
  page.get_all_text(ignore_tags=('script', 'style'))
96
112
 
@@ -195,11 +211,22 @@ python -m browserforge update
195
211
  ```
196
212
 
197
213
  ## Fetching Websites Features
198
- All fetcher-type classes are imported in the same way
214
+ You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
199
215
  ```python
200
216
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
201
217
  ```
202
218
  And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
219
+
220
+ If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
221
+ ```python
222
+ from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
223
+ ```
224
+ then use it right away without initializing like:
225
+ ```python
226
+ page = StealthyFetcher.fetch('https://example.com')
227
+ ```
228
+
229
+ Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
203
230
  > [!NOTE]
204
231
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
205
232
  ### Fetcher
@@ -219,6 +246,8 @@ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) whi
219
246
  >> page.status == 200
220
247
  True
221
248
  ```
249
+ > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
250
+
222
251
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
223
252
 
224
253
  | Argument | Description | Optional |
@@ -237,6 +266,8 @@ True
237
266
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
238
267
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
239
268
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
269
+ | proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
270
+ | os_randomize | If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS. | ✔️ |
240
271
  | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
241
272
 
242
273
  </details>
@@ -247,9 +278,11 @@ This list isn't final so expect a lot more additions and flexibility to be added
247
278
  This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
248
279
  ```python
249
280
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
250
- >> page.adaptor.css_first("#search a::attr(href)")
281
+ >> page.css_first("#search a::attr(href)")
251
282
  'https://github.com/D4Vinci/Scrapling'
252
283
  ```
284
+ > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
285
+
253
286
  Using this Fetcher class, you can make requests with:
254
287
  1) Vanilla Playwright without any modifications other than the ones you chose.
255
288
  2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
@@ -277,6 +310,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
277
310
  | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
278
311
  | google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
279
312
  | extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
313
+ | proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
280
314
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
281
315
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
282
316
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
@@ -341,7 +375,7 @@ You can search for a specific ancestor of an element that satisfies a function,
341
375
  ### Content-based Selection & Finding Similar Elements
342
376
  You can select elements by their text content in multiple ways, here's a full example on another website:
343
377
  ```python
344
- >>> page = Fetcher().get('https://books.toscrape.com/index.html').adaptor
378
+ >>> page = Fetcher().get('https://books.toscrape.com/index.html')
345
379
 
346
380
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
347
381
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
@@ -461,11 +495,11 @@ Now let's test the same selector in both versions
461
495
  >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
462
496
  >> new_url = "https://stackoverflow.com/"
463
497
  >>
464
- >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30).adaptor
498
+ >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
465
499
  >> element1 = page.css_first(selector, auto_save=True)
466
500
  >>
467
501
  >> # Same selector but used in the updated website
468
- >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url).adaptor
502
+ >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
469
503
  >> element2 = page.css_first(selector, auto_match=True)
470
504
  >>
471
505
  >> if element1.text == element2.text:
@@ -477,7 +511,7 @@ Note that I used a new argument called `automatch_domain`, this is because for S
477
511
  In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
478
512
 
479
513
  **Notes:**
480
- 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you on the `.adaptor` property.
514
+ 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you.
481
515
  2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
482
516
  ```text
483
517
  Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
@@ -518,7 +552,7 @@ Examples to clear any confusion :)
518
552
 
519
553
  ```python
520
554
  >> from scrapling import Fetcher
521
- >> page = Fetcher().get('https://quotes.toscrape.com/').adaptor
555
+ >> page = Fetcher().get('https://quotes.toscrape.com/')
522
556
  # Find all elements with tag name `div`.
523
557
  >> page.find_all('div')
524
558
  [<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
@@ -681,7 +715,10 @@ There are a lot of deep details skipped here to make this as short as possible s
681
715
 
682
716
  Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
683
717
 
684
- To give detailed documentation of the library, it will need a website. I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. But you can help by using the [sponsor button](https://github.com/sponsors/D4Vinci) above :)
718
+ > [!IMPORTANT]
719
+ > A website is needed to provide detailed library documentation.<br/>
720
+ > I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. I have been working on Scrapling for months for free after all.<br/><br/>
721
+ > If you like `Scrapling` and want it to keep improving then this is a friendly reminder that you can help by supporting me through the [sponsor button](https://github.com/sponsors/D4Vinci).
685
722
 
686
723
  ## ⚡ Enlightening Questions and FAQs
687
724
  This section addresses common questions about Scrapling, please read this section before opening an issue.
@@ -695,8 +732,8 @@ This section addresses common questions about Scrapling, please read this sectio
695
732
 
696
733
  Together both are used to retrieve the element's unique properties from the database later.
697
734
  4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
698
- 5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
699
- 6. The score for each element is stored in the table, and in the end, the element(s) with the highest combined similarity scores are returned.
735
+ 5. Comparing elements is not exact but more about finding how similar these values are, so everything is taken into consideration, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.
736
+ 6. The score for each element is stored in the table, and the element(s) with the highest combined similarity scores are returned.
700
737
 
701
738
  ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
702
739
  Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
@@ -727,8 +764,10 @@ Of course, you can find elements by text/regex, find similar elements in a more
727
764
  ### Is Scrapling thread-safe?
728
765
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
729
766
 
730
- ## Sponsors
767
+ ## More Sponsors!
731
768
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
769
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
770
+
732
771
 
733
772
  ## Contributing
734
773
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
4
4
  from scrapling.core.custom_types import TextHandler, AttributesHandler
5
5
 
6
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2"
7
+ __version__ = "0.2.2"
8
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
9
 
10
10
 
@@ -4,8 +4,9 @@ from itertools import chain
4
4
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
5
  from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
6
 
7
- from scrapling.core._types import Dict, Iterable, Any
7
+ from scrapling.core._types import Dict, Iterable, Any, Union
8
8
 
9
+ import orjson
9
10
  from lxml import html
10
11
 
11
12
  html_forbidden = {html.HtmlComment, }
@@ -18,6 +19,17 @@ logging.basicConfig(
18
19
  )
19
20
 
20
21
 
22
+ def is_jsonable(content: Union[bytes, str]) -> bool:
23
+ if type(content) is bytes:
24
+ content = content.decode()
25
+
26
+ try:
27
+ _ = orjson.loads(content)
28
+ return True
29
+ except orjson.JSONDecodeError:
30
+ return False
31
+
32
+
21
33
  @cache(None, typed=True)
22
34
  def setup_basic_logging(level: str = 'debug'):
23
35
  levels = {
@@ -0,0 +1,6 @@
1
+ from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
2
+
3
+ # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
+ Fetcher = Fetcher()
5
+ StealthyFetcher = StealthyFetcher()
6
+ PlayWrightFetcher = PlayWrightFetcher()
@@ -7,6 +7,7 @@ from scrapling.engines.toolbelt import (
7
7
  get_os_name,
8
8
  intercept_route,
9
9
  check_type_validity,
10
+ construct_proxy_dict,
10
11
  generate_convincing_referer,
11
12
  )
12
13
 
@@ -18,7 +19,8 @@ class CamoufoxEngine:
18
19
  self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
20
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
21
  timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
22
+ wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
23
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
22
24
  ):
23
25
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
24
26
 
@@ -33,12 +35,14 @@ class CamoufoxEngine:
33
35
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
34
36
  :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
35
37
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
38
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
36
39
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
37
40
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
38
41
  :param wait_selector: Wait for a specific css selector to be in a specific state.
39
42
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
40
43
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
41
44
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
45
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
42
46
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
43
47
  """
44
48
  self.headless = headless
@@ -48,7 +52,9 @@ class CamoufoxEngine:
48
52
  self.allow_webgl = bool(allow_webgl)
49
53
  self.network_idle = bool(network_idle)
50
54
  self.google_search = bool(google_search)
55
+ self.os_randomize = bool(os_randomize)
51
56
  self.extra_headers = extra_headers or {}
57
+ self.proxy = construct_proxy_dict(proxy)
52
58
  self.addons = addons or []
53
59
  self.humanize = humanize
54
60
  self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -66,17 +72,18 @@ class CamoufoxEngine:
66
72
  """Opens up the browser and do your request based on your chosen options.
67
73
 
68
74
  :param url: Target url.
69
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
75
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
70
76
  """
71
77
  with Camoufox(
72
- headless=self.headless,
73
- block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
74
- os=get_os_name(),
75
- block_webrtc=self.block_webrtc,
76
- allow_webgl=self.allow_webgl,
78
+ proxy=self.proxy,
77
79
  addons=self.addons,
80
+ headless=self.headless,
78
81
  humanize=self.humanize,
79
- i_know_what_im_doing=True, # To turn warnings off with user configurations
82
+ i_know_what_im_doing=True, # To turn warnings off with the user configurations
83
+ allow_webgl=self.allow_webgl,
84
+ block_webrtc=self.block_webrtc,
85
+ block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
86
+ os=None if self.os_randomize else get_os_name(),
80
87
  ) as browser:
81
88
  page = browser.new_page()
82
89
  page.set_default_navigation_timeout(self.timeout)
@@ -107,14 +114,14 @@ class CamoufoxEngine:
107
114
  response = Response(
108
115
  url=res.url,
109
116
  text=page.content(),
110
- content=res.body(),
117
+ body=res.body(),
111
118
  status=res.status,
112
119
  reason=res.status_text,
113
120
  encoding=encoding,
114
121
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
115
122
  headers=res.all_headers(),
116
123
  request_headers=res.request.all_headers(),
117
- adaptor_arguments=self.adaptor_arguments
124
+ **self.adaptor_arguments
118
125
  )
119
126
  page.close()
120
127
 
@@ -9,8 +9,9 @@ from scrapling.engines.toolbelt import (
9
9
  js_bypass_path,
10
10
  intercept_route,
11
11
  generate_headers,
12
- check_type_validity,
13
12
  construct_cdp_url,
13
+ check_type_validity,
14
+ construct_proxy_dict,
14
15
  generate_convincing_referer,
15
16
  )
16
17
 
@@ -33,6 +34,7 @@ class PlaywrightEngine:
33
34
  nstbrowser_config: Optional[Dict] = None,
34
35
  google_search: Optional[bool] = True,
35
36
  extra_headers: Optional[Dict[str, str]] = None,
37
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
36
38
  adaptor_arguments: Dict = None
37
39
  ):
38
40
  """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
@@ -54,6 +56,7 @@ class PlaywrightEngine:
54
56
  :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
55
57
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
56
58
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
59
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
57
60
  :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
58
61
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
59
62
  """
@@ -65,6 +68,7 @@ class PlaywrightEngine:
65
68
  self.disable_webgl = bool(disable_webgl)
66
69
  self.google_search = bool(google_search)
67
70
  self.extra_headers = extra_headers or {}
71
+ self.proxy = construct_proxy_dict(proxy)
68
72
  self.cdp_url = cdp_url
69
73
  self.useragent = useragent
70
74
  self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -112,7 +116,7 @@ class PlaywrightEngine:
112
116
  """Opens up the browser and do your request based on your chosen options.
113
117
 
114
118
  :param url: Target url.
115
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
119
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
116
120
  """
117
121
  if not self.stealth:
118
122
  from playwright.sync_api import sync_playwright
@@ -151,6 +155,7 @@ class PlaywrightEngine:
151
155
  locale='en-US',
152
156
  is_mobile=False,
153
157
  has_touch=False,
158
+ proxy=self.proxy,
154
159
  color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
155
160
  user_agent=useragent,
156
161
  device_scale_factor=2,
@@ -219,14 +224,14 @@ class PlaywrightEngine:
219
224
  response = Response(
220
225
  url=res.url,
221
226
  text=page.content(),
222
- content=res.body(),
227
+ body=res.body(),
223
228
  status=res.status,
224
229
  reason=res.status_text,
225
230
  encoding=encoding,
226
231
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
227
232
  headers=res.all_headers(),
228
233
  request_headers=res.request.all_headers(),
229
- adaptor_arguments=self.adaptor_arguments
234
+ **self.adaptor_arguments
230
235
  )
231
236
  page.close()
232
237
  return response