scrapling 0.2.8__tar.gz → 0.2.91__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. {scrapling-0.2.8/scrapling.egg-info → scrapling-0.2.91}/PKG-INFO +33 -18
  2. {scrapling-0.2.8 → scrapling-0.2.91}/README.md +27 -10
  3. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/__init__.py +4 -4
  4. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/_types.py +2 -0
  5. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/custom_types.py +88 -6
  6. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/storage_adaptors.py +5 -6
  7. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/translator.py +2 -2
  8. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/utils.py +29 -27
  9. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/defaults.py +2 -1
  10. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/camo.py +124 -24
  11. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/constants.py +4 -4
  12. scrapling-0.2.91/scrapling/engines/pw.py +363 -0
  13. scrapling-0.2.91/scrapling/engines/static.py +172 -0
  14. scrapling-0.2.91/scrapling/engines/toolbelt/__init__.py +6 -0
  15. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/custom.py +16 -22
  16. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/fingerprints.py +3 -3
  17. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/navigation.py +21 -8
  18. scrapling-0.2.91/scrapling/fetchers.py +432 -0
  19. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/parser.py +50 -22
  20. {scrapling-0.2.8 → scrapling-0.2.91/scrapling.egg-info}/PKG-INFO +33 -18
  21. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling.egg-info/SOURCES.txt +8 -3
  22. scrapling-0.2.91/scrapling.egg-info/requires.txt +10 -0
  23. {scrapling-0.2.8 → scrapling-0.2.91}/setup.cfg +1 -1
  24. {scrapling-0.2.8 → scrapling-0.2.91}/setup.py +6 -8
  25. scrapling-0.2.91/tests/fetchers/async/test_camoufox.py +95 -0
  26. scrapling-0.2.91/tests/fetchers/async/test_httpx.py +83 -0
  27. scrapling-0.2.91/tests/fetchers/async/test_playwright.py +99 -0
  28. scrapling-0.2.91/tests/fetchers/sync/__init__.py +0 -0
  29. scrapling-0.2.91/tests/fetchers/sync/test_camoufox.py +68 -0
  30. scrapling-0.2.91/tests/fetchers/sync/test_httpx.py +82 -0
  31. scrapling-0.2.91/tests/fetchers/sync/test_playwright.py +87 -0
  32. scrapling-0.2.91/tests/fetchers/test_utils.py +97 -0
  33. scrapling-0.2.91/tests/parser/__init__.py +0 -0
  34. scrapling-0.2.91/tests/parser/test_automatch.py +111 -0
  35. scrapling-0.2.91/tests/parser/test_general.py +330 -0
  36. scrapling-0.2.8/scrapling/engines/pw.py +0 -259
  37. scrapling-0.2.8/scrapling/engines/static.py +0 -129
  38. scrapling-0.2.8/scrapling/engines/toolbelt/__init__.py +0 -6
  39. scrapling-0.2.8/scrapling/fetchers.py +0 -217
  40. scrapling-0.2.8/scrapling.egg-info/requires.txt +0 -11
  41. scrapling-0.2.8/tests/fetchers/test_camoufox.py +0 -65
  42. scrapling-0.2.8/tests/fetchers/test_httpx.py +0 -68
  43. scrapling-0.2.8/tests/fetchers/test_playwright.py +0 -77
  44. scrapling-0.2.8/tests/fetchers/test_utils.py +0 -129
  45. scrapling-0.2.8/tests/parser/test_automatch.py +0 -56
  46. scrapling-0.2.8/tests/parser/test_general.py +0 -288
  47. {scrapling-0.2.8 → scrapling-0.2.91}/LICENSE +0 -0
  48. {scrapling-0.2.8 → scrapling-0.2.91}/MANIFEST.in +0 -0
  49. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/__init__.py +0 -0
  50. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/core/mixins.py +0 -0
  51. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/__init__.py +0 -0
  52. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  53. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  54. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  55. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  56. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  57. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  58. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  59. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling/py.typed +0 -0
  60. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling.egg-info/dependency_links.txt +0 -0
  61. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling.egg-info/not-zip-safe +0 -0
  62. {scrapling-0.2.8 → scrapling-0.2.91}/scrapling.egg-info/top_level.txt +0 -0
  63. {scrapling-0.2.8 → scrapling-0.2.91}/tests/__init__.py +0 -0
  64. {scrapling-0.2.8 → scrapling-0.2.91}/tests/fetchers/__init__.py +0 -0
  65. {scrapling-0.2.8/tests/parser → scrapling-0.2.91/tests/fetchers/async}/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.8
3
+ Version: 0.2.91
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -29,7 +28,7 @@ Classifier: Programming Language :: Python :: 3.12
29
28
  Classifier: Programming Language :: Python :: 3.13
30
29
  Classifier: Programming Language :: Python :: Implementation :: CPython
31
30
  Classifier: Typing :: Typed
32
- Requires-Python: >=3.8
31
+ Requires-Python: >=3.9
33
32
  Description-Content-Type: text/markdown
34
33
  License-File: LICENSE
35
34
  Requires-Dist: requests>=2.3
@@ -38,11 +37,10 @@ Requires-Dist: cssselect>=1.2
38
37
  Requires-Dist: w3lib
39
38
  Requires-Dist: orjson>=3
40
39
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
42
- Requires-Dist: playwright==1.48
43
- Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.4.4
45
- Requires-Dist: browserforge
40
+ Requires-Dist: httpx[brotli,socks,zstd]
41
+ Requires-Dist: playwright>=1.49.1
42
+ Requires-Dist: rebrowser-playwright>=1.49.1
43
+ Requires-Dist: camoufox[geoip]>=0.4.9
46
44
 
47
45
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
48
46
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
@@ -52,7 +50,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
50
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
51
 
54
52
  ```python
55
- >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
53
+ >> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
56
54
  # Fetch websites' source under the radar!
57
55
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
56
  >> print(page.status)
@@ -81,7 +79,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
81
79
 
82
80
  ## Table of content
83
81
  * [Key Features](#key-features)
84
- * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
82
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
85
83
  * [Adaptive Scraping](#adaptive-scraping)
86
84
  * [Performance](#performance)
87
85
  * [Developing Experience](#developing-experience)
@@ -122,7 +120,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
122
120
 
123
121
  ## Key Features
124
122
 
125
- ### Fetch websites as you prefer
123
+ ### Fetch websites as you prefer with async support
126
124
  - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
127
125
  - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
128
126
  - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
@@ -213,7 +211,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
213
211
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
214
212
 
215
213
  ## Installation
216
- Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
214
+ Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
217
215
  ```bash
218
216
  pip3 install scrapling
219
217
  ```
@@ -265,11 +263,11 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
265
263
  ```python
266
264
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
267
265
  ```
268
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
266
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
269
267
 
270
268
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
271
269
  ```python
272
- from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
270
+ from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
273
271
  ```
274
272
  then use it right away without initializing like:
275
273
  ```python
@@ -282,21 +280,32 @@ Also, the `Response` object returned from all fetchers is the same as the `Adapt
282
280
  ### Fetcher
283
281
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
284
282
 
285
- For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
283
+ For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
286
284
 
287
285
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
288
286
  ```python
289
- >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
287
+ >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
290
288
  >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
291
289
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
292
290
  >> page = Fetcher().delete('https://httpbin.org/delete')
293
291
  ```
292
+ For Async requests, you will just replace the import like below:
293
+ ```python
294
+ >> from scrapling import AsyncFetcher
295
+ >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
296
+ >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
297
+ >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
298
+ >> page = await AsyncFetcher().delete('https://httpbin.org/delete')
299
+ ```
294
300
  ### StealthyFetcher
295
301
  This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
296
302
  ```python
297
303
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
298
304
  >> page.status == 200
299
305
  True
306
+ >> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
307
+ >> page.status == 200
308
+ True
300
309
  ```
301
310
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
302
311
 
@@ -314,7 +323,8 @@ True
314
323
  | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
315
324
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
316
325
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
317
- | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
326
+ | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
327
+ | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
318
328
  | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
319
329
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
320
330
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
@@ -333,6 +343,9 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
333
343
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
334
344
  >> page.css_first("#search a::attr(href)")
335
345
  'https://github.com/D4Vinci/Scrapling'
346
+ >> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
347
+ >> page.css_first("#search a::attr(href)")
348
+ 'https://github.com/D4Vinci/Scrapling'
336
349
  ```
337
350
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
338
351
 
@@ -437,6 +450,9 @@ You can select elements by their text content in multiple ways, here's a full ex
437
450
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
438
451
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
439
452
 
453
+ >>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
454
+ 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
455
+
440
456
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
441
457
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
442
458
 
@@ -850,7 +866,6 @@ This project includes code adapted from:
850
866
 
851
867
  ## Known Issues
852
868
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
853
- - Currently, Scrapling is not compatible with async/await.
854
869
 
855
870
  ---
856
871
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
6
6
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
7
7
 
8
8
  ```python
9
- >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
9
+ >> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
10
10
  # Fetch websites' source under the radar!
11
11
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
12
12
  >> print(page.status)
@@ -35,7 +35,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
35
35
 
36
36
  ## Table of content
37
37
  * [Key Features](#key-features)
38
- * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
38
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
39
39
  * [Adaptive Scraping](#adaptive-scraping)
40
40
  * [Performance](#performance)
41
41
  * [Developing Experience](#developing-experience)
@@ -76,7 +76,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
76
76
 
77
77
  ## Key Features
78
78
 
79
- ### Fetch websites as you prefer
79
+ ### Fetch websites as you prefer with async support
80
80
  - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
81
81
  - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
82
82
  - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
@@ -167,7 +167,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
167
167
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
168
168
 
169
169
  ## Installation
170
- Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
170
+ Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
171
171
  ```bash
172
172
  pip3 install scrapling
173
173
  ```
@@ -219,11 +219,11 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
219
219
  ```python
220
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
221
221
  ```
222
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
222
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
223
223
 
224
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
225
225
  ```python
226
- from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
226
+ from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
227
227
  ```
228
228
  then use it right away without initializing like:
229
229
  ```python
@@ -236,21 +236,32 @@ Also, the `Response` object returned from all fetchers is the same as the `Adapt
236
236
  ### Fetcher
237
237
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
238
238
 
239
- For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
239
+ For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
240
240
 
241
241
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
242
242
  ```python
243
- >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
243
+ >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
244
244
  >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
245
245
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
246
246
  >> page = Fetcher().delete('https://httpbin.org/delete')
247
247
  ```
248
+ For Async requests, you will just replace the import like below:
249
+ ```python
250
+ >> from scrapling import AsyncFetcher
251
+ >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
252
+ >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
253
+ >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
254
+ >> page = await AsyncFetcher().delete('https://httpbin.org/delete')
255
+ ```
248
256
  ### StealthyFetcher
249
257
  This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
250
258
  ```python
251
259
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
252
260
  >> page.status == 200
253
261
  True
262
+ >> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
263
+ >> page.status == 200
264
+ True
254
265
  ```
255
266
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
256
267
 
@@ -268,7 +279,8 @@ True
268
279
  | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
269
280
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
270
281
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
271
- | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
282
+ | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
283
+ | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
272
284
  | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
273
285
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
274
286
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
@@ -287,6 +299,9 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
287
299
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
288
300
  >> page.css_first("#search a::attr(href)")
289
301
  'https://github.com/D4Vinci/Scrapling'
302
+ >> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
303
+ >> page.css_first("#search a::attr(href)")
304
+ 'https://github.com/D4Vinci/Scrapling'
290
305
  ```
291
306
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
292
307
 
@@ -391,6 +406,9 @@ You can select elements by their text content in multiple ways, here's a full ex
391
406
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
392
407
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
393
408
 
409
+ >>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
410
+ 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
411
+
394
412
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
395
413
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
396
414
 
@@ -804,7 +822,6 @@ This project includes code adapted from:
804
822
 
805
823
  ## Known Issues
806
824
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
807
- - Currently, Scrapling is not compatible with async/await.
808
825
 
809
826
  ---
810
827
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -1,12 +1,12 @@
1
1
  # Declare top-level shortcuts
2
2
  from scrapling.core.custom_types import AttributesHandler, TextHandler
3
- from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
4
- StealthyFetcher)
3
+ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
4
+ PlayWrightFetcher, StealthyFetcher)
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.8"
8
+ __version__ = "0.2.91"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
12
- __all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
12
+ __all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
5
5
  from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
6
  List, Literal, Optional, Pattern, Tuple, Type, Union)
7
7
 
8
+ SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9
+
8
10
  try:
9
11
  from typing import Protocol
10
12
  except ImportError:
@@ -14,11 +14,70 @@ class TextHandler(str):
14
14
  __slots__ = ()
15
15
 
16
16
  def __new__(cls, string):
17
- # Because str is immutable and we can't override __init__
18
- if type(string) is str:
17
+ if isinstance(string, str):
19
18
  return super().__new__(cls, string)
20
- else:
21
- return super().__new__(cls, '')
19
+ return super().__new__(cls, '')
20
+
21
+ # Make methods from original `str` class return `TextHandler` instead of returning `str` again
22
+ # Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
23
+ # and I made sonnet write it for me :)
24
+ def strip(self, chars=None):
25
+ return TextHandler(super().strip(chars))
26
+
27
+ def lstrip(self, chars=None):
28
+ return TextHandler(super().lstrip(chars))
29
+
30
+ def rstrip(self, chars=None):
31
+ return TextHandler(super().rstrip(chars))
32
+
33
+ def capitalize(self):
34
+ return TextHandler(super().capitalize())
35
+
36
+ def casefold(self):
37
+ return TextHandler(super().casefold())
38
+
39
+ def center(self, width, fillchar=' '):
40
+ return TextHandler(super().center(width, fillchar))
41
+
42
+ def expandtabs(self, tabsize=8):
43
+ return TextHandler(super().expandtabs(tabsize))
44
+
45
+ def format(self, *args, **kwargs):
46
+ return TextHandler(super().format(*args, **kwargs))
47
+
48
+ def format_map(self, mapping):
49
+ return TextHandler(super().format_map(mapping))
50
+
51
+ def join(self, iterable):
52
+ return TextHandler(super().join(iterable))
53
+
54
+ def ljust(self, width, fillchar=' '):
55
+ return TextHandler(super().ljust(width, fillchar))
56
+
57
+ def rjust(self, width, fillchar=' '):
58
+ return TextHandler(super().rjust(width, fillchar))
59
+
60
+ def swapcase(self):
61
+ return TextHandler(super().swapcase())
62
+
63
+ def title(self):
64
+ return TextHandler(super().title())
65
+
66
+ def translate(self, table):
67
+ return TextHandler(super().translate(table))
68
+
69
+ def zfill(self, width):
70
+ return TextHandler(super().zfill(width))
71
+
72
+ def replace(self, old, new, count=-1):
73
+ return TextHandler(super().replace(old, new, count))
74
+
75
+ def upper(self):
76
+ return TextHandler(super().upper())
77
+
78
+ def lower(self):
79
+ return TextHandler(super().lower())
80
+ ##############
22
81
 
23
82
  def sort(self, reverse: bool = False) -> str:
24
83
  """Return a sorted version of the string"""
@@ -30,11 +89,21 @@ class TextHandler(str):
30
89
  data = re.sub(' +', ' ', data)
31
90
  return self.__class__(data.strip())
32
91
 
92
+ # For easy copy-paste from Scrapy/parsel code when needed :)
93
+ def get(self, default=None):
94
+ return self
95
+
96
+ def get_all(self):
97
+ return self
98
+
99
+ extract = get_all
100
+ extract_first = get
101
+
33
102
  def json(self) -> Dict:
34
103
  """Return json response if the response is jsonable otherwise throw error"""
35
- # Using __str__ function as a workaround for orjson issue with subclasses of str
104
+ # Using str function as a workaround for orjson issue with subclasses of str
36
105
  # Check this out: https://github.com/ijl/orjson/issues/445
37
- return loads(self.__str__())
106
+ return loads(str(self))
38
107
 
39
108
  def re(
40
109
  self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
@@ -127,6 +196,19 @@ class TextHandlers(List[TextHandler]):
127
196
  return result
128
197
  return default
129
198
 
199
+ # For easy copy-paste from Scrapy/parsel code when needed :)
200
+ def get(self, default=None):
201
+ """Returns the first item of the current list
202
+ :param default: the default value to return if the current list is empty
203
+ """
204
+ return self[0] if len(self) > 0 else default
205
+
206
+ def extract(self):
207
+ return self
208
+
209
+ extract_first = get
210
+ get_all = extract
211
+
130
212
 
131
213
  class AttributesHandler(Mapping):
132
214
  """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
@@ -1,4 +1,3 @@
1
- import logging
2
1
  import sqlite3
3
2
  import threading
4
3
  from abc import ABC, abstractmethod
@@ -9,7 +8,7 @@ from lxml import html
9
8
  from tldextract import extract as tld
10
9
 
11
10
  from scrapling.core._types import Dict, Optional, Union
12
- from scrapling.core.utils import _StorageTools, cache
11
+ from scrapling.core.utils import _StorageTools, log, lru_cache
13
12
 
14
13
 
15
14
  class StorageSystemMixin(ABC):
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
20
19
  """
21
20
  self.url = url
22
21
 
23
- @cache(None, typed=True)
22
+ @lru_cache(None, typed=True)
24
23
  def _get_base_url(self, default_value: str = 'default') -> str:
25
24
  if not self.url or type(self.url) is not str:
26
25
  return default_value
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
52
51
  raise NotImplementedError('Storage system must implement `save` method')
53
52
 
54
53
  @staticmethod
55
- @cache(None, typed=True)
54
+ @lru_cache(None, typed=True)
56
55
  def _get_hash(identifier: str) -> str:
57
56
  """If you want to hash identifier in your storage system, use this safer"""
58
57
  identifier = identifier.lower().strip()
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
64
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
65
64
 
66
65
 
67
- @cache(None, typed=True)
66
+ @lru_cache(None, typed=True)
68
67
  class SQLiteStorageSystem(StorageSystemMixin):
69
68
  """The recommended system to use, it's race condition safe and thread safe.
70
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
86
85
  self.connection.execute("PRAGMA journal_mode=WAL")
87
86
  self.cursor = self.connection.cursor()
88
87
  self._setup_database()
89
- logging.debug(
88
+ log.debug(
90
89
  f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
91
90
  )
92
91
 
@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
17
17
  from w3lib.html import HTML5_WHITESPACE
18
18
 
19
19
  from scrapling.core._types import Any, Optional, Protocol, Self
20
- from scrapling.core.utils import cache
20
+ from scrapling.core.utils import lru_cache
21
21
 
22
22
  regex = f"[{HTML5_WHITESPACE}]+"
23
23
  replace_html5_whitespaces = re.compile(regex).sub
@@ -139,6 +139,6 @@ class TranslatorMixin:
139
139
 
140
140
 
141
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @cache(maxsize=256)
142
+ @lru_cache(maxsize=256)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
@@ -9,17 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
9
9
 
10
10
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
11
11
  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
- from functools import lru_cache as cache # isort:skip
13
-
12
+ from functools import lru_cache # isort:skip
14
13
 
15
14
  html_forbidden = {html.HtmlComment, }
16
- logging.basicConfig(
17
- level=logging.ERROR,
18
- format='%(asctime)s - %(levelname)s - %(message)s',
19
- handlers=[
20
- logging.StreamHandler()
21
- ]
22
- )
15
+
16
+
17
+ @lru_cache(1, typed=True)
18
+ def setup_logger():
19
+ """Create and configure a logger with a standard format.
20
+
21
+ :returns: logging.Logger: Configured logger instance
22
+ """
23
+ logger = logging.getLogger('scrapling')
24
+ logger.setLevel(logging.INFO)
25
+
26
+ formatter = logging.Formatter(
27
+ fmt="[%(asctime)s] %(levelname)s: %(message)s",
28
+ datefmt="%Y-%m-%d %H:%M:%S"
29
+ )
30
+
31
+ console_handler = logging.StreamHandler()
32
+ console_handler.setFormatter(formatter)
33
+
34
+ # Add handler to logger (if not already added)
35
+ if not logger.handlers:
36
+ logger.addHandler(console_handler)
37
+
38
+ return logger
39
+
40
+
41
+ log = setup_logger()
23
42
 
24
43
 
25
44
  def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -33,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
33
52
  return False
34
53
 
35
54
 
36
- @cache(None, typed=True)
37
- def setup_basic_logging(level: str = 'debug'):
38
- levels = {
39
- 'debug': logging.DEBUG,
40
- 'info': logging.INFO,
41
- 'warning': logging.WARNING,
42
- 'error': logging.ERROR,
43
- 'critical': logging.CRITICAL
44
- }
45
- formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
46
- lvl = levels[level.lower()]
47
- handler = logging.StreamHandler()
48
- handler.setFormatter(formatter)
49
- # Configure the root logger
50
- logging.basicConfig(level=lvl, handlers=[handler])
51
-
52
-
53
55
  def flatten(lst: Iterable):
54
56
  return list(chain.from_iterable(lst))
55
57
 
@@ -113,7 +115,7 @@ class _StorageTools:
113
115
  # return _impl
114
116
 
115
117
 
116
- @cache(None, typed=True)
118
+ @lru_cache(None, typed=True)
117
119
  def clean_spaces(string):
118
120
  string = string.replace('\t', ' ')
119
121
  string = re.sub('[\n|\r]', '', string)
@@ -1,6 +1,7 @@
1
- from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
1
+ from .fetchers import AsyncFetcher, Fetcher, PlayWrightFetcher, StealthyFetcher
2
2
 
3
3
  # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
4
  Fetcher = Fetcher()
5
+ AsyncFetcher = AsyncFetcher()
5
6
  StealthyFetcher = StealthyFetcher()
6
7
  PlayWrightFetcher = PlayWrightFetcher()