scrapling 0.2.8__tar.gz → 0.2.9__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. {scrapling-0.2.8/scrapling.egg-info → scrapling-0.2.9}/PKG-INFO +32 -16
  2. {scrapling-0.2.8 → scrapling-0.2.9}/README.md +27 -10
  3. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/__init__.py +4 -4
  4. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/custom_types.py +88 -6
  5. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/storage_adaptors.py +5 -6
  6. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/translator.py +2 -2
  7. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/utils.py +29 -27
  8. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/defaults.py +2 -1
  9. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/camo.py +89 -15
  10. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/constants.py +4 -4
  11. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/pw.py +158 -83
  12. scrapling-0.2.9/scrapling/engines/static.py +172 -0
  13. scrapling-0.2.9/scrapling/engines/toolbelt/__init__.py +6 -0
  14. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/custom.py +20 -22
  15. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/fingerprints.py +3 -3
  16. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/navigation.py +21 -8
  17. scrapling-0.2.9/scrapling/fetchers.py +432 -0
  18. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/parser.py +49 -21
  19. {scrapling-0.2.8 → scrapling-0.2.9/scrapling.egg-info}/PKG-INFO +32 -16
  20. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling.egg-info/SOURCES.txt +8 -3
  21. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling.egg-info/requires.txt +3 -4
  22. {scrapling-0.2.8 → scrapling-0.2.9}/setup.cfg +1 -1
  23. {scrapling-0.2.8 → scrapling-0.2.9}/setup.py +5 -6
  24. scrapling-0.2.9/tests/fetchers/async/test_camoufox.py +95 -0
  25. scrapling-0.2.9/tests/fetchers/async/test_httpx.py +83 -0
  26. scrapling-0.2.9/tests/fetchers/async/test_playwright.py +99 -0
  27. scrapling-0.2.9/tests/fetchers/sync/__init__.py +0 -0
  28. scrapling-0.2.9/tests/fetchers/sync/test_camoufox.py +68 -0
  29. scrapling-0.2.9/tests/fetchers/sync/test_httpx.py +82 -0
  30. scrapling-0.2.9/tests/fetchers/sync/test_playwright.py +87 -0
  31. scrapling-0.2.9/tests/fetchers/test_utils.py +97 -0
  32. scrapling-0.2.9/tests/parser/__init__.py +0 -0
  33. scrapling-0.2.9/tests/parser/test_automatch.py +111 -0
  34. scrapling-0.2.9/tests/parser/test_general.py +330 -0
  35. scrapling-0.2.8/scrapling/engines/static.py +0 -129
  36. scrapling-0.2.8/scrapling/engines/toolbelt/__init__.py +0 -6
  37. scrapling-0.2.8/scrapling/fetchers.py +0 -217
  38. scrapling-0.2.8/tests/fetchers/test_camoufox.py +0 -65
  39. scrapling-0.2.8/tests/fetchers/test_httpx.py +0 -68
  40. scrapling-0.2.8/tests/fetchers/test_playwright.py +0 -77
  41. scrapling-0.2.8/tests/fetchers/test_utils.py +0 -129
  42. scrapling-0.2.8/tests/parser/test_automatch.py +0 -56
  43. scrapling-0.2.8/tests/parser/test_general.py +0 -288
  44. {scrapling-0.2.8 → scrapling-0.2.9}/LICENSE +0 -0
  45. {scrapling-0.2.8 → scrapling-0.2.9}/MANIFEST.in +0 -0
  46. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/__init__.py +0 -0
  47. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/_types.py +0 -0
  48. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/core/mixins.py +0 -0
  49. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/__init__.py +0 -0
  50. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  51. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  52. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  53. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  54. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  55. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  56. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  57. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling/py.typed +0 -0
  58. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling.egg-info/dependency_links.txt +0 -0
  59. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling.egg-info/not-zip-safe +0 -0
  60. {scrapling-0.2.8 → scrapling-0.2.9}/scrapling.egg-info/top_level.txt +0 -0
  61. {scrapling-0.2.8 → scrapling-0.2.9}/tests/__init__.py +0 -0
  62. {scrapling-0.2.8 → scrapling-0.2.9}/tests/fetchers/__init__.py +0 -0
  63. {scrapling-0.2.8/tests/parser → scrapling-0.2.9/tests/fetchers/async}/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -29,7 +29,7 @@ Classifier: Programming Language :: Python :: 3.12
29
29
  Classifier: Programming Language :: Python :: 3.13
30
30
  Classifier: Programming Language :: Python :: Implementation :: CPython
31
31
  Classifier: Typing :: Typed
32
- Requires-Python: >=3.8
32
+ Requires-Python: >=3.9
33
33
  Description-Content-Type: text/markdown
34
34
  License-File: LICENSE
35
35
  Requires-Dist: requests>=2.3
@@ -39,10 +39,9 @@ Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
- Requires-Dist: playwright==1.48
43
- Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.4.4
45
- Requires-Dist: browserforge
42
+ Requires-Dist: playwright>=1.49.1
43
+ Requires-Dist: rebrowser-playwright>=1.49.1
44
+ Requires-Dist: camoufox[geoip]>=0.4.9
46
45
 
47
46
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
48
47
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
@@ -52,7 +51,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
51
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
52
 
54
53
  ```python
55
- >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
54
+ >> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
56
55
  # Fetch websites' source under the radar!
57
56
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
57
  >> print(page.status)
@@ -81,7 +80,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
81
80
 
82
81
  ## Table of content
83
82
  * [Key Features](#key-features)
84
- * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
83
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
85
84
  * [Adaptive Scraping](#adaptive-scraping)
86
85
  * [Performance](#performance)
87
86
  * [Developing Experience](#developing-experience)
@@ -122,7 +121,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
122
121
 
123
122
  ## Key Features
124
123
 
125
- ### Fetch websites as you prefer
124
+ ### Fetch websites as you prefer with async support
126
125
  - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
127
126
  - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
128
127
  - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
@@ -213,7 +212,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
213
212
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
214
213
 
215
214
  ## Installation
216
- Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
215
+ Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
217
216
  ```bash
218
217
  pip3 install scrapling
219
218
  ```
@@ -265,11 +264,11 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
265
264
  ```python
266
265
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
267
266
  ```
268
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
267
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
269
268
 
270
269
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
271
270
  ```python
272
- from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
271
+ from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
273
272
  ```
274
273
  then use it right away without initializing like:
275
274
  ```python
@@ -282,21 +281,32 @@ Also, the `Response` object returned from all fetchers is the same as the `Adapt
282
281
  ### Fetcher
283
282
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
284
283
 
285
- For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
284
+ For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
286
285
 
287
286
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
288
287
  ```python
289
- >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
288
+ >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
290
289
  >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
291
290
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
292
291
  >> page = Fetcher().delete('https://httpbin.org/delete')
293
292
  ```
293
+ For Async requests, you will just replace the import like below:
294
+ ```python
295
+ >> from scrapling import AsyncFetcher
296
+ >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
297
+ >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
298
+ >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
299
+ >> page = await AsyncFetcher().delete('https://httpbin.org/delete')
300
+ ```
294
301
  ### StealthyFetcher
295
302
  This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
296
303
  ```python
297
304
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
298
305
  >> page.status == 200
299
306
  True
307
+ >> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
308
+ >> page.status == 200
309
+ True
300
310
  ```
301
311
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
302
312
 
@@ -314,7 +324,8 @@ True
314
324
  | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
315
325
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
316
326
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
317
- | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
327
+ | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
328
+ | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
318
329
  | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
319
330
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
320
331
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
@@ -333,6 +344,9 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
333
344
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
334
345
  >> page.css_first("#search a::attr(href)")
335
346
  'https://github.com/D4Vinci/Scrapling'
347
+ >> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
348
+ >> page.css_first("#search a::attr(href)")
349
+ 'https://github.com/D4Vinci/Scrapling'
336
350
  ```
337
351
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
338
352
 
@@ -437,6 +451,9 @@ You can select elements by their text content in multiple ways, here's a full ex
437
451
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
438
452
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
439
453
 
454
+ >>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
455
+ 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
456
+
440
457
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
441
458
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
442
459
 
@@ -850,7 +867,6 @@ This project includes code adapted from:
850
867
 
851
868
  ## Known Issues
852
869
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
853
- - Currently, Scrapling is not compatible with async/await.
854
870
 
855
871
  ---
856
872
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
6
6
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
7
7
 
8
8
  ```python
9
- >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
9
+ >> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
10
10
  # Fetch websites' source under the radar!
11
11
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
12
12
  >> print(page.status)
@@ -35,7 +35,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
35
35
 
36
36
  ## Table of content
37
37
  * [Key Features](#key-features)
38
- * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
38
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
39
39
  * [Adaptive Scraping](#adaptive-scraping)
40
40
  * [Performance](#performance)
41
41
  * [Developing Experience](#developing-experience)
@@ -76,7 +76,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
76
76
 
77
77
  ## Key Features
78
78
 
79
- ### Fetch websites as you prefer
79
+ ### Fetch websites as you prefer with async support
80
80
  - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
81
81
  - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
82
82
  - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
@@ -167,7 +167,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
167
167
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
168
168
 
169
169
  ## Installation
170
- Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
170
+ Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
171
171
  ```bash
172
172
  pip3 install scrapling
173
173
  ```
@@ -219,11 +219,11 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
219
219
  ```python
220
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
221
221
  ```
222
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
222
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
223
223
 
224
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
225
225
  ```python
226
- from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
226
+ from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
227
227
  ```
228
228
  then use it right away without initializing like:
229
229
  ```python
@@ -236,21 +236,32 @@ Also, the `Response` object returned from all fetchers is the same as the `Adapt
236
236
  ### Fetcher
237
237
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
238
238
 
239
- For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
239
+ For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
240
240
 
241
241
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
242
242
  ```python
243
- >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
243
+ >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
244
244
  >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
245
245
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
246
246
  >> page = Fetcher().delete('https://httpbin.org/delete')
247
247
  ```
248
+ For Async requests, you will just replace the import like below:
249
+ ```python
250
+ >> from scrapling import AsyncFetcher
251
+ >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
252
+ >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
253
+ >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
254
+ >> page = await AsyncFetcher().delete('https://httpbin.org/delete')
255
+ ```
248
256
  ### StealthyFetcher
249
257
  This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
250
258
  ```python
251
259
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
252
260
  >> page.status == 200
253
261
  True
262
+ >> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
263
+ >> page.status == 200
264
+ True
254
265
  ```
255
266
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
256
267
 
@@ -268,7 +279,8 @@ True
268
279
  | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
269
280
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
270
281
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
271
- | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
282
+ | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
283
+ | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
272
284
  | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
273
285
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
274
286
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
@@ -287,6 +299,9 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
287
299
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
288
300
  >> page.css_first("#search a::attr(href)")
289
301
  'https://github.com/D4Vinci/Scrapling'
302
+ >> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
303
+ >> page.css_first("#search a::attr(href)")
304
+ 'https://github.com/D4Vinci/Scrapling'
290
305
  ```
291
306
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
292
307
 
@@ -391,6 +406,9 @@ You can select elements by their text content in multiple ways, here's a full ex
391
406
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
392
407
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
393
408
 
409
+ >>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
410
+ 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
411
+
394
412
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
395
413
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
396
414
 
@@ -804,7 +822,6 @@ This project includes code adapted from:
804
822
 
805
823
  ## Known Issues
806
824
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
807
- - Currently, Scrapling is not compatible with async/await.
808
825
 
809
826
  ---
810
827
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -1,12 +1,12 @@
1
1
  # Declare top-level shortcuts
2
2
  from scrapling.core.custom_types import AttributesHandler, TextHandler
3
- from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
4
- StealthyFetcher)
3
+ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
4
+ PlayWrightFetcher, StealthyFetcher)
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.8"
8
+ __version__ = "0.2.9"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
12
- __all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
12
+ __all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
@@ -14,11 +14,70 @@ class TextHandler(str):
14
14
  __slots__ = ()
15
15
 
16
16
  def __new__(cls, string):
17
- # Because str is immutable and we can't override __init__
18
- if type(string) is str:
17
+ if isinstance(string, str):
19
18
  return super().__new__(cls, string)
20
- else:
21
- return super().__new__(cls, '')
19
+ return super().__new__(cls, '')
20
+
21
+ # Make methods from original `str` class return `TextHandler` instead of returning `str` again
22
+ # Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
23
+ # and I made sonnet write it for me :)
24
+ def strip(self, chars=None):
25
+ return TextHandler(super().strip(chars))
26
+
27
+ def lstrip(self, chars=None):
28
+ return TextHandler(super().lstrip(chars))
29
+
30
+ def rstrip(self, chars=None):
31
+ return TextHandler(super().rstrip(chars))
32
+
33
+ def capitalize(self):
34
+ return TextHandler(super().capitalize())
35
+
36
+ def casefold(self):
37
+ return TextHandler(super().casefold())
38
+
39
+ def center(self, width, fillchar=' '):
40
+ return TextHandler(super().center(width, fillchar))
41
+
42
+ def expandtabs(self, tabsize=8):
43
+ return TextHandler(super().expandtabs(tabsize))
44
+
45
+ def format(self, *args, **kwargs):
46
+ return TextHandler(super().format(*args, **kwargs))
47
+
48
+ def format_map(self, mapping):
49
+ return TextHandler(super().format_map(mapping))
50
+
51
+ def join(self, iterable):
52
+ return TextHandler(super().join(iterable))
53
+
54
+ def ljust(self, width, fillchar=' '):
55
+ return TextHandler(super().ljust(width, fillchar))
56
+
57
+ def rjust(self, width, fillchar=' '):
58
+ return TextHandler(super().rjust(width, fillchar))
59
+
60
+ def swapcase(self):
61
+ return TextHandler(super().swapcase())
62
+
63
+ def title(self):
64
+ return TextHandler(super().title())
65
+
66
+ def translate(self, table):
67
+ return TextHandler(super().translate(table))
68
+
69
+ def zfill(self, width):
70
+ return TextHandler(super().zfill(width))
71
+
72
+ def replace(self, old, new, count=-1):
73
+ return TextHandler(super().replace(old, new, count))
74
+
75
+ def upper(self):
76
+ return TextHandler(super().upper())
77
+
78
+ def lower(self):
79
+ return TextHandler(super().lower())
80
+ ##############
22
81
 
23
82
  def sort(self, reverse: bool = False) -> str:
24
83
  """Return a sorted version of the string"""
@@ -30,11 +89,21 @@ class TextHandler(str):
30
89
  data = re.sub(' +', ' ', data)
31
90
  return self.__class__(data.strip())
32
91
 
92
+ # For easy copy-paste from Scrapy/parsel code when needed :)
93
+ def get(self, default=None):
94
+ return self
95
+
96
+ def get_all(self):
97
+ return self
98
+
99
+ extract = get_all
100
+ extract_first = get
101
+
33
102
  def json(self) -> Dict:
34
103
  """Return json response if the response is jsonable otherwise throw error"""
35
- # Using __str__ function as a workaround for orjson issue with subclasses of str
104
+ # Using str function as a workaround for orjson issue with subclasses of str
36
105
  # Check this out: https://github.com/ijl/orjson/issues/445
37
- return loads(self.__str__())
106
+ return loads(str(self))
38
107
 
39
108
  def re(
40
109
  self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
@@ -127,6 +196,19 @@ class TextHandlers(List[TextHandler]):
127
196
  return result
128
197
  return default
129
198
 
199
+ # For easy copy-paste from Scrapy/parsel code when needed :)
200
+ def get(self, default=None):
201
+ """Returns the first item of the current list
202
+ :param default: the default value to return if the current list is empty
203
+ """
204
+ return self[0] if len(self) > 0 else default
205
+
206
+ def extract(self):
207
+ return self
208
+
209
+ extract_first = get
210
+ get_all = extract
211
+
130
212
 
131
213
  class AttributesHandler(Mapping):
132
214
  """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
@@ -1,4 +1,3 @@
1
- import logging
2
1
  import sqlite3
3
2
  import threading
4
3
  from abc import ABC, abstractmethod
@@ -9,7 +8,7 @@ from lxml import html
9
8
  from tldextract import extract as tld
10
9
 
11
10
  from scrapling.core._types import Dict, Optional, Union
12
- from scrapling.core.utils import _StorageTools, cache
11
+ from scrapling.core.utils import _StorageTools, log, lru_cache
13
12
 
14
13
 
15
14
  class StorageSystemMixin(ABC):
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
20
19
  """
21
20
  self.url = url
22
21
 
23
- @cache(None, typed=True)
22
+ @lru_cache(None, typed=True)
24
23
  def _get_base_url(self, default_value: str = 'default') -> str:
25
24
  if not self.url or type(self.url) is not str:
26
25
  return default_value
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
52
51
  raise NotImplementedError('Storage system must implement `save` method')
53
52
 
54
53
  @staticmethod
55
- @cache(None, typed=True)
54
+ @lru_cache(None, typed=True)
56
55
  def _get_hash(identifier: str) -> str:
57
56
  """If you want to hash identifier in your storage system, use this safer"""
58
57
  identifier = identifier.lower().strip()
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
64
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
65
64
 
66
65
 
67
- @cache(None, typed=True)
66
+ @lru_cache(None, typed=True)
68
67
  class SQLiteStorageSystem(StorageSystemMixin):
69
68
  """The recommended system to use, it's race condition safe and thread safe.
70
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
86
85
  self.connection.execute("PRAGMA journal_mode=WAL")
87
86
  self.cursor = self.connection.cursor()
88
87
  self._setup_database()
89
- logging.debug(
88
+ log.debug(
90
89
  f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
91
90
  )
92
91
 
@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
17
17
  from w3lib.html import HTML5_WHITESPACE
18
18
 
19
19
  from scrapling.core._types import Any, Optional, Protocol, Self
20
- from scrapling.core.utils import cache
20
+ from scrapling.core.utils import lru_cache
21
21
 
22
22
  regex = f"[{HTML5_WHITESPACE}]+"
23
23
  replace_html5_whitespaces = re.compile(regex).sub
@@ -139,6 +139,6 @@ class TranslatorMixin:
139
139
 
140
140
 
141
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @cache(maxsize=256)
142
+ @lru_cache(maxsize=256)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
@@ -9,17 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
9
9
 
10
10
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
11
11
  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
- from functools import lru_cache as cache # isort:skip
13
-
12
+ from functools import lru_cache # isort:skip
14
13
 
15
14
  html_forbidden = {html.HtmlComment, }
16
- logging.basicConfig(
17
- level=logging.ERROR,
18
- format='%(asctime)s - %(levelname)s - %(message)s',
19
- handlers=[
20
- logging.StreamHandler()
21
- ]
22
- )
15
+
16
+
17
+ @lru_cache(1, typed=True)
18
+ def setup_logger():
19
+ """Create and configure a logger with a standard format.
20
+
21
+ :returns: logging.Logger: Configured logger instance
22
+ """
23
+ logger = logging.getLogger('scrapling')
24
+ logger.setLevel(logging.INFO)
25
+
26
+ formatter = logging.Formatter(
27
+ fmt="[%(asctime)s] %(levelname)s: %(message)s",
28
+ datefmt="%Y-%m-%d %H:%M:%S"
29
+ )
30
+
31
+ console_handler = logging.StreamHandler()
32
+ console_handler.setFormatter(formatter)
33
+
34
+ # Add handler to logger (if not already added)
35
+ if not logger.handlers:
36
+ logger.addHandler(console_handler)
37
+
38
+ return logger
39
+
40
+
41
+ log = setup_logger()
23
42
 
24
43
 
25
44
  def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -33,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
33
52
  return False
34
53
 
35
54
 
36
- @cache(None, typed=True)
37
- def setup_basic_logging(level: str = 'debug'):
38
- levels = {
39
- 'debug': logging.DEBUG,
40
- 'info': logging.INFO,
41
- 'warning': logging.WARNING,
42
- 'error': logging.ERROR,
43
- 'critical': logging.CRITICAL
44
- }
45
- formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
46
- lvl = levels[level.lower()]
47
- handler = logging.StreamHandler()
48
- handler.setFormatter(formatter)
49
- # Configure the root logger
50
- logging.basicConfig(level=lvl, handlers=[handler])
51
-
52
-
53
55
  def flatten(lst: Iterable):
54
56
  return list(chain.from_iterable(lst))
55
57
 
@@ -113,7 +115,7 @@ class _StorageTools:
113
115
  # return _impl
114
116
 
115
117
 
116
- @cache(None, typed=True)
118
+ @lru_cache(None, typed=True)
117
119
  def clean_spaces(string):
118
120
  string = string.replace('\t', ' ')
119
121
  string = re.sub('[\n|\r]', '', string)
@@ -1,6 +1,7 @@
1
- from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
1
+ from .fetchers import AsyncFetcher, Fetcher, PlayWrightFetcher, StealthyFetcher
2
2
 
3
3
  # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
4
  Fetcher = Fetcher()
5
+ AsyncFetcher = AsyncFetcher()
5
6
  StealthyFetcher = StealthyFetcher()
6
7
  PlayWrightFetcher = PlayWrightFetcher()