scrapling 0.2.6__tar.gz → 0.2.8__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. {scrapling-0.2.6/scrapling.egg-info → scrapling-0.2.8}/PKG-INFO +28 -21
  2. {scrapling-0.2.6 → scrapling-0.2.8}/README.md +25 -18
  3. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/__init__.py +4 -3
  4. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/_types.py +2 -3
  5. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/custom_types.py +5 -5
  6. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/translator.py +5 -6
  7. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/utils.py +15 -12
  8. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/defaults.py +1 -1
  9. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/camo.py +20 -13
  10. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/constants.py +1 -1
  11. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/pw.py +31 -18
  12. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/static.py +24 -11
  13. scrapling-0.2.8/scrapling/engines/toolbelt/__init__.py +6 -0
  14. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/custom.py +15 -10
  15. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/fingerprints.py +5 -5
  16. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/navigation.py +6 -6
  17. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/fetchers.py +23 -14
  18. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/parser.py +15 -8
  19. {scrapling-0.2.6 → scrapling-0.2.8/scrapling.egg-info}/PKG-INFO +28 -21
  20. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/requires.txt +1 -1
  21. {scrapling-0.2.6 → scrapling-0.2.8}/setup.cfg +1 -1
  22. {scrapling-0.2.6 → scrapling-0.2.8}/setup.py +6 -6
  23. {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_camoufox.py +1 -0
  24. {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_httpx.py +1 -0
  25. {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_playwright.py +1 -0
  26. {scrapling-0.2.6 → scrapling-0.2.8}/tests/parser/test_general.py +3 -1
  27. scrapling-0.2.6/scrapling/engines/toolbelt/__init__.py +0 -20
  28. {scrapling-0.2.6 → scrapling-0.2.8}/LICENSE +0 -0
  29. {scrapling-0.2.6 → scrapling-0.2.8}/MANIFEST.in +0 -0
  30. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/__init__.py +0 -0
  31. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/mixins.py +0 -0
  32. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/core/storage_adaptors.py +6 -6
  33. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/__init__.py +2 -2
  34. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  35. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  36. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  37. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  38. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  39. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  40. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  41. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling/py.typed +0 -0
  42. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/SOURCES.txt +0 -0
  43. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/dependency_links.txt +0 -0
  44. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/not-zip-safe +0 -0
  45. {scrapling-0.2.6 → scrapling-0.2.8}/scrapling.egg-info/top_level.txt +0 -0
  46. {scrapling-0.2.6 → scrapling-0.2.8}/tests/__init__.py +0 -0
  47. {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/__init__.py +0 -0
  48. {scrapling-0.2.6 → scrapling-0.2.8}/tests/fetchers/test_utils.py +0 -0
  49. {scrapling-0.2.6 → scrapling-0.2.8}/tests/parser/__init__.py +0 -0
  50. {scrapling-0.2.6 → scrapling-0.2.8}/tests/parser/test_automatch.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.6
4
- Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
3
+ Version: 0.2.8
4
+ Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright==1.48
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.3.10
44
+ Requires-Dist: camoufox>=0.4.4
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,7 +52,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
52
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
53
 
54
54
  ```python
55
- >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
55
+ >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
56
56
  # Fetch websites' source under the radar!
57
57
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
58
  >> print(page.status)
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
90
90
  * [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
91
91
  * [Extraction By Text Speed Test](#extraction-by-text-speed-test)
92
92
  * [Installation](#installation)
93
- * [Fetching Websites Features](#fetching-websites-features)
94
- * [Fetcher](#fetcher)
95
- * [StealthyFetcher](#stealthyfetcher)
96
- * [PlayWrightFetcher](#playwrightfetcher)
93
+ * [Fetching Websites](#fetching-websites)
94
+ * [Features](#features)
95
+ * [Fetcher class](#fetcher)
96
+ * [StealthyFetcher class](#stealthyfetcher)
97
+ * [PlayWrightFetcher class](#playwrightfetcher)
97
98
  * [Advanced Parsing Features](#advanced-parsing-features)
98
99
  * [Smart Navigation](#smart-navigation)
99
100
  * [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
@@ -256,43 +257,48 @@ playwright install chromium
256
257
  python -m browserforge update
257
258
  ```
258
259
 
259
- ## Fetching Websites Features
260
- You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
260
+ ## Fetching Websites
261
+ Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
262
+
263
+ ### Features
264
+ You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
261
265
  ```python
262
266
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
263
267
  ```
264
- And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
268
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
265
269
 
266
270
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
267
271
  ```python
268
- from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
272
+ from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
269
273
  ```
270
274
  then use it right away without initializing like:
271
275
  ```python
272
276
  page = StealthyFetcher.fetch('https://example.com')
273
277
  ```
274
278
 
275
- Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
279
+ Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
276
280
  > [!NOTE]
277
281
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
278
282
  ### Fetcher
279
283
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
280
284
 
281
285
  For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
286
+
287
+ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
282
288
  ```python
283
289
  >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
284
- >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
290
+ >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
285
291
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
286
292
  >> page = Fetcher().delete('https://httpbin.org/delete')
287
293
  ```
288
294
  ### StealthyFetcher
289
- This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
295
+ This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
290
296
  ```python
291
297
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
292
298
  >> page.status == 200
293
299
  True
294
300
  ```
295
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
301
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
296
302
 
297
303
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
298
304
 
@@ -309,6 +315,7 @@ True
309
315
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
310
316
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
311
317
  | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
318
+ | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
312
319
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
313
320
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
314
321
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
@@ -327,7 +334,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
327
334
  >> page.css_first("#search a::attr(href)")
328
335
  'https://github.com/D4Vinci/Scrapling'
329
336
  ```
330
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
337
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
331
338
 
332
339
  Using this Fetcher class, you can make requests with:
333
340
  1) Vanilla Playwright without any modifications other than the ones you chose.
@@ -339,7 +346,7 @@ Using this Fetcher class, you can make requests with:
339
346
  3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
340
347
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
341
348
 
342
- > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
349
+ > Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
343
350
 
344
351
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
345
352
 
@@ -362,7 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
362
369
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
363
370
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
364
371
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
365
- | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
372
+ | real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
373
+ | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
366
374
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
367
375
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
368
376
  | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
814
822
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
815
823
 
816
824
  ## More Sponsors!
817
- [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
818
- <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
825
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
819
826
 
820
827
 
821
828
  ## Contributing
@@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
6
6
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
7
7
 
8
8
  ```python
9
- >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
9
+ >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
10
10
  # Fetch websites' source under the radar!
11
11
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
12
12
  >> print(page.status)
@@ -44,10 +44,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
44
44
  * [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
45
45
  * [Extraction By Text Speed Test](#extraction-by-text-speed-test)
46
46
  * [Installation](#installation)
47
- * [Fetching Websites Features](#fetching-websites-features)
48
- * [Fetcher](#fetcher)
49
- * [StealthyFetcher](#stealthyfetcher)
50
- * [PlayWrightFetcher](#playwrightfetcher)
47
+ * [Fetching Websites](#fetching-websites)
48
+ * [Features](#features)
49
+ * [Fetcher class](#fetcher)
50
+ * [StealthyFetcher class](#stealthyfetcher)
51
+ * [PlayWrightFetcher class](#playwrightfetcher)
51
52
  * [Advanced Parsing Features](#advanced-parsing-features)
52
53
  * [Smart Navigation](#smart-navigation)
53
54
  * [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
@@ -210,43 +211,48 @@ playwright install chromium
210
211
  python -m browserforge update
211
212
  ```
212
213
 
213
- ## Fetching Websites Features
214
- You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
214
+ ## Fetching Websites
215
+ Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
216
+
217
+ ### Features
218
+ You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
215
219
  ```python
216
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
217
221
  ```
218
- And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
222
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
219
223
 
220
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
221
225
  ```python
222
- from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
226
+ from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
223
227
  ```
224
228
  then use it right away without initializing like:
225
229
  ```python
226
230
  page = StealthyFetcher.fetch('https://example.com')
227
231
  ```
228
232
 
229
- Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
233
+ Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
230
234
  > [!NOTE]
231
235
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
232
236
  ### Fetcher
233
237
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
234
238
 
235
239
  For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
240
+
241
+ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
236
242
  ```python
237
243
  >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
238
- >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
244
+ >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
239
245
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
240
246
  >> page = Fetcher().delete('https://httpbin.org/delete')
241
247
  ```
242
248
  ### StealthyFetcher
243
- This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
249
+ This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
244
250
  ```python
245
251
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
246
252
  >> page.status == 200
247
253
  True
248
254
  ```
249
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
255
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
250
256
 
251
257
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
252
258
 
@@ -263,6 +269,7 @@ True
263
269
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
264
270
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
265
271
  | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
272
+ | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
266
273
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
267
274
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
268
275
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
@@ -281,7 +288,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
281
288
  >> page.css_first("#search a::attr(href)")
282
289
  'https://github.com/D4Vinci/Scrapling'
283
290
  ```
284
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
291
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
285
292
 
286
293
  Using this Fetcher class, you can make requests with:
287
294
  1) Vanilla Playwright without any modifications other than the ones you chose.
@@ -293,7 +300,7 @@ Using this Fetcher class, you can make requests with:
293
300
  3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
294
301
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
295
302
 
296
- > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
303
+ > Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
297
304
 
298
305
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
299
306
 
@@ -316,7 +323,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
316
323
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
317
324
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
318
325
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
319
- | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
326
+ | real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
327
+ | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
320
328
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
321
329
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
322
330
  | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
@@ -768,8 +776,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
768
776
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
769
777
 
770
778
  ## More Sponsors!
771
- [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
772
- <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
779
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
773
780
 
774
781
 
775
782
  ## Contributing
@@ -1,10 +1,11 @@
1
1
  # Declare top-level shortcuts
2
- from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
2
+ from scrapling.core.custom_types import AttributesHandler, TextHandler
3
+ from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
4
+ StealthyFetcher)
3
5
  from scrapling.parser import Adaptor, Adaptors
4
- from scrapling.core.custom_types import TextHandler, AttributesHandler
5
6
 
6
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2.6"
8
+ __version__ = "0.2.8"
8
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
10
 
10
11
 
@@ -2,9 +2,8 @@
2
2
  Type definitions for type checking purposes.
3
3
  """
4
4
 
5
- from typing import (
6
- Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
7
- )
5
+ from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
+ List, Literal, Optional, Pattern, Tuple, Type, Union)
8
7
 
9
8
  try:
10
9
  from typing import Protocol
@@ -1,13 +1,13 @@
1
1
  import re
2
- from types import MappingProxyType
3
2
  from collections.abc import Mapping
3
+ from types import MappingProxyType
4
4
 
5
- from scrapling.core.utils import _is_iterable, flatten
6
- from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
7
-
8
- from orjson import loads, dumps
5
+ from orjson import dumps, loads
9
6
  from w3lib.html import replace_entities as _replace_entities
10
7
 
8
+ from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
9
+ from scrapling.core.utils import _is_iterable, flatten
10
+
11
11
 
12
12
  class TextHandler(str):
13
13
  """Extends standard Python string by adding more functionality"""
@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
10
 
11
11
  import re
12
12
 
13
- from w3lib.html import HTML5_WHITESPACE
14
- from scrapling.core.utils import cache
15
- from scrapling.core._types import Any, Optional, Protocol, Self
16
-
17
- from cssselect.xpath import ExpressionError
18
- from cssselect.xpath import XPathExpr as OriginalXPathExpr
19
13
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
20
14
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
15
+ from cssselect.xpath import ExpressionError
16
+ from cssselect.xpath import XPathExpr as OriginalXPathExpr
17
+ from w3lib.html import HTML5_WHITESPACE
21
18
 
19
+ from scrapling.core._types import Any, Optional, Protocol, Self
20
+ from scrapling.core.utils import cache
22
21
 
23
22
  regex = f"[{HTML5_WHITESPACE}]+"
24
23
  replace_html5_whitespaces = re.compile(regex).sub
@@ -1,22 +1,25 @@
1
- import re
2
1
  import logging
2
+ import re
3
3
  from itertools import chain
4
- # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
- from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
-
7
- from scrapling.core._types import Dict, Iterable, Any, Union
8
4
 
9
5
  import orjson
10
6
  from lxml import html
11
7
 
8
+ from scrapling.core._types import Any, Dict, Iterable, Union
9
+
10
+ # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
11
+ # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
+ from functools import lru_cache as cache # isort:skip
13
+
14
+
12
15
  html_forbidden = {html.HtmlComment, }
13
16
  logging.basicConfig(
14
- level=logging.ERROR,
15
- format='%(asctime)s - %(levelname)s - %(message)s',
16
- handlers=[
17
- logging.StreamHandler()
18
- ]
19
- )
17
+ level=logging.ERROR,
18
+ format='%(asctime)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ logging.StreamHandler()
21
+ ]
22
+ )
20
23
 
21
24
 
22
25
  def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -94,7 +97,7 @@ class _StorageTools:
94
97
  parent = element.getparent()
95
98
  return tuple(
96
99
  (element.tag,) if parent is None else (
97
- cls._get_element_path(parent) + (element.tag,)
100
+ cls._get_element_path(parent) + (element.tag,)
98
101
  )
99
102
  )
100
103
 
@@ -1,4 +1,4 @@
1
- from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
1
+ from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
2
2
 
3
3
  # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
4
  Fetcher = Fetcher()
@@ -1,19 +1,16 @@
1
1
  import logging
2
- from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
3
-
4
- from scrapling.engines.toolbelt import (
5
- Response,
6
- do_nothing,
7
- StatusText,
8
- get_os_name,
9
- intercept_route,
10
- check_type_validity,
11
- construct_proxy_dict,
12
- generate_convincing_referer,
13
- )
14
2
 
3
+ from camoufox import DefaultAddons
15
4
  from camoufox.sync_api import Camoufox
16
5
 
6
+ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
7
+ Union)
8
+ from scrapling.engines.toolbelt import (Response, StatusText,
9
+ check_type_validity,
10
+ construct_proxy_dict, do_nothing,
11
+ generate_convincing_referer,
12
+ get_os_name, intercept_route)
13
+
17
14
 
18
15
  class CamoufoxEngine:
19
16
  def __init__(
@@ -21,7 +18,8 @@ class CamoufoxEngine:
21
18
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
22
19
  timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
23
20
  wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
24
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
21
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
22
+ adaptor_arguments: Dict = None,
25
23
  ):
26
24
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
27
25
 
@@ -36,6 +34,7 @@ class CamoufoxEngine:
36
34
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
37
35
  :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
38
36
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
37
+ :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
39
38
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
40
39
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
41
40
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
@@ -54,6 +53,7 @@ class CamoufoxEngine:
54
53
  self.network_idle = bool(network_idle)
55
54
  self.google_search = bool(google_search)
56
55
  self.os_randomize = bool(os_randomize)
56
+ self.disable_ads = bool(disable_ads)
57
57
  self.extra_headers = extra_headers or {}
58
58
  self.proxy = construct_proxy_dict(proxy)
59
59
  self.addons = addons or []
@@ -75,9 +75,11 @@ class CamoufoxEngine:
75
75
  :param url: Target url.
76
76
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
77
77
  """
78
+ addons = [] if self.disable_ads else [DefaultAddons.UBO]
78
79
  with Camoufox(
79
80
  proxy=self.proxy,
80
81
  addons=self.addons,
82
+ exclude_addons=addons,
81
83
  headless=self.headless,
82
84
  humanize=self.humanize,
83
85
  i_know_what_im_doing=True, # To turn warnings off with the user configurations
@@ -105,6 +107,11 @@ class CamoufoxEngine:
105
107
  if self.wait_selector and type(self.wait_selector) is str:
106
108
  waiter = page.locator(self.wait_selector)
107
109
  waiter.first.wait_for(state=self.wait_selector_state)
110
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
111
+ page.wait_for_load_state(state="load")
112
+ page.wait_for_load_state(state="domcontentloaded")
113
+ if self.network_idle:
114
+ page.wait_for_load_state('networkidle')
108
115
 
109
116
  # This will be parsed inside `Response`
110
117
  encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
44
44
  '--disable-default-apps',
45
45
  '--disable-print-preview',
46
46
  '--disable-dev-shm-usage',
47
- '--disable-popup-blocking',
47
+ # '--disable-popup-blocking',
48
48
  '--metrics-recording-only',
49
49
  '--disable-crash-reporter',
50
50
  '--disable-partial-raster',
@@ -1,20 +1,15 @@
1
1
  import json
2
2
  import logging
3
- from scrapling.core._types import Union, Callable, Optional, List, Dict
4
-
5
- from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
- from scrapling.engines.toolbelt import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- js_bypass_path,
11
- intercept_route,
12
- generate_headers,
13
- construct_cdp_url,
14
- check_type_validity,
15
- construct_proxy_dict,
16
- generate_convincing_referer,
17
- )
3
+
4
+ from scrapling.core._types import Callable, Dict, List, Optional, Union
5
+ from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
+ NSTBROWSER_DEFAULT_QUERY)
7
+ from scrapling.engines.toolbelt import (Response, StatusText,
8
+ check_type_validity, construct_cdp_url,
9
+ construct_proxy_dict, do_nothing,
10
+ generate_convincing_referer,
11
+ generate_headers, intercept_route,
12
+ js_bypass_path)
18
13
 
19
14
 
20
15
  class PlaywrightEngine:
@@ -26,6 +21,7 @@ class PlaywrightEngine:
26
21
  timeout: Optional[float] = 30000,
27
22
  page_action: Callable = do_nothing,
28
23
  wait_selector: Optional[str] = None,
24
+ locale: Optional[str] = 'en-US',
29
25
  wait_selector_state: Optional[str] = 'attached',
30
26
  stealth: Optional[bool] = False,
31
27
  real_chrome: Optional[bool] = False,
@@ -50,6 +46,7 @@ class PlaywrightEngine:
50
46
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
51
47
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
52
48
  :param wait_selector: Wait for a specific css selector to be in a specific state.
49
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
53
50
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
54
51
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
55
52
  :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
@@ -64,6 +61,7 @@ class PlaywrightEngine:
64
61
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
65
62
  """
66
63
  self.headless = headless
64
+ self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
67
65
  self.disable_resources = disable_resources
68
66
  self.network_idle = bool(network_idle)
69
67
  self.stealth = bool(stealth)
@@ -87,6 +85,14 @@ class PlaywrightEngine:
87
85
  self.nstbrowser_mode = bool(nstbrowser_mode)
88
86
  self.nstbrowser_config = nstbrowser_config
89
87
  self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
88
+ self.harmful_default_args = [
89
+ # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
90
+ '--enable-automation',
91
+ '--disable-popup-blocking',
92
+ # '--disable-component-update',
93
+ # '--disable-default-apps',
94
+ # '--disable-extensions',
95
+ ]
90
96
 
91
97
  def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
92
98
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
@@ -151,15 +157,15 @@ class PlaywrightEngine:
151
157
  else:
152
158
  if self.stealth:
153
159
  browser = p.chromium.launch(
154
- headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
160
+ headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
155
161
  )
156
162
  else:
157
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
163
+ browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
158
164
 
159
165
  # Creating the context
160
166
  if self.stealth:
161
167
  context = browser.new_context(
162
- locale='en-US',
168
+ locale=self.locale,
163
169
  is_mobile=False,
164
170
  has_touch=False,
165
171
  proxy=self.proxy,
@@ -176,6 +182,8 @@ class PlaywrightEngine:
176
182
  )
177
183
  else:
178
184
  context = browser.new_context(
185
+ locale=self.locale,
186
+ proxy=self.proxy,
179
187
  color_scheme='dark',
180
188
  user_agent=useragent,
181
189
  device_scale_factor=2,
@@ -221,6 +229,11 @@ class PlaywrightEngine:
221
229
  if self.wait_selector and type(self.wait_selector) is str:
222
230
  waiter = page.locator(self.wait_selector)
223
231
  waiter.first.wait_for(state=self.wait_selector_state)
232
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
233
+ page.wait_for_load_state(state="load")
234
+ page.wait_for_load_state(state="domcontentloaded")
235
+ if self.network_idle:
236
+ page.wait_for_load_state('networkidle')
224
237
 
225
238
  # This will be parsed inside `Response`
226
239
  encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding