scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. scrapling/__init__.py +4 -3
  2. scrapling/core/__init__.py +0 -0
  3. scrapling/core/_types.py +25 -0
  4. scrapling/{custom_types.py → core/custom_types.py} +48 -3
  5. scrapling/{mixins.py → core/mixins.py} +22 -7
  6. scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
  7. scrapling/{translator.py → core/translator.py} +2 -12
  8. scrapling/{utils.py → core/utils.py} +14 -61
  9. scrapling/engines/__init__.py +7 -0
  10. scrapling/engines/camo.py +128 -0
  11. scrapling/engines/constants.py +108 -0
  12. scrapling/engines/pw.py +237 -0
  13. scrapling/engines/static.py +112 -0
  14. scrapling/engines/toolbelt/__init__.py +19 -0
  15. scrapling/engines/toolbelt/custom.py +154 -0
  16. scrapling/engines/toolbelt/fingerprints.py +81 -0
  17. scrapling/engines/toolbelt/navigation.py +108 -0
  18. scrapling/fetchers.py +198 -0
  19. scrapling/parser.py +223 -70
  20. scrapling/py.typed +1 -0
  21. scrapling-0.2.1.dist-info/METADATA +835 -0
  22. scrapling-0.2.1.dist-info/RECORD +33 -0
  23. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
  24. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
  25. tests/__init__.py +1 -0
  26. tests/fetchers/__init__.py +1 -0
  27. tests/fetchers/test_camoufox.py +62 -0
  28. tests/fetchers/test_httpx.py +67 -0
  29. tests/fetchers/test_playwright.py +74 -0
  30. tests/parser/__init__.py +0 -0
  31. tests/parser/test_automatch.py +56 -0
  32. tests/parser/test_general.py +286 -0
  33. scrapling-0.1.2.dist-info/METADATA +0 -477
  34. scrapling-0.1.2.dist-info/RECORD +0 -12
  35. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,33 @@
1
+ scrapling/__init__.py,sha256=x8S2Da-4KgUBzNYdM9ahYw3hDw5875KnpDliQWxQiGo,435
2
+ scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
3
+ scrapling/parser.py,sha256=VGbrARu2hxXyKLbUgtdtht_tljDYPT1jaWZWgoncv5U,53551
4
+ scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
5
+ scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
7
+ scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
8
+ scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
9
+ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
10
+ scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
11
+ scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
12
+ scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
13
+ scrapling/engines/camo.py,sha256=P8kPxP0awgV-AGMibMNDJUaxZC9oYDP64Ei_dk9D3jA,7549
14
+ scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
15
+ scrapling/engines/pw.py,sha256=JKPdJkfz--8YyngLxFNwEyWF0O3_o5xR7zQCxF1D8Cs,12121
16
+ scrapling/engines/static.py,sha256=dY1iLBe7YhzRJYd9MM8P7hbqF44cpwOgTJ6CkIVfaRA,7120
17
+ scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
18
+ scrapling/engines/toolbelt/custom.py,sha256=XB_oINjmVnigODxfP9hl-teRy0BkJqfrEprWDAqO-Jo,7473
19
+ scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
20
+ scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
21
+ tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
22
+ tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
23
+ tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
24
+ tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
25
+ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-TxUcs,3479
26
+ tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
28
+ tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
29
+ scrapling-0.2.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
30
+ scrapling-0.2.1.dist-info/METADATA,sha256=aeExP8jl7VQxIUnfvvo4QxIeasqfziscacOrOoHOuXk,64155
31
+ scrapling-0.2.1.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
32
+ scrapling-0.2.1.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
33
+ scrapling-0.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (75.5.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Package for test project."""
@@ -0,0 +1 @@
1
+ # Because I'm too lazy to mock requests :)
@@ -0,0 +1,62 @@
1
+ import unittest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import StealthyFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ # @pytest_httpbin.use_class_based_httpbin_secure
9
+ class TestStealthyFetcher(unittest.TestCase):
10
+ def setUp(self):
11
+ self.fetcher = StealthyFetcher(auto_match=False)
12
+ url = self.httpbin.url
13
+ self.status_200 = f'{url}/status/200'
14
+ self.status_404 = f'{url}/status/404'
15
+ self.status_501 = f'{url}/status/501'
16
+ self.basic_url = f'{url}/get'
17
+ self.html_url = f'{url}/html'
18
+ self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
19
+ self.cookies_url = f"{url}/cookies/set/test/value"
20
+
21
+ def test_basic_fetch(self):
22
+ """Test doing basic fetch request with multiple statuses"""
23
+ self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
24
+ self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
25
+ self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
26
+
27
+ def test_networkidle(self):
28
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
29
+ self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
30
+
31
+ def test_blocking_resources(self):
32
+ """Test if blocking resources make page does not finish loading or not"""
33
+ self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
34
+ self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
35
+
36
+ def test_waiting_selector(self):
37
+ """Test if waiting for a selector make page does not finish loading or not"""
38
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
39
+
40
+ def test_cookies_loading(self):
41
+ """Test if cookies are set after the request"""
42
+ self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
43
+
44
+ def test_automation(self):
45
+ """Test if automation break the code or not"""
46
+ def scroll_page(page):
47
+ page.mouse.wheel(10, 0)
48
+ page.mouse.move(100, 400)
49
+ page.mouse.up()
50
+ return page
51
+
52
+ self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
53
+
54
+ def test_properties(self):
55
+ """Test if different arguments breaks the code or not"""
56
+ self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
57
+ self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
58
+ self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
59
+
60
+ def test_infinite_timeout(self):
61
+ """Test if infinite timeout breaks the code or not"""
62
+ self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
@@ -0,0 +1,67 @@
1
+ import unittest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import Fetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ class TestFetcher(unittest.TestCase):
9
+ def setUp(self):
10
+ self.fetcher = Fetcher(auto_match=False)
11
+ url = self.httpbin.url
12
+ self.status_200 = f'{url}/status/200'
13
+ self.status_404 = f'{url}/status/404'
14
+ self.status_501 = f'{url}/status/501'
15
+ self.basic_url = f'{url}/get'
16
+ self.post_url = f'{url}/post'
17
+ self.put_url = f'{url}/put'
18
+ self.delete_url = f'{url}/delete'
19
+ self.html_url = f'{url}/html'
20
+
21
+ def test_basic_get(self):
22
+ """Test doing basic get request with multiple statuses"""
23
+ self.assertEqual(self.fetcher.get(self.status_200).status, 200)
24
+ self.assertEqual(self.fetcher.get(self.status_404).status, 404)
25
+ self.assertEqual(self.fetcher.get(self.status_501).status, 501)
26
+
27
+ def test_get_properties(self):
28
+ """Test if different arguments with GET request breaks the code or not"""
29
+ self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
30
+ self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
31
+ self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
32
+ self.assertEqual(
33
+ self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
34
+ 200
35
+ )
36
+
37
+ def test_post_properties(self):
38
+ """Test if different arguments with POST request breaks the code or not"""
39
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
40
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
41
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
42
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
43
+ self.assertEqual(
44
+ self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
45
+ 200
46
+ )
47
+
48
+ def test_put_properties(self):
49
+ """Test if different arguments with PUT request breaks the code or not"""
50
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
51
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
52
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
53
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
54
+ self.assertEqual(
55
+ self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
56
+ 200
57
+ )
58
+
59
+ def test_delete_properties(self):
60
+ """Test if different arguments with DELETE request breaks the code or not"""
61
+ self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
62
+ self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
63
+ self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
64
+ self.assertEqual(
65
+ self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
66
+ 200
67
+ )
@@ -0,0 +1,74 @@
1
+ import unittest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import PlayWrightFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ # @pytest_httpbin.use_class_based_httpbin_secure
9
+ class TestPlayWrightFetcher(unittest.TestCase):
10
+ def setUp(self):
11
+ self.fetcher = PlayWrightFetcher(auto_match=False)
12
+ url = self.httpbin.url
13
+ self.status_200 = f'{url}/status/200'
14
+ self.status_404 = f'{url}/status/404'
15
+ self.status_501 = f'{url}/status/501'
16
+ self.basic_url = f'{url}/get'
17
+ self.html_url = f'{url}/html'
18
+ self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
19
+ self.cookies_url = f"{url}/cookies/set/test/value"
20
+
21
+ def test_basic_fetch(self):
22
+ """Test doing basic fetch request with multiple statuses"""
23
+ self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
24
+ self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
25
+ self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
26
+
27
+ def test_networkidle(self):
28
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
29
+ self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
30
+
31
+ def test_blocking_resources(self):
32
+ """Test if blocking resources make page does not finish loading or not"""
33
+ self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
34
+
35
+ def test_waiting_selector(self):
36
+ """Test if waiting for a selector make page does not finish loading or not"""
37
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
38
+
39
+ def test_cookies_loading(self):
40
+ """Test if cookies are set after the request"""
41
+ self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
42
+
43
+ def test_automation(self):
44
+ """Test if automation break the code or not"""
45
+ def scroll_page(page):
46
+ page.mouse.wheel(10, 0)
47
+ page.mouse.move(100, 400)
48
+ page.mouse.up()
49
+ return page
50
+
51
+ self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
52
+
53
+ def test_properties(self):
54
+ """Test if different arguments breaks the code or not"""
55
+ self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
56
+ self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
57
+ self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
58
+ self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
59
+
60
+ def test_cdp_url(self):
61
+ """Test if it's going to try to connect to cdp url or not"""
62
+ with self.assertRaises(ValueError):
63
+ _ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
64
+
65
+ with self.assertRaises(ValueError):
66
+ _ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
67
+
68
+ with self.assertRaises(Exception):
69
+ # There's no type for this error in PlayWright, it's just `Error`
70
+ _ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
71
+
72
+ def test_infinite_timeout(self):
73
+ """Test if infinite timeout breaks the code or not"""
74
+ self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
File without changes
@@ -0,0 +1,56 @@
1
+ import unittest
2
+
3
+ from scrapling import Adaptor
4
+
5
+
6
+ class TestParserAutoMatch(unittest.TestCase):
7
+
8
+ def test_element_relocation(self):
9
+ """Test relocating element after structure change"""
10
+ original_html = '''
11
+ <div class="container">
12
+ <section class="products">
13
+ <article class="product" id="p1">
14
+ <h3>Product 1</h3>
15
+ <p class="description">Description 1</p>
16
+ </article>
17
+ <article class="product" id="p2">
18
+ <h3>Product 2</h3>
19
+ <p class="description">Description 2</p>
20
+ </article>
21
+ </section>
22
+ </div>
23
+ '''
24
+ changed_html = '''
25
+ <div class="new-container">
26
+ <div class="product-wrapper">
27
+ <section class="products">
28
+ <article class="product new-class" data-id="p1">
29
+ <div class="product-info">
30
+ <h3>Product 1</h3>
31
+ <p class="new-description">Description 1</p>
32
+ </div>
33
+ </article>
34
+ <article class="product new-class" data-id="p2">
35
+ <div class="product-info">
36
+ <h3>Product 2</h3>
37
+ <p class="new-description">Description 2</p>
38
+ </div>
39
+ </article>
40
+ </section>
41
+ </div>
42
+ </div>
43
+ '''
44
+
45
+ old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
46
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
47
+
48
+ # 'p1' was used as ID and now it's not and all the path elements have changes
49
+ # Also at the same time testing auto-match vs combined selectors
50
+ _ = old_page.css('#p1, #p2', auto_save=True)[0]
51
+ relocated = new_page.css('#p1', auto_match=True)
52
+
53
+ self.assertIsNotNone(relocated)
54
+ self.assertEqual(relocated[0].attrib['data-id'], 'p1')
55
+ self.assertTrue(relocated[0].has_class('new-class'))
56
+ self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
@@ -0,0 +1,286 @@
1
+
2
+ import pickle
3
+ import unittest
4
+ from scrapling import Adaptor
5
+ from cssselect import SelectorError, SelectorSyntaxError
6
+
7
+
8
+ class TestParser(unittest.TestCase):
9
+ def setUp(self):
10
+ self.html = '''
11
+ <html>
12
+ <head>
13
+ <title>Complex Web Page</title>
14
+ <style>
15
+ .hidden { display: none; }
16
+ </style>
17
+ </head>
18
+ <body>
19
+ <header>
20
+ <nav>
21
+ <ul>
22
+ <li><a href="#home">Home</a></li>
23
+ <li><a href="#about">About</a></li>
24
+ <li><a href="#contact">Contact</a></li>
25
+ </ul>
26
+ </nav>
27
+ </header>
28
+ <main>
29
+ <section id="products" schema='{"jsonable": "data"}'>
30
+ <h2>Products</h2>
31
+ <div class="product-list">
32
+ <article class="product" data-id="1">
33
+ <h3>Product 1</h3>
34
+ <p class="description">This is product 1</p>
35
+ <span class="price">$10.99</span>
36
+ <div class="hidden stock">In stock: 5</div>
37
+ </article>
38
+ <article class="product" data-id="2">
39
+ <h3>Product 2</h3>
40
+ <p class="description">This is product 2</p>
41
+ <span class="price">$20.99</span>
42
+ <div class="hidden stock">In stock: 3</div>
43
+ </article>
44
+ <article class="product" data-id="3">
45
+ <h3>Product 3</h3>
46
+ <p class="description">This is product 3</p>
47
+ <span class="price">$15.99</span>
48
+ <div class="hidden stock">Out of stock</div>
49
+ </article>
50
+ </div>
51
+ </section>
52
+ <section id="reviews">
53
+ <h2>Customer Reviews</h2>
54
+ <div class="review-list">
55
+ <div class="review" data-rating="5">
56
+ <p class="review-text">Great product!</p>
57
+ <span class="reviewer">John Doe</span>
58
+ </div>
59
+ <div class="review" data-rating="4">
60
+ <p class="review-text">Good value for money.</p>
61
+ <span class="reviewer">Jane Smith</span>
62
+ </div>
63
+ </div>
64
+ </section>
65
+ </main>
66
+ <footer>
67
+ <p>&copy; 2024 Our Company</p>
68
+ </footer>
69
+ <script id="page-data" type="application/json">
70
+ {"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
71
+ </script>
72
+ </body>
73
+ </html>
74
+ '''
75
+ self.page = Adaptor(self.html, auto_match=False, debug=False)
76
+
77
+ def test_css_selector(self):
78
+ """Test Selecting elements with complex CSS selectors"""
79
+ elements = self.page.css('main #products .product-list article.product')
80
+ self.assertEqual(len(elements), 3)
81
+
82
+ in_stock_products = self.page.css(
83
+ 'main #products .product-list article.product:not(:contains("Out of stock"))')
84
+ self.assertEqual(len(in_stock_products), 2)
85
+
86
+ def test_xpath_selector(self):
87
+ """Test Selecting elements with Complex XPath selectors"""
88
+ reviews = self.page.xpath(
89
+ '//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
90
+ )
91
+ self.assertEqual(len(reviews), 2)
92
+
93
+ high_priced_products = self.page.xpath(
94
+ '//article[contains(@class, "product")]'
95
+ '[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
96
+ )
97
+ self.assertEqual(len(high_priced_products), 2)
98
+
99
+ def test_find_by_text(self):
100
+ """Test Selecting elements with Text matching"""
101
+ stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=False)
102
+ self.assertEqual(len(stock_info), 2)
103
+
104
+ stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
105
+ self.assertEqual(stock_info.text, 'In stock: 5')
106
+
107
+ stock_info = self.page.find_by_text(r'In stock:', partial=True, first_match=False)
108
+ self.assertEqual(len(stock_info), 2)
109
+
110
+ out_of_stock = self.page.find_by_text('Out of stock', partial=False, first_match=False)
111
+ self.assertEqual(len(out_of_stock), 1)
112
+
113
+ def test_find_similar_elements(self):
114
+ """Test Finding similar elements of an element"""
115
+ first_product = self.page.css_first('.product')
116
+ similar_products = first_product.find_similar()
117
+ self.assertEqual(len(similar_products), 2)
118
+
119
+ first_review = self.page.find('div', class_='review')
120
+ similar_high_rated_reviews = [
121
+ review
122
+ for review in first_review.find_similar()
123
+ if int(review.attrib.get('data-rating', 0)) >= 4
124
+ ]
125
+ self.assertEqual(len(similar_high_rated_reviews), 1)
126
+
127
+ def test_expected_errors(self):
128
+ """Test errors that should raised if it does"""
129
+ with self.assertRaises(ValueError):
130
+ _ = Adaptor(auto_match=False)
131
+
132
+ with self.assertRaises(TypeError):
133
+ _ = Adaptor(root="ayo", auto_match=False)
134
+
135
+ with self.assertRaises(TypeError):
136
+ _ = Adaptor(text=1, auto_match=False)
137
+
138
+ with self.assertRaises(TypeError):
139
+ _ = Adaptor(body=1, auto_match=False)
140
+
141
+ with self.assertRaises(ValueError):
142
+ _ = Adaptor(self.html, storage=object, auto_match=True)
143
+
144
+ def test_pickleable(self):
145
+ """Test that objects aren't pickleable"""
146
+ table = self.page.css('.product-list')[0]
147
+ with self.assertRaises(TypeError): # Adaptors
148
+ pickle.dumps(table)
149
+
150
+ with self.assertRaises(TypeError): # Adaptor
151
+ pickle.dumps(table[0])
152
+
153
+ def test_overridden(self):
154
+ """Test overridden functions"""
155
+ table = self.page.css('.product-list')[0]
156
+ self.assertTrue(issubclass(type(table.__str__()), str))
157
+ self.assertTrue(issubclass(type(table.__repr__()), str))
158
+ self.assertTrue(issubclass(type(table.attrib.__str__()), str))
159
+ self.assertTrue(issubclass(type(table.attrib.__repr__()), str))
160
+
161
+ def test_bad_selector(self):
162
+ """Test object can handle bad selector"""
163
+ with self.assertRaises((SelectorError, SelectorSyntaxError,)):
164
+ self.page.css('4 ayo')
165
+
166
+ with self.assertRaises((SelectorError, SelectorSyntaxError,)):
167
+ self.page.xpath('4 ayo')
168
+
169
+ def test_selectors_generation(self):
170
+ """Try to create selectors for all elements in the page"""
171
+ def _traverse(element: Adaptor):
172
+ self.assertTrue(type(element.generate_css_selector) is str)
173
+ self.assertTrue(type(element.generate_xpath_selector) is str)
174
+ for branch in element.children:
175
+ _traverse(branch)
176
+
177
+ _traverse(self.page)
178
+
179
+ def test_getting_all_text(self):
180
+ """Test getting all text"""
181
+ self.assertNotEqual(self.page.get_all_text(), '')
182
+
183
+ def test_element_navigation(self):
184
+ """Test moving in the page from selected element"""
185
+ table = self.page.css('.product-list')[0]
186
+
187
+ self.assertIsNot(table.path, [])
188
+ self.assertNotEqual(table.html_content, '')
189
+ self.assertNotEqual(table.prettify(), '')
190
+
191
+ parent = table.parent
192
+ self.assertEqual(parent.attrib['id'], 'products')
193
+
194
+ children = table.children
195
+ self.assertEqual(len(children), 3)
196
+
197
+ parent_siblings = parent.siblings
198
+ self.assertEqual(len(parent_siblings), 1)
199
+
200
+ child = table.find({'data-id': "1"})
201
+ next_element = child.next
202
+ self.assertEqual(next_element.attrib['data-id'], '2')
203
+
204
+ prev_element = next_element.previous
205
+ self.assertEqual(prev_element.tag, child.tag)
206
+
207
+ all_prices = self.page.css('.price')
208
+ products_with_prices = [
209
+ price.find_ancestor(lambda p: p.has_class('product'))
210
+ for price in all_prices
211
+ ]
212
+ self.assertEqual(len(products_with_prices), 3)
213
+
214
+ def test_empty_return(self):
215
+ """Test cases where functions shouldn't have results"""
216
+ test_html = """
217
+ <html>
218
+ <span id="a"><a></a><!--comment--></span>
219
+ <span id="b"><!--comment--><a></a></span>
220
+ </html>"""
221
+ soup = Adaptor(test_html, auto_match=False, keep_comments=False)
222
+ html_tag = soup.css('html')[0]
223
+ self.assertEqual(html_tag.path, [])
224
+ self.assertEqual(html_tag.siblings, [])
225
+ self.assertEqual(html_tag.parent, None)
226
+ self.assertEqual(html_tag.find_ancestor(lambda e: e), None)
227
+
228
+ self.assertEqual(soup.css('#a a')[0].next, None)
229
+ self.assertEqual(soup.css('#b a')[0].previous, None)
230
+
231
+ def test_text_to_json(self):
232
+ """Test converting text to json"""
233
+ script_content = self.page.css('#page-data::text')[0]
234
+ self.assertTrue(issubclass(type(script_content.sort()), str))
235
+ page_data = script_content.json()
236
+ self.assertEqual(page_data['totalProducts'], 3)
237
+ self.assertTrue('lastUpdated' in page_data)
238
+
239
+ def test_regex_on_text(self):
240
+ """Test doing regex on a selected text"""
241
+ element = self.page.css('[data-id="1"] .price')[0]
242
+ match = element.re_first(r'[\.\d]+')
243
+ self.assertEqual(match, '10.99')
244
+ match = element.text.re(r'(\d+)', replace_entities=False)
245
+ self.assertEqual(len(match), 2)
246
+
247
+ def test_attribute_operations(self):
248
+ """Test operations on elements attributes"""
249
+ products = self.page.css('.product')
250
+ product_ids = [product.attrib['data-id'] for product in products]
251
+ self.assertEqual(product_ids, ['1', '2', '3'])
252
+ self.assertTrue('data-id' in products[0].attrib)
253
+
254
+ reviews = self.page.css('.review')
255
+ review_ratings = [int(review.attrib['data-rating']) for review in reviews]
256
+ self.assertEqual(sum(review_ratings) / len(review_ratings), 4.5)
257
+
258
+ key_value = list(products[0].attrib.search_values('1', partial=False))
259
+ self.assertEqual(list(key_value[0].keys()), ['data-id'])
260
+
261
+ key_value = list(products[0].attrib.search_values('1', partial=True))
262
+ self.assertEqual(list(key_value[0].keys()), ['data-id'])
263
+
264
+ attr_json = self.page.css_first('#products').attrib['schema'].json()
265
+ self.assertEqual(attr_json, {'jsonable': 'data'})
266
+ self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
267
+
268
+ def test_performance(self):
269
+ """Test parsing and selecting speed"""
270
+ import time
271
+ large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
272
+
273
+ start_time = time.time()
274
+ parsed = Adaptor(large_html, auto_match=False, debug=False)
275
+ elements = parsed.css('.item')
276
+ end_time = time.time()
277
+
278
+ self.assertEqual(len(elements), 5000)
279
+ # Converting 5000 elements to a class and doing operations on them will take time
280
+ # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
281
+ self.assertLess(end_time - start_time, 0.1)
282
+
283
+
284
+ # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
285
+ # if __name__ == '__main__':
286
+ # unittest.main(verbosity=2)