scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. scrapling/__init__.py +5 -4
  2. scrapling/core/_types.py +2 -3
  3. scrapling/core/custom_types.py +93 -11
  4. scrapling/core/storage_adaptors.py +9 -10
  5. scrapling/core/translator.py +6 -7
  6. scrapling/core/utils.py +35 -30
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/__init__.py +2 -2
  9. scrapling/engines/camo.py +96 -26
  10. scrapling/engines/constants.py +4 -4
  11. scrapling/engines/pw.py +166 -96
  12. scrapling/engines/static.py +94 -50
  13. scrapling/engines/toolbelt/__init__.py +6 -20
  14. scrapling/engines/toolbelt/custom.py +22 -23
  15. scrapling/engines/toolbelt/fingerprints.py +7 -7
  16. scrapling/engines/toolbelt/navigation.py +25 -12
  17. scrapling/fetchers.py +233 -17
  18. scrapling/parser.py +63 -28
  19. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
  20. scrapling-0.2.9.dist-info/RECORD +47 -0
  21. tests/fetchers/async/__init__.py +0 -0
  22. tests/fetchers/async/test_camoufox.py +95 -0
  23. tests/fetchers/async/test_httpx.py +83 -0
  24. tests/fetchers/async/test_playwright.py +99 -0
  25. tests/fetchers/sync/__init__.py +0 -0
  26. tests/fetchers/sync/test_camoufox.py +68 -0
  27. tests/fetchers/sync/test_httpx.py +82 -0
  28. tests/fetchers/sync/test_playwright.py +87 -0
  29. tests/fetchers/test_utils.py +90 -122
  30. tests/parser/test_automatch.py +64 -9
  31. tests/parser/test_general.py +263 -219
  32. scrapling-0.2.7.dist-info/RECORD +0 -42
  33. tests/fetchers/test_camoufox.py +0 -64
  34. tests/fetchers/test_httpx.py +0 -67
  35. tests/fetchers/test_playwright.py +0 -76
  36. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
  37. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
  38. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
1
- import unittest
1
+ import asyncio
2
2
 
3
- from scrapling import Adaptor
3
+ import pytest
4
4
 
5
+ from scrapling import Adaptor
5
6
 
6
- class TestParserAutoMatch(unittest.TestCase):
7
7
 
8
+ class TestParserAutoMatch:
8
9
  def test_element_relocation(self):
9
10
  """Test relocating element after structure change"""
10
11
  original_html = '''
@@ -42,15 +43,69 @@ class TestParserAutoMatch(unittest.TestCase):
42
43
  </div>
43
44
  '''
44
45
 
45
- old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
46
- new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
46
+ old_page = Adaptor(original_html, url='example.com', auto_match=True)
47
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True)
48
+
49
+ # 'p1' was used as ID and now it's not and all the path elements have changes
50
+ # Also at the same time testing auto-match vs combined selectors
51
+ _ = old_page.css('#p1, #p2', auto_save=True)[0]
52
+ relocated = new_page.css('#p1', auto_match=True)
53
+
54
+ assert relocated is not None
55
+ assert relocated[0].attrib['data-id'] == 'p1'
56
+ assert relocated[0].has_class('new-class')
57
+ assert relocated[0].css('.new-description')[0].text == 'Description 1'
58
+
59
+ @pytest.mark.asyncio
60
+ async def test_element_relocation_async(self):
61
+ """Test relocating element after structure change in async mode"""
62
+ original_html = '''
63
+ <div class="container">
64
+ <section class="products">
65
+ <article class="product" id="p1">
66
+ <h3>Product 1</h3>
67
+ <p class="description">Description 1</p>
68
+ </article>
69
+ <article class="product" id="p2">
70
+ <h3>Product 2</h3>
71
+ <p class="description">Description 2</p>
72
+ </article>
73
+ </section>
74
+ </div>
75
+ '''
76
+ changed_html = '''
77
+ <div class="new-container">
78
+ <div class="product-wrapper">
79
+ <section class="products">
80
+ <article class="product new-class" data-id="p1">
81
+ <div class="product-info">
82
+ <h3>Product 1</h3>
83
+ <p class="new-description">Description 1</p>
84
+ </div>
85
+ </article>
86
+ <article class="product new-class" data-id="p2">
87
+ <div class="product-info">
88
+ <h3>Product 2</h3>
89
+ <p class="new-description">Description 2</p>
90
+ </div>
91
+ </article>
92
+ </section>
93
+ </div>
94
+ </div>
95
+ '''
96
+
97
+ # Simulate async operation
98
+ await asyncio.sleep(0.1) # Minimal async operation
99
+
100
+ old_page = Adaptor(original_html, url='example.com', auto_match=True)
101
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True)
47
102
 
48
103
  # 'p1' was used as ID and now it's not and all the path elements have changes
49
104
  # Also at the same time testing auto-match vs combined selectors
50
105
  _ = old_page.css('#p1, #p2', auto_save=True)[0]
51
106
  relocated = new_page.css('#p1', auto_match=True)
52
107
 
53
- self.assertIsNotNone(relocated)
54
- self.assertEqual(relocated[0].attrib['data-id'], 'p1')
55
- self.assertTrue(relocated[0].has_class('new-class'))
56
- self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
108
+ assert relocated is not None
109
+ assert relocated[0].attrib['data-id'] == 'p1'
110
+ assert relocated[0].has_class('new-class')
111
+ assert relocated[0].css('.new-description')[0].text == 'Description 1'
@@ -1,286 +1,330 @@
1
-
2
1
  import pickle
3
- import unittest
4
- from scrapling import Adaptor
2
+ import time
3
+
4
+ import pytest
5
5
  from cssselect import SelectorError, SelectorSyntaxError
6
6
 
7
+ from scrapling import Adaptor
8
+
7
9
 
8
- class TestParser(unittest.TestCase):
9
- def setUp(self):
10
- self.html = '''
11
- <html>
12
- <head>
13
- <title>Complex Web Page</title>
14
- <style>
15
- .hidden { display: none; }
16
- </style>
17
- </head>
18
- <body>
19
- <header>
20
- <nav>
21
- <ul>
22
- <li><a href="#home">Home</a></li>
23
- <li><a href="#about">About</a></li>
24
- <li><a href="#contact">Contact</a></li>
25
- </ul>
26
- </nav>
27
- </header>
28
- <main>
29
- <section id="products" schema='{"jsonable": "data"}'>
30
- <h2>Products</h2>
31
- <div class="product-list">
32
- <article class="product" data-id="1">
33
- <h3>Product 1</h3>
34
- <p class="description">This is product 1</p>
35
- <span class="price">$10.99</span>
36
- <div class="hidden stock">In stock: 5</div>
37
- </article>
38
- <article class="product" data-id="2">
39
- <h3>Product 2</h3>
40
- <p class="description">This is product 2</p>
41
- <span class="price">$20.99</span>
42
- <div class="hidden stock">In stock: 3</div>
43
- </article>
44
- <article class="product" data-id="3">
45
- <h3>Product 3</h3>
46
- <p class="description">This is product 3</p>
47
- <span class="price">$15.99</span>
48
- <div class="hidden stock">Out of stock</div>
49
- </article>
10
+ @pytest.fixture
11
+ def html_content():
12
+ return '''
13
+ <html>
14
+ <head>
15
+ <title>Complex Web Page</title>
16
+ <style>
17
+ .hidden { display: none; }
18
+ </style>
19
+ </head>
20
+ <body>
21
+ <header>
22
+ <nav>
23
+ <ul>
24
+ <li><a href="#home">Home</a></li>
25
+ <li><a href="#about">About</a></li>
26
+ <li><a href="#contact">Contact</a></li>
27
+ </ul>
28
+ </nav>
29
+ </header>
30
+ <main>
31
+ <section id="products" schema='{"jsonable": "data"}'>
32
+ <h2>Products</h2>
33
+ <div class="product-list">
34
+ <article class="product" data-id="1">
35
+ <h3>Product 1</h3>
36
+ <p class="description">This is product 1</p>
37
+ <span class="price">$10.99</span>
38
+ <div class="hidden stock">In stock: 5</div>
39
+ </article>
40
+ <article class="product" data-id="2">
41
+ <h3>Product 2</h3>
42
+ <p class="description">This is product 2</p>
43
+ <span class="price">$20.99</span>
44
+ <div class="hidden stock">In stock: 3</div>
45
+ </article>
46
+ <article class="product" data-id="3">
47
+ <h3>Product 3</h3>
48
+ <p class="description">This is product 3</p>
49
+ <span class="price">$15.99</span>
50
+ <div class="hidden stock">Out of stock</div>
51
+ </article>
52
+ </div>
53
+ </section>
54
+ <section id="reviews">
55
+ <h2>Customer Reviews</h2>
56
+ <div class="review-list">
57
+ <div class="review" data-rating="5">
58
+ <p class="review-text">Great product!</p>
59
+ <span class="reviewer">John Doe</span>
50
60
  </div>
51
- </section>
52
- <section id="reviews">
53
- <h2>Customer Reviews</h2>
54
- <div class="review-list">
55
- <div class="review" data-rating="5">
56
- <p class="review-text">Great product!</p>
57
- <span class="reviewer">John Doe</span>
58
- </div>
59
- <div class="review" data-rating="4">
60
- <p class="review-text">Good value for money.</p>
61
- <span class="reviewer">Jane Smith</span>
62
- </div>
61
+ <div class="review" data-rating="4">
62
+ <p class="review-text">Good value for money.</p>
63
+ <span class="reviewer">Jane Smith</span>
63
64
  </div>
64
- </section>
65
- </main>
66
- <footer>
67
- <p>&copy; 2024 Our Company</p>
68
- </footer>
69
- <script id="page-data" type="application/json">
70
- {"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
71
- </script>
72
- </body>
73
- </html>
74
- '''
75
- self.page = Adaptor(self.html, auto_match=False, debug=False)
76
-
77
- def test_css_selector(self):
78
- """Test Selecting elements with complex CSS selectors"""
79
- elements = self.page.css('main #products .product-list article.product')
80
- self.assertEqual(len(elements), 3)
81
-
82
- in_stock_products = self.page.css(
65
+ </div>
66
+ </section>
67
+ </main>
68
+ <footer>
69
+ <p>&copy; 2024 Our Company</p>
70
+ </footer>
71
+ <script id="page-data" type="application/json">
72
+ {"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
73
+ </script>
74
+ </body>
75
+ </html>
76
+ '''
77
+
78
+
79
+ @pytest.fixture
80
+ def page(html_content):
81
+ return Adaptor(html_content, auto_match=False)
82
+
83
+
84
+ # CSS Selector Tests
85
+ class TestCSSSelectors:
86
+ def test_basic_product_selection(self, page):
87
+ """Test selecting all product elements"""
88
+ elements = page.css('main #products .product-list article.product')
89
+ assert len(elements) == 3
90
+
91
+ def test_in_stock_product_selection(self, page):
92
+ """Test selecting in-stock products"""
93
+ in_stock_products = page.css(
83
94
  'main #products .product-list article.product:not(:contains("Out of stock"))')
84
- self.assertEqual(len(in_stock_products), 2)
95
+ assert len(in_stock_products) == 2
96
+
85
97
 
86
- def test_xpath_selector(self):
87
- """Test Selecting elements with Complex XPath selectors"""
88
- reviews = self.page.xpath(
98
+ # XPath Selector Tests
99
+ class TestXPathSelectors:
100
+ def test_high_rating_reviews(self, page):
101
+ """Test selecting reviews with high ratings"""
102
+ reviews = page.xpath(
89
103
  '//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
90
104
  )
91
- self.assertEqual(len(reviews), 2)
105
+ assert len(reviews) == 2
92
106
 
93
- high_priced_products = self.page.xpath(
107
+ def test_high_priced_products(self, page):
108
+ """Test selecting products above a certain price"""
109
+ high_priced_products = page.xpath(
94
110
  '//article[contains(@class, "product")]'
95
111
  '[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
96
112
  )
97
- self.assertEqual(len(high_priced_products), 2)
113
+ assert len(high_priced_products) == 2
114
+
115
+
116
+ # Text Matching Tests
117
+ class TestTextMatching:
118
+ def test_regex_multiple_matches(self, page):
119
+ """Test finding multiple matches with regex"""
120
+ stock_info = page.find_by_regex(r'In stock: \d+', first_match=False)
121
+ assert len(stock_info) == 2
98
122
 
99
- def test_find_by_text(self):
100
- """Test Selecting elements with Text matching"""
101
- stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=False)
102
- self.assertEqual(len(stock_info), 2)
123
+ def test_regex_first_match(self, page):
124
+ """Test finding the first match with regex"""
125
+ stock_info = page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
126
+ assert stock_info.text == 'In stock: 5'
103
127
 
104
- stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
105
- self.assertEqual(stock_info.text, 'In stock: 5')
128
+ def test_partial_text_match(self, page):
129
+ """Test finding elements with partial text match"""
130
+ stock_info = page.find_by_text(r'In stock:', partial=True, first_match=False)
131
+ assert len(stock_info) == 2
106
132
 
107
- stock_info = self.page.find_by_text(r'In stock:', partial=True, first_match=False)
108
- self.assertEqual(len(stock_info), 2)
133
+ def test_exact_text_match(self, page):
134
+ """Test finding elements with exact text match"""
135
+ out_of_stock = page.find_by_text('Out of stock', partial=False, first_match=False)
136
+ assert len(out_of_stock) == 1
109
137
 
110
- out_of_stock = self.page.find_by_text('Out of stock', partial=False, first_match=False)
111
- self.assertEqual(len(out_of_stock), 1)
112
138
 
113
- def test_find_similar_elements(self):
114
- """Test Finding similar elements of an element"""
115
- first_product = self.page.css_first('.product')
139
+ # Similar Elements Tests
140
+ class TestSimilarElements:
141
+ def test_finding_similar_products(self, page):
142
+ """Test finding similar product elements"""
143
+ first_product = page.css_first('.product')
116
144
  similar_products = first_product.find_similar()
117
- self.assertEqual(len(similar_products), 2)
145
+ assert len(similar_products) == 2
118
146
 
119
- first_review = self.page.find('div', class_='review')
147
+ def test_finding_similar_reviews(self, page):
148
+ """Test finding similar review elements with additional filtering"""
149
+ first_review = page.find('div', class_='review')
120
150
  similar_high_rated_reviews = [
121
151
  review
122
152
  for review in first_review.find_similar()
123
153
  if int(review.attrib.get('data-rating', 0)) >= 4
124
154
  ]
125
- self.assertEqual(len(similar_high_rated_reviews), 1)
155
+ assert len(similar_high_rated_reviews) == 1
126
156
 
127
- def test_expected_errors(self):
128
- """Test errors that should raised if it does"""
129
- with self.assertRaises(ValueError):
157
+
158
+ # Error Handling Tests
159
+ class TestErrorHandling:
160
+ def test_invalid_adaptor_initialization(self):
161
+ """Test various invalid Adaptor initializations"""
162
+ # No arguments
163
+ with pytest.raises(ValueError):
130
164
  _ = Adaptor(auto_match=False)
131
165
 
132
- with self.assertRaises(TypeError):
166
+ # Invalid argument types
167
+ with pytest.raises(TypeError):
133
168
  _ = Adaptor(root="ayo", auto_match=False)
134
169
 
135
- with self.assertRaises(TypeError):
170
+ with pytest.raises(TypeError):
136
171
  _ = Adaptor(text=1, auto_match=False)
137
172
 
138
- with self.assertRaises(TypeError):
173
+ with pytest.raises(TypeError):
139
174
  _ = Adaptor(body=1, auto_match=False)
140
175
 
141
- with self.assertRaises(ValueError):
142
- _ = Adaptor(self.html, storage=object, auto_match=True)
143
-
144
- def test_pickleable(self):
145
- """Test that objects aren't pickleable"""
146
- table = self.page.css('.product-list')[0]
147
- with self.assertRaises(TypeError): # Adaptors
148
- pickle.dumps(table)
149
-
150
- with self.assertRaises(TypeError): # Adaptor
151
- pickle.dumps(table[0])
152
-
153
- def test_overridden(self):
154
- """Test overridden functions"""
155
- table = self.page.css('.product-list')[0]
156
- self.assertTrue(issubclass(type(table.__str__()), str))
157
- self.assertTrue(issubclass(type(table.__repr__()), str))
158
- self.assertTrue(issubclass(type(table.attrib.__str__()), str))
159
- self.assertTrue(issubclass(type(table.attrib.__repr__()), str))
160
-
161
- def test_bad_selector(self):
162
- """Test object can handle bad selector"""
163
- with self.assertRaises((SelectorError, SelectorSyntaxError,)):
164
- self.page.css('4 ayo')
165
-
166
- with self.assertRaises((SelectorError, SelectorSyntaxError,)):
167
- self.page.xpath('4 ayo')
176
+ def test_invalid_storage(self, page, html_content):
177
+ """Test invalid storage parameter"""
178
+ with pytest.raises(ValueError):
179
+ _ = Adaptor(html_content, storage=object, auto_match=True)
168
180
 
169
- def test_selectors_generation(self):
170
- """Try to create selectors for all elements in the page"""
171
- def _traverse(element: Adaptor):
172
- self.assertTrue(type(element.generate_css_selector) is str)
173
- self.assertTrue(type(element.generate_xpath_selector) is str)
174
- for branch in element.children:
175
- _traverse(branch)
181
+ def test_bad_selectors(self, page):
182
+ """Test handling of invalid selectors"""
183
+ with pytest.raises((SelectorError, SelectorSyntaxError)):
184
+ page.css('4 ayo')
176
185
 
177
- _traverse(self.page)
186
+ with pytest.raises((SelectorError, SelectorSyntaxError)):
187
+ page.xpath('4 ayo')
178
188
 
179
- def test_getting_all_text(self):
180
- """Test getting all text"""
181
- self.assertNotEqual(self.page.get_all_text(), '')
182
189
 
183
- def test_element_navigation(self):
184
- """Test moving in the page from selected element"""
185
- table = self.page.css('.product-list')[0]
190
+ # Pickling and Object Representation Tests
191
+ class TestPicklingAndRepresentation:
192
+ def test_unpickleable_objects(self, page):
193
+ """Test that Adaptor objects cannot be pickled"""
194
+ table = page.css('.product-list')[0]
195
+ with pytest.raises(TypeError):
196
+ pickle.dumps(table)
186
197
 
187
- self.assertIsNot(table.path, [])
188
- self.assertNotEqual(table.html_content, '')
189
- self.assertNotEqual(table.prettify(), '')
198
+ with pytest.raises(TypeError):
199
+ pickle.dumps(table[0])
190
200
 
201
+ def test_string_representations(self, page):
202
+ """Test custom string representations of objects"""
203
+ table = page.css('.product-list')[0]
204
+ assert issubclass(type(table.__str__()), str)
205
+ assert issubclass(type(table.__repr__()), str)
206
+ assert issubclass(type(table.attrib.__str__()), str)
207
+ assert issubclass(type(table.attrib.__repr__()), str)
208
+
209
+
210
+ # Navigation and Traversal Tests
211
+ class TestElementNavigation:
212
+ def test_basic_navigation_properties(self, page):
213
+ """Test basic navigation properties of elements"""
214
+ table = page.css('.product-list')[0]
215
+ assert table.path is not None
216
+ assert table.html_content != ''
217
+ assert table.prettify() != ''
218
+
219
+ def test_parent_and_sibling_navigation(self, page):
220
+ """Test parent and sibling navigation"""
221
+ table = page.css('.product-list')[0]
191
222
  parent = table.parent
192
- self.assertEqual(parent.attrib['id'], 'products')
193
-
194
- children = table.children
195
- self.assertEqual(len(children), 3)
223
+ assert parent.attrib['id'] == 'products'
196
224
 
197
225
  parent_siblings = parent.siblings
198
- self.assertEqual(len(parent_siblings), 1)
226
+ assert len(parent_siblings) == 1
199
227
 
200
- child = table.find({'data-id': "1"})
228
+ def test_child_navigation(self, page):
229
+ """Test child navigation"""
230
+ table = page.css('.product-list')[0]
231
+ children = table.children
232
+ assert len(children) == 3
233
+
234
+ def test_next_and_previous_navigation(self, page):
235
+ """Test next and previous element navigation"""
236
+ child = page.css('.product-list')[0].find({'data-id': "1"})
201
237
  next_element = child.next
202
- self.assertEqual(next_element.attrib['data-id'], '2')
238
+ assert next_element.attrib['data-id'] == '2'
203
239
 
204
240
  prev_element = next_element.previous
205
- self.assertEqual(prev_element.tag, child.tag)
241
+ assert prev_element.tag == child.tag
206
242
 
207
- all_prices = self.page.css('.price')
243
+ def test_ancestor_finding(self, page):
244
+ """Test finding ancestors of elements"""
245
+ all_prices = page.css('.price')
208
246
  products_with_prices = [
209
247
  price.find_ancestor(lambda p: p.has_class('product'))
210
248
  for price in all_prices
211
249
  ]
212
- self.assertEqual(len(products_with_prices), 3)
213
-
214
- def test_empty_return(self):
215
- """Test cases where functions shouldn't have results"""
216
- test_html = """
217
- <html>
218
- <span id="a"><a></a><!--comment--></span>
219
- <span id="b"><!--comment--><a></a></span>
220
- </html>"""
221
- soup = Adaptor(test_html, auto_match=False, keep_comments=False)
222
- html_tag = soup.css('html')[0]
223
- self.assertEqual(html_tag.path, [])
224
- self.assertEqual(html_tag.siblings, [])
225
- self.assertEqual(html_tag.parent, None)
226
- self.assertEqual(html_tag.find_ancestor(lambda e: e), None)
227
-
228
- self.assertEqual(soup.css('#a a')[0].next, None)
229
- self.assertEqual(soup.css('#b a')[0].previous, None)
230
-
231
- def test_text_to_json(self):
232
- """Test converting text to json"""
233
- script_content = self.page.css('#page-data::text')[0]
234
- self.assertTrue(issubclass(type(script_content.sort()), str))
250
+ assert len(products_with_prices) == 3
251
+
252
+
253
+ # JSON and Attribute Tests
254
+ class TestJSONAndAttributes:
255
+ def test_json_conversion(self, page):
256
+ """Test converting content to JSON"""
257
+ script_content = page.css('#page-data::text')[0]
258
+ assert issubclass(type(script_content.sort()), str)
235
259
  page_data = script_content.json()
236
- self.assertEqual(page_data['totalProducts'], 3)
237
- self.assertTrue('lastUpdated' in page_data)
238
-
239
- def test_regex_on_text(self):
240
- """Test doing regex on a selected text"""
241
- element = self.page.css('[data-id="1"] .price')[0]
242
- match = element.re_first(r'[\.\d]+')
243
- self.assertEqual(match, '10.99')
244
- match = element.text.re(r'(\d+)', replace_entities=False)
245
- self.assertEqual(len(match), 2)
246
-
247
- def test_attribute_operations(self):
248
- """Test operations on elements attributes"""
249
- products = self.page.css('.product')
260
+ assert page_data['totalProducts'] == 3
261
+ assert 'lastUpdated' in page_data
262
+
263
+ def test_attribute_operations(self, page):
264
+ """Test various attribute-related operations"""
265
+ # Product ID extraction
266
+ products = page.css('.product')
250
267
  product_ids = [product.attrib['data-id'] for product in products]
251
- self.assertEqual(product_ids, ['1', '2', '3'])
252
- self.assertTrue('data-id' in products[0].attrib)
268
+ assert product_ids == ['1', '2', '3']
269
+ assert 'data-id' in products[0].attrib
253
270
 
254
- reviews = self.page.css('.review')
271
+ # Review rating calculations
272
+ reviews = page.css('.review')
255
273
  review_ratings = [int(review.attrib['data-rating']) for review in reviews]
256
- self.assertEqual(sum(review_ratings) / len(review_ratings), 4.5)
274
+ assert sum(review_ratings) / len(review_ratings) == 4.5
257
275
 
276
+ # Attribute searching
258
277
  key_value = list(products[0].attrib.search_values('1', partial=False))
259
- self.assertEqual(list(key_value[0].keys()), ['data-id'])
278
+ assert list(key_value[0].keys()) == ['data-id']
260
279
 
261
280
  key_value = list(products[0].attrib.search_values('1', partial=True))
262
- self.assertEqual(list(key_value[0].keys()), ['data-id'])
281
+ assert list(key_value[0].keys()) == ['data-id']
282
+
283
+ # JSON attribute conversion
284
+ attr_json = page.css_first('#products').attrib['schema'].json()
285
+ assert attr_json == {'jsonable': 'data'}
286
+ assert isinstance(page.css('#products')[0].attrib.json_string, bytes)
287
+
288
+
289
+ # Performance Test
290
+ def test_large_html_parsing_performance():
291
+ """Test parsing and selecting performance on large HTML"""
292
+ large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
293
+
294
+ start_time = time.time()
295
+ parsed = Adaptor(large_html, auto_match=False)
296
+ elements = parsed.css('.item')
297
+ end_time = time.time()
298
+
299
+ assert len(elements) == 5000
300
+ # Converting 5000 elements to a class and doing operations on them will take time
301
+ # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
302
+ assert end_time - start_time < 0.5 # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
303
+
304
+
305
+ # Selector Generation Test
306
+ def test_selectors_generation(page):
307
+ """Try to create selectors for all elements in the page"""
263
308
 
264
- attr_json = self.page.css_first('#products').attrib['schema'].json()
265
- self.assertEqual(attr_json, {'jsonable': 'data'})
266
- self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
309
+ def _traverse(element: Adaptor):
310
+ assert isinstance(element.generate_css_selector, str)
311
+ assert isinstance(element.generate_xpath_selector, str)
312
+ for branch in element.children:
313
+ _traverse(branch)
267
314
 
268
- def test_performance(self):
269
- """Test parsing and selecting speed"""
270
- import time
271
- large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
315
+ _traverse(page)
272
316
 
273
- start_time = time.time()
274
- parsed = Adaptor(large_html, auto_match=False, debug=False)
275
- elements = parsed.css('.item')
276
- end_time = time.time()
277
317
 
278
- self.assertEqual(len(elements), 5000)
279
- # Converting 5000 elements to a class and doing operations on them will take time
280
- # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
281
- self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
318
+ # Miscellaneous Tests
319
+ def test_getting_all_text(page):
320
+ """Test getting all text from the page"""
321
+ assert page.get_all_text() != ''
282
322
 
283
323
 
284
- # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
285
- # if __name__ == '__main__':
286
- # unittest.main(verbosity=2)
324
+ def test_regex_on_text(page):
325
+ """Test regex operations on text"""
326
+ element = page.css('[data-id="1"] .price')[0]
327
+ match = element.re_first(r'[\.\d]+')
328
+ assert match == '10.99'
329
+ match = element.text.re(r'(\d+)', replace_entities=False)
330
+ assert len(match) == 2