scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +14 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +128 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +237 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +19 -0
- scrapling/engines/toolbelt/custom.py +154 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +108 -0
- scrapling/fetchers.py +198 -0
- scrapling/parser.py +223 -70
- scrapling/py.typed +1 -0
- scrapling-0.2.1.dist-info/METADATA +835 -0
- scrapling-0.2.1.dist-info/RECORD +33 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
scrapling/__init__.py,sha256=x8S2Da-4KgUBzNYdM9ahYw3hDw5875KnpDliQWxQiGo,435
|
2
|
+
scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
|
3
|
+
scrapling/parser.py,sha256=VGbrARu2hxXyKLbUgtdtht_tljDYPT1jaWZWgoncv5U,53551
|
4
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
5
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
7
|
+
scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
|
8
|
+
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
9
|
+
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
10
|
+
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
11
|
+
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
12
|
+
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
13
|
+
scrapling/engines/camo.py,sha256=P8kPxP0awgV-AGMibMNDJUaxZC9oYDP64Ei_dk9D3jA,7549
|
14
|
+
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
15
|
+
scrapling/engines/pw.py,sha256=JKPdJkfz--8YyngLxFNwEyWF0O3_o5xR7zQCxF1D8Cs,12121
|
16
|
+
scrapling/engines/static.py,sha256=dY1iLBe7YhzRJYd9MM8P7hbqF44cpwOgTJ6CkIVfaRA,7120
|
17
|
+
scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
|
18
|
+
scrapling/engines/toolbelt/custom.py,sha256=XB_oINjmVnigODxfP9hl-teRy0BkJqfrEprWDAqO-Jo,7473
|
19
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
20
|
+
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
21
|
+
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
22
|
+
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
23
|
+
tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
|
24
|
+
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
25
|
+
tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-TxUcs,3479
|
26
|
+
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
+
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
28
|
+
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
29
|
+
scrapling-0.2.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
30
|
+
scrapling-0.2.1.dist-info/METADATA,sha256=aeExP8jl7VQxIUnfvvo4QxIeasqfziscacOrOoHOuXk,64155
|
31
|
+
scrapling-0.2.1.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
32
|
+
scrapling-0.2.1.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
33
|
+
scrapling-0.2.1.dist-info/RECORD,,
|
tests/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
"""Package for test project."""
|
@@ -0,0 +1 @@
|
|
1
|
+
# Because I'm too lazy to mock requests :)
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import unittest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import StealthyFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
+
class TestStealthyFetcher(unittest.TestCase):
|
10
|
+
def setUp(self):
|
11
|
+
self.fetcher = StealthyFetcher(auto_match=False)
|
12
|
+
url = self.httpbin.url
|
13
|
+
self.status_200 = f'{url}/status/200'
|
14
|
+
self.status_404 = f'{url}/status/404'
|
15
|
+
self.status_501 = f'{url}/status/501'
|
16
|
+
self.basic_url = f'{url}/get'
|
17
|
+
self.html_url = f'{url}/html'
|
18
|
+
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
+
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
+
|
21
|
+
def test_basic_fetch(self):
|
22
|
+
"""Test doing basic fetch request with multiple statuses"""
|
23
|
+
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
+
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
+
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
+
|
27
|
+
def test_networkidle(self):
|
28
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
+
|
31
|
+
def test_blocking_resources(self):
|
32
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
|
34
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
35
|
+
|
36
|
+
def test_waiting_selector(self):
|
37
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
+
|
40
|
+
def test_cookies_loading(self):
|
41
|
+
"""Test if cookies are set after the request"""
|
42
|
+
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
43
|
+
|
44
|
+
def test_automation(self):
|
45
|
+
"""Test if automation break the code or not"""
|
46
|
+
def scroll_page(page):
|
47
|
+
page.mouse.wheel(10, 0)
|
48
|
+
page.mouse.move(100, 400)
|
49
|
+
page.mouse.up()
|
50
|
+
return page
|
51
|
+
|
52
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
53
|
+
|
54
|
+
def test_properties(self):
|
55
|
+
"""Test if different arguments breaks the code or not"""
|
56
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
57
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
58
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
59
|
+
|
60
|
+
def test_infinite_timeout(self):
|
61
|
+
"""Test if infinite timeout breaks the code or not"""
|
62
|
+
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
import unittest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import Fetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
class TestFetcher(unittest.TestCase):
|
9
|
+
def setUp(self):
|
10
|
+
self.fetcher = Fetcher(auto_match=False)
|
11
|
+
url = self.httpbin.url
|
12
|
+
self.status_200 = f'{url}/status/200'
|
13
|
+
self.status_404 = f'{url}/status/404'
|
14
|
+
self.status_501 = f'{url}/status/501'
|
15
|
+
self.basic_url = f'{url}/get'
|
16
|
+
self.post_url = f'{url}/post'
|
17
|
+
self.put_url = f'{url}/put'
|
18
|
+
self.delete_url = f'{url}/delete'
|
19
|
+
self.html_url = f'{url}/html'
|
20
|
+
|
21
|
+
def test_basic_get(self):
|
22
|
+
"""Test doing basic get request with multiple statuses"""
|
23
|
+
self.assertEqual(self.fetcher.get(self.status_200).status, 200)
|
24
|
+
self.assertEqual(self.fetcher.get(self.status_404).status, 404)
|
25
|
+
self.assertEqual(self.fetcher.get(self.status_501).status, 501)
|
26
|
+
|
27
|
+
def test_get_properties(self):
|
28
|
+
"""Test if different arguments with GET request breaks the code or not"""
|
29
|
+
self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
|
30
|
+
self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
|
31
|
+
self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
|
32
|
+
self.assertEqual(
|
33
|
+
self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
34
|
+
200
|
35
|
+
)
|
36
|
+
|
37
|
+
def test_post_properties(self):
|
38
|
+
"""Test if different arguments with POST request breaks the code or not"""
|
39
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
|
40
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
41
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
42
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
|
43
|
+
self.assertEqual(
|
44
|
+
self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
45
|
+
200
|
46
|
+
)
|
47
|
+
|
48
|
+
def test_put_properties(self):
|
49
|
+
"""Test if different arguments with PUT request breaks the code or not"""
|
50
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
|
51
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
52
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
53
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
|
54
|
+
self.assertEqual(
|
55
|
+
self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
56
|
+
200
|
57
|
+
)
|
58
|
+
|
59
|
+
def test_delete_properties(self):
|
60
|
+
"""Test if different arguments with DELETE request breaks the code or not"""
|
61
|
+
self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
|
62
|
+
self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
|
63
|
+
self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
|
64
|
+
self.assertEqual(
|
65
|
+
self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
66
|
+
200
|
67
|
+
)
|
@@ -0,0 +1,74 @@
|
|
1
|
+
import unittest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import PlayWrightFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
+
class TestPlayWrightFetcher(unittest.TestCase):
|
10
|
+
def setUp(self):
|
11
|
+
self.fetcher = PlayWrightFetcher(auto_match=False)
|
12
|
+
url = self.httpbin.url
|
13
|
+
self.status_200 = f'{url}/status/200'
|
14
|
+
self.status_404 = f'{url}/status/404'
|
15
|
+
self.status_501 = f'{url}/status/501'
|
16
|
+
self.basic_url = f'{url}/get'
|
17
|
+
self.html_url = f'{url}/html'
|
18
|
+
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
+
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
+
|
21
|
+
def test_basic_fetch(self):
|
22
|
+
"""Test doing basic fetch request with multiple statuses"""
|
23
|
+
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
+
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
+
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
+
|
27
|
+
def test_networkidle(self):
|
28
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
+
|
31
|
+
def test_blocking_resources(self):
|
32
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
34
|
+
|
35
|
+
def test_waiting_selector(self):
|
36
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
+
|
39
|
+
def test_cookies_loading(self):
|
40
|
+
"""Test if cookies are set after the request"""
|
41
|
+
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
42
|
+
|
43
|
+
def test_automation(self):
|
44
|
+
"""Test if automation break the code or not"""
|
45
|
+
def scroll_page(page):
|
46
|
+
page.mouse.wheel(10, 0)
|
47
|
+
page.mouse.move(100, 400)
|
48
|
+
page.mouse.up()
|
49
|
+
return page
|
50
|
+
|
51
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
52
|
+
|
53
|
+
def test_properties(self):
|
54
|
+
"""Test if different arguments breaks the code or not"""
|
55
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
|
56
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
57
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
58
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
59
|
+
|
60
|
+
def test_cdp_url(self):
|
61
|
+
"""Test if it's going to try to connect to cdp url or not"""
|
62
|
+
with self.assertRaises(ValueError):
|
63
|
+
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
|
64
|
+
|
65
|
+
with self.assertRaises(ValueError):
|
66
|
+
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
67
|
+
|
68
|
+
with self.assertRaises(Exception):
|
69
|
+
# There's no type for this error in PlayWright, it's just `Error`
|
70
|
+
_ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
71
|
+
|
72
|
+
def test_infinite_timeout(self):
|
73
|
+
"""Test if infinite timeout breaks the code or not"""
|
74
|
+
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
tests/parser/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from scrapling import Adaptor
|
4
|
+
|
5
|
+
|
6
|
+
class TestParserAutoMatch(unittest.TestCase):
|
7
|
+
|
8
|
+
def test_element_relocation(self):
|
9
|
+
"""Test relocating element after structure change"""
|
10
|
+
original_html = '''
|
11
|
+
<div class="container">
|
12
|
+
<section class="products">
|
13
|
+
<article class="product" id="p1">
|
14
|
+
<h3>Product 1</h3>
|
15
|
+
<p class="description">Description 1</p>
|
16
|
+
</article>
|
17
|
+
<article class="product" id="p2">
|
18
|
+
<h3>Product 2</h3>
|
19
|
+
<p class="description">Description 2</p>
|
20
|
+
</article>
|
21
|
+
</section>
|
22
|
+
</div>
|
23
|
+
'''
|
24
|
+
changed_html = '''
|
25
|
+
<div class="new-container">
|
26
|
+
<div class="product-wrapper">
|
27
|
+
<section class="products">
|
28
|
+
<article class="product new-class" data-id="p1">
|
29
|
+
<div class="product-info">
|
30
|
+
<h3>Product 1</h3>
|
31
|
+
<p class="new-description">Description 1</p>
|
32
|
+
</div>
|
33
|
+
</article>
|
34
|
+
<article class="product new-class" data-id="p2">
|
35
|
+
<div class="product-info">
|
36
|
+
<h3>Product 2</h3>
|
37
|
+
<p class="new-description">Description 2</p>
|
38
|
+
</div>
|
39
|
+
</article>
|
40
|
+
</section>
|
41
|
+
</div>
|
42
|
+
</div>
|
43
|
+
'''
|
44
|
+
|
45
|
+
old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
|
46
|
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
|
47
|
+
|
48
|
+
# 'p1' was used as ID and now it's not and all the path elements have changes
|
49
|
+
# Also at the same time testing auto-match vs combined selectors
|
50
|
+
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
51
|
+
relocated = new_page.css('#p1', auto_match=True)
|
52
|
+
|
53
|
+
self.assertIsNotNone(relocated)
|
54
|
+
self.assertEqual(relocated[0].attrib['data-id'], 'p1')
|
55
|
+
self.assertTrue(relocated[0].has_class('new-class'))
|
56
|
+
self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
|
@@ -0,0 +1,286 @@
|
|
1
|
+
|
2
|
+
import pickle
|
3
|
+
import unittest
|
4
|
+
from scrapling import Adaptor
|
5
|
+
from cssselect import SelectorError, SelectorSyntaxError
|
6
|
+
|
7
|
+
|
8
|
+
class TestParser(unittest.TestCase):
|
9
|
+
def setUp(self):
|
10
|
+
self.html = '''
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<title>Complex Web Page</title>
|
14
|
+
<style>
|
15
|
+
.hidden { display: none; }
|
16
|
+
</style>
|
17
|
+
</head>
|
18
|
+
<body>
|
19
|
+
<header>
|
20
|
+
<nav>
|
21
|
+
<ul>
|
22
|
+
<li><a href="#home">Home</a></li>
|
23
|
+
<li><a href="#about">About</a></li>
|
24
|
+
<li><a href="#contact">Contact</a></li>
|
25
|
+
</ul>
|
26
|
+
</nav>
|
27
|
+
</header>
|
28
|
+
<main>
|
29
|
+
<section id="products" schema='{"jsonable": "data"}'>
|
30
|
+
<h2>Products</h2>
|
31
|
+
<div class="product-list">
|
32
|
+
<article class="product" data-id="1">
|
33
|
+
<h3>Product 1</h3>
|
34
|
+
<p class="description">This is product 1</p>
|
35
|
+
<span class="price">$10.99</span>
|
36
|
+
<div class="hidden stock">In stock: 5</div>
|
37
|
+
</article>
|
38
|
+
<article class="product" data-id="2">
|
39
|
+
<h3>Product 2</h3>
|
40
|
+
<p class="description">This is product 2</p>
|
41
|
+
<span class="price">$20.99</span>
|
42
|
+
<div class="hidden stock">In stock: 3</div>
|
43
|
+
</article>
|
44
|
+
<article class="product" data-id="3">
|
45
|
+
<h3>Product 3</h3>
|
46
|
+
<p class="description">This is product 3</p>
|
47
|
+
<span class="price">$15.99</span>
|
48
|
+
<div class="hidden stock">Out of stock</div>
|
49
|
+
</article>
|
50
|
+
</div>
|
51
|
+
</section>
|
52
|
+
<section id="reviews">
|
53
|
+
<h2>Customer Reviews</h2>
|
54
|
+
<div class="review-list">
|
55
|
+
<div class="review" data-rating="5">
|
56
|
+
<p class="review-text">Great product!</p>
|
57
|
+
<span class="reviewer">John Doe</span>
|
58
|
+
</div>
|
59
|
+
<div class="review" data-rating="4">
|
60
|
+
<p class="review-text">Good value for money.</p>
|
61
|
+
<span class="reviewer">Jane Smith</span>
|
62
|
+
</div>
|
63
|
+
</div>
|
64
|
+
</section>
|
65
|
+
</main>
|
66
|
+
<footer>
|
67
|
+
<p>© 2024 Our Company</p>
|
68
|
+
</footer>
|
69
|
+
<script id="page-data" type="application/json">
|
70
|
+
{"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
|
71
|
+
</script>
|
72
|
+
</body>
|
73
|
+
</html>
|
74
|
+
'''
|
75
|
+
self.page = Adaptor(self.html, auto_match=False, debug=False)
|
76
|
+
|
77
|
+
def test_css_selector(self):
|
78
|
+
"""Test Selecting elements with complex CSS selectors"""
|
79
|
+
elements = self.page.css('main #products .product-list article.product')
|
80
|
+
self.assertEqual(len(elements), 3)
|
81
|
+
|
82
|
+
in_stock_products = self.page.css(
|
83
|
+
'main #products .product-list article.product:not(:contains("Out of stock"))')
|
84
|
+
self.assertEqual(len(in_stock_products), 2)
|
85
|
+
|
86
|
+
def test_xpath_selector(self):
|
87
|
+
"""Test Selecting elements with Complex XPath selectors"""
|
88
|
+
reviews = self.page.xpath(
|
89
|
+
'//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
|
90
|
+
)
|
91
|
+
self.assertEqual(len(reviews), 2)
|
92
|
+
|
93
|
+
high_priced_products = self.page.xpath(
|
94
|
+
'//article[contains(@class, "product")]'
|
95
|
+
'[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
|
96
|
+
)
|
97
|
+
self.assertEqual(len(high_priced_products), 2)
|
98
|
+
|
99
|
+
def test_find_by_text(self):
|
100
|
+
"""Test Selecting elements with Text matching"""
|
101
|
+
stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=False)
|
102
|
+
self.assertEqual(len(stock_info), 2)
|
103
|
+
|
104
|
+
stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
|
105
|
+
self.assertEqual(stock_info.text, 'In stock: 5')
|
106
|
+
|
107
|
+
stock_info = self.page.find_by_text(r'In stock:', partial=True, first_match=False)
|
108
|
+
self.assertEqual(len(stock_info), 2)
|
109
|
+
|
110
|
+
out_of_stock = self.page.find_by_text('Out of stock', partial=False, first_match=False)
|
111
|
+
self.assertEqual(len(out_of_stock), 1)
|
112
|
+
|
113
|
+
def test_find_similar_elements(self):
|
114
|
+
"""Test Finding similar elements of an element"""
|
115
|
+
first_product = self.page.css_first('.product')
|
116
|
+
similar_products = first_product.find_similar()
|
117
|
+
self.assertEqual(len(similar_products), 2)
|
118
|
+
|
119
|
+
first_review = self.page.find('div', class_='review')
|
120
|
+
similar_high_rated_reviews = [
|
121
|
+
review
|
122
|
+
for review in first_review.find_similar()
|
123
|
+
if int(review.attrib.get('data-rating', 0)) >= 4
|
124
|
+
]
|
125
|
+
self.assertEqual(len(similar_high_rated_reviews), 1)
|
126
|
+
|
127
|
+
def test_expected_errors(self):
|
128
|
+
"""Test errors that should raised if it does"""
|
129
|
+
with self.assertRaises(ValueError):
|
130
|
+
_ = Adaptor(auto_match=False)
|
131
|
+
|
132
|
+
with self.assertRaises(TypeError):
|
133
|
+
_ = Adaptor(root="ayo", auto_match=False)
|
134
|
+
|
135
|
+
with self.assertRaises(TypeError):
|
136
|
+
_ = Adaptor(text=1, auto_match=False)
|
137
|
+
|
138
|
+
with self.assertRaises(TypeError):
|
139
|
+
_ = Adaptor(body=1, auto_match=False)
|
140
|
+
|
141
|
+
with self.assertRaises(ValueError):
|
142
|
+
_ = Adaptor(self.html, storage=object, auto_match=True)
|
143
|
+
|
144
|
+
def test_pickleable(self):
|
145
|
+
"""Test that objects aren't pickleable"""
|
146
|
+
table = self.page.css('.product-list')[0]
|
147
|
+
with self.assertRaises(TypeError): # Adaptors
|
148
|
+
pickle.dumps(table)
|
149
|
+
|
150
|
+
with self.assertRaises(TypeError): # Adaptor
|
151
|
+
pickle.dumps(table[0])
|
152
|
+
|
153
|
+
def test_overridden(self):
|
154
|
+
"""Test overridden functions"""
|
155
|
+
table = self.page.css('.product-list')[0]
|
156
|
+
self.assertTrue(issubclass(type(table.__str__()), str))
|
157
|
+
self.assertTrue(issubclass(type(table.__repr__()), str))
|
158
|
+
self.assertTrue(issubclass(type(table.attrib.__str__()), str))
|
159
|
+
self.assertTrue(issubclass(type(table.attrib.__repr__()), str))
|
160
|
+
|
161
|
+
def test_bad_selector(self):
|
162
|
+
"""Test object can handle bad selector"""
|
163
|
+
with self.assertRaises((SelectorError, SelectorSyntaxError,)):
|
164
|
+
self.page.css('4 ayo')
|
165
|
+
|
166
|
+
with self.assertRaises((SelectorError, SelectorSyntaxError,)):
|
167
|
+
self.page.xpath('4 ayo')
|
168
|
+
|
169
|
+
def test_selectors_generation(self):
|
170
|
+
"""Try to create selectors for all elements in the page"""
|
171
|
+
def _traverse(element: Adaptor):
|
172
|
+
self.assertTrue(type(element.generate_css_selector) is str)
|
173
|
+
self.assertTrue(type(element.generate_xpath_selector) is str)
|
174
|
+
for branch in element.children:
|
175
|
+
_traverse(branch)
|
176
|
+
|
177
|
+
_traverse(self.page)
|
178
|
+
|
179
|
+
def test_getting_all_text(self):
|
180
|
+
"""Test getting all text"""
|
181
|
+
self.assertNotEqual(self.page.get_all_text(), '')
|
182
|
+
|
183
|
+
def test_element_navigation(self):
|
184
|
+
"""Test moving in the page from selected element"""
|
185
|
+
table = self.page.css('.product-list')[0]
|
186
|
+
|
187
|
+
self.assertIsNot(table.path, [])
|
188
|
+
self.assertNotEqual(table.html_content, '')
|
189
|
+
self.assertNotEqual(table.prettify(), '')
|
190
|
+
|
191
|
+
parent = table.parent
|
192
|
+
self.assertEqual(parent.attrib['id'], 'products')
|
193
|
+
|
194
|
+
children = table.children
|
195
|
+
self.assertEqual(len(children), 3)
|
196
|
+
|
197
|
+
parent_siblings = parent.siblings
|
198
|
+
self.assertEqual(len(parent_siblings), 1)
|
199
|
+
|
200
|
+
child = table.find({'data-id': "1"})
|
201
|
+
next_element = child.next
|
202
|
+
self.assertEqual(next_element.attrib['data-id'], '2')
|
203
|
+
|
204
|
+
prev_element = next_element.previous
|
205
|
+
self.assertEqual(prev_element.tag, child.tag)
|
206
|
+
|
207
|
+
all_prices = self.page.css('.price')
|
208
|
+
products_with_prices = [
|
209
|
+
price.find_ancestor(lambda p: p.has_class('product'))
|
210
|
+
for price in all_prices
|
211
|
+
]
|
212
|
+
self.assertEqual(len(products_with_prices), 3)
|
213
|
+
|
214
|
+
def test_empty_return(self):
|
215
|
+
"""Test cases where functions shouldn't have results"""
|
216
|
+
test_html = """
|
217
|
+
<html>
|
218
|
+
<span id="a"><a></a><!--comment--></span>
|
219
|
+
<span id="b"><!--comment--><a></a></span>
|
220
|
+
</html>"""
|
221
|
+
soup = Adaptor(test_html, auto_match=False, keep_comments=False)
|
222
|
+
html_tag = soup.css('html')[0]
|
223
|
+
self.assertEqual(html_tag.path, [])
|
224
|
+
self.assertEqual(html_tag.siblings, [])
|
225
|
+
self.assertEqual(html_tag.parent, None)
|
226
|
+
self.assertEqual(html_tag.find_ancestor(lambda e: e), None)
|
227
|
+
|
228
|
+
self.assertEqual(soup.css('#a a')[0].next, None)
|
229
|
+
self.assertEqual(soup.css('#b a')[0].previous, None)
|
230
|
+
|
231
|
+
def test_text_to_json(self):
|
232
|
+
"""Test converting text to json"""
|
233
|
+
script_content = self.page.css('#page-data::text')[0]
|
234
|
+
self.assertTrue(issubclass(type(script_content.sort()), str))
|
235
|
+
page_data = script_content.json()
|
236
|
+
self.assertEqual(page_data['totalProducts'], 3)
|
237
|
+
self.assertTrue('lastUpdated' in page_data)
|
238
|
+
|
239
|
+
def test_regex_on_text(self):
|
240
|
+
"""Test doing regex on a selected text"""
|
241
|
+
element = self.page.css('[data-id="1"] .price')[0]
|
242
|
+
match = element.re_first(r'[\.\d]+')
|
243
|
+
self.assertEqual(match, '10.99')
|
244
|
+
match = element.text.re(r'(\d+)', replace_entities=False)
|
245
|
+
self.assertEqual(len(match), 2)
|
246
|
+
|
247
|
+
def test_attribute_operations(self):
|
248
|
+
"""Test operations on elements attributes"""
|
249
|
+
products = self.page.css('.product')
|
250
|
+
product_ids = [product.attrib['data-id'] for product in products]
|
251
|
+
self.assertEqual(product_ids, ['1', '2', '3'])
|
252
|
+
self.assertTrue('data-id' in products[0].attrib)
|
253
|
+
|
254
|
+
reviews = self.page.css('.review')
|
255
|
+
review_ratings = [int(review.attrib['data-rating']) for review in reviews]
|
256
|
+
self.assertEqual(sum(review_ratings) / len(review_ratings), 4.5)
|
257
|
+
|
258
|
+
key_value = list(products[0].attrib.search_values('1', partial=False))
|
259
|
+
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
260
|
+
|
261
|
+
key_value = list(products[0].attrib.search_values('1', partial=True))
|
262
|
+
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
263
|
+
|
264
|
+
attr_json = self.page.css_first('#products').attrib['schema'].json()
|
265
|
+
self.assertEqual(attr_json, {'jsonable': 'data'})
|
266
|
+
self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
|
267
|
+
|
268
|
+
def test_performance(self):
|
269
|
+
"""Test parsing and selecting speed"""
|
270
|
+
import time
|
271
|
+
large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
|
272
|
+
|
273
|
+
start_time = time.time()
|
274
|
+
parsed = Adaptor(large_html, auto_match=False, debug=False)
|
275
|
+
elements = parsed.css('.item')
|
276
|
+
end_time = time.time()
|
277
|
+
|
278
|
+
self.assertEqual(len(elements), 5000)
|
279
|
+
# Converting 5000 elements to a class and doing operations on them will take time
|
280
|
+
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
281
|
+
self.assertLess(end_time - start_time, 0.1)
|
282
|
+
|
283
|
+
|
284
|
+
# Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
|
285
|
+
# if __name__ == '__main__':
|
286
|
+
# unittest.main(verbosity=2)
|