scrapling 0.1.2__py3-none-any.whl → 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +2 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +121 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +232 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +18 -0
- scrapling/engines/toolbelt/custom.py +168 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +74 -0
- scrapling/fetchers.py +190 -0
- scrapling/parser.py +216 -51
- scrapling-0.2.dist-info/METADATA +807 -0
- scrapling-0.2.dist-info/RECORD +32 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.dist-info}/LICENSE +0 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
scrapling/__init__.py,sha256=cSitNNcOc3Ud0zZvaLy5NDfZ4c8_UCLWe7FfTBazKnY,433
|
2
|
+
scrapling/fetchers.py,sha256=KD2moKWPYEcu7Lq4zIeBXcusmhFlPPueYSjyl8fMpLQ,15365
|
3
|
+
scrapling/parser.py,sha256=oC1I9_jDP4zemU6V9e6wDyP-CQk2aMhJzSF2BGSBGp0,54253
|
4
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
6
|
+
scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
|
7
|
+
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
8
|
+
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
9
|
+
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
10
|
+
scrapling/core/utils.py,sha256=o35SxakRw5Bq_hpOiHu1KaSWrOBxeQpEMuOzG88NCqE,3530
|
11
|
+
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
12
|
+
scrapling/engines/camo.py,sha256=Cq8960Uz-y-__4OJviHXPPhjbbVz1ILt9koaPic2x8w,6954
|
13
|
+
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
14
|
+
scrapling/engines/pw.py,sha256=cx1B0mfatEoGYpFkDt5zPg_cb0lKU0mu4MjuuU-COes,11805
|
15
|
+
scrapling/engines/static.py,sha256=K-tT8mEfJY0Ix_gZceazeFIYmZ_ko4nyqZptj6POYmM,7159
|
16
|
+
scrapling/engines/toolbelt/__init__.py,sha256=3zWs5aiV8QP5ua-cvIBkCRaDhmjWEEx_xycVpdp3ur4,341
|
17
|
+
scrapling/engines/toolbelt/custom.py,sha256=cqXQ2UdzoH0IXBAa0ySg_90kPhlP-f2fLAauJUAMFOs,8167
|
18
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
19
|
+
scrapling/engines/toolbelt/navigation.py,sha256=04Y1zjkVAgmvbgM3tHn6NsAruh5x6ESH1w0EW8CdVxo,2452
|
20
|
+
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
21
|
+
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
22
|
+
tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
|
23
|
+
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
24
|
+
tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-TxUcs,3479
|
25
|
+
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
+
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
27
|
+
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
28
|
+
scrapling-0.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
29
|
+
scrapling-0.2.dist-info/METADATA,sha256=yieOuAeWNwx5UMtQN-E1bsNnKEum4xGgPUynOgbG7m0,61418
|
30
|
+
scrapling-0.2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
31
|
+
scrapling-0.2.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
32
|
+
scrapling-0.2.dist-info/RECORD,,
|
tests/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
"""Package for test project."""
|
@@ -0,0 +1 @@
|
|
1
|
+
# Because I'm too lazy to mock requests :)
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import unittest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import StealthyFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
+
class TestStealthyFetcher(unittest.TestCase):
|
10
|
+
def setUp(self):
|
11
|
+
self.fetcher = StealthyFetcher(auto_match=False)
|
12
|
+
url = self.httpbin.url
|
13
|
+
self.status_200 = f'{url}/status/200'
|
14
|
+
self.status_404 = f'{url}/status/404'
|
15
|
+
self.status_501 = f'{url}/status/501'
|
16
|
+
self.basic_url = f'{url}/get'
|
17
|
+
self.html_url = f'{url}/html'
|
18
|
+
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
+
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
+
|
21
|
+
def test_basic_fetch(self):
|
22
|
+
"""Test doing basic fetch request with multiple statuses"""
|
23
|
+
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
+
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
+
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
+
|
27
|
+
def test_networkidle(self):
|
28
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
+
|
31
|
+
def test_blocking_resources(self):
|
32
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
|
34
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
35
|
+
|
36
|
+
def test_waiting_selector(self):
|
37
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
+
|
40
|
+
def test_cookies_loading(self):
|
41
|
+
"""Test if cookies are set after the request"""
|
42
|
+
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
43
|
+
|
44
|
+
def test_automation(self):
|
45
|
+
"""Test if automation break the code or not"""
|
46
|
+
def scroll_page(page):
|
47
|
+
page.mouse.wheel(10, 0)
|
48
|
+
page.mouse.move(100, 400)
|
49
|
+
page.mouse.up()
|
50
|
+
return page
|
51
|
+
|
52
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
53
|
+
|
54
|
+
def test_properties(self):
|
55
|
+
"""Test if different arguments breaks the code or not"""
|
56
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
57
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
58
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
59
|
+
|
60
|
+
def test_infinite_timeout(self):
|
61
|
+
"""Test if infinite timeout breaks the code or not"""
|
62
|
+
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
import unittest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import Fetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
class TestFetcher(unittest.TestCase):
|
9
|
+
def setUp(self):
|
10
|
+
self.fetcher = Fetcher(auto_match=False)
|
11
|
+
url = self.httpbin.url
|
12
|
+
self.status_200 = f'{url}/status/200'
|
13
|
+
self.status_404 = f'{url}/status/404'
|
14
|
+
self.status_501 = f'{url}/status/501'
|
15
|
+
self.basic_url = f'{url}/get'
|
16
|
+
self.post_url = f'{url}/post'
|
17
|
+
self.put_url = f'{url}/put'
|
18
|
+
self.delete_url = f'{url}/delete'
|
19
|
+
self.html_url = f'{url}/html'
|
20
|
+
|
21
|
+
def test_basic_get(self):
|
22
|
+
"""Test doing basic get request with multiple statuses"""
|
23
|
+
self.assertEqual(self.fetcher.get(self.status_200).status, 200)
|
24
|
+
self.assertEqual(self.fetcher.get(self.status_404).status, 404)
|
25
|
+
self.assertEqual(self.fetcher.get(self.status_501).status, 501)
|
26
|
+
|
27
|
+
def test_get_properties(self):
|
28
|
+
"""Test if different arguments with GET request breaks the code or not"""
|
29
|
+
self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
|
30
|
+
self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
|
31
|
+
self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
|
32
|
+
self.assertEqual(
|
33
|
+
self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
34
|
+
200
|
35
|
+
)
|
36
|
+
|
37
|
+
def test_post_properties(self):
|
38
|
+
"""Test if different arguments with POST request breaks the code or not"""
|
39
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
|
40
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
41
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
42
|
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
|
43
|
+
self.assertEqual(
|
44
|
+
self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
45
|
+
200
|
46
|
+
)
|
47
|
+
|
48
|
+
def test_put_properties(self):
|
49
|
+
"""Test if different arguments with PUT request breaks the code or not"""
|
50
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
|
51
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
52
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
53
|
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
|
54
|
+
self.assertEqual(
|
55
|
+
self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
56
|
+
200
|
57
|
+
)
|
58
|
+
|
59
|
+
def test_delete_properties(self):
|
60
|
+
"""Test if different arguments with DELETE request breaks the code or not"""
|
61
|
+
self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
|
62
|
+
self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
|
63
|
+
self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
|
64
|
+
self.assertEqual(
|
65
|
+
self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
66
|
+
200
|
67
|
+
)
|
@@ -0,0 +1,74 @@
|
|
1
|
+
import unittest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import PlayWrightFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
+
class TestPlayWrightFetcher(unittest.TestCase):
|
10
|
+
def setUp(self):
|
11
|
+
self.fetcher = PlayWrightFetcher(auto_match=False)
|
12
|
+
url = self.httpbin.url
|
13
|
+
self.status_200 = f'{url}/status/200'
|
14
|
+
self.status_404 = f'{url}/status/404'
|
15
|
+
self.status_501 = f'{url}/status/501'
|
16
|
+
self.basic_url = f'{url}/get'
|
17
|
+
self.html_url = f'{url}/html'
|
18
|
+
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
+
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
+
|
21
|
+
def test_basic_fetch(self):
|
22
|
+
"""Test doing basic fetch request with multiple statuses"""
|
23
|
+
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
+
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
+
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
+
|
27
|
+
def test_networkidle(self):
|
28
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
+
|
31
|
+
def test_blocking_resources(self):
|
32
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
+
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
34
|
+
|
35
|
+
def test_waiting_selector(self):
|
36
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
+
|
39
|
+
def test_cookies_loading(self):
|
40
|
+
"""Test if cookies are set after the request"""
|
41
|
+
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
42
|
+
|
43
|
+
def test_automation(self):
|
44
|
+
"""Test if automation break the code or not"""
|
45
|
+
def scroll_page(page):
|
46
|
+
page.mouse.wheel(10, 0)
|
47
|
+
page.mouse.move(100, 400)
|
48
|
+
page.mouse.up()
|
49
|
+
return page
|
50
|
+
|
51
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
52
|
+
|
53
|
+
def test_properties(self):
|
54
|
+
"""Test if different arguments breaks the code or not"""
|
55
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
|
56
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
57
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
58
|
+
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
59
|
+
|
60
|
+
def test_cdp_url(self):
|
61
|
+
"""Test if it's going to try to connect to cdp url or not"""
|
62
|
+
with self.assertRaises(ValueError):
|
63
|
+
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
|
64
|
+
|
65
|
+
with self.assertRaises(ValueError):
|
66
|
+
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
67
|
+
|
68
|
+
with self.assertRaises(Exception):
|
69
|
+
# There's no type for this error in PlayWright, it's just `Error`
|
70
|
+
_ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
71
|
+
|
72
|
+
def test_infinite_timeout(self):
|
73
|
+
"""Test if infinite timeout breaks the code or not"""
|
74
|
+
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
tests/parser/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
from scrapling import Adaptor
|
4
|
+
|
5
|
+
|
6
|
+
class TestParserAutoMatch(unittest.TestCase):
|
7
|
+
|
8
|
+
def test_element_relocation(self):
|
9
|
+
"""Test relocating element after structure change"""
|
10
|
+
original_html = '''
|
11
|
+
<div class="container">
|
12
|
+
<section class="products">
|
13
|
+
<article class="product" id="p1">
|
14
|
+
<h3>Product 1</h3>
|
15
|
+
<p class="description">Description 1</p>
|
16
|
+
</article>
|
17
|
+
<article class="product" id="p2">
|
18
|
+
<h3>Product 2</h3>
|
19
|
+
<p class="description">Description 2</p>
|
20
|
+
</article>
|
21
|
+
</section>
|
22
|
+
</div>
|
23
|
+
'''
|
24
|
+
changed_html = '''
|
25
|
+
<div class="new-container">
|
26
|
+
<div class="product-wrapper">
|
27
|
+
<section class="products">
|
28
|
+
<article class="product new-class" data-id="p1">
|
29
|
+
<div class="product-info">
|
30
|
+
<h3>Product 1</h3>
|
31
|
+
<p class="new-description">Description 1</p>
|
32
|
+
</div>
|
33
|
+
</article>
|
34
|
+
<article class="product new-class" data-id="p2">
|
35
|
+
<div class="product-info">
|
36
|
+
<h3>Product 2</h3>
|
37
|
+
<p class="new-description">Description 2</p>
|
38
|
+
</div>
|
39
|
+
</article>
|
40
|
+
</section>
|
41
|
+
</div>
|
42
|
+
</div>
|
43
|
+
'''
|
44
|
+
|
45
|
+
old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
|
46
|
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
|
47
|
+
|
48
|
+
# 'p1' was used as ID and now it's not and all the path elements have changes
|
49
|
+
# Also at the same time testing auto-match vs combined selectors
|
50
|
+
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
51
|
+
relocated = new_page.css('#p1', auto_match=True)
|
52
|
+
|
53
|
+
self.assertIsNotNone(relocated)
|
54
|
+
self.assertEqual(relocated[0].attrib['data-id'], 'p1')
|
55
|
+
self.assertTrue(relocated[0].has_class('new-class'))
|
56
|
+
self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
|
@@ -0,0 +1,286 @@
|
|
1
|
+
|
2
|
+
import pickle
|
3
|
+
import unittest
|
4
|
+
from scrapling import Adaptor
|
5
|
+
from cssselect import SelectorError, SelectorSyntaxError
|
6
|
+
|
7
|
+
|
8
|
+
class TestParser(unittest.TestCase):
|
9
|
+
def setUp(self):
|
10
|
+
self.html = '''
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<title>Complex Web Page</title>
|
14
|
+
<style>
|
15
|
+
.hidden { display: none; }
|
16
|
+
</style>
|
17
|
+
</head>
|
18
|
+
<body>
|
19
|
+
<header>
|
20
|
+
<nav>
|
21
|
+
<ul>
|
22
|
+
<li><a href="#home">Home</a></li>
|
23
|
+
<li><a href="#about">About</a></li>
|
24
|
+
<li><a href="#contact">Contact</a></li>
|
25
|
+
</ul>
|
26
|
+
</nav>
|
27
|
+
</header>
|
28
|
+
<main>
|
29
|
+
<section id="products" schema='{"jsonable": "data"}'>
|
30
|
+
<h2>Products</h2>
|
31
|
+
<div class="product-list">
|
32
|
+
<article class="product" data-id="1">
|
33
|
+
<h3>Product 1</h3>
|
34
|
+
<p class="description">This is product 1</p>
|
35
|
+
<span class="price">$10.99</span>
|
36
|
+
<div class="hidden stock">In stock: 5</div>
|
37
|
+
</article>
|
38
|
+
<article class="product" data-id="2">
|
39
|
+
<h3>Product 2</h3>
|
40
|
+
<p class="description">This is product 2</p>
|
41
|
+
<span class="price">$20.99</span>
|
42
|
+
<div class="hidden stock">In stock: 3</div>
|
43
|
+
</article>
|
44
|
+
<article class="product" data-id="3">
|
45
|
+
<h3>Product 3</h3>
|
46
|
+
<p class="description">This is product 3</p>
|
47
|
+
<span class="price">$15.99</span>
|
48
|
+
<div class="hidden stock">Out of stock</div>
|
49
|
+
</article>
|
50
|
+
</div>
|
51
|
+
</section>
|
52
|
+
<section id="reviews">
|
53
|
+
<h2>Customer Reviews</h2>
|
54
|
+
<div class="review-list">
|
55
|
+
<div class="review" data-rating="5">
|
56
|
+
<p class="review-text">Great product!</p>
|
57
|
+
<span class="reviewer">John Doe</span>
|
58
|
+
</div>
|
59
|
+
<div class="review" data-rating="4">
|
60
|
+
<p class="review-text">Good value for money.</p>
|
61
|
+
<span class="reviewer">Jane Smith</span>
|
62
|
+
</div>
|
63
|
+
</div>
|
64
|
+
</section>
|
65
|
+
</main>
|
66
|
+
<footer>
|
67
|
+
<p>© 2024 Our Company</p>
|
68
|
+
</footer>
|
69
|
+
<script id="page-data" type="application/json">
|
70
|
+
{"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3}
|
71
|
+
</script>
|
72
|
+
</body>
|
73
|
+
</html>
|
74
|
+
'''
|
75
|
+
self.page = Adaptor(self.html, auto_match=False, debug=False)
|
76
|
+
|
77
|
+
def test_css_selector(self):
|
78
|
+
"""Test Selecting elements with complex CSS selectors"""
|
79
|
+
elements = self.page.css('main #products .product-list article.product')
|
80
|
+
self.assertEqual(len(elements), 3)
|
81
|
+
|
82
|
+
in_stock_products = self.page.css(
|
83
|
+
'main #products .product-list article.product:not(:contains("Out of stock"))')
|
84
|
+
self.assertEqual(len(in_stock_products), 2)
|
85
|
+
|
86
|
+
def test_xpath_selector(self):
|
87
|
+
"""Test Selecting elements with Complex XPath selectors"""
|
88
|
+
reviews = self.page.xpath(
|
89
|
+
'//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]'
|
90
|
+
)
|
91
|
+
self.assertEqual(len(reviews), 2)
|
92
|
+
|
93
|
+
high_priced_products = self.page.xpath(
|
94
|
+
'//article[contains(@class, "product")]'
|
95
|
+
'[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]'
|
96
|
+
)
|
97
|
+
self.assertEqual(len(high_priced_products), 2)
|
98
|
+
|
99
|
+
def test_find_by_text(self):
|
100
|
+
"""Test Selecting elements with Text matching"""
|
101
|
+
stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=False)
|
102
|
+
self.assertEqual(len(stock_info), 2)
|
103
|
+
|
104
|
+
stock_info = self.page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
|
105
|
+
self.assertEqual(stock_info.text, 'In stock: 5')
|
106
|
+
|
107
|
+
stock_info = self.page.find_by_text(r'In stock:', partial=True, first_match=False)
|
108
|
+
self.assertEqual(len(stock_info), 2)
|
109
|
+
|
110
|
+
out_of_stock = self.page.find_by_text('Out of stock', partial=False, first_match=False)
|
111
|
+
self.assertEqual(len(out_of_stock), 1)
|
112
|
+
|
113
|
+
def test_find_similar_elements(self):
|
114
|
+
"""Test Finding similar elements of an element"""
|
115
|
+
first_product = self.page.css_first('.product')
|
116
|
+
similar_products = first_product.find_similar()
|
117
|
+
self.assertEqual(len(similar_products), 2)
|
118
|
+
|
119
|
+
first_review = self.page.find('div', class_='review')
|
120
|
+
similar_high_rated_reviews = [
|
121
|
+
review
|
122
|
+
for review in first_review.find_similar()
|
123
|
+
if int(review.attrib.get('data-rating', 0)) >= 4
|
124
|
+
]
|
125
|
+
self.assertEqual(len(similar_high_rated_reviews), 1)
|
126
|
+
|
127
|
+
def test_expected_errors(self):
|
128
|
+
"""Test errors that should raised if it does"""
|
129
|
+
with self.assertRaises(ValueError):
|
130
|
+
_ = Adaptor(auto_match=False)
|
131
|
+
|
132
|
+
with self.assertRaises(TypeError):
|
133
|
+
_ = Adaptor(root="ayo", auto_match=False)
|
134
|
+
|
135
|
+
with self.assertRaises(TypeError):
|
136
|
+
_ = Adaptor(text=1, auto_match=False)
|
137
|
+
|
138
|
+
with self.assertRaises(TypeError):
|
139
|
+
_ = Adaptor(body=1, auto_match=False)
|
140
|
+
|
141
|
+
with self.assertRaises(ValueError):
|
142
|
+
_ = Adaptor(self.html, storage=object, auto_match=True)
|
143
|
+
|
144
|
+
def test_pickleable(self):
|
145
|
+
"""Test that objects aren't pickleable"""
|
146
|
+
table = self.page.css('.product-list')[0]
|
147
|
+
with self.assertRaises(TypeError): # Adaptors
|
148
|
+
pickle.dumps(table)
|
149
|
+
|
150
|
+
with self.assertRaises(TypeError): # Adaptor
|
151
|
+
pickle.dumps(table[0])
|
152
|
+
|
153
|
+
def test_overridden(self):
|
154
|
+
"""Test overridden functions"""
|
155
|
+
table = self.page.css('.product-list')[0]
|
156
|
+
self.assertTrue(issubclass(type(table.__str__()), str))
|
157
|
+
self.assertTrue(issubclass(type(table.__repr__()), str))
|
158
|
+
self.assertTrue(issubclass(type(table.attrib.__str__()), str))
|
159
|
+
self.assertTrue(issubclass(type(table.attrib.__repr__()), str))
|
160
|
+
|
161
|
+
def test_bad_selector(self):
|
162
|
+
"""Test object can handle bad selector"""
|
163
|
+
with self.assertRaises((SelectorError, SelectorSyntaxError,)):
|
164
|
+
self.page.css('4 ayo')
|
165
|
+
|
166
|
+
with self.assertRaises((SelectorError, SelectorSyntaxError,)):
|
167
|
+
self.page.xpath('4 ayo')
|
168
|
+
|
169
|
+
def test_selectors_generation(self):
|
170
|
+
"""Try to create selectors for all elements in the page"""
|
171
|
+
def _traverse(element: Adaptor):
|
172
|
+
self.assertTrue(type(element.generate_css_selector) is str)
|
173
|
+
self.assertTrue(type(element.generate_xpath_selector) is str)
|
174
|
+
for branch in element.children:
|
175
|
+
_traverse(branch)
|
176
|
+
|
177
|
+
_traverse(self.page)
|
178
|
+
|
179
|
+
def test_getting_all_text(self):
|
180
|
+
"""Test getting all text"""
|
181
|
+
self.assertNotEqual(self.page.get_all_text(), '')
|
182
|
+
|
183
|
+
def test_element_navigation(self):
|
184
|
+
"""Test moving in the page from selected element"""
|
185
|
+
table = self.page.css('.product-list')[0]
|
186
|
+
|
187
|
+
self.assertIsNot(table.path, [])
|
188
|
+
self.assertNotEqual(table.html_content, '')
|
189
|
+
self.assertNotEqual(table.prettify(), '')
|
190
|
+
|
191
|
+
parent = table.parent
|
192
|
+
self.assertEqual(parent.attrib['id'], 'products')
|
193
|
+
|
194
|
+
children = table.children
|
195
|
+
self.assertEqual(len(children), 3)
|
196
|
+
|
197
|
+
parent_siblings = parent.siblings
|
198
|
+
self.assertEqual(len(parent_siblings), 1)
|
199
|
+
|
200
|
+
child = table.find({'data-id': "1"})
|
201
|
+
next_element = child.next
|
202
|
+
self.assertEqual(next_element.attrib['data-id'], '2')
|
203
|
+
|
204
|
+
prev_element = next_element.previous
|
205
|
+
self.assertEqual(prev_element.tag, child.tag)
|
206
|
+
|
207
|
+
all_prices = self.page.css('.price')
|
208
|
+
products_with_prices = [
|
209
|
+
price.find_ancestor(lambda p: p.has_class('product'))
|
210
|
+
for price in all_prices
|
211
|
+
]
|
212
|
+
self.assertEqual(len(products_with_prices), 3)
|
213
|
+
|
214
|
+
def test_empty_return(self):
|
215
|
+
"""Test cases where functions shouldn't have results"""
|
216
|
+
test_html = """
|
217
|
+
<html>
|
218
|
+
<span id="a"><a></a><!--comment--></span>
|
219
|
+
<span id="b"><!--comment--><a></a></span>
|
220
|
+
</html>"""
|
221
|
+
soup = Adaptor(test_html, auto_match=False, keep_comments=False)
|
222
|
+
html_tag = soup.css('html')[0]
|
223
|
+
self.assertEqual(html_tag.path, [])
|
224
|
+
self.assertEqual(html_tag.siblings, [])
|
225
|
+
self.assertEqual(html_tag.parent, None)
|
226
|
+
self.assertEqual(html_tag.find_ancestor(lambda e: e), None)
|
227
|
+
|
228
|
+
self.assertEqual(soup.css('#a a')[0].next, None)
|
229
|
+
self.assertEqual(soup.css('#b a')[0].previous, None)
|
230
|
+
|
231
|
+
def test_text_to_json(self):
|
232
|
+
"""Test converting text to json"""
|
233
|
+
script_content = self.page.css('#page-data::text')[0]
|
234
|
+
self.assertTrue(issubclass(type(script_content.sort()), str))
|
235
|
+
page_data = script_content.json()
|
236
|
+
self.assertEqual(page_data['totalProducts'], 3)
|
237
|
+
self.assertTrue('lastUpdated' in page_data)
|
238
|
+
|
239
|
+
def test_regex_on_text(self):
|
240
|
+
"""Test doing regex on a selected text"""
|
241
|
+
element = self.page.css('[data-id="1"] .price')[0]
|
242
|
+
match = element.re_first(r'[\.\d]+')
|
243
|
+
self.assertEqual(match, '10.99')
|
244
|
+
match = element.text.re(r'(\d+)', replace_entities=False)
|
245
|
+
self.assertEqual(len(match), 2)
|
246
|
+
|
247
|
+
def test_attribute_operations(self):
|
248
|
+
"""Test operations on elements attributes"""
|
249
|
+
products = self.page.css('.product')
|
250
|
+
product_ids = [product.attrib['data-id'] for product in products]
|
251
|
+
self.assertEqual(product_ids, ['1', '2', '3'])
|
252
|
+
self.assertTrue('data-id' in products[0].attrib)
|
253
|
+
|
254
|
+
reviews = self.page.css('.review')
|
255
|
+
review_ratings = [int(review.attrib['data-rating']) for review in reviews]
|
256
|
+
self.assertEqual(sum(review_ratings) / len(review_ratings), 4.5)
|
257
|
+
|
258
|
+
key_value = list(products[0].attrib.search_values('1', partial=False))
|
259
|
+
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
260
|
+
|
261
|
+
key_value = list(products[0].attrib.search_values('1', partial=True))
|
262
|
+
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
263
|
+
|
264
|
+
attr_json = self.page.css_first('#products').attrib['schema'].json()
|
265
|
+
self.assertEqual(attr_json, {'jsonable': 'data'})
|
266
|
+
self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
|
267
|
+
|
268
|
+
def test_performance(self):
|
269
|
+
"""Test parsing and selecting speed"""
|
270
|
+
import time
|
271
|
+
large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
|
272
|
+
|
273
|
+
start_time = time.time()
|
274
|
+
parsed = Adaptor(large_html, auto_match=False, debug=False)
|
275
|
+
elements = parsed.css('.item')
|
276
|
+
end_time = time.time()
|
277
|
+
|
278
|
+
self.assertEqual(len(elements), 5000)
|
279
|
+
# Converting 5000 elements to a class and doing operations on them will take time
|
280
|
+
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
281
|
+
self.assertLess(end_time - start_time, 0.1)
|
282
|
+
|
283
|
+
|
284
|
+
# Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
|
285
|
+
# if __name__ == '__main__':
|
286
|
+
# unittest.main(verbosity=2)
|