scrapling 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -4
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +89 -15
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +158 -83
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +20 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +229 -14
- scrapling/parser.py +49 -21
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/METADATA +32 -16
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,99 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import PlayWrightFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
class TestPlayWrightFetcherAsync:
|
9
|
+
@pytest.fixture
|
10
|
+
def fetcher(self):
|
11
|
+
return PlayWrightFetcher(auto_match=False)
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def urls(self, httpbin):
|
15
|
+
return {
|
16
|
+
'status_200': f'{httpbin.url}/status/200',
|
17
|
+
'status_404': f'{httpbin.url}/status/404',
|
18
|
+
'status_501': f'{httpbin.url}/status/501',
|
19
|
+
'basic_url': f'{httpbin.url}/get',
|
20
|
+
'html_url': f'{httpbin.url}/html',
|
21
|
+
'delayed_url': f'{httpbin.url}/delay/10',
|
22
|
+
'cookies_url': f"{httpbin.url}/cookies/set/test/value"
|
23
|
+
}
|
24
|
+
|
25
|
+
@pytest.mark.asyncio
|
26
|
+
async def test_basic_fetch(self, fetcher, urls):
|
27
|
+
"""Test doing basic fetch request with multiple statuses"""
|
28
|
+
response = await fetcher.async_fetch(urls['status_200'])
|
29
|
+
assert response.status == 200
|
30
|
+
|
31
|
+
@pytest.mark.asyncio
|
32
|
+
async def test_networkidle(self, fetcher, urls):
|
33
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
34
|
+
response = await fetcher.async_fetch(urls['basic_url'], network_idle=True)
|
35
|
+
assert response.status == 200
|
36
|
+
|
37
|
+
@pytest.mark.asyncio
|
38
|
+
async def test_blocking_resources(self, fetcher, urls):
|
39
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
40
|
+
response = await fetcher.async_fetch(urls['basic_url'], disable_resources=True)
|
41
|
+
assert response.status == 200
|
42
|
+
|
43
|
+
@pytest.mark.asyncio
|
44
|
+
async def test_waiting_selector(self, fetcher, urls):
|
45
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
46
|
+
response1 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1')
|
47
|
+
assert response1.status == 200
|
48
|
+
|
49
|
+
response2 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1', wait_selector_state='visible')
|
50
|
+
assert response2.status == 200
|
51
|
+
|
52
|
+
@pytest.mark.asyncio
|
53
|
+
async def test_cookies_loading(self, fetcher, urls):
|
54
|
+
"""Test if cookies are set after the request"""
|
55
|
+
response = await fetcher.async_fetch(urls['cookies_url'])
|
56
|
+
assert response.cookies == {'test': 'value'}
|
57
|
+
|
58
|
+
@pytest.mark.asyncio
|
59
|
+
async def test_automation(self, fetcher, urls):
|
60
|
+
"""Test if automation break the code or not"""
|
61
|
+
async def scroll_page(page):
|
62
|
+
await page.mouse.wheel(10, 0)
|
63
|
+
await page.mouse.move(100, 400)
|
64
|
+
await page.mouse.up()
|
65
|
+
return page
|
66
|
+
|
67
|
+
response = await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)
|
68
|
+
assert response.status == 200
|
69
|
+
|
70
|
+
@pytest.mark.parametrize("kwargs", [
|
71
|
+
{"disable_webgl": True, "hide_canvas": False},
|
72
|
+
{"disable_webgl": False, "hide_canvas": True},
|
73
|
+
{"stealth": True},
|
74
|
+
{"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
|
75
|
+
{"extra_headers": {'ayo': ''}}
|
76
|
+
])
|
77
|
+
@pytest.mark.asyncio
|
78
|
+
async def test_properties(self, fetcher, urls, kwargs):
|
79
|
+
"""Test if different arguments breaks the code or not"""
|
80
|
+
response = await fetcher.async_fetch(urls['html_url'], **kwargs)
|
81
|
+
assert response.status == 200
|
82
|
+
|
83
|
+
@pytest.mark.asyncio
|
84
|
+
async def test_cdp_url_invalid(self, fetcher, urls):
|
85
|
+
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
86
|
+
with pytest.raises(ValueError):
|
87
|
+
await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah')
|
88
|
+
|
89
|
+
with pytest.raises(ValueError):
|
90
|
+
await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah', nstbrowser_mode=True)
|
91
|
+
|
92
|
+
with pytest.raises(Exception):
|
93
|
+
await fetcher.async_fetch(urls['html_url'], cdp_url='ws://blahblah')
|
94
|
+
|
95
|
+
@pytest.mark.asyncio
|
96
|
+
async def test_infinite_timeout(self, fetcher, urls):
|
97
|
+
"""Test if infinite timeout breaks the code or not"""
|
98
|
+
response = await fetcher.async_fetch(urls['delayed_url'], timeout=None)
|
99
|
+
assert response.status == 200
|
File without changes
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import StealthyFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
class TestStealthyFetcher:
|
9
|
+
@pytest.fixture(scope="class")
|
10
|
+
def fetcher(self):
|
11
|
+
"""Fixture to create a StealthyFetcher instance for the entire test class"""
|
12
|
+
return StealthyFetcher(auto_match=False)
|
13
|
+
|
14
|
+
@pytest.fixture(autouse=True)
|
15
|
+
def setup_urls(self, httpbin):
|
16
|
+
"""Fixture to set up URLs for testing"""
|
17
|
+
self.status_200 = f'{httpbin.url}/status/200'
|
18
|
+
self.status_404 = f'{httpbin.url}/status/404'
|
19
|
+
self.status_501 = f'{httpbin.url}/status/501'
|
20
|
+
self.basic_url = f'{httpbin.url}/get'
|
21
|
+
self.html_url = f'{httpbin.url}/html'
|
22
|
+
self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
|
23
|
+
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
24
|
+
|
25
|
+
def test_basic_fetch(self, fetcher):
|
26
|
+
"""Test doing basic fetch request with multiple statuses"""
|
27
|
+
assert fetcher.fetch(self.status_200).status == 200
|
28
|
+
assert fetcher.fetch(self.status_404).status == 404
|
29
|
+
assert fetcher.fetch(self.status_501).status == 501
|
30
|
+
|
31
|
+
def test_networkidle(self, fetcher):
|
32
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
33
|
+
assert fetcher.fetch(self.basic_url, network_idle=True).status == 200
|
34
|
+
|
35
|
+
def test_blocking_resources(self, fetcher):
|
36
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
37
|
+
assert fetcher.fetch(self.basic_url, block_images=True).status == 200
|
38
|
+
assert fetcher.fetch(self.basic_url, disable_resources=True).status == 200
|
39
|
+
|
40
|
+
def test_waiting_selector(self, fetcher):
|
41
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
42
|
+
assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
|
43
|
+
assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
|
44
|
+
|
45
|
+
def test_cookies_loading(self, fetcher):
|
46
|
+
"""Test if cookies are set after the request"""
|
47
|
+
assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
|
48
|
+
|
49
|
+
def test_automation(self, fetcher):
|
50
|
+
"""Test if automation break the code or not"""
|
51
|
+
def scroll_page(page):
|
52
|
+
page.mouse.wheel(10, 0)
|
53
|
+
page.mouse.move(100, 400)
|
54
|
+
page.mouse.up()
|
55
|
+
return page
|
56
|
+
|
57
|
+
assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
|
58
|
+
|
59
|
+
def test_properties(self, fetcher):
|
60
|
+
"""Test if different arguments breaks the code or not"""
|
61
|
+
assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status == 200
|
62
|
+
assert fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status == 200
|
63
|
+
assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status == 200
|
64
|
+
assert fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status == 200
|
65
|
+
|
66
|
+
def test_infinite_timeout(self, fetcher):
|
67
|
+
"""Test if infinite timeout breaks the code or not"""
|
68
|
+
assert fetcher.fetch(self.delayed_url, timeout=None).status == 200
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import Fetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
class TestFetcher:
|
9
|
+
@pytest.fixture(scope="class")
|
10
|
+
def fetcher(self):
|
11
|
+
"""Fixture to create a Fetcher instance for the entire test class"""
|
12
|
+
return Fetcher(auto_match=False)
|
13
|
+
|
14
|
+
@pytest.fixture(autouse=True)
|
15
|
+
def setup_urls(self, httpbin):
|
16
|
+
"""Fixture to set up URLs for testing"""
|
17
|
+
self.status_200 = f'{httpbin.url}/status/200'
|
18
|
+
self.status_404 = f'{httpbin.url}/status/404'
|
19
|
+
self.status_501 = f'{httpbin.url}/status/501'
|
20
|
+
self.basic_url = f'{httpbin.url}/get'
|
21
|
+
self.post_url = f'{httpbin.url}/post'
|
22
|
+
self.put_url = f'{httpbin.url}/put'
|
23
|
+
self.delete_url = f'{httpbin.url}/delete'
|
24
|
+
self.html_url = f'{httpbin.url}/html'
|
25
|
+
|
26
|
+
def test_basic_get(self, fetcher):
|
27
|
+
"""Test doing basic get request with multiple statuses"""
|
28
|
+
assert fetcher.get(self.status_200).status == 200
|
29
|
+
assert fetcher.get(self.status_404).status == 404
|
30
|
+
assert fetcher.get(self.status_501).status == 501
|
31
|
+
|
32
|
+
def test_get_properties(self, fetcher):
|
33
|
+
"""Test if different arguments with GET request breaks the code or not"""
|
34
|
+
assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
|
35
|
+
assert fetcher.get(self.status_200, follow_redirects=True).status == 200
|
36
|
+
assert fetcher.get(self.status_200, timeout=None).status == 200
|
37
|
+
assert fetcher.get(
|
38
|
+
self.status_200,
|
39
|
+
stealthy_headers=True,
|
40
|
+
follow_redirects=True,
|
41
|
+
timeout=None
|
42
|
+
).status == 200
|
43
|
+
|
44
|
+
def test_post_properties(self, fetcher):
|
45
|
+
"""Test if different arguments with POST request breaks the code or not"""
|
46
|
+
assert fetcher.post(self.post_url, data={'key': 'value'}).status == 200
|
47
|
+
assert fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status == 200
|
48
|
+
assert fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status == 200
|
49
|
+
assert fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status == 200
|
50
|
+
assert fetcher.post(
|
51
|
+
self.post_url,
|
52
|
+
data={'key': 'value'},
|
53
|
+
stealthy_headers=True,
|
54
|
+
follow_redirects=True,
|
55
|
+
timeout=None
|
56
|
+
).status == 200
|
57
|
+
|
58
|
+
def test_put_properties(self, fetcher):
|
59
|
+
"""Test if different arguments with PUT request breaks the code or not"""
|
60
|
+
assert fetcher.put(self.put_url, data={'key': 'value'}).status == 200
|
61
|
+
assert fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status == 200
|
62
|
+
assert fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status == 200
|
63
|
+
assert fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status == 200
|
64
|
+
assert fetcher.put(
|
65
|
+
self.put_url,
|
66
|
+
data={'key': 'value'},
|
67
|
+
stealthy_headers=True,
|
68
|
+
follow_redirects=True,
|
69
|
+
timeout=None
|
70
|
+
).status == 200
|
71
|
+
|
72
|
+
def test_delete_properties(self, fetcher):
|
73
|
+
"""Test if different arguments with DELETE request breaks the code or not"""
|
74
|
+
assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
|
75
|
+
assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
|
76
|
+
assert fetcher.delete(self.delete_url, timeout=None).status == 200
|
77
|
+
assert fetcher.delete(
|
78
|
+
self.delete_url,
|
79
|
+
stealthy_headers=True,
|
80
|
+
follow_redirects=True,
|
81
|
+
timeout=None
|
82
|
+
).status == 200
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import PlayWrightFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
class TestPlayWrightFetcher:
|
9
|
+
|
10
|
+
@pytest.fixture(scope="class")
|
11
|
+
def fetcher(self):
|
12
|
+
"""Fixture to create a StealthyFetcher instance for the entire test class"""
|
13
|
+
return PlayWrightFetcher(auto_match=False)
|
14
|
+
|
15
|
+
@pytest.fixture(autouse=True)
|
16
|
+
def setup_urls(self, httpbin):
|
17
|
+
"""Fixture to set up URLs for testing"""
|
18
|
+
self.status_200 = f'{httpbin.url}/status/200'
|
19
|
+
self.status_404 = f'{httpbin.url}/status/404'
|
20
|
+
self.status_501 = f'{httpbin.url}/status/501'
|
21
|
+
self.basic_url = f'{httpbin.url}/get'
|
22
|
+
self.html_url = f'{httpbin.url}/html'
|
23
|
+
self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
|
24
|
+
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
25
|
+
|
26
|
+
def test_basic_fetch(self, fetcher):
|
27
|
+
"""Test doing basic fetch request with multiple statuses"""
|
28
|
+
assert fetcher.fetch(self.status_200).status == 200
|
29
|
+
# There's a bug with playwright makes it crashes if a URL returns status code 4xx/5xx without body, let's disable this till they reply to my issue report
|
30
|
+
# assert fetcher.fetch(self.status_404).status == 404
|
31
|
+
# assert fetcher.fetch(self.status_501).status == 501
|
32
|
+
|
33
|
+
def test_networkidle(self, fetcher):
|
34
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
35
|
+
assert fetcher.fetch(self.basic_url, network_idle=True).status == 200
|
36
|
+
|
37
|
+
def test_blocking_resources(self, fetcher):
|
38
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
39
|
+
assert fetcher.fetch(self.basic_url, disable_resources=True).status == 200
|
40
|
+
|
41
|
+
def test_waiting_selector(self, fetcher):
|
42
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
43
|
+
assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
|
44
|
+
assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
|
45
|
+
|
46
|
+
def test_cookies_loading(self, fetcher):
|
47
|
+
"""Test if cookies are set after the request"""
|
48
|
+
assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
|
49
|
+
|
50
|
+
def test_automation(self, fetcher):
|
51
|
+
"""Test if automation break the code or not"""
|
52
|
+
|
53
|
+
def scroll_page(page):
|
54
|
+
page.mouse.wheel(10, 0)
|
55
|
+
page.mouse.move(100, 400)
|
56
|
+
page.mouse.up()
|
57
|
+
return page
|
58
|
+
|
59
|
+
assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
|
60
|
+
|
61
|
+
@pytest.mark.parametrize("kwargs", [
|
62
|
+
{"disable_webgl": True, "hide_canvas": False},
|
63
|
+
{"disable_webgl": False, "hide_canvas": True},
|
64
|
+
{"stealth": True},
|
65
|
+
{"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
|
66
|
+
{"extra_headers": {'ayo': ''}}
|
67
|
+
])
|
68
|
+
def test_properties(self, fetcher, kwargs):
|
69
|
+
"""Test if different arguments breaks the code or not"""
|
70
|
+
response = fetcher.fetch(self.html_url, **kwargs)
|
71
|
+
assert response.status == 200
|
72
|
+
|
73
|
+
def test_cdp_url_invalid(self, fetcher):
|
74
|
+
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
75
|
+
with pytest.raises(ValueError):
|
76
|
+
fetcher.fetch(self.html_url, cdp_url='blahblah')
|
77
|
+
|
78
|
+
with pytest.raises(ValueError):
|
79
|
+
fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
80
|
+
|
81
|
+
with pytest.raises(Exception):
|
82
|
+
fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
83
|
+
|
84
|
+
def test_infinite_timeout(self, fetcher, ):
|
85
|
+
"""Test if infinite timeout breaks the code or not"""
|
86
|
+
response = fetcher.fetch(self.delayed_url, timeout=None)
|
87
|
+
assert response.status == 200
|
tests/fetchers/test_utils.py
CHANGED
@@ -1,129 +1,97 @@
|
|
1
|
-
import
|
1
|
+
import pytest
|
2
2
|
|
3
3
|
from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
|
4
4
|
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
self.status_map = {
|
55
|
-
100: "Continue",
|
56
|
-
101: "Switching Protocols",
|
57
|
-
102: "Processing",
|
58
|
-
103: "Early Hints",
|
59
|
-
200: "OK",
|
60
|
-
201: "Created",
|
61
|
-
202: "Accepted",
|
62
|
-
203: "Non-Authoritative Information",
|
63
|
-
204: "No Content",
|
64
|
-
205: "Reset Content",
|
65
|
-
206: "Partial Content",
|
66
|
-
207: "Multi-Status",
|
67
|
-
208: "Already Reported",
|
68
|
-
226: "IM Used",
|
69
|
-
300: "Multiple Choices",
|
70
|
-
301: "Moved Permanently",
|
71
|
-
302: "Found",
|
72
|
-
303: "See Other",
|
73
|
-
304: "Not Modified",
|
74
|
-
305: "Use Proxy",
|
75
|
-
307: "Temporary Redirect",
|
76
|
-
308: "Permanent Redirect",
|
77
|
-
400: "Bad Request",
|
78
|
-
401: "Unauthorized",
|
79
|
-
402: "Payment Required",
|
80
|
-
403: "Forbidden",
|
81
|
-
404: "Not Found",
|
82
|
-
405: "Method Not Allowed",
|
83
|
-
406: "Not Acceptable",
|
84
|
-
407: "Proxy Authentication Required",
|
85
|
-
408: "Request Timeout",
|
86
|
-
409: "Conflict",
|
87
|
-
410: "Gone",
|
88
|
-
411: "Length Required",
|
89
|
-
412: "Precondition Failed",
|
90
|
-
413: "Payload Too Large",
|
91
|
-
414: "URI Too Long",
|
92
|
-
415: "Unsupported Media Type",
|
93
|
-
416: "Range Not Satisfiable",
|
94
|
-
417: "Expectation Failed",
|
95
|
-
418: "I'm a teapot",
|
96
|
-
421: "Misdirected Request",
|
97
|
-
422: "Unprocessable Entity",
|
98
|
-
423: "Locked",
|
99
|
-
424: "Failed Dependency",
|
100
|
-
425: "Too Early",
|
101
|
-
426: "Upgrade Required",
|
102
|
-
428: "Precondition Required",
|
103
|
-
429: "Too Many Requests",
|
104
|
-
431: "Request Header Fields Too Large",
|
105
|
-
451: "Unavailable For Legal Reasons",
|
106
|
-
500: "Internal Server Error",
|
107
|
-
501: "Not Implemented",
|
108
|
-
502: "Bad Gateway",
|
109
|
-
503: "Service Unavailable",
|
110
|
-
504: "Gateway Timeout",
|
111
|
-
505: "HTTP Version Not Supported",
|
112
|
-
506: "Variant Also Negotiates",
|
113
|
-
507: "Insufficient Storage",
|
114
|
-
508: "Loop Detected",
|
115
|
-
510: "Not Extended",
|
116
|
-
511: "Network Authentication Required"
|
117
|
-
}
|
6
|
+
@pytest.fixture
|
7
|
+
def content_type_map():
|
8
|
+
return {
|
9
|
+
# A map generated by ChatGPT for most possible `content_type` values and the expected outcome
|
10
|
+
'text/html; charset=UTF-8': 'UTF-8',
|
11
|
+
'text/html; charset=ISO-8859-1': 'ISO-8859-1',
|
12
|
+
'text/html': 'ISO-8859-1',
|
13
|
+
'application/json; charset=UTF-8': 'UTF-8',
|
14
|
+
'application/json': 'utf-8',
|
15
|
+
'text/json': 'utf-8',
|
16
|
+
'application/javascript; charset=UTF-8': 'UTF-8',
|
17
|
+
'application/javascript': 'utf-8',
|
18
|
+
'text/plain; charset=UTF-8': 'UTF-8',
|
19
|
+
'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
|
20
|
+
'text/plain': 'ISO-8859-1',
|
21
|
+
'application/xhtml+xml; charset=UTF-8': 'UTF-8',
|
22
|
+
'application/xhtml+xml': 'utf-8',
|
23
|
+
'text/html; charset=windows-1252': 'windows-1252',
|
24
|
+
'application/json; charset=windows-1252': 'windows-1252',
|
25
|
+
'text/plain; charset=windows-1252': 'windows-1252',
|
26
|
+
'text/html; charset="UTF-8"': 'UTF-8',
|
27
|
+
'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
|
28
|
+
'text/html; charset="windows-1252"': 'windows-1252',
|
29
|
+
'application/json; charset="UTF-8"': 'UTF-8',
|
30
|
+
'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
|
31
|
+
'application/json; charset="windows-1252"': 'windows-1252',
|
32
|
+
'text/json; charset="UTF-8"': 'UTF-8',
|
33
|
+
'application/javascript; charset="UTF-8"': 'UTF-8',
|
34
|
+
'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
|
35
|
+
'text/plain; charset="UTF-8"': 'UTF-8',
|
36
|
+
'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
|
37
|
+
'text/plain; charset="windows-1252"': 'windows-1252',
|
38
|
+
'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
|
39
|
+
'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
40
|
+
'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
|
41
|
+
'text/html; charset="US-ASCII"': 'US-ASCII',
|
42
|
+
'application/json; charset="US-ASCII"': 'US-ASCII',
|
43
|
+
'text/plain; charset="US-ASCII"': 'US-ASCII',
|
44
|
+
'text/html; charset="Shift_JIS"': 'Shift_JIS',
|
45
|
+
'application/json; charset="Shift_JIS"': 'Shift_JIS',
|
46
|
+
'text/plain; charset="Shift_JIS"': 'Shift_JIS',
|
47
|
+
'application/xml; charset="UTF-8"': 'UTF-8',
|
48
|
+
'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
49
|
+
'application/xml': 'utf-8',
|
50
|
+
'text/xml; charset="UTF-8"': 'UTF-8',
|
51
|
+
'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
|
52
|
+
'text/xml': 'utf-8'
|
53
|
+
}
|
118
54
|
|
119
|
-
def test_parsing_content_type(self):
|
120
|
-
"""Test if parsing different types of content-type returns the expected result"""
|
121
|
-
for header_value, expected_encoding in self.content_type_map.items():
|
122
|
-
self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
|
123
55
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
56
|
+
@pytest.fixture
|
57
|
+
def status_map():
|
58
|
+
return {
|
59
|
+
100: "Continue", 101: "Switching Protocols", 102: "Processing", 103: "Early Hints",
|
60
|
+
200: "OK", 201: "Created", 202: "Accepted", 203: "Non-Authoritative Information",
|
61
|
+
204: "No Content", 205: "Reset Content", 206: "Partial Content", 207: "Multi-Status",
|
62
|
+
208: "Already Reported", 226: "IM Used", 300: "Multiple Choices",
|
63
|
+
301: "Moved Permanently", 302: "Found", 303: "See Other", 304: "Not Modified",
|
64
|
+
305: "Use Proxy", 307: "Temporary Redirect", 308: "Permanent Redirect",
|
65
|
+
400: "Bad Request", 401: "Unauthorized", 402: "Payment Required", 403: "Forbidden",
|
66
|
+
404: "Not Found", 405: "Method Not Allowed", 406: "Not Acceptable",
|
67
|
+
407: "Proxy Authentication Required", 408: "Request Timeout", 409: "Conflict",
|
68
|
+
410: "Gone", 411: "Length Required", 412: "Precondition Failed",
|
69
|
+
413: "Payload Too Large", 414: "URI Too Long", 415: "Unsupported Media Type",
|
70
|
+
416: "Range Not Satisfiable", 417: "Expectation Failed", 418: "I'm a teapot",
|
71
|
+
421: "Misdirected Request", 422: "Unprocessable Entity", 423: "Locked",
|
72
|
+
424: "Failed Dependency", 425: "Too Early", 426: "Upgrade Required",
|
73
|
+
428: "Precondition Required", 429: "Too Many Requests",
|
74
|
+
431: "Request Header Fields Too Large", 451: "Unavailable For Legal Reasons",
|
75
|
+
500: "Internal Server Error", 501: "Not Implemented", 502: "Bad Gateway",
|
76
|
+
503: "Service Unavailable", 504: "Gateway Timeout",
|
77
|
+
505: "HTTP Version Not Supported", 506: "Variant Also Negotiates",
|
78
|
+
507: "Insufficient Storage", 508: "Loop Detected", 510: "Not Extended",
|
79
|
+
511: "Network Authentication Required"
|
80
|
+
}
|
128
81
|
|
129
|
-
|
82
|
+
|
83
|
+
def test_parsing_content_type(content_type_map):
|
84
|
+
"""Test if parsing different types of content-type returns the expected result"""
|
85
|
+
for header_value, expected_encoding in content_type_map.items():
|
86
|
+
assert ResponseEncoding.get_value(header_value) == expected_encoding
|
87
|
+
|
88
|
+
|
89
|
+
def test_parsing_response_status(status_map):
|
90
|
+
"""Test if using different http responses' status codes returns the expected result"""
|
91
|
+
for status_code, expected_status_text in status_map.items():
|
92
|
+
assert StatusText.get(status_code) == expected_status_text
|
93
|
+
|
94
|
+
|
95
|
+
def test_unknown_status_code():
|
96
|
+
"""Test handling of an unknown status code"""
|
97
|
+
assert StatusText.get(1000) == "Unknown Status Code"
|
tests/parser/test_automatch.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
import
|
1
|
+
import asyncio
|
2
2
|
|
3
|
-
|
3
|
+
import pytest
|
4
4
|
|
5
|
+
from scrapling import Adaptor
|
5
6
|
|
6
|
-
class TestParserAutoMatch(unittest.TestCase):
|
7
7
|
|
8
|
+
class TestParserAutoMatch:
|
8
9
|
def test_element_relocation(self):
|
9
10
|
"""Test relocating element after structure change"""
|
10
11
|
original_html = '''
|
@@ -42,15 +43,69 @@ class TestParserAutoMatch(unittest.TestCase):
|
|
42
43
|
</div>
|
43
44
|
'''
|
44
45
|
|
45
|
-
old_page = Adaptor(original_html, url='example.com', auto_match=True
|
46
|
-
new_page = Adaptor(changed_html, url='example.com', auto_match=True
|
46
|
+
old_page = Adaptor(original_html, url='example.com', auto_match=True)
|
47
|
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True)
|
48
|
+
|
49
|
+
# 'p1' was used as ID and now it's not and all the path elements have changes
|
50
|
+
# Also at the same time testing auto-match vs combined selectors
|
51
|
+
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
52
|
+
relocated = new_page.css('#p1', auto_match=True)
|
53
|
+
|
54
|
+
assert relocated is not None
|
55
|
+
assert relocated[0].attrib['data-id'] == 'p1'
|
56
|
+
assert relocated[0].has_class('new-class')
|
57
|
+
assert relocated[0].css('.new-description')[0].text == 'Description 1'
|
58
|
+
|
59
|
+
@pytest.mark.asyncio
|
60
|
+
async def test_element_relocation_async(self):
|
61
|
+
"""Test relocating element after structure change in async mode"""
|
62
|
+
original_html = '''
|
63
|
+
<div class="container">
|
64
|
+
<section class="products">
|
65
|
+
<article class="product" id="p1">
|
66
|
+
<h3>Product 1</h3>
|
67
|
+
<p class="description">Description 1</p>
|
68
|
+
</article>
|
69
|
+
<article class="product" id="p2">
|
70
|
+
<h3>Product 2</h3>
|
71
|
+
<p class="description">Description 2</p>
|
72
|
+
</article>
|
73
|
+
</section>
|
74
|
+
</div>
|
75
|
+
'''
|
76
|
+
changed_html = '''
|
77
|
+
<div class="new-container">
|
78
|
+
<div class="product-wrapper">
|
79
|
+
<section class="products">
|
80
|
+
<article class="product new-class" data-id="p1">
|
81
|
+
<div class="product-info">
|
82
|
+
<h3>Product 1</h3>
|
83
|
+
<p class="new-description">Description 1</p>
|
84
|
+
</div>
|
85
|
+
</article>
|
86
|
+
<article class="product new-class" data-id="p2">
|
87
|
+
<div class="product-info">
|
88
|
+
<h3>Product 2</h3>
|
89
|
+
<p class="new-description">Description 2</p>
|
90
|
+
</div>
|
91
|
+
</article>
|
92
|
+
</section>
|
93
|
+
</div>
|
94
|
+
</div>
|
95
|
+
'''
|
96
|
+
|
97
|
+
# Simulate async operation
|
98
|
+
await asyncio.sleep(0.1) # Minimal async operation
|
99
|
+
|
100
|
+
old_page = Adaptor(original_html, url='example.com', auto_match=True)
|
101
|
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True)
|
47
102
|
|
48
103
|
# 'p1' was used as ID and now it's not and all the path elements have changes
|
49
104
|
# Also at the same time testing auto-match vs combined selectors
|
50
105
|
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
51
106
|
relocated = new_page.css('#p1', auto_match=True)
|
52
107
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
108
|
+
assert relocated is not None
|
109
|
+
assert relocated[0].attrib['data-id'] == 'p1'
|
110
|
+
assert relocated[0].has_class('new-class')
|
111
|
+
assert relocated[0].css('.new-description')[0].text == 'Description 1'
|