scrapling 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. scrapling/__init__.py +4 -4
  2. scrapling/core/custom_types.py +88 -6
  3. scrapling/core/storage_adaptors.py +5 -6
  4. scrapling/core/translator.py +2 -2
  5. scrapling/core/utils.py +29 -27
  6. scrapling/defaults.py +2 -1
  7. scrapling/engines/camo.py +89 -15
  8. scrapling/engines/constants.py +4 -4
  9. scrapling/engines/pw.py +158 -83
  10. scrapling/engines/static.py +91 -48
  11. scrapling/engines/toolbelt/__init__.py +3 -3
  12. scrapling/engines/toolbelt/custom.py +20 -22
  13. scrapling/engines/toolbelt/fingerprints.py +3 -3
  14. scrapling/engines/toolbelt/navigation.py +21 -8
  15. scrapling/fetchers.py +229 -14
  16. scrapling/parser.py +49 -21
  17. {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/METADATA +32 -16
  18. scrapling-0.2.9.dist-info/RECORD +47 -0
  19. tests/fetchers/async/__init__.py +0 -0
  20. tests/fetchers/async/test_camoufox.py +95 -0
  21. tests/fetchers/async/test_httpx.py +83 -0
  22. tests/fetchers/async/test_playwright.py +99 -0
  23. tests/fetchers/sync/__init__.py +0 -0
  24. tests/fetchers/sync/test_camoufox.py +68 -0
  25. tests/fetchers/sync/test_httpx.py +82 -0
  26. tests/fetchers/sync/test_playwright.py +87 -0
  27. tests/fetchers/test_utils.py +90 -122
  28. tests/parser/test_automatch.py +64 -9
  29. tests/parser/test_general.py +260 -218
  30. scrapling-0.2.8.dist-info/RECORD +0 -42
  31. tests/fetchers/test_camoufox.py +0 -65
  32. tests/fetchers/test_httpx.py +0 -68
  33. tests/fetchers/test_playwright.py +0 -77
  34. {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
  35. {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
  36. {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,99 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import PlayWrightFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ class TestPlayWrightFetcherAsync:
9
+ @pytest.fixture
10
+ def fetcher(self):
11
+ return PlayWrightFetcher(auto_match=False)
12
+
13
+ @pytest.fixture
14
+ def urls(self, httpbin):
15
+ return {
16
+ 'status_200': f'{httpbin.url}/status/200',
17
+ 'status_404': f'{httpbin.url}/status/404',
18
+ 'status_501': f'{httpbin.url}/status/501',
19
+ 'basic_url': f'{httpbin.url}/get',
20
+ 'html_url': f'{httpbin.url}/html',
21
+ 'delayed_url': f'{httpbin.url}/delay/10',
22
+ 'cookies_url': f"{httpbin.url}/cookies/set/test/value"
23
+ }
24
+
25
+ @pytest.mark.asyncio
26
+ async def test_basic_fetch(self, fetcher, urls):
27
+ """Test doing basic fetch request with multiple statuses"""
28
+ response = await fetcher.async_fetch(urls['status_200'])
29
+ assert response.status == 200
30
+
31
+ @pytest.mark.asyncio
32
+ async def test_networkidle(self, fetcher, urls):
33
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
34
+ response = await fetcher.async_fetch(urls['basic_url'], network_idle=True)
35
+ assert response.status == 200
36
+
37
+ @pytest.mark.asyncio
38
+ async def test_blocking_resources(self, fetcher, urls):
39
+ """Test if blocking resources make page does not finish loading or not"""
40
+ response = await fetcher.async_fetch(urls['basic_url'], disable_resources=True)
41
+ assert response.status == 200
42
+
43
+ @pytest.mark.asyncio
44
+ async def test_waiting_selector(self, fetcher, urls):
45
+ """Test if waiting for a selector make page does not finish loading or not"""
46
+ response1 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1')
47
+ assert response1.status == 200
48
+
49
+ response2 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1', wait_selector_state='visible')
50
+ assert response2.status == 200
51
+
52
+ @pytest.mark.asyncio
53
+ async def test_cookies_loading(self, fetcher, urls):
54
+ """Test if cookies are set after the request"""
55
+ response = await fetcher.async_fetch(urls['cookies_url'])
56
+ assert response.cookies == {'test': 'value'}
57
+
58
+ @pytest.mark.asyncio
59
+ async def test_automation(self, fetcher, urls):
60
+ """Test if automation break the code or not"""
61
+ async def scroll_page(page):
62
+ await page.mouse.wheel(10, 0)
63
+ await page.mouse.move(100, 400)
64
+ await page.mouse.up()
65
+ return page
66
+
67
+ response = await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)
68
+ assert response.status == 200
69
+
70
+ @pytest.mark.parametrize("kwargs", [
71
+ {"disable_webgl": True, "hide_canvas": False},
72
+ {"disable_webgl": False, "hide_canvas": True},
73
+ {"stealth": True},
74
+ {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
75
+ {"extra_headers": {'ayo': ''}}
76
+ ])
77
+ @pytest.mark.asyncio
78
+ async def test_properties(self, fetcher, urls, kwargs):
79
+ """Test if different arguments breaks the code or not"""
80
+ response = await fetcher.async_fetch(urls['html_url'], **kwargs)
81
+ assert response.status == 200
82
+
83
+ @pytest.mark.asyncio
84
+ async def test_cdp_url_invalid(self, fetcher, urls):
85
+ """Test if invalid CDP URLs raise appropriate exceptions"""
86
+ with pytest.raises(ValueError):
87
+ await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah')
88
+
89
+ with pytest.raises(ValueError):
90
+ await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah', nstbrowser_mode=True)
91
+
92
+ with pytest.raises(Exception):
93
+ await fetcher.async_fetch(urls['html_url'], cdp_url='ws://blahblah')
94
+
95
+ @pytest.mark.asyncio
96
+ async def test_infinite_timeout(self, fetcher, urls):
97
+ """Test if infinite timeout breaks the code or not"""
98
+ response = await fetcher.async_fetch(urls['delayed_url'], timeout=None)
99
+ assert response.status == 200
File without changes
@@ -0,0 +1,68 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import StealthyFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ class TestStealthyFetcher:
9
+ @pytest.fixture(scope="class")
10
+ def fetcher(self):
11
+ """Fixture to create a StealthyFetcher instance for the entire test class"""
12
+ return StealthyFetcher(auto_match=False)
13
+
14
+ @pytest.fixture(autouse=True)
15
+ def setup_urls(self, httpbin):
16
+ """Fixture to set up URLs for testing"""
17
+ self.status_200 = f'{httpbin.url}/status/200'
18
+ self.status_404 = f'{httpbin.url}/status/404'
19
+ self.status_501 = f'{httpbin.url}/status/501'
20
+ self.basic_url = f'{httpbin.url}/get'
21
+ self.html_url = f'{httpbin.url}/html'
22
+ self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
23
+ self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
24
+
25
+ def test_basic_fetch(self, fetcher):
26
+ """Test doing basic fetch request with multiple statuses"""
27
+ assert fetcher.fetch(self.status_200).status == 200
28
+ assert fetcher.fetch(self.status_404).status == 404
29
+ assert fetcher.fetch(self.status_501).status == 501
30
+
31
+ def test_networkidle(self, fetcher):
32
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
33
+ assert fetcher.fetch(self.basic_url, network_idle=True).status == 200
34
+
35
+ def test_blocking_resources(self, fetcher):
36
+ """Test if blocking resources make page does not finish loading or not"""
37
+ assert fetcher.fetch(self.basic_url, block_images=True).status == 200
38
+ assert fetcher.fetch(self.basic_url, disable_resources=True).status == 200
39
+
40
+ def test_waiting_selector(self, fetcher):
41
+ """Test if waiting for a selector make page does not finish loading or not"""
42
+ assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
43
+ assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
44
+
45
+ def test_cookies_loading(self, fetcher):
46
+ """Test if cookies are set after the request"""
47
+ assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
48
+
49
+ def test_automation(self, fetcher):
50
+ """Test if automation break the code or not"""
51
+ def scroll_page(page):
52
+ page.mouse.wheel(10, 0)
53
+ page.mouse.move(100, 400)
54
+ page.mouse.up()
55
+ return page
56
+
57
+ assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
58
+
59
+ def test_properties(self, fetcher):
60
+ """Test if different arguments breaks the code or not"""
61
+ assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status == 200
62
+ assert fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status == 200
63
+ assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status == 200
64
+ assert fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status == 200
65
+
66
+ def test_infinite_timeout(self, fetcher):
67
+ """Test if infinite timeout breaks the code or not"""
68
+ assert fetcher.fetch(self.delayed_url, timeout=None).status == 200
@@ -0,0 +1,82 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import Fetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ class TestFetcher:
9
+ @pytest.fixture(scope="class")
10
+ def fetcher(self):
11
+ """Fixture to create a Fetcher instance for the entire test class"""
12
+ return Fetcher(auto_match=False)
13
+
14
+ @pytest.fixture(autouse=True)
15
+ def setup_urls(self, httpbin):
16
+ """Fixture to set up URLs for testing"""
17
+ self.status_200 = f'{httpbin.url}/status/200'
18
+ self.status_404 = f'{httpbin.url}/status/404'
19
+ self.status_501 = f'{httpbin.url}/status/501'
20
+ self.basic_url = f'{httpbin.url}/get'
21
+ self.post_url = f'{httpbin.url}/post'
22
+ self.put_url = f'{httpbin.url}/put'
23
+ self.delete_url = f'{httpbin.url}/delete'
24
+ self.html_url = f'{httpbin.url}/html'
25
+
26
+ def test_basic_get(self, fetcher):
27
+ """Test doing basic get request with multiple statuses"""
28
+ assert fetcher.get(self.status_200).status == 200
29
+ assert fetcher.get(self.status_404).status == 404
30
+ assert fetcher.get(self.status_501).status == 501
31
+
32
+ def test_get_properties(self, fetcher):
33
+ """Test if different arguments with GET request breaks the code or not"""
34
+ assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
35
+ assert fetcher.get(self.status_200, follow_redirects=True).status == 200
36
+ assert fetcher.get(self.status_200, timeout=None).status == 200
37
+ assert fetcher.get(
38
+ self.status_200,
39
+ stealthy_headers=True,
40
+ follow_redirects=True,
41
+ timeout=None
42
+ ).status == 200
43
+
44
+ def test_post_properties(self, fetcher):
45
+ """Test if different arguments with POST request breaks the code or not"""
46
+ assert fetcher.post(self.post_url, data={'key': 'value'}).status == 200
47
+ assert fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status == 200
48
+ assert fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status == 200
49
+ assert fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status == 200
50
+ assert fetcher.post(
51
+ self.post_url,
52
+ data={'key': 'value'},
53
+ stealthy_headers=True,
54
+ follow_redirects=True,
55
+ timeout=None
56
+ ).status == 200
57
+
58
+ def test_put_properties(self, fetcher):
59
+ """Test if different arguments with PUT request breaks the code or not"""
60
+ assert fetcher.put(self.put_url, data={'key': 'value'}).status == 200
61
+ assert fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status == 200
62
+ assert fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status == 200
63
+ assert fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status == 200
64
+ assert fetcher.put(
65
+ self.put_url,
66
+ data={'key': 'value'},
67
+ stealthy_headers=True,
68
+ follow_redirects=True,
69
+ timeout=None
70
+ ).status == 200
71
+
72
+ def test_delete_properties(self, fetcher):
73
+ """Test if different arguments with DELETE request breaks the code or not"""
74
+ assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
75
+ assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
76
+ assert fetcher.delete(self.delete_url, timeout=None).status == 200
77
+ assert fetcher.delete(
78
+ self.delete_url,
79
+ stealthy_headers=True,
80
+ follow_redirects=True,
81
+ timeout=None
82
+ ).status == 200
@@ -0,0 +1,87 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import PlayWrightFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ class TestPlayWrightFetcher:
9
+
10
+ @pytest.fixture(scope="class")
11
+ def fetcher(self):
12
+ """Fixture to create a StealthyFetcher instance for the entire test class"""
13
+ return PlayWrightFetcher(auto_match=False)
14
+
15
+ @pytest.fixture(autouse=True)
16
+ def setup_urls(self, httpbin):
17
+ """Fixture to set up URLs for testing"""
18
+ self.status_200 = f'{httpbin.url}/status/200'
19
+ self.status_404 = f'{httpbin.url}/status/404'
20
+ self.status_501 = f'{httpbin.url}/status/501'
21
+ self.basic_url = f'{httpbin.url}/get'
22
+ self.html_url = f'{httpbin.url}/html'
23
+ self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
24
+ self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
25
+
26
+ def test_basic_fetch(self, fetcher):
27
+ """Test doing basic fetch request with multiple statuses"""
28
+ assert fetcher.fetch(self.status_200).status == 200
29
+ # There's a bug with playwright makes it crashes if a URL returns status code 4xx/5xx without body, let's disable this till they reply to my issue report
30
+ # assert fetcher.fetch(self.status_404).status == 404
31
+ # assert fetcher.fetch(self.status_501).status == 501
32
+
33
+ def test_networkidle(self, fetcher):
34
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
35
+ assert fetcher.fetch(self.basic_url, network_idle=True).status == 200
36
+
37
+ def test_blocking_resources(self, fetcher):
38
+ """Test if blocking resources make page does not finish loading or not"""
39
+ assert fetcher.fetch(self.basic_url, disable_resources=True).status == 200
40
+
41
+ def test_waiting_selector(self, fetcher):
42
+ """Test if waiting for a selector make page does not finish loading or not"""
43
+ assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
44
+ assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
45
+
46
+ def test_cookies_loading(self, fetcher):
47
+ """Test if cookies are set after the request"""
48
+ assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
49
+
50
+ def test_automation(self, fetcher):
51
+ """Test if automation break the code or not"""
52
+
53
+ def scroll_page(page):
54
+ page.mouse.wheel(10, 0)
55
+ page.mouse.move(100, 400)
56
+ page.mouse.up()
57
+ return page
58
+
59
+ assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
60
+
61
+ @pytest.mark.parametrize("kwargs", [
62
+ {"disable_webgl": True, "hide_canvas": False},
63
+ {"disable_webgl": False, "hide_canvas": True},
64
+ {"stealth": True},
65
+ {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
66
+ {"extra_headers": {'ayo': ''}}
67
+ ])
68
+ def test_properties(self, fetcher, kwargs):
69
+ """Test if different arguments breaks the code or not"""
70
+ response = fetcher.fetch(self.html_url, **kwargs)
71
+ assert response.status == 200
72
+
73
+ def test_cdp_url_invalid(self, fetcher):
74
+ """Test if invalid CDP URLs raise appropriate exceptions"""
75
+ with pytest.raises(ValueError):
76
+ fetcher.fetch(self.html_url, cdp_url='blahblah')
77
+
78
+ with pytest.raises(ValueError):
79
+ fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
80
+
81
+ with pytest.raises(Exception):
82
+ fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
83
+
84
+ def test_infinite_timeout(self, fetcher, ):
85
+ """Test if infinite timeout breaks the code or not"""
86
+ response = fetcher.fetch(self.delayed_url, timeout=None)
87
+ assert response.status == 200
@@ -1,129 +1,97 @@
1
- import unittest
1
+ import pytest
2
2
 
3
3
  from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
4
4
 
5
5
 
6
- class TestPlayWrightFetcher(unittest.TestCase):
7
- def setUp(self):
8
- self.content_type_map = {
9
- # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
- 'text/html; charset=UTF-8': 'UTF-8',
11
- 'text/html; charset=ISO-8859-1': 'ISO-8859-1',
12
- 'text/html': 'ISO-8859-1',
13
- 'application/json; charset=UTF-8': 'UTF-8',
14
- 'application/json': 'utf-8',
15
- 'text/json': 'utf-8',
16
- 'application/javascript; charset=UTF-8': 'UTF-8',
17
- 'application/javascript': 'utf-8',
18
- 'text/plain; charset=UTF-8': 'UTF-8',
19
- 'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
20
- 'text/plain': 'ISO-8859-1',
21
- 'application/xhtml+xml; charset=UTF-8': 'UTF-8',
22
- 'application/xhtml+xml': 'utf-8',
23
- 'text/html; charset=windows-1252': 'windows-1252',
24
- 'application/json; charset=windows-1252': 'windows-1252',
25
- 'text/plain; charset=windows-1252': 'windows-1252',
26
- 'text/html; charset="UTF-8"': 'UTF-8',
27
- 'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
28
- 'text/html; charset="windows-1252"': 'windows-1252',
29
- 'application/json; charset="UTF-8"': 'UTF-8',
30
- 'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
31
- 'application/json; charset="windows-1252"': 'windows-1252',
32
- 'text/json; charset="UTF-8"': 'UTF-8',
33
- 'application/javascript; charset="UTF-8"': 'UTF-8',
34
- 'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
35
- 'text/plain; charset="UTF-8"': 'UTF-8',
36
- 'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
37
- 'text/plain; charset="windows-1252"': 'windows-1252',
38
- 'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
39
- 'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
40
- 'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
41
- 'text/html; charset="US-ASCII"': 'US-ASCII',
42
- 'application/json; charset="US-ASCII"': 'US-ASCII',
43
- 'text/plain; charset="US-ASCII"': 'US-ASCII',
44
- 'text/html; charset="Shift_JIS"': 'Shift_JIS',
45
- 'application/json; charset="Shift_JIS"': 'Shift_JIS',
46
- 'text/plain; charset="Shift_JIS"': 'Shift_JIS',
47
- 'application/xml; charset="UTF-8"': 'UTF-8',
48
- 'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
49
- 'application/xml': 'utf-8',
50
- 'text/xml; charset="UTF-8"': 'UTF-8',
51
- 'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
52
- 'text/xml': 'utf-8'
53
- }
54
- self.status_map = {
55
- 100: "Continue",
56
- 101: "Switching Protocols",
57
- 102: "Processing",
58
- 103: "Early Hints",
59
- 200: "OK",
60
- 201: "Created",
61
- 202: "Accepted",
62
- 203: "Non-Authoritative Information",
63
- 204: "No Content",
64
- 205: "Reset Content",
65
- 206: "Partial Content",
66
- 207: "Multi-Status",
67
- 208: "Already Reported",
68
- 226: "IM Used",
69
- 300: "Multiple Choices",
70
- 301: "Moved Permanently",
71
- 302: "Found",
72
- 303: "See Other",
73
- 304: "Not Modified",
74
- 305: "Use Proxy",
75
- 307: "Temporary Redirect",
76
- 308: "Permanent Redirect",
77
- 400: "Bad Request",
78
- 401: "Unauthorized",
79
- 402: "Payment Required",
80
- 403: "Forbidden",
81
- 404: "Not Found",
82
- 405: "Method Not Allowed",
83
- 406: "Not Acceptable",
84
- 407: "Proxy Authentication Required",
85
- 408: "Request Timeout",
86
- 409: "Conflict",
87
- 410: "Gone",
88
- 411: "Length Required",
89
- 412: "Precondition Failed",
90
- 413: "Payload Too Large",
91
- 414: "URI Too Long",
92
- 415: "Unsupported Media Type",
93
- 416: "Range Not Satisfiable",
94
- 417: "Expectation Failed",
95
- 418: "I'm a teapot",
96
- 421: "Misdirected Request",
97
- 422: "Unprocessable Entity",
98
- 423: "Locked",
99
- 424: "Failed Dependency",
100
- 425: "Too Early",
101
- 426: "Upgrade Required",
102
- 428: "Precondition Required",
103
- 429: "Too Many Requests",
104
- 431: "Request Header Fields Too Large",
105
- 451: "Unavailable For Legal Reasons",
106
- 500: "Internal Server Error",
107
- 501: "Not Implemented",
108
- 502: "Bad Gateway",
109
- 503: "Service Unavailable",
110
- 504: "Gateway Timeout",
111
- 505: "HTTP Version Not Supported",
112
- 506: "Variant Also Negotiates",
113
- 507: "Insufficient Storage",
114
- 508: "Loop Detected",
115
- 510: "Not Extended",
116
- 511: "Network Authentication Required"
117
- }
6
+ @pytest.fixture
7
+ def content_type_map():
8
+ return {
9
+ # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
+ 'text/html; charset=UTF-8': 'UTF-8',
11
+ 'text/html; charset=ISO-8859-1': 'ISO-8859-1',
12
+ 'text/html': 'ISO-8859-1',
13
+ 'application/json; charset=UTF-8': 'UTF-8',
14
+ 'application/json': 'utf-8',
15
+ 'text/json': 'utf-8',
16
+ 'application/javascript; charset=UTF-8': 'UTF-8',
17
+ 'application/javascript': 'utf-8',
18
+ 'text/plain; charset=UTF-8': 'UTF-8',
19
+ 'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
20
+ 'text/plain': 'ISO-8859-1',
21
+ 'application/xhtml+xml; charset=UTF-8': 'UTF-8',
22
+ 'application/xhtml+xml': 'utf-8',
23
+ 'text/html; charset=windows-1252': 'windows-1252',
24
+ 'application/json; charset=windows-1252': 'windows-1252',
25
+ 'text/plain; charset=windows-1252': 'windows-1252',
26
+ 'text/html; charset="UTF-8"': 'UTF-8',
27
+ 'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
28
+ 'text/html; charset="windows-1252"': 'windows-1252',
29
+ 'application/json; charset="UTF-8"': 'UTF-8',
30
+ 'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
31
+ 'application/json; charset="windows-1252"': 'windows-1252',
32
+ 'text/json; charset="UTF-8"': 'UTF-8',
33
+ 'application/javascript; charset="UTF-8"': 'UTF-8',
34
+ 'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
35
+ 'text/plain; charset="UTF-8"': 'UTF-8',
36
+ 'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
37
+ 'text/plain; charset="windows-1252"': 'windows-1252',
38
+ 'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
39
+ 'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
40
+ 'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
41
+ 'text/html; charset="US-ASCII"': 'US-ASCII',
42
+ 'application/json; charset="US-ASCII"': 'US-ASCII',
43
+ 'text/plain; charset="US-ASCII"': 'US-ASCII',
44
+ 'text/html; charset="Shift_JIS"': 'Shift_JIS',
45
+ 'application/json; charset="Shift_JIS"': 'Shift_JIS',
46
+ 'text/plain; charset="Shift_JIS"': 'Shift_JIS',
47
+ 'application/xml; charset="UTF-8"': 'UTF-8',
48
+ 'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
49
+ 'application/xml': 'utf-8',
50
+ 'text/xml; charset="UTF-8"': 'UTF-8',
51
+ 'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
52
+ 'text/xml': 'utf-8'
53
+ }
118
54
 
119
- def test_parsing_content_type(self):
120
- """Test if parsing different types of content-type returns the expected result"""
121
- for header_value, expected_encoding in self.content_type_map.items():
122
- self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
123
55
 
124
- def test_parsing_response_status(self):
125
- """Test if using different http responses' status codes returns the expected result"""
126
- for status_code, expected_status_text in self.status_map.items():
127
- self.assertEqual(StatusText.get(status_code), expected_status_text)
56
+ @pytest.fixture
57
+ def status_map():
58
+ return {
59
+ 100: "Continue", 101: "Switching Protocols", 102: "Processing", 103: "Early Hints",
60
+ 200: "OK", 201: "Created", 202: "Accepted", 203: "Non-Authoritative Information",
61
+ 204: "No Content", 205: "Reset Content", 206: "Partial Content", 207: "Multi-Status",
62
+ 208: "Already Reported", 226: "IM Used", 300: "Multiple Choices",
63
+ 301: "Moved Permanently", 302: "Found", 303: "See Other", 304: "Not Modified",
64
+ 305: "Use Proxy", 307: "Temporary Redirect", 308: "Permanent Redirect",
65
+ 400: "Bad Request", 401: "Unauthorized", 402: "Payment Required", 403: "Forbidden",
66
+ 404: "Not Found", 405: "Method Not Allowed", 406: "Not Acceptable",
67
+ 407: "Proxy Authentication Required", 408: "Request Timeout", 409: "Conflict",
68
+ 410: "Gone", 411: "Length Required", 412: "Precondition Failed",
69
+ 413: "Payload Too Large", 414: "URI Too Long", 415: "Unsupported Media Type",
70
+ 416: "Range Not Satisfiable", 417: "Expectation Failed", 418: "I'm a teapot",
71
+ 421: "Misdirected Request", 422: "Unprocessable Entity", 423: "Locked",
72
+ 424: "Failed Dependency", 425: "Too Early", 426: "Upgrade Required",
73
+ 428: "Precondition Required", 429: "Too Many Requests",
74
+ 431: "Request Header Fields Too Large", 451: "Unavailable For Legal Reasons",
75
+ 500: "Internal Server Error", 501: "Not Implemented", 502: "Bad Gateway",
76
+ 503: "Service Unavailable", 504: "Gateway Timeout",
77
+ 505: "HTTP Version Not Supported", 506: "Variant Also Negotiates",
78
+ 507: "Insufficient Storage", 508: "Loop Detected", 510: "Not Extended",
79
+ 511: "Network Authentication Required"
80
+ }
128
81
 
129
- self.assertEqual(StatusText.get(1000), "Unknown Status Code")
82
+
83
+ def test_parsing_content_type(content_type_map):
84
+ """Test if parsing different types of content-type returns the expected result"""
85
+ for header_value, expected_encoding in content_type_map.items():
86
+ assert ResponseEncoding.get_value(header_value) == expected_encoding
87
+
88
+
89
+ def test_parsing_response_status(status_map):
90
+ """Test if using different http responses' status codes returns the expected result"""
91
+ for status_code, expected_status_text in status_map.items():
92
+ assert StatusText.get(status_code) == expected_status_text
93
+
94
+
95
+ def test_unknown_status_code():
96
+ """Test handling of an unknown status code"""
97
+ assert StatusText.get(1000) == "Unknown Status Code"
@@ -1,10 +1,11 @@
1
- import unittest
1
+ import asyncio
2
2
 
3
- from scrapling import Adaptor
3
+ import pytest
4
4
 
5
+ from scrapling import Adaptor
5
6
 
6
- class TestParserAutoMatch(unittest.TestCase):
7
7
 
8
+ class TestParserAutoMatch:
8
9
  def test_element_relocation(self):
9
10
  """Test relocating element after structure change"""
10
11
  original_html = '''
@@ -42,15 +43,69 @@ class TestParserAutoMatch(unittest.TestCase):
42
43
  </div>
43
44
  '''
44
45
 
45
- old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
46
- new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
46
+ old_page = Adaptor(original_html, url='example.com', auto_match=True)
47
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True)
48
+
49
+ # 'p1' was used as ID and now it's not and all the path elements have changes
50
+ # Also at the same time testing auto-match vs combined selectors
51
+ _ = old_page.css('#p1, #p2', auto_save=True)[0]
52
+ relocated = new_page.css('#p1', auto_match=True)
53
+
54
+ assert relocated is not None
55
+ assert relocated[0].attrib['data-id'] == 'p1'
56
+ assert relocated[0].has_class('new-class')
57
+ assert relocated[0].css('.new-description')[0].text == 'Description 1'
58
+
59
+ @pytest.mark.asyncio
60
+ async def test_element_relocation_async(self):
61
+ """Test relocating element after structure change in async mode"""
62
+ original_html = '''
63
+ <div class="container">
64
+ <section class="products">
65
+ <article class="product" id="p1">
66
+ <h3>Product 1</h3>
67
+ <p class="description">Description 1</p>
68
+ </article>
69
+ <article class="product" id="p2">
70
+ <h3>Product 2</h3>
71
+ <p class="description">Description 2</p>
72
+ </article>
73
+ </section>
74
+ </div>
75
+ '''
76
+ changed_html = '''
77
+ <div class="new-container">
78
+ <div class="product-wrapper">
79
+ <section class="products">
80
+ <article class="product new-class" data-id="p1">
81
+ <div class="product-info">
82
+ <h3>Product 1</h3>
83
+ <p class="new-description">Description 1</p>
84
+ </div>
85
+ </article>
86
+ <article class="product new-class" data-id="p2">
87
+ <div class="product-info">
88
+ <h3>Product 2</h3>
89
+ <p class="new-description">Description 2</p>
90
+ </div>
91
+ </article>
92
+ </section>
93
+ </div>
94
+ </div>
95
+ '''
96
+
97
+ # Simulate async operation
98
+ await asyncio.sleep(0.1) # Minimal async operation
99
+
100
+ old_page = Adaptor(original_html, url='example.com', auto_match=True)
101
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True)
47
102
 
48
103
  # 'p1' was used as ID and now it's not and all the path elements have changes
49
104
  # Also at the same time testing auto-match vs combined selectors
50
105
  _ = old_page.css('#p1, #p2', auto_save=True)[0]
51
106
  relocated = new_page.css('#p1', auto_match=True)
52
107
 
53
- self.assertIsNotNone(relocated)
54
- self.assertEqual(relocated[0].attrib['data-id'], 'p1')
55
- self.assertTrue(relocated[0].has_class('new-class'))
56
- self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
108
+ assert relocated is not None
109
+ assert relocated[0].attrib['data-id'] == 'p1'
110
+ assert relocated[0].has_class('new-class')
111
+ assert relocated[0].css('.new-description')[0].text == 'Description 1'