scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,49 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=S-SWj9O2r0Tu8Z-mPxDJ-z3h5k-bBfhFOETaCY4A9dc,1510
|
2
|
-
scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
|
3
|
-
scrapling/defaults.py,sha256=MAn2MMLBFvoe4i3u_qlp6YEvGUiCjNPPDux1cFCdpsU,866
|
4
|
-
scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
|
5
|
-
scrapling/parser.py,sha256=1xS1UjCm1GVnKcVAtup9rSE1xuYPxXOgJe-8LJE5gUk,53956
|
6
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
7
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
|
9
|
-
scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
|
10
|
-
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
11
|
-
scrapling/core/storage_adaptors.py,sha256=gZbUpHtLOL7o_oZbES_o40r39zShxTeTM8YK6dXA5Zo,6214
|
12
|
-
scrapling/core/translator.py,sha256=3a2VX9KR-q-GzwT1OgGDv1UlzIkvBggkQXUdiMyL-4c,5277
|
13
|
-
scrapling/core/utils.py,sha256=KX88B3tV1-SgCAr69TUN3LfmsTDcLnEhYJiPuWd31yA,3704
|
14
|
-
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
15
|
-
scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
|
16
|
-
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
17
|
-
scrapling/engines/pw.py,sha256=cZraIBWd9ulEGEdhETIGmpevi62CN9JGcUU1OIDdxkA,21369
|
18
|
-
scrapling/engines/static.py,sha256=EjdaR0beqWfEKKavT7vlBnozoayQaVpqeVtaOuzd384,9306
|
19
|
-
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
20
|
-
scrapling/engines/toolbelt/custom.py,sha256=_-baGB8oOOHogbaddtGsq_K_01ccOjOkGA6tOKk28hM,12811
|
21
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=Zzoqq3p6X_8D7eTxACz3z96cBZWWK61iKOGo2sZUtlg,2924
|
22
|
-
scrapling/engines/toolbelt/navigation.py,sha256=fMjDgicqy2MoZZll2h5EvrrxkL6yNrC09v8isTpwAt0,4565
|
23
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
24
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
25
|
-
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
26
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
27
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
28
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
29
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
30
|
-
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
31
|
-
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
32
|
-
tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
|
33
|
-
tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
-
tests/fetchers/async/test_camoufox.py,sha256=BANJ0TVqEdsjkYlsyU-q_spfaMsqTLOBQU8LUDurL9I,3685
|
35
|
-
tests/fetchers/async/test_httpx.py,sha256=6WgsvqV1-rYTjZ9na5x-wt49C3Ur9D99HXBFbewO0gc,3888
|
36
|
-
tests/fetchers/async/test_playwright.py,sha256=rr_3vB9LWclbl7PBNMH2MNU6CsirvJAIx_LsI9mLil0,4106
|
37
|
-
tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
tests/fetchers/sync/test_camoufox.py,sha256=IcDXPAWSSJnYT6psDFKSbCeym5n7hCrMPYQEghaOX3A,3165
|
39
|
-
tests/fetchers/sync/test_httpx.py,sha256=xItYWjnDOIswKJzua2tDq8Oy43nTeFl0O1bci7lzGmg,3615
|
40
|
-
tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXCPz6qUy8cs,3818
|
41
|
-
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
43
|
-
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
44
|
-
scrapling-0.2.98.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
-
scrapling-0.2.98.dist-info/METADATA,sha256=Un_ROxrGIvk_8w-ECQbwKAcJzYyx3MTWS1DHt9FRqdI,69718
|
46
|
-
scrapling-0.2.98.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
47
|
-
scrapling-0.2.98.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
-
scrapling-0.2.98.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
-
scrapling-0.2.98.dist-info/RECORD,,
|
tests/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
"""Package for test project."""
|
tests/fetchers/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
# Because I'm too lazy to mock requests :)
|
tests/fetchers/async/__init__.py
DELETED
File without changes
|
@@ -1,95 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import StealthyFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
@pytest.mark.asyncio
|
9
|
-
class TestStealthyFetcher:
|
10
|
-
@pytest.fixture(scope="class")
|
11
|
-
def fetcher(self):
|
12
|
-
return StealthyFetcher(auto_match=False)
|
13
|
-
|
14
|
-
@pytest.fixture(scope="class")
|
15
|
-
def urls(self, httpbin):
|
16
|
-
url = httpbin.url
|
17
|
-
return {
|
18
|
-
'status_200': f'{url}/status/200',
|
19
|
-
'status_404': f'{url}/status/404',
|
20
|
-
'status_501': f'{url}/status/501',
|
21
|
-
'basic_url': f'{url}/get',
|
22
|
-
'html_url': f'{url}/html',
|
23
|
-
'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
|
24
|
-
'cookies_url': f"{url}/cookies/set/test/value"
|
25
|
-
}
|
26
|
-
|
27
|
-
async def test_basic_fetch(self, fetcher, urls):
|
28
|
-
"""Test doing basic fetch request with multiple statuses"""
|
29
|
-
assert (await fetcher.async_fetch(urls['status_200'])).status == 200
|
30
|
-
assert (await fetcher.async_fetch(urls['status_404'])).status == 404
|
31
|
-
assert (await fetcher.async_fetch(urls['status_501'])).status == 501
|
32
|
-
|
33
|
-
async def test_networkidle(self, fetcher, urls):
|
34
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
35
|
-
assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
|
36
|
-
|
37
|
-
async def test_blocking_resources(self, fetcher, urls):
|
38
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
39
|
-
assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
|
40
|
-
assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
|
41
|
-
|
42
|
-
async def test_waiting_selector(self, fetcher, urls):
|
43
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
44
|
-
assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
|
45
|
-
assert (await fetcher.async_fetch(
|
46
|
-
urls['html_url'],
|
47
|
-
wait_selector='h1',
|
48
|
-
wait_selector_state='visible'
|
49
|
-
)).status == 200
|
50
|
-
|
51
|
-
async def test_cookies_loading(self, fetcher, urls):
|
52
|
-
"""Test if cookies are set after the request"""
|
53
|
-
response = await fetcher.async_fetch(urls['cookies_url'])
|
54
|
-
assert response.cookies == {'test': 'value'}
|
55
|
-
|
56
|
-
async def test_automation(self, fetcher, urls):
|
57
|
-
"""Test if automation break the code or not"""
|
58
|
-
|
59
|
-
async def scroll_page(page):
|
60
|
-
await page.mouse.wheel(10, 0)
|
61
|
-
await page.mouse.move(100, 400)
|
62
|
-
await page.mouse.up()
|
63
|
-
return page
|
64
|
-
|
65
|
-
assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
|
66
|
-
|
67
|
-
async def test_properties(self, fetcher, urls):
|
68
|
-
"""Test if different arguments breaks the code or not"""
|
69
|
-
assert (await fetcher.async_fetch(
|
70
|
-
urls['html_url'],
|
71
|
-
block_webrtc=True,
|
72
|
-
allow_webgl=True
|
73
|
-
)).status == 200
|
74
|
-
|
75
|
-
assert (await fetcher.async_fetch(
|
76
|
-
urls['html_url'],
|
77
|
-
block_webrtc=False,
|
78
|
-
allow_webgl=True
|
79
|
-
)).status == 200
|
80
|
-
|
81
|
-
assert (await fetcher.async_fetch(
|
82
|
-
urls['html_url'],
|
83
|
-
block_webrtc=True,
|
84
|
-
allow_webgl=False
|
85
|
-
)).status == 200
|
86
|
-
|
87
|
-
assert (await fetcher.async_fetch(
|
88
|
-
urls['html_url'],
|
89
|
-
extra_headers={'ayo': ''},
|
90
|
-
os_randomize=True
|
91
|
-
)).status == 200
|
92
|
-
|
93
|
-
async def test_infinite_timeout(self, fetcher, urls):
|
94
|
-
"""Test if infinite timeout breaks the code or not"""
|
95
|
-
assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200
|
@@ -1,83 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling.fetchers import AsyncFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
@pytest.mark.asyncio
|
9
|
-
class TestAsyncFetcher:
|
10
|
-
@pytest.fixture(scope="class")
|
11
|
-
def fetcher(self):
|
12
|
-
return AsyncFetcher(auto_match=True)
|
13
|
-
|
14
|
-
@pytest.fixture(scope="class")
|
15
|
-
def urls(self, httpbin):
|
16
|
-
return {
|
17
|
-
'status_200': f'{httpbin.url}/status/200',
|
18
|
-
'status_404': f'{httpbin.url}/status/404',
|
19
|
-
'status_501': f'{httpbin.url}/status/501',
|
20
|
-
'basic_url': f'{httpbin.url}/get',
|
21
|
-
'post_url': f'{httpbin.url}/post',
|
22
|
-
'put_url': f'{httpbin.url}/put',
|
23
|
-
'delete_url': f'{httpbin.url}/delete',
|
24
|
-
'html_url': f'{httpbin.url}/html'
|
25
|
-
}
|
26
|
-
|
27
|
-
async def test_basic_get(self, fetcher, urls):
|
28
|
-
"""Test doing basic get request with multiple statuses"""
|
29
|
-
assert (await fetcher.get(urls['status_200'])).status == 200
|
30
|
-
assert (await fetcher.get(urls['status_404'])).status == 404
|
31
|
-
assert (await fetcher.get(urls['status_501'])).status == 501
|
32
|
-
|
33
|
-
async def test_get_properties(self, fetcher, urls):
|
34
|
-
"""Test if different arguments with GET request breaks the code or not"""
|
35
|
-
assert (await fetcher.get(urls['status_200'], stealthy_headers=True)).status == 200
|
36
|
-
assert (await fetcher.get(urls['status_200'], follow_redirects=True)).status == 200
|
37
|
-
assert (await fetcher.get(urls['status_200'], timeout=None)).status == 200
|
38
|
-
assert (await fetcher.get(
|
39
|
-
urls['status_200'],
|
40
|
-
stealthy_headers=True,
|
41
|
-
follow_redirects=True,
|
42
|
-
timeout=None
|
43
|
-
)).status == 200
|
44
|
-
|
45
|
-
async def test_post_properties(self, fetcher, urls):
|
46
|
-
"""Test if different arguments with POST request breaks the code or not"""
|
47
|
-
assert (await fetcher.post(urls['post_url'], data={'key': 'value'})).status == 200
|
48
|
-
assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, stealthy_headers=True)).status == 200
|
49
|
-
assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, follow_redirects=True)).status == 200
|
50
|
-
assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, timeout=None)).status == 200
|
51
|
-
assert (await fetcher.post(
|
52
|
-
urls['post_url'],
|
53
|
-
data={'key': 'value'},
|
54
|
-
stealthy_headers=True,
|
55
|
-
follow_redirects=True,
|
56
|
-
timeout=None
|
57
|
-
)).status == 200
|
58
|
-
|
59
|
-
async def test_put_properties(self, fetcher, urls):
|
60
|
-
"""Test if different arguments with PUT request breaks the code or not"""
|
61
|
-
assert (await fetcher.put(urls['put_url'], data={'key': 'value'})).status in [200, 405]
|
62
|
-
assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, stealthy_headers=True)).status in [200, 405]
|
63
|
-
assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, follow_redirects=True)).status in [200, 405]
|
64
|
-
assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, timeout=None)).status in [200, 405]
|
65
|
-
assert (await fetcher.put(
|
66
|
-
urls['put_url'],
|
67
|
-
data={'key': 'value'},
|
68
|
-
stealthy_headers=True,
|
69
|
-
follow_redirects=True,
|
70
|
-
timeout=None
|
71
|
-
)).status in [200, 405]
|
72
|
-
|
73
|
-
async def test_delete_properties(self, fetcher, urls):
|
74
|
-
"""Test if different arguments with DELETE request breaks the code or not"""
|
75
|
-
assert (await fetcher.delete(urls['delete_url'], stealthy_headers=True)).status == 200
|
76
|
-
assert (await fetcher.delete(urls['delete_url'], follow_redirects=True)).status == 200
|
77
|
-
assert (await fetcher.delete(urls['delete_url'], timeout=None)).status == 200
|
78
|
-
assert (await fetcher.delete(
|
79
|
-
urls['delete_url'],
|
80
|
-
stealthy_headers=True,
|
81
|
-
follow_redirects=True,
|
82
|
-
timeout=None
|
83
|
-
)).status == 200
|
@@ -1,99 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import PlayWrightFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
class TestPlayWrightFetcherAsync:
|
9
|
-
@pytest.fixture
|
10
|
-
def fetcher(self):
|
11
|
-
return PlayWrightFetcher(auto_match=False)
|
12
|
-
|
13
|
-
@pytest.fixture
|
14
|
-
def urls(self, httpbin):
|
15
|
-
return {
|
16
|
-
'status_200': f'{httpbin.url}/status/200',
|
17
|
-
'status_404': f'{httpbin.url}/status/404',
|
18
|
-
'status_501': f'{httpbin.url}/status/501',
|
19
|
-
'basic_url': f'{httpbin.url}/get',
|
20
|
-
'html_url': f'{httpbin.url}/html',
|
21
|
-
'delayed_url': f'{httpbin.url}/delay/10',
|
22
|
-
'cookies_url': f"{httpbin.url}/cookies/set/test/value"
|
23
|
-
}
|
24
|
-
|
25
|
-
@pytest.mark.asyncio
|
26
|
-
async def test_basic_fetch(self, fetcher, urls):
|
27
|
-
"""Test doing basic fetch request with multiple statuses"""
|
28
|
-
response = await fetcher.async_fetch(urls['status_200'])
|
29
|
-
assert response.status == 200
|
30
|
-
|
31
|
-
@pytest.mark.asyncio
|
32
|
-
async def test_networkidle(self, fetcher, urls):
|
33
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
34
|
-
response = await fetcher.async_fetch(urls['basic_url'], network_idle=True)
|
35
|
-
assert response.status == 200
|
36
|
-
|
37
|
-
@pytest.mark.asyncio
|
38
|
-
async def test_blocking_resources(self, fetcher, urls):
|
39
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
40
|
-
response = await fetcher.async_fetch(urls['basic_url'], disable_resources=True)
|
41
|
-
assert response.status == 200
|
42
|
-
|
43
|
-
@pytest.mark.asyncio
|
44
|
-
async def test_waiting_selector(self, fetcher, urls):
|
45
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
46
|
-
response1 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1')
|
47
|
-
assert response1.status == 200
|
48
|
-
|
49
|
-
response2 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1', wait_selector_state='visible')
|
50
|
-
assert response2.status == 200
|
51
|
-
|
52
|
-
@pytest.mark.asyncio
|
53
|
-
async def test_cookies_loading(self, fetcher, urls):
|
54
|
-
"""Test if cookies are set after the request"""
|
55
|
-
response = await fetcher.async_fetch(urls['cookies_url'])
|
56
|
-
assert response.cookies == {'test': 'value'}
|
57
|
-
|
58
|
-
@pytest.mark.asyncio
|
59
|
-
async def test_automation(self, fetcher, urls):
|
60
|
-
"""Test if automation break the code or not"""
|
61
|
-
async def scroll_page(page):
|
62
|
-
await page.mouse.wheel(10, 0)
|
63
|
-
await page.mouse.move(100, 400)
|
64
|
-
await page.mouse.up()
|
65
|
-
return page
|
66
|
-
|
67
|
-
response = await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)
|
68
|
-
assert response.status == 200
|
69
|
-
|
70
|
-
@pytest.mark.parametrize("kwargs", [
|
71
|
-
{"disable_webgl": True, "hide_canvas": False},
|
72
|
-
{"disable_webgl": False, "hide_canvas": True},
|
73
|
-
# {"stealth": True}, # causes issues with Github Actions
|
74
|
-
{"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
|
75
|
-
{"extra_headers": {'ayo': ''}}
|
76
|
-
])
|
77
|
-
@pytest.mark.asyncio
|
78
|
-
async def test_properties(self, fetcher, urls, kwargs):
|
79
|
-
"""Test if different arguments breaks the code or not"""
|
80
|
-
response = await fetcher.async_fetch(urls['html_url'], **kwargs)
|
81
|
-
assert response.status == 200
|
82
|
-
|
83
|
-
@pytest.mark.asyncio
|
84
|
-
async def test_cdp_url_invalid(self, fetcher, urls):
|
85
|
-
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
86
|
-
with pytest.raises(ValueError):
|
87
|
-
await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah')
|
88
|
-
|
89
|
-
with pytest.raises(ValueError):
|
90
|
-
await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah', nstbrowser_mode=True)
|
91
|
-
|
92
|
-
with pytest.raises(Exception):
|
93
|
-
await fetcher.async_fetch(urls['html_url'], cdp_url='ws://blahblah')
|
94
|
-
|
95
|
-
@pytest.mark.asyncio
|
96
|
-
async def test_infinite_timeout(self, fetcher, urls):
|
97
|
-
"""Test if infinite timeout breaks the code or not"""
|
98
|
-
response = await fetcher.async_fetch(urls['delayed_url'], timeout=None)
|
99
|
-
assert response.status == 200
|
tests/fetchers/sync/__init__.py
DELETED
File without changes
|
@@ -1,68 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import StealthyFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
class TestStealthyFetcher:
|
9
|
-
@pytest.fixture(scope="class")
|
10
|
-
def fetcher(self):
|
11
|
-
"""Fixture to create a StealthyFetcher instance for the entire test class"""
|
12
|
-
return StealthyFetcher(auto_match=False)
|
13
|
-
|
14
|
-
@pytest.fixture(autouse=True)
|
15
|
-
def setup_urls(self, httpbin):
|
16
|
-
"""Fixture to set up URLs for testing"""
|
17
|
-
self.status_200 = f'{httpbin.url}/status/200'
|
18
|
-
self.status_404 = f'{httpbin.url}/status/404'
|
19
|
-
self.status_501 = f'{httpbin.url}/status/501'
|
20
|
-
self.basic_url = f'{httpbin.url}/get'
|
21
|
-
self.html_url = f'{httpbin.url}/html'
|
22
|
-
self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
|
23
|
-
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
24
|
-
|
25
|
-
def test_basic_fetch(self, fetcher):
|
26
|
-
"""Test doing basic fetch request with multiple statuses"""
|
27
|
-
assert fetcher.fetch(self.status_200).status == 200
|
28
|
-
assert fetcher.fetch(self.status_404).status == 404
|
29
|
-
assert fetcher.fetch(self.status_501).status == 501
|
30
|
-
|
31
|
-
def test_networkidle(self, fetcher):
|
32
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
33
|
-
assert fetcher.fetch(self.basic_url, network_idle=True).status == 200
|
34
|
-
|
35
|
-
def test_blocking_resources(self, fetcher):
|
36
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
37
|
-
assert fetcher.fetch(self.basic_url, block_images=True).status == 200
|
38
|
-
assert fetcher.fetch(self.basic_url, disable_resources=True).status == 200
|
39
|
-
|
40
|
-
def test_waiting_selector(self, fetcher):
|
41
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
42
|
-
assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
|
43
|
-
assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
|
44
|
-
|
45
|
-
def test_cookies_loading(self, fetcher):
|
46
|
-
"""Test if cookies are set after the request"""
|
47
|
-
assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
|
48
|
-
|
49
|
-
def test_automation(self, fetcher):
|
50
|
-
"""Test if automation break the code or not"""
|
51
|
-
def scroll_page(page):
|
52
|
-
page.mouse.wheel(10, 0)
|
53
|
-
page.mouse.move(100, 400)
|
54
|
-
page.mouse.up()
|
55
|
-
return page
|
56
|
-
|
57
|
-
assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
|
58
|
-
|
59
|
-
def test_properties(self, fetcher):
|
60
|
-
"""Test if different arguments breaks the code or not"""
|
61
|
-
assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status == 200
|
62
|
-
assert fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status == 200
|
63
|
-
assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status == 200
|
64
|
-
assert fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status == 200
|
65
|
-
|
66
|
-
def test_infinite_timeout(self, fetcher):
|
67
|
-
"""Test if infinite timeout breaks the code or not"""
|
68
|
-
assert fetcher.fetch(self.delayed_url, timeout=None).status == 200
|
@@ -1,82 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import Fetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
class TestFetcher:
|
9
|
-
@pytest.fixture(scope="class")
|
10
|
-
def fetcher(self):
|
11
|
-
"""Fixture to create a Fetcher instance for the entire test class"""
|
12
|
-
return Fetcher(auto_match=False)
|
13
|
-
|
14
|
-
@pytest.fixture(autouse=True)
|
15
|
-
def setup_urls(self, httpbin):
|
16
|
-
"""Fixture to set up URLs for testing"""
|
17
|
-
self.status_200 = f'{httpbin.url}/status/200'
|
18
|
-
self.status_404 = f'{httpbin.url}/status/404'
|
19
|
-
self.status_501 = f'{httpbin.url}/status/501'
|
20
|
-
self.basic_url = f'{httpbin.url}/get'
|
21
|
-
self.post_url = f'{httpbin.url}/post'
|
22
|
-
self.put_url = f'{httpbin.url}/put'
|
23
|
-
self.delete_url = f'{httpbin.url}/delete'
|
24
|
-
self.html_url = f'{httpbin.url}/html'
|
25
|
-
|
26
|
-
def test_basic_get(self, fetcher):
|
27
|
-
"""Test doing basic get request with multiple statuses"""
|
28
|
-
assert fetcher.get(self.status_200).status == 200
|
29
|
-
assert fetcher.get(self.status_404).status == 404
|
30
|
-
assert fetcher.get(self.status_501).status == 501
|
31
|
-
|
32
|
-
def test_get_properties(self, fetcher):
|
33
|
-
"""Test if different arguments with GET request breaks the code or not"""
|
34
|
-
assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
|
35
|
-
assert fetcher.get(self.status_200, follow_redirects=True).status == 200
|
36
|
-
assert fetcher.get(self.status_200, timeout=None).status == 200
|
37
|
-
assert fetcher.get(
|
38
|
-
self.status_200,
|
39
|
-
stealthy_headers=True,
|
40
|
-
follow_redirects=True,
|
41
|
-
timeout=None
|
42
|
-
).status == 200
|
43
|
-
|
44
|
-
def test_post_properties(self, fetcher):
|
45
|
-
"""Test if different arguments with POST request breaks the code or not"""
|
46
|
-
assert fetcher.post(self.post_url, data={'key': 'value'}).status == 200
|
47
|
-
assert fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status == 200
|
48
|
-
assert fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status == 200
|
49
|
-
assert fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status == 200
|
50
|
-
assert fetcher.post(
|
51
|
-
self.post_url,
|
52
|
-
data={'key': 'value'},
|
53
|
-
stealthy_headers=True,
|
54
|
-
follow_redirects=True,
|
55
|
-
timeout=None
|
56
|
-
).status == 200
|
57
|
-
|
58
|
-
def test_put_properties(self, fetcher):
|
59
|
-
"""Test if different arguments with PUT request breaks the code or not"""
|
60
|
-
assert fetcher.put(self.put_url, data={'key': 'value'}).status == 200
|
61
|
-
assert fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status == 200
|
62
|
-
assert fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status == 200
|
63
|
-
assert fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status == 200
|
64
|
-
assert fetcher.put(
|
65
|
-
self.put_url,
|
66
|
-
data={'key': 'value'},
|
67
|
-
stealthy_headers=True,
|
68
|
-
follow_redirects=True,
|
69
|
-
timeout=None
|
70
|
-
).status == 200
|
71
|
-
|
72
|
-
def test_delete_properties(self, fetcher):
|
73
|
-
"""Test if different arguments with DELETE request breaks the code or not"""
|
74
|
-
assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
|
75
|
-
assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
|
76
|
-
assert fetcher.delete(self.delete_url, timeout=None).status == 200
|
77
|
-
assert fetcher.delete(
|
78
|
-
self.delete_url,
|
79
|
-
stealthy_headers=True,
|
80
|
-
follow_redirects=True,
|
81
|
-
timeout=None
|
82
|
-
).status == 200
|
@@ -1,87 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import PlayWrightFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
class TestPlayWrightFetcher:
|
9
|
-
|
10
|
-
@pytest.fixture(scope="class")
|
11
|
-
def fetcher(self):
|
12
|
-
"""Fixture to create a StealthyFetcher instance for the entire test class"""
|
13
|
-
return PlayWrightFetcher(auto_match=False)
|
14
|
-
|
15
|
-
@pytest.fixture(autouse=True)
|
16
|
-
def setup_urls(self, httpbin):
|
17
|
-
"""Fixture to set up URLs for testing"""
|
18
|
-
self.status_200 = f'{httpbin.url}/status/200'
|
19
|
-
self.status_404 = f'{httpbin.url}/status/404'
|
20
|
-
self.status_501 = f'{httpbin.url}/status/501'
|
21
|
-
self.basic_url = f'{httpbin.url}/get'
|
22
|
-
self.html_url = f'{httpbin.url}/html'
|
23
|
-
self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
|
24
|
-
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
25
|
-
|
26
|
-
def test_basic_fetch(self, fetcher):
|
27
|
-
"""Test doing basic fetch request with multiple statuses"""
|
28
|
-
assert fetcher.fetch(self.status_200).status == 200
|
29
|
-
# There's a bug with playwright makes it crashes if a URL returns status code 4xx/5xx without body, let's disable this till they reply to my issue report
|
30
|
-
# assert fetcher.fetch(self.status_404).status == 404
|
31
|
-
# assert fetcher.fetch(self.status_501).status == 501
|
32
|
-
|
33
|
-
def test_networkidle(self, fetcher):
|
34
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
35
|
-
assert fetcher.fetch(self.basic_url, network_idle=True).status == 200
|
36
|
-
|
37
|
-
def test_blocking_resources(self, fetcher):
|
38
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
39
|
-
assert fetcher.fetch(self.basic_url, disable_resources=True).status == 200
|
40
|
-
|
41
|
-
def test_waiting_selector(self, fetcher):
|
42
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
43
|
-
assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
|
44
|
-
assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
|
45
|
-
|
46
|
-
def test_cookies_loading(self, fetcher):
|
47
|
-
"""Test if cookies are set after the request"""
|
48
|
-
assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
|
49
|
-
|
50
|
-
def test_automation(self, fetcher):
|
51
|
-
"""Test if automation break the code or not"""
|
52
|
-
|
53
|
-
def scroll_page(page):
|
54
|
-
page.mouse.wheel(10, 0)
|
55
|
-
page.mouse.move(100, 400)
|
56
|
-
page.mouse.up()
|
57
|
-
return page
|
58
|
-
|
59
|
-
assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
|
60
|
-
|
61
|
-
@pytest.mark.parametrize("kwargs", [
|
62
|
-
{"disable_webgl": True, "hide_canvas": False},
|
63
|
-
{"disable_webgl": False, "hide_canvas": True},
|
64
|
-
# {"stealth": True}, # causes issues with Github Actions
|
65
|
-
{"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
|
66
|
-
{"extra_headers": {'ayo': ''}}
|
67
|
-
])
|
68
|
-
def test_properties(self, fetcher, kwargs):
|
69
|
-
"""Test if different arguments breaks the code or not"""
|
70
|
-
response = fetcher.fetch(self.html_url, **kwargs)
|
71
|
-
assert response.status == 200
|
72
|
-
|
73
|
-
def test_cdp_url_invalid(self, fetcher):
|
74
|
-
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
75
|
-
with pytest.raises(ValueError):
|
76
|
-
fetcher.fetch(self.html_url, cdp_url='blahblah')
|
77
|
-
|
78
|
-
with pytest.raises(ValueError):
|
79
|
-
fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
80
|
-
|
81
|
-
with pytest.raises(Exception):
|
82
|
-
fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
83
|
-
|
84
|
-
def test_infinite_timeout(self, fetcher, ):
|
85
|
-
"""Test if infinite timeout breaks the code or not"""
|
86
|
-
response = fetcher.fetch(self.delayed_url, timeout=None)
|
87
|
-
assert response.status == 200
|