scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +5 -4
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +93 -11
- scrapling/core/storage_adaptors.py +9 -10
- scrapling/core/translator.py +6 -7
- scrapling/core/utils.py +35 -30
- scrapling/defaults.py +2 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +96 -26
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +166 -96
- scrapling/engines/static.py +94 -50
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +22 -23
- scrapling/engines/toolbelt/fingerprints.py +7 -7
- scrapling/engines/toolbelt/navigation.py +25 -12
- scrapling/fetchers.py +233 -17
- scrapling/parser.py +63 -28
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +263 -219
- scrapling-0.2.7.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -64
- tests/fetchers/test_httpx.py +0 -67
- tests/fetchers/test_playwright.py +0 -76
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling-0.2.7.dist-info/RECORD
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=WjvhJ6xkiSHp7St2YJYYJIsiKL8WDYuAQ_qIsg03v-0,435
|
2
|
-
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256=vjAsa-oleb7FfYsxqmEUVZGNxdo7LMVuiLuyjIGySQE,17417
|
4
|
-
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
8
|
-
scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
|
9
|
-
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
-
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
11
|
-
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
|
-
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
|
-
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=Lw_uZ5SMBy3T6MkCNOMPk1i51Lnpfd0M7HyAUJAzKIg,8284
|
15
|
-
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
-
scrapling/engines/pw.py,sha256=ZRmbFNQWzvxUHVrIUcKefyg6fDpBrN6erdatDpcLBaw,13762
|
17
|
-
scrapling/engines/static.py,sha256=ryVCIjTpVLNlCxSf_NYwDSdsoDbafnsGpkCoCROPhlI,8021
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=KopO0SVWzFoNB8LbFDQhtErm8KCid6nkQcGqRaItC6U,12752
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
|
-
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
-
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
-
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
-
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
-
tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
|
32
|
-
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
33
|
-
tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
|
34
|
-
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
35
|
-
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
-
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
|
-
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.7.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
-
scrapling-0.2.7.dist-info/METADATA,sha256=kYARTFqiiLsL_cvnU03pf2I1E5N_NmJk25gbeLzSR4M,66607
|
40
|
-
scrapling-0.2.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
-
scrapling-0.2.7.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
-
scrapling-0.2.7.dist-info/RECORD,,
|
tests/fetchers/test_camoufox.py
DELETED
@@ -1,64 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import StealthyFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
-
class TestStealthyFetcher(unittest.TestCase):
|
10
|
-
def setUp(self):
|
11
|
-
self.fetcher = StealthyFetcher(auto_match=False)
|
12
|
-
url = self.httpbin.url
|
13
|
-
self.status_200 = f'{url}/status/200'
|
14
|
-
self.status_404 = f'{url}/status/404'
|
15
|
-
self.status_501 = f'{url}/status/501'
|
16
|
-
self.basic_url = f'{url}/get'
|
17
|
-
self.html_url = f'{url}/html'
|
18
|
-
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
-
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
-
|
21
|
-
def test_basic_fetch(self):
|
22
|
-
"""Test doing basic fetch request with multiple statuses"""
|
23
|
-
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
-
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
-
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
-
|
27
|
-
def test_networkidle(self):
|
28
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
-
|
31
|
-
def test_blocking_resources(self):
|
32
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
|
34
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
35
|
-
|
36
|
-
def test_waiting_selector(self):
|
37
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
40
|
-
|
41
|
-
def test_cookies_loading(self):
|
42
|
-
"""Test if cookies are set after the request"""
|
43
|
-
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
44
|
-
|
45
|
-
def test_automation(self):
|
46
|
-
"""Test if automation break the code or not"""
|
47
|
-
def scroll_page(page):
|
48
|
-
page.mouse.wheel(10, 0)
|
49
|
-
page.mouse.move(100, 400)
|
50
|
-
page.mouse.up()
|
51
|
-
return page
|
52
|
-
|
53
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
54
|
-
|
55
|
-
def test_properties(self):
|
56
|
-
"""Test if different arguments breaks the code or not"""
|
57
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
58
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
59
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
60
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
|
61
|
-
|
62
|
-
def test_infinite_timeout(self):
|
63
|
-
"""Test if infinite timeout breaks the code or not"""
|
64
|
-
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
tests/fetchers/test_httpx.py
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import Fetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
class TestFetcher(unittest.TestCase):
|
9
|
-
def setUp(self):
|
10
|
-
self.fetcher = Fetcher(auto_match=False)
|
11
|
-
url = self.httpbin.url
|
12
|
-
self.status_200 = f'{url}/status/200'
|
13
|
-
self.status_404 = f'{url}/status/404'
|
14
|
-
self.status_501 = f'{url}/status/501'
|
15
|
-
self.basic_url = f'{url}/get'
|
16
|
-
self.post_url = f'{url}/post'
|
17
|
-
self.put_url = f'{url}/put'
|
18
|
-
self.delete_url = f'{url}/delete'
|
19
|
-
self.html_url = f'{url}/html'
|
20
|
-
|
21
|
-
def test_basic_get(self):
|
22
|
-
"""Test doing basic get request with multiple statuses"""
|
23
|
-
self.assertEqual(self.fetcher.get(self.status_200).status, 200)
|
24
|
-
self.assertEqual(self.fetcher.get(self.status_404).status, 404)
|
25
|
-
self.assertEqual(self.fetcher.get(self.status_501).status, 501)
|
26
|
-
|
27
|
-
def test_get_properties(self):
|
28
|
-
"""Test if different arguments with GET request breaks the code or not"""
|
29
|
-
self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
|
30
|
-
self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
|
31
|
-
self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
|
32
|
-
self.assertEqual(
|
33
|
-
self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
34
|
-
200
|
35
|
-
)
|
36
|
-
|
37
|
-
def test_post_properties(self):
|
38
|
-
"""Test if different arguments with POST request breaks the code or not"""
|
39
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
|
40
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
41
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
42
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
|
43
|
-
self.assertEqual(
|
44
|
-
self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
45
|
-
200
|
46
|
-
)
|
47
|
-
|
48
|
-
def test_put_properties(self):
|
49
|
-
"""Test if different arguments with PUT request breaks the code or not"""
|
50
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
|
51
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
52
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
53
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
|
54
|
-
self.assertEqual(
|
55
|
-
self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
56
|
-
200
|
57
|
-
)
|
58
|
-
|
59
|
-
def test_delete_properties(self):
|
60
|
-
"""Test if different arguments with DELETE request breaks the code or not"""
|
61
|
-
self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
|
62
|
-
self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
|
63
|
-
self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
|
64
|
-
self.assertEqual(
|
65
|
-
self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
66
|
-
200
|
67
|
-
)
|
@@ -1,76 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import PlayWrightFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
-
class TestPlayWrightFetcher(unittest.TestCase):
|
10
|
-
def setUp(self):
|
11
|
-
self.fetcher = PlayWrightFetcher(auto_match=False)
|
12
|
-
url = self.httpbin.url
|
13
|
-
self.status_200 = f'{url}/status/200'
|
14
|
-
self.status_404 = f'{url}/status/404'
|
15
|
-
self.status_501 = f'{url}/status/501'
|
16
|
-
self.basic_url = f'{url}/get'
|
17
|
-
self.html_url = f'{url}/html'
|
18
|
-
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
-
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
-
|
21
|
-
def test_basic_fetch(self):
|
22
|
-
"""Test doing basic fetch request with multiple statuses"""
|
23
|
-
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
-
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
-
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
-
|
27
|
-
def test_networkidle(self):
|
28
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
-
|
31
|
-
def test_blocking_resources(self):
|
32
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
34
|
-
|
35
|
-
def test_waiting_selector(self):
|
36
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
39
|
-
|
40
|
-
def test_cookies_loading(self):
|
41
|
-
"""Test if cookies are set after the request"""
|
42
|
-
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
43
|
-
|
44
|
-
def test_automation(self):
|
45
|
-
"""Test if automation break the code or not"""
|
46
|
-
def scroll_page(page):
|
47
|
-
page.mouse.wheel(10, 0)
|
48
|
-
page.mouse.move(100, 400)
|
49
|
-
page.mouse.up()
|
50
|
-
return page
|
51
|
-
|
52
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
53
|
-
|
54
|
-
def test_properties(self):
|
55
|
-
"""Test if different arguments breaks the code or not"""
|
56
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
|
57
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
58
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
59
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
60
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
|
61
|
-
|
62
|
-
def test_cdp_url(self):
|
63
|
-
"""Test if it's going to try to connect to cdp url or not"""
|
64
|
-
with self.assertRaises(ValueError):
|
65
|
-
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
|
66
|
-
|
67
|
-
with self.assertRaises(ValueError):
|
68
|
-
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
69
|
-
|
70
|
-
with self.assertRaises(Exception):
|
71
|
-
# There's no type for this error in PlayWright, it's just `Error`
|
72
|
-
_ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
73
|
-
|
74
|
-
def test_infinite_timeout(self):
|
75
|
-
"""Test if infinite timeout breaks the code or not"""
|
76
|
-
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
File without changes
|
File without changes
|
File without changes
|