scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +5 -4
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +93 -11
- scrapling/core/storage_adaptors.py +9 -10
- scrapling/core/translator.py +6 -7
- scrapling/core/utils.py +35 -30
- scrapling/defaults.py +2 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +96 -26
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +166 -96
- scrapling/engines/static.py +94 -50
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +22 -23
- scrapling/engines/toolbelt/fingerprints.py +7 -7
- scrapling/engines/toolbelt/navigation.py +25 -12
- scrapling/fetchers.py +233 -17
- scrapling/parser.py +63 -28
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +263 -219
- scrapling-0.2.7.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -64
- tests/fetchers/test_httpx.py +0 -67
- tests/fetchers/test_playwright.py +0 -76
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling-0.2.7.dist-info/RECORD
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=WjvhJ6xkiSHp7St2YJYYJIsiKL8WDYuAQ_qIsg03v-0,435
|
2
|
-
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256=vjAsa-oleb7FfYsxqmEUVZGNxdo7LMVuiLuyjIGySQE,17417
|
4
|
-
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
8
|
-
scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
|
9
|
-
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
-
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
11
|
-
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
|
-
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
|
-
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=Lw_uZ5SMBy3T6MkCNOMPk1i51Lnpfd0M7HyAUJAzKIg,8284
|
15
|
-
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
-
scrapling/engines/pw.py,sha256=ZRmbFNQWzvxUHVrIUcKefyg6fDpBrN6erdatDpcLBaw,13762
|
17
|
-
scrapling/engines/static.py,sha256=ryVCIjTpVLNlCxSf_NYwDSdsoDbafnsGpkCoCROPhlI,8021
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=KopO0SVWzFoNB8LbFDQhtErm8KCid6nkQcGqRaItC6U,12752
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
|
-
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
-
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
-
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
-
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
-
tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
|
32
|
-
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
33
|
-
tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
|
34
|
-
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
35
|
-
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
-
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
|
-
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.7.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
-
scrapling-0.2.7.dist-info/METADATA,sha256=kYARTFqiiLsL_cvnU03pf2I1E5N_NmJk25gbeLzSR4M,66607
|
40
|
-
scrapling-0.2.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
-
scrapling-0.2.7.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
-
scrapling-0.2.7.dist-info/RECORD,,
|
tests/fetchers/test_camoufox.py
DELETED
@@ -1,64 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import StealthyFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
-
class TestStealthyFetcher(unittest.TestCase):
|
10
|
-
def setUp(self):
|
11
|
-
self.fetcher = StealthyFetcher(auto_match=False)
|
12
|
-
url = self.httpbin.url
|
13
|
-
self.status_200 = f'{url}/status/200'
|
14
|
-
self.status_404 = f'{url}/status/404'
|
15
|
-
self.status_501 = f'{url}/status/501'
|
16
|
-
self.basic_url = f'{url}/get'
|
17
|
-
self.html_url = f'{url}/html'
|
18
|
-
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
-
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
-
|
21
|
-
def test_basic_fetch(self):
|
22
|
-
"""Test doing basic fetch request with multiple statuses"""
|
23
|
-
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
-
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
-
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
-
|
27
|
-
def test_networkidle(self):
|
28
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
-
|
31
|
-
def test_blocking_resources(self):
|
32
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
|
34
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
35
|
-
|
36
|
-
def test_waiting_selector(self):
|
37
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
38
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
39
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
40
|
-
|
41
|
-
def test_cookies_loading(self):
|
42
|
-
"""Test if cookies are set after the request"""
|
43
|
-
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
44
|
-
|
45
|
-
def test_automation(self):
|
46
|
-
"""Test if automation break the code or not"""
|
47
|
-
def scroll_page(page):
|
48
|
-
page.mouse.wheel(10, 0)
|
49
|
-
page.mouse.move(100, 400)
|
50
|
-
page.mouse.up()
|
51
|
-
return page
|
52
|
-
|
53
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
54
|
-
|
55
|
-
def test_properties(self):
|
56
|
-
"""Test if different arguments breaks the code or not"""
|
57
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
58
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
59
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
60
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
|
61
|
-
|
62
|
-
def test_infinite_timeout(self):
|
63
|
-
"""Test if infinite timeout breaks the code or not"""
|
64
|
-
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
tests/fetchers/test_httpx.py
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import Fetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
class TestFetcher(unittest.TestCase):
|
9
|
-
def setUp(self):
|
10
|
-
self.fetcher = Fetcher(auto_match=False)
|
11
|
-
url = self.httpbin.url
|
12
|
-
self.status_200 = f'{url}/status/200'
|
13
|
-
self.status_404 = f'{url}/status/404'
|
14
|
-
self.status_501 = f'{url}/status/501'
|
15
|
-
self.basic_url = f'{url}/get'
|
16
|
-
self.post_url = f'{url}/post'
|
17
|
-
self.put_url = f'{url}/put'
|
18
|
-
self.delete_url = f'{url}/delete'
|
19
|
-
self.html_url = f'{url}/html'
|
20
|
-
|
21
|
-
def test_basic_get(self):
|
22
|
-
"""Test doing basic get request with multiple statuses"""
|
23
|
-
self.assertEqual(self.fetcher.get(self.status_200).status, 200)
|
24
|
-
self.assertEqual(self.fetcher.get(self.status_404).status, 404)
|
25
|
-
self.assertEqual(self.fetcher.get(self.status_501).status, 501)
|
26
|
-
|
27
|
-
def test_get_properties(self):
|
28
|
-
"""Test if different arguments with GET request breaks the code or not"""
|
29
|
-
self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
|
30
|
-
self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
|
31
|
-
self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
|
32
|
-
self.assertEqual(
|
33
|
-
self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
34
|
-
200
|
35
|
-
)
|
36
|
-
|
37
|
-
def test_post_properties(self):
|
38
|
-
"""Test if different arguments with POST request breaks the code or not"""
|
39
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
|
40
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
41
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
42
|
-
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
|
43
|
-
self.assertEqual(
|
44
|
-
self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
45
|
-
200
|
46
|
-
)
|
47
|
-
|
48
|
-
def test_put_properties(self):
|
49
|
-
"""Test if different arguments with PUT request breaks the code or not"""
|
50
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
|
51
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
52
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
53
|
-
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
|
54
|
-
self.assertEqual(
|
55
|
-
self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
56
|
-
200
|
57
|
-
)
|
58
|
-
|
59
|
-
def test_delete_properties(self):
|
60
|
-
"""Test if different arguments with DELETE request breaks the code or not"""
|
61
|
-
self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
|
62
|
-
self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
|
63
|
-
self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
|
64
|
-
self.assertEqual(
|
65
|
-
self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
66
|
-
200
|
67
|
-
)
|
@@ -1,76 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
import pytest_httpbin
|
3
|
-
|
4
|
-
from scrapling import PlayWrightFetcher
|
5
|
-
|
6
|
-
|
7
|
-
@pytest_httpbin.use_class_based_httpbin
|
8
|
-
# @pytest_httpbin.use_class_based_httpbin_secure
|
9
|
-
class TestPlayWrightFetcher(unittest.TestCase):
|
10
|
-
def setUp(self):
|
11
|
-
self.fetcher = PlayWrightFetcher(auto_match=False)
|
12
|
-
url = self.httpbin.url
|
13
|
-
self.status_200 = f'{url}/status/200'
|
14
|
-
self.status_404 = f'{url}/status/404'
|
15
|
-
self.status_501 = f'{url}/status/501'
|
16
|
-
self.basic_url = f'{url}/get'
|
17
|
-
self.html_url = f'{url}/html'
|
18
|
-
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
19
|
-
self.cookies_url = f"{url}/cookies/set/test/value"
|
20
|
-
|
21
|
-
def test_basic_fetch(self):
|
22
|
-
"""Test doing basic fetch request with multiple statuses"""
|
23
|
-
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
24
|
-
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
25
|
-
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
26
|
-
|
27
|
-
def test_networkidle(self):
|
28
|
-
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
29
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
30
|
-
|
31
|
-
def test_blocking_resources(self):
|
32
|
-
"""Test if blocking resources make page does not finish loading or not"""
|
33
|
-
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
34
|
-
|
35
|
-
def test_waiting_selector(self):
|
36
|
-
"""Test if waiting for a selector make page does not finish loading or not"""
|
37
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
38
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
|
39
|
-
|
40
|
-
def test_cookies_loading(self):
|
41
|
-
"""Test if cookies are set after the request"""
|
42
|
-
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
43
|
-
|
44
|
-
def test_automation(self):
|
45
|
-
"""Test if automation break the code or not"""
|
46
|
-
def scroll_page(page):
|
47
|
-
page.mouse.wheel(10, 0)
|
48
|
-
page.mouse.move(100, 400)
|
49
|
-
page.mouse.up()
|
50
|
-
return page
|
51
|
-
|
52
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
53
|
-
|
54
|
-
def test_properties(self):
|
55
|
-
"""Test if different arguments breaks the code or not"""
|
56
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
|
57
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
58
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
59
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
60
|
-
self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
|
61
|
-
|
62
|
-
def test_cdp_url(self):
|
63
|
-
"""Test if it's going to try to connect to cdp url or not"""
|
64
|
-
with self.assertRaises(ValueError):
|
65
|
-
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
|
66
|
-
|
67
|
-
with self.assertRaises(ValueError):
|
68
|
-
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
69
|
-
|
70
|
-
with self.assertRaises(Exception):
|
71
|
-
# There's no type for this error in PlayWright, it's just `Error`
|
72
|
-
_ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
73
|
-
|
74
|
-
def test_infinite_timeout(self):
|
75
|
-
"""Test if infinite timeout breaks the code or not"""
|
76
|
-
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
File without changes
|
File without changes
|
File without changes
|