scrapling 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +5 -5
- scrapling/core/storage_adaptors.py +6 -6
- scrapling/core/translator.py +5 -6
- scrapling/core/utils.py +15 -12
- scrapling/defaults.py +1 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +20 -13
- scrapling/engines/constants.py +1 -1
- scrapling/engines/pw.py +31 -18
- scrapling/engines/static.py +24 -11
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +15 -10
- scrapling/engines/toolbelt/fingerprints.py +5 -5
- scrapling/engines/toolbelt/navigation.py +6 -6
- scrapling/fetchers.py +23 -14
- scrapling/parser.py +15 -8
- {scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/METADATA +28 -21
- scrapling-0.2.8.dist-info/RECORD +42 -0
- tests/fetchers/test_camoufox.py +1 -0
- tests/fetchers/test_httpx.py +1 -0
- tests/fetchers/test_playwright.py +1 -0
- tests/parser/test_general.py +3 -1
- scrapling-0.2.6.dist-info/RECORD +0 -42
- {scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/LICENSE +0 -0
- {scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/WHEEL +0 -0
- {scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
|
-
from scrapling.
|
2
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
+
from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
|
4
|
+
StealthyFetcher)
|
3
5
|
from scrapling.parser import Adaptor, Adaptors
|
4
|
-
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
6
|
|
6
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.8"
|
8
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
10
|
|
10
11
|
|
scrapling/core/_types.py
CHANGED
@@ -2,9 +2,8 @@
|
|
2
2
|
Type definitions for type checking purposes.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import (
|
6
|
-
|
7
|
-
)
|
5
|
+
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
8
7
|
|
9
8
|
try:
|
10
9
|
from typing import Protocol
|
scrapling/core/custom_types.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import re
|
2
|
-
from types import MappingProxyType
|
3
2
|
from collections.abc import Mapping
|
3
|
+
from types import MappingProxyType
|
4
4
|
|
5
|
-
from
|
6
|
-
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
7
|
-
|
8
|
-
from orjson import loads, dumps
|
5
|
+
from orjson import dumps, loads
|
9
6
|
from w3lib.html import replace_entities as _replace_entities
|
10
7
|
|
8
|
+
from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
|
9
|
+
from scrapling.core.utils import _is_iterable, flatten
|
10
|
+
|
11
11
|
|
12
12
|
class TextHandler(str):
|
13
13
|
"""Extends standard Python string by adding more functionality"""
|
@@ -1,16 +1,16 @@
|
|
1
|
-
import orjson
|
2
|
-
import sqlite3
|
3
1
|
import logging
|
2
|
+
import sqlite3
|
4
3
|
import threading
|
5
|
-
from hashlib import sha256
|
6
4
|
from abc import ABC, abstractmethod
|
5
|
+
from hashlib import sha256
|
7
6
|
|
8
|
-
|
9
|
-
from scrapling.core.utils import _StorageTools, cache
|
10
|
-
|
7
|
+
import orjson
|
11
8
|
from lxml import html
|
12
9
|
from tldextract import extract as tld
|
13
10
|
|
11
|
+
from scrapling.core._types import Dict, Optional, Union
|
12
|
+
from scrapling.core.utils import _StorageTools, cache
|
13
|
+
|
14
14
|
|
15
15
|
class StorageSystemMixin(ABC):
|
16
16
|
# If you want to make your own storage system, you have to inherit from this
|
scrapling/core/translator.py
CHANGED
@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
import re
|
12
12
|
|
13
|
-
from w3lib.html import HTML5_WHITESPACE
|
14
|
-
from scrapling.core.utils import cache
|
15
|
-
from scrapling.core._types import Any, Optional, Protocol, Self
|
16
|
-
|
17
|
-
from cssselect.xpath import ExpressionError
|
18
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
19
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
20
14
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
|
+
from cssselect.xpath import ExpressionError
|
16
|
+
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
17
|
+
from w3lib.html import HTML5_WHITESPACE
|
21
18
|
|
19
|
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
+
from scrapling.core.utils import cache
|
22
21
|
|
23
22
|
regex = f"[{HTML5_WHITESPACE}]+"
|
24
23
|
replace_html5_whitespaces = re.compile(regex).sub
|
scrapling/core/utils.py
CHANGED
@@ -1,22 +1,25 @@
|
|
1
|
-
import re
|
2
1
|
import logging
|
2
|
+
import re
|
3
3
|
from itertools import chain
|
4
|
-
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
5
|
-
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
6
|
-
|
7
|
-
from scrapling.core._types import Dict, Iterable, Any, Union
|
8
4
|
|
9
5
|
import orjson
|
10
6
|
from lxml import html
|
11
7
|
|
8
|
+
from scrapling.core._types import Any, Dict, Iterable, Union
|
9
|
+
|
10
|
+
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
11
|
+
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
12
|
+
from functools import lru_cache as cache # isort:skip
|
13
|
+
|
14
|
+
|
12
15
|
html_forbidden = {html.HtmlComment, }
|
13
16
|
logging.basicConfig(
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
level=logging.ERROR,
|
18
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
19
|
+
handlers=[
|
20
|
+
logging.StreamHandler()
|
21
|
+
]
|
22
|
+
)
|
20
23
|
|
21
24
|
|
22
25
|
def is_jsonable(content: Union[bytes, str]) -> bool:
|
@@ -94,7 +97,7 @@ class _StorageTools:
|
|
94
97
|
parent = element.getparent()
|
95
98
|
return tuple(
|
96
99
|
(element.tag,) if parent is None else (
|
97
|
-
|
100
|
+
cls._get_element_path(parent) + (element.tag,)
|
98
101
|
)
|
99
102
|
)
|
100
103
|
|
scrapling/defaults.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from .fetchers import Fetcher,
|
1
|
+
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
|
2
2
|
|
3
3
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
4
|
Fetcher = Fetcher()
|
scrapling/engines/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from .camo import CamoufoxEngine
|
2
|
-
from .static import StaticEngine
|
3
|
-
from .pw import PlaywrightEngine
|
4
2
|
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
3
|
+
from .pw import PlaywrightEngine
|
4
|
+
from .static import StaticEngine
|
5
5
|
from .toolbelt import check_if_engine_usable
|
6
6
|
|
7
7
|
__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
|
scrapling/engines/camo.py
CHANGED
@@ -1,19 +1,16 @@
|
|
1
1
|
import logging
|
2
|
-
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
3
|
-
|
4
|
-
from scrapling.engines.toolbelt import (
|
5
|
-
Response,
|
6
|
-
do_nothing,
|
7
|
-
StatusText,
|
8
|
-
get_os_name,
|
9
|
-
intercept_route,
|
10
|
-
check_type_validity,
|
11
|
-
construct_proxy_dict,
|
12
|
-
generate_convincing_referer,
|
13
|
-
)
|
14
2
|
|
3
|
+
from camoufox import DefaultAddons
|
15
4
|
from camoufox.sync_api import Camoufox
|
16
5
|
|
6
|
+
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
7
|
+
Union)
|
8
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
check_type_validity,
|
10
|
+
construct_proxy_dict, do_nothing,
|
11
|
+
generate_convincing_referer,
|
12
|
+
get_os_name, intercept_route)
|
13
|
+
|
17
14
|
|
18
15
|
class CamoufoxEngine:
|
19
16
|
def __init__(
|
@@ -21,7 +18,8 @@ class CamoufoxEngine:
|
|
21
18
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
22
19
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
23
20
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
24
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None,
|
21
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
22
|
+
adaptor_arguments: Dict = None,
|
25
23
|
):
|
26
24
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
27
25
|
|
@@ -36,6 +34,7 @@ class CamoufoxEngine:
|
|
36
34
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
35
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
38
36
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
37
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
39
38
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
40
39
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
41
40
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -54,6 +53,7 @@ class CamoufoxEngine:
|
|
54
53
|
self.network_idle = bool(network_idle)
|
55
54
|
self.google_search = bool(google_search)
|
56
55
|
self.os_randomize = bool(os_randomize)
|
56
|
+
self.disable_ads = bool(disable_ads)
|
57
57
|
self.extra_headers = extra_headers or {}
|
58
58
|
self.proxy = construct_proxy_dict(proxy)
|
59
59
|
self.addons = addons or []
|
@@ -75,9 +75,11 @@ class CamoufoxEngine:
|
|
75
75
|
:param url: Target url.
|
76
76
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
77
77
|
"""
|
78
|
+
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
78
79
|
with Camoufox(
|
79
80
|
proxy=self.proxy,
|
80
81
|
addons=self.addons,
|
82
|
+
exclude_addons=addons,
|
81
83
|
headless=self.headless,
|
82
84
|
humanize=self.humanize,
|
83
85
|
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
@@ -105,6 +107,11 @@ class CamoufoxEngine:
|
|
105
107
|
if self.wait_selector and type(self.wait_selector) is str:
|
106
108
|
waiter = page.locator(self.wait_selector)
|
107
109
|
waiter.first.wait_for(state=self.wait_selector_state)
|
110
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
111
|
+
page.wait_for_load_state(state="load")
|
112
|
+
page.wait_for_load_state(state="domcontentloaded")
|
113
|
+
if self.network_idle:
|
114
|
+
page.wait_for_load_state('networkidle')
|
108
115
|
|
109
116
|
# This will be parsed inside `Response`
|
110
117
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
scrapling/engines/constants.py
CHANGED
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
|
|
44
44
|
'--disable-default-apps',
|
45
45
|
'--disable-print-preview',
|
46
46
|
'--disable-dev-shm-usage',
|
47
|
-
'--disable-popup-blocking',
|
47
|
+
# '--disable-popup-blocking',
|
48
48
|
'--metrics-recording-only',
|
49
49
|
'--disable-crash-reporter',
|
50
50
|
'--disable-partial-raster',
|
scrapling/engines/pw.py
CHANGED
@@ -1,20 +1,15 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
|
4
|
-
|
5
|
-
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS,
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
construct_cdp_url,
|
14
|
-
check_type_validity,
|
15
|
-
construct_proxy_dict,
|
16
|
-
generate_convincing_referer,
|
17
|
-
)
|
3
|
+
|
4
|
+
from scrapling.core._types import Callable, Dict, List, Optional, Union
|
5
|
+
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
|
+
NSTBROWSER_DEFAULT_QUERY)
|
7
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
8
|
+
check_type_validity, construct_cdp_url,
|
9
|
+
construct_proxy_dict, do_nothing,
|
10
|
+
generate_convincing_referer,
|
11
|
+
generate_headers, intercept_route,
|
12
|
+
js_bypass_path)
|
18
13
|
|
19
14
|
|
20
15
|
class PlaywrightEngine:
|
@@ -26,6 +21,7 @@ class PlaywrightEngine:
|
|
26
21
|
timeout: Optional[float] = 30000,
|
27
22
|
page_action: Callable = do_nothing,
|
28
23
|
wait_selector: Optional[str] = None,
|
24
|
+
locale: Optional[str] = 'en-US',
|
29
25
|
wait_selector_state: Optional[str] = 'attached',
|
30
26
|
stealth: Optional[bool] = False,
|
31
27
|
real_chrome: Optional[bool] = False,
|
@@ -50,6 +46,7 @@ class PlaywrightEngine:
|
|
50
46
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
51
47
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
52
48
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
49
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
53
50
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
54
51
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
52
|
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
@@ -64,6 +61,7 @@ class PlaywrightEngine:
|
|
64
61
|
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
65
62
|
"""
|
66
63
|
self.headless = headless
|
64
|
+
self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
|
67
65
|
self.disable_resources = disable_resources
|
68
66
|
self.network_idle = bool(network_idle)
|
69
67
|
self.stealth = bool(stealth)
|
@@ -87,6 +85,14 @@ class PlaywrightEngine:
|
|
87
85
|
self.nstbrowser_mode = bool(nstbrowser_mode)
|
88
86
|
self.nstbrowser_config = nstbrowser_config
|
89
87
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
88
|
+
self.harmful_default_args = [
|
89
|
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
90
|
+
'--enable-automation',
|
91
|
+
'--disable-popup-blocking',
|
92
|
+
# '--disable-component-update',
|
93
|
+
# '--disable-default-apps',
|
94
|
+
# '--disable-extensions',
|
95
|
+
]
|
90
96
|
|
91
97
|
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
92
98
|
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
@@ -151,15 +157,15 @@ class PlaywrightEngine:
|
|
151
157
|
else:
|
152
158
|
if self.stealth:
|
153
159
|
browser = p.chromium.launch(
|
154
|
-
headless=self.headless, args=flags, ignore_default_args=
|
160
|
+
headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
161
|
)
|
156
162
|
else:
|
157
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=
|
163
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
|
158
164
|
|
159
165
|
# Creating the context
|
160
166
|
if self.stealth:
|
161
167
|
context = browser.new_context(
|
162
|
-
locale=
|
168
|
+
locale=self.locale,
|
163
169
|
is_mobile=False,
|
164
170
|
has_touch=False,
|
165
171
|
proxy=self.proxy,
|
@@ -176,6 +182,8 @@ class PlaywrightEngine:
|
|
176
182
|
)
|
177
183
|
else:
|
178
184
|
context = browser.new_context(
|
185
|
+
locale=self.locale,
|
186
|
+
proxy=self.proxy,
|
179
187
|
color_scheme='dark',
|
180
188
|
user_agent=useragent,
|
181
189
|
device_scale_factor=2,
|
@@ -221,6 +229,11 @@ class PlaywrightEngine:
|
|
221
229
|
if self.wait_selector and type(self.wait_selector) is str:
|
222
230
|
waiter = page.locator(self.wait_selector)
|
223
231
|
waiter.first.wait_for(state=self.wait_selector_state)
|
232
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
233
|
+
page.wait_for_load_state(state="load")
|
234
|
+
page.wait_for_load_state(state="domcontentloaded")
|
235
|
+
if self.network_idle:
|
236
|
+
page.wait_for_load_state('networkidle')
|
224
237
|
|
225
238
|
# This will be parsed inside `Response`
|
226
239
|
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
|
scrapling/engines/static.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from scrapling.core._types import Union, Optional, Dict
|
4
|
-
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
5
|
-
|
6
3
|
import httpx
|
7
4
|
from httpx._models import Response as httpxResponse
|
8
5
|
|
6
|
+
from scrapling.core._types import Dict, Optional, Union
|
7
|
+
|
8
|
+
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
9
|
+
|
9
10
|
|
10
11
|
class StaticEngine:
|
11
12
|
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
|
@@ -63,54 +64,66 @@ class StaticEngine:
|
|
63
64
|
**self.adaptor_arguments
|
64
65
|
)
|
65
66
|
|
66
|
-
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
|
+
def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
67
68
|
"""Make basic HTTP GET request for you but with some added flavors.
|
68
69
|
|
69
70
|
:param url: Target url.
|
70
71
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
71
72
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
73
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
72
74
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
73
75
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
74
76
|
"""
|
75
77
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
76
|
-
|
78
|
+
with httpx.Client(proxy=proxy) as client:
|
79
|
+
request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
80
|
+
|
77
81
|
return self._prepare_response(request)
|
78
82
|
|
79
|
-
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
83
|
+
def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
80
84
|
"""Make basic HTTP POST request for you but with some added flavors.
|
81
85
|
|
82
86
|
:param url: Target url.
|
83
87
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
84
88
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
89
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
85
90
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
86
91
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
87
92
|
"""
|
88
93
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
89
|
-
|
94
|
+
with httpx.Client(proxy=proxy) as client:
|
95
|
+
request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
96
|
+
|
90
97
|
return self._prepare_response(request)
|
91
98
|
|
92
|
-
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
99
|
+
def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
93
100
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
94
101
|
|
95
102
|
:param url: Target url.
|
96
103
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
97
104
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
105
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
98
106
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
99
107
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
100
108
|
"""
|
101
109
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
102
|
-
|
110
|
+
with httpx.Client(proxy=proxy) as client:
|
111
|
+
request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
112
|
+
|
103
113
|
return self._prepare_response(request)
|
104
114
|
|
105
|
-
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
115
|
+
def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
106
116
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
107
117
|
|
108
118
|
:param url: Target url.
|
109
119
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
110
120
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
121
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
111
122
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
112
123
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
113
124
|
"""
|
114
125
|
headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
|
115
|
-
|
126
|
+
with httpx.Client(proxy=proxy) as client:
|
127
|
+
request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
128
|
+
|
116
129
|
return self._prepare_response(request)
|
@@ -1,20 +1,6 @@
|
|
1
|
-
from .
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
Response,
|
8
|
-
do_nothing,
|
9
|
-
StatusText,
|
10
|
-
BaseFetcher,
|
11
|
-
get_variable_name,
|
12
|
-
check_type_validity,
|
13
|
-
check_if_engine_usable,
|
14
|
-
)
|
15
|
-
from .navigation import (
|
16
|
-
js_bypass_path,
|
17
|
-
intercept_route,
|
18
|
-
construct_cdp_url,
|
19
|
-
construct_proxy_dict,
|
20
|
-
)
|
1
|
+
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
2
|
+
check_type_validity, do_nothing, get_variable_name)
|
3
|
+
from .fingerprints import (generate_convincing_referer, generate_headers,
|
4
|
+
get_os_name)
|
5
|
+
from .navigation import (construct_cdp_url, construct_proxy_dict,
|
6
|
+
intercept_route, js_bypass_path)
|
@@ -5,10 +5,11 @@ import inspect
|
|
5
5
|
import logging
|
6
6
|
from email.message import Message
|
7
7
|
|
8
|
+
from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
|
9
|
+
Type, Union)
|
8
10
|
from scrapling.core.custom_types import MappingProxyType
|
11
|
+
from scrapling.core.utils import cache, setup_basic_logging
|
9
12
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
10
|
-
from scrapling.core.utils import setup_basic_logging, cache
|
11
|
-
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
|
12
13
|
|
13
14
|
|
14
15
|
class ResponseEncoding:
|
@@ -39,7 +40,7 @@ class ResponseEncoding:
|
|
39
40
|
|
40
41
|
@classmethod
|
41
42
|
@cache(maxsize=None)
|
42
|
-
def get_value(cls, content_type: Optional[str]) -> str:
|
43
|
+
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
44
|
"""Determine the appropriate character encoding from a content-type header.
|
44
45
|
|
45
46
|
The encoding is determined by these rules in order:
|
@@ -50,26 +51,30 @@ class ResponseEncoding:
|
|
50
51
|
5. Default to UTF-8 if nothing else matches
|
51
52
|
|
52
53
|
:param content_type: Content-Type header value or None
|
54
|
+
:param text: A text to test the encoding on it
|
53
55
|
:return: String naming the character encoding
|
54
56
|
"""
|
55
57
|
if not content_type:
|
56
58
|
return cls.__DEFAULT_ENCODING
|
57
59
|
|
58
60
|
try:
|
61
|
+
encoding = None
|
59
62
|
content_type, params = cls.__parse_content_type(content_type)
|
60
63
|
|
61
64
|
# First check for explicit charset parameter
|
62
65
|
if "charset" in params:
|
63
66
|
encoding = params["charset"].strip("'\"")
|
64
|
-
"test".encode(encoding) # Validate encoding
|
65
|
-
return encoding
|
66
67
|
|
67
68
|
# Apply content-type specific rules
|
68
|
-
|
69
|
-
|
69
|
+
elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
70
|
+
encoding = "ISO-8859-1"
|
71
|
+
|
72
|
+
elif content_type == "application/json":
|
73
|
+
encoding = cls.__DEFAULT_ENCODING
|
70
74
|
|
71
|
-
if
|
72
|
-
|
75
|
+
if encoding:
|
76
|
+
_ = text.encode(encoding) # Validate encoding and validate it can encode the given text
|
77
|
+
return encoding
|
73
78
|
|
74
79
|
return cls.__DEFAULT_ENCODING
|
75
80
|
|
@@ -87,7 +92,7 @@ class Response(Adaptor):
|
|
87
92
|
self.cookies = cookies
|
88
93
|
self.headers = headers
|
89
94
|
self.request_headers = request_headers
|
90
|
-
encoding = ResponseEncoding.get_value(encoding)
|
95
|
+
encoding = ResponseEncoding.get_value(encoding, text)
|
91
96
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
92
97
|
# For back-ward compatibility
|
93
98
|
self.adaptor = self
|
@@ -4,12 +4,12 @@ Functions related to generating headers and fingerprints generally
|
|
4
4
|
|
5
5
|
import platform
|
6
6
|
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
7
|
+
from browserforge.fingerprints import Fingerprint, FingerprintGenerator
|
8
|
+
from browserforge.headers import Browser, HeaderGenerator
|
10
9
|
from tldextract import extract
|
11
|
-
|
12
|
-
from
|
10
|
+
|
11
|
+
from scrapling.core._types import Dict, Union
|
12
|
+
from scrapling.core.utils import cache
|
13
13
|
|
14
14
|
|
15
15
|
@cache(None, typed=True)
|
@@ -2,16 +2,16 @@
|
|
2
2
|
Functions related to files and URLs
|
3
3
|
"""
|
4
4
|
|
5
|
-
import os
|
6
5
|
import logging
|
7
|
-
|
6
|
+
import os
|
7
|
+
from urllib.parse import urlencode, urlparse
|
8
|
+
|
9
|
+
from playwright.sync_api import Route
|
8
10
|
|
11
|
+
from scrapling.core._types import Dict, Optional, Union
|
9
12
|
from scrapling.core.utils import cache
|
10
|
-
from scrapling.core._types import Union, Dict, Optional
|
11
13
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
12
14
|
|
13
|
-
from playwright.sync_api import Route
|
14
|
-
|
15
15
|
|
16
16
|
def intercept_route(route: Route) -> Union[Route, None]:
|
17
17
|
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
@@ -43,7 +43,7 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
|
|
43
43
|
}
|
44
44
|
except ValueError:
|
45
45
|
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
46
|
-
raise TypeError(
|
46
|
+
raise TypeError('The proxy argument\'s string is in invalid format!')
|
47
47
|
|
48
48
|
elif isinstance(proxy_string, dict):
|
49
49
|
valid_keys = ('server', 'username', 'password', )
|
scrapling/fetchers.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
from scrapling.core._types import
|
2
|
-
|
3
|
-
from scrapling.engines
|
4
|
-
|
1
|
+
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
+
Union)
|
3
|
+
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
|
+
check_if_engine_usable)
|
5
|
+
from scrapling.engines.toolbelt import BaseFetcher, Response, do_nothing
|
5
6
|
|
6
7
|
|
7
8
|
class Fetcher(BaseFetcher):
|
@@ -9,7 +10,7 @@ class Fetcher(BaseFetcher):
|
|
9
10
|
|
10
11
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
12
|
"""
|
12
|
-
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
13
|
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
13
14
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
15
|
|
15
16
|
:param url: Target url.
|
@@ -17,13 +18,14 @@ class Fetcher(BaseFetcher):
|
|
17
18
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
18
19
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
19
20
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
21
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
20
22
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
21
23
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
22
24
|
"""
|
23
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
25
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs)
|
24
26
|
return response_object
|
25
27
|
|
26
|
-
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
28
|
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
27
29
|
"""Make basic HTTP POST request for you but with some added flavors.
|
28
30
|
|
29
31
|
:param url: Target url.
|
@@ -31,13 +33,14 @@ class Fetcher(BaseFetcher):
|
|
31
33
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
32
34
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
33
35
|
create a referer header as if this request came from Google's search of this URL's domain.
|
36
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
34
37
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
35
38
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
36
39
|
"""
|
37
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
40
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs)
|
38
41
|
return response_object
|
39
42
|
|
40
|
-
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
43
|
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
41
44
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
42
45
|
|
43
46
|
:param url: Target url
|
@@ -45,14 +48,15 @@ class Fetcher(BaseFetcher):
|
|
45
48
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
46
49
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
47
50
|
create a referer header as if this request came from Google's search of this URL's domain.
|
51
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
48
52
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
49
53
|
|
50
54
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
51
55
|
"""
|
52
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
56
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs)
|
53
57
|
return response_object
|
54
58
|
|
55
|
-
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
59
|
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
|
56
60
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
57
61
|
|
58
62
|
:param url: Target url
|
@@ -60,10 +64,11 @@ class Fetcher(BaseFetcher):
|
|
60
64
|
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
61
65
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
62
66
|
create a referer header as if this request came from Google's search of this URL's domain.
|
67
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
63
68
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
64
69
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
65
70
|
"""
|
66
|
-
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
71
|
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs)
|
67
72
|
return response_object
|
68
73
|
|
69
74
|
|
@@ -78,7 +83,7 @@ class StealthyFetcher(BaseFetcher):
|
|
78
83
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
79
84
|
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
80
85
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
81
|
-
os_randomize: Optional[bool] = None
|
86
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
82
87
|
) -> Response:
|
83
88
|
"""
|
84
89
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -92,6 +97,7 @@ class StealthyFetcher(BaseFetcher):
|
|
92
97
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
93
98
|
:param block_webrtc: Blocks WebRTC entirely.
|
94
99
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
100
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
95
101
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
96
102
|
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
97
103
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
@@ -111,6 +117,7 @@ class StealthyFetcher(BaseFetcher):
|
|
111
117
|
timeout=timeout,
|
112
118
|
headless=headless,
|
113
119
|
humanize=humanize,
|
120
|
+
disable_ads=disable_ads,
|
114
121
|
allow_webgl=allow_webgl,
|
115
122
|
page_action=page_action,
|
116
123
|
network_idle=network_idle,
|
@@ -148,7 +155,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
148
155
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
149
156
|
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
157
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
151
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
158
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
152
159
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
153
160
|
cdp_url: Optional[str] = None,
|
154
161
|
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
@@ -163,6 +170,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
163
170
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
164
171
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
165
172
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
173
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
166
174
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
167
175
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
168
176
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
@@ -180,6 +188,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
180
188
|
"""
|
181
189
|
engine = PlaywrightEngine(
|
182
190
|
proxy=proxy,
|
191
|
+
locale=locale,
|
183
192
|
timeout=timeout,
|
184
193
|
stealth=stealth,
|
185
194
|
cdp_url=cdp_url,
|
scrapling/parser.py
CHANGED
@@ -1,16 +1,23 @@
|
|
1
|
+
import inspect
|
1
2
|
import os
|
2
3
|
import re
|
3
|
-
import inspect
|
4
4
|
from difflib import SequenceMatcher
|
5
5
|
|
6
|
-
from
|
7
|
-
from
|
8
|
-
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
9
|
-
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
10
|
-
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
|
11
|
-
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
6
|
+
from cssselect import SelectorError, SelectorSyntaxError
|
7
|
+
from cssselect import parse as split_selectors
|
12
8
|
from lxml import etree, html
|
13
|
-
|
9
|
+
|
10
|
+
from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
|
11
|
+
List, Optional, Pattern, SupportsIndex,
|
12
|
+
Tuple, Union)
|
13
|
+
from scrapling.core.custom_types import (AttributesHandler, TextHandler,
|
14
|
+
TextHandlers)
|
15
|
+
from scrapling.core.mixins import SelectorsGeneration
|
16
|
+
from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
17
|
+
StorageSystemMixin, _StorageTools)
|
18
|
+
from scrapling.core.translator import HTMLTranslator
|
19
|
+
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
20
|
+
is_jsonable, logging, setup_basic_logging)
|
14
21
|
|
15
22
|
|
16
23
|
class Adaptor(SelectorsGeneration):
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
3
|
+
Version: 0.2.8
|
4
|
+
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.
|
44
|
+
Requires-Dist: camoufox>=0.4.4
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -52,7 +52,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling.
|
55
|
+
>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
57
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
90
90
|
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
91
91
|
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
92
92
|
* [Installation](#installation)
|
93
|
-
* [Fetching Websites
|
94
|
-
* [
|
95
|
-
* [
|
96
|
-
* [
|
93
|
+
* [Fetching Websites](#fetching-websites)
|
94
|
+
* [Features](#features)
|
95
|
+
* [Fetcher class](#fetcher)
|
96
|
+
* [StealthyFetcher class](#stealthyfetcher)
|
97
|
+
* [PlayWrightFetcher class](#playwrightfetcher)
|
97
98
|
* [Advanced Parsing Features](#advanced-parsing-features)
|
98
99
|
* [Smart Navigation](#smart-navigation)
|
99
100
|
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
@@ -256,43 +257,48 @@ playwright install chromium
|
|
256
257
|
python -m browserforge update
|
257
258
|
```
|
258
259
|
|
259
|
-
## Fetching Websites
|
260
|
-
|
260
|
+
## Fetching Websites
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
|
+
|
263
|
+
### Features
|
264
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
265
|
```python
|
262
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
263
267
|
```
|
264
|
-
|
268
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
|
265
269
|
|
266
270
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
267
271
|
```python
|
268
|
-
from scrapling.
|
272
|
+
from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
269
273
|
```
|
270
274
|
then use it right away without initializing like:
|
271
275
|
```python
|
272
276
|
page = StealthyFetcher.fetch('https://example.com')
|
273
277
|
```
|
274
278
|
|
275
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
279
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
276
280
|
> [!NOTE]
|
277
281
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
278
282
|
### Fetcher
|
279
283
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
280
284
|
|
281
285
|
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
286
|
+
|
287
|
+
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
282
288
|
```python
|
283
289
|
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
284
|
-
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
290
|
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
285
291
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
286
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
287
293
|
```
|
288
294
|
### StealthyFetcher
|
289
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
295
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
290
296
|
```python
|
291
297
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
292
298
|
>> page.status == 200
|
293
299
|
True
|
294
300
|
```
|
295
|
-
> Note: all requests done by this fetcher
|
301
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
296
302
|
|
297
303
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
298
304
|
|
@@ -309,6 +315,7 @@ True
|
|
309
315
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
310
316
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
311
317
|
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
318
|
+
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
312
319
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
313
320
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
314
321
|
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
@@ -327,7 +334,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
327
334
|
>> page.css_first("#search a::attr(href)")
|
328
335
|
'https://github.com/D4Vinci/Scrapling'
|
329
336
|
```
|
330
|
-
> Note: all requests done by this fetcher
|
337
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
331
338
|
|
332
339
|
Using this Fetcher class, you can make requests with:
|
333
340
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -339,7 +346,7 @@ Using this Fetcher class, you can make requests with:
|
|
339
346
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
347
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
348
|
|
342
|
-
> Hence using the `real_chrome` argument requires that you have
|
349
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
343
350
|
|
344
351
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
345
352
|
|
@@ -362,7 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
362
369
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
363
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
364
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
|
-
| real_chrome | If you have
|
372
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
|
+
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
366
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
367
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
368
376
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
|
|
814
822
|
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
815
823
|
|
816
824
|
## More Sponsors!
|
817
|
-
|
818
|
-
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
825
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
|
819
826
|
|
820
827
|
|
821
828
|
## Contributing
|
@@ -0,0 +1,42 @@
|
|
1
|
+
scrapling/__init__.py,sha256=0-gw4uqckCs7ikl6sHiB5c6y0AelpgefqJkBmSd7j1k,469
|
2
|
+
scrapling/defaults.py,sha256=qO6zAS7k5_QXvbjuoBv87fUMqASGMuM2dVry9J9auv0,287
|
3
|
+
scrapling/fetchers.py,sha256=iw1wEuFg14akJYpSg9webfBjAL341Pnofn4IkWahGlE,17486
|
4
|
+
scrapling/parser.py,sha256=suXggr39GimLnnLm9ivM1CQ40AoDwGke2sgnWszqFqk,54331
|
5
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
|
8
|
+
scrapling/core/custom_types.py,sha256=8GCgcZL-IT5lP6titxL-RPCiItQSuJZjSlFIGCDxoSs,8402
|
9
|
+
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
+
scrapling/core/storage_adaptors.py,sha256=Q2-G7oDqoIqlIBEmnUsKwSzM2lNGNUPKtTbMjTV9178,6218
|
11
|
+
scrapling/core/translator.py,sha256=WN_xPyYrD1MjLPv8Ar8zHNTPC_iYsW29kkjET4hbFI0,5228
|
12
|
+
scrapling/core/utils.py,sha256=RajDRSPkVmszjpwNy8NIz8ZlUxPox8j2rSractr7Q9s,3779
|
13
|
+
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
+
scrapling/engines/camo.py,sha256=fmpGMW5T7we5cQC8muyvVo_A27yAqc5csm7dO_2jHiE,8446
|
15
|
+
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
+
scrapling/engines/pw.py,sha256=kWbkHm2vnQYeGuJnicKlAL1HrBKuXoFtyRMNFXLs4VY,13962
|
17
|
+
scrapling/engines/static.py,sha256=h629IjT78YbhjFYBVSli53lKiYrG3929TAaZ7TA-j-Y,8022
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=0tSsxMH5ALOMPXrLkr8mTH7LWg9QfIse4Ij9vUFgYjY,391
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=tab_wJmN6onvu2U8tDXeJ9jn6A47jTkmxSBoc-w8dIk,12789
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=Y3FW8uqxxeNK3v6vBVvki8VjeG5oRxSwim4Q2Hv_cRk,2917
|
21
|
+
scrapling/engines/toolbelt/navigation.py,sha256=Okpl4ynlLn2cUpSiaaoXDSOdDOXhvxNOOGphE_HXc5k,4016
|
22
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
+
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
+
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
+
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
+
tests/fetchers/test_camoufox.py,sha256=-1v_0mXeBcAVW932nkFws1HIDCodGbpNYniSnVMHeeU,3116
|
32
|
+
tests/fetchers/test_httpx.py,sha256=rrw9q4KdDAHpQVa4sTmw278Yv1OlwY_SKPbpBPLVN7c,3508
|
33
|
+
tests/fetchers/test_playwright.py,sha256=xwhRmlw7WBrtqyilZsoMHkHpyAx7iXQ-YexDMJURTao,3702
|
34
|
+
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
35
|
+
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
+
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
|
+
tests/parser/test_general.py,sha256=sPbwQRka9Mh8MDz2Sto8Rwg78t0SWWxELgzhTVPEplE,11785
|
38
|
+
scrapling-0.2.8.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.8.dist-info/METADATA,sha256=0As--zWykpljObaw8DZQJr6udpHm4NyRN-dfUOUrhBc,66605
|
40
|
+
scrapling-0.2.8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.8.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.8.dist-info/RECORD,,
|
tests/fetchers/test_camoufox.py
CHANGED
tests/fetchers/test_httpx.py
CHANGED
tests/parser/test_general.py
CHANGED
scrapling-0.2.6.dist-info/RECORD
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
|
2
|
-
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
|
4
|
-
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
8
|
-
scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
|
9
|
-
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
-
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
11
|
-
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
|
-
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
|
-
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
|
15
|
-
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
|
17
|
-
scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
|
-
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
-
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
-
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
-
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
-
tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
|
32
|
-
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
33
|
-
tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
|
34
|
-
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
35
|
-
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
-
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
|
-
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
-
scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
|
40
|
-
scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
-
scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
-
scrapling-0.2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|