scrapling 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -4
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +89 -15
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +158 -83
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +20 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +229 -14
- scrapling/parser.py +49 -21
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/METADATA +32 -16
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
@@ -2,13 +2,12 @@
|
|
2
2
|
Functions related to custom types or type checking
|
3
3
|
"""
|
4
4
|
import inspect
|
5
|
-
import logging
|
6
5
|
from email.message import Message
|
7
6
|
|
8
7
|
from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
|
9
8
|
Type, Union)
|
10
9
|
from scrapling.core.custom_types import MappingProxyType
|
11
|
-
from scrapling.core.utils import
|
10
|
+
from scrapling.core.utils import log, lru_cache
|
12
11
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
13
12
|
|
14
13
|
|
@@ -17,7 +16,7 @@ class ResponseEncoding:
|
|
17
16
|
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
18
17
|
|
19
18
|
@classmethod
|
20
|
-
@
|
19
|
+
@lru_cache(maxsize=None)
|
21
20
|
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
22
21
|
"""Parse content type and parameters from a content-type header value.
|
23
22
|
|
@@ -39,7 +38,7 @@ class ResponseEncoding:
|
|
39
38
|
return content_type, params
|
40
39
|
|
41
40
|
@classmethod
|
42
|
-
@
|
41
|
+
@lru_cache(maxsize=None)
|
43
42
|
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
44
43
|
"""Determine the appropriate character encoding from a content-type header.
|
45
44
|
|
@@ -85,7 +84,10 @@ class ResponseEncoding:
|
|
85
84
|
class Response(Adaptor):
|
86
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
87
86
|
|
88
|
-
|
87
|
+
_is_response_result_logged = False # Class-level flag, initialized to False
|
88
|
+
|
89
|
+
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
90
|
+
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
89
91
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
90
92
|
self.status = status
|
91
93
|
self.reason = reason
|
@@ -96,6 +98,10 @@ class Response(Adaptor):
|
|
96
98
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
97
99
|
# For back-ward compatibility
|
98
100
|
self.adaptor = self
|
101
|
+
# For easier debugging while working from a Python shell
|
102
|
+
if not Response._is_response_result_logged:
|
103
|
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
104
|
+
Response._is_response_result_logged = True
|
99
105
|
|
100
106
|
# def __repr__(self):
|
101
107
|
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
@@ -104,8 +110,8 @@ class Response(Adaptor):
|
|
104
110
|
class BaseFetcher:
|
105
111
|
def __init__(
|
106
112
|
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
107
|
-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
|
108
|
-
automatch_domain: Optional[str] = None,
|
113
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
|
114
|
+
automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
|
109
115
|
):
|
110
116
|
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
111
117
|
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
@@ -113,6 +119,7 @@ class BaseFetcher:
|
|
113
119
|
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
114
120
|
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
115
121
|
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
122
|
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
116
123
|
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
117
124
|
priority over all auto-match related arguments/functions in the class.
|
118
125
|
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
@@ -120,23 +127,20 @@ class BaseFetcher:
|
|
120
127
|
If empty, default values will be used.
|
121
128
|
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
122
129
|
Otherwise, the domain of the request is used by default.
|
123
|
-
:param debug: Enable debug mode
|
124
130
|
"""
|
125
131
|
# Adaptor class parameters
|
126
132
|
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
127
133
|
self.adaptor_arguments = dict(
|
128
134
|
huge_tree=huge_tree,
|
129
135
|
keep_comments=keep_comments,
|
136
|
+
keep_cdata=keep_cdata,
|
130
137
|
auto_match=auto_match,
|
131
138
|
storage=storage,
|
132
|
-
storage_args=storage_args
|
133
|
-
debug=debug,
|
139
|
+
storage_args=storage_args
|
134
140
|
)
|
135
|
-
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
136
|
-
setup_basic_logging(level='debug' if debug else 'info')
|
137
141
|
if automatch_domain:
|
138
142
|
if type(automatch_domain) is not str:
|
139
|
-
|
143
|
+
log.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
140
144
|
else:
|
141
145
|
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
142
146
|
|
@@ -212,7 +216,7 @@ class StatusText:
|
|
212
216
|
})
|
213
217
|
|
214
218
|
@classmethod
|
215
|
-
@
|
219
|
+
@lru_cache(maxsize=128)
|
216
220
|
def get(cls, status_code: int) -> str:
|
217
221
|
"""Get the phrase for a given HTTP status code."""
|
218
222
|
return cls._phrases.get(status_code, "Unknown Status Code")
|
@@ -279,7 +283,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
279
283
|
error_msg = f'Argument "{var_name}" cannot be None'
|
280
284
|
if critical:
|
281
285
|
raise TypeError(error_msg)
|
282
|
-
|
286
|
+
log.error(f'[Ignored] {error_msg}')
|
283
287
|
return default_value
|
284
288
|
|
285
289
|
# If no valid_types specified and variable has a value, return it
|
@@ -292,13 +296,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
292
296
|
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
293
297
|
if critical:
|
294
298
|
raise TypeError(error_msg)
|
295
|
-
|
299
|
+
log.error(f'[Ignored] {error_msg}')
|
296
300
|
return default_value
|
297
301
|
|
298
302
|
return variable
|
299
|
-
|
300
|
-
|
301
|
-
# Pew Pew
|
302
|
-
def do_nothing(page):
|
303
|
-
# Just works as a filler for `page_action` argument in browser engines
|
304
|
-
return page
|
@@ -9,10 +9,10 @@ from browserforge.headers import Browser, HeaderGenerator
|
|
9
9
|
from tldextract import extract
|
10
10
|
|
11
11
|
from scrapling.core._types import Dict, Union
|
12
|
-
from scrapling.core.utils import
|
12
|
+
from scrapling.core.utils import lru_cache
|
13
13
|
|
14
14
|
|
15
|
-
@
|
15
|
+
@lru_cache(None, typed=True)
|
16
16
|
def generate_convincing_referer(url: str) -> str:
|
17
17
|
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
18
18
|
|
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
26
26
|
return f'https://www.google.com/search?q={website_name}'
|
27
27
|
|
28
28
|
|
29
|
-
@
|
29
|
+
@lru_cache(None, typed=True)
|
30
30
|
def get_os_name() -> Union[str, None]:
|
31
31
|
"""Get the current OS name in the same format needed for browserforge
|
32
32
|
|
@@ -1,28 +1,41 @@
|
|
1
1
|
"""
|
2
2
|
Functions related to files and URLs
|
3
3
|
"""
|
4
|
-
|
5
|
-
import logging
|
6
4
|
import os
|
7
5
|
from urllib.parse import urlencode, urlparse
|
8
6
|
|
7
|
+
from playwright.async_api import Route as async_Route
|
9
8
|
from playwright.sync_api import Route
|
10
9
|
|
11
10
|
from scrapling.core._types import Dict, Optional, Union
|
12
|
-
from scrapling.core.utils import
|
11
|
+
from scrapling.core.utils import log, lru_cache
|
13
12
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
14
13
|
|
15
14
|
|
16
|
-
def intercept_route(route: Route)
|
15
|
+
def intercept_route(route: Route):
|
16
|
+
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
17
|
+
|
18
|
+
:param route: PlayWright `Route` object of the current page
|
19
|
+
:return: PlayWright `Route` object
|
20
|
+
"""
|
21
|
+
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
22
|
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
23
|
+
route.abort()
|
24
|
+
else:
|
25
|
+
route.continue_()
|
26
|
+
|
27
|
+
|
28
|
+
async def async_intercept_route(route: async_Route):
|
17
29
|
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
18
30
|
|
19
31
|
:param route: PlayWright `Route` object of the current page
|
20
32
|
:return: PlayWright `Route` object
|
21
33
|
"""
|
22
34
|
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
23
|
-
|
24
|
-
|
25
|
-
|
35
|
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
36
|
+
await route.abort()
|
37
|
+
else:
|
38
|
+
await route.continue_()
|
26
39
|
|
27
40
|
|
28
41
|
def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
|
@@ -97,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
97
110
|
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
98
111
|
|
99
112
|
|
100
|
-
@
|
113
|
+
@lru_cache(None, typed=True)
|
101
114
|
def js_bypass_path(filename: str) -> str:
|
102
115
|
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
103
116
|
|
scrapling/fetchers.py
CHANGED
@@ -2,7 +2,7 @@ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
|
2
2
|
Union)
|
3
3
|
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
4
|
check_if_engine_usable)
|
5
|
-
from scrapling.engines.toolbelt import BaseFetcher, Response
|
5
|
+
from scrapling.engines.toolbelt import BaseFetcher, Response
|
6
6
|
|
7
7
|
|
8
8
|
class Fetcher(BaseFetcher):
|
@@ -10,7 +10,9 @@ class Fetcher(BaseFetcher):
|
|
10
10
|
|
11
11
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
12
12
|
"""
|
13
|
-
def get(
|
13
|
+
def get(
|
14
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
15
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
14
16
|
"""Make basic HTTP GET request for you but with some added flavors.
|
15
17
|
|
16
18
|
:param url: Target url.
|
@@ -19,13 +21,17 @@ class Fetcher(BaseFetcher):
|
|
19
21
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
20
22
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
21
23
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
24
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
22
25
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
23
26
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
24
27
|
"""
|
25
|
-
|
28
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
29
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
|
26
30
|
return response_object
|
27
31
|
|
28
|
-
def post(
|
32
|
+
def post(
|
33
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
34
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
29
35
|
"""Make basic HTTP POST request for you but with some added flavors.
|
30
36
|
|
31
37
|
:param url: Target url.
|
@@ -34,13 +40,17 @@ class Fetcher(BaseFetcher):
|
|
34
40
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
35
41
|
create a referer header as if this request came from Google's search of this URL's domain.
|
36
42
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
43
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
37
44
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
38
45
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
39
46
|
"""
|
40
|
-
|
47
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
48
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
|
41
49
|
return response_object
|
42
50
|
|
43
|
-
def put(
|
51
|
+
def put(
|
52
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
53
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
44
54
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
45
55
|
|
46
56
|
:param url: Target url
|
@@ -49,14 +59,18 @@ class Fetcher(BaseFetcher):
|
|
49
59
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
50
60
|
create a referer header as if this request came from Google's search of this URL's domain.
|
51
61
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
62
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
52
63
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
53
64
|
|
54
65
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
55
66
|
"""
|
56
|
-
|
67
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
68
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
|
57
69
|
return response_object
|
58
70
|
|
59
|
-
def delete(
|
71
|
+
def delete(
|
72
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
73
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
60
74
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
61
75
|
|
62
76
|
:param url: Target url
|
@@ -65,10 +79,90 @@ class Fetcher(BaseFetcher):
|
|
65
79
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
66
80
|
create a referer header as if this request came from Google's search of this URL's domain.
|
67
81
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
82
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
68
83
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
69
84
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
70
85
|
"""
|
71
|
-
|
86
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
87
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
|
88
|
+
return response_object
|
89
|
+
|
90
|
+
|
91
|
+
class AsyncFetcher(Fetcher):
|
92
|
+
async def get(
|
93
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
94
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
95
|
+
"""Make basic HTTP GET request for you but with some added flavors.
|
96
|
+
|
97
|
+
:param url: Target url.
|
98
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
99
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
100
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
101
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
102
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
103
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
104
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
105
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
106
|
+
"""
|
107
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
108
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
|
109
|
+
return response_object
|
110
|
+
|
111
|
+
async def post(
|
112
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
113
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
114
|
+
"""Make basic HTTP POST request for you but with some added flavors.
|
115
|
+
|
116
|
+
:param url: Target url.
|
117
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
118
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
119
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
120
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
121
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
122
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
123
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
124
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
125
|
+
"""
|
126
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
127
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
|
128
|
+
return response_object
|
129
|
+
|
130
|
+
async def put(
|
131
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
132
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
133
|
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
134
|
+
|
135
|
+
:param url: Target url
|
136
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
137
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
138
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
139
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
140
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
141
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
142
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
143
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
144
|
+
"""
|
145
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
146
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
|
147
|
+
return response_object
|
148
|
+
|
149
|
+
async def delete(
|
150
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
151
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
152
|
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
153
|
+
|
154
|
+
:param url: Target url
|
155
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
156
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
157
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
158
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
159
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
160
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
161
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
162
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
163
|
+
"""
|
164
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
165
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
|
72
166
|
return response_object
|
73
167
|
|
74
168
|
|
@@ -80,10 +174,10 @@ class StealthyFetcher(BaseFetcher):
|
|
80
174
|
"""
|
81
175
|
def fetch(
|
82
176
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
83
|
-
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] =
|
84
|
-
timeout: Optional[float] = 30000, page_action: Callable =
|
177
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
|
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
85
179
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
86
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
180
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
87
181
|
) -> Response:
|
88
182
|
"""
|
89
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -99,7 +193,9 @@ class StealthyFetcher(BaseFetcher):
|
|
99
193
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
100
194
|
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
101
195
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
102
|
-
:param allow_webgl:
|
196
|
+
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
197
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
198
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
103
199
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
104
200
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
105
201
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
@@ -113,6 +209,7 @@ class StealthyFetcher(BaseFetcher):
|
|
113
209
|
"""
|
114
210
|
engine = CamoufoxEngine(
|
115
211
|
proxy=proxy,
|
212
|
+
geoip=geoip,
|
116
213
|
addons=addons,
|
117
214
|
timeout=timeout,
|
118
215
|
headless=headless,
|
@@ -133,6 +230,64 @@ class StealthyFetcher(BaseFetcher):
|
|
133
230
|
)
|
134
231
|
return engine.fetch(url)
|
135
232
|
|
233
|
+
async def async_fetch(
|
234
|
+
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
235
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
|
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
238
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
239
|
+
) -> Response:
|
240
|
+
"""
|
241
|
+
Opens up a browser and do your request based on your chosen options below.
|
242
|
+
|
243
|
+
:param url: Target url.
|
244
|
+
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
245
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
246
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
247
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
248
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
249
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
250
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
251
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
252
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
253
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
254
|
+
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
255
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
256
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
257
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
258
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
259
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
260
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
261
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
262
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
263
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
264
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
265
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
266
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
267
|
+
"""
|
268
|
+
engine = CamoufoxEngine(
|
269
|
+
proxy=proxy,
|
270
|
+
geoip=geoip,
|
271
|
+
addons=addons,
|
272
|
+
timeout=timeout,
|
273
|
+
headless=headless,
|
274
|
+
humanize=humanize,
|
275
|
+
disable_ads=disable_ads,
|
276
|
+
allow_webgl=allow_webgl,
|
277
|
+
page_action=page_action,
|
278
|
+
network_idle=network_idle,
|
279
|
+
block_images=block_images,
|
280
|
+
block_webrtc=block_webrtc,
|
281
|
+
os_randomize=os_randomize,
|
282
|
+
wait_selector=wait_selector,
|
283
|
+
google_search=google_search,
|
284
|
+
extra_headers=extra_headers,
|
285
|
+
disable_resources=disable_resources,
|
286
|
+
wait_selector_state=wait_selector_state,
|
287
|
+
adaptor_arguments=self.adaptor_arguments,
|
288
|
+
)
|
289
|
+
return await engine.async_fetch(url)
|
290
|
+
|
136
291
|
|
137
292
|
class PlayWrightFetcher(BaseFetcher):
|
138
293
|
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
@@ -153,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
153
308
|
def fetch(
|
154
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
155
310
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
156
|
-
page_action: Optional[Callable] =
|
311
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
157
312
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
158
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
159
314
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -210,6 +365,66 @@ class PlayWrightFetcher(BaseFetcher):
|
|
210
365
|
)
|
211
366
|
return engine.fetch(url)
|
212
367
|
|
368
|
+
async def async_fetch(
|
369
|
+
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
|
+
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
371
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
372
|
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
373
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
|
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
375
|
+
cdp_url: Optional[str] = None,
|
376
|
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
377
|
+
) -> Response:
|
378
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
379
|
+
|
380
|
+
:param url: Target url.
|
381
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
382
|
+
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
383
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
384
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
385
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
386
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
387
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
388
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
389
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
390
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
391
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
392
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
393
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
394
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
395
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
396
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
397
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
398
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
399
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
400
|
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
401
|
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
402
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
403
|
+
"""
|
404
|
+
engine = PlaywrightEngine(
|
405
|
+
proxy=proxy,
|
406
|
+
locale=locale,
|
407
|
+
timeout=timeout,
|
408
|
+
stealth=stealth,
|
409
|
+
cdp_url=cdp_url,
|
410
|
+
headless=headless,
|
411
|
+
useragent=useragent,
|
412
|
+
real_chrome=real_chrome,
|
413
|
+
page_action=page_action,
|
414
|
+
hide_canvas=hide_canvas,
|
415
|
+
network_idle=network_idle,
|
416
|
+
google_search=google_search,
|
417
|
+
extra_headers=extra_headers,
|
418
|
+
wait_selector=wait_selector,
|
419
|
+
disable_webgl=disable_webgl,
|
420
|
+
nstbrowser_mode=nstbrowser_mode,
|
421
|
+
nstbrowser_config=nstbrowser_config,
|
422
|
+
disable_resources=disable_resources,
|
423
|
+
wait_selector_state=wait_selector_state,
|
424
|
+
adaptor_arguments=self.adaptor_arguments,
|
425
|
+
)
|
426
|
+
return await engine.async_fetch(url)
|
427
|
+
|
213
428
|
|
214
429
|
class CustomFetcher(BaseFetcher):
|
215
430
|
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|