scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +5 -4
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +93 -11
- scrapling/core/storage_adaptors.py +9 -10
- scrapling/core/translator.py +6 -7
- scrapling/core/utils.py +35 -30
- scrapling/defaults.py +2 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +96 -26
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +166 -96
- scrapling/engines/static.py +94 -50
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +22 -23
- scrapling/engines/toolbelt/fingerprints.py +7 -7
- scrapling/engines/toolbelt/navigation.py +25 -12
- scrapling/fetchers.py +233 -17
- scrapling/parser.py +63 -28
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +263 -219
- scrapling-0.2.7.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -64
- tests/fetchers/test_httpx.py +0 -67
- tests/fetchers/test_playwright.py +0 -76
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
@@ -2,13 +2,13 @@
|
|
2
2
|
Functions related to custom types or type checking
|
3
3
|
"""
|
4
4
|
import inspect
|
5
|
-
import logging
|
6
5
|
from email.message import Message
|
7
6
|
|
7
|
+
from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
|
8
|
+
Type, Union)
|
8
9
|
from scrapling.core.custom_types import MappingProxyType
|
10
|
+
from scrapling.core.utils import log, lru_cache
|
9
11
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
10
|
-
from scrapling.core.utils import setup_basic_logging, cache
|
11
|
-
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
|
12
12
|
|
13
13
|
|
14
14
|
class ResponseEncoding:
|
@@ -16,7 +16,7 @@ class ResponseEncoding:
|
|
16
16
|
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
17
17
|
|
18
18
|
@classmethod
|
19
|
-
@
|
19
|
+
@lru_cache(maxsize=None)
|
20
20
|
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
21
21
|
"""Parse content type and parameters from a content-type header value.
|
22
22
|
|
@@ -38,7 +38,7 @@ class ResponseEncoding:
|
|
38
38
|
return content_type, params
|
39
39
|
|
40
40
|
@classmethod
|
41
|
-
@
|
41
|
+
@lru_cache(maxsize=None)
|
42
42
|
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
43
43
|
"""Determine the appropriate character encoding from a content-type header.
|
44
44
|
|
@@ -84,7 +84,10 @@ class ResponseEncoding:
|
|
84
84
|
class Response(Adaptor):
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
|
-
|
87
|
+
_is_response_result_logged = False # Class-level flag, initialized to False
|
88
|
+
|
89
|
+
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
90
|
+
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
88
91
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
89
92
|
self.status = status
|
90
93
|
self.reason = reason
|
@@ -95,6 +98,10 @@ class Response(Adaptor):
|
|
95
98
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
96
99
|
# For back-ward compatibility
|
97
100
|
self.adaptor = self
|
101
|
+
# For easier debugging while working from a Python shell
|
102
|
+
if not Response._is_response_result_logged:
|
103
|
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
104
|
+
Response._is_response_result_logged = True
|
98
105
|
|
99
106
|
# def __repr__(self):
|
100
107
|
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
@@ -103,8 +110,8 @@ class Response(Adaptor):
|
|
103
110
|
class BaseFetcher:
|
104
111
|
def __init__(
|
105
112
|
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
106
|
-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
|
107
|
-
automatch_domain: Optional[str] = None,
|
113
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
|
114
|
+
automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
|
108
115
|
):
|
109
116
|
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
110
117
|
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
@@ -112,6 +119,7 @@ class BaseFetcher:
|
|
112
119
|
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
113
120
|
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
114
121
|
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
122
|
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
115
123
|
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
116
124
|
priority over all auto-match related arguments/functions in the class.
|
117
125
|
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
@@ -119,23 +127,20 @@ class BaseFetcher:
|
|
119
127
|
If empty, default values will be used.
|
120
128
|
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
121
129
|
Otherwise, the domain of the request is used by default.
|
122
|
-
:param debug: Enable debug mode
|
123
130
|
"""
|
124
131
|
# Adaptor class parameters
|
125
132
|
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
126
133
|
self.adaptor_arguments = dict(
|
127
134
|
huge_tree=huge_tree,
|
128
135
|
keep_comments=keep_comments,
|
136
|
+
keep_cdata=keep_cdata,
|
129
137
|
auto_match=auto_match,
|
130
138
|
storage=storage,
|
131
|
-
storage_args=storage_args
|
132
|
-
debug=debug,
|
139
|
+
storage_args=storage_args
|
133
140
|
)
|
134
|
-
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
135
|
-
setup_basic_logging(level='debug' if debug else 'info')
|
136
141
|
if automatch_domain:
|
137
142
|
if type(automatch_domain) is not str:
|
138
|
-
|
143
|
+
log.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
139
144
|
else:
|
140
145
|
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
141
146
|
|
@@ -211,7 +216,7 @@ class StatusText:
|
|
211
216
|
})
|
212
217
|
|
213
218
|
@classmethod
|
214
|
-
@
|
219
|
+
@lru_cache(maxsize=128)
|
215
220
|
def get(cls, status_code: int) -> str:
|
216
221
|
"""Get the phrase for a given HTTP status code."""
|
217
222
|
return cls._phrases.get(status_code, "Unknown Status Code")
|
@@ -278,7 +283,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
278
283
|
error_msg = f'Argument "{var_name}" cannot be None'
|
279
284
|
if critical:
|
280
285
|
raise TypeError(error_msg)
|
281
|
-
|
286
|
+
log.error(f'[Ignored] {error_msg}')
|
282
287
|
return default_value
|
283
288
|
|
284
289
|
# If no valid_types specified and variable has a value, return it
|
@@ -291,13 +296,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
291
296
|
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
292
297
|
if critical:
|
293
298
|
raise TypeError(error_msg)
|
294
|
-
|
299
|
+
log.error(f'[Ignored] {error_msg}')
|
295
300
|
return default_value
|
296
301
|
|
297
302
|
return variable
|
298
|
-
|
299
|
-
|
300
|
-
# Pew Pew
|
301
|
-
def do_nothing(page):
|
302
|
-
# Just works as a filler for `page_action` argument in browser engines
|
303
|
-
return page
|
@@ -4,15 +4,15 @@ Functions related to generating headers and fingerprints generally
|
|
4
4
|
|
5
5
|
import platform
|
6
6
|
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
7
|
+
from browserforge.fingerprints import Fingerprint, FingerprintGenerator
|
8
|
+
from browserforge.headers import Browser, HeaderGenerator
|
10
9
|
from tldextract import extract
|
11
|
-
|
12
|
-
from
|
10
|
+
|
11
|
+
from scrapling.core._types import Dict, Union
|
12
|
+
from scrapling.core.utils import lru_cache
|
13
13
|
|
14
14
|
|
15
|
-
@
|
15
|
+
@lru_cache(None, typed=True)
|
16
16
|
def generate_convincing_referer(url: str) -> str:
|
17
17
|
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
18
18
|
|
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
26
26
|
return f'https://www.google.com/search?q={website_name}'
|
27
27
|
|
28
28
|
|
29
|
-
@
|
29
|
+
@lru_cache(None, typed=True)
|
30
30
|
def get_os_name() -> Union[str, None]:
|
31
31
|
"""Get the current OS name in the same format needed for browserforge
|
32
32
|
|
@@ -1,28 +1,41 @@
|
|
1
1
|
"""
|
2
2
|
Functions related to files and URLs
|
3
3
|
"""
|
4
|
-
|
5
4
|
import os
|
6
|
-
import
|
7
|
-
|
5
|
+
from urllib.parse import urlencode, urlparse
|
6
|
+
|
7
|
+
from playwright.async_api import Route as async_Route
|
8
|
+
from playwright.sync_api import Route
|
8
9
|
|
9
|
-
from scrapling.core.
|
10
|
-
from scrapling.core.
|
10
|
+
from scrapling.core._types import Dict, Optional, Union
|
11
|
+
from scrapling.core.utils import log, lru_cache
|
11
12
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
12
13
|
|
13
|
-
|
14
|
+
|
15
|
+
def intercept_route(route: Route):
|
16
|
+
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
17
|
+
|
18
|
+
:param route: PlayWright `Route` object of the current page
|
19
|
+
:return: PlayWright `Route` object
|
20
|
+
"""
|
21
|
+
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
22
|
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
23
|
+
route.abort()
|
24
|
+
else:
|
25
|
+
route.continue_()
|
14
26
|
|
15
27
|
|
16
|
-
def
|
28
|
+
async def async_intercept_route(route: async_Route):
|
17
29
|
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
18
30
|
|
19
31
|
:param route: PlayWright `Route` object of the current page
|
20
32
|
:return: PlayWright `Route` object
|
21
33
|
"""
|
22
34
|
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
23
|
-
|
24
|
-
|
25
|
-
|
35
|
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
36
|
+
await route.abort()
|
37
|
+
else:
|
38
|
+
await route.continue_()
|
26
39
|
|
27
40
|
|
28
41
|
def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
|
@@ -43,7 +56,7 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
|
|
43
56
|
}
|
44
57
|
except ValueError:
|
45
58
|
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
46
|
-
raise TypeError(
|
59
|
+
raise TypeError('The proxy argument\'s string is in invalid format!')
|
47
60
|
|
48
61
|
elif isinstance(proxy_string, dict):
|
49
62
|
valid_keys = ('server', 'username', 'password', )
|
@@ -97,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
97
110
|
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
98
111
|
|
99
112
|
|
100
|
-
@
|
113
|
+
@lru_cache(None, typed=True)
|
101
114
|
def js_bypass_path(filename: str) -> str:
|
102
115
|
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
103
116
|
|
scrapling/fetchers.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
from scrapling.core._types import
|
2
|
-
|
3
|
-
from scrapling.engines
|
4
|
-
|
1
|
+
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
+
Union)
|
3
|
+
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
|
+
check_if_engine_usable)
|
5
|
+
from scrapling.engines.toolbelt import BaseFetcher, Response
|
5
6
|
|
6
7
|
|
7
8
|
class Fetcher(BaseFetcher):
|
@@ -9,7 +10,9 @@ class Fetcher(BaseFetcher):
|
|
9
10
|
|
10
11
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
11
12
|
"""
|
12
|
-
def get(
|
13
|
+
def get(
|
14
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
15
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
13
16
|
"""Make basic HTTP GET request for you but with some added flavors.
|
14
17
|
|
15
18
|
:param url: Target url.
|
@@ -18,13 +21,17 @@ class Fetcher(BaseFetcher):
|
|
18
21
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
19
22
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
20
23
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
24
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
21
25
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
22
26
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
23
27
|
"""
|
24
|
-
|
28
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
29
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
|
25
30
|
return response_object
|
26
31
|
|
27
|
-
def post(
|
32
|
+
def post(
|
33
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
34
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
28
35
|
"""Make basic HTTP POST request for you but with some added flavors.
|
29
36
|
|
30
37
|
:param url: Target url.
|
@@ -33,13 +40,17 @@ class Fetcher(BaseFetcher):
|
|
33
40
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
34
41
|
create a referer header as if this request came from Google's search of this URL's domain.
|
35
42
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
43
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
36
44
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
37
45
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
38
46
|
"""
|
39
|
-
|
47
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
48
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
|
40
49
|
return response_object
|
41
50
|
|
42
|
-
def put(
|
51
|
+
def put(
|
52
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
53
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
43
54
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
44
55
|
|
45
56
|
:param url: Target url
|
@@ -48,14 +59,96 @@ class Fetcher(BaseFetcher):
|
|
48
59
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
49
60
|
create a referer header as if this request came from Google's search of this URL's domain.
|
50
61
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
62
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
51
63
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
52
64
|
|
53
65
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
54
66
|
"""
|
55
|
-
|
67
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
68
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
|
69
|
+
return response_object
|
70
|
+
|
71
|
+
def delete(
|
72
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
73
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
74
|
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
75
|
+
|
76
|
+
:param url: Target url
|
77
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
78
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
79
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
80
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
81
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
82
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
83
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
84
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
|
+
"""
|
86
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
87
|
+
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
|
88
|
+
return response_object
|
89
|
+
|
90
|
+
|
91
|
+
class AsyncFetcher(Fetcher):
|
92
|
+
async def get(
|
93
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
94
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
95
|
+
"""Make basic HTTP GET request for you but with some added flavors.
|
96
|
+
|
97
|
+
:param url: Target url.
|
98
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
99
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
100
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
101
|
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
102
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
103
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
104
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
105
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
106
|
+
"""
|
107
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
108
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
|
109
|
+
return response_object
|
110
|
+
|
111
|
+
async def post(
|
112
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
113
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
114
|
+
"""Make basic HTTP POST request for you but with some added flavors.
|
115
|
+
|
116
|
+
:param url: Target url.
|
117
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
118
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
119
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
120
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
121
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
122
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
123
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
124
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
125
|
+
"""
|
126
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
127
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
|
128
|
+
return response_object
|
129
|
+
|
130
|
+
async def put(
|
131
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
132
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
133
|
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
134
|
+
|
135
|
+
:param url: Target url
|
136
|
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
137
|
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
138
|
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
139
|
+
create a referer header as if this request came from Google's search of this URL's domain.
|
140
|
+
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
141
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
142
|
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
143
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
144
|
+
"""
|
145
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
146
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
|
56
147
|
return response_object
|
57
148
|
|
58
|
-
def delete(
|
149
|
+
async def delete(
|
150
|
+
self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
|
151
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
59
152
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
60
153
|
|
61
154
|
:param url: Target url
|
@@ -64,10 +157,12 @@ class Fetcher(BaseFetcher):
|
|
64
157
|
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
65
158
|
create a referer header as if this request came from Google's search of this URL's domain.
|
66
159
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
160
|
+
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
67
161
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
68
162
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
69
163
|
"""
|
70
|
-
|
164
|
+
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
165
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
|
71
166
|
return response_object
|
72
167
|
|
73
168
|
|
@@ -79,10 +174,10 @@ class StealthyFetcher(BaseFetcher):
|
|
79
174
|
"""
|
80
175
|
def fetch(
|
81
176
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
82
|
-
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] =
|
83
|
-
timeout: Optional[float] = 30000, page_action: Callable =
|
177
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
|
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
84
179
|
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
85
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
180
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
86
181
|
) -> Response:
|
87
182
|
"""
|
88
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -98,7 +193,9 @@ class StealthyFetcher(BaseFetcher):
|
|
98
193
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
99
194
|
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
100
195
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
101
|
-
:param allow_webgl:
|
196
|
+
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
197
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
198
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
102
199
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
103
200
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
104
201
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
@@ -112,6 +209,7 @@ class StealthyFetcher(BaseFetcher):
|
|
112
209
|
"""
|
113
210
|
engine = CamoufoxEngine(
|
114
211
|
proxy=proxy,
|
212
|
+
geoip=geoip,
|
115
213
|
addons=addons,
|
116
214
|
timeout=timeout,
|
117
215
|
headless=headless,
|
@@ -132,6 +230,64 @@ class StealthyFetcher(BaseFetcher):
|
|
132
230
|
)
|
133
231
|
return engine.fetch(url)
|
134
232
|
|
233
|
+
async def async_fetch(
|
234
|
+
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
235
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
|
+
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
|
238
|
+
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
239
|
+
) -> Response:
|
240
|
+
"""
|
241
|
+
Opens up a browser and do your request based on your chosen options below.
|
242
|
+
|
243
|
+
:param url: Target url.
|
244
|
+
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
245
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
246
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
247
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
248
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
249
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
250
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
251
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
252
|
+
:param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
253
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
254
|
+
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
255
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
256
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
257
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
258
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
259
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
260
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
261
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
262
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
263
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
264
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
265
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
266
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
267
|
+
"""
|
268
|
+
engine = CamoufoxEngine(
|
269
|
+
proxy=proxy,
|
270
|
+
geoip=geoip,
|
271
|
+
addons=addons,
|
272
|
+
timeout=timeout,
|
273
|
+
headless=headless,
|
274
|
+
humanize=humanize,
|
275
|
+
disable_ads=disable_ads,
|
276
|
+
allow_webgl=allow_webgl,
|
277
|
+
page_action=page_action,
|
278
|
+
network_idle=network_idle,
|
279
|
+
block_images=block_images,
|
280
|
+
block_webrtc=block_webrtc,
|
281
|
+
os_randomize=os_randomize,
|
282
|
+
wait_selector=wait_selector,
|
283
|
+
google_search=google_search,
|
284
|
+
extra_headers=extra_headers,
|
285
|
+
disable_resources=disable_resources,
|
286
|
+
wait_selector_state=wait_selector_state,
|
287
|
+
adaptor_arguments=self.adaptor_arguments,
|
288
|
+
)
|
289
|
+
return await engine.async_fetch(url)
|
290
|
+
|
135
291
|
|
136
292
|
class PlayWrightFetcher(BaseFetcher):
|
137
293
|
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
@@ -152,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
152
308
|
def fetch(
|
153
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
154
310
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
155
|
-
page_action: Optional[Callable] =
|
311
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
156
312
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
157
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
158
314
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -209,6 +365,66 @@ class PlayWrightFetcher(BaseFetcher):
|
|
209
365
|
)
|
210
366
|
return engine.fetch(url)
|
211
367
|
|
368
|
+
async def async_fetch(
|
369
|
+
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
|
+
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
371
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
372
|
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
373
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
|
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
375
|
+
cdp_url: Optional[str] = None,
|
376
|
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
377
|
+
) -> Response:
|
378
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
379
|
+
|
380
|
+
:param url: Target url.
|
381
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
382
|
+
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
383
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
384
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
385
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
386
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
387
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
388
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
389
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
390
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
391
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
392
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
393
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
394
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
395
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
396
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
397
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
398
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
399
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
400
|
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
401
|
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
402
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
403
|
+
"""
|
404
|
+
engine = PlaywrightEngine(
|
405
|
+
proxy=proxy,
|
406
|
+
locale=locale,
|
407
|
+
timeout=timeout,
|
408
|
+
stealth=stealth,
|
409
|
+
cdp_url=cdp_url,
|
410
|
+
headless=headless,
|
411
|
+
useragent=useragent,
|
412
|
+
real_chrome=real_chrome,
|
413
|
+
page_action=page_action,
|
414
|
+
hide_canvas=hide_canvas,
|
415
|
+
network_idle=network_idle,
|
416
|
+
google_search=google_search,
|
417
|
+
extra_headers=extra_headers,
|
418
|
+
wait_selector=wait_selector,
|
419
|
+
disable_webgl=disable_webgl,
|
420
|
+
nstbrowser_mode=nstbrowser_mode,
|
421
|
+
nstbrowser_config=nstbrowser_config,
|
422
|
+
disable_resources=disable_resources,
|
423
|
+
wait_selector_state=wait_selector_state,
|
424
|
+
adaptor_arguments=self.adaptor_arguments,
|
425
|
+
)
|
426
|
+
return await engine.async_fetch(url)
|
427
|
+
|
212
428
|
|
213
429
|
class CustomFetcher(BaseFetcher):
|
214
430
|
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|