scrapling 0.2.98__py3-none-any.whl → 0.2.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/defaults.py +11 -5
- scrapling/engines/camo.py +42 -2
- scrapling/engines/pw.py +38 -1
- scrapling/engines/toolbelt/custom.py +65 -29
- scrapling/fetchers.py +142 -41
- scrapling/parser.py +1 -1
- scrapling-0.2.99.dist-info/METADATA +290 -0
- {scrapling-0.2.98.dist-info → scrapling-0.2.99.dist-info}/RECORD +19 -19
- {scrapling-0.2.98.dist-info → scrapling-0.2.99.dist-info}/WHEEL +1 -1
- tests/fetchers/async/test_camoufox.py +3 -1
- tests/fetchers/async/test_httpx.py +3 -1
- tests/fetchers/async/test_playwright.py +3 -1
- tests/fetchers/sync/test_camoufox.py +3 -1
- tests/fetchers/sync/test_httpx.py +3 -1
- tests/fetchers/sync/test_playwright.py +3 -1
- scrapling-0.2.98.dist-info/METADATA +0 -867
- {scrapling-0.2.98.dist-info → scrapling-0.2.99.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.2.99.dist-info/licenses}/LICENSE +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.2.99.dist-info}/top_level.txt +0 -0
scrapling/fetchers.py
CHANGED
@@ -10,9 +10,10 @@ class Fetcher(BaseFetcher):
|
|
10
10
|
|
11
11
|
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
12
12
|
"""
|
13
|
+
@classmethod
|
13
14
|
def get(
|
14
|
-
|
15
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
15
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
16
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
16
17
|
"""Make basic HTTP GET request for you but with some added flavors.
|
17
18
|
|
18
19
|
:param url: Target url.
|
@@ -22,16 +23,23 @@ class Fetcher(BaseFetcher):
|
|
22
23
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
23
24
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
24
25
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
26
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
25
27
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
26
28
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
27
29
|
"""
|
28
|
-
|
30
|
+
if not custom_config:
|
31
|
+
custom_config = {}
|
32
|
+
elif not isinstance(custom_config, dict):
|
33
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
34
|
+
|
35
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
29
36
|
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
|
30
37
|
return response_object
|
31
38
|
|
39
|
+
@classmethod
|
32
40
|
def post(
|
33
|
-
|
34
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
41
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
42
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
35
43
|
"""Make basic HTTP POST request for you but with some added flavors.
|
36
44
|
|
37
45
|
:param url: Target url.
|
@@ -41,16 +49,23 @@ class Fetcher(BaseFetcher):
|
|
41
49
|
create a referer header as if this request came from Google's search of this URL's domain.
|
42
50
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
43
51
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
52
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
44
53
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
45
54
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
46
55
|
"""
|
47
|
-
|
56
|
+
if not custom_config:
|
57
|
+
custom_config = {}
|
58
|
+
elif not isinstance(custom_config, dict):
|
59
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
60
|
+
|
61
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
48
62
|
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
|
49
63
|
return response_object
|
50
64
|
|
65
|
+
@classmethod
|
51
66
|
def put(
|
52
|
-
|
53
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
67
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
68
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
54
69
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
55
70
|
|
56
71
|
:param url: Target url
|
@@ -60,17 +75,24 @@ class Fetcher(BaseFetcher):
|
|
60
75
|
create a referer header as if this request came from Google's search of this URL's domain.
|
61
76
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
62
77
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
78
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
63
79
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
64
80
|
|
65
81
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
66
82
|
"""
|
67
|
-
|
83
|
+
if not custom_config:
|
84
|
+
custom_config = {}
|
85
|
+
elif not isinstance(custom_config, dict):
|
86
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
87
|
+
|
88
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
68
89
|
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
|
69
90
|
return response_object
|
70
91
|
|
92
|
+
@classmethod
|
71
93
|
def delete(
|
72
|
-
|
73
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
94
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
95
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
74
96
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
75
97
|
|
76
98
|
:param url: Target url
|
@@ -80,18 +102,25 @@ class Fetcher(BaseFetcher):
|
|
80
102
|
create a referer header as if this request came from Google's search of this URL's domain.
|
81
103
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
82
104
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
105
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
83
106
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
84
107
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
108
|
"""
|
86
|
-
|
109
|
+
if not custom_config:
|
110
|
+
custom_config = {}
|
111
|
+
elif not isinstance(custom_config, dict):
|
112
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
113
|
+
|
114
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
87
115
|
response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
|
88
116
|
return response_object
|
89
117
|
|
90
118
|
|
91
119
|
class AsyncFetcher(Fetcher):
|
120
|
+
@classmethod
|
92
121
|
async def get(
|
93
|
-
|
94
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
122
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
123
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
95
124
|
"""Make basic HTTP GET request for you but with some added flavors.
|
96
125
|
|
97
126
|
:param url: Target url.
|
@@ -101,16 +130,23 @@ class AsyncFetcher(Fetcher):
|
|
101
130
|
create a referer header as if this request had came from Google's search of this URL's domain.
|
102
131
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
103
132
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
133
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
104
134
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
105
135
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
106
136
|
"""
|
107
|
-
|
137
|
+
if not custom_config:
|
138
|
+
custom_config = {}
|
139
|
+
elif not isinstance(custom_config, dict):
|
140
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
141
|
+
|
142
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
108
143
|
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
|
109
144
|
return response_object
|
110
145
|
|
146
|
+
@classmethod
|
111
147
|
async def post(
|
112
|
-
|
113
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
148
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
149
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
114
150
|
"""Make basic HTTP POST request for you but with some added flavors.
|
115
151
|
|
116
152
|
:param url: Target url.
|
@@ -120,16 +156,23 @@ class AsyncFetcher(Fetcher):
|
|
120
156
|
create a referer header as if this request came from Google's search of this URL's domain.
|
121
157
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
122
158
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
159
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
123
160
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
124
161
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
125
162
|
"""
|
126
|
-
|
163
|
+
if not custom_config:
|
164
|
+
custom_config = {}
|
165
|
+
elif not isinstance(custom_config, dict):
|
166
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
167
|
+
|
168
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
127
169
|
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
|
128
170
|
return response_object
|
129
171
|
|
172
|
+
@classmethod
|
130
173
|
async def put(
|
131
|
-
|
132
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
174
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
175
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
133
176
|
"""Make basic HTTP PUT request for you but with some added flavors.
|
134
177
|
|
135
178
|
:param url: Target url
|
@@ -139,16 +182,23 @@ class AsyncFetcher(Fetcher):
|
|
139
182
|
create a referer header as if this request came from Google's search of this URL's domain.
|
140
183
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
141
184
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
185
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
142
186
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
143
187
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
144
188
|
"""
|
145
|
-
|
189
|
+
if not custom_config:
|
190
|
+
custom_config = {}
|
191
|
+
elif not isinstance(custom_config, dict):
|
192
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
193
|
+
|
194
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
146
195
|
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
|
147
196
|
return response_object
|
148
197
|
|
198
|
+
@classmethod
|
149
199
|
async def delete(
|
150
|
-
|
151
|
-
proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
|
200
|
+
cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
|
201
|
+
proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
|
152
202
|
"""Make basic HTTP DELETE request for you but with some added flavors.
|
153
203
|
|
154
204
|
:param url: Target url
|
@@ -158,10 +208,16 @@ class AsyncFetcher(Fetcher):
|
|
158
208
|
create a referer header as if this request came from Google's search of this URL's domain.
|
159
209
|
:param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
|
160
210
|
:param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
|
211
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
161
212
|
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
162
213
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
163
214
|
"""
|
164
|
-
|
215
|
+
if not custom_config:
|
216
|
+
custom_config = {}
|
217
|
+
elif not isinstance(custom_config, dict):
|
218
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
219
|
+
|
220
|
+
adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
|
165
221
|
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
|
166
222
|
return response_object
|
167
223
|
|
@@ -172,12 +228,14 @@ class StealthyFetcher(BaseFetcher):
|
|
172
228
|
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
173
229
|
Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
174
230
|
"""
|
231
|
+
@classmethod
|
175
232
|
def fetch(
|
176
|
-
|
177
|
-
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
|
233
|
+
cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
234
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
|
178
235
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
236
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
237
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
|
238
|
+
custom_config: Dict = None, additional_arguments: Dict = None
|
181
239
|
) -> Response:
|
182
240
|
"""
|
183
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -198,16 +256,25 @@ class StealthyFetcher(BaseFetcher):
|
|
198
256
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
199
257
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
200
258
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
201
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
259
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
|
260
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
202
261
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
203
262
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
204
263
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
205
264
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
206
265
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
207
266
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
267
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
268
|
+
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
|
208
269
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
209
270
|
"""
|
271
|
+
if not custom_config:
|
272
|
+
custom_config = {}
|
273
|
+
elif not isinstance(custom_config, dict):
|
274
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
275
|
+
|
210
276
|
engine = CamoufoxEngine(
|
277
|
+
wait=wait,
|
211
278
|
proxy=proxy,
|
212
279
|
geoip=geoip,
|
213
280
|
addons=addons,
|
@@ -226,16 +293,19 @@ class StealthyFetcher(BaseFetcher):
|
|
226
293
|
extra_headers=extra_headers,
|
227
294
|
disable_resources=disable_resources,
|
228
295
|
wait_selector_state=wait_selector_state,
|
229
|
-
adaptor_arguments=
|
296
|
+
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
297
|
+
additional_arguments=additional_arguments or {}
|
230
298
|
)
|
231
299
|
return engine.fetch(url)
|
232
300
|
|
301
|
+
@classmethod
|
233
302
|
async def async_fetch(
|
234
|
-
|
235
|
-
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
|
303
|
+
cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
304
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
|
236
305
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
306
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
307
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
|
308
|
+
custom_config: Dict = None, additional_arguments: Dict = None
|
239
309
|
) -> Response:
|
240
310
|
"""
|
241
311
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -257,15 +327,24 @@ class StealthyFetcher(BaseFetcher):
|
|
257
327
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
258
328
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
259
329
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
330
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
260
331
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
261
332
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
262
333
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
263
334
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
264
335
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
265
336
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
337
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
338
|
+
:param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
|
266
339
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
267
340
|
"""
|
341
|
+
if not custom_config:
|
342
|
+
custom_config = {}
|
343
|
+
elif not isinstance(custom_config, dict):
|
344
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
345
|
+
|
268
346
|
engine = CamoufoxEngine(
|
347
|
+
wait=wait,
|
269
348
|
proxy=proxy,
|
270
349
|
geoip=geoip,
|
271
350
|
addons=addons,
|
@@ -284,7 +363,8 @@ class StealthyFetcher(BaseFetcher):
|
|
284
363
|
extra_headers=extra_headers,
|
285
364
|
disable_resources=disable_resources,
|
286
365
|
wait_selector_state=wait_selector_state,
|
287
|
-
adaptor_arguments=
|
366
|
+
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
367
|
+
additional_arguments=additional_arguments or {}
|
288
368
|
)
|
289
369
|
return await engine.async_fetch(url)
|
290
370
|
|
@@ -305,15 +385,17 @@ class PlayWrightFetcher(BaseFetcher):
|
|
305
385
|
|
306
386
|
> Note that these are the main options with PlayWright but it can be mixed together.
|
307
387
|
"""
|
388
|
+
@classmethod
|
308
389
|
def fetch(
|
309
|
-
|
310
|
-
useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
|
390
|
+
cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
391
|
+
useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
|
311
392
|
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
312
393
|
hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
|
313
394
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
314
395
|
stealth: bool = False, real_chrome: bool = False,
|
315
396
|
cdp_url: Optional[str] = None,
|
316
397
|
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
398
|
+
custom_config: Dict = None
|
317
399
|
) -> Response:
|
318
400
|
"""Opens up a browser and do your request based on your chosen options below.
|
319
401
|
|
@@ -324,7 +406,8 @@ class PlayWrightFetcher(BaseFetcher):
|
|
324
406
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
325
407
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
326
408
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
327
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
409
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
|
410
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
328
411
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
329
412
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
330
413
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
@@ -339,9 +422,16 @@ class PlayWrightFetcher(BaseFetcher):
|
|
339
422
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
340
423
|
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
341
424
|
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
425
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
342
426
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
343
427
|
"""
|
428
|
+
if not custom_config:
|
429
|
+
custom_config = {}
|
430
|
+
elif not isinstance(custom_config, dict):
|
431
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
432
|
+
|
344
433
|
engine = PlaywrightEngine(
|
434
|
+
wait=wait,
|
345
435
|
proxy=proxy,
|
346
436
|
locale=locale,
|
347
437
|
timeout=timeout,
|
@@ -361,19 +451,21 @@ class PlayWrightFetcher(BaseFetcher):
|
|
361
451
|
nstbrowser_config=nstbrowser_config,
|
362
452
|
disable_resources=disable_resources,
|
363
453
|
wait_selector_state=wait_selector_state,
|
364
|
-
adaptor_arguments=
|
454
|
+
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
365
455
|
)
|
366
456
|
return engine.fetch(url)
|
367
457
|
|
458
|
+
@classmethod
|
368
459
|
async def async_fetch(
|
369
|
-
|
370
|
-
useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
|
460
|
+
cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
461
|
+
useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
|
371
462
|
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
372
463
|
hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
|
373
464
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
465
|
stealth: bool = False, real_chrome: bool = False,
|
375
466
|
cdp_url: Optional[str] = None,
|
376
467
|
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
468
|
+
custom_config: Dict = None
|
377
469
|
) -> Response:
|
378
470
|
"""Opens up a browser and do your request based on your chosen options below.
|
379
471
|
|
@@ -384,7 +476,8 @@ class PlayWrightFetcher(BaseFetcher):
|
|
384
476
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
385
477
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
386
478
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
387
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
479
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
|
480
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
|
388
481
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
389
482
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
390
483
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
@@ -399,9 +492,16 @@ class PlayWrightFetcher(BaseFetcher):
|
|
399
492
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
400
493
|
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
401
494
|
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
495
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
402
496
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
403
497
|
"""
|
498
|
+
if not custom_config:
|
499
|
+
custom_config = {}
|
500
|
+
elif not isinstance(custom_config, dict):
|
501
|
+
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
502
|
+
|
404
503
|
engine = PlaywrightEngine(
|
504
|
+
wait=wait,
|
405
505
|
proxy=proxy,
|
406
506
|
locale=locale,
|
407
507
|
timeout=timeout,
|
@@ -421,12 +521,13 @@ class PlayWrightFetcher(BaseFetcher):
|
|
421
521
|
nstbrowser_config=nstbrowser_config,
|
422
522
|
disable_resources=disable_resources,
|
423
523
|
wait_selector_state=wait_selector_state,
|
424
|
-
adaptor_arguments=
|
524
|
+
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
425
525
|
)
|
426
526
|
return await engine.async_fetch(url)
|
427
527
|
|
428
528
|
|
429
529
|
class CustomFetcher(BaseFetcher):
|
430
|
-
|
431
|
-
|
530
|
+
@classmethod
|
531
|
+
def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
|
532
|
+
engine = check_if_engine_usable(browser_engine)(adaptor_arguments=cls._generate_parser_arguments(), **kwargs)
|
432
533
|
return engine.fetch(url)
|
scrapling/parser.py
CHANGED
@@ -39,7 +39,7 @@ class Adaptor(SelectorsGeneration):
|
|
39
39
|
root: Optional[html.HtmlElement] = None,
|
40
40
|
keep_comments: Optional[bool] = False,
|
41
41
|
keep_cdata: Optional[bool] = False,
|
42
|
-
auto_match: Optional[bool] =
|
42
|
+
auto_match: Optional[bool] = False,
|
43
43
|
storage: Any = SQLiteStorageSystem,
|
44
44
|
storage_args: Optional[Dict] = None,
|
45
45
|
**kwargs
|