scrapling 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/fetchers.py CHANGED
@@ -10,9 +10,10 @@ class Fetcher(BaseFetcher):
10
10
 
11
11
  Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
12
12
  """
13
+ @classmethod
13
14
  def get(
14
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
15
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
15
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
16
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
16
17
  """Make basic HTTP GET request for you but with some added flavors.
17
18
 
18
19
  :param url: Target url.
@@ -22,16 +23,23 @@ class Fetcher(BaseFetcher):
22
23
  create a referer header as if this request had came from Google's search of this URL's domain.
23
24
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
24
25
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
26
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
25
27
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
26
28
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
27
29
  """
28
- adaptor_arguments = tuple(self.adaptor_arguments.items())
30
+ if not custom_config:
31
+ custom_config = {}
32
+ elif not isinstance(custom_config, dict):
33
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
34
+
35
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
29
36
  response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
30
37
  return response_object
31
38
 
39
+ @classmethod
32
40
  def post(
33
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
34
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
41
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
42
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
35
43
  """Make basic HTTP POST request for you but with some added flavors.
36
44
 
37
45
  :param url: Target url.
@@ -41,16 +49,23 @@ class Fetcher(BaseFetcher):
41
49
  create a referer header as if this request came from Google's search of this URL's domain.
42
50
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
43
51
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
52
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
44
53
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
45
54
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
46
55
  """
47
- adaptor_arguments = tuple(self.adaptor_arguments.items())
56
+ if not custom_config:
57
+ custom_config = {}
58
+ elif not isinstance(custom_config, dict):
59
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
60
+
61
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
48
62
  response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
49
63
  return response_object
50
64
 
65
+ @classmethod
51
66
  def put(
52
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
53
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
67
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
68
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
54
69
  """Make basic HTTP PUT request for you but with some added flavors.
55
70
 
56
71
  :param url: Target url
@@ -60,17 +75,24 @@ class Fetcher(BaseFetcher):
60
75
  create a referer header as if this request came from Google's search of this URL's domain.
61
76
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
62
77
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
78
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
63
79
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
64
80
 
65
81
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
66
82
  """
67
- adaptor_arguments = tuple(self.adaptor_arguments.items())
83
+ if not custom_config:
84
+ custom_config = {}
85
+ elif not isinstance(custom_config, dict):
86
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
87
+
88
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
68
89
  response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
69
90
  return response_object
70
91
 
92
+ @classmethod
71
93
  def delete(
72
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
73
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
94
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
95
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
74
96
  """Make basic HTTP DELETE request for you but with some added flavors.
75
97
 
76
98
  :param url: Target url
@@ -80,18 +102,25 @@ class Fetcher(BaseFetcher):
80
102
  create a referer header as if this request came from Google's search of this URL's domain.
81
103
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
82
104
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
105
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
83
106
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
84
107
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
108
  """
86
- adaptor_arguments = tuple(self.adaptor_arguments.items())
109
+ if not custom_config:
110
+ custom_config = {}
111
+ elif not isinstance(custom_config, dict):
112
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
113
+
114
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
87
115
  response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
88
116
  return response_object
89
117
 
90
118
 
91
119
  class AsyncFetcher(Fetcher):
120
+ @classmethod
92
121
  async def get(
93
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
94
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
122
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
123
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
95
124
  """Make basic HTTP GET request for you but with some added flavors.
96
125
 
97
126
  :param url: Target url.
@@ -101,16 +130,23 @@ class AsyncFetcher(Fetcher):
101
130
  create a referer header as if this request had came from Google's search of this URL's domain.
102
131
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
103
132
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
133
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
104
134
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
105
135
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
106
136
  """
107
- adaptor_arguments = tuple(self.adaptor_arguments.items())
137
+ if not custom_config:
138
+ custom_config = {}
139
+ elif not isinstance(custom_config, dict):
140
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
141
+
142
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
108
143
  response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
109
144
  return response_object
110
145
 
146
+ @classmethod
111
147
  async def post(
112
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
113
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
148
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
149
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
114
150
  """Make basic HTTP POST request for you but with some added flavors.
115
151
 
116
152
  :param url: Target url.
@@ -120,16 +156,23 @@ class AsyncFetcher(Fetcher):
120
156
  create a referer header as if this request came from Google's search of this URL's domain.
121
157
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
122
158
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
159
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
123
160
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
124
161
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
125
162
  """
126
- adaptor_arguments = tuple(self.adaptor_arguments.items())
163
+ if not custom_config:
164
+ custom_config = {}
165
+ elif not isinstance(custom_config, dict):
166
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
167
+
168
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
127
169
  response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
128
170
  return response_object
129
171
 
172
+ @classmethod
130
173
  async def put(
131
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
132
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
174
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
175
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
133
176
  """Make basic HTTP PUT request for you but with some added flavors.
134
177
 
135
178
  :param url: Target url
@@ -139,16 +182,23 @@ class AsyncFetcher(Fetcher):
139
182
  create a referer header as if this request came from Google's search of this URL's domain.
140
183
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
141
184
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
185
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
142
186
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
143
187
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
144
188
  """
145
- adaptor_arguments = tuple(self.adaptor_arguments.items())
189
+ if not custom_config:
190
+ custom_config = {}
191
+ elif not isinstance(custom_config, dict):
192
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
193
+
194
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
146
195
  response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
147
196
  return response_object
148
197
 
198
+ @classmethod
149
199
  async def delete(
150
- self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
151
- proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
200
+ cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
201
+ proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
152
202
  """Make basic HTTP DELETE request for you but with some added flavors.
153
203
 
154
204
  :param url: Target url
@@ -158,10 +208,16 @@ class AsyncFetcher(Fetcher):
158
208
  create a referer header as if this request came from Google's search of this URL's domain.
159
209
  :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
160
210
  :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
211
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
161
212
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
162
213
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
163
214
  """
164
- adaptor_arguments = tuple(self.adaptor_arguments.items())
215
+ if not custom_config:
216
+ custom_config = {}
217
+ elif not isinstance(custom_config, dict):
218
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
219
+
220
+ adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
165
221
  response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
166
222
  return response_object
167
223
 
@@ -172,12 +228,14 @@ class StealthyFetcher(BaseFetcher):
172
228
  It works as real browsers passing almost all online tests/protections based on Camoufox.
173
229
  Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
174
230
  """
231
+ @classmethod
175
232
  def fetch(
176
- self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
177
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
233
+ cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
234
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
178
235
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
179
236
  wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
180
237
  proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
238
+ custom_config: Dict = None, additional_arguments: Dict = None
181
239
  ) -> Response:
182
240
  """
183
241
  Opens up a browser and do your request based on your chosen options below.
@@ -198,16 +256,25 @@ class StealthyFetcher(BaseFetcher):
198
256
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
199
257
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
200
258
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
201
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
259
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
260
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
202
261
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
203
262
  :param wait_selector: Wait for a specific css selector to be in a specific state.
204
263
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
205
264
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
206
265
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
207
266
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
267
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
268
+ :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
208
269
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
209
270
  """
271
+ if not custom_config:
272
+ custom_config = {}
273
+ elif not isinstance(custom_config, dict):
274
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
275
+
210
276
  engine = CamoufoxEngine(
277
+ wait=wait,
211
278
  proxy=proxy,
212
279
  geoip=geoip,
213
280
  addons=addons,
@@ -226,16 +293,19 @@ class StealthyFetcher(BaseFetcher):
226
293
  extra_headers=extra_headers,
227
294
  disable_resources=disable_resources,
228
295
  wait_selector_state=wait_selector_state,
229
- adaptor_arguments=self.adaptor_arguments,
296
+ adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
297
+ additional_arguments=additional_arguments or {}
230
298
  )
231
299
  return engine.fetch(url)
232
300
 
301
+ @classmethod
233
302
  async def async_fetch(
234
- self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
235
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
303
+ cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
304
+ block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
236
305
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
237
306
  wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
238
307
  proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
308
+ custom_config: Dict = None, additional_arguments: Dict = None
239
309
  ) -> Response:
240
310
  """
241
311
  Opens up a browser and do your request based on your chosen options below.
@@ -257,15 +327,24 @@ class StealthyFetcher(BaseFetcher):
257
327
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
258
328
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
259
329
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
330
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
260
331
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
261
332
  :param wait_selector: Wait for a specific css selector to be in a specific state.
262
333
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
263
334
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
264
335
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
265
336
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
337
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
338
+ :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
266
339
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
267
340
  """
341
+ if not custom_config:
342
+ custom_config = {}
343
+ elif not isinstance(custom_config, dict):
344
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
345
+
268
346
  engine = CamoufoxEngine(
347
+ wait=wait,
269
348
  proxy=proxy,
270
349
  geoip=geoip,
271
350
  addons=addons,
@@ -284,7 +363,8 @@ class StealthyFetcher(BaseFetcher):
284
363
  extra_headers=extra_headers,
285
364
  disable_resources=disable_resources,
286
365
  wait_selector_state=wait_selector_state,
287
- adaptor_arguments=self.adaptor_arguments,
366
+ adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
367
+ additional_arguments=additional_arguments or {}
288
368
  )
289
369
  return await engine.async_fetch(url)
290
370
 
@@ -305,15 +385,17 @@ class PlayWrightFetcher(BaseFetcher):
305
385
 
306
386
  > Note that these are the main options with PlayWright but it can be mixed together.
307
387
  """
388
+ @classmethod
308
389
  def fetch(
309
- self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
310
- useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
390
+ cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
391
+ useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
311
392
  page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
312
393
  hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
313
394
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
314
395
  stealth: bool = False, real_chrome: bool = False,
315
396
  cdp_url: Optional[str] = None,
316
397
  nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
398
+ custom_config: Dict = None
317
399
  ) -> Response:
318
400
  """Opens up a browser and do your request based on your chosen options below.
319
401
 
@@ -324,7 +406,8 @@ class PlayWrightFetcher(BaseFetcher):
324
406
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
325
407
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
326
408
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
327
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
409
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
410
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
328
411
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
329
412
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
330
413
  :param wait_selector: Wait for a specific css selector to be in a specific state.
@@ -339,9 +422,16 @@ class PlayWrightFetcher(BaseFetcher):
339
422
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
340
423
  :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
341
424
  :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
425
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
342
426
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
343
427
  """
428
+ if not custom_config:
429
+ custom_config = {}
430
+ elif not isinstance(custom_config, dict):
431
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
432
+
344
433
  engine = PlaywrightEngine(
434
+ wait=wait,
345
435
  proxy=proxy,
346
436
  locale=locale,
347
437
  timeout=timeout,
@@ -361,19 +451,21 @@ class PlayWrightFetcher(BaseFetcher):
361
451
  nstbrowser_config=nstbrowser_config,
362
452
  disable_resources=disable_resources,
363
453
  wait_selector_state=wait_selector_state,
364
- adaptor_arguments=self.adaptor_arguments,
454
+ adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
365
455
  )
366
456
  return engine.fetch(url)
367
457
 
458
+ @classmethod
368
459
  async def async_fetch(
369
- self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
370
- useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
460
+ cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
461
+ useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
371
462
  page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
372
463
  hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
373
464
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
374
465
  stealth: bool = False, real_chrome: bool = False,
375
466
  cdp_url: Optional[str] = None,
376
467
  nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
468
+ custom_config: Dict = None
377
469
  ) -> Response:
378
470
  """Opens up a browser and do your request based on your chosen options below.
379
471
 
@@ -384,7 +476,8 @@ class PlayWrightFetcher(BaseFetcher):
384
476
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
385
477
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
386
478
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
387
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
479
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
480
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
388
481
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
389
482
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
390
483
  :param wait_selector: Wait for a specific css selector to be in a specific state.
@@ -399,9 +492,16 @@ class PlayWrightFetcher(BaseFetcher):
399
492
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
400
493
  :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
401
494
  :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
495
+ :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
402
496
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
403
497
  """
498
+ if not custom_config:
499
+ custom_config = {}
500
+ elif not isinstance(custom_config, dict):
501
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
502
+
404
503
  engine = PlaywrightEngine(
504
+ wait=wait,
405
505
  proxy=proxy,
406
506
  locale=locale,
407
507
  timeout=timeout,
@@ -421,12 +521,13 @@ class PlayWrightFetcher(BaseFetcher):
421
521
  nstbrowser_config=nstbrowser_config,
422
522
  disable_resources=disable_resources,
423
523
  wait_selector_state=wait_selector_state,
424
- adaptor_arguments=self.adaptor_arguments,
524
+ adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
425
525
  )
426
526
  return await engine.async_fetch(url)
427
527
 
428
528
 
429
529
  class CustomFetcher(BaseFetcher):
430
- def fetch(self, url: str, browser_engine, **kwargs) -> Response:
431
- engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
530
+ @classmethod
531
+ def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
532
+ engine = check_if_engine_usable(browser_engine)(adaptor_arguments=cls._generate_parser_arguments(), **kwargs)
432
533
  return engine.fetch(url)
scrapling/parser.py CHANGED
@@ -17,7 +17,7 @@ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
17
17
  from scrapling.core.mixins import SelectorsGeneration
18
18
  from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
19
19
  StorageSystemMixin, _StorageTools)
20
- from scrapling.core.translator import HTMLTranslator
20
+ from scrapling.core.translator import translator_instance
21
21
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
22
22
  is_jsonable, log)
23
23
 
@@ -26,7 +26,7 @@ class Adaptor(SelectorsGeneration):
26
26
  __slots__ = (
27
27
  'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
28
28
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
29
- '__keep_cdata', '__raw_body'
29
+ '__keep_cdata'
30
30
  )
31
31
 
32
32
  def __init__(
@@ -39,7 +39,7 @@ class Adaptor(SelectorsGeneration):
39
39
  root: Optional[html.HtmlElement] = None,
40
40
  keep_comments: Optional[bool] = False,
41
41
  keep_cdata: Optional[bool] = False,
42
- auto_match: Optional[bool] = True,
42
+ auto_match: Optional[bool] = False,
43
43
  storage: Any = SQLiteStorageSystem,
44
44
  storage_args: Optional[Dict] = None,
45
45
  **kwargs
@@ -72,20 +72,17 @@ class Adaptor(SelectorsGeneration):
72
72
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
73
73
 
74
74
  self.__text = ''
75
- self.__raw_body = ''
76
75
  if root is None:
77
76
  if text is None:
78
77
  if not body or not isinstance(body, bytes):
79
78
  raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
80
79
 
81
80
  body = body.replace(b"\x00", b"").strip()
82
- self.__raw_body = body.replace(b"\x00", b"").strip().decode()
83
81
  else:
84
82
  if not isinstance(text, str):
85
83
  raise TypeError(f"text argument must be of type str, got {text.__class__}")
86
84
 
87
85
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
88
- self.__raw_body = text.strip()
89
86
 
90
87
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
91
88
  parser = html.HTMLParser(
@@ -250,10 +247,7 @@ class Adaptor(SelectorsGeneration):
250
247
  """Return the inner html code of the element"""
251
248
  return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
252
249
 
253
- @property
254
- def body(self) -> TextHandler:
255
- """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
256
- return TextHandler(self.__raw_body) or self.html_content
250
+ body = html_content
257
251
 
258
252
  def prettify(self) -> TextHandler:
259
253
  """Return a prettified version of the element's inner html-code"""
@@ -476,7 +470,7 @@ class Adaptor(SelectorsGeneration):
476
470
  try:
477
471
  if not self.__auto_match_enabled or ',' not in selector:
478
472
  # No need to split selectors in this case, let's save some CPU cycles :)
479
- xpath_selector = HTMLTranslator().css_to_xpath(selector)
473
+ xpath_selector = translator_instance.css_to_xpath(selector)
480
474
  return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
481
475
 
482
476
  results = []
@@ -484,7 +478,7 @@ class Adaptor(SelectorsGeneration):
484
478
  for single_selector in split_selectors(selector):
485
479
  # I'm doing this only so the `save` function save data correctly for combined selectors
486
480
  # Like using the ',' to combine two different selectors that point to different elements.
487
- xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
481
+ xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
488
482
  results += self.xpath(
489
483
  xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
490
484
  )