scrape-do-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrape_do/client.py ADDED
@@ -0,0 +1,804 @@
1
+ """Synchronous HTTP client for the Scrape.do API.
2
+
3
+ Defines the primary `ScrapeDoClient` used for executing proxy
4
+ requests. Handles autonomic error routing, customizable retry strategies,
5
+ telemetry tracking, and secure, isolated connection pooling.
6
+ """
7
+
8
+ import os
9
+ import time
10
+ import random
11
+ import logging
12
+ import ssl
13
+ from pydantic import HttpUrl
14
+ from httpx import (
15
+ Client,
16
+ Limits,
17
+ BaseTransport,
18
+ RequestError
19
+ )
20
+ from httpx._config import DEFAULT_LIMITS
21
+ from httpx._types import (
22
+ TimeoutTypes,
23
+ CertTypes,
24
+ RequestExtensions
25
+ )
26
+ from httpx._client import (
27
+ UseClientDefault,
28
+ USE_CLIENT_DEFAULT
29
+ )
30
+
31
+ from typing import (
32
+ Dict,
33
+ List,
34
+ Optional,
35
+ Self,
36
+ Any,
37
+ Union,
38
+ Unpack,
39
+ Callable,
40
+ Literal,
41
+ TypeAlias,
42
+ TypedDict
43
+ )
44
+ from types import TracebackType
45
+ from .models import (
46
+ RequestParameters,
47
+ PreparedScrapeDoRequest,
48
+ ScrapeDoResponse,
49
+ PayloadType,
50
+ HttpMethod,
51
+ RequestParametersDict
52
+ )
53
+ from .exceptions import APIConnectionError, RotatedSessionError
54
+
55
+
56
+ logger = logging.getLogger("scrape_do")
57
+
58
+
59
+ # --- Type Definitions ---
60
+
61
+ SyncSessionValidator: TypeAlias = Callable[[ScrapeDoResponse], bool]
62
+ """
63
+ Defines the expected signature of the custom function meant to be passed
64
+ to the `ScrapeDoClient.execute` method's `session_validator` argument
65
+ """
66
+
67
+
68
+ class SyncClientEventHooks(TypedDict, total=False):
69
+ """
70
+ Configuration dictionary for SDK-native lifecycle hooks.
71
+
72
+ Unlike native HTTPX event hooks which fire on every transport-level
73
+ execution (and can corrupt telemetry during autonomic retries), these SDK
74
+ hooks map cleanly to the logical request lifecycle.
75
+ """
76
+
77
+ request: List[
78
+ Callable[[PreparedScrapeDoRequest], None]
79
+ ]
80
+ """
81
+ Fires exactly once per logical execution, immediately before the retry
82
+ loop begins. Receives the `PreparedScrapeDoRequest` object that will be
83
+ used to exececute the request. Useful for logging the request being
84
+ executed.
85
+ """
86
+ response: List[
87
+ Callable[[ScrapeDoResponse], None]
88
+ ]
89
+ """
90
+ Fires exactly once per logical execution, immediately after the proxy
91
+ returns a response and the `session_validator` (if any) passes.
92
+ Receives the request's `ScrapeDoResponse` object. Useful for
93
+ logging only the final response after all retries, which can be either
94
+ a successful response, a non-retryable error, or a final retryable error
95
+ after `max_attempts` has been exhausted.
96
+ """
97
+ retry: List[
98
+ Callable[
99
+ [
100
+ int,
101
+ PreparedScrapeDoRequest,
102
+ Optional[ScrapeDoResponse],
103
+ Optional[Exception]
104
+ ],
105
+ None
106
+ ]
107
+ ]
108
+ """
109
+ Fires inside the execution loop ONLY when a proxy gateway error
110
+ (or an httpx.RequestError) occurs and the SDK decides to retry. Receives
111
+ the current attempt number, the prepared request, and either the failed
112
+ response (if it exists) or the `httpx.RequestError` that caused the retry.
113
+ Useful for tracking proxy instability or manually raising an exception to
114
+ abort the retry loop.
115
+ """
116
+
117
+
118
+ # --- Client Default Backoff Strategy ---
119
+
120
+
121
+ def default_backoff_strategy(attempt: int) -> float:
122
+ """Calculates a jittered exponential backoff for rate-limit retries.
123
+
124
+ This is the default function used by the `ScrapeDoClient` to determine how
125
+ long to wait before retrying a rate-limited request when the
126
+ `retry_backoff` parameter is set to `None`.
127
+
128
+ Args:
129
+ attempt (int): The number of retries made so far, starting from 0
130
+
131
+ info: Additional Information
132
+ The `jitter` here is a random number between 0.1 and 1 generated
133
+ by the `random.uniform` function.
134
+
135
+ Returns:
136
+ The number of seconds to sleep, calculated as (2^attempt) + jitter.
137
+ """
138
+
139
+ return (2.0**attempt) + random.uniform(0.1, 1.0)
140
+
141
+
142
+ class ScrapeDoClient:
143
+ """Synchronous HTTP client for executing Scrape.do API requests.
144
+
145
+ Aims to facilitate interactions with the Scrape.do API by managing an
146
+ `httpx.Client` instance to provide strict type-checking for request
147
+ parameters, custom error parsing, and session tracking while keeping the
148
+ network configurations as flexible as possible.
149
+
150
+ abstract: Features
151
+ - Local API parameter validation via the `RequestParameters` Pydantic
152
+ model.
153
+
154
+ - Status code error parsing and customisable retry intervals for
155
+ rate-limited requests.
156
+
157
+ - Strongly-typed interface for responses via the `ScrapeDoResponse`
158
+ Pyadantic model.
159
+
160
+ info: Concurrency Limit and Server Errors
161
+ This client intercepts and manages Scrape.do's specific gateway errors
162
+ (429, 502, 510), automatically applying a customisable retry strategy
163
+ before the error can reach the application.
164
+
165
+ tip: SDK Event Hooks (`event_hooks`)
166
+ This client implements SDK-specific event hooks mimicking the
167
+ structure of `httpx` native event hooks. See
168
+ [`SyncClientEventHooks`][scrape_do.client.SyncClientEventHooks] for
169
+ available lifecycle hooks and their required signatures.
170
+
171
+ tip: Additional `httpx.Client` Configuration
172
+ The following `httpx.Client` parameters can be provided as keyword
173
+ arguments and will be passed directly to the underlying object.
174
+
175
+ - `verify`
176
+ - `cert`
177
+ - `http1`
178
+ - `http2`
179
+ - `timeout`
180
+ - `limits`
181
+ - `transport`
182
+ - `default_encoding`
183
+
184
+ Additionally, the following `httpx.Client.request` parameters can be
185
+ provided as keyword arguments during request execution.
186
+
187
+ - `timeout` (`r_timeout`)
188
+ - `extensions`
189
+
190
+ For more information on their behaviour and default values, please
191
+ consult the official
192
+ [`httpx`](https://www.python-httpx.org/api/#client) documentation.
193
+
194
+ warning: Unsupported HTTPX Client Arguments
195
+ The underlying `httpx.Client` object is strictly managed by the
196
+ instance to prevent invalid configurations from being sent to the
197
+ Scrape.do API. For this reason, arguments not listed in the previous
198
+ section are intentionally blocked and shouldn't be changed.
199
+
200
+ Args:
201
+ api_token (Optional[str]): The Scrape.do API key. If omitted, the
202
+ client will attempt to load it from the 'SCRAPE_DO_API_KEY'
203
+ environment variable.
204
+ max_retries (int): The maximum number of retry attempts for retryable
205
+ Scrape.do gateway errors (HTTP 429, 502, and 510).
206
+ retry_backoff (Union[float, Callable[[int], float]]): The strategy
207
+ used to calculate the delay between retries. Can be a static
208
+ `float` (seconds) or a callable that accepts the current attempt
209
+ number (0-indexed) and returns a float. Defaults to a jittered
210
+ exponential backoff when set to `None`.
211
+ event_hooks (Optional[SyncClientEventHooks]): A dictionary of
212
+ SDK-native hooks to execute during different points of the request
213
+ lifecycle.
214
+ verify (Union[ssl.SSLContext, str, bool]): Configures SSL certificate
215
+ verification. Defaults to True (secure).
216
+ cert (Optional[CertTypes]): Client-side certificates for mutual TLS
217
+ authentication.
218
+ http1 (bool): Enable HTTP/1.1 support.
219
+ http2 (bool): Enable HTTP/2 multiplexing for higher concurrency.
220
+ timeout (TimeoutTypes): The default timeout (in seconds) applied to
221
+ all network phases. Defaults to 60s, raised from httpx's 5s
222
+ default to accommodate Scrape.do proxy round-trips
223
+ (browser rendering, geo-routing, fingerprinting).
224
+ limits (Limits): Configuration for maximum connection pool sizes.
225
+ transport (Optional[BaseTransport]): A completely custom transport
226
+ engine
227
+ default_encoding (Union[str, Callable[[bytes], str]]): The fallback
228
+ text encoding used if a target website omits a charset header.
229
+ """
230
+ def __init__(
231
+ self,
232
+ api_token: Optional[str] = None,
233
+ max_retries: int = 3,
234
+ retry_backoff: Optional[Union[float, Callable[[int], float]]] = None,
235
+ event_hooks: Optional[SyncClientEventHooks] = None,
236
+ *,
237
+ verify: Union[ssl.SSLContext, str, bool] = True,
238
+ cert: Optional[CertTypes] = None,
239
+ http1: bool = True,
240
+ http2: bool = False,
241
+ timeout: TimeoutTypes = 60.0,
242
+ limits: Limits = DEFAULT_LIMITS,
243
+ transport: Optional[BaseTransport] = None,
244
+ default_encoding: Union[str, Callable[[bytes], str]] = "utf-8"
245
+ ) -> None:
246
+ self.api_token = api_token or os.getenv("SCRAPE_DO_API_KEY")
247
+ if not self.api_token:
248
+ raise ValueError(
249
+ "Scrape.do API token must be provided explicitly or set via"
250
+ " the 'SCRAPE_DO_API_KEY' environment variable."
251
+ )
252
+
253
+ self.max_retries = max_retries
254
+
255
+ if retry_backoff is not None:
256
+ self.retry_backoff = retry_backoff
257
+ else:
258
+ self.retry_backoff = default_backoff_strategy
259
+
260
+ self.event_hooks: SyncClientEventHooks = event_hooks or {}
261
+
262
+ self._http_client = Client(
263
+ verify=verify,
264
+ cert=cert,
265
+ trust_env=False,
266
+ http1=http1,
267
+ http2=http2,
268
+ timeout=timeout,
269
+ limits=limits,
270
+ transport=transport,
271
+ default_encoding=default_encoding
272
+ )
273
+
274
+ def close(self) -> None:
275
+ """Closes the underlying HTTPX connection pool.
276
+
277
+ It is recommended to use the client as a context manager to ensure
278
+ resources are released automatically.
279
+ """
280
+ self._http_client.close()
281
+
282
+ def __enter__(self) -> Self:
283
+ """Initializes the HTTPX connection pool and returns the context
284
+ manager object.
285
+
286
+ Returns:
287
+ The `ScrapeDoClient` instance with an opened HTTPX connection pool
288
+ """
289
+ return self
290
+
291
+ def __exit__(
292
+ self,
293
+ exc_type: Optional[type[BaseException]],
294
+ exc_val: Optional[BaseException],
295
+ exc_tb: Optional[TracebackType]
296
+ ) -> Literal[False]:
297
+ """Calls the `close` method to close the underlying HTTPX connection
298
+ pool without swallowing any exceptions.
299
+
300
+ Args:
301
+ exc_type (Optional[type[BaseException]]): The type of the
302
+ exception.
303
+ exc_val (Optional[BaseException]): The instance of the exception.
304
+ exc_tb (Optional[TracebackType]): The traceback information.
305
+
306
+ Returns:
307
+ `False`, since no exceptions are swallowed
308
+ """
309
+ self.close()
310
+ return False
311
+
312
+ def execute(
313
+ self,
314
+ request: PreparedScrapeDoRequest,
315
+ session_validator: Optional[SyncSessionValidator] = None,
316
+ *,
317
+ r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
318
+ extensions: Optional[RequestExtensions] = None
319
+ ) -> ScrapeDoResponse:
320
+ """Executes a fully prepared and validated Scrape.do request.
321
+
322
+ Acts as the core execution funnel, applying the retry
323
+ backoff logic, evaluating gateway errors and sessions,
324
+ and isolating cookies between sequential executions.
325
+
326
+ tip: Intended Usage
327
+ Use this method if you have manually constructed a
328
+ `PreparedScrapeDoRequest` object for bulk routing,
329
+ custom configurations, or task reusability.
330
+
331
+ warning: Sessions (`sessionId`)
332
+ If you configure a request with a `session_id`, Scrape.do will
333
+ attempt to route your traffic through the same proxy address.
334
+ However, it can still silently rotate this address for various
335
+ reasons. If it rotates during a multi-step scraping task, any
336
+ target-specific WAF state or cookies accumulated will be lost,
337
+ which may cause the task to fail.
338
+
339
+ tip: Validating Sessions (`session_validator`)
340
+ - In order to prevent unexpected errors due to dropped sessions,
341
+ you can pass a custom function to the client's `execute` method
342
+ `session_validator` argument.
343
+
344
+ - This function will be called internally by the client after each
345
+ stateful request (`sessionId is not None`) to determine whether
346
+ or not a `RotatedSessionError` exception should be raised to
347
+ signal that this session is no longer valid.
348
+
349
+ - The function should take the current request's `ScrapeDoResponse`
350
+ object as its only argument, and return a single `bool` value.
351
+
352
+ - If the function evaluates to `True`, this method will raise the
353
+ `RotatedSessionError` instead of returning the response object.
354
+ (The request's `ScrapeDoResponse` object can still be accessed
355
+ later on using the exception's `response` attribute.) Otherwise,
356
+ no additional action is taken.
357
+
358
+ Args:
359
+ request (PreparedScrapeDoRequest): The validated request payload.
360
+ r_timeout (Union[TimeoutTypes, UseClientDefault]): A
361
+ request-specific timeout override.
362
+ session_validator (Optional[SyncSessionValidator]): A custom
363
+ function to be called in order to determine whether or not to
364
+ raise a `RotatedSessionError` exception.
365
+ extensions (Optional[RequestExtensions]): Advanced HTTPX
366
+ extensions for this specific request.
367
+
368
+ Returns:
369
+ The `ScrapeDoResponse` object containing the target's data.
370
+
371
+ Raises:
372
+ APIConnectionError: If the underlying network transport drops
373
+ entirely (e.g., DNS failure).
374
+ RotatedSessionError: If a `session_validator` is provided, the
375
+ request was made with a `session_id` argument, and the
376
+ `session_validator` returned `True`
377
+ """
378
+
379
+ # Fire Request Event Hooks
380
+ if "request" in self.event_hooks:
381
+ for req_hook in self.event_hooks["request"]:
382
+ req_hook(request)
383
+
384
+ httpx_kwargs = request.to_httpx_kwargs(token=self.api_token)
385
+ session_id = request.api_params.session_id
386
+
387
+ if r_timeout is not USE_CLIENT_DEFAULT:
388
+ httpx_kwargs["timeout"] = r_timeout
389
+ if extensions is not None:
390
+ httpx_kwargs["extensions"] = extensions
391
+
392
+ try:
393
+ for attempt in range(self.max_retries + 1):
394
+ try:
395
+ raw_resp = self._http_client.request(**httpx_kwargs)
396
+ scrape_response = ScrapeDoResponse(request, raw_resp)
397
+
398
+ # Strictly aligned with Scrape.do documented gateway errors
399
+ is_retryable_status = (
400
+ raw_resp.status_code in (429, 502, 510)
401
+ )
402
+
403
+ if scrape_response.is_proxy_error and is_retryable_status:
404
+ if attempt < self.max_retries:
405
+
406
+ # Fire retry hook and pass response
407
+ if "retry" in self.event_hooks:
408
+ for retry_hook in self.event_hooks["retry"]:
409
+ retry_hook(
410
+ attempt,
411
+ request,
412
+ scrape_response,
413
+ None
414
+ )
415
+
416
+ if callable(self.retry_backoff):
417
+ time.sleep(self.retry_backoff(attempt))
418
+ else:
419
+ time.sleep(float(self.retry_backoff))
420
+ continue
421
+
422
+ # If attempt == max_retries, fall through
423
+ # to return the failed ScrapeDoResponse to the user.
424
+
425
+ # Call validator if session_id is not None
426
+ if (
427
+ session_validator is not None
428
+ and session_id is not None
429
+ ):
430
+ # Raise exception if validator returns True
431
+ if session_validator(scrape_response):
432
+ raise RotatedSessionError(
433
+ (
434
+ f"User-Defined Session Validator Failed | "
435
+ f"Status: {raw_resp.status_code}"
436
+ ),
437
+ raw_resp,
438
+ request,
439
+ scrape_response
440
+ )
441
+
442
+ # Fires on a success, OR on a final 502 if
443
+ # retries are exhausted.
444
+ if "response" in self.event_hooks:
445
+ for resp_hook in self.event_hooks["response"]:
446
+ resp_hook(scrape_response)
447
+
448
+ return scrape_response
449
+
450
+ except RequestError as e:
451
+ if attempt == self.max_retries:
452
+ raise APIConnectionError(
453
+ f"Network transport failed: {str(e)}",
454
+ request
455
+ ) from e
456
+
457
+ # Fire retry hook and pass the exception
458
+ if "retry" in self.event_hooks:
459
+ for retry_hook in self.event_hooks["retry"]:
460
+ retry_hook(
461
+ attempt,
462
+ request,
463
+ None,
464
+ e
465
+ )
466
+
467
+ if callable(self.retry_backoff):
468
+ time.sleep(self.retry_backoff(attempt))
469
+ else:
470
+ time.sleep(float(self.retry_backoff))
471
+
472
+ # max_retries < 0
473
+ raise RuntimeError(
474
+ "Execution loop exhausted without returning a response."
475
+ )
476
+ finally:
477
+ # Prevent cookie bleed between requests
478
+ self._http_client.cookies.clear()
479
+
480
+ def execute_from_url(
481
+ self,
482
+ method: HttpMethod,
483
+ full_url: str,
484
+ headers: Optional[Dict[str, str]] = None,
485
+ body: Optional[Union[Dict[str, Any], str, bytes]] = None,
486
+ payload_type: PayloadType = "json",
487
+ session_validator: Optional[SyncSessionValidator] = None,
488
+ *,
489
+ r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
490
+ extensions: Optional[RequestExtensions] = None
491
+ ) -> ScrapeDoResponse:
492
+ """Executes a request using a raw, pre-configured `api.scrape.do` URL.
493
+
494
+ tip: Intended Usage
495
+ This method is designed for scenarios where you have generated a
496
+ Scrape.do URL elsewhere and simply need to execute it. It parses
497
+ the URL to extract and validate the parameters, and then passes the
498
+ `PreparedScrapeDoRequest` to the `execute` method.
499
+
500
+ info: URL Format
501
+ The `api.scrape.do` URL can be either url-encoded or not. Both
502
+ will have their parameters extracted and be properly re-encoded
503
+ before the request is sent.
504
+
505
+ Args:
506
+ method (HttpMethod): The HTTP method to forward to the target
507
+ website.
508
+ full_url (str): The complete, pre-formatted `api.scrape.do`
509
+ endpoint.
510
+ headers (Optional[Dict[str, str]]): Custom HTTP headers to forward
511
+ to the target.
512
+ body (Optional[Union[Dict[str, Any], str, bytes]]): The payload to
513
+ send to the target website.
514
+ payload_type (PayloadType): Dictates how the client encodes the
515
+ `body` (e.g., 'json', 'data').
516
+ session_validator (Optional[SyncSessionValidator]): A custom
517
+ function to be called in order to determine whether or not to
518
+ raise a `RotatedSessionError` exception. (See
519
+ `ScrapeDoClient.execute` docstring for more information)
520
+ r_timeout (Union[TimeoutTypes, UseClientDefault]): A
521
+ request-specific timeout override.
522
+ extensions (Optional[RequestExtensions]): Advanced HTTPX
523
+ extensions.
524
+
525
+ Raises:
526
+ APIConnectionError: If the underlying network transport drops
527
+ entirely (e.g., DNS failure).
528
+ RotatedSessionError: If a `session_validator` is provided, the
529
+ request was made with a `session_id` argument, and the
530
+ `session_validator` returned `True`
531
+
532
+ Returns:
533
+ The `ScrapeDoResponse` object containing the target's data.
534
+ """
535
+ req = PreparedScrapeDoRequest(
536
+ api_params=RequestParameters.from_url(full_url),
537
+ method=method,
538
+ headers=headers,
539
+ body=body,
540
+ payload_type=payload_type
541
+ )
542
+ return self.execute(
543
+ req,
544
+ session_validator,
545
+ r_timeout=r_timeout,
546
+ extensions=extensions
547
+ )
548
+
549
+ def request(
550
+ self,
551
+ method: HttpMethod,
552
+ target_url: str,
553
+ params: Optional[RequestParameters] = None,
554
+ session_validator: Optional[SyncSessionValidator] = None,
555
+ *,
556
+ headers: Optional[Dict[str, str]] = None,
557
+ body: Optional[Union[Dict[str, Any], str, bytes]] = None,
558
+ payload_type: PayloadType = "json",
559
+ r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
560
+ extensions: Optional[RequestExtensions] = None,
561
+ **api_kwargs: Unpack[RequestParametersDict]
562
+ ) -> ScrapeDoResponse:
563
+ """Interface for building and executing a Scrape.do request.
564
+
565
+ Depending on the parameter configuration it either constructs a
566
+ `PreparedScrapeDoRequest` object and passes it to the
567
+ `execute` method, or calls the `execute_from_url` method on
568
+ the `target_url`.
569
+
570
+ info: Parameter Configuration
571
+ This method provides smart routing based on the arguments provided.
572
+ You can configure the request in three distinct ways:
573
+
574
+ - **Keyword Arguments (Default) :** Pass the target URL and
575
+ Scrape.do parameters directly as `**api_kwargs`
576
+ (`render=True`, `geoCode="us"`).
577
+
578
+ - **Pre-built Parameters :** Pass a fully validated
579
+ `RequestParameters` object via the `params` argument.
580
+
581
+ - **Raw Scrape.do URL :** Pass a full `api.scrape.do` URL as the
582
+ `target_url`.
583
+
584
+ warning: Parameter Restrictions
585
+ To prevent silent overwrites and routing ambiguity, the client
586
+ enforces that only one of the parameter configurations can be
587
+ used at a time.
588
+
589
+ - When using the default **Keyword Arguments** (`**api_kwargs`)
590
+ configuration, passing a value to the `params` argument, or a
591
+ `api.scrape.do` URL to the `target_url` argument will raise a
592
+ `ValueError`
593
+
594
+ - When using the **Pre-built Parameters** (`params`) configuration,
595
+ passing any `**api_kwargs` argument, or an `api.scrape.do` URL
596
+ to the `target_url` argument, will raise a `ValueError`
597
+
598
+ - When using the **Raw Scrape.do URL** configuration, passing any
599
+ `**api_kwargs` argument, or a value to the `params` argument,
600
+ will raise a `ValueError`
601
+
602
+ warning: Pre-built Parameters Configuration
603
+ When passing an already constructed `RequestParameters` instance
604
+ to the `params` argument, its `url` attribute will be ignored and
605
+ replaced by the provided `target_url`.
606
+
607
+ Args:
608
+ method (HttpMethod): The HTTP method to forward to the target
609
+ website.
610
+ target_url (str): The destination website URL
611
+ (or a raw Scrape.do endpoint).
612
+ params (Optional[RequestParameters]): A pre-validated parameter
613
+ object.
614
+ session_validator (Optional[SyncSessionValidator]): A custom
615
+ function to be called in order to determine whether or not to
616
+ raise a `RotatedSessionError` exception. (See
617
+ `ScrapeDoClient.execute` docstring for more information)
618
+ headers (Optional[Dict[str, str]]): Custom HTTP headers to forward
619
+ to the target.
620
+ body (Optional[Union[Dict[str, Any], str, bytes]]): The payload to
621
+ send to the target website.
622
+ payload_type (PayloadType): Dictates how the client encodes the
623
+ `body`.
624
+ r_timeout (Union[TimeoutTypes, UseClientDefault]): Request-specific
625
+ timeout override.
626
+ extensions (Optional[RequestExtensions]): Advanced HTTPX
627
+ extensions.
628
+ **api_kwargs (Unpack[RequestParametersDict]): Scrape.do API
629
+ configuration parameters (e.g., `render=True`).
630
+
631
+ Returns:
632
+ The `ScrapeDoResponse` object containing the target's data.
633
+
634
+ Raises:
635
+ ValueError: If configuration constraints are violated.
636
+ APIConnectionError: If the underlying network transport drops
637
+ entirely (e.g., DNS failure).
638
+ RotatedSessionError: If a `session_validator` is provided, the
639
+ request was made with a `session_id` argument, and the
640
+ `session_validator` returned `True`
641
+ """
642
+ if "api.scrape.do" in target_url.lower():
643
+ if params is not None or api_kwargs:
644
+ raise ValueError((
645
+ "You provided a raw api.scrape.do URL but also provided "
646
+ "additional parameters. When using a raw Scrape.do URL, "
647
+ "it must be the single source of truth. Please remove the "
648
+ "kwargs/params or pass the target URL instead."
649
+ ))
650
+ return self.execute_from_url(
651
+ method,
652
+ target_url,
653
+ headers,
654
+ body,
655
+ payload_type,
656
+ session_validator,
657
+ r_timeout=r_timeout,
658
+ extensions=extensions
659
+ )
660
+
661
+ if params is not None and api_kwargs:
662
+ raise ValueError(
663
+ "You cannot provide both a 'RequestParameters' object and "
664
+ "explicit **api_kwargs. Choose one method of configuration."
665
+ )
666
+
667
+ if params is None:
668
+ params = RequestParameters.model_validate(
669
+ {"url": target_url, **api_kwargs})
670
+ else:
671
+ params.url = HttpUrl(target_url)
672
+
673
+ req = PreparedScrapeDoRequest(
674
+ api_params=params,
675
+ method=method,
676
+ headers=headers,
677
+ body=body,
678
+ payload_type=payload_type
679
+ )
680
+ return self.execute(
681
+ req,
682
+ session_validator,
683
+ r_timeout=r_timeout,
684
+ extensions=extensions
685
+ )
686
+
687
+ # --- Method Wrappers ---
688
+
689
+ def get(
690
+ self,
691
+ url: str,
692
+ params: Optional[RequestParameters] = None,
693
+ session_validator: Optional[SyncSessionValidator] = None,
694
+ *,
695
+ headers: Optional[Dict[str, str]] = None,
696
+ r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
697
+ extensions: Optional[RequestExtensions] = None,
698
+ **api_kwargs: Unpack[RequestParametersDict]
699
+ ) -> ScrapeDoResponse:
700
+ """Wrapper for executing a GET request.
701
+
702
+ Inherits the smart routing logic, parameter validation, and execution
703
+ constraints of the base
704
+ [request][scrape_do.client.ScrapeDoClient.request] method.
705
+
706
+ Args:
707
+ url (str): The target website URL (or raw Scrape.do URL).
708
+ params (Optional[RequestParameters]): A pre-validated parameter
709
+ object.
710
+ session_validator (Optional[SyncSessionValidator]): A custom
711
+ function to be called in order to determine whether or not to
712
+ raise a `RotatedSessionError` exception. (See
713
+ `ScrapeDoClient.execute` docstring for more information)
714
+ headers (Optional[Dict[str, str]]): Custom HTTP headers to forward.
715
+ r_timeout (Union[TimeoutTypes, UseClientDefault]): Request-specific
716
+ timeout override.
717
+ extensions (Optional[RequestExtensions]): Advanced HTTPX
718
+ extensions.
719
+ **api_kwargs (Unpack[RequestParametersDict]): Scrape.do API
720
+ configuration parameters.
721
+
722
+ Raises:
723
+ ValueError: If configuration constraints are violated.
724
+ APIConnectionError: If the underlying network transport drops
725
+ entirely (e.g., DNS failure).
726
+ RotatedSessionError: If a `session_validator` is provided, the
727
+ request was made with a `session_id` argument, and the
728
+ `session_validator` returned `True`
729
+
730
+ Returns:
731
+ The `ScrapeDoResponse` object containing the target's data.
732
+ """
733
+ return self.request(
734
+ "GET",
735
+ url,
736
+ params=params,
737
+ session_validator=session_validator,
738
+ headers=headers,
739
+ r_timeout=r_timeout,
740
+ extensions=extensions,
741
+ **api_kwargs
742
+ )
743
+
744
+ def post(
745
+ self,
746
+ url: str,
747
+ params: Optional[RequestParameters] = None,
748
+ session_validator: Optional[SyncSessionValidator] = None,
749
+ *,
750
+ body: Optional[Union[Dict[str, Any], str, bytes]] = None,
751
+ headers: Optional[Dict[str, str]] = None,
752
+ payload_type: PayloadType = "json",
753
+ r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
754
+ extensions: Optional[RequestExtensions] = None,
755
+ **api_kwargs: Unpack[RequestParametersDict]
756
+ ) -> ScrapeDoResponse:
757
+ """Wrapper for executing a POST request.
758
+
759
+ Inherits the smart routing logic, parameter validation, and execution
760
+ constraints of the base
761
+ [request][scrape_do.client.ScrapeDoClient.request] method.
762
+
763
+ Args:
764
+ url (str): The target website URL (or raw Scrape.do URL).
765
+ params (Optional[RequestParameters]): A pre-validated parameter
766
+ object.
767
+ session_validator (Optional[SyncSessionValidator]): A custom
768
+ function to be called in order to determine whether or not to
769
+ raise a `RotatedSessionError` exception. (See
770
+ `ScrapeDoClient.execute` docstring for more information)
771
+ body (Optional[Union[Dict[str, Any], str, bytes]]): The payload to
772
+ send to the target website.
773
+ headers (Optional[Dict[str, str]]): Custom HTTP headers to forward.
774
+ payload_type (PayloadType): Dictates how the client encodes the
775
+ `body`.
776
+ r_timeout (Union[TimeoutTypes, UseClientDefault]): Request-specific
777
+ timeout override.
778
+ extensions (Optional[RequestExtensions]): Advanced HTTPX
779
+ extensions.
780
+ **api_kwargs (Unpack[RequestParametersDict]): Scrape.do API
781
+ configuration parameters.
782
+
783
+ Raises:
784
+ ValueError: If configuration constraints are violated.
785
+ APIConnectionError: If the underlying network transport drops
786
+ entirely (e.g., DNS failure).
787
+ RotatedSessionError: If a `session_validator` is provided, the
788
+ request was made with a `session_id` argument, and the
789
+ `session_validator` returned `True`
790
+
791
+ Returns:
792
+ The `ScrapeDoResponse` object containing the target's data.
793
+ """
794
+ return self.request(
795
+ "POST",
796
+ url,
797
+ params=params,
798
+ headers=headers,
799
+ body=body,
800
+ payload_type=payload_type,
801
+ r_timeout=r_timeout,
802
+ extensions=extensions,
803
+ **api_kwargs
804
+ )