scrape-do-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrape_do/__init__.py +0 -0
- scrape_do/abc.py +0 -0
- scrape_do/async_client.py +0 -0
- scrape_do/client.py +804 -0
- scrape_do/constants.py +84 -0
- scrape_do/exceptions.py +238 -0
- scrape_do/models/__init__.py +79 -0
- scrape_do/models/browser_actions.py +332 -0
- scrape_do/models/enums.py +76 -0
- scrape_do/models/parameters.py +840 -0
- scrape_do/models/request.py +232 -0
- scrape_do/models/response.py +890 -0
- scrape_do/namespaces/__init__.py +0 -0
- scrape_do/namespaces/amazon.py +0 -0
- scrape_do/namespaces/google.py +0 -0
- scrape_do/namespaces/jobs.py +0 -0
- scrape_do_python-0.1.0.dist-info/METADATA +134 -0
- scrape_do_python-0.1.0.dist-info/RECORD +21 -0
- scrape_do_python-0.1.0.dist-info/WHEEL +5 -0
- scrape_do_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrape_do_python-0.1.0.dist-info/top_level.txt +1 -0
scrape_do/client.py
ADDED
|
@@ -0,0 +1,804 @@
|
|
|
1
|
+
"""Synchronous HTTP client for the Scrape.do API.
|
|
2
|
+
|
|
3
|
+
Defines the primary `ScrapeDoClient` used for executing proxy
|
|
4
|
+
requests. Handles autonomic error routing, customizable retry strategies,
|
|
5
|
+
telemetry tracking, and secure, isolated connection pooling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
import random
|
|
11
|
+
import logging
|
|
12
|
+
import ssl
|
|
13
|
+
from pydantic import HttpUrl
|
|
14
|
+
from httpx import (
|
|
15
|
+
Client,
|
|
16
|
+
Limits,
|
|
17
|
+
BaseTransport,
|
|
18
|
+
RequestError
|
|
19
|
+
)
|
|
20
|
+
from httpx._config import DEFAULT_LIMITS
|
|
21
|
+
from httpx._types import (
|
|
22
|
+
TimeoutTypes,
|
|
23
|
+
CertTypes,
|
|
24
|
+
RequestExtensions
|
|
25
|
+
)
|
|
26
|
+
from httpx._client import (
|
|
27
|
+
UseClientDefault,
|
|
28
|
+
USE_CLIENT_DEFAULT
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from typing import (
|
|
32
|
+
Dict,
|
|
33
|
+
List,
|
|
34
|
+
Optional,
|
|
35
|
+
Self,
|
|
36
|
+
Any,
|
|
37
|
+
Union,
|
|
38
|
+
Unpack,
|
|
39
|
+
Callable,
|
|
40
|
+
Literal,
|
|
41
|
+
TypeAlias,
|
|
42
|
+
TypedDict
|
|
43
|
+
)
|
|
44
|
+
from types import TracebackType
|
|
45
|
+
from .models import (
|
|
46
|
+
RequestParameters,
|
|
47
|
+
PreparedScrapeDoRequest,
|
|
48
|
+
ScrapeDoResponse,
|
|
49
|
+
PayloadType,
|
|
50
|
+
HttpMethod,
|
|
51
|
+
RequestParametersDict
|
|
52
|
+
)
|
|
53
|
+
from .exceptions import APIConnectionError, RotatedSessionError
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger("scrape_do")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# --- Type Definitions ---
|
|
60
|
+
|
|
61
|
+
SyncSessionValidator: TypeAlias = Callable[[ScrapeDoResponse], bool]
|
|
62
|
+
"""
|
|
63
|
+
Defines the expected signature of the custom function meant to be passed
|
|
64
|
+
to the `ScrapeDoClient.execute` method's `session_validator` argument
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class SyncClientEventHooks(TypedDict, total=False):
|
|
69
|
+
"""
|
|
70
|
+
Configuration dictionary for SDK-native lifecycle hooks.
|
|
71
|
+
|
|
72
|
+
Unlike native HTTPX event hooks which fire on every transport-level
|
|
73
|
+
execution (and can corrupt telemetry during autonomic retries), these SDK
|
|
74
|
+
hooks map cleanly to the logical request lifecycle.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
request: List[
|
|
78
|
+
Callable[[PreparedScrapeDoRequest], None]
|
|
79
|
+
]
|
|
80
|
+
"""
|
|
81
|
+
Fires exactly once per logical execution, immediately before the retry
|
|
82
|
+
loop begins. Receives the `PreparedScrapeDoRequest` object that will be
|
|
83
|
+
used to exececute the request. Useful for logging the request being
|
|
84
|
+
executed.
|
|
85
|
+
"""
|
|
86
|
+
response: List[
|
|
87
|
+
Callable[[ScrapeDoResponse], None]
|
|
88
|
+
]
|
|
89
|
+
"""
|
|
90
|
+
Fires exactly once per logical execution, immediately after the proxy
|
|
91
|
+
returns a response and the `session_validator` (if any) passes.
|
|
92
|
+
Receives the request's `ScrapeDoResponse` object. Useful for
|
|
93
|
+
logging only the final response after all retries, which can be either
|
|
94
|
+
a successful response, a non-retryable error, or a final retryable error
|
|
95
|
+
after `max_attempts` has been exhausted.
|
|
96
|
+
"""
|
|
97
|
+
retry: List[
|
|
98
|
+
Callable[
|
|
99
|
+
[
|
|
100
|
+
int,
|
|
101
|
+
PreparedScrapeDoRequest,
|
|
102
|
+
Optional[ScrapeDoResponse],
|
|
103
|
+
Optional[Exception]
|
|
104
|
+
],
|
|
105
|
+
None
|
|
106
|
+
]
|
|
107
|
+
]
|
|
108
|
+
"""
|
|
109
|
+
Fires inside the execution loop ONLY when a proxy gateway error
|
|
110
|
+
(or an httpx.RequestError) occurs and the SDK decides to retry. Receives
|
|
111
|
+
the current attempt number, the prepared request, and either the failed
|
|
112
|
+
response (if it exists) or the `httpx.RequestError` that caused the retry.
|
|
113
|
+
Useful for tracking proxy instability or manually raising an exception to
|
|
114
|
+
abort the retry loop.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# --- Client Default Backoff Strategy ---
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def default_backoff_strategy(attempt: int) -> float:
|
|
122
|
+
"""Calculates a jittered exponential backoff for rate-limit retries.
|
|
123
|
+
|
|
124
|
+
This is the default function used by the `ScrapeDoClient` to determine how
|
|
125
|
+
long to wait before retrying a rate-limited request when the
|
|
126
|
+
`retry_backoff` parameter is set to `None`.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
attempt (int): The number of retries made so far, starting from 0
|
|
130
|
+
|
|
131
|
+
info: Additional Information
|
|
132
|
+
The `jitter` here is a random number between 0.1 and 1 generated
|
|
133
|
+
by the `random.uniform` function.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
The number of seconds to sleep, calculated as (2^attempt) + jitter.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
return (2.0**attempt) + random.uniform(0.1, 1.0)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ScrapeDoClient:
|
|
143
|
+
"""Synchronous HTTP client for executing Scrape.do API requests.
|
|
144
|
+
|
|
145
|
+
Aims to facilitate interactions with the Scrape.do API by managing an
|
|
146
|
+
`httpx.Client` instance to provide strict type-checking for request
|
|
147
|
+
parameters, custom error parsing, and session tracking while keeping the
|
|
148
|
+
network configurations as flexible as possible.
|
|
149
|
+
|
|
150
|
+
abstract: Features
|
|
151
|
+
- Local API parameter validation via the `RequestParameters` Pydantic
|
|
152
|
+
model.
|
|
153
|
+
|
|
154
|
+
- Status code error parsing and customisable retry intervals for
|
|
155
|
+
rate-limited requests.
|
|
156
|
+
|
|
157
|
+
- Strongly-typed interface for responses via the `ScrapeDoResponse`
|
|
158
|
+
Pyadantic model.
|
|
159
|
+
|
|
160
|
+
info: Concurrency Limit and Server Errors
|
|
161
|
+
This client intercepts and manages Scrape.do's specific gateway errors
|
|
162
|
+
(429, 502, 510), automatically applying a customisable retry strategy
|
|
163
|
+
before the error can reach the application.
|
|
164
|
+
|
|
165
|
+
tip: SDK Event Hooks (`event_hooks`)
|
|
166
|
+
This client implements SDK-specific event hooks mimicking the
|
|
167
|
+
structure of `httpx` native event hooks. See
|
|
168
|
+
[`SyncClientEventHooks`][scrape_do.client.SyncClientEventHooks] for
|
|
169
|
+
available lifecycle hooks and their required signatures.
|
|
170
|
+
|
|
171
|
+
tip: Additional `httpx.Client` Configuration
|
|
172
|
+
The following `httpx.Client` parameters can be provided as keyword
|
|
173
|
+
arguments and will be passed directly to the underlying object.
|
|
174
|
+
|
|
175
|
+
- `verify`
|
|
176
|
+
- `cert`
|
|
177
|
+
- `http1`
|
|
178
|
+
- `http2`
|
|
179
|
+
- `timeout`
|
|
180
|
+
- `limits`
|
|
181
|
+
- `transport`
|
|
182
|
+
- `default_encoding`
|
|
183
|
+
|
|
184
|
+
Additionally, the following `httpx.Client.request` parameters can be
|
|
185
|
+
provided as keyword arguments during request execution.
|
|
186
|
+
|
|
187
|
+
- `timeout` (`r_timeout`)
|
|
188
|
+
- `extensions`
|
|
189
|
+
|
|
190
|
+
For more information on their behaviour and default values, please
|
|
191
|
+
consult the official
|
|
192
|
+
[`httpx`](https://www.python-httpx.org/api/#client) documentation.
|
|
193
|
+
|
|
194
|
+
warning: Unsupported HTTPX Client Arguments
|
|
195
|
+
The underlying `httpx.Client` object is strictly managed by the
|
|
196
|
+
instance to prevent invalid configurations from being sent to the
|
|
197
|
+
Scrape.do API. For this reason, arguments not listed in the previous
|
|
198
|
+
section are intentionally blocked and shouldn't be changed.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
api_token (Optional[str]): The Scrape.do API key. If omitted, the
|
|
202
|
+
client will attempt to load it from the 'SCRAPE_DO_API_KEY'
|
|
203
|
+
environment variable.
|
|
204
|
+
max_retries (int): The maximum number of retry attempts for retryable
|
|
205
|
+
Scrape.do gateway errors (HTTP 429, 502, and 510).
|
|
206
|
+
retry_backoff (Union[float, Callable[[int], float]]): The strategy
|
|
207
|
+
used to calculate the delay between retries. Can be a static
|
|
208
|
+
`float` (seconds) or a callable that accepts the current attempt
|
|
209
|
+
number (0-indexed) and returns a float. Defaults to a jittered
|
|
210
|
+
exponential backoff when set to `None`.
|
|
211
|
+
event_hooks (Optional[SyncClientEventHooks]): A dictionary of
|
|
212
|
+
SDK-native hooks to execute during different points of the request
|
|
213
|
+
lifecycle.
|
|
214
|
+
verify (Union[ssl.SSLContext, str, bool]): Configures SSL certificate
|
|
215
|
+
verification. Defaults to True (secure).
|
|
216
|
+
cert (Optional[CertTypes]): Client-side certificates for mutual TLS
|
|
217
|
+
authentication.
|
|
218
|
+
http1 (bool): Enable HTTP/1.1 support.
|
|
219
|
+
http2 (bool): Enable HTTP/2 multiplexing for higher concurrency.
|
|
220
|
+
timeout (TimeoutTypes): The default timeout (in seconds) applied to
|
|
221
|
+
all network phases. Defaults to 60s, raised from httpx's 5s
|
|
222
|
+
default to accommodate Scrape.do proxy round-trips
|
|
223
|
+
(browser rendering, geo-routing, fingerprinting).
|
|
224
|
+
limits (Limits): Configuration for maximum connection pool sizes.
|
|
225
|
+
transport (Optional[BaseTransport]): A completely custom transport
|
|
226
|
+
engine
|
|
227
|
+
default_encoding (Union[str, Callable[[bytes], str]]): The fallback
|
|
228
|
+
text encoding used if a target website omits a charset header.
|
|
229
|
+
"""
|
|
230
|
+
def __init__(
|
|
231
|
+
self,
|
|
232
|
+
api_token: Optional[str] = None,
|
|
233
|
+
max_retries: int = 3,
|
|
234
|
+
retry_backoff: Optional[Union[float, Callable[[int], float]]] = None,
|
|
235
|
+
event_hooks: Optional[SyncClientEventHooks] = None,
|
|
236
|
+
*,
|
|
237
|
+
verify: Union[ssl.SSLContext, str, bool] = True,
|
|
238
|
+
cert: Optional[CertTypes] = None,
|
|
239
|
+
http1: bool = True,
|
|
240
|
+
http2: bool = False,
|
|
241
|
+
timeout: TimeoutTypes = 60.0,
|
|
242
|
+
limits: Limits = DEFAULT_LIMITS,
|
|
243
|
+
transport: Optional[BaseTransport] = None,
|
|
244
|
+
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8"
|
|
245
|
+
) -> None:
|
|
246
|
+
self.api_token = api_token or os.getenv("SCRAPE_DO_API_KEY")
|
|
247
|
+
if not self.api_token:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
"Scrape.do API token must be provided explicitly or set via"
|
|
250
|
+
" the 'SCRAPE_DO_API_KEY' environment variable."
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
self.max_retries = max_retries
|
|
254
|
+
|
|
255
|
+
if retry_backoff is not None:
|
|
256
|
+
self.retry_backoff = retry_backoff
|
|
257
|
+
else:
|
|
258
|
+
self.retry_backoff = default_backoff_strategy
|
|
259
|
+
|
|
260
|
+
self.event_hooks: SyncClientEventHooks = event_hooks or {}
|
|
261
|
+
|
|
262
|
+
self._http_client = Client(
|
|
263
|
+
verify=verify,
|
|
264
|
+
cert=cert,
|
|
265
|
+
trust_env=False,
|
|
266
|
+
http1=http1,
|
|
267
|
+
http2=http2,
|
|
268
|
+
timeout=timeout,
|
|
269
|
+
limits=limits,
|
|
270
|
+
transport=transport,
|
|
271
|
+
default_encoding=default_encoding
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
def close(self) -> None:
|
|
275
|
+
"""Closes the underlying HTTPX connection pool.
|
|
276
|
+
|
|
277
|
+
It is recommended to use the client as a context manager to ensure
|
|
278
|
+
resources are released automatically.
|
|
279
|
+
"""
|
|
280
|
+
self._http_client.close()
|
|
281
|
+
|
|
282
|
+
def __enter__(self) -> Self:
|
|
283
|
+
"""Initializes the HTTPX connection pool and returns the context
|
|
284
|
+
manager object.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
The `ScrapeDoClient` instance with an opened HTTPX connection pool
|
|
288
|
+
"""
|
|
289
|
+
return self
|
|
290
|
+
|
|
291
|
+
def __exit__(
|
|
292
|
+
self,
|
|
293
|
+
exc_type: Optional[type[BaseException]],
|
|
294
|
+
exc_val: Optional[BaseException],
|
|
295
|
+
exc_tb: Optional[TracebackType]
|
|
296
|
+
) -> Literal[False]:
|
|
297
|
+
"""Calls the `close` method to close the underlying HTTPX connection
|
|
298
|
+
pool without swallowing any exceptions.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
exc_type (Optional[type[BaseException]]): The type of the
|
|
302
|
+
exception.
|
|
303
|
+
exc_val (Optional[BaseException]): The instance of the exception.
|
|
304
|
+
exc_tb (Optional[TracebackType]): The traceback information.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
`False`, since no exceptions are swallowed
|
|
308
|
+
"""
|
|
309
|
+
self.close()
|
|
310
|
+
return False
|
|
311
|
+
|
|
312
|
+
def execute(
|
|
313
|
+
self,
|
|
314
|
+
request: PreparedScrapeDoRequest,
|
|
315
|
+
session_validator: Optional[SyncSessionValidator] = None,
|
|
316
|
+
*,
|
|
317
|
+
r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
|
|
318
|
+
extensions: Optional[RequestExtensions] = None
|
|
319
|
+
) -> ScrapeDoResponse:
|
|
320
|
+
"""Executes a fully prepared and validated Scrape.do request.
|
|
321
|
+
|
|
322
|
+
Acts as the core execution funnel, applying the retry
|
|
323
|
+
backoff logic, evaluating gateway errors and sessions,
|
|
324
|
+
and isolating cookies between sequential executions.
|
|
325
|
+
|
|
326
|
+
tip: Intended Usage
|
|
327
|
+
Use this method if you have manually constructed a
|
|
328
|
+
`PreparedScrapeDoRequest` object for bulk routing,
|
|
329
|
+
custom configurations, or task reusability.
|
|
330
|
+
|
|
331
|
+
warning: Sessions (`sessionId`)
|
|
332
|
+
If you configure a request with a `session_id`, Scrape.do will
|
|
333
|
+
attempt to route your traffic through the same proxy address.
|
|
334
|
+
However, it can still silently rotate this address for various
|
|
335
|
+
reasons. If it rotates during a multi-step scraping task, any
|
|
336
|
+
target-specific WAF state or cookies accumulated will be lost,
|
|
337
|
+
which may cause the task to fail.
|
|
338
|
+
|
|
339
|
+
tip: Validating Sessions (`session_validator`)
|
|
340
|
+
- In order to prevent unexpected errors due to dropped sessions,
|
|
341
|
+
you can pass a custom function to the client's `execute` method
|
|
342
|
+
`session_validator` argument.
|
|
343
|
+
|
|
344
|
+
- This function will be called internally by the client after each
|
|
345
|
+
stateful request (`sessionId is not None`) to determine whether
|
|
346
|
+
or not a `RotatedSessionError` exception should be raised to
|
|
347
|
+
signal that this session is no longer valid.
|
|
348
|
+
|
|
349
|
+
- The function should take the current request's `ScrapeDoResponse`
|
|
350
|
+
object as its only argument, and return a single `bool` value.
|
|
351
|
+
|
|
352
|
+
- If the function evaluates to `True`, this method will raise the
|
|
353
|
+
`RotatedSessionError` instead of returning the response object.
|
|
354
|
+
(The request's `ScrapeDoResponse` object can still be accessed
|
|
355
|
+
later on using the exception's `response` attribute.) Otherwise,
|
|
356
|
+
no additional action is taken.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
request (PreparedScrapeDoRequest): The validated request payload.
|
|
360
|
+
r_timeout (Union[TimeoutTypes, UseClientDefault]): A
|
|
361
|
+
request-specific timeout override.
|
|
362
|
+
session_validator (Optional[SyncSessionValidator]): A custom
|
|
363
|
+
function to be called in order to determine whether or not to
|
|
364
|
+
raise a `RotatedSessionError` exception.
|
|
365
|
+
extensions (Optional[RequestExtensions]): Advanced HTTPX
|
|
366
|
+
extensions for this specific request.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
The `ScrapeDoResponse` object containing the target's data.
|
|
370
|
+
|
|
371
|
+
Raises:
|
|
372
|
+
APIConnectionError: If the underlying network transport drops
|
|
373
|
+
entirely (e.g., DNS failure).
|
|
374
|
+
RotatedSessionError: If a `session_validator` is provided, the
|
|
375
|
+
request was made with a `session_id` argument, and the
|
|
376
|
+
`session_validator` returned `True`
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
# Fire Request Event Hooks
|
|
380
|
+
if "request" in self.event_hooks:
|
|
381
|
+
for req_hook in self.event_hooks["request"]:
|
|
382
|
+
req_hook(request)
|
|
383
|
+
|
|
384
|
+
httpx_kwargs = request.to_httpx_kwargs(token=self.api_token)
|
|
385
|
+
session_id = request.api_params.session_id
|
|
386
|
+
|
|
387
|
+
if r_timeout is not USE_CLIENT_DEFAULT:
|
|
388
|
+
httpx_kwargs["timeout"] = r_timeout
|
|
389
|
+
if extensions is not None:
|
|
390
|
+
httpx_kwargs["extensions"] = extensions
|
|
391
|
+
|
|
392
|
+
try:
|
|
393
|
+
for attempt in range(self.max_retries + 1):
|
|
394
|
+
try:
|
|
395
|
+
raw_resp = self._http_client.request(**httpx_kwargs)
|
|
396
|
+
scrape_response = ScrapeDoResponse(request, raw_resp)
|
|
397
|
+
|
|
398
|
+
# Strictly aligned with Scrape.do documented gateway errors
|
|
399
|
+
is_retryable_status = (
|
|
400
|
+
raw_resp.status_code in (429, 502, 510)
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
if scrape_response.is_proxy_error and is_retryable_status:
|
|
404
|
+
if attempt < self.max_retries:
|
|
405
|
+
|
|
406
|
+
# Fire retry hook and pass response
|
|
407
|
+
if "retry" in self.event_hooks:
|
|
408
|
+
for retry_hook in self.event_hooks["retry"]:
|
|
409
|
+
retry_hook(
|
|
410
|
+
attempt,
|
|
411
|
+
request,
|
|
412
|
+
scrape_response,
|
|
413
|
+
None
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
if callable(self.retry_backoff):
|
|
417
|
+
time.sleep(self.retry_backoff(attempt))
|
|
418
|
+
else:
|
|
419
|
+
time.sleep(float(self.retry_backoff))
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
# If attempt == max_retries, fall through
|
|
423
|
+
# to return the failed ScrapeDoResponse to the user.
|
|
424
|
+
|
|
425
|
+
# Call validator if session_id is not None
|
|
426
|
+
if (
|
|
427
|
+
session_validator is not None
|
|
428
|
+
and session_id is not None
|
|
429
|
+
):
|
|
430
|
+
# Raise exception if validator returns True
|
|
431
|
+
if session_validator(scrape_response):
|
|
432
|
+
raise RotatedSessionError(
|
|
433
|
+
(
|
|
434
|
+
f"User-Defined Session Validator Failed | "
|
|
435
|
+
f"Status: {raw_resp.status_code}"
|
|
436
|
+
),
|
|
437
|
+
raw_resp,
|
|
438
|
+
request,
|
|
439
|
+
scrape_response
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# Fires on a success, OR on a final 502 if
|
|
443
|
+
# retries are exhausted.
|
|
444
|
+
if "response" in self.event_hooks:
|
|
445
|
+
for resp_hook in self.event_hooks["response"]:
|
|
446
|
+
resp_hook(scrape_response)
|
|
447
|
+
|
|
448
|
+
return scrape_response
|
|
449
|
+
|
|
450
|
+
except RequestError as e:
|
|
451
|
+
if attempt == self.max_retries:
|
|
452
|
+
raise APIConnectionError(
|
|
453
|
+
f"Network transport failed: {str(e)}",
|
|
454
|
+
request
|
|
455
|
+
) from e
|
|
456
|
+
|
|
457
|
+
# Fire retry hook and pass the exception
|
|
458
|
+
if "retry" in self.event_hooks:
|
|
459
|
+
for retry_hook in self.event_hooks["retry"]:
|
|
460
|
+
retry_hook(
|
|
461
|
+
attempt,
|
|
462
|
+
request,
|
|
463
|
+
None,
|
|
464
|
+
e
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
if callable(self.retry_backoff):
|
|
468
|
+
time.sleep(self.retry_backoff(attempt))
|
|
469
|
+
else:
|
|
470
|
+
time.sleep(float(self.retry_backoff))
|
|
471
|
+
|
|
472
|
+
# max_retries < 0
|
|
473
|
+
raise RuntimeError(
|
|
474
|
+
"Execution loop exhausted without returning a response."
|
|
475
|
+
)
|
|
476
|
+
finally:
|
|
477
|
+
# Prevent cookie bleed between requests
|
|
478
|
+
self._http_client.cookies.clear()
|
|
479
|
+
|
|
480
|
+
def execute_from_url(
|
|
481
|
+
self,
|
|
482
|
+
method: HttpMethod,
|
|
483
|
+
full_url: str,
|
|
484
|
+
headers: Optional[Dict[str, str]] = None,
|
|
485
|
+
body: Optional[Union[Dict[str, Any], str, bytes]] = None,
|
|
486
|
+
payload_type: PayloadType = "json",
|
|
487
|
+
session_validator: Optional[SyncSessionValidator] = None,
|
|
488
|
+
*,
|
|
489
|
+
r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
|
|
490
|
+
extensions: Optional[RequestExtensions] = None
|
|
491
|
+
) -> ScrapeDoResponse:
|
|
492
|
+
"""Executes a request using a raw, pre-configured `api.scrape.do` URL.
|
|
493
|
+
|
|
494
|
+
tip: Intended Usage
|
|
495
|
+
This method is designed for scenarios where you have generated a
|
|
496
|
+
Scrape.do URL elsewhere and simply need to execute it. It parses
|
|
497
|
+
the URL to extract and validate the parameters, and then passes the
|
|
498
|
+
`PreparedScrapeDoRequest` to the `execute` method.
|
|
499
|
+
|
|
500
|
+
info: URL Format
|
|
501
|
+
The `api.scrape.do` URL can be either url-encoded or not. Both
|
|
502
|
+
will have their parameters extracted and be properly re-encoded
|
|
503
|
+
before the request is sent.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
method (HttpMethod): The HTTP method to forward to the target
|
|
507
|
+
website.
|
|
508
|
+
full_url (str): The complete, pre-formatted `api.scrape.do`
|
|
509
|
+
endpoint.
|
|
510
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers to forward
|
|
511
|
+
to the target.
|
|
512
|
+
body (Optional[Union[Dict[str, Any], str, bytes]]): The payload to
|
|
513
|
+
send to the target website.
|
|
514
|
+
payload_type (PayloadType): Dictates how the client encodes the
|
|
515
|
+
`body` (e.g., 'json', 'data').
|
|
516
|
+
session_validator (Optional[SyncSessionValidator]): A custom
|
|
517
|
+
function to be called in order to determine whether or not to
|
|
518
|
+
raise a `RotatedSessionError` exception. (See
|
|
519
|
+
`ScrapeDoClient.execute` docstring for more information)
|
|
520
|
+
r_timeout (Union[TimeoutTypes, UseClientDefault]): A
|
|
521
|
+
request-specific timeout override.
|
|
522
|
+
extensions (Optional[RequestExtensions]): Advanced HTTPX
|
|
523
|
+
extensions.
|
|
524
|
+
|
|
525
|
+
Raises:
|
|
526
|
+
APIConnectionError: If the underlying network transport drops
|
|
527
|
+
entirely (e.g., DNS failure).
|
|
528
|
+
RotatedSessionError: If a `session_validator` is provided, the
|
|
529
|
+
request was made with a `session_id` argument, and the
|
|
530
|
+
`session_validator` returned `True`
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
The `ScrapeDoResponse` object containing the target's data.
|
|
534
|
+
"""
|
|
535
|
+
req = PreparedScrapeDoRequest(
|
|
536
|
+
api_params=RequestParameters.from_url(full_url),
|
|
537
|
+
method=method,
|
|
538
|
+
headers=headers,
|
|
539
|
+
body=body,
|
|
540
|
+
payload_type=payload_type
|
|
541
|
+
)
|
|
542
|
+
return self.execute(
|
|
543
|
+
req,
|
|
544
|
+
session_validator,
|
|
545
|
+
r_timeout=r_timeout,
|
|
546
|
+
extensions=extensions
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
def request(
|
|
550
|
+
self,
|
|
551
|
+
method: HttpMethod,
|
|
552
|
+
target_url: str,
|
|
553
|
+
params: Optional[RequestParameters] = None,
|
|
554
|
+
session_validator: Optional[SyncSessionValidator] = None,
|
|
555
|
+
*,
|
|
556
|
+
headers: Optional[Dict[str, str]] = None,
|
|
557
|
+
body: Optional[Union[Dict[str, Any], str, bytes]] = None,
|
|
558
|
+
payload_type: PayloadType = "json",
|
|
559
|
+
r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
|
|
560
|
+
extensions: Optional[RequestExtensions] = None,
|
|
561
|
+
**api_kwargs: Unpack[RequestParametersDict]
|
|
562
|
+
) -> ScrapeDoResponse:
|
|
563
|
+
"""Interface for building and executing a Scrape.do request.
|
|
564
|
+
|
|
565
|
+
Depending on the parameter configuration it either constructs a
|
|
566
|
+
`PreparedScrapeDoRequest` object and passes it to the
|
|
567
|
+
`execute` method, or calls the `execute_from_url` method on
|
|
568
|
+
the `target_url`.
|
|
569
|
+
|
|
570
|
+
info: Parameter Configuration
|
|
571
|
+
This method provides smart routing based on the arguments provided.
|
|
572
|
+
You can configure the request in three distinct ways:
|
|
573
|
+
|
|
574
|
+
- **Keyword Arguments (Default) :** Pass the target URL and
|
|
575
|
+
Scrape.do parameters directly as `**api_kwargs`
|
|
576
|
+
(`render=True`, `geoCode="us"`).
|
|
577
|
+
|
|
578
|
+
- **Pre-built Parameters :** Pass a fully validated
|
|
579
|
+
`RequestParameters` object via the `params` argument.
|
|
580
|
+
|
|
581
|
+
- **Raw Scrape.do URL :** Pass a full `api.scrape.do` URL as the
|
|
582
|
+
`target_url`.
|
|
583
|
+
|
|
584
|
+
warning: Parameter Restrictions
|
|
585
|
+
To prevent silent overwrites and routing ambiguity, the client
|
|
586
|
+
enforces that only one of the parameter configurations can be
|
|
587
|
+
used at a time.
|
|
588
|
+
|
|
589
|
+
- When using the default **Keyword Arguments** (`**api_kwargs`)
|
|
590
|
+
configuration, passing a value to the `params` argument, or a
|
|
591
|
+
`api.scrape.do` URL to the `target_url` argument will raise a
|
|
592
|
+
`ValueError`
|
|
593
|
+
|
|
594
|
+
- When using the **Pre-built Parameters** (`params`) configuration,
|
|
595
|
+
passing any `**api_kwargs` argument, or an `api.scrape.do` URL
|
|
596
|
+
to the `target_url` argument, will raise a `ValueError`
|
|
597
|
+
|
|
598
|
+
- When using the **Raw Scrape.do URL** configuration, passing any
|
|
599
|
+
`**api_kwargs` argument, or a value to the `params` argument,
|
|
600
|
+
will raise a `ValueError`
|
|
601
|
+
|
|
602
|
+
warning: Pre-built Parameters Configuration
|
|
603
|
+
When passing an already constructed `RequestParameters` instance
|
|
604
|
+
to the `params` argument, its `url` attribute will be ignored and
|
|
605
|
+
replaced by the provided `target_url`.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
method (HttpMethod): The HTTP method to forward to the target
|
|
609
|
+
website.
|
|
610
|
+
target_url (str): The destination website URL
|
|
611
|
+
(or a raw Scrape.do endpoint).
|
|
612
|
+
params (Optional[RequestParameters]): A pre-validated parameter
|
|
613
|
+
object.
|
|
614
|
+
session_validator (Optional[SyncSessionValidator]): A custom
|
|
615
|
+
function to be called in order to determine whether or not to
|
|
616
|
+
raise a `RotatedSessionError` exception. (See
|
|
617
|
+
`ScrapeDoClient.execute` docstring for more information)
|
|
618
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers to forward
|
|
619
|
+
to the target.
|
|
620
|
+
body (Optional[Union[Dict[str, Any], str, bytes]]): The payload to
|
|
621
|
+
send to the target website.
|
|
622
|
+
payload_type (PayloadType): Dictates how the client encodes the
|
|
623
|
+
`body`.
|
|
624
|
+
r_timeout (Union[TimeoutTypes, UseClientDefault]): Request-specific
|
|
625
|
+
timeout override.
|
|
626
|
+
extensions (Optional[RequestExtensions]): Advanced HTTPX
|
|
627
|
+
extensions.
|
|
628
|
+
**api_kwargs (Unpack[RequestParametersDict]): Scrape.do API
|
|
629
|
+
configuration parameters (e.g., `render=True`).
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
The `ScrapeDoResponse` object containing the target's data.
|
|
633
|
+
|
|
634
|
+
Raises:
|
|
635
|
+
ValueError: If configuration constraints are violated.
|
|
636
|
+
APIConnectionError: If the underlying network transport drops
|
|
637
|
+
entirely (e.g., DNS failure).
|
|
638
|
+
RotatedSessionError: If a `session_validator` is provided, the
|
|
639
|
+
request was made with a `session_id` argument, and the
|
|
640
|
+
`session_validator` returned `True`
|
|
641
|
+
"""
|
|
642
|
+
if "api.scrape.do" in target_url.lower():
|
|
643
|
+
if params is not None or api_kwargs:
|
|
644
|
+
raise ValueError((
|
|
645
|
+
"You provided a raw api.scrape.do URL but also provided "
|
|
646
|
+
"additional parameters. When using a raw Scrape.do URL, "
|
|
647
|
+
"it must be the single source of truth. Please remove the "
|
|
648
|
+
"kwargs/params or pass the target URL instead."
|
|
649
|
+
))
|
|
650
|
+
return self.execute_from_url(
|
|
651
|
+
method,
|
|
652
|
+
target_url,
|
|
653
|
+
headers,
|
|
654
|
+
body,
|
|
655
|
+
payload_type,
|
|
656
|
+
session_validator,
|
|
657
|
+
r_timeout=r_timeout,
|
|
658
|
+
extensions=extensions
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
if params is not None and api_kwargs:
|
|
662
|
+
raise ValueError(
|
|
663
|
+
"You cannot provide both a 'RequestParameters' object and "
|
|
664
|
+
"explicit **api_kwargs. Choose one method of configuration."
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
if params is None:
|
|
668
|
+
params = RequestParameters.model_validate(
|
|
669
|
+
{"url": target_url, **api_kwargs})
|
|
670
|
+
else:
|
|
671
|
+
params.url = HttpUrl(target_url)
|
|
672
|
+
|
|
673
|
+
req = PreparedScrapeDoRequest(
|
|
674
|
+
api_params=params,
|
|
675
|
+
method=method,
|
|
676
|
+
headers=headers,
|
|
677
|
+
body=body,
|
|
678
|
+
payload_type=payload_type
|
|
679
|
+
)
|
|
680
|
+
return self.execute(
|
|
681
|
+
req,
|
|
682
|
+
session_validator,
|
|
683
|
+
r_timeout=r_timeout,
|
|
684
|
+
extensions=extensions
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
# --- Method Wrappers ---
|
|
688
|
+
|
|
689
|
+
def get(
|
|
690
|
+
self,
|
|
691
|
+
url: str,
|
|
692
|
+
params: Optional[RequestParameters] = None,
|
|
693
|
+
session_validator: Optional[SyncSessionValidator] = None,
|
|
694
|
+
*,
|
|
695
|
+
headers: Optional[Dict[str, str]] = None,
|
|
696
|
+
r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
|
|
697
|
+
extensions: Optional[RequestExtensions] = None,
|
|
698
|
+
**api_kwargs: Unpack[RequestParametersDict]
|
|
699
|
+
) -> ScrapeDoResponse:
|
|
700
|
+
"""Wrapper for executing a GET request.
|
|
701
|
+
|
|
702
|
+
Inherits the smart routing logic, parameter validation, and execution
|
|
703
|
+
constraints of the base
|
|
704
|
+
[request][scrape_do.client.ScrapeDoClient.request] method.
|
|
705
|
+
|
|
706
|
+
Args:
|
|
707
|
+
url (str): The target website URL (or raw Scrape.do URL).
|
|
708
|
+
params (Optional[RequestParameters]): A pre-validated parameter
|
|
709
|
+
object.
|
|
710
|
+
session_validator (Optional[SyncSessionValidator]): A custom
|
|
711
|
+
function to be called in order to determine whether or not to
|
|
712
|
+
raise a `RotatedSessionError` exception. (See
|
|
713
|
+
`ScrapeDoClient.execute` docstring for more information)
|
|
714
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers to forward.
|
|
715
|
+
r_timeout (Union[TimeoutTypes, UseClientDefault]): Request-specific
|
|
716
|
+
timeout override.
|
|
717
|
+
extensions (Optional[RequestExtensions]): Advanced HTTPX
|
|
718
|
+
extensions.
|
|
719
|
+
**api_kwargs (Unpack[RequestParametersDict]): Scrape.do API
|
|
720
|
+
configuration parameters.
|
|
721
|
+
|
|
722
|
+
Raises:
|
|
723
|
+
ValueError: If configuration constraints are violated.
|
|
724
|
+
APIConnectionError: If the underlying network transport drops
|
|
725
|
+
entirely (e.g., DNS failure).
|
|
726
|
+
RotatedSessionError: If a `session_validator` is provided, the
|
|
727
|
+
request was made with a `session_id` argument, and the
|
|
728
|
+
`session_validator` returned `True`
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
The `ScrapeDoResponse` object containing the target's data.
|
|
732
|
+
"""
|
|
733
|
+
return self.request(
|
|
734
|
+
"GET",
|
|
735
|
+
url,
|
|
736
|
+
params=params,
|
|
737
|
+
session_validator=session_validator,
|
|
738
|
+
headers=headers,
|
|
739
|
+
r_timeout=r_timeout,
|
|
740
|
+
extensions=extensions,
|
|
741
|
+
**api_kwargs
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
def post(
|
|
745
|
+
self,
|
|
746
|
+
url: str,
|
|
747
|
+
params: Optional[RequestParameters] = None,
|
|
748
|
+
session_validator: Optional[SyncSessionValidator] = None,
|
|
749
|
+
*,
|
|
750
|
+
body: Optional[Union[Dict[str, Any], str, bytes]] = None,
|
|
751
|
+
headers: Optional[Dict[str, str]] = None,
|
|
752
|
+
payload_type: PayloadType = "json",
|
|
753
|
+
r_timeout: Union[TimeoutTypes, UseClientDefault] = USE_CLIENT_DEFAULT,
|
|
754
|
+
extensions: Optional[RequestExtensions] = None,
|
|
755
|
+
**api_kwargs: Unpack[RequestParametersDict]
|
|
756
|
+
) -> ScrapeDoResponse:
|
|
757
|
+
"""Wrapper for executing a POST request.
|
|
758
|
+
|
|
759
|
+
Inherits the smart routing logic, parameter validation, and execution
|
|
760
|
+
constraints of the base
|
|
761
|
+
[request][scrape_do.client.ScrapeDoClient.request] method.
|
|
762
|
+
|
|
763
|
+
Args:
|
|
764
|
+
url (str): The target website URL (or raw Scrape.do URL).
|
|
765
|
+
params (Optional[RequestParameters]): A pre-validated parameter
|
|
766
|
+
object.
|
|
767
|
+
session_validator (Optional[SyncSessionValidator]): A custom
|
|
768
|
+
function to be called in order to determine whether or not to
|
|
769
|
+
raise a `RotatedSessionError` exception. (See
|
|
770
|
+
`ScrapeDoClient.execute` docstring for more information)
|
|
771
|
+
body (Optional[Union[Dict[str, Any], str, bytes]]): The payload to
|
|
772
|
+
send to the target website.
|
|
773
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers to forward.
|
|
774
|
+
payload_type (PayloadType): Dictates how the client encodes the
|
|
775
|
+
`body`.
|
|
776
|
+
r_timeout (Union[TimeoutTypes, UseClientDefault]): Request-specific
|
|
777
|
+
timeout override.
|
|
778
|
+
extensions (Optional[RequestExtensions]): Advanced HTTPX
|
|
779
|
+
extensions.
|
|
780
|
+
**api_kwargs (Unpack[RequestParametersDict]): Scrape.do API
|
|
781
|
+
configuration parameters.
|
|
782
|
+
|
|
783
|
+
Raises:
|
|
784
|
+
ValueError: If configuration constraints are violated.
|
|
785
|
+
APIConnectionError: If the underlying network transport drops
|
|
786
|
+
entirely (e.g., DNS failure).
|
|
787
|
+
RotatedSessionError: If a `session_validator` is provided, the
|
|
788
|
+
request was made with a `session_id` argument, and the
|
|
789
|
+
`session_validator` returned `True`
|
|
790
|
+
|
|
791
|
+
Returns:
|
|
792
|
+
The `ScrapeDoResponse` object containing the target's data.
|
|
793
|
+
"""
|
|
794
|
+
return self.request(
|
|
795
|
+
"POST",
|
|
796
|
+
url,
|
|
797
|
+
params=params,
|
|
798
|
+
headers=headers,
|
|
799
|
+
body=body,
|
|
800
|
+
payload_type=payload_type,
|
|
801
|
+
r_timeout=r_timeout,
|
|
802
|
+
extensions=extensions,
|
|
803
|
+
**api_kwargs
|
|
804
|
+
)
|