scrape-do-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrape_do/__init__.py +0 -0
- scrape_do/abc.py +0 -0
- scrape_do/async_client.py +0 -0
- scrape_do/client.py +804 -0
- scrape_do/constants.py +84 -0
- scrape_do/exceptions.py +238 -0
- scrape_do/models/__init__.py +79 -0
- scrape_do/models/browser_actions.py +332 -0
- scrape_do/models/enums.py +76 -0
- scrape_do/models/parameters.py +840 -0
- scrape_do/models/request.py +232 -0
- scrape_do/models/response.py +890 -0
- scrape_do/namespaces/__init__.py +0 -0
- scrape_do/namespaces/amazon.py +0 -0
- scrape_do/namespaces/google.py +0 -0
- scrape_do/namespaces/jobs.py +0 -0
- scrape_do_python-0.1.0.dist-info/METADATA +134 -0
- scrape_do_python-0.1.0.dist-info/RECORD +21 -0
- scrape_do_python-0.1.0.dist-info/WHEEL +5 -0
- scrape_do_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrape_do_python-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,890 @@
|
|
|
1
|
+
"""Custom data models for the Scrape.do's API HTTP response
|
|
2
|
+
|
|
3
|
+
Encapsulates the httpx.Response object to provide a strongly-typed interface
|
|
4
|
+
for the respone data sent back by the Scrape.do API. It Parses nested JSON
|
|
5
|
+
payloads, extracts proxy telemetry, and attempts to determine whether non-2xx
|
|
6
|
+
responses are coming from the target website, or from Scrape.do's gateway
|
|
7
|
+
failures.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import os
|
|
13
|
+
import base64
|
|
14
|
+
import re
|
|
15
|
+
import httpx
|
|
16
|
+
from functools import cached_property
|
|
17
|
+
from typing import (
|
|
18
|
+
Optional,
|
|
19
|
+
Union,
|
|
20
|
+
List,
|
|
21
|
+
Self,
|
|
22
|
+
Any,
|
|
23
|
+
Dict
|
|
24
|
+
)
|
|
25
|
+
from pydantic import (
|
|
26
|
+
BaseModel,
|
|
27
|
+
Field,
|
|
28
|
+
HttpUrl,
|
|
29
|
+
ConfigDict
|
|
30
|
+
)
|
|
31
|
+
from .request import PreparedScrapeDoRequest
|
|
32
|
+
from ..exceptions import (
|
|
33
|
+
APIResponseError,
|
|
34
|
+
TargetError,
|
|
35
|
+
BadRequestError,
|
|
36
|
+
ServerError,
|
|
37
|
+
AuthenticationError,
|
|
38
|
+
AuthenticationThrottleError,
|
|
39
|
+
RateLimitError
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# -------------------------
|
|
43
|
+
# JSON Response Info Models
|
|
44
|
+
# -------------------------
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ScrapeDoNetworkRequest(BaseModel):
|
|
48
|
+
"""Represents an intercepted HTTP network request made by the headless
|
|
49
|
+
browser.
|
|
50
|
+
|
|
51
|
+
When rendering JavaScript, the browser makes subsequent requests to fetch
|
|
52
|
+
CSS, images, and background API data which Scrape.do returns in the
|
|
53
|
+
`networkRequests` field when `returnJSON=true`
|
|
54
|
+
|
|
55
|
+
Attributes:
|
|
56
|
+
url (HttpUrl): The absolute URL of the requested resource.
|
|
57
|
+
method (str): The HTTP method used (e.g., GET, POST).
|
|
58
|
+
status (int): The HTTP status code returned by the resource server.
|
|
59
|
+
request_headers (Dict[str, str]): The headers sent by the headless
|
|
60
|
+
browser.
|
|
61
|
+
request_body (Optional[str]): The payload sent with the request,
|
|
62
|
+
if any.
|
|
63
|
+
response_body (Optional[str]): The payload returned by the server,
|
|
64
|
+
if captured.
|
|
65
|
+
response_headers (Dict[str, str]): The headers returned by the
|
|
66
|
+
resource server.
|
|
67
|
+
"""
|
|
68
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
69
|
+
url: HttpUrl
|
|
70
|
+
method: str
|
|
71
|
+
status: int
|
|
72
|
+
request_headers: Dict[str, str] = Field(default_factory=dict)
|
|
73
|
+
request_body: Optional[str] = None
|
|
74
|
+
response_body: Optional[str] = None
|
|
75
|
+
response_headers: Dict[str, str] = Field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ScrapeDoWebSocketFrame(BaseModel):
|
|
79
|
+
"""Represents the underlying payload of an intercepted WebSocket message.
|
|
80
|
+
|
|
81
|
+
Attributes:
|
|
82
|
+
opcode (int): The WebSocket frame operation code
|
|
83
|
+
(1 for text, 2 for binary).
|
|
84
|
+
mask (bool): Indicates if the payload data is masked.
|
|
85
|
+
payload_data (str): The actual message content transferred over the
|
|
86
|
+
socket.
|
|
87
|
+
"""
|
|
88
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
89
|
+
opcode: int
|
|
90
|
+
mask: bool
|
|
91
|
+
payload_data: str = Field(alias="payloadData")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class ScrapeDoWebSocketEvent(BaseModel):
|
|
95
|
+
"""Represents the Chrome DevTools Protocol (CDP) event metadata for a
|
|
96
|
+
WebSocket.
|
|
97
|
+
|
|
98
|
+
Attributes:
|
|
99
|
+
request_id (str): The unique identifier for this specific
|
|
100
|
+
WebSocket connection.
|
|
101
|
+
timestamp (float): The exact epoch timestamp when the event occurred.
|
|
102
|
+
response (ScrapeDoWebSocketFrame): The underlying frame containing the
|
|
103
|
+
payload.
|
|
104
|
+
"""
|
|
105
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
106
|
+
request_id: str = Field(alias="requestId")
|
|
107
|
+
timestamp: float
|
|
108
|
+
response: ScrapeDoWebSocketFrame
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ScrapeDoWebsocketRequest(BaseModel):
|
|
112
|
+
"""Represents a complete WebSocket message intercepted during rendering.
|
|
113
|
+
|
|
114
|
+
Attributes:
|
|
115
|
+
type (str): The direction of the traffic (e.g., "sent" or "received").
|
|
116
|
+
event (ScrapeDoWebSocketEvent): The raw DevTools Protocol event data.
|
|
117
|
+
"""
|
|
118
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
119
|
+
type: str
|
|
120
|
+
event: ScrapeDoWebSocketEvent
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def is_text(self) -> bool:
|
|
124
|
+
"""Determines if the WebSocket payload is readable text.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
`True` if the underlying frame opcode is 1 (Text).
|
|
128
|
+
"""
|
|
129
|
+
return self.event.response.opcode == 1
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class ScrapeDoActionResult(BaseModel):
|
|
133
|
+
"""Represents the execution outcome of a specific programmatic browser
|
|
134
|
+
action.
|
|
135
|
+
|
|
136
|
+
Attributes:
|
|
137
|
+
action (str): The name of the action executed (e.g., "Click", "Wait").
|
|
138
|
+
index (int): The sequence index of this action in the original request
|
|
139
|
+
array.
|
|
140
|
+
success (bool): Indicates whether the action completed without
|
|
141
|
+
throwing an error.
|
|
142
|
+
error (Optional[str]): The error message if the action failed.
|
|
143
|
+
response (Optional[Union[Dict[str, Any], str]]): Data returned by the
|
|
144
|
+
action, typically populated when using the `ExecuteAction` to run
|
|
145
|
+
custom JavaScript.
|
|
146
|
+
"""
|
|
147
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
148
|
+
action: str
|
|
149
|
+
index: int
|
|
150
|
+
success: bool
|
|
151
|
+
error: Optional[str] = None
|
|
152
|
+
response: Optional[Union[Dict[str, Any], str]] = None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class ScrapeDoScreenshot(BaseModel):
|
|
156
|
+
"""Represents a captured screenshot generated during the scraping process.
|
|
157
|
+
|
|
158
|
+
Attributes:
|
|
159
|
+
screenshot_type (str): The configuration used (e.g., "FullScreenShot").
|
|
160
|
+
b64_image (Optional[str]): The Base64 encoded string of the PNG image
|
|
161
|
+
data.
|
|
162
|
+
error (Optional[str]): The failure reason if the screenshot could not
|
|
163
|
+
be captured.
|
|
164
|
+
"""
|
|
165
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
166
|
+
screenshot_type: str = Field(alias="type")
|
|
167
|
+
b64_image: Optional[str] = Field(alias="image", default=None)
|
|
168
|
+
error: Optional[str] = None
|
|
169
|
+
|
|
170
|
+
def to_bytes(self) -> bytes:
|
|
171
|
+
"""
|
|
172
|
+
Convenience method to convert the `b64_image` string into a bytes
|
|
173
|
+
object using the `base64` standard python library
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
ValueError: If the instance's `b64_image` attribute is empty
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
bytes object retuned by `base64.b64decode(b64_image)`
|
|
180
|
+
"""
|
|
181
|
+
if not self.b64_image:
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"No image data was found in the screenshot response | "
|
|
184
|
+
f"Screenshot Type: {self.screenshot_type} | "
|
|
185
|
+
f"Error String: {self.error}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return base64.b64decode(self.b64_image)
|
|
189
|
+
|
|
190
|
+
def to_file(self, path: Union[str, os.PathLike]) -> Path:
|
|
191
|
+
"""
|
|
192
|
+
Convenience method to save the base64-encoded screenshot
|
|
193
|
+
|
|
194
|
+
warning: File Type
|
|
195
|
+
Scrape.do returns base64-encoded `.png` image data, so `path`
|
|
196
|
+
should end in `/file_name.png`
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
path (Union[str, os.PathLike]): Image file will be saved to this
|
|
200
|
+
path
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
resolved `pathlib.Path` object of the `path` parameter
|
|
204
|
+
"""
|
|
205
|
+
r_path = Path(path).resolve()
|
|
206
|
+
image_bytes = self.to_bytes()
|
|
207
|
+
|
|
208
|
+
with open(r_path, "wb") as image:
|
|
209
|
+
image.write(image_bytes)
|
|
210
|
+
|
|
211
|
+
return r_path
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class ScrapeDoFrame(BaseModel):
|
|
215
|
+
"""Represents an isolated, cross-origin iframe discovered on the target
|
|
216
|
+
webpage.
|
|
217
|
+
|
|
218
|
+
Attributes:
|
|
219
|
+
url (HttpUrl): The absolute source URL of the iframe.
|
|
220
|
+
content (Optional[str]): The rendered HTML content inside the iframe.
|
|
221
|
+
"""
|
|
222
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
223
|
+
url: HttpUrl
|
|
224
|
+
content: Optional[str] = None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# --------------------
|
|
228
|
+
# Main Response Model
|
|
229
|
+
# --------------------
|
|
230
|
+
|
|
231
|
+
class ScrapeDoResponse:
|
|
232
|
+
"""A unified data model for all HTTP responses returned by the Scrape.do
|
|
233
|
+
API.
|
|
234
|
+
|
|
235
|
+
This model encapsulates the underlying HTTPX network response to provide
|
|
236
|
+
a flexible, strongly-typed interface.
|
|
237
|
+
|
|
238
|
+
abstract: Different Response Types
|
|
239
|
+
Because Scrape.do alters its response format based on the request
|
|
240
|
+
parameters, this model attempts to route property access to the
|
|
241
|
+
correct underlying data source.
|
|
242
|
+
|
|
243
|
+
info: Additional Infomartion
|
|
244
|
+
The following are some of the parameters that change the format of the
|
|
245
|
+
HTTP response returned by Scrape.do.
|
|
246
|
+
|
|
247
|
+
- `return_json=True` : Returns a JSON string containing information
|
|
248
|
+
about the request instead of the target website's raw HTML
|
|
249
|
+
|
|
250
|
+
- `transparent_response=True` : Causes the HTTP response returned by
|
|
251
|
+
Scrape.do to mirror the exact status code of the HTTP response
|
|
252
|
+
it got from the target website
|
|
253
|
+
|
|
254
|
+
- `pure_cookies=True` : Tells Scrpe.do to return the original
|
|
255
|
+
`Set-Cookie` headers it got from the target website instead of
|
|
256
|
+
bundling them into its `scrape.do-cookies` response header
|
|
257
|
+
|
|
258
|
+
Attributes:
|
|
259
|
+
request (PreparedScrapeDoRequest): The original, validated request
|
|
260
|
+
configuration.
|
|
261
|
+
httpx_response (httpx.Response): The unmutated network response object.
|
|
262
|
+
target_status_code (Optional[int]): The status code returned by the
|
|
263
|
+
destination server.
|
|
264
|
+
text (str): The primary payload of the target website
|
|
265
|
+
(HTML or inner JSON string).
|
|
266
|
+
target_headers (httpx.Headers): The target's headers, without
|
|
267
|
+
proxy telemetry headers.
|
|
268
|
+
cookies (Optional[httpx.Cookies]): Extracted cookies returned by the
|
|
269
|
+
target.
|
|
270
|
+
resolved_url (Optional[str]): The final destination URL after all
|
|
271
|
+
redirects.
|
|
272
|
+
target_url (Optional[str]): The original destination URL requested.
|
|
273
|
+
scrape_do_status_code (Optional[int]): The status code of the
|
|
274
|
+
Scrape.do gateway.
|
|
275
|
+
request_cost (Optional[float]): API billing credits consumed by this
|
|
276
|
+
specific execution.
|
|
277
|
+
remaining_credits (Optional[float]): Total API billing credits
|
|
278
|
+
remaining on your account.
|
|
279
|
+
rid (Optional[str]): The specific proxy node Routing ID utilized
|
|
280
|
+
rate (Optional[str]): Current rate limit metrics for the provided API
|
|
281
|
+
token.
|
|
282
|
+
request_id (Optional[str]): Unique UUID assigned to this request by
|
|
283
|
+
the gateway.
|
|
284
|
+
auth (Optional[int]): Authentication status against the
|
|
285
|
+
Scrape.do gateway.
|
|
286
|
+
initial_status_code (Optional[int]): Target's status extracted
|
|
287
|
+
strictly from proxy headers.
|
|
288
|
+
scrape_do_headers (httpx.Headers): Filtered headers containing only
|
|
289
|
+
Scrape.do telemetry.
|
|
290
|
+
frames (Optional[List[ScrapeDoFrame]]): Isolated cross-origin iframes
|
|
291
|
+
discovered on the page.
|
|
292
|
+
network_requests (Optional[List[ScrapeDoNetworkRequest]]): Background
|
|
293
|
+
HTTP calls made by the browser.
|
|
294
|
+
websocket_requests (Optional[List[ScrapeDoWebsocketRequest]]):
|
|
295
|
+
Intercepted bidirectional WebSocket traffic.
|
|
296
|
+
action_results (Optional[List[ScrapeDoActionResult]]): Execution
|
|
297
|
+
outcomes of programmatic DOM actions.
|
|
298
|
+
screenshots (Optional[List[ScrapeDoScreenshot]]): Captured Base64
|
|
299
|
+
screenshots.
|
|
300
|
+
"""
|
|
301
|
+
def __init__(
|
|
302
|
+
self,
|
|
303
|
+
request: PreparedScrapeDoRequest,
|
|
304
|
+
response: httpx.Response
|
|
305
|
+
):
|
|
306
|
+
# Raw Request and Response
|
|
307
|
+
self._raw_request = request
|
|
308
|
+
self._raw_response = response
|
|
309
|
+
|
|
310
|
+
# Response Flags
|
|
311
|
+
self._is_json = request.api_params.return_json
|
|
312
|
+
self._is_transparent = request.api_params.transparent_response
|
|
313
|
+
self._is_pure_cookies = request.api_params.pure_cookies
|
|
314
|
+
|
|
315
|
+
# JSON Parsing
|
|
316
|
+
self._parsed_json: Optional[Dict[str, Any]] = None
|
|
317
|
+
if self._is_json:
|
|
318
|
+
parsed = response.json()
|
|
319
|
+
try:
|
|
320
|
+
parsed = response.json()
|
|
321
|
+
if isinstance(parsed, dict):
|
|
322
|
+
self._parsed_json = parsed
|
|
323
|
+
except ValueError:
|
|
324
|
+
# If Scrape.do crashed and returned HTML despite
|
|
325
|
+
# returnJSON=True, we swallow the error here so the
|
|
326
|
+
# `is_proxy_error` heuristic can properly route it as a
|
|
327
|
+
# ServerError later.
|
|
328
|
+
pass
|
|
329
|
+
|
|
330
|
+
@cached_property
|
|
331
|
+
def is_proxy_error(self) -> bool:
|
|
332
|
+
"""Heuristic to determine whether a non-2xx status code error
|
|
333
|
+
is coming directly from the target website, or whether it's coming
|
|
334
|
+
from the Scrape.do gateway
|
|
335
|
+
|
|
336
|
+
info: Additional Information
|
|
337
|
+
Scrape.do usually sends JSON error messages when there's an
|
|
338
|
+
infrastructure error, so we try to parse the response's payload
|
|
339
|
+
as JSON regardless of whether or not `return_json=True`.
|
|
340
|
+
|
|
341
|
+
- IF `Payload Is Parsable JSON` :
|
|
342
|
+
- Check if the returned JSON contatins one of the standard
|
|
343
|
+
error keys (`message`, `Error`, `detail`, `Message`,
|
|
344
|
+
or `errorMessage`). If it does, then the error is coming
|
|
345
|
+
from Scrape.do, so return `True`
|
|
346
|
+
|
|
347
|
+
- Otherwise, check if the returned JSON contains the
|
|
348
|
+
`statusCode` key. If it does, and its value matches the
|
|
349
|
+
status code returned by the original httpx response, then
|
|
350
|
+
the error is probably coming from the `target website`, so
|
|
351
|
+
return `False`.
|
|
352
|
+
|
|
353
|
+
- If the value doesn't match or the `statusCode` key is
|
|
354
|
+
missing, fallback to `Payload Is Not Parsable JSON` logic.
|
|
355
|
+
|
|
356
|
+
- IF `Payload Is Not Parsable JSON` :
|
|
357
|
+
- Scrape.do sends telemetry headers when a request is
|
|
358
|
+
successfuly completed, so if the response has the
|
|
359
|
+
`scrape.do-intial-status-code` header and its value is not
|
|
360
|
+
empty, the error is probably coming from the
|
|
361
|
+
`target website`, so return `False`. Otherwise, it's
|
|
362
|
+
probably a Scrape.do error, so return `True`
|
|
363
|
+
|
|
364
|
+
info: `transparent_response=True`
|
|
365
|
+
When `trasparent_response=True`, Scrape.do can still send its
|
|
366
|
+
own error status codes when there's an infrastructure failure, so
|
|
367
|
+
we can't rely on the `scrape_do_status_code` to determine where
|
|
368
|
+
the error is coming from. With this in mind, this method aims
|
|
369
|
+
to provide a solution by analysing the response's structure as a
|
|
370
|
+
whole.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
`True` if it's a Scrape.do error, or `False` if it's a target
|
|
374
|
+
website error
|
|
375
|
+
"""
|
|
376
|
+
raw_status = self._raw_response.status_code
|
|
377
|
+
has_intial_status_code = self.initial_status_code is not None
|
|
378
|
+
parsed_json = None
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
parsed_json = self._raw_response.json()
|
|
382
|
+
except ValueError:
|
|
383
|
+
pass
|
|
384
|
+
|
|
385
|
+
if isinstance(parsed_json, dict):
|
|
386
|
+
error_keys = [
|
|
387
|
+
"message",
|
|
388
|
+
"Error",
|
|
389
|
+
"detail",
|
|
390
|
+
"Message",
|
|
391
|
+
"errorMessage"
|
|
392
|
+
]
|
|
393
|
+
|
|
394
|
+
if any(k in parsed_json for k in error_keys):
|
|
395
|
+
return True
|
|
396
|
+
|
|
397
|
+
status_code_match = (
|
|
398
|
+
"statusCode" in parsed_json
|
|
399
|
+
and int(parsed_json["statusCode"]) == raw_status
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
if status_code_match:
|
|
403
|
+
return False
|
|
404
|
+
|
|
405
|
+
return not has_intial_status_code
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def httpx_response(self) -> httpx.Response:
|
|
409
|
+
"""Exposes the raw, underlying HTTPX response.
|
|
410
|
+
|
|
411
|
+
info: Intended Usage
|
|
412
|
+
Accessing this bypasses all SDK normalization. It's provided as an
|
|
413
|
+
escape hatch for specific use cases where the original response
|
|
414
|
+
object is needed.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
The raw httpx response object.
|
|
418
|
+
"""
|
|
419
|
+
return self._raw_response
|
|
420
|
+
|
|
421
|
+
@property
|
|
422
|
+
def status_code(self) -> int:
|
|
423
|
+
"""Convenience accessor for the underlying HTTPX response status code.
|
|
424
|
+
|
|
425
|
+
Equivalent to `response.httpx_response.status_code`. Distinct from
|
|
426
|
+
`target_status_code` and `scrape_do_status_code`, which interpret the
|
|
427
|
+
Scrape.do response envelope.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
The HTTP status code of the response received from `api.scrape.do`.
|
|
431
|
+
"""
|
|
432
|
+
return self.httpx_response.status_code
|
|
433
|
+
|
|
434
|
+
@property
|
|
435
|
+
def request(self) -> PreparedScrapeDoRequest:
|
|
436
|
+
"""Exposes the original, validated request configuration.
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
The `PreparedScrapeDoRequest` configuration that generated this
|
|
440
|
+
response.
|
|
441
|
+
"""
|
|
442
|
+
return self._raw_request
|
|
443
|
+
|
|
444
|
+
@property
|
|
445
|
+
def scrape_do_status_code(self) -> Optional[int]:
|
|
446
|
+
"""The HTTP status code returned by the Scrape.do gateway
|
|
447
|
+
infrastructure.
|
|
448
|
+
|
|
449
|
+
info: Transparent Response
|
|
450
|
+
If `transparent_response=True` was used, the gateway hides its own
|
|
451
|
+
status code, and this property will return `None`.
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
The proxy gateway status code (e.g., 200, 429, 502).
|
|
455
|
+
"""
|
|
456
|
+
if self._is_transparent:
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
return self._raw_response.status_code
|
|
460
|
+
|
|
461
|
+
@property
|
|
462
|
+
def target_status_code(self) -> Optional[int]:
|
|
463
|
+
"""The HTTP status code returned by the destination website.
|
|
464
|
+
|
|
465
|
+
info: Additional Information
|
|
466
|
+
- If `self.is_proxy_error=True`, the target website was never
|
|
467
|
+
reached, so return `None`
|
|
468
|
+
|
|
469
|
+
- If `transparent_response=True`, the original status code from
|
|
470
|
+
the httpx response is returned
|
|
471
|
+
|
|
472
|
+
- If `return_json=True`, the `statusCode` field from the response's
|
|
473
|
+
JSON is returned
|
|
474
|
+
|
|
475
|
+
- If it's not a proxy error, and both parameters are set to false,
|
|
476
|
+
the `ScrapeDoResponse.initial_status_code` property value is
|
|
477
|
+
returned
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
The target website's status code (e.g., 200, 403, 404).
|
|
481
|
+
"""
|
|
482
|
+
if self.is_proxy_error:
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
if self._is_transparent:
|
|
486
|
+
return self._raw_response.status_code
|
|
487
|
+
|
|
488
|
+
if self._parsed_json:
|
|
489
|
+
return self._parsed_json.get("statusCode")
|
|
490
|
+
|
|
491
|
+
return self.initial_status_code
|
|
492
|
+
|
|
493
|
+
@property
|
|
494
|
+
def text(self) -> str:
|
|
495
|
+
"""The primary textual payload of the target website.
|
|
496
|
+
|
|
497
|
+
info: Additional Information
|
|
498
|
+
Depending on the request parameters, this will return
|
|
499
|
+
either the raw HTML byte stream or the extracted `content` string
|
|
500
|
+
from within Scrape.do's JSON wrapper.
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
The HTML or JSON string payload from the target.
|
|
504
|
+
"""
|
|
505
|
+
if self._parsed_json:
|
|
506
|
+
return self._parsed_json.get(
|
|
507
|
+
"content",
|
|
508
|
+
self._raw_response.text
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
return self._raw_response.text
|
|
512
|
+
|
|
513
|
+
@property
|
|
514
|
+
def target_headers(self) -> httpx.Headers:
|
|
515
|
+
"""The HTTP headers returned by the destination server.
|
|
516
|
+
|
|
517
|
+
info: Additional Information
|
|
518
|
+
This property automatically filters all internal `scrape.do-` proxy
|
|
519
|
+
telemetry headers, providing a clean representation of
|
|
520
|
+
the target's response.
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
The filtered headers from the target website.
|
|
524
|
+
"""
|
|
525
|
+
clean_headers = {
|
|
526
|
+
k: v for k, v in self._raw_response.headers.items()
|
|
527
|
+
if not k.lower().startswith("scrape.do-")
|
|
528
|
+
}
|
|
529
|
+
return httpx.Headers(clean_headers)
|
|
530
|
+
|
|
531
|
+
# --- Scrape.do Headers ---
|
|
532
|
+
|
|
533
|
+
@property
|
|
534
|
+
def scrape_do_headers(self) -> Optional[httpx.Headers]:
|
|
535
|
+
"""Filters the response headers to isolate Scrape.do's specific
|
|
536
|
+
infrastructure telemetry.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Only headers prefixed with `scrape.do-`, or None if no
|
|
540
|
+
`scrape.do-` headers are found
|
|
541
|
+
"""
|
|
542
|
+
headers = {
|
|
543
|
+
k: v for k, v in self._raw_response.headers.items()
|
|
544
|
+
if k.lower().startswith("scrape.do-")
|
|
545
|
+
}
|
|
546
|
+
if not headers:
|
|
547
|
+
return None
|
|
548
|
+
return httpx.Headers(headers)
|
|
549
|
+
|
|
550
|
+
@property
|
|
551
|
+
def request_cost(self) -> Optional[float]:
|
|
552
|
+
"""The amount of API billing credits consumed by this specific
|
|
553
|
+
execution.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
The value returned in the scapre_do_headers casted to a
|
|
557
|
+
float, or `None` if the `scrape.do-request-cost`
|
|
558
|
+
header is missing
|
|
559
|
+
"""
|
|
560
|
+
cost = self._raw_response.headers.get("scrape.do-request-cost")
|
|
561
|
+
return float(cost) if cost else None
|
|
562
|
+
|
|
563
|
+
@property
|
|
564
|
+
def initial_status_code(self) -> Optional[int]:
|
|
565
|
+
"""The target website's HTTP status code, extracted directly from the
|
|
566
|
+
proxy headers.
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
The status code casted to an int, or None if the
|
|
570
|
+
`scrape.do-intial-status-code` header is missing.
|
|
571
|
+
"""
|
|
572
|
+
initial_status_code = self._raw_response.headers.get(
|
|
573
|
+
"scrape.do-initial-status-code"
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return int(initial_status_code) if initial_status_code else None
|
|
577
|
+
|
|
578
|
+
@property
|
|
579
|
+
def request_id(self) -> Optional[str]:
|
|
580
|
+
"""The unique UUID assigned to this request by the Scrape.do gateway.
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
The internal tracking ID, or None if the `scrape.do-request-id`
|
|
584
|
+
header is missing
|
|
585
|
+
"""
|
|
586
|
+
return self._raw_response.headers.get("scrape.do-request-id")
|
|
587
|
+
|
|
588
|
+
@property
|
|
589
|
+
def resolved_url(self) -> Optional[str]:
|
|
590
|
+
"""The final destination URL after all server-side and client-side
|
|
591
|
+
redirects.
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
The absolute URL where the browser ultimately landed, or None if
|
|
595
|
+
the `scrape.do-resolved-url` header is missing
|
|
596
|
+
"""
|
|
597
|
+
return self._raw_response.headers.get("scrape.do-resolved-url")
|
|
598
|
+
|
|
599
|
+
@property
|
|
600
|
+
def target_url(self) -> Optional[str]:
|
|
601
|
+
"""The original destination URL requested by the SDK.
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
The initial target URL, or None if the `scrape.do-target-url`
|
|
605
|
+
header is missing
|
|
606
|
+
"""
|
|
607
|
+
return self._raw_response.headers.get("scrape.do-target-url")
|
|
608
|
+
|
|
609
|
+
@property
|
|
610
|
+
def auth(self) -> Optional[int]:
|
|
611
|
+
"""Indicates the authentication status against the Scrape.do gateway.
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
The authentication flag value casted to an int, or None if the
|
|
615
|
+
`scrape.do-auth` header is missing
|
|
616
|
+
"""
|
|
617
|
+
auth = self._raw_response.headers.get("scrape.do-auth")
|
|
618
|
+
return int(auth) if auth else None
|
|
619
|
+
|
|
620
|
+
@property
|
|
621
|
+
def rate(self) -> Optional[str]:
|
|
622
|
+
"""The current rate limit metrics for the provided API token.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
A string representing current concurrency thresholds, or None if
|
|
626
|
+
the `scrape.do-rate` header is missing
|
|
627
|
+
"""
|
|
628
|
+
return self._raw_response.headers.get("scrape.do-rate")
|
|
629
|
+
|
|
630
|
+
@property
|
|
631
|
+
def remaining_credits(self) -> Optional[float]:
|
|
632
|
+
"""The total number of API billing credits remaining on your account.
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
The remaining account balance casted to a float, or None if the
|
|
636
|
+
`scrape.do-remaining-credits` header is missing
|
|
637
|
+
"""
|
|
638
|
+
remaining_credits = self._raw_response.headers.get(
|
|
639
|
+
"scrape.do-remaining-credits"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
return float(remaining_credits) if remaining_credits else None
|
|
643
|
+
|
|
644
|
+
@property
|
|
645
|
+
def rid(self) -> Optional[str]:
|
|
646
|
+
"""The specific proxy node Routing ID utilized for this connection.
|
|
647
|
+
|
|
648
|
+
info: Session ID
|
|
649
|
+
If `session_id` was provided in the parameters,
|
|
650
|
+
this Routing ID is used by the `ScrapeDoClient` to verify that
|
|
651
|
+
sticky sessions are maintaining the same node.
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
The internal routing identifier, or None if the `scrape.do-rid`
|
|
655
|
+
header is missing
|
|
656
|
+
"""
|
|
657
|
+
return self._raw_response.headers.get("scrape.do-rid")
|
|
658
|
+
|
|
659
|
+
@property
|
|
660
|
+
def cookies(self) -> Optional[httpx.Cookies]:
|
|
661
|
+
"""Extracts and parses cookies returned by the target server.
|
|
662
|
+
|
|
663
|
+
info: Additional Information
|
|
664
|
+
If `pure_cookies=True` is active, it returns the httpx response's
|
|
665
|
+
`cookies` attribute. Otherwise, it decodes the custom
|
|
666
|
+
`scrape.do-cookies` string into a `httpx.Cookies` object
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
A `httpx.Cookies` object containing all cookies.
|
|
670
|
+
"""
|
|
671
|
+
if self._is_pure_cookies:
|
|
672
|
+
return self._raw_response.cookies
|
|
673
|
+
|
|
674
|
+
cookies = self._raw_response.headers.get("scrape.do-cookies")
|
|
675
|
+
if cookies:
|
|
676
|
+
# Parse Cookies (c1=v1;c2=v2;...)
|
|
677
|
+
pattern = re.compile(r"([^=;]+)=([^;]*)")
|
|
678
|
+
matches = re.findall(pattern, cookies)
|
|
679
|
+
if not matches:
|
|
680
|
+
return None
|
|
681
|
+
cookie_dict = {n: v for n, v in matches}
|
|
682
|
+
return httpx.Cookies(cookie_dict)
|
|
683
|
+
|
|
684
|
+
return None
|
|
685
|
+
|
|
686
|
+
# --- Scrape.do JSON ---
|
|
687
|
+
|
|
688
|
+
@property
|
|
689
|
+
def frames(self) -> Optional[List[ScrapeDoFrame]]:
|
|
690
|
+
"""Extracts isolated cross-origin iframes discovered during page
|
|
691
|
+
rendering.
|
|
692
|
+
|
|
693
|
+
info: Prerequisites
|
|
694
|
+
Requires `render=True`, `return_json=True`, and `show_frames=True`
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
A list of typed Pydantic models representing frames.
|
|
698
|
+
"""
|
|
699
|
+
if self._parsed_json and "frames" in self._parsed_json:
|
|
700
|
+
return [
|
|
701
|
+
ScrapeDoFrame(**f) for f in self._parsed_json["frames"]
|
|
702
|
+
]
|
|
703
|
+
return None
|
|
704
|
+
|
|
705
|
+
@property
|
|
706
|
+
def network_requests(self) -> Optional[List[ScrapeDoNetworkRequest]]:
|
|
707
|
+
"""Intercepts background network traffic triggered by the headless
|
|
708
|
+
browser.
|
|
709
|
+
|
|
710
|
+
info: Prerequisites
|
|
711
|
+
Requires `render=True` and `return_json=True`.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
A list of typed models detailing HTTP calls.
|
|
715
|
+
"""
|
|
716
|
+
if self._parsed_json and "networkRequests" in self._parsed_json:
|
|
717
|
+
return [
|
|
718
|
+
ScrapeDoNetworkRequest(**nr) for nr
|
|
719
|
+
in self._parsed_json["networkRequests"]
|
|
720
|
+
]
|
|
721
|
+
return None
|
|
722
|
+
|
|
723
|
+
@property
|
|
724
|
+
def websocket_requests(self) -> Optional[List[ScrapeDoWebsocketRequest]]:
|
|
725
|
+
"""Intercepts bidirectional WebSocket traffic initiated by the target
|
|
726
|
+
website.
|
|
727
|
+
|
|
728
|
+
info: Prerequisites
|
|
729
|
+
Requires `render=True`, `return_json=True`, and
|
|
730
|
+
`show_websocket_requests=True`
|
|
731
|
+
|
|
732
|
+
Returns:
|
|
733
|
+
A list of typed models detailing socket events.
|
|
734
|
+
"""
|
|
735
|
+
if self._parsed_json and "websocketRequests" in self._parsed_json:
|
|
736
|
+
return [
|
|
737
|
+
ScrapeDoWebsocketRequest(**ws) for ws
|
|
738
|
+
in self._parsed_json["websocketRequests"]
|
|
739
|
+
]
|
|
740
|
+
return None
|
|
741
|
+
|
|
742
|
+
@property
|
|
743
|
+
def action_results(self) -> Optional[List[ScrapeDoActionResult]]:
|
|
744
|
+
"""Details the success or failure of programmatic DOM interactions.
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
A list of typed models mapping sequentially to the actions defined
|
|
748
|
+
in the `play_with_browser` array.
|
|
749
|
+
"""
|
|
750
|
+
if self._parsed_json and "actionResults" in self._parsed_json:
|
|
751
|
+
return [
|
|
752
|
+
ScrapeDoActionResult(**ar) for ar
|
|
753
|
+
in self._parsed_json["actionResults"]
|
|
754
|
+
]
|
|
755
|
+
return None
|
|
756
|
+
|
|
757
|
+
@property
|
|
758
|
+
def screenshots(self) -> Optional[List[ScrapeDoScreenshot]]:
|
|
759
|
+
"""Extracts generated Base64 screenshots from the JSON payload.
|
|
760
|
+
|
|
761
|
+
info: Prerequisites
|
|
762
|
+
Requires `render=True`, `return_json=True`, and a valid screenshot
|
|
763
|
+
parameter (e.g., `full_screenshot=True`).
|
|
764
|
+
|
|
765
|
+
Returns:
|
|
766
|
+
A list of typed models containing the image data.
|
|
767
|
+
"""
|
|
768
|
+
if self._parsed_json and "screenShots" in self._parsed_json:
|
|
769
|
+
return [
|
|
770
|
+
ScrapeDoScreenshot(**s) for s in
|
|
771
|
+
self._parsed_json["screenShots"]
|
|
772
|
+
]
|
|
773
|
+
return None
|
|
774
|
+
|
|
775
|
+
def raise_for_status(self) -> Self:
|
|
776
|
+
"""Evaluates the response and raises a mapped exception if the request
|
|
777
|
+
failed.
|
|
778
|
+
|
|
779
|
+
info: Additional Information
|
|
780
|
+
Utilizes the `is_proxy_error` heuristic to determine if
|
|
781
|
+
the failure originated from the Scrape.do proxy infrastructure or
|
|
782
|
+
from the target website.
|
|
783
|
+
|
|
784
|
+
Returns:
|
|
785
|
+
The current `ScrapeDoResponse` instance,
|
|
786
|
+
allowing for method chaining.
|
|
787
|
+
|
|
788
|
+
Raises:
|
|
789
|
+
TargetError: If the proxy succeeded, but the target website
|
|
790
|
+
returned an error code (e.g., a 403 Cloudflare block or a 404
|
|
791
|
+
Not Found).
|
|
792
|
+
BadRequestError: If the request was malformed
|
|
793
|
+
(HTTP 400 from Scrape.do).
|
|
794
|
+
AuthenticationError: If your Scrape.do API token is invalid
|
|
795
|
+
(HTTP 401).
|
|
796
|
+
AuthenticationThrottleError: If your specific token has been
|
|
797
|
+
temporarily locked by the Scrape.do authentication server to
|
|
798
|
+
prevent abuse. (HTTP 401)
|
|
799
|
+
RateLimitError: If you exceed your account's concurrent request
|
|
800
|
+
limit (HTTP 429).
|
|
801
|
+
ServerError: If the Scrape.do gateway experiences an issue
|
|
802
|
+
(HTTP 502/510).
|
|
803
|
+
APIResponseError: A generic fallback for unmapped Scrape.do proxy
|
|
804
|
+
errors.
|
|
805
|
+
"""
|
|
806
|
+
|
|
807
|
+
if self.target_status_code and self.target_status_code < 400:
|
|
808
|
+
return self
|
|
809
|
+
|
|
810
|
+
# Checks if it's an Authentication Throttle Error
|
|
811
|
+
error_msg = None
|
|
812
|
+
if self._parsed_json:
|
|
813
|
+
error_keys = [
|
|
814
|
+
"message",
|
|
815
|
+
"Error",
|
|
816
|
+
"detail",
|
|
817
|
+
"Message",
|
|
818
|
+
"errorMessage"
|
|
819
|
+
]
|
|
820
|
+
|
|
821
|
+
for k in error_keys:
|
|
822
|
+
if k in self._parsed_json:
|
|
823
|
+
error_msg = self._parsed_json[k]
|
|
824
|
+
break
|
|
825
|
+
|
|
826
|
+
elif self.text:
|
|
827
|
+
error_msg = self.text
|
|
828
|
+
|
|
829
|
+
is_throttled = None
|
|
830
|
+
throttled_msg = "temporarily throttled by the authentication server"
|
|
831
|
+
if error_msg and throttled_msg in error_msg:
|
|
832
|
+
is_throttled = True
|
|
833
|
+
|
|
834
|
+
raw_status = self._raw_response.status_code
|
|
835
|
+
|
|
836
|
+
# Route to Proxy Infrastructure Errors
|
|
837
|
+
if self.is_proxy_error:
|
|
838
|
+
|
|
839
|
+
if raw_status == 400:
|
|
840
|
+
raise BadRequestError(
|
|
841
|
+
self._raw_response,
|
|
842
|
+
self._raw_request,
|
|
843
|
+
self
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
elif raw_status == 401:
|
|
847
|
+
if is_throttled:
|
|
848
|
+
raise AuthenticationThrottleError(
|
|
849
|
+
self._raw_response,
|
|
850
|
+
self._raw_request,
|
|
851
|
+
self
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
raise AuthenticationError(
|
|
855
|
+
self._raw_response,
|
|
856
|
+
self._raw_request,
|
|
857
|
+
self
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
elif raw_status == 429:
|
|
861
|
+
raise RateLimitError(
|
|
862
|
+
self._raw_response,
|
|
863
|
+
self._raw_request,
|
|
864
|
+
self
|
|
865
|
+
)
|
|
866
|
+
elif raw_status in (502, 510):
|
|
867
|
+
raise ServerError(
|
|
868
|
+
self._raw_response,
|
|
869
|
+
self._raw_request,
|
|
870
|
+
self
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
raise APIResponseError(
|
|
874
|
+
self._raw_response,
|
|
875
|
+
self._raw_request,
|
|
876
|
+
self
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
# If is_proxy_error is False, then it's a TargetError
|
|
880
|
+
|
|
881
|
+
status_code = self.target_status_code or self._raw_response.status_code
|
|
882
|
+
raise TargetError(
|
|
883
|
+
(f"Target rejected request with status: "
|
|
884
|
+
f"{status_code}"
|
|
885
|
+
),
|
|
886
|
+
status_code,
|
|
887
|
+
self._raw_response,
|
|
888
|
+
self._raw_request,
|
|
889
|
+
self
|
|
890
|
+
)
|