scrape-do-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,890 @@
1
+ """Custom data models for the Scrape.do's API HTTP response
2
+
3
+ Encapsulates the httpx.Response object to provide a strongly-typed interface
4
+ for the respone data sent back by the Scrape.do API. It Parses nested JSON
5
+ payloads, extracts proxy telemetry, and attempts to determine whether non-2xx
6
+ responses are coming from the target website, or from Scrape.do's gateway
7
+ failures.
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from pathlib import Path
12
+ import os
13
+ import base64
14
+ import re
15
+ import httpx
16
+ from functools import cached_property
17
+ from typing import (
18
+ Optional,
19
+ Union,
20
+ List,
21
+ Self,
22
+ Any,
23
+ Dict
24
+ )
25
+ from pydantic import (
26
+ BaseModel,
27
+ Field,
28
+ HttpUrl,
29
+ ConfigDict
30
+ )
31
+ from .request import PreparedScrapeDoRequest
32
+ from ..exceptions import (
33
+ APIResponseError,
34
+ TargetError,
35
+ BadRequestError,
36
+ ServerError,
37
+ AuthenticationError,
38
+ AuthenticationThrottleError,
39
+ RateLimitError
40
+ )
41
+
42
+ # -------------------------
43
+ # JSON Response Info Models
44
+ # -------------------------
45
+
46
+
47
+ class ScrapeDoNetworkRequest(BaseModel):
48
+ """Represents an intercepted HTTP network request made by the headless
49
+ browser.
50
+
51
+ When rendering JavaScript, the browser makes subsequent requests to fetch
52
+ CSS, images, and background API data which Scrape.do returns in the
53
+ `networkRequests` field when `returnJSON=true`
54
+
55
+ Attributes:
56
+ url (HttpUrl): The absolute URL of the requested resource.
57
+ method (str): The HTTP method used (e.g., GET, POST).
58
+ status (int): The HTTP status code returned by the resource server.
59
+ request_headers (Dict[str, str]): The headers sent by the headless
60
+ browser.
61
+ request_body (Optional[str]): The payload sent with the request,
62
+ if any.
63
+ response_body (Optional[str]): The payload returned by the server,
64
+ if captured.
65
+ response_headers (Dict[str, str]): The headers returned by the
66
+ resource server.
67
+ """
68
+ model_config = ConfigDict(populate_by_name=True)
69
+ url: HttpUrl
70
+ method: str
71
+ status: int
72
+ request_headers: Dict[str, str] = Field(default_factory=dict)
73
+ request_body: Optional[str] = None
74
+ response_body: Optional[str] = None
75
+ response_headers: Dict[str, str] = Field(default_factory=dict)
76
+
77
+
78
+ class ScrapeDoWebSocketFrame(BaseModel):
79
+ """Represents the underlying payload of an intercepted WebSocket message.
80
+
81
+ Attributes:
82
+ opcode (int): The WebSocket frame operation code
83
+ (1 for text, 2 for binary).
84
+ mask (bool): Indicates if the payload data is masked.
85
+ payload_data (str): The actual message content transferred over the
86
+ socket.
87
+ """
88
+ model_config = ConfigDict(populate_by_name=True)
89
+ opcode: int
90
+ mask: bool
91
+ payload_data: str = Field(alias="payloadData")
92
+
93
+
94
+ class ScrapeDoWebSocketEvent(BaseModel):
95
+ """Represents the Chrome DevTools Protocol (CDP) event metadata for a
96
+ WebSocket.
97
+
98
+ Attributes:
99
+ request_id (str): The unique identifier for this specific
100
+ WebSocket connection.
101
+ timestamp (float): The exact epoch timestamp when the event occurred.
102
+ response (ScrapeDoWebSocketFrame): The underlying frame containing the
103
+ payload.
104
+ """
105
+ model_config = ConfigDict(populate_by_name=True)
106
+ request_id: str = Field(alias="requestId")
107
+ timestamp: float
108
+ response: ScrapeDoWebSocketFrame
109
+
110
+
111
+ class ScrapeDoWebsocketRequest(BaseModel):
112
+ """Represents a complete WebSocket message intercepted during rendering.
113
+
114
+ Attributes:
115
+ type (str): The direction of the traffic (e.g., "sent" or "received").
116
+ event (ScrapeDoWebSocketEvent): The raw DevTools Protocol event data.
117
+ """
118
+ model_config = ConfigDict(populate_by_name=True)
119
+ type: str
120
+ event: ScrapeDoWebSocketEvent
121
+
122
+ @property
123
+ def is_text(self) -> bool:
124
+ """Determines if the WebSocket payload is readable text.
125
+
126
+ Returns:
127
+ `True` if the underlying frame opcode is 1 (Text).
128
+ """
129
+ return self.event.response.opcode == 1
130
+
131
+
132
+ class ScrapeDoActionResult(BaseModel):
133
+ """Represents the execution outcome of a specific programmatic browser
134
+ action.
135
+
136
+ Attributes:
137
+ action (str): The name of the action executed (e.g., "Click", "Wait").
138
+ index (int): The sequence index of this action in the original request
139
+ array.
140
+ success (bool): Indicates whether the action completed without
141
+ throwing an error.
142
+ error (Optional[str]): The error message if the action failed.
143
+ response (Optional[Union[Dict[str, Any], str]]): Data returned by the
144
+ action, typically populated when using the `ExecuteAction` to run
145
+ custom JavaScript.
146
+ """
147
+ model_config = ConfigDict(populate_by_name=True)
148
+ action: str
149
+ index: int
150
+ success: bool
151
+ error: Optional[str] = None
152
+ response: Optional[Union[Dict[str, Any], str]] = None
153
+
154
+
155
+ class ScrapeDoScreenshot(BaseModel):
156
+ """Represents a captured screenshot generated during the scraping process.
157
+
158
+ Attributes:
159
+ screenshot_type (str): The configuration used (e.g., "FullScreenShot").
160
+ b64_image (Optional[str]): The Base64 encoded string of the PNG image
161
+ data.
162
+ error (Optional[str]): The failure reason if the screenshot could not
163
+ be captured.
164
+ """
165
+ model_config = ConfigDict(populate_by_name=True)
166
+ screenshot_type: str = Field(alias="type")
167
+ b64_image: Optional[str] = Field(alias="image", default=None)
168
+ error: Optional[str] = None
169
+
170
+ def to_bytes(self) -> bytes:
171
+ """
172
+ Convenience method to convert the `b64_image` string into a bytes
173
+ object using the `base64` standard python library
174
+
175
+ Raises:
176
+ ValueError: If the instance's `b64_image` attribute is empty
177
+
178
+ Returns:
179
+ bytes object retuned by `base64.b64decode(b64_image)`
180
+ """
181
+ if not self.b64_image:
182
+ raise ValueError(
183
+ f"No image data was found in the screenshot response | "
184
+ f"Screenshot Type: {self.screenshot_type} | "
185
+ f"Error String: {self.error}"
186
+ )
187
+
188
+ return base64.b64decode(self.b64_image)
189
+
190
+ def to_file(self, path: Union[str, os.PathLike]) -> Path:
191
+ """
192
+ Convenience method to save the base64-encoded screenshot
193
+
194
+ warning: File Type
195
+ Scrape.do returns base64-encoded `.png` image data, so `path`
196
+ should end in `/file_name.png`
197
+
198
+ Args:
199
+ path (Union[str, os.PathLike]): Image file will be saved to this
200
+ path
201
+
202
+ Returns:
203
+ resolved `pathlib.Path` object of the `path` parameter
204
+ """
205
+ r_path = Path(path).resolve()
206
+ image_bytes = self.to_bytes()
207
+
208
+ with open(r_path, "wb") as image:
209
+ image.write(image_bytes)
210
+
211
+ return r_path
212
+
213
+
214
+ class ScrapeDoFrame(BaseModel):
215
+ """Represents an isolated, cross-origin iframe discovered on the target
216
+ webpage.
217
+
218
+ Attributes:
219
+ url (HttpUrl): The absolute source URL of the iframe.
220
+ content (Optional[str]): The rendered HTML content inside the iframe.
221
+ """
222
+ model_config = ConfigDict(populate_by_name=True)
223
+ url: HttpUrl
224
+ content: Optional[str] = None
225
+
226
+
227
+ # --------------------
228
+ # Main Response Model
229
+ # --------------------
230
+
231
+ class ScrapeDoResponse:
232
+ """A unified data model for all HTTP responses returned by the Scrape.do
233
+ API.
234
+
235
+ This model encapsulates the underlying HTTPX network response to provide
236
+ a flexible, strongly-typed interface.
237
+
238
+ abstract: Different Response Types
239
+ Because Scrape.do alters its response format based on the request
240
+ parameters, this model attempts to route property access to the
241
+ correct underlying data source.
242
+
243
+ info: Additional Infomartion
244
+ The following are some of the parameters that change the format of the
245
+ HTTP response returned by Scrape.do.
246
+
247
+ - `return_json=True` : Returns a JSON string containing information
248
+ about the request instead of the target website's raw HTML
249
+
250
+ - `transparent_response=True` : Causes the HTTP response returned by
251
+ Scrape.do to mirror the exact status code of the HTTP response
252
+ it got from the target website
253
+
254
+ - `pure_cookies=True` : Tells Scrpe.do to return the original
255
+ `Set-Cookie` headers it got from the target website instead of
256
+ bundling them into its `scrape.do-cookies` response header
257
+
258
+ Attributes:
259
+ request (PreparedScrapeDoRequest): The original, validated request
260
+ configuration.
261
+ httpx_response (httpx.Response): The unmutated network response object.
262
+ target_status_code (Optional[int]): The status code returned by the
263
+ destination server.
264
+ text (str): The primary payload of the target website
265
+ (HTML or inner JSON string).
266
+ target_headers (httpx.Headers): The target's headers, without
267
+ proxy telemetry headers.
268
+ cookies (Optional[httpx.Cookies]): Extracted cookies returned by the
269
+ target.
270
+ resolved_url (Optional[str]): The final destination URL after all
271
+ redirects.
272
+ target_url (Optional[str]): The original destination URL requested.
273
+ scrape_do_status_code (Optional[int]): The status code of the
274
+ Scrape.do gateway.
275
+ request_cost (Optional[float]): API billing credits consumed by this
276
+ specific execution.
277
+ remaining_credits (Optional[float]): Total API billing credits
278
+ remaining on your account.
279
+ rid (Optional[str]): The specific proxy node Routing ID utilized
280
+ rate (Optional[str]): Current rate limit metrics for the provided API
281
+ token.
282
+ request_id (Optional[str]): Unique UUID assigned to this request by
283
+ the gateway.
284
+ auth (Optional[int]): Authentication status against the
285
+ Scrape.do gateway.
286
+ initial_status_code (Optional[int]): Target's status extracted
287
+ strictly from proxy headers.
288
+ scrape_do_headers (httpx.Headers): Filtered headers containing only
289
+ Scrape.do telemetry.
290
+ frames (Optional[List[ScrapeDoFrame]]): Isolated cross-origin iframes
291
+ discovered on the page.
292
+ network_requests (Optional[List[ScrapeDoNetworkRequest]]): Background
293
+ HTTP calls made by the browser.
294
+ websocket_requests (Optional[List[ScrapeDoWebsocketRequest]]):
295
+ Intercepted bidirectional WebSocket traffic.
296
+ action_results (Optional[List[ScrapeDoActionResult]]): Execution
297
+ outcomes of programmatic DOM actions.
298
+ screenshots (Optional[List[ScrapeDoScreenshot]]): Captured Base64
299
+ screenshots.
300
+ """
301
+ def __init__(
302
+ self,
303
+ request: PreparedScrapeDoRequest,
304
+ response: httpx.Response
305
+ ):
306
+ # Raw Request and Response
307
+ self._raw_request = request
308
+ self._raw_response = response
309
+
310
+ # Response Flags
311
+ self._is_json = request.api_params.return_json
312
+ self._is_transparent = request.api_params.transparent_response
313
+ self._is_pure_cookies = request.api_params.pure_cookies
314
+
315
+ # JSON Parsing
316
+ self._parsed_json: Optional[Dict[str, Any]] = None
317
+ if self._is_json:
318
+ parsed = response.json()
319
+ try:
320
+ parsed = response.json()
321
+ if isinstance(parsed, dict):
322
+ self._parsed_json = parsed
323
+ except ValueError:
324
+ # If Scrape.do crashed and returned HTML despite
325
+ # returnJSON=True, we swallow the error here so the
326
+ # `is_proxy_error` heuristic can properly route it as a
327
+ # ServerError later.
328
+ pass
329
+
330
+ @cached_property
331
+ def is_proxy_error(self) -> bool:
332
+ """Heuristic to determine whether a non-2xx status code error
333
+ is coming directly from the target website, or whether it's coming
334
+ from the Scrape.do gateway
335
+
336
+ info: Additional Information
337
+ Scrape.do usually sends JSON error messages when there's an
338
+ infrastructure error, so we try to parse the response's payload
339
+ as JSON regardless of whether or not `return_json=True`.
340
+
341
+ - IF `Payload Is Parsable JSON` :
342
+ - Check if the returned JSON contatins one of the standard
343
+ error keys (`message`, `Error`, `detail`, `Message`,
344
+ or `errorMessage`). If it does, then the error is coming
345
+ from Scrape.do, so return `True`
346
+
347
+ - Otherwise, check if the returned JSON contains the
348
+ `statusCode` key. If it does, and its value matches the
349
+ status code returned by the original httpx response, then
350
+ the error is probably coming from the `target website`, so
351
+ return `False`.
352
+
353
+ - If the value doesn't match or the `statusCode` key is
354
+ missing, fallback to `Payload Is Not Parsable JSON` logic.
355
+
356
+ - IF `Payload Is Not Parsable JSON` :
357
+ - Scrape.do sends telemetry headers when a request is
358
+ successfuly completed, so if the response has the
359
+ `scrape.do-intial-status-code` header and its value is not
360
+ empty, the error is probably coming from the
361
+ `target website`, so return `False`. Otherwise, it's
362
+ probably a Scrape.do error, so return `True`
363
+
364
+ info: `transparent_response=True`
365
+ When `trasparent_response=True`, Scrape.do can still send its
366
+ own error status codes when there's an infrastructure failure, so
367
+ we can't rely on the `scrape_do_status_code` to determine where
368
+ the error is coming from. With this in mind, this method aims
369
+ to provide a solution by analysing the response's structure as a
370
+ whole.
371
+
372
+ Returns:
373
+ `True` if it's a Scrape.do error, or `False` if it's a target
374
+ website error
375
+ """
376
+ raw_status = self._raw_response.status_code
377
+ has_intial_status_code = self.initial_status_code is not None
378
+ parsed_json = None
379
+
380
+ try:
381
+ parsed_json = self._raw_response.json()
382
+ except ValueError:
383
+ pass
384
+
385
+ if isinstance(parsed_json, dict):
386
+ error_keys = [
387
+ "message",
388
+ "Error",
389
+ "detail",
390
+ "Message",
391
+ "errorMessage"
392
+ ]
393
+
394
+ if any(k in parsed_json for k in error_keys):
395
+ return True
396
+
397
+ status_code_match = (
398
+ "statusCode" in parsed_json
399
+ and int(parsed_json["statusCode"]) == raw_status
400
+ )
401
+
402
+ if status_code_match:
403
+ return False
404
+
405
+ return not has_intial_status_code
406
+
407
+ @property
408
+ def httpx_response(self) -> httpx.Response:
409
+ """Exposes the raw, underlying HTTPX response.
410
+
411
+ info: Intended Usage
412
+ Accessing this bypasses all SDK normalization. It's provided as an
413
+ escape hatch for specific use cases where the original response
414
+ object is needed.
415
+
416
+ Returns:
417
+ The raw httpx response object.
418
+ """
419
+ return self._raw_response
420
+
421
+ @property
422
+ def status_code(self) -> int:
423
+ """Convenience accessor for the underlying HTTPX response status code.
424
+
425
+ Equivalent to `response.httpx_response.status_code`. Distinct from
426
+ `target_status_code` and `scrape_do_status_code`, which interpret the
427
+ Scrape.do response envelope.
428
+
429
+ Returns:
430
+ The HTTP status code of the response received from `api.scrape.do`.
431
+ """
432
+ return self.httpx_response.status_code
433
+
434
+ @property
435
+ def request(self) -> PreparedScrapeDoRequest:
436
+ """Exposes the original, validated request configuration.
437
+
438
+ Returns:
439
+ The `PreparedScrapeDoRequest` configuration that generated this
440
+ response.
441
+ """
442
+ return self._raw_request
443
+
444
+ @property
445
+ def scrape_do_status_code(self) -> Optional[int]:
446
+ """The HTTP status code returned by the Scrape.do gateway
447
+ infrastructure.
448
+
449
+ info: Transparent Response
450
+ If `transparent_response=True` was used, the gateway hides its own
451
+ status code, and this property will return `None`.
452
+
453
+ Returns:
454
+ The proxy gateway status code (e.g., 200, 429, 502).
455
+ """
456
+ if self._is_transparent:
457
+ return None
458
+
459
+ return self._raw_response.status_code
460
+
461
+ @property
462
+ def target_status_code(self) -> Optional[int]:
463
+ """The HTTP status code returned by the destination website.
464
+
465
+ info: Additional Information
466
+ - If `self.is_proxy_error=True`, the target website was never
467
+ reached, so return `None`
468
+
469
+ - If `transparent_response=True`, the original status code from
470
+ the httpx response is returned
471
+
472
+ - If `return_json=True`, the `statusCode` field from the response's
473
+ JSON is returned
474
+
475
+ - If it's not a proxy error, and both parameters are set to false,
476
+ the `ScrapeDoResponse.initial_status_code` property value is
477
+ returned
478
+
479
+ Returns:
480
+ The target website's status code (e.g., 200, 403, 404).
481
+ """
482
+ if self.is_proxy_error:
483
+ return None
484
+
485
+ if self._is_transparent:
486
+ return self._raw_response.status_code
487
+
488
+ if self._parsed_json:
489
+ return self._parsed_json.get("statusCode")
490
+
491
+ return self.initial_status_code
492
+
493
+ @property
494
+ def text(self) -> str:
495
+ """The primary textual payload of the target website.
496
+
497
+ info: Additional Information
498
+ Depending on the request parameters, this will return
499
+ either the raw HTML byte stream or the extracted `content` string
500
+ from within Scrape.do's JSON wrapper.
501
+
502
+ Returns:
503
+ The HTML or JSON string payload from the target.
504
+ """
505
+ if self._parsed_json:
506
+ return self._parsed_json.get(
507
+ "content",
508
+ self._raw_response.text
509
+ )
510
+
511
+ return self._raw_response.text
512
+
513
+ @property
514
+ def target_headers(self) -> httpx.Headers:
515
+ """The HTTP headers returned by the destination server.
516
+
517
+ info: Additional Information
518
+ This property automatically filters all internal `scrape.do-` proxy
519
+ telemetry headers, providing a clean representation of
520
+ the target's response.
521
+
522
+ Returns:
523
+ The filtered headers from the target website.
524
+ """
525
+ clean_headers = {
526
+ k: v for k, v in self._raw_response.headers.items()
527
+ if not k.lower().startswith("scrape.do-")
528
+ }
529
+ return httpx.Headers(clean_headers)
530
+
531
+ # --- Scrape.do Headers ---
532
+
533
+ @property
534
+ def scrape_do_headers(self) -> Optional[httpx.Headers]:
535
+ """Filters the response headers to isolate Scrape.do's specific
536
+ infrastructure telemetry.
537
+
538
+ Returns:
539
+ Only headers prefixed with `scrape.do-`, or None if no
540
+ `scrape.do-` headers are found
541
+ """
542
+ headers = {
543
+ k: v for k, v in self._raw_response.headers.items()
544
+ if k.lower().startswith("scrape.do-")
545
+ }
546
+ if not headers:
547
+ return None
548
+ return httpx.Headers(headers)
549
+
550
+ @property
551
+ def request_cost(self) -> Optional[float]:
552
+ """The amount of API billing credits consumed by this specific
553
+ execution.
554
+
555
+ Returns:
556
+ The value returned in the scapre_do_headers casted to a
557
+ float, or `None` if the `scrape.do-request-cost`
558
+ header is missing
559
+ """
560
+ cost = self._raw_response.headers.get("scrape.do-request-cost")
561
+ return float(cost) if cost else None
562
+
563
+ @property
564
+ def initial_status_code(self) -> Optional[int]:
565
+ """The target website's HTTP status code, extracted directly from the
566
+ proxy headers.
567
+
568
+ Returns:
569
+ The status code casted to an int, or None if the
570
+ `scrape.do-intial-status-code` header is missing.
571
+ """
572
+ initial_status_code = self._raw_response.headers.get(
573
+ "scrape.do-initial-status-code"
574
+ )
575
+
576
+ return int(initial_status_code) if initial_status_code else None
577
+
578
+ @property
579
+ def request_id(self) -> Optional[str]:
580
+ """The unique UUID assigned to this request by the Scrape.do gateway.
581
+
582
+ Returns:
583
+ The internal tracking ID, or None if the `scrape.do-request-id`
584
+ header is missing
585
+ """
586
+ return self._raw_response.headers.get("scrape.do-request-id")
587
+
588
+ @property
589
+ def resolved_url(self) -> Optional[str]:
590
+ """The final destination URL after all server-side and client-side
591
+ redirects.
592
+
593
+ Returns:
594
+ The absolute URL where the browser ultimately landed, or None if
595
+ the `scrape.do-resolved-url` header is missing
596
+ """
597
+ return self._raw_response.headers.get("scrape.do-resolved-url")
598
+
599
+ @property
600
+ def target_url(self) -> Optional[str]:
601
+ """The original destination URL requested by the SDK.
602
+
603
+ Returns:
604
+ The initial target URL, or None if the `scrape.do-target-url`
605
+ header is missing
606
+ """
607
+ return self._raw_response.headers.get("scrape.do-target-url")
608
+
609
+ @property
610
+ def auth(self) -> Optional[int]:
611
+ """Indicates the authentication status against the Scrape.do gateway.
612
+
613
+ Returns:
614
+ The authentication flag value casted to an int, or None if the
615
+ `scrape.do-auth` header is missing
616
+ """
617
+ auth = self._raw_response.headers.get("scrape.do-auth")
618
+ return int(auth) if auth else None
619
+
620
+ @property
621
+ def rate(self) -> Optional[str]:
622
+ """The current rate limit metrics for the provided API token.
623
+
624
+ Returns:
625
+ A string representing current concurrency thresholds, or None if
626
+ the `scrape.do-rate` header is missing
627
+ """
628
+ return self._raw_response.headers.get("scrape.do-rate")
629
+
630
+ @property
631
+ def remaining_credits(self) -> Optional[float]:
632
+ """The total number of API billing credits remaining on your account.
633
+
634
+ Returns:
635
+ The remaining account balance casted to a float, or None if the
636
+ `scrape.do-remaining-credits` header is missing
637
+ """
638
+ remaining_credits = self._raw_response.headers.get(
639
+ "scrape.do-remaining-credits"
640
+ )
641
+
642
+ return float(remaining_credits) if remaining_credits else None
643
+
644
+ @property
645
+ def rid(self) -> Optional[str]:
646
+ """The specific proxy node Routing ID utilized for this connection.
647
+
648
+ info: Session ID
649
+ If `session_id` was provided in the parameters,
650
+ this Routing ID is used by the `ScrapeDoClient` to verify that
651
+ sticky sessions are maintaining the same node.
652
+
653
+ Returns:
654
+ The internal routing identifier, or None if the `scrape.do-rid`
655
+ header is missing
656
+ """
657
+ return self._raw_response.headers.get("scrape.do-rid")
658
+
659
+ @property
660
+ def cookies(self) -> Optional[httpx.Cookies]:
661
+ """Extracts and parses cookies returned by the target server.
662
+
663
+ info: Additional Information
664
+ If `pure_cookies=True` is active, it returns the httpx response's
665
+ `cookies` attribute. Otherwise, it decodes the custom
666
+ `scrape.do-cookies` string into a `httpx.Cookies` object
667
+
668
+ Returns:
669
+ A `httpx.Cookies` object containing all cookies.
670
+ """
671
+ if self._is_pure_cookies:
672
+ return self._raw_response.cookies
673
+
674
+ cookies = self._raw_response.headers.get("scrape.do-cookies")
675
+ if cookies:
676
+ # Parse Cookies (c1=v1;c2=v2;...)
677
+ pattern = re.compile(r"([^=;]+)=([^;]*)")
678
+ matches = re.findall(pattern, cookies)
679
+ if not matches:
680
+ return None
681
+ cookie_dict = {n: v for n, v in matches}
682
+ return httpx.Cookies(cookie_dict)
683
+
684
+ return None
685
+
686
+ # --- Scrape.do JSON ---
687
+
688
+ @property
689
+ def frames(self) -> Optional[List[ScrapeDoFrame]]:
690
+ """Extracts isolated cross-origin iframes discovered during page
691
+ rendering.
692
+
693
+ info: Prerequisites
694
+ Requires `render=True`, `return_json=True`, and `show_frames=True`
695
+
696
+ Returns:
697
+ A list of typed Pydantic models representing frames.
698
+ """
699
+ if self._parsed_json and "frames" in self._parsed_json:
700
+ return [
701
+ ScrapeDoFrame(**f) for f in self._parsed_json["frames"]
702
+ ]
703
+ return None
704
+
705
+ @property
706
+ def network_requests(self) -> Optional[List[ScrapeDoNetworkRequest]]:
707
+ """Intercepts background network traffic triggered by the headless
708
+ browser.
709
+
710
+ info: Prerequisites
711
+ Requires `render=True` and `return_json=True`.
712
+
713
+ Returns:
714
+ A list of typed models detailing HTTP calls.
715
+ """
716
+ if self._parsed_json and "networkRequests" in self._parsed_json:
717
+ return [
718
+ ScrapeDoNetworkRequest(**nr) for nr
719
+ in self._parsed_json["networkRequests"]
720
+ ]
721
+ return None
722
+
723
+ @property
724
+ def websocket_requests(self) -> Optional[List[ScrapeDoWebsocketRequest]]:
725
+ """Intercepts bidirectional WebSocket traffic initiated by the target
726
+ website.
727
+
728
+ info: Prerequisites
729
+ Requires `render=True`, `return_json=True`, and
730
+ `show_websocket_requests=True`
731
+
732
+ Returns:
733
+ A list of typed models detailing socket events.
734
+ """
735
+ if self._parsed_json and "websocketRequests" in self._parsed_json:
736
+ return [
737
+ ScrapeDoWebsocketRequest(**ws) for ws
738
+ in self._parsed_json["websocketRequests"]
739
+ ]
740
+ return None
741
+
742
+ @property
743
+ def action_results(self) -> Optional[List[ScrapeDoActionResult]]:
744
+ """Details the success or failure of programmatic DOM interactions.
745
+
746
+ Returns:
747
+ A list of typed models mapping sequentially to the actions defined
748
+ in the `play_with_browser` array.
749
+ """
750
+ if self._parsed_json and "actionResults" in self._parsed_json:
751
+ return [
752
+ ScrapeDoActionResult(**ar) for ar
753
+ in self._parsed_json["actionResults"]
754
+ ]
755
+ return None
756
+
757
+ @property
758
+ def screenshots(self) -> Optional[List[ScrapeDoScreenshot]]:
759
+ """Extracts generated Base64 screenshots from the JSON payload.
760
+
761
+ info: Prerequisites
762
+ Requires `render=True`, `return_json=True`, and a valid screenshot
763
+ parameter (e.g., `full_screenshot=True`).
764
+
765
+ Returns:
766
+ A list of typed models containing the image data.
767
+ """
768
+ if self._parsed_json and "screenShots" in self._parsed_json:
769
+ return [
770
+ ScrapeDoScreenshot(**s) for s in
771
+ self._parsed_json["screenShots"]
772
+ ]
773
+ return None
774
+
775
+ def raise_for_status(self) -> Self:
776
+ """Evaluates the response and raises a mapped exception if the request
777
+ failed.
778
+
779
+ info: Additional Information
780
+ Utilizes the `is_proxy_error` heuristic to determine if
781
+ the failure originated from the Scrape.do proxy infrastructure or
782
+ from the target website.
783
+
784
+ Returns:
785
+ The current `ScrapeDoResponse` instance,
786
+ allowing for method chaining.
787
+
788
+ Raises:
789
+ TargetError: If the proxy succeeded, but the target website
790
+ returned an error code (e.g., a 403 Cloudflare block or a 404
791
+ Not Found).
792
+ BadRequestError: If the request was malformed
793
+ (HTTP 400 from Scrape.do).
794
+ AuthenticationError: If your Scrape.do API token is invalid
795
+ (HTTP 401).
796
+ AuthenticationThrottleError: If your specific token has been
797
+ temporarily locked by the Scrape.do authentication server to
798
+ prevent abuse. (HTTP 401)
799
+ RateLimitError: If you exceed your account's concurrent request
800
+ limit (HTTP 429).
801
+ ServerError: If the Scrape.do gateway experiences an issue
802
+ (HTTP 502/510).
803
+ APIResponseError: A generic fallback for unmapped Scrape.do proxy
804
+ errors.
805
+ """
806
+
807
+ if self.target_status_code and self.target_status_code < 400:
808
+ return self
809
+
810
+ # Checks if it's an Authentication Throttle Error
811
+ error_msg = None
812
+ if self._parsed_json:
813
+ error_keys = [
814
+ "message",
815
+ "Error",
816
+ "detail",
817
+ "Message",
818
+ "errorMessage"
819
+ ]
820
+
821
+ for k in error_keys:
822
+ if k in self._parsed_json:
823
+ error_msg = self._parsed_json[k]
824
+ break
825
+
826
+ elif self.text:
827
+ error_msg = self.text
828
+
829
+ is_throttled = None
830
+ throttled_msg = "temporarily throttled by the authentication server"
831
+ if error_msg and throttled_msg in error_msg:
832
+ is_throttled = True
833
+
834
+ raw_status = self._raw_response.status_code
835
+
836
+ # Route to Proxy Infrastructure Errors
837
+ if self.is_proxy_error:
838
+
839
+ if raw_status == 400:
840
+ raise BadRequestError(
841
+ self._raw_response,
842
+ self._raw_request,
843
+ self
844
+ )
845
+
846
+ elif raw_status == 401:
847
+ if is_throttled:
848
+ raise AuthenticationThrottleError(
849
+ self._raw_response,
850
+ self._raw_request,
851
+ self
852
+ )
853
+
854
+ raise AuthenticationError(
855
+ self._raw_response,
856
+ self._raw_request,
857
+ self
858
+ )
859
+
860
+ elif raw_status == 429:
861
+ raise RateLimitError(
862
+ self._raw_response,
863
+ self._raw_request,
864
+ self
865
+ )
866
+ elif raw_status in (502, 510):
867
+ raise ServerError(
868
+ self._raw_response,
869
+ self._raw_request,
870
+ self
871
+ )
872
+
873
+ raise APIResponseError(
874
+ self._raw_response,
875
+ self._raw_request,
876
+ self
877
+ )
878
+
879
+ # If is_proxy_error is False, then it's a TargetError
880
+
881
+ status_code = self.target_status_code or self._raw_response.status_code
882
+ raise TargetError(
883
+ (f"Target rejected request with status: "
884
+ f"{status_code}"
885
+ ),
886
+ status_code,
887
+ self._raw_response,
888
+ self._raw_request,
889
+ self
890
+ )