scrape-do-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrape_do/constants.py ADDED
@@ -0,0 +1,84 @@
1
+ """
2
+ Defines constants with valid parameter values expected by the Scrape.do API
3
+
4
+ Attributes:
5
+ _SUPER_SUPPORTED_COUNTRIES (set[str]): The complete list of ISO 3166-1
6
+ alpha-2 country codes supported when `super=True`.
7
+
8
+ _DATACENTER_SUPPORTED_COUNTRIES (set[str]): The restricted list of ISO
9
+ 3166-1 alpha-2 country codes supported when `super=False`.
10
+
11
+ _ZIPCODE_FORMATS (dict[str, re.Pattern]): Pre-compiled regular expressions
12
+ mapping lowercase country codes to their strict regional postal code
13
+ formats.
14
+
15
+ _SUPER_ONLY_COUNTRIES (set[str]): ISO 3166-1 alpha-2 country codes
16
+ supported only when `super=True`
17
+
18
+ _ZIPCODE_ALLOWED_COUNTRIES (set[str]): Set of country codes for which the
19
+ `postal_code` parameter is allowed
20
+
21
+ _ZIPCODE_NOT_ALLOWED_COUNTRIES (set[str]): Set of country codes for which
22
+ the `postal_code` parameter is not allowed
23
+ """
24
+
25
+ import re
26
+
27
+
28
+ _SUPER_SUPPORTED_COUNTRIES = {
29
+ "ad", "ae", "af", "ag", "al", "am", "ao", "ar", "as", "at",
30
+ "au", "aw", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh",
31
+ "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bw", "by",
32
+ "bz", "ca", "cd", "cf", "cg", "ch", "ci", "cl", "cm", "cn",
33
+ "co", "cr", "cu", "cv", "cy", "cz", "de", "dj", "dk", "dm",
34
+ "do", "dz", "ec", "ee", "eg", "er", "es", "et", "fi", "fj",
35
+ "fm", "fr", "ga", "gb", "gd", "ge", "gh", "gi", "gm", "gn",
36
+ "gq", "gr", "gt", "gu", "gw", "gy", "hk", "hn", "hr", "ht",
37
+ "hu", "id", "ie", "il", "in", "iq", "ir", "is", "it", "jm",
38
+ "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr",
39
+ "kw", "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls",
40
+ "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh",
41
+ "mk", "ml", "mm", "mn", "mo", "mq", "mr", "mt", "mu", "mv",
42
+ "mw", "mx", "my", "mz", "na", "ne", "ng", "ni", "nl", "no",
43
+ "np", "nr", "nz", "om", "pa", "pe", "pg", "ph", "pk", "pl",
44
+ "pr", "pt", "pw", "py", "qa", "ro", "rs", "ru", "rw", "sa",
45
+ "sb", "sc", "sd", "se", "sg", "si", "sk", "sl", "sn", "so",
46
+ "sr", "ss", "st", "sv", "sy", "sz", "td", "tg", "th", "tj",
47
+ "tl", "tm", "tn", "to", "tr", "tt", "tv", "tw", "tz", "ua",
48
+ "ug", "us", "uy", "uz", "vc", "ve", "vg", "vi", "vn", "vu",
49
+ "ws", "ye", "za", "zm", "zw"
50
+ }
51
+
52
+
53
+ _DATACENTER_SUPPORTED_COUNTRIES = {
54
+ "ae", "al", "ar", "at", "au", "br", "ca", "ch", "cl", "cn",
55
+ "cr", "cy", "cz", "de", "dk", "ee", "eg", "es", "fi", "fr",
56
+ "gb", "gr", "hr", "ie", "it", "jp", "lt", "lv", "mt", "nl",
57
+ "no", "pk", "pl", "pt", "ro", "rs", "ru", "se", "sg", "si",
58
+ "sk", "tr", "ua", "us", "za"
59
+ }
60
+
61
+ _ZIPCODE_FORMATS = {
62
+ "us": re.compile(r"^\d{5}$"),
63
+ "gb": re.compile(r"^[A-Z0-9\s]{2,8}$", re.IGNORECASE),
64
+ "de": re.compile(r"^\d{5}$"),
65
+ "fr": re.compile(r"^\d{5}$"),
66
+ "ca": re.compile(r"^[A-Z]\d[A-Z]\s?\d[A-Z]\d$", re.IGNORECASE),
67
+ "au": re.compile(r"^\d{4}$"),
68
+ "in": re.compile(r"^\d{6}$"),
69
+ "nl": re.compile(r"^\d{4}[A-Z]{2}$", re.IGNORECASE),
70
+ "it": re.compile(r"^\d{5}$"),
71
+ "es": re.compile(r"^\d{5}$"),
72
+ "br": re.compile(r"^(\d{5}|\d{8})$"),
73
+ "jp": re.compile(r"^\d{3}-?\d{4}$")
74
+ }
75
+
76
+ _SUPER_ONLY_COUNTRIES = {c for c in _SUPER_SUPPORTED_COUNTRIES
77
+ if c not in _DATACENTER_SUPPORTED_COUNTRIES
78
+ }
79
+
80
+ _ZIPCODE_ALLOWED_COUNTRIES = set(_ZIPCODE_FORMATS.keys())
81
+
82
+ _ZIPCODE_NOT_ALLOWED_COUNTRIES = {c for c in _SUPER_SUPPORTED_COUNTRIES
83
+ if c not in _ZIPCODE_ALLOWED_COUNTRIES
84
+ }
@@ -0,0 +1,238 @@
1
+ """Custom exception hierarchy and network error routing for the Scrape.do SDK.
2
+
3
+ Dynamically parses API failures, distinguishes proxy infrastructure errors
4
+ from target website blocks, and exposes programmatic flags for retry
5
+ strategies.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import httpx
10
+ from typing import Optional, TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from .models import PreparedScrapeDoRequest, ScrapeDoResponse
14
+
15
+
16
+ class ScrapeDoError(Exception):
17
+ """The base exception for all errors raised by the SDK.
18
+
19
+ Catching this exception guarantees that any error originating strictly
20
+ from the SDK or the proxy network is handled.
21
+
22
+ Args:
23
+ message (str): Error message to be displayed
24
+ request (Optional[PreparedScrapeDoRequest]): Object containing the
25
+ request's information if it exists, otherwise `None`.
26
+ response (Optional[ScrapeDoResponse]): Object containing the response's
27
+ information if it exists, otherwise `None`
28
+ """
29
+ def __init__(
30
+ self,
31
+ message: str,
32
+ request: Optional[PreparedScrapeDoRequest] = None,
33
+ response: Optional[ScrapeDoResponse] = None
34
+ ):
35
+ super().__init__(message)
36
+ self.message = message
37
+ self.request = request
38
+ self.response = response
39
+ if response is not None:
40
+ self.status_code = response.target_status_code
41
+
42
+
43
+ class APIConnectionError(ScrapeDoError):
44
+ """Raised when the SDK fails to connect to the Scrape.do gateway entirely.
45
+
46
+ This indicates a network-level failure such as DNS resolution issues,
47
+ local internet outages, or hard socket timeouts.
48
+ """
49
+
50
+
51
+ class TargetError(ScrapeDoError):
52
+ """Raised when the Scrape.do proxy connects, but the target website fails.
53
+
54
+ This exception is triggered when `transparent_response=True` is used,
55
+ explicitly flagging that the destination URL returned a non-2xx status
56
+ code.
57
+
58
+ Args:
59
+ message (str): The raw response body or error message from the target.
60
+ target_status_code (int): The HTTP status code returned by the target
61
+ website.
62
+ raw_response (httpx.Response): The raw HTTP response object.
63
+ request (Optional[PreparedScrapeDoRequest]): Object containing the
64
+ request's information if it exists, otherwise `None`.
65
+ response (Optional[ScrapeDoResponse]): Object containing the response's
66
+ information if it exists, otherwise `None`
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ message: str,
72
+ target_status_code: int,
73
+ raw_response: httpx.Response,
74
+ request: Optional[PreparedScrapeDoRequest] = None,
75
+ response: Optional[ScrapeDoResponse] = None
76
+ ):
77
+ self.target_status_code = target_status_code
78
+ self.raw_response = raw_response
79
+ super().__init__(
80
+ f"Target website returned status {target_status_code}: {message}",
81
+ request,
82
+ response
83
+ )
84
+
85
+ @property
86
+ def is_waf_block(self) -> bool:
87
+ """
88
+ Programmatic flag to identify if the target website blocked the proxy.
89
+
90
+ Returns:
91
+ `True` if status code is either `401` or `403`, `False` otherwise
92
+ """
93
+ return self.target_status_code in (401, 403)
94
+
95
+ @property
96
+ def is_throttled(self) -> bool:
97
+ """
98
+ Programmatic flag to identify target-level rate limiting.
99
+
100
+ Returns:
101
+ `True` if status code is `429`, `False` otherwise
102
+ """
103
+ return self.target_status_code == 429
104
+
105
+
106
+ class AuthenticationThrottleError(ScrapeDoError):
107
+ """Raised when high-frequency invalid requests trigger an authentication
108
+ ban.
109
+
110
+ Args:
111
+ raw_response (httpx.Response): The raw HTTP response object.
112
+ request (Optional[PreparedScrapeDoRequest]): Object containing the
113
+ request's information if it exists, otherwise `None`.
114
+ response (Optional[ScrapeDoResponse]): Object containing the response's
115
+ information if it exists, otherwise `None`
116
+ """
117
+ def __init__(
118
+ self,
119
+ raw_response: httpx.Response,
120
+ request: Optional[PreparedScrapeDoRequest] = None,
121
+ response: Optional[ScrapeDoResponse] = None
122
+ ):
123
+ msg = ("Your request has been temporarily throttled by the"
124
+ "authentication server."
125
+ )
126
+ self.raw_response = raw_response
127
+
128
+ super().__init__(msg, request, response)
129
+
130
+
131
+ class APIResponseError(ScrapeDoError):
132
+ """Dynamically parses and represents a Scrape.do API infrastructure error.
133
+
134
+ This acts as the base exception for all non-2xx HTTP responses returned
135
+ by the Scrape.do gateway. It parses the JSON payloads to extract
136
+ human-readable error messages.
137
+
138
+ Args:
139
+ raw_response (httpx.Response): The raw HTTP response object.
140
+ request (Optional[PreparedScrapeDoRequest]): Object containing the
141
+ request's information if it exists, otherwise `None`.
142
+ response (Optional[ScrapeDoResponse]): Object containing the response's
143
+ information if it exists, otherwise `None`
144
+ """
145
+
146
+ def __init__(
147
+ self,
148
+ raw_response: httpx.Response,
149
+ request: Optional[PreparedScrapeDoRequest] = None,
150
+ response: Optional[ScrapeDoResponse] = None
151
+ ):
152
+ self.raw_response = raw_response
153
+ self.raw_status_code = raw_response.status_code
154
+ self.message = f"Unknown API Error. Body: {raw_response.text}"
155
+
156
+ # Attempt to parse known JSON keys
157
+ try:
158
+ data = raw_response.json()
159
+ for key in ("detail",
160
+ "Error",
161
+ "errorMessage",
162
+ "message",
163
+ "Message"
164
+ ):
165
+ if key in data and isinstance(data[key], str):
166
+ self.message = data[key]
167
+ break
168
+
169
+ except ValueError:
170
+ pass
171
+
172
+ super().__init__(
173
+ (
174
+ f"API returned an error."
175
+ f"Status: {self.raw_status_code} | Message: {self.message}"
176
+ ),
177
+ request,
178
+ response
179
+ )
180
+
181
+
182
+ # --- Specific API Response Subclasses ---
183
+
184
+ class AuthenticationError(APIResponseError):
185
+ """Raised when the API returns an HTTP 401 (Unauthorized).
186
+
187
+ Indicates that the provided API token is missing or invalid.
188
+ """
189
+ pass
190
+
191
+
192
+ class BadRequestError(APIResponseError):
193
+ """Raised when the API returns an HTTP 400 (Bad Request).
194
+
195
+ Indicates that the Scrape.do servers rejected the request configuration.
196
+ """
197
+ pass
198
+
199
+
200
+ class RateLimitError(APIResponseError):
201
+ """Raised when the API returns an HTTP 429 (Too Many Requests).
202
+
203
+ Indicates that the account has exceeded its concurrent request limit.
204
+ """
205
+ pass
206
+
207
+
208
+ class ServerError(APIResponseError):
209
+ """Raised when the API returns an HTTP 500+ status code.
210
+
211
+ Indicates a gateway failure or proxy pool outage.
212
+ """
213
+ pass
214
+
215
+
216
+ class RotatedSessionError(ScrapeDoError):
217
+ """
218
+ Raised when a user-defined `session_validator` determines that the target
219
+ website's state has been lost (e.g., logged out, CAPTCHA triggered),
220
+ indicating that Scrape.do silently rotated the proxy exit node.
221
+
222
+ Args:
223
+ message (str): Error message to be displayed
224
+ raw_response (httpx.Response): The raw HTTP response object.
225
+ request (PreparedScrapeDoRequest): Object containing the
226
+ request's information
227
+ response (Optional[ScrapeDoResponse]): Object containing the response's
228
+ information
229
+ """
230
+ def __init__(
231
+ self,
232
+ message: str,
233
+ raw_response: httpx.Response,
234
+ request: PreparedScrapeDoRequest,
235
+ response: ScrapeDoResponse
236
+ ):
237
+ self.raw_response = raw_response
238
+ super().__init__(message, request, response)
@@ -0,0 +1,79 @@
1
+ """Public API for the data models and Scrape.do API contracts
2
+
3
+ Aggregates domain models into a unified namespace to expose all necessary type
4
+ hints, browser actions, and configuration contracts required to interact with
5
+ the API.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .browser_actions import (
11
+ ClickAction,
12
+ WaitAction,
13
+ WaitSelectorAction,
14
+ ScrollXAction,
15
+ ScrollYAction,
16
+ ScrollToAction,
17
+ FillAction,
18
+ ExecuteAction,
19
+ ScreenShotAction,
20
+ WaitForRequestCompletionAction,
21
+ BrowserAction
22
+ )
23
+ from .enums import (
24
+ RegionCodeType,
25
+ WaitUntilType,
26
+ DeviceType,
27
+ OutputType,
28
+ HttpMethod,
29
+ PayloadType
30
+ )
31
+ from .request import (
32
+ PreparedScrapeDoRequest
33
+ )
34
+ from .parameters import (
35
+ RequestParameters,
36
+ RequestParametersDict
37
+ )
38
+ from .response import (
39
+ ScrapeDoNetworkRequest,
40
+ ScrapeDoWebSocketFrame,
41
+ ScrapeDoWebSocketEvent,
42
+ ScrapeDoWebsocketRequest,
43
+ ScrapeDoActionResult,
44
+ ScrapeDoScreenshot,
45
+ ScrapeDoFrame,
46
+ ScrapeDoResponse
47
+ )
48
+
49
+
50
+ __all__ = [
51
+ "ClickAction",
52
+ "WaitAction",
53
+ "WaitSelectorAction",
54
+ "ScrollXAction",
55
+ "ScrollYAction",
56
+ "ScrollToAction",
57
+ "FillAction",
58
+ "ExecuteAction",
59
+ "ScreenShotAction",
60
+ "WaitForRequestCompletionAction",
61
+ "BrowserAction",
62
+ "RegionCodeType",
63
+ "WaitUntilType",
64
+ "DeviceType",
65
+ "OutputType",
66
+ "HttpMethod",
67
+ "PayloadType",
68
+ "RequestParametersDict",
69
+ "RequestParameters",
70
+ "PreparedScrapeDoRequest",
71
+ "ScrapeDoNetworkRequest",
72
+ "ScrapeDoWebSocketFrame",
73
+ "ScrapeDoWebSocketEvent",
74
+ "ScrapeDoWebsocketRequest",
75
+ "ScrapeDoActionResult",
76
+ "ScrapeDoScreenshot",
77
+ "ScrapeDoFrame",
78
+ "ScrapeDoResponse"
79
+ ]