scrape-do-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrape_do/__init__.py +0 -0
- scrape_do/abc.py +0 -0
- scrape_do/async_client.py +0 -0
- scrape_do/client.py +804 -0
- scrape_do/constants.py +84 -0
- scrape_do/exceptions.py +238 -0
- scrape_do/models/__init__.py +79 -0
- scrape_do/models/browser_actions.py +332 -0
- scrape_do/models/enums.py +76 -0
- scrape_do/models/parameters.py +840 -0
- scrape_do/models/request.py +232 -0
- scrape_do/models/response.py +890 -0
- scrape_do/namespaces/__init__.py +0 -0
- scrape_do/namespaces/amazon.py +0 -0
- scrape_do/namespaces/google.py +0 -0
- scrape_do/namespaces/jobs.py +0 -0
- scrape_do_python-0.1.0.dist-info/METADATA +134 -0
- scrape_do_python-0.1.0.dist-info/RECORD +21 -0
- scrape_do_python-0.1.0.dist-info/WHEEL +5 -0
- scrape_do_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrape_do_python-0.1.0.dist-info/top_level.txt +1 -0
scrape_do/constants.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Defines constants with valid parameter values expected by the Scrape.do API
|
|
3
|
+
|
|
4
|
+
Attributes:
|
|
5
|
+
_SUPER_SUPPORTED_COUNTRIES (set[str]): The complete list of ISO 3166-1
|
|
6
|
+
alpha-2 country codes supported when `super=True`.
|
|
7
|
+
|
|
8
|
+
_DATACENTER_SUPPORTED_COUNTRIES (set[str]): The restricted list of ISO
|
|
9
|
+
3166-1 alpha-2 country codes supported when `super=False`.
|
|
10
|
+
|
|
11
|
+
_ZIPCODE_FORMATS (dict[str, re.Pattern]): Pre-compiled regular expressions
|
|
12
|
+
mapping lowercase country codes to their strict regional postal code
|
|
13
|
+
formats.
|
|
14
|
+
|
|
15
|
+
_SUPER_ONLY_COUNTRIES (set[str]): ISO 3166-1 alpha-2 country codes
|
|
16
|
+
supported only when `super=True`
|
|
17
|
+
|
|
18
|
+
_ZIPCODE_ALLOWED_COUNTRIES (set[str]): Set of country codes for which the
|
|
19
|
+
`postal_code` parameter is allowed
|
|
20
|
+
|
|
21
|
+
_ZIPCODE_NOT_ALLOWED_COUNTRIES (set[str]): Set of country codes for which
|
|
22
|
+
the `postal_code` parameter is not allowed
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_SUPER_SUPPORTED_COUNTRIES = {
|
|
29
|
+
"ad", "ae", "af", "ag", "al", "am", "ao", "ar", "as", "at",
|
|
30
|
+
"au", "aw", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh",
|
|
31
|
+
"bi", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bw", "by",
|
|
32
|
+
"bz", "ca", "cd", "cf", "cg", "ch", "ci", "cl", "cm", "cn",
|
|
33
|
+
"co", "cr", "cu", "cv", "cy", "cz", "de", "dj", "dk", "dm",
|
|
34
|
+
"do", "dz", "ec", "ee", "eg", "er", "es", "et", "fi", "fj",
|
|
35
|
+
"fm", "fr", "ga", "gb", "gd", "ge", "gh", "gi", "gm", "gn",
|
|
36
|
+
"gq", "gr", "gt", "gu", "gw", "gy", "hk", "hn", "hr", "ht",
|
|
37
|
+
"hu", "id", "ie", "il", "in", "iq", "ir", "is", "it", "jm",
|
|
38
|
+
"jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr",
|
|
39
|
+
"kw", "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls",
|
|
40
|
+
"lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh",
|
|
41
|
+
"mk", "ml", "mm", "mn", "mo", "mq", "mr", "mt", "mu", "mv",
|
|
42
|
+
"mw", "mx", "my", "mz", "na", "ne", "ng", "ni", "nl", "no",
|
|
43
|
+
"np", "nr", "nz", "om", "pa", "pe", "pg", "ph", "pk", "pl",
|
|
44
|
+
"pr", "pt", "pw", "py", "qa", "ro", "rs", "ru", "rw", "sa",
|
|
45
|
+
"sb", "sc", "sd", "se", "sg", "si", "sk", "sl", "sn", "so",
|
|
46
|
+
"sr", "ss", "st", "sv", "sy", "sz", "td", "tg", "th", "tj",
|
|
47
|
+
"tl", "tm", "tn", "to", "tr", "tt", "tv", "tw", "tz", "ua",
|
|
48
|
+
"ug", "us", "uy", "uz", "vc", "ve", "vg", "vi", "vn", "vu",
|
|
49
|
+
"ws", "ye", "za", "zm", "zw"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
_DATACENTER_SUPPORTED_COUNTRIES = {
|
|
54
|
+
"ae", "al", "ar", "at", "au", "br", "ca", "ch", "cl", "cn",
|
|
55
|
+
"cr", "cy", "cz", "de", "dk", "ee", "eg", "es", "fi", "fr",
|
|
56
|
+
"gb", "gr", "hr", "ie", "it", "jp", "lt", "lv", "mt", "nl",
|
|
57
|
+
"no", "pk", "pl", "pt", "ro", "rs", "ru", "se", "sg", "si",
|
|
58
|
+
"sk", "tr", "ua", "us", "za"
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
_ZIPCODE_FORMATS = {
|
|
62
|
+
"us": re.compile(r"^\d{5}$"),
|
|
63
|
+
"gb": re.compile(r"^[A-Z0-9\s]{2,8}$", re.IGNORECASE),
|
|
64
|
+
"de": re.compile(r"^\d{5}$"),
|
|
65
|
+
"fr": re.compile(r"^\d{5}$"),
|
|
66
|
+
"ca": re.compile(r"^[A-Z]\d[A-Z]\s?\d[A-Z]\d$", re.IGNORECASE),
|
|
67
|
+
"au": re.compile(r"^\d{4}$"),
|
|
68
|
+
"in": re.compile(r"^\d{6}$"),
|
|
69
|
+
"nl": re.compile(r"^\d{4}[A-Z]{2}$", re.IGNORECASE),
|
|
70
|
+
"it": re.compile(r"^\d{5}$"),
|
|
71
|
+
"es": re.compile(r"^\d{5}$"),
|
|
72
|
+
"br": re.compile(r"^(\d{5}|\d{8})$"),
|
|
73
|
+
"jp": re.compile(r"^\d{3}-?\d{4}$")
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
_SUPER_ONLY_COUNTRIES = {c for c in _SUPER_SUPPORTED_COUNTRIES
|
|
77
|
+
if c not in _DATACENTER_SUPPORTED_COUNTRIES
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
_ZIPCODE_ALLOWED_COUNTRIES = set(_ZIPCODE_FORMATS.keys())
|
|
81
|
+
|
|
82
|
+
_ZIPCODE_NOT_ALLOWED_COUNTRIES = {c for c in _SUPER_SUPPORTED_COUNTRIES
|
|
83
|
+
if c not in _ZIPCODE_ALLOWED_COUNTRIES
|
|
84
|
+
}
|
scrape_do/exceptions.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Custom exception hierarchy and network error routing for the Scrape.do SDK.
|
|
2
|
+
|
|
3
|
+
Dynamically parses API failures, distinguishes proxy infrastructure errors
|
|
4
|
+
from target website blocks, and exposes programmatic flags for retry
|
|
5
|
+
strategies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import httpx
|
|
10
|
+
from typing import Optional, TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .models import PreparedScrapeDoRequest, ScrapeDoResponse
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ScrapeDoError(Exception):
|
|
17
|
+
"""The base exception for all errors raised by the SDK.
|
|
18
|
+
|
|
19
|
+
Catching this exception guarantees that any error originating strictly
|
|
20
|
+
from the SDK or the proxy network is handled.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
message (str): Error message to be displayed
|
|
24
|
+
request (Optional[PreparedScrapeDoRequest]): Object containing the
|
|
25
|
+
request's information if it exists, otherwise `None`.
|
|
26
|
+
response (Optional[ScrapeDoResponse]): Object containing the response's
|
|
27
|
+
information if it exists, otherwise `None`
|
|
28
|
+
"""
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
message: str,
|
|
32
|
+
request: Optional[PreparedScrapeDoRequest] = None,
|
|
33
|
+
response: Optional[ScrapeDoResponse] = None
|
|
34
|
+
):
|
|
35
|
+
super().__init__(message)
|
|
36
|
+
self.message = message
|
|
37
|
+
self.request = request
|
|
38
|
+
self.response = response
|
|
39
|
+
if response is not None:
|
|
40
|
+
self.status_code = response.target_status_code
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class APIConnectionError(ScrapeDoError):
|
|
44
|
+
"""Raised when the SDK fails to connect to the Scrape.do gateway entirely.
|
|
45
|
+
|
|
46
|
+
This indicates a network-level failure such as DNS resolution issues,
|
|
47
|
+
local internet outages, or hard socket timeouts.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TargetError(ScrapeDoError):
|
|
52
|
+
"""Raised when the Scrape.do proxy connects, but the target website fails.
|
|
53
|
+
|
|
54
|
+
This exception is triggered when `transparent_response=True` is used,
|
|
55
|
+
explicitly flagging that the destination URL returned a non-2xx status
|
|
56
|
+
code.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
message (str): The raw response body or error message from the target.
|
|
60
|
+
target_status_code (int): The HTTP status code returned by the target
|
|
61
|
+
website.
|
|
62
|
+
raw_response (httpx.Response): The raw HTTP response object.
|
|
63
|
+
request (Optional[PreparedScrapeDoRequest]): Object containing the
|
|
64
|
+
request's information if it exists, otherwise `None`.
|
|
65
|
+
response (Optional[ScrapeDoResponse]): Object containing the response's
|
|
66
|
+
information if it exists, otherwise `None`
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
message: str,
|
|
72
|
+
target_status_code: int,
|
|
73
|
+
raw_response: httpx.Response,
|
|
74
|
+
request: Optional[PreparedScrapeDoRequest] = None,
|
|
75
|
+
response: Optional[ScrapeDoResponse] = None
|
|
76
|
+
):
|
|
77
|
+
self.target_status_code = target_status_code
|
|
78
|
+
self.raw_response = raw_response
|
|
79
|
+
super().__init__(
|
|
80
|
+
f"Target website returned status {target_status_code}: {message}",
|
|
81
|
+
request,
|
|
82
|
+
response
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def is_waf_block(self) -> bool:
|
|
87
|
+
"""
|
|
88
|
+
Programmatic flag to identify if the target website blocked the proxy.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
`True` if status code is either `401` or `403`, `False` otherwise
|
|
92
|
+
"""
|
|
93
|
+
return self.target_status_code in (401, 403)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def is_throttled(self) -> bool:
|
|
97
|
+
"""
|
|
98
|
+
Programmatic flag to identify target-level rate limiting.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
`True` if status code is `429`, `False` otherwise
|
|
102
|
+
"""
|
|
103
|
+
return self.target_status_code == 429
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class AuthenticationThrottleError(ScrapeDoError):
|
|
107
|
+
"""Raised when high-frequency invalid requests trigger an authentication
|
|
108
|
+
ban.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
raw_response (httpx.Response): The raw HTTP response object.
|
|
112
|
+
request (Optional[PreparedScrapeDoRequest]): Object containing the
|
|
113
|
+
request's information if it exists, otherwise `None`.
|
|
114
|
+
response (Optional[ScrapeDoResponse]): Object containing the response's
|
|
115
|
+
information if it exists, otherwise `None`
|
|
116
|
+
"""
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
raw_response: httpx.Response,
|
|
120
|
+
request: Optional[PreparedScrapeDoRequest] = None,
|
|
121
|
+
response: Optional[ScrapeDoResponse] = None
|
|
122
|
+
):
|
|
123
|
+
msg = ("Your request has been temporarily throttled by the"
|
|
124
|
+
"authentication server."
|
|
125
|
+
)
|
|
126
|
+
self.raw_response = raw_response
|
|
127
|
+
|
|
128
|
+
super().__init__(msg, request, response)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class APIResponseError(ScrapeDoError):
|
|
132
|
+
"""Dynamically parses and represents a Scrape.do API infrastructure error.
|
|
133
|
+
|
|
134
|
+
This acts as the base exception for all non-2xx HTTP responses returned
|
|
135
|
+
by the Scrape.do gateway. It parses the JSON payloads to extract
|
|
136
|
+
human-readable error messages.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
raw_response (httpx.Response): The raw HTTP response object.
|
|
140
|
+
request (Optional[PreparedScrapeDoRequest]): Object containing the
|
|
141
|
+
request's information if it exists, otherwise `None`.
|
|
142
|
+
response (Optional[ScrapeDoResponse]): Object containing the response's
|
|
143
|
+
information if it exists, otherwise `None`
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
raw_response: httpx.Response,
|
|
149
|
+
request: Optional[PreparedScrapeDoRequest] = None,
|
|
150
|
+
response: Optional[ScrapeDoResponse] = None
|
|
151
|
+
):
|
|
152
|
+
self.raw_response = raw_response
|
|
153
|
+
self.raw_status_code = raw_response.status_code
|
|
154
|
+
self.message = f"Unknown API Error. Body: {raw_response.text}"
|
|
155
|
+
|
|
156
|
+
# Attempt to parse known JSON keys
|
|
157
|
+
try:
|
|
158
|
+
data = raw_response.json()
|
|
159
|
+
for key in ("detail",
|
|
160
|
+
"Error",
|
|
161
|
+
"errorMessage",
|
|
162
|
+
"message",
|
|
163
|
+
"Message"
|
|
164
|
+
):
|
|
165
|
+
if key in data and isinstance(data[key], str):
|
|
166
|
+
self.message = data[key]
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
except ValueError:
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
super().__init__(
|
|
173
|
+
(
|
|
174
|
+
f"API returned an error."
|
|
175
|
+
f"Status: {self.raw_status_code} | Message: {self.message}"
|
|
176
|
+
),
|
|
177
|
+
request,
|
|
178
|
+
response
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# --- Specific API Response Subclasses ---
|
|
183
|
+
|
|
184
|
+
class AuthenticationError(APIResponseError):
|
|
185
|
+
"""Raised when the API returns an HTTP 401 (Unauthorized).
|
|
186
|
+
|
|
187
|
+
Indicates that the provided API token is missing or invalid.
|
|
188
|
+
"""
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class BadRequestError(APIResponseError):
|
|
193
|
+
"""Raised when the API returns an HTTP 400 (Bad Request).
|
|
194
|
+
|
|
195
|
+
Indicates that the Scrape.do servers rejected the request configuration.
|
|
196
|
+
"""
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class RateLimitError(APIResponseError):
|
|
201
|
+
"""Raised when the API returns an HTTP 429 (Too Many Requests).
|
|
202
|
+
|
|
203
|
+
Indicates that the account has exceeded its concurrent request limit.
|
|
204
|
+
"""
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class ServerError(APIResponseError):
|
|
209
|
+
"""Raised when the API returns an HTTP 500+ status code.
|
|
210
|
+
|
|
211
|
+
Indicates a gateway failure or proxy pool outage.
|
|
212
|
+
"""
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class RotatedSessionError(ScrapeDoError):
|
|
217
|
+
"""
|
|
218
|
+
Raised when a user-defined `session_validator` determines that the target
|
|
219
|
+
website's state has been lost (e.g., logged out, CAPTCHA triggered),
|
|
220
|
+
indicating that Scrape.do silently rotated the proxy exit node.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
message (str): Error message to be displayed
|
|
224
|
+
raw_response (httpx.Response): The raw HTTP response object.
|
|
225
|
+
request (PreparedScrapeDoRequest): Object containing the
|
|
226
|
+
request's information
|
|
227
|
+
response (Optional[ScrapeDoResponse]): Object containing the response's
|
|
228
|
+
information
|
|
229
|
+
"""
|
|
230
|
+
def __init__(
|
|
231
|
+
self,
|
|
232
|
+
message: str,
|
|
233
|
+
raw_response: httpx.Response,
|
|
234
|
+
request: PreparedScrapeDoRequest,
|
|
235
|
+
response: ScrapeDoResponse
|
|
236
|
+
):
|
|
237
|
+
self.raw_response = raw_response
|
|
238
|
+
super().__init__(message, request, response)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Public API for the data models and Scrape.do API contracts
|
|
2
|
+
|
|
3
|
+
Aggregates domain models into a unified namespace to expose all necessary type
|
|
4
|
+
hints, browser actions, and configuration contracts required to interact with
|
|
5
|
+
the API.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .browser_actions import (
|
|
11
|
+
ClickAction,
|
|
12
|
+
WaitAction,
|
|
13
|
+
WaitSelectorAction,
|
|
14
|
+
ScrollXAction,
|
|
15
|
+
ScrollYAction,
|
|
16
|
+
ScrollToAction,
|
|
17
|
+
FillAction,
|
|
18
|
+
ExecuteAction,
|
|
19
|
+
ScreenShotAction,
|
|
20
|
+
WaitForRequestCompletionAction,
|
|
21
|
+
BrowserAction
|
|
22
|
+
)
|
|
23
|
+
from .enums import (
|
|
24
|
+
RegionCodeType,
|
|
25
|
+
WaitUntilType,
|
|
26
|
+
DeviceType,
|
|
27
|
+
OutputType,
|
|
28
|
+
HttpMethod,
|
|
29
|
+
PayloadType
|
|
30
|
+
)
|
|
31
|
+
from .request import (
|
|
32
|
+
PreparedScrapeDoRequest
|
|
33
|
+
)
|
|
34
|
+
from .parameters import (
|
|
35
|
+
RequestParameters,
|
|
36
|
+
RequestParametersDict
|
|
37
|
+
)
|
|
38
|
+
from .response import (
|
|
39
|
+
ScrapeDoNetworkRequest,
|
|
40
|
+
ScrapeDoWebSocketFrame,
|
|
41
|
+
ScrapeDoWebSocketEvent,
|
|
42
|
+
ScrapeDoWebsocketRequest,
|
|
43
|
+
ScrapeDoActionResult,
|
|
44
|
+
ScrapeDoScreenshot,
|
|
45
|
+
ScrapeDoFrame,
|
|
46
|
+
ScrapeDoResponse
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
"ClickAction",
|
|
52
|
+
"WaitAction",
|
|
53
|
+
"WaitSelectorAction",
|
|
54
|
+
"ScrollXAction",
|
|
55
|
+
"ScrollYAction",
|
|
56
|
+
"ScrollToAction",
|
|
57
|
+
"FillAction",
|
|
58
|
+
"ExecuteAction",
|
|
59
|
+
"ScreenShotAction",
|
|
60
|
+
"WaitForRequestCompletionAction",
|
|
61
|
+
"BrowserAction",
|
|
62
|
+
"RegionCodeType",
|
|
63
|
+
"WaitUntilType",
|
|
64
|
+
"DeviceType",
|
|
65
|
+
"OutputType",
|
|
66
|
+
"HttpMethod",
|
|
67
|
+
"PayloadType",
|
|
68
|
+
"RequestParametersDict",
|
|
69
|
+
"RequestParameters",
|
|
70
|
+
"PreparedScrapeDoRequest",
|
|
71
|
+
"ScrapeDoNetworkRequest",
|
|
72
|
+
"ScrapeDoWebSocketFrame",
|
|
73
|
+
"ScrapeDoWebSocketEvent",
|
|
74
|
+
"ScrapeDoWebsocketRequest",
|
|
75
|
+
"ScrapeDoActionResult",
|
|
76
|
+
"ScrapeDoScreenshot",
|
|
77
|
+
"ScrapeDoFrame",
|
|
78
|
+
"ScrapeDoResponse"
|
|
79
|
+
]
|