scrape-do-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrape_do/__init__.py +0 -0
- scrape_do/abc.py +0 -0
- scrape_do/async_client.py +0 -0
- scrape_do/client.py +804 -0
- scrape_do/constants.py +84 -0
- scrape_do/exceptions.py +238 -0
- scrape_do/models/__init__.py +79 -0
- scrape_do/models/browser_actions.py +332 -0
- scrape_do/models/enums.py +76 -0
- scrape_do/models/parameters.py +840 -0
- scrape_do/models/request.py +232 -0
- scrape_do/models/response.py +890 -0
- scrape_do/namespaces/__init__.py +0 -0
- scrape_do/namespaces/amazon.py +0 -0
- scrape_do/namespaces/google.py +0 -0
- scrape_do/namespaces/jobs.py +0 -0
- scrape_do_python-0.1.0.dist-info/METADATA +134 -0
- scrape_do_python-0.1.0.dist-info/RECORD +21 -0
- scrape_do_python-0.1.0.dist-info/WHEEL +5 -0
- scrape_do_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrape_do_python-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Serialization layer and HTTP transport preparation.
|
|
2
|
+
|
|
3
|
+
Bridges the gap between the strictly validated Pydantic models and the
|
|
4
|
+
underlying HTTP client. It wraps the API parameters, handles URL
|
|
5
|
+
encoding, manages payload typing, and injects authentication token
|
|
6
|
+
before network execution.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
import warnings
|
|
11
|
+
from typing import (
|
|
12
|
+
Optional,
|
|
13
|
+
Self,
|
|
14
|
+
Any,
|
|
15
|
+
Dict,
|
|
16
|
+
Union
|
|
17
|
+
)
|
|
18
|
+
from pydantic import (
|
|
19
|
+
BaseModel,
|
|
20
|
+
Field,
|
|
21
|
+
model_validator,
|
|
22
|
+
)
|
|
23
|
+
from .parameters import RequestParameters
|
|
24
|
+
from .enums import (
|
|
25
|
+
PayloadType,
|
|
26
|
+
HttpMethod
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# ------------------------
|
|
30
|
+
# PreparedScrapeDoRequest
|
|
31
|
+
# ------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PreparedScrapeDoRequest(BaseModel):
|
|
35
|
+
"""Represents a fully validated, ready-to-execute API call.
|
|
36
|
+
|
|
37
|
+
info: Payload Type
|
|
38
|
+
- If `payload_type='json'`, the `body` will be sent to
|
|
39
|
+
`httpx.request()` through the `json` parameter
|
|
40
|
+
|
|
41
|
+
- If `payload_type='raw'`, the `body` will be sent to
|
|
42
|
+
`httpx.request()` through the `content` parameter
|
|
43
|
+
|
|
44
|
+
- If `payload_type='form'` the `body` will be sent to
|
|
45
|
+
`httpx.request()` through the `data` parameter
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
api_params (RequestParameters): Validated parameters to pass to the
|
|
49
|
+
API
|
|
50
|
+
method (HttpMethod): HTTP method to forward to the target website
|
|
51
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers to forward
|
|
52
|
+
body (Optional[Union[Dict[str, Any], str, bytes]]): Payload to send to
|
|
53
|
+
the target website (JSON dict, string, or bytes)
|
|
54
|
+
payload_type (PayloadType): Dictates how httpx should encode
|
|
55
|
+
the body. Defaults to 'json'.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
api_params: RequestParameters = Field(
|
|
59
|
+
...,
|
|
60
|
+
description="The validated parameters to pass to the API. "
|
|
61
|
+
)
|
|
62
|
+
method: HttpMethod = Field(
|
|
63
|
+
default="GET",
|
|
64
|
+
description="The HTTP method to forward to the target website."
|
|
65
|
+
)
|
|
66
|
+
headers: Optional[Dict[str, str]] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description="The HTTP headers to forward."
|
|
69
|
+
)
|
|
70
|
+
body: Optional[Union[Dict[str, Any], str, bytes]] = Field(
|
|
71
|
+
default=None,
|
|
72
|
+
description="The payload to send (JSON dict, string, or bytes)."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
payload_type: PayloadType = Field(
|
|
76
|
+
default="json",
|
|
77
|
+
description="Dictates how httpx should encode the body."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@model_validator(mode="after")
|
|
81
|
+
def cross_validate_http_components(self) -> Self:
|
|
82
|
+
"""Cross-references standard HTTP request components (Method, Headers,
|
|
83
|
+
Body) against the Scrape.do specific parameters to ensure the
|
|
84
|
+
configuration will be respected by the proxy network.
|
|
85
|
+
|
|
86
|
+
info: Headers
|
|
87
|
+
- Raises a ValueError if none of the header flags is set to
|
|
88
|
+
true in `RequestParameters` and custom headers are provided
|
|
89
|
+
|
|
90
|
+
- Raises a ValueError if one of the header flags are set to
|
|
91
|
+
true in `RequestParameters` and no custom headers are
|
|
92
|
+
provided
|
|
93
|
+
|
|
94
|
+
- Raises a ValueError if `RequestParameters.extra_headers` is
|
|
95
|
+
set to true and any of the provided headers don't start with
|
|
96
|
+
the required `sd-` prefix.
|
|
97
|
+
|
|
98
|
+
info: Method
|
|
99
|
+
- Raises a ValueError if `RequestParameters.render` is set to
|
|
100
|
+
true and `method != "GET"`
|
|
101
|
+
|
|
102
|
+
info: Body
|
|
103
|
+
- Emits a UserWarning if a `body` is provided and `method=GET`
|
|
104
|
+
or `method=HEAD`
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
The validated instance from which the method was called
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValueError: If any of the validation steps fails
|
|
111
|
+
"""
|
|
112
|
+
# --- Header Validation ---
|
|
113
|
+
|
|
114
|
+
has_header_flag = (
|
|
115
|
+
self.api_params.custom_headers or
|
|
116
|
+
self.api_params.extra_headers or
|
|
117
|
+
self.api_params.forward_headers
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if self.headers:
|
|
121
|
+
if not has_header_flag:
|
|
122
|
+
raise ValueError((
|
|
123
|
+
"You provided 'headers' for the HTTP request, but no "
|
|
124
|
+
"header routing flag (custom_headers, extra_headers, or "
|
|
125
|
+
"forward_headers) was enabled in your RequestParameters. "
|
|
126
|
+
"Scrape.do will ignore these headers."
|
|
127
|
+
))
|
|
128
|
+
|
|
129
|
+
# Extra Headers Prefix Check
|
|
130
|
+
if self.api_params.extra_headers:
|
|
131
|
+
invalid_keys = [
|
|
132
|
+
k for k in self.headers.keys()
|
|
133
|
+
if not k.lower().startswith("sd-")
|
|
134
|
+
]
|
|
135
|
+
if invalid_keys:
|
|
136
|
+
raise ValueError((
|
|
137
|
+
f"When 'extra_headers=True' is used, Scrape.do "
|
|
138
|
+
f"requires all injected headers to be prefixed with "
|
|
139
|
+
f"'sd-'. Invalid headers found: {invalid_keys}. "
|
|
140
|
+
))
|
|
141
|
+
else:
|
|
142
|
+
if has_header_flag:
|
|
143
|
+
raise ValueError((
|
|
144
|
+
"One of the header routing flags (custom_headers, "
|
|
145
|
+
"extra_headers, or forward_headers) is enabled in your "
|
|
146
|
+
"RequestParameters, but no 'headers' were provided"
|
|
147
|
+
))
|
|
148
|
+
|
|
149
|
+
# --- Headless Browser Method Constraint ---
|
|
150
|
+
|
|
151
|
+
if self.api_params.render and self.method != "GET":
|
|
152
|
+
raise ValueError((
|
|
153
|
+
"The JavaScript render feature (render=true) works only with"
|
|
154
|
+
" the 'GET' method."
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
# --- Payload Type Validation ---
|
|
158
|
+
if self.body is not None:
|
|
159
|
+
if (
|
|
160
|
+
self.payload_type in ("json", "form")
|
|
161
|
+
and not isinstance(self.body, dict)
|
|
162
|
+
):
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"When payload_type is '{self.payload_type}', "
|
|
165
|
+
f"the body must be a Python dictionary. "
|
|
166
|
+
f" Received: {type(self.body).__name__}."
|
|
167
|
+
)
|
|
168
|
+
if (
|
|
169
|
+
self.payload_type == "raw"
|
|
170
|
+
and not isinstance(self.body, (str, bytes))
|
|
171
|
+
):
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"When payload_type is 'raw', "
|
|
174
|
+
f"the body must be a string or bytes. "
|
|
175
|
+
f"Received: {type(self.body).__name__}."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# --- Warnings ---
|
|
179
|
+
|
|
180
|
+
# Body with GET/HEAD Warning
|
|
181
|
+
if self.body is not None and self.method in ("GET", "HEAD"):
|
|
182
|
+
warnings.warn((
|
|
183
|
+
f"Providing a body payload with a {self.method} request "
|
|
184
|
+
f"violates standard HTTP specifications and may be ignored by "
|
|
185
|
+
f"the target website."
|
|
186
|
+
),
|
|
187
|
+
UserWarning
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return self
|
|
191
|
+
|
|
192
|
+
def to_httpx_kwargs(self, token: Optional[str] = None) -> Dict[str, Any]:
|
|
193
|
+
"""Packages the validated object into a dictionary ready for httpx
|
|
194
|
+
unpacking.
|
|
195
|
+
|
|
196
|
+
info: Token
|
|
197
|
+
The optional `token` parameter is the user's Scrape.do API key and
|
|
198
|
+
is only added here only for convenience. It can also be manuall
|
|
199
|
+
inserted into the resulting `httpx_kwargs` dictionary as the value
|
|
200
|
+
to the `token` key if it isn't provided
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
token (Optional[str]): The Scrape.do API key to include in the
|
|
204
|
+
dictionary
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Keyword arguments strictly formatted for `httpx.request()`.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
params = self.api_params.to_api_params()
|
|
211
|
+
|
|
212
|
+
if token is not None:
|
|
213
|
+
params["token"] = token
|
|
214
|
+
|
|
215
|
+
kwargs: Dict[str, Any] = {
|
|
216
|
+
"method": self.method,
|
|
217
|
+
"url": "https://api.scrape.do/",
|
|
218
|
+
"params": params
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if self.headers:
|
|
222
|
+
kwargs["headers"] = self.headers
|
|
223
|
+
|
|
224
|
+
if self.body is not None:
|
|
225
|
+
if self.payload_type == "json":
|
|
226
|
+
kwargs["json"] = self.body
|
|
227
|
+
elif self.payload_type == "form":
|
|
228
|
+
kwargs['data'] = self.body
|
|
229
|
+
else:
|
|
230
|
+
kwargs["content"] = self.body
|
|
231
|
+
|
|
232
|
+
return kwargs
|