scrape-do-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ """Serialization layer and HTTP transport preparation.
2
+
3
+ Bridges the gap between the strictly validated Pydantic models and the
4
+ underlying HTTP client. It wraps the API parameters, handles URL
5
+ encoding, manages payload typing, and injects authentication token
6
+ before network execution.
7
+ """
8
+
9
+ from __future__ import annotations
10
+ import warnings
11
+ from typing import (
12
+ Optional,
13
+ Self,
14
+ Any,
15
+ Dict,
16
+ Union
17
+ )
18
+ from pydantic import (
19
+ BaseModel,
20
+ Field,
21
+ model_validator,
22
+ )
23
+ from .parameters import RequestParameters
24
+ from .enums import (
25
+ PayloadType,
26
+ HttpMethod
27
+ )
28
+
29
+ # ------------------------
30
+ # PreparedScrapeDoRequest
31
+ # ------------------------
32
+
33
+
34
+ class PreparedScrapeDoRequest(BaseModel):
35
+ """Represents a fully validated, ready-to-execute API call.
36
+
37
+ info: Payload Type
38
+ - If `payload_type='json'`, the `body` will be sent to
39
+ `httpx.request()` through the `json` parameter
40
+
41
+ - If `payload_type='raw'`, the `body` will be sent to
42
+ `httpx.request()` through the `content` parameter
43
+
44
+ - If `payload_type='form'` the `body` will be sent to
45
+ `httpx.request()` through the `data` parameter
46
+
47
+ Attributes:
48
+ api_params (RequestParameters): Validated parameters to pass to the
49
+ API
50
+ method (HttpMethod): HTTP method to forward to the target website
51
+ headers (Optional[Dict[str, str]]): Custom HTTP headers to forward
52
+ body (Optional[Union[Dict[str, Any], str, bytes]]): Payload to send to
53
+ the target website (JSON dict, string, or bytes)
54
+ payload_type (PayloadType): Dictates how httpx should encode
55
+ the body. Defaults to 'json'.
56
+ """
57
+
58
+ api_params: RequestParameters = Field(
59
+ ...,
60
+ description="The validated parameters to pass to the API. "
61
+ )
62
+ method: HttpMethod = Field(
63
+ default="GET",
64
+ description="The HTTP method to forward to the target website."
65
+ )
66
+ headers: Optional[Dict[str, str]] = Field(
67
+ default=None,
68
+ description="The HTTP headers to forward."
69
+ )
70
+ body: Optional[Union[Dict[str, Any], str, bytes]] = Field(
71
+ default=None,
72
+ description="The payload to send (JSON dict, string, or bytes)."
73
+ )
74
+
75
+ payload_type: PayloadType = Field(
76
+ default="json",
77
+ description="Dictates how httpx should encode the body."
78
+ )
79
+
80
+ @model_validator(mode="after")
81
+ def cross_validate_http_components(self) -> Self:
82
+ """Cross-references standard HTTP request components (Method, Headers,
83
+ Body) against the Scrape.do specific parameters to ensure the
84
+ configuration will be respected by the proxy network.
85
+
86
+ info: Headers
87
+ - Raises a ValueError if none of the header flags is set to
88
+ true in `RequestParameters` and custom headers are provided
89
+
90
+ - Raises a ValueError if one of the header flags are set to
91
+ true in `RequestParameters` and no custom headers are
92
+ provided
93
+
94
+ - Raises a ValueError if `RequestParameters.extra_headers` is
95
+ set to true and any of the provided headers don't start with
96
+ the required `sd-` prefix.
97
+
98
+ info: Method
99
+ - Raises a ValueError if `RequestParameters.render` is set to
100
+ true and `method != "GET"`
101
+
102
+ info: Body
103
+ - Emits a UserWarning if a `body` is provided and `method=GET`
104
+ or `method=HEAD`
105
+
106
+ Returns:
107
+ The validated instance from which the method was called
108
+
109
+ Raises:
110
+ ValueError: If any of the validation steps fails
111
+ """
112
+ # --- Header Validation ---
113
+
114
+ has_header_flag = (
115
+ self.api_params.custom_headers or
116
+ self.api_params.extra_headers or
117
+ self.api_params.forward_headers
118
+ )
119
+
120
+ if self.headers:
121
+ if not has_header_flag:
122
+ raise ValueError((
123
+ "You provided 'headers' for the HTTP request, but no "
124
+ "header routing flag (custom_headers, extra_headers, or "
125
+ "forward_headers) was enabled in your RequestParameters. "
126
+ "Scrape.do will ignore these headers."
127
+ ))
128
+
129
+ # Extra Headers Prefix Check
130
+ if self.api_params.extra_headers:
131
+ invalid_keys = [
132
+ k for k in self.headers.keys()
133
+ if not k.lower().startswith("sd-")
134
+ ]
135
+ if invalid_keys:
136
+ raise ValueError((
137
+ f"When 'extra_headers=True' is used, Scrape.do "
138
+ f"requires all injected headers to be prefixed with "
139
+ f"'sd-'. Invalid headers found: {invalid_keys}. "
140
+ ))
141
+ else:
142
+ if has_header_flag:
143
+ raise ValueError((
144
+ "One of the header routing flags (custom_headers, "
145
+ "extra_headers, or forward_headers) is enabled in your "
146
+ "RequestParameters, but no 'headers' were provided"
147
+ ))
148
+
149
+ # --- Headless Browser Method Constraint ---
150
+
151
+ if self.api_params.render and self.method != "GET":
152
+ raise ValueError((
153
+ "The JavaScript render feature (render=true) works only with"
154
+ " the 'GET' method."
155
+ ))
156
+
157
+ # --- Payload Type Validation ---
158
+ if self.body is not None:
159
+ if (
160
+ self.payload_type in ("json", "form")
161
+ and not isinstance(self.body, dict)
162
+ ):
163
+ raise ValueError(
164
+ f"When payload_type is '{self.payload_type}', "
165
+ f"the body must be a Python dictionary. "
166
+ f" Received: {type(self.body).__name__}."
167
+ )
168
+ if (
169
+ self.payload_type == "raw"
170
+ and not isinstance(self.body, (str, bytes))
171
+ ):
172
+ raise ValueError(
173
+ f"When payload_type is 'raw', "
174
+ f"the body must be a string or bytes. "
175
+ f"Received: {type(self.body).__name__}."
176
+ )
177
+
178
+ # --- Warnings ---
179
+
180
+ # Body with GET/HEAD Warning
181
+ if self.body is not None and self.method in ("GET", "HEAD"):
182
+ warnings.warn((
183
+ f"Providing a body payload with a {self.method} request "
184
+ f"violates standard HTTP specifications and may be ignored by "
185
+ f"the target website."
186
+ ),
187
+ UserWarning
188
+ )
189
+
190
+ return self
191
+
192
+ def to_httpx_kwargs(self, token: Optional[str] = None) -> Dict[str, Any]:
193
+ """Packages the validated object into a dictionary ready for httpx
194
+ unpacking.
195
+
196
+ info: Token
197
+ The optional `token` parameter is the user's Scrape.do API key and
198
+ is only added here only for convenience. It can also be manuall
199
+ inserted into the resulting `httpx_kwargs` dictionary as the value
200
+ to the `token` key if it isn't provided
201
+
202
+ Args:
203
+ token (Optional[str]): The Scrape.do API key to include in the
204
+ dictionary
205
+
206
+ Returns:
207
+ Keyword arguments strictly formatted for `httpx.request()`.
208
+ """
209
+
210
+ params = self.api_params.to_api_params()
211
+
212
+ if token is not None:
213
+ params["token"] = token
214
+
215
+ kwargs: Dict[str, Any] = {
216
+ "method": self.method,
217
+ "url": "https://api.scrape.do/",
218
+ "params": params
219
+ }
220
+
221
+ if self.headers:
222
+ kwargs["headers"] = self.headers
223
+
224
+ if self.body is not None:
225
+ if self.payload_type == "json":
226
+ kwargs["json"] = self.body
227
+ elif self.payload_type == "form":
228
+ kwargs['data'] = self.body
229
+ else:
230
+ kwargs["content"] = self.body
231
+
232
+ return kwargs